import re, os from ebook_converter.ebooks.chardet import strip_encoding_declarations __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' def update_internal_links(mobi8_reader, log): # need to update all links that are internal which # are based on positions within the xhtml files **BEFORE** # cutting and pasting any pieces into the xhtml text files # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml) # XXXX is the offset in records into divtbl # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position mr = mobi8_reader # pos:fid pattern posfid_pattern = re.compile(br'''()''', re.IGNORECASE) posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''') parts = [] for part in mr.parts: srcpieces = posfid_pattern.split(part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith(b'<'): for m in posfid_index_pattern.finditer(tag): posfid = m.group(1) offset = m.group(2) try: filename, idtag = mr.get_id_tag_by_pos_fid( int(posfid, 32), int(offset, 32)) except ValueError: log.warn('Invalid link, points to nowhere, ignoring') replacement = b'#' else: suffix = (b'#' + idtag) if idtag else b'' replacement = filename.split('/')[-1].encode( mr.header.codec) + suffix replacement = replacement.replace(b'"', b'"') tag = posfid_index_pattern.sub(b'"' + replacement + b'"', tag, 1) srcpieces[j] = tag raw = b''.join(srcpieces) try: parts.append(raw.decode(mr.header.codec)) except UnicodeDecodeError: log.warn('Failed to decode text in KF8 part, replacing bad bytes') parts.append(raw.decode(mr.header.codec, 'replace')) # All parts are now unicode and have no internal links return parts def remove_kindlegen_markup(parts, aid_anchor_suffix, linked_aids): # we can safely remove all of the Kindlegen generated aid attributes and # calibre generated cid attributes find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\s[ac]id\s*=[^>]*>)''', re.IGNORECASE) within_tag_aid_position_pattern = re.compile(r'''\s[ac]id\s*=['"]([^'"]*)['"]''') for i in range(len(parts)): part = parts[i] srcpieces = find_tag_with_aid_pattern.split(part) for j in range(len(srcpieces)): tag = srcpieces[j] if tag.startswith('<'): for m in within_tag_aid_position_pattern.finditer(tag): try: aid = m.group(1) except IndexError: aid = None replacement = '' if aid in linked_aids: replacement = ' id="%s"' % (aid + '-' + aid_anchor_suffix) tag = within_tag_aid_position_pattern.sub(replacement, tag, 1) srcpieces[j] = tag part = "".join(srcpieces) parts[i] = part # we can safely remove all of the Kindlegen generated data-AmznPageBreak # attributes find_tag_with_AmznPageBreak_pattern = re.compile( r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE) within_tag_AmznPageBreak_position_pattern = re.compile( r'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''') for i in range(len(parts)): part = parts[i] srcpieces = find_tag_with_AmznPageBreak_pattern.split(part) for j in range(len(srcpieces)): tag = srcpieces[j] if tag.startswith('<'): srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub( lambda m:' style="page-break-after:%s"'%m.group(1), tag) part = "".join(srcpieces) parts[i] = part def update_flow_links(mobi8_reader, resource_map, log): # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) # kindle:embed:XXXX (used for fonts) mr = mobi8_reader flows = [] img_pattern = re.compile(r'''(<[img\s|image\s|svg:image\s][^>]*>)''', re.IGNORECASE) img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''', re.IGNORECASE) tag_pattern = re.compile(r'''(<[^>]*>)''') flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE) url_img_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*''', re.IGNORECASE) font_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)''', re.IGNORECASE) url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE) for flow in mr.flows: if flow is None: # 0th flow is None flows.append(flow) continue if not isinstance(flow, str): try: flow = flow.decode(mr.header.codec) except UnicodeDecodeError: log.error('Flow part has invalid %s encoded bytes'%mr.header.codec) flow = flow.decode(mr.header.codec, 'replace') # links to raster image files from image tags # image_pattern srcpieces = img_pattern.split(flow) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith(']*>)''') flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) for i in range(len(parts)): part = parts[i] # flow pattern srcpieces = tag_pattern.split(part) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if tag.startswith('<'): for m in flow_pattern.finditer(tag): num = int(m.group(1), 32) try: fi = mr.flowinfo[num] except IndexError: log.warn('Ignoring invalid flow reference: %s'%m.group()) tag = '' else: if fi.format == 'inline': tag = flows[num] else: replacement = '"../' + fi.dir + '/' + fi.fname + '"' tag = flow_pattern.sub(replacement, tag, 1) srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part def insert_images_into_markup(parts, resource_map, log): # Handle any embedded raster images links in the xhtml text # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^')"]*[)'"]''') style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE) for i in range(len(parts)): part = parts[i] srcpieces = img_pattern.split(part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith(']*>)''', re.IGNORECASE) for i in range(len(parts)): part = parts[i] # tag pattern srcpieces = re.split(tag_pattern, part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag[:4].lower() == '', '', 1) f.write(part.encode('utf-8')) spine.append(f.name) for i, flow in enumerate(flows): fi = mobi8_reader.flowinfo[i] if fi.format == 'file': if not os.path.exists(fi.dir): os.mkdir(fi.dir) with open(os.path.join(fi.dir, fi.fname), 'wb') as f: f.write(flow.encode('utf-8')) return spine