mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-20 08:45:47 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
374 lines
15 KiB
Python
374 lines
15 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
import re, os
|
|
|
|
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
|
from ebook_converter.polyglot.builtins import unicode_type, range
|
|
|
|
|
|
def update_internal_links(mobi8_reader, log):
|
|
# need to update all links that are internal which
|
|
# are based on positions within the xhtml files **BEFORE**
|
|
# cutting and pasting any pieces into the xhtml text files
|
|
|
|
# kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
|
|
# XXXX is the offset in records into divtbl
|
|
# YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
|
|
|
|
mr = mobi8_reader
|
|
|
|
# pos:fid pattern
|
|
posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
|
|
posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
|
|
|
|
parts = []
|
|
for part in mr.parts:
|
|
srcpieces = posfid_pattern.split(part)
|
|
for j in range(1, len(srcpieces), 2):
|
|
tag = srcpieces[j]
|
|
if tag.startswith(b'<'):
|
|
for m in posfid_index_pattern.finditer(tag):
|
|
posfid = m.group(1)
|
|
offset = m.group(2)
|
|
try:
|
|
filename, idtag = mr.get_id_tag_by_pos_fid(
|
|
int(posfid, 32), int(offset, 32))
|
|
except ValueError:
|
|
log.warn('Invalid link, points to nowhere, ignoring')
|
|
replacement = b'#'
|
|
else:
|
|
suffix = (b'#' + idtag) if idtag else b''
|
|
replacement = filename.split('/')[-1].encode(
|
|
mr.header.codec) + suffix
|
|
replacement = replacement.replace(b'"', b'"')
|
|
tag = posfid_index_pattern.sub(b'"' + replacement + b'"', tag, 1)
|
|
srcpieces[j] = tag
|
|
raw = b''.join(srcpieces)
|
|
try:
|
|
parts.append(raw.decode(mr.header.codec))
|
|
except UnicodeDecodeError:
|
|
log.warn('Failed to decode text in KF8 part, replacing bad bytes')
|
|
parts.append(raw.decode(mr.header.codec, 'replace'))
|
|
|
|
# All parts are now unicode and have no internal links
|
|
return parts
|
|
|
|
|
|
def remove_kindlegen_markup(parts, aid_anchor_suffix, linked_aids):
|
|
|
|
# we can safely remove all of the Kindlegen generated aid attributes and
|
|
# calibre generated cid attributes
|
|
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\s[ac]id\s*=[^>]*>)''',
|
|
re.IGNORECASE)
|
|
within_tag_aid_position_pattern = re.compile(r'''\s[ac]id\s*=['"]([^'"]*)['"]''')
|
|
|
|
for i in range(len(parts)):
|
|
part = parts[i]
|
|
srcpieces = find_tag_with_aid_pattern.split(part)
|
|
for j in range(len(srcpieces)):
|
|
tag = srcpieces[j]
|
|
if tag.startswith('<'):
|
|
for m in within_tag_aid_position_pattern.finditer(tag):
|
|
try:
|
|
aid = m.group(1)
|
|
except IndexError:
|
|
aid = None
|
|
replacement = ''
|
|
if aid in linked_aids:
|
|
replacement = ' id="%s"' % (aid + '-' + aid_anchor_suffix)
|
|
tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
|
|
srcpieces[j] = tag
|
|
part = "".join(srcpieces)
|
|
parts[i] = part
|
|
|
|
# we can safely remove all of the Kindlegen generated data-AmznPageBreak
|
|
# attributes
|
|
find_tag_with_AmznPageBreak_pattern = re.compile(
|
|
r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
|
|
within_tag_AmznPageBreak_position_pattern = re.compile(
|
|
r'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
|
|
|
|
for i in range(len(parts)):
|
|
part = parts[i]
|
|
srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
|
|
for j in range(len(srcpieces)):
|
|
tag = srcpieces[j]
|
|
if tag.startswith('<'):
|
|
srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
|
|
lambda m:' style="page-break-after:%s"'%m.group(1), tag)
|
|
part = "".join(srcpieces)
|
|
parts[i] = part
|
|
|
|
|
|
def update_flow_links(mobi8_reader, resource_map, log):
|
|
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
|
|
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
|
|
# kindle:embed:XXXX (used for fonts)
|
|
|
|
mr = mobi8_reader
|
|
flows = []
|
|
|
|
img_pattern = re.compile(r'''(<[img\s|image\s|svg:image\s][^>]*>)''', re.IGNORECASE)
|
|
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''', re.IGNORECASE)
|
|
|
|
tag_pattern = re.compile(r'''(<[^>]*>)''')
|
|
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
|
|
|
|
url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE)
|
|
url_img_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*''', re.IGNORECASE)
|
|
font_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)''', re.IGNORECASE)
|
|
url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
|
|
|
|
for flow in mr.flows:
|
|
if flow is None: # 0th flow is None
|
|
flows.append(flow)
|
|
continue
|
|
|
|
if not isinstance(flow, unicode_type):
|
|
try:
|
|
flow = flow.decode(mr.header.codec)
|
|
except UnicodeDecodeError:
|
|
log.error('Flow part has invalid %s encoded bytes'%mr.header.codec)
|
|
flow = flow.decode(mr.header.codec, 'replace')
|
|
|
|
# links to raster image files from image tags
|
|
# image_pattern
|
|
srcpieces = img_pattern.split(flow)
|
|
for j in range(1, len(srcpieces), 2):
|
|
tag = srcpieces[j]
|
|
if tag.startswith('<im') or tag.startswith('<svg:image'):
|
|
for m in img_index_pattern.finditer(tag):
|
|
num = int(m.group(1), 32)
|
|
href = resource_map[num-1]
|
|
if href:
|
|
replacement = '"%s"'%('../'+ href)
|
|
tag = img_index_pattern.sub(replacement, tag, 1)
|
|
else:
|
|
log.warn('Referenced image %s was not recognized '
|
|
'as a valid image in %s' % (num, tag))
|
|
srcpieces[j] = tag
|
|
flow = "".join(srcpieces)
|
|
|
|
# replacements inside css url():
|
|
srcpieces = url_pattern.split(flow)
|
|
for j in range(1, len(srcpieces), 2):
|
|
tag = srcpieces[j]
|
|
|
|
# process links to raster image files
|
|
for m in url_img_index_pattern.finditer(tag):
|
|
num = int(m.group(1), 32)
|
|
href = resource_map[num-1]
|
|
if href:
|
|
replacement = '"%s"'%('../'+ href)
|
|
tag = url_img_index_pattern.sub(replacement, tag, 1)
|
|
else:
|
|
log.warn('Referenced image %s was not recognized as a '
|
|
'valid image in %s' % (num, tag))
|
|
|
|
# process links to fonts
|
|
for m in font_index_pattern.finditer(tag):
|
|
num = int(m.group(1), 32)
|
|
href = resource_map[num-1]
|
|
if href is None:
|
|
log.warn('Referenced font %s was not recognized as a '
|
|
'valid font in %s' % (num, tag))
|
|
else:
|
|
replacement = '"%s"'%('../'+ href)
|
|
if href.endswith('.failed'):
|
|
replacement = '"%s"'%('failed-'+href)
|
|
tag = font_index_pattern.sub(replacement, tag, 1)
|
|
|
|
# process links to other css pieces
|
|
for m in url_css_index_pattern.finditer(tag):
|
|
num = int(m.group(1), 32)
|
|
fi = mr.flowinfo[num]
|
|
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
|
|
tag = url_css_index_pattern.sub(replacement, tag, 1)
|
|
|
|
srcpieces[j] = tag
|
|
flow = "".join(srcpieces)
|
|
|
|
# flow pattern not inside url()
|
|
srcpieces = re.split(tag_pattern, flow)
|
|
for j in range(1, len(srcpieces), 2):
|
|
tag = srcpieces[j]
|
|
if tag.startswith('<'):
|
|
for m in re.finditer(flow_pattern, tag):
|
|
try:
|
|
num = int(m.group(1), 32)
|
|
fi = mr.flowinfo[num]
|
|
except IndexError:
|
|
log.warn('Ignoring invalid flow reference in tag', tag)
|
|
tag = ''
|
|
else:
|
|
if fi.format == 'inline':
|
|
flowtext = mr.flows[num]
|
|
tag = flowtext
|
|
else:
|
|
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
|
|
tag = flow_pattern.sub(replacement, tag, 1)
|
|
srcpieces[j] = tag
|
|
flow = "".join(srcpieces)
|
|
|
|
flows.append(flow)
|
|
|
|
# All flows are now unicode and have links resolved
|
|
return flows
|
|
|
|
|
|
def insert_flows_into_markup(parts, flows, mobi8_reader, log):
|
|
mr = mobi8_reader
|
|
|
|
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
|
|
tag_pattern = re.compile(r'''(<[^>]*>)''')
|
|
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
|
|
for i in range(len(parts)):
|
|
part = parts[i]
|
|
|
|
# flow pattern
|
|
srcpieces = tag_pattern.split(part)
|
|
for j in range(1, len(srcpieces),2):
|
|
tag = srcpieces[j]
|
|
if tag.startswith('<'):
|
|
for m in flow_pattern.finditer(tag):
|
|
num = int(m.group(1), 32)
|
|
try:
|
|
fi = mr.flowinfo[num]
|
|
except IndexError:
|
|
log.warn('Ignoring invalid flow reference: %s'%m.group())
|
|
tag = ''
|
|
else:
|
|
if fi.format == 'inline':
|
|
tag = flows[num]
|
|
else:
|
|
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
|
|
tag = flow_pattern.sub(replacement, tag, 1)
|
|
srcpieces[j] = tag
|
|
part = "".join(srcpieces)
|
|
# store away modified version
|
|
parts[i] = part
|
|
|
|
|
|
def insert_images_into_markup(parts, resource_map, log):
|
|
# Handle any embedded raster images links in the xhtml text
|
|
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
|
|
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
|
|
img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^')"]*[)'"]''')
|
|
|
|
style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''',
|
|
re.IGNORECASE)
|
|
|
|
for i in range(len(parts)):
|
|
part = parts[i]
|
|
srcpieces = img_pattern.split(part)
|
|
for j in range(1, len(srcpieces), 2):
|
|
tag = srcpieces[j]
|
|
if tag.startswith('<im'):
|
|
for m in img_index_pattern.finditer(tag):
|
|
num = int(m.group(1), 32)
|
|
href = resource_map[num-1]
|
|
if href:
|
|
replacement = '"%s"'%('../' + href)
|
|
tag = img_index_pattern.sub(replacement, tag, 1)
|
|
else:
|
|
log.warn('Referenced image %s was not recognized as '
|
|
'a valid image in %s' % (num, tag))
|
|
srcpieces[j] = tag
|
|
part = "".join(srcpieces)
|
|
# store away modified version
|
|
parts[i] = part
|
|
|
|
# Replace urls used in style attributes
|
|
for i in range(len(parts)):
|
|
part = parts[i]
|
|
srcpieces = style_pattern.split(part)
|
|
for j in range(1, len(srcpieces), 2):
|
|
tag = srcpieces[j]
|
|
if 'kindle:embed' in tag:
|
|
for m in img_index_pattern.finditer(tag):
|
|
num = int(m.group(1), 32)
|
|
href = resource_map[num-1]
|
|
osep = m.group()[0]
|
|
csep = m.group()[-1]
|
|
if href:
|
|
replacement = '%s%s%s'%(osep, '../' + href, csep)
|
|
tag = img_index_pattern.sub(replacement, tag, 1)
|
|
else:
|
|
log.warn('Referenced image %s was not recognized as '
|
|
'a valid image in %s' % (num, tag))
|
|
srcpieces[j] = tag
|
|
part = "".join(srcpieces)
|
|
# store away modified version
|
|
parts[i] = part
|
|
|
|
|
|
def upshift_markup(parts):
|
|
tag_pattern = re.compile(r'''(<(?:svg)[^>]*>)''', re.IGNORECASE)
|
|
|
|
for i in range(len(parts)):
|
|
part = parts[i]
|
|
|
|
# tag pattern
|
|
srcpieces = re.split(tag_pattern, part)
|
|
for j in range(1, len(srcpieces), 2):
|
|
tag = srcpieces[j]
|
|
if tag[:4].lower() == '<svg':
|
|
tag = tag.replace('preserveaspectratio','preserveAspectRatio')
|
|
tag = tag.replace('viewbox','viewBox')
|
|
srcpieces[j] = tag
|
|
part = "".join(srcpieces)
|
|
# store away modified version
|
|
parts[i] = part
|
|
|
|
|
|
def expand_mobi8_markup(mobi8_reader, resource_map, log):
|
|
# First update all internal links that are based on offsets
|
|
parts = update_internal_links(mobi8_reader, log)
|
|
|
|
# Remove pointless markup inserted by kindlegen
|
|
remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids)
|
|
|
|
# Handle substitutions for the flows pieces first as they may
|
|
# be inlined into the xhtml text
|
|
flows = update_flow_links(mobi8_reader, resource_map, log)
|
|
|
|
# Insert inline flows into the markup
|
|
insert_flows_into_markup(parts, flows, mobi8_reader, log)
|
|
|
|
# Insert raster images into markup
|
|
insert_images_into_markup(parts, resource_map, log)
|
|
|
|
# Perform general markup cleanups
|
|
upshift_markup(parts)
|
|
|
|
# Update the parts and flows stored in the reader
|
|
mobi8_reader.parts = parts
|
|
mobi8_reader.flows = flows
|
|
|
|
# write out the parts and file flows
|
|
os.mkdir('text') # directory containing all parts
|
|
spine = []
|
|
for i, part in enumerate(parts):
|
|
pi = mobi8_reader.partinfo[i]
|
|
with open(os.path.join(pi.type, pi.filename), 'wb') as f:
|
|
part = strip_encoding_declarations(part)
|
|
part = part.replace('<head>', '<head><meta charset="UTF-8"/>', 1)
|
|
f.write(part.encode('utf-8'))
|
|
spine.append(f.name)
|
|
|
|
for i, flow in enumerate(flows):
|
|
fi = mobi8_reader.flowinfo[i]
|
|
if fi.format == 'file':
|
|
if not os.path.exists(fi.dir):
|
|
os.mkdir(fi.dir)
|
|
with open(os.path.join(fi.dir, fi.fname), 'wb') as f:
|
|
f.write(flow.encode('utf-8'))
|
|
|
|
return spine
|