mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-31 10:55:44 +01:00
999 lines
42 KiB
Python
999 lines
42 KiB
Python
import io
|
|
import logging
|
|
import mimetypes
|
|
import os
|
|
import re
|
|
import shutil
|
|
import struct
|
|
import textwrap
|
|
|
|
from lxml import html, etree
|
|
|
|
from ebook_converter import xml_entity_to_unicode, entity_to_unicode
|
|
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
|
from ebook_converter.ebooks import DRMError, unit_convert
|
|
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
|
from ebook_converter.ebooks.mobi import MobiError
|
|
from ebook_converter.ebooks.mobi.huffcdic import HuffReader
|
|
from ebook_converter.ebooks.compression.palmdoc import decompress_doc
|
|
from ebook_converter.ebooks.metadata import MetaInformation
|
|
from ebook_converter.ebooks.metadata.opf2 import OPFCreator, OPF
|
|
from ebook_converter.ebooks.metadata.toc import TOC
|
|
from ebook_converter.ebooks.mobi.reader.headers import BookHeader
|
|
from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data
|
|
from ebook_converter.utils.img import AnimatedGIF
|
|
from ebook_converter.utils.imghdr import what
|
|
|
|
|
|
class TopazError(ValueError):
|
|
pass
|
|
|
|
|
|
class KFXError(ValueError):
|
|
|
|
def __init__(self):
|
|
ValueError.__init__(self, 'This is an Amazon KFX book. It cannot be '
|
|
'processed. See https://www.mobileread.com/forums/'
|
|
'showthread.php?t=283371 for information on how '
|
|
'to handle KFX books.')
|
|
|
|
|
|
class MobiReader(object):
|
|
PAGE_BREAK_PAT = re.compile(r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*)'
|
|
r'{0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}'
|
|
r'\s*mbp:pagebreak\s*/{0,1}\s*>)*',
|
|
re.IGNORECASE)
|
|
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
|
|
|
|
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
|
|
try_extra_data_fix=False):
|
|
self.log = log
|
|
self.debug = debug
|
|
self.embedded_mi = None
|
|
self.warned_about_trailing_entry_corruption = False
|
|
self.base_css_rules = textwrap.dedent('''
|
|
body { text-align: justify }
|
|
|
|
blockquote { margin: 0em 0em 0em 2em; }
|
|
|
|
p { margin: 0em; text-indent: 1.5em }
|
|
|
|
.bold { font-weight: bold }
|
|
|
|
.italic { font-style: italic }
|
|
|
|
.underline { text-decoration: underline }
|
|
|
|
.mbp_pagebreak {
|
|
page-break-after: always; margin: 0; display: block
|
|
}
|
|
''')
|
|
self.tag_css_rules = {}
|
|
self.left_margins = {}
|
|
self.text_indents = {}
|
|
|
|
if hasattr(filename_or_stream, 'read'):
|
|
stream = filename_or_stream
|
|
stream.seek(0)
|
|
else:
|
|
stream = open(filename_or_stream, 'rb')
|
|
|
|
raw = stream.read()
|
|
if raw.startswith(b'TPZ'):
|
|
raise TopazError('This is an Amazon Topaz book. It cannot be '
|
|
'processed.')
|
|
if raw.startswith(b'\xeaDRMION\xee'):
|
|
raise KFXError()
|
|
|
|
self.header = raw[0:72]
|
|
self.name = self.header[:32].replace(b'\x00', b'')
|
|
self.num_sections, = struct.unpack('>H', raw[76:78])
|
|
|
|
self.ident = self.header[0x3C:0x3C + 8].upper()
|
|
if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
|
|
raise MobiError('Unknown book type: %s' % repr(self.ident))
|
|
|
|
self.sections = []
|
|
self.section_headers = []
|
|
for i in range(self.num_sections):
|
|
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB',
|
|
raw[78 + i * 8:78 +
|
|
i * 8 + 8])
|
|
flags, val = a1, a2 << 16 | a3 << 8 | a4
|
|
self.section_headers.append((offset, flags, val))
|
|
|
|
def section(section_number):
|
|
if section_number == self.num_sections - 1:
|
|
end_off = len(raw)
|
|
else:
|
|
end_off = self.section_headers[section_number + 1][0]
|
|
off = self.section_headers[section_number][0]
|
|
return raw[off:end_off]
|
|
|
|
for i in range(self.num_sections):
|
|
self.sections.append((section(i), self.section_headers[i]))
|
|
|
|
bh = BookHeader(self.sections[0][0], self.ident, user_encoding,
|
|
self.log, try_extra_data_fix=try_extra_data_fix)
|
|
self.book_header = bh
|
|
self.name = self.name.decode(self.book_header.codec, 'replace')
|
|
self.kf8_type = None
|
|
k8i = getattr(self.book_header.exth, 'kf8_header', None)
|
|
|
|
# Ancient PRC files from Baen can have random values for
|
|
# mobi_version, so be conservative
|
|
if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
|
|
'skelidx')):
|
|
self.kf8_type = 'standalone'
|
|
elif k8i is not None: # Check for joint mobi 6 and kf 8 file
|
|
try:
|
|
raw = self.sections[k8i-1][0]
|
|
except Exception:
|
|
raw = None
|
|
if raw == b'BOUNDARY':
|
|
try:
|
|
self.book_header = BookHeader(self.sections[k8i][0],
|
|
self.ident, user_encoding,
|
|
self.log)
|
|
_kfii = self.book_header.first_image_index + k8i
|
|
self.book_header.kf8_first_image_index = _kfii
|
|
self.book_header.mobi6_records = bh.records
|
|
|
|
# Need the first_image_index from the mobi 6 header as well
|
|
for x in ('first_image_index',):
|
|
setattr(self.book_header, x, getattr(bh, x))
|
|
|
|
# We need to do this because the MOBI 6 text extract code
|
|
# does not know anything about the kf8 offset
|
|
if hasattr(self.book_header, 'huff_offset'):
|
|
self.book_header.huff_offset += k8i
|
|
|
|
self.kf8_type = 'joint'
|
|
self.kf8_boundary = k8i-1
|
|
except Exception:
|
|
self.book_header = bh
|
|
|
|
def check_for_drm(self):
|
|
if self.book_header.encryption_type != 0:
|
|
try:
|
|
name = self.book_header.exth.mi.title
|
|
except Exception:
|
|
name = self.name
|
|
if not name:
|
|
name = self.name
|
|
raise DRMError(name)
|
|
|
|
def extract_content(self, output_dir, parse_cache):
|
|
output_dir = os.path.abspath(output_dir)
|
|
self.check_for_drm()
|
|
processed_records = self.extract_text()
|
|
if self.debug is not None:
|
|
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
|
|
self.add_anchors()
|
|
self.processed_html = self.processed_html.decode(
|
|
self.book_header.codec, 'ignore')
|
|
self.processed_html = self.processed_html.replace('</</', '</')
|
|
self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
|
|
self.processed_html)
|
|
self.processed_html = self.processed_html.replace('\ufeff', '')
|
|
# Remove tags of the form <xyz: ...> as they can cause issues further
|
|
# along the pipeline
|
|
self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
|
|
self.processed_html)
|
|
|
|
self.processed_html = strip_encoding_declarations(self.processed_html)
|
|
self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
|
|
self.processed_html)
|
|
image_name_map = self.extract_images(processed_records, output_dir)
|
|
self.replace_page_breaks()
|
|
self.cleanup_html()
|
|
|
|
self.log.debug('Parsing HTML...')
|
|
self.processed_html = clean_xml_chars(self.processed_html)
|
|
try:
|
|
root = html.fromstring(self.processed_html)
|
|
if len(root.xpath('//html')) > 5:
|
|
root = html.fromstring(self.processed_html
|
|
.replace('\x0c', '')
|
|
.replace('\x14', ''))
|
|
except Exception:
|
|
self.log.warning('MOBI markup appears to contain random bytes. '
|
|
'Stripping.')
|
|
self.processed_html = self.remove_random_bytes(self.processed_html)
|
|
root = html.fromstring(self.processed_html)
|
|
if root.xpath('descendant::p/descendant::p'):
|
|
from html5_parser import parse
|
|
self.log.warning('Malformed markup, parsing using html5-parser')
|
|
self.processed_html = strip_encoding_declarations(
|
|
self.processed_html)
|
|
# These trip up the html5 parser causing all content to be placed
|
|
# under the <guide> tag
|
|
self.processed_html = re.sub(r'<metadata>.+?</metadata>', '',
|
|
self.processed_html, flags=re.I)
|
|
self.processed_html = re.sub(r'<guide>.+?</guide>', '',
|
|
self.processed_html, flags=re.I)
|
|
try:
|
|
root = parse(self.processed_html, maybe_xhtml=False,
|
|
keep_doctype=False, sanitize_names=True)
|
|
except Exception:
|
|
self.log.warning('MOBI markup appears to contain random '
|
|
'bytes. Stripping.')
|
|
self.processed_html = self.remove_random_bytes(
|
|
self.processed_html)
|
|
root = parse(self.processed_html, maybe_xhtml=False,
|
|
keep_doctype=False, sanitize_names=True)
|
|
if len(root.xpath('body/descendant::*')) < 1:
|
|
# There are probably stray </html>s in the markup
|
|
self.processed_html = self.processed_html.replace('</html>',
|
|
'')
|
|
root = parse(self.processed_html, maybe_xhtml=False,
|
|
keep_doctype=False, sanitize_names=True)
|
|
|
|
if root.tag != 'html':
|
|
self.log.warn('File does not have opening <html> tag')
|
|
nroot = html.fromstring('<html><head></head><body></body></html>')
|
|
bod = nroot.find('body')
|
|
for child in list(root):
|
|
child.getparent().remove(child)
|
|
bod.append(child)
|
|
root = nroot
|
|
|
|
htmls = list(root.xpath('//html'))
|
|
|
|
if len(htmls) > 1:
|
|
self.log.warn('Markup contains multiple <html> tags, merging.')
|
|
# Merge all <head> and <body> sections
|
|
for h in htmls:
|
|
p = h.getparent()
|
|
if hasattr(p, 'remove'):
|
|
p.remove(h)
|
|
bodies, heads = root.xpath('//body'), root.xpath('//head')
|
|
for x in root:
|
|
root.remove(x)
|
|
head, body = map(root.makeelement, ('head', 'body'))
|
|
for h in heads:
|
|
for x in h:
|
|
h.remove(x)
|
|
head.append(x)
|
|
for b in bodies:
|
|
for x in b:
|
|
b.remove(x)
|
|
body.append(x)
|
|
root.append(head), root.append(body)
|
|
for x in root.xpath('//script'):
|
|
x.getparent().remove(x)
|
|
|
|
head = root.xpath('//head')
|
|
if head:
|
|
head = head[0]
|
|
else:
|
|
head = root.makeelement('head', {})
|
|
root.insert(0, head)
|
|
head.text = '\n\t'
|
|
link = head.makeelement('link', {'type': 'text/css',
|
|
'href': 'styles.css',
|
|
'rel': 'stylesheet'})
|
|
head.insert(0, link)
|
|
link.tail = '\n\t'
|
|
title = head.xpath('descendant::title')
|
|
m = head.makeelement('meta', {'http-equiv': 'Content-Type',
|
|
'content': 'text/html; charset=utf-8'})
|
|
head.insert(0, m)
|
|
if not title:
|
|
title = head.makeelement('title', {})
|
|
try:
|
|
title.text = self.book_header.title
|
|
except ValueError:
|
|
title.text = clean_ascii_chars(self.book_header.title)
|
|
title.tail = '\n\t'
|
|
head.insert(0, title)
|
|
head.text = '\n\t'
|
|
|
|
self.upshift_markup(root, image_name_map)
|
|
guides = root.xpath('//guide')
|
|
guide = guides[0] if guides else None
|
|
metadata_elems = root.xpath('//metadata')
|
|
if metadata_elems and self.book_header.exth is None:
|
|
self.read_embedded_metadata(root, metadata_elems[0], guide)
|
|
for elem in guides + metadata_elems:
|
|
elem.getparent().remove(elem)
|
|
htmlfile = os.path.join(output_dir, 'index.html')
|
|
try:
|
|
for ref in guide.xpath('descendant::reference'):
|
|
if 'href' in ref.attrib:
|
|
ref.attrib['href'] = (os.path.basename(htmlfile) +
|
|
ref.attrib['href'])
|
|
except AttributeError:
|
|
pass
|
|
|
|
def write_as_utf8(path, data):
|
|
if isinstance(data, str):
|
|
data = data.encode('utf-8')
|
|
with open(path, 'wb') as f:
|
|
f.write(data)
|
|
|
|
parse_cache[htmlfile] = root
|
|
self.htmlfile = htmlfile
|
|
ncx = io.BytesIO()
|
|
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
|
|
self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
|
|
opf.render(open(self.created_opf_path, 'wb'), ncx,
|
|
ncx_manifest_entry=ncx_manifest_entry)
|
|
ncx = ncx.getvalue()
|
|
if ncx:
|
|
ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
|
|
write_as_utf8(ncx_path, ncx)
|
|
|
|
css = [self.base_css_rules, '\n\n']
|
|
for cls, rule in self.tag_css_rules.items():
|
|
css.append('.%s { %s }\n\n' % (cls, rule))
|
|
write_as_utf8('styles.css', ''.join(css))
|
|
|
|
if self.book_header.exth is not None or self.embedded_mi is not None:
|
|
self.log.debug('Creating OPF...')
|
|
ncx = io.BytesIO()
|
|
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
|
|
opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
|
|
ncx_manifest_entry)
|
|
ncx = ncx.getvalue()
|
|
if ncx:
|
|
write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
|
|
|
|
def read_embedded_metadata(self, root, elem, guide):
|
|
raw = b'<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \
|
|
html.tostring(elem, encoding='utf-8') + b'</package>'
|
|
stream = io.BytesIO(raw)
|
|
opf = OPF(stream)
|
|
self.embedded_mi = opf.to_book_metadata()
|
|
if guide is not None:
|
|
for ref in guide.xpath('descendant::reference'):
|
|
if 'cover' in ref.get('type', '').lower():
|
|
href = ref.get('href', '')
|
|
if href.startswith('#'):
|
|
href = href[1:]
|
|
anchors = root.xpath('//*[@id="%s"]' % href)
|
|
if anchors:
|
|
cpos = anchors[0]
|
|
reached = False
|
|
for elem in root.iter():
|
|
if elem is cpos:
|
|
reached = True
|
|
if reached and elem.tag == 'img':
|
|
cover = elem.get('src', None)
|
|
self.embedded_mi.cover = cover
|
|
elem.getparent().remove(elem)
|
|
break
|
|
break
|
|
|
|
def cleanup_html(self):
|
|
self.log.debug('Cleaning up HTML...')
|
|
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}">'
|
|
'</div>', '', self.processed_html)
|
|
if (self.book_header.ancient and
|
|
b'<html' not in self.mobi_html[:300].lower()):
|
|
self.processed_html = ('<html><p>' +
|
|
self.processed_html.replace('\n\n', '<p>') +
|
|
'</html>')
|
|
self.processed_html = self.processed_html.replace('\r\n', '\n')
|
|
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
|
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
|
|
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
|
|
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'',
|
|
self.processed_html)
|
|
# Swap inline and block level elements, and order block level elements
|
|
# according to priority
|
|
# - lxml and beautifulsoup expect/assume a specific order based on
|
|
# xhtml spec
|
|
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|'
|
|
r'small|big|strong|tt)>\s*){1,})'
|
|
r'(?P<para><p[^>]*>)',
|
|
r'\g<para>' + r'\g<styletags>',
|
|
self.processed_html)
|
|
self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*'
|
|
r'(?P<styletags>(</(h\d+|i|b|u|em|small|'
|
|
r'big|strong|tt)>\s*){1,})',
|
|
r'\g<styletags>' + r'\g<para>',
|
|
self.processed_html)
|
|
self.processed_html = re.sub(r'(?i)(?P<blockquote>(</(blockquote|div)'
|
|
r'[^>]*>\s*){1,})(?P<para></p[^>]*>)',
|
|
r'\g<para>' + r'\g<blockquote>',
|
|
self.processed_html)
|
|
self.processed_html = re.sub(r'(?i)(?P<para><p[^>]*>)\s*'
|
|
r'(?P<blockquote>(<(blockquote|div)[^>]*>'
|
|
r'\s*){1,})',
|
|
r'\g<blockquote>' + r'\g<para>',
|
|
self.processed_html)
|
|
bods = htmls = 0
|
|
for x in re.finditer('</body>|</html>', self.processed_html):
|
|
if x == '</body>':
|
|
bods += 1
|
|
else:
|
|
htmls += 1
|
|
if bods > 1 and htmls > 1:
|
|
break
|
|
if bods > 1:
|
|
self.processed_html = self.processed_html.replace('</body>', '')
|
|
if htmls > 1:
|
|
self.processed_html = self.processed_html.replace('</html>', '')
|
|
|
|
def remove_random_bytes(self, html):
|
|
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01'
|
|
'|\x02|\x03|\x04|\x05|\x06|\x07', '', html)
|
|
|
|
def ensure_unit(self, raw, unit='px'):
|
|
if re.search(r'\d+$', raw) is not None:
|
|
raw += unit
|
|
return raw
|
|
|
|
def upshift_markup(self, root, image_name_map=None):
|
|
self.log.debug('Converting style information to CSS...')
|
|
image_name_map = image_name_map or {}
|
|
size_map = {
|
|
'xx-small': '0.5',
|
|
'x-small': '1',
|
|
'small': '2',
|
|
'medium': '3',
|
|
'large': '4',
|
|
'x-large': '5',
|
|
'xx-large': '6',
|
|
}
|
|
|
|
def barename(x):
|
|
return x.rpartition(':')[-1]
|
|
|
|
mobi_version = self.book_header.mobi_version
|
|
for x in root.xpath('//ncx'):
|
|
x.getparent().remove(x)
|
|
svg_tags = []
|
|
forwardable_anchors = []
|
|
pagebreak_anchors = []
|
|
BLOCK_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'p'}
|
|
for i, tag in enumerate(root.iter(etree.Element)):
|
|
tag.attrib.pop('xmlns', '')
|
|
for x in tag.attrib:
|
|
if ':' in x:
|
|
del tag.attrib[x]
|
|
if tag.tag and barename(tag.tag) == 'svg':
|
|
svg_tags.append(tag)
|
|
if tag.tag and barename(tag.tag.lower()) in \
|
|
('country-region', 'place', 'placetype', 'placename',
|
|
'state', 'city', 'street', 'address', 'content', 'form'):
|
|
tag.tag = 'div' if tag.tag in ('content', 'form') else 'span'
|
|
for key in tag.attrib.keys():
|
|
tag.attrib.pop(key)
|
|
continue
|
|
styles, attrib = [], tag.attrib
|
|
if 'style' in attrib:
|
|
style = attrib.pop('style').strip()
|
|
if style:
|
|
styles.append(style)
|
|
if 'height' in attrib:
|
|
height = attrib.pop('height').strip()
|
|
if (
|
|
height and '<' not in height and '>' not in height and
|
|
re.search(r'\d+', height)):
|
|
if tag.tag in ('table', 'td', 'tr'):
|
|
pass
|
|
elif tag.tag == 'img':
|
|
tag.set('height', height)
|
|
else:
|
|
if tag.tag == 'div' and not tag.text and \
|
|
(not tag.tail or not tag.tail.strip()) and \
|
|
not len(list(tag.iterdescendants())):
|
|
# Paragraph spacer
|
|
# Insert nbsp so that the element is never
|
|
# discarded by a renderer
|
|
tag.text = '\u00a0' # nbsp
|
|
styles.append('height: %s' %
|
|
self.ensure_unit(height))
|
|
else:
|
|
styles.append('margin-top: %s' %
|
|
self.ensure_unit(height))
|
|
if 'width' in attrib:
|
|
width = attrib.pop('width').strip()
|
|
if width and re.search(r'\d+', width):
|
|
if tag.tag in ('table', 'td', 'tr'):
|
|
pass
|
|
elif tag.tag == 'img':
|
|
tag.set('width', width)
|
|
else:
|
|
ewidth = self.ensure_unit(width)
|
|
styles.append('text-indent: %s' % ewidth)
|
|
try:
|
|
ewidth_val = unit_convert(ewidth, 12, 500, 166)
|
|
self.text_indents[tag] = ewidth_val
|
|
except Exception:
|
|
pass
|
|
if width.startswith('-'):
|
|
styles.append('margin-left: %s' %
|
|
self.ensure_unit(width[1:]))
|
|
try:
|
|
ewidth_val = unit_convert(ewidth[1:],
|
|
12, 500, 166)
|
|
self.left_margins[tag] = ewidth_val
|
|
except Exception:
|
|
pass
|
|
|
|
if 'align' in attrib:
|
|
align = attrib.pop('align').strip()
|
|
if align:
|
|
align = align.lower()
|
|
if align == 'baseline':
|
|
styles.append('vertical-align: '+align)
|
|
else:
|
|
styles.append('text-align: %s' % align)
|
|
if tag.tag == 'hr':
|
|
if mobi_version == 1:
|
|
tag.tag = 'div'
|
|
styles.append('page-break-before: always')
|
|
styles.append('display: block')
|
|
styles.append('margin: 0')
|
|
elif tag.tag == 'i':
|
|
tag.tag = 'span'
|
|
tag.attrib['class'] = 'italic'
|
|
elif tag.tag == 'u':
|
|
tag.tag = 'span'
|
|
tag.attrib['class'] = 'underline'
|
|
elif tag.tag == 'b':
|
|
tag.tag = 'span'
|
|
tag.attrib['class'] = 'bold'
|
|
elif tag.tag == 'font':
|
|
sz = tag.get('size', '').lower()
|
|
try:
|
|
float(sz)
|
|
except ValueError:
|
|
if sz in list(size_map.keys()):
|
|
attrib['size'] = size_map[sz]
|
|
elif tag.tag == 'img':
|
|
recindex = None
|
|
for attr in self.IMAGE_ATTRS:
|
|
recindex = attrib.pop(attr, None) or recindex
|
|
if recindex is not None:
|
|
try:
|
|
recindex = int(recindex)
|
|
except Exception:
|
|
pass
|
|
else:
|
|
attrib['src'] = ('images/' +
|
|
image_name_map.get(recindex,
|
|
'%05d.jpg' %
|
|
recindex))
|
|
for attr in ('width', 'height'):
|
|
if attr in attrib:
|
|
val = attrib[attr]
|
|
if val.lower().endswith('em'):
|
|
try:
|
|
nval = float(val[:-2])
|
|
# Assume this was set using the Kindle profile
|
|
nval *= 16 * (168.451/72)
|
|
attrib[attr] = "%dpx" % int(nval)
|
|
except Exception:
|
|
del attrib[attr]
|
|
elif val.lower().endswith('%'):
|
|
del attrib[attr]
|
|
elif tag.tag == 'pre':
|
|
if not tag.text:
|
|
tag.tag = 'div'
|
|
|
|
if (attrib.get('class', None) == 'mbp_pagebreak' and tag.tag ==
|
|
'div' and 'filepos-id' in attrib):
|
|
pagebreak_anchors.append(tag)
|
|
|
|
if 'color' in attrib:
|
|
styles.append('color: ' + attrib.pop('color'))
|
|
if 'bgcolor' in attrib:
|
|
styles.append('background-color: ' + attrib.pop('bgcolor'))
|
|
|
|
if 'filepos-id' in attrib:
|
|
attrib['id'] = attrib.pop('filepos-id')
|
|
if 'name' in attrib and attrib['name'] != attrib['id']:
|
|
attrib['name'] = attrib['id']
|
|
if 'filepos' in attrib:
|
|
filepos = attrib.pop('filepos')
|
|
try:
|
|
attrib['href'] = "#filepos%d" % int(filepos)
|
|
except ValueError:
|
|
pass
|
|
if (tag.tag == 'a' and
|
|
attrib.get('id', '').startswith('filepos') and
|
|
not tag.text and len(tag) == 0 and
|
|
(tag.tail is None or
|
|
not tag.tail.strip()) and
|
|
getattr(tag.getnext(), 'tag', None) in BLOCK_TAGS):
|
|
# This is an empty anchor immediately before a block tag, move
|
|
# the id onto the block tag instead
|
|
forwardable_anchors.append(tag)
|
|
|
|
if styles:
|
|
ncls = None
|
|
rule = '; '.join(styles)
|
|
for sel, srule in self.tag_css_rules.items():
|
|
if srule == rule:
|
|
ncls = sel
|
|
break
|
|
if ncls is None:
|
|
ncls = 'calibre_%d' % i
|
|
self.tag_css_rules[ncls] = rule
|
|
cls = attrib.get('class', '')
|
|
cls = cls + (' ' if cls else '') + ncls
|
|
attrib['class'] = cls
|
|
|
|
for tag in svg_tags:
|
|
images = tag.xpath('descendant::img[@src]')
|
|
parent = tag.getparent()
|
|
|
|
if images and hasattr(parent, 'find'):
|
|
index = parent.index(tag)
|
|
for img in images:
|
|
img.getparent().remove(img)
|
|
img.tail = img.text = None
|
|
parent.insert(index, img)
|
|
|
|
if hasattr(parent, 'remove'):
|
|
parent.remove(tag)
|
|
|
|
for tag in pagebreak_anchors:
|
|
anchor = tag.attrib['id']
|
|
del tag.attrib['id']
|
|
if 'name' in tag.attrib:
|
|
del tag.attrib['name']
|
|
p = tag.getparent()
|
|
a = p.makeelement('a')
|
|
a.attrib['id'] = anchor
|
|
p.insert(p.index(tag)+1, a)
|
|
if getattr(a.getnext(), 'tag', None) in BLOCK_TAGS:
|
|
forwardable_anchors.append(a)
|
|
|
|
for tag in forwardable_anchors:
|
|
block = tag.getnext()
|
|
tag.getparent().remove(tag)
|
|
|
|
if 'id' in block.attrib:
|
|
tag.tail = block.text
|
|
block.text = None
|
|
block.insert(0, tag)
|
|
else:
|
|
block.attrib['id'] = tag.attrib['id']
|
|
|
|
# WebKit fails to navigate to anchors located on <br> tags
|
|
for br in root.xpath('/body/br[@id]'):
|
|
br.tag = 'div'
|
|
|
|
def get_left_whitespace(self, tag):
|
|
|
|
def whitespace(tag):
|
|
lm = ti = 0.0
|
|
if tag.tag == 'p':
|
|
ti = unit_convert('1.5em', 12, 500, 166)
|
|
if tag.tag == 'blockquote':
|
|
lm = unit_convert('2em', 12, 500, 166)
|
|
lm = self.left_margins.get(tag, lm)
|
|
ti = self.text_indents.get(tag, ti)
|
|
try:
|
|
lm = float(lm)
|
|
except Exception:
|
|
lm = 0.0
|
|
try:
|
|
ti = float(ti)
|
|
except Exception:
|
|
ti = 0.0
|
|
return lm + ti
|
|
|
|
parent = tag
|
|
ans = 0.0
|
|
while parent is not None:
|
|
ans += whitespace(parent)
|
|
parent = parent.getparent()
|
|
|
|
return ans
|
|
|
|
def create_opf(self, htmlfile, guide=None, root=None):
|
|
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
|
|
if mi is None:
|
|
mi = MetaInformation(self.book_header.title, ['Unknown'])
|
|
opf = OPFCreator(os.path.dirname(htmlfile), mi)
|
|
if hasattr(self.book_header.exth, 'cover_offset'):
|
|
opf.cover = 'images/%05d.jpg' % (self.book_header
|
|
.exth.cover_offset + 1)
|
|
elif mi.cover is not None:
|
|
opf.cover = mi.cover
|
|
else:
|
|
opf.cover = 'images/%05d.jpg' % 1
|
|
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
|
|
* opf.cover.split('/'))):
|
|
opf.cover = None
|
|
|
|
cover = opf.cover
|
|
cover_copied = None
|
|
if cover is not None:
|
|
cover = cover.replace('/', os.sep)
|
|
if os.path.exists(cover):
|
|
ncover = 'images'+os.sep+'calibre_cover.jpg'
|
|
if os.path.exists(ncover):
|
|
os.remove(ncover)
|
|
shutil.copyfile(cover, ncover)
|
|
cover_copied = os.path.abspath(ncover)
|
|
opf.cover = ncover.replace(os.sep, '/')
|
|
|
|
manifest = [(htmlfile, 'application/xhtml+xml'),
|
|
(os.path.abspath('styles.css'), 'text/css')]
|
|
bp = os.path.dirname(htmlfile)
|
|
added = set()
|
|
for i in getattr(self, 'image_names', []):
|
|
path = os.path.join(bp, 'images', i)
|
|
added.add(path)
|
|
manifest.append((path,
|
|
mimetypes.guess_type(path)[0] or 'image/jpeg'))
|
|
if cover_copied is not None:
|
|
manifest.append((cover_copied, 'image/jpeg'))
|
|
|
|
opf.create_manifest(manifest)
|
|
opf.create_spine([os.path.basename(htmlfile)])
|
|
toc = None
|
|
if guide is not None:
|
|
opf.create_guide(guide)
|
|
for ref in opf.guide:
|
|
if ref.type.lower() == 'toc':
|
|
toc = ref.href()
|
|
|
|
ncx_manifest_entry = None
|
|
if toc:
|
|
ncx_manifest_entry = 'toc.ncx'
|
|
elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
|
|
tocobj = None
|
|
ent_pat = re.compile(r'&(\S+?);')
|
|
if elems:
|
|
tocobj = TOC()
|
|
found = False
|
|
reached = False
|
|
for x in root.iter():
|
|
if x == elems[-1]:
|
|
reached = True
|
|
continue
|
|
if reached and x.tag == 'a':
|
|
href = x.get('href', '')
|
|
if href and re.match(r'\w+://', href) is None:
|
|
try:
|
|
text = ' '.join([t.strip() for t in
|
|
x.xpath('descendant:'
|
|
':text()')])
|
|
except Exception:
|
|
text = ''
|
|
text = ent_pat.sub(entity_to_unicode, text)
|
|
item = tocobj.add_item(toc.partition('#')[0],
|
|
href[1:], text)
|
|
item.left_space = int(self.get_left_whitespace(x))
|
|
found = True
|
|
if (reached and found and
|
|
x.get('class', None) == 'mbp_pagebreak'):
|
|
break
|
|
if tocobj is not None:
|
|
tocobj = self.structure_toc(tocobj)
|
|
opf.set_toc(tocobj)
|
|
|
|
return opf, ncx_manifest_entry
|
|
|
|
def structure_toc(self, toc):
|
|
indent_vals = set()
|
|
for item in toc:
|
|
indent_vals.add(item.left_space)
|
|
if len(indent_vals) > 6 or len(indent_vals) < 2:
|
|
# Too many or too few levels, give up
|
|
return toc
|
|
indent_vals = sorted(indent_vals)
|
|
|
|
last_found = [None for i in indent_vals]
|
|
|
|
newtoc = TOC()
|
|
|
|
def find_parent(level):
|
|
candidates = last_found[:level]
|
|
for x in reversed(candidates):
|
|
if x is not None:
|
|
return x
|
|
return newtoc
|
|
|
|
for item in toc:
|
|
level = indent_vals.index(item.left_space)
|
|
parent = find_parent(level)
|
|
last_found[level] = parent.add_item(item.href, item.fragment,
|
|
item.text)
|
|
|
|
return newtoc
|
|
|
|
def sizeof_trailing_entries(self, data):
|
|
def sizeof_trailing_entry(ptr, psize):
|
|
bitpos, result = 0, 0
|
|
while True:
|
|
v = ord(ptr[psize-1:psize])
|
|
result |= (v & 0x7F) << bitpos
|
|
bitpos += 7
|
|
psize -= 1
|
|
if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
|
|
return result
|
|
|
|
num = 0
|
|
size = len(data)
|
|
flags = self.book_header.extra_flags >> 1
|
|
while flags:
|
|
if flags & 1:
|
|
try:
|
|
num += sizeof_trailing_entry(data, size - num)
|
|
except IndexError:
|
|
self.warn_about_trailing_entry_corruption()
|
|
return 0
|
|
flags >>= 1
|
|
if self.book_header.extra_flags & 1:
|
|
off = size - num - 1
|
|
num += (ord(data[off:off+1]) & 0x3) + 1
|
|
return num
|
|
|
|
def warn_about_trailing_entry_corruption(self):
|
|
if not self.warned_about_trailing_entry_corruption:
|
|
self.warned_about_trailing_entry_corruption = True
|
|
self.log.warn('The trailing data entries in this MOBI file are '
|
|
'corrupted, you might see corrupted text in the '
|
|
'output')
|
|
|
|
def text_section(self, index):
|
|
data = self.sections[index][0]
|
|
trail_size = self.sizeof_trailing_entries(data)
|
|
return data[:len(data)-trail_size]
|
|
|
|
def extract_text(self, offset=1):
|
|
self.log.debug('Extracting text...')
|
|
text_sections = [self.text_section(i)
|
|
for i in range(offset, min(self.book_header.records
|
|
+ offset,
|
|
len(self.sections)))]
|
|
processed_records = list(range(offset-1, self.book_header.records +
|
|
offset))
|
|
|
|
self.mobi_html = b''
|
|
|
|
if self.book_header.compression_type == b'DH':
|
|
huffs = [self.sections[i][0]
|
|
for i in range(self.book_header.huff_offset,
|
|
self.book_header.huff_offset +
|
|
self.book_header.huff_number)]
|
|
processed_records += list(range(self.book_header.huff_offset,
|
|
self.book_header.huff_offset +
|
|
self.book_header.huff_number))
|
|
huff = HuffReader(huffs)
|
|
unpack = huff.unpack
|
|
|
|
elif self.book_header.compression_type == b'\x00\x02':
|
|
unpack = decompress_doc
|
|
|
|
elif self.book_header.compression_type == b'\x00\x01':
|
|
unpack = lambda x: x # noqa
|
|
else:
|
|
raise MobiError('Unknown compression algorithm: %r' %
|
|
self.book_header.compression_type)
|
|
self.mobi_html = b''.join(map(unpack, text_sections))
|
|
if self.mobi_html.endswith(b'#'):
|
|
self.mobi_html = self.mobi_html[:-1]
|
|
|
|
if (self.book_header.ancient and
|
|
b'<html' not in self.mobi_html[:300].lower()):
|
|
self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ')
|
|
self.mobi_html = self.mobi_html.replace(b'\0', b'')
|
|
if self.book_header.codec == 'cp1252':
|
|
# record separator
|
|
self.mobi_html = self.mobi_html.replace(b'\x1e', b'')
|
|
# start of text
|
|
self.mobi_html = self.mobi_html.replace(b'\x02', b'')
|
|
return processed_records
|
|
|
|
def replace_page_breaks(self):
|
|
self.processed_html = self.PAGE_BREAK_PAT.sub(
|
|
r'<div \1 class="mbp_pagebreak" />',
|
|
self.processed_html)
|
|
|
|
def add_anchors(self):
|
|
self.log.debug('Adding anchors...')
|
|
positions = set()
|
|
link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
|
|
re.IGNORECASE)
|
|
for match in link_pattern.finditer(self.mobi_html):
|
|
positions.add(int(match.group(1)))
|
|
pos = 0
|
|
processed_html = []
|
|
end_tag_re = re.compile(br'<\s*/')
|
|
for end in sorted(positions):
|
|
if end == 0:
|
|
continue
|
|
oend = end
|
|
_l = self.mobi_html.find(b'<', end)
|
|
r = self.mobi_html.find(b'>', end)
|
|
anchor = b'<a id="filepos%d"></a>'
|
|
if r > -1 and (r < _l or _l == end or _l == -1):
|
|
p = self.mobi_html.rfind(b'<', 0, end + 1)
|
|
if (pos < end and p > -1 and
|
|
not end_tag_re.match(self.mobi_html[p:r]) and
|
|
not self.mobi_html[p:r + 1].endswith(b'/>')):
|
|
anchor = b' filepos-id="filepos%d"'
|
|
end = r
|
|
else:
|
|
end = r + 1
|
|
processed_html.append(self.mobi_html[pos:end] + (anchor % oend))
|
|
pos = end
|
|
processed_html.append(self.mobi_html[pos:])
|
|
processed_html = b''.join(processed_html)
|
|
|
|
# Remove anchors placed inside entities
|
|
self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)'
|
|
br'([^;]*);', br'&\1\3;\2',
|
|
processed_html)
|
|
|
|
def extract_images(self, processed_records, output_dir):
|
|
self.log.debug('Extracting images...')
|
|
output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
|
|
if not os.path.exists(output_dir):
|
|
os.makedirs(output_dir)
|
|
image_index = 0
|
|
self.image_names = []
|
|
image_name_map = {}
|
|
start = getattr(self.book_header, 'first_image_index', -1)
|
|
if start > self.num_sections or start < 0:
|
|
# BAEN PRC files have bad headers
|
|
start = 0
|
|
for i in range(start, self.num_sections):
|
|
if i in processed_records:
|
|
continue
|
|
processed_records.append(i)
|
|
data = self.sections[i][0]
|
|
image_index += 1
|
|
if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
|
|
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI',
|
|
b'VIDE'}:
|
|
# This record is a known non image type, no need to try to
|
|
# load the image
|
|
continue
|
|
|
|
try:
|
|
imgfmt = what(None, data)
|
|
except Exception:
|
|
continue
|
|
if imgfmt not in {'jpg', 'jpeg', 'gif', 'png', 'bmp'}:
|
|
continue
|
|
if imgfmt == 'jpeg':
|
|
imgfmt = 'jpg'
|
|
if imgfmt == 'gif':
|
|
try:
|
|
data = gif_data_to_png_data(data)
|
|
imgfmt = 'png'
|
|
except AnimatedGIF:
|
|
pass
|
|
path = os.path.join(output_dir, '%05d.%s' % (image_index, imgfmt))
|
|
image_name_map[image_index] = os.path.basename(path)
|
|
if imgfmt == 'png':
|
|
with open(path, 'wb') as f:
|
|
f.write(data)
|
|
else:
|
|
try:
|
|
save_cover_data_to(data, path, minify_to=(10000, 10000))
|
|
except Exception:
|
|
logging.exception('Exception has been thrown during '
|
|
'transforming image')
|
|
continue
|
|
self.image_names.append(os.path.basename(path))
|
|
return image_name_map
|
|
|
|
|
|
def test_mbp_regex():
|
|
for raw, m in {'<mbp:pagebreak></mbp:pagebreak>': '',
|
|
'<mbp:pagebreak xxx></mbp:pagebreak>yyy': ' xxxyyy',
|
|
'<mbp:pagebreak> </mbp:pagebreak>': '',
|
|
'<mbp:pagebreak>xxx': 'xxx',
|
|
'<mbp:pagebreak/>xxx': 'xxx',
|
|
'<mbp:pagebreak sdf/ >xxx': ' sdfxxx',
|
|
'<mbp:pagebreak / >': ' ',
|
|
'</mbp:pagebreak>': '',
|
|
'</mbp:pagebreak sdf>': ' sdf',
|
|
'</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':
|
|
'xxx'}.items():
|
|
ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
|
|
if ans != m:
|
|
raise Exception('%r != %r for %r' % (ans, m, raw))
|