diff --git a/ebook_converter/ebooks/mobi/mobiml.py b/ebook_converter/ebooks/mobi/mobiml.py new file mode 100644 index 0000000..94c8114 --- /dev/null +++ b/ebook_converter/ebooks/mobi/mobiml.py @@ -0,0 +1,622 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +''' +Transform XHTML/OPS-ish content into Mobipocket HTML 3.2. +''' + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import copy +import re +import numbers +from lxml import etree +from calibre.ebooks.oeb.base import namespace, barename +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, urlnormalize +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks.oeb.transforms.flatcss import KeyMapper +from calibre.ebooks.mobi.utils import convert_color_for_font_tag +from calibre.utils.imghdr import identify +from polyglot.builtins import unicode_type, string_or_bytes + +MBP_NS = 'http://mobipocket.com/ns/mbp' + + +def MBP(name): + return '{%s}%s' % (MBP_NS, name) + + +MOBI_NSMAP = {None: XHTML_NS, 'mbp': MBP_NS} +INLINE_TAGS = {'span', 'a', 'code', 'u', 's', 'big', 'strike', 'tt', 'font', 'q', 'i', 'b', 'em', 'strong', 'sup', 'sub'} +HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'} +# GR: Added 'caption' to both sets +NESTABLE_TAGS = {'ol', 'ul', 'li', 'table', 'tr', 'td', 'th', 'caption'} +TABLE_TAGS = {'table', 'tr', 'td', 'th', 'caption'} + +SPECIAL_TAGS = {'hr', 'br'} +CONTENT_TAGS = {'img', 'hr', 'br'} + +NOT_VTAGS = HEADER_TAGS | NESTABLE_TAGS | TABLE_TAGS | SPECIAL_TAGS | \ + CONTENT_TAGS +LEAF_TAGS = {'base', 'basefont', 'frame', 'link', 'meta', 'area', 'br', +'col', 'hr', 'img', 'input', 'param'} +PAGE_BREAKS = {'always', 'left', 'right'} + +COLLAPSE = re.compile(r'[ \t\r\n\v]+') + + +def asfloat(value): + if not isinstance(value, numbers.Number): + return 0.0 + return float(value) + + +def isspace(text): + if not text: + return True + if '\xa0' in text: + return False + return text.isspace() + + +class BlockState(object): + + def __init__(self, body): + self.body = body + self.nested = [] + self.para = None + self.inline = None + self.anchor = None + self.vpadding = 0. + self.vmargin = 0. + self.pbreak = False + self.istate = None + self.content = False + + +class FormatState(object): + + def __init__(self): + self.rendered = False + self.left = 0. + self.halign = 'auto' + self.indent = 0. + self.fsize = 3 + self.ids = set() + self.italic = False + self.bold = False + self.strikethrough = False + self.underline = False + self.preserve = False + self.pre_wrap = False + self.family = 'serif' + self.bgcolor = 'transparent' + self.fgcolor = 'black' + self.href = None + self.list_num = 0 + self.attrib = {} + + def __eq__(self, other): + return self.fsize == other.fsize \ + and self.italic == other.italic \ + and self.bold == other.bold \ + and self.href == other.href \ + and self.preserve == other.preserve \ + and self.pre_wrap == other.pre_wrap \ + and self.family == other.family \ + and self.bgcolor == other.bgcolor \ + and self.fgcolor == other.fgcolor \ + and self.strikethrough == other.strikethrough \ + and self.underline == other.underline + + def __ne__(self, other): + return not self.__eq__(other) + + +class MobiMLizer(object): + + def __init__(self, ignore_tables=False): + self.ignore_tables = ignore_tables + + def __call__(self, oeb, context): + oeb.logger.info('Converting XHTML to Mobipocket markup...') + self.oeb = oeb + self.log = self.oeb.logger + self.opts = context + self.profile = profile = context.dest + self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items()) + self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys()) + self.mobimlize_spine() + + def mobimlize_spine(self): + 'Iterate over the spine and convert it to MOBIML' + for item in self.oeb.spine: + stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile) + body = item.data.find(XHTML('body')) + nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) + nbody = etree.SubElement(nroot, XHTML('body')) + self.current_spine_item = item + self.mobimlize_elem(body, stylizer, BlockState(nbody), + [FormatState()]) + item.data = nroot + # print(etree.tostring(nroot)) + + def mobimlize_font(self, ptsize): + return self.fnums[self.fmap[ptsize]] + + def mobimlize_measure(self, ptsize): + if isinstance(ptsize, string_or_bytes): + return ptsize + embase = self.profile.fbase + if round(ptsize) < embase: + return "%dpt" % int(round(ptsize)) + return "%dem" % int(round(ptsize / embase)) + + def preize_text(self, text, pre_wrap=False): + text = unicode_type(text) + if pre_wrap: + # Replace n consecutive spaces with n-1 NBSP + space + text = re.sub(r' {2,}', lambda m:('\xa0'*(len(m.group())-1) + ' '), text) + else: + text = text.replace(' ', '\xa0') + + text = text.replace('\r\n', '\n') + text = text.replace('\r', '\n') + lines = text.split('\n') + result = lines[:1] + for line in lines[1:]: + result.append(etree.Element(XHTML('br'))) + if line: + result.append(line) + return result + + def mobimlize_content(self, tag, text, bstate, istates): + 'Convert text content' + if text or tag != 'br': + bstate.content = True + istate = istates[-1] + para = bstate.para + if tag in SPECIAL_TAGS and not text: + para = para if para is not None else bstate.body + elif para is None or tag in ('td', 'th'): + body = bstate.body + if bstate.pbreak: + etree.SubElement(body, MBP('pagebreak')) + bstate.pbreak = False + bstate.istate = None + bstate.anchor = None + parent = bstate.nested[-1] if bstate.nested else bstate.body + indent = istate.indent + left = istate.left + if isinstance(indent, string_or_bytes): + indent = 0 + if indent < 0 and abs(indent) < left: + left += indent + indent = 0 + elif indent != 0 and abs(indent) < self.profile.fbase: + indent = (indent / abs(indent)) * self.profile.fbase + if tag in NESTABLE_TAGS and not istate.rendered: + para = wrapper = etree.SubElement( + parent, XHTML(tag), attrib=istate.attrib) + bstate.nested.append(para) + if tag == 'li' and len(istates) > 1: + istates[-2].list_num += 1 + para.attrib['value'] = unicode_type(istates[-2].list_num) + elif tag in NESTABLE_TAGS and istate.rendered: + para = wrapper = bstate.nested[-1] + elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0: + ems = self.profile.mobi_ems_per_blockquote + para = wrapper = etree.SubElement(parent, XHTML('blockquote')) + para = wrapper + emleft = int(round(left / self.profile.fbase)) - ems + emleft = min((emleft, 10)) + while emleft > ems / 2: + para = etree.SubElement(para, XHTML('blockquote')) + emleft -= ems + else: + para = wrapper = etree.SubElement(parent, XHTML('p')) + bstate.inline = bstate.para = para + vspace = bstate.vpadding + bstate.vmargin + bstate.vpadding = bstate.vmargin = 0 + if tag not in TABLE_TAGS: + if tag in ('ul', 'ol') and vspace > 0: + wrapper.addprevious(etree.Element(XHTML('div'), + height=self.mobimlize_measure(vspace))) + else: + wrapper.attrib['height'] = self.mobimlize_measure(vspace) + para.attrib['width'] = self.mobimlize_measure(indent) + elif tag == 'table' and vspace > 0: + vspace = int(round(vspace / self.profile.fbase)) + while vspace > 0: + wrapper.addprevious(etree.Element(XHTML('br'))) + vspace -= 1 + if istate.halign != 'auto' and isinstance(istate.halign, (bytes, unicode_type)): + if isinstance(istate.halign, bytes): + istate.halign = istate.halign.decode('utf-8') + para.attrib['align'] = istate.halign + istate.rendered = True + pstate = bstate.istate + if tag in CONTENT_TAGS: + bstate.inline = para + pstate = bstate.istate = None + try: + etree.SubElement(para, XHTML(tag), attrib=istate.attrib) + except: + print('Invalid subelement:', para, tag, istate.attrib) + raise + elif tag in TABLE_TAGS: + para.attrib['valign'] = 'top' + if istate.ids: + for id_ in istate.ids: + anchor = etree.Element(XHTML('a'), attrib={'id': id_}) + if tag == 'li': + try: + last = bstate.body[-1][-1] + except: + break + last.insert(0, anchor) + anchor.tail = last.text + last.text = None + else: + last = bstate.body[-1] + # We use append instead of addprevious so that inline + # anchors in large blocks point to the correct place. See + # https://bugs.launchpad.net/calibre/+bug/899831 + # This could potentially break if inserting an anchor at + # this point in the markup is illegal, but I cannot think + # of such a case offhand. + if barename(last.tag) in LEAF_TAGS: + last.addprevious(anchor) + else: + last.append(anchor) + + istate.ids.clear() + if not text: + return + if not pstate or istate != pstate: + inline = para + fsize = istate.fsize + href = istate.href + if not href: + bstate.anchor = None + elif pstate and pstate.href == href: + inline = bstate.anchor + else: + inline = etree.SubElement(inline, XHTML('a'), href=href) + bstate.anchor = inline + + if fsize != 3: + inline = etree.SubElement(inline, XHTML('font'), + size=unicode_type(fsize)) + if istate.family == 'monospace': + inline = etree.SubElement(inline, XHTML('tt')) + if istate.italic: + inline = etree.SubElement(inline, XHTML('i')) + if istate.bold: + inline = etree.SubElement(inline, XHTML('b')) + if istate.bgcolor is not None and istate.bgcolor != 'transparent' : + inline = etree.SubElement(inline, XHTML('span'), + bgcolor=convert_color_for_font_tag(istate.bgcolor)) + if istate.fgcolor != 'black': + inline = etree.SubElement(inline, XHTML('font'), + color=convert_color_for_font_tag(istate.fgcolor)) + if istate.strikethrough: + inline = etree.SubElement(inline, XHTML('s')) + if istate.underline: + inline = etree.SubElement(inline, XHTML('u')) + bstate.inline = inline + bstate.istate = istate + inline = bstate.inline + content = self.preize_text(text, pre_wrap=istate.pre_wrap) if istate.preserve or istate.pre_wrap else [text] + for item in content: + if isinstance(item, string_or_bytes): + if len(inline) == 0: + inline.text = (inline.text or '') + item + else: + last = inline[-1] + last.tail = (last.tail or '') + item + else: + inline.append(item) + + def mobimlize_elem(self, elem, stylizer, bstate, istates, + ignore_valign=False): + if not isinstance(elem.tag, string_or_bytes) \ + or namespace(elem.tag) != XHTML_NS: + return + style = stylizer.style(elem) + # does not exist lalalala + if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and + elem.get('data-calibre-jacket-searchable-tags', None) != '1'): + id_ = elem.get('id', None) + if id_: + # Keep anchors so people can use display:none + # to generate hidden TOCs + tail = elem.tail + elem.clear() + elem.text = None + elem.set('id', id_) + elem.tail = tail + elem.tag = XHTML('a') + else: + return + tag = barename(elem.tag) + istate = copy.copy(istates[-1]) + istate.rendered = False + istate.list_num = 0 + if tag == 'ol' and 'start' in elem.attrib: + try: + istate.list_num = int(elem.attrib['start'])-1 + except: + pass + istates.append(istate) + left = 0 + display = style['display'] + if display == 'table-cell': + display = 'inline' + elif display.startswith('table'): + display = 'block' + isblock = (not display.startswith('inline') and style['display'] != + 'none') + isblock = isblock and style['float'] == 'none' + isblock = isblock and tag != 'br' + if isblock: + bstate.para = None + istate.halign = style['text-align'] + rawti = style._get('text-indent') + istate.indent = style['text-indent'] + if hasattr(rawti, 'strip') and '%' in rawti: + # We have a percentage text indent, these can come out looking + # too large if the user chooses a wide output profile like + # tablet + istate.indent = min(style._unit_convert(rawti, base=500), istate.indent) + if style['margin-left'] == 'auto' \ + and style['margin-right'] == 'auto': + istate.halign = 'center' + margin = asfloat(style['margin-left']) + padding = asfloat(style['padding-left']) + if tag != 'body': + left = margin + padding + istate.left += left + vmargin = asfloat(style['margin-top']) + bstate.vmargin = max((bstate.vmargin, vmargin)) + vpadding = asfloat(style['padding-top']) + if vpadding > 0: + bstate.vpadding += bstate.vmargin + bstate.vmargin = 0 + bstate.vpadding += vpadding + elif not istate.href: + margin = asfloat(style['margin-left']) + padding = asfloat(style['padding-left']) + lspace = margin + padding + if lspace > 0: + spaces = int(round((lspace * 3) / style['font-size'])) + elem.text = ('\xa0' * spaces) + (elem.text or '') + margin = asfloat(style['margin-right']) + padding = asfloat(style['padding-right']) + rspace = margin + padding + if rspace > 0: + spaces = int(round((rspace * 3) / style['font-size'])) + if len(elem) == 0: + elem.text = (elem.text or '') + ('\xa0' * spaces) + else: + last = elem[-1] + last.text = (last.text or '') + ('\xa0' * spaces) + if bstate.content and style['page-break-before'] in PAGE_BREAKS: + bstate.pbreak = True + istate.fsize = self.mobimlize_font(style['font-size']) + istate.italic = True if style['font-style'] == 'italic' else False + weight = style['font-weight'] + istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400 + istate.preserve = style['white-space'] == 'pre' + istate.pre_wrap = style['white-space'] == 'pre-wrap' + istate.bgcolor = style['background-color'] + istate.fgcolor = style['color'] + istate.strikethrough = style.effective_text_decoration == 'line-through' + istate.underline = style.effective_text_decoration == 'underline' + ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else '' + if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'): + istate.family = 'monospace' + elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or + 'arial' in ff or 'helvetica' in ff): + istate.family = 'sans-serif' + else: + istate.family = 'serif' + if 'id' in elem.attrib: + istate.ids.add(elem.attrib['id']) + if 'name' in elem.attrib: + istate.ids.add(elem.attrib['name']) + if tag == 'a' and 'href' in elem.attrib: + istate.href = elem.attrib['href'] + istate.attrib.clear() + if tag == 'img' and 'src' in elem.attrib: + istate.attrib['src'] = elem.attrib['src'] + istate.attrib['align'] = 'baseline' + cssdict = style.cssdict() + valign = cssdict.get('vertical-align', None) + if valign in ('top', 'bottom', 'middle'): + istate.attrib['align'] = valign + for prop in ('width', 'height'): + if cssdict[prop] != 'auto': + value = style[prop] + if value == getattr(self.profile, prop): + result = '100%' + else: + # Amazon's renderer does not support + # img sizes in units other than px + # See #7520 for test case + try: + pixs = int(round(float(value) / + (72/self.profile.dpi))) + except: + continue + result = unicode_type(pixs) + istate.attrib[prop] = result + if 'width' not in istate.attrib or 'height' not in istate.attrib: + href = self.current_spine_item.abshref(elem.attrib['src']) + try: + item = self.oeb.manifest.hrefs[urlnormalize(href)] + except: + self.oeb.logger.warn('Failed to find image:', + href) + else: + try: + width, height = identify(item.data)[1:] + except Exception: + self.oeb.logger.warn('Invalid image:', href) + else: + if 'width' not in istate.attrib and 'height' not in \ + istate.attrib: + istate.attrib['width'] = unicode_type(width) + istate.attrib['height'] = unicode_type(height) + else: + ar = width / height + if 'width' not in istate.attrib: + try: + width = int(istate.attrib['height'])*ar + except: + pass + istate.attrib['width'] = unicode_type(int(width)) + else: + try: + height = int(istate.attrib['width'])/ar + except: + pass + istate.attrib['height'] = unicode_type(int(height)) + item.unload_data_from_memory() + elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}: + raww = style._get('width') + if hasattr(raww, 'strip') and '%' in raww: + istate.attrib['width'] = raww + else: + prop = style['width'] / self.profile.width + istate.attrib['width'] = "%d%%" % int(round(prop * 100)) + elif display == 'table': + tag = 'table' + elif display == 'table-row': + tag = 'tr' + elif display == 'table-cell': + tag = 'td' + if tag in TABLE_TAGS and self.ignore_tables: + tag = 'span' if tag == 'td' else 'div' + + if tag in ('table', 'td', 'tr'): + col = style.backgroundColor + if col: + elem.set('bgcolor', col) + css = style.cssdict() + if 'border' in css or 'border-width' in css: + elem.set('border', '1') + if tag in TABLE_TAGS: + for attr in ('rowspan', 'colspan', 'width', 'border', 'scope', + 'bgcolor'): + if attr in elem.attrib: + istate.attrib[attr] = elem.attrib[attr] + if tag == 'q': + t = elem.text + if not t: + t = '' + elem.text = '\u201c' + t + t = elem.tail + if not t: + t = '' + elem.tail = '\u201d' + t + text = None + if elem.text: + if istate.preserve or istate.pre_wrap: + text = elem.text + elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and + elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS): + text = None + else: + text = COLLAPSE.sub(' ', elem.text) + valign = style['vertical-align'] + not_baseline = valign in ('super', 'sub', 'text-top', + 'text-bottom', 'top', 'bottom') or ( + isinstance(valign, numbers.Number) and abs(valign) != 0) + issup = valign in ('super', 'text-top', 'top') or ( + isinstance(valign, numbers.Number) and valign > 0) + vtag = 'sup' if issup else 'sub' + if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: + nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) + vbstate = BlockState(etree.SubElement(nroot, XHTML('body'))) + vbstate.para = etree.SubElement(vbstate.body, XHTML('p')) + self.mobimlize_elem(elem, stylizer, vbstate, istates, + ignore_valign=True) + if len(istates) > 0: + istates.pop() + if len(istates) == 0: + istates.append(FormatState()) + at_start = bstate.para is None + if at_start: + self.mobimlize_content('span', '', bstate, istates) + parent = bstate.para if bstate.inline is None else bstate.inline + if parent is not None: + vtag = etree.SubElement(parent, XHTML(vtag)) + vtag = etree.SubElement(vtag, XHTML('small')) + # Add anchors + for child in vbstate.body: + if child is not vbstate.para: + vtag.append(child) + else: + break + if vbstate.para is not None: + if vbstate.para.text: + vtag.text = vbstate.para.text + for child in vbstate.para: + vtag.append(child) + return + + if tag == 'blockquote': + old_mim = self.opts.mobi_ignore_margins + self.opts.mobi_ignore_margins = False + + if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or ( + # We have an id but no text and no children, the id should still + # be added. + istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and + len(elem)==0)): + if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib: + try: + value = int(elem.attrib['value']) + istates[-2].list_num = value - 1 + except: + pass + self.mobimlize_content(tag, text, bstate, istates) + for child in elem: + self.mobimlize_elem(child, stylizer, bstate, istates) + tail = None + if child.tail: + if istate.preserve or istate.pre_wrap: + tail = child.tail + elif bstate.para is None and isspace(child.tail): + tail = None + else: + tail = COLLAPSE.sub(' ', child.tail) + if tail: + self.mobimlize_content(tag, tail, bstate, istates) + + if tag == 'blockquote': + self.opts.mobi_ignore_margins = old_mim + + if bstate.content and style['page-break-after'] in PAGE_BREAKS: + bstate.pbreak = True + if isblock: + para = bstate.para + if para is not None and para.text == '\xa0' and len(para) < 1: + if style.height > 2: + para.getparent().replace(para, etree.Element(XHTML('br'))) + else: + # This is too small to be rendered effectively, drop it + para.getparent().remove(para) + bstate.para = None + bstate.istate = None + vmargin = asfloat(style['margin-bottom']) + bstate.vmargin = max((bstate.vmargin, vmargin)) + vpadding = asfloat(style['padding-bottom']) + if vpadding > 0: + bstate.vpadding += bstate.vmargin + bstate.vmargin = 0 + bstate.vpadding += vpadding + if bstate.nested and bstate.nested[-1].tag == elem.tag: + bstate.nested.pop() + istates.pop() diff --git a/ebook_converter/ebooks/mobi/writer2/indexer.py b/ebook_converter/ebooks/mobi/writer2/indexer.py new file mode 100644 index 0000000..c5c9d81 --- /dev/null +++ b/ebook_converter/ebooks/mobi/writer2/indexer.py @@ -0,0 +1,891 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import numbers +from struct import pack +import io +from collections import OrderedDict, defaultdict + +from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, + encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_) +from polyglot.builtins import filter, iteritems, itervalues, map, range + + +class CNCX(CNCX_): # {{{ + + def __init__(self, toc, is_periodical): + strings = [] + for item in toc.iterdescendants(breadth_first=True): + strings.append(item.title) + if is_periodical: + strings.append(item.klass) + if item.author: + strings.append(item.author) + if item.description: + strings.append(item.description) + CNCX_.__init__(self, strings) +# }}} + + +class TAGX(object): # {{{ + + BITMASKS = {11:0b1} + BITMASKS.update({x:(1 << i) for i, x in enumerate([1, 2, 3, 4, 5, 21, 22, 23])}) + BITMASKS.update({x:(1 << i) for i, x in enumerate([69, 70, 71, 72, 73])}) + + NUM_VALUES = defaultdict(lambda :1) + NUM_VALUES[11] = 3 + NUM_VALUES[0] = 0 + + def __init__(self): + self.byts = bytearray() + + def add_tag(self, tag): + buf = self.byts + buf.append(tag) + buf.append(self.NUM_VALUES[tag]) + # bitmask + buf.append(self.BITMASKS[tag] if tag else 0) + # eof + buf.append(0 if tag else 1) + + def header(self, control_byte_count): + header = b'TAGX' + # table length, control byte count + header += pack(b'>II', 12+len(self.byts), control_byte_count) + return header + + @property + def periodical(self): + ''' + TAGX block for the Primary index header of a periodical + ''' + list(map(self.add_tag, (1, 2, 3, 4, 5, 21, 22, 23, 0, 69, 70, 71, 72, + 73, 0))) + return self.header(2) + bytes(self.byts) + + @property + def secondary(self): + ''' + TAGX block for the secondary index header of a periodical + ''' + list(map(self.add_tag, (11, 0))) + return self.header(1) + bytes(self.byts) + + @property + def flat_book(self): + ''' + TAGX block for the primary index header of a flat book + ''' + list(map(self.add_tag, (1, 2, 3, 4, 0))) + return self.header(1) + bytes(self.byts) + + +# }}} + +# Index Entries {{{ + +class IndexEntry(object): + + TAG_VALUES = { + 'offset': 1, + 'size': 2, + 'label_offset': 3, + 'depth': 4, + 'class_offset': 5, + 'secondary': 11, + 'parent_index': 21, + 'first_child_index': 22, + 'last_child_index': 23, + 'image_index': 69, + 'desc_offset': 70, + 'author_offset': 71, + + } + RTAG_MAP = {v:k for k, v in iteritems(TAG_VALUES)} # noqa + + def __init__(self, offset, label_offset): + self.offset, self.label_offset = offset, label_offset + self.depth, self.class_offset = 0, None + self.control_byte_count = 1 + + self.length = 0 + self.index = 0 + + self.parent_index = None + self.first_child_index = None + self.last_child_index = None + + self.image_index = None + self.author_offset = None + self.desc_offset = None + + def __repr__(self): + return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,' + ' parent_index=%r)')%(self.offset, self.depth, self.length, + self.index, self.parent_index) + + @property + def size(self): + return self.length + + @size.setter + def size(self, val): + self.length = val + + @property + def next_offset(self): + return self.offset + self.length + + @property + def tag_nums(self): + for i in range(1, 5): + yield i + for attr in ('class_offset', 'parent_index', 'first_child_index', + 'last_child_index'): + if getattr(self, attr) is not None: + yield self.TAG_VALUES[attr] + + @property + def entry_type(self): + ans = 0 + for tag in self.tag_nums: + ans |= TAGX.BITMASKS[tag] + return ans + + def attr_for_tag(self, tag): + return self.RTAG_MAP[tag] + + @property + def bytestring(self): + buf = io.BytesIO() + if isinstance(self.index, numbers.Integral): + buf.write(encode_number_as_hex(self.index)) + else: + raw = bytearray(self.index.encode('ascii')) + raw.insert(0, len(raw)) + buf.write(bytes(raw)) + et = self.entry_type + buf.write(bytes(bytearray([et]))) + + if self.control_byte_count == 2: + flags = 0 + for attr in ('image_index', 'desc_offset', 'author_offset'): + val = getattr(self, attr) + if val is not None: + tag = self.TAG_VALUES[attr] + bm = TAGX.BITMASKS[tag] + flags |= bm + buf.write(bytes(bytearray([flags]))) + + for tag in self.tag_nums: + attr = self.attr_for_tag(tag) + val = getattr(self, attr) + if isinstance(val, numbers.Integral): + val = [val] + for x in val: + buf.write(encint(x)) + + if self.control_byte_count == 2: + for attr in ('image_index', 'desc_offset', 'author_offset'): + val = getattr(self, attr) + if val is not None: + buf.write(encint(val)) + + ans = buf.getvalue() + return ans + + +class PeriodicalIndexEntry(IndexEntry): + + def __init__(self, offset, label_offset, class_offset, depth): + IndexEntry.__init__(self, offset, label_offset) + self.depth = depth + self.class_offset = class_offset + self.control_byte_count = 2 + + +class SecondaryIndexEntry(IndexEntry): + + INDEX_MAP = {'author':73, 'caption':72, 'credit':71, 'description':70, + 'mastheadImage':69} + + def __init__(self, index): + IndexEntry.__init__(self, 0, 0) + self.index = index + + tag = self.INDEX_MAP[index] + + # The values for this index entry + # I dont know what the 5 means, it is not the number of entries + self.secondary = [5 if tag == min( + itervalues(self.INDEX_MAP)) else 0, 0, tag] + + @property + def tag_nums(self): + yield 11 + + @property + def entry_type(self): + return 1 + + @classmethod + def entries(cls): + rmap = {v:k for k,v in iteritems(cls.INDEX_MAP)} + for tag in sorted(rmap, reverse=True): + yield cls(rmap[tag]) + +# }}} + + +class TBS(object): # {{{ + + ''' + Take the list of index nodes starting/ending on a record and calculate the + trailing byte sequence for the record. + ''' + + def __init__(self, data, is_periodical, first=False, section_map={}, + after_first=False): + self.section_map = section_map + + if is_periodical: + # The starting bytes. + # The value is zero which I think indicates the periodical + # index entry. The values for the various flags seem to be + # unused. If the 0b100 is present, it means that the record + # deals with section 1 (or is the final record with section + # transitions). + self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3) + self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0}, + flag_size=3) + self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0}, + flag_size=3) + self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001: + 0}, flag_size=3) + + if not data: + byts = b'' + if after_first: + # This can happen if a record contains only text between + # the periodical start and the first section + byts = self.type_011 + self.bytestring = byts + else: + depth_map = defaultdict(list) + for x in ('starts', 'ends', 'completes'): + for idx in data[x]: + depth_map[idx.depth].append(idx) + for l in itervalues(depth_map): + l.sort(key=lambda x:x.offset) + self.periodical_tbs(data, first, depth_map) + else: + if not data: + self.bytestring = b'' + else: + self.book_tbs(data, first) + + def periodical_tbs(self, data, first, depth_map): + buf = io.BytesIO() + + has_section_start = (depth_map[1] and + set(depth_map[1]).intersection(set(data['starts']))) + spanner = data['spans'] + parent_section_index = -1 + + if depth_map[0]: + # We have a terminal record + + # Find the first non periodical node + first_node = None + for nodes in (depth_map[1], depth_map[2]): + for node in nodes: + if (first_node is None or (node.offset, node.depth) < + (first_node.offset, first_node.depth)): + first_node = node + + typ = (self.type_110 if has_section_start else self.type_010) + + # parent_section_index is needed for the last record + if first_node is not None and first_node.depth > 0: + parent_section_index = (first_node.index if first_node.depth == 1 else first_node.parent_index) + else: + parent_section_index = max(iter(self.section_map)) + + else: + # Non terminal record + + if spanner is not None: + # record is spanned by a single article + parent_section_index = spanner.parent_index + typ = (self.type_110 if parent_section_index == 1 else + self.type_010) + elif not depth_map[1]: + # has only article nodes, i.e. spanned by a section + parent_section_index = depth_map[2][0].parent_index + typ = (self.type_111 if parent_section_index == 1 else + self.type_010) + else: + # has section transitions + if depth_map[2]: + parent_section_index = depth_map[2][0].parent_index + else: + parent_section_index = depth_map[1][0].index + typ = self.type_011 + + buf.write(typ) + + if typ not in (self.type_110, self.type_111) and parent_section_index > 0: + extra = {} + # Write starting section information + if spanner is None: + num_articles = len([a for a in depth_map[1] if a.parent_index == parent_section_index]) + if not depth_map[1]: + extra = {0b0001: 0} + if num_articles > 1: + extra = {0b0100: num_articles} + buf.write(encode_tbs(parent_section_index, extra)) + + if spanner is None: + articles = depth_map[2] + sections = {self.section_map[a.parent_index] for a in + articles} + sections = sorted(sections, key=lambda x:x.offset) + section_map = {s:[a for a in articles if a.parent_index == + s.index] for s in sections} + for i, section in enumerate(sections): + # All the articles in this record that belong to section + articles = section_map[section] + first_article = articles[0] + last_article = articles[-1] + num = len(articles) + last_article_ends = (last_article in data['ends'] or + last_article in data['completes']) + + try: + next_sec = sections[i+1] + except: + next_sec = None + + extra = {} + if num > 1: + extra[0b0100] = num + if False and i == 0 and next_sec is not None: + # Write offset to next section from start of record + # I can't figure out exactly when Kindlegen decides to + # write this so I have disabled it for now. + extra[0b0001] = next_sec.offset - data['offset'] + + buf.write(encode_tbs(first_article.index-section.index, extra)) + + if next_sec is not None: + buf.write(encode_tbs(last_article.index-next_sec.index, + {0b1000: 0})) + + # If a section TOC starts and extends into the next record add + # a trailing vwi. We detect this by TBS type==3, processing last + # section present in the record, and the last article in that + # section either ends or completes and doesn't finish + # on the last byte of the record. + elif (typ == self.type_011 and last_article_ends and + ((last_article.offset+last_article.size) % RECORD_SIZE > 0) + ): + buf.write(encode_tbs(last_article.index-section.index-1, + {0b1000: 0})) + + else: + buf.write(encode_tbs(spanner.index - parent_section_index, + {0b0001: 0})) + + self.bytestring = buf.getvalue() + + def book_tbs(self, data, first): + spanner = data['spans'] + if spanner is not None: + self.bytestring = encode_tbs(spanner.index, {0b010: 0, 0b001: 0}, + flag_size=3) + else: + starts, completes, ends = (data['starts'], data['completes'], + data['ends']) + if (not completes and ( + (len(starts) == 1 and not ends) or (len(ends) == 1 and not + starts))): + node = starts[0] if starts else ends[0] + self.bytestring = encode_tbs(node.index, {0b010: 0}, flag_size=3) + else: + nodes = [] + for x in (starts, completes, ends): + nodes.extend(x) + nodes.sort(key=lambda x:x.index) + self.bytestring = encode_tbs(nodes[0].index, {0b010:0, + 0b100: len(nodes)}, flag_size=3) + +# }}} + + +class Indexer(object): # {{{ + + def __init__(self, serializer, number_of_text_records, + size_of_last_text_record, masthead_offset, is_periodical, + opts, oeb): + self.serializer = serializer + self.number_of_text_records = number_of_text_records + self.text_size = (RECORD_SIZE * (self.number_of_text_records-1) + + size_of_last_text_record) + self.masthead_offset = masthead_offset + self.secondary_record_offset = None + + self.oeb = oeb + self.log = oeb.log + self.opts = opts + + self.is_periodical = is_periodical + if self.is_periodical and self.masthead_offset is None: + raise ValueError('Periodicals must have a masthead') + + self.log('Generating MOBI index for a %s'%('periodical' if + self.is_periodical else 'book')) + self.is_flat_periodical = False + if self.is_periodical: + periodical_node = next(iter(oeb.toc)) + sections = tuple(periodical_node) + self.is_flat_periodical = len(sections) == 1 + + self.records = [] + + if self.is_periodical: + # Ensure all articles have an author and description before + # creating the CNCX + for node in oeb.toc.iterdescendants(): + if node.klass == 'article': + aut, desc = node.author, node.description + if not aut: + aut = _('Unknown') + if not desc: + desc = _('No details available') + node.author, node.description = aut, desc + + self.cncx = CNCX(oeb.toc, self.is_periodical) + + if self.is_periodical: + self.indices = self.create_periodical_index() + else: + self.indices = self.create_book_index() + + if not self.indices: + raise ValueError('No valid entries in TOC, cannot generate index') + + self.records.append(self.create_index_record()) + self.records.insert(0, self.create_header()) + self.records.extend(self.cncx.records) + + if is_periodical: + self.secondary_record_offset = len(self.records) + self.records.append(self.create_header(secondary=True)) + self.records.append(self.create_index_record(secondary=True)) + + self.calculate_trailing_byte_sequences() + + def create_index_record(self, secondary=False): # {{{ + header_length = 192 + buf = io.BytesIO() + indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices + + # Write index entries + offsets = [] + for i in indices: + offsets.append(buf.tell()) + buf.write(i.bytestring) + + index_block = align_block(buf.getvalue()) + + # Write offsets to index entries as an IDXT block + idxt_block = b'IDXT' + buf.seek(0), buf.truncate(0) + for offset in offsets: + buf.write(pack(b'>H', header_length+offset)) + idxt_block = align_block(idxt_block + buf.getvalue()) + body = index_block + idxt_block + + header = b'INDX' + buf.seek(0), buf.truncate(0) + buf.write(pack(b'>I', header_length)) + buf.write(b'\0'*4) # Unknown + buf.write(pack(b'>I', 1)) # Header type? Or index record number? + buf.write(b'\0'*4) # Unknown + # IDXT block offset + buf.write(pack(b'>I', header_length + len(index_block))) + # Number of index entries + buf.write(pack(b'>I', len(offsets))) + # Unknown + buf.write(b'\xff'*8) + # Unknown + buf.write(b'\0'*156) + + header += buf.getvalue() + + ans = header + body + if len(ans) > 0x10000: + raise ValueError('Too many entries (%d) in the TOC'%len(offsets)) + return ans + # }}} + + def create_header(self, secondary=False): # {{{ + buf = io.BytesIO() + if secondary: + tagx_block = TAGX().secondary + else: + tagx_block = (TAGX().periodical if self.is_periodical else + TAGX().flat_book) + header_length = 192 + + # Ident 0 - 4 + buf.write(b'INDX') + + # Header length 4 - 8 + buf.write(pack(b'>I', header_length)) + + # Unknown 8-16 + buf.write(b'\0'*8) + + # Index type: 0 - normal, 2 - inflection 16 - 20 + buf.write(pack(b'>I', 2)) + + # IDXT offset 20-24 + buf.write(pack(b'>I', 0)) # Filled in later + + # Number of index records 24-28 + buf.write(pack(b'>I', 1 if secondary else len(self.records))) + + # Index Encoding 28-32 + buf.write(pack(b'>I', 65001)) # utf-8 + + # Unknown 32-36 + buf.write(b'\xff'*4) + + # Number of index entries 36-40 + indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices + buf.write(pack(b'>I', len(indices))) + + # ORDT offset 40-44 + buf.write(pack(b'>I', 0)) + + # LIGT offset 44-48 + buf.write(pack(b'>I', 0)) + + # Number of LIGT entries 48-52 + buf.write(pack(b'>I', 0)) + + # Number of CNCX records 52-56 + buf.write(pack(b'>I', 0 if secondary else len(self.cncx.records))) + + # Unknown 56-180 + buf.write(b'\0'*124) + + # TAGX offset 180-184 + buf.write(pack(b'>I', header_length)) + + # Unknown 184-192 + buf.write(b'\0'*8) + + # TAGX block + buf.write(tagx_block) + + num = len(indices) + + # The index of the last entry in the NCX + idx = indices[-1].index + if isinstance(idx, numbers.Integral): + idx = encode_number_as_hex(idx) + else: + idx = idx.encode('ascii') + idx = (bytes(bytearray([len(idx)]))) + idx + buf.write(idx) + + # The number of entries in the NCX + buf.write(pack(b'>H', num)) + + # Padding + pad = (4 - (buf.tell()%4))%4 + if pad: + buf.write(b'\0'*pad) + + idxt_offset = buf.tell() + + buf.write(b'IDXT') + buf.write(pack(b'>H', header_length + len(tagx_block))) + buf.write(b'\0') + buf.seek(20) + buf.write(pack(b'>I', idxt_offset)) + + return align_block(buf.getvalue()) + # }}} + + def create_book_index(self): # {{{ + indices = [] + seen = set() + id_offsets = self.serializer.id_offsets + + # Flatten toc so that chapter to chapter jumps work with all sub + # chapter levels as well + for node in self.oeb.toc.iterdescendants(): + try: + offset = id_offsets[node.href] + label = self.cncx[node.title] + except: + self.log.warn('TOC item %s [%s] not found in document'%( + node.title, node.href)) + continue + + if offset in seen: + continue + seen.add(offset) + + indices.append(IndexEntry(offset, label)) + + indices.sort(key=lambda x:x.offset) + + # Set lengths + for i, index in enumerate(indices): + try: + next_offset = indices[i+1].offset + except: + next_offset = self.serializer.body_end_offset + index.length = next_offset - index.offset + + # Remove empty indices + indices = [x for x in indices if x.length > 0] + + # Reset lengths in case any were removed + for i, index in enumerate(indices): + try: + next_offset = indices[i+1].offset + except: + next_offset = self.serializer.body_end_offset + index.length = next_offset - index.offset + + # Set index values + for index, x in enumerate(indices): + x.index = index + + return indices + + # }}} + + def create_periodical_index(self): # {{{ + periodical_node = next(iter(self.oeb.toc)) + periodical_node_offset = self.serializer.body_start_offset + periodical_node_size = (self.serializer.body_end_offset - + periodical_node_offset) + + normalized_sections = [] + + id_offsets = self.serializer.id_offsets + + periodical = PeriodicalIndexEntry(periodical_node_offset, + self.cncx[periodical_node.title], + self.cncx[periodical_node.klass], 0) + periodical.length = periodical_node_size + periodical.first_child_index = 1 + periodical.image_index = self.masthead_offset + + seen_sec_offsets = set() + seen_art_offsets = set() + + for sec in periodical_node: + normalized_articles = [] + try: + offset = id_offsets[sec.href] + label = self.cncx[sec.title] + klass = self.cncx[sec.klass] + except: + continue + if offset in seen_sec_offsets: + continue + + seen_sec_offsets.add(offset) + section = PeriodicalIndexEntry(offset, label, klass, 1) + section.parent_index = 0 + + for art in sec: + try: + offset = id_offsets[art.href] + label = self.cncx[art.title] + klass = self.cncx[art.klass] + except: + continue + if offset in seen_art_offsets: + continue + seen_art_offsets.add(offset) + article = PeriodicalIndexEntry(offset, label, klass, 2) + normalized_articles.append(article) + article.author_offset = self.cncx[art.author] + article.desc_offset = self.cncx[art.description] + if getattr(art, 'toc_thumbnail', None) is not None: + try: + ii = self.serializer.images[art.toc_thumbnail] - 1 + if ii > -1: + article.image_index = ii + except KeyError: + pass # Image not found in serializer + + if normalized_articles: + normalized_articles.sort(key=lambda x:x.offset) + normalized_sections.append((section, normalized_articles)) + + normalized_sections.sort(key=lambda x:x[0].offset) + + # Set lengths + for s, x in enumerate(normalized_sections): + sec, normalized_articles = x + try: + sec.length = normalized_sections[s+1][0].offset - sec.offset + except: + sec.length = self.serializer.body_end_offset - sec.offset + for i, art in enumerate(normalized_articles): + try: + art.length = normalized_articles[i+1].offset - art.offset + except: + art.length = sec.offset + sec.length - art.offset + + # Filter + for i, x in list(enumerate(normalized_sections)): + sec, normalized_articles = x + normalized_articles = list(filter(lambda x: x.length > 0, + normalized_articles)) + normalized_sections[i] = (sec, normalized_articles) + + normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1], + normalized_sections)) + + # Set indices + i = 0 + for sec, articles in normalized_sections: + i += 1 + sec.index = i + sec.parent_index = 0 + + for sec, articles in normalized_sections: + for art in articles: + i += 1 + art.index = i + + art.parent_index = sec.index + + for sec, normalized_articles in normalized_sections: + sec.first_child_index = normalized_articles[0].index + sec.last_child_index = normalized_articles[-1].index + + # Set lengths again to close up any gaps left by filtering + for s, x in enumerate(normalized_sections): + sec, articles = x + try: + next_offset = normalized_sections[s+1][0].offset + except: + next_offset = self.serializer.body_end_offset + sec.length = next_offset - sec.offset + + for a, art in enumerate(articles): + try: + next_offset = articles[a+1].offset + except: + next_offset = sec.next_offset + art.length = next_offset - art.offset + + # Sanity check + for s, x in enumerate(normalized_sections): + sec, articles = x + try: + next_sec = normalized_sections[s+1][0] + except: + if (sec.length == 0 or sec.next_offset != + self.serializer.body_end_offset): + raise ValueError('Invalid section layout') + else: + if next_sec.offset != sec.next_offset or sec.length == 0: + raise ValueError('Invalid section layout') + for a, art in enumerate(articles): + try: + next_art = articles[a+1] + except: + if (art.length == 0 or art.next_offset != + sec.next_offset): + raise ValueError('Invalid article layout') + else: + if art.length == 0 or art.next_offset != next_art.offset: + raise ValueError('Invalid article layout') + + # Flatten + indices = [periodical] + for sec, articles in normalized_sections: + indices.append(sec) + periodical.last_child_index = sec.index + + for sec, articles in normalized_sections: + for a in articles: + indices.append(a) + + return indices + # }}} + + # TBS {{{ + def calculate_trailing_byte_sequences(self): + self.tbs_map = {} + found_node = False + sections = [i for i in self.indices if i.depth == 1] + section_map = OrderedDict((i.index, i) for i in + sorted(sections, key=lambda x:x.offset)) + + deepest = max(i.depth for i in self.indices) + + for i in range(self.number_of_text_records): + offset = i * RECORD_SIZE + next_offset = offset + RECORD_SIZE + data = {'ends':[], 'completes':[], 'starts':[], + 'spans':None, 'offset':offset, 'record_number':i+1} + + for index in self.indices: + + if index.offset >= next_offset: + # Node starts after current record + if index.depth == deepest: + break + else: + continue + if index.next_offset <= offset: + # Node ends before current record + continue + if index.offset >= offset: + # Node starts in current record + if index.next_offset <= next_offset: + # Node ends in current record + data['completes'].append(index) + else: + data['starts'].append(index) + else: + # Node starts before current records + if index.next_offset <= next_offset: + # Node ends in current record + data['ends'].append(index) + elif index.depth == deepest: + data['spans'] = index + + if (data['ends'] or data['completes'] or data['starts'] or + data['spans'] is not None): + self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not + found_node, section_map=section_map) + found_node = True + else: + self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False, + after_first=found_node, section_map=section_map) + + def get_trailing_byte_sequence(self, num): + return self.tbs_map[num].bytestring + # }}} + +# }}} diff --git a/ebook_converter/ebooks/mobi/writer2/main.py b/ebook_converter/ebooks/mobi/writer2/main.py new file mode 100644 index 0000000..884628d --- /dev/null +++ b/ebook_converter/ebooks/mobi/writer2/main.py @@ -0,0 +1,480 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import io, random, time +from struct import pack + +from calibre.ebooks import normalize +from calibre.ebooks.mobi.writer2.serializer import Serializer +from calibre.ebooks.compression.palmdoc import compress_doc +from calibre.ebooks.mobi.langcodes import iana2mobi +from calibre.utils.filenames import ascii_filename +from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED) +from calibre.ebooks.mobi.utils import (encint, encode_trailing_data, + align_block, detect_periodical, RECORD_SIZE, create_text_record) +from calibre.ebooks.mobi.writer2.indexer import Indexer +from polyglot.builtins import iteritems, unicode_type, range + +# Disabled as I dont care about uncrossable breaks +WRITE_UNCROSSABLE_BREAKS = False +NULL_INDEX = 0xffffffff + +FLIS = (b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+ + b'\xff'*4) + + +def fcis(text_length): + fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' + fcis += pack(b'>I', text_length) + fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' + return fcis + + +class MobiWriter(object): + + def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True): + self.opts = opts + self.resources = resources + self.kf8 = kf8 + self.for_joint = kf8 is not None + self.write_page_breaks_after_item = write_page_breaks_after_item + self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC + self.prefer_author_sort = opts.prefer_author_sort + self.last_text_record_idx = 1 + + def __call__(self, oeb, path_or_stream): + self.log = oeb.log + pt = None + if oeb.metadata.publication_type: + x = unicode_type(oeb.metadata.publication_type[0]).split(':') + if len(x) > 1: + pt = x[1].lower() + self.publication_type = pt + + if hasattr(path_or_stream, 'write'): + return self.dump_stream(oeb, path_or_stream) + with open(path_or_stream, 'w+b') as stream: + return self.dump_stream(oeb, stream) + + def write(self, *args): + for datum in args: + self.stream.write(datum) + + def tell(self): + return self.stream.tell() + + def dump_stream(self, oeb, stream): + self.oeb = oeb + self.stream = stream + self.records = [None] + self.generate_content() + self.generate_joint_record0() if self.for_joint else self.generate_record0() + self.write_header() + self.write_content() + + def generate_content(self): + self.is_periodical = detect_periodical(self.oeb.toc, self.oeb.log) + # Image records are stored in their own list, they are merged into the + # main record list at the end + self.generate_images() + self.generate_text() + # The uncrossable breaks trailing entries come before the indexing + # trailing entries + self.write_uncrossable_breaks() + # Index records come after text records + self.generate_index() + + # Indexing {{{ + def generate_index(self): + self.primary_index_record_idx = None + if self.oeb.toc.count() < 1: + self.log.warn('No TOC, MOBI index not generated') + return + try: + self.indexer = Indexer(self.serializer, self.last_text_record_idx, + len(self.records[self.last_text_record_idx]), + self.masthead_offset, self.is_periodical, + self.opts, self.oeb) + except: + self.log.exception('Failed to generate MOBI index:') + else: + self.primary_index_record_idx = len(self.records) + for i in range(self.last_text_record_idx + 1): + if i == 0: + continue + tbs = self.indexer.get_trailing_byte_sequence(i) + self.records[i] += encode_trailing_data(tbs) + self.records.extend(self.indexer.records) + + # }}} + + def write_uncrossable_breaks(self): # {{{ + ''' + Write information about uncrossable breaks (non linear items in + the spine. + ''' + if not WRITE_UNCROSSABLE_BREAKS: + return + + breaks = self.serializer.breaks + + for i in range(1, self.last_text_record_idx+1): + offset = i * RECORD_SIZE + pbreak = 0 + running = offset + + buf = io.BytesIO() + + while breaks and (breaks[0] - offset) < RECORD_SIZE: + pbreak = (breaks.pop(0) - running) >> 3 + encoded = encint(pbreak) + buf.write(encoded) + running += pbreak << 3 + encoded = encode_trailing_data(buf.getvalue()) + self.records[i] += encoded + # }}} + + # Images {{{ + + def generate_images(self): + resources = self.resources + image_records = resources.records + self.image_map = resources.item_map + self.masthead_offset = resources.masthead_offset + self.cover_offset = resources.cover_offset + self.thumbnail_offset = resources.thumbnail_offset + + if image_records and image_records[0] is None: + raise ValueError('Failed to find masthead image in manifest') + + # }}} + + def generate_text(self): # {{{ + self.oeb.logger.info('Serializing markup content...') + self.serializer = Serializer(self.oeb, self.image_map, + self.is_periodical, + write_page_breaks_after_item=self.write_page_breaks_after_item) + text = self.serializer() + self.text_length = len(text) + text = io.BytesIO(text) + nrecords = 0 + records_size = 0 + + if self.compression != UNCOMPRESSED: + self.oeb.logger.info(' Compressing markup content...') + + while text.tell() < self.text_length: + data, overlap = create_text_record(text) + if self.compression == PALMDOC: + data = compress_doc(data) + + data += overlap + data += pack(b'>B', len(overlap)) + + self.records.append(data) + records_size += len(data) + nrecords += 1 + + self.last_text_record_idx = nrecords + self.first_non_text_record_idx = nrecords + 1 + # Pad so that the next records starts at a 4 byte boundary + if records_size % 4 != 0: + self.records.append(b'\x00'*(records_size % 4)) + self.first_non_text_record_idx += 1 + # }}} + + def generate_record0(self): # MOBI header {{{ + metadata = self.oeb.metadata + bt = 0x002 + if self.primary_index_record_idx is not None: + if False and self.indexer.is_flat_periodical: + # Disabled as setting this to 0x102 causes the Kindle to not + # auto archive the issues + bt = 0x102 + elif self.indexer.is_periodical: + # If you change this, remember to change the cdetype in the EXTH + # header as well + bt = 0x103 if self.indexer.is_flat_periodical else 0x101 + + from calibre.ebooks.mobi.writer8.exth import build_exth + exth = build_exth(metadata, + prefer_author_sort=self.opts.prefer_author_sort, + is_periodical=self.is_periodical, + share_not_sync=self.opts.share_not_sync, + cover_offset=self.cover_offset, + thumbnail_offset=self.thumbnail_offset, + start_offset=self.serializer.start_offset, mobi_doctype=bt + ) + first_image_record = None + if self.resources: + used_images = self.serializer.used_images + first_image_record = len(self.records) + self.resources.serialize(self.records, used_images) + last_content_record = len(self.records) - 1 + + # FCIS/FLIS (Seems to serve no purpose) + flis_number = len(self.records) + self.records.append(FLIS) + fcis_number = len(self.records) + self.records.append(fcis(self.text_length)) + + # EOF record + self.records.append(b'\xE9\x8E\x0D\x0A') + + record0 = io.BytesIO() + # The MOBI Header + record0.write(pack(b'>HHIHHHH', + self.compression, # compression type # compression type + 0, # Unused + self.text_length, # Text length + self.last_text_record_idx, # Number of text records or last tr idx + RECORD_SIZE, # Text record size + 0, # Unused + 0 # Unused + )) # 0 - 15 (0x0 - 0xf) + uid = random.randint(0, 0xffffffff) + title = normalize(unicode_type(metadata.title[0])).encode('utf-8') + + # 0x0 - 0x3 + record0.write(b'MOBI') + + # 0x4 - 0x7 : Length of header + # 0x8 - 0x11 : MOBI type + # type meaning + # 0x002 MOBI book (chapter - chapter navigation) + # 0x101 News - Hierarchical navigation with sections and articles + # 0x102 News feed - Flat navigation + # 0x103 News magazine - same as 0x101 + # 0xC - 0xF : Text encoding (65001 is utf-8) + # 0x10 - 0x13 : UID + # 0x14 - 0x17 : Generator version + + record0.write(pack(b'>IIIII', + 0xe8, bt, 65001, uid, 6)) + + # 0x18 - 0x1f : Unknown + record0.write(b'\xff' * 8) + + # 0x20 - 0x23 : Secondary index record + sir = 0xffffffff + if (self.primary_index_record_idx is not None and + self.indexer.secondary_record_offset is not None): + sir = (self.primary_index_record_idx + + self.indexer.secondary_record_offset) + record0.write(pack(b'>I', sir)) + + # 0x24 - 0x3f : Unknown + record0.write(b'\xff' * 28) + + # 0x40 - 0x43 : Offset of first non-text record + record0.write(pack(b'>I', + self.first_non_text_record_idx)) + + # 0x44 - 0x4b : title offset, title length + record0.write(pack(b'>II', + 0xe8 + 16 + len(exth), len(title))) + + # 0x4c - 0x4f : Language specifier + record0.write(iana2mobi( + unicode_type(metadata.language[0]))) + + # 0x50 - 0x57 : Input language and Output language + record0.write(b'\0' * 8) + + # 0x58 - 0x5b : Format version + # 0x5c - 0x5f : First image record number + record0.write(pack(b'>II', + 6, first_image_record if first_image_record else len(self.records))) + + # 0x60 - 0x63 : First HUFF/CDIC record number + # 0x64 - 0x67 : Number of HUFF/CDIC records + # 0x68 - 0x6b : First DATP record number + # 0x6c - 0x6f : Number of DATP records + record0.write(b'\0' * 16) + + # 0x70 - 0x73 : EXTH flags + # Bit 6 (0b1000000) being set indicates the presence of an EXTH header + # Bit 12 being set indicates the presence of embedded fonts + # The purpose of the other bits is unknown + exth_flags = 0b1010000 + if self.is_periodical: + exth_flags |= 0b1000 + if self.resources.has_fonts: + exth_flags |= 0b1000000000000 + record0.write(pack(b'>I', exth_flags)) + + # 0x74 - 0x93 : Unknown + record0.write(b'\0' * 32) + + # 0x94 - 0x97 : DRM offset + # 0x98 - 0x9b : DRM count + # 0x9c - 0x9f : DRM size + # 0xa0 - 0xa3 : DRM flags + record0.write(pack(b'>IIII', + 0xffffffff, 0xffffffff, 0, 0)) + + # 0xa4 - 0xaf : Unknown + record0.write(b'\0'*12) + + # 0xb0 - 0xb1 : First content record number + # 0xb2 - 0xb3 : last content record number + # (Includes Image, DATP, HUFF, DRM) + record0.write(pack(b'>HH', 1, last_content_record)) + + # 0xb4 - 0xb7 : Unknown + record0.write(b'\0\0\0\x01') + + # 0xb8 - 0xbb : FCIS record number + record0.write(pack(b'>I', fcis_number)) + + # 0xbc - 0xbf : Unknown (FCIS record count?) + record0.write(pack(b'>I', 1)) + + # 0xc0 - 0xc3 : FLIS record number + record0.write(pack(b'>I', flis_number)) + + # 0xc4 - 0xc7 : Unknown (FLIS record count?) + record0.write(pack(b'>I', 1)) + + # 0xc8 - 0xcf : Unknown + record0.write(b'\0'*8) + + # 0xd0 - 0xdf : Unknown + record0.write(pack(b'>IIII', 0xffffffff, 0, 0xffffffff, 0xffffffff)) + + # 0xe0 - 0xe3 : Extra record data + # Extra record data flags: + # - 0b1 : + # - 0b10 : + # - 0b100: + # Setting bit 2 (0x2) disables functionality + extra_data_flags = 0b1 # Has multibyte overlap bytes + if self.primary_index_record_idx is not None: + extra_data_flags |= 0b10 + if WRITE_UNCROSSABLE_BREAKS: + extra_data_flags |= 0b100 + record0.write(pack(b'>I', extra_data_flags)) + + # 0xe4 - 0xe7 : Primary index record + record0.write(pack(b'>I', 0xffffffff if self.primary_index_record_idx + is None else self.primary_index_record_idx)) + + record0.write(exth) + record0.write(title) + record0 = record0.getvalue() + # Add some buffer so that Amazon can add encryption information if this + # MOBI is submitted for publication + record0 += (b'\0' * (1024*8)) + self.records[0] = align_block(record0) + # }}} + + def generate_joint_record0(self): # {{{ + from calibre.ebooks.mobi.writer8.mobi import (MOBIHeader, + HEADER_FIELDS) + from calibre.ebooks.mobi.writer8.exth import build_exth + + # Insert resource records + first_image_record = None + old = len(self.records) + if self.resources: + used_images = self.serializer.used_images | self.kf8.used_images + first_image_record = len(self.records) + self.resources.serialize(self.records, used_images) + resource_record_count = len(self.records) - old + last_content_record = len(self.records) - 1 + + # FCIS/FLIS (Seems to serve no purpose) + flis_number = len(self.records) + self.records.append(FLIS) + fcis_number = len(self.records) + self.records.append(fcis(self.text_length)) + + # Insert KF8 records + self.records.append(b'BOUNDARY') + kf8_header_index = len(self.records) + self.kf8.start_offset = (self.serializer.start_offset, + self.kf8.start_offset) + self.records.append(self.kf8.record0) + self.records.extend(self.kf8.records[1:]) + + first_image_record = (first_image_record if first_image_record else + len(self.records)) + + header_fields = {k:getattr(self.kf8, k) for k in HEADER_FIELDS} + + # Now change the header fields that need to be different in the MOBI 6 + # header + header_fields['first_resource_record'] = first_image_record + ef = 0b100001010000 # Kinglegen uses this + if self.resources.has_fonts: + ef |= 0b1000000000000 + header_fields['exth_flags'] = ef + header_fields['fdst_record'] = pack(b'>HH', 1, last_content_record) + header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1 + header_fields['flis_record'] = flis_number + header_fields['fcis_record'] = fcis_number + header_fields['text_length'] = self.text_length + extra_data_flags = 0b1 # Has multibyte overlap bytes + if self.primary_index_record_idx is not None: + extra_data_flags |= 0b10 + header_fields['extra_data_flags'] = extra_data_flags + + for k, v in iteritems({'last_text_record':'last_text_record_idx', + 'first_non_text_record':'first_non_text_record_idx', + 'ncx_index':'primary_index_record_idx', + }): + header_fields[k] = getattr(self, v) + if header_fields['ncx_index'] is None: + header_fields['ncx_index'] = NULL_INDEX + + for x in ('skel', 'chunk', 'guide'): + header_fields[x+'_index'] = NULL_INDEX + + # Create the MOBI 6 EXTH + opts = self.opts + kuc = 0 if resource_record_count > 0 else None + + header_fields['exth'] = build_exth(self.oeb.metadata, + prefer_author_sort=opts.prefer_author_sort, + is_periodical=opts.mobi_periodical, + share_not_sync=opts.share_not_sync, + cover_offset=self.cover_offset, + thumbnail_offset=self.thumbnail_offset, + num_of_resources=resource_record_count, + kf8_unknown_count=kuc, be_kindlegen2=True, + kf8_header_index=kf8_header_index, + start_offset=self.serializer.start_offset, + mobi_doctype=2) + self.records[0] = MOBIHeader(file_version=6)(**header_fields) + + # }}} + + def write_header(self): # PalmDB header {{{ + ''' + Write the PalmDB header + ''' + title = ascii_filename(unicode_type(self.oeb.metadata.title[0])).replace( + ' ', '_') + if not isinstance(title, bytes): + title = title.encode('ascii') + title = title[:31] + title = title + (b'\0' * (32 - len(title))) + now = int(time.time()) + nrecords = len(self.records) + self.write(title, pack(b'>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0), + b'BOOK', b'MOBI', pack(b'>IIH', (2*nrecords)-1, 0, nrecords)) + offset = self.tell() + (8 * nrecords) + 2 + for i, record in enumerate(self.records): + self.write(pack(b'>I', offset), b'\0', pack(b'>I', 2*i)[1:]) + offset += len(record) + self.write(b'\0\0') + # }}} + + def write_content(self): + for record in self.records: + self.write(record) diff --git a/ebook_converter/ebooks/mobi/writer2/serializer.py b/ebook_converter/ebooks/mobi/writer2/serializer.py new file mode 100644 index 0000000..756d2d7 --- /dev/null +++ b/ebook_converter/ebooks/mobi/writer2/serializer.py @@ -0,0 +1,396 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +import re +import unicodedata +from collections import defaultdict +from io import BytesIO + +from calibre.ebooks.mobi.mobiml import MBP_NS +from calibre.ebooks.mobi.utils import is_guide_ref_start +from calibre.ebooks.oeb.base import ( + OEB_DOCS, XHTML, XHTML_NS, XML_NS, namespace, prefixname, urlnormalize +) +from polyglot.builtins import unicode_type, string_or_bytes +from polyglot.urllib import urldefrag + + +class Buf(BytesIO): + + def write(self, x): + if isinstance(x, unicode_type): + x = x.encode('utf-8') + BytesIO.write(self, x) + + +class Serializer(object): + NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'} + + def __init__(self, oeb, images, is_periodical, write_page_breaks_after_item=True): + ''' + Write all the HTML markup in oeb into a single in memory buffer + containing a single html document with links replaced by offsets into + the buffer. + + :param oeb: OEBBook object that encapsulates the document to be + processed. + + :param images: Mapping of image hrefs (urlnormalized) to image record + indices. + + :param write_page_breaks_after_item: If True a MOBIpocket pagebreak tag + is written after every element of the spine in ``oeb``. + ''' + self.oeb = oeb + # Map of image hrefs to image index in the MOBI file + self.images = images + self.used_images = set() + self.logger = oeb.logger + self.is_periodical = is_periodical + self.write_page_breaks_after_item = write_page_breaks_after_item + + # If not None, this is a number pointing to the location at which to + # open the MOBI file on the Kindle + self.start_offset = None + + # Mapping of hrefs (urlnormalized) to the offset in the buffer where + # the resource pointed to by the href lives. Used at the end to fill in + # the correct values into all filepos="..." links. + self.id_offsets = {} + + # Mapping of hrefs (urlnormalized) to a list of offsets into the buffer + # where filepos="..." elements are written corresponding to links that + # point to the href. This is used at the end to fill in the correct values. + self.href_offsets = defaultdict(list) + + # List of offsets in the buffer of non linear items in the spine. These + # become uncrossable breaks in the MOBI + self.breaks = [] + + self.find_blocks() + + def find_blocks(self): + ''' + Mark every item in the spine if it is the start/end of a + section/article, so that it can be wrapped in divs appropriately. + ''' + for item in self.oeb.spine: + item.is_section_start = item.is_section_end = False + item.is_article_start = item.is_article_end = False + + def spine_item(tocitem): + href = urldefrag(tocitem.href)[0] + for item in self.oeb.spine: + if item.href == href: + return item + + for item in self.oeb.toc.iterdescendants(): + if item.klass == 'section': + articles = list(item) + if not articles: + continue + spine_item(item).is_section_start = True + for i, article in enumerate(articles): + si = spine_item(article) + if si is not None: + si.is_article_start = True + + items = list(self.oeb.spine) + in_sec = in_art = False + for i, item in enumerate(items): + try: + prev_item = items[i-1] + except: + prev_item = None + if in_art and item.is_article_start is True: + prev_item.is_article_end = True + in_art = False + if in_sec and item.is_section_start is True: + prev_item.is_section_end = True + in_sec = False + if item.is_section_start: + in_sec = True + if item.is_article_start: + in_art = True + + item.is_section_end = item.is_article_end = True + + def __call__(self): + ''' + Return the document serialized as a single UTF-8 encoded bytestring. + ''' + buf = self.buf = Buf() + buf.write(b'') + self.serialize_head() + self.serialize_body() + buf.write(b'') + self.end_offset = buf.tell() + self.fixup_links() + if self.start_offset is None and not self.is_periodical: + # If we don't set a start offset, the stupid Kindle will + # open the book at the location of the first IndexEntry, which + # could be anywhere. So ensure the book is always opened at the + # beginning, instead. + self.start_offset = self.body_start_offset + return buf.getvalue() + + def serialize_head(self): + buf = self.buf + buf.write(b'') + if len(self.oeb.guide) > 0: + self.serialize_guide() + buf.write(b'') + + def serialize_guide(self): + ''' + The Kindle decides where to open a book based on the presence of + an item in the guide that looks like + + + Similarly an item with type="toc" controls where the Goto Table of + Contents operation on the kindle goes. + ''' + + buf = self.buf + hrefs = self.oeb.manifest.hrefs + buf.write(b'') + for ref in self.oeb.guide.values(): + path = urldefrag(ref.href)[0] + if path not in hrefs or hrefs[path].media_type not in OEB_DOCS: + continue + + buf.write(b'') + + buf.write(b'') + + def serialize_href(self, href, base=None): + ''' + Serialize the href attribute of an or tag. It is + serialized as filepos="000000000" and a pointer to its location is + stored in self.href_offsets so that the correct value can be filled in + at the end. + ''' + hrefs = self.oeb.manifest.hrefs + try: + path, frag = urldefrag(urlnormalize(href)) + except ValueError: + # Unparseable URL + return False + if path and base: + path = base.abshref(path) + if path and path not in hrefs: + return False + buf = self.buf + item = hrefs[path] if path else None + if item and item.spine_position is None: + return False + path = item.href if item else base.href + href = '#'.join((path, frag)) if frag else path + buf.write(b'filepos=') + self.href_offsets[href].append(buf.tell()) + buf.write(b'0000000000') + return True + + def serialize_body(self): + ''' + Serialize all items in the spine of the document. Non linear items are + moved to the end. + ''' + buf = self.buf + + def serialize_toc_level(tocref, href=None): + # add the provided toc level to the output stream + # if href is provided add a link ref to the toc level output (e.g. feed_0/index.html) + if href is not None: + # resolve the section url in id_offsets + buf.write(b'') + self.id_offsets[urlnormalize(href)] = buf.tell() + + if tocref.klass == "periodical": + buf.write(b'
') + else: + t = tocref.title + if isinstance(t, unicode_type): + t = t.encode('utf-8') + buf.write(b'
') + + self.anchor_offset = buf.tell() + buf.write(b'') + self.body_start_offset = buf.tell() + + if self.is_periodical: + top_toc = self.oeb.toc.nodes[0] + serialize_toc_level(top_toc) + + spine = [item for item in self.oeb.spine if item.linear] + spine.extend([item for item in self.oeb.spine if not item.linear]) + + for item in spine: + + if self.is_periodical and item.is_section_start: + for section_toc in top_toc.nodes: + if urlnormalize(item.href) == section_toc.href: + # create section url of the form r'feed_\d+/index.html' + section_url = re.sub(r'article_\d+/', '', section_toc.href) + serialize_toc_level(section_toc, section_url) + section_toc.href = section_url + break + + self.serialize_item(item) + + self.body_end_offset = buf.tell() + buf.write(b'') + + def serialize_item(self, item): + ''' + Serialize an individual item from the spine of the input document. + A reference to this item is stored in self.href_offsets + ''' + buf = self.buf + if not item.linear: + self.breaks.append(buf.tell() - 1) + self.id_offsets[urlnormalize(item.href)] = buf.tell() + if item.is_section_start: + buf.write(b' ') + if item.is_article_start: + buf.write(b' ') + for elem in item.data.find(XHTML('body')): + self.serialize_elem(elem, item) + if self.write_page_breaks_after_item: + buf.write(b'') + if item.is_article_end: + # Kindle periodical article end marker + buf.write(b' ') + if item.is_section_end: + buf.write(b' ') + self.anchor_offset = None + + def serialize_elem(self, elem, item, nsrmap=NSRMAP): + buf = self.buf + if not isinstance(elem.tag, string_or_bytes) \ + or namespace(elem.tag) not in nsrmap: + return + tag = prefixname(elem.tag, nsrmap) + # Previous layers take care of @name + id_ = elem.attrib.pop('id', None) + if id_: + href = '#'.join((item.href, id_)) + offset = self.anchor_offset or buf.tell() + key = urlnormalize(href) + # Only set this id_offset if it wasn't previously seen + self.id_offsets[key] = self.id_offsets.get(key, offset) + if self.anchor_offset is not None and \ + tag == 'a' and not elem.attrib and \ + not len(elem) and not elem.text: + return + self.anchor_offset = buf.tell() + buf.write(b'<') + buf.write(tag.encode('utf-8')) + if elem.attrib: + for attr, val in elem.attrib.items(): + if namespace(attr) not in nsrmap: + continue + attr = prefixname(attr, nsrmap) + buf.write(b' ') + if attr == 'href': + if self.serialize_href(val, item): + continue + elif attr == 'src': + href = urlnormalize(item.abshref(val)) + if href in self.images: + index = self.images[href] + self.used_images.add(href) + buf.write(b'recindex="%05d"' % index) + continue + buf.write(attr.encode('utf-8')) + buf.write(b'="') + self.serialize_text(val, quot=True) + buf.write(b'"') + buf.write(b'>') + if elem.text or len(elem) > 0: + if elem.text: + self.anchor_offset = None + self.serialize_text(elem.text) + for child in elem: + self.serialize_elem(child, item) + if child.tail: + self.anchor_offset = None + self.serialize_text(child.tail) + buf.write(('' % tag).encode('utf-8')) + + def serialize_text(self, text, quot=False): + text = text.replace('&', '&') + text = text.replace('<', '<') + text = text.replace('>', '>') + text = text.replace(u'\u00AD', '') # Soft-hyphen + if quot: + text = text.replace('"', '"') + if isinstance(text, unicode_type): + text = unicodedata.normalize('NFC', text) + self.buf.write(text.encode('utf-8')) + + def fixup_links(self): + ''' + Fill in the correct values for all filepos="..." links with the offsets + of the linked to content (as stored in id_offsets). + ''' + buf = self.buf + id_offsets = self.id_offsets + start_href = getattr(self, '_start_href', None) + for href, hoffs in self.href_offsets.items(): + is_start = (href and href == start_href) + # Iterate over all filepos items + if href not in id_offsets: + self.logger.warn('Hyperlink target %r not found' % href) + # Link to the top of the document, better than just ignoring + href, _ = urldefrag(href) + if href in self.id_offsets: + ioff = self.id_offsets[href] + if is_start: + self.start_offset = ioff + for hoff in hoffs: + buf.seek(hoff) + buf.write(('%010d' % ioff).encode('utf-8')) diff --git a/ebook_converter/ebooks/mobi/writer8/__init__.py b/ebook_converter/ebooks/mobi/writer8/__init__.py new file mode 100644 index 0000000..357137f --- /dev/null +++ b/ebook_converter/ebooks/mobi/writer8/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + + diff --git a/ebook_converter/ebooks/mobi/writer8/cleanup.py b/ebook_converter/ebooks/mobi/writer8/cleanup.py new file mode 100644 index 0000000..9483eea --- /dev/null +++ b/ebook_converter/ebooks/mobi/writer8/cleanup.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.ebooks.oeb.base import XPath + + +class CSSCleanup(object): + + def __init__(self, log, opts): + self.log, self.opts = log, opts + + def __call__(self, item, stylizer): + if not hasattr(item.data, 'xpath'): + return + + # The Kindle touch displays all black pages if the height is set on + # body + for body in XPath('//h:body')(item.data): + style = stylizer.style(body) + style.drop('height') + + +def remove_duplicate_anchors(oeb): + # The Kindle apparently has incorrect behavior for duplicate anchors, see + # https://bugs.launchpad.net/calibre/+bug/1454199 + for item in oeb.spine: + if not hasattr(item.data, 'xpath'): + continue + seen = set() + for tag in item.data.xpath('//*[@id or @name]'): + for attr in ('id', 'name'): + anchor = tag.get(attr) + if anchor is not None: + if anchor in seen: + oeb.log.debug('Removing duplicate anchor:', anchor) + tag.attrib.pop(attr) + else: + seen.add(anchor) diff --git a/ebook_converter/ebooks/mobi/writer8/exth.py b/ebook_converter/ebooks/mobi/writer8/exth.py new file mode 100644 index 0000000..d997b3a --- /dev/null +++ b/ebook_converter/ebooks/mobi/writer8/exth.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re +from struct import pack +from io import BytesIO + +from calibre.constants import iswindows, isosx +from calibre.ebooks.mobi.utils import (utf8_text, to_base) +from calibre.utils.localization import lang_as_iso639_1 +from calibre.ebooks.metadata import authors_to_sort_string +from polyglot.builtins import iteritems, unicode_type + +EXTH_CODES = { + 'creator': 100, + 'publisher': 101, + 'description': 103, + 'identifier': 104, + 'subject': 105, + 'pubdate': 106, + 'review': 107, + 'contributor': 108, + 'rights': 109, + 'type': 111, + 'source': 112, + 'versionnumber': 114, + 'startreading': 116, + 'kf8_header_index': 121, + 'num_of_resources': 125, + 'kf8_thumbnail_uri': 129, + 'kf8_unknown_count': 131, + 'coveroffset': 201, + 'thumboffset': 202, + 'hasfakecover': 203, + 'lastupdatetime': 502, + 'title': 503, + 'language': 524, + 'primary_writing_mode': 525, + 'page_progression_direction': 527, +} + +COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') + + +def build_exth(metadata, prefer_author_sort=False, is_periodical=False, + share_not_sync=True, cover_offset=None, thumbnail_offset=None, + start_offset=None, mobi_doctype=2, num_of_resources=None, + kf8_unknown_count=0, be_kindlegen2=False, kf8_header_index=None, + page_progression_direction=None, primary_writing_mode=None): + exth = BytesIO() + nrecs = 0 + + for term in metadata: + if term not in EXTH_CODES: + continue + code = EXTH_CODES[term] + items = metadata[term] + if term == 'creator': + if prefer_author_sort: + creators = [authors_to_sort_string([unicode_type(c)]) for c in + items] + else: + creators = [unicode_type(c) for c in items] + items = creators + elif term == 'rights': + try: + rights = utf8_text(unicode_type(metadata.rights[0])) + except: + rights = b'Unknown' + exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8)) + exth.write(rights) + nrecs += 1 + continue + + for item in items: + data = unicode_type(item) + if term != 'description': + data = COLLAPSE_RE.sub(' ', data) + if term == 'identifier': + if data.lower().startswith('urn:isbn:'): + data = data[9:] + elif item.scheme.lower() == 'isbn': + pass + else: + continue + if term == 'language': + d2 = lang_as_iso639_1(data) + if d2: + data = d2 + data = utf8_text(data) + exth.write(pack(b'>II', code, len(data) + 8)) + exth.write(data) + nrecs += 1 + + # Write UUID as ASIN + uuid = None + from calibre.ebooks.oeb.base import OPF + for x in metadata['identifier']: + if (x.get(OPF('scheme'), None).lower() == 'uuid' or + unicode_type(x).startswith('urn:uuid:')): + uuid = unicode_type(x).split(':')[-1] + break + if uuid is None: + from uuid import uuid4 + uuid = unicode_type(uuid4()) + + if isinstance(uuid, unicode_type): + uuid = uuid.encode('utf-8') + if not share_not_sync: + exth.write(pack(b'>II', 113, len(uuid) + 8)) + exth.write(uuid) + nrecs += 1 + + # Write UUID as SOURCE + c_uuid = b'calibre:%s' % uuid + exth.write(pack(b'>II', 112, len(c_uuid) + 8)) + exth.write(c_uuid) + nrecs += 1 + + # Write cdetype + if not is_periodical: + if not share_not_sync: + exth.write(pack(b'>II', 501, 12)) + exth.write(b'EBOK') + nrecs += 1 + else: + ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None) + if ids: + exth.write(pack(b'>II', 501, 12)) + exth.write(ids) + nrecs += 1 + + # Add a publication date entry + if metadata['date']: + datestr = unicode_type(metadata['date'][0]) + elif metadata['timestamp']: + datestr = unicode_type(metadata['timestamp'][0]) + + if datestr is None: + raise ValueError("missing date or timestamp") + + datestr = datestr.encode('utf-8') + exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8)) + exth.write(datestr) + nrecs += 1 + if is_periodical: + exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8)) + exth.write(datestr) + nrecs += 1 + + if be_kindlegen2: + mv = 200 if iswindows else 202 if isosx else 201 + vals = {204:mv, 205:2, 206:9, 207:0} + elif is_periodical: + # Pretend to be amazon's super secret periodical generator + vals = {204:201, 205:2, 206:0, 207:101} + else: + # Pretend to be kindlegen 1.2 + vals = {204:201, 205:1, 206:2, 207:33307} + for code, val in iteritems(vals): + exth.write(pack(b'>III', code, 12, val)) + nrecs += 1 + if be_kindlegen2: + revnum = b'0730-890adc2' + exth.write(pack(b'>II', 535, 8 + len(revnum)) + revnum) + nrecs += 1 + + if cover_offset is not None: + exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12, + cover_offset)) + exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0)) + nrecs += 2 + if thumbnail_offset is not None: + exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12, + thumbnail_offset)) + thumbnail_uri_str = ('kindle:embed:%s' %(to_base(thumbnail_offset, base=32, min_num_digits=4))).encode('utf-8') + exth.write(pack(b'>II', EXTH_CODES['kf8_thumbnail_uri'], len(thumbnail_uri_str) + 8)) + exth.write(thumbnail_uri_str) + nrecs += 2 + + if start_offset is not None: + try: + len(start_offset) + except TypeError: + start_offset = [start_offset] + for so in start_offset: + if so is not None: + exth.write(pack(b'>III', EXTH_CODES['startreading'], 12, + so)) + nrecs += 1 + + if kf8_header_index is not None: + exth.write(pack(b'>III', EXTH_CODES['kf8_header_index'], 12, + kf8_header_index)) + nrecs += 1 + + if num_of_resources is not None: + exth.write(pack(b'>III', EXTH_CODES['num_of_resources'], 12, + num_of_resources)) + nrecs += 1 + + if kf8_unknown_count is not None: + exth.write(pack(b'>III', EXTH_CODES['kf8_unknown_count'], 12, + kf8_unknown_count)) + nrecs += 1 + + if primary_writing_mode: + pwm = primary_writing_mode.encode('utf-8') + exth.write(pack(b'>II', EXTH_CODES['primary_writing_mode'], len(pwm) + 8)) + exth.write(pwm) + nrecs += 1 + + if page_progression_direction in {'rtl', 'ltr', 'default'}: + ppd = page_progression_direction.encode('ascii') + exth.write(pack(b'>II', EXTH_CODES['page_progression_direction'], len(ppd) + 8)) + exth.write(ppd) + nrecs += 1 + + exth = exth.getvalue() + trail = len(exth) % 4 + pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte + exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad] + return b''.join(exth) diff --git a/ebook_converter/ebooks/oeb/transforms/htmltoc.py b/ebook_converter/ebooks/oeb/transforms/htmltoc.py new file mode 100644 index 0000000..df6f699 --- /dev/null +++ b/ebook_converter/ebooks/oeb/transforms/htmltoc.py @@ -0,0 +1,128 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +''' +HTML-TOC-adding transform. +''' + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +from calibre.ebooks.oeb.base import XML, XHTML, XHTML_NS +from calibre.ebooks.oeb.base import XHTML_MIME, CSS_MIME +from calibre.ebooks.oeb.base import element, XPath +from polyglot.builtins import unicode_type + +__all__ = ['HTMLTOCAdder'] + +DEFAULT_TITLE = __('Table of Contents') + +STYLE_CSS = { + 'nested': """ +.calibre_toc_header { + text-align: center; +} +.calibre_toc_block { + margin-left: 1.2em; + text-indent: -1.2em; +} +.calibre_toc_block .calibre_toc_block { + margin-left: 2.4em; +} +.calibre_toc_block .calibre_toc_block .calibre_toc_block { + margin-left: 3.6em; +} +""", + + 'centered': """ +.calibre_toc_header { + text-align: center; +} +.calibre_toc_block { + text-align: center; +} +body > .calibre_toc_block { + margin-top: 1.2em; +} +""" + } + + +class HTMLTOCAdder(object): + + def __init__(self, title=None, style='nested', position='end'): + self.title = title + self.style = style + self.position = position + + @classmethod + def config(cls, cfg): + group = cfg.add_group('htmltoc', _('HTML TOC generation options.')) + group('toc_title', ['--toc-title'], default=None, + help=_('Title for any generated in-line table of contents.')) + return cfg + + @classmethod + def generate(cls, opts): + return cls(title=opts.toc_title) + + def __call__(self, oeb, context): + has_toc = getattr(getattr(oeb, 'toc', False), 'nodes', False) + + if 'toc' in oeb.guide: + # Ensure toc pointed to in is in spine + from calibre.ebooks.oeb.base import urlnormalize + href = urlnormalize(oeb.guide['toc'].href) + if href in oeb.manifest.hrefs: + item = oeb.manifest.hrefs[href] + if (hasattr(item.data, 'xpath') and + XPath('//h:a[@href]')(item.data)): + if oeb.spine.index(item) < 0: + if self.position == 'end': + oeb.spine.add(item, linear=False) + else: + oeb.spine.insert(0, item, linear=True) + return + elif has_toc: + oeb.guide.remove('toc') + else: + oeb.guide.remove('toc') + if not has_toc: + return + oeb.logger.info('Generating in-line TOC...') + title = self.title or oeb.translate(DEFAULT_TITLE) + style = self.style + if style not in STYLE_CSS: + oeb.logger.error('Unknown TOC style %r' % style) + style = 'nested' + id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css') + oeb.manifest.add(id, css_href, CSS_MIME, data=STYLE_CSS[style]) + language = unicode_type(oeb.metadata.language[0]) + contents = element(None, XHTML('html'), nsmap={None: XHTML_NS}, + attrib={XML('lang'): language}) + head = element(contents, XHTML('head')) + htitle = element(head, XHTML('title')) + htitle.text = title + element(head, XHTML('link'), rel='stylesheet', type=CSS_MIME, + href=css_href) + body = element(contents, XHTML('body'), + attrib={'class': 'calibre_toc'}) + h1 = element(body, XHTML('h2'), + attrib={'class': 'calibre_toc_header'}) + h1.text = title + self.add_toc_level(body, oeb.toc) + id, href = oeb.manifest.generate('contents', 'contents.xhtml') + item = oeb.manifest.add(id, href, XHTML_MIME, data=contents) + if self.position == 'end': + oeb.spine.add(item, linear=False) + else: + oeb.spine.insert(0, item, linear=True) + oeb.guide.add('toc', 'Table of Contents', href) + + def add_toc_level(self, elem, toc): + for node in toc: + block = element(elem, XHTML('div'), + attrib={'class': 'calibre_toc_block'}) + line = element(block, XHTML('a'), + attrib={'href': node.href, + 'class': 'calibre_toc_line'}) + line.text = node.title + self.add_toc_level(block, node) diff --git a/ebook_converter/ebooks/oeb/transforms/manglecase.py b/ebook_converter/ebooks/oeb/transforms/manglecase.py new file mode 100644 index 0000000..24c1a9a --- /dev/null +++ b/ebook_converter/ebooks/oeb/transforms/manglecase.py @@ -0,0 +1,117 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +''' +CSS case-mangling transform. +''' + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +from lxml import etree +from calibre.ebooks.oeb.base import XHTML, XHTML_NS +from calibre.ebooks.oeb.base import CSS_MIME +from calibre.ebooks.oeb.base import namespace +from calibre.ebooks.oeb.stylizer import Stylizer +from polyglot.builtins import string_or_bytes + +CASE_MANGLER_CSS = """ +.calibre_lowercase { + font-variant: normal; + font-size: 0.65em; +} +""" + +TEXT_TRANSFORMS = {'capitalize', 'uppercase', 'lowercase'} + + +class CaseMangler(object): + + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): + oeb.logger.info('Applying case-transforming CSS...') + self.oeb = oeb + self.opts = context + self.profile = context.source + self.mangle_spine() + + def mangle_spine(self): + id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css') + self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS) + for item in self.oeb.spine: + html = item.data + relhref = item.relhref(href) + etree.SubElement(html.find(XHTML('head')), XHTML('link'), + rel='stylesheet', href=relhref, type=CSS_MIME) + stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile) + self.mangle_elem(html.find(XHTML('body')), stylizer) + + def text_transform(self, transform, text): + if transform == 'capitalize': + return icu_title(text) + elif transform == 'uppercase': + return icu_upper(text) + elif transform == 'lowercase': + return icu_lower(text) + return text + + def split_text(self, text): + results = [''] + isupper = text[0].isupper() + for char in text: + if char.isupper() == isupper: + results[-1] += char + else: + isupper = not isupper + results.append(char) + return results + + def smallcaps_elem(self, elem, attr): + texts = self.split_text(getattr(elem, attr)) + setattr(elem, attr, None) + last = elem if attr == 'tail' else None + attrib = {'class': 'calibre_lowercase'} + for text in texts: + if text.isupper(): + if last is None: + elem.text = text + else: + last.tail = text + else: + child = elem.makeelement(XHTML('span'), attrib=attrib) + child.text = text.upper() + if last is None: + elem.insert(0, child) + else: + # addnext() moves the tail for some reason + tail = last.tail + last.addnext(child) + last.tail = tail + child.tail = None + last = child + + def mangle_elem(self, elem, stylizer): + if not isinstance(elem.tag, string_or_bytes) or \ + namespace(elem.tag) != XHTML_NS: + return + children = list(elem) + style = stylizer.style(elem) + transform = style['text-transform'] + variant = style['font-variant'] + if elem.text: + if transform in TEXT_TRANSFORMS: + elem.text = self.text_transform(transform, elem.text) + if variant == 'small-caps': + self.smallcaps_elem(elem, 'text') + for child in children: + self.mangle_elem(child, stylizer) + if child.tail: + if transform in TEXT_TRANSFORMS: + child.tail = self.text_transform(transform, child.tail) + if variant == 'small-caps': + self.smallcaps_elem(child, 'tail') diff --git a/ebook_converter/ebooks/oeb/transforms/rasterize.py b/ebook_converter/ebooks/oeb/transforms/rasterize.py new file mode 100644 index 0000000..0a58dc6 --- /dev/null +++ b/ebook_converter/ebooks/oeb/transforms/rasterize.py @@ -0,0 +1,239 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +''' +SVG rasterization transform. +''' + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import os, re + +from PyQt5.Qt import ( + Qt, QByteArray, QBuffer, QIODevice, QColor, QImage, QPainter, QSvgRenderer) +from calibre.ebooks.oeb.base import XHTML, XLINK +from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME +from calibre.ebooks.oeb.base import xml2str, xpath +from calibre.ebooks.oeb.base import urlnormalize +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ptempfile import PersistentTemporaryFile +from calibre.utils.imghdr import what +from polyglot.builtins import unicode_type +from polyglot.urllib import urldefrag + +IMAGE_TAGS = {XHTML('img'), XHTML('object')} +KEEP_ATTRS = {'class', 'style', 'width', 'height', 'align'} + + +class Unavailable(Exception): + pass + + +class SVGRasterizer(object): + + def __init__(self, base_css=''): + self.base_css = base_css + from calibre.gui2 import must_use_qt + must_use_qt() + + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): + oeb.logger.info('Rasterizing SVG images...') + self.temp_files = [] + self.stylizer_cache = {} + self.oeb = oeb + self.opts = context + self.profile = context.dest + self.images = {} + self.dataize_manifest() + self.rasterize_spine() + self.rasterize_cover() + for pt in self.temp_files: + try: + os.remove(pt) + except: + pass + + def rasterize_svg(self, elem, width=0, height=0, format='PNG'): + view_box = elem.get('viewBox', elem.get('viewbox', None)) + sizes = None + logger = self.oeb.logger + + if view_box is not None: + try: + box = [float(x) for x in filter(None, re.split('[, ]', view_box))] + sizes = [box[2]-box[0], box[3] - box[1]] + except (TypeError, ValueError, IndexError): + logger.warn('SVG image has invalid viewBox="%s", ignoring the viewBox' % view_box) + else: + for image in elem.xpath('descendant::*[local-name()="image" and ' + '@height and contains(@height, "%")]'): + logger.info('Found SVG image height in %, trying to convert...') + try: + h = float(image.get('height').replace('%', ''))/100. + image.set('height', unicode_type(h*sizes[1])) + except: + logger.exception('Failed to convert percentage height:', + image.get('height')) + + data = QByteArray(xml2str(elem, with_tail=False)) + svg = QSvgRenderer(data) + size = svg.defaultSize() + if size.width() == 100 and size.height() == 100 and sizes: + size.setWidth(sizes[0]) + size.setHeight(sizes[1]) + if width or height: + size.scale(width, height, Qt.KeepAspectRatio) + logger.info('Rasterizing %r to %dx%d' + % (elem, size.width(), size.height())) + image = QImage(size, QImage.Format_ARGB32_Premultiplied) + image.fill(QColor("white").rgb()) + painter = QPainter(image) + svg.render(painter) + painter.end() + array = QByteArray() + buffer = QBuffer(array) + buffer.open(QIODevice.WriteOnly) + image.save(buffer, format) + return array.data() + + def dataize_manifest(self): + for item in self.oeb.manifest.values(): + if item.media_type == SVG_MIME and item.data is not None: + self.dataize_svg(item) + + def dataize_svg(self, item, svg=None): + if svg is None: + svg = item.data + hrefs = self.oeb.manifest.hrefs + for elem in xpath(svg, '//svg:*[@xl:href]'): + href = urlnormalize(elem.attrib[XLINK('href')]) + path = urldefrag(href)[0] + if not path: + continue + abshref = item.abshref(path) + if abshref not in hrefs: + continue + linkee = hrefs[abshref] + data = linkee.bytes_representation + ext = what(None, data) or 'jpg' + with PersistentTemporaryFile(suffix='.'+ext) as pt: + pt.write(data) + self.temp_files.append(pt.name) + elem.attrib[XLINK('href')] = pt.name + return svg + + def stylizer(self, item): + ans = self.stylizer_cache.get(item, None) + if ans is None: + ans = Stylizer(item.data, item.href, self.oeb, self.opts, + self.profile, base_css=self.base_css) + self.stylizer_cache[item] = ans + return ans + + def rasterize_spine(self): + for item in self.oeb.spine: + self.rasterize_item(item) + + def rasterize_item(self, item): + html = item.data + hrefs = self.oeb.manifest.hrefs + for elem in xpath(html, '//h:img[@src]'): + src = urlnormalize(elem.attrib['src']) + image = hrefs.get(item.abshref(src), None) + if image and image.media_type == SVG_MIME: + style = self.stylizer(item).style(elem) + self.rasterize_external(elem, style, item, image) + for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME): + data = urlnormalize(elem.attrib['data']) + image = hrefs.get(item.abshref(data), None) + if image and image.media_type == SVG_MIME: + style = self.stylizer(item).style(elem) + self.rasterize_external(elem, style, item, image) + for elem in xpath(html, '//svg:svg'): + style = self.stylizer(item).style(elem) + self.rasterize_inline(elem, style, item) + + def rasterize_inline(self, elem, style, item): + width = style['width'] + height = style['height'] + width = (width / 72) * self.profile.dpi + height = (height / 72) * self.profile.dpi + elem = self.dataize_svg(item, elem) + data = self.rasterize_svg(elem, width, height) + manifest = self.oeb.manifest + href = os.path.splitext(item.href)[0] + '.png' + id, href = manifest.generate(item.id, href) + manifest.add(id, href, PNG_MIME, data=data) + img = elem.makeelement(XHTML('img'), src=item.relhref(href)) + elem.getparent().replace(elem, img) + for prop in ('width', 'height'): + if prop in elem.attrib: + img.attrib[prop] = elem.attrib[prop] + + def rasterize_external(self, elem, style, item, svgitem): + width = style['width'] + height = style['height'] + width = (width / 72) * self.profile.dpi + height = (height / 72) * self.profile.dpi + data = QByteArray(svgitem.bytes_representation) + svg = QSvgRenderer(data) + size = svg.defaultSize() + size.scale(width, height, Qt.KeepAspectRatio) + key = (svgitem.href, size.width(), size.height()) + if key in self.images: + href = self.images[key] + else: + logger = self.oeb.logger + logger.info('Rasterizing %r to %dx%d' + % (svgitem.href, size.width(), size.height())) + image = QImage(size, QImage.Format_ARGB32_Premultiplied) + image.fill(QColor("white").rgb()) + painter = QPainter(image) + svg.render(painter) + painter.end() + array = QByteArray() + buffer = QBuffer(array) + buffer.open(QIODevice.WriteOnly) + image.save(buffer, 'PNG') + data = array.data() + manifest = self.oeb.manifest + href = os.path.splitext(svgitem.href)[0] + '.png' + id, href = manifest.generate(svgitem.id, href) + manifest.add(id, href, PNG_MIME, data=data) + self.images[key] = href + elem.tag = XHTML('img') + for attr in elem.attrib: + if attr not in KEEP_ATTRS: + del elem.attrib[attr] + elem.attrib['src'] = item.relhref(href) + if elem.text: + elem.attrib['alt'] = elem.text + elem.text = None + for child in elem: + elem.remove(child) + + def rasterize_cover(self): + covers = self.oeb.metadata.cover + if not covers: + return + if unicode_type(covers[0]) not in self.oeb.manifest.ids: + self.oeb.logger.warn('Cover not in manifest, skipping.') + self.oeb.metadata.clear('cover') + return + cover = self.oeb.manifest.ids[unicode_type(covers[0])] + if not cover.media_type == SVG_MIME: + return + width = (self.profile.width / 72) * self.profile.dpi + height = (self.profile.height / 72) * self.profile.dpi + data = self.rasterize_svg(cover.data, width, height) + href = os.path.splitext(cover.href)[0] + '.png' + id, href = self.oeb.manifest.generate(cover.id, href) + self.oeb.manifest.add(id, href, PNG_MIME, data=data) + covers[0].value = id diff --git a/ebook_converter/ebooks/pdf/__init__.py b/ebook_converter/ebooks/pdf/__init__.py new file mode 100644 index 0000000..c23e175 --- /dev/null +++ b/ebook_converter/ebooks/pdf/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python2 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Used for pdf output for comic2pdf +''' diff --git a/ebook_converter/ebooks/pdf/pdftohtml.py b/ebook_converter/ebooks/pdf/pdftohtml.py new file mode 100644 index 0000000..9df116b --- /dev/null +++ b/ebook_converter/ebooks/pdf/pdftohtml.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2008, Kovid Goyal + +from __future__ import print_function, unicode_literals + +import errno +import os +import re +import shutil +import subprocess +import sys + +from calibre import CurrentDir, xml_replace_entities, prints +from calibre.constants import ( + filesystem_encoding, isbsd, islinux, isosx, ispy3, iswindows +) +from calibre.ebooks import ConversionError, DRMError +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ptempfile import PersistentTemporaryFile +from calibre.utils.cleantext import clean_xml_chars +from calibre.utils.ipc import eintr_retry_call + + +PDFTOHTML = 'pdftohtml' + + +def popen(cmd, **kw): + if not ispy3: + cmd = [x.encode(filesystem_encoding) if not isinstance(x, bytes) else x for x in cmd] + if iswindows: + kw['creationflags'] = 0x08 + return subprocess.Popen(cmd, **kw) + + +if isosx and hasattr(sys, 'frameworks_dir'): + base = os.path.join(os.path.dirname(sys.frameworks_dir), 'utils.app', 'Contents', 'MacOS') + PDFTOHTML = os.path.join(base, PDFTOHTML) +if iswindows and hasattr(sys, 'frozen'): + base = sys.extensions_location if hasattr(sys, 'new_app_layout') else os.path.dirname(sys.executable) + PDFTOHTML = os.path.join(base, 'pdftohtml.exe') +if (islinux or isbsd) and getattr(sys, 'frozen', False): + PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml') + + +def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): + ''' + Convert the pdf into html using the pdftohtml app. + This will write the html as index.html into output_dir. + It will also write all extracted images to the output_dir + ''' + + pdfsrc = os.path.join(output_dir, 'src.pdf') + index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html')) + + with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest: + shutil.copyfileobj(src, dest) + + with CurrentDir(output_dir): + + def a(x): + return os.path.basename(x) + + exe = PDFTOHTML + cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', + '-nodrm', a(pdfsrc), a(index)] + + if isbsd: + cmd.remove('-nodrm') + if no_images: + cmd.append('-i') + if as_xml: + cmd.append('-xml') + + logf = PersistentTemporaryFile('pdftohtml_log') + try: + p = popen(cmd, stderr=logf._fd, stdout=logf._fd, + stdin=subprocess.PIPE) + except OSError as err: + if err.errno == errno.ENOENT: + raise ConversionError( + _('Could not find pdftohtml, check it is in your PATH')) + else: + raise + ret = eintr_retry_call(p.wait) + logf.flush() + logf.close() + out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip() + if ret != 0: + raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out)) + if out: + prints("pdftohtml log:") + prints(out) + if not os.path.exists(index) or os.stat(index).st_size < 100: + raise DRMError() + + if not as_xml: + with lopen(index, 'r+b') as i: + raw = i.read().decode('utf-8', 'replace') + raw = flip_images(raw) + raw = raw.replace('\n = 0.20 output self closing
tags, this + # breaks the pdf heuristics regexps, so replace them + raw = raw.replace('
', '
') + raw = re.sub(r' 2: + root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml') + with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f: + f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True)) + + +def flip_image(img, flip): + from calibre.utils.img import flip_image, image_and_format_from_data, image_to_data + with lopen(img, 'r+b') as f: + img, fmt = image_and_format_from_data(f.read()) + img = flip_image(img, horizontal='x' in flip, vertical='y' in flip) + f.seek(0), f.truncate() + f.write(image_to_data(img, fmt=fmt)) + + +def flip_images(raw): + for match in re.finditer(']+/?>', raw, flags=re.I): + img = match.group() + m = re.search(r'class="(x|y|xy)flip"', img) + if m is None: + continue + flip = m.group(1) + src = re.search(r'src="([^"]+)"', img) + if src is None: + continue + img = src.group(1) + if not os.path.exists(img): + continue + flip_image(img, flip) + raw = re.sub(r'\s*', '', raw, flags=re.I|re.DOTALL) + return raw