1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2025-12-30 14:02:27 +01:00

Added mobi writer files

This commit is contained in:
2020-04-13 15:24:23 +02:00
parent 79cad46732
commit ae80ae5640
12 changed files with 3346 additions and 0 deletions

View File

@@ -0,0 +1,622 @@
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Transform XHTML/OPS-ish content into Mobipocket HTML 3.2.
'''
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
import copy
import re
import numbers
from lxml import etree
from calibre.ebooks.oeb.base import namespace, barename
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
from calibre.ebooks.mobi.utils import convert_color_for_font_tag
from calibre.utils.imghdr import identify
from polyglot.builtins import unicode_type, string_or_bytes
MBP_NS = 'http://mobipocket.com/ns/mbp'
def MBP(name):
return '{%s}%s' % (MBP_NS, name)
MOBI_NSMAP = {None: XHTML_NS, 'mbp': MBP_NS}
INLINE_TAGS = {'span', 'a', 'code', 'u', 's', 'big', 'strike', 'tt', 'font', 'q', 'i', 'b', 'em', 'strong', 'sup', 'sub'}
HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
# GR: Added 'caption' to both sets
NESTABLE_TAGS = {'ol', 'ul', 'li', 'table', 'tr', 'td', 'th', 'caption'}
TABLE_TAGS = {'table', 'tr', 'td', 'th', 'caption'}
SPECIAL_TAGS = {'hr', 'br'}
CONTENT_TAGS = {'img', 'hr', 'br'}
NOT_VTAGS = HEADER_TAGS | NESTABLE_TAGS | TABLE_TAGS | SPECIAL_TAGS | \
CONTENT_TAGS
LEAF_TAGS = {'base', 'basefont', 'frame', 'link', 'meta', 'area', 'br',
'col', 'hr', 'img', 'input', 'param'}
PAGE_BREAKS = {'always', 'left', 'right'}
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
def asfloat(value):
if not isinstance(value, numbers.Number):
return 0.0
return float(value)
def isspace(text):
if not text:
return True
if '\xa0' in text:
return False
return text.isspace()
class BlockState(object):
def __init__(self, body):
self.body = body
self.nested = []
self.para = None
self.inline = None
self.anchor = None
self.vpadding = 0.
self.vmargin = 0.
self.pbreak = False
self.istate = None
self.content = False
class FormatState(object):
def __init__(self):
self.rendered = False
self.left = 0.
self.halign = 'auto'
self.indent = 0.
self.fsize = 3
self.ids = set()
self.italic = False
self.bold = False
self.strikethrough = False
self.underline = False
self.preserve = False
self.pre_wrap = False
self.family = 'serif'
self.bgcolor = 'transparent'
self.fgcolor = 'black'
self.href = None
self.list_num = 0
self.attrib = {}
def __eq__(self, other):
return self.fsize == other.fsize \
and self.italic == other.italic \
and self.bold == other.bold \
and self.href == other.href \
and self.preserve == other.preserve \
and self.pre_wrap == other.pre_wrap \
and self.family == other.family \
and self.bgcolor == other.bgcolor \
and self.fgcolor == other.fgcolor \
and self.strikethrough == other.strikethrough \
and self.underline == other.underline
def __ne__(self, other):
return not self.__eq__(other)
class MobiMLizer(object):
def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables
def __call__(self, oeb, context):
oeb.logger.info('Converting XHTML to Mobipocket markup...')
self.oeb = oeb
self.log = self.oeb.logger
self.opts = context
self.profile = profile = context.dest
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
self.mobimlize_spine()
def mobimlize_spine(self):
'Iterate over the spine and convert it to MOBIML'
for item in self.oeb.spine:
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile)
body = item.data.find(XHTML('body'))
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
nbody = etree.SubElement(nroot, XHTML('body'))
self.current_spine_item = item
self.mobimlize_elem(body, stylizer, BlockState(nbody),
[FormatState()])
item.data = nroot
# print(etree.tostring(nroot))
def mobimlize_font(self, ptsize):
return self.fnums[self.fmap[ptsize]]
def mobimlize_measure(self, ptsize):
if isinstance(ptsize, string_or_bytes):
return ptsize
embase = self.profile.fbase
if round(ptsize) < embase:
return "%dpt" % int(round(ptsize))
return "%dem" % int(round(ptsize / embase))
def preize_text(self, text, pre_wrap=False):
text = unicode_type(text)
if pre_wrap:
# Replace n consecutive spaces with n-1 NBSP + space
text = re.sub(r' {2,}', lambda m:('\xa0'*(len(m.group())-1) + ' '), text)
else:
text = text.replace(' ', '\xa0')
text = text.replace('\r\n', '\n')
text = text.replace('\r', '\n')
lines = text.split('\n')
result = lines[:1]
for line in lines[1:]:
result.append(etree.Element(XHTML('br')))
if line:
result.append(line)
return result
def mobimlize_content(self, tag, text, bstate, istates):
'Convert text content'
if text or tag != 'br':
bstate.content = True
istate = istates[-1]
para = bstate.para
if tag in SPECIAL_TAGS and not text:
para = para if para is not None else bstate.body
elif para is None or tag in ('td', 'th'):
body = bstate.body
if bstate.pbreak:
etree.SubElement(body, MBP('pagebreak'))
bstate.pbreak = False
bstate.istate = None
bstate.anchor = None
parent = bstate.nested[-1] if bstate.nested else bstate.body
indent = istate.indent
left = istate.left
if isinstance(indent, string_or_bytes):
indent = 0
if indent < 0 and abs(indent) < left:
left += indent
indent = 0
elif indent != 0 and abs(indent) < self.profile.fbase:
indent = (indent / abs(indent)) * self.profile.fbase
if tag in NESTABLE_TAGS and not istate.rendered:
para = wrapper = etree.SubElement(
parent, XHTML(tag), attrib=istate.attrib)
bstate.nested.append(para)
if tag == 'li' and len(istates) > 1:
istates[-2].list_num += 1
para.attrib['value'] = unicode_type(istates[-2].list_num)
elif tag in NESTABLE_TAGS and istate.rendered:
para = wrapper = bstate.nested[-1]
elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0:
ems = self.profile.mobi_ems_per_blockquote
para = wrapper = etree.SubElement(parent, XHTML('blockquote'))
para = wrapper
emleft = int(round(left / self.profile.fbase)) - ems
emleft = min((emleft, 10))
while emleft > ems / 2:
para = etree.SubElement(para, XHTML('blockquote'))
emleft -= ems
else:
para = wrapper = etree.SubElement(parent, XHTML('p'))
bstate.inline = bstate.para = para
vspace = bstate.vpadding + bstate.vmargin
bstate.vpadding = bstate.vmargin = 0
if tag not in TABLE_TAGS:
if tag in ('ul', 'ol') and vspace > 0:
wrapper.addprevious(etree.Element(XHTML('div'),
height=self.mobimlize_measure(vspace)))
else:
wrapper.attrib['height'] = self.mobimlize_measure(vspace)
para.attrib['width'] = self.mobimlize_measure(indent)
elif tag == 'table' and vspace > 0:
vspace = int(round(vspace / self.profile.fbase))
while vspace > 0:
wrapper.addprevious(etree.Element(XHTML('br')))
vspace -= 1
if istate.halign != 'auto' and isinstance(istate.halign, (bytes, unicode_type)):
if isinstance(istate.halign, bytes):
istate.halign = istate.halign.decode('utf-8')
para.attrib['align'] = istate.halign
istate.rendered = True
pstate = bstate.istate
if tag in CONTENT_TAGS:
bstate.inline = para
pstate = bstate.istate = None
try:
etree.SubElement(para, XHTML(tag), attrib=istate.attrib)
except:
print('Invalid subelement:', para, tag, istate.attrib)
raise
elif tag in TABLE_TAGS:
para.attrib['valign'] = 'top'
if istate.ids:
for id_ in istate.ids:
anchor = etree.Element(XHTML('a'), attrib={'id': id_})
if tag == 'li':
try:
last = bstate.body[-1][-1]
except:
break
last.insert(0, anchor)
anchor.tail = last.text
last.text = None
else:
last = bstate.body[-1]
# We use append instead of addprevious so that inline
# anchors in large blocks point to the correct place. See
# https://bugs.launchpad.net/calibre/+bug/899831
# This could potentially break if inserting an anchor at
# this point in the markup is illegal, but I cannot think
# of such a case offhand.
if barename(last.tag) in LEAF_TAGS:
last.addprevious(anchor)
else:
last.append(anchor)
istate.ids.clear()
if not text:
return
if not pstate or istate != pstate:
inline = para
fsize = istate.fsize
href = istate.href
if not href:
bstate.anchor = None
elif pstate and pstate.href == href:
inline = bstate.anchor
else:
inline = etree.SubElement(inline, XHTML('a'), href=href)
bstate.anchor = inline
if fsize != 3:
inline = etree.SubElement(inline, XHTML('font'),
size=unicode_type(fsize))
if istate.family == 'monospace':
inline = etree.SubElement(inline, XHTML('tt'))
if istate.italic:
inline = etree.SubElement(inline, XHTML('i'))
if istate.bold:
inline = etree.SubElement(inline, XHTML('b'))
if istate.bgcolor is not None and istate.bgcolor != 'transparent' :
inline = etree.SubElement(inline, XHTML('span'),
bgcolor=convert_color_for_font_tag(istate.bgcolor))
if istate.fgcolor != 'black':
inline = etree.SubElement(inline, XHTML('font'),
color=convert_color_for_font_tag(istate.fgcolor))
if istate.strikethrough:
inline = etree.SubElement(inline, XHTML('s'))
if istate.underline:
inline = etree.SubElement(inline, XHTML('u'))
bstate.inline = inline
bstate.istate = istate
inline = bstate.inline
content = self.preize_text(text, pre_wrap=istate.pre_wrap) if istate.preserve or istate.pre_wrap else [text]
for item in content:
if isinstance(item, string_or_bytes):
if len(inline) == 0:
inline.text = (inline.text or '') + item
else:
last = inline[-1]
last.tail = (last.tail or '') + item
else:
inline.append(item)
def mobimlize_elem(self, elem, stylizer, bstate, istates,
ignore_valign=False):
if not isinstance(elem.tag, string_or_bytes) \
or namespace(elem.tag) != XHTML_NS:
return
style = stylizer.style(elem)
# <mbp:frame-set/> does not exist lalalala
if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and
elem.get('data-calibre-jacket-searchable-tags', None) != '1'):
id_ = elem.get('id', None)
if id_:
# Keep anchors so people can use display:none
# to generate hidden TOCs
tail = elem.tail
elem.clear()
elem.text = None
elem.set('id', id_)
elem.tail = tail
elem.tag = XHTML('a')
else:
return
tag = barename(elem.tag)
istate = copy.copy(istates[-1])
istate.rendered = False
istate.list_num = 0
if tag == 'ol' and 'start' in elem.attrib:
try:
istate.list_num = int(elem.attrib['start'])-1
except:
pass
istates.append(istate)
left = 0
display = style['display']
if display == 'table-cell':
display = 'inline'
elif display.startswith('table'):
display = 'block'
isblock = (not display.startswith('inline') and style['display'] !=
'none')
isblock = isblock and style['float'] == 'none'
isblock = isblock and tag != 'br'
if isblock:
bstate.para = None
istate.halign = style['text-align']
rawti = style._get('text-indent')
istate.indent = style['text-indent']
if hasattr(rawti, 'strip') and '%' in rawti:
# We have a percentage text indent, these can come out looking
# too large if the user chooses a wide output profile like
# tablet
istate.indent = min(style._unit_convert(rawti, base=500), istate.indent)
if style['margin-left'] == 'auto' \
and style['margin-right'] == 'auto':
istate.halign = 'center'
margin = asfloat(style['margin-left'])
padding = asfloat(style['padding-left'])
if tag != 'body':
left = margin + padding
istate.left += left
vmargin = asfloat(style['margin-top'])
bstate.vmargin = max((bstate.vmargin, vmargin))
vpadding = asfloat(style['padding-top'])
if vpadding > 0:
bstate.vpadding += bstate.vmargin
bstate.vmargin = 0
bstate.vpadding += vpadding
elif not istate.href:
margin = asfloat(style['margin-left'])
padding = asfloat(style['padding-left'])
lspace = margin + padding
if lspace > 0:
spaces = int(round((lspace * 3) / style['font-size']))
elem.text = ('\xa0' * spaces) + (elem.text or '')
margin = asfloat(style['margin-right'])
padding = asfloat(style['padding-right'])
rspace = margin + padding
if rspace > 0:
spaces = int(round((rspace * 3) / style['font-size']))
if len(elem) == 0:
elem.text = (elem.text or '') + ('\xa0' * spaces)
else:
last = elem[-1]
last.text = (last.text or '') + ('\xa0' * spaces)
if bstate.content and style['page-break-before'] in PAGE_BREAKS:
bstate.pbreak = True
istate.fsize = self.mobimlize_font(style['font-size'])
istate.italic = True if style['font-style'] == 'italic' else False
weight = style['font-weight']
istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
istate.preserve = style['white-space'] == 'pre'
istate.pre_wrap = style['white-space'] == 'pre-wrap'
istate.bgcolor = style['background-color']
istate.fgcolor = style['color']
istate.strikethrough = style.effective_text_decoration == 'line-through'
istate.underline = style.effective_text_decoration == 'underline'
ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else ''
if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'):
istate.family = 'monospace'
elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or
'arial' in ff or 'helvetica' in ff):
istate.family = 'sans-serif'
else:
istate.family = 'serif'
if 'id' in elem.attrib:
istate.ids.add(elem.attrib['id'])
if 'name' in elem.attrib:
istate.ids.add(elem.attrib['name'])
if tag == 'a' and 'href' in elem.attrib:
istate.href = elem.attrib['href']
istate.attrib.clear()
if tag == 'img' and 'src' in elem.attrib:
istate.attrib['src'] = elem.attrib['src']
istate.attrib['align'] = 'baseline'
cssdict = style.cssdict()
valign = cssdict.get('vertical-align', None)
if valign in ('top', 'bottom', 'middle'):
istate.attrib['align'] = valign
for prop in ('width', 'height'):
if cssdict[prop] != 'auto':
value = style[prop]
if value == getattr(self.profile, prop):
result = '100%'
else:
# Amazon's renderer does not support
# img sizes in units other than px
# See #7520 for test case
try:
pixs = int(round(float(value) /
(72/self.profile.dpi)))
except:
continue
result = unicode_type(pixs)
istate.attrib[prop] = result
if 'width' not in istate.attrib or 'height' not in istate.attrib:
href = self.current_spine_item.abshref(elem.attrib['src'])
try:
item = self.oeb.manifest.hrefs[urlnormalize(href)]
except:
self.oeb.logger.warn('Failed to find image:',
href)
else:
try:
width, height = identify(item.data)[1:]
except Exception:
self.oeb.logger.warn('Invalid image:', href)
else:
if 'width' not in istate.attrib and 'height' not in \
istate.attrib:
istate.attrib['width'] = unicode_type(width)
istate.attrib['height'] = unicode_type(height)
else:
ar = width / height
if 'width' not in istate.attrib:
try:
width = int(istate.attrib['height'])*ar
except:
pass
istate.attrib['width'] = unicode_type(int(width))
else:
try:
height = int(istate.attrib['width'])/ar
except:
pass
istate.attrib['height'] = unicode_type(int(height))
item.unload_data_from_memory()
elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}:
raww = style._get('width')
if hasattr(raww, 'strip') and '%' in raww:
istate.attrib['width'] = raww
else:
prop = style['width'] / self.profile.width
istate.attrib['width'] = "%d%%" % int(round(prop * 100))
elif display == 'table':
tag = 'table'
elif display == 'table-row':
tag = 'tr'
elif display == 'table-cell':
tag = 'td'
if tag in TABLE_TAGS and self.ignore_tables:
tag = 'span' if tag == 'td' else 'div'
if tag in ('table', 'td', 'tr'):
col = style.backgroundColor
if col:
elem.set('bgcolor', col)
css = style.cssdict()
if 'border' in css or 'border-width' in css:
elem.set('border', '1')
if tag in TABLE_TAGS:
for attr in ('rowspan', 'colspan', 'width', 'border', 'scope',
'bgcolor'):
if attr in elem.attrib:
istate.attrib[attr] = elem.attrib[attr]
if tag == 'q':
t = elem.text
if not t:
t = ''
elem.text = '\u201c' + t
t = elem.tail
if not t:
t = ''
elem.tail = '\u201d' + t
text = None
if elem.text:
if istate.preserve or istate.pre_wrap:
text = elem.text
elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and
elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS):
text = None
else:
text = COLLAPSE.sub(' ', elem.text)
valign = style['vertical-align']
not_baseline = valign in ('super', 'sub', 'text-top',
'text-bottom', 'top', 'bottom') or (
isinstance(valign, numbers.Number) and abs(valign) != 0)
issup = valign in ('super', 'text-top', 'top') or (
isinstance(valign, numbers.Number) and valign > 0)
vtag = 'sup' if issup else 'sub'
if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
vbstate = BlockState(etree.SubElement(nroot, XHTML('body')))
vbstate.para = etree.SubElement(vbstate.body, XHTML('p'))
self.mobimlize_elem(elem, stylizer, vbstate, istates,
ignore_valign=True)
if len(istates) > 0:
istates.pop()
if len(istates) == 0:
istates.append(FormatState())
at_start = bstate.para is None
if at_start:
self.mobimlize_content('span', '', bstate, istates)
parent = bstate.para if bstate.inline is None else bstate.inline
if parent is not None:
vtag = etree.SubElement(parent, XHTML(vtag))
vtag = etree.SubElement(vtag, XHTML('small'))
# Add anchors
for child in vbstate.body:
if child is not vbstate.para:
vtag.append(child)
else:
break
if vbstate.para is not None:
if vbstate.para.text:
vtag.text = vbstate.para.text
for child in vbstate.para:
vtag.append(child)
return
if tag == 'blockquote':
old_mim = self.opts.mobi_ignore_margins
self.opts.mobi_ignore_margins = False
if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or (
# We have an id but no text and no children, the id should still
# be added.
istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and
len(elem)==0)):
if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib:
try:
value = int(elem.attrib['value'])
istates[-2].list_num = value - 1
except:
pass
self.mobimlize_content(tag, text, bstate, istates)
for child in elem:
self.mobimlize_elem(child, stylizer, bstate, istates)
tail = None
if child.tail:
if istate.preserve or istate.pre_wrap:
tail = child.tail
elif bstate.para is None and isspace(child.tail):
tail = None
else:
tail = COLLAPSE.sub(' ', child.tail)
if tail:
self.mobimlize_content(tag, tail, bstate, istates)
if tag == 'blockquote':
self.opts.mobi_ignore_margins = old_mim
if bstate.content and style['page-break-after'] in PAGE_BREAKS:
bstate.pbreak = True
if isblock:
para = bstate.para
if para is not None and para.text == '\xa0' and len(para) < 1:
if style.height > 2:
para.getparent().replace(para, etree.Element(XHTML('br')))
else:
# This is too small to be rendered effectively, drop it
para.getparent().remove(para)
bstate.para = None
bstate.istate = None
vmargin = asfloat(style['margin-bottom'])
bstate.vmargin = max((bstate.vmargin, vmargin))
vpadding = asfloat(style['padding-bottom'])
if vpadding > 0:
bstate.vpadding += bstate.vmargin
bstate.vmargin = 0
bstate.vpadding += vpadding
if bstate.nested and bstate.nested[-1].tag == elem.tag:
bstate.nested.pop()
istates.pop()

View File

@@ -0,0 +1,891 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import numbers
from struct import pack
import io
from collections import OrderedDict, defaultdict
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)
from polyglot.builtins import filter, iteritems, itervalues, map, range
class CNCX(CNCX_): # {{{
def __init__(self, toc, is_periodical):
strings = []
for item in toc.iterdescendants(breadth_first=True):
strings.append(item.title)
if is_periodical:
strings.append(item.klass)
if item.author:
strings.append(item.author)
if item.description:
strings.append(item.description)
CNCX_.__init__(self, strings)
# }}}
class TAGX(object): # {{{
BITMASKS = {11:0b1}
BITMASKS.update({x:(1 << i) for i, x in enumerate([1, 2, 3, 4, 5, 21, 22, 23])})
BITMASKS.update({x:(1 << i) for i, x in enumerate([69, 70, 71, 72, 73])})
NUM_VALUES = defaultdict(lambda :1)
NUM_VALUES[11] = 3
NUM_VALUES[0] = 0
def __init__(self):
self.byts = bytearray()
def add_tag(self, tag):
buf = self.byts
buf.append(tag)
buf.append(self.NUM_VALUES[tag])
# bitmask
buf.append(self.BITMASKS[tag] if tag else 0)
# eof
buf.append(0 if tag else 1)
def header(self, control_byte_count):
header = b'TAGX'
# table length, control byte count
header += pack(b'>II', 12+len(self.byts), control_byte_count)
return header
@property
def periodical(self):
'''
TAGX block for the Primary index header of a periodical
'''
list(map(self.add_tag, (1, 2, 3, 4, 5, 21, 22, 23, 0, 69, 70, 71, 72,
73, 0)))
return self.header(2) + bytes(self.byts)
@property
def secondary(self):
'''
TAGX block for the secondary index header of a periodical
'''
list(map(self.add_tag, (11, 0)))
return self.header(1) + bytes(self.byts)
@property
def flat_book(self):
'''
TAGX block for the primary index header of a flat book
'''
list(map(self.add_tag, (1, 2, 3, 4, 0)))
return self.header(1) + bytes(self.byts)
# }}}
# Index Entries {{{
class IndexEntry(object):
TAG_VALUES = {
'offset': 1,
'size': 2,
'label_offset': 3,
'depth': 4,
'class_offset': 5,
'secondary': 11,
'parent_index': 21,
'first_child_index': 22,
'last_child_index': 23,
'image_index': 69,
'desc_offset': 70,
'author_offset': 71,
}
RTAG_MAP = {v:k for k, v in iteritems(TAG_VALUES)} # noqa
def __init__(self, offset, label_offset):
self.offset, self.label_offset = offset, label_offset
self.depth, self.class_offset = 0, None
self.control_byte_count = 1
self.length = 0
self.index = 0
self.parent_index = None
self.first_child_index = None
self.last_child_index = None
self.image_index = None
self.author_offset = None
self.desc_offset = None
def __repr__(self):
return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,'
' parent_index=%r)')%(self.offset, self.depth, self.length,
self.index, self.parent_index)
@property
def size(self):
return self.length
@size.setter
def size(self, val):
self.length = val
@property
def next_offset(self):
return self.offset + self.length
@property
def tag_nums(self):
for i in range(1, 5):
yield i
for attr in ('class_offset', 'parent_index', 'first_child_index',
'last_child_index'):
if getattr(self, attr) is not None:
yield self.TAG_VALUES[attr]
@property
def entry_type(self):
ans = 0
for tag in self.tag_nums:
ans |= TAGX.BITMASKS[tag]
return ans
def attr_for_tag(self, tag):
return self.RTAG_MAP[tag]
@property
def bytestring(self):
buf = io.BytesIO()
if isinstance(self.index, numbers.Integral):
buf.write(encode_number_as_hex(self.index))
else:
raw = bytearray(self.index.encode('ascii'))
raw.insert(0, len(raw))
buf.write(bytes(raw))
et = self.entry_type
buf.write(bytes(bytearray([et])))
if self.control_byte_count == 2:
flags = 0
for attr in ('image_index', 'desc_offset', 'author_offset'):
val = getattr(self, attr)
if val is not None:
tag = self.TAG_VALUES[attr]
bm = TAGX.BITMASKS[tag]
flags |= bm
buf.write(bytes(bytearray([flags])))
for tag in self.tag_nums:
attr = self.attr_for_tag(tag)
val = getattr(self, attr)
if isinstance(val, numbers.Integral):
val = [val]
for x in val:
buf.write(encint(x))
if self.control_byte_count == 2:
for attr in ('image_index', 'desc_offset', 'author_offset'):
val = getattr(self, attr)
if val is not None:
buf.write(encint(val))
ans = buf.getvalue()
return ans
class PeriodicalIndexEntry(IndexEntry):
def __init__(self, offset, label_offset, class_offset, depth):
IndexEntry.__init__(self, offset, label_offset)
self.depth = depth
self.class_offset = class_offset
self.control_byte_count = 2
class SecondaryIndexEntry(IndexEntry):
INDEX_MAP = {'author':73, 'caption':72, 'credit':71, 'description':70,
'mastheadImage':69}
def __init__(self, index):
IndexEntry.__init__(self, 0, 0)
self.index = index
tag = self.INDEX_MAP[index]
# The values for this index entry
# I dont know what the 5 means, it is not the number of entries
self.secondary = [5 if tag == min(
itervalues(self.INDEX_MAP)) else 0, 0, tag]
@property
def tag_nums(self):
yield 11
@property
def entry_type(self):
return 1
@classmethod
def entries(cls):
rmap = {v:k for k,v in iteritems(cls.INDEX_MAP)}
for tag in sorted(rmap, reverse=True):
yield cls(rmap[tag])
# }}}
class TBS(object): # {{{
'''
Take the list of index nodes starting/ending on a record and calculate the
trailing byte sequence for the record.
'''
def __init__(self, data, is_periodical, first=False, section_map={},
after_first=False):
self.section_map = section_map
if is_periodical:
# The starting bytes.
# The value is zero which I think indicates the periodical
# index entry. The values for the various flags seem to be
# unused. If the 0b100 is present, it means that the record
# deals with section 1 (or is the final record with section
# transitions).
self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
flag_size=3)
self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
flag_size=3)
self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
0}, flag_size=3)
if not data:
byts = b''
if after_first:
# This can happen if a record contains only text between
# the periodical start and the first section
byts = self.type_011
self.bytestring = byts
else:
depth_map = defaultdict(list)
for x in ('starts', 'ends', 'completes'):
for idx in data[x]:
depth_map[idx.depth].append(idx)
for l in itervalues(depth_map):
l.sort(key=lambda x:x.offset)
self.periodical_tbs(data, first, depth_map)
else:
if not data:
self.bytestring = b''
else:
self.book_tbs(data, first)
def periodical_tbs(self, data, first, depth_map):
buf = io.BytesIO()
has_section_start = (depth_map[1] and
set(depth_map[1]).intersection(set(data['starts'])))
spanner = data['spans']
parent_section_index = -1
if depth_map[0]:
# We have a terminal record
# Find the first non periodical node
first_node = None
for nodes in (depth_map[1], depth_map[2]):
for node in nodes:
if (first_node is None or (node.offset, node.depth) <
(first_node.offset, first_node.depth)):
first_node = node
typ = (self.type_110 if has_section_start else self.type_010)
# parent_section_index is needed for the last record
if first_node is not None and first_node.depth > 0:
parent_section_index = (first_node.index if first_node.depth == 1 else first_node.parent_index)
else:
parent_section_index = max(iter(self.section_map))
else:
# Non terminal record
if spanner is not None:
# record is spanned by a single article
parent_section_index = spanner.parent_index
typ = (self.type_110 if parent_section_index == 1 else
self.type_010)
elif not depth_map[1]:
# has only article nodes, i.e. spanned by a section
parent_section_index = depth_map[2][0].parent_index
typ = (self.type_111 if parent_section_index == 1 else
self.type_010)
else:
# has section transitions
if depth_map[2]:
parent_section_index = depth_map[2][0].parent_index
else:
parent_section_index = depth_map[1][0].index
typ = self.type_011
buf.write(typ)
if typ not in (self.type_110, self.type_111) and parent_section_index > 0:
extra = {}
# Write starting section information
if spanner is None:
num_articles = len([a for a in depth_map[1] if a.parent_index == parent_section_index])
if not depth_map[1]:
extra = {0b0001: 0}
if num_articles > 1:
extra = {0b0100: num_articles}
buf.write(encode_tbs(parent_section_index, extra))
if spanner is None:
articles = depth_map[2]
sections = {self.section_map[a.parent_index] for a in
articles}
sections = sorted(sections, key=lambda x:x.offset)
section_map = {s:[a for a in articles if a.parent_index ==
s.index] for s in sections}
for i, section in enumerate(sections):
# All the articles in this record that belong to section
articles = section_map[section]
first_article = articles[0]
last_article = articles[-1]
num = len(articles)
last_article_ends = (last_article in data['ends'] or
last_article in data['completes'])
try:
next_sec = sections[i+1]
except:
next_sec = None
extra = {}
if num > 1:
extra[0b0100] = num
if False and i == 0 and next_sec is not None:
# Write offset to next section from start of record
# I can't figure out exactly when Kindlegen decides to
# write this so I have disabled it for now.
extra[0b0001] = next_sec.offset - data['offset']
buf.write(encode_tbs(first_article.index-section.index, extra))
if next_sec is not None:
buf.write(encode_tbs(last_article.index-next_sec.index,
{0b1000: 0}))
# If a section TOC starts and extends into the next record add
# a trailing vwi. We detect this by TBS type==3, processing last
# section present in the record, and the last article in that
# section either ends or completes and doesn't finish
# on the last byte of the record.
elif (typ == self.type_011 and last_article_ends and
((last_article.offset+last_article.size) % RECORD_SIZE > 0)
):
buf.write(encode_tbs(last_article.index-section.index-1,
{0b1000: 0}))
else:
buf.write(encode_tbs(spanner.index - parent_section_index,
{0b0001: 0}))
self.bytestring = buf.getvalue()
def book_tbs(self, data, first):
spanner = data['spans']
if spanner is not None:
self.bytestring = encode_tbs(spanner.index, {0b010: 0, 0b001: 0},
flag_size=3)
else:
starts, completes, ends = (data['starts'], data['completes'],
data['ends'])
if (not completes and (
(len(starts) == 1 and not ends) or (len(ends) == 1 and not
starts))):
node = starts[0] if starts else ends[0]
self.bytestring = encode_tbs(node.index, {0b010: 0}, flag_size=3)
else:
nodes = []
for x in (starts, completes, ends):
nodes.extend(x)
nodes.sort(key=lambda x:x.index)
self.bytestring = encode_tbs(nodes[0].index, {0b010:0,
0b100: len(nodes)}, flag_size=3)
# }}}
class Indexer(object): # {{{
def __init__(self, serializer, number_of_text_records,
size_of_last_text_record, masthead_offset, is_periodical,
opts, oeb):
self.serializer = serializer
self.number_of_text_records = number_of_text_records
self.text_size = (RECORD_SIZE * (self.number_of_text_records-1) +
size_of_last_text_record)
self.masthead_offset = masthead_offset
self.secondary_record_offset = None
self.oeb = oeb
self.log = oeb.log
self.opts = opts
self.is_periodical = is_periodical
if self.is_periodical and self.masthead_offset is None:
raise ValueError('Periodicals must have a masthead')
self.log('Generating MOBI index for a %s'%('periodical' if
self.is_periodical else 'book'))
self.is_flat_periodical = False
if self.is_periodical:
periodical_node = next(iter(oeb.toc))
sections = tuple(periodical_node)
self.is_flat_periodical = len(sections) == 1
self.records = []
if self.is_periodical:
# Ensure all articles have an author and description before
# creating the CNCX
for node in oeb.toc.iterdescendants():
if node.klass == 'article':
aut, desc = node.author, node.description
if not aut:
aut = _('Unknown')
if not desc:
desc = _('No details available')
node.author, node.description = aut, desc
self.cncx = CNCX(oeb.toc, self.is_periodical)
if self.is_periodical:
self.indices = self.create_periodical_index()
else:
self.indices = self.create_book_index()
if not self.indices:
raise ValueError('No valid entries in TOC, cannot generate index')
self.records.append(self.create_index_record())
self.records.insert(0, self.create_header())
self.records.extend(self.cncx.records)
if is_periodical:
self.secondary_record_offset = len(self.records)
self.records.append(self.create_header(secondary=True))
self.records.append(self.create_index_record(secondary=True))
self.calculate_trailing_byte_sequences()
def create_index_record(self, secondary=False): # {{{
header_length = 192
buf = io.BytesIO()
indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices
# Write index entries
offsets = []
for i in indices:
offsets.append(buf.tell())
buf.write(i.bytestring)
index_block = align_block(buf.getvalue())
# Write offsets to index entries as an IDXT block
idxt_block = b'IDXT'
buf.seek(0), buf.truncate(0)
for offset in offsets:
buf.write(pack(b'>H', header_length+offset))
idxt_block = align_block(idxt_block + buf.getvalue())
body = index_block + idxt_block
header = b'INDX'
buf.seek(0), buf.truncate(0)
buf.write(pack(b'>I', header_length))
buf.write(b'\0'*4) # Unknown
buf.write(pack(b'>I', 1)) # Header type? Or index record number?
buf.write(b'\0'*4) # Unknown
# IDXT block offset
buf.write(pack(b'>I', header_length + len(index_block)))
# Number of index entries
buf.write(pack(b'>I', len(offsets)))
# Unknown
buf.write(b'\xff'*8)
# Unknown
buf.write(b'\0'*156)
header += buf.getvalue()
ans = header + body
if len(ans) > 0x10000:
raise ValueError('Too many entries (%d) in the TOC'%len(offsets))
return ans
# }}}
def create_header(self, secondary=False): # {{{
buf = io.BytesIO()
if secondary:
tagx_block = TAGX().secondary
else:
tagx_block = (TAGX().periodical if self.is_periodical else
TAGX().flat_book)
header_length = 192
# Ident 0 - 4
buf.write(b'INDX')
# Header length 4 - 8
buf.write(pack(b'>I', header_length))
# Unknown 8-16
buf.write(b'\0'*8)
# Index type: 0 - normal, 2 - inflection 16 - 20
buf.write(pack(b'>I', 2))
# IDXT offset 20-24
buf.write(pack(b'>I', 0)) # Filled in later
# Number of index records 24-28
buf.write(pack(b'>I', 1 if secondary else len(self.records)))
# Index Encoding 28-32
buf.write(pack(b'>I', 65001)) # utf-8
# Unknown 32-36
buf.write(b'\xff'*4)
# Number of index entries 36-40
indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices
buf.write(pack(b'>I', len(indices)))
# ORDT offset 40-44
buf.write(pack(b'>I', 0))
# LIGT offset 44-48
buf.write(pack(b'>I', 0))
# Number of LIGT entries 48-52
buf.write(pack(b'>I', 0))
# Number of CNCX records 52-56
buf.write(pack(b'>I', 0 if secondary else len(self.cncx.records)))
# Unknown 56-180
buf.write(b'\0'*124)
# TAGX offset 180-184
buf.write(pack(b'>I', header_length))
# Unknown 184-192
buf.write(b'\0'*8)
# TAGX block
buf.write(tagx_block)
num = len(indices)
# The index of the last entry in the NCX
idx = indices[-1].index
if isinstance(idx, numbers.Integral):
idx = encode_number_as_hex(idx)
else:
idx = idx.encode('ascii')
idx = (bytes(bytearray([len(idx)]))) + idx
buf.write(idx)
# The number of entries in the NCX
buf.write(pack(b'>H', num))
# Padding
pad = (4 - (buf.tell()%4))%4
if pad:
buf.write(b'\0'*pad)
idxt_offset = buf.tell()
buf.write(b'IDXT')
buf.write(pack(b'>H', header_length + len(tagx_block)))
buf.write(b'\0')
buf.seek(20)
buf.write(pack(b'>I', idxt_offset))
return align_block(buf.getvalue())
# }}}
def create_book_index(self): # {{{
indices = []
seen = set()
id_offsets = self.serializer.id_offsets
# Flatten toc so that chapter to chapter jumps work with all sub
# chapter levels as well
for node in self.oeb.toc.iterdescendants():
try:
offset = id_offsets[node.href]
label = self.cncx[node.title]
except:
self.log.warn('TOC item %s [%s] not found in document'%(
node.title, node.href))
continue
if offset in seen:
continue
seen.add(offset)
indices.append(IndexEntry(offset, label))
indices.sort(key=lambda x:x.offset)
# Set lengths
for i, index in enumerate(indices):
try:
next_offset = indices[i+1].offset
except:
next_offset = self.serializer.body_end_offset
index.length = next_offset - index.offset
# Remove empty indices
indices = [x for x in indices if x.length > 0]
# Reset lengths in case any were removed
for i, index in enumerate(indices):
try:
next_offset = indices[i+1].offset
except:
next_offset = self.serializer.body_end_offset
index.length = next_offset - index.offset
# Set index values
for index, x in enumerate(indices):
x.index = index
return indices
# }}}
def create_periodical_index(self): # {{{
periodical_node = next(iter(self.oeb.toc))
periodical_node_offset = self.serializer.body_start_offset
periodical_node_size = (self.serializer.body_end_offset -
periodical_node_offset)
normalized_sections = []
id_offsets = self.serializer.id_offsets
periodical = PeriodicalIndexEntry(periodical_node_offset,
self.cncx[periodical_node.title],
self.cncx[periodical_node.klass], 0)
periodical.length = periodical_node_size
periodical.first_child_index = 1
periodical.image_index = self.masthead_offset
seen_sec_offsets = set()
seen_art_offsets = set()
for sec in periodical_node:
normalized_articles = []
try:
offset = id_offsets[sec.href]
label = self.cncx[sec.title]
klass = self.cncx[sec.klass]
except:
continue
if offset in seen_sec_offsets:
continue
seen_sec_offsets.add(offset)
section = PeriodicalIndexEntry(offset, label, klass, 1)
section.parent_index = 0
for art in sec:
try:
offset = id_offsets[art.href]
label = self.cncx[art.title]
klass = self.cncx[art.klass]
except:
continue
if offset in seen_art_offsets:
continue
seen_art_offsets.add(offset)
article = PeriodicalIndexEntry(offset, label, klass, 2)
normalized_articles.append(article)
article.author_offset = self.cncx[art.author]
article.desc_offset = self.cncx[art.description]
if getattr(art, 'toc_thumbnail', None) is not None:
try:
ii = self.serializer.images[art.toc_thumbnail] - 1
if ii > -1:
article.image_index = ii
except KeyError:
pass # Image not found in serializer
if normalized_articles:
normalized_articles.sort(key=lambda x:x.offset)
normalized_sections.append((section, normalized_articles))
normalized_sections.sort(key=lambda x:x[0].offset)
# Set lengths
for s, x in enumerate(normalized_sections):
sec, normalized_articles = x
try:
sec.length = normalized_sections[s+1][0].offset - sec.offset
except:
sec.length = self.serializer.body_end_offset - sec.offset
for i, art in enumerate(normalized_articles):
try:
art.length = normalized_articles[i+1].offset - art.offset
except:
art.length = sec.offset + sec.length - art.offset
# Filter
for i, x in list(enumerate(normalized_sections)):
sec, normalized_articles = x
normalized_articles = list(filter(lambda x: x.length > 0,
normalized_articles))
normalized_sections[i] = (sec, normalized_articles)
normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1],
normalized_sections))
# Set indices
i = 0
for sec, articles in normalized_sections:
i += 1
sec.index = i
sec.parent_index = 0
for sec, articles in normalized_sections:
for art in articles:
i += 1
art.index = i
art.parent_index = sec.index
for sec, normalized_articles in normalized_sections:
sec.first_child_index = normalized_articles[0].index
sec.last_child_index = normalized_articles[-1].index
# Set lengths again to close up any gaps left by filtering
for s, x in enumerate(normalized_sections):
sec, articles = x
try:
next_offset = normalized_sections[s+1][0].offset
except:
next_offset = self.serializer.body_end_offset
sec.length = next_offset - sec.offset
for a, art in enumerate(articles):
try:
next_offset = articles[a+1].offset
except:
next_offset = sec.next_offset
art.length = next_offset - art.offset
# Sanity check
for s, x in enumerate(normalized_sections):
sec, articles = x
try:
next_sec = normalized_sections[s+1][0]
except:
if (sec.length == 0 or sec.next_offset !=
self.serializer.body_end_offset):
raise ValueError('Invalid section layout')
else:
if next_sec.offset != sec.next_offset or sec.length == 0:
raise ValueError('Invalid section layout')
for a, art in enumerate(articles):
try:
next_art = articles[a+1]
except:
if (art.length == 0 or art.next_offset !=
sec.next_offset):
raise ValueError('Invalid article layout')
else:
if art.length == 0 or art.next_offset != next_art.offset:
raise ValueError('Invalid article layout')
# Flatten
indices = [periodical]
for sec, articles in normalized_sections:
indices.append(sec)
periodical.last_child_index = sec.index
for sec, articles in normalized_sections:
for a in articles:
indices.append(a)
return indices
# }}}
# TBS {{{
def calculate_trailing_byte_sequences(self):
self.tbs_map = {}
found_node = False
sections = [i for i in self.indices if i.depth == 1]
section_map = OrderedDict((i.index, i) for i in
sorted(sections, key=lambda x:x.offset))
deepest = max(i.depth for i in self.indices)
for i in range(self.number_of_text_records):
offset = i * RECORD_SIZE
next_offset = offset + RECORD_SIZE
data = {'ends':[], 'completes':[], 'starts':[],
'spans':None, 'offset':offset, 'record_number':i+1}
for index in self.indices:
if index.offset >= next_offset:
# Node starts after current record
if index.depth == deepest:
break
else:
continue
if index.next_offset <= offset:
# Node ends before current record
continue
if index.offset >= offset:
# Node starts in current record
if index.next_offset <= next_offset:
# Node ends in current record
data['completes'].append(index)
else:
data['starts'].append(index)
else:
# Node starts before current records
if index.next_offset <= next_offset:
# Node ends in current record
data['ends'].append(index)
elif index.depth == deepest:
data['spans'] = index
if (data['ends'] or data['completes'] or data['starts'] or
data['spans'] is not None):
self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
found_node, section_map=section_map)
found_node = True
else:
self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False,
after_first=found_node, section_map=section_map)
def get_trailing_byte_sequence(self, num):
return self.tbs_map[num].bytestring
# }}}
# }}}

View File

@@ -0,0 +1,480 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import io, random, time
from struct import pack
from calibre.ebooks import normalize
from calibre.ebooks.mobi.writer2.serializer import Serializer
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.utils.filenames import ascii_filename
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
align_block, detect_periodical, RECORD_SIZE, create_text_record)
from calibre.ebooks.mobi.writer2.indexer import Indexer
from polyglot.builtins import iteritems, unicode_type, range
# Disabled as I dont care about uncrossable breaks
WRITE_UNCROSSABLE_BREAKS = False
NULL_INDEX = 0xffffffff
FLIS = (b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+
b'\xff'*4)
def fcis(text_length):
fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
fcis += pack(b'>I', text_length)
fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
return fcis
class MobiWriter(object):
def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True):
self.opts = opts
self.resources = resources
self.kf8 = kf8
self.for_joint = kf8 is not None
self.write_page_breaks_after_item = write_page_breaks_after_item
self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
self.prefer_author_sort = opts.prefer_author_sort
self.last_text_record_idx = 1
def __call__(self, oeb, path_or_stream):
self.log = oeb.log
pt = None
if oeb.metadata.publication_type:
x = unicode_type(oeb.metadata.publication_type[0]).split(':')
if len(x) > 1:
pt = x[1].lower()
self.publication_type = pt
if hasattr(path_or_stream, 'write'):
return self.dump_stream(oeb, path_or_stream)
with open(path_or_stream, 'w+b') as stream:
return self.dump_stream(oeb, stream)
def write(self, *args):
for datum in args:
self.stream.write(datum)
def tell(self):
return self.stream.tell()
def dump_stream(self, oeb, stream):
self.oeb = oeb
self.stream = stream
self.records = [None]
self.generate_content()
self.generate_joint_record0() if self.for_joint else self.generate_record0()
self.write_header()
self.write_content()
def generate_content(self):
self.is_periodical = detect_periodical(self.oeb.toc, self.oeb.log)
# Image records are stored in their own list, they are merged into the
# main record list at the end
self.generate_images()
self.generate_text()
# The uncrossable breaks trailing entries come before the indexing
# trailing entries
self.write_uncrossable_breaks()
# Index records come after text records
self.generate_index()
# Indexing {{{
def generate_index(self):
self.primary_index_record_idx = None
if self.oeb.toc.count() < 1:
self.log.warn('No TOC, MOBI index not generated')
return
try:
self.indexer = Indexer(self.serializer, self.last_text_record_idx,
len(self.records[self.last_text_record_idx]),
self.masthead_offset, self.is_periodical,
self.opts, self.oeb)
except:
self.log.exception('Failed to generate MOBI index:')
else:
self.primary_index_record_idx = len(self.records)
for i in range(self.last_text_record_idx + 1):
if i == 0:
continue
tbs = self.indexer.get_trailing_byte_sequence(i)
self.records[i] += encode_trailing_data(tbs)
self.records.extend(self.indexer.records)
# }}}
def write_uncrossable_breaks(self): # {{{
'''
Write information about uncrossable breaks (non linear items in
the spine.
'''
if not WRITE_UNCROSSABLE_BREAKS:
return
breaks = self.serializer.breaks
for i in range(1, self.last_text_record_idx+1):
offset = i * RECORD_SIZE
pbreak = 0
running = offset
buf = io.BytesIO()
while breaks and (breaks[0] - offset) < RECORD_SIZE:
pbreak = (breaks.pop(0) - running) >> 3
encoded = encint(pbreak)
buf.write(encoded)
running += pbreak << 3
encoded = encode_trailing_data(buf.getvalue())
self.records[i] += encoded
# }}}
# Images {{{
def generate_images(self):
resources = self.resources
image_records = resources.records
self.image_map = resources.item_map
self.masthead_offset = resources.masthead_offset
self.cover_offset = resources.cover_offset
self.thumbnail_offset = resources.thumbnail_offset
if image_records and image_records[0] is None:
raise ValueError('Failed to find masthead image in manifest')
# }}}
def generate_text(self): # {{{
self.oeb.logger.info('Serializing markup content...')
self.serializer = Serializer(self.oeb, self.image_map,
self.is_periodical,
write_page_breaks_after_item=self.write_page_breaks_after_item)
text = self.serializer()
self.text_length = len(text)
text = io.BytesIO(text)
nrecords = 0
records_size = 0
if self.compression != UNCOMPRESSED:
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
data, overlap = create_text_record(text)
if self.compression == PALMDOC:
data = compress_doc(data)
data += overlap
data += pack(b'>B', len(overlap))
self.records.append(data)
records_size += len(data)
nrecords += 1
self.last_text_record_idx = nrecords
self.first_non_text_record_idx = nrecords + 1
# Pad so that the next records starts at a 4 byte boundary
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
# }}}
def generate_record0(self): # MOBI header {{{
metadata = self.oeb.metadata
bt = 0x002
if self.primary_index_record_idx is not None:
if False and self.indexer.is_flat_periodical:
# Disabled as setting this to 0x102 causes the Kindle to not
# auto archive the issues
bt = 0x102
elif self.indexer.is_periodical:
# If you change this, remember to change the cdetype in the EXTH
# header as well
bt = 0x103 if self.indexer.is_flat_periodical else 0x101
from calibre.ebooks.mobi.writer8.exth import build_exth
exth = build_exth(metadata,
prefer_author_sort=self.opts.prefer_author_sort,
is_periodical=self.is_periodical,
share_not_sync=self.opts.share_not_sync,
cover_offset=self.cover_offset,
thumbnail_offset=self.thumbnail_offset,
start_offset=self.serializer.start_offset, mobi_doctype=bt
)
first_image_record = None
if self.resources:
used_images = self.serializer.used_images
first_image_record = len(self.records)
self.resources.serialize(self.records, used_images)
last_content_record = len(self.records) - 1
# FCIS/FLIS (Seems to serve no purpose)
flis_number = len(self.records)
self.records.append(FLIS)
fcis_number = len(self.records)
self.records.append(fcis(self.text_length))
# EOF record
self.records.append(b'\xE9\x8E\x0D\x0A')
record0 = io.BytesIO()
# The MOBI Header
record0.write(pack(b'>HHIHHHH',
self.compression, # compression type # compression type
0, # Unused
self.text_length, # Text length
self.last_text_record_idx, # Number of text records or last tr idx
RECORD_SIZE, # Text record size
0, # Unused
0 # Unused
)) # 0 - 15 (0x0 - 0xf)
uid = random.randint(0, 0xffffffff)
title = normalize(unicode_type(metadata.title[0])).encode('utf-8')
# 0x0 - 0x3
record0.write(b'MOBI')
# 0x4 - 0x7 : Length of header
# 0x8 - 0x11 : MOBI type
# type meaning
# 0x002 MOBI book (chapter - chapter navigation)
# 0x101 News - Hierarchical navigation with sections and articles
# 0x102 News feed - Flat navigation
# 0x103 News magazine - same as 0x101
# 0xC - 0xF : Text encoding (65001 is utf-8)
# 0x10 - 0x13 : UID
# 0x14 - 0x17 : Generator version
record0.write(pack(b'>IIIII',
0xe8, bt, 65001, uid, 6))
# 0x18 - 0x1f : Unknown
record0.write(b'\xff' * 8)
# 0x20 - 0x23 : Secondary index record
sir = 0xffffffff
if (self.primary_index_record_idx is not None and
self.indexer.secondary_record_offset is not None):
sir = (self.primary_index_record_idx +
self.indexer.secondary_record_offset)
record0.write(pack(b'>I', sir))
# 0x24 - 0x3f : Unknown
record0.write(b'\xff' * 28)
# 0x40 - 0x43 : Offset of first non-text record
record0.write(pack(b'>I',
self.first_non_text_record_idx))
# 0x44 - 0x4b : title offset, title length
record0.write(pack(b'>II',
0xe8 + 16 + len(exth), len(title)))
# 0x4c - 0x4f : Language specifier
record0.write(iana2mobi(
unicode_type(metadata.language[0])))
# 0x50 - 0x57 : Input language and Output language
record0.write(b'\0' * 8)
# 0x58 - 0x5b : Format version
# 0x5c - 0x5f : First image record number
record0.write(pack(b'>II',
6, first_image_record if first_image_record else len(self.records)))
# 0x60 - 0x63 : First HUFF/CDIC record number
# 0x64 - 0x67 : Number of HUFF/CDIC records
# 0x68 - 0x6b : First DATP record number
# 0x6c - 0x6f : Number of DATP records
record0.write(b'\0' * 16)
# 0x70 - 0x73 : EXTH flags
# Bit 6 (0b1000000) being set indicates the presence of an EXTH header
# Bit 12 being set indicates the presence of embedded fonts
# The purpose of the other bits is unknown
exth_flags = 0b1010000
if self.is_periodical:
exth_flags |= 0b1000
if self.resources.has_fonts:
exth_flags |= 0b1000000000000
record0.write(pack(b'>I', exth_flags))
# 0x74 - 0x93 : Unknown
record0.write(b'\0' * 32)
# 0x94 - 0x97 : DRM offset
# 0x98 - 0x9b : DRM count
# 0x9c - 0x9f : DRM size
# 0xa0 - 0xa3 : DRM flags
record0.write(pack(b'>IIII',
0xffffffff, 0xffffffff, 0, 0))
# 0xa4 - 0xaf : Unknown
record0.write(b'\0'*12)
# 0xb0 - 0xb1 : First content record number
# 0xb2 - 0xb3 : last content record number
# (Includes Image, DATP, HUFF, DRM)
record0.write(pack(b'>HH', 1, last_content_record))
# 0xb4 - 0xb7 : Unknown
record0.write(b'\0\0\0\x01')
# 0xb8 - 0xbb : FCIS record number
record0.write(pack(b'>I', fcis_number))
# 0xbc - 0xbf : Unknown (FCIS record count?)
record0.write(pack(b'>I', 1))
# 0xc0 - 0xc3 : FLIS record number
record0.write(pack(b'>I', flis_number))
# 0xc4 - 0xc7 : Unknown (FLIS record count?)
record0.write(pack(b'>I', 1))
# 0xc8 - 0xcf : Unknown
record0.write(b'\0'*8)
# 0xd0 - 0xdf : Unknown
record0.write(pack(b'>IIII', 0xffffffff, 0, 0xffffffff, 0xffffffff))
# 0xe0 - 0xe3 : Extra record data
# Extra record data flags:
# - 0b1 : <extra multibyte bytes><size>
# - 0b10 : <TBS indexing description of this HTML record><size>
# - 0b100: <uncrossable breaks><size>
# Setting bit 2 (0x2) disables <guide><reference type="start"> functionality
extra_data_flags = 0b1 # Has multibyte overlap bytes
if self.primary_index_record_idx is not None:
extra_data_flags |= 0b10
if WRITE_UNCROSSABLE_BREAKS:
extra_data_flags |= 0b100
record0.write(pack(b'>I', extra_data_flags))
# 0xe4 - 0xe7 : Primary index record
record0.write(pack(b'>I', 0xffffffff if self.primary_index_record_idx
is None else self.primary_index_record_idx))
record0.write(exth)
record0.write(title)
record0 = record0.getvalue()
# Add some buffer so that Amazon can add encryption information if this
# MOBI is submitted for publication
record0 += (b'\0' * (1024*8))
self.records[0] = align_block(record0)
# }}}
def generate_joint_record0(self): # {{{
from calibre.ebooks.mobi.writer8.mobi import (MOBIHeader,
HEADER_FIELDS)
from calibre.ebooks.mobi.writer8.exth import build_exth
# Insert resource records
first_image_record = None
old = len(self.records)
if self.resources:
used_images = self.serializer.used_images | self.kf8.used_images
first_image_record = len(self.records)
self.resources.serialize(self.records, used_images)
resource_record_count = len(self.records) - old
last_content_record = len(self.records) - 1
# FCIS/FLIS (Seems to serve no purpose)
flis_number = len(self.records)
self.records.append(FLIS)
fcis_number = len(self.records)
self.records.append(fcis(self.text_length))
# Insert KF8 records
self.records.append(b'BOUNDARY')
kf8_header_index = len(self.records)
self.kf8.start_offset = (self.serializer.start_offset,
self.kf8.start_offset)
self.records.append(self.kf8.record0)
self.records.extend(self.kf8.records[1:])
first_image_record = (first_image_record if first_image_record else
len(self.records))
header_fields = {k:getattr(self.kf8, k) for k in HEADER_FIELDS}
# Now change the header fields that need to be different in the MOBI 6
# header
header_fields['first_resource_record'] = first_image_record
ef = 0b100001010000 # Kinglegen uses this
if self.resources.has_fonts:
ef |= 0b1000000000000
header_fields['exth_flags'] = ef
header_fields['fdst_record'] = pack(b'>HH', 1, last_content_record)
header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1
header_fields['flis_record'] = flis_number
header_fields['fcis_record'] = fcis_number
header_fields['text_length'] = self.text_length
extra_data_flags = 0b1 # Has multibyte overlap bytes
if self.primary_index_record_idx is not None:
extra_data_flags |= 0b10
header_fields['extra_data_flags'] = extra_data_flags
for k, v in iteritems({'last_text_record':'last_text_record_idx',
'first_non_text_record':'first_non_text_record_idx',
'ncx_index':'primary_index_record_idx',
}):
header_fields[k] = getattr(self, v)
if header_fields['ncx_index'] is None:
header_fields['ncx_index'] = NULL_INDEX
for x in ('skel', 'chunk', 'guide'):
header_fields[x+'_index'] = NULL_INDEX
# Create the MOBI 6 EXTH
opts = self.opts
kuc = 0 if resource_record_count > 0 else None
header_fields['exth'] = build_exth(self.oeb.metadata,
prefer_author_sort=opts.prefer_author_sort,
is_periodical=opts.mobi_periodical,
share_not_sync=opts.share_not_sync,
cover_offset=self.cover_offset,
thumbnail_offset=self.thumbnail_offset,
num_of_resources=resource_record_count,
kf8_unknown_count=kuc, be_kindlegen2=True,
kf8_header_index=kf8_header_index,
start_offset=self.serializer.start_offset,
mobi_doctype=2)
self.records[0] = MOBIHeader(file_version=6)(**header_fields)
# }}}
def write_header(self): # PalmDB header {{{
'''
Write the PalmDB header
'''
title = ascii_filename(unicode_type(self.oeb.metadata.title[0])).replace(
' ', '_')
if not isinstance(title, bytes):
title = title.encode('ascii')
title = title[:31]
title = title + (b'\0' * (32 - len(title)))
now = int(time.time())
nrecords = len(self.records)
self.write(title, pack(b'>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0),
b'BOOK', b'MOBI', pack(b'>IIH', (2*nrecords)-1, 0, nrecords))
offset = self.tell() + (8 * nrecords) + 2
for i, record in enumerate(self.records):
self.write(pack(b'>I', offset), b'\0', pack(b'>I', 2*i)[1:])
offset += len(record)
self.write(b'\0\0')
# }}}
def write_content(self):
for record in self.records:
self.write(record)

View File

@@ -0,0 +1,396 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
import unicodedata
from collections import defaultdict
from io import BytesIO
from calibre.ebooks.mobi.mobiml import MBP_NS
from calibre.ebooks.mobi.utils import is_guide_ref_start
from calibre.ebooks.oeb.base import (
OEB_DOCS, XHTML, XHTML_NS, XML_NS, namespace, prefixname, urlnormalize
)
from polyglot.builtins import unicode_type, string_or_bytes
from polyglot.urllib import urldefrag
class Buf(BytesIO):
def write(self, x):
if isinstance(x, unicode_type):
x = x.encode('utf-8')
BytesIO.write(self, x)
class Serializer(object):
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
def __init__(self, oeb, images, is_periodical, write_page_breaks_after_item=True):
'''
Write all the HTML markup in oeb into a single in memory buffer
containing a single html document with links replaced by offsets into
the buffer.
:param oeb: OEBBook object that encapsulates the document to be
processed.
:param images: Mapping of image hrefs (urlnormalized) to image record
indices.
:param write_page_breaks_after_item: If True a MOBIpocket pagebreak tag
is written after every element of the spine in ``oeb``.
'''
self.oeb = oeb
# Map of image hrefs to image index in the MOBI file
self.images = images
self.used_images = set()
self.logger = oeb.logger
self.is_periodical = is_periodical
self.write_page_breaks_after_item = write_page_breaks_after_item
# If not None, this is a number pointing to the location at which to
# open the MOBI file on the Kindle
self.start_offset = None
# Mapping of hrefs (urlnormalized) to the offset in the buffer where
# the resource pointed to by the href lives. Used at the end to fill in
# the correct values into all filepos="..." links.
self.id_offsets = {}
# Mapping of hrefs (urlnormalized) to a list of offsets into the buffer
# where filepos="..." elements are written corresponding to links that
# point to the href. This is used at the end to fill in the correct values.
self.href_offsets = defaultdict(list)
# List of offsets in the buffer of non linear items in the spine. These
# become uncrossable breaks in the MOBI
self.breaks = []
self.find_blocks()
def find_blocks(self):
'''
Mark every item in the spine if it is the start/end of a
section/article, so that it can be wrapped in divs appropriately.
'''
for item in self.oeb.spine:
item.is_section_start = item.is_section_end = False
item.is_article_start = item.is_article_end = False
def spine_item(tocitem):
href = urldefrag(tocitem.href)[0]
for item in self.oeb.spine:
if item.href == href:
return item
for item in self.oeb.toc.iterdescendants():
if item.klass == 'section':
articles = list(item)
if not articles:
continue
spine_item(item).is_section_start = True
for i, article in enumerate(articles):
si = spine_item(article)
if si is not None:
si.is_article_start = True
items = list(self.oeb.spine)
in_sec = in_art = False
for i, item in enumerate(items):
try:
prev_item = items[i-1]
except:
prev_item = None
if in_art and item.is_article_start is True:
prev_item.is_article_end = True
in_art = False
if in_sec and item.is_section_start is True:
prev_item.is_section_end = True
in_sec = False
if item.is_section_start:
in_sec = True
if item.is_article_start:
in_art = True
item.is_section_end = item.is_article_end = True
def __call__(self):
'''
Return the document serialized as a single UTF-8 encoded bytestring.
'''
buf = self.buf = Buf()
buf.write(b'<html>')
self.serialize_head()
self.serialize_body()
buf.write(b'</html>')
self.end_offset = buf.tell()
self.fixup_links()
if self.start_offset is None and not self.is_periodical:
# If we don't set a start offset, the stupid Kindle will
# open the book at the location of the first IndexEntry, which
# could be anywhere. So ensure the book is always opened at the
# beginning, instead.
self.start_offset = self.body_start_offset
return buf.getvalue()
def serialize_head(self):
buf = self.buf
buf.write(b'<head>')
if len(self.oeb.guide) > 0:
self.serialize_guide()
buf.write(b'</head>')
def serialize_guide(self):
'''
The Kindle decides where to open a book based on the presence of
an item in the guide that looks like
<reference type="text" title="Start" href="chapter-one.xhtml"/>
Similarly an item with type="toc" controls where the Goto Table of
Contents operation on the kindle goes.
'''
buf = self.buf
hrefs = self.oeb.manifest.hrefs
buf.write(b'<guide>')
for ref in self.oeb.guide.values():
path = urldefrag(ref.href)[0]
if path not in hrefs or hrefs[path].media_type not in OEB_DOCS:
continue
buf.write(b'<reference type="')
if ref.type.startswith('other.') :
self.serialize_text(ref.type.replace('other.',''), quot=True)
else:
self.serialize_text(ref.type, quot=True)
buf.write(b'" ')
if ref.title is not None:
buf.write(b'title="')
self.serialize_text(ref.title, quot=True)
buf.write(b'" ')
if is_guide_ref_start(ref):
self._start_href = ref.href
self.serialize_href(ref.href)
# Space required or won't work, I kid you not
buf.write(b' />')
buf.write(b'</guide>')
def serialize_href(self, href, base=None):
'''
Serialize the href attribute of an <a> or <reference> tag. It is
serialized as filepos="000000000" and a pointer to its location is
stored in self.href_offsets so that the correct value can be filled in
at the end.
'''
hrefs = self.oeb.manifest.hrefs
try:
path, frag = urldefrag(urlnormalize(href))
except ValueError:
# Unparseable URL
return False
if path and base:
path = base.abshref(path)
if path and path not in hrefs:
return False
buf = self.buf
item = hrefs[path] if path else None
if item and item.spine_position is None:
return False
path = item.href if item else base.href
href = '#'.join((path, frag)) if frag else path
buf.write(b'filepos=')
self.href_offsets[href].append(buf.tell())
buf.write(b'0000000000')
return True
def serialize_body(self):
'''
Serialize all items in the spine of the document. Non linear items are
moved to the end.
'''
buf = self.buf
def serialize_toc_level(tocref, href=None):
# add the provided toc level to the output stream
# if href is provided add a link ref to the toc level output (e.g. feed_0/index.html)
if href is not None:
# resolve the section url in id_offsets
buf.write(b'<mbp:pagebreak />')
self.id_offsets[urlnormalize(href)] = buf.tell()
if tocref.klass == "periodical":
buf.write(b'<div> <div height="1em"></div>')
else:
t = tocref.title
if isinstance(t, unicode_type):
t = t.encode('utf-8')
buf.write(b'<div></div> <div> <h2 height="1em"><font size="+2"><b>' + t +
b'</b></font></h2> <div height="1em"></div>')
buf.write(b'<ul>')
for tocitem in tocref.nodes:
buf.write(b'<li><a filepos=')
itemhref = tocitem.href
if tocref.klass == 'periodical':
# This is a section node.
# For periodical tocs, the section urls are like r'feed_\d+/index.html'
# We dont want to point to the start of the first article
# so we change the href.
itemhref = re.sub(r'article_\d+/', '', itemhref)
self.href_offsets[itemhref].append(buf.tell())
buf.write(b'0000000000')
buf.write(b' ><font size="+1"><b><u>')
t = tocitem.title
if isinstance(t, unicode_type):
t = t.encode('utf-8')
buf.write(t)
buf.write(b'</u></b></font></a></li>')
buf.write(b'</ul><div height="1em"></div></div><mbp:pagebreak />')
self.anchor_offset = buf.tell()
buf.write(b'<body>')
self.body_start_offset = buf.tell()
if self.is_periodical:
top_toc = self.oeb.toc.nodes[0]
serialize_toc_level(top_toc)
spine = [item for item in self.oeb.spine if item.linear]
spine.extend([item for item in self.oeb.spine if not item.linear])
for item in spine:
if self.is_periodical and item.is_section_start:
for section_toc in top_toc.nodes:
if urlnormalize(item.href) == section_toc.href:
# create section url of the form r'feed_\d+/index.html'
section_url = re.sub(r'article_\d+/', '', section_toc.href)
serialize_toc_level(section_toc, section_url)
section_toc.href = section_url
break
self.serialize_item(item)
self.body_end_offset = buf.tell()
buf.write(b'</body>')
def serialize_item(self, item):
'''
Serialize an individual item from the spine of the input document.
A reference to this item is stored in self.href_offsets
'''
buf = self.buf
if not item.linear:
self.breaks.append(buf.tell() - 1)
self.id_offsets[urlnormalize(item.href)] = buf.tell()
if item.is_section_start:
buf.write(b'<a ></a> ')
if item.is_article_start:
buf.write(b'<a ></a> <a ></a>')
for elem in item.data.find(XHTML('body')):
self.serialize_elem(elem, item)
if self.write_page_breaks_after_item:
buf.write(b'<mbp:pagebreak/>')
if item.is_article_end:
# Kindle periodical article end marker
buf.write(b'<a ></a> <a ></a>')
if item.is_section_end:
buf.write(b' <a ></a>')
self.anchor_offset = None
def serialize_elem(self, elem, item, nsrmap=NSRMAP):
buf = self.buf
if not isinstance(elem.tag, string_or_bytes) \
or namespace(elem.tag) not in nsrmap:
return
tag = prefixname(elem.tag, nsrmap)
# Previous layers take care of @name
id_ = elem.attrib.pop('id', None)
if id_:
href = '#'.join((item.href, id_))
offset = self.anchor_offset or buf.tell()
key = urlnormalize(href)
# Only set this id_offset if it wasn't previously seen
self.id_offsets[key] = self.id_offsets.get(key, offset)
if self.anchor_offset is not None and \
tag == 'a' and not elem.attrib and \
not len(elem) and not elem.text:
return
self.anchor_offset = buf.tell()
buf.write(b'<')
buf.write(tag.encode('utf-8'))
if elem.attrib:
for attr, val in elem.attrib.items():
if namespace(attr) not in nsrmap:
continue
attr = prefixname(attr, nsrmap)
buf.write(b' ')
if attr == 'href':
if self.serialize_href(val, item):
continue
elif attr == 'src':
href = urlnormalize(item.abshref(val))
if href in self.images:
index = self.images[href]
self.used_images.add(href)
buf.write(b'recindex="%05d"' % index)
continue
buf.write(attr.encode('utf-8'))
buf.write(b'="')
self.serialize_text(val, quot=True)
buf.write(b'"')
buf.write(b'>')
if elem.text or len(elem) > 0:
if elem.text:
self.anchor_offset = None
self.serialize_text(elem.text)
for child in elem:
self.serialize_elem(child, item)
if child.tail:
self.anchor_offset = None
self.serialize_text(child.tail)
buf.write(('</%s>' % tag).encode('utf-8'))
def serialize_text(self, text, quot=False):
text = text.replace('&', '&amp;')
text = text.replace('<', '&lt;')
text = text.replace('>', '&gt;')
text = text.replace(u'\u00AD', '') # Soft-hyphen
if quot:
text = text.replace('"', '&quot;')
if isinstance(text, unicode_type):
text = unicodedata.normalize('NFC', text)
self.buf.write(text.encode('utf-8'))
def fixup_links(self):
'''
Fill in the correct values for all filepos="..." links with the offsets
of the linked to content (as stored in id_offsets).
'''
buf = self.buf
id_offsets = self.id_offsets
start_href = getattr(self, '_start_href', None)
for href, hoffs in self.href_offsets.items():
is_start = (href and href == start_href)
# Iterate over all filepos items
if href not in id_offsets:
self.logger.warn('Hyperlink target %r not found' % href)
# Link to the top of the document, better than just ignoring
href, _ = urldefrag(href)
if href in self.id_offsets:
ioff = self.id_offsets[href]
if is_start:
self.start_offset = ioff
for hoff in hoffs:
buf.seek(hoff)
buf.write(('%010d' % ioff).encode('utf-8'))

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.oeb.base import XPath
class CSSCleanup(object):
def __init__(self, log, opts):
self.log, self.opts = log, opts
def __call__(self, item, stylizer):
if not hasattr(item.data, 'xpath'):
return
# The Kindle touch displays all black pages if the height is set on
# body
for body in XPath('//h:body')(item.data):
style = stylizer.style(body)
style.drop('height')
def remove_duplicate_anchors(oeb):
# The Kindle apparently has incorrect behavior for duplicate anchors, see
# https://bugs.launchpad.net/calibre/+bug/1454199
for item in oeb.spine:
if not hasattr(item.data, 'xpath'):
continue
seen = set()
for tag in item.data.xpath('//*[@id or @name]'):
for attr in ('id', 'name'):
anchor = tag.get(attr)
if anchor is not None:
if anchor in seen:
oeb.log.debug('Removing duplicate anchor:', anchor)
tag.attrib.pop(attr)
else:
seen.add(anchor)

View File

@@ -0,0 +1,228 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from struct import pack
from io import BytesIO
from calibre.constants import iswindows, isosx
from calibre.ebooks.mobi.utils import (utf8_text, to_base)
from calibre.utils.localization import lang_as_iso639_1
from calibre.ebooks.metadata import authors_to_sort_string
from polyglot.builtins import iteritems, unicode_type
EXTH_CODES = {
'creator': 100,
'publisher': 101,
'description': 103,
'identifier': 104,
'subject': 105,
'pubdate': 106,
'review': 107,
'contributor': 108,
'rights': 109,
'type': 111,
'source': 112,
'versionnumber': 114,
'startreading': 116,
'kf8_header_index': 121,
'num_of_resources': 125,
'kf8_thumbnail_uri': 129,
'kf8_unknown_count': 131,
'coveroffset': 201,
'thumboffset': 202,
'hasfakecover': 203,
'lastupdatetime': 502,
'title': 503,
'language': 524,
'primary_writing_mode': 525,
'page_progression_direction': 527,
}
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
share_not_sync=True, cover_offset=None, thumbnail_offset=None,
start_offset=None, mobi_doctype=2, num_of_resources=None,
kf8_unknown_count=0, be_kindlegen2=False, kf8_header_index=None,
page_progression_direction=None, primary_writing_mode=None):
exth = BytesIO()
nrecs = 0
for term in metadata:
if term not in EXTH_CODES:
continue
code = EXTH_CODES[term]
items = metadata[term]
if term == 'creator':
if prefer_author_sort:
creators = [authors_to_sort_string([unicode_type(c)]) for c in
items]
else:
creators = [unicode_type(c) for c in items]
items = creators
elif term == 'rights':
try:
rights = utf8_text(unicode_type(metadata.rights[0]))
except:
rights = b'Unknown'
exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8))
exth.write(rights)
nrecs += 1
continue
for item in items:
data = unicode_type(item)
if term != 'description':
data = COLLAPSE_RE.sub(' ', data)
if term == 'identifier':
if data.lower().startswith('urn:isbn:'):
data = data[9:]
elif item.scheme.lower() == 'isbn':
pass
else:
continue
if term == 'language':
d2 = lang_as_iso639_1(data)
if d2:
data = d2
data = utf8_text(data)
exth.write(pack(b'>II', code, len(data) + 8))
exth.write(data)
nrecs += 1
# Write UUID as ASIN
uuid = None
from calibre.ebooks.oeb.base import OPF
for x in metadata['identifier']:
if (x.get(OPF('scheme'), None).lower() == 'uuid' or
unicode_type(x).startswith('urn:uuid:')):
uuid = unicode_type(x).split(':')[-1]
break
if uuid is None:
from uuid import uuid4
uuid = unicode_type(uuid4())
if isinstance(uuid, unicode_type):
uuid = uuid.encode('utf-8')
if not share_not_sync:
exth.write(pack(b'>II', 113, len(uuid) + 8))
exth.write(uuid)
nrecs += 1
# Write UUID as SOURCE
c_uuid = b'calibre:%s' % uuid
exth.write(pack(b'>II', 112, len(c_uuid) + 8))
exth.write(c_uuid)
nrecs += 1
# Write cdetype
if not is_periodical:
if not share_not_sync:
exth.write(pack(b'>II', 501, 12))
exth.write(b'EBOK')
nrecs += 1
else:
ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
if ids:
exth.write(pack(b'>II', 501, 12))
exth.write(ids)
nrecs += 1
# Add a publication date entry
if metadata['date']:
datestr = unicode_type(metadata['date'][0])
elif metadata['timestamp']:
datestr = unicode_type(metadata['timestamp'][0])
if datestr is None:
raise ValueError("missing date or timestamp")
datestr = datestr.encode('utf-8')
exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
exth.write(datestr)
nrecs += 1
if is_periodical:
exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8))
exth.write(datestr)
nrecs += 1
if be_kindlegen2:
mv = 200 if iswindows else 202 if isosx else 201
vals = {204:mv, 205:2, 206:9, 207:0}
elif is_periodical:
# Pretend to be amazon's super secret periodical generator
vals = {204:201, 205:2, 206:0, 207:101}
else:
# Pretend to be kindlegen 1.2
vals = {204:201, 205:1, 206:2, 207:33307}
for code, val in iteritems(vals):
exth.write(pack(b'>III', code, 12, val))
nrecs += 1
if be_kindlegen2:
revnum = b'0730-890adc2'
exth.write(pack(b'>II', 535, 8 + len(revnum)) + revnum)
nrecs += 1
if cover_offset is not None:
exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12,
cover_offset))
exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0))
nrecs += 2
if thumbnail_offset is not None:
exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
thumbnail_offset))
thumbnail_uri_str = ('kindle:embed:%s' %(to_base(thumbnail_offset, base=32, min_num_digits=4))).encode('utf-8')
exth.write(pack(b'>II', EXTH_CODES['kf8_thumbnail_uri'], len(thumbnail_uri_str) + 8))
exth.write(thumbnail_uri_str)
nrecs += 2
if start_offset is not None:
try:
len(start_offset)
except TypeError:
start_offset = [start_offset]
for so in start_offset:
if so is not None:
exth.write(pack(b'>III', EXTH_CODES['startreading'], 12,
so))
nrecs += 1
if kf8_header_index is not None:
exth.write(pack(b'>III', EXTH_CODES['kf8_header_index'], 12,
kf8_header_index))
nrecs += 1
if num_of_resources is not None:
exth.write(pack(b'>III', EXTH_CODES['num_of_resources'], 12,
num_of_resources))
nrecs += 1
if kf8_unknown_count is not None:
exth.write(pack(b'>III', EXTH_CODES['kf8_unknown_count'], 12,
kf8_unknown_count))
nrecs += 1
if primary_writing_mode:
pwm = primary_writing_mode.encode('utf-8')
exth.write(pack(b'>II', EXTH_CODES['primary_writing_mode'], len(pwm) + 8))
exth.write(pwm)
nrecs += 1
if page_progression_direction in {'rtl', 'ltr', 'default'}:
ppd = page_progression_direction.encode('ascii')
exth.write(pack(b'>II', EXTH_CODES['page_progression_direction'], len(ppd) + 8))
exth.write(ppd)
nrecs += 1
exth = exth.getvalue()
trail = len(exth) % 4
pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad]
return b''.join(exth)

View File

@@ -0,0 +1,128 @@
from __future__ import absolute_import, division, print_function, unicode_literals
'''
HTML-TOC-adding transform.
'''
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
from calibre.ebooks.oeb.base import XML, XHTML, XHTML_NS
from calibre.ebooks.oeb.base import XHTML_MIME, CSS_MIME
from calibre.ebooks.oeb.base import element, XPath
from polyglot.builtins import unicode_type
__all__ = ['HTMLTOCAdder']
DEFAULT_TITLE = __('Table of Contents')
STYLE_CSS = {
'nested': """
.calibre_toc_header {
text-align: center;
}
.calibre_toc_block {
margin-left: 1.2em;
text-indent: -1.2em;
}
.calibre_toc_block .calibre_toc_block {
margin-left: 2.4em;
}
.calibre_toc_block .calibre_toc_block .calibre_toc_block {
margin-left: 3.6em;
}
""",
'centered': """
.calibre_toc_header {
text-align: center;
}
.calibre_toc_block {
text-align: center;
}
body > .calibre_toc_block {
margin-top: 1.2em;
}
"""
}
class HTMLTOCAdder(object):
def __init__(self, title=None, style='nested', position='end'):
self.title = title
self.style = style
self.position = position
@classmethod
def config(cls, cfg):
group = cfg.add_group('htmltoc', _('HTML TOC generation options.'))
group('toc_title', ['--toc-title'], default=None,
help=_('Title for any generated in-line table of contents.'))
return cfg
@classmethod
def generate(cls, opts):
return cls(title=opts.toc_title)
def __call__(self, oeb, context):
has_toc = getattr(getattr(oeb, 'toc', False), 'nodes', False)
if 'toc' in oeb.guide:
# Ensure toc pointed to in <guide> is in spine
from calibre.ebooks.oeb.base import urlnormalize
href = urlnormalize(oeb.guide['toc'].href)
if href in oeb.manifest.hrefs:
item = oeb.manifest.hrefs[href]
if (hasattr(item.data, 'xpath') and
XPath('//h:a[@href]')(item.data)):
if oeb.spine.index(item) < 0:
if self.position == 'end':
oeb.spine.add(item, linear=False)
else:
oeb.spine.insert(0, item, linear=True)
return
elif has_toc:
oeb.guide.remove('toc')
else:
oeb.guide.remove('toc')
if not has_toc:
return
oeb.logger.info('Generating in-line TOC...')
title = self.title or oeb.translate(DEFAULT_TITLE)
style = self.style
if style not in STYLE_CSS:
oeb.logger.error('Unknown TOC style %r' % style)
style = 'nested'
id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css')
oeb.manifest.add(id, css_href, CSS_MIME, data=STYLE_CSS[style])
language = unicode_type(oeb.metadata.language[0])
contents = element(None, XHTML('html'), nsmap={None: XHTML_NS},
attrib={XML('lang'): language})
head = element(contents, XHTML('head'))
htitle = element(head, XHTML('title'))
htitle.text = title
element(head, XHTML('link'), rel='stylesheet', type=CSS_MIME,
href=css_href)
body = element(contents, XHTML('body'),
attrib={'class': 'calibre_toc'})
h1 = element(body, XHTML('h2'),
attrib={'class': 'calibre_toc_header'})
h1.text = title
self.add_toc_level(body, oeb.toc)
id, href = oeb.manifest.generate('contents', 'contents.xhtml')
item = oeb.manifest.add(id, href, XHTML_MIME, data=contents)
if self.position == 'end':
oeb.spine.add(item, linear=False)
else:
oeb.spine.insert(0, item, linear=True)
oeb.guide.add('toc', 'Table of Contents', href)
def add_toc_level(self, elem, toc):
for node in toc:
block = element(elem, XHTML('div'),
attrib={'class': 'calibre_toc_block'})
line = element(block, XHTML('a'),
attrib={'href': node.href,
'class': 'calibre_toc_line'})
line.text = node.title
self.add_toc_level(block, node)

View File

@@ -0,0 +1,117 @@
from __future__ import absolute_import, division, print_function, unicode_literals
'''
CSS case-mangling transform.
'''
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
from lxml import etree
from calibre.ebooks.oeb.base import XHTML, XHTML_NS
from calibre.ebooks.oeb.base import CSS_MIME
from calibre.ebooks.oeb.base import namespace
from calibre.ebooks.oeb.stylizer import Stylizer
from polyglot.builtins import string_or_bytes
CASE_MANGLER_CSS = """
.calibre_lowercase {
font-variant: normal;
font-size: 0.65em;
}
"""
TEXT_TRANSFORMS = {'capitalize', 'uppercase', 'lowercase'}
class CaseMangler(object):
@classmethod
def config(cls, cfg):
return cfg
@classmethod
def generate(cls, opts):
return cls()
def __call__(self, oeb, context):
oeb.logger.info('Applying case-transforming CSS...')
self.oeb = oeb
self.opts = context
self.profile = context.source
self.mangle_spine()
def mangle_spine(self):
id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css')
self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS)
for item in self.oeb.spine:
html = item.data
relhref = item.relhref(href)
etree.SubElement(html.find(XHTML('head')), XHTML('link'),
rel='stylesheet', href=relhref, type=CSS_MIME)
stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile)
self.mangle_elem(html.find(XHTML('body')), stylizer)
def text_transform(self, transform, text):
if transform == 'capitalize':
return icu_title(text)
elif transform == 'uppercase':
return icu_upper(text)
elif transform == 'lowercase':
return icu_lower(text)
return text
def split_text(self, text):
results = ['']
isupper = text[0].isupper()
for char in text:
if char.isupper() == isupper:
results[-1] += char
else:
isupper = not isupper
results.append(char)
return results
def smallcaps_elem(self, elem, attr):
texts = self.split_text(getattr(elem, attr))
setattr(elem, attr, None)
last = elem if attr == 'tail' else None
attrib = {'class': 'calibre_lowercase'}
for text in texts:
if text.isupper():
if last is None:
elem.text = text
else:
last.tail = text
else:
child = elem.makeelement(XHTML('span'), attrib=attrib)
child.text = text.upper()
if last is None:
elem.insert(0, child)
else:
# addnext() moves the tail for some reason
tail = last.tail
last.addnext(child)
last.tail = tail
child.tail = None
last = child
def mangle_elem(self, elem, stylizer):
if not isinstance(elem.tag, string_or_bytes) or \
namespace(elem.tag) != XHTML_NS:
return
children = list(elem)
style = stylizer.style(elem)
transform = style['text-transform']
variant = style['font-variant']
if elem.text:
if transform in TEXT_TRANSFORMS:
elem.text = self.text_transform(transform, elem.text)
if variant == 'small-caps':
self.smallcaps_elem(elem, 'text')
for child in children:
self.mangle_elem(child, stylizer)
if child.tail:
if transform in TEXT_TRANSFORMS:
child.tail = self.text_transform(transform, child.tail)
if variant == 'small-caps':
self.smallcaps_elem(child, 'tail')

View File

@@ -0,0 +1,239 @@
from __future__ import absolute_import, division, print_function, unicode_literals
'''
SVG rasterization transform.
'''
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import os, re
from PyQt5.Qt import (
Qt, QByteArray, QBuffer, QIODevice, QColor, QImage, QPainter, QSvgRenderer)
from calibre.ebooks.oeb.base import XHTML, XLINK
from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME
from calibre.ebooks.oeb.base import xml2str, xpath
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.imghdr import what
from polyglot.builtins import unicode_type
from polyglot.urllib import urldefrag
IMAGE_TAGS = {XHTML('img'), XHTML('object')}
KEEP_ATTRS = {'class', 'style', 'width', 'height', 'align'}
class Unavailable(Exception):
pass
class SVGRasterizer(object):
def __init__(self, base_css=''):
self.base_css = base_css
from calibre.gui2 import must_use_qt
must_use_qt()
@classmethod
def config(cls, cfg):
return cfg
@classmethod
def generate(cls, opts):
return cls()
def __call__(self, oeb, context):
oeb.logger.info('Rasterizing SVG images...')
self.temp_files = []
self.stylizer_cache = {}
self.oeb = oeb
self.opts = context
self.profile = context.dest
self.images = {}
self.dataize_manifest()
self.rasterize_spine()
self.rasterize_cover()
for pt in self.temp_files:
try:
os.remove(pt)
except:
pass
def rasterize_svg(self, elem, width=0, height=0, format='PNG'):
view_box = elem.get('viewBox', elem.get('viewbox', None))
sizes = None
logger = self.oeb.logger
if view_box is not None:
try:
box = [float(x) for x in filter(None, re.split('[, ]', view_box))]
sizes = [box[2]-box[0], box[3] - box[1]]
except (TypeError, ValueError, IndexError):
logger.warn('SVG image has invalid viewBox="%s", ignoring the viewBox' % view_box)
else:
for image in elem.xpath('descendant::*[local-name()="image" and '
'@height and contains(@height, "%")]'):
logger.info('Found SVG image height in %, trying to convert...')
try:
h = float(image.get('height').replace('%', ''))/100.
image.set('height', unicode_type(h*sizes[1]))
except:
logger.exception('Failed to convert percentage height:',
image.get('height'))
data = QByteArray(xml2str(elem, with_tail=False))
svg = QSvgRenderer(data)
size = svg.defaultSize()
if size.width() == 100 and size.height() == 100 and sizes:
size.setWidth(sizes[0])
size.setHeight(sizes[1])
if width or height:
size.scale(width, height, Qt.KeepAspectRatio)
logger.info('Rasterizing %r to %dx%d'
% (elem, size.width(), size.height()))
image = QImage(size, QImage.Format_ARGB32_Premultiplied)
image.fill(QColor("white").rgb())
painter = QPainter(image)
svg.render(painter)
painter.end()
array = QByteArray()
buffer = QBuffer(array)
buffer.open(QIODevice.WriteOnly)
image.save(buffer, format)
return array.data()
def dataize_manifest(self):
for item in self.oeb.manifest.values():
if item.media_type == SVG_MIME and item.data is not None:
self.dataize_svg(item)
def dataize_svg(self, item, svg=None):
if svg is None:
svg = item.data
hrefs = self.oeb.manifest.hrefs
for elem in xpath(svg, '//svg:*[@xl:href]'):
href = urlnormalize(elem.attrib[XLINK('href')])
path = urldefrag(href)[0]
if not path:
continue
abshref = item.abshref(path)
if abshref not in hrefs:
continue
linkee = hrefs[abshref]
data = linkee.bytes_representation
ext = what(None, data) or 'jpg'
with PersistentTemporaryFile(suffix='.'+ext) as pt:
pt.write(data)
self.temp_files.append(pt.name)
elem.attrib[XLINK('href')] = pt.name
return svg
def stylizer(self, item):
ans = self.stylizer_cache.get(item, None)
if ans is None:
ans = Stylizer(item.data, item.href, self.oeb, self.opts,
self.profile, base_css=self.base_css)
self.stylizer_cache[item] = ans
return ans
def rasterize_spine(self):
for item in self.oeb.spine:
self.rasterize_item(item)
def rasterize_item(self, item):
html = item.data
hrefs = self.oeb.manifest.hrefs
for elem in xpath(html, '//h:img[@src]'):
src = urlnormalize(elem.attrib['src'])
image = hrefs.get(item.abshref(src), None)
if image and image.media_type == SVG_MIME:
style = self.stylizer(item).style(elem)
self.rasterize_external(elem, style, item, image)
for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME):
data = urlnormalize(elem.attrib['data'])
image = hrefs.get(item.abshref(data), None)
if image and image.media_type == SVG_MIME:
style = self.stylizer(item).style(elem)
self.rasterize_external(elem, style, item, image)
for elem in xpath(html, '//svg:svg'):
style = self.stylizer(item).style(elem)
self.rasterize_inline(elem, style, item)
def rasterize_inline(self, elem, style, item):
width = style['width']
height = style['height']
width = (width / 72) * self.profile.dpi
height = (height / 72) * self.profile.dpi
elem = self.dataize_svg(item, elem)
data = self.rasterize_svg(elem, width, height)
manifest = self.oeb.manifest
href = os.path.splitext(item.href)[0] + '.png'
id, href = manifest.generate(item.id, href)
manifest.add(id, href, PNG_MIME, data=data)
img = elem.makeelement(XHTML('img'), src=item.relhref(href))
elem.getparent().replace(elem, img)
for prop in ('width', 'height'):
if prop in elem.attrib:
img.attrib[prop] = elem.attrib[prop]
def rasterize_external(self, elem, style, item, svgitem):
width = style['width']
height = style['height']
width = (width / 72) * self.profile.dpi
height = (height / 72) * self.profile.dpi
data = QByteArray(svgitem.bytes_representation)
svg = QSvgRenderer(data)
size = svg.defaultSize()
size.scale(width, height, Qt.KeepAspectRatio)
key = (svgitem.href, size.width(), size.height())
if key in self.images:
href = self.images[key]
else:
logger = self.oeb.logger
logger.info('Rasterizing %r to %dx%d'
% (svgitem.href, size.width(), size.height()))
image = QImage(size, QImage.Format_ARGB32_Premultiplied)
image.fill(QColor("white").rgb())
painter = QPainter(image)
svg.render(painter)
painter.end()
array = QByteArray()
buffer = QBuffer(array)
buffer.open(QIODevice.WriteOnly)
image.save(buffer, 'PNG')
data = array.data()
manifest = self.oeb.manifest
href = os.path.splitext(svgitem.href)[0] + '.png'
id, href = manifest.generate(svgitem.id, href)
manifest.add(id, href, PNG_MIME, data=data)
self.images[key] = href
elem.tag = XHTML('img')
for attr in elem.attrib:
if attr not in KEEP_ATTRS:
del elem.attrib[attr]
elem.attrib['src'] = item.relhref(href)
if elem.text:
elem.attrib['alt'] = elem.text
elem.text = None
for child in elem:
elem.remove(child)
def rasterize_cover(self):
covers = self.oeb.metadata.cover
if not covers:
return
if unicode_type(covers[0]) not in self.oeb.manifest.ids:
self.oeb.logger.warn('Cover not in manifest, skipping.')
self.oeb.metadata.clear('cover')
return
cover = self.oeb.manifest.ids[unicode_type(covers[0])]
if not cover.media_type == SVG_MIME:
return
width = (self.profile.width / 72) * self.profile.dpi
height = (self.profile.height / 72) * self.profile.dpi
data = self.rasterize_svg(cover.data, width, height)
href = os.path.splitext(cover.href)[0] + '.png'
id, href = self.oeb.manifest.generate(cover.id, href)
self.oeb.manifest.add(id, href, PNG_MIME, data=data)
covers[0].value = id

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Used for pdf output for comic2pdf
'''

View File

@@ -0,0 +1,182 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import print_function, unicode_literals
import errno
import os
import re
import shutil
import subprocess
import sys
from calibre import CurrentDir, xml_replace_entities, prints
from calibre.constants import (
filesystem_encoding, isbsd, islinux, isosx, ispy3, iswindows
)
from calibre.ebooks import ConversionError, DRMError
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.cleantext import clean_xml_chars
from calibre.utils.ipc import eintr_retry_call
PDFTOHTML = 'pdftohtml'
def popen(cmd, **kw):
if not ispy3:
cmd = [x.encode(filesystem_encoding) if not isinstance(x, bytes) else x for x in cmd]
if iswindows:
kw['creationflags'] = 0x08
return subprocess.Popen(cmd, **kw)
if isosx and hasattr(sys, 'frameworks_dir'):
base = os.path.join(os.path.dirname(sys.frameworks_dir), 'utils.app', 'Contents', 'MacOS')
PDFTOHTML = os.path.join(base, PDFTOHTML)
if iswindows and hasattr(sys, 'frozen'):
base = sys.extensions_location if hasattr(sys, 'new_app_layout') else os.path.dirname(sys.executable)
PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
if (islinux or isbsd) and getattr(sys, 'frozen', False):
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
'''
Convert the pdf into html using the pdftohtml app.
This will write the html as index.html into output_dir.
It will also write all extracted images to the output_dir
'''
pdfsrc = os.path.join(output_dir, 'src.pdf')
index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html'))
with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest:
shutil.copyfileobj(src, dest)
with CurrentDir(output_dir):
def a(x):
return os.path.basename(x)
exe = PDFTOHTML
cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
'-nodrm', a(pdfsrc), a(index)]
if isbsd:
cmd.remove('-nodrm')
if no_images:
cmd.append('-i')
if as_xml:
cmd.append('-xml')
logf = PersistentTemporaryFile('pdftohtml_log')
try:
p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
stdin=subprocess.PIPE)
except OSError as err:
if err.errno == errno.ENOENT:
raise ConversionError(
_('Could not find pdftohtml, check it is in your PATH'))
else:
raise
ret = eintr_retry_call(p.wait)
logf.flush()
logf.close()
out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
if ret != 0:
raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out))
if out:
prints("pdftohtml log:")
prints(out)
if not os.path.exists(index) or os.stat(index).st_size < 100:
raise DRMError()
if not as_xml:
with lopen(index, 'r+b') as i:
raw = i.read().decode('utf-8', 'replace')
raw = flip_images(raw)
raw = raw.replace('<head', '<!-- created by calibre\'s pdftohtml -->\n <head', 1)
i.seek(0)
i.truncate()
# versions of pdftohtml >= 0.20 output self closing <br> tags, this
# breaks the pdf heuristics regexps, so replace them
raw = raw.replace('<br/>', '<br>')
raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I)
raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
raw = xml_replace_entities(raw)
raw = raw.replace('\u00a0', ' ')
i.write(raw.encode('utf-8'))
cmd = [exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
'-nodrm', '-q', '-stdout', a(pdfsrc)]
if isbsd:
cmd.remove('-nodrm')
p = popen(cmd, stdout=subprocess.PIPE)
raw = p.stdout.read().strip()
if p.wait() == 0 and raw:
parse_outline(raw, output_dir)
try:
os.remove(pdfsrc)
except:
pass
def parse_outline(raw, output_dir):
from lxml import etree
from calibre.utils.xml_parse import safe_xml_fromstring
raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
outline = safe_xml_fromstring(raw).xpath('(//outline)[1]')
if outline:
from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
outline = outline[0]
toc = TOC()
count = [0]
def process_node(node, toc):
for child in node.iterchildren('*'):
if child.tag == 'outline':
parent = toc.children[-1] if toc.children else toc
process_node(child, parent)
else:
if child.text:
page = child.get('page', '1')
toc.add(child.text, 'index.html', 'p' + page)
count[0] += 1
process_node(outline, toc)
if count[0] > 2:
root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml')
with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f:
f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True))
def flip_image(img, flip):
from calibre.utils.img import flip_image, image_and_format_from_data, image_to_data
with lopen(img, 'r+b') as f:
img, fmt = image_and_format_from_data(f.read())
img = flip_image(img, horizontal='x' in flip, vertical='y' in flip)
f.seek(0), f.truncate()
f.write(image_to_data(img, fmt=fmt))
def flip_images(raw):
for match in re.finditer('<IMG[^>]+/?>', raw, flags=re.I):
img = match.group()
m = re.search(r'class="(x|y|xy)flip"', img)
if m is None:
continue
flip = m.group(1)
src = re.search(r'src="([^"]+)"', img)
if src is None:
continue
img = src.group(1)
if not os.path.exists(img):
continue
flip_image(img, flip)
raw = re.sub(r'<STYLE.+?</STYLE>\s*', '', raw, flags=re.I|re.DOTALL)
return raw