mirror of
https://github.com/gryf/ebook-converter.git
synced 2025-12-30 14:02:27 +01:00
Added mobi writer files
This commit is contained in:
622
ebook_converter/ebooks/mobi/mobiml.py
Normal file
622
ebook_converter/ebooks/mobi/mobiml.py
Normal file
@@ -0,0 +1,622 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
Transform XHTML/OPS-ish content into Mobipocket HTML 3.2.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
|
||||
|
||||
import copy
|
||||
import re
|
||||
import numbers
|
||||
from lxml import etree
|
||||
from calibre.ebooks.oeb.base import namespace, barename
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, urlnormalize
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
|
||||
from calibre.ebooks.mobi.utils import convert_color_for_font_tag
|
||||
from calibre.utils.imghdr import identify
|
||||
from polyglot.builtins import unicode_type, string_or_bytes
|
||||
|
||||
MBP_NS = 'http://mobipocket.com/ns/mbp'
|
||||
|
||||
|
||||
def MBP(name):
|
||||
return '{%s}%s' % (MBP_NS, name)
|
||||
|
||||
|
||||
MOBI_NSMAP = {None: XHTML_NS, 'mbp': MBP_NS}
|
||||
INLINE_TAGS = {'span', 'a', 'code', 'u', 's', 'big', 'strike', 'tt', 'font', 'q', 'i', 'b', 'em', 'strong', 'sup', 'sub'}
|
||||
HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
||||
# GR: Added 'caption' to both sets
|
||||
NESTABLE_TAGS = {'ol', 'ul', 'li', 'table', 'tr', 'td', 'th', 'caption'}
|
||||
TABLE_TAGS = {'table', 'tr', 'td', 'th', 'caption'}
|
||||
|
||||
SPECIAL_TAGS = {'hr', 'br'}
|
||||
CONTENT_TAGS = {'img', 'hr', 'br'}
|
||||
|
||||
NOT_VTAGS = HEADER_TAGS | NESTABLE_TAGS | TABLE_TAGS | SPECIAL_TAGS | \
|
||||
CONTENT_TAGS
|
||||
LEAF_TAGS = {'base', 'basefont', 'frame', 'link', 'meta', 'area', 'br',
|
||||
'col', 'hr', 'img', 'input', 'param'}
|
||||
PAGE_BREAKS = {'always', 'left', 'right'}
|
||||
|
||||
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
|
||||
|
||||
|
||||
def asfloat(value):
|
||||
if not isinstance(value, numbers.Number):
|
||||
return 0.0
|
||||
return float(value)
|
||||
|
||||
|
||||
def isspace(text):
|
||||
if not text:
|
||||
return True
|
||||
if '\xa0' in text:
|
||||
return False
|
||||
return text.isspace()
|
||||
|
||||
|
||||
class BlockState(object):
|
||||
|
||||
def __init__(self, body):
|
||||
self.body = body
|
||||
self.nested = []
|
||||
self.para = None
|
||||
self.inline = None
|
||||
self.anchor = None
|
||||
self.vpadding = 0.
|
||||
self.vmargin = 0.
|
||||
self.pbreak = False
|
||||
self.istate = None
|
||||
self.content = False
|
||||
|
||||
|
||||
class FormatState(object):
|
||||
|
||||
def __init__(self):
|
||||
self.rendered = False
|
||||
self.left = 0.
|
||||
self.halign = 'auto'
|
||||
self.indent = 0.
|
||||
self.fsize = 3
|
||||
self.ids = set()
|
||||
self.italic = False
|
||||
self.bold = False
|
||||
self.strikethrough = False
|
||||
self.underline = False
|
||||
self.preserve = False
|
||||
self.pre_wrap = False
|
||||
self.family = 'serif'
|
||||
self.bgcolor = 'transparent'
|
||||
self.fgcolor = 'black'
|
||||
self.href = None
|
||||
self.list_num = 0
|
||||
self.attrib = {}
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.fsize == other.fsize \
|
||||
and self.italic == other.italic \
|
||||
and self.bold == other.bold \
|
||||
and self.href == other.href \
|
||||
and self.preserve == other.preserve \
|
||||
and self.pre_wrap == other.pre_wrap \
|
||||
and self.family == other.family \
|
||||
and self.bgcolor == other.bgcolor \
|
||||
and self.fgcolor == other.fgcolor \
|
||||
and self.strikethrough == other.strikethrough \
|
||||
and self.underline == other.underline
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self.__eq__(other)
|
||||
|
||||
|
||||
class MobiMLizer(object):
|
||||
|
||||
def __init__(self, ignore_tables=False):
|
||||
self.ignore_tables = ignore_tables
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
oeb.logger.info('Converting XHTML to Mobipocket markup...')
|
||||
self.oeb = oeb
|
||||
self.log = self.oeb.logger
|
||||
self.opts = context
|
||||
self.profile = profile = context.dest
|
||||
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
|
||||
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
|
||||
self.mobimlize_spine()
|
||||
|
||||
def mobimlize_spine(self):
|
||||
'Iterate over the spine and convert it to MOBIML'
|
||||
for item in self.oeb.spine:
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile)
|
||||
body = item.data.find(XHTML('body'))
|
||||
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
|
||||
nbody = etree.SubElement(nroot, XHTML('body'))
|
||||
self.current_spine_item = item
|
||||
self.mobimlize_elem(body, stylizer, BlockState(nbody),
|
||||
[FormatState()])
|
||||
item.data = nroot
|
||||
# print(etree.tostring(nroot))
|
||||
|
||||
def mobimlize_font(self, ptsize):
|
||||
return self.fnums[self.fmap[ptsize]]
|
||||
|
||||
def mobimlize_measure(self, ptsize):
|
||||
if isinstance(ptsize, string_or_bytes):
|
||||
return ptsize
|
||||
embase = self.profile.fbase
|
||||
if round(ptsize) < embase:
|
||||
return "%dpt" % int(round(ptsize))
|
||||
return "%dem" % int(round(ptsize / embase))
|
||||
|
||||
def preize_text(self, text, pre_wrap=False):
|
||||
text = unicode_type(text)
|
||||
if pre_wrap:
|
||||
# Replace n consecutive spaces with n-1 NBSP + space
|
||||
text = re.sub(r' {2,}', lambda m:('\xa0'*(len(m.group())-1) + ' '), text)
|
||||
else:
|
||||
text = text.replace(' ', '\xa0')
|
||||
|
||||
text = text.replace('\r\n', '\n')
|
||||
text = text.replace('\r', '\n')
|
||||
lines = text.split('\n')
|
||||
result = lines[:1]
|
||||
for line in lines[1:]:
|
||||
result.append(etree.Element(XHTML('br')))
|
||||
if line:
|
||||
result.append(line)
|
||||
return result
|
||||
|
||||
def mobimlize_content(self, tag, text, bstate, istates):
|
||||
'Convert text content'
|
||||
if text or tag != 'br':
|
||||
bstate.content = True
|
||||
istate = istates[-1]
|
||||
para = bstate.para
|
||||
if tag in SPECIAL_TAGS and not text:
|
||||
para = para if para is not None else bstate.body
|
||||
elif para is None or tag in ('td', 'th'):
|
||||
body = bstate.body
|
||||
if bstate.pbreak:
|
||||
etree.SubElement(body, MBP('pagebreak'))
|
||||
bstate.pbreak = False
|
||||
bstate.istate = None
|
||||
bstate.anchor = None
|
||||
parent = bstate.nested[-1] if bstate.nested else bstate.body
|
||||
indent = istate.indent
|
||||
left = istate.left
|
||||
if isinstance(indent, string_or_bytes):
|
||||
indent = 0
|
||||
if indent < 0 and abs(indent) < left:
|
||||
left += indent
|
||||
indent = 0
|
||||
elif indent != 0 and abs(indent) < self.profile.fbase:
|
||||
indent = (indent / abs(indent)) * self.profile.fbase
|
||||
if tag in NESTABLE_TAGS and not istate.rendered:
|
||||
para = wrapper = etree.SubElement(
|
||||
parent, XHTML(tag), attrib=istate.attrib)
|
||||
bstate.nested.append(para)
|
||||
if tag == 'li' and len(istates) > 1:
|
||||
istates[-2].list_num += 1
|
||||
para.attrib['value'] = unicode_type(istates[-2].list_num)
|
||||
elif tag in NESTABLE_TAGS and istate.rendered:
|
||||
para = wrapper = bstate.nested[-1]
|
||||
elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0:
|
||||
ems = self.profile.mobi_ems_per_blockquote
|
||||
para = wrapper = etree.SubElement(parent, XHTML('blockquote'))
|
||||
para = wrapper
|
||||
emleft = int(round(left / self.profile.fbase)) - ems
|
||||
emleft = min((emleft, 10))
|
||||
while emleft > ems / 2:
|
||||
para = etree.SubElement(para, XHTML('blockquote'))
|
||||
emleft -= ems
|
||||
else:
|
||||
para = wrapper = etree.SubElement(parent, XHTML('p'))
|
||||
bstate.inline = bstate.para = para
|
||||
vspace = bstate.vpadding + bstate.vmargin
|
||||
bstate.vpadding = bstate.vmargin = 0
|
||||
if tag not in TABLE_TAGS:
|
||||
if tag in ('ul', 'ol') and vspace > 0:
|
||||
wrapper.addprevious(etree.Element(XHTML('div'),
|
||||
height=self.mobimlize_measure(vspace)))
|
||||
else:
|
||||
wrapper.attrib['height'] = self.mobimlize_measure(vspace)
|
||||
para.attrib['width'] = self.mobimlize_measure(indent)
|
||||
elif tag == 'table' and vspace > 0:
|
||||
vspace = int(round(vspace / self.profile.fbase))
|
||||
while vspace > 0:
|
||||
wrapper.addprevious(etree.Element(XHTML('br')))
|
||||
vspace -= 1
|
||||
if istate.halign != 'auto' and isinstance(istate.halign, (bytes, unicode_type)):
|
||||
if isinstance(istate.halign, bytes):
|
||||
istate.halign = istate.halign.decode('utf-8')
|
||||
para.attrib['align'] = istate.halign
|
||||
istate.rendered = True
|
||||
pstate = bstate.istate
|
||||
if tag in CONTENT_TAGS:
|
||||
bstate.inline = para
|
||||
pstate = bstate.istate = None
|
||||
try:
|
||||
etree.SubElement(para, XHTML(tag), attrib=istate.attrib)
|
||||
except:
|
||||
print('Invalid subelement:', para, tag, istate.attrib)
|
||||
raise
|
||||
elif tag in TABLE_TAGS:
|
||||
para.attrib['valign'] = 'top'
|
||||
if istate.ids:
|
||||
for id_ in istate.ids:
|
||||
anchor = etree.Element(XHTML('a'), attrib={'id': id_})
|
||||
if tag == 'li':
|
||||
try:
|
||||
last = bstate.body[-1][-1]
|
||||
except:
|
||||
break
|
||||
last.insert(0, anchor)
|
||||
anchor.tail = last.text
|
||||
last.text = None
|
||||
else:
|
||||
last = bstate.body[-1]
|
||||
# We use append instead of addprevious so that inline
|
||||
# anchors in large blocks point to the correct place. See
|
||||
# https://bugs.launchpad.net/calibre/+bug/899831
|
||||
# This could potentially break if inserting an anchor at
|
||||
# this point in the markup is illegal, but I cannot think
|
||||
# of such a case offhand.
|
||||
if barename(last.tag) in LEAF_TAGS:
|
||||
last.addprevious(anchor)
|
||||
else:
|
||||
last.append(anchor)
|
||||
|
||||
istate.ids.clear()
|
||||
if not text:
|
||||
return
|
||||
if not pstate or istate != pstate:
|
||||
inline = para
|
||||
fsize = istate.fsize
|
||||
href = istate.href
|
||||
if not href:
|
||||
bstate.anchor = None
|
||||
elif pstate and pstate.href == href:
|
||||
inline = bstate.anchor
|
||||
else:
|
||||
inline = etree.SubElement(inline, XHTML('a'), href=href)
|
||||
bstate.anchor = inline
|
||||
|
||||
if fsize != 3:
|
||||
inline = etree.SubElement(inline, XHTML('font'),
|
||||
size=unicode_type(fsize))
|
||||
if istate.family == 'monospace':
|
||||
inline = etree.SubElement(inline, XHTML('tt'))
|
||||
if istate.italic:
|
||||
inline = etree.SubElement(inline, XHTML('i'))
|
||||
if istate.bold:
|
||||
inline = etree.SubElement(inline, XHTML('b'))
|
||||
if istate.bgcolor is not None and istate.bgcolor != 'transparent' :
|
||||
inline = etree.SubElement(inline, XHTML('span'),
|
||||
bgcolor=convert_color_for_font_tag(istate.bgcolor))
|
||||
if istate.fgcolor != 'black':
|
||||
inline = etree.SubElement(inline, XHTML('font'),
|
||||
color=convert_color_for_font_tag(istate.fgcolor))
|
||||
if istate.strikethrough:
|
||||
inline = etree.SubElement(inline, XHTML('s'))
|
||||
if istate.underline:
|
||||
inline = etree.SubElement(inline, XHTML('u'))
|
||||
bstate.inline = inline
|
||||
bstate.istate = istate
|
||||
inline = bstate.inline
|
||||
content = self.preize_text(text, pre_wrap=istate.pre_wrap) if istate.preserve or istate.pre_wrap else [text]
|
||||
for item in content:
|
||||
if isinstance(item, string_or_bytes):
|
||||
if len(inline) == 0:
|
||||
inline.text = (inline.text or '') + item
|
||||
else:
|
||||
last = inline[-1]
|
||||
last.tail = (last.tail or '') + item
|
||||
else:
|
||||
inline.append(item)
|
||||
|
||||
def mobimlize_elem(self, elem, stylizer, bstate, istates,
|
||||
ignore_valign=False):
|
||||
if not isinstance(elem.tag, string_or_bytes) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
return
|
||||
style = stylizer.style(elem)
|
||||
# <mbp:frame-set/> does not exist lalalala
|
||||
if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and
|
||||
elem.get('data-calibre-jacket-searchable-tags', None) != '1'):
|
||||
id_ = elem.get('id', None)
|
||||
if id_:
|
||||
# Keep anchors so people can use display:none
|
||||
# to generate hidden TOCs
|
||||
tail = elem.tail
|
||||
elem.clear()
|
||||
elem.text = None
|
||||
elem.set('id', id_)
|
||||
elem.tail = tail
|
||||
elem.tag = XHTML('a')
|
||||
else:
|
||||
return
|
||||
tag = barename(elem.tag)
|
||||
istate = copy.copy(istates[-1])
|
||||
istate.rendered = False
|
||||
istate.list_num = 0
|
||||
if tag == 'ol' and 'start' in elem.attrib:
|
||||
try:
|
||||
istate.list_num = int(elem.attrib['start'])-1
|
||||
except:
|
||||
pass
|
||||
istates.append(istate)
|
||||
left = 0
|
||||
display = style['display']
|
||||
if display == 'table-cell':
|
||||
display = 'inline'
|
||||
elif display.startswith('table'):
|
||||
display = 'block'
|
||||
isblock = (not display.startswith('inline') and style['display'] !=
|
||||
'none')
|
||||
isblock = isblock and style['float'] == 'none'
|
||||
isblock = isblock and tag != 'br'
|
||||
if isblock:
|
||||
bstate.para = None
|
||||
istate.halign = style['text-align']
|
||||
rawti = style._get('text-indent')
|
||||
istate.indent = style['text-indent']
|
||||
if hasattr(rawti, 'strip') and '%' in rawti:
|
||||
# We have a percentage text indent, these can come out looking
|
||||
# too large if the user chooses a wide output profile like
|
||||
# tablet
|
||||
istate.indent = min(style._unit_convert(rawti, base=500), istate.indent)
|
||||
if style['margin-left'] == 'auto' \
|
||||
and style['margin-right'] == 'auto':
|
||||
istate.halign = 'center'
|
||||
margin = asfloat(style['margin-left'])
|
||||
padding = asfloat(style['padding-left'])
|
||||
if tag != 'body':
|
||||
left = margin + padding
|
||||
istate.left += left
|
||||
vmargin = asfloat(style['margin-top'])
|
||||
bstate.vmargin = max((bstate.vmargin, vmargin))
|
||||
vpadding = asfloat(style['padding-top'])
|
||||
if vpadding > 0:
|
||||
bstate.vpadding += bstate.vmargin
|
||||
bstate.vmargin = 0
|
||||
bstate.vpadding += vpadding
|
||||
elif not istate.href:
|
||||
margin = asfloat(style['margin-left'])
|
||||
padding = asfloat(style['padding-left'])
|
||||
lspace = margin + padding
|
||||
if lspace > 0:
|
||||
spaces = int(round((lspace * 3) / style['font-size']))
|
||||
elem.text = ('\xa0' * spaces) + (elem.text or '')
|
||||
margin = asfloat(style['margin-right'])
|
||||
padding = asfloat(style['padding-right'])
|
||||
rspace = margin + padding
|
||||
if rspace > 0:
|
||||
spaces = int(round((rspace * 3) / style['font-size']))
|
||||
if len(elem) == 0:
|
||||
elem.text = (elem.text or '') + ('\xa0' * spaces)
|
||||
else:
|
||||
last = elem[-1]
|
||||
last.text = (last.text or '') + ('\xa0' * spaces)
|
||||
if bstate.content and style['page-break-before'] in PAGE_BREAKS:
|
||||
bstate.pbreak = True
|
||||
istate.fsize = self.mobimlize_font(style['font-size'])
|
||||
istate.italic = True if style['font-style'] == 'italic' else False
|
||||
weight = style['font-weight']
|
||||
istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
|
||||
istate.preserve = style['white-space'] == 'pre'
|
||||
istate.pre_wrap = style['white-space'] == 'pre-wrap'
|
||||
istate.bgcolor = style['background-color']
|
||||
istate.fgcolor = style['color']
|
||||
istate.strikethrough = style.effective_text_decoration == 'line-through'
|
||||
istate.underline = style.effective_text_decoration == 'underline'
|
||||
ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else ''
|
||||
if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'):
|
||||
istate.family = 'monospace'
|
||||
elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or
|
||||
'arial' in ff or 'helvetica' in ff):
|
||||
istate.family = 'sans-serif'
|
||||
else:
|
||||
istate.family = 'serif'
|
||||
if 'id' in elem.attrib:
|
||||
istate.ids.add(elem.attrib['id'])
|
||||
if 'name' in elem.attrib:
|
||||
istate.ids.add(elem.attrib['name'])
|
||||
if tag == 'a' and 'href' in elem.attrib:
|
||||
istate.href = elem.attrib['href']
|
||||
istate.attrib.clear()
|
||||
if tag == 'img' and 'src' in elem.attrib:
|
||||
istate.attrib['src'] = elem.attrib['src']
|
||||
istate.attrib['align'] = 'baseline'
|
||||
cssdict = style.cssdict()
|
||||
valign = cssdict.get('vertical-align', None)
|
||||
if valign in ('top', 'bottom', 'middle'):
|
||||
istate.attrib['align'] = valign
|
||||
for prop in ('width', 'height'):
|
||||
if cssdict[prop] != 'auto':
|
||||
value = style[prop]
|
||||
if value == getattr(self.profile, prop):
|
||||
result = '100%'
|
||||
else:
|
||||
# Amazon's renderer does not support
|
||||
# img sizes in units other than px
|
||||
# See #7520 for test case
|
||||
try:
|
||||
pixs = int(round(float(value) /
|
||||
(72/self.profile.dpi)))
|
||||
except:
|
||||
continue
|
||||
result = unicode_type(pixs)
|
||||
istate.attrib[prop] = result
|
||||
if 'width' not in istate.attrib or 'height' not in istate.attrib:
|
||||
href = self.current_spine_item.abshref(elem.attrib['src'])
|
||||
try:
|
||||
item = self.oeb.manifest.hrefs[urlnormalize(href)]
|
||||
except:
|
||||
self.oeb.logger.warn('Failed to find image:',
|
||||
href)
|
||||
else:
|
||||
try:
|
||||
width, height = identify(item.data)[1:]
|
||||
except Exception:
|
||||
self.oeb.logger.warn('Invalid image:', href)
|
||||
else:
|
||||
if 'width' not in istate.attrib and 'height' not in \
|
||||
istate.attrib:
|
||||
istate.attrib['width'] = unicode_type(width)
|
||||
istate.attrib['height'] = unicode_type(height)
|
||||
else:
|
||||
ar = width / height
|
||||
if 'width' not in istate.attrib:
|
||||
try:
|
||||
width = int(istate.attrib['height'])*ar
|
||||
except:
|
||||
pass
|
||||
istate.attrib['width'] = unicode_type(int(width))
|
||||
else:
|
||||
try:
|
||||
height = int(istate.attrib['width'])/ar
|
||||
except:
|
||||
pass
|
||||
istate.attrib['height'] = unicode_type(int(height))
|
||||
item.unload_data_from_memory()
|
||||
elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}:
|
||||
raww = style._get('width')
|
||||
if hasattr(raww, 'strip') and '%' in raww:
|
||||
istate.attrib['width'] = raww
|
||||
else:
|
||||
prop = style['width'] / self.profile.width
|
||||
istate.attrib['width'] = "%d%%" % int(round(prop * 100))
|
||||
elif display == 'table':
|
||||
tag = 'table'
|
||||
elif display == 'table-row':
|
||||
tag = 'tr'
|
||||
elif display == 'table-cell':
|
||||
tag = 'td'
|
||||
if tag in TABLE_TAGS and self.ignore_tables:
|
||||
tag = 'span' if tag == 'td' else 'div'
|
||||
|
||||
if tag in ('table', 'td', 'tr'):
|
||||
col = style.backgroundColor
|
||||
if col:
|
||||
elem.set('bgcolor', col)
|
||||
css = style.cssdict()
|
||||
if 'border' in css or 'border-width' in css:
|
||||
elem.set('border', '1')
|
||||
if tag in TABLE_TAGS:
|
||||
for attr in ('rowspan', 'colspan', 'width', 'border', 'scope',
|
||||
'bgcolor'):
|
||||
if attr in elem.attrib:
|
||||
istate.attrib[attr] = elem.attrib[attr]
|
||||
if tag == 'q':
|
||||
t = elem.text
|
||||
if not t:
|
||||
t = ''
|
||||
elem.text = '\u201c' + t
|
||||
t = elem.tail
|
||||
if not t:
|
||||
t = ''
|
||||
elem.tail = '\u201d' + t
|
||||
text = None
|
||||
if elem.text:
|
||||
if istate.preserve or istate.pre_wrap:
|
||||
text = elem.text
|
||||
elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and
|
||||
elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS):
|
||||
text = None
|
||||
else:
|
||||
text = COLLAPSE.sub(' ', elem.text)
|
||||
valign = style['vertical-align']
|
||||
not_baseline = valign in ('super', 'sub', 'text-top',
|
||||
'text-bottom', 'top', 'bottom') or (
|
||||
isinstance(valign, numbers.Number) and abs(valign) != 0)
|
||||
issup = valign in ('super', 'text-top', 'top') or (
|
||||
isinstance(valign, numbers.Number) and valign > 0)
|
||||
vtag = 'sup' if issup else 'sub'
|
||||
if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
|
||||
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
|
||||
vbstate = BlockState(etree.SubElement(nroot, XHTML('body')))
|
||||
vbstate.para = etree.SubElement(vbstate.body, XHTML('p'))
|
||||
self.mobimlize_elem(elem, stylizer, vbstate, istates,
|
||||
ignore_valign=True)
|
||||
if len(istates) > 0:
|
||||
istates.pop()
|
||||
if len(istates) == 0:
|
||||
istates.append(FormatState())
|
||||
at_start = bstate.para is None
|
||||
if at_start:
|
||||
self.mobimlize_content('span', '', bstate, istates)
|
||||
parent = bstate.para if bstate.inline is None else bstate.inline
|
||||
if parent is not None:
|
||||
vtag = etree.SubElement(parent, XHTML(vtag))
|
||||
vtag = etree.SubElement(vtag, XHTML('small'))
|
||||
# Add anchors
|
||||
for child in vbstate.body:
|
||||
if child is not vbstate.para:
|
||||
vtag.append(child)
|
||||
else:
|
||||
break
|
||||
if vbstate.para is not None:
|
||||
if vbstate.para.text:
|
||||
vtag.text = vbstate.para.text
|
||||
for child in vbstate.para:
|
||||
vtag.append(child)
|
||||
return
|
||||
|
||||
if tag == 'blockquote':
|
||||
old_mim = self.opts.mobi_ignore_margins
|
||||
self.opts.mobi_ignore_margins = False
|
||||
|
||||
if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or (
|
||||
# We have an id but no text and no children, the id should still
|
||||
# be added.
|
||||
istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and
|
||||
len(elem)==0)):
|
||||
if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib:
|
||||
try:
|
||||
value = int(elem.attrib['value'])
|
||||
istates[-2].list_num = value - 1
|
||||
except:
|
||||
pass
|
||||
self.mobimlize_content(tag, text, bstate, istates)
|
||||
for child in elem:
|
||||
self.mobimlize_elem(child, stylizer, bstate, istates)
|
||||
tail = None
|
||||
if child.tail:
|
||||
if istate.preserve or istate.pre_wrap:
|
||||
tail = child.tail
|
||||
elif bstate.para is None and isspace(child.tail):
|
||||
tail = None
|
||||
else:
|
||||
tail = COLLAPSE.sub(' ', child.tail)
|
||||
if tail:
|
||||
self.mobimlize_content(tag, tail, bstate, istates)
|
||||
|
||||
if tag == 'blockquote':
|
||||
self.opts.mobi_ignore_margins = old_mim
|
||||
|
||||
if bstate.content and style['page-break-after'] in PAGE_BREAKS:
|
||||
bstate.pbreak = True
|
||||
if isblock:
|
||||
para = bstate.para
|
||||
if para is not None and para.text == '\xa0' and len(para) < 1:
|
||||
if style.height > 2:
|
||||
para.getparent().replace(para, etree.Element(XHTML('br')))
|
||||
else:
|
||||
# This is too small to be rendered effectively, drop it
|
||||
para.getparent().remove(para)
|
||||
bstate.para = None
|
||||
bstate.istate = None
|
||||
vmargin = asfloat(style['margin-bottom'])
|
||||
bstate.vmargin = max((bstate.vmargin, vmargin))
|
||||
vpadding = asfloat(style['padding-bottom'])
|
||||
if vpadding > 0:
|
||||
bstate.vpadding += bstate.vmargin
|
||||
bstate.vmargin = 0
|
||||
bstate.vpadding += vpadding
|
||||
if bstate.nested and bstate.nested[-1].tag == elem.tag:
|
||||
bstate.nested.pop()
|
||||
istates.pop()
|
||||
891
ebook_converter/ebooks/mobi/writer2/indexer.py
Normal file
891
ebook_converter/ebooks/mobi/writer2/indexer.py
Normal file
@@ -0,0 +1,891 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import numbers
|
||||
from struct import pack
|
||||
import io
|
||||
from collections import OrderedDict, defaultdict
|
||||
|
||||
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
|
||||
encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)
|
||||
from polyglot.builtins import filter, iteritems, itervalues, map, range
|
||||
|
||||
|
||||
class CNCX(CNCX_): # {{{
|
||||
|
||||
def __init__(self, toc, is_periodical):
|
||||
strings = []
|
||||
for item in toc.iterdescendants(breadth_first=True):
|
||||
strings.append(item.title)
|
||||
if is_periodical:
|
||||
strings.append(item.klass)
|
||||
if item.author:
|
||||
strings.append(item.author)
|
||||
if item.description:
|
||||
strings.append(item.description)
|
||||
CNCX_.__init__(self, strings)
|
||||
# }}}
|
||||
|
||||
|
||||
class TAGX(object): # {{{
|
||||
|
||||
BITMASKS = {11:0b1}
|
||||
BITMASKS.update({x:(1 << i) for i, x in enumerate([1, 2, 3, 4, 5, 21, 22, 23])})
|
||||
BITMASKS.update({x:(1 << i) for i, x in enumerate([69, 70, 71, 72, 73])})
|
||||
|
||||
NUM_VALUES = defaultdict(lambda :1)
|
||||
NUM_VALUES[11] = 3
|
||||
NUM_VALUES[0] = 0
|
||||
|
||||
def __init__(self):
|
||||
self.byts = bytearray()
|
||||
|
||||
def add_tag(self, tag):
|
||||
buf = self.byts
|
||||
buf.append(tag)
|
||||
buf.append(self.NUM_VALUES[tag])
|
||||
# bitmask
|
||||
buf.append(self.BITMASKS[tag] if tag else 0)
|
||||
# eof
|
||||
buf.append(0 if tag else 1)
|
||||
|
||||
def header(self, control_byte_count):
|
||||
header = b'TAGX'
|
||||
# table length, control byte count
|
||||
header += pack(b'>II', 12+len(self.byts), control_byte_count)
|
||||
return header
|
||||
|
||||
@property
|
||||
def periodical(self):
|
||||
'''
|
||||
TAGX block for the Primary index header of a periodical
|
||||
'''
|
||||
list(map(self.add_tag, (1, 2, 3, 4, 5, 21, 22, 23, 0, 69, 70, 71, 72,
|
||||
73, 0)))
|
||||
return self.header(2) + bytes(self.byts)
|
||||
|
||||
@property
|
||||
def secondary(self):
|
||||
'''
|
||||
TAGX block for the secondary index header of a periodical
|
||||
'''
|
||||
list(map(self.add_tag, (11, 0)))
|
||||
return self.header(1) + bytes(self.byts)
|
||||
|
||||
@property
|
||||
def flat_book(self):
|
||||
'''
|
||||
TAGX block for the primary index header of a flat book
|
||||
'''
|
||||
list(map(self.add_tag, (1, 2, 3, 4, 0)))
|
||||
return self.header(1) + bytes(self.byts)
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
# Index Entries {{{
|
||||
|
||||
class IndexEntry(object):
|
||||
|
||||
TAG_VALUES = {
|
||||
'offset': 1,
|
||||
'size': 2,
|
||||
'label_offset': 3,
|
||||
'depth': 4,
|
||||
'class_offset': 5,
|
||||
'secondary': 11,
|
||||
'parent_index': 21,
|
||||
'first_child_index': 22,
|
||||
'last_child_index': 23,
|
||||
'image_index': 69,
|
||||
'desc_offset': 70,
|
||||
'author_offset': 71,
|
||||
|
||||
}
|
||||
RTAG_MAP = {v:k for k, v in iteritems(TAG_VALUES)} # noqa
|
||||
|
||||
def __init__(self, offset, label_offset):
|
||||
self.offset, self.label_offset = offset, label_offset
|
||||
self.depth, self.class_offset = 0, None
|
||||
self.control_byte_count = 1
|
||||
|
||||
self.length = 0
|
||||
self.index = 0
|
||||
|
||||
self.parent_index = None
|
||||
self.first_child_index = None
|
||||
self.last_child_index = None
|
||||
|
||||
self.image_index = None
|
||||
self.author_offset = None
|
||||
self.desc_offset = None
|
||||
|
||||
def __repr__(self):
|
||||
return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,'
|
||||
' parent_index=%r)')%(self.offset, self.depth, self.length,
|
||||
self.index, self.parent_index)
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
return self.length
|
||||
|
||||
@size.setter
|
||||
def size(self, val):
|
||||
self.length = val
|
||||
|
||||
@property
|
||||
def next_offset(self):
|
||||
return self.offset + self.length
|
||||
|
||||
@property
|
||||
def tag_nums(self):
|
||||
for i in range(1, 5):
|
||||
yield i
|
||||
for attr in ('class_offset', 'parent_index', 'first_child_index',
|
||||
'last_child_index'):
|
||||
if getattr(self, attr) is not None:
|
||||
yield self.TAG_VALUES[attr]
|
||||
|
||||
@property
|
||||
def entry_type(self):
|
||||
ans = 0
|
||||
for tag in self.tag_nums:
|
||||
ans |= TAGX.BITMASKS[tag]
|
||||
return ans
|
||||
|
||||
def attr_for_tag(self, tag):
|
||||
return self.RTAG_MAP[tag]
|
||||
|
||||
@property
|
||||
def bytestring(self):
|
||||
buf = io.BytesIO()
|
||||
if isinstance(self.index, numbers.Integral):
|
||||
buf.write(encode_number_as_hex(self.index))
|
||||
else:
|
||||
raw = bytearray(self.index.encode('ascii'))
|
||||
raw.insert(0, len(raw))
|
||||
buf.write(bytes(raw))
|
||||
et = self.entry_type
|
||||
buf.write(bytes(bytearray([et])))
|
||||
|
||||
if self.control_byte_count == 2:
|
||||
flags = 0
|
||||
for attr in ('image_index', 'desc_offset', 'author_offset'):
|
||||
val = getattr(self, attr)
|
||||
if val is not None:
|
||||
tag = self.TAG_VALUES[attr]
|
||||
bm = TAGX.BITMASKS[tag]
|
||||
flags |= bm
|
||||
buf.write(bytes(bytearray([flags])))
|
||||
|
||||
for tag in self.tag_nums:
|
||||
attr = self.attr_for_tag(tag)
|
||||
val = getattr(self, attr)
|
||||
if isinstance(val, numbers.Integral):
|
||||
val = [val]
|
||||
for x in val:
|
||||
buf.write(encint(x))
|
||||
|
||||
if self.control_byte_count == 2:
|
||||
for attr in ('image_index', 'desc_offset', 'author_offset'):
|
||||
val = getattr(self, attr)
|
||||
if val is not None:
|
||||
buf.write(encint(val))
|
||||
|
||||
ans = buf.getvalue()
|
||||
return ans
|
||||
|
||||
|
||||
class PeriodicalIndexEntry(IndexEntry):
|
||||
|
||||
def __init__(self, offset, label_offset, class_offset, depth):
|
||||
IndexEntry.__init__(self, offset, label_offset)
|
||||
self.depth = depth
|
||||
self.class_offset = class_offset
|
||||
self.control_byte_count = 2
|
||||
|
||||
|
||||
class SecondaryIndexEntry(IndexEntry):
|
||||
|
||||
INDEX_MAP = {'author':73, 'caption':72, 'credit':71, 'description':70,
|
||||
'mastheadImage':69}
|
||||
|
||||
def __init__(self, index):
|
||||
IndexEntry.__init__(self, 0, 0)
|
||||
self.index = index
|
||||
|
||||
tag = self.INDEX_MAP[index]
|
||||
|
||||
# The values for this index entry
|
||||
# I dont know what the 5 means, it is not the number of entries
|
||||
self.secondary = [5 if tag == min(
|
||||
itervalues(self.INDEX_MAP)) else 0, 0, tag]
|
||||
|
||||
@property
|
||||
def tag_nums(self):
|
||||
yield 11
|
||||
|
||||
@property
|
||||
def entry_type(self):
|
||||
return 1
|
||||
|
||||
@classmethod
|
||||
def entries(cls):
|
||||
rmap = {v:k for k,v in iteritems(cls.INDEX_MAP)}
|
||||
for tag in sorted(rmap, reverse=True):
|
||||
yield cls(rmap[tag])
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
class TBS(object): # {{{
|
||||
|
||||
'''
|
||||
Take the list of index nodes starting/ending on a record and calculate the
|
||||
trailing byte sequence for the record.
|
||||
'''
|
||||
|
||||
def __init__(self, data, is_periodical, first=False, section_map={},
|
||||
after_first=False):
|
||||
self.section_map = section_map
|
||||
|
||||
if is_periodical:
|
||||
# The starting bytes.
|
||||
# The value is zero which I think indicates the periodical
|
||||
# index entry. The values for the various flags seem to be
|
||||
# unused. If the 0b100 is present, it means that the record
|
||||
# deals with section 1 (or is the final record with section
|
||||
# transitions).
|
||||
self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
|
||||
self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
|
||||
flag_size=3)
|
||||
self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
|
||||
flag_size=3)
|
||||
self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
|
||||
0}, flag_size=3)
|
||||
|
||||
if not data:
|
||||
byts = b''
|
||||
if after_first:
|
||||
# This can happen if a record contains only text between
|
||||
# the periodical start and the first section
|
||||
byts = self.type_011
|
||||
self.bytestring = byts
|
||||
else:
|
||||
depth_map = defaultdict(list)
|
||||
for x in ('starts', 'ends', 'completes'):
|
||||
for idx in data[x]:
|
||||
depth_map[idx.depth].append(idx)
|
||||
for l in itervalues(depth_map):
|
||||
l.sort(key=lambda x:x.offset)
|
||||
self.periodical_tbs(data, first, depth_map)
|
||||
else:
|
||||
if not data:
|
||||
self.bytestring = b''
|
||||
else:
|
||||
self.book_tbs(data, first)
|
||||
|
||||
def periodical_tbs(self, data, first, depth_map):
|
||||
buf = io.BytesIO()
|
||||
|
||||
has_section_start = (depth_map[1] and
|
||||
set(depth_map[1]).intersection(set(data['starts'])))
|
||||
spanner = data['spans']
|
||||
parent_section_index = -1
|
||||
|
||||
if depth_map[0]:
|
||||
# We have a terminal record
|
||||
|
||||
# Find the first non periodical node
|
||||
first_node = None
|
||||
for nodes in (depth_map[1], depth_map[2]):
|
||||
for node in nodes:
|
||||
if (first_node is None or (node.offset, node.depth) <
|
||||
(first_node.offset, first_node.depth)):
|
||||
first_node = node
|
||||
|
||||
typ = (self.type_110 if has_section_start else self.type_010)
|
||||
|
||||
# parent_section_index is needed for the last record
|
||||
if first_node is not None and first_node.depth > 0:
|
||||
parent_section_index = (first_node.index if first_node.depth == 1 else first_node.parent_index)
|
||||
else:
|
||||
parent_section_index = max(iter(self.section_map))
|
||||
|
||||
else:
|
||||
# Non terminal record
|
||||
|
||||
if spanner is not None:
|
||||
# record is spanned by a single article
|
||||
parent_section_index = spanner.parent_index
|
||||
typ = (self.type_110 if parent_section_index == 1 else
|
||||
self.type_010)
|
||||
elif not depth_map[1]:
|
||||
# has only article nodes, i.e. spanned by a section
|
||||
parent_section_index = depth_map[2][0].parent_index
|
||||
typ = (self.type_111 if parent_section_index == 1 else
|
||||
self.type_010)
|
||||
else:
|
||||
# has section transitions
|
||||
if depth_map[2]:
|
||||
parent_section_index = depth_map[2][0].parent_index
|
||||
else:
|
||||
parent_section_index = depth_map[1][0].index
|
||||
typ = self.type_011
|
||||
|
||||
buf.write(typ)
|
||||
|
||||
if typ not in (self.type_110, self.type_111) and parent_section_index > 0:
|
||||
extra = {}
|
||||
# Write starting section information
|
||||
if spanner is None:
|
||||
num_articles = len([a for a in depth_map[1] if a.parent_index == parent_section_index])
|
||||
if not depth_map[1]:
|
||||
extra = {0b0001: 0}
|
||||
if num_articles > 1:
|
||||
extra = {0b0100: num_articles}
|
||||
buf.write(encode_tbs(parent_section_index, extra))
|
||||
|
||||
if spanner is None:
|
||||
articles = depth_map[2]
|
||||
sections = {self.section_map[a.parent_index] for a in
|
||||
articles}
|
||||
sections = sorted(sections, key=lambda x:x.offset)
|
||||
section_map = {s:[a for a in articles if a.parent_index ==
|
||||
s.index] for s in sections}
|
||||
for i, section in enumerate(sections):
|
||||
# All the articles in this record that belong to section
|
||||
articles = section_map[section]
|
||||
first_article = articles[0]
|
||||
last_article = articles[-1]
|
||||
num = len(articles)
|
||||
last_article_ends = (last_article in data['ends'] or
|
||||
last_article in data['completes'])
|
||||
|
||||
try:
|
||||
next_sec = sections[i+1]
|
||||
except:
|
||||
next_sec = None
|
||||
|
||||
extra = {}
|
||||
if num > 1:
|
||||
extra[0b0100] = num
|
||||
if False and i == 0 and next_sec is not None:
|
||||
# Write offset to next section from start of record
|
||||
# I can't figure out exactly when Kindlegen decides to
|
||||
# write this so I have disabled it for now.
|
||||
extra[0b0001] = next_sec.offset - data['offset']
|
||||
|
||||
buf.write(encode_tbs(first_article.index-section.index, extra))
|
||||
|
||||
if next_sec is not None:
|
||||
buf.write(encode_tbs(last_article.index-next_sec.index,
|
||||
{0b1000: 0}))
|
||||
|
||||
# If a section TOC starts and extends into the next record add
|
||||
# a trailing vwi. We detect this by TBS type==3, processing last
|
||||
# section present in the record, and the last article in that
|
||||
# section either ends or completes and doesn't finish
|
||||
# on the last byte of the record.
|
||||
elif (typ == self.type_011 and last_article_ends and
|
||||
((last_article.offset+last_article.size) % RECORD_SIZE > 0)
|
||||
):
|
||||
buf.write(encode_tbs(last_article.index-section.index-1,
|
||||
{0b1000: 0}))
|
||||
|
||||
else:
|
||||
buf.write(encode_tbs(spanner.index - parent_section_index,
|
||||
{0b0001: 0}))
|
||||
|
||||
self.bytestring = buf.getvalue()
|
||||
|
||||
def book_tbs(self, data, first):
|
||||
spanner = data['spans']
|
||||
if spanner is not None:
|
||||
self.bytestring = encode_tbs(spanner.index, {0b010: 0, 0b001: 0},
|
||||
flag_size=3)
|
||||
else:
|
||||
starts, completes, ends = (data['starts'], data['completes'],
|
||||
data['ends'])
|
||||
if (not completes and (
|
||||
(len(starts) == 1 and not ends) or (len(ends) == 1 and not
|
||||
starts))):
|
||||
node = starts[0] if starts else ends[0]
|
||||
self.bytestring = encode_tbs(node.index, {0b010: 0}, flag_size=3)
|
||||
else:
|
||||
nodes = []
|
||||
for x in (starts, completes, ends):
|
||||
nodes.extend(x)
|
||||
nodes.sort(key=lambda x:x.index)
|
||||
self.bytestring = encode_tbs(nodes[0].index, {0b010:0,
|
||||
0b100: len(nodes)}, flag_size=3)
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
class Indexer(object): # {{{
|
||||
|
||||
def __init__(self, serializer, number_of_text_records,
|
||||
size_of_last_text_record, masthead_offset, is_periodical,
|
||||
opts, oeb):
|
||||
self.serializer = serializer
|
||||
self.number_of_text_records = number_of_text_records
|
||||
self.text_size = (RECORD_SIZE * (self.number_of_text_records-1) +
|
||||
size_of_last_text_record)
|
||||
self.masthead_offset = masthead_offset
|
||||
self.secondary_record_offset = None
|
||||
|
||||
self.oeb = oeb
|
||||
self.log = oeb.log
|
||||
self.opts = opts
|
||||
|
||||
self.is_periodical = is_periodical
|
||||
if self.is_periodical and self.masthead_offset is None:
|
||||
raise ValueError('Periodicals must have a masthead')
|
||||
|
||||
self.log('Generating MOBI index for a %s'%('periodical' if
|
||||
self.is_periodical else 'book'))
|
||||
self.is_flat_periodical = False
|
||||
if self.is_periodical:
|
||||
periodical_node = next(iter(oeb.toc))
|
||||
sections = tuple(periodical_node)
|
||||
self.is_flat_periodical = len(sections) == 1
|
||||
|
||||
self.records = []
|
||||
|
||||
if self.is_periodical:
|
||||
# Ensure all articles have an author and description before
|
||||
# creating the CNCX
|
||||
for node in oeb.toc.iterdescendants():
|
||||
if node.klass == 'article':
|
||||
aut, desc = node.author, node.description
|
||||
if not aut:
|
||||
aut = _('Unknown')
|
||||
if not desc:
|
||||
desc = _('No details available')
|
||||
node.author, node.description = aut, desc
|
||||
|
||||
self.cncx = CNCX(oeb.toc, self.is_periodical)
|
||||
|
||||
if self.is_periodical:
|
||||
self.indices = self.create_periodical_index()
|
||||
else:
|
||||
self.indices = self.create_book_index()
|
||||
|
||||
if not self.indices:
|
||||
raise ValueError('No valid entries in TOC, cannot generate index')
|
||||
|
||||
self.records.append(self.create_index_record())
|
||||
self.records.insert(0, self.create_header())
|
||||
self.records.extend(self.cncx.records)
|
||||
|
||||
if is_periodical:
|
||||
self.secondary_record_offset = len(self.records)
|
||||
self.records.append(self.create_header(secondary=True))
|
||||
self.records.append(self.create_index_record(secondary=True))
|
||||
|
||||
self.calculate_trailing_byte_sequences()
|
||||
|
||||
def create_index_record(self, secondary=False): # {{{
|
||||
header_length = 192
|
||||
buf = io.BytesIO()
|
||||
indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices
|
||||
|
||||
# Write index entries
|
||||
offsets = []
|
||||
for i in indices:
|
||||
offsets.append(buf.tell())
|
||||
buf.write(i.bytestring)
|
||||
|
||||
index_block = align_block(buf.getvalue())
|
||||
|
||||
# Write offsets to index entries as an IDXT block
|
||||
idxt_block = b'IDXT'
|
||||
buf.seek(0), buf.truncate(0)
|
||||
for offset in offsets:
|
||||
buf.write(pack(b'>H', header_length+offset))
|
||||
idxt_block = align_block(idxt_block + buf.getvalue())
|
||||
body = index_block + idxt_block
|
||||
|
||||
header = b'INDX'
|
||||
buf.seek(0), buf.truncate(0)
|
||||
buf.write(pack(b'>I', header_length))
|
||||
buf.write(b'\0'*4) # Unknown
|
||||
buf.write(pack(b'>I', 1)) # Header type? Or index record number?
|
||||
buf.write(b'\0'*4) # Unknown
|
||||
# IDXT block offset
|
||||
buf.write(pack(b'>I', header_length + len(index_block)))
|
||||
# Number of index entries
|
||||
buf.write(pack(b'>I', len(offsets)))
|
||||
# Unknown
|
||||
buf.write(b'\xff'*8)
|
||||
# Unknown
|
||||
buf.write(b'\0'*156)
|
||||
|
||||
header += buf.getvalue()
|
||||
|
||||
ans = header + body
|
||||
if len(ans) > 0x10000:
|
||||
raise ValueError('Too many entries (%d) in the TOC'%len(offsets))
|
||||
return ans
|
||||
# }}}
|
||||
|
||||
def create_header(self, secondary=False): # {{{
|
||||
buf = io.BytesIO()
|
||||
if secondary:
|
||||
tagx_block = TAGX().secondary
|
||||
else:
|
||||
tagx_block = (TAGX().periodical if self.is_periodical else
|
||||
TAGX().flat_book)
|
||||
header_length = 192
|
||||
|
||||
# Ident 0 - 4
|
||||
buf.write(b'INDX')
|
||||
|
||||
# Header length 4 - 8
|
||||
buf.write(pack(b'>I', header_length))
|
||||
|
||||
# Unknown 8-16
|
||||
buf.write(b'\0'*8)
|
||||
|
||||
# Index type: 0 - normal, 2 - inflection 16 - 20
|
||||
buf.write(pack(b'>I', 2))
|
||||
|
||||
# IDXT offset 20-24
|
||||
buf.write(pack(b'>I', 0)) # Filled in later
|
||||
|
||||
# Number of index records 24-28
|
||||
buf.write(pack(b'>I', 1 if secondary else len(self.records)))
|
||||
|
||||
# Index Encoding 28-32
|
||||
buf.write(pack(b'>I', 65001)) # utf-8
|
||||
|
||||
# Unknown 32-36
|
||||
buf.write(b'\xff'*4)
|
||||
|
||||
# Number of index entries 36-40
|
||||
indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices
|
||||
buf.write(pack(b'>I', len(indices)))
|
||||
|
||||
# ORDT offset 40-44
|
||||
buf.write(pack(b'>I', 0))
|
||||
|
||||
# LIGT offset 44-48
|
||||
buf.write(pack(b'>I', 0))
|
||||
|
||||
# Number of LIGT entries 48-52
|
||||
buf.write(pack(b'>I', 0))
|
||||
|
||||
# Number of CNCX records 52-56
|
||||
buf.write(pack(b'>I', 0 if secondary else len(self.cncx.records)))
|
||||
|
||||
# Unknown 56-180
|
||||
buf.write(b'\0'*124)
|
||||
|
||||
# TAGX offset 180-184
|
||||
buf.write(pack(b'>I', header_length))
|
||||
|
||||
# Unknown 184-192
|
||||
buf.write(b'\0'*8)
|
||||
|
||||
# TAGX block
|
||||
buf.write(tagx_block)
|
||||
|
||||
num = len(indices)
|
||||
|
||||
# The index of the last entry in the NCX
|
||||
idx = indices[-1].index
|
||||
if isinstance(idx, numbers.Integral):
|
||||
idx = encode_number_as_hex(idx)
|
||||
else:
|
||||
idx = idx.encode('ascii')
|
||||
idx = (bytes(bytearray([len(idx)]))) + idx
|
||||
buf.write(idx)
|
||||
|
||||
# The number of entries in the NCX
|
||||
buf.write(pack(b'>H', num))
|
||||
|
||||
# Padding
|
||||
pad = (4 - (buf.tell()%4))%4
|
||||
if pad:
|
||||
buf.write(b'\0'*pad)
|
||||
|
||||
idxt_offset = buf.tell()
|
||||
|
||||
buf.write(b'IDXT')
|
||||
buf.write(pack(b'>H', header_length + len(tagx_block)))
|
||||
buf.write(b'\0')
|
||||
buf.seek(20)
|
||||
buf.write(pack(b'>I', idxt_offset))
|
||||
|
||||
return align_block(buf.getvalue())
|
||||
# }}}
|
||||
|
||||
def create_book_index(self): # {{{
|
||||
indices = []
|
||||
seen = set()
|
||||
id_offsets = self.serializer.id_offsets
|
||||
|
||||
# Flatten toc so that chapter to chapter jumps work with all sub
|
||||
# chapter levels as well
|
||||
for node in self.oeb.toc.iterdescendants():
|
||||
try:
|
||||
offset = id_offsets[node.href]
|
||||
label = self.cncx[node.title]
|
||||
except:
|
||||
self.log.warn('TOC item %s [%s] not found in document'%(
|
||||
node.title, node.href))
|
||||
continue
|
||||
|
||||
if offset in seen:
|
||||
continue
|
||||
seen.add(offset)
|
||||
|
||||
indices.append(IndexEntry(offset, label))
|
||||
|
||||
indices.sort(key=lambda x:x.offset)
|
||||
|
||||
# Set lengths
|
||||
for i, index in enumerate(indices):
|
||||
try:
|
||||
next_offset = indices[i+1].offset
|
||||
except:
|
||||
next_offset = self.serializer.body_end_offset
|
||||
index.length = next_offset - index.offset
|
||||
|
||||
# Remove empty indices
|
||||
indices = [x for x in indices if x.length > 0]
|
||||
|
||||
# Reset lengths in case any were removed
|
||||
for i, index in enumerate(indices):
|
||||
try:
|
||||
next_offset = indices[i+1].offset
|
||||
except:
|
||||
next_offset = self.serializer.body_end_offset
|
||||
index.length = next_offset - index.offset
|
||||
|
||||
# Set index values
|
||||
for index, x in enumerate(indices):
|
||||
x.index = index
|
||||
|
||||
return indices
|
||||
|
||||
# }}}
|
||||
|
||||
def create_periodical_index(self): # {{{
|
||||
periodical_node = next(iter(self.oeb.toc))
|
||||
periodical_node_offset = self.serializer.body_start_offset
|
||||
periodical_node_size = (self.serializer.body_end_offset -
|
||||
periodical_node_offset)
|
||||
|
||||
normalized_sections = []
|
||||
|
||||
id_offsets = self.serializer.id_offsets
|
||||
|
||||
periodical = PeriodicalIndexEntry(periodical_node_offset,
|
||||
self.cncx[periodical_node.title],
|
||||
self.cncx[periodical_node.klass], 0)
|
||||
periodical.length = periodical_node_size
|
||||
periodical.first_child_index = 1
|
||||
periodical.image_index = self.masthead_offset
|
||||
|
||||
seen_sec_offsets = set()
|
||||
seen_art_offsets = set()
|
||||
|
||||
for sec in periodical_node:
|
||||
normalized_articles = []
|
||||
try:
|
||||
offset = id_offsets[sec.href]
|
||||
label = self.cncx[sec.title]
|
||||
klass = self.cncx[sec.klass]
|
||||
except:
|
||||
continue
|
||||
if offset in seen_sec_offsets:
|
||||
continue
|
||||
|
||||
seen_sec_offsets.add(offset)
|
||||
section = PeriodicalIndexEntry(offset, label, klass, 1)
|
||||
section.parent_index = 0
|
||||
|
||||
for art in sec:
|
||||
try:
|
||||
offset = id_offsets[art.href]
|
||||
label = self.cncx[art.title]
|
||||
klass = self.cncx[art.klass]
|
||||
except:
|
||||
continue
|
||||
if offset in seen_art_offsets:
|
||||
continue
|
||||
seen_art_offsets.add(offset)
|
||||
article = PeriodicalIndexEntry(offset, label, klass, 2)
|
||||
normalized_articles.append(article)
|
||||
article.author_offset = self.cncx[art.author]
|
||||
article.desc_offset = self.cncx[art.description]
|
||||
if getattr(art, 'toc_thumbnail', None) is not None:
|
||||
try:
|
||||
ii = self.serializer.images[art.toc_thumbnail] - 1
|
||||
if ii > -1:
|
||||
article.image_index = ii
|
||||
except KeyError:
|
||||
pass # Image not found in serializer
|
||||
|
||||
if normalized_articles:
|
||||
normalized_articles.sort(key=lambda x:x.offset)
|
||||
normalized_sections.append((section, normalized_articles))
|
||||
|
||||
normalized_sections.sort(key=lambda x:x[0].offset)
|
||||
|
||||
# Set lengths
|
||||
for s, x in enumerate(normalized_sections):
|
||||
sec, normalized_articles = x
|
||||
try:
|
||||
sec.length = normalized_sections[s+1][0].offset - sec.offset
|
||||
except:
|
||||
sec.length = self.serializer.body_end_offset - sec.offset
|
||||
for i, art in enumerate(normalized_articles):
|
||||
try:
|
||||
art.length = normalized_articles[i+1].offset - art.offset
|
||||
except:
|
||||
art.length = sec.offset + sec.length - art.offset
|
||||
|
||||
# Filter
|
||||
for i, x in list(enumerate(normalized_sections)):
|
||||
sec, normalized_articles = x
|
||||
normalized_articles = list(filter(lambda x: x.length > 0,
|
||||
normalized_articles))
|
||||
normalized_sections[i] = (sec, normalized_articles)
|
||||
|
||||
normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1],
|
||||
normalized_sections))
|
||||
|
||||
# Set indices
|
||||
i = 0
|
||||
for sec, articles in normalized_sections:
|
||||
i += 1
|
||||
sec.index = i
|
||||
sec.parent_index = 0
|
||||
|
||||
for sec, articles in normalized_sections:
|
||||
for art in articles:
|
||||
i += 1
|
||||
art.index = i
|
||||
|
||||
art.parent_index = sec.index
|
||||
|
||||
for sec, normalized_articles in normalized_sections:
|
||||
sec.first_child_index = normalized_articles[0].index
|
||||
sec.last_child_index = normalized_articles[-1].index
|
||||
|
||||
# Set lengths again to close up any gaps left by filtering
|
||||
for s, x in enumerate(normalized_sections):
|
||||
sec, articles = x
|
||||
try:
|
||||
next_offset = normalized_sections[s+1][0].offset
|
||||
except:
|
||||
next_offset = self.serializer.body_end_offset
|
||||
sec.length = next_offset - sec.offset
|
||||
|
||||
for a, art in enumerate(articles):
|
||||
try:
|
||||
next_offset = articles[a+1].offset
|
||||
except:
|
||||
next_offset = sec.next_offset
|
||||
art.length = next_offset - art.offset
|
||||
|
||||
# Sanity check
|
||||
for s, x in enumerate(normalized_sections):
|
||||
sec, articles = x
|
||||
try:
|
||||
next_sec = normalized_sections[s+1][0]
|
||||
except:
|
||||
if (sec.length == 0 or sec.next_offset !=
|
||||
self.serializer.body_end_offset):
|
||||
raise ValueError('Invalid section layout')
|
||||
else:
|
||||
if next_sec.offset != sec.next_offset or sec.length == 0:
|
||||
raise ValueError('Invalid section layout')
|
||||
for a, art in enumerate(articles):
|
||||
try:
|
||||
next_art = articles[a+1]
|
||||
except:
|
||||
if (art.length == 0 or art.next_offset !=
|
||||
sec.next_offset):
|
||||
raise ValueError('Invalid article layout')
|
||||
else:
|
||||
if art.length == 0 or art.next_offset != next_art.offset:
|
||||
raise ValueError('Invalid article layout')
|
||||
|
||||
# Flatten
|
||||
indices = [periodical]
|
||||
for sec, articles in normalized_sections:
|
||||
indices.append(sec)
|
||||
periodical.last_child_index = sec.index
|
||||
|
||||
for sec, articles in normalized_sections:
|
||||
for a in articles:
|
||||
indices.append(a)
|
||||
|
||||
return indices
|
||||
# }}}
|
||||
|
||||
# TBS {{{
|
||||
def calculate_trailing_byte_sequences(self):
|
||||
self.tbs_map = {}
|
||||
found_node = False
|
||||
sections = [i for i in self.indices if i.depth == 1]
|
||||
section_map = OrderedDict((i.index, i) for i in
|
||||
sorted(sections, key=lambda x:x.offset))
|
||||
|
||||
deepest = max(i.depth for i in self.indices)
|
||||
|
||||
for i in range(self.number_of_text_records):
|
||||
offset = i * RECORD_SIZE
|
||||
next_offset = offset + RECORD_SIZE
|
||||
data = {'ends':[], 'completes':[], 'starts':[],
|
||||
'spans':None, 'offset':offset, 'record_number':i+1}
|
||||
|
||||
for index in self.indices:
|
||||
|
||||
if index.offset >= next_offset:
|
||||
# Node starts after current record
|
||||
if index.depth == deepest:
|
||||
break
|
||||
else:
|
||||
continue
|
||||
if index.next_offset <= offset:
|
||||
# Node ends before current record
|
||||
continue
|
||||
if index.offset >= offset:
|
||||
# Node starts in current record
|
||||
if index.next_offset <= next_offset:
|
||||
# Node ends in current record
|
||||
data['completes'].append(index)
|
||||
else:
|
||||
data['starts'].append(index)
|
||||
else:
|
||||
# Node starts before current records
|
||||
if index.next_offset <= next_offset:
|
||||
# Node ends in current record
|
||||
data['ends'].append(index)
|
||||
elif index.depth == deepest:
|
||||
data['spans'] = index
|
||||
|
||||
if (data['ends'] or data['completes'] or data['starts'] or
|
||||
data['spans'] is not None):
|
||||
self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
|
||||
found_node, section_map=section_map)
|
||||
found_node = True
|
||||
else:
|
||||
self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False,
|
||||
after_first=found_node, section_map=section_map)
|
||||
|
||||
def get_trailing_byte_sequence(self, num):
|
||||
return self.tbs_map[num].bytestring
|
||||
# }}}
|
||||
|
||||
# }}}
|
||||
480
ebook_converter/ebooks/mobi/writer2/main.py
Normal file
480
ebook_converter/ebooks/mobi/writer2/main.py
Normal file
@@ -0,0 +1,480 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import io, random, time
|
||||
from struct import pack
|
||||
|
||||
from calibre.ebooks import normalize
|
||||
from calibre.ebooks.mobi.writer2.serializer import Serializer
|
||||
from calibre.ebooks.compression.palmdoc import compress_doc
|
||||
from calibre.ebooks.mobi.langcodes import iana2mobi
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
|
||||
from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
|
||||
align_block, detect_periodical, RECORD_SIZE, create_text_record)
|
||||
from calibre.ebooks.mobi.writer2.indexer import Indexer
|
||||
from polyglot.builtins import iteritems, unicode_type, range
|
||||
|
||||
# Disabled as I dont care about uncrossable breaks
|
||||
WRITE_UNCROSSABLE_BREAKS = False
|
||||
NULL_INDEX = 0xffffffff
|
||||
|
||||
FLIS = (b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+
|
||||
b'\xff'*4)
|
||||
|
||||
|
||||
def fcis(text_length):
|
||||
fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
|
||||
fcis += pack(b'>I', text_length)
|
||||
fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
|
||||
return fcis
|
||||
|
||||
|
||||
class MobiWriter(object):
|
||||
|
||||
def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True):
|
||||
self.opts = opts
|
||||
self.resources = resources
|
||||
self.kf8 = kf8
|
||||
self.for_joint = kf8 is not None
|
||||
self.write_page_breaks_after_item = write_page_breaks_after_item
|
||||
self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
|
||||
self.prefer_author_sort = opts.prefer_author_sort
|
||||
self.last_text_record_idx = 1
|
||||
|
||||
def __call__(self, oeb, path_or_stream):
|
||||
self.log = oeb.log
|
||||
pt = None
|
||||
if oeb.metadata.publication_type:
|
||||
x = unicode_type(oeb.metadata.publication_type[0]).split(':')
|
||||
if len(x) > 1:
|
||||
pt = x[1].lower()
|
||||
self.publication_type = pt
|
||||
|
||||
if hasattr(path_or_stream, 'write'):
|
||||
return self.dump_stream(oeb, path_or_stream)
|
||||
with open(path_or_stream, 'w+b') as stream:
|
||||
return self.dump_stream(oeb, stream)
|
||||
|
||||
def write(self, *args):
|
||||
for datum in args:
|
||||
self.stream.write(datum)
|
||||
|
||||
def tell(self):
|
||||
return self.stream.tell()
|
||||
|
||||
def dump_stream(self, oeb, stream):
|
||||
self.oeb = oeb
|
||||
self.stream = stream
|
||||
self.records = [None]
|
||||
self.generate_content()
|
||||
self.generate_joint_record0() if self.for_joint else self.generate_record0()
|
||||
self.write_header()
|
||||
self.write_content()
|
||||
|
||||
def generate_content(self):
|
||||
self.is_periodical = detect_periodical(self.oeb.toc, self.oeb.log)
|
||||
# Image records are stored in their own list, they are merged into the
|
||||
# main record list at the end
|
||||
self.generate_images()
|
||||
self.generate_text()
|
||||
# The uncrossable breaks trailing entries come before the indexing
|
||||
# trailing entries
|
||||
self.write_uncrossable_breaks()
|
||||
# Index records come after text records
|
||||
self.generate_index()
|
||||
|
||||
# Indexing {{{
|
||||
def generate_index(self):
|
||||
self.primary_index_record_idx = None
|
||||
if self.oeb.toc.count() < 1:
|
||||
self.log.warn('No TOC, MOBI index not generated')
|
||||
return
|
||||
try:
|
||||
self.indexer = Indexer(self.serializer, self.last_text_record_idx,
|
||||
len(self.records[self.last_text_record_idx]),
|
||||
self.masthead_offset, self.is_periodical,
|
||||
self.opts, self.oeb)
|
||||
except:
|
||||
self.log.exception('Failed to generate MOBI index:')
|
||||
else:
|
||||
self.primary_index_record_idx = len(self.records)
|
||||
for i in range(self.last_text_record_idx + 1):
|
||||
if i == 0:
|
||||
continue
|
||||
tbs = self.indexer.get_trailing_byte_sequence(i)
|
||||
self.records[i] += encode_trailing_data(tbs)
|
||||
self.records.extend(self.indexer.records)
|
||||
|
||||
# }}}
|
||||
|
||||
def write_uncrossable_breaks(self): # {{{
|
||||
'''
|
||||
Write information about uncrossable breaks (non linear items in
|
||||
the spine.
|
||||
'''
|
||||
if not WRITE_UNCROSSABLE_BREAKS:
|
||||
return
|
||||
|
||||
breaks = self.serializer.breaks
|
||||
|
||||
for i in range(1, self.last_text_record_idx+1):
|
||||
offset = i * RECORD_SIZE
|
||||
pbreak = 0
|
||||
running = offset
|
||||
|
||||
buf = io.BytesIO()
|
||||
|
||||
while breaks and (breaks[0] - offset) < RECORD_SIZE:
|
||||
pbreak = (breaks.pop(0) - running) >> 3
|
||||
encoded = encint(pbreak)
|
||||
buf.write(encoded)
|
||||
running += pbreak << 3
|
||||
encoded = encode_trailing_data(buf.getvalue())
|
||||
self.records[i] += encoded
|
||||
# }}}
|
||||
|
||||
# Images {{{
|
||||
|
||||
def generate_images(self):
|
||||
resources = self.resources
|
||||
image_records = resources.records
|
||||
self.image_map = resources.item_map
|
||||
self.masthead_offset = resources.masthead_offset
|
||||
self.cover_offset = resources.cover_offset
|
||||
self.thumbnail_offset = resources.thumbnail_offset
|
||||
|
||||
if image_records and image_records[0] is None:
|
||||
raise ValueError('Failed to find masthead image in manifest')
|
||||
|
||||
# }}}
|
||||
|
||||
def generate_text(self): # {{{
|
||||
self.oeb.logger.info('Serializing markup content...')
|
||||
self.serializer = Serializer(self.oeb, self.image_map,
|
||||
self.is_periodical,
|
||||
write_page_breaks_after_item=self.write_page_breaks_after_item)
|
||||
text = self.serializer()
|
||||
self.text_length = len(text)
|
||||
text = io.BytesIO(text)
|
||||
nrecords = 0
|
||||
records_size = 0
|
||||
|
||||
if self.compression != UNCOMPRESSED:
|
||||
self.oeb.logger.info(' Compressing markup content...')
|
||||
|
||||
while text.tell() < self.text_length:
|
||||
data, overlap = create_text_record(text)
|
||||
if self.compression == PALMDOC:
|
||||
data = compress_doc(data)
|
||||
|
||||
data += overlap
|
||||
data += pack(b'>B', len(overlap))
|
||||
|
||||
self.records.append(data)
|
||||
records_size += len(data)
|
||||
nrecords += 1
|
||||
|
||||
self.last_text_record_idx = nrecords
|
||||
self.first_non_text_record_idx = nrecords + 1
|
||||
# Pad so that the next records starts at a 4 byte boundary
|
||||
if records_size % 4 != 0:
|
||||
self.records.append(b'\x00'*(records_size % 4))
|
||||
self.first_non_text_record_idx += 1
|
||||
# }}}
|
||||
|
||||
def generate_record0(self): # MOBI header {{{
|
||||
metadata = self.oeb.metadata
|
||||
bt = 0x002
|
||||
if self.primary_index_record_idx is not None:
|
||||
if False and self.indexer.is_flat_periodical:
|
||||
# Disabled as setting this to 0x102 causes the Kindle to not
|
||||
# auto archive the issues
|
||||
bt = 0x102
|
||||
elif self.indexer.is_periodical:
|
||||
# If you change this, remember to change the cdetype in the EXTH
|
||||
# header as well
|
||||
bt = 0x103 if self.indexer.is_flat_periodical else 0x101
|
||||
|
||||
from calibre.ebooks.mobi.writer8.exth import build_exth
|
||||
exth = build_exth(metadata,
|
||||
prefer_author_sort=self.opts.prefer_author_sort,
|
||||
is_periodical=self.is_periodical,
|
||||
share_not_sync=self.opts.share_not_sync,
|
||||
cover_offset=self.cover_offset,
|
||||
thumbnail_offset=self.thumbnail_offset,
|
||||
start_offset=self.serializer.start_offset, mobi_doctype=bt
|
||||
)
|
||||
first_image_record = None
|
||||
if self.resources:
|
||||
used_images = self.serializer.used_images
|
||||
first_image_record = len(self.records)
|
||||
self.resources.serialize(self.records, used_images)
|
||||
last_content_record = len(self.records) - 1
|
||||
|
||||
# FCIS/FLIS (Seems to serve no purpose)
|
||||
flis_number = len(self.records)
|
||||
self.records.append(FLIS)
|
||||
fcis_number = len(self.records)
|
||||
self.records.append(fcis(self.text_length))
|
||||
|
||||
# EOF record
|
||||
self.records.append(b'\xE9\x8E\x0D\x0A')
|
||||
|
||||
record0 = io.BytesIO()
|
||||
# The MOBI Header
|
||||
record0.write(pack(b'>HHIHHHH',
|
||||
self.compression, # compression type # compression type
|
||||
0, # Unused
|
||||
self.text_length, # Text length
|
||||
self.last_text_record_idx, # Number of text records or last tr idx
|
||||
RECORD_SIZE, # Text record size
|
||||
0, # Unused
|
||||
0 # Unused
|
||||
)) # 0 - 15 (0x0 - 0xf)
|
||||
uid = random.randint(0, 0xffffffff)
|
||||
title = normalize(unicode_type(metadata.title[0])).encode('utf-8')
|
||||
|
||||
# 0x0 - 0x3
|
||||
record0.write(b'MOBI')
|
||||
|
||||
# 0x4 - 0x7 : Length of header
|
||||
# 0x8 - 0x11 : MOBI type
|
||||
# type meaning
|
||||
# 0x002 MOBI book (chapter - chapter navigation)
|
||||
# 0x101 News - Hierarchical navigation with sections and articles
|
||||
# 0x102 News feed - Flat navigation
|
||||
# 0x103 News magazine - same as 0x101
|
||||
# 0xC - 0xF : Text encoding (65001 is utf-8)
|
||||
# 0x10 - 0x13 : UID
|
||||
# 0x14 - 0x17 : Generator version
|
||||
|
||||
record0.write(pack(b'>IIIII',
|
||||
0xe8, bt, 65001, uid, 6))
|
||||
|
||||
# 0x18 - 0x1f : Unknown
|
||||
record0.write(b'\xff' * 8)
|
||||
|
||||
# 0x20 - 0x23 : Secondary index record
|
||||
sir = 0xffffffff
|
||||
if (self.primary_index_record_idx is not None and
|
||||
self.indexer.secondary_record_offset is not None):
|
||||
sir = (self.primary_index_record_idx +
|
||||
self.indexer.secondary_record_offset)
|
||||
record0.write(pack(b'>I', sir))
|
||||
|
||||
# 0x24 - 0x3f : Unknown
|
||||
record0.write(b'\xff' * 28)
|
||||
|
||||
# 0x40 - 0x43 : Offset of first non-text record
|
||||
record0.write(pack(b'>I',
|
||||
self.first_non_text_record_idx))
|
||||
|
||||
# 0x44 - 0x4b : title offset, title length
|
||||
record0.write(pack(b'>II',
|
||||
0xe8 + 16 + len(exth), len(title)))
|
||||
|
||||
# 0x4c - 0x4f : Language specifier
|
||||
record0.write(iana2mobi(
|
||||
unicode_type(metadata.language[0])))
|
||||
|
||||
# 0x50 - 0x57 : Input language and Output language
|
||||
record0.write(b'\0' * 8)
|
||||
|
||||
# 0x58 - 0x5b : Format version
|
||||
# 0x5c - 0x5f : First image record number
|
||||
record0.write(pack(b'>II',
|
||||
6, first_image_record if first_image_record else len(self.records)))
|
||||
|
||||
# 0x60 - 0x63 : First HUFF/CDIC record number
|
||||
# 0x64 - 0x67 : Number of HUFF/CDIC records
|
||||
# 0x68 - 0x6b : First DATP record number
|
||||
# 0x6c - 0x6f : Number of DATP records
|
||||
record0.write(b'\0' * 16)
|
||||
|
||||
# 0x70 - 0x73 : EXTH flags
|
||||
# Bit 6 (0b1000000) being set indicates the presence of an EXTH header
|
||||
# Bit 12 being set indicates the presence of embedded fonts
|
||||
# The purpose of the other bits is unknown
|
||||
exth_flags = 0b1010000
|
||||
if self.is_periodical:
|
||||
exth_flags |= 0b1000
|
||||
if self.resources.has_fonts:
|
||||
exth_flags |= 0b1000000000000
|
||||
record0.write(pack(b'>I', exth_flags))
|
||||
|
||||
# 0x74 - 0x93 : Unknown
|
||||
record0.write(b'\0' * 32)
|
||||
|
||||
# 0x94 - 0x97 : DRM offset
|
||||
# 0x98 - 0x9b : DRM count
|
||||
# 0x9c - 0x9f : DRM size
|
||||
# 0xa0 - 0xa3 : DRM flags
|
||||
record0.write(pack(b'>IIII',
|
||||
0xffffffff, 0xffffffff, 0, 0))
|
||||
|
||||
# 0xa4 - 0xaf : Unknown
|
||||
record0.write(b'\0'*12)
|
||||
|
||||
# 0xb0 - 0xb1 : First content record number
|
||||
# 0xb2 - 0xb3 : last content record number
|
||||
# (Includes Image, DATP, HUFF, DRM)
|
||||
record0.write(pack(b'>HH', 1, last_content_record))
|
||||
|
||||
# 0xb4 - 0xb7 : Unknown
|
||||
record0.write(b'\0\0\0\x01')
|
||||
|
||||
# 0xb8 - 0xbb : FCIS record number
|
||||
record0.write(pack(b'>I', fcis_number))
|
||||
|
||||
# 0xbc - 0xbf : Unknown (FCIS record count?)
|
||||
record0.write(pack(b'>I', 1))
|
||||
|
||||
# 0xc0 - 0xc3 : FLIS record number
|
||||
record0.write(pack(b'>I', flis_number))
|
||||
|
||||
# 0xc4 - 0xc7 : Unknown (FLIS record count?)
|
||||
record0.write(pack(b'>I', 1))
|
||||
|
||||
# 0xc8 - 0xcf : Unknown
|
||||
record0.write(b'\0'*8)
|
||||
|
||||
# 0xd0 - 0xdf : Unknown
|
||||
record0.write(pack(b'>IIII', 0xffffffff, 0, 0xffffffff, 0xffffffff))
|
||||
|
||||
# 0xe0 - 0xe3 : Extra record data
|
||||
# Extra record data flags:
|
||||
# - 0b1 : <extra multibyte bytes><size>
|
||||
# - 0b10 : <TBS indexing description of this HTML record><size>
|
||||
# - 0b100: <uncrossable breaks><size>
|
||||
# Setting bit 2 (0x2) disables <guide><reference type="start"> functionality
|
||||
extra_data_flags = 0b1 # Has multibyte overlap bytes
|
||||
if self.primary_index_record_idx is not None:
|
||||
extra_data_flags |= 0b10
|
||||
if WRITE_UNCROSSABLE_BREAKS:
|
||||
extra_data_flags |= 0b100
|
||||
record0.write(pack(b'>I', extra_data_flags))
|
||||
|
||||
# 0xe4 - 0xe7 : Primary index record
|
||||
record0.write(pack(b'>I', 0xffffffff if self.primary_index_record_idx
|
||||
is None else self.primary_index_record_idx))
|
||||
|
||||
record0.write(exth)
|
||||
record0.write(title)
|
||||
record0 = record0.getvalue()
|
||||
# Add some buffer so that Amazon can add encryption information if this
|
||||
# MOBI is submitted for publication
|
||||
record0 += (b'\0' * (1024*8))
|
||||
self.records[0] = align_block(record0)
|
||||
# }}}
|
||||
|
||||
def generate_joint_record0(self): # {{{
|
||||
from calibre.ebooks.mobi.writer8.mobi import (MOBIHeader,
|
||||
HEADER_FIELDS)
|
||||
from calibre.ebooks.mobi.writer8.exth import build_exth
|
||||
|
||||
# Insert resource records
|
||||
first_image_record = None
|
||||
old = len(self.records)
|
||||
if self.resources:
|
||||
used_images = self.serializer.used_images | self.kf8.used_images
|
||||
first_image_record = len(self.records)
|
||||
self.resources.serialize(self.records, used_images)
|
||||
resource_record_count = len(self.records) - old
|
||||
last_content_record = len(self.records) - 1
|
||||
|
||||
# FCIS/FLIS (Seems to serve no purpose)
|
||||
flis_number = len(self.records)
|
||||
self.records.append(FLIS)
|
||||
fcis_number = len(self.records)
|
||||
self.records.append(fcis(self.text_length))
|
||||
|
||||
# Insert KF8 records
|
||||
self.records.append(b'BOUNDARY')
|
||||
kf8_header_index = len(self.records)
|
||||
self.kf8.start_offset = (self.serializer.start_offset,
|
||||
self.kf8.start_offset)
|
||||
self.records.append(self.kf8.record0)
|
||||
self.records.extend(self.kf8.records[1:])
|
||||
|
||||
first_image_record = (first_image_record if first_image_record else
|
||||
len(self.records))
|
||||
|
||||
header_fields = {k:getattr(self.kf8, k) for k in HEADER_FIELDS}
|
||||
|
||||
# Now change the header fields that need to be different in the MOBI 6
|
||||
# header
|
||||
header_fields['first_resource_record'] = first_image_record
|
||||
ef = 0b100001010000 # Kinglegen uses this
|
||||
if self.resources.has_fonts:
|
||||
ef |= 0b1000000000000
|
||||
header_fields['exth_flags'] = ef
|
||||
header_fields['fdst_record'] = pack(b'>HH', 1, last_content_record)
|
||||
header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1
|
||||
header_fields['flis_record'] = flis_number
|
||||
header_fields['fcis_record'] = fcis_number
|
||||
header_fields['text_length'] = self.text_length
|
||||
extra_data_flags = 0b1 # Has multibyte overlap bytes
|
||||
if self.primary_index_record_idx is not None:
|
||||
extra_data_flags |= 0b10
|
||||
header_fields['extra_data_flags'] = extra_data_flags
|
||||
|
||||
for k, v in iteritems({'last_text_record':'last_text_record_idx',
|
||||
'first_non_text_record':'first_non_text_record_idx',
|
||||
'ncx_index':'primary_index_record_idx',
|
||||
}):
|
||||
header_fields[k] = getattr(self, v)
|
||||
if header_fields['ncx_index'] is None:
|
||||
header_fields['ncx_index'] = NULL_INDEX
|
||||
|
||||
for x in ('skel', 'chunk', 'guide'):
|
||||
header_fields[x+'_index'] = NULL_INDEX
|
||||
|
||||
# Create the MOBI 6 EXTH
|
||||
opts = self.opts
|
||||
kuc = 0 if resource_record_count > 0 else None
|
||||
|
||||
header_fields['exth'] = build_exth(self.oeb.metadata,
|
||||
prefer_author_sort=opts.prefer_author_sort,
|
||||
is_periodical=opts.mobi_periodical,
|
||||
share_not_sync=opts.share_not_sync,
|
||||
cover_offset=self.cover_offset,
|
||||
thumbnail_offset=self.thumbnail_offset,
|
||||
num_of_resources=resource_record_count,
|
||||
kf8_unknown_count=kuc, be_kindlegen2=True,
|
||||
kf8_header_index=kf8_header_index,
|
||||
start_offset=self.serializer.start_offset,
|
||||
mobi_doctype=2)
|
||||
self.records[0] = MOBIHeader(file_version=6)(**header_fields)
|
||||
|
||||
# }}}
|
||||
|
||||
def write_header(self): # PalmDB header {{{
|
||||
'''
|
||||
Write the PalmDB header
|
||||
'''
|
||||
title = ascii_filename(unicode_type(self.oeb.metadata.title[0])).replace(
|
||||
' ', '_')
|
||||
if not isinstance(title, bytes):
|
||||
title = title.encode('ascii')
|
||||
title = title[:31]
|
||||
title = title + (b'\0' * (32 - len(title)))
|
||||
now = int(time.time())
|
||||
nrecords = len(self.records)
|
||||
self.write(title, pack(b'>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0),
|
||||
b'BOOK', b'MOBI', pack(b'>IIH', (2*nrecords)-1, 0, nrecords))
|
||||
offset = self.tell() + (8 * nrecords) + 2
|
||||
for i, record in enumerate(self.records):
|
||||
self.write(pack(b'>I', offset), b'\0', pack(b'>I', 2*i)[1:])
|
||||
offset += len(record)
|
||||
self.write(b'\0\0')
|
||||
# }}}
|
||||
|
||||
def write_content(self):
|
||||
for record in self.records:
|
||||
self.write(record)
|
||||
396
ebook_converter/ebooks/mobi/writer2/serializer.py
Normal file
396
ebook_converter/ebooks/mobi/writer2/serializer.py
Normal file
@@ -0,0 +1,396 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from collections import defaultdict
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.ebooks.mobi.mobiml import MBP_NS
|
||||
from calibre.ebooks.mobi.utils import is_guide_ref_start
|
||||
from calibre.ebooks.oeb.base import (
|
||||
OEB_DOCS, XHTML, XHTML_NS, XML_NS, namespace, prefixname, urlnormalize
|
||||
)
|
||||
from polyglot.builtins import unicode_type, string_or_bytes
|
||||
from polyglot.urllib import urldefrag
|
||||
|
||||
|
||||
class Buf(BytesIO):
|
||||
|
||||
def write(self, x):
|
||||
if isinstance(x, unicode_type):
|
||||
x = x.encode('utf-8')
|
||||
BytesIO.write(self, x)
|
||||
|
||||
|
||||
class Serializer(object):
|
||||
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
|
||||
|
||||
def __init__(self, oeb, images, is_periodical, write_page_breaks_after_item=True):
|
||||
'''
|
||||
Write all the HTML markup in oeb into a single in memory buffer
|
||||
containing a single html document with links replaced by offsets into
|
||||
the buffer.
|
||||
|
||||
:param oeb: OEBBook object that encapsulates the document to be
|
||||
processed.
|
||||
|
||||
:param images: Mapping of image hrefs (urlnormalized) to image record
|
||||
indices.
|
||||
|
||||
:param write_page_breaks_after_item: If True a MOBIpocket pagebreak tag
|
||||
is written after every element of the spine in ``oeb``.
|
||||
'''
|
||||
self.oeb = oeb
|
||||
# Map of image hrefs to image index in the MOBI file
|
||||
self.images = images
|
||||
self.used_images = set()
|
||||
self.logger = oeb.logger
|
||||
self.is_periodical = is_periodical
|
||||
self.write_page_breaks_after_item = write_page_breaks_after_item
|
||||
|
||||
# If not None, this is a number pointing to the location at which to
|
||||
# open the MOBI file on the Kindle
|
||||
self.start_offset = None
|
||||
|
||||
# Mapping of hrefs (urlnormalized) to the offset in the buffer where
|
||||
# the resource pointed to by the href lives. Used at the end to fill in
|
||||
# the correct values into all filepos="..." links.
|
||||
self.id_offsets = {}
|
||||
|
||||
# Mapping of hrefs (urlnormalized) to a list of offsets into the buffer
|
||||
# where filepos="..." elements are written corresponding to links that
|
||||
# point to the href. This is used at the end to fill in the correct values.
|
||||
self.href_offsets = defaultdict(list)
|
||||
|
||||
# List of offsets in the buffer of non linear items in the spine. These
|
||||
# become uncrossable breaks in the MOBI
|
||||
self.breaks = []
|
||||
|
||||
self.find_blocks()
|
||||
|
||||
def find_blocks(self):
|
||||
'''
|
||||
Mark every item in the spine if it is the start/end of a
|
||||
section/article, so that it can be wrapped in divs appropriately.
|
||||
'''
|
||||
for item in self.oeb.spine:
|
||||
item.is_section_start = item.is_section_end = False
|
||||
item.is_article_start = item.is_article_end = False
|
||||
|
||||
def spine_item(tocitem):
|
||||
href = urldefrag(tocitem.href)[0]
|
||||
for item in self.oeb.spine:
|
||||
if item.href == href:
|
||||
return item
|
||||
|
||||
for item in self.oeb.toc.iterdescendants():
|
||||
if item.klass == 'section':
|
||||
articles = list(item)
|
||||
if not articles:
|
||||
continue
|
||||
spine_item(item).is_section_start = True
|
||||
for i, article in enumerate(articles):
|
||||
si = spine_item(article)
|
||||
if si is not None:
|
||||
si.is_article_start = True
|
||||
|
||||
items = list(self.oeb.spine)
|
||||
in_sec = in_art = False
|
||||
for i, item in enumerate(items):
|
||||
try:
|
||||
prev_item = items[i-1]
|
||||
except:
|
||||
prev_item = None
|
||||
if in_art and item.is_article_start is True:
|
||||
prev_item.is_article_end = True
|
||||
in_art = False
|
||||
if in_sec and item.is_section_start is True:
|
||||
prev_item.is_section_end = True
|
||||
in_sec = False
|
||||
if item.is_section_start:
|
||||
in_sec = True
|
||||
if item.is_article_start:
|
||||
in_art = True
|
||||
|
||||
item.is_section_end = item.is_article_end = True
|
||||
|
||||
def __call__(self):
|
||||
'''
|
||||
Return the document serialized as a single UTF-8 encoded bytestring.
|
||||
'''
|
||||
buf = self.buf = Buf()
|
||||
buf.write(b'<html>')
|
||||
self.serialize_head()
|
||||
self.serialize_body()
|
||||
buf.write(b'</html>')
|
||||
self.end_offset = buf.tell()
|
||||
self.fixup_links()
|
||||
if self.start_offset is None and not self.is_periodical:
|
||||
# If we don't set a start offset, the stupid Kindle will
|
||||
# open the book at the location of the first IndexEntry, which
|
||||
# could be anywhere. So ensure the book is always opened at the
|
||||
# beginning, instead.
|
||||
self.start_offset = self.body_start_offset
|
||||
return buf.getvalue()
|
||||
|
||||
def serialize_head(self):
|
||||
buf = self.buf
|
||||
buf.write(b'<head>')
|
||||
if len(self.oeb.guide) > 0:
|
||||
self.serialize_guide()
|
||||
buf.write(b'</head>')
|
||||
|
||||
def serialize_guide(self):
|
||||
'''
|
||||
The Kindle decides where to open a book based on the presence of
|
||||
an item in the guide that looks like
|
||||
<reference type="text" title="Start" href="chapter-one.xhtml"/>
|
||||
|
||||
Similarly an item with type="toc" controls where the Goto Table of
|
||||
Contents operation on the kindle goes.
|
||||
'''
|
||||
|
||||
buf = self.buf
|
||||
hrefs = self.oeb.manifest.hrefs
|
||||
buf.write(b'<guide>')
|
||||
for ref in self.oeb.guide.values():
|
||||
path = urldefrag(ref.href)[0]
|
||||
if path not in hrefs or hrefs[path].media_type not in OEB_DOCS:
|
||||
continue
|
||||
|
||||
buf.write(b'<reference type="')
|
||||
if ref.type.startswith('other.') :
|
||||
self.serialize_text(ref.type.replace('other.',''), quot=True)
|
||||
else:
|
||||
self.serialize_text(ref.type, quot=True)
|
||||
buf.write(b'" ')
|
||||
if ref.title is not None:
|
||||
buf.write(b'title="')
|
||||
self.serialize_text(ref.title, quot=True)
|
||||
buf.write(b'" ')
|
||||
if is_guide_ref_start(ref):
|
||||
self._start_href = ref.href
|
||||
self.serialize_href(ref.href)
|
||||
# Space required or won't work, I kid you not
|
||||
buf.write(b' />')
|
||||
|
||||
buf.write(b'</guide>')
|
||||
|
||||
def serialize_href(self, href, base=None):
|
||||
'''
|
||||
Serialize the href attribute of an <a> or <reference> tag. It is
|
||||
serialized as filepos="000000000" and a pointer to its location is
|
||||
stored in self.href_offsets so that the correct value can be filled in
|
||||
at the end.
|
||||
'''
|
||||
hrefs = self.oeb.manifest.hrefs
|
||||
try:
|
||||
path, frag = urldefrag(urlnormalize(href))
|
||||
except ValueError:
|
||||
# Unparseable URL
|
||||
return False
|
||||
if path and base:
|
||||
path = base.abshref(path)
|
||||
if path and path not in hrefs:
|
||||
return False
|
||||
buf = self.buf
|
||||
item = hrefs[path] if path else None
|
||||
if item and item.spine_position is None:
|
||||
return False
|
||||
path = item.href if item else base.href
|
||||
href = '#'.join((path, frag)) if frag else path
|
||||
buf.write(b'filepos=')
|
||||
self.href_offsets[href].append(buf.tell())
|
||||
buf.write(b'0000000000')
|
||||
return True
|
||||
|
||||
def serialize_body(self):
|
||||
'''
|
||||
Serialize all items in the spine of the document. Non linear items are
|
||||
moved to the end.
|
||||
'''
|
||||
buf = self.buf
|
||||
|
||||
def serialize_toc_level(tocref, href=None):
|
||||
# add the provided toc level to the output stream
|
||||
# if href is provided add a link ref to the toc level output (e.g. feed_0/index.html)
|
||||
if href is not None:
|
||||
# resolve the section url in id_offsets
|
||||
buf.write(b'<mbp:pagebreak />')
|
||||
self.id_offsets[urlnormalize(href)] = buf.tell()
|
||||
|
||||
if tocref.klass == "periodical":
|
||||
buf.write(b'<div> <div height="1em"></div>')
|
||||
else:
|
||||
t = tocref.title
|
||||
if isinstance(t, unicode_type):
|
||||
t = t.encode('utf-8')
|
||||
buf.write(b'<div></div> <div> <h2 height="1em"><font size="+2"><b>' + t +
|
||||
b'</b></font></h2> <div height="1em"></div>')
|
||||
|
||||
buf.write(b'<ul>')
|
||||
|
||||
for tocitem in tocref.nodes:
|
||||
buf.write(b'<li><a filepos=')
|
||||
itemhref = tocitem.href
|
||||
if tocref.klass == 'periodical':
|
||||
# This is a section node.
|
||||
# For periodical tocs, the section urls are like r'feed_\d+/index.html'
|
||||
# We dont want to point to the start of the first article
|
||||
# so we change the href.
|
||||
itemhref = re.sub(r'article_\d+/', '', itemhref)
|
||||
self.href_offsets[itemhref].append(buf.tell())
|
||||
buf.write(b'0000000000')
|
||||
buf.write(b' ><font size="+1"><b><u>')
|
||||
t = tocitem.title
|
||||
if isinstance(t, unicode_type):
|
||||
t = t.encode('utf-8')
|
||||
buf.write(t)
|
||||
buf.write(b'</u></b></font></a></li>')
|
||||
|
||||
buf.write(b'</ul><div height="1em"></div></div><mbp:pagebreak />')
|
||||
|
||||
self.anchor_offset = buf.tell()
|
||||
buf.write(b'<body>')
|
||||
self.body_start_offset = buf.tell()
|
||||
|
||||
if self.is_periodical:
|
||||
top_toc = self.oeb.toc.nodes[0]
|
||||
serialize_toc_level(top_toc)
|
||||
|
||||
spine = [item for item in self.oeb.spine if item.linear]
|
||||
spine.extend([item for item in self.oeb.spine if not item.linear])
|
||||
|
||||
for item in spine:
|
||||
|
||||
if self.is_periodical and item.is_section_start:
|
||||
for section_toc in top_toc.nodes:
|
||||
if urlnormalize(item.href) == section_toc.href:
|
||||
# create section url of the form r'feed_\d+/index.html'
|
||||
section_url = re.sub(r'article_\d+/', '', section_toc.href)
|
||||
serialize_toc_level(section_toc, section_url)
|
||||
section_toc.href = section_url
|
||||
break
|
||||
|
||||
self.serialize_item(item)
|
||||
|
||||
self.body_end_offset = buf.tell()
|
||||
buf.write(b'</body>')
|
||||
|
||||
def serialize_item(self, item):
|
||||
'''
|
||||
Serialize an individual item from the spine of the input document.
|
||||
A reference to this item is stored in self.href_offsets
|
||||
'''
|
||||
buf = self.buf
|
||||
if not item.linear:
|
||||
self.breaks.append(buf.tell() - 1)
|
||||
self.id_offsets[urlnormalize(item.href)] = buf.tell()
|
||||
if item.is_section_start:
|
||||
buf.write(b'<a ></a> ')
|
||||
if item.is_article_start:
|
||||
buf.write(b'<a ></a> <a ></a>')
|
||||
for elem in item.data.find(XHTML('body')):
|
||||
self.serialize_elem(elem, item)
|
||||
if self.write_page_breaks_after_item:
|
||||
buf.write(b'<mbp:pagebreak/>')
|
||||
if item.is_article_end:
|
||||
# Kindle periodical article end marker
|
||||
buf.write(b'<a ></a> <a ></a>')
|
||||
if item.is_section_end:
|
||||
buf.write(b' <a ></a>')
|
||||
self.anchor_offset = None
|
||||
|
||||
def serialize_elem(self, elem, item, nsrmap=NSRMAP):
|
||||
buf = self.buf
|
||||
if not isinstance(elem.tag, string_or_bytes) \
|
||||
or namespace(elem.tag) not in nsrmap:
|
||||
return
|
||||
tag = prefixname(elem.tag, nsrmap)
|
||||
# Previous layers take care of @name
|
||||
id_ = elem.attrib.pop('id', None)
|
||||
if id_:
|
||||
href = '#'.join((item.href, id_))
|
||||
offset = self.anchor_offset or buf.tell()
|
||||
key = urlnormalize(href)
|
||||
# Only set this id_offset if it wasn't previously seen
|
||||
self.id_offsets[key] = self.id_offsets.get(key, offset)
|
||||
if self.anchor_offset is not None and \
|
||||
tag == 'a' and not elem.attrib and \
|
||||
not len(elem) and not elem.text:
|
||||
return
|
||||
self.anchor_offset = buf.tell()
|
||||
buf.write(b'<')
|
||||
buf.write(tag.encode('utf-8'))
|
||||
if elem.attrib:
|
||||
for attr, val in elem.attrib.items():
|
||||
if namespace(attr) not in nsrmap:
|
||||
continue
|
||||
attr = prefixname(attr, nsrmap)
|
||||
buf.write(b' ')
|
||||
if attr == 'href':
|
||||
if self.serialize_href(val, item):
|
||||
continue
|
||||
elif attr == 'src':
|
||||
href = urlnormalize(item.abshref(val))
|
||||
if href in self.images:
|
||||
index = self.images[href]
|
||||
self.used_images.add(href)
|
||||
buf.write(b'recindex="%05d"' % index)
|
||||
continue
|
||||
buf.write(attr.encode('utf-8'))
|
||||
buf.write(b'="')
|
||||
self.serialize_text(val, quot=True)
|
||||
buf.write(b'"')
|
||||
buf.write(b'>')
|
||||
if elem.text or len(elem) > 0:
|
||||
if elem.text:
|
||||
self.anchor_offset = None
|
||||
self.serialize_text(elem.text)
|
||||
for child in elem:
|
||||
self.serialize_elem(child, item)
|
||||
if child.tail:
|
||||
self.anchor_offset = None
|
||||
self.serialize_text(child.tail)
|
||||
buf.write(('</%s>' % tag).encode('utf-8'))
|
||||
|
||||
def serialize_text(self, text, quot=False):
|
||||
text = text.replace('&', '&')
|
||||
text = text.replace('<', '<')
|
||||
text = text.replace('>', '>')
|
||||
text = text.replace(u'\u00AD', '') # Soft-hyphen
|
||||
if quot:
|
||||
text = text.replace('"', '"')
|
||||
if isinstance(text, unicode_type):
|
||||
text = unicodedata.normalize('NFC', text)
|
||||
self.buf.write(text.encode('utf-8'))
|
||||
|
||||
def fixup_links(self):
|
||||
'''
|
||||
Fill in the correct values for all filepos="..." links with the offsets
|
||||
of the linked to content (as stored in id_offsets).
|
||||
'''
|
||||
buf = self.buf
|
||||
id_offsets = self.id_offsets
|
||||
start_href = getattr(self, '_start_href', None)
|
||||
for href, hoffs in self.href_offsets.items():
|
||||
is_start = (href and href == start_href)
|
||||
# Iterate over all filepos items
|
||||
if href not in id_offsets:
|
||||
self.logger.warn('Hyperlink target %r not found' % href)
|
||||
# Link to the top of the document, better than just ignoring
|
||||
href, _ = urldefrag(href)
|
||||
if href in self.id_offsets:
|
||||
ioff = self.id_offsets[href]
|
||||
if is_start:
|
||||
self.start_offset = ioff
|
||||
for hoff in hoffs:
|
||||
buf.seek(hoff)
|
||||
buf.write(('%010d' % ioff).encode('utf-8'))
|
||||
10
ebook_converter/ebooks/mobi/writer8/__init__.py
Normal file
10
ebook_converter/ebooks/mobi/writer8/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
||||
43
ebook_converter/ebooks/mobi/writer8/cleanup.py
Normal file
43
ebook_converter/ebooks/mobi/writer8/cleanup.py
Normal file
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.ebooks.oeb.base import XPath
|
||||
|
||||
|
||||
class CSSCleanup(object):
|
||||
|
||||
def __init__(self, log, opts):
|
||||
self.log, self.opts = log, opts
|
||||
|
||||
def __call__(self, item, stylizer):
|
||||
if not hasattr(item.data, 'xpath'):
|
||||
return
|
||||
|
||||
# The Kindle touch displays all black pages if the height is set on
|
||||
# body
|
||||
for body in XPath('//h:body')(item.data):
|
||||
style = stylizer.style(body)
|
||||
style.drop('height')
|
||||
|
||||
|
||||
def remove_duplicate_anchors(oeb):
|
||||
# The Kindle apparently has incorrect behavior for duplicate anchors, see
|
||||
# https://bugs.launchpad.net/calibre/+bug/1454199
|
||||
for item in oeb.spine:
|
||||
if not hasattr(item.data, 'xpath'):
|
||||
continue
|
||||
seen = set()
|
||||
for tag in item.data.xpath('//*[@id or @name]'):
|
||||
for attr in ('id', 'name'):
|
||||
anchor = tag.get(attr)
|
||||
if anchor is not None:
|
||||
if anchor in seen:
|
||||
oeb.log.debug('Removing duplicate anchor:', anchor)
|
||||
tag.attrib.pop(attr)
|
||||
else:
|
||||
seen.add(anchor)
|
||||
228
ebook_converter/ebooks/mobi/writer8/exth.py
Normal file
228
ebook_converter/ebooks/mobi/writer8/exth.py
Normal file
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from struct import pack
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.constants import iswindows, isosx
|
||||
from calibre.ebooks.mobi.utils import (utf8_text, to_base)
|
||||
from calibre.utils.localization import lang_as_iso639_1
|
||||
from calibre.ebooks.metadata import authors_to_sort_string
|
||||
from polyglot.builtins import iteritems, unicode_type
|
||||
|
||||
EXTH_CODES = {
|
||||
'creator': 100,
|
||||
'publisher': 101,
|
||||
'description': 103,
|
||||
'identifier': 104,
|
||||
'subject': 105,
|
||||
'pubdate': 106,
|
||||
'review': 107,
|
||||
'contributor': 108,
|
||||
'rights': 109,
|
||||
'type': 111,
|
||||
'source': 112,
|
||||
'versionnumber': 114,
|
||||
'startreading': 116,
|
||||
'kf8_header_index': 121,
|
||||
'num_of_resources': 125,
|
||||
'kf8_thumbnail_uri': 129,
|
||||
'kf8_unknown_count': 131,
|
||||
'coveroffset': 201,
|
||||
'thumboffset': 202,
|
||||
'hasfakecover': 203,
|
||||
'lastupdatetime': 502,
|
||||
'title': 503,
|
||||
'language': 524,
|
||||
'primary_writing_mode': 525,
|
||||
'page_progression_direction': 527,
|
||||
}
|
||||
|
||||
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
||||
|
||||
|
||||
def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
|
||||
share_not_sync=True, cover_offset=None, thumbnail_offset=None,
|
||||
start_offset=None, mobi_doctype=2, num_of_resources=None,
|
||||
kf8_unknown_count=0, be_kindlegen2=False, kf8_header_index=None,
|
||||
page_progression_direction=None, primary_writing_mode=None):
|
||||
exth = BytesIO()
|
||||
nrecs = 0
|
||||
|
||||
for term in metadata:
|
||||
if term not in EXTH_CODES:
|
||||
continue
|
||||
code = EXTH_CODES[term]
|
||||
items = metadata[term]
|
||||
if term == 'creator':
|
||||
if prefer_author_sort:
|
||||
creators = [authors_to_sort_string([unicode_type(c)]) for c in
|
||||
items]
|
||||
else:
|
||||
creators = [unicode_type(c) for c in items]
|
||||
items = creators
|
||||
elif term == 'rights':
|
||||
try:
|
||||
rights = utf8_text(unicode_type(metadata.rights[0]))
|
||||
except:
|
||||
rights = b'Unknown'
|
||||
exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8))
|
||||
exth.write(rights)
|
||||
nrecs += 1
|
||||
continue
|
||||
|
||||
for item in items:
|
||||
data = unicode_type(item)
|
||||
if term != 'description':
|
||||
data = COLLAPSE_RE.sub(' ', data)
|
||||
if term == 'identifier':
|
||||
if data.lower().startswith('urn:isbn:'):
|
||||
data = data[9:]
|
||||
elif item.scheme.lower() == 'isbn':
|
||||
pass
|
||||
else:
|
||||
continue
|
||||
if term == 'language':
|
||||
d2 = lang_as_iso639_1(data)
|
||||
if d2:
|
||||
data = d2
|
||||
data = utf8_text(data)
|
||||
exth.write(pack(b'>II', code, len(data) + 8))
|
||||
exth.write(data)
|
||||
nrecs += 1
|
||||
|
||||
# Write UUID as ASIN
|
||||
uuid = None
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
for x in metadata['identifier']:
|
||||
if (x.get(OPF('scheme'), None).lower() == 'uuid' or
|
||||
unicode_type(x).startswith('urn:uuid:')):
|
||||
uuid = unicode_type(x).split(':')[-1]
|
||||
break
|
||||
if uuid is None:
|
||||
from uuid import uuid4
|
||||
uuid = unicode_type(uuid4())
|
||||
|
||||
if isinstance(uuid, unicode_type):
|
||||
uuid = uuid.encode('utf-8')
|
||||
if not share_not_sync:
|
||||
exth.write(pack(b'>II', 113, len(uuid) + 8))
|
||||
exth.write(uuid)
|
||||
nrecs += 1
|
||||
|
||||
# Write UUID as SOURCE
|
||||
c_uuid = b'calibre:%s' % uuid
|
||||
exth.write(pack(b'>II', 112, len(c_uuid) + 8))
|
||||
exth.write(c_uuid)
|
||||
nrecs += 1
|
||||
|
||||
# Write cdetype
|
||||
if not is_periodical:
|
||||
if not share_not_sync:
|
||||
exth.write(pack(b'>II', 501, 12))
|
||||
exth.write(b'EBOK')
|
||||
nrecs += 1
|
||||
else:
|
||||
ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
|
||||
if ids:
|
||||
exth.write(pack(b'>II', 501, 12))
|
||||
exth.write(ids)
|
||||
nrecs += 1
|
||||
|
||||
# Add a publication date entry
|
||||
if metadata['date']:
|
||||
datestr = unicode_type(metadata['date'][0])
|
||||
elif metadata['timestamp']:
|
||||
datestr = unicode_type(metadata['timestamp'][0])
|
||||
|
||||
if datestr is None:
|
||||
raise ValueError("missing date or timestamp")
|
||||
|
||||
datestr = datestr.encode('utf-8')
|
||||
exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
|
||||
exth.write(datestr)
|
||||
nrecs += 1
|
||||
if is_periodical:
|
||||
exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8))
|
||||
exth.write(datestr)
|
||||
nrecs += 1
|
||||
|
||||
if be_kindlegen2:
|
||||
mv = 200 if iswindows else 202 if isosx else 201
|
||||
vals = {204:mv, 205:2, 206:9, 207:0}
|
||||
elif is_periodical:
|
||||
# Pretend to be amazon's super secret periodical generator
|
||||
vals = {204:201, 205:2, 206:0, 207:101}
|
||||
else:
|
||||
# Pretend to be kindlegen 1.2
|
||||
vals = {204:201, 205:1, 206:2, 207:33307}
|
||||
for code, val in iteritems(vals):
|
||||
exth.write(pack(b'>III', code, 12, val))
|
||||
nrecs += 1
|
||||
if be_kindlegen2:
|
||||
revnum = b'0730-890adc2'
|
||||
exth.write(pack(b'>II', 535, 8 + len(revnum)) + revnum)
|
||||
nrecs += 1
|
||||
|
||||
if cover_offset is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12,
|
||||
cover_offset))
|
||||
exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0))
|
||||
nrecs += 2
|
||||
if thumbnail_offset is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
|
||||
thumbnail_offset))
|
||||
thumbnail_uri_str = ('kindle:embed:%s' %(to_base(thumbnail_offset, base=32, min_num_digits=4))).encode('utf-8')
|
||||
exth.write(pack(b'>II', EXTH_CODES['kf8_thumbnail_uri'], len(thumbnail_uri_str) + 8))
|
||||
exth.write(thumbnail_uri_str)
|
||||
nrecs += 2
|
||||
|
||||
if start_offset is not None:
|
||||
try:
|
||||
len(start_offset)
|
||||
except TypeError:
|
||||
start_offset = [start_offset]
|
||||
for so in start_offset:
|
||||
if so is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['startreading'], 12,
|
||||
so))
|
||||
nrecs += 1
|
||||
|
||||
if kf8_header_index is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['kf8_header_index'], 12,
|
||||
kf8_header_index))
|
||||
nrecs += 1
|
||||
|
||||
if num_of_resources is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['num_of_resources'], 12,
|
||||
num_of_resources))
|
||||
nrecs += 1
|
||||
|
||||
if kf8_unknown_count is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['kf8_unknown_count'], 12,
|
||||
kf8_unknown_count))
|
||||
nrecs += 1
|
||||
|
||||
if primary_writing_mode:
|
||||
pwm = primary_writing_mode.encode('utf-8')
|
||||
exth.write(pack(b'>II', EXTH_CODES['primary_writing_mode'], len(pwm) + 8))
|
||||
exth.write(pwm)
|
||||
nrecs += 1
|
||||
|
||||
if page_progression_direction in {'rtl', 'ltr', 'default'}:
|
||||
ppd = page_progression_direction.encode('ascii')
|
||||
exth.write(pack(b'>II', EXTH_CODES['page_progression_direction'], len(ppd) + 8))
|
||||
exth.write(ppd)
|
||||
nrecs += 1
|
||||
|
||||
exth = exth.getvalue()
|
||||
trail = len(exth) % 4
|
||||
pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
|
||||
exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad]
|
||||
return b''.join(exth)
|
||||
128
ebook_converter/ebooks/oeb/transforms/htmltoc.py
Normal file
128
ebook_converter/ebooks/oeb/transforms/htmltoc.py
Normal file
@@ -0,0 +1,128 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
'''
|
||||
HTML-TOC-adding transform.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
from calibre.ebooks.oeb.base import XML, XHTML, XHTML_NS
|
||||
from calibre.ebooks.oeb.base import XHTML_MIME, CSS_MIME
|
||||
from calibre.ebooks.oeb.base import element, XPath
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
__all__ = ['HTMLTOCAdder']
|
||||
|
||||
DEFAULT_TITLE = __('Table of Contents')
|
||||
|
||||
STYLE_CSS = {
|
||||
'nested': """
|
||||
.calibre_toc_header {
|
||||
text-align: center;
|
||||
}
|
||||
.calibre_toc_block {
|
||||
margin-left: 1.2em;
|
||||
text-indent: -1.2em;
|
||||
}
|
||||
.calibre_toc_block .calibre_toc_block {
|
||||
margin-left: 2.4em;
|
||||
}
|
||||
.calibre_toc_block .calibre_toc_block .calibre_toc_block {
|
||||
margin-left: 3.6em;
|
||||
}
|
||||
""",
|
||||
|
||||
'centered': """
|
||||
.calibre_toc_header {
|
||||
text-align: center;
|
||||
}
|
||||
.calibre_toc_block {
|
||||
text-align: center;
|
||||
}
|
||||
body > .calibre_toc_block {
|
||||
margin-top: 1.2em;
|
||||
}
|
||||
"""
|
||||
}
|
||||
|
||||
|
||||
class HTMLTOCAdder(object):
|
||||
|
||||
def __init__(self, title=None, style='nested', position='end'):
|
||||
self.title = title
|
||||
self.style = style
|
||||
self.position = position
|
||||
|
||||
@classmethod
|
||||
def config(cls, cfg):
|
||||
group = cfg.add_group('htmltoc', _('HTML TOC generation options.'))
|
||||
group('toc_title', ['--toc-title'], default=None,
|
||||
help=_('Title for any generated in-line table of contents.'))
|
||||
return cfg
|
||||
|
||||
@classmethod
|
||||
def generate(cls, opts):
|
||||
return cls(title=opts.toc_title)
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
has_toc = getattr(getattr(oeb, 'toc', False), 'nodes', False)
|
||||
|
||||
if 'toc' in oeb.guide:
|
||||
# Ensure toc pointed to in <guide> is in spine
|
||||
from calibre.ebooks.oeb.base import urlnormalize
|
||||
href = urlnormalize(oeb.guide['toc'].href)
|
||||
if href in oeb.manifest.hrefs:
|
||||
item = oeb.manifest.hrefs[href]
|
||||
if (hasattr(item.data, 'xpath') and
|
||||
XPath('//h:a[@href]')(item.data)):
|
||||
if oeb.spine.index(item) < 0:
|
||||
if self.position == 'end':
|
||||
oeb.spine.add(item, linear=False)
|
||||
else:
|
||||
oeb.spine.insert(0, item, linear=True)
|
||||
return
|
||||
elif has_toc:
|
||||
oeb.guide.remove('toc')
|
||||
else:
|
||||
oeb.guide.remove('toc')
|
||||
if not has_toc:
|
||||
return
|
||||
oeb.logger.info('Generating in-line TOC...')
|
||||
title = self.title or oeb.translate(DEFAULT_TITLE)
|
||||
style = self.style
|
||||
if style not in STYLE_CSS:
|
||||
oeb.logger.error('Unknown TOC style %r' % style)
|
||||
style = 'nested'
|
||||
id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css')
|
||||
oeb.manifest.add(id, css_href, CSS_MIME, data=STYLE_CSS[style])
|
||||
language = unicode_type(oeb.metadata.language[0])
|
||||
contents = element(None, XHTML('html'), nsmap={None: XHTML_NS},
|
||||
attrib={XML('lang'): language})
|
||||
head = element(contents, XHTML('head'))
|
||||
htitle = element(head, XHTML('title'))
|
||||
htitle.text = title
|
||||
element(head, XHTML('link'), rel='stylesheet', type=CSS_MIME,
|
||||
href=css_href)
|
||||
body = element(contents, XHTML('body'),
|
||||
attrib={'class': 'calibre_toc'})
|
||||
h1 = element(body, XHTML('h2'),
|
||||
attrib={'class': 'calibre_toc_header'})
|
||||
h1.text = title
|
||||
self.add_toc_level(body, oeb.toc)
|
||||
id, href = oeb.manifest.generate('contents', 'contents.xhtml')
|
||||
item = oeb.manifest.add(id, href, XHTML_MIME, data=contents)
|
||||
if self.position == 'end':
|
||||
oeb.spine.add(item, linear=False)
|
||||
else:
|
||||
oeb.spine.insert(0, item, linear=True)
|
||||
oeb.guide.add('toc', 'Table of Contents', href)
|
||||
|
||||
def add_toc_level(self, elem, toc):
|
||||
for node in toc:
|
||||
block = element(elem, XHTML('div'),
|
||||
attrib={'class': 'calibre_toc_block'})
|
||||
line = element(block, XHTML('a'),
|
||||
attrib={'href': node.href,
|
||||
'class': 'calibre_toc_line'})
|
||||
line.text = node.title
|
||||
self.add_toc_level(block, node)
|
||||
117
ebook_converter/ebooks/oeb/transforms/manglecase.py
Normal file
117
ebook_converter/ebooks/oeb/transforms/manglecase.py
Normal file
@@ -0,0 +1,117 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
'''
|
||||
CSS case-mangling transform.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
from lxml import etree
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS
|
||||
from calibre.ebooks.oeb.base import CSS_MIME
|
||||
from calibre.ebooks.oeb.base import namespace
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from polyglot.builtins import string_or_bytes
|
||||
|
||||
CASE_MANGLER_CSS = """
|
||||
.calibre_lowercase {
|
||||
font-variant: normal;
|
||||
font-size: 0.65em;
|
||||
}
|
||||
"""
|
||||
|
||||
TEXT_TRANSFORMS = {'capitalize', 'uppercase', 'lowercase'}
|
||||
|
||||
|
||||
class CaseMangler(object):
|
||||
|
||||
@classmethod
|
||||
def config(cls, cfg):
|
||||
return cfg
|
||||
|
||||
@classmethod
|
||||
def generate(cls, opts):
|
||||
return cls()
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
oeb.logger.info('Applying case-transforming CSS...')
|
||||
self.oeb = oeb
|
||||
self.opts = context
|
||||
self.profile = context.source
|
||||
self.mangle_spine()
|
||||
|
||||
def mangle_spine(self):
|
||||
id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css')
|
||||
self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS)
|
||||
for item in self.oeb.spine:
|
||||
html = item.data
|
||||
relhref = item.relhref(href)
|
||||
etree.SubElement(html.find(XHTML('head')), XHTML('link'),
|
||||
rel='stylesheet', href=relhref, type=CSS_MIME)
|
||||
stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile)
|
||||
self.mangle_elem(html.find(XHTML('body')), stylizer)
|
||||
|
||||
def text_transform(self, transform, text):
|
||||
if transform == 'capitalize':
|
||||
return icu_title(text)
|
||||
elif transform == 'uppercase':
|
||||
return icu_upper(text)
|
||||
elif transform == 'lowercase':
|
||||
return icu_lower(text)
|
||||
return text
|
||||
|
||||
def split_text(self, text):
|
||||
results = ['']
|
||||
isupper = text[0].isupper()
|
||||
for char in text:
|
||||
if char.isupper() == isupper:
|
||||
results[-1] += char
|
||||
else:
|
||||
isupper = not isupper
|
||||
results.append(char)
|
||||
return results
|
||||
|
||||
def smallcaps_elem(self, elem, attr):
|
||||
texts = self.split_text(getattr(elem, attr))
|
||||
setattr(elem, attr, None)
|
||||
last = elem if attr == 'tail' else None
|
||||
attrib = {'class': 'calibre_lowercase'}
|
||||
for text in texts:
|
||||
if text.isupper():
|
||||
if last is None:
|
||||
elem.text = text
|
||||
else:
|
||||
last.tail = text
|
||||
else:
|
||||
child = elem.makeelement(XHTML('span'), attrib=attrib)
|
||||
child.text = text.upper()
|
||||
if last is None:
|
||||
elem.insert(0, child)
|
||||
else:
|
||||
# addnext() moves the tail for some reason
|
||||
tail = last.tail
|
||||
last.addnext(child)
|
||||
last.tail = tail
|
||||
child.tail = None
|
||||
last = child
|
||||
|
||||
def mangle_elem(self, elem, stylizer):
|
||||
if not isinstance(elem.tag, string_or_bytes) or \
|
||||
namespace(elem.tag) != XHTML_NS:
|
||||
return
|
||||
children = list(elem)
|
||||
style = stylizer.style(elem)
|
||||
transform = style['text-transform']
|
||||
variant = style['font-variant']
|
||||
if elem.text:
|
||||
if transform in TEXT_TRANSFORMS:
|
||||
elem.text = self.text_transform(transform, elem.text)
|
||||
if variant == 'small-caps':
|
||||
self.smallcaps_elem(elem, 'text')
|
||||
for child in children:
|
||||
self.mangle_elem(child, stylizer)
|
||||
if child.tail:
|
||||
if transform in TEXT_TRANSFORMS:
|
||||
child.tail = self.text_transform(transform, child.tail)
|
||||
if variant == 'small-caps':
|
||||
self.smallcaps_elem(child, 'tail')
|
||||
239
ebook_converter/ebooks/oeb/transforms/rasterize.py
Normal file
239
ebook_converter/ebooks/oeb/transforms/rasterize.py
Normal file
@@ -0,0 +1,239 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
'''
|
||||
SVG rasterization transform.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
import os, re
|
||||
|
||||
from PyQt5.Qt import (
|
||||
Qt, QByteArray, QBuffer, QIODevice, QColor, QImage, QPainter, QSvgRenderer)
|
||||
from calibre.ebooks.oeb.base import XHTML, XLINK
|
||||
from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME
|
||||
from calibre.ebooks.oeb.base import xml2str, xpath
|
||||
from calibre.ebooks.oeb.base import urlnormalize
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.utils.imghdr import what
|
||||
from polyglot.builtins import unicode_type
|
||||
from polyglot.urllib import urldefrag
|
||||
|
||||
IMAGE_TAGS = {XHTML('img'), XHTML('object')}
|
||||
KEEP_ATTRS = {'class', 'style', 'width', 'height', 'align'}
|
||||
|
||||
|
||||
class Unavailable(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class SVGRasterizer(object):
|
||||
|
||||
def __init__(self, base_css=''):
|
||||
self.base_css = base_css
|
||||
from calibre.gui2 import must_use_qt
|
||||
must_use_qt()
|
||||
|
||||
@classmethod
|
||||
def config(cls, cfg):
|
||||
return cfg
|
||||
|
||||
@classmethod
|
||||
def generate(cls, opts):
|
||||
return cls()
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
oeb.logger.info('Rasterizing SVG images...')
|
||||
self.temp_files = []
|
||||
self.stylizer_cache = {}
|
||||
self.oeb = oeb
|
||||
self.opts = context
|
||||
self.profile = context.dest
|
||||
self.images = {}
|
||||
self.dataize_manifest()
|
||||
self.rasterize_spine()
|
||||
self.rasterize_cover()
|
||||
for pt in self.temp_files:
|
||||
try:
|
||||
os.remove(pt)
|
||||
except:
|
||||
pass
|
||||
|
||||
def rasterize_svg(self, elem, width=0, height=0, format='PNG'):
|
||||
view_box = elem.get('viewBox', elem.get('viewbox', None))
|
||||
sizes = None
|
||||
logger = self.oeb.logger
|
||||
|
||||
if view_box is not None:
|
||||
try:
|
||||
box = [float(x) for x in filter(None, re.split('[, ]', view_box))]
|
||||
sizes = [box[2]-box[0], box[3] - box[1]]
|
||||
except (TypeError, ValueError, IndexError):
|
||||
logger.warn('SVG image has invalid viewBox="%s", ignoring the viewBox' % view_box)
|
||||
else:
|
||||
for image in elem.xpath('descendant::*[local-name()="image" and '
|
||||
'@height and contains(@height, "%")]'):
|
||||
logger.info('Found SVG image height in %, trying to convert...')
|
||||
try:
|
||||
h = float(image.get('height').replace('%', ''))/100.
|
||||
image.set('height', unicode_type(h*sizes[1]))
|
||||
except:
|
||||
logger.exception('Failed to convert percentage height:',
|
||||
image.get('height'))
|
||||
|
||||
data = QByteArray(xml2str(elem, with_tail=False))
|
||||
svg = QSvgRenderer(data)
|
||||
size = svg.defaultSize()
|
||||
if size.width() == 100 and size.height() == 100 and sizes:
|
||||
size.setWidth(sizes[0])
|
||||
size.setHeight(sizes[1])
|
||||
if width or height:
|
||||
size.scale(width, height, Qt.KeepAspectRatio)
|
||||
logger.info('Rasterizing %r to %dx%d'
|
||||
% (elem, size.width(), size.height()))
|
||||
image = QImage(size, QImage.Format_ARGB32_Premultiplied)
|
||||
image.fill(QColor("white").rgb())
|
||||
painter = QPainter(image)
|
||||
svg.render(painter)
|
||||
painter.end()
|
||||
array = QByteArray()
|
||||
buffer = QBuffer(array)
|
||||
buffer.open(QIODevice.WriteOnly)
|
||||
image.save(buffer, format)
|
||||
return array.data()
|
||||
|
||||
def dataize_manifest(self):
|
||||
for item in self.oeb.manifest.values():
|
||||
if item.media_type == SVG_MIME and item.data is not None:
|
||||
self.dataize_svg(item)
|
||||
|
||||
def dataize_svg(self, item, svg=None):
|
||||
if svg is None:
|
||||
svg = item.data
|
||||
hrefs = self.oeb.manifest.hrefs
|
||||
for elem in xpath(svg, '//svg:*[@xl:href]'):
|
||||
href = urlnormalize(elem.attrib[XLINK('href')])
|
||||
path = urldefrag(href)[0]
|
||||
if not path:
|
||||
continue
|
||||
abshref = item.abshref(path)
|
||||
if abshref not in hrefs:
|
||||
continue
|
||||
linkee = hrefs[abshref]
|
||||
data = linkee.bytes_representation
|
||||
ext = what(None, data) or 'jpg'
|
||||
with PersistentTemporaryFile(suffix='.'+ext) as pt:
|
||||
pt.write(data)
|
||||
self.temp_files.append(pt.name)
|
||||
elem.attrib[XLINK('href')] = pt.name
|
||||
return svg
|
||||
|
||||
def stylizer(self, item):
|
||||
ans = self.stylizer_cache.get(item, None)
|
||||
if ans is None:
|
||||
ans = Stylizer(item.data, item.href, self.oeb, self.opts,
|
||||
self.profile, base_css=self.base_css)
|
||||
self.stylizer_cache[item] = ans
|
||||
return ans
|
||||
|
||||
def rasterize_spine(self):
|
||||
for item in self.oeb.spine:
|
||||
self.rasterize_item(item)
|
||||
|
||||
def rasterize_item(self, item):
|
||||
html = item.data
|
||||
hrefs = self.oeb.manifest.hrefs
|
||||
for elem in xpath(html, '//h:img[@src]'):
|
||||
src = urlnormalize(elem.attrib['src'])
|
||||
image = hrefs.get(item.abshref(src), None)
|
||||
if image and image.media_type == SVG_MIME:
|
||||
style = self.stylizer(item).style(elem)
|
||||
self.rasterize_external(elem, style, item, image)
|
||||
for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME):
|
||||
data = urlnormalize(elem.attrib['data'])
|
||||
image = hrefs.get(item.abshref(data), None)
|
||||
if image and image.media_type == SVG_MIME:
|
||||
style = self.stylizer(item).style(elem)
|
||||
self.rasterize_external(elem, style, item, image)
|
||||
for elem in xpath(html, '//svg:svg'):
|
||||
style = self.stylizer(item).style(elem)
|
||||
self.rasterize_inline(elem, style, item)
|
||||
|
||||
def rasterize_inline(self, elem, style, item):
|
||||
width = style['width']
|
||||
height = style['height']
|
||||
width = (width / 72) * self.profile.dpi
|
||||
height = (height / 72) * self.profile.dpi
|
||||
elem = self.dataize_svg(item, elem)
|
||||
data = self.rasterize_svg(elem, width, height)
|
||||
manifest = self.oeb.manifest
|
||||
href = os.path.splitext(item.href)[0] + '.png'
|
||||
id, href = manifest.generate(item.id, href)
|
||||
manifest.add(id, href, PNG_MIME, data=data)
|
||||
img = elem.makeelement(XHTML('img'), src=item.relhref(href))
|
||||
elem.getparent().replace(elem, img)
|
||||
for prop in ('width', 'height'):
|
||||
if prop in elem.attrib:
|
||||
img.attrib[prop] = elem.attrib[prop]
|
||||
|
||||
def rasterize_external(self, elem, style, item, svgitem):
|
||||
width = style['width']
|
||||
height = style['height']
|
||||
width = (width / 72) * self.profile.dpi
|
||||
height = (height / 72) * self.profile.dpi
|
||||
data = QByteArray(svgitem.bytes_representation)
|
||||
svg = QSvgRenderer(data)
|
||||
size = svg.defaultSize()
|
||||
size.scale(width, height, Qt.KeepAspectRatio)
|
||||
key = (svgitem.href, size.width(), size.height())
|
||||
if key in self.images:
|
||||
href = self.images[key]
|
||||
else:
|
||||
logger = self.oeb.logger
|
||||
logger.info('Rasterizing %r to %dx%d'
|
||||
% (svgitem.href, size.width(), size.height()))
|
||||
image = QImage(size, QImage.Format_ARGB32_Premultiplied)
|
||||
image.fill(QColor("white").rgb())
|
||||
painter = QPainter(image)
|
||||
svg.render(painter)
|
||||
painter.end()
|
||||
array = QByteArray()
|
||||
buffer = QBuffer(array)
|
||||
buffer.open(QIODevice.WriteOnly)
|
||||
image.save(buffer, 'PNG')
|
||||
data = array.data()
|
||||
manifest = self.oeb.manifest
|
||||
href = os.path.splitext(svgitem.href)[0] + '.png'
|
||||
id, href = manifest.generate(svgitem.id, href)
|
||||
manifest.add(id, href, PNG_MIME, data=data)
|
||||
self.images[key] = href
|
||||
elem.tag = XHTML('img')
|
||||
for attr in elem.attrib:
|
||||
if attr not in KEEP_ATTRS:
|
||||
del elem.attrib[attr]
|
||||
elem.attrib['src'] = item.relhref(href)
|
||||
if elem.text:
|
||||
elem.attrib['alt'] = elem.text
|
||||
elem.text = None
|
||||
for child in elem:
|
||||
elem.remove(child)
|
||||
|
||||
def rasterize_cover(self):
|
||||
covers = self.oeb.metadata.cover
|
||||
if not covers:
|
||||
return
|
||||
if unicode_type(covers[0]) not in self.oeb.manifest.ids:
|
||||
self.oeb.logger.warn('Cover not in manifest, skipping.')
|
||||
self.oeb.metadata.clear('cover')
|
||||
return
|
||||
cover = self.oeb.manifest.ids[unicode_type(covers[0])]
|
||||
if not cover.media_type == SVG_MIME:
|
||||
return
|
||||
width = (self.profile.width / 72) * self.profile.dpi
|
||||
height = (self.profile.height / 72) * self.profile.dpi
|
||||
data = self.rasterize_svg(cover.data, width, height)
|
||||
href = os.path.splitext(cover.href)[0] + '.png'
|
||||
id, href = self.oeb.manifest.generate(cover.id, href)
|
||||
self.oeb.manifest.add(id, href, PNG_MIME, data=data)
|
||||
covers[0].value = id
|
||||
10
ebook_converter/ebooks/pdf/__init__.py
Normal file
10
ebook_converter/ebooks/pdf/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python2
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Used for pdf output for comic2pdf
|
||||
'''
|
||||
182
ebook_converter/ebooks/pdf/pdftohtml.py
Normal file
182
ebook_converter/ebooks/pdf/pdftohtml.py
Normal file
@@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import errno
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from calibre import CurrentDir, xml_replace_entities, prints
|
||||
from calibre.constants import (
|
||||
filesystem_encoding, isbsd, islinux, isosx, ispy3, iswindows
|
||||
)
|
||||
from calibre.ebooks import ConversionError, DRMError
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
from calibre.utils.ipc import eintr_retry_call
|
||||
|
||||
|
||||
PDFTOHTML = 'pdftohtml'
|
||||
|
||||
|
||||
def popen(cmd, **kw):
|
||||
if not ispy3:
|
||||
cmd = [x.encode(filesystem_encoding) if not isinstance(x, bytes) else x for x in cmd]
|
||||
if iswindows:
|
||||
kw['creationflags'] = 0x08
|
||||
return subprocess.Popen(cmd, **kw)
|
||||
|
||||
|
||||
if isosx and hasattr(sys, 'frameworks_dir'):
|
||||
base = os.path.join(os.path.dirname(sys.frameworks_dir), 'utils.app', 'Contents', 'MacOS')
|
||||
PDFTOHTML = os.path.join(base, PDFTOHTML)
|
||||
if iswindows and hasattr(sys, 'frozen'):
|
||||
base = sys.extensions_location if hasattr(sys, 'new_app_layout') else os.path.dirname(sys.executable)
|
||||
PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
|
||||
if (islinux or isbsd) and getattr(sys, 'frozen', False):
|
||||
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
|
||||
|
||||
|
||||
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
||||
'''
|
||||
Convert the pdf into html using the pdftohtml app.
|
||||
This will write the html as index.html into output_dir.
|
||||
It will also write all extracted images to the output_dir
|
||||
'''
|
||||
|
||||
pdfsrc = os.path.join(output_dir, 'src.pdf')
|
||||
index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html'))
|
||||
|
||||
with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest:
|
||||
shutil.copyfileobj(src, dest)
|
||||
|
||||
with CurrentDir(output_dir):
|
||||
|
||||
def a(x):
|
||||
return os.path.basename(x)
|
||||
|
||||
exe = PDFTOHTML
|
||||
cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
|
||||
'-nodrm', a(pdfsrc), a(index)]
|
||||
|
||||
if isbsd:
|
||||
cmd.remove('-nodrm')
|
||||
if no_images:
|
||||
cmd.append('-i')
|
||||
if as_xml:
|
||||
cmd.append('-xml')
|
||||
|
||||
logf = PersistentTemporaryFile('pdftohtml_log')
|
||||
try:
|
||||
p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
|
||||
stdin=subprocess.PIPE)
|
||||
except OSError as err:
|
||||
if err.errno == errno.ENOENT:
|
||||
raise ConversionError(
|
||||
_('Could not find pdftohtml, check it is in your PATH'))
|
||||
else:
|
||||
raise
|
||||
ret = eintr_retry_call(p.wait)
|
||||
logf.flush()
|
||||
logf.close()
|
||||
out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
|
||||
if ret != 0:
|
||||
raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out))
|
||||
if out:
|
||||
prints("pdftohtml log:")
|
||||
prints(out)
|
||||
if not os.path.exists(index) or os.stat(index).st_size < 100:
|
||||
raise DRMError()
|
||||
|
||||
if not as_xml:
|
||||
with lopen(index, 'r+b') as i:
|
||||
raw = i.read().decode('utf-8', 'replace')
|
||||
raw = flip_images(raw)
|
||||
raw = raw.replace('<head', '<!-- created by calibre\'s pdftohtml -->\n <head', 1)
|
||||
i.seek(0)
|
||||
i.truncate()
|
||||
# versions of pdftohtml >= 0.20 output self closing <br> tags, this
|
||||
# breaks the pdf heuristics regexps, so replace them
|
||||
raw = raw.replace('<br/>', '<br>')
|
||||
raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I)
|
||||
raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
|
||||
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
|
||||
raw = xml_replace_entities(raw)
|
||||
raw = raw.replace('\u00a0', ' ')
|
||||
|
||||
i.write(raw.encode('utf-8'))
|
||||
|
||||
cmd = [exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
|
||||
'-nodrm', '-q', '-stdout', a(pdfsrc)]
|
||||
if isbsd:
|
||||
cmd.remove('-nodrm')
|
||||
p = popen(cmd, stdout=subprocess.PIPE)
|
||||
raw = p.stdout.read().strip()
|
||||
if p.wait() == 0 and raw:
|
||||
parse_outline(raw, output_dir)
|
||||
|
||||
try:
|
||||
os.remove(pdfsrc)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def parse_outline(raw, output_dir):
|
||||
from lxml import etree
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
|
||||
outline = safe_xml_fromstring(raw).xpath('(//outline)[1]')
|
||||
if outline:
|
||||
from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
|
||||
outline = outline[0]
|
||||
toc = TOC()
|
||||
count = [0]
|
||||
|
||||
def process_node(node, toc):
|
||||
for child in node.iterchildren('*'):
|
||||
if child.tag == 'outline':
|
||||
parent = toc.children[-1] if toc.children else toc
|
||||
process_node(child, parent)
|
||||
else:
|
||||
if child.text:
|
||||
page = child.get('page', '1')
|
||||
toc.add(child.text, 'index.html', 'p' + page)
|
||||
count[0] += 1
|
||||
process_node(outline, toc)
|
||||
if count[0] > 2:
|
||||
root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml')
|
||||
with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f:
|
||||
f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True))
|
||||
|
||||
|
||||
def flip_image(img, flip):
|
||||
from calibre.utils.img import flip_image, image_and_format_from_data, image_to_data
|
||||
with lopen(img, 'r+b') as f:
|
||||
img, fmt = image_and_format_from_data(f.read())
|
||||
img = flip_image(img, horizontal='x' in flip, vertical='y' in flip)
|
||||
f.seek(0), f.truncate()
|
||||
f.write(image_to_data(img, fmt=fmt))
|
||||
|
||||
|
||||
def flip_images(raw):
|
||||
for match in re.finditer('<IMG[^>]+/?>', raw, flags=re.I):
|
||||
img = match.group()
|
||||
m = re.search(r'class="(x|y|xy)flip"', img)
|
||||
if m is None:
|
||||
continue
|
||||
flip = m.group(1)
|
||||
src = re.search(r'src="([^"]+)"', img)
|
||||
if src is None:
|
||||
continue
|
||||
img = src.group(1)
|
||||
if not os.path.exists(img):
|
||||
continue
|
||||
flip_image(img, flip)
|
||||
raw = re.sub(r'<STYLE.+?</STYLE>\s*', '', raw, flags=re.I|re.DOTALL)
|
||||
return raw
|
||||
Reference in New Issue
Block a user