1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-02-16 22:25:54 +01:00
Files
ebook-converter/ebook_converter/ebooks/mobi/mobiml.py
gryf ce89f5c9d1 Use the real constants module.
This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
2020-05-29 17:04:53 +02:00

622 lines
26 KiB
Python

"""
Transform XHTML/OPS-ish content into Mobipocket HTML 3.2.
"""
import copy
import re
import numbers
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.stylizer import Stylizer
from ebook_converter.ebooks.oeb.transforms.flatcss import KeyMapper
from ebook_converter.ebooks.mobi.utils import convert_color_for_font_tag
from ebook_converter.utils.imghdr import identify
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
MBP_NS = 'http://mobipocket.com/ns/mbp'
def MBP(name):
return '{%s}%s' % (MBP_NS, name)
MOBI_NSMAP = {None: const.XHTML_NS, 'mbp': const.MBP_NS}
INLINE_TAGS = {'span', 'a', 'code', 'u', 's', 'big', 'strike', 'tt', 'font', 'q', 'i', 'b', 'em', 'strong', 'sup', 'sub'}
HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
# GR: Added 'caption' to both sets
NESTABLE_TAGS = {'ol', 'ul', 'li', 'table', 'tr', 'td', 'th', 'caption'}
TABLE_TAGS = {'table', 'tr', 'td', 'th', 'caption'}
SPECIAL_TAGS = {'hr', 'br'}
CONTENT_TAGS = {'img', 'hr', 'br'}
NOT_VTAGS = HEADER_TAGS | NESTABLE_TAGS | TABLE_TAGS | SPECIAL_TAGS | \
CONTENT_TAGS
LEAF_TAGS = {'base', 'basefont', 'frame', 'link', 'meta', 'area', 'br',
'col', 'hr', 'img', 'input', 'param'}
PAGE_BREAKS = {'always', 'left', 'right'}
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
def asfloat(value):
if not isinstance(value, numbers.Number):
return 0.0
return float(value)
def isspace(text):
if not text:
return True
if '\xa0' in text:
return False
return text.isspace()
class BlockState(object):
def __init__(self, body):
self.body = body
self.nested = []
self.para = None
self.inline = None
self.anchor = None
self.vpadding = 0.
self.vmargin = 0.
self.pbreak = False
self.istate = None
self.content = False
class FormatState(object):
def __init__(self):
self.rendered = False
self.left = 0.
self.halign = 'auto'
self.indent = 0.
self.fsize = 3
self.ids = set()
self.italic = False
self.bold = False
self.strikethrough = False
self.underline = False
self.preserve = False
self.pre_wrap = False
self.family = 'serif'
self.bgcolor = 'transparent'
self.fgcolor = 'black'
self.href = None
self.list_num = 0
self.attrib = {}
def __eq__(self, other):
return self.fsize == other.fsize \
and self.italic == other.italic \
and self.bold == other.bold \
and self.href == other.href \
and self.preserve == other.preserve \
and self.pre_wrap == other.pre_wrap \
and self.family == other.family \
and self.bgcolor == other.bgcolor \
and self.fgcolor == other.fgcolor \
and self.strikethrough == other.strikethrough \
and self.underline == other.underline
def __ne__(self, other):
return not self.__eq__(other)
class MobiMLizer(object):
def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables
def __call__(self, oeb, context):
oeb.logger.info('Converting XHTML to Mobipocket markup...')
self.oeb = oeb
self.log = self.oeb.logger
self.opts = context
self.profile = profile = context.dest
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
self.mobimlize_spine()
def mobimlize_spine(self):
'Iterate over the spine and convert it to MOBIML'
for item in self.oeb.spine:
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile)
body = item.data.find(base.tag('xhtml', 'body'))
nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP)
nbody = etree.SubElement(nroot, base.tag('xhtml', 'body'))
self.current_spine_item = item
self.mobimlize_elem(body, stylizer, BlockState(nbody),
[FormatState()])
item.data = nroot
# print(etree.tostring(nroot))
def mobimlize_font(self, ptsize):
return self.fnums[self.fmap[ptsize]]
def mobimlize_measure(self, ptsize):
if isinstance(ptsize, (str, bytes)):
return ptsize
embase = self.profile.fbase
if round(ptsize) < embase:
return "%dpt" % int(round(ptsize))
return "%dem" % int(round(ptsize / embase))
def preize_text(self, text, pre_wrap=False):
text = str(text)
if pre_wrap:
# Replace n consecutive spaces with n-1 NBSP + space
text = re.sub(r' {2,}', lambda m:('\xa0'*(len(m.group())-1) + ' '), text)
else:
text = text.replace(' ', '\xa0')
text = text.replace('\r\n', '\n')
text = text.replace('\r', '\n')
lines = text.split('\n')
result = lines[:1]
for line in lines[1:]:
result.append(etree.Element(base.tag('xhtml', 'br')))
if line:
result.append(line)
return result
def mobimlize_content(self, tag, text, bstate, istates):
'Convert text content'
if text or tag != 'br':
bstate.content = True
istate = istates[-1]
para = bstate.para
if tag in SPECIAL_TAGS and not text:
para = para if para is not None else bstate.body
elif para is None or tag in ('td', 'th'):
body = bstate.body
if bstate.pbreak:
etree.SubElement(body, MBP('pagebreak'))
bstate.pbreak = False
bstate.istate = None
bstate.anchor = None
parent = bstate.nested[-1] if bstate.nested else bstate.body
indent = istate.indent
left = istate.left
if isinstance(indent, (str, bytes)):
indent = 0
if indent < 0 and abs(indent) < left:
left += indent
indent = 0
elif indent != 0 and abs(indent) < self.profile.fbase:
indent = (indent / abs(indent)) * self.profile.fbase
if tag in NESTABLE_TAGS and not istate.rendered:
para = wrapper = etree.SubElement(
parent, base.tag('xhtml', tag), attrib=istate.attrib)
bstate.nested.append(para)
if tag == 'li' and len(istates) > 1:
istates[-2].list_num += 1
para.attrib['value'] = str(istates[-2].list_num)
elif tag in NESTABLE_TAGS and istate.rendered:
para = wrapper = bstate.nested[-1]
elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0:
ems = self.profile.mobi_ems_per_blockquote
para = wrapper = etree.SubElement(parent, base.tag('xhtml', 'blockquote'))
para = wrapper
emleft = int(round(left / self.profile.fbase)) - ems
emleft = min((emleft, 10))
while emleft > ems / 2:
para = etree.SubElement(para, base.tag('xhtml', 'blockquote'))
emleft -= ems
else:
para = wrapper = etree.SubElement(parent, base.tag('xhtml', 'p'))
bstate.inline = bstate.para = para
vspace = bstate.vpadding + bstate.vmargin
bstate.vpadding = bstate.vmargin = 0
if tag not in TABLE_TAGS:
if tag in ('ul', 'ol') and vspace > 0:
wrapper.addprevious(etree.Element(base.tag('xhtml', 'div'),
height=self.mobimlize_measure(vspace)))
else:
wrapper.attrib['height'] = self.mobimlize_measure(vspace)
para.attrib['width'] = self.mobimlize_measure(indent)
elif tag == 'table' and vspace > 0:
vspace = int(round(vspace / self.profile.fbase))
while vspace > 0:
wrapper.addprevious(etree.Element(base.tag('xhtml', 'br')))
vspace -= 1
if istate.halign != 'auto' and isinstance(istate.halign, (bytes, str)):
if isinstance(istate.halign, bytes):
istate.halign = istate.halign.decode('utf-8')
para.attrib['align'] = istate.halign
istate.rendered = True
pstate = bstate.istate
if tag in CONTENT_TAGS:
bstate.inline = para
pstate = bstate.istate = None
try:
etree.SubElement(para, base.tag('xhtml', tag), attrib=istate.attrib)
except:
print('Invalid subelement:', para, tag, istate.attrib)
raise
elif tag in TABLE_TAGS:
para.attrib['valign'] = 'top'
if istate.ids:
for id_ in istate.ids:
anchor = etree.Element(base.tag('xhtml', 'a'), attrib={'id': id_})
if tag == 'li':
try:
last = bstate.body[-1][-1]
except:
break
last.insert(0, anchor)
anchor.tail = last.text
last.text = None
else:
last = bstate.body[-1]
# We use append instead of addprevious so that inline
# anchors in large blocks point to the correct place. See
# https://bugs.launchpad.net/calibre/+bug/899831
# This could potentially break if inserting an anchor at
# this point in the markup is illegal, but I cannot think
# of such a case offhand.
if parse_utils.barename(last.tag) in LEAF_TAGS:
last.addprevious(anchor)
else:
last.append(anchor)
istate.ids.clear()
if not text:
return
if not pstate or istate != pstate:
inline = para
fsize = istate.fsize
href = istate.href
if not href:
bstate.anchor = None
elif pstate and pstate.href == href:
inline = bstate.anchor
else:
inline = etree.SubElement(inline, base.tag('xhtml', 'a'), href=href)
bstate.anchor = inline
if fsize != 3:
inline = etree.SubElement(inline, base.tag('xhtml', 'font'),
size=str(fsize))
if istate.family == 'monospace':
inline = etree.SubElement(inline, base.tag('xhtml', 'tt'))
if istate.italic:
inline = etree.SubElement(inline, base.tag('xhtml', 'i'))
if istate.bold:
inline = etree.SubElement(inline, base.tag('xhtml', 'b'))
if istate.bgcolor is not None and istate.bgcolor != 'transparent' :
inline = etree.SubElement(inline, base.tag('xhtml', 'span'),
bgcolor=convert_color_for_font_tag(istate.bgcolor))
if istate.fgcolor != 'black':
inline = etree.SubElement(inline, base.tag('xhtml', 'font'),
color=convert_color_for_font_tag(istate.fgcolor))
if istate.strikethrough:
inline = etree.SubElement(inline, base.tag('xhtml', 's'))
if istate.underline:
inline = etree.SubElement(inline, base.tag('xhtml', 'u'))
bstate.inline = inline
bstate.istate = istate
inline = bstate.inline
content = self.preize_text(text, pre_wrap=istate.pre_wrap) if istate.preserve or istate.pre_wrap else [text]
for item in content:
if isinstance(item, (str, bytes)):
if len(inline) == 0:
inline.text = (inline.text or '') + item
else:
last = inline[-1]
last.tail = (last.tail or '') + item
else:
inline.append(item)
def mobimlize_elem(self, elem, stylizer, bstate, istates,
ignore_valign=False):
if not isinstance(elem.tag, (str, bytes)) \
or parse_utils.namespace(elem.tag) != const.XHTML_NS:
return
style = stylizer.style(elem)
# <mbp:frame-set/> does not exist lalalala
if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and
elem.get('data-calibre-jacket-searchable-tags', None) != '1'):
id_ = elem.get('id', None)
if id_:
# Keep anchors so people can use display:none
# to generate hidden TOCs
tail = elem.tail
elem.clear()
elem.text = None
elem.set('id', id_)
elem.tail = tail
elem.tag = base.tag('xhtml', 'a')
else:
return
tag = parse_utils.barename(elem.tag)
istate = copy.copy(istates[-1])
istate.rendered = False
istate.list_num = 0
if tag == 'ol' and 'start' in elem.attrib:
try:
istate.list_num = int(elem.attrib['start'])-1
except:
pass
istates.append(istate)
left = 0
display = style['display']
if display == 'table-cell':
display = 'inline'
elif display.startswith('table'):
display = 'block'
isblock = (not display.startswith('inline') and style['display'] !=
'none')
isblock = isblock and style['float'] == 'none'
isblock = isblock and tag != 'br'
if isblock:
bstate.para = None
istate.halign = style['text-align']
rawti = style._get('text-indent')
istate.indent = style['text-indent']
if hasattr(rawti, 'strip') and '%' in rawti:
# We have a percentage text indent, these can come out looking
# too large if the user chooses a wide output profile like
# tablet
istate.indent = min(style._unit_convert(rawti, base=500), istate.indent)
if style['margin-left'] == 'auto' \
and style['margin-right'] == 'auto':
istate.halign = 'center'
margin = asfloat(style['margin-left'])
padding = asfloat(style['padding-left'])
if tag != 'body':
left = margin + padding
istate.left += left
vmargin = asfloat(style['margin-top'])
bstate.vmargin = max((bstate.vmargin, vmargin))
vpadding = asfloat(style['padding-top'])
if vpadding > 0:
bstate.vpadding += bstate.vmargin
bstate.vmargin = 0
bstate.vpadding += vpadding
elif not istate.href:
margin = asfloat(style['margin-left'])
padding = asfloat(style['padding-left'])
lspace = margin + padding
if lspace > 0:
spaces = int(round((lspace * 3) / style['font-size']))
elem.text = ('\xa0' * spaces) + (elem.text or '')
margin = asfloat(style['margin-right'])
padding = asfloat(style['padding-right'])
rspace = margin + padding
if rspace > 0:
spaces = int(round((rspace * 3) / style['font-size']))
if len(elem) == 0:
elem.text = (elem.text or '') + ('\xa0' * spaces)
else:
last = elem[-1]
last.text = (last.text or '') + ('\xa0' * spaces)
if bstate.content and style['page-break-before'] in PAGE_BREAKS:
bstate.pbreak = True
istate.fsize = self.mobimlize_font(style['font-size'])
istate.italic = True if style['font-style'] == 'italic' else False
weight = style['font-weight']
istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
istate.preserve = style['white-space'] == 'pre'
istate.pre_wrap = style['white-space'] == 'pre-wrap'
istate.bgcolor = style['background-color']
istate.fgcolor = style['color']
istate.strikethrough = style.effective_text_decoration == 'line-through'
istate.underline = style.effective_text_decoration == 'underline'
ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else ''
if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'):
istate.family = 'monospace'
elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or
'arial' in ff or 'helvetica' in ff):
istate.family = 'sans-serif'
else:
istate.family = 'serif'
if 'id' in elem.attrib:
istate.ids.add(elem.attrib['id'])
if 'name' in elem.attrib:
istate.ids.add(elem.attrib['name'])
if tag == 'a' and 'href' in elem.attrib:
istate.href = elem.attrib['href']
istate.attrib.clear()
if tag == 'img' and 'src' in elem.attrib:
istate.attrib['src'] = elem.attrib['src']
istate.attrib['align'] = 'baseline'
cssdict = style.cssdict()
valign = cssdict.get('vertical-align', None)
if valign in ('top', 'bottom', 'middle'):
istate.attrib['align'] = valign
for prop in ('width', 'height'):
if cssdict[prop] != 'auto':
value = style[prop]
if value == getattr(self.profile, prop):
result = '100%'
else:
# Amazon's renderer does not support
# img sizes in units other than px
# See #7520 for test case
try:
pixs = int(round(float(value) /
(72/self.profile.dpi)))
except:
continue
result = str(pixs)
istate.attrib[prop] = result
if 'width' not in istate.attrib or 'height' not in istate.attrib:
href = self.current_spine_item.abshref(elem.attrib['src'])
try:
item = self.oeb.manifest.hrefs[base.urlnormalize(href)]
except:
self.oeb.logger.warn('Failed to find image:',
href)
else:
try:
width, height = identify(item.data)[1:]
except Exception:
self.oeb.logger.warn('Invalid image:', href)
else:
if 'width' not in istate.attrib and 'height' not in \
istate.attrib:
istate.attrib['width'] = str(width)
istate.attrib['height'] = str(height)
else:
ar = width / height
if 'width' not in istate.attrib:
try:
width = int(istate.attrib['height'])*ar
except:
pass
istate.attrib['width'] = str(int(width))
else:
try:
height = int(istate.attrib['width'])/ar
except:
pass
istate.attrib['height'] = str(int(height))
item.unload_data_from_memory()
elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}:
raww = style._get('width')
if hasattr(raww, 'strip') and '%' in raww:
istate.attrib['width'] = raww
else:
prop = style['width'] / self.profile.width
istate.attrib['width'] = "%d%%" % int(round(prop * 100))
elif display == 'table':
tag = 'table'
elif display == 'table-row':
tag = 'tr'
elif display == 'table-cell':
tag = 'td'
if tag in TABLE_TAGS and self.ignore_tables:
tag = 'span' if tag == 'td' else 'div'
if tag in ('table', 'td', 'tr'):
col = style.backgroundColor
if col:
elem.set('bgcolor', col)
css = style.cssdict()
if 'border' in css or 'border-width' in css:
elem.set('border', '1')
if tag in TABLE_TAGS:
for attr in ('rowspan', 'colspan', 'width', 'border', 'scope',
'bgcolor'):
if attr in elem.attrib:
istate.attrib[attr] = elem.attrib[attr]
if tag == 'q':
t = elem.text
if not t:
t = ''
elem.text = '\u201c' + t
t = elem.tail
if not t:
t = ''
elem.tail = '\u201d' + t
text = None
if elem.text:
if istate.preserve or istate.pre_wrap:
text = elem.text
elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and
elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS):
text = None
else:
text = COLLAPSE.sub(' ', elem.text)
valign = style['vertical-align']
not_baseline = valign in ('super', 'sub', 'text-top',
'text-bottom', 'top', 'bottom') or (
isinstance(valign, numbers.Number) and abs(valign) != 0)
issup = valign in ('super', 'text-top', 'top') or (
isinstance(valign, numbers.Number) and valign > 0)
vtag = 'sup' if issup else 'sub'
if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP)
vbstate = BlockState(etree.SubElement(nroot, base.tag('xhtml', 'body')))
vbstate.para = etree.SubElement(vbstate.body, base.tag('xhtml', 'p'))
self.mobimlize_elem(elem, stylizer, vbstate, istates,
ignore_valign=True)
if len(istates) > 0:
istates.pop()
if len(istates) == 0:
istates.append(FormatState())
at_start = bstate.para is None
if at_start:
self.mobimlize_content('span', '', bstate, istates)
parent = bstate.para if bstate.inline is None else bstate.inline
if parent is not None:
vtag = etree.SubElement(parent, base.tag('xhtml', vtag))
vtag = etree.SubElement(vtag, base.tag('xhtml', 'small'))
# Add anchors
for child in vbstate.body:
if child is not vbstate.para:
vtag.append(child)
else:
break
if vbstate.para is not None:
if vbstate.para.text:
vtag.text = vbstate.para.text
for child in vbstate.para:
vtag.append(child)
return
if tag == 'blockquote':
old_mim = self.opts.mobi_ignore_margins
self.opts.mobi_ignore_margins = False
if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or (
# We have an id but no text and no children, the id should still
# be added.
istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and
len(elem)==0)):
if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib:
try:
value = int(elem.attrib['value'])
istates[-2].list_num = value - 1
except:
pass
self.mobimlize_content(tag, text, bstate, istates)
for child in elem:
self.mobimlize_elem(child, stylizer, bstate, istates)
tail = None
if child.tail:
if istate.preserve or istate.pre_wrap:
tail = child.tail
elif bstate.para is None and isspace(child.tail):
tail = None
else:
tail = COLLAPSE.sub(' ', child.tail)
if tail:
self.mobimlize_content(tag, tail, bstate, istates)
if tag == 'blockquote':
self.opts.mobi_ignore_margins = old_mim
if bstate.content and style['page-break-after'] in PAGE_BREAKS:
bstate.pbreak = True
if isblock:
para = bstate.para
if para is not None and para.text == '\xa0' and len(para) < 1:
if style.height > 2:
para.getparent().replace(para, etree.Element(base.tag('xhtml', 'br')))
else:
# This is too small to be rendered effectively, drop it
para.getparent().remove(para)
bstate.para = None
bstate.istate = None
vmargin = asfloat(style['margin-bottom'])
bstate.vmargin = max((bstate.vmargin, vmargin))
vpadding = asfloat(style['padding-bottom'])
if vpadding > 0:
bstate.vpadding += bstate.vmargin
bstate.vmargin = 0
bstate.vpadding += vpadding
if bstate.nested and bstate.nested[-1].tag == elem.tag:
bstate.nested.pop()
istates.pop()