1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-31 09:23:32 +02:00

Initial import

This commit is contained in:
2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,437 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import numbers
from functools import wraps
from css_parser.css import PropertyValue
from css_parser import profile as cssprofiles, CSSParser
from tinycss.fonts3 import parse_font, serialize_font_family
from calibre.ebooks.oeb.base import css_text
from polyglot.builtins import iteritems, string_or_bytes, unicode_type, zip
DEFAULTS = {'azimuth': 'center', 'background-attachment': 'scroll', # {{{
'background-color': 'transparent', 'background-image': 'none',
'background-position': '0% 0%', 'background-repeat': 'repeat',
'border-bottom-color': 'currentColor', 'border-bottom-style':
'none', 'border-bottom-width': 'medium', 'border-collapse':
'separate', 'border-left-color': 'currentColor',
'border-left-style': 'none', 'border-left-width': 'medium',
'border-right-color': 'currentColor', 'border-right-style': 'none',
'border-right-width': 'medium', 'border-spacing': 0,
'border-top-color': 'currentColor', 'border-top-style': 'none',
'border-top-width': 'medium', 'bottom': 'auto', 'caption-side':
'top', 'clear': 'none', 'clip': 'auto', 'color': 'black',
'content': 'normal', 'counter-increment': 'none', 'counter-reset':
'none', 'cue-after': 'none', 'cue-before': 'none', 'cursor':
'auto', 'direction': 'ltr', 'display': 'inline', 'elevation':
'level', 'empty-cells': 'show', 'float': 'none', 'font-family':
'serif', 'font-size': 'medium', 'font-stretch': 'normal', 'font-style': 'normal',
'font-variant': 'normal', 'font-weight': 'normal', 'height':
'auto', 'left': 'auto', 'letter-spacing': 'normal', 'line-height':
'normal', 'list-style-image': 'none', 'list-style-position':
'outside', 'list-style-type': 'disc', 'margin-bottom': 0,
'margin-left': 0, 'margin-right': 0, 'margin-top': 0, 'max-height':
'none', 'max-width': 'none', 'min-height': 0, 'min-width': 0,
'orphans': '2', 'outline-color': 'invert', 'outline-style': 'none',
'outline-width': 'medium', 'overflow': 'visible', 'padding-bottom':
0, 'padding-left': 0, 'padding-right': 0, 'padding-top': 0,
'page-break-after': 'auto', 'page-break-before': 'auto',
'page-break-inside': 'auto', 'pause-after': 0, 'pause-before': 0,
'pitch': 'medium', 'pitch-range': '50', 'play-during': 'auto',
'position': 'static', 'quotes': u"'' '' '' ''", 'richness':
'50', 'right': 'auto', 'speak': 'normal', 'speak-header': 'once',
'speak-numeral': 'continuous', 'speak-punctuation': 'none',
'speech-rate': 'medium', 'stress': '50', 'table-layout': 'auto',
'text-align': 'auto', 'text-decoration': 'none', 'text-indent': 0,
'text-shadow': 'none', 'text-transform': 'none', 'top': 'auto',
'unicode-bidi': 'normal', 'vertical-align': 'baseline',
'visibility': 'visible', 'voice-family': 'default', 'volume':
'medium', 'white-space': 'normal', 'widows': '2', 'width': 'auto',
'word-spacing': 'normal', 'z-index': 'auto'}
# }}}
EDGES = ('top', 'right', 'bottom', 'left')
BORDER_PROPS = ('color', 'style', 'width')
def normalize_edge(name, cssvalue):
style = {}
if isinstance(cssvalue, PropertyValue):
primitives = [css_text(v) for v in cssvalue]
else:
primitives = [css_text(cssvalue)]
if len(primitives) == 1:
value, = primitives
values = (value, value, value, value)
elif len(primitives) == 2:
vert, horiz = primitives
values = (vert, horiz, vert, horiz)
elif len(primitives) == 3:
top, horiz, bottom = primitives
values = (top, horiz, bottom, horiz)
else:
values = primitives[:4]
if '-' in name:
l, _, r = name.partition('-')
for edge, value in zip(EDGES, values):
style['%s-%s-%s' % (l, edge, r)] = value
else:
for edge, value in zip(EDGES, values):
style['%s-%s' % (name, edge)] = value
return style
def simple_normalizer(prefix, names, check_inherit=True):
composition = tuple('%s-%s' %(prefix, n) for n in names)
@wraps(normalize_simple_composition)
def wrapper(name, cssvalue):
return normalize_simple_composition(name, cssvalue, composition, check_inherit=check_inherit)
return wrapper
def normalize_simple_composition(name, cssvalue, composition, check_inherit=True):
if check_inherit and css_text(cssvalue) == 'inherit':
style = {k:'inherit' for k in composition}
else:
style = {k:DEFAULTS[k] for k in composition}
try:
primitives = [css_text(v) for v in cssvalue]
except TypeError:
primitives = [css_text(cssvalue)]
while primitives:
value = primitives.pop()
for key in composition:
if cssprofiles.validate(key, value):
style[key] = value
break
return style
font_composition = ('font-style', 'font-variant', 'font-weight', 'font-size', 'line-height', 'font-family')
def normalize_font(cssvalue, font_family_as_list=False):
# See https://developer.mozilla.org/en-US/docs/Web/CSS/font
composition = font_composition
val = css_text(cssvalue)
if val == 'inherit':
ans = {k:'inherit' for k in composition}
elif val in {'caption', 'icon', 'menu', 'message-box', 'small-caption', 'status-bar'}:
ans = {k:DEFAULTS[k] for k in composition}
else:
ans = {k:DEFAULTS[k] for k in composition}
ans.update(parse_font(val))
if font_family_as_list:
if isinstance(ans['font-family'], string_or_bytes):
ans['font-family'] = [x.strip() for x in ans['font-family'].split(',')]
else:
if not isinstance(ans['font-family'], string_or_bytes):
ans['font-family'] = serialize_font_family(ans['font-family'])
return ans
def normalize_border(name, cssvalue):
style = normalizers['border-' + EDGES[0]]('border-' + EDGES[0], cssvalue)
vals = style.copy()
for edge in EDGES[1:]:
style.update({k.replace(EDGES[0], edge):v for k, v in iteritems(vals)})
return style
normalizers = {
'list-style': simple_normalizer('list-style', ('type', 'position', 'image')),
'font': lambda prop, v: normalize_font(v),
'border': normalize_border,
}
for x in ('margin', 'padding', 'border-style', 'border-width', 'border-color'):
normalizers[x] = normalize_edge
for x in EDGES:
name = 'border-' + x
normalizers[name] = simple_normalizer(name, BORDER_PROPS, check_inherit=False)
SHORTHAND_DEFAULTS = {
'margin': '0', 'padding': '0', 'border-style': 'none', 'border-width': '0', 'border-color': 'currentColor',
'border':'none', 'border-left': 'none', 'border-right':'none', 'border-top': 'none', 'border-bottom': 'none',
'list-style': 'inherit', 'font': 'inherit',
}
_safe_parser = None
def safe_parser():
global _safe_parser
if _safe_parser is None:
import logging
_safe_parser = CSSParser(loglevel=logging.CRITICAL, validate=False)
return _safe_parser
def normalize_filter_css(props):
ans = set()
p = safe_parser()
for prop in props:
n = normalizers.get(prop, None)
ans.add(prop)
if n is not None and prop in SHORTHAND_DEFAULTS:
dec = p.parseStyle('%s: %s' % (prop, SHORTHAND_DEFAULTS[prop]))
cssvalue = dec.getPropertyCSSValue(dec.item(0))
ans |= set(n(prop, cssvalue))
return ans
def condense_edge(vals):
edges = {x.name.rpartition('-')[-1]:x.value for x in vals}
if len(edges) != 4 or set(edges) != {'left', 'top', 'right', 'bottom'}:
return
ce = {}
for (x, y) in [('left', 'right'), ('top', 'bottom')]:
if edges[x] == edges[y]:
ce[x] = edges[x]
else:
ce[x], ce[y] = edges[x], edges[y]
if len(ce) == 4:
return ' '.join(ce[x] for x in ('top', 'right', 'bottom', 'left'))
if len(ce) == 3:
if 'right' in ce:
return ' '.join(ce[x] for x in ('top', 'right', 'top', 'left'))
return ' '.join(ce[x] for x in ('top', 'left', 'bottom'))
if len(ce) == 2:
if ce['top'] == ce['left']:
return ce['top']
return ' '.join(ce[x] for x in ('top', 'left'))
def simple_condenser(prefix, func):
@wraps(func)
def condense_simple(style, props):
cp = func(props)
if cp is not None:
for prop in props:
style.removeProperty(prop.name)
style.setProperty(prefix, cp)
return condense_simple
def condense_border(style, props):
prop_map = {p.name:p for p in props}
edge_vals = []
for edge in EDGES:
name = 'border-%s' % edge
vals = []
for prop in BORDER_PROPS:
x = prop_map.get('%s-%s' % (name, prop), None)
if x is not None:
vals.append(x)
if len(vals) == 3:
for prop in vals:
style.removeProperty(prop.name)
style.setProperty(name, ' '.join(x.value for x in vals))
prop_map[name] = style.getProperty(name)
x = prop_map.get(name, None)
if x is not None:
edge_vals.append(x)
if len(edge_vals) == 4 and len({x.value for x in edge_vals}) == 1:
for prop in edge_vals:
style.removeProperty(prop.name)
style.setProperty('border', edge_vals[0].value)
condensers = {'margin': simple_condenser('margin', condense_edge), 'padding': simple_condenser('padding', condense_edge), 'border': condense_border}
def condense_rule(style):
expanded = {'margin-':[], 'padding-':[], 'border-':[]}
for prop in style.getProperties():
for x in expanded:
if prop.name and prop.name.startswith(x):
expanded[x].append(prop)
break
for prefix, vals in iteritems(expanded):
if len(vals) > 1 and {x.priority for x in vals} == {''}:
condensers[prefix[:-1]](style, vals)
def condense_sheet(sheet):
for rule in sheet.cssRules:
if rule.type == rule.STYLE_RULE:
condense_rule(rule.style)
def test_normalization(return_tests=False): # {{{
import unittest
from css_parser import parseStyle
from itertools import product
class TestNormalization(unittest.TestCase):
longMessage = True
maxDiff = None
def test_font_normalization(self):
def font_dict(expected):
ans = {k:DEFAULTS[k] for k in font_composition} if expected else {}
ans.update(expected)
return ans
for raw, expected in iteritems({
'some_font': {'font-family':'some_font'}, 'inherit':{k:'inherit' for k in font_composition},
'1.2pt/1.4 A_Font': {'font-family':'A_Font', 'font-size':'1.2pt', 'line-height':'1.4'},
'bad font': {'font-family':'"bad font"'}, '10% serif': {'font-family':'serif', 'font-size':'10%'},
'12px "My Font", serif': {'font-family':'"My Font", serif', 'font-size': '12px'},
'normal 0.6em/135% arial,sans-serif': {'font-family': 'arial, sans-serif', 'font-size': '0.6em', 'line-height':'135%', 'font-style':'normal'},
'bold italic large serif': {'font-family':'serif', 'font-weight':'bold', 'font-style':'italic', 'font-size':'large'},
'bold italic small-caps larger/normal serif':
{'font-family':'serif', 'font-weight':'bold', 'font-style':'italic', 'font-size':'larger',
'line-height':'normal', 'font-variant':'small-caps'},
'2em A B': {'font-family': '"A B"', 'font-size': '2em'},
}):
val = tuple(parseStyle('font: %s' % raw, validate=False))[0].cssValue
style = normalizers['font']('font', val)
self.assertDictEqual(font_dict(expected), style, raw)
def test_border_normalization(self):
def border_edge_dict(expected, edge='right'):
ans = {'border-%s-%s' % (edge, x): DEFAULTS['border-%s-%s' % (edge, x)] for x in ('style', 'width', 'color')}
for x, v in iteritems(expected):
ans['border-%s-%s' % (edge, x)] = v
return ans
def border_dict(expected):
ans = {}
for edge in EDGES:
ans.update(border_edge_dict(expected, edge))
return ans
def border_val_dict(expected, val='color'):
ans = {'border-%s-%s' % (edge, val): DEFAULTS['border-%s-%s' % (edge, val)] for edge in EDGES}
for edge in EDGES:
ans['border-%s-%s' % (edge, val)] = expected
return ans
for raw, expected in iteritems({
'solid 1px red': {'color':'red', 'width':'1px', 'style':'solid'},
'1px': {'width': '1px'}, '#aaa': {'color': '#aaa'},
'2em groove': {'width':'2em', 'style':'groove'},
}):
for edge in EDGES:
br = 'border-%s' % edge
val = tuple(parseStyle('%s: %s' % (br, raw), validate=False))[0].cssValue
self.assertDictEqual(border_edge_dict(expected, edge), normalizers[br](br, val))
for raw, expected in iteritems({
'solid 1px red': {'color':'red', 'width':'1px', 'style':'solid'},
'1px': {'width': '1px'}, '#aaa': {'color': '#aaa'},
'thin groove': {'width':'thin', 'style':'groove'},
}):
val = tuple(parseStyle('%s: %s' % ('border', raw), validate=False))[0].cssValue
self.assertDictEqual(border_dict(expected), normalizers['border']('border', val))
for name, val in iteritems({
'width': '10%', 'color': 'rgb(0, 1, 1)', 'style': 'double',
}):
cval = tuple(parseStyle('border-%s: %s' % (name, val), validate=False))[0].cssValue
self.assertDictEqual(border_val_dict(val, name), normalizers['border-'+name]('border-'+name, cval))
def test_edge_normalization(self):
def edge_dict(prefix, expected):
return {'%s-%s' % (prefix, edge) : x for edge, x in zip(EDGES, expected)}
for raw, expected in iteritems({
'2px': ('2px', '2px', '2px', '2px'),
'1em 2em': ('1em', '2em', '1em', '2em'),
'1em 2em 3em': ('1em', '2em', '3em', '2em'),
'1 2 3 4': ('1', '2', '3', '4'),
}):
for prefix in ('margin', 'padding'):
cval = tuple(parseStyle('%s: %s' % (prefix, raw), validate=False))[0].cssValue
self.assertDictEqual(edge_dict(prefix, expected), normalizers[prefix](prefix, cval))
def test_list_style_normalization(self):
def ls_dict(expected):
ans = {'list-style-%s' % x : DEFAULTS['list-style-%s' % x] for x in ('type', 'image', 'position')}
for k, v in iteritems(expected):
ans['list-style-%s' % k] = v
return ans
for raw, expected in iteritems({
'url(http://www.example.com/images/list.png)': {'image': 'url(http://www.example.com/images/list.png)'},
'inside square': {'position':'inside', 'type':'square'},
'upper-roman url(img) outside': {'position':'outside', 'type':'upper-roman', 'image':'url(img)'},
}):
cval = tuple(parseStyle('list-style: %s' % raw, validate=False))[0].cssValue
self.assertDictEqual(ls_dict(expected), normalizers['list-style']('list-style', cval))
def test_filter_css_normalization(self):
ae = self.assertEqual
ae({'font'} | set(font_composition), normalize_filter_css({'font'}))
for p in ('margin', 'padding'):
ae({p} | {p + '-' + x for x in EDGES}, normalize_filter_css({p}))
bvals = {'border-%s-%s' % (edge, x) for edge in EDGES for x in BORDER_PROPS}
ae(bvals | {'border'}, normalize_filter_css({'border'}))
for x in BORDER_PROPS:
sbvals = {'border-%s-%s' % (e, x) for e in EDGES}
ae(sbvals | {'border-%s' % x}, normalize_filter_css({'border-%s' % x}))
for e in EDGES:
sbvals = {'border-%s-%s' % (e, x) for x in BORDER_PROPS}
ae(sbvals | {'border-%s' % e}, normalize_filter_css({'border-%s' % e}))
ae({'list-style', 'list-style-image', 'list-style-type', 'list-style-position'}, normalize_filter_css({'list-style'}))
def test_edge_condensation(self):
for s, v in iteritems({
(1, 1, 3) : None,
(1, 2, 3, 4) : '2pt 3pt 4pt 1pt',
(1, 2, 3, 2) : '2pt 3pt 2pt 1pt',
(1, 2, 1, 3) : '2pt 1pt 3pt',
(1, 2, 1, 2) : '2pt 1pt',
(1, 1, 1, 1) : '1pt',
('2%', '2%', '2%', '2%') : '2%',
tuple('0 0 0 0'.split()) : '0',
}):
for prefix in ('margin', 'padding'):
css = {'%s-%s' % (prefix, x) : unicode_type(y)+'pt' if isinstance(y, numbers.Number) else y
for x, y in zip(('left', 'top', 'right', 'bottom'), s)}
css = '; '.join(('%s:%s' % (k, v) for k, v in iteritems(css)))
style = parseStyle(css)
condense_rule(style)
val = getattr(style.getProperty(prefix), 'value', None)
self.assertEqual(v, val)
if val is not None:
for edge in EDGES:
self.assertFalse(getattr(style.getProperty('%s-%s' % (prefix, edge)), 'value', None))
def test_border_condensation(self):
vals = 'red solid 5px'
css = '; '.join('border-%s-%s: %s' % (edge, p, v) for edge in EDGES for p, v in zip(BORDER_PROPS, vals.split()))
style = parseStyle(css)
condense_rule(style)
for e, p in product(EDGES, BORDER_PROPS):
self.assertFalse(style.getProperty('border-%s-%s' % (e, p)))
self.assertFalse(style.getProperty('border-%s' % e))
self.assertFalse(style.getProperty('border-%s' % p))
self.assertEqual(style.getProperty('border').value, vals)
css = '; '.join('border-%s-%s: %s' % (edge, p, v) for edge in ('top',) for p, v in zip(BORDER_PROPS, vals.split()))
style = parseStyle(css)
condense_rule(style)
self.assertEqual(css_text(style), 'border-top: %s' % vals)
css += ';' + '; '.join('border-%s-%s: %s' % (edge, p, v) for edge in ('right', 'left', 'bottom') for p, v in
zip(BORDER_PROPS, vals.replace('red', 'green').split()))
style = parseStyle(css)
condense_rule(style)
self.assertEqual(len(style.getProperties()), 4)
self.assertEqual(style.getProperty('border-top').value, vals)
self.assertEqual(style.getProperty('border-left').value, vals.replace('red', 'green'))
tests = unittest.defaultTestLoader.loadTestsFromTestCase(TestNormalization)
if return_tests:
return tests
unittest.TextTestRunner(verbosity=4).run(tests)
# }}}
if __name__ == '__main__':
test_normalization()

View File

@@ -0,0 +1,389 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from lxml import etree, html
from calibre import xml_replace_entities, force_unicode
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.constants import filesystem_encoding
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
from polyglot.builtins import iteritems, itervalues, unicode_type, string_or_bytes, map
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
XHTML_NS = 'http://www.w3.org/1999/xhtml'
XMLNS_NS = 'http://www.w3.org/2000/xmlns/'
class NotHTML(Exception):
def __init__(self, root_tag):
Exception.__init__(self, 'Data is not HTML')
self.root_tag = root_tag
def barename(name):
return name.rpartition('}')[-1]
def namespace(name):
return name.rpartition('}')[0][1:]
def XHTML(name):
return '{%s}%s' % (XHTML_NS, name)
def xpath(elem, expr):
return elem.xpath(expr, namespaces={'h':XHTML_NS})
def XPath(expr):
return etree.XPath(expr, namespaces={'h':XHTML_NS})
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
def merge_multiple_html_heads_and_bodies(root, log=None):
heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
if not (len(heads) > 1 or len(bodies) > 1):
return root
for child in root:
root.remove(child)
head = root.makeelement(XHTML('head'))
body = root.makeelement(XHTML('body'))
for h in heads:
for x in h:
head.append(x)
for b in bodies:
for x in b:
body.append(x)
tuple(map(root.append, (head, body)))
if log is not None:
log.warn('Merging multiple <head> and <body> sections')
return root
def clone_element(elem, nsmap={}, in_context=True):
if in_context:
maker = elem.getroottree().getroot().makeelement
else:
maker = etree.Element
nelem = maker(elem.tag, attrib=elem.attrib,
nsmap=nsmap)
nelem.text, nelem.tail = elem.text, elem.tail
nelem.extend(elem)
return nelem
def node_depth(node):
ans = 0
p = node.getparent()
while p is not None:
ans += 1
p = p.getparent()
return ans
def html5_parse(data, max_nesting_depth=100):
from html5_parser import parse
from calibre.utils.cleantext import clean_xml_chars
data = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
# Check that the asinine HTML 5 algorithm did not result in a tree with
# insane nesting depths
for x in data.iterdescendants():
if isinstance(x.tag, string_or_bytes) and not len(x): # Leaf node
depth = node_depth(x)
if depth > max_nesting_depth:
raise ValueError('HTML 5 parsing resulted in a tree with nesting'
' depth > %d'%max_nesting_depth)
return data
def _html4_parse(data):
data = html.fromstring(data)
data.attrib.pop('xmlns', None)
for elem in data.iter(tag=etree.Comment):
if elem.text:
elem.text = elem.text.strip('-')
data = etree.tostring(data, encoding='unicode')
data = safe_xml_fromstring(data)
return data
def clean_word_doc(data, log):
prefixes = []
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
prefixes.append(match.group(1))
if prefixes:
log.warn('Found microsoft markup, cleaning...')
# Remove empty tags as they are not rendered by browsers
# but can become renderable HTML tags like <p/> if the
# document is parsed by an HTML parser
pat = re.compile(
r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
re.DOTALL)
data = pat.sub('', data)
pat = re.compile(
r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
data = pat.sub('', data)
return data
def ensure_namespace_prefixes(node, nsmap):
namespace_uris = frozenset(itervalues(nsmap))
fnsmap = {k:v for k, v in iteritems(node.nsmap) if v not in namespace_uris}
fnsmap.update(nsmap)
if fnsmap != dict(node.nsmap):
node = clone_element(node, nsmap=fnsmap, in_context=False)
return node
class HTML5Doc(ValueError):
pass
def check_for_html5(prefix, root):
if re.search(r'<!DOCTYPE\s+html\s*>', prefix, re.IGNORECASE) is not None:
if root.xpath('//svg'):
raise HTML5Doc('This document appears to be un-namespaced HTML 5, should be parsed by the HTML 5 parser')
def parse_html(data, log=None, decoder=None, preprocessor=None,
filename='<string>', non_html_file_tags=frozenset()):
if log is None:
from calibre.utils.logging import default_log
log = default_log
filename = force_unicode(filename, enc=filesystem_encoding)
if not isinstance(data, unicode_type):
if decoder is not None:
data = decoder(data)
else:
data = xml_to_unicode(data)[0]
data = strip_encoding_declarations(data)
# Remove DOCTYPE declaration as it messes up parsing
# In particular, it causes tostring to insert xmlns
# declarations, which messes up the coercing logic
pre = ''
idx = data.find('<html')
if idx == -1:
idx = data.find('<HTML')
has_html4_doctype = False
if idx > -1:
pre = data[:idx]
data = data[idx:]
if '<!DOCTYPE' in pre: # Handle user defined entities
# kindlegen produces invalid xhtml with uppercase attribute names
# if fed HTML 4 with uppercase attribute names, so try to detect
# and compensate for that.
has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
# Process private entities
user_entities = {}
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
val = match.group(2)
if val.startswith('"') and val.endswith('"'):
val = val[1:-1]
user_entities[match.group(1)] = val
if user_entities:
pat = re.compile(r'&(%s);'%('|'.join(list(user_entities.keys()))))
data = pat.sub(lambda m:user_entities[m.group(1)], data)
if preprocessor is not None:
data = preprocessor(data)
# There could be null bytes in data if it had &#0; entities in it
data = data.replace('\0', '')
data = raw = clean_word_doc(data, log)
# Try with more & more drastic measures to parse
try:
data = safe_xml_fromstring(data, recover=False)
check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Initial parse failed, using more'
' forgiving parsers')
raw = data = xml_replace_entities(raw)
try:
data = safe_xml_fromstring(data, recover=False)
check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Parsing %s as HTML' % filename)
data = raw
try:
data = html5_parse(data)
except Exception:
log.exception(
'HTML 5 parsing failed, falling back to older parsers')
data = _html4_parse(data)
if has_html4_doctype or data.tag == 'HTML' or (len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))):
# Lower case all tag and attribute names
data.tag = data.tag.lower()
for x in data.iterdescendants():
try:
x.tag = x.tag.lower()
for key, val in list(iteritems(x.attrib)):
del x.attrib[key]
key = key.lower()
x.attrib[key] = val
except:
pass
if barename(data.tag) != 'html':
if barename(data.tag) in non_html_file_tags:
raise NotHTML(data.tag)
log.warn('File %r does not appear to be (X)HTML'%filename)
nroot = safe_xml_fromstring('<html></html>')
has_body = False
for child in list(data):
if isinstance(child.tag, (unicode_type, bytes)) and barename(child.tag) == 'body':
has_body = True
break
parent = nroot
if not has_body:
log.warn('File %r appears to be a HTML fragment'%filename)
nroot = safe_xml_fromstring('<html><body/></html>')
parent = nroot[0]
for child in list(data.iter()):
oparent = child.getparent()
if oparent is not None:
oparent.remove(child)
parent.append(child)
data = nroot
# Force into the XHTML namespace
if not namespace(data.tag):
log.warn('Forcing', filename, 'into XHTML namespace')
data.attrib['xmlns'] = XHTML_NS
data = etree.tostring(data, encoding='unicode')
try:
data = safe_xml_fromstring(data, recover=False)
except:
data = data.replace(':=', '=').replace(':>', '>')
data = data.replace('<http:/>', '')
try:
data = safe_xml_fromstring(data, recover=False)
except etree.XMLSyntaxError:
log.warn('Stripping comments from %s'%
filename)
data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
data)
data = data.replace(
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
'')
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
try:
data = safe_xml_fromstring(data)
except etree.XMLSyntaxError:
log.warn('Stripping meta tags from %s'% filename)
data = re.sub(r'<meta\s+[^>]+?>', '', data)
data = safe_xml_fromstring(data)
elif namespace(data.tag) != XHTML_NS:
# OEB_DOC_NS, but possibly others
ns = namespace(data.tag)
attrib = dict(data.attrib)
nroot = etree.Element(XHTML('html'),
nsmap={None: XHTML_NS}, attrib=attrib)
for elem in data.iterdescendants():
if isinstance(elem.tag, string_or_bytes) and \
namespace(elem.tag) == ns:
elem.tag = XHTML(barename(elem.tag))
for elem in data:
nroot.append(elem)
data = nroot
# Remove non default prefixes referring to the XHTML namespace
data = ensure_namespace_prefixes(data, {None: XHTML_NS})
data = merge_multiple_html_heads_and_bodies(data, log)
# Ensure has a <head/>
head = xpath(data, '/h:html/h:head')
head = head[0] if head else None
if head is None:
log.warn('File %s missing <head/> element' % filename)
head = etree.Element(XHTML('head'))
data.insert(0, head)
title = etree.SubElement(head, XHTML('title'))
title.text = _('Unknown')
elif not xpath(data, '/h:html/h:head/h:title'):
title = etree.SubElement(head, XHTML('title'))
title.text = _('Unknown')
# Ensure <title> is not empty
title = xpath(data, '/h:html/h:head/h:title')[0]
if not title.text or not title.text.strip():
title.text = _('Unknown')
# Remove any encoding-specifying <meta/> elements
for meta in META_XP(data):
meta.getparent().remove(meta)
meta = etree.SubElement(head, XHTML('meta'),
attrib={'http-equiv': 'Content-Type'})
meta.set('content', 'text/html; charset=utf-8') # Ensure content is second attribute
# Ensure has a <body/>
if not xpath(data, '/h:html/h:body'):
body = xpath(data, '//h:body')
if body:
body = body[0]
body.getparent().remove(body)
data.append(body)
else:
log.warn('File %s missing <body/> element' % filename)
etree.SubElement(data, XHTML('body'))
# Remove microsoft office markup
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
for x in r:
x.tag = XHTML('span')
def remove_elem(a):
p = a.getparent()
idx = p.index(a) -1
p.remove(a)
if a.tail:
if idx < 0:
if p.text is None:
p.text = ''
p.text += a.tail
else:
if p[idx].tail is None:
p[idx].tail = ''
p[idx].tail += a.tail
# Remove hyperlinks with no content as they cause rendering
# artifacts in browser based renderers
# Also remove empty <b>, <u> and <i> tags
for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
if a.get('id', None) is None and a.get('name', None) is None \
and len(a) == 0 and not a.text:
remove_elem(a)
# Convert <br>s with content into paragraphs as ADE can't handle
# them
for br in xpath(data, '//h:br'):
if len(br) > 0 or br.text:
br.tag = XHTML('div')
# Remove any stray text in the <head> section and format it nicely
data.text = '\n '
head = xpath(data, '//h:head')
if head:
head = head[0]
head.text = '\n '
head.tail = '\n '
for child in head:
child.tail = '\n '
child.tail = '\n '
return data

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,23 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks import DRMError as _DRMError
class InvalidBook(ValueError):
pass
class DRMError(_DRMError):
def __init__(self):
super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.'))
class MalformedMarkup(ValueError):
pass

View File

@@ -0,0 +1,52 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from lxml import etree
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES
from calibre.utils.localization import canonicalize_lang
def get_book_language(container):
for lang in container.opf_xpath('//dc:language'):
raw = lang.text
if raw:
code = canonicalize_lang(raw.split(',')[0].strip())
if code:
return code
def set_guide_item(container, item_type, title, name, frag=None):
ref_tag = '{%s}reference' % OPF_NAMESPACES['opf']
href = None
if name:
href = container.name_to_href(name, container.opf_name)
if frag:
href += '#' + frag
guides = container.opf_xpath('//opf:guide')
if not guides and href:
g = container.opf.makeelement('{%s}guide' % OPF_NAMESPACES['opf'], nsmap={'opf':OPF_NAMESPACES['opf']})
container.insert_into_xml(container.opf, g)
guides = [g]
for guide in guides:
matches = []
for child in guide.iterchildren(etree.Element):
if child.tag == ref_tag and child.get('type', '').lower() == item_type.lower():
matches.append(child)
if not matches and href:
r = guide.makeelement(ref_tag, type=item_type, nsmap={'opf':OPF_NAMESPACES['opf']})
container.insert_into_xml(guide, r)
matches.append(r)
for m in matches:
if href:
m.set('title', title), m.set('href', href), m.set('type', item_type)
else:
container.remove_from_xml(m)
container.dirty(container.opf_name)

View File

@@ -0,0 +1,99 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from lxml.etree import Element as LxmlElement
import html5_parser
from calibre import xml_replace_entities
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
from calibre.utils.cleantext import clean_xml_chars
from polyglot.builtins import unicode_type
XHTML_NS = 'http://www.w3.org/1999/xhtml'
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
if replace_entities:
raw = xml_replace_entities(raw)
if fix_newlines:
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
raw = clean_xml_chars(raw)
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
if (discard_namespaces and root.tag != 'html') or (
not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
return root
def handle_private_entities(data):
# Process private entities
pre = ''
idx = data.find('<html')
if idx == -1:
idx = data.find('<HTML')
if idx > -1:
pre = data[:idx]
num_of_nl_in_pre = pre.count('\n')
if '<!DOCTYPE' in pre: # Handle user defined entities
user_entities = {}
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
val = match.group(2)
if val.startswith('"') and val.endswith('"'):
val = val[1:-1]
user_entities[match.group(1)] = val
if user_entities:
data = ('\n' * num_of_nl_in_pre) + data[idx:]
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
data = pat.sub(lambda m:user_entities[m.group(1)], data)
return data
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
raw = handle_private_entities(raw)
if replace_entities:
raw = xml_replace_entities(raw).replace('\0', '') # Handle &#0;
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
# Remove any preamble before the opening html tag as it can cause problems,
# especially doctypes, preserve the original linenumbers by inserting
# newlines at the start
pre = raw[:2048]
for match in re.finditer(r'<\s*html', pre, flags=re.I):
newlines = raw.count('\n', 0, match.start())
raw = ('\n' * newlines) + raw[match.start():]
break
raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True)
if force_html5_parse:
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
try:
ans = safe_xml_fromstring(raw, recover=False)
if ans.tag != '{%s}html' % XHTML_NS:
raise ValueError('Root tag is not <html> in the XHTML namespace')
if linenumber_attribute:
for elem in ans.iter(LxmlElement):
if elem.sourceline is not None:
elem.set(linenumber_attribute, unicode_type(elem.sourceline))
return ans
except Exception:
if log is not None:
log.exception('Failed to parse as XML, parsing as tag soup')
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
if __name__ == '__main__':
from lxml import etree
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0>&nbsp;\n<b>b<svg ass="wipe" viewbox="0">', discard_namespaces=False)
print(etree.tostring(root, encoding='utf-8'))
print()

View File

@@ -0,0 +1,252 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import textwrap
from polyglot.builtins import iteritems, map
# from lxml.etree import Element
from calibre import force_unicode
from calibre.ebooks.oeb.base import (
serialize, OEB_DOCS, barename, OEB_STYLES, XPNSMAP, XHTML, SVG)
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES
from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.utils.icu import sort_key
def isspace(x):
return not x.strip('\u0009\u000a\u000c\u000d\u0020')
def pretty_xml_tree(elem, level=0, indent=' '):
''' XML beautifier, assumes that elements that have children do not have
textual content. Also assumes that there is no text immediately after
closing tags. These are true for opf/ncx and container.xml files. If either
of the assumptions are violated, there should be no data loss, but pretty
printing wont produce optimal results.'''
if (not elem.text and len(elem) > 0) or (elem.text and isspace(elem.text)):
elem.text = '\n' + (indent * (level+1))
for i, child in enumerate(elem):
pretty_xml_tree(child, level=level+1, indent=indent)
if not child.tail or isspace(child.tail):
l = level + 1
if i == len(elem) - 1:
l -= 1
child.tail = '\n' + (indent * l)
def pretty_opf(root):
# Put all dc: tags first starting with title and author. Preserve order for
# the rest.
def dckey(x):
return {'title':0, 'creator':1}.get(barename(x.tag), 2)
for metadata in root.xpath('//opf:metadata', namespaces=OPF_NAMESPACES):
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc'])
dc_tags.sort(key=dckey)
for x in reversed(dc_tags):
metadata.insert(0, x)
# Group items in the manifest
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=OPF_NAMESPACES)
spine_ids = {x:i for i, x in enumerate(spine_ids)}
def manifest_key(x):
mt = x.get('media-type', '')
href = x.get('href', '')
ext = href.rpartition('.')[-1].lower()
cat = 1000
if mt in OEB_DOCS:
cat = 0
elif mt == guess_type('a.ncx'):
cat = 1
elif mt in OEB_STYLES:
cat = 2
elif mt.startswith('image/'):
cat = 3
elif ext in {'otf', 'ttf', 'woff'}:
cat = 4
elif mt.startswith('audio/'):
cat = 5
elif mt.startswith('video/'):
cat = 6
if cat == 0:
i = spine_ids.get(x.get('id', None), 1000000000)
else:
i = sort_key(href)
return (cat, i)
for manifest in root.xpath('//opf:manifest', namespaces=OPF_NAMESPACES):
try:
children = sorted(manifest, key=manifest_key)
except AttributeError:
continue # There are comments so dont sort since that would mess up the comments
for x in reversed(children):
manifest.insert(0, x)
SVG_TAG = SVG('svg')
BLOCK_TAGS = frozenset(map(XHTML, (
'address', 'article', 'aside', 'audio', 'blockquote', 'body', 'canvas', 'col', 'colgroup', 'dd',
'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li',
'noscript', 'ol', 'output', 'p', 'pre', 'script', 'section', 'style', 'table', 'tbody', 'td',
'tfoot', 'th', 'thead', 'tr', 'ul', 'video', 'img'))) | {SVG_TAG}
def isblock(x):
if callable(x.tag) or not x.tag:
return True
if x.tag in BLOCK_TAGS:
return True
return False
def has_only_blocks(x):
if hasattr(x.tag, 'split') and len(x) == 0:
# Tag with no children,
return False
if x.text and not isspace(x.text):
return False
for child in x:
if not isblock(child) or (child.tail and not isspace(child.tail)):
return False
return True
def indent_for_tag(x):
prev = x.getprevious()
x = x.getparent().text if prev is None else prev.tail
if not x:
return ''
s = x.rpartition('\n')[-1]
return s if isspace(s) else ''
def set_indent(elem, attr, indent):
x = getattr(elem, attr)
if not x:
x = indent
else:
lines = x.splitlines()
if isspace(lines[-1]):
lines[-1] = indent
else:
lines.append(indent)
x = '\n'.join(lines)
setattr(elem, attr, x)
def pretty_block(parent, level=1, indent=' '):
''' Surround block tags with blank lines and recurse into child block tags
that contain only other block tags '''
if not parent.text or isspace(parent.text):
parent.text = ''
nn = '\n' if hasattr(parent.tag, 'strip') and barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
parent.text = parent.text + nn + (indent * level)
for i, child in enumerate(parent):
if isblock(child) and has_only_blocks(child):
pretty_block(child, level=level+1, indent=indent)
elif child.tag == SVG_TAG:
pretty_xml_tree(child, level=level, indent=indent)
l = level
if i == len(parent) - 1:
l -= 1
if not child.tail or isspace(child.tail):
child.tail = ''
child.tail = child.tail + nn + (indent * l)
def pretty_script_or_style(container, child):
if child.text:
indent = indent_for_tag(child)
if child.tag.endswith('style'):
child.text = force_unicode(pretty_css(container, '', child.text), 'utf-8')
child.text = textwrap.dedent(child.text)
child.text = '\n' + '\n'.join([(indent + x) if x else '' for x in child.text.splitlines()])
set_indent(child, 'text', indent)
def pretty_html_tree(container, root):
root.text = '\n\n'
for child in root:
child.tail = '\n\n'
if hasattr(child.tag, 'endswith') and child.tag.endswith('}head'):
pretty_xml_tree(child)
for body in root.findall('h:body', namespaces=XPNSMAP):
pretty_block(body)
# Special case the handling of a body that contains a single block tag
# with all content. In this case we prettify the containing block tag
# even if it has non block children.
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(
body[0]) and barename(body[0].tag) not in (
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
pretty_block(body[0], level=2)
if container is not None:
# Handle <script> and <style> tags
for child in root.xpath('//*[local-name()="script" or local-name()="style"]'):
pretty_script_or_style(container, child)
def fix_html(container, raw):
' Fix any parsing errors in the HTML represented as a string in raw. Fixing is done using the HTML5 parsing algorithm. '
root = container.parse_xhtml(raw)
return serialize(root, 'text/html')
def pretty_html(container, name, raw):
' Pretty print the HTML represented as a string in raw '
root = container.parse_xhtml(raw)
pretty_html_tree(container, root)
return serialize(root, 'text/html')
def pretty_css(container, name, raw):
' Pretty print the CSS represented as a string in raw '
sheet = container.parse_css(raw)
return serialize(sheet, 'text/css')
def pretty_xml(container, name, raw):
' Pretty print the XML represented as a string in raw. If ``name`` is the name of the OPF, extra OPF-specific prettying is performed. '
root = container.parse_xml(raw)
if name == container.opf_name:
pretty_opf(root)
pretty_xml_tree(root)
return serialize(root, 'text/xml')
def fix_all_html(container):
' Fix any parsing errors in all HTML files in the container. Fixing is done using the HTML5 parsing algorithm. '
for name, mt in iteritems(container.mime_map):
if mt in OEB_DOCS:
container.parsed(name)
container.dirty(name)
def pretty_all(container):
' Pretty print all HTML/CSS/XML files in the container '
xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
for name, mt in iteritems(container.mime_map):
prettied = False
if mt in OEB_DOCS:
pretty_html_tree(container, container.parsed(name))
prettied = True
elif mt in OEB_STYLES:
container.parsed(name)
prettied = True
elif name == container.opf_name:
root = container.parsed(name)
pretty_opf(root)
pretty_xml_tree(root)
prettied = True
elif mt in xml_types:
pretty_xml_tree(container.parsed(name))
prettied = True
if prettied:
container.dirty(name)

View File

@@ -0,0 +1,891 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from collections import Counter, OrderedDict
from functools import partial
from operator import itemgetter
from lxml import etree
from lxml.builder import ElementMaker
from calibre import __version__
from calibre.ebooks.oeb.base import (
XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize, EPUB_NS, XML_NS, OEB_DOCS)
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
from calibre.ebooks.oeb.polish.utils import guess_type, extract
from calibre.ebooks.oeb.polish.opf import set_guide_item, get_book_language
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
from calibre.translations.dynamic import translate
from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
from polyglot.builtins import iteritems, map, unicode_type
from polyglot.urllib import urlparse
ns = etree.FunctionNamespace('calibre_xpath_extensions')
ns.prefix = 'calibre'
ns['lower-case'] = lambda c, x: x.lower() if hasattr(x, 'lower') else x
class TOC(object):
toc_title = None
def __init__(self, title=None, dest=None, frag=None):
self.title, self.dest, self.frag = title, dest, frag
self.dest_exists = self.dest_error = None
if self.title:
self.title = self.title.strip()
self.parent = None
self.children = []
self.page_list = []
def add(self, title, dest, frag=None):
c = TOC(title, dest, frag)
self.children.append(c)
c.parent = self
return c
def remove(self, child):
self.children.remove(child)
child.parent = None
def remove_from_parent(self):
if self.parent is None:
return
idx = self.parent.children.index(self)
for child in reversed(self.children):
child.parent = self.parent
self.parent.children.insert(idx, child)
self.parent.children.remove(self)
self.parent = None
def __iter__(self):
for c in self.children:
yield c
def __len__(self):
return len(self.children)
def iterdescendants(self, level=None):
gc_level = None if level is None else level + 1
for child in self:
if level is None:
yield child
else:
yield level, child
for gc in child.iterdescendants(level=gc_level):
yield gc
def remove_duplicates(self, only_text=True):
seen = set()
remove = []
for child in self:
key = child.title if only_text else (child.title, child.dest, (child.frag or None))
if key in seen:
remove.append(child)
else:
seen.add(key)
child.remove_duplicates()
for child in remove:
self.remove(child)
@property
def depth(self):
"""The maximum depth of the navigation tree rooted at this node."""
try:
return max(node.depth for node in self) + 1
except ValueError:
return 1
@property
def last_child(self):
return self.children[-1] if self.children else None
def get_lines(self, lvl=0):
frag = ('#'+self.frag) if self.frag else ''
ans = [('\t'*lvl) + 'TOC: %s --> %s%s'%(self.title, self.dest, frag)]
for child in self:
ans.extend(child.get_lines(lvl+1))
return ans
def __str__(self):
return '\n'.join(self.get_lines())
def to_dict(self, node_counter=None):
ans = {
'title':self.title, 'dest':self.dest, 'frag':self.frag,
'children':[c.to_dict(node_counter) for c in self.children]
}
if self.dest_exists is not None:
ans['dest_exists'] = self.dest_exists
if self.dest_error is not None:
ans['dest_error'] = self.dest_error
if node_counter is not None:
ans['id'] = next(node_counter)
return ans
@property
def as_dict(self):
return self.to_dict()
def child_xpath(tag, name):
return tag.xpath('./*[calibre:lower-case(local-name()) = "%s"]'%name)
def add_from_navpoint(container, navpoint, parent, ncx_name):
dest = frag = text = None
nl = child_xpath(navpoint, 'navlabel')
if nl:
nl = nl[0]
text = ''
for txt in child_xpath(nl, 'text'):
text += etree.tostring(txt, method='text',
encoding='unicode', with_tail=False)
content = child_xpath(navpoint, 'content')
if content:
content = content[0]
href = content.get('src', None)
if href:
dest = container.href_to_name(href, base=ncx_name)
frag = urlparse(href).fragment or None
return parent.add(text or None, dest or None, frag or None)
def process_ncx_node(container, node, toc_parent, ncx_name):
for navpoint in node.xpath('./*[calibre:lower-case(local-name()) = "navpoint"]'):
child = add_from_navpoint(container, navpoint, toc_parent, ncx_name)
if child is not None:
process_ncx_node(container, navpoint, child, ncx_name)
def parse_ncx(container, ncx_name):
root = container.parsed(ncx_name)
toc_root = TOC()
navmaps = root.xpath('//*[calibre:lower-case(local-name()) = "navmap"]')
if navmaps:
process_ncx_node(container, navmaps[0], toc_root, ncx_name)
toc_root.lang = toc_root.uid = None
for attr, val in iteritems(root.attrib):
if attr.endswith('lang'):
toc_root.lang = unicode_type(val)
break
for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and @name="dtb:uid"]/@content'):
if uid:
toc_root.uid = unicode_type(uid)
break
for pl in root.xpath('//*[calibre:lower-case(local-name()) = "pagelist"]'):
for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = "pagetarget"]'):
pagenum = pt.get('value')
if pagenum:
href = pt.xpath('descendant::*[calibre:lower-case(local-name()) = "content"]/@src')
if href:
dest = container.href_to_name(href[0], base=ncx_name)
frag = urlparse(href[0]).fragment or None
toc_root.page_list.append({'dest': dest, 'pagenum': pagenum, 'frag': frag})
return toc_root
def add_from_li(container, li, parent, nav_name):
dest = frag = text = None
for x in li.iterchildren(XHTML('a'), XHTML('span')):
text = etree.tostring(x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
href = x.get('href')
if href:
dest = nav_name if href.startswith('#') else container.href_to_name(href, base=nav_name)
frag = urlparse(href).fragment or None
break
return parent.add(text or None, dest or None, frag or None)
def first_child(parent, tagname):
try:
return next(parent.iterchildren(tagname))
except StopIteration:
return None
def process_nav_node(container, node, toc_parent, nav_name):
for li in node.iterchildren(XHTML('li')):
child = add_from_li(container, li, toc_parent, nav_name)
ol = first_child(li, XHTML('ol'))
if child is not None and ol is not None:
process_nav_node(container, ol, child, nav_name)
def parse_nav(container, nav_name):
root = container.parsed(nav_name)
toc_root = TOC()
toc_root.lang = toc_root.uid = None
et = '{%s}type' % EPUB_NS
for nav in root.iterdescendants(XHTML('nav')):
if nav.get(et) == 'toc':
ol = first_child(nav, XHTML('ol'))
if ol is not None:
process_nav_node(container, ol, toc_root, nav_name)
for h in nav.iterchildren(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
text = etree.tostring(h, method='text', encoding='unicode', with_tail=False) or h.get('title')
if text:
toc_root.toc_title = text
break
break
return toc_root
def verify_toc_destinations(container, toc):
anchor_map = {}
anchor_xpath = XPath('//*/@id|//h:a/@name')
for item in toc.iterdescendants():
name = item.dest
if not name:
item.dest_exists = False
item.dest_error = _('No file named %s exists')%name
continue
try:
root = container.parsed(name)
except KeyError:
item.dest_exists = False
item.dest_error = _('No file named %s exists')%name
continue
if not hasattr(root, 'xpath'):
item.dest_exists = False
item.dest_error = _('No HTML file named %s exists')%name
continue
if not item.frag:
item.dest_exists = True
continue
if name not in anchor_map:
anchor_map[name] = frozenset(anchor_xpath(root))
item.dest_exists = item.frag in anchor_map[name]
if not item.dest_exists:
item.dest_error = _(
'The anchor %(a)s does not exist in file %(f)s')%dict(
a=item.frag, f=name)
def find_existing_ncx_toc(container):
toc = container.opf_xpath('//opf:spine/@toc')
if toc:
toc = container.manifest_id_map.get(toc[0], None)
if not toc:
ncx = guess_type('a.ncx')
toc = container.manifest_type_map.get(ncx, [None])[0]
return toc or None
def find_existing_nav_toc(container):
for name in container.manifest_items_with_property('nav'):
return name
def get_x_toc(container, find_toc, parse_toc, verify_destinations=True):
def empty_toc():
ans = TOC()
ans.lang = ans.uid = None
return ans
toc = find_toc(container)
ans = empty_toc() if toc is None or not container.has_name(toc) else parse_toc(container, toc)
ans.toc_file_name = toc if toc and container.has_name(toc) else None
if verify_destinations:
verify_toc_destinations(container, ans)
return ans
def get_toc(container, verify_destinations=True):
ver = container.opf_version_parsed
if ver.major < 3:
return get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations)
else:
ans = get_x_toc(container, find_existing_nav_toc, parse_nav, verify_destinations=verify_destinations)
if len(ans) == 0:
ans = get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations)
return ans
def get_guide_landmarks(container):
for ref in container.opf_xpath('./opf:guide/opf:reference'):
href, title, rtype = ref.get('href'), ref.get('title'), ref.get('type')
href, frag = href.partition('#')[::2]
name = container.href_to_name(href, container.opf_name)
if container.has_name(name):
yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''}
def get_nav_landmarks(container):
nav = find_existing_nav_toc(container)
if nav and container.has_name(nav):
root = container.parsed(nav)
et = '{%s}type' % EPUB_NS
for elem in root.iterdescendants(XHTML('nav')):
if elem.get(et) == 'landmarks':
for li in elem.iterdescendants(XHTML('li')):
for a in li.iterdescendants(XHTML('a')):
href, rtype = a.get('href'), a.get(et)
if href:
title = etree.tostring(a, method='text', encoding='unicode', with_tail=False).strip()
href, frag = href.partition('#')[::2]
name = container.href_to_name(href, nav)
if container.has_name(name):
yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''}
break
def get_landmarks(container):
ver = container.opf_version_parsed
if ver.major < 3:
return list(get_guide_landmarks(container))
ans = list(get_nav_landmarks(container))
if len(ans) == 0:
ans = list(get_guide_landmarks(container))
return ans
def ensure_id(elem, all_ids):
elem_id = elem.get('id')
if elem_id:
return False, elem_id
if elem.tag == XHTML('a'):
anchor = elem.get('name', None)
if anchor:
elem.set('id', anchor)
return False, anchor
c = 0
while True:
c += 1
q = 'toc_{}'.format(c)
if q not in all_ids:
elem.set('id', q)
all_ids.add(q)
break
return True, elem.get('id')
def elem_to_toc_text(elem):
text = xml2text(elem).strip()
if not text:
text = elem.get('title', '')
if not text:
text = elem.get('alt', '')
text = re.sub(r'\s+', ' ', text.strip())
text = text[:1000].strip()
if not text:
text = _('(Untitled)')
return text
def item_at_top(elem):
try:
body = XPath('//h:body')(elem.getroottree().getroot())[0]
except (TypeError, IndexError, KeyError, AttributeError):
return False
tree = body.getroottree()
path = tree.getpath(elem)
for el in body.iterdescendants(etree.Element):
epath = tree.getpath(el)
if epath == path:
break
try:
if el.tag.endswith('}img') or (el.text and el.text.strip()):
return False
except:
return False
if not path.startswith(epath):
# Only check tail of non-parent elements
if el.tail and el.tail.strip():
return False
return True
def from_xpaths(container, xpaths):
'''
Generate a Table of Contents from a list of XPath expressions. Each
expression in the list corresponds to a level of the generate ToC. For
example: :code:`['//h:h1', '//h:h2', '//h:h3']` will generate a three level
Table of Contents from the ``<h1>``, ``<h2>`` and ``<h3>`` tags.
'''
tocroot = TOC()
xpaths = [XPath(xp) for xp in xpaths]
# Find those levels that have no elements in all spine items
maps = OrderedDict()
empty_levels = {i+1 for i, xp in enumerate(xpaths)}
for spinepath in container.spine_items:
name = container.abspath_to_name(spinepath)
root = container.parsed(name)
level_item_map = maps[name] = {i+1:frozenset(xp(root)) for i, xp in enumerate(xpaths)}
for lvl, elems in iteritems(level_item_map):
if elems:
empty_levels.discard(lvl)
# Remove empty levels from all level_maps
if empty_levels:
for name, lmap in tuple(iteritems(maps)):
lmap = {lvl:items for lvl, items in iteritems(lmap) if lvl not in empty_levels}
lmap = sorted(iteritems(lmap), key=itemgetter(0))
lmap = {i+1:items for i, (l, items) in enumerate(lmap)}
maps[name] = lmap
node_level_map = {tocroot: 0}
def parent_for_level(child_level):
limit = child_level - 1
def process_node(node):
child = node.last_child
if child is None:
return node
lvl = node_level_map[child]
return node if lvl > limit else child if lvl == limit else process_node(child)
return process_node(tocroot)
for name, level_item_map in iteritems(maps):
root = container.parsed(name)
item_level_map = {e:i for i, elems in iteritems(level_item_map) for e in elems}
item_dirtied = False
all_ids = set(root.xpath('//*/@id'))
for item in root.iterdescendants(etree.Element):
lvl = item_level_map.get(item, None)
if lvl is None:
continue
text = elem_to_toc_text(item)
parent = parent_for_level(lvl)
if item_at_top(item):
dirtied, elem_id = False, None
else:
dirtied, elem_id = ensure_id(item, all_ids)
item_dirtied = dirtied or item_dirtied
toc = parent.add(text, name, elem_id)
node_level_map[toc] = lvl
toc.dest_exists = True
if item_dirtied:
container.commit_item(name, keep_parsed=True)
return tocroot
def from_links(container):
'''
Generate a Table of Contents from links in the book.
'''
toc = TOC()
link_path = XPath('//h:a[@href]')
seen_titles, seen_dests = set(), set()
for name, is_linear in container.spine_names:
root = container.parsed(name)
for a in link_path(root):
href = a.get('href')
if not href or not href.strip():
continue
frag = None
if href.startswith('#'):
dest = name
frag = href[1:]
else:
href, _, frag = href.partition('#')
dest = container.href_to_name(href, base=name)
frag = frag or None
if (dest, frag) in seen_dests:
continue
seen_dests.add((dest, frag))
text = elem_to_toc_text(a)
if text in seen_titles:
continue
seen_titles.add(text)
toc.add(text, dest, frag=frag)
verify_toc_destinations(container, toc)
for child in toc:
if not child.dest_exists:
toc.remove(child)
return toc
def find_text(node):
LIMIT = 200
pat = re.compile(r'\s+')
for child in node:
if isinstance(child, etree._Element):
text = xml2text(child).strip()
text = pat.sub(' ', text)
if len(text) < 1:
continue
if len(text) > LIMIT:
# Look for less text in a child of this node, recursively
ntext = find_text(child)
return ntext or (text[:LIMIT] + '...')
else:
return text
def from_files(container):
'''
Generate a Table of Contents from files in the book.
'''
toc = TOC()
for i, spinepath in enumerate(container.spine_items):
name = container.abspath_to_name(spinepath)
root = container.parsed(name)
body = XPath('//h:body')(root)
if not body:
continue
text = find_text(body[0])
if not text:
text = name.rpartition('/')[-1]
if i == 0 and text.rpartition('.')[0].lower() in {'titlepage', 'cover'}:
text = _('Cover')
toc.add(text, name)
return toc
def node_from_loc(root, locs, totals=None):
node = root.xpath('//*[local-name()="body"]')[0]
for i, loc in enumerate(locs):
children = tuple(node.iterchildren(etree.Element))
if totals is not None and totals[i] != len(children):
raise MalformedMarkup()
node = children[loc]
return node
def add_id(container, name, loc, totals=None):
root = container.parsed(name)
try:
node = node_from_loc(root, loc, totals=totals)
except MalformedMarkup:
# The webkit HTML parser and the container parser have yielded
# different node counts, this can happen if the file is valid XML
# but contains constructs like nested <p> tags. So force parse it
# with the HTML 5 parser and try again.
raw = container.raw_data(name)
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
try:
node = node_from_loc(root, loc, totals=totals)
except MalformedMarkup:
raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
' before editing.') % name)
container.replace(name, root)
if not node.get('id'):
ensure_id(node, set(root.xpath('//*/@id')))
container.commit_item(name, keep_parsed=True)
return node.get('id')
def create_ncx(toc, to_href, btitle, lang, uid):
lang = lang.replace('_', '-')
ncx = etree.Element(NCX('ncx'),
attrib={'version': '2005-1', XML('lang'): lang},
nsmap={None: NCX_NS})
head = etree.SubElement(ncx, NCX('head'))
etree.SubElement(head, NCX('meta'),
name='dtb:uid', content=unicode_type(uid))
etree.SubElement(head, NCX('meta'),
name='dtb:depth', content=unicode_type(toc.depth))
generator = ''.join(['calibre (', __version__, ')'])
etree.SubElement(head, NCX('meta'),
name='dtb:generator', content=generator)
etree.SubElement(head, NCX('meta'), name='dtb:totalPageCount', content='0')
etree.SubElement(head, NCX('meta'), name='dtb:maxPageNumber', content='0')
title = etree.SubElement(ncx, NCX('docTitle'))
text = etree.SubElement(title, NCX('text'))
text.text = btitle
navmap = etree.SubElement(ncx, NCX('navMap'))
spat = re.compile(r'\s+')
play_order = Counter()
def process_node(xml_parent, toc_parent):
for child in toc_parent:
play_order['c'] += 1
point = etree.SubElement(xml_parent, NCX('navPoint'), id='num_%d' % play_order['c'],
playOrder=unicode_type(play_order['c']))
label = etree.SubElement(point, NCX('navLabel'))
title = child.title
if title:
title = spat.sub(' ', title)
etree.SubElement(label, NCX('text')).text = title
if child.dest:
href = to_href(child.dest)
if child.frag:
href += '#'+child.frag
etree.SubElement(point, NCX('content'), src=href)
process_node(point, child)
process_node(navmap, toc)
return ncx
def commit_ncx_toc(container, toc, lang=None, uid=None):
tocname = find_existing_ncx_toc(container)
if tocname is None:
item = container.generate_item('toc.ncx', id_prefix='toc')
tocname = container.href_to_name(item.get('href'), base=container.opf_name)
ncx_id = item.get('id')
[s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')]
if not lang:
lang = get_lang()
for l in container.opf_xpath('//dc:language'):
l = canonicalize_lang(xml2text(l).strip())
if l:
lang = l
lang = lang_as_iso639_1(l) or l
break
lang = lang_as_iso639_1(lang) or lang
if not uid:
uid = uuid_id()
eid = container.opf.get('unique-identifier', None)
if eid:
m = container.opf_xpath('//*[@id="%s"]'%eid)
if m:
uid = xml2text(m[0])
title = _('Table of Contents')
m = container.opf_xpath('//dc:title')
if m:
x = xml2text(m[0]).strip()
title = x or title
to_href = partial(container.name_to_href, base=tocname)
root = create_ncx(toc, to_href, title, lang, uid)
container.replace(tocname, root)
container.pretty_print.add(tocname)
def ensure_single_nav_of_type(root, ntype='toc'):
et = '{%s}type' % EPUB_NS
navs = [n for n in root.iterdescendants(XHTML('nav')) if n.get(et) == ntype]
for x in navs[1:]:
extract(x)
if navs:
nav = navs[0]
tail = nav.tail
attrib = dict(nav.attrib)
nav.clear()
nav.attrib.update(attrib)
nav.tail = tail
else:
nav = root.makeelement(XHTML('nav'))
first_child(root, XHTML('body')).append(nav)
nav.set('{%s}type' % EPUB_NS, ntype)
return nav
def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None):
from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree
tocname = find_existing_nav_toc(container)
if previous_nav is not None:
nav_name = container.href_to_name(previous_nav[0])
if nav_name and container.exists(nav_name):
tocname = nav_name
container.apply_unique_properties(tocname, 'nav')
if tocname is None:
item = container.generate_item('nav.xhtml', id_prefix='nav')
item.set('properties', 'nav')
tocname = container.href_to_name(item.get('href'), base=container.opf_name)
if previous_nav is not None:
root = previous_nav[1]
else:
root = container.parse_xhtml(P('templates/new_nav.html', data=True).decode('utf-8'))
container.replace(tocname, root)
else:
root = container.parsed(tocname)
if lang:
lang = lang_as_iso639_1(lang) or lang
root.set('lang', lang)
root.set('{%s}lang' % XML_NS, lang)
nav = ensure_single_nav_of_type(root, 'toc')
if toc.toc_title:
nav.append(nav.makeelement(XHTML('h1')))
nav[-1].text = toc.toc_title
rnode = nav.makeelement(XHTML('ol'))
nav.append(rnode)
to_href = partial(container.name_to_href, base=tocname)
spat = re.compile(r'\s+')
def process_node(xml_parent, toc_parent):
for child in toc_parent:
li = xml_parent.makeelement(XHTML('li'))
xml_parent.append(li)
title = child.title or ''
title = spat.sub(' ', title).strip()
a = li.makeelement(XHTML('a' if child.dest else 'span'))
a.text = title
li.append(a)
if child.dest:
href = to_href(child.dest)
if child.frag:
href += '#'+child.frag
a.set('href', href)
if len(child):
ol = li.makeelement(XHTML('ol'))
li.append(ol)
process_node(ol, child)
process_node(rnode, toc)
pretty_xml_tree(nav)
def collapse_li(parent):
for li in parent.iterdescendants(XHTML('li')):
if len(li) == 1:
li.text = None
li[0].tail = None
collapse_li(nav)
nav.tail = '\n'
def create_li(ol, entry):
li = ol.makeelement(XHTML('li'))
ol.append(li)
a = li.makeelement(XHTML('a'))
li.append(a)
href = container.name_to_href(entry['dest'], tocname)
if entry['frag']:
href += '#' + entry['frag']
a.set('href', href)
return a
if landmarks is not None:
nav = ensure_single_nav_of_type(root, 'landmarks')
nav.set('hidden', '')
ol = nav.makeelement(XHTML('ol'))
nav.append(ol)
for entry in landmarks:
if entry['type'] and container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS:
a = create_li(ol, entry)
a.set('{%s}type' % EPUB_NS, entry['type'])
a.text = entry['title'] or None
pretty_xml_tree(nav)
collapse_li(nav)
if toc.page_list:
nav = ensure_single_nav_of_type(root, 'page-list')
nav.set('hidden', '')
ol = nav.makeelement(XHTML('ol'))
nav.append(ol)
for entry in toc.page_list:
if container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS:
a = create_li(ol, entry)
a.text = unicode_type(entry['pagenum'])
pretty_xml_tree(nav)
collapse_li(nav)
container.replace(tocname, root)
def commit_toc(container, toc, lang=None, uid=None):
commit_ncx_toc(container, toc, lang=lang, uid=uid)
if container.opf_version_parsed.major > 2:
commit_nav_toc(container, toc, lang=lang)
def remove_names_from_toc(container, names):
changed = []
names = frozenset(names)
for find_toc, parse_toc, commit_toc in (
(find_existing_ncx_toc, parse_ncx, commit_ncx_toc),
(find_existing_nav_toc, parse_nav, commit_nav_toc),
):
toc = get_x_toc(container, find_toc, parse_toc, verify_destinations=False)
if len(toc) > 0:
remove = []
for node in toc.iterdescendants():
if node.dest in names:
remove.append(node)
if remove:
for node in reversed(remove):
node.remove_from_parent()
commit_toc(container, toc)
changed.append(find_toc(container))
return changed
def find_inline_toc(container):
for name, linear in container.spine_names:
if container.parsed(name).xpath('//*[local-name()="body" and @id="calibre_generated_inline_toc"]'):
return name
def toc_to_html(toc, container, toc_name, title, lang=None):
def process_node(html_parent, toc, level=1, indent=' ', style_level=2):
li = html_parent.makeelement(XHTML('li'))
li.tail = '\n'+ (indent*level)
html_parent.append(li)
name, frag = toc.dest, toc.frag
href = '#'
if name:
href = container.name_to_href(name, toc_name)
if frag:
href += '#' + frag
a = li.makeelement(XHTML('a'), href=href)
a.text = toc.title
li.append(a)
if len(toc) > 0:
parent = li.makeelement(XHTML('ul'))
parent.set('class', 'level%d' % (style_level))
li.append(parent)
a.tail = '\n\n' + (indent*(level+2))
parent.text = '\n'+(indent*(level+3))
parent.tail = '\n\n' + (indent*(level+1))
for child in toc:
process_node(parent, child, level+3, style_level=style_level + 1)
parent[-1].tail = '\n' + (indent*(level+2))
E = ElementMaker(namespace=XHTML_NS, nsmap={None:XHTML_NS})
html = E.html(
E.head(
E.title(title),
E.style(P('templates/inline_toc_styles.css', data=True), type='text/css'),
),
E.body(
E.h2(title),
E.ul(),
id="calibre_generated_inline_toc",
)
)
ul = html[1][1]
ul.set('class', 'level1')
for child in toc:
process_node(ul, child)
if lang:
html.set('lang', lang)
pretty_html_tree(container, html)
return html
def create_inline_toc(container, title=None):
'''
Create an inline (HTML) Table of Contents from an existing NCX Table of Contents.
:param title: The title for this table of contents.
'''
lang = get_book_language(container)
default_title = 'Table of Contents'
if lang:
lang = lang_as_iso639_1(lang) or lang
default_title = translate(lang, default_title)
title = title or default_title
toc = get_toc(container)
if len(toc) == 0:
return None
toc_name = find_inline_toc(container)
name = toc_name
html = toc_to_html(toc, container, name, title, lang)
raw = serialize(html, 'text/html')
if name is None:
name, c = 'toc.xhtml', 0
while container.has_name(name):
c += 1
name = 'toc%d.xhtml' % c
container.add_file(name, raw, spine_index=0)
else:
with container.open(name, 'wb') as f:
f.write(raw)
set_guide_item(container, 'toc', title, name, frag='calibre_generated_inline_toc')
return name

View File

@@ -0,0 +1,231 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re, os
from bisect import bisect
from calibre import guess_type as _guess_type, replace_entities
from polyglot.builtins import filter
def guess_type(x):
return _guess_type(x)[0] or 'application/octet-stream'
def setup_css_parser_serialization(tab_width=2):
import css_parser
prefs = css_parser.ser.prefs
prefs.indent = tab_width * ' '
prefs.indentClosingBrace = False
prefs.omitLastSemicolon = False
def actual_case_for_name(container, name):
from calibre.utils.filenames import samefile
if not container.exists(name):
raise ValueError('Cannot get actual case for %s as it does not exist' % name)
parts = name.split('/')
base = ''
ans = []
for i, x in enumerate(parts):
base = '/'.join(ans + [x])
path = container.name_to_abspath(base)
pdir = os.path.dirname(path)
candidates = {os.path.join(pdir, q) for q in os.listdir(pdir)}
if x in candidates:
correctx = x
else:
for q in candidates:
if samefile(q, path):
correctx = os.path.basename(q)
break
else:
raise RuntimeError('Something bad happened')
ans.append(correctx)
return '/'.join(ans)
def corrected_case_for_name(container, name):
parts = name.split('/')
ans = []
base = ''
for i, x in enumerate(parts):
base = '/'.join(ans + [x])
if container.exists(base):
correctx = x
else:
try:
candidates = {q for q in os.listdir(os.path.dirname(container.name_to_abspath(base)))}
except EnvironmentError:
return None # one of the non-terminal components of name is a file instead of a directory
for q in candidates:
if q.lower() == x.lower():
correctx = q
break
else:
return None
ans.append(correctx)
return '/'.join(ans)
class PositionFinder(object):
def __init__(self, raw):
pat = br'\n' if isinstance(raw, bytes) else r'\n'
self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))
def __call__(self, pos):
lnum = bisect(self.new_lines, pos)
try:
offset = abs(pos - self.new_lines[lnum - 1])
except IndexError:
offset = pos
return (lnum + 1, offset)
class CommentFinder(object):
def __init__(self, raw, pat=r'(?s)/\*.*?\*/'):
self.starts, self.ends = [], []
for m in re.finditer(pat, raw):
start, end = m.span()
self.starts.append(start), self.ends.append(end)
def __call__(self, offset):
if not self.starts:
return False
q = bisect(self.starts, offset) - 1
return q >= 0 and self.starts[q] <= offset <= self.ends[q]
def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'):
from calibre.ebooks.oeb.base import XPath, XHTML
changed_names = set()
snames = set(sheets)
lp = XPath('//h:link[@href]')
hp = XPath('//h:head')
for name in names:
root = container.parsed(name)
if remove:
for link in lp(root):
if (link.get('type', mtype) or mtype) == mtype:
container.remove_from_xml(link)
changed_names.add(name)
container.dirty(name)
existing = {container.href_to_name(l.get('href'), name) for l in lp(root) if (l.get('type', mtype) or mtype) == mtype}
extra = snames - existing
if extra:
changed_names.add(name)
try:
parent = hp(root)[0]
except (TypeError, IndexError):
parent = root.makeelement(XHTML('head'))
container.insert_into_xml(root, parent, index=0)
for sheet in sheets:
if sheet in extra:
container.insert_into_xml(
parent, parent.makeelement(XHTML('link'), rel='stylesheet', type=mtype,
href=container.name_to_href(sheet, name)))
container.dirty(name)
return changed_names
def lead_text(top_elem, num_words=10):
''' Return the leading text contained in top_elem (including descendants)
up to a maximum of num_words words. More efficient than using
etree.tostring(method='text') as it does not have to serialize the entire
sub-tree rooted at top_elem.'''
pat = re.compile(r'\s+', flags=re.UNICODE)
words = []
def get_text(x, attr='text'):
ans = getattr(x, attr)
if ans:
words.extend(filter(None, pat.split(ans)))
stack = [(top_elem, 'text')]
while stack and len(words) < num_words:
elem, attr = stack.pop()
get_text(elem, attr)
if attr == 'text':
if elem is not top_elem:
stack.append((elem, 'tail'))
stack.extend(reversed(list((c, 'text') for c in elem.iterchildren('*'))))
return ' '.join(words[:num_words])
def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None):
if log_level is None:
import logging
log_level = logging.WARNING
from css_parser import CSSParser, log
from calibre.ebooks.oeb.base import _css_logger
log.setLevel(log_level)
log.raiseExceptions = False
data = data or ''
if isinstance(data, bytes):
data = data.decode('utf-8') if decode is None else decode(data)
if css_preprocessor is not None:
data = css_preprocessor(data)
parser = CSSParser(loglevel=log_level,
# We dont care about @import rules
fetcher=lambda x: (None, None), log=_css_logger)
if is_declaration:
data = parser.parseStyle(data, validate=False)
else:
data = parser.parseString(data, href=fname, validate=False)
return data
def handle_entities(text, func):
return func(replace_entities(text))
def apply_func_to_match_groups(match, func=icu_upper, handle_entities=handle_entities):
'''Apply the specified function to individual groups in the match object (the result of re.search() or
the whole match if no groups were defined. Returns the replaced string.'''
found_groups = False
i = 0
parts, pos = [], match.start()
f = lambda text:handle_entities(text, func)
while True:
i += 1
try:
start, end = match.span(i)
except IndexError:
break
found_groups = True
if start > -1:
parts.append(match.string[pos:start])
parts.append(f(match.string[start:end]))
pos = end
if not found_groups:
return f(match.group())
parts.append(match.string[pos:match.end()])
return ''.join(parts)
def apply_func_to_html_text(match, func=icu_upper, handle_entities=handle_entities):
''' Apply the specified function only to text between HTML tag definitions. '''
f = lambda text:handle_entities(text, func)
parts = re.split(r'(<[^>]+>)', match.group())
parts = (x if x.startswith('<') else f(x) for x in parts)
return ''.join(parts)
def extract(elem):
''' Remove an element from the tree, keeping elem.tail '''
p = elem.getparent()
if p is not None:
idx = p.index(elem)
p.remove(elem)
if elem.tail:
if idx > 0:
p[idx-1].tail = (p[idx-1].tail or '') + elem.tail
else:
p.text = (p.text or '') + elem.tail

View File

@@ -0,0 +1,720 @@
"""
Container-/OPF-based input OEBBook reader.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys, os, uuid, copy, re, io
from collections import defaultdict
from lxml import etree
from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
DC_NSES, OPF, xml2text, XHTML_MIME
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
MS_COVER_TYPE, iterlinks
from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
urlnormalize, BINARY_MIME, \
OEBError, OEBBook, DirContainer
from calibre.ebooks.oeb.writer import OEBWriter
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.cleantext import clean_xml_chars
from calibre.utils.localization import get_lang
from calibre.ptempfile import TemporaryDirectory
from calibre.constants import __appname__, __version__
from calibre import guess_type, xml_replace_entities
from polyglot.builtins import unicode_type, zip
from polyglot.urllib import unquote, urldefrag, urlparse
__all__ = ['OEBReader']
class OEBReader(object):
"""Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
Container = DirContainer
"""Container type used to access book files. Override in sub-classes."""
DEFAULT_PROFILE = 'PRS505'
"""Default renderer profile for content read with this Reader."""
TRANSFORMS = []
"""List of transforms to apply to content read with this Reader."""
@classmethod
def config(cls, cfg):
"""Add any book-reading options to the :class:`Config` object
:param:`cfg`.
"""
return
@classmethod
def generate(cls, opts):
"""Generate a Reader instance from command-line options."""
return cls()
def __call__(self, oeb, path):
"""Read the book at :param:`path` into the :class:`OEBBook` object
:param:`oeb`.
"""
self.oeb = oeb
self.logger = self.log = oeb.logger
oeb.container = self.Container(path, self.logger)
oeb.container.log = oeb.log
opf = self._read_opf()
self._all_from_opf(opf)
return oeb
def _clean_opf(self, opf):
nsmap = {}
for elem in opf.iter(tag=etree.Element):
nsmap.update(elem.nsmap)
for elem in opf.iter(tag=etree.Element):
if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag):
elem.tag = OPF(barename(elem.tag))
nsmap.update(OPF2_NSMAP)
attrib = dict(opf.attrib)
nroot = etree.Element(OPF('package'),
nsmap={None: OPF2_NS}, attrib=attrib)
metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
ignored = (OPF('dc-metadata'), OPF('x-metadata'))
for elem in xpath(opf, 'o2:metadata//*'):
if elem.tag in ignored:
continue
if namespace(elem.tag) in DC_NSES:
tag = barename(elem.tag).lower()
elem.tag = '{%s}%s' % (DC11_NS, tag)
if elem.tag.startswith('dc:'):
tag = elem.tag.partition(':')[-1].lower()
elem.tag = '{%s}%s' % (DC11_NS, tag)
metadata.append(elem)
for element in xpath(opf, 'o2:metadata//o2:meta'):
metadata.append(element)
for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
for element in xpath(opf, tag):
nroot.append(element)
return nroot
def _read_opf(self):
data = self.oeb.container.read(None)
data = self.oeb.decode(data)
data = XMLDECL_RE.sub('', data)
data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
OPF1_NS, data)
try:
opf = safe_xml_fromstring(data)
except etree.XMLSyntaxError:
data = xml_replace_entities(clean_xml_chars(data), encoding=None)
try:
opf = safe_xml_fromstring(data)
self.logger.warn('OPF contains invalid HTML named entities')
except etree.XMLSyntaxError:
data = re.sub(r'(?is)<tours>.+</tours>', '', data)
data = data.replace('<dc-metadata>',
'<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">')
opf = safe_xml_fromstring(data)
self.logger.warn('OPF contains invalid tours section')
ns = namespace(opf.tag)
if ns not in ('', OPF1_NS, OPF2_NS):
raise OEBError('Invalid namespace %r for OPF document' % ns)
opf = self._clean_opf(opf)
return opf
def _metadata_from_opf(self, opf):
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
stream = io.BytesIO(etree.tostring(opf, xml_declaration=True, encoding='utf-8'))
o = OPF(stream)
pwm = o.primary_writing_mode
if pwm:
self.oeb.metadata.primary_writing_mode = pwm
mi = o.to_book_metadata()
if not mi.language:
mi.language = get_lang().replace('_', '-')
self.oeb.metadata.add('language', mi.language)
if not mi.book_producer:
mi.book_producer = '%(a)s (%(v)s) [http://%(a)s-ebook.com]'%\
dict(a=__appname__, v=__version__)
meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger)
m = self.oeb.metadata
m.add('identifier', unicode_type(uuid.uuid4()), id='uuid_id', scheme='uuid')
self.oeb.uid = self.oeb.metadata.identifier[-1]
if not m.title:
m.add('title', self.oeb.translate(__('Unknown')))
has_aut = False
for x in m.creator:
if getattr(x, 'role', '').lower() in ('', 'aut'):
has_aut = True
break
if not has_aut:
m.add('creator', self.oeb.translate(__('Unknown')), role='aut')
def _manifest_prune_invalid(self):
'''
Remove items from manifest that contain invalid data. This prevents
catastrophic conversion failure, when a few files contain corrupted
data.
'''
bad = []
check = OEB_DOCS.union(OEB_STYLES)
for item in list(self.oeb.manifest.values()):
if item.media_type in check:
try:
item.data
except KeyboardInterrupt:
raise
except:
self.logger.exception('Failed to parse content in %s'%
item.href)
bad.append(item)
self.oeb.manifest.remove(item)
return bad
def _manifest_add_missing(self, invalid):
import css_parser
manifest = self.oeb.manifest
known = set(manifest.hrefs)
unchecked = set(manifest.values())
cdoc = OEB_DOCS|OEB_STYLES
invalid = set()
while unchecked:
new = set()
for item in unchecked:
data = None
if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')):
try:
data = item.data
except:
self.oeb.log.exception('Failed to read from manifest '
'entry with id: %s, ignoring'%item.id)
invalid.add(item)
continue
if data is None:
continue
if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')):
hrefs = [r[2] for r in iterlinks(data)]
for href in hrefs:
if isinstance(href, bytes):
href = href.decode('utf-8')
href, _ = urldefrag(href)
if not href:
continue
try:
href = item.abshref(urlnormalize(href))
scheme = urlparse(href).scheme
except:
self.oeb.log.exception(
'Skipping invalid href: %r'%href)
continue
if not scheme and href not in known:
new.add(href)
elif item.media_type in OEB_STYLES:
try:
urls = list(css_parser.getUrls(data))
except:
urls = []
for url in urls:
href, _ = urldefrag(url)
href = item.abshref(urlnormalize(href))
scheme = urlparse(href).scheme
if not scheme and href not in known:
new.add(href)
unchecked.clear()
warned = set()
for href in new:
known.add(href)
is_invalid = False
for item in invalid:
if href == item.abshref(urlnormalize(href)):
is_invalid = True
break
if is_invalid:
continue
if not self.oeb.container.exists(href):
if href not in warned:
self.logger.warn('Referenced file %r not found' % href)
warned.add(href)
continue
if href not in warned:
self.logger.warn('Referenced file %r not in manifest' % href)
warned.add(href)
id, _ = manifest.generate(id='added')
guessed = guess_type(href)[0]
media_type = guessed or BINARY_MIME
added = manifest.add(id, href, media_type)
unchecked.add(added)
for item in invalid:
self.oeb.manifest.remove(item)
def _manifest_from_opf(self, opf):
manifest = self.oeb.manifest
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
id = elem.get('id')
href = elem.get('href')
media_type = elem.get('media-type', None)
if media_type is None:
media_type = elem.get('mediatype', None)
if not media_type or media_type == 'text/xml':
guessed = guess_type(href)[0]
media_type = guessed or media_type or BINARY_MIME
if hasattr(media_type, 'lower'):
media_type = media_type.lower()
fallback = elem.get('fallback')
if href in manifest.hrefs:
self.logger.warn('Duplicate manifest entry for %r' % href)
continue
if not self.oeb.container.exists(href):
self.logger.warn('Manifest item %r not found' % href)
continue
if id in manifest.ids:
self.logger.warn('Duplicate manifest id %r' % id)
id, href = manifest.generate(id, href)
manifest.add(id, href, media_type, fallback)
invalid = self._manifest_prune_invalid()
self._manifest_add_missing(invalid)
def _spine_add_extra(self):
manifest = self.oeb.manifest
spine = self.oeb.spine
unchecked = set(spine)
selector = XPath('h:body//h:a/@href')
extras = set()
while unchecked:
new = set()
for item in unchecked:
if item.media_type not in OEB_DOCS:
# TODO: handle fallback chains
continue
for href in selector(item.data):
href, _ = urldefrag(href)
if not href:
continue
try:
href = item.abshref(urlnormalize(href))
except ValueError: # Malformed URL
continue
if href not in manifest.hrefs:
continue
found = manifest.hrefs[href]
if found.media_type not in OEB_DOCS or \
found in spine or found in extras:
continue
new.add(found)
extras.update(new)
unchecked = new
version = int(self.oeb.version[0])
removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ())
for item in sorted(extras):
if item.href in removed_items_to_ignore:
continue
if version >= 2:
self.logger.warn(
'Spine-referenced file %r not in spine' % item.href)
spine.add(item, linear=False)
def _spine_from_opf(self, opf):
spine = self.oeb.spine
manifest = self.oeb.manifest
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
idref = elem.get('idref')
if idref not in manifest.ids:
self.logger.warn('Spine item %r not found' % idref)
continue
item = manifest.ids[idref]
if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath') and not getattr(item.data, 'tag', '').endswith('}ncx'):
spine.add(item, elem.get('linear'))
else:
if hasattr(item.data, 'tag') and item.data.tag and item.data.tag.endswith('}html'):
item.media_type = XHTML_MIME
spine.add(item, elem.get('linear'))
else:
self.oeb.log.warn('The item %s is not a XML document.'
' Removing it from spine.'%item.href)
if len(spine) == 0:
raise OEBError("Spine is empty")
self._spine_add_extra()
for val in xpath(opf, '/o2:package/o2:spine/@page-progression-direction'):
if val in {'ltr', 'rtl'}:
spine.page_progression_direction = val
def _guide_from_opf(self, opf):
guide = self.oeb.guide
manifest = self.oeb.manifest
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
ref_href = elem.get('href')
path = urlnormalize(urldefrag(ref_href)[0])
if path not in manifest.hrefs:
corrected_href = None
for href in manifest.hrefs:
if href.lower() == path.lower():
corrected_href = href
break
if corrected_href is None:
self.logger.warn('Guide reference %r not found' % ref_href)
continue
ref_href = corrected_href
typ = elem.get('type')
if typ not in guide:
guide.add(typ, elem.get('title'), ref_href)
def _find_ncx(self, opf):
result = xpath(opf, '/o2:package/o2:spine/@toc')
if result:
id = result[0]
if id not in self.oeb.manifest.ids:
return None
item = self.oeb.manifest.ids[id]
self.oeb.manifest.remove(item)
return item
for item in self.oeb.manifest.values():
if item.media_type == NCX_MIME:
self.oeb.manifest.remove(item)
return item
return None
def _toc_from_navpoint(self, item, toc, navpoint):
children = xpath(navpoint, 'ncx:navPoint')
for child in children:
title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
title = COLLAPSE_RE.sub(' ', title.strip())
href = xpath(child, 'ncx:content/@src')
if not title:
self._toc_from_navpoint(item, toc, child)
continue
if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'):
# This node is useless
continue
href = item.abshref(urlnormalize(href[0])) if href and href[0] else ''
path, _ = urldefrag(href)
if path and path not in self.oeb.manifest.hrefs:
path = urlnormalize(path)
if href and path not in self.oeb.manifest.hrefs:
self.logger.warn('TOC reference %r not found' % href)
gc = xpath(child, 'ncx:navPoint')
if not gc:
# This node is useless
continue
id = child.get('id')
klass = child.get('class', 'chapter')
try:
po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
except:
po = self.oeb.toc.next_play_order()
authorElement = xpath(child,
'descendant::calibre:meta[@name = "author"]')
if authorElement:
author = authorElement[0].text
else:
author = None
descriptionElement = xpath(child,
'descendant::calibre:meta[@name = "description"]')
if descriptionElement:
description = etree.tostring(descriptionElement[0],
method='text', encoding='unicode').strip()
if not description:
description = None
else:
description = None
index_image = xpath(child,
'descendant::calibre:meta[@name = "toc_thumbnail"]')
toc_thumbnail = (index_image[0].text if index_image else None)
if not toc_thumbnail or not toc_thumbnail.strip():
toc_thumbnail = None
node = toc.add(title, href, id=id, klass=klass,
play_order=po, description=description, author=author,
toc_thumbnail=toc_thumbnail)
self._toc_from_navpoint(item, node, child)
def _toc_from_ncx(self, item):
if (item is None) or (item.data is None):
return False
self.log.debug('Reading TOC from NCX...')
ncx = item.data
title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
title = COLLAPSE_RE.sub(' ', title.strip())
title = title or unicode_type(self.oeb.metadata.title[0])
toc = self.oeb.toc
toc.title = title
navmaps = xpath(ncx, 'ncx:navMap')
for navmap in navmaps:
self._toc_from_navpoint(item, toc, navmap)
return True
def _toc_from_tour(self, opf):
result = xpath(opf, 'o2:tours/o2:tour')
if not result:
return False
self.log.debug('Reading TOC from tour...')
tour = result[0]
toc = self.oeb.toc
toc.title = tour.get('title')
sites = xpath(tour, 'o2:site')
for site in sites:
title = site.get('title')
href = site.get('href')
if not title or not href:
continue
path, _ = urldefrag(urlnormalize(href))
if path not in self.oeb.manifest.hrefs:
self.logger.warn('TOC reference %r not found' % href)
continue
id = site.get('id')
toc.add(title, href, id=id)
return True
def _toc_from_html(self, opf):
if 'toc' not in self.oeb.guide:
return False
self.log.debug('Reading TOC from HTML...')
itempath, frag = urldefrag(self.oeb.guide['toc'].href)
item = self.oeb.manifest.hrefs[itempath]
html = item.data
if frag:
elems = xpath(html, './/*[@id="%s"]' % frag)
if not elems:
elems = xpath(html, './/*[@name="%s"]' % frag)
elem = elems[0] if elems else html
while elem != html and not xpath(elem, './/h:a[@href]'):
elem = elem.getparent()
html = elem
titles = defaultdict(list)
order = []
for anchor in xpath(html, './/h:a[@href]'):
href = anchor.attrib['href']
href = item.abshref(urlnormalize(href))
path, frag = urldefrag(href)
if path not in self.oeb.manifest.hrefs:
continue
title = xml2text(anchor)
title = COLLAPSE_RE.sub(' ', title.strip())
if href not in titles:
order.append(href)
titles[href].append(title)
toc = self.oeb.toc
for href in order:
toc.add(' '.join(titles[href]), href)
return True
def _toc_from_spine(self, opf):
self.log.warn('Generating default TOC from spine...')
toc = self.oeb.toc
titles = []
headers = []
for item in self.oeb.spine:
if not item.linear:
continue
html = item.data
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
title = COLLAPSE_RE.sub(' ', title.strip())
if title:
titles.append(title)
headers.append('(unlabled)')
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
expr = '/h:html/h:body//h:%s[position()=1]/text()'
header = ''.join(xpath(html, expr % tag))
header = COLLAPSE_RE.sub(' ', header.strip())
if header:
headers[-1] = header
break
use = titles
if len(titles) > len(set(titles)):
use = headers
for title, item in zip(use, self.oeb.spine):
if not item.linear:
continue
toc.add(title, item.href)
return True
def _toc_from_opf(self, opf, item):
self.oeb.auto_generated_toc = False
if self._toc_from_ncx(item):
return
# Prefer HTML to tour based TOC, since several LIT files
# have good HTML TOCs but bad tour based TOCs
if self._toc_from_html(opf):
return
if self._toc_from_tour(opf):
return
self._toc_from_spine(opf)
self.oeb.auto_generated_toc = True
def _pages_from_ncx(self, opf, item):
if item is None:
return False
ncx = item.data
if ncx is None:
return False
ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget')
if not ptargets:
return False
pages = self.oeb.pages
for ptarget in ptargets:
name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
name = COLLAPSE_RE.sub(' ', name.strip())
href = xpath(ptarget, 'ncx:content/@src')
if not href:
continue
href = item.abshref(urlnormalize(href[0]))
id = ptarget.get('id')
type = ptarget.get('type', 'normal')
klass = ptarget.get('class')
pages.add(name, href, type=type, id=id, klass=klass)
return True
def _find_page_map(self, opf):
result = xpath(opf, '/o2:package/o2:spine/@page-map')
if result:
id = result[0]
if id not in self.oeb.manifest.ids:
return None
item = self.oeb.manifest.ids[id]
self.oeb.manifest.remove(item)
return item
for item in self.oeb.manifest.values():
if item.media_type == PAGE_MAP_MIME:
self.oeb.manifest.remove(item)
return item
return None
def _pages_from_page_map(self, opf):
item = self._find_page_map(opf)
if item is None:
return False
pmap = item.data
pages = self.oeb.pages
for page in xpath(pmap, 'o2:page'):
name = page.get('name', '')
href = page.get('href')
if not href:
continue
name = COLLAPSE_RE.sub(' ', name.strip())
href = item.abshref(urlnormalize(href))
type = 'normal'
if not name:
type = 'special'
elif name.lower().strip('ivxlcdm') == '':
type = 'front'
pages.add(name, href, type=type)
return True
def _pages_from_opf(self, opf, item):
if self._pages_from_ncx(opf, item):
return
if self._pages_from_page_map(opf):
return
return
def _cover_from_html(self, hcover):
from calibre.ebooks import render_html_svg_workaround
with TemporaryDirectory('_html_cover') as tdir:
writer = OEBWriter()
writer(self.oeb, tdir)
path = os.path.join(tdir, unquote(hcover.href))
data = render_html_svg_workaround(path, self.logger)
if not data:
data = b''
id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
return item
def _locate_cover_image(self):
if self.oeb.metadata.cover:
id = unicode_type(self.oeb.metadata.cover[0])
item = self.oeb.manifest.ids.get(id, None)
if item is not None and item.media_type in OEB_IMAGES:
return item
else:
self.logger.warn('Invalid cover image @id %r' % id)
hcover = self.oeb.spine[0]
if 'cover' in self.oeb.guide:
href = self.oeb.guide['cover'].href
item = self.oeb.manifest.hrefs[href]
media_type = item.media_type
if media_type in OEB_IMAGES:
return item
elif media_type in OEB_DOCS:
hcover = item
html = hcover.data
if MS_COVER_TYPE in self.oeb.guide:
href = self.oeb.guide[MS_COVER_TYPE].href
item = self.oeb.manifest.hrefs.get(href, None)
if item is not None and item.media_type in OEB_IMAGES:
return item
if self.COVER_SVG_XP(html):
svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
href = os.path.splitext(hcover.href)[0] + '.svg'
id, href = self.oeb.manifest.generate(hcover.id, href)
item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg)
return item
if self.COVER_OBJECT_XP(html):
object = self.COVER_OBJECT_XP(html)[0]
href = hcover.abshref(object.get('data'))
item = self.oeb.manifest.hrefs.get(href, None)
if item is not None and item.media_type in OEB_IMAGES:
return item
return self._cover_from_html(hcover)
def _ensure_cover_image(self):
cover = self._locate_cover_image()
if self.oeb.metadata.cover:
self.oeb.metadata.cover[0].value = cover.id
return
self.oeb.metadata.add('cover', cover.id)
def _manifest_remove_duplicates(self):
seen = set()
dups = set()
for item in self.oeb.manifest:
if item.href in seen:
dups.add(item.href)
seen.add(item.href)
for href in dups:
items = [x for x in self.oeb.manifest if x.href == href]
for x in items:
if x not in self.oeb.spine:
self.oeb.log.warn('Removing duplicate manifest item with id:', x.id)
self.oeb.manifest.remove_duplicate_item(x)
def _all_from_opf(self, opf):
self.oeb.version = opf.get('version', '1.2')
self._metadata_from_opf(opf)
self._manifest_from_opf(opf)
self._spine_from_opf(opf)
self._manifest_remove_duplicates()
self._guide_from_opf(opf)
item = self._find_ncx(opf)
self._toc_from_opf(opf, item)
self._pages_from_opf(opf, item)
# self._ensure_cover_image()
def main(argv=sys.argv):
reader = OEBReader()
for arg in argv[1:]:
oeb = reader(OEBBook(), arg)
for name, doc in oeb.to_opf1().values():
print(etree.tostring(doc, pretty_print=True))
for name, doc in oeb.to_opf2(page_map=True).values():
print(etree.tostring(doc, pretty_print=True))
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,808 @@
# -*- encoding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
CSS property propagation class.
'''
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import os, re, logging, copy, unicodedata, numbers
from operator import itemgetter
from weakref import WeakKeyDictionary
from xml.dom import SyntaxErr as CSSSyntaxError
from css_parser.css import (CSSStyleRule, CSSPageRule, CSSFontFaceRule,
cssproperties)
from css_parser import (profile as cssprofiles, parseString, parseStyle, log as
css_parser_log, CSSParser, profiles, replaceUrls)
from calibre import force_unicode, as_unicode
from calibre.ebooks import unit_convert
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, xpath, urlnormalize
from calibre.ebooks.oeb.normalize_css import DEFAULTS, normalizers
from css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES
from polyglot.builtins import iteritems, unicode_type, filter
from tinycss.media3 import CSSMedia3Parser
css_parser_log.setLevel(logging.WARN)
_html_css_stylesheet = None
def html_css_stylesheet():
global _html_css_stylesheet
if _html_css_stylesheet is None:
with open(P('templates/html.css'), 'rb') as f:
html_css = f.read().decode('utf-8')
_html_css_stylesheet = parseString(html_css, validate=False)
return _html_css_stylesheet
INHERITED = {
'azimuth', 'border-collapse', 'border-spacing', 'caption-side', 'color',
'cursor', 'direction', 'elevation', 'empty-cells', 'font-family',
'font-size', 'font-style', 'font-variant', 'font-weight', 'letter-spacing',
'line-height', 'list-style-image', 'list-style-position',
'list-style-type', 'orphans', 'page-break-inside', 'pitch-range', 'pitch',
'quotes', 'richness', 'speak-header', 'speak-numeral', 'speak-punctuation',
'speak', 'speech-rate', 'stress', 'text-align', 'text-indent',
'text-transform', 'visibility', 'voice-family', 'volume', 'white-space',
'widows', 'word-spacing', 'text-shadow',
}
FONT_SIZE_NAMES = {
'xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large'
}
ALLOWED_MEDIA_TYPES = frozenset({'screen', 'all', 'aural', 'amzn-kf8'})
IGNORED_MEDIA_FEATURES = frozenset('width min-width max-width height min-height max-height device-width min-device-width max-device-width device-height min-device-height max-device-height aspect-ratio min-aspect-ratio max-aspect-ratio device-aspect-ratio min-device-aspect-ratio max-device-aspect-ratio color min-color max-color color-index min-color-index max-color-index monochrome min-monochrome max-monochrome -webkit-min-device-pixel-ratio resolution min-resolution max-resolution scan grid'.split()) # noqa
def media_ok(raw):
if not raw:
return True
if raw == 'amzn-mobi': # Optimization for the common case
return False
def query_ok(mq):
matched = True
if mq.media_type not in ALLOWED_MEDIA_TYPES:
matched = False
# Media queries that test for device specific features always fail
for media_feature, expr in mq.expressions:
if media_feature in IGNORED_MEDIA_FEATURES:
matched = False
return mq.negated ^ matched
try:
for mq in CSSMedia3Parser().parse_stylesheet('@media %s {}' % raw).rules[0].media:
if query_ok(mq):
return True
return False
except Exception:
pass
return True
def test_media_ok():
assert media_ok(None)
assert media_ok('')
assert not media_ok('amzn-mobi')
assert media_ok('amzn-kf8')
assert media_ok('screen')
assert media_ok('only screen')
assert not media_ok('not screen')
assert not media_ok('(device-width:10px)')
assert media_ok('screen, (device-width:10px)')
assert not media_ok('screen and (device-width:10px)')
class StylizerRules(object):
def __init__(self, opts, profile, stylesheets):
self.opts, self.profile, self.stylesheets = opts, profile, stylesheets
index = 0
self.rules = []
self.page_rule = {}
self.font_face_rules = []
for sheet_index, stylesheet in enumerate(stylesheets):
href = stylesheet.href
for rule in stylesheet.cssRules:
if rule.type == rule.MEDIA_RULE:
if media_ok(rule.media.mediaText):
for subrule in rule.cssRules:
self.rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0))
index += 1
else:
self.rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0))
index = index + 1
self.rules.sort(key=itemgetter(0)) # sort by specificity
def flatten_rule(self, rule, href, index, is_user_agent_sheet=False):
results = []
sheet_index = 0 if is_user_agent_sheet else 1
if isinstance(rule, CSSStyleRule):
style = self.flatten_style(rule.style)
for selector in rule.selectorList:
specificity = (sheet_index,) + selector.specificity + (index,)
text = selector.selectorText
selector = list(selector.seq)
results.append((specificity, selector, style, text, href))
elif isinstance(rule, CSSPageRule):
style = self.flatten_style(rule.style)
self.page_rule.update(style)
elif isinstance(rule, CSSFontFaceRule):
if rule.style.length > 1:
# Ignore the meaningless font face rules generated by the
# benighted MS Word that contain only a font-family declaration
# and nothing else
self.font_face_rules.append(rule)
return results
def flatten_style(self, cssstyle):
style = {}
for prop in cssstyle:
name = prop.name
normalizer = normalizers.get(name, None)
if normalizer is not None:
style.update(normalizer(name, prop.cssValue))
elif name == 'text-align':
style['text-align'] = self._apply_text_align(prop.value)
else:
style[name] = prop.value
if 'font-size' in style:
size = style['font-size']
if size == 'normal':
size = 'medium'
if size == 'smallest':
size = 'xx-small'
if size in FONT_SIZE_NAMES:
style['font-size'] = "%.1frem" % (self.profile.fnames[size] / float(self.profile.fbase))
if '-epub-writing-mode' in style:
for x in ('-webkit-writing-mode', 'writing-mode'):
style[x] = style.get(x, style['-epub-writing-mode'])
return style
def _apply_text_align(self, text):
if text in ('left', 'justify') and self.opts.change_justification in ('left', 'justify'):
text = self.opts.change_justification
return text
def same_rules(self, opts, profile, stylesheets):
if self.opts != opts:
# it's unlikely to happen, but better safe than sorry
return False
if self.profile != profile:
return False
if len(self.stylesheets) != len(stylesheets):
return False
for index, stylesheet in enumerate(self.stylesheets):
if stylesheet != stylesheets[index]:
return False
return True
class Stylizer(object):
STYLESHEETS = WeakKeyDictionary()
def __init__(self, tree, path, oeb, opts, profile=None,
extra_css='', user_css='', base_css=''):
self.oeb, self.opts = oeb, opts
self.profile = profile
if self.profile is None:
# Use the default profile. This should really be using
# opts.output_profile, but I don't want to risk changing it, as
# doing so might well have hard to debug font size effects.
from calibre.customize.ui import output_profiles
for x in output_profiles():
if x.short_name == 'default':
self.profile = x
break
if self.profile is None:
# Just in case the default profile is removed in the future :)
self.profile = opts.output_profile
self.body_font_size = self.profile.fbase
self.logger = oeb.logger
item = oeb.manifest.hrefs[path]
basename = os.path.basename(path)
cssname = os.path.splitext(basename)[0] + '.css'
stylesheets = [html_css_stylesheet()]
if base_css:
stylesheets.append(parseString(base_css, validate=False))
style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]')
# Add css_parser parsing profiles from output_profile
for profile in self.opts.output_profile.extra_css_modules:
cssprofiles.addProfile(profile['name'],
profile['props'],
profile['macros'])
parser = CSSParser(fetcher=self._fetch_css_file,
log=logging.getLogger('calibre.css'))
for elem in style_tags:
if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))):
text = elem.text if elem.text else ''
for x in elem:
t = getattr(x, 'text', None)
if t:
text += '\n\n' + force_unicode(t, 'utf-8')
t = getattr(x, 'tail', None)
if t:
text += '\n\n' + force_unicode(t, 'utf-8')
if text:
text = oeb.css_preprocessor(text)
# We handle @import rules separately
parser.setFetcher(lambda x: ('utf-8', b''))
stylesheet = parser.parseString(text, href=cssname,
validate=False)
parser.setFetcher(self._fetch_css_file)
for rule in stylesheet.cssRules:
if rule.type == rule.IMPORT_RULE:
ihref = item.abshref(rule.href)
if not media_ok(rule.media.mediaText):
continue
hrefs = self.oeb.manifest.hrefs
if ihref not in hrefs:
self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
continue
sitem = hrefs[ihref]
if sitem.media_type not in OEB_STYLES:
self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
continue
stylesheets.append(sitem.data)
# Make links to resources absolute, since these rules will
# be folded into a stylesheet at the root
replaceUrls(stylesheet, item.abshref,
ignoreImportRules=True)
stylesheets.append(stylesheet)
elif (elem.tag == XHTML('link') and elem.get('href') and elem.get(
'rel', 'stylesheet').lower() == 'stylesheet' and elem.get(
'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media'))
):
href = urlnormalize(elem.attrib['href'])
path = item.abshref(href)
sitem = oeb.manifest.hrefs.get(path, None)
if sitem is None:
self.logger.warn(
'Stylesheet %r referenced by file %r not in manifest' %
(path, item.href))
continue
if not hasattr(sitem.data, 'cssRules'):
self.logger.warn(
'Stylesheet %r referenced by file %r is not CSS'%(path,
item.href))
continue
stylesheets.append(sitem.data)
csses = {'extra_css':extra_css, 'user_css':user_css}
for w, x in csses.items():
if x:
try:
text = x
stylesheet = parser.parseString(text, href=cssname,
validate=False)
stylesheets.append(stylesheet)
except Exception:
self.logger.exception('Failed to parse %s, ignoring.'%w)
self.logger.debug('Bad css: ')
self.logger.debug(x)
# using oeb to store the rules, page rule and font face rules
# and generating them again if opts, profile or stylesheets are different
if (not hasattr(self.oeb, 'stylizer_rules')) \
or not self.oeb.stylizer_rules.same_rules(self.opts, self.profile, stylesheets):
self.oeb.stylizer_rules = StylizerRules(self.opts, self.profile, stylesheets)
self.rules = self.oeb.stylizer_rules.rules
self.page_rule = self.oeb.stylizer_rules.page_rule
self.font_face_rules = self.oeb.stylizer_rules.font_face_rules
self.flatten_style = self.oeb.stylizer_rules.flatten_style
self._styles = {}
pseudo_pat = re.compile(':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
select = Select(tree, ignore_inappropriate_pseudo_classes=True)
for _, _, cssdict, text, _ in self.rules:
fl = pseudo_pat.search(text)
try:
matches = tuple(select(text))
except SelectorError as err:
self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err)))
continue
if fl is not None:
fl = fl.group(1)
if fl == 'first-letter' and getattr(self.oeb,
'plumber_output_format', '').lower() in {'mobi', 'docx'}:
# Fake first-letter
for elem in matches:
for x in elem.iter('*'):
if x.text:
punctuation_chars = []
text = unicode_type(x.text)
while text:
category = unicodedata.category(text[0])
if category[0] not in {'P', 'Z'}:
break
punctuation_chars.append(text[0])
text = text[1:]
special_text = ''.join(punctuation_chars) + \
(text[0] if text else '')
span = x.makeelement('{%s}span' % XHTML_NS)
span.text = special_text
span.set('data-fake-first-letter', '1')
span.tail = text[1:]
x.text = None
x.insert(0, span)
self.style(span)._update_cssdict(cssdict)
break
else: # Element pseudo-class
for elem in matches:
self.style(elem)._update_pseudo_class(fl, cssdict)
else:
for elem in matches:
self.style(elem)._update_cssdict(cssdict)
for elem in xpath(tree, '//h:*[@style]'):
self.style(elem)._apply_style_attr(url_replacer=item.abshref)
num_pat = re.compile(r'[0-9.]+$')
for elem in xpath(tree, '//h:img[@width or @height]'):
style = self.style(elem)
# Check if either height or width is not default
is_styled = style._style.get('width', 'auto') != 'auto' or \
style._style.get('height', 'auto') != 'auto'
if not is_styled:
# Update img style dimension using width and height
upd = {}
for prop in ('width', 'height'):
val = elem.get(prop, '').strip()
try:
del elem.attrib[prop]
except:
pass
if val:
if num_pat.match(val) is not None:
val += 'px'
upd[prop] = val
if upd:
style._update_cssdict(upd)
def _fetch_css_file(self, path):
hrefs = self.oeb.manifest.hrefs
if path not in hrefs:
self.logger.warn('CSS import of missing file %r' % path)
return (None, None)
item = hrefs[path]
if item.media_type not in OEB_STYLES:
self.logger.warn('CSS import of non-CSS file %r' % path)
return (None, None)
data = item.data.cssText
if not isinstance(data, bytes):
data = data.encode('utf-8')
return ('utf-8', data)
def style(self, element):
try:
return self._styles[element]
except KeyError:
return Style(element, self)
def stylesheet(self, name, font_scale=None):
rules = []
for _, _, style, selector, href in self.rules:
if href != name:
continue
if font_scale and 'font-size' in style and \
style['font-size'].endswith('pt'):
style = copy.copy(style)
size = float(style['font-size'][:-2])
style['font-size'] = "%.2fpt" % (size * font_scale)
style = ';\n '.join(': '.join(item) for item in style.items())
rules.append('%s {\n %s;\n}' % (selector, style))
return '\n'.join(rules)
class Style(object):
MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)')
def __init__(self, element, stylizer):
self._element = element
self._profile = stylizer.profile
self._stylizer = stylizer
self._style = {}
self._fontSize = None
self._width = None
self._height = None
self._lineHeight = None
self._bgcolor = None
self._pseudo_classes = {}
stylizer._styles[element] = self
def set(self, prop, val):
self._style[prop] = val
def drop(self, prop, default=None):
return self._style.pop(prop, default)
def _update_cssdict(self, cssdict):
self._style.update(cssdict)
def _update_pseudo_class(self, name, cssdict):
orig = self._pseudo_classes.get(name, {})
orig.update(cssdict)
self._pseudo_classes[name] = orig
def _apply_style_attr(self, url_replacer=None):
attrib = self._element.attrib
if 'style' not in attrib:
return
css = attrib['style'].split(';')
css = filter(None, (x.strip() for x in css))
css = [y.strip() for y in css]
css = [y for y in css if self.MS_PAT.match(y) is None]
css = '; '.join(css)
try:
style = parseStyle(css, validate=False)
except CSSSyntaxError:
return
if url_replacer is not None:
replaceUrls(style, url_replacer, ignoreImportRules=True)
self._style.update(self._stylizer.flatten_style(style))
def _has_parent(self):
try:
return self._element.getparent() is not None
except AttributeError:
return False # self._element is None
def _get_parent(self):
elem = self._element.getparent()
if elem is None:
return None
return self._stylizer.style(elem)
def __getitem__(self, name):
domname = cssproperties._toDOMname(name)
if hasattr(self, domname):
return getattr(self, domname)
return self._unit_convert(self._get(name))
def _get(self, name):
result = None
if name in self._style:
result = self._style[name]
if (result == 'inherit' or (result is None and name in INHERITED and self._has_parent())):
stylizer = self._stylizer
result = stylizer.style(self._element.getparent())._get(name)
if result is None:
result = DEFAULTS[name]
return result
def get(self, name, default=None):
return self._style.get(name, default)
def _unit_convert(self, value, base=None, font=None):
'Return value in pts'
if base is None:
base = self.width
if not font and font != 0:
font = self.fontSize
return unit_convert(value, base, font, self._profile.dpi, body_font_size=self._stylizer.body_font_size)
def pt_to_px(self, value):
return (self._profile.dpi / 72) * value
@property
def backgroundColor(self):
'''
Return the background color by parsing both the background-color and
background shortcut properties. Note that inheritance/default values
are not used. None is returned if no background color is set.
'''
def validate_color(col):
return cssprofiles.validateWithProfile('color',
col,
profiles=[profiles.Profiles.CSS_LEVEL_2])[1]
if self._bgcolor is None:
col = None
val = self._style.get('background-color', None)
if val and validate_color(val):
col = val
else:
val = self._style.get('background', None)
if val is not None:
try:
style = parseStyle('background: '+val, validate=False)
val = style.getProperty('background').cssValue
try:
val = list(val)
except:
# val is CSSPrimitiveValue
val = [val]
for c in val:
c = c.cssText
if isinstance(c, bytes):
c = c.decode('utf-8', 'replace')
if validate_color(c):
col = c
break
except:
pass
if col is None:
self._bgcolor = False
else:
self._bgcolor = col
return self._bgcolor if self._bgcolor else None
@property
def fontSize(self):
def normalize_fontsize(value, base):
value = value.replace('"', '').replace("'", '')
result = None
factor = None
if value == 'inherit':
value = base
if value in FONT_SIZE_NAMES:
result = self._profile.fnames[value]
elif value == 'smaller':
factor = 1.0/1.2
for _, _, size in self._profile.fsizes:
if base <= size:
break
factor = None
result = size
elif value == 'larger':
factor = 1.2
for _, _, size in reversed(self._profile.fsizes):
if base >= size:
break
factor = None
result = size
else:
result = self._unit_convert(value, base=base, font=base)
if not isinstance(result, numbers.Number):
return base
if result < 0:
result = normalize_fontsize("smaller", base)
if factor:
result = factor * base
return result
if self._fontSize is None:
result = None
parent = self._get_parent()
if parent is not None:
base = parent.fontSize
else:
base = self._profile.fbase
if 'font-size' in self._style:
size = self._style['font-size']
result = normalize_fontsize(size, base)
else:
result = base
self._fontSize = result
return self._fontSize
def img_dimension(self, attr, img_size):
ans = None
parent = self._get_parent()
if parent is not None:
base = getattr(parent, attr)
else:
base = getattr(self._profile, attr + '_pts')
x = self._style.get(attr)
if x is not None:
if x == 'auto':
ans = self._unit_convert(unicode_type(img_size) + 'px', base=base)
else:
x = self._unit_convert(x, base=base)
if isinstance(x, numbers.Number):
ans = x
if ans is None:
x = self._element.get(attr)
if x is not None:
x = self._unit_convert(x + 'px', base=base)
if isinstance(x, numbers.Number):
ans = x
if ans is None:
ans = self._unit_convert(unicode_type(img_size) + 'px', base=base)
maa = self._style.get('max-' + attr)
if maa is not None:
x = self._unit_convert(maa, base=base)
if isinstance(x, numbers.Number) and (ans is None or x < ans):
ans = x
return ans
def img_size(self, width, height):
' Return the final size of an <img> given that it points to an image of size widthxheight '
w, h = self._get('width'), self._get('height')
answ, ansh = self.img_dimension('width', width), self.img_dimension('height', height)
if w == 'auto' and h != 'auto':
answ = (float(width)/height) * ansh
elif h == 'auto' and w != 'auto':
ansh = (float(height)/width) * answ
return answ, ansh
@property
def width(self):
if self._width is None:
width = None
base = None
parent = self._get_parent()
if parent is not None:
base = parent.width
else:
base = self._profile.width_pts
if 'width' in self._element.attrib:
width = self._element.attrib['width']
elif 'width' in self._style:
width = self._style['width']
if not width or width == 'auto':
result = base
else:
result = self._unit_convert(width, base=base)
if isinstance(result, (unicode_type, bytes)):
result = self._profile.width
self._width = result
if 'max-width' in self._style:
result = self._unit_convert(self._style['max-width'], base=base)
if isinstance(result, (unicode_type, bytes)):
result = self._width
if result < self._width:
self._width = result
return self._width
@property
def parent_width(self):
parent = self._get_parent()
if parent is None:
return self.width
return parent.width
@property
def height(self):
if self._height is None:
height = None
base = None
parent = self._get_parent()
if parent is not None:
base = parent.height
else:
base = self._profile.height_pts
if 'height' in self._element.attrib:
height = self._element.attrib['height']
elif 'height' in self._style:
height = self._style['height']
if not height or height == 'auto':
result = base
else:
result = self._unit_convert(height, base=base)
if isinstance(result, (unicode_type, bytes)):
result = self._profile.height
self._height = result
if 'max-height' in self._style:
result = self._unit_convert(self._style['max-height'], base=base)
if isinstance(result, (unicode_type, bytes)):
result = self._height
if result < self._height:
self._height = result
return self._height
@property
def lineHeight(self):
if self._lineHeight is None:
result = None
parent = self._get_parent()
if 'line-height' in self._style:
lineh = self._style['line-height']
if lineh == 'normal':
lineh = '1.2'
try:
result = float(lineh) * self.fontSize
except ValueError:
result = self._unit_convert(lineh, base=self.fontSize)
elif parent is not None:
# TODO: proper inheritance
result = parent.lineHeight
else:
result = 1.2 * self.fontSize
self._lineHeight = result
return self._lineHeight
@property
def effective_text_decoration(self):
'''
Browsers do this creepy thing with text-decoration where even though the
property is not inherited, it looks like it is because containing
blocks apply it. The actual algorithm is utterly ridiculous, see
http://reference.sitepoint.com/css/text-decoration
This matters for MOBI output, where text-decoration is mapped to <u>
and <st> tags. Trying to implement the actual algorithm is too much
work, so we just use a simple fake that should cover most cases.
'''
css = self._style.get('text-decoration', None)
pcss = None
parent = self._get_parent()
if parent is not None:
pcss = parent._style.get('text-decoration', None)
if css in ('none', None, 'inherit') and pcss not in (None, 'none'):
return pcss
return css
@property
def first_vertical_align(self):
''' For docx output where tags are not nested, we cannot directly
simulate the HTML vertical-align rendering model. Instead use the
approximation of considering the first non-default vertical-align '''
val = self['vertical-align']
if val != 'baseline':
raw_val = self._get('vertical-align')
if '%' in raw_val:
val = self._unit_convert(raw_val, base=self['line-height'])
return val
parent = self._get_parent()
if parent is not None and 'inline' in parent['display']:
return parent.first_vertical_align
@property
def marginTop(self):
return self._unit_convert(
self._get('margin-top'), base=self.parent_width)
@property
def marginBottom(self):
return self._unit_convert(
self._get('margin-bottom'), base=self.parent_width)
@property
def marginLeft(self):
return self._unit_convert(
self._get('margin-left'), base=self.parent_width)
@property
def marginRight(self):
return self._unit_convert(
self._get('margin-right'), base=self.parent_width)
@property
def paddingTop(self):
return self._unit_convert(
self._get('padding-top'), base=self.parent_width)
@property
def paddingBottom(self):
return self._unit_convert(
self._get('padding-bottom'), base=self.parent_width)
@property
def paddingLeft(self):
return self._unit_convert(
self._get('padding-left'), base=self.parent_width)
@property
def paddingRight(self):
return self._unit_convert(
self._get('padding-right'), base=self.parent_width)
def __str__(self):
items = sorted(iteritems(self._style))
return '; '.join("%s: %s" % (key, val) for key, val in items)
def cssdict(self):
return dict(self._style)
def pseudo_classes(self, filter_css):
if filter_css:
css = copy.deepcopy(self._pseudo_classes)
for psel, cssdict in iteritems(css):
for k in filter_css:
cssdict.pop(k, None)
else:
css = self._pseudo_classes
return {k:v for k, v in iteritems(css) if v}
@property
def is_hidden(self):
return self._style.get('display') == 'none' or self._style.get('visibility') == 'hidden'

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,52 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from calibre.ebooks.oeb.base import XPath, urlunquote
from polyglot.builtins import as_bytes
class DataURL(object):
def __call__(self, oeb, opts):
from calibre.utils.imghdr import what
self.log = oeb.log
attr_path = XPath('//h:img[@src]')
for item in oeb.spine:
root = item.data
if not hasattr(root, 'xpath'):
continue
for img in attr_path(root):
raw = img.get('src', '')
if not raw.startswith('data:'):
continue
header, data = raw.partition(',')[0::2]
if not header.startswith('data:image/') or not data:
continue
if ';base64' in header:
data = re.sub(r'\s+', '', data)
from polyglot.binary import from_base64_bytes
try:
data = from_base64_bytes(data)
except Exception:
self.log.error('Found invalid base64 encoded data URI, ignoring it')
continue
else:
data = urlunquote(data)
data = as_bytes(data)
fmt = what(None, data)
if not fmt:
self.log.warn('Image encoded as data URL has unknown format, ignoring')
continue
img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
def convert_image_data_uri(self, data, fmt, oeb):
self.log('Found image encoded as data URI converting it to normal image')
from calibre import guess_type
item_id, item_href = oeb.manifest.generate('data-url-image', 'data-url-image.' + fmt)
oeb.manifest.add(item_id, item_href, guess_type(item_href)[0], data=data)
return item_href

View File

@@ -0,0 +1,684 @@
'''
CSS flattening transform.
'''
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import re, operator, math, numbers
from collections import defaultdict
from xml.dom import SyntaxErr
from lxml import etree
import css_parser
from css_parser.css import Property
from calibre import guess_type
from calibre.ebooks import unit_convert
from calibre.ebooks.oeb.base import (XHTML, XHTML_NS, CSS_MIME, OEB_STYLES,
namespace, barename, XPath, css_text)
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.utils.filenames import ascii_filename, ascii_text
from calibre.utils.icu import numeric_sort_key
from polyglot.builtins import iteritems, unicode_type, string_or_bytes, map
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
STRIPNUM = re.compile(r'[-0-9]+$')
def asfloat(value, default):
if not isinstance(value, numbers.Number):
value = default
return float(value)
class KeyMapper(object):
def __init__(self, sbase, dbase, dkey):
self.sbase = float(sbase)
self.dprop = [(self.relate(x, dbase), float(x)) for x in dkey]
self.cache = {}
@staticmethod
def relate(size, base):
if size == 0:
return base
size = float(size)
base = float(base)
if abs(size - base) < 0.1:
return 0
sign = -1 if size < base else 1
endp = 0 if size < base else 36
diff = (abs(base - size) * 3) + ((36 - size) / 100)
logb = abs(base - endp)
if logb == 1.0:
logb = 1.1
try:
result = sign * math.log(diff, logb)
except ValueError:
if diff < 0:
# Size is both very large and close to base
return 0
if logb == 0:
logb = 1e-6
if diff == 0:
diff = 1e-6
result = sign * math.log(diff, logb)
return result
def __getitem__(self, ssize):
ssize = asfloat(ssize, 0)
if ssize in self.cache:
return self.cache[ssize]
dsize = self.map(ssize)
self.cache[ssize] = dsize
return dsize
def map(self, ssize):
sbase = self.sbase
prop = self.relate(ssize, sbase)
diff = [(abs(prop - p), s) for p, s in self.dprop]
dsize = min(diff)[1]
return dsize
class ScaleMapper(object):
def __init__(self, sbase, dbase):
self.dscale = float(dbase) / float(sbase)
def __getitem__(self, ssize):
ssize = asfloat(ssize, 0)
dsize = ssize * self.dscale
return dsize
class NullMapper(object):
def __init__(self):
pass
def __getitem__(self, ssize):
return ssize
def FontMapper(sbase=None, dbase=None, dkey=None):
if sbase and dbase and dkey:
return KeyMapper(sbase, dbase, dkey)
elif sbase and dbase:
return ScaleMapper(sbase, dbase)
else:
return NullMapper()
class EmbedFontsCSSRules(object):
def __init__(self, body_font_family, rules):
self.body_font_family, self.rules = body_font_family, rules
self.href = None
def __call__(self, oeb):
if not self.body_font_family:
return None
if not self.href:
iid, href = oeb.manifest.generate('page_styles', 'page_styles.css')
rules = [css_text(x) for x in self.rules]
rules = '\n\n'.join(rules)
sheet = css_parser.parseString(rules, validate=False)
self.href = oeb.manifest.add(iid, href, guess_type(href)[0],
data=sheet).href
return self.href
class CSSFlattener(object):
def __init__(self, fbase=None, fkey=None, lineh=None, unfloat=False,
untable=False, page_break_on_body=False, specializer=None,
transform_css_rules=()):
self.fbase = fbase
self.transform_css_rules = transform_css_rules
if self.transform_css_rules:
from calibre.ebooks.css_transform_rules import compile_rules
self.transform_css_rules = compile_rules(self.transform_css_rules)
self.fkey = fkey
self.lineh = lineh
self.unfloat = unfloat
self.untable = untable
self.specializer = specializer
self.page_break_on_body = page_break_on_body
@classmethod
def config(cls, cfg):
return cfg
@classmethod
def generate(cls, opts):
return cls()
def __call__(self, oeb, context):
oeb.logger.info('Flattening CSS and remapping font sizes...')
self.context = self.opts = context
self.oeb = oeb
self.items = list(self.oeb.spine)
titlepage = self.oeb.guide.get('titlepage')
if titlepage is not None:
titlepage = titlepage.item
if titlepage is not None and titlepage not in self.items:
self.items.append(titlepage)
epub3_nav = None
if getattr(self.opts, 'epub3_nav_href', None):
epub3_nav = self.oeb.manifest.hrefs.get(self.opts.epub3_nav_href)
if epub3_nav is not None and epub3_nav not in self.items:
self.items.append(epub3_nav)
self.filter_css = frozenset()
if self.opts.filter_css:
try:
self.filter_css = {x.strip().lower() for x in
self.opts.filter_css.split(',')}
except:
self.oeb.log.warning('Failed to parse filter_css, ignoring')
else:
from calibre.ebooks.oeb.normalize_css import normalize_filter_css
self.filter_css = frozenset(normalize_filter_css(self.filter_css))
self.oeb.log.debug('Filtering CSS properties: %s'%
', '.join(self.filter_css))
for item in oeb.manifest.values():
# Make all links to resources absolute, as these sheets will be
# consolidated into a single stylesheet at the root of the document
if item.media_type in OEB_STYLES:
css_parser.replaceUrls(item.data, item.abshref,
ignoreImportRules=True)
self.body_font_family, self.embed_font_rules = self.get_embed_font_info(
self.opts.embed_font_family)
# Store for use in output plugins/transforms that generate content,
# like the AZW3 output inline ToC.
self.oeb.store_embed_font_rules = EmbedFontsCSSRules(self.body_font_family,
self.embed_font_rules)
self.stylize_spine()
self.sbase = self.baseline_spine() if self.fbase else None
self.fmap = FontMapper(self.sbase, self.fbase, self.fkey)
self.flatten_spine()
if epub3_nav is not None:
self.opts.epub3_nav_parsed = epub3_nav.data
self.store_page_margins()
def store_page_margins(self):
self.opts._stored_page_margins = {}
for item, stylizer in iteritems(self.stylizers):
margins = self.opts._stored_page_margins[item.href] = {}
for prop, val in stylizer.page_rule.items():
p, w = prop.partition('-')[::2]
if p == 'margin':
margins[w] = unit_convert(
val, stylizer.profile.width_pts, stylizer.body_font_size,
stylizer.profile.dpi, body_font_size=stylizer.body_font_size)
def get_embed_font_info(self, family, failure_critical=True):
efi = []
body_font_family = None
if not family:
return body_font_family, efi
from calibre.utils.fonts.scanner import font_scanner, NoFonts
from calibre.utils.fonts.utils import panose_to_css_generic_family
try:
faces = font_scanner.fonts_for_family(family)
except NoFonts:
msg = ('No embeddable fonts found for family: %r'%family)
if failure_critical:
raise ValueError(msg)
self.oeb.log.warn(msg)
return body_font_family, efi
if not faces:
msg = ('No embeddable fonts found for family: %r'%family)
if failure_critical:
raise ValueError(msg)
self.oeb.log.warn(msg)
return body_font_family, efi
for i, font in enumerate(faces):
ext = 'otf' if font['is_otf'] else 'ttf'
fid, href = self.oeb.manifest.generate(id=u'font',
href='fonts/%s.%s'%(ascii_filename(font['full_name']).replace(' ', '-'), ext))
item = self.oeb.manifest.add(fid, href,
guess_type('dummy.'+ext)[0],
data=font_scanner.get_font_data(font))
item.unload_data_from_memory()
cfont = {
'font-family': '"%s"'%font['font-family'],
'panose-1': ' '.join(map(unicode_type, font['panose'])),
'src': 'url(%s)'%item.href,
}
if i == 0:
generic_family = panose_to_css_generic_family(font['panose'])
body_font_family = "'%s',%s"%(font['font-family'], generic_family)
self.oeb.log('Embedding font: %s'%font['font-family'])
for k in ('font-weight', 'font-style', 'font-stretch'):
if font[k] != 'normal':
cfont[k] = font[k]
rule = '@font-face { %s }'%('; '.join('%s:%s'%(k, v) for k, v in
iteritems(cfont)))
rule = css_parser.parseString(rule)
efi.append(rule)
return body_font_family, efi
def stylize_spine(self):
self.stylizers = {}
profile = self.context.source
css = ''
for item in self.items:
html = item.data
body = html.find(XHTML('body'))
if 'style' in html.attrib:
b = body.attrib.get('style', '')
body.set('style', html.get('style') + ';' + b)
del html.attrib['style']
bs = body.get('style', '').split(';')
bs.append('margin-top: 0pt')
bs.append('margin-bottom: 0pt')
if float(self.context.margin_left) >= 0:
bs.append('margin-left : %gpt'%
float(self.context.margin_left))
if float(self.context.margin_right) >= 0:
bs.append('margin-right : %gpt'%
float(self.context.margin_right))
bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
if self.page_break_on_body:
bs.extend(['page-break-before: always'])
if self.context.change_justification != 'original':
bs.append('text-align: '+ self.context.change_justification)
if self.body_font_family:
bs.append('font-family: '+self.body_font_family)
body.set('style', '; '.join(bs))
stylizer = Stylizer(html, item.href, self.oeb, self.context, profile,
user_css=self.context.extra_css,
extra_css=css)
self.stylizers[item] = stylizer
def baseline_node(self, node, stylizer, sizes, csize):
csize = stylizer.style(node)['font-size']
if node.text:
sizes[csize] += len(COLLAPSE.sub(' ', node.text))
for child in node:
self.baseline_node(child, stylizer, sizes, csize)
if child.tail:
sizes[csize] += len(COLLAPSE.sub(' ', child.tail))
def baseline_spine(self):
sizes = defaultdict(float)
for item in self.items:
html = item.data
stylizer = self.stylizers[item]
body = html.find(XHTML('body'))
fsize = self.context.source.fbase
self.baseline_node(body, stylizer, sizes, fsize)
try:
sbase = max(list(sizes.items()), key=operator.itemgetter(1))[0]
except:
sbase = 12.0
self.oeb.logger.info(
"Source base font size is %0.05fpt" % sbase)
return sbase
def clean_edges(self, cssdict, style, fsize):
slineh = self.sbase * 1.26
dlineh = self.lineh
for kind in ('margin', 'padding'):
for edge in ('bottom', 'top'):
property = "%s-%s" % (kind, edge)
if property not in cssdict:
continue
if '%' in cssdict[property]:
continue
value = style[property]
if value == 0:
continue
elif value <= slineh:
cssdict[property] = "%0.5fem" % (dlineh / fsize)
else:
try:
value = round(value / slineh) * dlineh
except:
self.oeb.logger.warning(
'Invalid length:', value)
value = 0.0
cssdict[property] = "%0.5fem" % (value / fsize)
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True):
if not isinstance(node.tag, string_or_bytes) \
or namespace(node.tag) != XHTML_NS:
return
tag = barename(node.tag)
style = stylizer.style(node)
cssdict = style.cssdict()
try:
font_size = style['font-size']
except:
font_size = self.sbase if self.sbase is not None else \
self.context.source.fbase
if tag == 'body' and isinstance(font_size, numbers.Number):
stylizer.body_font_size = font_size
if 'align' in node.attrib:
if tag != 'img':
cssdict['text-align'] = node.attrib['align']
if cssdict['text-align'] == 'center':
# align=center causes tables to be center aligned,
# which text-align does not. And the ever trustworthy Word
# uses this construct in its HTML output. See
# https://bugs.launchpad.net/bugs/1569583
if tag == 'table':
if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
cssdict['margin-left'] = cssdict['margin-right'] = 'auto'
else:
for table in node.iterchildren(XHTML("table")):
ts = stylizer.style(table)
if ts.get('margin-left') is None and ts.get('margin-right') is None:
ts.set('margin-left', 'auto')
ts.set('margin-right', 'auto')
else:
val = node.attrib['align']
if val in ('middle', 'bottom', 'top'):
cssdict['vertical-align'] = val
elif val in ('left', 'right'):
cssdict['float'] = val
del node.attrib['align']
if 'valign' in node.attrib and tag == 'td':
if cssdict.get('vertical-align') == 'inherit':
cssdict['vertical-align'] = node.attrib['valign']
del node.attrib['valign']
if node.tag == XHTML('font'):
tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')]
tag = 'div' if XPath('|'.join(tags))(node) else 'span'
node.tag = XHTML(tag)
if 'size' in node.attrib:
def force_int(raw):
return int(re.search(r'([0-9+-]+)', raw).group(1))
size = node.attrib['size'].strip()
if size:
fnums = self.context.source.fnums
if size[0] in ('+', '-'):
# Oh, the warcrimes
try:
esize = 3 + force_int(size)
except:
esize = 3
if esize < 1:
esize = 1
if esize > 7:
esize = 7
font_size = fnums[esize]
else:
try:
font_size = fnums[force_int(size)]
except:
font_size = fnums[3]
cssdict['font-size'] = '%.1fpt'%font_size
del node.attrib['size']
if 'face' in node.attrib:
cssdict['font-family'] = node.attrib['face']
del node.attrib['face']
if 'color' in node.attrib:
try:
cssdict['color'] = Property('color', node.attrib['color']).value
except (ValueError, SyntaxErr):
pass
del node.attrib['color']
if 'bgcolor' in node.attrib:
try:
cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value
except (ValueError, SyntaxErr):
pass
del node.attrib['bgcolor']
if tag == 'ol' and 'type' in node.attrib:
del node.attrib['type']
if cssdict.get('font-weight', '').lower() == 'medium':
cssdict['font-weight'] = 'normal' # ADE chokes on font-weight medium
fsize = font_size
is_drop_cap = (cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and (
len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f)))
# Detect drop caps generated by the docx input plugin
if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \
not node.tail and 'line-height' in cssdict and 'font-size' in cssdict:
dp = node.getparent()
if dp.tag and dp.tag.endswith('}div') and len(dp) == 1 and not dp.text:
if stylizer.style(dp).cssdict().get('float', None) == 'left':
is_drop_cap = True
if not self.context.disable_font_rescaling and not is_drop_cap:
_sbase = self.sbase if self.sbase is not None else \
self.context.source.fbase
dyn_rescale = node.attrib.pop('data-calibre-rescale', None)
if dyn_rescale is not None:
try:
dyn_rescale = float(dyn_rescale) / 100
except Exception:
dyn_rescale = 1
fsize = self.fmap[_sbase]
fsize *= dyn_rescale
cssdict['font-size'] = '%0.5fem'%(fsize/psize)
psize = fsize
elif 'font-size' in cssdict or tag == 'body':
fsize = self.fmap[font_size]
try:
cssdict['font-size'] = "%0.5fem" % (fsize / psize)
except ZeroDivisionError:
cssdict['font-size'] = '%.1fpt'%fsize
psize = fsize
try:
minlh = self.context.minimum_line_height / 100.
slh = style['line-height']
if not is_drop_cap and isinstance(slh, numbers.Number) and slh < minlh * fsize:
cssdict['line-height'] = unicode_type(minlh)
except Exception:
self.oeb.logger.exception('Failed to set minimum line-height')
if cssdict:
for x in self.filter_css:
popval = cssdict.pop(x, None)
if self.body_font_family and popval and x == 'font-family' \
and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]:
cssdict[x] = popval
if cssdict:
if self.lineh and self.fbase and tag != 'body':
self.clean_edges(cssdict, style, psize)
if 'display' in cssdict and cssdict['display'] == 'in-line':
cssdict['display'] = 'inline'
if self.unfloat and 'float' in cssdict \
and cssdict.get('display', 'none') != 'none':
del cssdict['display']
if self.untable and 'display' in cssdict \
and cssdict['display'].startswith('table'):
display = cssdict['display']
if display == 'table-cell':
cssdict['display'] = 'inline'
else:
cssdict['display'] = 'block'
if 'vertical-align' in cssdict \
and cssdict['vertical-align'] == 'sup':
cssdict['vertical-align'] = 'super'
if self.lineh and 'line-height' not in cssdict:
lineh = self.lineh / psize
cssdict['line-height'] = "%0.5fem" % lineh
if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'):
if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
for prop in ('margin', 'padding', 'border'):
for edge in ('top', 'bottom'):
cssdict['%s-%s'%(prop, edge)] = '0pt'
if self.context.insert_blank_line:
cssdict['margin-top'] = cssdict['margin-bottom'] = \
'%fem'%self.context.insert_blank_line_size
indent_size = self.context.remove_paragraph_spacing_indent_size
keep_indents = indent_size < 0.0
if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')):
cssdict['text-indent'] = "%1.1fem" % indent_size
pseudo_classes = style.pseudo_classes(self.filter_css)
if cssdict or pseudo_classes:
keep_classes = set()
if cssdict:
items = sorted(iteritems(cssdict))
css = ';\n'.join(u'%s: %s' % (key, val) for key, val in items)
classes = node.get('class', '').strip() or 'calibre'
classes_list = classes.split()
# lower() because otherwise if the document uses the same class
# name with different case, both cases will apply, leading
# to incorrect results.
klass = ascii_text(STRIPNUM.sub('', classes_list[0])).lower().strip().replace(' ', '_')
if css in styles:
match = styles[css]
else:
match = klass + unicode_type(names[klass] or '')
styles[css] = match
names[klass] += 1
node.attrib['class'] = match
keep_classes.add(match)
for psel, cssdict in iteritems(pseudo_classes):
items = sorted(iteritems(cssdict))
css = ';\n'.join('%s: %s' % (key, val) for key, val in items)
pstyles = pseudo_styles[psel]
if css in pstyles:
match = pstyles[css]
else:
# We have to use a different class for each psel as
# otherwise you can have incorrect styles for a situation
# like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
# If the pcalibre class for a:hover and a:link is the same,
# then the class attribute for a.x tags will contain both
# that class and the class for a.x:hover, which is wrong.
klass = 'pcalibre'
match = klass + unicode_type(names[klass] or '')
pstyles[css] = match
names[klass] += 1
keep_classes.add(match)
node.attrib['class'] = ' '.join(keep_classes)
elif 'class' in node.attrib:
del node.attrib['class']
if 'style' in node.attrib:
del node.attrib['style']
if recurse:
for child in node:
self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
def flatten_head(self, item, href, global_href):
html = item.data
head = html.find(XHTML('head'))
def safe_lower(x):
try:
x = x.lower()
except Exception:
pass
return x
for node in html.xpath('//*[local-name()="style" or local-name()="link"]'):
if node.tag == XHTML('link') \
and safe_lower(node.get('rel', 'stylesheet')) == 'stylesheet' \
and safe_lower(node.get('type', CSS_MIME)) in OEB_STYLES:
node.getparent().remove(node)
elif node.tag == XHTML('style') \
and node.get('type', CSS_MIME) in OEB_STYLES:
node.getparent().remove(node)
href = item.relhref(href)
l = etree.SubElement(head, XHTML('link'),
rel='stylesheet', type=CSS_MIME, href=href)
l.tail='\n'
if global_href:
href = item.relhref(global_href)
l = etree.SubElement(head, XHTML('link'),
rel='stylesheet', type=CSS_MIME, href=href)
l.tail = '\n'
def replace_css(self, css):
manifest = self.oeb.manifest
for item in manifest.values():
if item.media_type in OEB_STYLES:
manifest.remove(item)
id, href = manifest.generate('css', 'stylesheet.css')
sheet = css_parser.parseString(css, validate=False)
if self.transform_css_rules:
from calibre.ebooks.css_transform_rules import transform_sheet
transform_sheet(self.transform_css_rules, sheet)
item = manifest.add(id, href, CSS_MIME, data=sheet)
self.oeb.manifest.main_stylesheet = item
return href
def collect_global_css(self):
global_css = defaultdict(list)
for item in self.items:
stylizer = self.stylizers[item]
if float(self.context.margin_top) >= 0:
stylizer.page_rule['margin-top'] = '%gpt'%\
float(self.context.margin_top)
if float(self.context.margin_bottom) >= 0:
stylizer.page_rule['margin-bottom'] = '%gpt'%\
float(self.context.margin_bottom)
items = sorted(stylizer.page_rule.items())
css = ';\n'.join("%s: %s" % (key, val) for key, val in items)
css = ('@page {\n%s\n}\n'%css) if items else ''
rules = [css_text(r) for r in stylizer.font_face_rules + self.embed_font_rules]
raw = '\n\n'.join(rules)
css += '\n\n' + raw
global_css[css].append(item)
gc_map = {}
manifest = self.oeb.manifest
for css in global_css:
href = None
if css.strip():
id_, href = manifest.generate('page_css', 'page_styles.css')
sheet = css_parser.parseString(css, validate=False)
if self.transform_css_rules:
from calibre.ebooks.css_transform_rules import transform_sheet
transform_sheet(self.transform_css_rules, sheet)
manifest.add(id_, href, CSS_MIME, data=sheet)
gc_map[css] = href
ans = {}
for css, items in iteritems(global_css):
for item in items:
ans[item] = gc_map[css]
return ans
def flatten_spine(self):
names = defaultdict(int)
styles, pseudo_styles = {}, defaultdict(dict)
for item in self.items:
html = item.data
stylizer = self.stylizers[item]
if self.specializer is not None:
self.specializer(item, stylizer)
fsize = self.context.dest.fbase
self.flatten_node(html, stylizer, names, styles, pseudo_styles, fsize, item.id, recurse=False)
self.flatten_node(html.find(XHTML('body')), stylizer, names, styles, pseudo_styles, fsize, item.id)
items = sorted(((key, val) for (val, key) in iteritems(styles)), key=lambda x:numeric_sort_key(x[0]))
# :hover must come after link and :active must come after :hover
psels = sorted(pseudo_styles, key=lambda x :
{'hover':1, 'active':2}.get(x, 0))
for psel in psels:
styles = pseudo_styles[psel]
if not styles:
continue
x = sorted(((k+':'+psel, v) for v, k in iteritems(styles)))
items.extend(x)
css = ''.join(".%s {\n%s;\n}\n\n" % (key, val) for key, val in items)
href = self.replace_css(css)
global_css = self.collect_global_css()
for item in self.items:
stylizer = self.stylizers[item]
self.flatten_head(item, href, global_css[item])

View File

@@ -0,0 +1,55 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class Clean(object):
'''Clean up guide, leaving only known values '''
def __call__(self, oeb, opts):
self.oeb, self.log, self.opts = oeb, oeb.log, opts
if 'cover' not in self.oeb.guide:
covers = []
for x in ('other.ms-coverimage-standard', 'coverimagestandard',
'other.ms-titleimage-standard', 'other.ms-titleimage',
'other.ms-coverimage', 'other.ms-thumbimage-standard',
'other.ms-thumbimage', 'thumbimagestandard'):
if x in self.oeb.guide:
href = self.oeb.guide[x].href
try:
item = self.oeb.manifest.hrefs[href]
except KeyError:
continue
else:
covers.append([self.oeb.guide[x], len(item.data)])
covers.sort(key=lambda x: x[1], reverse=True)
if covers:
ref = covers[0][0]
if len(covers) > 1:
self.log('Choosing %s:%s as the cover'%(ref.type, ref.href))
ref.type = 'cover'
self.oeb.guide.refs['cover'] = ref
if ('start' in self.oeb.guide and 'text' not in self.oeb.guide):
# Prefer text to start as per the OPF 2.0 spec
x = self.oeb.guide['start']
self.oeb.guide.add('text', x.title, x.href)
self.oeb.guide.remove('start')
for x in list(self.oeb.guide):
if x.lower() not in {
'cover', 'titlepage', 'masthead', 'toc', 'title-page',
'copyright-page', 'text', 'index', 'glossary',
'acknowledgements', 'bibliography', 'colophon',
'copyright-page', 'dedication', 'epigraph', 'foreword',
'loi', 'lot', 'notes', 'preface'}:
item = self.oeb.guide[x]
if item.title and item.title.lower() == 'start':
continue
self.oeb.guide.remove(x)

View File

@@ -0,0 +1,395 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, re
from xml.sax.saxutils import escape
from string import Formatter
from calibre import guess_type, strftime
from calibre.constants import iswindows
from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML, xml2text, urldefrag, urlnormalize
from calibre.library.comments import comments_to_html, markdown
from calibre.utils.date import is_date_undefined, as_local_time
from calibre.utils.icu import sort_key
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.metadata import fmt_sidx, rating_to_stars
from polyglot.builtins import unicode_type, map
JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'
class SafeFormatter(Formatter):
def get_value(self, *args, **kwargs):
try:
return Formatter.get_value(self, *args, **kwargs)
except KeyError:
return ''
class Base(object):
def remove_images(self, item, limit=1):
path = XPath('//h:img[@src]')
removed = 0
for img in path(item.data):
if removed >= limit:
break
href = item.abshref(img.get('src'))
image = self.oeb.manifest.hrefs.get(href)
if image is None:
href = urlnormalize(href)
image = self.oeb.manifest.hrefs.get(href)
if image is not None:
self.oeb.manifest.remove(image)
self.oeb.guide.remove_by_href(href)
img.getparent().remove(img)
removed += 1
return removed
class RemoveFirstImage(Base):
def remove_first_image(self):
deleted_item = None
for item in self.oeb.spine:
if XPath(JACKET_XPATH)(item.data):
continue
removed = self.remove_images(item)
if removed > 0:
self.log('Removed first image')
body = XPath('//h:body')(item.data)
if body:
raw = xml2text(body[0]).strip()
imgs = XPath('//h:img|//svg:svg')(item.data)
if not raw and not imgs:
self.log('Removing %s as it has no content'%item.href)
self.oeb.manifest.remove(item)
deleted_item = item
break
else:
self.log.warn('Could not find first image to remove')
if deleted_item is not None:
for item in list(self.oeb.toc):
href = urldefrag(item.href)[0]
if href == deleted_item.href:
self.oeb.toc.remove(item)
self.oeb.guide.remove_by_href(deleted_item.href)
def __call__(self, oeb, opts, metadata):
'''
Add metadata in jacket.xhtml if specified in opts
If not specified, remove previous jacket instance
'''
self.oeb, self.opts, self.log = oeb, opts, oeb.log
if opts.remove_first_image:
self.remove_first_image()
class Jacket(Base):
'''
Book jacket manipulation. Remove first image and insert comments at start of
book.
'''
def insert_metadata(self, mi):
self.log('Inserting metadata into book...')
try:
tags = list(map(unicode_type, self.oeb.metadata.subject))
except Exception:
tags = []
try:
comments = unicode_type(self.oeb.metadata.description[0])
except:
comments = ''
try:
title = unicode_type(self.oeb.metadata.title[0])
except:
title = _('Unknown')
try:
authors = list(map(unicode_type, self.oeb.metadata.creator))
except:
authors = [_('Unknown')]
root = render_jacket(mi, self.opts.output_profile,
alt_title=title, alt_tags=tags, alt_authors=authors,
alt_comments=comments, rescale_fonts=True)
id, href = self.oeb.manifest.generate('calibre_jacket', 'jacket.xhtml')
jacket = self.oeb.manifest.add(id, href, guess_type(href)[0], data=root)
self.oeb.spine.insert(0, jacket, True)
self.oeb.inserted_metadata_jacket = jacket
for img, path in referenced_images(root):
self.oeb.log('Embedding referenced image %s into jacket' % path)
ext = path.rpartition('.')[-1].lower()
item_id, href = self.oeb.manifest.generate('jacket_image', 'jacket_img.'+ext)
with open(path, 'rb') as f:
item = self.oeb.manifest.add(item_id, href, guess_type(href)[0], data=f.read())
item.unload_data_from_memory()
img.set('src', jacket.relhref(item.href))
def remove_existing_jacket(self):
for x in self.oeb.spine[:4]:
if XPath(JACKET_XPATH)(x.data):
self.remove_images(x, limit=sys.maxsize)
self.oeb.manifest.remove(x)
self.log('Removed existing jacket')
break
def __call__(self, oeb, opts, metadata):
'''
Add metadata in jacket.xhtml if specified in opts
If not specified, remove previous jacket instance
'''
self.oeb, self.opts, self.log = oeb, opts, oeb.log
self.remove_existing_jacket()
if opts.insert_metadata:
self.insert_metadata(metadata)
# Render Jacket {{{
def get_rating(rating, rchar, e_rchar):
ans = ''
try:
num = float(rating)/2
except:
return ans
num = max(0, num)
num = min(num, 5)
if num < 1:
return ans
ans = ("%s%s") % (rchar * int(num), e_rchar * (5 - int(num)))
return ans
class Series(unicode_type):
def __new__(self, series, series_index):
if series and series_index is not None:
roman = _('{1} of <em>{0}</em>').format(
escape(series), escape(fmt_sidx(series_index, use_roman=True)))
combined = _('{1} of <em>{0}</em>').format(
escape(series), escape(fmt_sidx(series_index, use_roman=False)))
else:
combined = roman = escape(series or u'')
s = unicode_type.__new__(self, combined)
s.roman = roman
s.name = escape(series or '')
s.number = escape(fmt_sidx(series_index or 1.0, use_roman=False))
s.roman_number = escape(fmt_sidx(series_index or 1.0, use_roman=True))
return s
class Tags(unicode_type):
def __new__(self, tags, output_profile):
tags = [escape(x) for x in tags or ()]
t = unicode_type.__new__(self, ', '.join(tags))
t.alphabetical = ', '.join(sorted(tags, key=sort_key))
t.tags_list = tags
return t
def postprocess_jacket(root, output_profile, has_data):
# Post-process the generated html to strip out empty header items
def extract(tag):
parent = tag.getparent()
idx = parent.index(tag)
parent.remove(tag)
if tag.tail:
if idx == 0:
parent.text = (parent.text or '') + tag.tail
else:
if idx >= len(parent):
idx = -1
parent[-1].tail = (parent[-1].tail or '') + tag.tail
def extract_class(cls):
for tag in root.xpath('//*[@class="_"]'.replace('_', cls)):
extract(tag)
for key in 'series rating tags'.split():
if not has_data[key]:
extract_class('cbj_' + key)
if not has_data['pubdate']:
extract_class('cbj_pubdata')
if output_profile.short_name != 'kindle':
extract_class('cbj_kindle_banner_hr')
def render_jacket(mi, output_profile,
alt_title=_('Unknown'), alt_tags=[], alt_comments='',
alt_publisher='', rescale_fonts=False, alt_authors=None):
css = P('jacket/stylesheet.css', data=True).decode('utf-8')
template = P('jacket/template.xhtml', data=True).decode('utf-8')
template = re.sub(r'<!--.*?-->', '', template, flags=re.DOTALL)
css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)
try:
title_str = alt_title if mi.is_null('title') else mi.title
except:
title_str = _('Unknown')
title_str = escape(title_str)
title = '<span class="title">%s</span>' % title_str
series = Series(mi.series, mi.series_index)
try:
publisher = mi.publisher if not mi.is_null('publisher') else alt_publisher
except:
publisher = ''
publisher = escape(publisher)
try:
if is_date_undefined(mi.pubdate):
pubdate = ''
else:
dt = as_local_time(mi.pubdate)
pubdate = strftime('%Y', dt.timetuple())
except:
pubdate = ''
rating = get_rating(mi.rating, output_profile.ratings_char, output_profile.empty_ratings_char)
tags = Tags((mi.tags if mi.tags else alt_tags), output_profile)
comments = mi.comments if mi.comments else alt_comments
comments = comments.strip()
if comments:
comments = comments_to_html(comments)
orig = mi.authors
if mi.is_null('authors'):
mi.authors = list(alt_authors or (_('Unknown'),))
try:
author = mi.format_authors()
except:
author = ''
mi.authors = orig
author = escape(author)
has_data = {}
def generate_html(comments):
args = dict(xmlns=XHTML_NS,
title_str=title_str,
css=css,
title=title,
author=author,
publisher=publisher,
pubdate_label=_('Published'), pubdate=pubdate,
series_label=_('Series'), series=series,
rating_label=_('Rating'), rating=rating,
tags_label=_('Tags'), tags=tags,
comments=comments,
footer='',
searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list),
)
for key in mi.custom_field_keys():
m = mi.get_user_metadata(key, False) or {}
try:
display_name, val = mi.format_field_extended(key)[:2]
dkey = key.replace('#', '_')
dt = m.get('datatype')
if dt == 'series':
args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
elif dt == 'rating':
args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False))
elif dt == 'comments':
val = val or ''
display = m.get('display', {})
ctype = display.get('interpret_as') or 'html'
if ctype == 'long-text':
val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val)
elif ctype == 'short-text':
val = '<span>%s</span>' % escape(val)
elif ctype == 'markdown':
val = markdown(val)
else:
val = comments_to_html(val)
args[dkey] = val
else:
args[dkey] = escape(val)
args[dkey+'_label'] = escape(display_name)
except Exception:
# if the val (custom column contents) is None, don't add to args
pass
if False:
print("Custom column values available in jacket template:")
for key in args.keys():
if key.startswith('_') and not key.endswith('_label'):
print(" %s: %s" % ('#' + key[1:], args[key]))
# Used in the comment describing use of custom columns in templates
# Don't change this unless you also change it in template.xhtml
args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
args['_genre'] = args.get('_genre', '{_genre}')
formatter = SafeFormatter()
generated_html = formatter.format(template, **args)
has_data['series'] = bool(series)
has_data['tags'] = bool(tags)
has_data['rating'] = bool(rating)
has_data['pubdate'] = bool(pubdate)
return strip_encoding_declarations(generated_html)
from calibre.ebooks.oeb.polish.parsing import parse
raw = generate_html(comments)
root = parse(raw, line_numbers=False, force_html5_parse=True)
if rescale_fonts:
# We ensure that the conversion pipeline will set the font sizes for
# text in the jacket to the same size as the font sizes for the rest of
# the text in the book. That means that as long as the jacket uses
# relative font sizes (em or %), the post conversion font size will be
# the same as for text in the main book. So text with size x em will
# be rescaled to the same value in both the jacket and the main content.
#
# We cannot use data-calibre-rescale 100 on the body tag as that will just
# give the body tag a font size of 1em, which is useless.
for body in root.xpath('//*[local-name()="body"]'):
fw = body.makeelement(XHTML('div'))
fw.set('data-calibre-rescale', '100')
for child in body:
fw.append(child)
body.append(fw)
postprocess_jacket(root, output_profile, has_data)
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
pretty_html_tree(None, root)
return root
# }}}
def linearize_jacket(oeb):
for x in oeb.spine[:4]:
if XPath(JACKET_XPATH)(x.data):
for e in XPath('//h:table|//h:tr|//h:th')(x.data):
e.tag = XHTML('div')
for e in XPath('//h:td')(x.data):
e.tag = XHTML('span')
break
def referenced_images(root):
for img in XPath('//h:img[@src]')(root):
src = img.get('src')
if src.startswith('file://'):
path = src[7:]
if iswindows and path.startswith('/'):
path = path[1:]
if os.path.exists(path):
yield img, path

View File

@@ -0,0 +1,218 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re
from calibre.utils.date import isoformat, now
from calibre import guess_type
from polyglot.builtins import iteritems
def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
from calibre.ebooks.oeb.base import OPF
if not mi.is_null('title'):
m.clear('title')
m.add('title', mi.title)
if mi.title_sort:
if not m.title:
m.add('title', mi.title_sort)
m.clear('title_sort')
m.add('title_sort', mi.title_sort)
if not mi.is_null('authors'):
m.filter('creator', lambda x : x.role.lower() in ['aut', ''])
for a in mi.authors:
attrib = {'role':'aut'}
if mi.author_sort:
attrib[OPF('file-as')] = mi.author_sort
m.add('creator', a, attrib=attrib)
if not mi.is_null('book_producer'):
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
m.add('contributor', mi.book_producer, role='bkp')
elif override_input_metadata:
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
if not mi.is_null('comments'):
m.clear('description')
m.add('description', mi.comments)
elif override_input_metadata:
m.clear('description')
if not mi.is_null('publisher'):
m.clear('publisher')
m.add('publisher', mi.publisher)
elif override_input_metadata:
m.clear('publisher')
if not mi.is_null('series'):
m.clear('series')
m.add('series', mi.series)
elif override_input_metadata:
m.clear('series')
identifiers = mi.get_identifiers()
set_isbn = False
for typ, val in iteritems(identifiers):
has = False
if typ.lower() == 'isbn':
set_isbn = True
for x in m.identifier:
if x.scheme.lower() == typ.lower():
x.content = val
has = True
if not has:
m.add('identifier', val, scheme=typ.upper())
if override_input_metadata and not set_isbn:
m.filter('identifier', lambda x: x.scheme.lower() == 'isbn')
if not mi.is_null('languages'):
m.clear('language')
for lang in mi.languages:
if lang and lang.lower() not in ('und', ''):
m.add('language', lang)
if not mi.is_null('series_index'):
m.clear('series_index')
m.add('series_index', mi.format_series_index())
elif override_input_metadata:
m.clear('series_index')
if not mi.is_null('rating'):
m.clear('rating')
m.add('rating', '%.2f'%mi.rating)
elif override_input_metadata:
m.clear('rating')
if not mi.is_null('tags'):
m.clear('subject')
for t in mi.tags:
m.add('subject', t)
elif override_input_metadata:
m.clear('subject')
if not mi.is_null('pubdate'):
m.clear('date')
m.add('date', isoformat(mi.pubdate))
if not mi.is_null('timestamp'):
m.clear('timestamp')
m.add('timestamp', isoformat(mi.timestamp))
if not mi.is_null('rights'):
m.clear('rights')
m.add('rights', mi.rights)
if not mi.is_null('publication_type'):
m.clear('publication_type')
m.add('publication_type', mi.publication_type)
if not m.timestamp:
m.add('timestamp', isoformat(now()))
class MergeMetadata(object):
'Merge in user metadata, including cover'
def __call__(self, oeb, mi, opts, override_input_metadata=False):
self.oeb, self.log = oeb, oeb.log
m = self.oeb.metadata
self.log('Merging user specified metadata...')
meta_info_to_oeb_metadata(mi, m, oeb.log,
override_input_metadata=override_input_metadata)
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
m.clear('cover')
if cover_id is not None:
m.add('cover', cover_id)
if mi.uuid is not None:
m.filter('identifier', lambda x:x.id=='uuid_id')
self.oeb.metadata.add('identifier', mi.uuid, id='uuid_id',
scheme='uuid')
self.oeb.uid = self.oeb.metadata.identifier[-1]
if mi.application_id is not None:
m.filter('identifier', lambda x:x.scheme=='calibre')
self.oeb.metadata.add('identifier', mi.application_id, scheme='calibre')
def set_cover(self, mi, prefer_metadata_cover):
cdata, ext = b'', 'jpg'
if mi.cover and os.access(mi.cover, os.R_OK):
with open(mi.cover, 'rb') as f:
cdata = f.read()
ext = mi.cover.rpartition('.')[-1].lower().strip()
elif mi.cover_data and mi.cover_data[-1]:
cdata = mi.cover_data[1]
ext = mi.cover_data[0]
if ext not in ('png', 'jpg', 'jpeg'):
ext = 'jpg'
id = old_cover = None
if 'cover' in self.oeb.guide:
old_cover = self.oeb.guide['cover']
if prefer_metadata_cover and old_cover is not None:
cdata = b''
if cdata:
self.oeb.guide.remove('cover')
self.oeb.guide.remove('titlepage')
elif self.oeb.plumber_output_format in {'mobi', 'azw3'} and old_cover is not None:
# The amazon formats dont support html cover pages, so remove them
# even if no cover was specified.
self.oeb.guide.remove('titlepage')
do_remove_old_cover = False
if old_cover is not None:
if old_cover.href in self.oeb.manifest.hrefs:
item = self.oeb.manifest.hrefs[old_cover.href]
if not cdata:
return item.id
do_remove_old_cover = True
elif not cdata:
id = self.oeb.manifest.generate(id='cover')[0]
self.oeb.manifest.add(id, old_cover.href, 'image/jpeg')
return id
new_cover_item = None
if cdata:
id, href = self.oeb.manifest.generate('cover', 'cover.'+ext)
new_cover_item = self.oeb.manifest.add(id, href, guess_type('cover.'+ext)[0], data=cdata)
self.oeb.guide.add('cover', 'Cover', href)
if do_remove_old_cover:
self.remove_old_cover(item, new_cover_item.href)
return id
def remove_old_cover(self, cover_item, new_cover_href=None):
from calibre.ebooks.oeb.base import XPath, XLINK
from lxml import etree
self.oeb.manifest.remove(cover_item)
# Remove any references to the cover in the HTML
affected_items = set()
xp = XPath('//h:img[@src]|//svg:image[@xl:href]')
for i, item in enumerate(self.oeb.spine):
try:
images = xp(item.data)
except Exception:
images = ()
removed = False
for img in images:
href = img.get('src') or img.get(XLINK('href'))
try:
href = item.abshref(href)
except Exception:
continue # Invalid URL, ignore
if href == cover_item.href:
if new_cover_href is not None:
replacement_href = item.relhref(new_cover_href)
attr = 'src' if img.tag.endswith('img') else XLINK('href')
img.set(attr, replacement_href)
else:
p = img.getparent()
if p.tag.endswith('}svg'):
p.getparent().remove(p)
else:
p.remove(img)
removed = True
if removed:
affected_items.add(item)
# Check if the resulting HTML has no content, if so remove it
for item in affected_items:
body = XPath('//h:body')(item.data)
if body:
text = etree.tostring(body[0], method='text', encoding='unicode')
else:
text = ''
text = re.sub(r'\s+', '', text)
if not text and not XPath('//h:img|//svg:svg')(item.data):
self.log('Removing %s as it is a wrapper around'
' the cover image'%item.href)
self.oeb.spine.remove(item)
self.oeb.manifest.remove(item)
self.oeb.guide.remove_by_href(item.href)

View File

@@ -0,0 +1,189 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import numbers
from collections import Counter
from calibre.ebooks.oeb.base import barename, XPath
from polyglot.builtins import iteritems
class RemoveAdobeMargins(object):
'''
Remove margins specified in Adobe's page templates.
'''
def __call__(self, oeb, log, opts):
self.oeb, self.opts, self.log = oeb, opts, log
for item in self.oeb.manifest:
if item.media_type in {
'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml',
'application/adobe-page-template+xml', 'application/adobe.page-template+xml',
} and hasattr(item.data, 'xpath'):
self.log('Removing page margins specified in the'
' Adobe page template')
for elem in item.data.xpath(
'//*[@margin-bottom or @margin-top '
'or @margin-left or @margin-right]'):
for margin in ('left', 'right', 'top', 'bottom'):
attr = 'margin-'+margin
elem.attrib.pop(attr, None)
class NegativeTextIndent(Exception):
pass
class RemoveFakeMargins(object):
'''
Remove left and right margins from paragraph/divs if the same margin is specified
on almost all the elements at that level.
Must be called only after CSS flattening
'''
def __call__(self, oeb, log, opts):
if not opts.remove_fake_margins:
return
self.oeb, self.log, self.opts = oeb, log, opts
stylesheet = None
self.levels = {}
self.stats = {}
self.selector_map = {}
stylesheet = self.oeb.manifest.main_stylesheet
if stylesheet is None:
return
self.log('Removing fake margins...')
stylesheet = stylesheet.data
from css_parser.css import CSSRule
for rule in stylesheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
self.selector_map[rule.selectorList.selectorText] = rule.style
self.find_levels()
for level in self.levels:
try:
self.process_level(level)
except NegativeTextIndent:
self.log.debug('Negative text indent detected at level '
' %s, ignoring this level'%level)
def get_margins(self, elem):
cls = elem.get('class', None)
if cls:
style = self.selector_map.get('.'+cls, None)
if style:
try:
ti = style['text-indent']
except:
pass
else:
if ((hasattr(ti, 'startswith') and ti.startswith('-')) or
isinstance(ti, numbers.Number) and ti < 0):
raise NegativeTextIndent()
return style.marginLeft, style.marginRight, style
return '', '', None
def process_level(self, level):
elems = self.levels[level]
self.stats[level+'_left'] = Counter()
self.stats[level+'_right'] = Counter()
for elem in elems:
lm, rm = self.get_margins(elem)[:2]
self.stats[level+'_left'][lm] += 1
self.stats[level+'_right'][rm] += 1
self.log.debug(level, ' left margin stats:', self.stats[level+'_left'])
self.log.debug(level, ' right margin stats:', self.stats[level+'_right'])
remove_left = self.analyze_stats(self.stats[level+'_left'])
remove_right = self.analyze_stats(self.stats[level+'_right'])
if remove_left:
mcl = self.stats[level+'_left'].most_common(1)[0][0]
self.log('Removing level %s left margin of:'%level, mcl)
if remove_right:
mcr = self.stats[level+'_right'].most_common(1)[0][0]
self.log('Removing level %s right margin of:'%level, mcr)
if remove_left or remove_right:
for elem in elems:
lm, rm, style = self.get_margins(elem)
if remove_left and lm == mcl:
style.removeProperty('margin-left')
if remove_right and rm == mcr:
style.removeProperty('margin-right')
def find_levels(self):
def level_of(elem, body):
ans = 1
while elem.getparent() is not body:
ans += 1
elem = elem.getparent()
return ans
paras = XPath('descendant::h:p|descendant::h:div')
for item in self.oeb.spine:
body = XPath('//h:body')(item.data)
if not body:
continue
body = body[0]
for p in paras(body):
level = level_of(p, body)
level = '%s_%d'%(barename(p.tag), level)
if level not in self.levels:
self.levels[level] = []
self.levels[level].append(p)
remove = set()
for k, v in iteritems(self.levels):
num = len(v)
self.log.debug('Found %d items of level:'%num, k)
level = int(k.split('_')[-1])
tag = k.split('_')[0]
if tag == 'p' and num < 25:
remove.add(k)
if tag == 'div':
if level > 2 and num < 25:
remove.add(k)
elif level < 3:
# Check each level < 3 element and only keep those
# that have many child paras
for elem in list(v):
children = len(paras(elem))
if children < 5:
v.remove(elem)
for k in remove:
self.levels.pop(k)
self.log.debug('Ignoring level', k)
def analyze_stats(self, stats):
if not stats:
return False
mc = stats.most_common(1)
if len(mc) > 1:
return False
mc = mc[0]
most_common, most_common_count = mc
if not most_common or most_common == '0':
return False
total = sum(stats.values())
# True if greater than 95% of elements have the same margin
return most_common_count/total > 0.95

View File

@@ -0,0 +1,324 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, uuid
from lxml import etree
from collections import OrderedDict, Counter
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename
from calibre.ebooks import ConversionError
from polyglot.builtins import itervalues, unicode_type
from polyglot.urllib import urlparse
def XPath(x):
try:
return etree.XPath(x, namespaces=XPNSMAP)
except etree.XPathSyntaxError:
raise ConversionError(
'The syntax of the XPath expression %s is invalid.' % repr(x))
def isspace(x):
return not x or x.replace('\xa0', '').isspace()
def at_start(elem):
' Return True if there is no content before elem '
body = XPath('ancestor-or-self::h:body')(elem)
if not body:
return True
body = body[0]
ancestors = frozenset(XPath('ancestor::*')(elem))
for x in body.iter():
if x is elem:
return True
if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}:
return False
if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))):
continue
return False
return False
class DetectStructure(object):
def __call__(self, oeb, opts):
self.log = oeb.log
self.oeb = oeb
self.opts = opts
self.log('Detecting structure...')
self.detect_chapters()
if self.oeb.auto_generated_toc or opts.use_auto_toc:
orig_toc = self.oeb.toc
self.oeb.toc = TOC()
self.create_level_based_toc()
if self.oeb.toc.count() < 1:
if not opts.no_chapters_in_toc and self.detected_chapters:
self.create_toc_from_chapters()
if self.oeb.toc.count() < opts.toc_threshold:
self.create_toc_from_links()
if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
self.oeb.toc = orig_toc
else:
self.oeb.auto_generated_toc = True
self.log('Auto generated TOC with %d entries.' %
self.oeb.toc.count())
if opts.toc_filter is not None:
regexp = re.compile(opts.toc_filter)
for node in list(self.oeb.toc.iter()):
if not node.title or regexp.search(node.title) is not None:
self.log('Filtering', node.title if node.title else
'empty node', 'from TOC')
self.oeb.toc.remove(node)
if opts.page_breaks_before is not None:
pb_xpath = XPath(opts.page_breaks_before)
for item in oeb.spine:
for elem in pb_xpath(item.data):
try:
prev = next(elem.itersiblings(tag=etree.Element,
preceding=True))
if (barename(elem.tag) in {'h1', 'h2'} and barename(
prev.tag) in {'h1', 'h2'} and (not prev.tail or
not prev.tail.split())):
# We have two adjacent headings, do not put a page
# break on the second one
continue
except StopIteration:
pass
style = elem.get('style', '')
if style:
style += '; '
elem.set('style', style+'page-break-before:always')
for node in self.oeb.toc.iter():
if not node.title or not node.title.strip():
node.title = _('Unnamed')
if self.opts.start_reading_at:
self.detect_start_reading()
def detect_start_reading(self):
expr = self.opts.start_reading_at
try:
expr = XPath(expr)
except:
self.log.warn(
'Invalid start reading at XPath expression, ignoring: %s'%expr)
return
for item in self.oeb.spine:
if not hasattr(item.data, 'xpath'):
continue
matches = expr(item.data)
if matches:
elem = matches[0]
eid = elem.get('id', None)
if not eid:
eid = 'start_reading_at_'+unicode_type(uuid.uuid4()).replace('-', '')
elem.set('id', eid)
if 'text' in self.oeb.guide:
self.oeb.guide.remove('text')
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
self.log('Setting start reading at position to %s in %s'%(
self.opts.start_reading_at, item.href))
return
self.log.warn("Failed to find start reading at position: %s"%
self.opts.start_reading_at)
def get_toc_parts_for_xpath(self, expr):
# if an attribute is selected by the xpath expr then truncate it
# from the path and instead return it as where to find the title text
title_attribute_regex = re.compile(r'/@([-\w]+)$')
match = title_attribute_regex.search(expr)
if match is not None:
return expr[0:match.start()], match.group(1)
return expr, None
def detect_chapters(self):
self.detected_chapters = []
self.chapter_title_attribute = None
def find_matches(expr, doc):
try:
ans = XPath(expr)(doc)
len(ans)
return ans
except:
self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
return []
if self.opts.chapter:
chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter)
self.chapter_title_attribute = title_attribute
for item in self.oeb.spine:
for x in find_matches(chapter_path, item.data):
self.detected_chapters.append((item, x))
chapter_mark = self.opts.chapter_mark
page_break_before = 'display: block; page-break-before: always'
page_break_after = 'display: block; page-break-after: always'
c = Counter()
for item, elem in self.detected_chapters:
c[item] += 1
text = xml2text(elem).strip()
text = re.sub(r'\s+', ' ', text.strip())
self.log('\tDetected chapter:', text[:50])
if chapter_mark == 'none':
continue
if chapter_mark == 'rule':
mark = elem.makeelement(XHTML('hr'))
elif chapter_mark == 'pagebreak':
if c[item] < 3 and at_start(elem):
# For the first two elements in this item, check if they
# are at the start of the file, in which case inserting a
# page break in unnecessary and can lead to extra blank
# pages in the PDF Output plugin. We need to use two as
# feedbooks epubs match both a heading tag and its
# containing div with the default chapter expression.
continue
mark = elem.makeelement(XHTML('div'), style=page_break_after)
else: # chapter_mark == 'both':
mark = elem.makeelement(XHTML('hr'), style=page_break_before)
try:
elem.addprevious(mark)
except TypeError:
self.log.exception('Failed to mark chapter')
def create_level_based_toc(self):
if self.opts.level1_toc is not None:
self.add_leveled_toc_items()
def create_toc_from_chapters(self):
counter = self.oeb.toc.next_play_order()
for item, elem in self.detected_chapters:
text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter)
self.oeb.toc.add(text, href, play_order=counter)
counter += 1
def create_toc_from_links(self):
num = 0
for item in self.oeb.spine:
for a in XPath('//h:a[@href]')(item.data):
href = a.get('href')
try:
purl = urlparse(href)
except ValueError:
self.log.warning('Ignoring malformed URL:', href)
continue
if not purl[0] or purl[0] == 'file':
href, frag = purl.path, purl.fragment
href = item.abshref(href)
if frag:
href = '#'.join((href, frag))
if not self.oeb.toc.has_href(href):
text = xml2text(a)
text = text[:100].strip()
if (not self.opts.duplicate_links_in_toc and
self.oeb.toc.has_text(text)):
continue
try:
self.oeb.toc.add(text, href,
play_order=self.oeb.toc.next_play_order())
num += 1
except ValueError:
self.oeb.log.exception('Failed to process link: %r' % href)
continue # Most likely an incorrectly URL encoded link
if self.opts.max_toc_links > 0 and \
num >= self.opts.max_toc_links:
self.log('Maximum TOC links reached, stopping.')
return
def elem_to_link(self, item, elem, title_attribute, counter):
text = ''
if title_attribute is not None:
text = elem.get(title_attribute, '')
if not text:
text = xml2text(elem).strip()
if not text:
text = elem.get('title', '')
if not text:
text = elem.get('alt', '')
text = re.sub(r'\s+', ' ', text.strip())
text = text[:1000].strip()
id = elem.get('id', 'calibre_toc_%d'%counter)
elem.set('id', id)
href = '#'.join((item.href, id))
return text, href
def add_leveled_toc_items(self):
added = OrderedDict()
added2 = OrderedDict()
counter = 1
def find_matches(expr, doc):
try:
ans = XPath(expr)(doc)
len(ans)
return ans
except:
self.log.warn('Invalid ToC expression, ignoring: %s'%expr)
return []
for document in self.oeb.spine:
previous_level1 = list(itervalues(added))[-1] if added else None
previous_level2 = list(itervalues(added2))[-1] if added2 else None
level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc)
for elem in find_matches(level1_toc, document.data):
text, _href = self.elem_to_link(document, elem, level1_title, counter)
counter += 1
if text:
node = self.oeb.toc.add(text, _href,
play_order=self.oeb.toc.next_play_order())
added[elem] = node
# node.add(_('Top'), _href)
if self.opts.level2_toc is not None and added:
level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc)
for elem in find_matches(level2_toc, document.data):
level1 = None
for item in document.data.iterdescendants():
if item in added:
level1 = added[item]
elif item == elem:
if level1 is None:
if previous_level1 is None:
break
level1 = previous_level1
text, _href = self.elem_to_link(document, elem, level2_title, counter)
counter += 1
if text:
added2[elem] = level1.add(text, _href,
play_order=self.oeb.toc.next_play_order())
break
if self.opts.level3_toc is not None and added2:
level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc)
for elem in find_matches(level3_toc, document.data):
level2 = None
for item in document.data.iterdescendants():
if item in added2:
level2 = added2[item]
elif item == elem:
if level2 is None:
if previous_level2 is None:
break
level2 = previous_level2
text, _href = \
self.elem_to_link(document, elem, level3_title, counter)
counter += 1
if text:
level2.add(text, _href,
play_order=self.oeb.toc.next_play_order())
break

View File

@@ -0,0 +1,73 @@
'''
OPF manifest trimming transform.
'''
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS
from calibre.ebooks.oeb.base import urlnormalize, iterlinks
from polyglot.urllib import urldefrag
class ManifestTrimmer(object):
@classmethod
def config(cls, cfg):
return cfg
@classmethod
def generate(cls, opts):
return cls()
def __call__(self, oeb, context):
import css_parser
oeb.logger.info('Trimming unused files from manifest...')
self.opts = context
used = set()
for term in oeb.metadata:
for item in oeb.metadata[term]:
if item.value in oeb.manifest.hrefs:
used.add(oeb.manifest.hrefs[item.value])
elif item.value in oeb.manifest.ids:
used.add(oeb.manifest.ids[item.value])
for ref in oeb.guide.values():
path, _ = urldefrag(ref.href)
if path in oeb.manifest.hrefs:
used.add(oeb.manifest.hrefs[path])
# TOC items are required to be in the spine
for item in oeb.spine:
used.add(item)
unchecked = used
while unchecked:
new = set()
for item in unchecked:
if (item.media_type in OEB_DOCS or
item.media_type[-4:] in ('/xml', '+xml')) and \
item.data is not None:
hrefs = [r[2] for r in iterlinks(item.data)]
for href in hrefs:
if isinstance(href, bytes):
href = href.decode('utf-8')
try:
href = item.abshref(urlnormalize(href))
except:
continue
if href in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href]
if found not in used:
new.add(found)
elif item.media_type == CSS_MIME:
for href in css_parser.getUrls(item.data):
href = item.abshref(urlnormalize(href))
if href in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href]
if found not in used:
new.add(found)
used.update(new)
unchecked = new
for item in oeb.manifest.values():
if item not in used:
oeb.logger.info('Trimming %r from manifest' % item.href)
oeb.manifest.remove(item)

View File

@@ -0,0 +1,78 @@
'''
Directory output OEBBook writer.
'''
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import os
from calibre.ebooks.oeb.base import OPF_MIME, xml2str
from calibre.ebooks.oeb.base import DirContainer, OEBError
__all__ = ['OEBWriter']
class OEBWriter(object):
DEFAULT_PROFILE = 'PRS505'
"""Default renderer profile for content written with this Writer."""
TRANSFORMS = []
"""List of transforms to apply to content written with this Writer."""
def __init__(self, version='2.0', page_map=False, pretty_print=False):
self.version = version
self.page_map = page_map
self.pretty_print = pretty_print
@classmethod
def config(cls, cfg):
"""Add any book-writing options to the :class:`Config` object
:param:`cfg`.
"""
oeb = cfg.add_group('oeb', _('OPF/NCX/etc. generation options.'))
versions = ['1.2', '2.0']
oeb('opf_version', ['--opf-version'], default='2.0', choices=versions,
help=_('OPF version to generate. Default is %default.'))
oeb('adobe_page_map', ['--adobe-page-map'], default=False,
help=_('Generate an Adobe "page-map" file if pagination '
'information is available.'))
return cfg
@classmethod
def generate(cls, opts):
"""Generate a Writer instance from command-line options."""
version = opts.opf_version
page_map = opts.adobe_page_map
pretty_print = opts.pretty_print
return cls(version=version, page_map=page_map,
pretty_print=pretty_print)
def __call__(self, oeb, path):
"""
Write the book in the :class:`OEBBook` object :param:`oeb` to a folder
at :param:`path`.
"""
version = int(self.version[0])
opfname = None
if os.path.splitext(path)[1].lower() == '.opf':
opfname = os.path.basename(path)
path = os.path.dirname(path)
if not os.path.isdir(path):
os.mkdir(path)
output = DirContainer(path, oeb.log)
for item in oeb.manifest.values():
output.write(item.href, item.bytes_representation)
if version == 1:
metadata = oeb.to_opf1()
elif version == 2:
metadata = oeb.to_opf2(page_map=self.page_map)
else:
raise OEBError("Unrecognized OPF version %r" % self.version)
pretty_print = self.pretty_print
for mime, (href, data) in metadata.items():
if opfname and mime == OPF_MIME:
href = opfname
output.write(href, xml2str(data, pretty_print=pretty_print))
return