mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-31 09:23:32 +02:00
Initial import
This commit is contained in:
4
ebook_converter/ebooks/oeb/__init__.py
Normal file
4
ebook_converter/ebooks/oeb/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
2023
ebook_converter/ebooks/oeb/base.py
Normal file
2023
ebook_converter/ebooks/oeb/base.py
Normal file
File diff suppressed because it is too large
Load Diff
437
ebook_converter/ebooks/oeb/normalize_css.py
Normal file
437
ebook_converter/ebooks/oeb/normalize_css.py
Normal file
@@ -0,0 +1,437 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import numbers
|
||||
from functools import wraps
|
||||
|
||||
from css_parser.css import PropertyValue
|
||||
from css_parser import profile as cssprofiles, CSSParser
|
||||
from tinycss.fonts3 import parse_font, serialize_font_family
|
||||
from calibre.ebooks.oeb.base import css_text
|
||||
from polyglot.builtins import iteritems, string_or_bytes, unicode_type, zip
|
||||
|
||||
DEFAULTS = {'azimuth': 'center', 'background-attachment': 'scroll', # {{{
|
||||
'background-color': 'transparent', 'background-image': 'none',
|
||||
'background-position': '0% 0%', 'background-repeat': 'repeat',
|
||||
'border-bottom-color': 'currentColor', 'border-bottom-style':
|
||||
'none', 'border-bottom-width': 'medium', 'border-collapse':
|
||||
'separate', 'border-left-color': 'currentColor',
|
||||
'border-left-style': 'none', 'border-left-width': 'medium',
|
||||
'border-right-color': 'currentColor', 'border-right-style': 'none',
|
||||
'border-right-width': 'medium', 'border-spacing': 0,
|
||||
'border-top-color': 'currentColor', 'border-top-style': 'none',
|
||||
'border-top-width': 'medium', 'bottom': 'auto', 'caption-side':
|
||||
'top', 'clear': 'none', 'clip': 'auto', 'color': 'black',
|
||||
'content': 'normal', 'counter-increment': 'none', 'counter-reset':
|
||||
'none', 'cue-after': 'none', 'cue-before': 'none', 'cursor':
|
||||
'auto', 'direction': 'ltr', 'display': 'inline', 'elevation':
|
||||
'level', 'empty-cells': 'show', 'float': 'none', 'font-family':
|
||||
'serif', 'font-size': 'medium', 'font-stretch': 'normal', 'font-style': 'normal',
|
||||
'font-variant': 'normal', 'font-weight': 'normal', 'height':
|
||||
'auto', 'left': 'auto', 'letter-spacing': 'normal', 'line-height':
|
||||
'normal', 'list-style-image': 'none', 'list-style-position':
|
||||
'outside', 'list-style-type': 'disc', 'margin-bottom': 0,
|
||||
'margin-left': 0, 'margin-right': 0, 'margin-top': 0, 'max-height':
|
||||
'none', 'max-width': 'none', 'min-height': 0, 'min-width': 0,
|
||||
'orphans': '2', 'outline-color': 'invert', 'outline-style': 'none',
|
||||
'outline-width': 'medium', 'overflow': 'visible', 'padding-bottom':
|
||||
0, 'padding-left': 0, 'padding-right': 0, 'padding-top': 0,
|
||||
'page-break-after': 'auto', 'page-break-before': 'auto',
|
||||
'page-break-inside': 'auto', 'pause-after': 0, 'pause-before': 0,
|
||||
'pitch': 'medium', 'pitch-range': '50', 'play-during': 'auto',
|
||||
'position': 'static', 'quotes': u"'“' '”' '‘' '’'", 'richness':
|
||||
'50', 'right': 'auto', 'speak': 'normal', 'speak-header': 'once',
|
||||
'speak-numeral': 'continuous', 'speak-punctuation': 'none',
|
||||
'speech-rate': 'medium', 'stress': '50', 'table-layout': 'auto',
|
||||
'text-align': 'auto', 'text-decoration': 'none', 'text-indent': 0,
|
||||
'text-shadow': 'none', 'text-transform': 'none', 'top': 'auto',
|
||||
'unicode-bidi': 'normal', 'vertical-align': 'baseline',
|
||||
'visibility': 'visible', 'voice-family': 'default', 'volume':
|
||||
'medium', 'white-space': 'normal', 'widows': '2', 'width': 'auto',
|
||||
'word-spacing': 'normal', 'z-index': 'auto'}
|
||||
# }}}
|
||||
|
||||
EDGES = ('top', 'right', 'bottom', 'left')
|
||||
BORDER_PROPS = ('color', 'style', 'width')
|
||||
|
||||
|
||||
def normalize_edge(name, cssvalue):
|
||||
style = {}
|
||||
if isinstance(cssvalue, PropertyValue):
|
||||
primitives = [css_text(v) for v in cssvalue]
|
||||
else:
|
||||
primitives = [css_text(cssvalue)]
|
||||
if len(primitives) == 1:
|
||||
value, = primitives
|
||||
values = (value, value, value, value)
|
||||
elif len(primitives) == 2:
|
||||
vert, horiz = primitives
|
||||
values = (vert, horiz, vert, horiz)
|
||||
elif len(primitives) == 3:
|
||||
top, horiz, bottom = primitives
|
||||
values = (top, horiz, bottom, horiz)
|
||||
else:
|
||||
values = primitives[:4]
|
||||
if '-' in name:
|
||||
l, _, r = name.partition('-')
|
||||
for edge, value in zip(EDGES, values):
|
||||
style['%s-%s-%s' % (l, edge, r)] = value
|
||||
else:
|
||||
for edge, value in zip(EDGES, values):
|
||||
style['%s-%s' % (name, edge)] = value
|
||||
return style
|
||||
|
||||
|
||||
def simple_normalizer(prefix, names, check_inherit=True):
|
||||
composition = tuple('%s-%s' %(prefix, n) for n in names)
|
||||
|
||||
@wraps(normalize_simple_composition)
|
||||
def wrapper(name, cssvalue):
|
||||
return normalize_simple_composition(name, cssvalue, composition, check_inherit=check_inherit)
|
||||
return wrapper
|
||||
|
||||
|
||||
def normalize_simple_composition(name, cssvalue, composition, check_inherit=True):
|
||||
if check_inherit and css_text(cssvalue) == 'inherit':
|
||||
style = {k:'inherit' for k in composition}
|
||||
else:
|
||||
style = {k:DEFAULTS[k] for k in composition}
|
||||
try:
|
||||
primitives = [css_text(v) for v in cssvalue]
|
||||
except TypeError:
|
||||
primitives = [css_text(cssvalue)]
|
||||
while primitives:
|
||||
value = primitives.pop()
|
||||
for key in composition:
|
||||
if cssprofiles.validate(key, value):
|
||||
style[key] = value
|
||||
break
|
||||
return style
|
||||
|
||||
|
||||
font_composition = ('font-style', 'font-variant', 'font-weight', 'font-size', 'line-height', 'font-family')
|
||||
|
||||
|
||||
def normalize_font(cssvalue, font_family_as_list=False):
|
||||
# See https://developer.mozilla.org/en-US/docs/Web/CSS/font
|
||||
composition = font_composition
|
||||
val = css_text(cssvalue)
|
||||
if val == 'inherit':
|
||||
ans = {k:'inherit' for k in composition}
|
||||
elif val in {'caption', 'icon', 'menu', 'message-box', 'small-caption', 'status-bar'}:
|
||||
ans = {k:DEFAULTS[k] for k in composition}
|
||||
else:
|
||||
ans = {k:DEFAULTS[k] for k in composition}
|
||||
ans.update(parse_font(val))
|
||||
if font_family_as_list:
|
||||
if isinstance(ans['font-family'], string_or_bytes):
|
||||
ans['font-family'] = [x.strip() for x in ans['font-family'].split(',')]
|
||||
else:
|
||||
if not isinstance(ans['font-family'], string_or_bytes):
|
||||
ans['font-family'] = serialize_font_family(ans['font-family'])
|
||||
return ans
|
||||
|
||||
|
||||
def normalize_border(name, cssvalue):
|
||||
style = normalizers['border-' + EDGES[0]]('border-' + EDGES[0], cssvalue)
|
||||
vals = style.copy()
|
||||
for edge in EDGES[1:]:
|
||||
style.update({k.replace(EDGES[0], edge):v for k, v in iteritems(vals)})
|
||||
return style
|
||||
|
||||
|
||||
normalizers = {
|
||||
'list-style': simple_normalizer('list-style', ('type', 'position', 'image')),
|
||||
'font': lambda prop, v: normalize_font(v),
|
||||
'border': normalize_border,
|
||||
}
|
||||
|
||||
for x in ('margin', 'padding', 'border-style', 'border-width', 'border-color'):
|
||||
normalizers[x] = normalize_edge
|
||||
|
||||
for x in EDGES:
|
||||
name = 'border-' + x
|
||||
normalizers[name] = simple_normalizer(name, BORDER_PROPS, check_inherit=False)
|
||||
|
||||
SHORTHAND_DEFAULTS = {
|
||||
'margin': '0', 'padding': '0', 'border-style': 'none', 'border-width': '0', 'border-color': 'currentColor',
|
||||
'border':'none', 'border-left': 'none', 'border-right':'none', 'border-top': 'none', 'border-bottom': 'none',
|
||||
'list-style': 'inherit', 'font': 'inherit',
|
||||
}
|
||||
|
||||
_safe_parser = None
|
||||
|
||||
|
||||
def safe_parser():
|
||||
global _safe_parser
|
||||
if _safe_parser is None:
|
||||
import logging
|
||||
_safe_parser = CSSParser(loglevel=logging.CRITICAL, validate=False)
|
||||
return _safe_parser
|
||||
|
||||
|
||||
def normalize_filter_css(props):
|
||||
ans = set()
|
||||
p = safe_parser()
|
||||
for prop in props:
|
||||
n = normalizers.get(prop, None)
|
||||
ans.add(prop)
|
||||
if n is not None and prop in SHORTHAND_DEFAULTS:
|
||||
dec = p.parseStyle('%s: %s' % (prop, SHORTHAND_DEFAULTS[prop]))
|
||||
cssvalue = dec.getPropertyCSSValue(dec.item(0))
|
||||
ans |= set(n(prop, cssvalue))
|
||||
return ans
|
||||
|
||||
|
||||
def condense_edge(vals):
|
||||
edges = {x.name.rpartition('-')[-1]:x.value for x in vals}
|
||||
if len(edges) != 4 or set(edges) != {'left', 'top', 'right', 'bottom'}:
|
||||
return
|
||||
ce = {}
|
||||
for (x, y) in [('left', 'right'), ('top', 'bottom')]:
|
||||
if edges[x] == edges[y]:
|
||||
ce[x] = edges[x]
|
||||
else:
|
||||
ce[x], ce[y] = edges[x], edges[y]
|
||||
if len(ce) == 4:
|
||||
return ' '.join(ce[x] for x in ('top', 'right', 'bottom', 'left'))
|
||||
if len(ce) == 3:
|
||||
if 'right' in ce:
|
||||
return ' '.join(ce[x] for x in ('top', 'right', 'top', 'left'))
|
||||
return ' '.join(ce[x] for x in ('top', 'left', 'bottom'))
|
||||
if len(ce) == 2:
|
||||
if ce['top'] == ce['left']:
|
||||
return ce['top']
|
||||
return ' '.join(ce[x] for x in ('top', 'left'))
|
||||
|
||||
|
||||
def simple_condenser(prefix, func):
|
||||
@wraps(func)
|
||||
def condense_simple(style, props):
|
||||
cp = func(props)
|
||||
if cp is not None:
|
||||
for prop in props:
|
||||
style.removeProperty(prop.name)
|
||||
style.setProperty(prefix, cp)
|
||||
return condense_simple
|
||||
|
||||
|
||||
def condense_border(style, props):
|
||||
prop_map = {p.name:p for p in props}
|
||||
edge_vals = []
|
||||
for edge in EDGES:
|
||||
name = 'border-%s' % edge
|
||||
vals = []
|
||||
for prop in BORDER_PROPS:
|
||||
x = prop_map.get('%s-%s' % (name, prop), None)
|
||||
if x is not None:
|
||||
vals.append(x)
|
||||
if len(vals) == 3:
|
||||
for prop in vals:
|
||||
style.removeProperty(prop.name)
|
||||
style.setProperty(name, ' '.join(x.value for x in vals))
|
||||
prop_map[name] = style.getProperty(name)
|
||||
x = prop_map.get(name, None)
|
||||
if x is not None:
|
||||
edge_vals.append(x)
|
||||
if len(edge_vals) == 4 and len({x.value for x in edge_vals}) == 1:
|
||||
for prop in edge_vals:
|
||||
style.removeProperty(prop.name)
|
||||
style.setProperty('border', edge_vals[0].value)
|
||||
|
||||
|
||||
condensers = {'margin': simple_condenser('margin', condense_edge), 'padding': simple_condenser('padding', condense_edge), 'border': condense_border}
|
||||
|
||||
|
||||
def condense_rule(style):
|
||||
expanded = {'margin-':[], 'padding-':[], 'border-':[]}
|
||||
for prop in style.getProperties():
|
||||
for x in expanded:
|
||||
if prop.name and prop.name.startswith(x):
|
||||
expanded[x].append(prop)
|
||||
break
|
||||
for prefix, vals in iteritems(expanded):
|
||||
if len(vals) > 1 and {x.priority for x in vals} == {''}:
|
||||
condensers[prefix[:-1]](style, vals)
|
||||
|
||||
|
||||
def condense_sheet(sheet):
|
||||
for rule in sheet.cssRules:
|
||||
if rule.type == rule.STYLE_RULE:
|
||||
condense_rule(rule.style)
|
||||
|
||||
|
||||
def test_normalization(return_tests=False): # {{{
|
||||
import unittest
|
||||
from css_parser import parseStyle
|
||||
from itertools import product
|
||||
|
||||
class TestNormalization(unittest.TestCase):
|
||||
longMessage = True
|
||||
maxDiff = None
|
||||
|
||||
def test_font_normalization(self):
|
||||
def font_dict(expected):
|
||||
ans = {k:DEFAULTS[k] for k in font_composition} if expected else {}
|
||||
ans.update(expected)
|
||||
return ans
|
||||
|
||||
for raw, expected in iteritems({
|
||||
'some_font': {'font-family':'some_font'}, 'inherit':{k:'inherit' for k in font_composition},
|
||||
'1.2pt/1.4 A_Font': {'font-family':'A_Font', 'font-size':'1.2pt', 'line-height':'1.4'},
|
||||
'bad font': {'font-family':'"bad font"'}, '10% serif': {'font-family':'serif', 'font-size':'10%'},
|
||||
'12px "My Font", serif': {'font-family':'"My Font", serif', 'font-size': '12px'},
|
||||
'normal 0.6em/135% arial,sans-serif': {'font-family': 'arial, sans-serif', 'font-size': '0.6em', 'line-height':'135%', 'font-style':'normal'},
|
||||
'bold italic large serif': {'font-family':'serif', 'font-weight':'bold', 'font-style':'italic', 'font-size':'large'},
|
||||
'bold italic small-caps larger/normal serif':
|
||||
{'font-family':'serif', 'font-weight':'bold', 'font-style':'italic', 'font-size':'larger',
|
||||
'line-height':'normal', 'font-variant':'small-caps'},
|
||||
'2em A B': {'font-family': '"A B"', 'font-size': '2em'},
|
||||
}):
|
||||
val = tuple(parseStyle('font: %s' % raw, validate=False))[0].cssValue
|
||||
style = normalizers['font']('font', val)
|
||||
self.assertDictEqual(font_dict(expected), style, raw)
|
||||
|
||||
def test_border_normalization(self):
|
||||
def border_edge_dict(expected, edge='right'):
|
||||
ans = {'border-%s-%s' % (edge, x): DEFAULTS['border-%s-%s' % (edge, x)] for x in ('style', 'width', 'color')}
|
||||
for x, v in iteritems(expected):
|
||||
ans['border-%s-%s' % (edge, x)] = v
|
||||
return ans
|
||||
|
||||
def border_dict(expected):
|
||||
ans = {}
|
||||
for edge in EDGES:
|
||||
ans.update(border_edge_dict(expected, edge))
|
||||
return ans
|
||||
|
||||
def border_val_dict(expected, val='color'):
|
||||
ans = {'border-%s-%s' % (edge, val): DEFAULTS['border-%s-%s' % (edge, val)] for edge in EDGES}
|
||||
for edge in EDGES:
|
||||
ans['border-%s-%s' % (edge, val)] = expected
|
||||
return ans
|
||||
|
||||
for raw, expected in iteritems({
|
||||
'solid 1px red': {'color':'red', 'width':'1px', 'style':'solid'},
|
||||
'1px': {'width': '1px'}, '#aaa': {'color': '#aaa'},
|
||||
'2em groove': {'width':'2em', 'style':'groove'},
|
||||
}):
|
||||
for edge in EDGES:
|
||||
br = 'border-%s' % edge
|
||||
val = tuple(parseStyle('%s: %s' % (br, raw), validate=False))[0].cssValue
|
||||
self.assertDictEqual(border_edge_dict(expected, edge), normalizers[br](br, val))
|
||||
|
||||
for raw, expected in iteritems({
|
||||
'solid 1px red': {'color':'red', 'width':'1px', 'style':'solid'},
|
||||
'1px': {'width': '1px'}, '#aaa': {'color': '#aaa'},
|
||||
'thin groove': {'width':'thin', 'style':'groove'},
|
||||
}):
|
||||
val = tuple(parseStyle('%s: %s' % ('border', raw), validate=False))[0].cssValue
|
||||
self.assertDictEqual(border_dict(expected), normalizers['border']('border', val))
|
||||
|
||||
for name, val in iteritems({
|
||||
'width': '10%', 'color': 'rgb(0, 1, 1)', 'style': 'double',
|
||||
}):
|
||||
cval = tuple(parseStyle('border-%s: %s' % (name, val), validate=False))[0].cssValue
|
||||
self.assertDictEqual(border_val_dict(val, name), normalizers['border-'+name]('border-'+name, cval))
|
||||
|
||||
def test_edge_normalization(self):
|
||||
def edge_dict(prefix, expected):
|
||||
return {'%s-%s' % (prefix, edge) : x for edge, x in zip(EDGES, expected)}
|
||||
for raw, expected in iteritems({
|
||||
'2px': ('2px', '2px', '2px', '2px'),
|
||||
'1em 2em': ('1em', '2em', '1em', '2em'),
|
||||
'1em 2em 3em': ('1em', '2em', '3em', '2em'),
|
||||
'1 2 3 4': ('1', '2', '3', '4'),
|
||||
}):
|
||||
for prefix in ('margin', 'padding'):
|
||||
cval = tuple(parseStyle('%s: %s' % (prefix, raw), validate=False))[0].cssValue
|
||||
self.assertDictEqual(edge_dict(prefix, expected), normalizers[prefix](prefix, cval))
|
||||
|
||||
def test_list_style_normalization(self):
|
||||
def ls_dict(expected):
|
||||
ans = {'list-style-%s' % x : DEFAULTS['list-style-%s' % x] for x in ('type', 'image', 'position')}
|
||||
for k, v in iteritems(expected):
|
||||
ans['list-style-%s' % k] = v
|
||||
return ans
|
||||
for raw, expected in iteritems({
|
||||
'url(http://www.example.com/images/list.png)': {'image': 'url(http://www.example.com/images/list.png)'},
|
||||
'inside square': {'position':'inside', 'type':'square'},
|
||||
'upper-roman url(img) outside': {'position':'outside', 'type':'upper-roman', 'image':'url(img)'},
|
||||
}):
|
||||
cval = tuple(parseStyle('list-style: %s' % raw, validate=False))[0].cssValue
|
||||
self.assertDictEqual(ls_dict(expected), normalizers['list-style']('list-style', cval))
|
||||
|
||||
def test_filter_css_normalization(self):
|
||||
ae = self.assertEqual
|
||||
ae({'font'} | set(font_composition), normalize_filter_css({'font'}))
|
||||
for p in ('margin', 'padding'):
|
||||
ae({p} | {p + '-' + x for x in EDGES}, normalize_filter_css({p}))
|
||||
bvals = {'border-%s-%s' % (edge, x) for edge in EDGES for x in BORDER_PROPS}
|
||||
ae(bvals | {'border'}, normalize_filter_css({'border'}))
|
||||
for x in BORDER_PROPS:
|
||||
sbvals = {'border-%s-%s' % (e, x) for e in EDGES}
|
||||
ae(sbvals | {'border-%s' % x}, normalize_filter_css({'border-%s' % x}))
|
||||
for e in EDGES:
|
||||
sbvals = {'border-%s-%s' % (e, x) for x in BORDER_PROPS}
|
||||
ae(sbvals | {'border-%s' % e}, normalize_filter_css({'border-%s' % e}))
|
||||
ae({'list-style', 'list-style-image', 'list-style-type', 'list-style-position'}, normalize_filter_css({'list-style'}))
|
||||
|
||||
def test_edge_condensation(self):
|
||||
for s, v in iteritems({
|
||||
(1, 1, 3) : None,
|
||||
(1, 2, 3, 4) : '2pt 3pt 4pt 1pt',
|
||||
(1, 2, 3, 2) : '2pt 3pt 2pt 1pt',
|
||||
(1, 2, 1, 3) : '2pt 1pt 3pt',
|
||||
(1, 2, 1, 2) : '2pt 1pt',
|
||||
(1, 1, 1, 1) : '1pt',
|
||||
('2%', '2%', '2%', '2%') : '2%',
|
||||
tuple('0 0 0 0'.split()) : '0',
|
||||
}):
|
||||
for prefix in ('margin', 'padding'):
|
||||
css = {'%s-%s' % (prefix, x) : unicode_type(y)+'pt' if isinstance(y, numbers.Number) else y
|
||||
for x, y in zip(('left', 'top', 'right', 'bottom'), s)}
|
||||
css = '; '.join(('%s:%s' % (k, v) for k, v in iteritems(css)))
|
||||
style = parseStyle(css)
|
||||
condense_rule(style)
|
||||
val = getattr(style.getProperty(prefix), 'value', None)
|
||||
self.assertEqual(v, val)
|
||||
if val is not None:
|
||||
for edge in EDGES:
|
||||
self.assertFalse(getattr(style.getProperty('%s-%s' % (prefix, edge)), 'value', None))
|
||||
|
||||
def test_border_condensation(self):
|
||||
vals = 'red solid 5px'
|
||||
css = '; '.join('border-%s-%s: %s' % (edge, p, v) for edge in EDGES for p, v in zip(BORDER_PROPS, vals.split()))
|
||||
style = parseStyle(css)
|
||||
condense_rule(style)
|
||||
for e, p in product(EDGES, BORDER_PROPS):
|
||||
self.assertFalse(style.getProperty('border-%s-%s' % (e, p)))
|
||||
self.assertFalse(style.getProperty('border-%s' % e))
|
||||
self.assertFalse(style.getProperty('border-%s' % p))
|
||||
self.assertEqual(style.getProperty('border').value, vals)
|
||||
css = '; '.join('border-%s-%s: %s' % (edge, p, v) for edge in ('top',) for p, v in zip(BORDER_PROPS, vals.split()))
|
||||
style = parseStyle(css)
|
||||
condense_rule(style)
|
||||
self.assertEqual(css_text(style), 'border-top: %s' % vals)
|
||||
css += ';' + '; '.join('border-%s-%s: %s' % (edge, p, v) for edge in ('right', 'left', 'bottom') for p, v in
|
||||
zip(BORDER_PROPS, vals.replace('red', 'green').split()))
|
||||
style = parseStyle(css)
|
||||
condense_rule(style)
|
||||
self.assertEqual(len(style.getProperties()), 4)
|
||||
self.assertEqual(style.getProperty('border-top').value, vals)
|
||||
self.assertEqual(style.getProperty('border-left').value, vals.replace('red', 'green'))
|
||||
|
||||
tests = unittest.defaultTestLoader.loadTestsFromTestCase(TestNormalization)
|
||||
if return_tests:
|
||||
return tests
|
||||
unittest.TextTestRunner(verbosity=4).run(tests)
|
||||
# }}}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_normalization()
|
||||
389
ebook_converter/ebooks/oeb/parse_utils.py
Normal file
389
ebook_converter/ebooks/oeb/parse_utils.py
Normal file
@@ -0,0 +1,389 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from lxml import etree, html
|
||||
|
||||
from calibre import xml_replace_entities, force_unicode
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from calibre.constants import filesystem_encoding
|
||||
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||
from polyglot.builtins import iteritems, itervalues, unicode_type, string_or_bytes, map
|
||||
|
||||
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||
XMLNS_NS = 'http://www.w3.org/2000/xmlns/'
|
||||
|
||||
|
||||
class NotHTML(Exception):
|
||||
|
||||
def __init__(self, root_tag):
|
||||
Exception.__init__(self, 'Data is not HTML')
|
||||
self.root_tag = root_tag
|
||||
|
||||
|
||||
def barename(name):
|
||||
return name.rpartition('}')[-1]
|
||||
|
||||
|
||||
def namespace(name):
|
||||
return name.rpartition('}')[0][1:]
|
||||
|
||||
|
||||
def XHTML(name):
|
||||
return '{%s}%s' % (XHTML_NS, name)
|
||||
|
||||
|
||||
def xpath(elem, expr):
|
||||
return elem.xpath(expr, namespaces={'h':XHTML_NS})
|
||||
|
||||
|
||||
def XPath(expr):
|
||||
return etree.XPath(expr, namespaces={'h':XHTML_NS})
|
||||
|
||||
|
||||
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
|
||||
|
||||
|
||||
def merge_multiple_html_heads_and_bodies(root, log=None):
|
||||
heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
|
||||
if not (len(heads) > 1 or len(bodies) > 1):
|
||||
return root
|
||||
for child in root:
|
||||
root.remove(child)
|
||||
head = root.makeelement(XHTML('head'))
|
||||
body = root.makeelement(XHTML('body'))
|
||||
for h in heads:
|
||||
for x in h:
|
||||
head.append(x)
|
||||
for b in bodies:
|
||||
for x in b:
|
||||
body.append(x)
|
||||
tuple(map(root.append, (head, body)))
|
||||
if log is not None:
|
||||
log.warn('Merging multiple <head> and <body> sections')
|
||||
return root
|
||||
|
||||
|
||||
def clone_element(elem, nsmap={}, in_context=True):
|
||||
if in_context:
|
||||
maker = elem.getroottree().getroot().makeelement
|
||||
else:
|
||||
maker = etree.Element
|
||||
nelem = maker(elem.tag, attrib=elem.attrib,
|
||||
nsmap=nsmap)
|
||||
nelem.text, nelem.tail = elem.text, elem.tail
|
||||
nelem.extend(elem)
|
||||
return nelem
|
||||
|
||||
|
||||
def node_depth(node):
|
||||
ans = 0
|
||||
p = node.getparent()
|
||||
while p is not None:
|
||||
ans += 1
|
||||
p = p.getparent()
|
||||
return ans
|
||||
|
||||
|
||||
def html5_parse(data, max_nesting_depth=100):
|
||||
from html5_parser import parse
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
data = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
|
||||
# Check that the asinine HTML 5 algorithm did not result in a tree with
|
||||
# insane nesting depths
|
||||
for x in data.iterdescendants():
|
||||
if isinstance(x.tag, string_or_bytes) and not len(x): # Leaf node
|
||||
depth = node_depth(x)
|
||||
if depth > max_nesting_depth:
|
||||
raise ValueError('HTML 5 parsing resulted in a tree with nesting'
|
||||
' depth > %d'%max_nesting_depth)
|
||||
return data
|
||||
|
||||
|
||||
def _html4_parse(data):
|
||||
data = html.fromstring(data)
|
||||
data.attrib.pop('xmlns', None)
|
||||
for elem in data.iter(tag=etree.Comment):
|
||||
if elem.text:
|
||||
elem.text = elem.text.strip('-')
|
||||
data = etree.tostring(data, encoding='unicode')
|
||||
|
||||
data = safe_xml_fromstring(data)
|
||||
return data
|
||||
|
||||
|
||||
def clean_word_doc(data, log):
|
||||
prefixes = []
|
||||
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
|
||||
prefixes.append(match.group(1))
|
||||
if prefixes:
|
||||
log.warn('Found microsoft markup, cleaning...')
|
||||
# Remove empty tags as they are not rendered by browsers
|
||||
# but can become renderable HTML tags like <p/> if the
|
||||
# document is parsed by an HTML parser
|
||||
pat = re.compile(
|
||||
r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
|
||||
re.DOTALL)
|
||||
data = pat.sub('', data)
|
||||
pat = re.compile(
|
||||
r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
|
||||
data = pat.sub('', data)
|
||||
return data
|
||||
|
||||
|
||||
def ensure_namespace_prefixes(node, nsmap):
|
||||
namespace_uris = frozenset(itervalues(nsmap))
|
||||
fnsmap = {k:v for k, v in iteritems(node.nsmap) if v not in namespace_uris}
|
||||
fnsmap.update(nsmap)
|
||||
if fnsmap != dict(node.nsmap):
|
||||
node = clone_element(node, nsmap=fnsmap, in_context=False)
|
||||
return node
|
||||
|
||||
|
||||
class HTML5Doc(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def check_for_html5(prefix, root):
|
||||
if re.search(r'<!DOCTYPE\s+html\s*>', prefix, re.IGNORECASE) is not None:
|
||||
if root.xpath('//svg'):
|
||||
raise HTML5Doc('This document appears to be un-namespaced HTML 5, should be parsed by the HTML 5 parser')
|
||||
|
||||
|
||||
def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
filename='<string>', non_html_file_tags=frozenset()):
|
||||
if log is None:
|
||||
from calibre.utils.logging import default_log
|
||||
log = default_log
|
||||
|
||||
filename = force_unicode(filename, enc=filesystem_encoding)
|
||||
|
||||
if not isinstance(data, unicode_type):
|
||||
if decoder is not None:
|
||||
data = decoder(data)
|
||||
else:
|
||||
data = xml_to_unicode(data)[0]
|
||||
|
||||
data = strip_encoding_declarations(data)
|
||||
# Remove DOCTYPE declaration as it messes up parsing
|
||||
# In particular, it causes tostring to insert xmlns
|
||||
# declarations, which messes up the coercing logic
|
||||
pre = ''
|
||||
idx = data.find('<html')
|
||||
if idx == -1:
|
||||
idx = data.find('<HTML')
|
||||
has_html4_doctype = False
|
||||
if idx > -1:
|
||||
pre = data[:idx]
|
||||
data = data[idx:]
|
||||
if '<!DOCTYPE' in pre: # Handle user defined entities
|
||||
# kindlegen produces invalid xhtml with uppercase attribute names
|
||||
# if fed HTML 4 with uppercase attribute names, so try to detect
|
||||
# and compensate for that.
|
||||
has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
|
||||
# Process private entities
|
||||
user_entities = {}
|
||||
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
||||
val = match.group(2)
|
||||
if val.startswith('"') and val.endswith('"'):
|
||||
val = val[1:-1]
|
||||
user_entities[match.group(1)] = val
|
||||
if user_entities:
|
||||
pat = re.compile(r'&(%s);'%('|'.join(list(user_entities.keys()))))
|
||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||
|
||||
if preprocessor is not None:
|
||||
data = preprocessor(data)
|
||||
|
||||
# There could be null bytes in data if it had � entities in it
|
||||
data = data.replace('\0', '')
|
||||
data = raw = clean_word_doc(data, log)
|
||||
|
||||
# Try with more & more drastic measures to parse
|
||||
try:
|
||||
data = safe_xml_fromstring(data, recover=False)
|
||||
check_for_html5(pre, data)
|
||||
except (HTML5Doc, etree.XMLSyntaxError):
|
||||
log.debug('Initial parse failed, using more'
|
||||
' forgiving parsers')
|
||||
raw = data = xml_replace_entities(raw)
|
||||
try:
|
||||
data = safe_xml_fromstring(data, recover=False)
|
||||
check_for_html5(pre, data)
|
||||
except (HTML5Doc, etree.XMLSyntaxError):
|
||||
log.debug('Parsing %s as HTML' % filename)
|
||||
data = raw
|
||||
try:
|
||||
data = html5_parse(data)
|
||||
except Exception:
|
||||
log.exception(
|
||||
'HTML 5 parsing failed, falling back to older parsers')
|
||||
data = _html4_parse(data)
|
||||
|
||||
if has_html4_doctype or data.tag == 'HTML' or (len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))):
|
||||
# Lower case all tag and attribute names
|
||||
data.tag = data.tag.lower()
|
||||
for x in data.iterdescendants():
|
||||
try:
|
||||
x.tag = x.tag.lower()
|
||||
for key, val in list(iteritems(x.attrib)):
|
||||
del x.attrib[key]
|
||||
key = key.lower()
|
||||
x.attrib[key] = val
|
||||
except:
|
||||
pass
|
||||
|
||||
if barename(data.tag) != 'html':
|
||||
if barename(data.tag) in non_html_file_tags:
|
||||
raise NotHTML(data.tag)
|
||||
log.warn('File %r does not appear to be (X)HTML'%filename)
|
||||
nroot = safe_xml_fromstring('<html></html>')
|
||||
has_body = False
|
||||
for child in list(data):
|
||||
if isinstance(child.tag, (unicode_type, bytes)) and barename(child.tag) == 'body':
|
||||
has_body = True
|
||||
break
|
||||
parent = nroot
|
||||
if not has_body:
|
||||
log.warn('File %r appears to be a HTML fragment'%filename)
|
||||
nroot = safe_xml_fromstring('<html><body/></html>')
|
||||
parent = nroot[0]
|
||||
for child in list(data.iter()):
|
||||
oparent = child.getparent()
|
||||
if oparent is not None:
|
||||
oparent.remove(child)
|
||||
parent.append(child)
|
||||
data = nroot
|
||||
|
||||
# Force into the XHTML namespace
|
||||
if not namespace(data.tag):
|
||||
log.warn('Forcing', filename, 'into XHTML namespace')
|
||||
data.attrib['xmlns'] = XHTML_NS
|
||||
data = etree.tostring(data, encoding='unicode')
|
||||
|
||||
try:
|
||||
data = safe_xml_fromstring(data, recover=False)
|
||||
except:
|
||||
data = data.replace(':=', '=').replace(':>', '>')
|
||||
data = data.replace('<http:/>', '')
|
||||
try:
|
||||
data = safe_xml_fromstring(data, recover=False)
|
||||
except etree.XMLSyntaxError:
|
||||
log.warn('Stripping comments from %s'%
|
||||
filename)
|
||||
data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
|
||||
data)
|
||||
data = data.replace(
|
||||
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
|
||||
'')
|
||||
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
|
||||
try:
|
||||
data = safe_xml_fromstring(data)
|
||||
except etree.XMLSyntaxError:
|
||||
log.warn('Stripping meta tags from %s'% filename)
|
||||
data = re.sub(r'<meta\s+[^>]+?>', '', data)
|
||||
data = safe_xml_fromstring(data)
|
||||
elif namespace(data.tag) != XHTML_NS:
|
||||
# OEB_DOC_NS, but possibly others
|
||||
ns = namespace(data.tag)
|
||||
attrib = dict(data.attrib)
|
||||
nroot = etree.Element(XHTML('html'),
|
||||
nsmap={None: XHTML_NS}, attrib=attrib)
|
||||
for elem in data.iterdescendants():
|
||||
if isinstance(elem.tag, string_or_bytes) and \
|
||||
namespace(elem.tag) == ns:
|
||||
elem.tag = XHTML(barename(elem.tag))
|
||||
for elem in data:
|
||||
nroot.append(elem)
|
||||
data = nroot
|
||||
|
||||
# Remove non default prefixes referring to the XHTML namespace
|
||||
data = ensure_namespace_prefixes(data, {None: XHTML_NS})
|
||||
|
||||
data = merge_multiple_html_heads_and_bodies(data, log)
|
||||
# Ensure has a <head/>
|
||||
head = xpath(data, '/h:html/h:head')
|
||||
head = head[0] if head else None
|
||||
if head is None:
|
||||
log.warn('File %s missing <head/> element' % filename)
|
||||
head = etree.Element(XHTML('head'))
|
||||
data.insert(0, head)
|
||||
title = etree.SubElement(head, XHTML('title'))
|
||||
title.text = _('Unknown')
|
||||
elif not xpath(data, '/h:html/h:head/h:title'):
|
||||
title = etree.SubElement(head, XHTML('title'))
|
||||
title.text = _('Unknown')
|
||||
# Ensure <title> is not empty
|
||||
title = xpath(data, '/h:html/h:head/h:title')[0]
|
||||
if not title.text or not title.text.strip():
|
||||
title.text = _('Unknown')
|
||||
# Remove any encoding-specifying <meta/> elements
|
||||
for meta in META_XP(data):
|
||||
meta.getparent().remove(meta)
|
||||
meta = etree.SubElement(head, XHTML('meta'),
|
||||
attrib={'http-equiv': 'Content-Type'})
|
||||
meta.set('content', 'text/html; charset=utf-8') # Ensure content is second attribute
|
||||
|
||||
# Ensure has a <body/>
|
||||
if not xpath(data, '/h:html/h:body'):
|
||||
body = xpath(data, '//h:body')
|
||||
if body:
|
||||
body = body[0]
|
||||
body.getparent().remove(body)
|
||||
data.append(body)
|
||||
else:
|
||||
log.warn('File %s missing <body/> element' % filename)
|
||||
etree.SubElement(data, XHTML('body'))
|
||||
|
||||
# Remove microsoft office markup
|
||||
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
|
||||
for x in r:
|
||||
x.tag = XHTML('span')
|
||||
|
||||
def remove_elem(a):
|
||||
p = a.getparent()
|
||||
idx = p.index(a) -1
|
||||
p.remove(a)
|
||||
if a.tail:
|
||||
if idx < 0:
|
||||
if p.text is None:
|
||||
p.text = ''
|
||||
p.text += a.tail
|
||||
else:
|
||||
if p[idx].tail is None:
|
||||
p[idx].tail = ''
|
||||
p[idx].tail += a.tail
|
||||
|
||||
# Remove hyperlinks with no content as they cause rendering
|
||||
# artifacts in browser based renderers
|
||||
# Also remove empty <b>, <u> and <i> tags
|
||||
for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
|
||||
if a.get('id', None) is None and a.get('name', None) is None \
|
||||
and len(a) == 0 and not a.text:
|
||||
remove_elem(a)
|
||||
|
||||
# Convert <br>s with content into paragraphs as ADE can't handle
|
||||
# them
|
||||
for br in xpath(data, '//h:br'):
|
||||
if len(br) > 0 or br.text:
|
||||
br.tag = XHTML('div')
|
||||
|
||||
# Remove any stray text in the <head> section and format it nicely
|
||||
data.text = '\n '
|
||||
head = xpath(data, '//h:head')
|
||||
if head:
|
||||
head = head[0]
|
||||
head.text = '\n '
|
||||
head.tail = '\n '
|
||||
for child in head:
|
||||
child.tail = '\n '
|
||||
child.tail = '\n '
|
||||
|
||||
return data
|
||||
10
ebook_converter/ebooks/oeb/polish/__init__.py
Normal file
10
ebook_converter/ebooks/oeb/polish/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
||||
1552
ebook_converter/ebooks/oeb/polish/container.py
Normal file
1552
ebook_converter/ebooks/oeb/polish/container.py
Normal file
File diff suppressed because it is too large
Load Diff
23
ebook_converter/ebooks/oeb/polish/errors.py
Normal file
23
ebook_converter/ebooks/oeb/polish/errors.py
Normal file
@@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.ebooks import DRMError as _DRMError
|
||||
|
||||
|
||||
class InvalidBook(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class DRMError(_DRMError):
|
||||
|
||||
def __init__(self):
|
||||
super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.'))
|
||||
|
||||
|
||||
class MalformedMarkup(ValueError):
|
||||
pass
|
||||
52
ebook_converter/ebooks/oeb/polish/opf.py
Normal file
52
ebook_converter/ebooks/oeb/polish/opf.py
Normal file
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
|
||||
def get_book_language(container):
|
||||
for lang in container.opf_xpath('//dc:language'):
|
||||
raw = lang.text
|
||||
if raw:
|
||||
code = canonicalize_lang(raw.split(',')[0].strip())
|
||||
if code:
|
||||
return code
|
||||
|
||||
|
||||
def set_guide_item(container, item_type, title, name, frag=None):
|
||||
ref_tag = '{%s}reference' % OPF_NAMESPACES['opf']
|
||||
href = None
|
||||
if name:
|
||||
href = container.name_to_href(name, container.opf_name)
|
||||
if frag:
|
||||
href += '#' + frag
|
||||
|
||||
guides = container.opf_xpath('//opf:guide')
|
||||
if not guides and href:
|
||||
g = container.opf.makeelement('{%s}guide' % OPF_NAMESPACES['opf'], nsmap={'opf':OPF_NAMESPACES['opf']})
|
||||
container.insert_into_xml(container.opf, g)
|
||||
guides = [g]
|
||||
|
||||
for guide in guides:
|
||||
matches = []
|
||||
for child in guide.iterchildren(etree.Element):
|
||||
if child.tag == ref_tag and child.get('type', '').lower() == item_type.lower():
|
||||
matches.append(child)
|
||||
if not matches and href:
|
||||
r = guide.makeelement(ref_tag, type=item_type, nsmap={'opf':OPF_NAMESPACES['opf']})
|
||||
container.insert_into_xml(guide, r)
|
||||
matches.append(r)
|
||||
for m in matches:
|
||||
if href:
|
||||
m.set('title', title), m.set('href', href), m.set('type', item_type)
|
||||
else:
|
||||
container.remove_from_xml(m)
|
||||
container.dirty(container.opf_name)
|
||||
|
||||
99
ebook_converter/ebooks/oeb/polish/parsing.py
Normal file
99
ebook_converter/ebooks/oeb/polish/parsing.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re
|
||||
|
||||
from lxml.etree import Element as LxmlElement
|
||||
import html5_parser
|
||||
|
||||
from calibre import xml_replace_entities
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||
|
||||
|
||||
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
if replace_entities:
|
||||
raw = xml_replace_entities(raw)
|
||||
if fix_newlines:
|
||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||
raw = clean_xml_chars(raw)
|
||||
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
|
||||
if (discard_namespaces and root.tag != 'html') or (
|
||||
not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
|
||||
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
||||
return root
|
||||
|
||||
|
||||
def handle_private_entities(data):
|
||||
# Process private entities
|
||||
pre = ''
|
||||
idx = data.find('<html')
|
||||
if idx == -1:
|
||||
idx = data.find('<HTML')
|
||||
if idx > -1:
|
||||
pre = data[:idx]
|
||||
num_of_nl_in_pre = pre.count('\n')
|
||||
if '<!DOCTYPE' in pre: # Handle user defined entities
|
||||
user_entities = {}
|
||||
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
||||
val = match.group(2)
|
||||
if val.startswith('"') and val.endswith('"'):
|
||||
val = val[1:-1]
|
||||
user_entities[match.group(1)] = val
|
||||
if user_entities:
|
||||
data = ('\n' * num_of_nl_in_pre) + data[idx:]
|
||||
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||
return data
|
||||
|
||||
|
||||
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
raw = handle_private_entities(raw)
|
||||
if replace_entities:
|
||||
raw = xml_replace_entities(raw).replace('\0', '') # Handle �
|
||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# Remove any preamble before the opening html tag as it can cause problems,
|
||||
# especially doctypes, preserve the original linenumbers by inserting
|
||||
# newlines at the start
|
||||
pre = raw[:2048]
|
||||
for match in re.finditer(r'<\s*html', pre, flags=re.I):
|
||||
newlines = raw.count('\n', 0, match.start())
|
||||
raw = ('\n' * newlines) + raw[match.start():]
|
||||
break
|
||||
|
||||
raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True)
|
||||
if force_html5_parse:
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
||||
try:
|
||||
ans = safe_xml_fromstring(raw, recover=False)
|
||||
if ans.tag != '{%s}html' % XHTML_NS:
|
||||
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
||||
if linenumber_attribute:
|
||||
for elem in ans.iter(LxmlElement):
|
||||
if elem.sourceline is not None:
|
||||
elem.set(linenumber_attribute, unicode_type(elem.sourceline))
|
||||
return ans
|
||||
except Exception:
|
||||
if log is not None:
|
||||
log.exception('Failed to parse as XML, parsing as tag soup')
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from lxml import etree
|
||||
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0> \n<b>b<svg ass="wipe" viewbox="0">', discard_namespaces=False)
|
||||
print(etree.tostring(root, encoding='utf-8'))
|
||||
print()
|
||||
252
ebook_converter/ebooks/oeb/polish/pretty.py
Normal file
252
ebook_converter/ebooks/oeb/polish/pretty.py
Normal file
@@ -0,0 +1,252 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import textwrap
|
||||
from polyglot.builtins import iteritems, map
|
||||
|
||||
# from lxml.etree import Element
|
||||
|
||||
from calibre import force_unicode
|
||||
from calibre.ebooks.oeb.base import (
|
||||
serialize, OEB_DOCS, barename, OEB_STYLES, XPNSMAP, XHTML, SVG)
|
||||
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES
|
||||
from calibre.ebooks.oeb.polish.utils import guess_type
|
||||
from calibre.utils.icu import sort_key
|
||||
|
||||
|
||||
def isspace(x):
|
||||
return not x.strip('\u0009\u000a\u000c\u000d\u0020')
|
||||
|
||||
|
||||
def pretty_xml_tree(elem, level=0, indent=' '):
|
||||
''' XML beautifier, assumes that elements that have children do not have
|
||||
textual content. Also assumes that there is no text immediately after
|
||||
closing tags. These are true for opf/ncx and container.xml files. If either
|
||||
of the assumptions are violated, there should be no data loss, but pretty
|
||||
printing wont produce optimal results.'''
|
||||
if (not elem.text and len(elem) > 0) or (elem.text and isspace(elem.text)):
|
||||
elem.text = '\n' + (indent * (level+1))
|
||||
for i, child in enumerate(elem):
|
||||
pretty_xml_tree(child, level=level+1, indent=indent)
|
||||
if not child.tail or isspace(child.tail):
|
||||
l = level + 1
|
||||
if i == len(elem) - 1:
|
||||
l -= 1
|
||||
child.tail = '\n' + (indent * l)
|
||||
|
||||
|
||||
def pretty_opf(root):
|
||||
# Put all dc: tags first starting with title and author. Preserve order for
|
||||
# the rest.
|
||||
def dckey(x):
|
||||
return {'title':0, 'creator':1}.get(barename(x.tag), 2)
|
||||
for metadata in root.xpath('//opf:metadata', namespaces=OPF_NAMESPACES):
|
||||
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc'])
|
||||
dc_tags.sort(key=dckey)
|
||||
for x in reversed(dc_tags):
|
||||
metadata.insert(0, x)
|
||||
|
||||
# Group items in the manifest
|
||||
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=OPF_NAMESPACES)
|
||||
spine_ids = {x:i for i, x in enumerate(spine_ids)}
|
||||
|
||||
def manifest_key(x):
|
||||
mt = x.get('media-type', '')
|
||||
href = x.get('href', '')
|
||||
ext = href.rpartition('.')[-1].lower()
|
||||
cat = 1000
|
||||
if mt in OEB_DOCS:
|
||||
cat = 0
|
||||
elif mt == guess_type('a.ncx'):
|
||||
cat = 1
|
||||
elif mt in OEB_STYLES:
|
||||
cat = 2
|
||||
elif mt.startswith('image/'):
|
||||
cat = 3
|
||||
elif ext in {'otf', 'ttf', 'woff'}:
|
||||
cat = 4
|
||||
elif mt.startswith('audio/'):
|
||||
cat = 5
|
||||
elif mt.startswith('video/'):
|
||||
cat = 6
|
||||
|
||||
if cat == 0:
|
||||
i = spine_ids.get(x.get('id', None), 1000000000)
|
||||
else:
|
||||
i = sort_key(href)
|
||||
return (cat, i)
|
||||
|
||||
for manifest in root.xpath('//opf:manifest', namespaces=OPF_NAMESPACES):
|
||||
try:
|
||||
children = sorted(manifest, key=manifest_key)
|
||||
except AttributeError:
|
||||
continue # There are comments so dont sort since that would mess up the comments
|
||||
for x in reversed(children):
|
||||
manifest.insert(0, x)
|
||||
|
||||
|
||||
SVG_TAG = SVG('svg')
|
||||
BLOCK_TAGS = frozenset(map(XHTML, (
|
||||
'address', 'article', 'aside', 'audio', 'blockquote', 'body', 'canvas', 'col', 'colgroup', 'dd',
|
||||
'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li',
|
||||
'noscript', 'ol', 'output', 'p', 'pre', 'script', 'section', 'style', 'table', 'tbody', 'td',
|
||||
'tfoot', 'th', 'thead', 'tr', 'ul', 'video', 'img'))) | {SVG_TAG}
|
||||
|
||||
|
||||
def isblock(x):
|
||||
if callable(x.tag) or not x.tag:
|
||||
return True
|
||||
if x.tag in BLOCK_TAGS:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def has_only_blocks(x):
|
||||
if hasattr(x.tag, 'split') and len(x) == 0:
|
||||
# Tag with no children,
|
||||
return False
|
||||
if x.text and not isspace(x.text):
|
||||
return False
|
||||
for child in x:
|
||||
if not isblock(child) or (child.tail and not isspace(child.tail)):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def indent_for_tag(x):
|
||||
prev = x.getprevious()
|
||||
x = x.getparent().text if prev is None else prev.tail
|
||||
if not x:
|
||||
return ''
|
||||
s = x.rpartition('\n')[-1]
|
||||
return s if isspace(s) else ''
|
||||
|
||||
|
||||
def set_indent(elem, attr, indent):
|
||||
x = getattr(elem, attr)
|
||||
if not x:
|
||||
x = indent
|
||||
else:
|
||||
lines = x.splitlines()
|
||||
if isspace(lines[-1]):
|
||||
lines[-1] = indent
|
||||
else:
|
||||
lines.append(indent)
|
||||
x = '\n'.join(lines)
|
||||
setattr(elem, attr, x)
|
||||
|
||||
|
||||
def pretty_block(parent, level=1, indent=' '):
|
||||
''' Surround block tags with blank lines and recurse into child block tags
|
||||
that contain only other block tags '''
|
||||
if not parent.text or isspace(parent.text):
|
||||
parent.text = ''
|
||||
nn = '\n' if hasattr(parent.tag, 'strip') and barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
|
||||
parent.text = parent.text + nn + (indent * level)
|
||||
for i, child in enumerate(parent):
|
||||
if isblock(child) and has_only_blocks(child):
|
||||
pretty_block(child, level=level+1, indent=indent)
|
||||
elif child.tag == SVG_TAG:
|
||||
pretty_xml_tree(child, level=level, indent=indent)
|
||||
l = level
|
||||
if i == len(parent) - 1:
|
||||
l -= 1
|
||||
if not child.tail or isspace(child.tail):
|
||||
child.tail = ''
|
||||
child.tail = child.tail + nn + (indent * l)
|
||||
|
||||
|
||||
def pretty_script_or_style(container, child):
|
||||
if child.text:
|
||||
indent = indent_for_tag(child)
|
||||
if child.tag.endswith('style'):
|
||||
child.text = force_unicode(pretty_css(container, '', child.text), 'utf-8')
|
||||
child.text = textwrap.dedent(child.text)
|
||||
child.text = '\n' + '\n'.join([(indent + x) if x else '' for x in child.text.splitlines()])
|
||||
set_indent(child, 'text', indent)
|
||||
|
||||
|
||||
def pretty_html_tree(container, root):
|
||||
root.text = '\n\n'
|
||||
for child in root:
|
||||
child.tail = '\n\n'
|
||||
if hasattr(child.tag, 'endswith') and child.tag.endswith('}head'):
|
||||
pretty_xml_tree(child)
|
||||
for body in root.findall('h:body', namespaces=XPNSMAP):
|
||||
pretty_block(body)
|
||||
# Special case the handling of a body that contains a single block tag
|
||||
# with all content. In this case we prettify the containing block tag
|
||||
# even if it has non block children.
|
||||
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(
|
||||
body[0]) and barename(body[0].tag) not in (
|
||||
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
|
||||
pretty_block(body[0], level=2)
|
||||
|
||||
if container is not None:
|
||||
# Handle <script> and <style> tags
|
||||
for child in root.xpath('//*[local-name()="script" or local-name()="style"]'):
|
||||
pretty_script_or_style(container, child)
|
||||
|
||||
|
||||
def fix_html(container, raw):
|
||||
' Fix any parsing errors in the HTML represented as a string in raw. Fixing is done using the HTML5 parsing algorithm. '
|
||||
root = container.parse_xhtml(raw)
|
||||
return serialize(root, 'text/html')
|
||||
|
||||
|
||||
def pretty_html(container, name, raw):
|
||||
' Pretty print the HTML represented as a string in raw '
|
||||
root = container.parse_xhtml(raw)
|
||||
pretty_html_tree(container, root)
|
||||
return serialize(root, 'text/html')
|
||||
|
||||
|
||||
def pretty_css(container, name, raw):
|
||||
' Pretty print the CSS represented as a string in raw '
|
||||
sheet = container.parse_css(raw)
|
||||
return serialize(sheet, 'text/css')
|
||||
|
||||
|
||||
def pretty_xml(container, name, raw):
|
||||
' Pretty print the XML represented as a string in raw. If ``name`` is the name of the OPF, extra OPF-specific prettying is performed. '
|
||||
root = container.parse_xml(raw)
|
||||
if name == container.opf_name:
|
||||
pretty_opf(root)
|
||||
pretty_xml_tree(root)
|
||||
return serialize(root, 'text/xml')
|
||||
|
||||
|
||||
def fix_all_html(container):
|
||||
' Fix any parsing errors in all HTML files in the container. Fixing is done using the HTML5 parsing algorithm. '
|
||||
for name, mt in iteritems(container.mime_map):
|
||||
if mt in OEB_DOCS:
|
||||
container.parsed(name)
|
||||
container.dirty(name)
|
||||
|
||||
|
||||
def pretty_all(container):
|
||||
' Pretty print all HTML/CSS/XML files in the container '
|
||||
xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
|
||||
for name, mt in iteritems(container.mime_map):
|
||||
prettied = False
|
||||
if mt in OEB_DOCS:
|
||||
pretty_html_tree(container, container.parsed(name))
|
||||
prettied = True
|
||||
elif mt in OEB_STYLES:
|
||||
container.parsed(name)
|
||||
prettied = True
|
||||
elif name == container.opf_name:
|
||||
root = container.parsed(name)
|
||||
pretty_opf(root)
|
||||
pretty_xml_tree(root)
|
||||
prettied = True
|
||||
elif mt in xml_types:
|
||||
pretty_xml_tree(container.parsed(name))
|
||||
prettied = True
|
||||
if prettied:
|
||||
container.dirty(name)
|
||||
891
ebook_converter/ebooks/oeb/polish/toc.py
Normal file
891
ebook_converter/ebooks/oeb/polish/toc.py
Normal file
@@ -0,0 +1,891 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from collections import Counter, OrderedDict
|
||||
from functools import partial
|
||||
from operator import itemgetter
|
||||
|
||||
from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
|
||||
from calibre import __version__
|
||||
from calibre.ebooks.oeb.base import (
|
||||
XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize, EPUB_NS, XML_NS, OEB_DOCS)
|
||||
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
|
||||
from calibre.ebooks.oeb.polish.utils import guess_type, extract
|
||||
from calibre.ebooks.oeb.polish.opf import set_guide_item, get_book_language
|
||||
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
|
||||
from calibre.translations.dynamic import translate
|
||||
from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
|
||||
from polyglot.builtins import iteritems, map, unicode_type
|
||||
from polyglot.urllib import urlparse
|
||||
|
||||
ns = etree.FunctionNamespace('calibre_xpath_extensions')
|
||||
ns.prefix = 'calibre'
|
||||
ns['lower-case'] = lambda c, x: x.lower() if hasattr(x, 'lower') else x
|
||||
|
||||
|
||||
class TOC(object):
|
||||
|
||||
toc_title = None
|
||||
|
||||
def __init__(self, title=None, dest=None, frag=None):
|
||||
self.title, self.dest, self.frag = title, dest, frag
|
||||
self.dest_exists = self.dest_error = None
|
||||
if self.title:
|
||||
self.title = self.title.strip()
|
||||
self.parent = None
|
||||
self.children = []
|
||||
self.page_list = []
|
||||
|
||||
def add(self, title, dest, frag=None):
|
||||
c = TOC(title, dest, frag)
|
||||
self.children.append(c)
|
||||
c.parent = self
|
||||
return c
|
||||
|
||||
def remove(self, child):
|
||||
self.children.remove(child)
|
||||
child.parent = None
|
||||
|
||||
def remove_from_parent(self):
|
||||
if self.parent is None:
|
||||
return
|
||||
idx = self.parent.children.index(self)
|
||||
for child in reversed(self.children):
|
||||
child.parent = self.parent
|
||||
self.parent.children.insert(idx, child)
|
||||
self.parent.children.remove(self)
|
||||
self.parent = None
|
||||
|
||||
def __iter__(self):
|
||||
for c in self.children:
|
||||
yield c
|
||||
|
||||
def __len__(self):
|
||||
return len(self.children)
|
||||
|
||||
def iterdescendants(self, level=None):
|
||||
gc_level = None if level is None else level + 1
|
||||
for child in self:
|
||||
if level is None:
|
||||
yield child
|
||||
else:
|
||||
yield level, child
|
||||
for gc in child.iterdescendants(level=gc_level):
|
||||
yield gc
|
||||
|
||||
def remove_duplicates(self, only_text=True):
|
||||
seen = set()
|
||||
remove = []
|
||||
for child in self:
|
||||
key = child.title if only_text else (child.title, child.dest, (child.frag or None))
|
||||
if key in seen:
|
||||
remove.append(child)
|
||||
else:
|
||||
seen.add(key)
|
||||
child.remove_duplicates()
|
||||
for child in remove:
|
||||
self.remove(child)
|
||||
|
||||
@property
|
||||
def depth(self):
|
||||
"""The maximum depth of the navigation tree rooted at this node."""
|
||||
try:
|
||||
return max(node.depth for node in self) + 1
|
||||
except ValueError:
|
||||
return 1
|
||||
|
||||
@property
|
||||
def last_child(self):
|
||||
return self.children[-1] if self.children else None
|
||||
|
||||
def get_lines(self, lvl=0):
|
||||
frag = ('#'+self.frag) if self.frag else ''
|
||||
ans = [('\t'*lvl) + 'TOC: %s --> %s%s'%(self.title, self.dest, frag)]
|
||||
for child in self:
|
||||
ans.extend(child.get_lines(lvl+1))
|
||||
return ans
|
||||
|
||||
def __str__(self):
|
||||
return '\n'.join(self.get_lines())
|
||||
|
||||
def to_dict(self, node_counter=None):
|
||||
ans = {
|
||||
'title':self.title, 'dest':self.dest, 'frag':self.frag,
|
||||
'children':[c.to_dict(node_counter) for c in self.children]
|
||||
}
|
||||
if self.dest_exists is not None:
|
||||
ans['dest_exists'] = self.dest_exists
|
||||
if self.dest_error is not None:
|
||||
ans['dest_error'] = self.dest_error
|
||||
if node_counter is not None:
|
||||
ans['id'] = next(node_counter)
|
||||
return ans
|
||||
|
||||
@property
|
||||
def as_dict(self):
|
||||
return self.to_dict()
|
||||
|
||||
|
||||
def child_xpath(tag, name):
|
||||
return tag.xpath('./*[calibre:lower-case(local-name()) = "%s"]'%name)
|
||||
|
||||
|
||||
def add_from_navpoint(container, navpoint, parent, ncx_name):
|
||||
dest = frag = text = None
|
||||
nl = child_xpath(navpoint, 'navlabel')
|
||||
if nl:
|
||||
nl = nl[0]
|
||||
text = ''
|
||||
for txt in child_xpath(nl, 'text'):
|
||||
text += etree.tostring(txt, method='text',
|
||||
encoding='unicode', with_tail=False)
|
||||
content = child_xpath(navpoint, 'content')
|
||||
if content:
|
||||
content = content[0]
|
||||
href = content.get('src', None)
|
||||
if href:
|
||||
dest = container.href_to_name(href, base=ncx_name)
|
||||
frag = urlparse(href).fragment or None
|
||||
return parent.add(text or None, dest or None, frag or None)
|
||||
|
||||
|
||||
def process_ncx_node(container, node, toc_parent, ncx_name):
|
||||
for navpoint in node.xpath('./*[calibre:lower-case(local-name()) = "navpoint"]'):
|
||||
child = add_from_navpoint(container, navpoint, toc_parent, ncx_name)
|
||||
if child is not None:
|
||||
process_ncx_node(container, navpoint, child, ncx_name)
|
||||
|
||||
|
||||
def parse_ncx(container, ncx_name):
|
||||
root = container.parsed(ncx_name)
|
||||
toc_root = TOC()
|
||||
navmaps = root.xpath('//*[calibre:lower-case(local-name()) = "navmap"]')
|
||||
if navmaps:
|
||||
process_ncx_node(container, navmaps[0], toc_root, ncx_name)
|
||||
toc_root.lang = toc_root.uid = None
|
||||
for attr, val in iteritems(root.attrib):
|
||||
if attr.endswith('lang'):
|
||||
toc_root.lang = unicode_type(val)
|
||||
break
|
||||
for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and @name="dtb:uid"]/@content'):
|
||||
if uid:
|
||||
toc_root.uid = unicode_type(uid)
|
||||
break
|
||||
for pl in root.xpath('//*[calibre:lower-case(local-name()) = "pagelist"]'):
|
||||
for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = "pagetarget"]'):
|
||||
pagenum = pt.get('value')
|
||||
if pagenum:
|
||||
href = pt.xpath('descendant::*[calibre:lower-case(local-name()) = "content"]/@src')
|
||||
if href:
|
||||
dest = container.href_to_name(href[0], base=ncx_name)
|
||||
frag = urlparse(href[0]).fragment or None
|
||||
toc_root.page_list.append({'dest': dest, 'pagenum': pagenum, 'frag': frag})
|
||||
return toc_root
|
||||
|
||||
|
||||
def add_from_li(container, li, parent, nav_name):
|
||||
dest = frag = text = None
|
||||
for x in li.iterchildren(XHTML('a'), XHTML('span')):
|
||||
text = etree.tostring(x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
|
||||
href = x.get('href')
|
||||
if href:
|
||||
dest = nav_name if href.startswith('#') else container.href_to_name(href, base=nav_name)
|
||||
frag = urlparse(href).fragment or None
|
||||
break
|
||||
return parent.add(text or None, dest or None, frag or None)
|
||||
|
||||
|
||||
def first_child(parent, tagname):
|
||||
try:
|
||||
return next(parent.iterchildren(tagname))
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
|
||||
def process_nav_node(container, node, toc_parent, nav_name):
|
||||
for li in node.iterchildren(XHTML('li')):
|
||||
child = add_from_li(container, li, toc_parent, nav_name)
|
||||
ol = first_child(li, XHTML('ol'))
|
||||
if child is not None and ol is not None:
|
||||
process_nav_node(container, ol, child, nav_name)
|
||||
|
||||
|
||||
def parse_nav(container, nav_name):
|
||||
root = container.parsed(nav_name)
|
||||
toc_root = TOC()
|
||||
toc_root.lang = toc_root.uid = None
|
||||
et = '{%s}type' % EPUB_NS
|
||||
for nav in root.iterdescendants(XHTML('nav')):
|
||||
if nav.get(et) == 'toc':
|
||||
ol = first_child(nav, XHTML('ol'))
|
||||
if ol is not None:
|
||||
process_nav_node(container, ol, toc_root, nav_name)
|
||||
for h in nav.iterchildren(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
|
||||
text = etree.tostring(h, method='text', encoding='unicode', with_tail=False) or h.get('title')
|
||||
if text:
|
||||
toc_root.toc_title = text
|
||||
break
|
||||
break
|
||||
return toc_root
|
||||
|
||||
|
||||
def verify_toc_destinations(container, toc):
|
||||
anchor_map = {}
|
||||
anchor_xpath = XPath('//*/@id|//h:a/@name')
|
||||
for item in toc.iterdescendants():
|
||||
name = item.dest
|
||||
if not name:
|
||||
item.dest_exists = False
|
||||
item.dest_error = _('No file named %s exists')%name
|
||||
continue
|
||||
try:
|
||||
root = container.parsed(name)
|
||||
except KeyError:
|
||||
item.dest_exists = False
|
||||
item.dest_error = _('No file named %s exists')%name
|
||||
continue
|
||||
if not hasattr(root, 'xpath'):
|
||||
item.dest_exists = False
|
||||
item.dest_error = _('No HTML file named %s exists')%name
|
||||
continue
|
||||
if not item.frag:
|
||||
item.dest_exists = True
|
||||
continue
|
||||
if name not in anchor_map:
|
||||
anchor_map[name] = frozenset(anchor_xpath(root))
|
||||
item.dest_exists = item.frag in anchor_map[name]
|
||||
if not item.dest_exists:
|
||||
item.dest_error = _(
|
||||
'The anchor %(a)s does not exist in file %(f)s')%dict(
|
||||
a=item.frag, f=name)
|
||||
|
||||
|
||||
def find_existing_ncx_toc(container):
|
||||
toc = container.opf_xpath('//opf:spine/@toc')
|
||||
if toc:
|
||||
toc = container.manifest_id_map.get(toc[0], None)
|
||||
if not toc:
|
||||
ncx = guess_type('a.ncx')
|
||||
toc = container.manifest_type_map.get(ncx, [None])[0]
|
||||
return toc or None
|
||||
|
||||
|
||||
def find_existing_nav_toc(container):
|
||||
for name in container.manifest_items_with_property('nav'):
|
||||
return name
|
||||
|
||||
|
||||
def get_x_toc(container, find_toc, parse_toc, verify_destinations=True):
|
||||
def empty_toc():
|
||||
ans = TOC()
|
||||
ans.lang = ans.uid = None
|
||||
return ans
|
||||
toc = find_toc(container)
|
||||
ans = empty_toc() if toc is None or not container.has_name(toc) else parse_toc(container, toc)
|
||||
ans.toc_file_name = toc if toc and container.has_name(toc) else None
|
||||
if verify_destinations:
|
||||
verify_toc_destinations(container, ans)
|
||||
return ans
|
||||
|
||||
|
||||
def get_toc(container, verify_destinations=True):
|
||||
ver = container.opf_version_parsed
|
||||
if ver.major < 3:
|
||||
return get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations)
|
||||
else:
|
||||
ans = get_x_toc(container, find_existing_nav_toc, parse_nav, verify_destinations=verify_destinations)
|
||||
if len(ans) == 0:
|
||||
ans = get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations)
|
||||
return ans
|
||||
|
||||
|
||||
def get_guide_landmarks(container):
|
||||
for ref in container.opf_xpath('./opf:guide/opf:reference'):
|
||||
href, title, rtype = ref.get('href'), ref.get('title'), ref.get('type')
|
||||
href, frag = href.partition('#')[::2]
|
||||
name = container.href_to_name(href, container.opf_name)
|
||||
if container.has_name(name):
|
||||
yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''}
|
||||
|
||||
|
||||
def get_nav_landmarks(container):
|
||||
nav = find_existing_nav_toc(container)
|
||||
if nav and container.has_name(nav):
|
||||
root = container.parsed(nav)
|
||||
et = '{%s}type' % EPUB_NS
|
||||
for elem in root.iterdescendants(XHTML('nav')):
|
||||
if elem.get(et) == 'landmarks':
|
||||
for li in elem.iterdescendants(XHTML('li')):
|
||||
for a in li.iterdescendants(XHTML('a')):
|
||||
href, rtype = a.get('href'), a.get(et)
|
||||
if href:
|
||||
title = etree.tostring(a, method='text', encoding='unicode', with_tail=False).strip()
|
||||
href, frag = href.partition('#')[::2]
|
||||
name = container.href_to_name(href, nav)
|
||||
if container.has_name(name):
|
||||
yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''}
|
||||
break
|
||||
|
||||
|
||||
def get_landmarks(container):
|
||||
ver = container.opf_version_parsed
|
||||
if ver.major < 3:
|
||||
return list(get_guide_landmarks(container))
|
||||
ans = list(get_nav_landmarks(container))
|
||||
if len(ans) == 0:
|
||||
ans = list(get_guide_landmarks(container))
|
||||
return ans
|
||||
|
||||
|
||||
def ensure_id(elem, all_ids):
|
||||
elem_id = elem.get('id')
|
||||
if elem_id:
|
||||
return False, elem_id
|
||||
if elem.tag == XHTML('a'):
|
||||
anchor = elem.get('name', None)
|
||||
if anchor:
|
||||
elem.set('id', anchor)
|
||||
return False, anchor
|
||||
c = 0
|
||||
while True:
|
||||
c += 1
|
||||
q = 'toc_{}'.format(c)
|
||||
if q not in all_ids:
|
||||
elem.set('id', q)
|
||||
all_ids.add(q)
|
||||
break
|
||||
return True, elem.get('id')
|
||||
|
||||
|
||||
def elem_to_toc_text(elem):
|
||||
text = xml2text(elem).strip()
|
||||
if not text:
|
||||
text = elem.get('title', '')
|
||||
if not text:
|
||||
text = elem.get('alt', '')
|
||||
text = re.sub(r'\s+', ' ', text.strip())
|
||||
text = text[:1000].strip()
|
||||
if not text:
|
||||
text = _('(Untitled)')
|
||||
return text
|
||||
|
||||
|
||||
def item_at_top(elem):
|
||||
try:
|
||||
body = XPath('//h:body')(elem.getroottree().getroot())[0]
|
||||
except (TypeError, IndexError, KeyError, AttributeError):
|
||||
return False
|
||||
tree = body.getroottree()
|
||||
path = tree.getpath(elem)
|
||||
for el in body.iterdescendants(etree.Element):
|
||||
epath = tree.getpath(el)
|
||||
if epath == path:
|
||||
break
|
||||
try:
|
||||
if el.tag.endswith('}img') or (el.text and el.text.strip()):
|
||||
return False
|
||||
except:
|
||||
return False
|
||||
if not path.startswith(epath):
|
||||
# Only check tail of non-parent elements
|
||||
if el.tail and el.tail.strip():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def from_xpaths(container, xpaths):
|
||||
'''
|
||||
Generate a Table of Contents from a list of XPath expressions. Each
|
||||
expression in the list corresponds to a level of the generate ToC. For
|
||||
example: :code:`['//h:h1', '//h:h2', '//h:h3']` will generate a three level
|
||||
Table of Contents from the ``<h1>``, ``<h2>`` and ``<h3>`` tags.
|
||||
'''
|
||||
tocroot = TOC()
|
||||
xpaths = [XPath(xp) for xp in xpaths]
|
||||
|
||||
# Find those levels that have no elements in all spine items
|
||||
maps = OrderedDict()
|
||||
empty_levels = {i+1 for i, xp in enumerate(xpaths)}
|
||||
for spinepath in container.spine_items:
|
||||
name = container.abspath_to_name(spinepath)
|
||||
root = container.parsed(name)
|
||||
level_item_map = maps[name] = {i+1:frozenset(xp(root)) for i, xp in enumerate(xpaths)}
|
||||
for lvl, elems in iteritems(level_item_map):
|
||||
if elems:
|
||||
empty_levels.discard(lvl)
|
||||
# Remove empty levels from all level_maps
|
||||
if empty_levels:
|
||||
for name, lmap in tuple(iteritems(maps)):
|
||||
lmap = {lvl:items for lvl, items in iteritems(lmap) if lvl not in empty_levels}
|
||||
lmap = sorted(iteritems(lmap), key=itemgetter(0))
|
||||
lmap = {i+1:items for i, (l, items) in enumerate(lmap)}
|
||||
maps[name] = lmap
|
||||
|
||||
node_level_map = {tocroot: 0}
|
||||
|
||||
def parent_for_level(child_level):
|
||||
limit = child_level - 1
|
||||
|
||||
def process_node(node):
|
||||
child = node.last_child
|
||||
if child is None:
|
||||
return node
|
||||
lvl = node_level_map[child]
|
||||
return node if lvl > limit else child if lvl == limit else process_node(child)
|
||||
|
||||
return process_node(tocroot)
|
||||
|
||||
for name, level_item_map in iteritems(maps):
|
||||
root = container.parsed(name)
|
||||
item_level_map = {e:i for i, elems in iteritems(level_item_map) for e in elems}
|
||||
item_dirtied = False
|
||||
all_ids = set(root.xpath('//*/@id'))
|
||||
|
||||
for item in root.iterdescendants(etree.Element):
|
||||
lvl = item_level_map.get(item, None)
|
||||
if lvl is None:
|
||||
continue
|
||||
text = elem_to_toc_text(item)
|
||||
parent = parent_for_level(lvl)
|
||||
if item_at_top(item):
|
||||
dirtied, elem_id = False, None
|
||||
else:
|
||||
dirtied, elem_id = ensure_id(item, all_ids)
|
||||
item_dirtied = dirtied or item_dirtied
|
||||
toc = parent.add(text, name, elem_id)
|
||||
node_level_map[toc] = lvl
|
||||
toc.dest_exists = True
|
||||
|
||||
if item_dirtied:
|
||||
container.commit_item(name, keep_parsed=True)
|
||||
|
||||
return tocroot
|
||||
|
||||
|
||||
def from_links(container):
|
||||
'''
|
||||
Generate a Table of Contents from links in the book.
|
||||
'''
|
||||
toc = TOC()
|
||||
link_path = XPath('//h:a[@href]')
|
||||
seen_titles, seen_dests = set(), set()
|
||||
for name, is_linear in container.spine_names:
|
||||
root = container.parsed(name)
|
||||
for a in link_path(root):
|
||||
href = a.get('href')
|
||||
if not href or not href.strip():
|
||||
continue
|
||||
frag = None
|
||||
if href.startswith('#'):
|
||||
dest = name
|
||||
frag = href[1:]
|
||||
else:
|
||||
href, _, frag = href.partition('#')
|
||||
dest = container.href_to_name(href, base=name)
|
||||
frag = frag or None
|
||||
if (dest, frag) in seen_dests:
|
||||
continue
|
||||
seen_dests.add((dest, frag))
|
||||
text = elem_to_toc_text(a)
|
||||
if text in seen_titles:
|
||||
continue
|
||||
seen_titles.add(text)
|
||||
toc.add(text, dest, frag=frag)
|
||||
verify_toc_destinations(container, toc)
|
||||
for child in toc:
|
||||
if not child.dest_exists:
|
||||
toc.remove(child)
|
||||
return toc
|
||||
|
||||
|
||||
def find_text(node):
|
||||
LIMIT = 200
|
||||
pat = re.compile(r'\s+')
|
||||
for child in node:
|
||||
if isinstance(child, etree._Element):
|
||||
text = xml2text(child).strip()
|
||||
text = pat.sub(' ', text)
|
||||
if len(text) < 1:
|
||||
continue
|
||||
if len(text) > LIMIT:
|
||||
# Look for less text in a child of this node, recursively
|
||||
ntext = find_text(child)
|
||||
return ntext or (text[:LIMIT] + '...')
|
||||
else:
|
||||
return text
|
||||
|
||||
|
||||
def from_files(container):
|
||||
'''
|
||||
Generate a Table of Contents from files in the book.
|
||||
'''
|
||||
toc = TOC()
|
||||
for i, spinepath in enumerate(container.spine_items):
|
||||
name = container.abspath_to_name(spinepath)
|
||||
root = container.parsed(name)
|
||||
body = XPath('//h:body')(root)
|
||||
if not body:
|
||||
continue
|
||||
text = find_text(body[0])
|
||||
if not text:
|
||||
text = name.rpartition('/')[-1]
|
||||
if i == 0 and text.rpartition('.')[0].lower() in {'titlepage', 'cover'}:
|
||||
text = _('Cover')
|
||||
toc.add(text, name)
|
||||
return toc
|
||||
|
||||
|
||||
def node_from_loc(root, locs, totals=None):
|
||||
node = root.xpath('//*[local-name()="body"]')[0]
|
||||
for i, loc in enumerate(locs):
|
||||
children = tuple(node.iterchildren(etree.Element))
|
||||
if totals is not None and totals[i] != len(children):
|
||||
raise MalformedMarkup()
|
||||
node = children[loc]
|
||||
return node
|
||||
|
||||
|
||||
def add_id(container, name, loc, totals=None):
|
||||
root = container.parsed(name)
|
||||
try:
|
||||
node = node_from_loc(root, loc, totals=totals)
|
||||
except MalformedMarkup:
|
||||
# The webkit HTML parser and the container parser have yielded
|
||||
# different node counts, this can happen if the file is valid XML
|
||||
# but contains constructs like nested <p> tags. So force parse it
|
||||
# with the HTML 5 parser and try again.
|
||||
raw = container.raw_data(name)
|
||||
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
|
||||
try:
|
||||
node = node_from_loc(root, loc, totals=totals)
|
||||
except MalformedMarkup:
|
||||
raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
|
||||
' before editing.') % name)
|
||||
container.replace(name, root)
|
||||
|
||||
if not node.get('id'):
|
||||
ensure_id(node, set(root.xpath('//*/@id')))
|
||||
container.commit_item(name, keep_parsed=True)
|
||||
return node.get('id')
|
||||
|
||||
|
||||
def create_ncx(toc, to_href, btitle, lang, uid):
|
||||
lang = lang.replace('_', '-')
|
||||
ncx = etree.Element(NCX('ncx'),
|
||||
attrib={'version': '2005-1', XML('lang'): lang},
|
||||
nsmap={None: NCX_NS})
|
||||
head = etree.SubElement(ncx, NCX('head'))
|
||||
etree.SubElement(head, NCX('meta'),
|
||||
name='dtb:uid', content=unicode_type(uid))
|
||||
etree.SubElement(head, NCX('meta'),
|
||||
name='dtb:depth', content=unicode_type(toc.depth))
|
||||
generator = ''.join(['calibre (', __version__, ')'])
|
||||
etree.SubElement(head, NCX('meta'),
|
||||
name='dtb:generator', content=generator)
|
||||
etree.SubElement(head, NCX('meta'), name='dtb:totalPageCount', content='0')
|
||||
etree.SubElement(head, NCX('meta'), name='dtb:maxPageNumber', content='0')
|
||||
title = etree.SubElement(ncx, NCX('docTitle'))
|
||||
text = etree.SubElement(title, NCX('text'))
|
||||
text.text = btitle
|
||||
navmap = etree.SubElement(ncx, NCX('navMap'))
|
||||
spat = re.compile(r'\s+')
|
||||
|
||||
play_order = Counter()
|
||||
|
||||
def process_node(xml_parent, toc_parent):
|
||||
for child in toc_parent:
|
||||
play_order['c'] += 1
|
||||
point = etree.SubElement(xml_parent, NCX('navPoint'), id='num_%d' % play_order['c'],
|
||||
playOrder=unicode_type(play_order['c']))
|
||||
label = etree.SubElement(point, NCX('navLabel'))
|
||||
title = child.title
|
||||
if title:
|
||||
title = spat.sub(' ', title)
|
||||
etree.SubElement(label, NCX('text')).text = title
|
||||
if child.dest:
|
||||
href = to_href(child.dest)
|
||||
if child.frag:
|
||||
href += '#'+child.frag
|
||||
etree.SubElement(point, NCX('content'), src=href)
|
||||
process_node(point, child)
|
||||
|
||||
process_node(navmap, toc)
|
||||
return ncx
|
||||
|
||||
|
||||
def commit_ncx_toc(container, toc, lang=None, uid=None):
|
||||
tocname = find_existing_ncx_toc(container)
|
||||
if tocname is None:
|
||||
item = container.generate_item('toc.ncx', id_prefix='toc')
|
||||
tocname = container.href_to_name(item.get('href'), base=container.opf_name)
|
||||
ncx_id = item.get('id')
|
||||
[s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')]
|
||||
if not lang:
|
||||
lang = get_lang()
|
||||
for l in container.opf_xpath('//dc:language'):
|
||||
l = canonicalize_lang(xml2text(l).strip())
|
||||
if l:
|
||||
lang = l
|
||||
lang = lang_as_iso639_1(l) or l
|
||||
break
|
||||
lang = lang_as_iso639_1(lang) or lang
|
||||
if not uid:
|
||||
uid = uuid_id()
|
||||
eid = container.opf.get('unique-identifier', None)
|
||||
if eid:
|
||||
m = container.opf_xpath('//*[@id="%s"]'%eid)
|
||||
if m:
|
||||
uid = xml2text(m[0])
|
||||
|
||||
title = _('Table of Contents')
|
||||
m = container.opf_xpath('//dc:title')
|
||||
if m:
|
||||
x = xml2text(m[0]).strip()
|
||||
title = x or title
|
||||
|
||||
to_href = partial(container.name_to_href, base=tocname)
|
||||
root = create_ncx(toc, to_href, title, lang, uid)
|
||||
container.replace(tocname, root)
|
||||
container.pretty_print.add(tocname)
|
||||
|
||||
|
||||
def ensure_single_nav_of_type(root, ntype='toc'):
|
||||
et = '{%s}type' % EPUB_NS
|
||||
navs = [n for n in root.iterdescendants(XHTML('nav')) if n.get(et) == ntype]
|
||||
for x in navs[1:]:
|
||||
extract(x)
|
||||
if navs:
|
||||
nav = navs[0]
|
||||
tail = nav.tail
|
||||
attrib = dict(nav.attrib)
|
||||
nav.clear()
|
||||
nav.attrib.update(attrib)
|
||||
nav.tail = tail
|
||||
else:
|
||||
nav = root.makeelement(XHTML('nav'))
|
||||
first_child(root, XHTML('body')).append(nav)
|
||||
nav.set('{%s}type' % EPUB_NS, ntype)
|
||||
return nav
|
||||
|
||||
|
||||
def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None):
|
||||
from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree
|
||||
tocname = find_existing_nav_toc(container)
|
||||
if previous_nav is not None:
|
||||
nav_name = container.href_to_name(previous_nav[0])
|
||||
if nav_name and container.exists(nav_name):
|
||||
tocname = nav_name
|
||||
container.apply_unique_properties(tocname, 'nav')
|
||||
if tocname is None:
|
||||
item = container.generate_item('nav.xhtml', id_prefix='nav')
|
||||
item.set('properties', 'nav')
|
||||
tocname = container.href_to_name(item.get('href'), base=container.opf_name)
|
||||
if previous_nav is not None:
|
||||
root = previous_nav[1]
|
||||
else:
|
||||
root = container.parse_xhtml(P('templates/new_nav.html', data=True).decode('utf-8'))
|
||||
container.replace(tocname, root)
|
||||
else:
|
||||
root = container.parsed(tocname)
|
||||
if lang:
|
||||
lang = lang_as_iso639_1(lang) or lang
|
||||
root.set('lang', lang)
|
||||
root.set('{%s}lang' % XML_NS, lang)
|
||||
nav = ensure_single_nav_of_type(root, 'toc')
|
||||
if toc.toc_title:
|
||||
nav.append(nav.makeelement(XHTML('h1')))
|
||||
nav[-1].text = toc.toc_title
|
||||
|
||||
rnode = nav.makeelement(XHTML('ol'))
|
||||
nav.append(rnode)
|
||||
to_href = partial(container.name_to_href, base=tocname)
|
||||
spat = re.compile(r'\s+')
|
||||
|
||||
def process_node(xml_parent, toc_parent):
|
||||
for child in toc_parent:
|
||||
li = xml_parent.makeelement(XHTML('li'))
|
||||
xml_parent.append(li)
|
||||
title = child.title or ''
|
||||
title = spat.sub(' ', title).strip()
|
||||
a = li.makeelement(XHTML('a' if child.dest else 'span'))
|
||||
a.text = title
|
||||
li.append(a)
|
||||
if child.dest:
|
||||
href = to_href(child.dest)
|
||||
if child.frag:
|
||||
href += '#'+child.frag
|
||||
a.set('href', href)
|
||||
if len(child):
|
||||
ol = li.makeelement(XHTML('ol'))
|
||||
li.append(ol)
|
||||
process_node(ol, child)
|
||||
process_node(rnode, toc)
|
||||
pretty_xml_tree(nav)
|
||||
|
||||
def collapse_li(parent):
|
||||
for li in parent.iterdescendants(XHTML('li')):
|
||||
if len(li) == 1:
|
||||
li.text = None
|
||||
li[0].tail = None
|
||||
collapse_li(nav)
|
||||
nav.tail = '\n'
|
||||
|
||||
def create_li(ol, entry):
|
||||
li = ol.makeelement(XHTML('li'))
|
||||
ol.append(li)
|
||||
a = li.makeelement(XHTML('a'))
|
||||
li.append(a)
|
||||
href = container.name_to_href(entry['dest'], tocname)
|
||||
if entry['frag']:
|
||||
href += '#' + entry['frag']
|
||||
a.set('href', href)
|
||||
return a
|
||||
|
||||
if landmarks is not None:
|
||||
nav = ensure_single_nav_of_type(root, 'landmarks')
|
||||
nav.set('hidden', '')
|
||||
ol = nav.makeelement(XHTML('ol'))
|
||||
nav.append(ol)
|
||||
for entry in landmarks:
|
||||
if entry['type'] and container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS:
|
||||
a = create_li(ol, entry)
|
||||
a.set('{%s}type' % EPUB_NS, entry['type'])
|
||||
a.text = entry['title'] or None
|
||||
pretty_xml_tree(nav)
|
||||
collapse_li(nav)
|
||||
|
||||
if toc.page_list:
|
||||
nav = ensure_single_nav_of_type(root, 'page-list')
|
||||
nav.set('hidden', '')
|
||||
ol = nav.makeelement(XHTML('ol'))
|
||||
nav.append(ol)
|
||||
for entry in toc.page_list:
|
||||
if container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS:
|
||||
a = create_li(ol, entry)
|
||||
a.text = unicode_type(entry['pagenum'])
|
||||
pretty_xml_tree(nav)
|
||||
collapse_li(nav)
|
||||
container.replace(tocname, root)
|
||||
|
||||
|
||||
def commit_toc(container, toc, lang=None, uid=None):
|
||||
commit_ncx_toc(container, toc, lang=lang, uid=uid)
|
||||
if container.opf_version_parsed.major > 2:
|
||||
commit_nav_toc(container, toc, lang=lang)
|
||||
|
||||
|
||||
def remove_names_from_toc(container, names):
|
||||
changed = []
|
||||
names = frozenset(names)
|
||||
for find_toc, parse_toc, commit_toc in (
|
||||
(find_existing_ncx_toc, parse_ncx, commit_ncx_toc),
|
||||
(find_existing_nav_toc, parse_nav, commit_nav_toc),
|
||||
):
|
||||
toc = get_x_toc(container, find_toc, parse_toc, verify_destinations=False)
|
||||
if len(toc) > 0:
|
||||
remove = []
|
||||
for node in toc.iterdescendants():
|
||||
if node.dest in names:
|
||||
remove.append(node)
|
||||
if remove:
|
||||
for node in reversed(remove):
|
||||
node.remove_from_parent()
|
||||
commit_toc(container, toc)
|
||||
changed.append(find_toc(container))
|
||||
return changed
|
||||
|
||||
|
||||
def find_inline_toc(container):
|
||||
for name, linear in container.spine_names:
|
||||
if container.parsed(name).xpath('//*[local-name()="body" and @id="calibre_generated_inline_toc"]'):
|
||||
return name
|
||||
|
||||
|
||||
def toc_to_html(toc, container, toc_name, title, lang=None):
|
||||
|
||||
def process_node(html_parent, toc, level=1, indent=' ', style_level=2):
|
||||
li = html_parent.makeelement(XHTML('li'))
|
||||
li.tail = '\n'+ (indent*level)
|
||||
html_parent.append(li)
|
||||
name, frag = toc.dest, toc.frag
|
||||
href = '#'
|
||||
if name:
|
||||
href = container.name_to_href(name, toc_name)
|
||||
if frag:
|
||||
href += '#' + frag
|
||||
a = li.makeelement(XHTML('a'), href=href)
|
||||
a.text = toc.title
|
||||
li.append(a)
|
||||
if len(toc) > 0:
|
||||
parent = li.makeelement(XHTML('ul'))
|
||||
parent.set('class', 'level%d' % (style_level))
|
||||
li.append(parent)
|
||||
a.tail = '\n\n' + (indent*(level+2))
|
||||
parent.text = '\n'+(indent*(level+3))
|
||||
parent.tail = '\n\n' + (indent*(level+1))
|
||||
for child in toc:
|
||||
process_node(parent, child, level+3, style_level=style_level + 1)
|
||||
parent[-1].tail = '\n' + (indent*(level+2))
|
||||
|
||||
E = ElementMaker(namespace=XHTML_NS, nsmap={None:XHTML_NS})
|
||||
html = E.html(
|
||||
E.head(
|
||||
E.title(title),
|
||||
E.style(P('templates/inline_toc_styles.css', data=True), type='text/css'),
|
||||
),
|
||||
E.body(
|
||||
E.h2(title),
|
||||
E.ul(),
|
||||
id="calibre_generated_inline_toc",
|
||||
)
|
||||
)
|
||||
|
||||
ul = html[1][1]
|
||||
ul.set('class', 'level1')
|
||||
for child in toc:
|
||||
process_node(ul, child)
|
||||
if lang:
|
||||
html.set('lang', lang)
|
||||
pretty_html_tree(container, html)
|
||||
return html
|
||||
|
||||
|
||||
def create_inline_toc(container, title=None):
|
||||
'''
|
||||
Create an inline (HTML) Table of Contents from an existing NCX Table of Contents.
|
||||
|
||||
:param title: The title for this table of contents.
|
||||
'''
|
||||
lang = get_book_language(container)
|
||||
default_title = 'Table of Contents'
|
||||
if lang:
|
||||
lang = lang_as_iso639_1(lang) or lang
|
||||
default_title = translate(lang, default_title)
|
||||
title = title or default_title
|
||||
toc = get_toc(container)
|
||||
if len(toc) == 0:
|
||||
return None
|
||||
toc_name = find_inline_toc(container)
|
||||
|
||||
name = toc_name
|
||||
html = toc_to_html(toc, container, name, title, lang)
|
||||
raw = serialize(html, 'text/html')
|
||||
if name is None:
|
||||
name, c = 'toc.xhtml', 0
|
||||
while container.has_name(name):
|
||||
c += 1
|
||||
name = 'toc%d.xhtml' % c
|
||||
container.add_file(name, raw, spine_index=0)
|
||||
else:
|
||||
with container.open(name, 'wb') as f:
|
||||
f.write(raw)
|
||||
set_guide_item(container, 'toc', title, name, frag='calibre_generated_inline_toc')
|
||||
return name
|
||||
231
ebook_converter/ebooks/oeb/polish/utils.py
Normal file
231
ebook_converter/ebooks/oeb/polish/utils.py
Normal file
@@ -0,0 +1,231 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re, os
|
||||
from bisect import bisect
|
||||
|
||||
from calibre import guess_type as _guess_type, replace_entities
|
||||
from polyglot.builtins import filter
|
||||
|
||||
|
||||
def guess_type(x):
|
||||
return _guess_type(x)[0] or 'application/octet-stream'
|
||||
|
||||
|
||||
def setup_css_parser_serialization(tab_width=2):
|
||||
import css_parser
|
||||
prefs = css_parser.ser.prefs
|
||||
prefs.indent = tab_width * ' '
|
||||
prefs.indentClosingBrace = False
|
||||
prefs.omitLastSemicolon = False
|
||||
|
||||
|
||||
def actual_case_for_name(container, name):
|
||||
from calibre.utils.filenames import samefile
|
||||
if not container.exists(name):
|
||||
raise ValueError('Cannot get actual case for %s as it does not exist' % name)
|
||||
parts = name.split('/')
|
||||
base = ''
|
||||
ans = []
|
||||
for i, x in enumerate(parts):
|
||||
base = '/'.join(ans + [x])
|
||||
path = container.name_to_abspath(base)
|
||||
pdir = os.path.dirname(path)
|
||||
candidates = {os.path.join(pdir, q) for q in os.listdir(pdir)}
|
||||
if x in candidates:
|
||||
correctx = x
|
||||
else:
|
||||
for q in candidates:
|
||||
if samefile(q, path):
|
||||
correctx = os.path.basename(q)
|
||||
break
|
||||
else:
|
||||
raise RuntimeError('Something bad happened')
|
||||
ans.append(correctx)
|
||||
return '/'.join(ans)
|
||||
|
||||
|
||||
def corrected_case_for_name(container, name):
|
||||
parts = name.split('/')
|
||||
ans = []
|
||||
base = ''
|
||||
for i, x in enumerate(parts):
|
||||
base = '/'.join(ans + [x])
|
||||
if container.exists(base):
|
||||
correctx = x
|
||||
else:
|
||||
try:
|
||||
candidates = {q for q in os.listdir(os.path.dirname(container.name_to_abspath(base)))}
|
||||
except EnvironmentError:
|
||||
return None # one of the non-terminal components of name is a file instead of a directory
|
||||
for q in candidates:
|
||||
if q.lower() == x.lower():
|
||||
correctx = q
|
||||
break
|
||||
else:
|
||||
return None
|
||||
ans.append(correctx)
|
||||
return '/'.join(ans)
|
||||
|
||||
|
||||
class PositionFinder(object):
|
||||
|
||||
def __init__(self, raw):
|
||||
pat = br'\n' if isinstance(raw, bytes) else r'\n'
|
||||
self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))
|
||||
|
||||
def __call__(self, pos):
|
||||
lnum = bisect(self.new_lines, pos)
|
||||
try:
|
||||
offset = abs(pos - self.new_lines[lnum - 1])
|
||||
except IndexError:
|
||||
offset = pos
|
||||
return (lnum + 1, offset)
|
||||
|
||||
|
||||
class CommentFinder(object):
|
||||
|
||||
def __init__(self, raw, pat=r'(?s)/\*.*?\*/'):
|
||||
self.starts, self.ends = [], []
|
||||
for m in re.finditer(pat, raw):
|
||||
start, end = m.span()
|
||||
self.starts.append(start), self.ends.append(end)
|
||||
|
||||
def __call__(self, offset):
|
||||
if not self.starts:
|
||||
return False
|
||||
q = bisect(self.starts, offset) - 1
|
||||
return q >= 0 and self.starts[q] <= offset <= self.ends[q]
|
||||
|
||||
|
||||
def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'):
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML
|
||||
changed_names = set()
|
||||
snames = set(sheets)
|
||||
lp = XPath('//h:link[@href]')
|
||||
hp = XPath('//h:head')
|
||||
for name in names:
|
||||
root = container.parsed(name)
|
||||
if remove:
|
||||
for link in lp(root):
|
||||
if (link.get('type', mtype) or mtype) == mtype:
|
||||
container.remove_from_xml(link)
|
||||
changed_names.add(name)
|
||||
container.dirty(name)
|
||||
existing = {container.href_to_name(l.get('href'), name) for l in lp(root) if (l.get('type', mtype) or mtype) == mtype}
|
||||
extra = snames - existing
|
||||
if extra:
|
||||
changed_names.add(name)
|
||||
try:
|
||||
parent = hp(root)[0]
|
||||
except (TypeError, IndexError):
|
||||
parent = root.makeelement(XHTML('head'))
|
||||
container.insert_into_xml(root, parent, index=0)
|
||||
for sheet in sheets:
|
||||
if sheet in extra:
|
||||
container.insert_into_xml(
|
||||
parent, parent.makeelement(XHTML('link'), rel='stylesheet', type=mtype,
|
||||
href=container.name_to_href(sheet, name)))
|
||||
container.dirty(name)
|
||||
|
||||
return changed_names
|
||||
|
||||
|
||||
def lead_text(top_elem, num_words=10):
|
||||
''' Return the leading text contained in top_elem (including descendants)
|
||||
up to a maximum of num_words words. More efficient than using
|
||||
etree.tostring(method='text') as it does not have to serialize the entire
|
||||
sub-tree rooted at top_elem.'''
|
||||
pat = re.compile(r'\s+', flags=re.UNICODE)
|
||||
words = []
|
||||
|
||||
def get_text(x, attr='text'):
|
||||
ans = getattr(x, attr)
|
||||
if ans:
|
||||
words.extend(filter(None, pat.split(ans)))
|
||||
|
||||
stack = [(top_elem, 'text')]
|
||||
while stack and len(words) < num_words:
|
||||
elem, attr = stack.pop()
|
||||
get_text(elem, attr)
|
||||
if attr == 'text':
|
||||
if elem is not top_elem:
|
||||
stack.append((elem, 'tail'))
|
||||
stack.extend(reversed(list((c, 'text') for c in elem.iterchildren('*'))))
|
||||
return ' '.join(words[:num_words])
|
||||
|
||||
|
||||
def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None):
|
||||
if log_level is None:
|
||||
import logging
|
||||
log_level = logging.WARNING
|
||||
from css_parser import CSSParser, log
|
||||
from calibre.ebooks.oeb.base import _css_logger
|
||||
log.setLevel(log_level)
|
||||
log.raiseExceptions = False
|
||||
data = data or ''
|
||||
if isinstance(data, bytes):
|
||||
data = data.decode('utf-8') if decode is None else decode(data)
|
||||
if css_preprocessor is not None:
|
||||
data = css_preprocessor(data)
|
||||
parser = CSSParser(loglevel=log_level,
|
||||
# We dont care about @import rules
|
||||
fetcher=lambda x: (None, None), log=_css_logger)
|
||||
if is_declaration:
|
||||
data = parser.parseStyle(data, validate=False)
|
||||
else:
|
||||
data = parser.parseString(data, href=fname, validate=False)
|
||||
return data
|
||||
|
||||
|
||||
def handle_entities(text, func):
|
||||
return func(replace_entities(text))
|
||||
|
||||
|
||||
def apply_func_to_match_groups(match, func=icu_upper, handle_entities=handle_entities):
|
||||
'''Apply the specified function to individual groups in the match object (the result of re.search() or
|
||||
the whole match if no groups were defined. Returns the replaced string.'''
|
||||
found_groups = False
|
||||
i = 0
|
||||
parts, pos = [], match.start()
|
||||
f = lambda text:handle_entities(text, func)
|
||||
while True:
|
||||
i += 1
|
||||
try:
|
||||
start, end = match.span(i)
|
||||
except IndexError:
|
||||
break
|
||||
found_groups = True
|
||||
if start > -1:
|
||||
parts.append(match.string[pos:start])
|
||||
parts.append(f(match.string[start:end]))
|
||||
pos = end
|
||||
if not found_groups:
|
||||
return f(match.group())
|
||||
parts.append(match.string[pos:match.end()])
|
||||
return ''.join(parts)
|
||||
|
||||
|
||||
def apply_func_to_html_text(match, func=icu_upper, handle_entities=handle_entities):
|
||||
''' Apply the specified function only to text between HTML tag definitions. '''
|
||||
f = lambda text:handle_entities(text, func)
|
||||
parts = re.split(r'(<[^>]+>)', match.group())
|
||||
parts = (x if x.startswith('<') else f(x) for x in parts)
|
||||
return ''.join(parts)
|
||||
|
||||
|
||||
def extract(elem):
|
||||
''' Remove an element from the tree, keeping elem.tail '''
|
||||
p = elem.getparent()
|
||||
if p is not None:
|
||||
idx = p.index(elem)
|
||||
p.remove(elem)
|
||||
if elem.tail:
|
||||
if idx > 0:
|
||||
p[idx-1].tail = (p[idx-1].tail or '') + elem.tail
|
||||
else:
|
||||
p.text = (p.text or '') + elem.tail
|
||||
720
ebook_converter/ebooks/oeb/reader.py
Normal file
720
ebook_converter/ebooks/oeb/reader.py
Normal file
@@ -0,0 +1,720 @@
|
||||
"""
|
||||
Container-/OPF-based input OEBBook reader.
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
import sys, os, uuid, copy, re, io
|
||||
from collections import defaultdict
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
|
||||
DC_NSES, OPF, xml2text, XHTML_MIME
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
|
||||
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
|
||||
from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
|
||||
MS_COVER_TYPE, iterlinks
|
||||
from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
|
||||
urlnormalize, BINARY_MIME, \
|
||||
OEBError, OEBBook, DirContainer
|
||||
from calibre.ebooks.oeb.writer import OEBWriter
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import __appname__, __version__
|
||||
from calibre import guess_type, xml_replace_entities
|
||||
from polyglot.builtins import unicode_type, zip
|
||||
from polyglot.urllib import unquote, urldefrag, urlparse
|
||||
|
||||
__all__ = ['OEBReader']
|
||||
|
||||
|
||||
class OEBReader(object):
|
||||
"""Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
|
||||
|
||||
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
|
||||
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
|
||||
|
||||
Container = DirContainer
|
||||
"""Container type used to access book files. Override in sub-classes."""
|
||||
|
||||
DEFAULT_PROFILE = 'PRS505'
|
||||
"""Default renderer profile for content read with this Reader."""
|
||||
|
||||
TRANSFORMS = []
|
||||
"""List of transforms to apply to content read with this Reader."""
|
||||
|
||||
@classmethod
|
||||
def config(cls, cfg):
|
||||
"""Add any book-reading options to the :class:`Config` object
|
||||
:param:`cfg`.
|
||||
"""
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def generate(cls, opts):
|
||||
"""Generate a Reader instance from command-line options."""
|
||||
return cls()
|
||||
|
||||
def __call__(self, oeb, path):
|
||||
"""Read the book at :param:`path` into the :class:`OEBBook` object
|
||||
:param:`oeb`.
|
||||
"""
|
||||
self.oeb = oeb
|
||||
self.logger = self.log = oeb.logger
|
||||
oeb.container = self.Container(path, self.logger)
|
||||
oeb.container.log = oeb.log
|
||||
opf = self._read_opf()
|
||||
self._all_from_opf(opf)
|
||||
return oeb
|
||||
|
||||
def _clean_opf(self, opf):
|
||||
nsmap = {}
|
||||
for elem in opf.iter(tag=etree.Element):
|
||||
nsmap.update(elem.nsmap)
|
||||
for elem in opf.iter(tag=etree.Element):
|
||||
if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag):
|
||||
elem.tag = OPF(barename(elem.tag))
|
||||
nsmap.update(OPF2_NSMAP)
|
||||
attrib = dict(opf.attrib)
|
||||
nroot = etree.Element(OPF('package'),
|
||||
nsmap={None: OPF2_NS}, attrib=attrib)
|
||||
metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
|
||||
ignored = (OPF('dc-metadata'), OPF('x-metadata'))
|
||||
for elem in xpath(opf, 'o2:metadata//*'):
|
||||
if elem.tag in ignored:
|
||||
continue
|
||||
if namespace(elem.tag) in DC_NSES:
|
||||
tag = barename(elem.tag).lower()
|
||||
elem.tag = '{%s}%s' % (DC11_NS, tag)
|
||||
if elem.tag.startswith('dc:'):
|
||||
tag = elem.tag.partition(':')[-1].lower()
|
||||
elem.tag = '{%s}%s' % (DC11_NS, tag)
|
||||
metadata.append(elem)
|
||||
for element in xpath(opf, 'o2:metadata//o2:meta'):
|
||||
metadata.append(element)
|
||||
for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
|
||||
for element in xpath(opf, tag):
|
||||
nroot.append(element)
|
||||
return nroot
|
||||
|
||||
def _read_opf(self):
|
||||
data = self.oeb.container.read(None)
|
||||
data = self.oeb.decode(data)
|
||||
data = XMLDECL_RE.sub('', data)
|
||||
data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
|
||||
OPF1_NS, data)
|
||||
try:
|
||||
opf = safe_xml_fromstring(data)
|
||||
except etree.XMLSyntaxError:
|
||||
data = xml_replace_entities(clean_xml_chars(data), encoding=None)
|
||||
try:
|
||||
opf = safe_xml_fromstring(data)
|
||||
self.logger.warn('OPF contains invalid HTML named entities')
|
||||
except etree.XMLSyntaxError:
|
||||
data = re.sub(r'(?is)<tours>.+</tours>', '', data)
|
||||
data = data.replace('<dc-metadata>',
|
||||
'<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">')
|
||||
opf = safe_xml_fromstring(data)
|
||||
self.logger.warn('OPF contains invalid tours section')
|
||||
|
||||
ns = namespace(opf.tag)
|
||||
if ns not in ('', OPF1_NS, OPF2_NS):
|
||||
raise OEBError('Invalid namespace %r for OPF document' % ns)
|
||||
opf = self._clean_opf(opf)
|
||||
return opf
|
||||
|
||||
def _metadata_from_opf(self, opf):
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
stream = io.BytesIO(etree.tostring(opf, xml_declaration=True, encoding='utf-8'))
|
||||
o = OPF(stream)
|
||||
pwm = o.primary_writing_mode
|
||||
if pwm:
|
||||
self.oeb.metadata.primary_writing_mode = pwm
|
||||
mi = o.to_book_metadata()
|
||||
if not mi.language:
|
||||
mi.language = get_lang().replace('_', '-')
|
||||
self.oeb.metadata.add('language', mi.language)
|
||||
if not mi.book_producer:
|
||||
mi.book_producer = '%(a)s (%(v)s) [http://%(a)s-ebook.com]'%\
|
||||
dict(a=__appname__, v=__version__)
|
||||
meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger)
|
||||
m = self.oeb.metadata
|
||||
m.add('identifier', unicode_type(uuid.uuid4()), id='uuid_id', scheme='uuid')
|
||||
self.oeb.uid = self.oeb.metadata.identifier[-1]
|
||||
if not m.title:
|
||||
m.add('title', self.oeb.translate(__('Unknown')))
|
||||
has_aut = False
|
||||
for x in m.creator:
|
||||
if getattr(x, 'role', '').lower() in ('', 'aut'):
|
||||
has_aut = True
|
||||
break
|
||||
if not has_aut:
|
||||
m.add('creator', self.oeb.translate(__('Unknown')), role='aut')
|
||||
|
||||
def _manifest_prune_invalid(self):
|
||||
'''
|
||||
Remove items from manifest that contain invalid data. This prevents
|
||||
catastrophic conversion failure, when a few files contain corrupted
|
||||
data.
|
||||
'''
|
||||
bad = []
|
||||
check = OEB_DOCS.union(OEB_STYLES)
|
||||
for item in list(self.oeb.manifest.values()):
|
||||
if item.media_type in check:
|
||||
try:
|
||||
item.data
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except:
|
||||
self.logger.exception('Failed to parse content in %s'%
|
||||
item.href)
|
||||
bad.append(item)
|
||||
self.oeb.manifest.remove(item)
|
||||
return bad
|
||||
|
||||
def _manifest_add_missing(self, invalid):
|
||||
import css_parser
|
||||
manifest = self.oeb.manifest
|
||||
known = set(manifest.hrefs)
|
||||
unchecked = set(manifest.values())
|
||||
cdoc = OEB_DOCS|OEB_STYLES
|
||||
invalid = set()
|
||||
while unchecked:
|
||||
new = set()
|
||||
for item in unchecked:
|
||||
data = None
|
||||
if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')):
|
||||
try:
|
||||
data = item.data
|
||||
except:
|
||||
self.oeb.log.exception('Failed to read from manifest '
|
||||
'entry with id: %s, ignoring'%item.id)
|
||||
invalid.add(item)
|
||||
continue
|
||||
if data is None:
|
||||
continue
|
||||
|
||||
if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')):
|
||||
hrefs = [r[2] for r in iterlinks(data)]
|
||||
for href in hrefs:
|
||||
if isinstance(href, bytes):
|
||||
href = href.decode('utf-8')
|
||||
href, _ = urldefrag(href)
|
||||
if not href:
|
||||
continue
|
||||
try:
|
||||
href = item.abshref(urlnormalize(href))
|
||||
scheme = urlparse(href).scheme
|
||||
except:
|
||||
self.oeb.log.exception(
|
||||
'Skipping invalid href: %r'%href)
|
||||
continue
|
||||
if not scheme and href not in known:
|
||||
new.add(href)
|
||||
elif item.media_type in OEB_STYLES:
|
||||
try:
|
||||
urls = list(css_parser.getUrls(data))
|
||||
except:
|
||||
urls = []
|
||||
for url in urls:
|
||||
href, _ = urldefrag(url)
|
||||
href = item.abshref(urlnormalize(href))
|
||||
scheme = urlparse(href).scheme
|
||||
if not scheme and href not in known:
|
||||
new.add(href)
|
||||
unchecked.clear()
|
||||
warned = set()
|
||||
for href in new:
|
||||
known.add(href)
|
||||
is_invalid = False
|
||||
for item in invalid:
|
||||
if href == item.abshref(urlnormalize(href)):
|
||||
is_invalid = True
|
||||
break
|
||||
if is_invalid:
|
||||
continue
|
||||
if not self.oeb.container.exists(href):
|
||||
if href not in warned:
|
||||
self.logger.warn('Referenced file %r not found' % href)
|
||||
warned.add(href)
|
||||
continue
|
||||
if href not in warned:
|
||||
self.logger.warn('Referenced file %r not in manifest' % href)
|
||||
warned.add(href)
|
||||
id, _ = manifest.generate(id='added')
|
||||
guessed = guess_type(href)[0]
|
||||
media_type = guessed or BINARY_MIME
|
||||
added = manifest.add(id, href, media_type)
|
||||
unchecked.add(added)
|
||||
|
||||
for item in invalid:
|
||||
self.oeb.manifest.remove(item)
|
||||
|
||||
def _manifest_from_opf(self, opf):
|
||||
manifest = self.oeb.manifest
|
||||
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
|
||||
id = elem.get('id')
|
||||
href = elem.get('href')
|
||||
media_type = elem.get('media-type', None)
|
||||
if media_type is None:
|
||||
media_type = elem.get('mediatype', None)
|
||||
if not media_type or media_type == 'text/xml':
|
||||
guessed = guess_type(href)[0]
|
||||
media_type = guessed or media_type or BINARY_MIME
|
||||
if hasattr(media_type, 'lower'):
|
||||
media_type = media_type.lower()
|
||||
fallback = elem.get('fallback')
|
||||
if href in manifest.hrefs:
|
||||
self.logger.warn('Duplicate manifest entry for %r' % href)
|
||||
continue
|
||||
if not self.oeb.container.exists(href):
|
||||
self.logger.warn('Manifest item %r not found' % href)
|
||||
continue
|
||||
if id in manifest.ids:
|
||||
self.logger.warn('Duplicate manifest id %r' % id)
|
||||
id, href = manifest.generate(id, href)
|
||||
manifest.add(id, href, media_type, fallback)
|
||||
invalid = self._manifest_prune_invalid()
|
||||
self._manifest_add_missing(invalid)
|
||||
|
||||
def _spine_add_extra(self):
|
||||
manifest = self.oeb.manifest
|
||||
spine = self.oeb.spine
|
||||
unchecked = set(spine)
|
||||
selector = XPath('h:body//h:a/@href')
|
||||
extras = set()
|
||||
while unchecked:
|
||||
new = set()
|
||||
for item in unchecked:
|
||||
if item.media_type not in OEB_DOCS:
|
||||
# TODO: handle fallback chains
|
||||
continue
|
||||
for href in selector(item.data):
|
||||
href, _ = urldefrag(href)
|
||||
if not href:
|
||||
continue
|
||||
try:
|
||||
href = item.abshref(urlnormalize(href))
|
||||
except ValueError: # Malformed URL
|
||||
continue
|
||||
if href not in manifest.hrefs:
|
||||
continue
|
||||
found = manifest.hrefs[href]
|
||||
if found.media_type not in OEB_DOCS or \
|
||||
found in spine or found in extras:
|
||||
continue
|
||||
new.add(found)
|
||||
extras.update(new)
|
||||
unchecked = new
|
||||
version = int(self.oeb.version[0])
|
||||
removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ())
|
||||
for item in sorted(extras):
|
||||
if item.href in removed_items_to_ignore:
|
||||
continue
|
||||
if version >= 2:
|
||||
self.logger.warn(
|
||||
'Spine-referenced file %r not in spine' % item.href)
|
||||
spine.add(item, linear=False)
|
||||
|
||||
def _spine_from_opf(self, opf):
|
||||
spine = self.oeb.spine
|
||||
manifest = self.oeb.manifest
|
||||
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
|
||||
idref = elem.get('idref')
|
||||
if idref not in manifest.ids:
|
||||
self.logger.warn('Spine item %r not found' % idref)
|
||||
continue
|
||||
item = manifest.ids[idref]
|
||||
if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath') and not getattr(item.data, 'tag', '').endswith('}ncx'):
|
||||
spine.add(item, elem.get('linear'))
|
||||
else:
|
||||
if hasattr(item.data, 'tag') and item.data.tag and item.data.tag.endswith('}html'):
|
||||
item.media_type = XHTML_MIME
|
||||
spine.add(item, elem.get('linear'))
|
||||
else:
|
||||
self.oeb.log.warn('The item %s is not a XML document.'
|
||||
' Removing it from spine.'%item.href)
|
||||
if len(spine) == 0:
|
||||
raise OEBError("Spine is empty")
|
||||
self._spine_add_extra()
|
||||
for val in xpath(opf, '/o2:package/o2:spine/@page-progression-direction'):
|
||||
if val in {'ltr', 'rtl'}:
|
||||
spine.page_progression_direction = val
|
||||
|
||||
def _guide_from_opf(self, opf):
|
||||
guide = self.oeb.guide
|
||||
manifest = self.oeb.manifest
|
||||
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
|
||||
ref_href = elem.get('href')
|
||||
path = urlnormalize(urldefrag(ref_href)[0])
|
||||
if path not in manifest.hrefs:
|
||||
corrected_href = None
|
||||
for href in manifest.hrefs:
|
||||
if href.lower() == path.lower():
|
||||
corrected_href = href
|
||||
break
|
||||
if corrected_href is None:
|
||||
self.logger.warn('Guide reference %r not found' % ref_href)
|
||||
continue
|
||||
ref_href = corrected_href
|
||||
typ = elem.get('type')
|
||||
if typ not in guide:
|
||||
guide.add(typ, elem.get('title'), ref_href)
|
||||
|
||||
def _find_ncx(self, opf):
|
||||
result = xpath(opf, '/o2:package/o2:spine/@toc')
|
||||
if result:
|
||||
id = result[0]
|
||||
if id not in self.oeb.manifest.ids:
|
||||
return None
|
||||
item = self.oeb.manifest.ids[id]
|
||||
self.oeb.manifest.remove(item)
|
||||
return item
|
||||
for item in self.oeb.manifest.values():
|
||||
if item.media_type == NCX_MIME:
|
||||
self.oeb.manifest.remove(item)
|
||||
return item
|
||||
return None
|
||||
|
||||
def _toc_from_navpoint(self, item, toc, navpoint):
|
||||
children = xpath(navpoint, 'ncx:navPoint')
|
||||
for child in children:
|
||||
title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
|
||||
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||
href = xpath(child, 'ncx:content/@src')
|
||||
if not title:
|
||||
self._toc_from_navpoint(item, toc, child)
|
||||
continue
|
||||
if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'):
|
||||
# This node is useless
|
||||
continue
|
||||
href = item.abshref(urlnormalize(href[0])) if href and href[0] else ''
|
||||
path, _ = urldefrag(href)
|
||||
if path and path not in self.oeb.manifest.hrefs:
|
||||
path = urlnormalize(path)
|
||||
if href and path not in self.oeb.manifest.hrefs:
|
||||
self.logger.warn('TOC reference %r not found' % href)
|
||||
gc = xpath(child, 'ncx:navPoint')
|
||||
if not gc:
|
||||
# This node is useless
|
||||
continue
|
||||
id = child.get('id')
|
||||
klass = child.get('class', 'chapter')
|
||||
|
||||
try:
|
||||
po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
|
||||
except:
|
||||
po = self.oeb.toc.next_play_order()
|
||||
|
||||
authorElement = xpath(child,
|
||||
'descendant::calibre:meta[@name = "author"]')
|
||||
if authorElement:
|
||||
author = authorElement[0].text
|
||||
else:
|
||||
author = None
|
||||
|
||||
descriptionElement = xpath(child,
|
||||
'descendant::calibre:meta[@name = "description"]')
|
||||
if descriptionElement:
|
||||
description = etree.tostring(descriptionElement[0],
|
||||
method='text', encoding='unicode').strip()
|
||||
if not description:
|
||||
description = None
|
||||
else:
|
||||
description = None
|
||||
|
||||
index_image = xpath(child,
|
||||
'descendant::calibre:meta[@name = "toc_thumbnail"]')
|
||||
toc_thumbnail = (index_image[0].text if index_image else None)
|
||||
if not toc_thumbnail or not toc_thumbnail.strip():
|
||||
toc_thumbnail = None
|
||||
|
||||
node = toc.add(title, href, id=id, klass=klass,
|
||||
play_order=po, description=description, author=author,
|
||||
toc_thumbnail=toc_thumbnail)
|
||||
|
||||
self._toc_from_navpoint(item, node, child)
|
||||
|
||||
def _toc_from_ncx(self, item):
|
||||
if (item is None) or (item.data is None):
|
||||
return False
|
||||
self.log.debug('Reading TOC from NCX...')
|
||||
ncx = item.data
|
||||
title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
|
||||
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||
title = title or unicode_type(self.oeb.metadata.title[0])
|
||||
toc = self.oeb.toc
|
||||
toc.title = title
|
||||
navmaps = xpath(ncx, 'ncx:navMap')
|
||||
for navmap in navmaps:
|
||||
self._toc_from_navpoint(item, toc, navmap)
|
||||
return True
|
||||
|
||||
def _toc_from_tour(self, opf):
|
||||
result = xpath(opf, 'o2:tours/o2:tour')
|
||||
if not result:
|
||||
return False
|
||||
self.log.debug('Reading TOC from tour...')
|
||||
tour = result[0]
|
||||
toc = self.oeb.toc
|
||||
toc.title = tour.get('title')
|
||||
sites = xpath(tour, 'o2:site')
|
||||
for site in sites:
|
||||
title = site.get('title')
|
||||
href = site.get('href')
|
||||
if not title or not href:
|
||||
continue
|
||||
path, _ = urldefrag(urlnormalize(href))
|
||||
if path not in self.oeb.manifest.hrefs:
|
||||
self.logger.warn('TOC reference %r not found' % href)
|
||||
continue
|
||||
id = site.get('id')
|
||||
toc.add(title, href, id=id)
|
||||
return True
|
||||
|
||||
def _toc_from_html(self, opf):
|
||||
if 'toc' not in self.oeb.guide:
|
||||
return False
|
||||
self.log.debug('Reading TOC from HTML...')
|
||||
itempath, frag = urldefrag(self.oeb.guide['toc'].href)
|
||||
item = self.oeb.manifest.hrefs[itempath]
|
||||
html = item.data
|
||||
if frag:
|
||||
elems = xpath(html, './/*[@id="%s"]' % frag)
|
||||
if not elems:
|
||||
elems = xpath(html, './/*[@name="%s"]' % frag)
|
||||
elem = elems[0] if elems else html
|
||||
while elem != html and not xpath(elem, './/h:a[@href]'):
|
||||
elem = elem.getparent()
|
||||
html = elem
|
||||
titles = defaultdict(list)
|
||||
order = []
|
||||
for anchor in xpath(html, './/h:a[@href]'):
|
||||
href = anchor.attrib['href']
|
||||
href = item.abshref(urlnormalize(href))
|
||||
path, frag = urldefrag(href)
|
||||
if path not in self.oeb.manifest.hrefs:
|
||||
continue
|
||||
title = xml2text(anchor)
|
||||
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||
if href not in titles:
|
||||
order.append(href)
|
||||
titles[href].append(title)
|
||||
toc = self.oeb.toc
|
||||
for href in order:
|
||||
toc.add(' '.join(titles[href]), href)
|
||||
return True
|
||||
|
||||
def _toc_from_spine(self, opf):
|
||||
self.log.warn('Generating default TOC from spine...')
|
||||
toc = self.oeb.toc
|
||||
titles = []
|
||||
headers = []
|
||||
for item in self.oeb.spine:
|
||||
if not item.linear:
|
||||
continue
|
||||
html = item.data
|
||||
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
|
||||
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||
if title:
|
||||
titles.append(title)
|
||||
headers.append('(unlabled)')
|
||||
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
|
||||
expr = '/h:html/h:body//h:%s[position()=1]/text()'
|
||||
header = ''.join(xpath(html, expr % tag))
|
||||
header = COLLAPSE_RE.sub(' ', header.strip())
|
||||
if header:
|
||||
headers[-1] = header
|
||||
break
|
||||
use = titles
|
||||
if len(titles) > len(set(titles)):
|
||||
use = headers
|
||||
for title, item in zip(use, self.oeb.spine):
|
||||
if not item.linear:
|
||||
continue
|
||||
toc.add(title, item.href)
|
||||
return True
|
||||
|
||||
def _toc_from_opf(self, opf, item):
|
||||
self.oeb.auto_generated_toc = False
|
||||
if self._toc_from_ncx(item):
|
||||
return
|
||||
# Prefer HTML to tour based TOC, since several LIT files
|
||||
# have good HTML TOCs but bad tour based TOCs
|
||||
if self._toc_from_html(opf):
|
||||
return
|
||||
if self._toc_from_tour(opf):
|
||||
return
|
||||
self._toc_from_spine(opf)
|
||||
self.oeb.auto_generated_toc = True
|
||||
|
||||
def _pages_from_ncx(self, opf, item):
|
||||
if item is None:
|
||||
return False
|
||||
ncx = item.data
|
||||
if ncx is None:
|
||||
return False
|
||||
ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget')
|
||||
if not ptargets:
|
||||
return False
|
||||
pages = self.oeb.pages
|
||||
for ptarget in ptargets:
|
||||
name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
|
||||
name = COLLAPSE_RE.sub(' ', name.strip())
|
||||
href = xpath(ptarget, 'ncx:content/@src')
|
||||
if not href:
|
||||
continue
|
||||
href = item.abshref(urlnormalize(href[0]))
|
||||
id = ptarget.get('id')
|
||||
type = ptarget.get('type', 'normal')
|
||||
klass = ptarget.get('class')
|
||||
pages.add(name, href, type=type, id=id, klass=klass)
|
||||
return True
|
||||
|
||||
def _find_page_map(self, opf):
|
||||
result = xpath(opf, '/o2:package/o2:spine/@page-map')
|
||||
if result:
|
||||
id = result[0]
|
||||
if id not in self.oeb.manifest.ids:
|
||||
return None
|
||||
item = self.oeb.manifest.ids[id]
|
||||
self.oeb.manifest.remove(item)
|
||||
return item
|
||||
for item in self.oeb.manifest.values():
|
||||
if item.media_type == PAGE_MAP_MIME:
|
||||
self.oeb.manifest.remove(item)
|
||||
return item
|
||||
return None
|
||||
|
||||
def _pages_from_page_map(self, opf):
|
||||
item = self._find_page_map(opf)
|
||||
if item is None:
|
||||
return False
|
||||
pmap = item.data
|
||||
pages = self.oeb.pages
|
||||
for page in xpath(pmap, 'o2:page'):
|
||||
name = page.get('name', '')
|
||||
href = page.get('href')
|
||||
if not href:
|
||||
continue
|
||||
name = COLLAPSE_RE.sub(' ', name.strip())
|
||||
href = item.abshref(urlnormalize(href))
|
||||
type = 'normal'
|
||||
if not name:
|
||||
type = 'special'
|
||||
elif name.lower().strip('ivxlcdm') == '':
|
||||
type = 'front'
|
||||
pages.add(name, href, type=type)
|
||||
return True
|
||||
|
||||
def _pages_from_opf(self, opf, item):
|
||||
if self._pages_from_ncx(opf, item):
|
||||
return
|
||||
if self._pages_from_page_map(opf):
|
||||
return
|
||||
return
|
||||
|
||||
def _cover_from_html(self, hcover):
|
||||
from calibre.ebooks import render_html_svg_workaround
|
||||
with TemporaryDirectory('_html_cover') as tdir:
|
||||
writer = OEBWriter()
|
||||
writer(self.oeb, tdir)
|
||||
path = os.path.join(tdir, unquote(hcover.href))
|
||||
data = render_html_svg_workaround(path, self.logger)
|
||||
if not data:
|
||||
data = b''
|
||||
id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
|
||||
item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
|
||||
return item
|
||||
|
||||
def _locate_cover_image(self):
|
||||
if self.oeb.metadata.cover:
|
||||
id = unicode_type(self.oeb.metadata.cover[0])
|
||||
item = self.oeb.manifest.ids.get(id, None)
|
||||
if item is not None and item.media_type in OEB_IMAGES:
|
||||
return item
|
||||
else:
|
||||
self.logger.warn('Invalid cover image @id %r' % id)
|
||||
hcover = self.oeb.spine[0]
|
||||
if 'cover' in self.oeb.guide:
|
||||
href = self.oeb.guide['cover'].href
|
||||
item = self.oeb.manifest.hrefs[href]
|
||||
media_type = item.media_type
|
||||
if media_type in OEB_IMAGES:
|
||||
return item
|
||||
elif media_type in OEB_DOCS:
|
||||
hcover = item
|
||||
html = hcover.data
|
||||
if MS_COVER_TYPE in self.oeb.guide:
|
||||
href = self.oeb.guide[MS_COVER_TYPE].href
|
||||
item = self.oeb.manifest.hrefs.get(href, None)
|
||||
if item is not None and item.media_type in OEB_IMAGES:
|
||||
return item
|
||||
if self.COVER_SVG_XP(html):
|
||||
svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
|
||||
href = os.path.splitext(hcover.href)[0] + '.svg'
|
||||
id, href = self.oeb.manifest.generate(hcover.id, href)
|
||||
item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg)
|
||||
return item
|
||||
if self.COVER_OBJECT_XP(html):
|
||||
object = self.COVER_OBJECT_XP(html)[0]
|
||||
href = hcover.abshref(object.get('data'))
|
||||
item = self.oeb.manifest.hrefs.get(href, None)
|
||||
if item is not None and item.media_type in OEB_IMAGES:
|
||||
return item
|
||||
return self._cover_from_html(hcover)
|
||||
|
||||
def _ensure_cover_image(self):
|
||||
cover = self._locate_cover_image()
|
||||
if self.oeb.metadata.cover:
|
||||
self.oeb.metadata.cover[0].value = cover.id
|
||||
return
|
||||
self.oeb.metadata.add('cover', cover.id)
|
||||
|
||||
def _manifest_remove_duplicates(self):
|
||||
seen = set()
|
||||
dups = set()
|
||||
for item in self.oeb.manifest:
|
||||
if item.href in seen:
|
||||
dups.add(item.href)
|
||||
seen.add(item.href)
|
||||
|
||||
for href in dups:
|
||||
items = [x for x in self.oeb.manifest if x.href == href]
|
||||
for x in items:
|
||||
if x not in self.oeb.spine:
|
||||
self.oeb.log.warn('Removing duplicate manifest item with id:', x.id)
|
||||
self.oeb.manifest.remove_duplicate_item(x)
|
||||
|
||||
def _all_from_opf(self, opf):
|
||||
self.oeb.version = opf.get('version', '1.2')
|
||||
self._metadata_from_opf(opf)
|
||||
self._manifest_from_opf(opf)
|
||||
self._spine_from_opf(opf)
|
||||
self._manifest_remove_duplicates()
|
||||
self._guide_from_opf(opf)
|
||||
item = self._find_ncx(opf)
|
||||
self._toc_from_opf(opf, item)
|
||||
self._pages_from_opf(opf, item)
|
||||
# self._ensure_cover_image()
|
||||
|
||||
|
||||
def main(argv=sys.argv):
|
||||
reader = OEBReader()
|
||||
for arg in argv[1:]:
|
||||
oeb = reader(OEBBook(), arg)
|
||||
for name, doc in oeb.to_opf1().values():
|
||||
print(etree.tostring(doc, pretty_print=True))
|
||||
for name, doc in oeb.to_opf2(page_map=True).values():
|
||||
print(etree.tostring(doc, pretty_print=True))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
808
ebook_converter/ebooks/oeb/stylizer.py
Normal file
808
ebook_converter/ebooks/oeb/stylizer.py
Normal file
@@ -0,0 +1,808 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
CSS property propagation class.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
import os, re, logging, copy, unicodedata, numbers
|
||||
from operator import itemgetter
|
||||
from weakref import WeakKeyDictionary
|
||||
from xml.dom import SyntaxErr as CSSSyntaxError
|
||||
from css_parser.css import (CSSStyleRule, CSSPageRule, CSSFontFaceRule,
|
||||
cssproperties)
|
||||
from css_parser import (profile as cssprofiles, parseString, parseStyle, log as
|
||||
css_parser_log, CSSParser, profiles, replaceUrls)
|
||||
from calibre import force_unicode, as_unicode
|
||||
from calibre.ebooks import unit_convert
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, xpath, urlnormalize
|
||||
from calibre.ebooks.oeb.normalize_css import DEFAULTS, normalizers
|
||||
from css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES
|
||||
from polyglot.builtins import iteritems, unicode_type, filter
|
||||
from tinycss.media3 import CSSMedia3Parser
|
||||
|
||||
css_parser_log.setLevel(logging.WARN)
|
||||
|
||||
_html_css_stylesheet = None
|
||||
|
||||
|
||||
def html_css_stylesheet():
|
||||
global _html_css_stylesheet
|
||||
if _html_css_stylesheet is None:
|
||||
with open(P('templates/html.css'), 'rb') as f:
|
||||
html_css = f.read().decode('utf-8')
|
||||
_html_css_stylesheet = parseString(html_css, validate=False)
|
||||
return _html_css_stylesheet
|
||||
|
||||
|
||||
INHERITED = {
|
||||
'azimuth', 'border-collapse', 'border-spacing', 'caption-side', 'color',
|
||||
'cursor', 'direction', 'elevation', 'empty-cells', 'font-family',
|
||||
'font-size', 'font-style', 'font-variant', 'font-weight', 'letter-spacing',
|
||||
'line-height', 'list-style-image', 'list-style-position',
|
||||
'list-style-type', 'orphans', 'page-break-inside', 'pitch-range', 'pitch',
|
||||
'quotes', 'richness', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
||||
'speak', 'speech-rate', 'stress', 'text-align', 'text-indent',
|
||||
'text-transform', 'visibility', 'voice-family', 'volume', 'white-space',
|
||||
'widows', 'word-spacing', 'text-shadow',
|
||||
}
|
||||
|
||||
FONT_SIZE_NAMES = {
|
||||
'xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large'
|
||||
}
|
||||
|
||||
ALLOWED_MEDIA_TYPES = frozenset({'screen', 'all', 'aural', 'amzn-kf8'})
|
||||
IGNORED_MEDIA_FEATURES = frozenset('width min-width max-width height min-height max-height device-width min-device-width max-device-width device-height min-device-height max-device-height aspect-ratio min-aspect-ratio max-aspect-ratio device-aspect-ratio min-device-aspect-ratio max-device-aspect-ratio color min-color max-color color-index min-color-index max-color-index monochrome min-monochrome max-monochrome -webkit-min-device-pixel-ratio resolution min-resolution max-resolution scan grid'.split()) # noqa
|
||||
|
||||
|
||||
def media_ok(raw):
|
||||
if not raw:
|
||||
return True
|
||||
if raw == 'amzn-mobi': # Optimization for the common case
|
||||
return False
|
||||
|
||||
def query_ok(mq):
|
||||
matched = True
|
||||
if mq.media_type not in ALLOWED_MEDIA_TYPES:
|
||||
matched = False
|
||||
# Media queries that test for device specific features always fail
|
||||
for media_feature, expr in mq.expressions:
|
||||
if media_feature in IGNORED_MEDIA_FEATURES:
|
||||
matched = False
|
||||
return mq.negated ^ matched
|
||||
|
||||
try:
|
||||
for mq in CSSMedia3Parser().parse_stylesheet('@media %s {}' % raw).rules[0].media:
|
||||
if query_ok(mq):
|
||||
return True
|
||||
return False
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
def test_media_ok():
|
||||
assert media_ok(None)
|
||||
assert media_ok('')
|
||||
assert not media_ok('amzn-mobi')
|
||||
assert media_ok('amzn-kf8')
|
||||
assert media_ok('screen')
|
||||
assert media_ok('only screen')
|
||||
assert not media_ok('not screen')
|
||||
assert not media_ok('(device-width:10px)')
|
||||
assert media_ok('screen, (device-width:10px)')
|
||||
assert not media_ok('screen and (device-width:10px)')
|
||||
|
||||
|
||||
class StylizerRules(object):
|
||||
|
||||
def __init__(self, opts, profile, stylesheets):
|
||||
self.opts, self.profile, self.stylesheets = opts, profile, stylesheets
|
||||
|
||||
index = 0
|
||||
self.rules = []
|
||||
self.page_rule = {}
|
||||
self.font_face_rules = []
|
||||
for sheet_index, stylesheet in enumerate(stylesheets):
|
||||
href = stylesheet.href
|
||||
for rule in stylesheet.cssRules:
|
||||
if rule.type == rule.MEDIA_RULE:
|
||||
if media_ok(rule.media.mediaText):
|
||||
for subrule in rule.cssRules:
|
||||
self.rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0))
|
||||
index += 1
|
||||
else:
|
||||
self.rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0))
|
||||
index = index + 1
|
||||
self.rules.sort(key=itemgetter(0)) # sort by specificity
|
||||
|
||||
def flatten_rule(self, rule, href, index, is_user_agent_sheet=False):
|
||||
results = []
|
||||
sheet_index = 0 if is_user_agent_sheet else 1
|
||||
if isinstance(rule, CSSStyleRule):
|
||||
style = self.flatten_style(rule.style)
|
||||
for selector in rule.selectorList:
|
||||
specificity = (sheet_index,) + selector.specificity + (index,)
|
||||
text = selector.selectorText
|
||||
selector = list(selector.seq)
|
||||
results.append((specificity, selector, style, text, href))
|
||||
elif isinstance(rule, CSSPageRule):
|
||||
style = self.flatten_style(rule.style)
|
||||
self.page_rule.update(style)
|
||||
elif isinstance(rule, CSSFontFaceRule):
|
||||
if rule.style.length > 1:
|
||||
# Ignore the meaningless font face rules generated by the
|
||||
# benighted MS Word that contain only a font-family declaration
|
||||
# and nothing else
|
||||
self.font_face_rules.append(rule)
|
||||
return results
|
||||
|
||||
def flatten_style(self, cssstyle):
|
||||
style = {}
|
||||
for prop in cssstyle:
|
||||
name = prop.name
|
||||
normalizer = normalizers.get(name, None)
|
||||
if normalizer is not None:
|
||||
style.update(normalizer(name, prop.cssValue))
|
||||
elif name == 'text-align':
|
||||
style['text-align'] = self._apply_text_align(prop.value)
|
||||
else:
|
||||
style[name] = prop.value
|
||||
if 'font-size' in style:
|
||||
size = style['font-size']
|
||||
if size == 'normal':
|
||||
size = 'medium'
|
||||
if size == 'smallest':
|
||||
size = 'xx-small'
|
||||
if size in FONT_SIZE_NAMES:
|
||||
style['font-size'] = "%.1frem" % (self.profile.fnames[size] / float(self.profile.fbase))
|
||||
if '-epub-writing-mode' in style:
|
||||
for x in ('-webkit-writing-mode', 'writing-mode'):
|
||||
style[x] = style.get(x, style['-epub-writing-mode'])
|
||||
return style
|
||||
|
||||
def _apply_text_align(self, text):
|
||||
if text in ('left', 'justify') and self.opts.change_justification in ('left', 'justify'):
|
||||
text = self.opts.change_justification
|
||||
return text
|
||||
|
||||
def same_rules(self, opts, profile, stylesheets):
|
||||
if self.opts != opts:
|
||||
# it's unlikely to happen, but better safe than sorry
|
||||
return False
|
||||
if self.profile != profile:
|
||||
return False
|
||||
if len(self.stylesheets) != len(stylesheets):
|
||||
return False
|
||||
for index, stylesheet in enumerate(self.stylesheets):
|
||||
if stylesheet != stylesheets[index]:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class Stylizer(object):
|
||||
STYLESHEETS = WeakKeyDictionary()
|
||||
|
||||
def __init__(self, tree, path, oeb, opts, profile=None,
|
||||
extra_css='', user_css='', base_css=''):
|
||||
self.oeb, self.opts = oeb, opts
|
||||
self.profile = profile
|
||||
if self.profile is None:
|
||||
# Use the default profile. This should really be using
|
||||
# opts.output_profile, but I don't want to risk changing it, as
|
||||
# doing so might well have hard to debug font size effects.
|
||||
from calibre.customize.ui import output_profiles
|
||||
for x in output_profiles():
|
||||
if x.short_name == 'default':
|
||||
self.profile = x
|
||||
break
|
||||
if self.profile is None:
|
||||
# Just in case the default profile is removed in the future :)
|
||||
self.profile = opts.output_profile
|
||||
self.body_font_size = self.profile.fbase
|
||||
self.logger = oeb.logger
|
||||
item = oeb.manifest.hrefs[path]
|
||||
basename = os.path.basename(path)
|
||||
cssname = os.path.splitext(basename)[0] + '.css'
|
||||
stylesheets = [html_css_stylesheet()]
|
||||
if base_css:
|
||||
stylesheets.append(parseString(base_css, validate=False))
|
||||
style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]')
|
||||
|
||||
# Add css_parser parsing profiles from output_profile
|
||||
for profile in self.opts.output_profile.extra_css_modules:
|
||||
cssprofiles.addProfile(profile['name'],
|
||||
profile['props'],
|
||||
profile['macros'])
|
||||
|
||||
parser = CSSParser(fetcher=self._fetch_css_file,
|
||||
log=logging.getLogger('calibre.css'))
|
||||
for elem in style_tags:
|
||||
if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))):
|
||||
text = elem.text if elem.text else ''
|
||||
for x in elem:
|
||||
t = getattr(x, 'text', None)
|
||||
if t:
|
||||
text += '\n\n' + force_unicode(t, 'utf-8')
|
||||
t = getattr(x, 'tail', None)
|
||||
if t:
|
||||
text += '\n\n' + force_unicode(t, 'utf-8')
|
||||
if text:
|
||||
text = oeb.css_preprocessor(text)
|
||||
# We handle @import rules separately
|
||||
parser.setFetcher(lambda x: ('utf-8', b''))
|
||||
stylesheet = parser.parseString(text, href=cssname,
|
||||
validate=False)
|
||||
parser.setFetcher(self._fetch_css_file)
|
||||
for rule in stylesheet.cssRules:
|
||||
if rule.type == rule.IMPORT_RULE:
|
||||
ihref = item.abshref(rule.href)
|
||||
if not media_ok(rule.media.mediaText):
|
||||
continue
|
||||
hrefs = self.oeb.manifest.hrefs
|
||||
if ihref not in hrefs:
|
||||
self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
|
||||
continue
|
||||
sitem = hrefs[ihref]
|
||||
if sitem.media_type not in OEB_STYLES:
|
||||
self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
|
||||
continue
|
||||
stylesheets.append(sitem.data)
|
||||
# Make links to resources absolute, since these rules will
|
||||
# be folded into a stylesheet at the root
|
||||
replaceUrls(stylesheet, item.abshref,
|
||||
ignoreImportRules=True)
|
||||
stylesheets.append(stylesheet)
|
||||
elif (elem.tag == XHTML('link') and elem.get('href') and elem.get(
|
||||
'rel', 'stylesheet').lower() == 'stylesheet' and elem.get(
|
||||
'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media'))
|
||||
):
|
||||
href = urlnormalize(elem.attrib['href'])
|
||||
path = item.abshref(href)
|
||||
sitem = oeb.manifest.hrefs.get(path, None)
|
||||
if sitem is None:
|
||||
self.logger.warn(
|
||||
'Stylesheet %r referenced by file %r not in manifest' %
|
||||
(path, item.href))
|
||||
continue
|
||||
if not hasattr(sitem.data, 'cssRules'):
|
||||
self.logger.warn(
|
||||
'Stylesheet %r referenced by file %r is not CSS'%(path,
|
||||
item.href))
|
||||
continue
|
||||
stylesheets.append(sitem.data)
|
||||
csses = {'extra_css':extra_css, 'user_css':user_css}
|
||||
for w, x in csses.items():
|
||||
if x:
|
||||
try:
|
||||
text = x
|
||||
stylesheet = parser.parseString(text, href=cssname,
|
||||
validate=False)
|
||||
stylesheets.append(stylesheet)
|
||||
except Exception:
|
||||
self.logger.exception('Failed to parse %s, ignoring.'%w)
|
||||
self.logger.debug('Bad css: ')
|
||||
self.logger.debug(x)
|
||||
|
||||
# using oeb to store the rules, page rule and font face rules
|
||||
# and generating them again if opts, profile or stylesheets are different
|
||||
if (not hasattr(self.oeb, 'stylizer_rules')) \
|
||||
or not self.oeb.stylizer_rules.same_rules(self.opts, self.profile, stylesheets):
|
||||
self.oeb.stylizer_rules = StylizerRules(self.opts, self.profile, stylesheets)
|
||||
self.rules = self.oeb.stylizer_rules.rules
|
||||
self.page_rule = self.oeb.stylizer_rules.page_rule
|
||||
self.font_face_rules = self.oeb.stylizer_rules.font_face_rules
|
||||
self.flatten_style = self.oeb.stylizer_rules.flatten_style
|
||||
|
||||
self._styles = {}
|
||||
pseudo_pat = re.compile(':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
|
||||
select = Select(tree, ignore_inappropriate_pseudo_classes=True)
|
||||
|
||||
for _, _, cssdict, text, _ in self.rules:
|
||||
fl = pseudo_pat.search(text)
|
||||
try:
|
||||
matches = tuple(select(text))
|
||||
except SelectorError as err:
|
||||
self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err)))
|
||||
continue
|
||||
|
||||
if fl is not None:
|
||||
fl = fl.group(1)
|
||||
if fl == 'first-letter' and getattr(self.oeb,
|
||||
'plumber_output_format', '').lower() in {'mobi', 'docx'}:
|
||||
# Fake first-letter
|
||||
for elem in matches:
|
||||
for x in elem.iter('*'):
|
||||
if x.text:
|
||||
punctuation_chars = []
|
||||
text = unicode_type(x.text)
|
||||
while text:
|
||||
category = unicodedata.category(text[0])
|
||||
if category[0] not in {'P', 'Z'}:
|
||||
break
|
||||
punctuation_chars.append(text[0])
|
||||
text = text[1:]
|
||||
|
||||
special_text = ''.join(punctuation_chars) + \
|
||||
(text[0] if text else '')
|
||||
span = x.makeelement('{%s}span' % XHTML_NS)
|
||||
span.text = special_text
|
||||
span.set('data-fake-first-letter', '1')
|
||||
span.tail = text[1:]
|
||||
x.text = None
|
||||
x.insert(0, span)
|
||||
self.style(span)._update_cssdict(cssdict)
|
||||
break
|
||||
else: # Element pseudo-class
|
||||
for elem in matches:
|
||||
self.style(elem)._update_pseudo_class(fl, cssdict)
|
||||
else:
|
||||
for elem in matches:
|
||||
self.style(elem)._update_cssdict(cssdict)
|
||||
for elem in xpath(tree, '//h:*[@style]'):
|
||||
self.style(elem)._apply_style_attr(url_replacer=item.abshref)
|
||||
num_pat = re.compile(r'[0-9.]+$')
|
||||
for elem in xpath(tree, '//h:img[@width or @height]'):
|
||||
style = self.style(elem)
|
||||
# Check if either height or width is not default
|
||||
is_styled = style._style.get('width', 'auto') != 'auto' or \
|
||||
style._style.get('height', 'auto') != 'auto'
|
||||
if not is_styled:
|
||||
# Update img style dimension using width and height
|
||||
upd = {}
|
||||
for prop in ('width', 'height'):
|
||||
val = elem.get(prop, '').strip()
|
||||
try:
|
||||
del elem.attrib[prop]
|
||||
except:
|
||||
pass
|
||||
if val:
|
||||
if num_pat.match(val) is not None:
|
||||
val += 'px'
|
||||
upd[prop] = val
|
||||
if upd:
|
||||
style._update_cssdict(upd)
|
||||
|
||||
def _fetch_css_file(self, path):
|
||||
hrefs = self.oeb.manifest.hrefs
|
||||
if path not in hrefs:
|
||||
self.logger.warn('CSS import of missing file %r' % path)
|
||||
return (None, None)
|
||||
item = hrefs[path]
|
||||
if item.media_type not in OEB_STYLES:
|
||||
self.logger.warn('CSS import of non-CSS file %r' % path)
|
||||
return (None, None)
|
||||
data = item.data.cssText
|
||||
if not isinstance(data, bytes):
|
||||
data = data.encode('utf-8')
|
||||
return ('utf-8', data)
|
||||
|
||||
def style(self, element):
|
||||
try:
|
||||
return self._styles[element]
|
||||
except KeyError:
|
||||
return Style(element, self)
|
||||
|
||||
def stylesheet(self, name, font_scale=None):
|
||||
rules = []
|
||||
for _, _, style, selector, href in self.rules:
|
||||
if href != name:
|
||||
continue
|
||||
if font_scale and 'font-size' in style and \
|
||||
style['font-size'].endswith('pt'):
|
||||
style = copy.copy(style)
|
||||
size = float(style['font-size'][:-2])
|
||||
style['font-size'] = "%.2fpt" % (size * font_scale)
|
||||
style = ';\n '.join(': '.join(item) for item in style.items())
|
||||
rules.append('%s {\n %s;\n}' % (selector, style))
|
||||
return '\n'.join(rules)
|
||||
|
||||
|
||||
class Style(object):
|
||||
MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)')
|
||||
|
||||
def __init__(self, element, stylizer):
|
||||
self._element = element
|
||||
self._profile = stylizer.profile
|
||||
self._stylizer = stylizer
|
||||
self._style = {}
|
||||
self._fontSize = None
|
||||
self._width = None
|
||||
self._height = None
|
||||
self._lineHeight = None
|
||||
self._bgcolor = None
|
||||
self._pseudo_classes = {}
|
||||
stylizer._styles[element] = self
|
||||
|
||||
def set(self, prop, val):
|
||||
self._style[prop] = val
|
||||
|
||||
def drop(self, prop, default=None):
|
||||
return self._style.pop(prop, default)
|
||||
|
||||
def _update_cssdict(self, cssdict):
|
||||
self._style.update(cssdict)
|
||||
|
||||
def _update_pseudo_class(self, name, cssdict):
|
||||
orig = self._pseudo_classes.get(name, {})
|
||||
orig.update(cssdict)
|
||||
self._pseudo_classes[name] = orig
|
||||
|
||||
def _apply_style_attr(self, url_replacer=None):
|
||||
attrib = self._element.attrib
|
||||
if 'style' not in attrib:
|
||||
return
|
||||
css = attrib['style'].split(';')
|
||||
css = filter(None, (x.strip() for x in css))
|
||||
css = [y.strip() for y in css]
|
||||
css = [y for y in css if self.MS_PAT.match(y) is None]
|
||||
css = '; '.join(css)
|
||||
try:
|
||||
style = parseStyle(css, validate=False)
|
||||
except CSSSyntaxError:
|
||||
return
|
||||
if url_replacer is not None:
|
||||
replaceUrls(style, url_replacer, ignoreImportRules=True)
|
||||
self._style.update(self._stylizer.flatten_style(style))
|
||||
|
||||
def _has_parent(self):
|
||||
try:
|
||||
return self._element.getparent() is not None
|
||||
except AttributeError:
|
||||
return False # self._element is None
|
||||
|
||||
def _get_parent(self):
|
||||
elem = self._element.getparent()
|
||||
if elem is None:
|
||||
return None
|
||||
return self._stylizer.style(elem)
|
||||
|
||||
def __getitem__(self, name):
|
||||
domname = cssproperties._toDOMname(name)
|
||||
if hasattr(self, domname):
|
||||
return getattr(self, domname)
|
||||
return self._unit_convert(self._get(name))
|
||||
|
||||
def _get(self, name):
|
||||
result = None
|
||||
if name in self._style:
|
||||
result = self._style[name]
|
||||
if (result == 'inherit' or (result is None and name in INHERITED and self._has_parent())):
|
||||
stylizer = self._stylizer
|
||||
result = stylizer.style(self._element.getparent())._get(name)
|
||||
if result is None:
|
||||
result = DEFAULTS[name]
|
||||
return result
|
||||
|
||||
def get(self, name, default=None):
|
||||
return self._style.get(name, default)
|
||||
|
||||
def _unit_convert(self, value, base=None, font=None):
|
||||
'Return value in pts'
|
||||
if base is None:
|
||||
base = self.width
|
||||
if not font and font != 0:
|
||||
font = self.fontSize
|
||||
return unit_convert(value, base, font, self._profile.dpi, body_font_size=self._stylizer.body_font_size)
|
||||
|
||||
def pt_to_px(self, value):
|
||||
return (self._profile.dpi / 72) * value
|
||||
|
||||
@property
|
||||
def backgroundColor(self):
|
||||
'''
|
||||
Return the background color by parsing both the background-color and
|
||||
background shortcut properties. Note that inheritance/default values
|
||||
are not used. None is returned if no background color is set.
|
||||
'''
|
||||
|
||||
def validate_color(col):
|
||||
return cssprofiles.validateWithProfile('color',
|
||||
col,
|
||||
profiles=[profiles.Profiles.CSS_LEVEL_2])[1]
|
||||
|
||||
if self._bgcolor is None:
|
||||
col = None
|
||||
val = self._style.get('background-color', None)
|
||||
if val and validate_color(val):
|
||||
col = val
|
||||
else:
|
||||
val = self._style.get('background', None)
|
||||
if val is not None:
|
||||
try:
|
||||
style = parseStyle('background: '+val, validate=False)
|
||||
val = style.getProperty('background').cssValue
|
||||
try:
|
||||
val = list(val)
|
||||
except:
|
||||
# val is CSSPrimitiveValue
|
||||
val = [val]
|
||||
for c in val:
|
||||
c = c.cssText
|
||||
if isinstance(c, bytes):
|
||||
c = c.decode('utf-8', 'replace')
|
||||
if validate_color(c):
|
||||
col = c
|
||||
break
|
||||
except:
|
||||
pass
|
||||
if col is None:
|
||||
self._bgcolor = False
|
||||
else:
|
||||
self._bgcolor = col
|
||||
return self._bgcolor if self._bgcolor else None
|
||||
|
||||
@property
|
||||
def fontSize(self):
|
||||
def normalize_fontsize(value, base):
|
||||
value = value.replace('"', '').replace("'", '')
|
||||
result = None
|
||||
factor = None
|
||||
if value == 'inherit':
|
||||
value = base
|
||||
if value in FONT_SIZE_NAMES:
|
||||
result = self._profile.fnames[value]
|
||||
elif value == 'smaller':
|
||||
factor = 1.0/1.2
|
||||
for _, _, size in self._profile.fsizes:
|
||||
if base <= size:
|
||||
break
|
||||
factor = None
|
||||
result = size
|
||||
elif value == 'larger':
|
||||
factor = 1.2
|
||||
for _, _, size in reversed(self._profile.fsizes):
|
||||
if base >= size:
|
||||
break
|
||||
factor = None
|
||||
result = size
|
||||
else:
|
||||
result = self._unit_convert(value, base=base, font=base)
|
||||
if not isinstance(result, numbers.Number):
|
||||
return base
|
||||
if result < 0:
|
||||
result = normalize_fontsize("smaller", base)
|
||||
if factor:
|
||||
result = factor * base
|
||||
return result
|
||||
if self._fontSize is None:
|
||||
result = None
|
||||
parent = self._get_parent()
|
||||
if parent is not None:
|
||||
base = parent.fontSize
|
||||
else:
|
||||
base = self._profile.fbase
|
||||
if 'font-size' in self._style:
|
||||
size = self._style['font-size']
|
||||
result = normalize_fontsize(size, base)
|
||||
else:
|
||||
result = base
|
||||
self._fontSize = result
|
||||
return self._fontSize
|
||||
|
||||
def img_dimension(self, attr, img_size):
|
||||
ans = None
|
||||
parent = self._get_parent()
|
||||
if parent is not None:
|
||||
base = getattr(parent, attr)
|
||||
else:
|
||||
base = getattr(self._profile, attr + '_pts')
|
||||
x = self._style.get(attr)
|
||||
if x is not None:
|
||||
if x == 'auto':
|
||||
ans = self._unit_convert(unicode_type(img_size) + 'px', base=base)
|
||||
else:
|
||||
x = self._unit_convert(x, base=base)
|
||||
if isinstance(x, numbers.Number):
|
||||
ans = x
|
||||
if ans is None:
|
||||
x = self._element.get(attr)
|
||||
if x is not None:
|
||||
x = self._unit_convert(x + 'px', base=base)
|
||||
if isinstance(x, numbers.Number):
|
||||
ans = x
|
||||
if ans is None:
|
||||
ans = self._unit_convert(unicode_type(img_size) + 'px', base=base)
|
||||
maa = self._style.get('max-' + attr)
|
||||
if maa is not None:
|
||||
x = self._unit_convert(maa, base=base)
|
||||
if isinstance(x, numbers.Number) and (ans is None or x < ans):
|
||||
ans = x
|
||||
return ans
|
||||
|
||||
def img_size(self, width, height):
|
||||
' Return the final size of an <img> given that it points to an image of size widthxheight '
|
||||
w, h = self._get('width'), self._get('height')
|
||||
answ, ansh = self.img_dimension('width', width), self.img_dimension('height', height)
|
||||
if w == 'auto' and h != 'auto':
|
||||
answ = (float(width)/height) * ansh
|
||||
elif h == 'auto' and w != 'auto':
|
||||
ansh = (float(height)/width) * answ
|
||||
return answ, ansh
|
||||
|
||||
@property
|
||||
def width(self):
|
||||
if self._width is None:
|
||||
width = None
|
||||
base = None
|
||||
parent = self._get_parent()
|
||||
if parent is not None:
|
||||
base = parent.width
|
||||
else:
|
||||
base = self._profile.width_pts
|
||||
if 'width' in self._element.attrib:
|
||||
width = self._element.attrib['width']
|
||||
elif 'width' in self._style:
|
||||
width = self._style['width']
|
||||
if not width or width == 'auto':
|
||||
result = base
|
||||
else:
|
||||
result = self._unit_convert(width, base=base)
|
||||
if isinstance(result, (unicode_type, bytes)):
|
||||
result = self._profile.width
|
||||
self._width = result
|
||||
if 'max-width' in self._style:
|
||||
result = self._unit_convert(self._style['max-width'], base=base)
|
||||
if isinstance(result, (unicode_type, bytes)):
|
||||
result = self._width
|
||||
if result < self._width:
|
||||
self._width = result
|
||||
|
||||
return self._width
|
||||
|
||||
@property
|
||||
def parent_width(self):
|
||||
parent = self._get_parent()
|
||||
if parent is None:
|
||||
return self.width
|
||||
return parent.width
|
||||
|
||||
@property
|
||||
def height(self):
|
||||
if self._height is None:
|
||||
height = None
|
||||
base = None
|
||||
parent = self._get_parent()
|
||||
if parent is not None:
|
||||
base = parent.height
|
||||
else:
|
||||
base = self._profile.height_pts
|
||||
if 'height' in self._element.attrib:
|
||||
height = self._element.attrib['height']
|
||||
elif 'height' in self._style:
|
||||
height = self._style['height']
|
||||
if not height or height == 'auto':
|
||||
result = base
|
||||
else:
|
||||
result = self._unit_convert(height, base=base)
|
||||
if isinstance(result, (unicode_type, bytes)):
|
||||
result = self._profile.height
|
||||
self._height = result
|
||||
if 'max-height' in self._style:
|
||||
result = self._unit_convert(self._style['max-height'], base=base)
|
||||
if isinstance(result, (unicode_type, bytes)):
|
||||
result = self._height
|
||||
if result < self._height:
|
||||
self._height = result
|
||||
|
||||
return self._height
|
||||
|
||||
@property
|
||||
def lineHeight(self):
|
||||
if self._lineHeight is None:
|
||||
result = None
|
||||
parent = self._get_parent()
|
||||
if 'line-height' in self._style:
|
||||
lineh = self._style['line-height']
|
||||
if lineh == 'normal':
|
||||
lineh = '1.2'
|
||||
try:
|
||||
result = float(lineh) * self.fontSize
|
||||
except ValueError:
|
||||
result = self._unit_convert(lineh, base=self.fontSize)
|
||||
elif parent is not None:
|
||||
# TODO: proper inheritance
|
||||
result = parent.lineHeight
|
||||
else:
|
||||
result = 1.2 * self.fontSize
|
||||
self._lineHeight = result
|
||||
return self._lineHeight
|
||||
|
||||
@property
|
||||
def effective_text_decoration(self):
|
||||
'''
|
||||
Browsers do this creepy thing with text-decoration where even though the
|
||||
property is not inherited, it looks like it is because containing
|
||||
blocks apply it. The actual algorithm is utterly ridiculous, see
|
||||
http://reference.sitepoint.com/css/text-decoration
|
||||
This matters for MOBI output, where text-decoration is mapped to <u>
|
||||
and <st> tags. Trying to implement the actual algorithm is too much
|
||||
work, so we just use a simple fake that should cover most cases.
|
||||
'''
|
||||
css = self._style.get('text-decoration', None)
|
||||
pcss = None
|
||||
parent = self._get_parent()
|
||||
if parent is not None:
|
||||
pcss = parent._style.get('text-decoration', None)
|
||||
if css in ('none', None, 'inherit') and pcss not in (None, 'none'):
|
||||
return pcss
|
||||
return css
|
||||
|
||||
@property
|
||||
def first_vertical_align(self):
|
||||
''' For docx output where tags are not nested, we cannot directly
|
||||
simulate the HTML vertical-align rendering model. Instead use the
|
||||
approximation of considering the first non-default vertical-align '''
|
||||
val = self['vertical-align']
|
||||
if val != 'baseline':
|
||||
raw_val = self._get('vertical-align')
|
||||
if '%' in raw_val:
|
||||
val = self._unit_convert(raw_val, base=self['line-height'])
|
||||
return val
|
||||
parent = self._get_parent()
|
||||
if parent is not None and 'inline' in parent['display']:
|
||||
return parent.first_vertical_align
|
||||
|
||||
@property
|
||||
def marginTop(self):
|
||||
return self._unit_convert(
|
||||
self._get('margin-top'), base=self.parent_width)
|
||||
|
||||
@property
|
||||
def marginBottom(self):
|
||||
return self._unit_convert(
|
||||
self._get('margin-bottom'), base=self.parent_width)
|
||||
|
||||
@property
|
||||
def marginLeft(self):
|
||||
return self._unit_convert(
|
||||
self._get('margin-left'), base=self.parent_width)
|
||||
|
||||
@property
|
||||
def marginRight(self):
|
||||
return self._unit_convert(
|
||||
self._get('margin-right'), base=self.parent_width)
|
||||
|
||||
@property
|
||||
def paddingTop(self):
|
||||
return self._unit_convert(
|
||||
self._get('padding-top'), base=self.parent_width)
|
||||
|
||||
@property
|
||||
def paddingBottom(self):
|
||||
return self._unit_convert(
|
||||
self._get('padding-bottom'), base=self.parent_width)
|
||||
|
||||
@property
|
||||
def paddingLeft(self):
|
||||
return self._unit_convert(
|
||||
self._get('padding-left'), base=self.parent_width)
|
||||
|
||||
@property
|
||||
def paddingRight(self):
|
||||
return self._unit_convert(
|
||||
self._get('padding-right'), base=self.parent_width)
|
||||
|
||||
def __str__(self):
|
||||
items = sorted(iteritems(self._style))
|
||||
return '; '.join("%s: %s" % (key, val) for key, val in items)
|
||||
|
||||
def cssdict(self):
|
||||
return dict(self._style)
|
||||
|
||||
def pseudo_classes(self, filter_css):
|
||||
if filter_css:
|
||||
css = copy.deepcopy(self._pseudo_classes)
|
||||
for psel, cssdict in iteritems(css):
|
||||
for k in filter_css:
|
||||
cssdict.pop(k, None)
|
||||
else:
|
||||
css = self._pseudo_classes
|
||||
return {k:v for k, v in iteritems(css) if v}
|
||||
|
||||
@property
|
||||
def is_hidden(self):
|
||||
return self._style.get('display') == 'none' or self._style.get('visibility') == 'hidden'
|
||||
7
ebook_converter/ebooks/oeb/transforms/__init__.py
Normal file
7
ebook_converter/ebooks/oeb/transforms/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
52
ebook_converter/ebooks/oeb/transforms/data_url.py
Normal file
52
ebook_converter/ebooks/oeb/transforms/data_url.py
Normal file
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re
|
||||
from calibre.ebooks.oeb.base import XPath, urlunquote
|
||||
from polyglot.builtins import as_bytes
|
||||
|
||||
|
||||
class DataURL(object):
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
from calibre.utils.imghdr import what
|
||||
self.log = oeb.log
|
||||
attr_path = XPath('//h:img[@src]')
|
||||
for item in oeb.spine:
|
||||
root = item.data
|
||||
if not hasattr(root, 'xpath'):
|
||||
continue
|
||||
for img in attr_path(root):
|
||||
raw = img.get('src', '')
|
||||
if not raw.startswith('data:'):
|
||||
continue
|
||||
header, data = raw.partition(',')[0::2]
|
||||
if not header.startswith('data:image/') or not data:
|
||||
continue
|
||||
if ';base64' in header:
|
||||
data = re.sub(r'\s+', '', data)
|
||||
from polyglot.binary import from_base64_bytes
|
||||
try:
|
||||
data = from_base64_bytes(data)
|
||||
except Exception:
|
||||
self.log.error('Found invalid base64 encoded data URI, ignoring it')
|
||||
continue
|
||||
else:
|
||||
data = urlunquote(data)
|
||||
data = as_bytes(data)
|
||||
fmt = what(None, data)
|
||||
if not fmt:
|
||||
self.log.warn('Image encoded as data URL has unknown format, ignoring')
|
||||
continue
|
||||
img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
|
||||
|
||||
def convert_image_data_uri(self, data, fmt, oeb):
|
||||
self.log('Found image encoded as data URI converting it to normal image')
|
||||
from calibre import guess_type
|
||||
item_id, item_href = oeb.manifest.generate('data-url-image', 'data-url-image.' + fmt)
|
||||
oeb.manifest.add(item_id, item_href, guess_type(item_href)[0], data=data)
|
||||
return item_href
|
||||
684
ebook_converter/ebooks/oeb/transforms/flatcss.py
Normal file
684
ebook_converter/ebooks/oeb/transforms/flatcss.py
Normal file
@@ -0,0 +1,684 @@
|
||||
'''
|
||||
CSS flattening transform.
|
||||
'''
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
import re, operator, math, numbers
|
||||
from collections import defaultdict
|
||||
from xml.dom import SyntaxErr
|
||||
|
||||
from lxml import etree
|
||||
import css_parser
|
||||
from css_parser.css import Property
|
||||
|
||||
from calibre import guess_type
|
||||
from calibre.ebooks import unit_convert
|
||||
from calibre.ebooks.oeb.base import (XHTML, XHTML_NS, CSS_MIME, OEB_STYLES,
|
||||
namespace, barename, XPath, css_text)
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.utils.filenames import ascii_filename, ascii_text
|
||||
from calibre.utils.icu import numeric_sort_key
|
||||
from polyglot.builtins import iteritems, unicode_type, string_or_bytes, map
|
||||
|
||||
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
|
||||
STRIPNUM = re.compile(r'[-0-9]+$')
|
||||
|
||||
|
||||
def asfloat(value, default):
|
||||
if not isinstance(value, numbers.Number):
|
||||
value = default
|
||||
return float(value)
|
||||
|
||||
|
||||
class KeyMapper(object):
|
||||
|
||||
def __init__(self, sbase, dbase, dkey):
|
||||
self.sbase = float(sbase)
|
||||
self.dprop = [(self.relate(x, dbase), float(x)) for x in dkey]
|
||||
self.cache = {}
|
||||
|
||||
@staticmethod
|
||||
def relate(size, base):
|
||||
if size == 0:
|
||||
return base
|
||||
size = float(size)
|
||||
base = float(base)
|
||||
if abs(size - base) < 0.1:
|
||||
return 0
|
||||
sign = -1 if size < base else 1
|
||||
endp = 0 if size < base else 36
|
||||
diff = (abs(base - size) * 3) + ((36 - size) / 100)
|
||||
logb = abs(base - endp)
|
||||
if logb == 1.0:
|
||||
logb = 1.1
|
||||
try:
|
||||
result = sign * math.log(diff, logb)
|
||||
except ValueError:
|
||||
if diff < 0:
|
||||
# Size is both very large and close to base
|
||||
return 0
|
||||
if logb == 0:
|
||||
logb = 1e-6
|
||||
if diff == 0:
|
||||
diff = 1e-6
|
||||
result = sign * math.log(diff, logb)
|
||||
return result
|
||||
|
||||
def __getitem__(self, ssize):
|
||||
ssize = asfloat(ssize, 0)
|
||||
if ssize in self.cache:
|
||||
return self.cache[ssize]
|
||||
dsize = self.map(ssize)
|
||||
self.cache[ssize] = dsize
|
||||
return dsize
|
||||
|
||||
def map(self, ssize):
|
||||
sbase = self.sbase
|
||||
prop = self.relate(ssize, sbase)
|
||||
diff = [(abs(prop - p), s) for p, s in self.dprop]
|
||||
dsize = min(diff)[1]
|
||||
return dsize
|
||||
|
||||
|
||||
class ScaleMapper(object):
|
||||
|
||||
def __init__(self, sbase, dbase):
|
||||
self.dscale = float(dbase) / float(sbase)
|
||||
|
||||
def __getitem__(self, ssize):
|
||||
ssize = asfloat(ssize, 0)
|
||||
dsize = ssize * self.dscale
|
||||
return dsize
|
||||
|
||||
|
||||
class NullMapper(object):
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getitem__(self, ssize):
|
||||
return ssize
|
||||
|
||||
|
||||
def FontMapper(sbase=None, dbase=None, dkey=None):
|
||||
if sbase and dbase and dkey:
|
||||
return KeyMapper(sbase, dbase, dkey)
|
||||
elif sbase and dbase:
|
||||
return ScaleMapper(sbase, dbase)
|
||||
else:
|
||||
return NullMapper()
|
||||
|
||||
|
||||
class EmbedFontsCSSRules(object):
|
||||
|
||||
def __init__(self, body_font_family, rules):
|
||||
self.body_font_family, self.rules = body_font_family, rules
|
||||
self.href = None
|
||||
|
||||
def __call__(self, oeb):
|
||||
if not self.body_font_family:
|
||||
return None
|
||||
if not self.href:
|
||||
iid, href = oeb.manifest.generate('page_styles', 'page_styles.css')
|
||||
rules = [css_text(x) for x in self.rules]
|
||||
rules = '\n\n'.join(rules)
|
||||
sheet = css_parser.parseString(rules, validate=False)
|
||||
self.href = oeb.manifest.add(iid, href, guess_type(href)[0],
|
||||
data=sheet).href
|
||||
return self.href
|
||||
|
||||
|
||||
class CSSFlattener(object):
|
||||
|
||||
def __init__(self, fbase=None, fkey=None, lineh=None, unfloat=False,
|
||||
untable=False, page_break_on_body=False, specializer=None,
|
||||
transform_css_rules=()):
|
||||
self.fbase = fbase
|
||||
self.transform_css_rules = transform_css_rules
|
||||
if self.transform_css_rules:
|
||||
from calibre.ebooks.css_transform_rules import compile_rules
|
||||
self.transform_css_rules = compile_rules(self.transform_css_rules)
|
||||
self.fkey = fkey
|
||||
self.lineh = lineh
|
||||
self.unfloat = unfloat
|
||||
self.untable = untable
|
||||
self.specializer = specializer
|
||||
self.page_break_on_body = page_break_on_body
|
||||
|
||||
@classmethod
|
||||
def config(cls, cfg):
|
||||
return cfg
|
||||
|
||||
@classmethod
|
||||
def generate(cls, opts):
|
||||
return cls()
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
oeb.logger.info('Flattening CSS and remapping font sizes...')
|
||||
self.context = self.opts = context
|
||||
self.oeb = oeb
|
||||
self.items = list(self.oeb.spine)
|
||||
titlepage = self.oeb.guide.get('titlepage')
|
||||
if titlepage is not None:
|
||||
titlepage = titlepage.item
|
||||
if titlepage is not None and titlepage not in self.items:
|
||||
self.items.append(titlepage)
|
||||
epub3_nav = None
|
||||
if getattr(self.opts, 'epub3_nav_href', None):
|
||||
epub3_nav = self.oeb.manifest.hrefs.get(self.opts.epub3_nav_href)
|
||||
if epub3_nav is not None and epub3_nav not in self.items:
|
||||
self.items.append(epub3_nav)
|
||||
|
||||
self.filter_css = frozenset()
|
||||
if self.opts.filter_css:
|
||||
try:
|
||||
self.filter_css = {x.strip().lower() for x in
|
||||
self.opts.filter_css.split(',')}
|
||||
except:
|
||||
self.oeb.log.warning('Failed to parse filter_css, ignoring')
|
||||
else:
|
||||
from calibre.ebooks.oeb.normalize_css import normalize_filter_css
|
||||
self.filter_css = frozenset(normalize_filter_css(self.filter_css))
|
||||
self.oeb.log.debug('Filtering CSS properties: %s'%
|
||||
', '.join(self.filter_css))
|
||||
|
||||
for item in oeb.manifest.values():
|
||||
# Make all links to resources absolute, as these sheets will be
|
||||
# consolidated into a single stylesheet at the root of the document
|
||||
if item.media_type in OEB_STYLES:
|
||||
css_parser.replaceUrls(item.data, item.abshref,
|
||||
ignoreImportRules=True)
|
||||
|
||||
self.body_font_family, self.embed_font_rules = self.get_embed_font_info(
|
||||
self.opts.embed_font_family)
|
||||
# Store for use in output plugins/transforms that generate content,
|
||||
# like the AZW3 output inline ToC.
|
||||
self.oeb.store_embed_font_rules = EmbedFontsCSSRules(self.body_font_family,
|
||||
self.embed_font_rules)
|
||||
self.stylize_spine()
|
||||
self.sbase = self.baseline_spine() if self.fbase else None
|
||||
self.fmap = FontMapper(self.sbase, self.fbase, self.fkey)
|
||||
self.flatten_spine()
|
||||
if epub3_nav is not None:
|
||||
self.opts.epub3_nav_parsed = epub3_nav.data
|
||||
|
||||
self.store_page_margins()
|
||||
|
||||
def store_page_margins(self):
|
||||
self.opts._stored_page_margins = {}
|
||||
for item, stylizer in iteritems(self.stylizers):
|
||||
margins = self.opts._stored_page_margins[item.href] = {}
|
||||
for prop, val in stylizer.page_rule.items():
|
||||
p, w = prop.partition('-')[::2]
|
||||
if p == 'margin':
|
||||
margins[w] = unit_convert(
|
||||
val, stylizer.profile.width_pts, stylizer.body_font_size,
|
||||
stylizer.profile.dpi, body_font_size=stylizer.body_font_size)
|
||||
|
||||
def get_embed_font_info(self, family, failure_critical=True):
|
||||
efi = []
|
||||
body_font_family = None
|
||||
if not family:
|
||||
return body_font_family, efi
|
||||
from calibre.utils.fonts.scanner import font_scanner, NoFonts
|
||||
from calibre.utils.fonts.utils import panose_to_css_generic_family
|
||||
try:
|
||||
faces = font_scanner.fonts_for_family(family)
|
||||
except NoFonts:
|
||||
msg = ('No embeddable fonts found for family: %r'%family)
|
||||
if failure_critical:
|
||||
raise ValueError(msg)
|
||||
self.oeb.log.warn(msg)
|
||||
return body_font_family, efi
|
||||
if not faces:
|
||||
msg = ('No embeddable fonts found for family: %r'%family)
|
||||
if failure_critical:
|
||||
raise ValueError(msg)
|
||||
self.oeb.log.warn(msg)
|
||||
return body_font_family, efi
|
||||
|
||||
for i, font in enumerate(faces):
|
||||
ext = 'otf' if font['is_otf'] else 'ttf'
|
||||
fid, href = self.oeb.manifest.generate(id=u'font',
|
||||
href='fonts/%s.%s'%(ascii_filename(font['full_name']).replace(' ', '-'), ext))
|
||||
item = self.oeb.manifest.add(fid, href,
|
||||
guess_type('dummy.'+ext)[0],
|
||||
data=font_scanner.get_font_data(font))
|
||||
item.unload_data_from_memory()
|
||||
|
||||
cfont = {
|
||||
'font-family': '"%s"'%font['font-family'],
|
||||
'panose-1': ' '.join(map(unicode_type, font['panose'])),
|
||||
'src': 'url(%s)'%item.href,
|
||||
}
|
||||
|
||||
if i == 0:
|
||||
generic_family = panose_to_css_generic_family(font['panose'])
|
||||
body_font_family = "'%s',%s"%(font['font-family'], generic_family)
|
||||
self.oeb.log('Embedding font: %s'%font['font-family'])
|
||||
for k in ('font-weight', 'font-style', 'font-stretch'):
|
||||
if font[k] != 'normal':
|
||||
cfont[k] = font[k]
|
||||
rule = '@font-face { %s }'%('; '.join('%s:%s'%(k, v) for k, v in
|
||||
iteritems(cfont)))
|
||||
rule = css_parser.parseString(rule)
|
||||
efi.append(rule)
|
||||
|
||||
return body_font_family, efi
|
||||
|
||||
def stylize_spine(self):
|
||||
self.stylizers = {}
|
||||
profile = self.context.source
|
||||
css = ''
|
||||
for item in self.items:
|
||||
html = item.data
|
||||
body = html.find(XHTML('body'))
|
||||
if 'style' in html.attrib:
|
||||
b = body.attrib.get('style', '')
|
||||
body.set('style', html.get('style') + ';' + b)
|
||||
del html.attrib['style']
|
||||
bs = body.get('style', '').split(';')
|
||||
bs.append('margin-top: 0pt')
|
||||
bs.append('margin-bottom: 0pt')
|
||||
if float(self.context.margin_left) >= 0:
|
||||
bs.append('margin-left : %gpt'%
|
||||
float(self.context.margin_left))
|
||||
if float(self.context.margin_right) >= 0:
|
||||
bs.append('margin-right : %gpt'%
|
||||
float(self.context.margin_right))
|
||||
bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
|
||||
if self.page_break_on_body:
|
||||
bs.extend(['page-break-before: always'])
|
||||
if self.context.change_justification != 'original':
|
||||
bs.append('text-align: '+ self.context.change_justification)
|
||||
if self.body_font_family:
|
||||
bs.append('font-family: '+self.body_font_family)
|
||||
body.set('style', '; '.join(bs))
|
||||
stylizer = Stylizer(html, item.href, self.oeb, self.context, profile,
|
||||
user_css=self.context.extra_css,
|
||||
extra_css=css)
|
||||
self.stylizers[item] = stylizer
|
||||
|
||||
def baseline_node(self, node, stylizer, sizes, csize):
|
||||
csize = stylizer.style(node)['font-size']
|
||||
if node.text:
|
||||
sizes[csize] += len(COLLAPSE.sub(' ', node.text))
|
||||
for child in node:
|
||||
self.baseline_node(child, stylizer, sizes, csize)
|
||||
if child.tail:
|
||||
sizes[csize] += len(COLLAPSE.sub(' ', child.tail))
|
||||
|
||||
def baseline_spine(self):
|
||||
sizes = defaultdict(float)
|
||||
for item in self.items:
|
||||
html = item.data
|
||||
stylizer = self.stylizers[item]
|
||||
body = html.find(XHTML('body'))
|
||||
fsize = self.context.source.fbase
|
||||
self.baseline_node(body, stylizer, sizes, fsize)
|
||||
try:
|
||||
sbase = max(list(sizes.items()), key=operator.itemgetter(1))[0]
|
||||
except:
|
||||
sbase = 12.0
|
||||
self.oeb.logger.info(
|
||||
"Source base font size is %0.05fpt" % sbase)
|
||||
return sbase
|
||||
|
||||
def clean_edges(self, cssdict, style, fsize):
|
||||
slineh = self.sbase * 1.26
|
||||
dlineh = self.lineh
|
||||
for kind in ('margin', 'padding'):
|
||||
for edge in ('bottom', 'top'):
|
||||
property = "%s-%s" % (kind, edge)
|
||||
if property not in cssdict:
|
||||
continue
|
||||
if '%' in cssdict[property]:
|
||||
continue
|
||||
value = style[property]
|
||||
if value == 0:
|
||||
continue
|
||||
elif value <= slineh:
|
||||
cssdict[property] = "%0.5fem" % (dlineh / fsize)
|
||||
else:
|
||||
try:
|
||||
value = round(value / slineh) * dlineh
|
||||
except:
|
||||
self.oeb.logger.warning(
|
||||
'Invalid length:', value)
|
||||
value = 0.0
|
||||
cssdict[property] = "%0.5fem" % (value / fsize)
|
||||
|
||||
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True):
|
||||
if not isinstance(node.tag, string_or_bytes) \
|
||||
or namespace(node.tag) != XHTML_NS:
|
||||
return
|
||||
tag = barename(node.tag)
|
||||
style = stylizer.style(node)
|
||||
cssdict = style.cssdict()
|
||||
try:
|
||||
font_size = style['font-size']
|
||||
except:
|
||||
font_size = self.sbase if self.sbase is not None else \
|
||||
self.context.source.fbase
|
||||
if tag == 'body' and isinstance(font_size, numbers.Number):
|
||||
stylizer.body_font_size = font_size
|
||||
if 'align' in node.attrib:
|
||||
if tag != 'img':
|
||||
cssdict['text-align'] = node.attrib['align']
|
||||
if cssdict['text-align'] == 'center':
|
||||
# align=center causes tables to be center aligned,
|
||||
# which text-align does not. And the ever trustworthy Word
|
||||
# uses this construct in its HTML output. See
|
||||
# https://bugs.launchpad.net/bugs/1569583
|
||||
if tag == 'table':
|
||||
if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
|
||||
cssdict['margin-left'] = cssdict['margin-right'] = 'auto'
|
||||
else:
|
||||
for table in node.iterchildren(XHTML("table")):
|
||||
ts = stylizer.style(table)
|
||||
if ts.get('margin-left') is None and ts.get('margin-right') is None:
|
||||
ts.set('margin-left', 'auto')
|
||||
ts.set('margin-right', 'auto')
|
||||
else:
|
||||
val = node.attrib['align']
|
||||
if val in ('middle', 'bottom', 'top'):
|
||||
cssdict['vertical-align'] = val
|
||||
elif val in ('left', 'right'):
|
||||
cssdict['float'] = val
|
||||
del node.attrib['align']
|
||||
if 'valign' in node.attrib and tag == 'td':
|
||||
if cssdict.get('vertical-align') == 'inherit':
|
||||
cssdict['vertical-align'] = node.attrib['valign']
|
||||
del node.attrib['valign']
|
||||
if node.tag == XHTML('font'):
|
||||
tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1',
|
||||
'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')]
|
||||
tag = 'div' if XPath('|'.join(tags))(node) else 'span'
|
||||
node.tag = XHTML(tag)
|
||||
if 'size' in node.attrib:
|
||||
def force_int(raw):
|
||||
return int(re.search(r'([0-9+-]+)', raw).group(1))
|
||||
size = node.attrib['size'].strip()
|
||||
if size:
|
||||
fnums = self.context.source.fnums
|
||||
if size[0] in ('+', '-'):
|
||||
# Oh, the warcrimes
|
||||
try:
|
||||
esize = 3 + force_int(size)
|
||||
except:
|
||||
esize = 3
|
||||
if esize < 1:
|
||||
esize = 1
|
||||
if esize > 7:
|
||||
esize = 7
|
||||
font_size = fnums[esize]
|
||||
else:
|
||||
try:
|
||||
font_size = fnums[force_int(size)]
|
||||
except:
|
||||
font_size = fnums[3]
|
||||
cssdict['font-size'] = '%.1fpt'%font_size
|
||||
del node.attrib['size']
|
||||
if 'face' in node.attrib:
|
||||
cssdict['font-family'] = node.attrib['face']
|
||||
del node.attrib['face']
|
||||
if 'color' in node.attrib:
|
||||
try:
|
||||
cssdict['color'] = Property('color', node.attrib['color']).value
|
||||
except (ValueError, SyntaxErr):
|
||||
pass
|
||||
del node.attrib['color']
|
||||
if 'bgcolor' in node.attrib:
|
||||
try:
|
||||
cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value
|
||||
except (ValueError, SyntaxErr):
|
||||
pass
|
||||
del node.attrib['bgcolor']
|
||||
if tag == 'ol' and 'type' in node.attrib:
|
||||
del node.attrib['type']
|
||||
if cssdict.get('font-weight', '').lower() == 'medium':
|
||||
cssdict['font-weight'] = 'normal' # ADE chokes on font-weight medium
|
||||
|
||||
fsize = font_size
|
||||
is_drop_cap = (cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and (
|
||||
len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f)))
|
||||
# Detect drop caps generated by the docx input plugin
|
||||
if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \
|
||||
not node.tail and 'line-height' in cssdict and 'font-size' in cssdict:
|
||||
dp = node.getparent()
|
||||
if dp.tag and dp.tag.endswith('}div') and len(dp) == 1 and not dp.text:
|
||||
if stylizer.style(dp).cssdict().get('float', None) == 'left':
|
||||
is_drop_cap = True
|
||||
if not self.context.disable_font_rescaling and not is_drop_cap:
|
||||
_sbase = self.sbase if self.sbase is not None else \
|
||||
self.context.source.fbase
|
||||
dyn_rescale = node.attrib.pop('data-calibre-rescale', None)
|
||||
if dyn_rescale is not None:
|
||||
try:
|
||||
dyn_rescale = float(dyn_rescale) / 100
|
||||
except Exception:
|
||||
dyn_rescale = 1
|
||||
fsize = self.fmap[_sbase]
|
||||
fsize *= dyn_rescale
|
||||
cssdict['font-size'] = '%0.5fem'%(fsize/psize)
|
||||
psize = fsize
|
||||
elif 'font-size' in cssdict or tag == 'body':
|
||||
fsize = self.fmap[font_size]
|
||||
try:
|
||||
cssdict['font-size'] = "%0.5fem" % (fsize / psize)
|
||||
except ZeroDivisionError:
|
||||
cssdict['font-size'] = '%.1fpt'%fsize
|
||||
psize = fsize
|
||||
|
||||
try:
|
||||
minlh = self.context.minimum_line_height / 100.
|
||||
slh = style['line-height']
|
||||
if not is_drop_cap and isinstance(slh, numbers.Number) and slh < minlh * fsize:
|
||||
cssdict['line-height'] = unicode_type(minlh)
|
||||
except Exception:
|
||||
self.oeb.logger.exception('Failed to set minimum line-height')
|
||||
|
||||
if cssdict:
|
||||
for x in self.filter_css:
|
||||
popval = cssdict.pop(x, None)
|
||||
if self.body_font_family and popval and x == 'font-family' \
|
||||
and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]:
|
||||
cssdict[x] = popval
|
||||
|
||||
if cssdict:
|
||||
if self.lineh and self.fbase and tag != 'body':
|
||||
self.clean_edges(cssdict, style, psize)
|
||||
if 'display' in cssdict and cssdict['display'] == 'in-line':
|
||||
cssdict['display'] = 'inline'
|
||||
if self.unfloat and 'float' in cssdict \
|
||||
and cssdict.get('display', 'none') != 'none':
|
||||
del cssdict['display']
|
||||
if self.untable and 'display' in cssdict \
|
||||
and cssdict['display'].startswith('table'):
|
||||
display = cssdict['display']
|
||||
if display == 'table-cell':
|
||||
cssdict['display'] = 'inline'
|
||||
else:
|
||||
cssdict['display'] = 'block'
|
||||
if 'vertical-align' in cssdict \
|
||||
and cssdict['vertical-align'] == 'sup':
|
||||
cssdict['vertical-align'] = 'super'
|
||||
if self.lineh and 'line-height' not in cssdict:
|
||||
lineh = self.lineh / psize
|
||||
cssdict['line-height'] = "%0.5fem" % lineh
|
||||
|
||||
if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'):
|
||||
if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
|
||||
for prop in ('margin', 'padding', 'border'):
|
||||
for edge in ('top', 'bottom'):
|
||||
cssdict['%s-%s'%(prop, edge)] = '0pt'
|
||||
if self.context.insert_blank_line:
|
||||
cssdict['margin-top'] = cssdict['margin-bottom'] = \
|
||||
'%fem'%self.context.insert_blank_line_size
|
||||
indent_size = self.context.remove_paragraph_spacing_indent_size
|
||||
keep_indents = indent_size < 0.0
|
||||
if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')):
|
||||
cssdict['text-indent'] = "%1.1fem" % indent_size
|
||||
|
||||
pseudo_classes = style.pseudo_classes(self.filter_css)
|
||||
if cssdict or pseudo_classes:
|
||||
keep_classes = set()
|
||||
|
||||
if cssdict:
|
||||
items = sorted(iteritems(cssdict))
|
||||
css = ';\n'.join(u'%s: %s' % (key, val) for key, val in items)
|
||||
classes = node.get('class', '').strip() or 'calibre'
|
||||
classes_list = classes.split()
|
||||
# lower() because otherwise if the document uses the same class
|
||||
# name with different case, both cases will apply, leading
|
||||
# to incorrect results.
|
||||
klass = ascii_text(STRIPNUM.sub('', classes_list[0])).lower().strip().replace(' ', '_')
|
||||
if css in styles:
|
||||
match = styles[css]
|
||||
else:
|
||||
match = klass + unicode_type(names[klass] or '')
|
||||
styles[css] = match
|
||||
names[klass] += 1
|
||||
node.attrib['class'] = match
|
||||
keep_classes.add(match)
|
||||
|
||||
for psel, cssdict in iteritems(pseudo_classes):
|
||||
items = sorted(iteritems(cssdict))
|
||||
css = ';\n'.join('%s: %s' % (key, val) for key, val in items)
|
||||
pstyles = pseudo_styles[psel]
|
||||
if css in pstyles:
|
||||
match = pstyles[css]
|
||||
else:
|
||||
# We have to use a different class for each psel as
|
||||
# otherwise you can have incorrect styles for a situation
|
||||
# like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
|
||||
# If the pcalibre class for a:hover and a:link is the same,
|
||||
# then the class attribute for a.x tags will contain both
|
||||
# that class and the class for a.x:hover, which is wrong.
|
||||
klass = 'pcalibre'
|
||||
match = klass + unicode_type(names[klass] or '')
|
||||
pstyles[css] = match
|
||||
names[klass] += 1
|
||||
keep_classes.add(match)
|
||||
node.attrib['class'] = ' '.join(keep_classes)
|
||||
|
||||
elif 'class' in node.attrib:
|
||||
del node.attrib['class']
|
||||
if 'style' in node.attrib:
|
||||
del node.attrib['style']
|
||||
if recurse:
|
||||
for child in node:
|
||||
self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
|
||||
|
||||
def flatten_head(self, item, href, global_href):
|
||||
html = item.data
|
||||
head = html.find(XHTML('head'))
|
||||
|
||||
def safe_lower(x):
|
||||
try:
|
||||
x = x.lower()
|
||||
except Exception:
|
||||
pass
|
||||
return x
|
||||
|
||||
for node in html.xpath('//*[local-name()="style" or local-name()="link"]'):
|
||||
if node.tag == XHTML('link') \
|
||||
and safe_lower(node.get('rel', 'stylesheet')) == 'stylesheet' \
|
||||
and safe_lower(node.get('type', CSS_MIME)) in OEB_STYLES:
|
||||
node.getparent().remove(node)
|
||||
elif node.tag == XHTML('style') \
|
||||
and node.get('type', CSS_MIME) in OEB_STYLES:
|
||||
node.getparent().remove(node)
|
||||
href = item.relhref(href)
|
||||
l = etree.SubElement(head, XHTML('link'),
|
||||
rel='stylesheet', type=CSS_MIME, href=href)
|
||||
l.tail='\n'
|
||||
if global_href:
|
||||
href = item.relhref(global_href)
|
||||
l = etree.SubElement(head, XHTML('link'),
|
||||
rel='stylesheet', type=CSS_MIME, href=href)
|
||||
l.tail = '\n'
|
||||
|
||||
def replace_css(self, css):
|
||||
manifest = self.oeb.manifest
|
||||
for item in manifest.values():
|
||||
if item.media_type in OEB_STYLES:
|
||||
manifest.remove(item)
|
||||
id, href = manifest.generate('css', 'stylesheet.css')
|
||||
sheet = css_parser.parseString(css, validate=False)
|
||||
if self.transform_css_rules:
|
||||
from calibre.ebooks.css_transform_rules import transform_sheet
|
||||
transform_sheet(self.transform_css_rules, sheet)
|
||||
item = manifest.add(id, href, CSS_MIME, data=sheet)
|
||||
self.oeb.manifest.main_stylesheet = item
|
||||
return href
|
||||
|
||||
def collect_global_css(self):
|
||||
global_css = defaultdict(list)
|
||||
for item in self.items:
|
||||
stylizer = self.stylizers[item]
|
||||
if float(self.context.margin_top) >= 0:
|
||||
stylizer.page_rule['margin-top'] = '%gpt'%\
|
||||
float(self.context.margin_top)
|
||||
if float(self.context.margin_bottom) >= 0:
|
||||
stylizer.page_rule['margin-bottom'] = '%gpt'%\
|
||||
float(self.context.margin_bottom)
|
||||
items = sorted(stylizer.page_rule.items())
|
||||
css = ';\n'.join("%s: %s" % (key, val) for key, val in items)
|
||||
css = ('@page {\n%s\n}\n'%css) if items else ''
|
||||
rules = [css_text(r) for r in stylizer.font_face_rules + self.embed_font_rules]
|
||||
raw = '\n\n'.join(rules)
|
||||
css += '\n\n' + raw
|
||||
global_css[css].append(item)
|
||||
|
||||
gc_map = {}
|
||||
manifest = self.oeb.manifest
|
||||
for css in global_css:
|
||||
href = None
|
||||
if css.strip():
|
||||
id_, href = manifest.generate('page_css', 'page_styles.css')
|
||||
sheet = css_parser.parseString(css, validate=False)
|
||||
if self.transform_css_rules:
|
||||
from calibre.ebooks.css_transform_rules import transform_sheet
|
||||
transform_sheet(self.transform_css_rules, sheet)
|
||||
manifest.add(id_, href, CSS_MIME, data=sheet)
|
||||
gc_map[css] = href
|
||||
|
||||
ans = {}
|
||||
for css, items in iteritems(global_css):
|
||||
for item in items:
|
||||
ans[item] = gc_map[css]
|
||||
return ans
|
||||
|
||||
def flatten_spine(self):
|
||||
names = defaultdict(int)
|
||||
styles, pseudo_styles = {}, defaultdict(dict)
|
||||
for item in self.items:
|
||||
html = item.data
|
||||
stylizer = self.stylizers[item]
|
||||
if self.specializer is not None:
|
||||
self.specializer(item, stylizer)
|
||||
fsize = self.context.dest.fbase
|
||||
self.flatten_node(html, stylizer, names, styles, pseudo_styles, fsize, item.id, recurse=False)
|
||||
self.flatten_node(html.find(XHTML('body')), stylizer, names, styles, pseudo_styles, fsize, item.id)
|
||||
items = sorted(((key, val) for (val, key) in iteritems(styles)), key=lambda x:numeric_sort_key(x[0]))
|
||||
# :hover must come after link and :active must come after :hover
|
||||
psels = sorted(pseudo_styles, key=lambda x :
|
||||
{'hover':1, 'active':2}.get(x, 0))
|
||||
for psel in psels:
|
||||
styles = pseudo_styles[psel]
|
||||
if not styles:
|
||||
continue
|
||||
x = sorted(((k+':'+psel, v) for v, k in iteritems(styles)))
|
||||
items.extend(x)
|
||||
|
||||
css = ''.join(".%s {\n%s;\n}\n\n" % (key, val) for key, val in items)
|
||||
|
||||
href = self.replace_css(css)
|
||||
global_css = self.collect_global_css()
|
||||
for item in self.items:
|
||||
stylizer = self.stylizers[item]
|
||||
self.flatten_head(item, href, global_css[item])
|
||||
55
ebook_converter/ebooks/oeb/transforms/guide.py
Normal file
55
ebook_converter/ebooks/oeb/transforms/guide.py
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
class Clean(object):
|
||||
'''Clean up guide, leaving only known values '''
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
self.oeb, self.log, self.opts = oeb, oeb.log, opts
|
||||
|
||||
if 'cover' not in self.oeb.guide:
|
||||
covers = []
|
||||
for x in ('other.ms-coverimage-standard', 'coverimagestandard',
|
||||
'other.ms-titleimage-standard', 'other.ms-titleimage',
|
||||
'other.ms-coverimage', 'other.ms-thumbimage-standard',
|
||||
'other.ms-thumbimage', 'thumbimagestandard'):
|
||||
if x in self.oeb.guide:
|
||||
href = self.oeb.guide[x].href
|
||||
try:
|
||||
item = self.oeb.manifest.hrefs[href]
|
||||
except KeyError:
|
||||
continue
|
||||
else:
|
||||
covers.append([self.oeb.guide[x], len(item.data)])
|
||||
|
||||
covers.sort(key=lambda x: x[1], reverse=True)
|
||||
if covers:
|
||||
ref = covers[0][0]
|
||||
if len(covers) > 1:
|
||||
self.log('Choosing %s:%s as the cover'%(ref.type, ref.href))
|
||||
ref.type = 'cover'
|
||||
self.oeb.guide.refs['cover'] = ref
|
||||
|
||||
if ('start' in self.oeb.guide and 'text' not in self.oeb.guide):
|
||||
# Prefer text to start as per the OPF 2.0 spec
|
||||
x = self.oeb.guide['start']
|
||||
self.oeb.guide.add('text', x.title, x.href)
|
||||
self.oeb.guide.remove('start')
|
||||
|
||||
for x in list(self.oeb.guide):
|
||||
if x.lower() not in {
|
||||
'cover', 'titlepage', 'masthead', 'toc', 'title-page',
|
||||
'copyright-page', 'text', 'index', 'glossary',
|
||||
'acknowledgements', 'bibliography', 'colophon',
|
||||
'copyright-page', 'dedication', 'epigraph', 'foreword',
|
||||
'loi', 'lot', 'notes', 'preface'}:
|
||||
item = self.oeb.guide[x]
|
||||
if item.title and item.title.lower() == 'start':
|
||||
continue
|
||||
self.oeb.guide.remove(x)
|
||||
395
ebook_converter/ebooks/oeb/transforms/jacket.py
Normal file
395
ebook_converter/ebooks/oeb/transforms/jacket.py
Normal file
@@ -0,0 +1,395 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os, re
|
||||
from xml.sax.saxutils import escape
|
||||
from string import Formatter
|
||||
|
||||
from calibre import guess_type, strftime
|
||||
from calibre.constants import iswindows
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML, xml2text, urldefrag, urlnormalize
|
||||
from calibre.library.comments import comments_to_html, markdown
|
||||
from calibre.utils.date import is_date_undefined, as_local_time
|
||||
from calibre.utils.icu import sort_key
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations
|
||||
from calibre.ebooks.metadata import fmt_sidx, rating_to_stars
|
||||
from polyglot.builtins import unicode_type, map
|
||||
|
||||
JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'
|
||||
|
||||
|
||||
class SafeFormatter(Formatter):
|
||||
|
||||
def get_value(self, *args, **kwargs):
|
||||
try:
|
||||
return Formatter.get_value(self, *args, **kwargs)
|
||||
except KeyError:
|
||||
return ''
|
||||
|
||||
|
||||
class Base(object):
|
||||
|
||||
def remove_images(self, item, limit=1):
|
||||
path = XPath('//h:img[@src]')
|
||||
removed = 0
|
||||
for img in path(item.data):
|
||||
if removed >= limit:
|
||||
break
|
||||
href = item.abshref(img.get('src'))
|
||||
image = self.oeb.manifest.hrefs.get(href)
|
||||
if image is None:
|
||||
href = urlnormalize(href)
|
||||
image = self.oeb.manifest.hrefs.get(href)
|
||||
if image is not None:
|
||||
self.oeb.manifest.remove(image)
|
||||
self.oeb.guide.remove_by_href(href)
|
||||
img.getparent().remove(img)
|
||||
removed += 1
|
||||
return removed
|
||||
|
||||
|
||||
class RemoveFirstImage(Base):
|
||||
|
||||
def remove_first_image(self):
|
||||
deleted_item = None
|
||||
for item in self.oeb.spine:
|
||||
if XPath(JACKET_XPATH)(item.data):
|
||||
continue
|
||||
removed = self.remove_images(item)
|
||||
if removed > 0:
|
||||
self.log('Removed first image')
|
||||
body = XPath('//h:body')(item.data)
|
||||
if body:
|
||||
raw = xml2text(body[0]).strip()
|
||||
imgs = XPath('//h:img|//svg:svg')(item.data)
|
||||
if not raw and not imgs:
|
||||
self.log('Removing %s as it has no content'%item.href)
|
||||
self.oeb.manifest.remove(item)
|
||||
deleted_item = item
|
||||
break
|
||||
else:
|
||||
self.log.warn('Could not find first image to remove')
|
||||
if deleted_item is not None:
|
||||
for item in list(self.oeb.toc):
|
||||
href = urldefrag(item.href)[0]
|
||||
if href == deleted_item.href:
|
||||
self.oeb.toc.remove(item)
|
||||
self.oeb.guide.remove_by_href(deleted_item.href)
|
||||
|
||||
def __call__(self, oeb, opts, metadata):
|
||||
'''
|
||||
Add metadata in jacket.xhtml if specified in opts
|
||||
If not specified, remove previous jacket instance
|
||||
'''
|
||||
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
||||
if opts.remove_first_image:
|
||||
self.remove_first_image()
|
||||
|
||||
|
||||
class Jacket(Base):
|
||||
'''
|
||||
Book jacket manipulation. Remove first image and insert comments at start of
|
||||
book.
|
||||
'''
|
||||
|
||||
def insert_metadata(self, mi):
|
||||
self.log('Inserting metadata into book...')
|
||||
|
||||
try:
|
||||
tags = list(map(unicode_type, self.oeb.metadata.subject))
|
||||
except Exception:
|
||||
tags = []
|
||||
|
||||
try:
|
||||
comments = unicode_type(self.oeb.metadata.description[0])
|
||||
except:
|
||||
comments = ''
|
||||
|
||||
try:
|
||||
title = unicode_type(self.oeb.metadata.title[0])
|
||||
except:
|
||||
title = _('Unknown')
|
||||
|
||||
try:
|
||||
authors = list(map(unicode_type, self.oeb.metadata.creator))
|
||||
except:
|
||||
authors = [_('Unknown')]
|
||||
|
||||
root = render_jacket(mi, self.opts.output_profile,
|
||||
alt_title=title, alt_tags=tags, alt_authors=authors,
|
||||
alt_comments=comments, rescale_fonts=True)
|
||||
id, href = self.oeb.manifest.generate('calibre_jacket', 'jacket.xhtml')
|
||||
|
||||
jacket = self.oeb.manifest.add(id, href, guess_type(href)[0], data=root)
|
||||
self.oeb.spine.insert(0, jacket, True)
|
||||
self.oeb.inserted_metadata_jacket = jacket
|
||||
for img, path in referenced_images(root):
|
||||
self.oeb.log('Embedding referenced image %s into jacket' % path)
|
||||
ext = path.rpartition('.')[-1].lower()
|
||||
item_id, href = self.oeb.manifest.generate('jacket_image', 'jacket_img.'+ext)
|
||||
with open(path, 'rb') as f:
|
||||
item = self.oeb.manifest.add(item_id, href, guess_type(href)[0], data=f.read())
|
||||
item.unload_data_from_memory()
|
||||
img.set('src', jacket.relhref(item.href))
|
||||
|
||||
def remove_existing_jacket(self):
|
||||
for x in self.oeb.spine[:4]:
|
||||
if XPath(JACKET_XPATH)(x.data):
|
||||
self.remove_images(x, limit=sys.maxsize)
|
||||
self.oeb.manifest.remove(x)
|
||||
self.log('Removed existing jacket')
|
||||
break
|
||||
|
||||
def __call__(self, oeb, opts, metadata):
|
||||
'''
|
||||
Add metadata in jacket.xhtml if specified in opts
|
||||
If not specified, remove previous jacket instance
|
||||
'''
|
||||
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
||||
self.remove_existing_jacket()
|
||||
if opts.insert_metadata:
|
||||
self.insert_metadata(metadata)
|
||||
|
||||
# Render Jacket {{{
|
||||
|
||||
|
||||
def get_rating(rating, rchar, e_rchar):
|
||||
ans = ''
|
||||
try:
|
||||
num = float(rating)/2
|
||||
except:
|
||||
return ans
|
||||
num = max(0, num)
|
||||
num = min(num, 5)
|
||||
if num < 1:
|
||||
return ans
|
||||
|
||||
ans = ("%s%s") % (rchar * int(num), e_rchar * (5 - int(num)))
|
||||
return ans
|
||||
|
||||
|
||||
class Series(unicode_type):
|
||||
|
||||
def __new__(self, series, series_index):
|
||||
if series and series_index is not None:
|
||||
roman = _('{1} of <em>{0}</em>').format(
|
||||
escape(series), escape(fmt_sidx(series_index, use_roman=True)))
|
||||
combined = _('{1} of <em>{0}</em>').format(
|
||||
escape(series), escape(fmt_sidx(series_index, use_roman=False)))
|
||||
else:
|
||||
combined = roman = escape(series or u'')
|
||||
s = unicode_type.__new__(self, combined)
|
||||
s.roman = roman
|
||||
s.name = escape(series or '')
|
||||
s.number = escape(fmt_sidx(series_index or 1.0, use_roman=False))
|
||||
s.roman_number = escape(fmt_sidx(series_index or 1.0, use_roman=True))
|
||||
return s
|
||||
|
||||
|
||||
class Tags(unicode_type):
|
||||
|
||||
def __new__(self, tags, output_profile):
|
||||
tags = [escape(x) for x in tags or ()]
|
||||
t = unicode_type.__new__(self, ', '.join(tags))
|
||||
t.alphabetical = ', '.join(sorted(tags, key=sort_key))
|
||||
t.tags_list = tags
|
||||
return t
|
||||
|
||||
|
||||
def postprocess_jacket(root, output_profile, has_data):
|
||||
# Post-process the generated html to strip out empty header items
|
||||
|
||||
def extract(tag):
|
||||
parent = tag.getparent()
|
||||
idx = parent.index(tag)
|
||||
parent.remove(tag)
|
||||
if tag.tail:
|
||||
if idx == 0:
|
||||
parent.text = (parent.text or '') + tag.tail
|
||||
else:
|
||||
if idx >= len(parent):
|
||||
idx = -1
|
||||
parent[-1].tail = (parent[-1].tail or '') + tag.tail
|
||||
|
||||
def extract_class(cls):
|
||||
for tag in root.xpath('//*[@class="_"]'.replace('_', cls)):
|
||||
extract(tag)
|
||||
|
||||
for key in 'series rating tags'.split():
|
||||
if not has_data[key]:
|
||||
extract_class('cbj_' + key)
|
||||
if not has_data['pubdate']:
|
||||
extract_class('cbj_pubdata')
|
||||
if output_profile.short_name != 'kindle':
|
||||
extract_class('cbj_kindle_banner_hr')
|
||||
|
||||
|
||||
def render_jacket(mi, output_profile,
|
||||
alt_title=_('Unknown'), alt_tags=[], alt_comments='',
|
||||
alt_publisher='', rescale_fonts=False, alt_authors=None):
|
||||
css = P('jacket/stylesheet.css', data=True).decode('utf-8')
|
||||
template = P('jacket/template.xhtml', data=True).decode('utf-8')
|
||||
|
||||
template = re.sub(r'<!--.*?-->', '', template, flags=re.DOTALL)
|
||||
css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)
|
||||
|
||||
try:
|
||||
title_str = alt_title if mi.is_null('title') else mi.title
|
||||
except:
|
||||
title_str = _('Unknown')
|
||||
title_str = escape(title_str)
|
||||
title = '<span class="title">%s</span>' % title_str
|
||||
|
||||
series = Series(mi.series, mi.series_index)
|
||||
try:
|
||||
publisher = mi.publisher if not mi.is_null('publisher') else alt_publisher
|
||||
except:
|
||||
publisher = ''
|
||||
publisher = escape(publisher)
|
||||
|
||||
try:
|
||||
if is_date_undefined(mi.pubdate):
|
||||
pubdate = ''
|
||||
else:
|
||||
dt = as_local_time(mi.pubdate)
|
||||
pubdate = strftime('%Y', dt.timetuple())
|
||||
except:
|
||||
pubdate = ''
|
||||
|
||||
rating = get_rating(mi.rating, output_profile.ratings_char, output_profile.empty_ratings_char)
|
||||
|
||||
tags = Tags((mi.tags if mi.tags else alt_tags), output_profile)
|
||||
|
||||
comments = mi.comments if mi.comments else alt_comments
|
||||
comments = comments.strip()
|
||||
if comments:
|
||||
comments = comments_to_html(comments)
|
||||
|
||||
orig = mi.authors
|
||||
if mi.is_null('authors'):
|
||||
mi.authors = list(alt_authors or (_('Unknown'),))
|
||||
try:
|
||||
author = mi.format_authors()
|
||||
except:
|
||||
author = ''
|
||||
mi.authors = orig
|
||||
author = escape(author)
|
||||
has_data = {}
|
||||
|
||||
def generate_html(comments):
|
||||
args = dict(xmlns=XHTML_NS,
|
||||
title_str=title_str,
|
||||
css=css,
|
||||
title=title,
|
||||
author=author,
|
||||
publisher=publisher,
|
||||
pubdate_label=_('Published'), pubdate=pubdate,
|
||||
series_label=_('Series'), series=series,
|
||||
rating_label=_('Rating'), rating=rating,
|
||||
tags_label=_('Tags'), tags=tags,
|
||||
comments=comments,
|
||||
footer='',
|
||||
searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list),
|
||||
)
|
||||
for key in mi.custom_field_keys():
|
||||
m = mi.get_user_metadata(key, False) or {}
|
||||
try:
|
||||
display_name, val = mi.format_field_extended(key)[:2]
|
||||
dkey = key.replace('#', '_')
|
||||
dt = m.get('datatype')
|
||||
if dt == 'series':
|
||||
args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
|
||||
elif dt == 'rating':
|
||||
args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False))
|
||||
elif dt == 'comments':
|
||||
val = val or ''
|
||||
display = m.get('display', {})
|
||||
ctype = display.get('interpret_as') or 'html'
|
||||
if ctype == 'long-text':
|
||||
val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val)
|
||||
elif ctype == 'short-text':
|
||||
val = '<span>%s</span>' % escape(val)
|
||||
elif ctype == 'markdown':
|
||||
val = markdown(val)
|
||||
else:
|
||||
val = comments_to_html(val)
|
||||
args[dkey] = val
|
||||
else:
|
||||
args[dkey] = escape(val)
|
||||
args[dkey+'_label'] = escape(display_name)
|
||||
except Exception:
|
||||
# if the val (custom column contents) is None, don't add to args
|
||||
pass
|
||||
|
||||
if False:
|
||||
print("Custom column values available in jacket template:")
|
||||
for key in args.keys():
|
||||
if key.startswith('_') and not key.endswith('_label'):
|
||||
print(" %s: %s" % ('#' + key[1:], args[key]))
|
||||
|
||||
# Used in the comment describing use of custom columns in templates
|
||||
# Don't change this unless you also change it in template.xhtml
|
||||
args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
|
||||
args['_genre'] = args.get('_genre', '{_genre}')
|
||||
|
||||
formatter = SafeFormatter()
|
||||
generated_html = formatter.format(template, **args)
|
||||
has_data['series'] = bool(series)
|
||||
has_data['tags'] = bool(tags)
|
||||
has_data['rating'] = bool(rating)
|
||||
has_data['pubdate'] = bool(pubdate)
|
||||
|
||||
return strip_encoding_declarations(generated_html)
|
||||
|
||||
from calibre.ebooks.oeb.polish.parsing import parse
|
||||
raw = generate_html(comments)
|
||||
root = parse(raw, line_numbers=False, force_html5_parse=True)
|
||||
|
||||
if rescale_fonts:
|
||||
# We ensure that the conversion pipeline will set the font sizes for
|
||||
# text in the jacket to the same size as the font sizes for the rest of
|
||||
# the text in the book. That means that as long as the jacket uses
|
||||
# relative font sizes (em or %), the post conversion font size will be
|
||||
# the same as for text in the main book. So text with size x em will
|
||||
# be rescaled to the same value in both the jacket and the main content.
|
||||
#
|
||||
# We cannot use data-calibre-rescale 100 on the body tag as that will just
|
||||
# give the body tag a font size of 1em, which is useless.
|
||||
for body in root.xpath('//*[local-name()="body"]'):
|
||||
fw = body.makeelement(XHTML('div'))
|
||||
fw.set('data-calibre-rescale', '100')
|
||||
for child in body:
|
||||
fw.append(child)
|
||||
body.append(fw)
|
||||
postprocess_jacket(root, output_profile, has_data)
|
||||
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
|
||||
pretty_html_tree(None, root)
|
||||
return root
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
def linearize_jacket(oeb):
|
||||
for x in oeb.spine[:4]:
|
||||
if XPath(JACKET_XPATH)(x.data):
|
||||
for e in XPath('//h:table|//h:tr|//h:th')(x.data):
|
||||
e.tag = XHTML('div')
|
||||
for e in XPath('//h:td')(x.data):
|
||||
e.tag = XHTML('span')
|
||||
break
|
||||
|
||||
|
||||
def referenced_images(root):
|
||||
for img in XPath('//h:img[@src]')(root):
|
||||
src = img.get('src')
|
||||
if src.startswith('file://'):
|
||||
path = src[7:]
|
||||
if iswindows and path.startswith('/'):
|
||||
path = path[1:]
|
||||
if os.path.exists(path):
|
||||
yield img, path
|
||||
218
ebook_converter/ebooks/oeb/transforms/metadata.py
Normal file
218
ebook_converter/ebooks/oeb/transforms/metadata.py
Normal file
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re
|
||||
from calibre.utils.date import isoformat, now
|
||||
from calibre import guess_type
|
||||
from polyglot.builtins import iteritems
|
||||
|
||||
|
||||
def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
if not mi.is_null('title'):
|
||||
m.clear('title')
|
||||
m.add('title', mi.title)
|
||||
if mi.title_sort:
|
||||
if not m.title:
|
||||
m.add('title', mi.title_sort)
|
||||
m.clear('title_sort')
|
||||
m.add('title_sort', mi.title_sort)
|
||||
if not mi.is_null('authors'):
|
||||
m.filter('creator', lambda x : x.role.lower() in ['aut', ''])
|
||||
for a in mi.authors:
|
||||
attrib = {'role':'aut'}
|
||||
if mi.author_sort:
|
||||
attrib[OPF('file-as')] = mi.author_sort
|
||||
m.add('creator', a, attrib=attrib)
|
||||
if not mi.is_null('book_producer'):
|
||||
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
|
||||
m.add('contributor', mi.book_producer, role='bkp')
|
||||
elif override_input_metadata:
|
||||
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
|
||||
if not mi.is_null('comments'):
|
||||
m.clear('description')
|
||||
m.add('description', mi.comments)
|
||||
elif override_input_metadata:
|
||||
m.clear('description')
|
||||
if not mi.is_null('publisher'):
|
||||
m.clear('publisher')
|
||||
m.add('publisher', mi.publisher)
|
||||
elif override_input_metadata:
|
||||
m.clear('publisher')
|
||||
if not mi.is_null('series'):
|
||||
m.clear('series')
|
||||
m.add('series', mi.series)
|
||||
elif override_input_metadata:
|
||||
m.clear('series')
|
||||
identifiers = mi.get_identifiers()
|
||||
set_isbn = False
|
||||
for typ, val in iteritems(identifiers):
|
||||
has = False
|
||||
if typ.lower() == 'isbn':
|
||||
set_isbn = True
|
||||
for x in m.identifier:
|
||||
if x.scheme.lower() == typ.lower():
|
||||
x.content = val
|
||||
has = True
|
||||
if not has:
|
||||
m.add('identifier', val, scheme=typ.upper())
|
||||
if override_input_metadata and not set_isbn:
|
||||
m.filter('identifier', lambda x: x.scheme.lower() == 'isbn')
|
||||
if not mi.is_null('languages'):
|
||||
m.clear('language')
|
||||
for lang in mi.languages:
|
||||
if lang and lang.lower() not in ('und', ''):
|
||||
m.add('language', lang)
|
||||
if not mi.is_null('series_index'):
|
||||
m.clear('series_index')
|
||||
m.add('series_index', mi.format_series_index())
|
||||
elif override_input_metadata:
|
||||
m.clear('series_index')
|
||||
if not mi.is_null('rating'):
|
||||
m.clear('rating')
|
||||
m.add('rating', '%.2f'%mi.rating)
|
||||
elif override_input_metadata:
|
||||
m.clear('rating')
|
||||
if not mi.is_null('tags'):
|
||||
m.clear('subject')
|
||||
for t in mi.tags:
|
||||
m.add('subject', t)
|
||||
elif override_input_metadata:
|
||||
m.clear('subject')
|
||||
if not mi.is_null('pubdate'):
|
||||
m.clear('date')
|
||||
m.add('date', isoformat(mi.pubdate))
|
||||
if not mi.is_null('timestamp'):
|
||||
m.clear('timestamp')
|
||||
m.add('timestamp', isoformat(mi.timestamp))
|
||||
if not mi.is_null('rights'):
|
||||
m.clear('rights')
|
||||
m.add('rights', mi.rights)
|
||||
if not mi.is_null('publication_type'):
|
||||
m.clear('publication_type')
|
||||
m.add('publication_type', mi.publication_type)
|
||||
|
||||
if not m.timestamp:
|
||||
m.add('timestamp', isoformat(now()))
|
||||
|
||||
|
||||
class MergeMetadata(object):
|
||||
'Merge in user metadata, including cover'
|
||||
|
||||
def __call__(self, oeb, mi, opts, override_input_metadata=False):
|
||||
self.oeb, self.log = oeb, oeb.log
|
||||
m = self.oeb.metadata
|
||||
self.log('Merging user specified metadata...')
|
||||
meta_info_to_oeb_metadata(mi, m, oeb.log,
|
||||
override_input_metadata=override_input_metadata)
|
||||
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
|
||||
m.clear('cover')
|
||||
if cover_id is not None:
|
||||
m.add('cover', cover_id)
|
||||
if mi.uuid is not None:
|
||||
m.filter('identifier', lambda x:x.id=='uuid_id')
|
||||
self.oeb.metadata.add('identifier', mi.uuid, id='uuid_id',
|
||||
scheme='uuid')
|
||||
self.oeb.uid = self.oeb.metadata.identifier[-1]
|
||||
if mi.application_id is not None:
|
||||
m.filter('identifier', lambda x:x.scheme=='calibre')
|
||||
self.oeb.metadata.add('identifier', mi.application_id, scheme='calibre')
|
||||
|
||||
def set_cover(self, mi, prefer_metadata_cover):
|
||||
cdata, ext = b'', 'jpg'
|
||||
if mi.cover and os.access(mi.cover, os.R_OK):
|
||||
with open(mi.cover, 'rb') as f:
|
||||
cdata = f.read()
|
||||
ext = mi.cover.rpartition('.')[-1].lower().strip()
|
||||
elif mi.cover_data and mi.cover_data[-1]:
|
||||
cdata = mi.cover_data[1]
|
||||
ext = mi.cover_data[0]
|
||||
if ext not in ('png', 'jpg', 'jpeg'):
|
||||
ext = 'jpg'
|
||||
id = old_cover = None
|
||||
if 'cover' in self.oeb.guide:
|
||||
old_cover = self.oeb.guide['cover']
|
||||
if prefer_metadata_cover and old_cover is not None:
|
||||
cdata = b''
|
||||
if cdata:
|
||||
self.oeb.guide.remove('cover')
|
||||
self.oeb.guide.remove('titlepage')
|
||||
elif self.oeb.plumber_output_format in {'mobi', 'azw3'} and old_cover is not None:
|
||||
# The amazon formats dont support html cover pages, so remove them
|
||||
# even if no cover was specified.
|
||||
self.oeb.guide.remove('titlepage')
|
||||
do_remove_old_cover = False
|
||||
if old_cover is not None:
|
||||
if old_cover.href in self.oeb.manifest.hrefs:
|
||||
item = self.oeb.manifest.hrefs[old_cover.href]
|
||||
if not cdata:
|
||||
return item.id
|
||||
do_remove_old_cover = True
|
||||
elif not cdata:
|
||||
id = self.oeb.manifest.generate(id='cover')[0]
|
||||
self.oeb.manifest.add(id, old_cover.href, 'image/jpeg')
|
||||
return id
|
||||
new_cover_item = None
|
||||
if cdata:
|
||||
id, href = self.oeb.manifest.generate('cover', 'cover.'+ext)
|
||||
new_cover_item = self.oeb.manifest.add(id, href, guess_type('cover.'+ext)[0], data=cdata)
|
||||
self.oeb.guide.add('cover', 'Cover', href)
|
||||
if do_remove_old_cover:
|
||||
self.remove_old_cover(item, new_cover_item.href)
|
||||
return id
|
||||
|
||||
def remove_old_cover(self, cover_item, new_cover_href=None):
|
||||
from calibre.ebooks.oeb.base import XPath, XLINK
|
||||
from lxml import etree
|
||||
|
||||
self.oeb.manifest.remove(cover_item)
|
||||
|
||||
# Remove any references to the cover in the HTML
|
||||
affected_items = set()
|
||||
xp = XPath('//h:img[@src]|//svg:image[@xl:href]')
|
||||
for i, item in enumerate(self.oeb.spine):
|
||||
try:
|
||||
images = xp(item.data)
|
||||
except Exception:
|
||||
images = ()
|
||||
removed = False
|
||||
for img in images:
|
||||
href = img.get('src') or img.get(XLINK('href'))
|
||||
try:
|
||||
href = item.abshref(href)
|
||||
except Exception:
|
||||
continue # Invalid URL, ignore
|
||||
if href == cover_item.href:
|
||||
if new_cover_href is not None:
|
||||
replacement_href = item.relhref(new_cover_href)
|
||||
attr = 'src' if img.tag.endswith('img') else XLINK('href')
|
||||
img.set(attr, replacement_href)
|
||||
else:
|
||||
p = img.getparent()
|
||||
if p.tag.endswith('}svg'):
|
||||
p.getparent().remove(p)
|
||||
else:
|
||||
p.remove(img)
|
||||
removed = True
|
||||
if removed:
|
||||
affected_items.add(item)
|
||||
|
||||
# Check if the resulting HTML has no content, if so remove it
|
||||
for item in affected_items:
|
||||
body = XPath('//h:body')(item.data)
|
||||
if body:
|
||||
text = etree.tostring(body[0], method='text', encoding='unicode')
|
||||
else:
|
||||
text = ''
|
||||
text = re.sub(r'\s+', '', text)
|
||||
if not text and not XPath('//h:img|//svg:svg')(item.data):
|
||||
self.log('Removing %s as it is a wrapper around'
|
||||
' the cover image'%item.href)
|
||||
self.oeb.spine.remove(item)
|
||||
self.oeb.manifest.remove(item)
|
||||
self.oeb.guide.remove_by_href(item.href)
|
||||
189
ebook_converter/ebooks/oeb/transforms/page_margin.py
Normal file
189
ebook_converter/ebooks/oeb/transforms/page_margin.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import numbers
|
||||
from collections import Counter
|
||||
|
||||
from calibre.ebooks.oeb.base import barename, XPath
|
||||
from polyglot.builtins import iteritems
|
||||
|
||||
|
||||
class RemoveAdobeMargins(object):
|
||||
'''
|
||||
Remove margins specified in Adobe's page templates.
|
||||
'''
|
||||
|
||||
def __call__(self, oeb, log, opts):
|
||||
self.oeb, self.opts, self.log = oeb, opts, log
|
||||
|
||||
for item in self.oeb.manifest:
|
||||
if item.media_type in {
|
||||
'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml',
|
||||
'application/adobe-page-template+xml', 'application/adobe.page-template+xml',
|
||||
} and hasattr(item.data, 'xpath'):
|
||||
self.log('Removing page margins specified in the'
|
||||
' Adobe page template')
|
||||
for elem in item.data.xpath(
|
||||
'//*[@margin-bottom or @margin-top '
|
||||
'or @margin-left or @margin-right]'):
|
||||
for margin in ('left', 'right', 'top', 'bottom'):
|
||||
attr = 'margin-'+margin
|
||||
elem.attrib.pop(attr, None)
|
||||
|
||||
|
||||
class NegativeTextIndent(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RemoveFakeMargins(object):
|
||||
|
||||
'''
|
||||
Remove left and right margins from paragraph/divs if the same margin is specified
|
||||
on almost all the elements at that level.
|
||||
|
||||
Must be called only after CSS flattening
|
||||
'''
|
||||
|
||||
def __call__(self, oeb, log, opts):
|
||||
if not opts.remove_fake_margins:
|
||||
return
|
||||
self.oeb, self.log, self.opts = oeb, log, opts
|
||||
stylesheet = None
|
||||
self.levels = {}
|
||||
self.stats = {}
|
||||
self.selector_map = {}
|
||||
|
||||
stylesheet = self.oeb.manifest.main_stylesheet
|
||||
if stylesheet is None:
|
||||
return
|
||||
|
||||
self.log('Removing fake margins...')
|
||||
|
||||
stylesheet = stylesheet.data
|
||||
|
||||
from css_parser.css import CSSRule
|
||||
for rule in stylesheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
self.selector_map[rule.selectorList.selectorText] = rule.style
|
||||
|
||||
self.find_levels()
|
||||
|
||||
for level in self.levels:
|
||||
try:
|
||||
self.process_level(level)
|
||||
except NegativeTextIndent:
|
||||
self.log.debug('Negative text indent detected at level '
|
||||
' %s, ignoring this level'%level)
|
||||
|
||||
def get_margins(self, elem):
|
||||
cls = elem.get('class', None)
|
||||
if cls:
|
||||
style = self.selector_map.get('.'+cls, None)
|
||||
if style:
|
||||
try:
|
||||
ti = style['text-indent']
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
if ((hasattr(ti, 'startswith') and ti.startswith('-')) or
|
||||
isinstance(ti, numbers.Number) and ti < 0):
|
||||
raise NegativeTextIndent()
|
||||
return style.marginLeft, style.marginRight, style
|
||||
return '', '', None
|
||||
|
||||
def process_level(self, level):
|
||||
elems = self.levels[level]
|
||||
self.stats[level+'_left'] = Counter()
|
||||
self.stats[level+'_right'] = Counter()
|
||||
|
||||
for elem in elems:
|
||||
lm, rm = self.get_margins(elem)[:2]
|
||||
self.stats[level+'_left'][lm] += 1
|
||||
self.stats[level+'_right'][rm] += 1
|
||||
|
||||
self.log.debug(level, ' left margin stats:', self.stats[level+'_left'])
|
||||
self.log.debug(level, ' right margin stats:', self.stats[level+'_right'])
|
||||
|
||||
remove_left = self.analyze_stats(self.stats[level+'_left'])
|
||||
remove_right = self.analyze_stats(self.stats[level+'_right'])
|
||||
|
||||
if remove_left:
|
||||
mcl = self.stats[level+'_left'].most_common(1)[0][0]
|
||||
self.log('Removing level %s left margin of:'%level, mcl)
|
||||
|
||||
if remove_right:
|
||||
mcr = self.stats[level+'_right'].most_common(1)[0][0]
|
||||
self.log('Removing level %s right margin of:'%level, mcr)
|
||||
|
||||
if remove_left or remove_right:
|
||||
for elem in elems:
|
||||
lm, rm, style = self.get_margins(elem)
|
||||
if remove_left and lm == mcl:
|
||||
style.removeProperty('margin-left')
|
||||
if remove_right and rm == mcr:
|
||||
style.removeProperty('margin-right')
|
||||
|
||||
def find_levels(self):
|
||||
|
||||
def level_of(elem, body):
|
||||
ans = 1
|
||||
while elem.getparent() is not body:
|
||||
ans += 1
|
||||
elem = elem.getparent()
|
||||
return ans
|
||||
|
||||
paras = XPath('descendant::h:p|descendant::h:div')
|
||||
|
||||
for item in self.oeb.spine:
|
||||
body = XPath('//h:body')(item.data)
|
||||
if not body:
|
||||
continue
|
||||
body = body[0]
|
||||
|
||||
for p in paras(body):
|
||||
level = level_of(p, body)
|
||||
level = '%s_%d'%(barename(p.tag), level)
|
||||
if level not in self.levels:
|
||||
self.levels[level] = []
|
||||
self.levels[level].append(p)
|
||||
|
||||
remove = set()
|
||||
for k, v in iteritems(self.levels):
|
||||
num = len(v)
|
||||
self.log.debug('Found %d items of level:'%num, k)
|
||||
level = int(k.split('_')[-1])
|
||||
tag = k.split('_')[0]
|
||||
if tag == 'p' and num < 25:
|
||||
remove.add(k)
|
||||
if tag == 'div':
|
||||
if level > 2 and num < 25:
|
||||
remove.add(k)
|
||||
elif level < 3:
|
||||
# Check each level < 3 element and only keep those
|
||||
# that have many child paras
|
||||
for elem in list(v):
|
||||
children = len(paras(elem))
|
||||
if children < 5:
|
||||
v.remove(elem)
|
||||
|
||||
for k in remove:
|
||||
self.levels.pop(k)
|
||||
self.log.debug('Ignoring level', k)
|
||||
|
||||
def analyze_stats(self, stats):
|
||||
if not stats:
|
||||
return False
|
||||
mc = stats.most_common(1)
|
||||
if len(mc) > 1:
|
||||
return False
|
||||
mc = mc[0]
|
||||
most_common, most_common_count = mc
|
||||
if not most_common or most_common == '0':
|
||||
return False
|
||||
total = sum(stats.values())
|
||||
# True if greater than 95% of elements have the same margin
|
||||
return most_common_count/total > 0.95
|
||||
324
ebook_converter/ebooks/oeb/transforms/structure.py
Normal file
324
ebook_converter/ebooks/oeb/transforms/structure.py
Normal file
@@ -0,0 +1,324 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re, uuid
|
||||
|
||||
from lxml import etree
|
||||
from collections import OrderedDict, Counter
|
||||
|
||||
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename
|
||||
from calibre.ebooks import ConversionError
|
||||
from polyglot.builtins import itervalues, unicode_type
|
||||
from polyglot.urllib import urlparse
|
||||
|
||||
|
||||
def XPath(x):
|
||||
try:
|
||||
return etree.XPath(x, namespaces=XPNSMAP)
|
||||
except etree.XPathSyntaxError:
|
||||
raise ConversionError(
|
||||
'The syntax of the XPath expression %s is invalid.' % repr(x))
|
||||
|
||||
|
||||
def isspace(x):
|
||||
return not x or x.replace('\xa0', '').isspace()
|
||||
|
||||
|
||||
def at_start(elem):
|
||||
' Return True if there is no content before elem '
|
||||
body = XPath('ancestor-or-self::h:body')(elem)
|
||||
if not body:
|
||||
return True
|
||||
body = body[0]
|
||||
ancestors = frozenset(XPath('ancestor::*')(elem))
|
||||
for x in body.iter():
|
||||
if x is elem:
|
||||
return True
|
||||
if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}:
|
||||
return False
|
||||
if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))):
|
||||
continue
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
class DetectStructure(object):
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
self.log = oeb.log
|
||||
self.oeb = oeb
|
||||
self.opts = opts
|
||||
self.log('Detecting structure...')
|
||||
|
||||
self.detect_chapters()
|
||||
if self.oeb.auto_generated_toc or opts.use_auto_toc:
|
||||
orig_toc = self.oeb.toc
|
||||
self.oeb.toc = TOC()
|
||||
self.create_level_based_toc()
|
||||
if self.oeb.toc.count() < 1:
|
||||
if not opts.no_chapters_in_toc and self.detected_chapters:
|
||||
self.create_toc_from_chapters()
|
||||
if self.oeb.toc.count() < opts.toc_threshold:
|
||||
self.create_toc_from_links()
|
||||
if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
|
||||
self.oeb.toc = orig_toc
|
||||
else:
|
||||
self.oeb.auto_generated_toc = True
|
||||
self.log('Auto generated TOC with %d entries.' %
|
||||
self.oeb.toc.count())
|
||||
|
||||
if opts.toc_filter is not None:
|
||||
regexp = re.compile(opts.toc_filter)
|
||||
for node in list(self.oeb.toc.iter()):
|
||||
if not node.title or regexp.search(node.title) is not None:
|
||||
self.log('Filtering', node.title if node.title else
|
||||
'empty node', 'from TOC')
|
||||
self.oeb.toc.remove(node)
|
||||
|
||||
if opts.page_breaks_before is not None:
|
||||
pb_xpath = XPath(opts.page_breaks_before)
|
||||
for item in oeb.spine:
|
||||
for elem in pb_xpath(item.data):
|
||||
try:
|
||||
prev = next(elem.itersiblings(tag=etree.Element,
|
||||
preceding=True))
|
||||
if (barename(elem.tag) in {'h1', 'h2'} and barename(
|
||||
prev.tag) in {'h1', 'h2'} and (not prev.tail or
|
||||
not prev.tail.split())):
|
||||
# We have two adjacent headings, do not put a page
|
||||
# break on the second one
|
||||
continue
|
||||
except StopIteration:
|
||||
pass
|
||||
|
||||
style = elem.get('style', '')
|
||||
if style:
|
||||
style += '; '
|
||||
elem.set('style', style+'page-break-before:always')
|
||||
|
||||
for node in self.oeb.toc.iter():
|
||||
if not node.title or not node.title.strip():
|
||||
node.title = _('Unnamed')
|
||||
|
||||
if self.opts.start_reading_at:
|
||||
self.detect_start_reading()
|
||||
|
||||
def detect_start_reading(self):
|
||||
expr = self.opts.start_reading_at
|
||||
try:
|
||||
expr = XPath(expr)
|
||||
except:
|
||||
self.log.warn(
|
||||
'Invalid start reading at XPath expression, ignoring: %s'%expr)
|
||||
return
|
||||
for item in self.oeb.spine:
|
||||
if not hasattr(item.data, 'xpath'):
|
||||
continue
|
||||
matches = expr(item.data)
|
||||
if matches:
|
||||
elem = matches[0]
|
||||
eid = elem.get('id', None)
|
||||
if not eid:
|
||||
eid = 'start_reading_at_'+unicode_type(uuid.uuid4()).replace('-', '')
|
||||
elem.set('id', eid)
|
||||
if 'text' in self.oeb.guide:
|
||||
self.oeb.guide.remove('text')
|
||||
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
|
||||
self.log('Setting start reading at position to %s in %s'%(
|
||||
self.opts.start_reading_at, item.href))
|
||||
return
|
||||
self.log.warn("Failed to find start reading at position: %s"%
|
||||
self.opts.start_reading_at)
|
||||
|
||||
def get_toc_parts_for_xpath(self, expr):
|
||||
# if an attribute is selected by the xpath expr then truncate it
|
||||
# from the path and instead return it as where to find the title text
|
||||
title_attribute_regex = re.compile(r'/@([-\w]+)$')
|
||||
match = title_attribute_regex.search(expr)
|
||||
if match is not None:
|
||||
return expr[0:match.start()], match.group(1)
|
||||
|
||||
return expr, None
|
||||
|
||||
def detect_chapters(self):
|
||||
self.detected_chapters = []
|
||||
self.chapter_title_attribute = None
|
||||
|
||||
def find_matches(expr, doc):
|
||||
try:
|
||||
ans = XPath(expr)(doc)
|
||||
len(ans)
|
||||
return ans
|
||||
except:
|
||||
self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
|
||||
return []
|
||||
|
||||
if self.opts.chapter:
|
||||
chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter)
|
||||
self.chapter_title_attribute = title_attribute
|
||||
for item in self.oeb.spine:
|
||||
for x in find_matches(chapter_path, item.data):
|
||||
self.detected_chapters.append((item, x))
|
||||
|
||||
chapter_mark = self.opts.chapter_mark
|
||||
page_break_before = 'display: block; page-break-before: always'
|
||||
page_break_after = 'display: block; page-break-after: always'
|
||||
c = Counter()
|
||||
for item, elem in self.detected_chapters:
|
||||
c[item] += 1
|
||||
text = xml2text(elem).strip()
|
||||
text = re.sub(r'\s+', ' ', text.strip())
|
||||
self.log('\tDetected chapter:', text[:50])
|
||||
if chapter_mark == 'none':
|
||||
continue
|
||||
if chapter_mark == 'rule':
|
||||
mark = elem.makeelement(XHTML('hr'))
|
||||
elif chapter_mark == 'pagebreak':
|
||||
if c[item] < 3 and at_start(elem):
|
||||
# For the first two elements in this item, check if they
|
||||
# are at the start of the file, in which case inserting a
|
||||
# page break in unnecessary and can lead to extra blank
|
||||
# pages in the PDF Output plugin. We need to use two as
|
||||
# feedbooks epubs match both a heading tag and its
|
||||
# containing div with the default chapter expression.
|
||||
continue
|
||||
mark = elem.makeelement(XHTML('div'), style=page_break_after)
|
||||
else: # chapter_mark == 'both':
|
||||
mark = elem.makeelement(XHTML('hr'), style=page_break_before)
|
||||
try:
|
||||
elem.addprevious(mark)
|
||||
except TypeError:
|
||||
self.log.exception('Failed to mark chapter')
|
||||
|
||||
def create_level_based_toc(self):
|
||||
if self.opts.level1_toc is not None:
|
||||
self.add_leveled_toc_items()
|
||||
|
||||
def create_toc_from_chapters(self):
|
||||
counter = self.oeb.toc.next_play_order()
|
||||
for item, elem in self.detected_chapters:
|
||||
text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter)
|
||||
self.oeb.toc.add(text, href, play_order=counter)
|
||||
counter += 1
|
||||
|
||||
def create_toc_from_links(self):
|
||||
num = 0
|
||||
for item in self.oeb.spine:
|
||||
for a in XPath('//h:a[@href]')(item.data):
|
||||
href = a.get('href')
|
||||
try:
|
||||
purl = urlparse(href)
|
||||
except ValueError:
|
||||
self.log.warning('Ignoring malformed URL:', href)
|
||||
continue
|
||||
if not purl[0] or purl[0] == 'file':
|
||||
href, frag = purl.path, purl.fragment
|
||||
href = item.abshref(href)
|
||||
if frag:
|
||||
href = '#'.join((href, frag))
|
||||
if not self.oeb.toc.has_href(href):
|
||||
text = xml2text(a)
|
||||
text = text[:100].strip()
|
||||
if (not self.opts.duplicate_links_in_toc and
|
||||
self.oeb.toc.has_text(text)):
|
||||
continue
|
||||
try:
|
||||
self.oeb.toc.add(text, href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
num += 1
|
||||
except ValueError:
|
||||
self.oeb.log.exception('Failed to process link: %r' % href)
|
||||
continue # Most likely an incorrectly URL encoded link
|
||||
if self.opts.max_toc_links > 0 and \
|
||||
num >= self.opts.max_toc_links:
|
||||
self.log('Maximum TOC links reached, stopping.')
|
||||
return
|
||||
|
||||
def elem_to_link(self, item, elem, title_attribute, counter):
|
||||
text = ''
|
||||
if title_attribute is not None:
|
||||
text = elem.get(title_attribute, '')
|
||||
if not text:
|
||||
text = xml2text(elem).strip()
|
||||
if not text:
|
||||
text = elem.get('title', '')
|
||||
if not text:
|
||||
text = elem.get('alt', '')
|
||||
text = re.sub(r'\s+', ' ', text.strip())
|
||||
text = text[:1000].strip()
|
||||
id = elem.get('id', 'calibre_toc_%d'%counter)
|
||||
elem.set('id', id)
|
||||
href = '#'.join((item.href, id))
|
||||
return text, href
|
||||
|
||||
def add_leveled_toc_items(self):
|
||||
added = OrderedDict()
|
||||
added2 = OrderedDict()
|
||||
counter = 1
|
||||
|
||||
def find_matches(expr, doc):
|
||||
try:
|
||||
ans = XPath(expr)(doc)
|
||||
len(ans)
|
||||
return ans
|
||||
except:
|
||||
self.log.warn('Invalid ToC expression, ignoring: %s'%expr)
|
||||
return []
|
||||
|
||||
for document in self.oeb.spine:
|
||||
previous_level1 = list(itervalues(added))[-1] if added else None
|
||||
previous_level2 = list(itervalues(added2))[-1] if added2 else None
|
||||
|
||||
level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc)
|
||||
for elem in find_matches(level1_toc, document.data):
|
||||
text, _href = self.elem_to_link(document, elem, level1_title, counter)
|
||||
counter += 1
|
||||
if text:
|
||||
node = self.oeb.toc.add(text, _href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
added[elem] = node
|
||||
# node.add(_('Top'), _href)
|
||||
|
||||
if self.opts.level2_toc is not None and added:
|
||||
level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc)
|
||||
for elem in find_matches(level2_toc, document.data):
|
||||
level1 = None
|
||||
for item in document.data.iterdescendants():
|
||||
if item in added:
|
||||
level1 = added[item]
|
||||
elif item == elem:
|
||||
if level1 is None:
|
||||
if previous_level1 is None:
|
||||
break
|
||||
level1 = previous_level1
|
||||
text, _href = self.elem_to_link(document, elem, level2_title, counter)
|
||||
counter += 1
|
||||
if text:
|
||||
added2[elem] = level1.add(text, _href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
break
|
||||
|
||||
if self.opts.level3_toc is not None and added2:
|
||||
level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc)
|
||||
for elem in find_matches(level3_toc, document.data):
|
||||
level2 = None
|
||||
for item in document.data.iterdescendants():
|
||||
if item in added2:
|
||||
level2 = added2[item]
|
||||
elif item == elem:
|
||||
if level2 is None:
|
||||
if previous_level2 is None:
|
||||
break
|
||||
level2 = previous_level2
|
||||
text, _href = \
|
||||
self.elem_to_link(document, elem, level3_title, counter)
|
||||
counter += 1
|
||||
if text:
|
||||
level2.add(text, _href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
break
|
||||
73
ebook_converter/ebooks/oeb/transforms/trimmanifest.py
Normal file
73
ebook_converter/ebooks/oeb/transforms/trimmanifest.py
Normal file
@@ -0,0 +1,73 @@
|
||||
'''
|
||||
OPF manifest trimming transform.
|
||||
'''
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS
|
||||
from calibre.ebooks.oeb.base import urlnormalize, iterlinks
|
||||
from polyglot.urllib import urldefrag
|
||||
|
||||
|
||||
class ManifestTrimmer(object):
|
||||
|
||||
@classmethod
|
||||
def config(cls, cfg):
|
||||
return cfg
|
||||
|
||||
@classmethod
|
||||
def generate(cls, opts):
|
||||
return cls()
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
import css_parser
|
||||
oeb.logger.info('Trimming unused files from manifest...')
|
||||
self.opts = context
|
||||
used = set()
|
||||
for term in oeb.metadata:
|
||||
for item in oeb.metadata[term]:
|
||||
if item.value in oeb.manifest.hrefs:
|
||||
used.add(oeb.manifest.hrefs[item.value])
|
||||
elif item.value in oeb.manifest.ids:
|
||||
used.add(oeb.manifest.ids[item.value])
|
||||
for ref in oeb.guide.values():
|
||||
path, _ = urldefrag(ref.href)
|
||||
if path in oeb.manifest.hrefs:
|
||||
used.add(oeb.manifest.hrefs[path])
|
||||
# TOC items are required to be in the spine
|
||||
for item in oeb.spine:
|
||||
used.add(item)
|
||||
unchecked = used
|
||||
while unchecked:
|
||||
new = set()
|
||||
for item in unchecked:
|
||||
if (item.media_type in OEB_DOCS or
|
||||
item.media_type[-4:] in ('/xml', '+xml')) and \
|
||||
item.data is not None:
|
||||
hrefs = [r[2] for r in iterlinks(item.data)]
|
||||
for href in hrefs:
|
||||
if isinstance(href, bytes):
|
||||
href = href.decode('utf-8')
|
||||
try:
|
||||
href = item.abshref(urlnormalize(href))
|
||||
except:
|
||||
continue
|
||||
if href in oeb.manifest.hrefs:
|
||||
found = oeb.manifest.hrefs[href]
|
||||
if found not in used:
|
||||
new.add(found)
|
||||
elif item.media_type == CSS_MIME:
|
||||
for href in css_parser.getUrls(item.data):
|
||||
href = item.abshref(urlnormalize(href))
|
||||
if href in oeb.manifest.hrefs:
|
||||
found = oeb.manifest.hrefs[href]
|
||||
if found not in used:
|
||||
new.add(found)
|
||||
used.update(new)
|
||||
unchecked = new
|
||||
for item in oeb.manifest.values():
|
||||
if item not in used:
|
||||
oeb.logger.info('Trimming %r from manifest' % item.href)
|
||||
oeb.manifest.remove(item)
|
||||
78
ebook_converter/ebooks/oeb/writer.py
Normal file
78
ebook_converter/ebooks/oeb/writer.py
Normal file
@@ -0,0 +1,78 @@
|
||||
'''
|
||||
Directory output OEBBook writer.
|
||||
'''
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
import os
|
||||
from calibre.ebooks.oeb.base import OPF_MIME, xml2str
|
||||
from calibre.ebooks.oeb.base import DirContainer, OEBError
|
||||
|
||||
__all__ = ['OEBWriter']
|
||||
|
||||
|
||||
class OEBWriter(object):
|
||||
DEFAULT_PROFILE = 'PRS505'
|
||||
"""Default renderer profile for content written with this Writer."""
|
||||
|
||||
TRANSFORMS = []
|
||||
"""List of transforms to apply to content written with this Writer."""
|
||||
|
||||
def __init__(self, version='2.0', page_map=False, pretty_print=False):
|
||||
self.version = version
|
||||
self.page_map = page_map
|
||||
self.pretty_print = pretty_print
|
||||
|
||||
@classmethod
|
||||
def config(cls, cfg):
|
||||
"""Add any book-writing options to the :class:`Config` object
|
||||
:param:`cfg`.
|
||||
"""
|
||||
oeb = cfg.add_group('oeb', _('OPF/NCX/etc. generation options.'))
|
||||
versions = ['1.2', '2.0']
|
||||
oeb('opf_version', ['--opf-version'], default='2.0', choices=versions,
|
||||
help=_('OPF version to generate. Default is %default.'))
|
||||
oeb('adobe_page_map', ['--adobe-page-map'], default=False,
|
||||
help=_('Generate an Adobe "page-map" file if pagination '
|
||||
'information is available.'))
|
||||
return cfg
|
||||
|
||||
@classmethod
|
||||
def generate(cls, opts):
|
||||
"""Generate a Writer instance from command-line options."""
|
||||
version = opts.opf_version
|
||||
page_map = opts.adobe_page_map
|
||||
pretty_print = opts.pretty_print
|
||||
return cls(version=version, page_map=page_map,
|
||||
pretty_print=pretty_print)
|
||||
|
||||
def __call__(self, oeb, path):
|
||||
"""
|
||||
Write the book in the :class:`OEBBook` object :param:`oeb` to a folder
|
||||
at :param:`path`.
|
||||
"""
|
||||
version = int(self.version[0])
|
||||
opfname = None
|
||||
if os.path.splitext(path)[1].lower() == '.opf':
|
||||
opfname = os.path.basename(path)
|
||||
path = os.path.dirname(path)
|
||||
if not os.path.isdir(path):
|
||||
os.mkdir(path)
|
||||
output = DirContainer(path, oeb.log)
|
||||
for item in oeb.manifest.values():
|
||||
output.write(item.href, item.bytes_representation)
|
||||
|
||||
if version == 1:
|
||||
metadata = oeb.to_opf1()
|
||||
elif version == 2:
|
||||
metadata = oeb.to_opf2(page_map=self.page_map)
|
||||
else:
|
||||
raise OEBError("Unrecognized OPF version %r" % self.version)
|
||||
pretty_print = self.pretty_print
|
||||
for mime, (href, data) in metadata.items():
|
||||
if opfname and mime == OPF_MIME:
|
||||
href = opfname
|
||||
output.write(href, xml2str(data, pretty_print=pretty_print))
|
||||
return
|
||||
Reference in New Issue
Block a user