mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-28 14:33:31 +01:00
Initial import
This commit is contained in:
7
ebook_converter/ebooks/oeb/transforms/__init__.py
Normal file
7
ebook_converter/ebooks/oeb/transforms/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
52
ebook_converter/ebooks/oeb/transforms/data_url.py
Normal file
52
ebook_converter/ebooks/oeb/transforms/data_url.py
Normal file
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re
|
||||
from calibre.ebooks.oeb.base import XPath, urlunquote
|
||||
from polyglot.builtins import as_bytes
|
||||
|
||||
|
||||
class DataURL(object):
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
from calibre.utils.imghdr import what
|
||||
self.log = oeb.log
|
||||
attr_path = XPath('//h:img[@src]')
|
||||
for item in oeb.spine:
|
||||
root = item.data
|
||||
if not hasattr(root, 'xpath'):
|
||||
continue
|
||||
for img in attr_path(root):
|
||||
raw = img.get('src', '')
|
||||
if not raw.startswith('data:'):
|
||||
continue
|
||||
header, data = raw.partition(',')[0::2]
|
||||
if not header.startswith('data:image/') or not data:
|
||||
continue
|
||||
if ';base64' in header:
|
||||
data = re.sub(r'\s+', '', data)
|
||||
from polyglot.binary import from_base64_bytes
|
||||
try:
|
||||
data = from_base64_bytes(data)
|
||||
except Exception:
|
||||
self.log.error('Found invalid base64 encoded data URI, ignoring it')
|
||||
continue
|
||||
else:
|
||||
data = urlunquote(data)
|
||||
data = as_bytes(data)
|
||||
fmt = what(None, data)
|
||||
if not fmt:
|
||||
self.log.warn('Image encoded as data URL has unknown format, ignoring')
|
||||
continue
|
||||
img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
|
||||
|
||||
def convert_image_data_uri(self, data, fmt, oeb):
|
||||
self.log('Found image encoded as data URI converting it to normal image')
|
||||
from calibre import guess_type
|
||||
item_id, item_href = oeb.manifest.generate('data-url-image', 'data-url-image.' + fmt)
|
||||
oeb.manifest.add(item_id, item_href, guess_type(item_href)[0], data=data)
|
||||
return item_href
|
||||
684
ebook_converter/ebooks/oeb/transforms/flatcss.py
Normal file
684
ebook_converter/ebooks/oeb/transforms/flatcss.py
Normal file
@@ -0,0 +1,684 @@
|
||||
'''
|
||||
CSS flattening transform.
|
||||
'''
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
import re, operator, math, numbers
|
||||
from collections import defaultdict
|
||||
from xml.dom import SyntaxErr
|
||||
|
||||
from lxml import etree
|
||||
import css_parser
|
||||
from css_parser.css import Property
|
||||
|
||||
from calibre import guess_type
|
||||
from calibre.ebooks import unit_convert
|
||||
from calibre.ebooks.oeb.base import (XHTML, XHTML_NS, CSS_MIME, OEB_STYLES,
|
||||
namespace, barename, XPath, css_text)
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.utils.filenames import ascii_filename, ascii_text
|
||||
from calibre.utils.icu import numeric_sort_key
|
||||
from polyglot.builtins import iteritems, unicode_type, string_or_bytes, map
|
||||
|
||||
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
|
||||
STRIPNUM = re.compile(r'[-0-9]+$')
|
||||
|
||||
|
||||
def asfloat(value, default):
|
||||
if not isinstance(value, numbers.Number):
|
||||
value = default
|
||||
return float(value)
|
||||
|
||||
|
||||
class KeyMapper(object):
|
||||
|
||||
def __init__(self, sbase, dbase, dkey):
|
||||
self.sbase = float(sbase)
|
||||
self.dprop = [(self.relate(x, dbase), float(x)) for x in dkey]
|
||||
self.cache = {}
|
||||
|
||||
@staticmethod
|
||||
def relate(size, base):
|
||||
if size == 0:
|
||||
return base
|
||||
size = float(size)
|
||||
base = float(base)
|
||||
if abs(size - base) < 0.1:
|
||||
return 0
|
||||
sign = -1 if size < base else 1
|
||||
endp = 0 if size < base else 36
|
||||
diff = (abs(base - size) * 3) + ((36 - size) / 100)
|
||||
logb = abs(base - endp)
|
||||
if logb == 1.0:
|
||||
logb = 1.1
|
||||
try:
|
||||
result = sign * math.log(diff, logb)
|
||||
except ValueError:
|
||||
if diff < 0:
|
||||
# Size is both very large and close to base
|
||||
return 0
|
||||
if logb == 0:
|
||||
logb = 1e-6
|
||||
if diff == 0:
|
||||
diff = 1e-6
|
||||
result = sign * math.log(diff, logb)
|
||||
return result
|
||||
|
||||
def __getitem__(self, ssize):
|
||||
ssize = asfloat(ssize, 0)
|
||||
if ssize in self.cache:
|
||||
return self.cache[ssize]
|
||||
dsize = self.map(ssize)
|
||||
self.cache[ssize] = dsize
|
||||
return dsize
|
||||
|
||||
def map(self, ssize):
|
||||
sbase = self.sbase
|
||||
prop = self.relate(ssize, sbase)
|
||||
diff = [(abs(prop - p), s) for p, s in self.dprop]
|
||||
dsize = min(diff)[1]
|
||||
return dsize
|
||||
|
||||
|
||||
class ScaleMapper(object):
|
||||
|
||||
def __init__(self, sbase, dbase):
|
||||
self.dscale = float(dbase) / float(sbase)
|
||||
|
||||
def __getitem__(self, ssize):
|
||||
ssize = asfloat(ssize, 0)
|
||||
dsize = ssize * self.dscale
|
||||
return dsize
|
||||
|
||||
|
||||
class NullMapper(object):
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getitem__(self, ssize):
|
||||
return ssize
|
||||
|
||||
|
||||
def FontMapper(sbase=None, dbase=None, dkey=None):
|
||||
if sbase and dbase and dkey:
|
||||
return KeyMapper(sbase, dbase, dkey)
|
||||
elif sbase and dbase:
|
||||
return ScaleMapper(sbase, dbase)
|
||||
else:
|
||||
return NullMapper()
|
||||
|
||||
|
||||
class EmbedFontsCSSRules(object):
|
||||
|
||||
def __init__(self, body_font_family, rules):
|
||||
self.body_font_family, self.rules = body_font_family, rules
|
||||
self.href = None
|
||||
|
||||
def __call__(self, oeb):
|
||||
if not self.body_font_family:
|
||||
return None
|
||||
if not self.href:
|
||||
iid, href = oeb.manifest.generate('page_styles', 'page_styles.css')
|
||||
rules = [css_text(x) for x in self.rules]
|
||||
rules = '\n\n'.join(rules)
|
||||
sheet = css_parser.parseString(rules, validate=False)
|
||||
self.href = oeb.manifest.add(iid, href, guess_type(href)[0],
|
||||
data=sheet).href
|
||||
return self.href
|
||||
|
||||
|
||||
class CSSFlattener(object):
|
||||
|
||||
def __init__(self, fbase=None, fkey=None, lineh=None, unfloat=False,
|
||||
untable=False, page_break_on_body=False, specializer=None,
|
||||
transform_css_rules=()):
|
||||
self.fbase = fbase
|
||||
self.transform_css_rules = transform_css_rules
|
||||
if self.transform_css_rules:
|
||||
from calibre.ebooks.css_transform_rules import compile_rules
|
||||
self.transform_css_rules = compile_rules(self.transform_css_rules)
|
||||
self.fkey = fkey
|
||||
self.lineh = lineh
|
||||
self.unfloat = unfloat
|
||||
self.untable = untable
|
||||
self.specializer = specializer
|
||||
self.page_break_on_body = page_break_on_body
|
||||
|
||||
@classmethod
|
||||
def config(cls, cfg):
|
||||
return cfg
|
||||
|
||||
@classmethod
|
||||
def generate(cls, opts):
|
||||
return cls()
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
oeb.logger.info('Flattening CSS and remapping font sizes...')
|
||||
self.context = self.opts = context
|
||||
self.oeb = oeb
|
||||
self.items = list(self.oeb.spine)
|
||||
titlepage = self.oeb.guide.get('titlepage')
|
||||
if titlepage is not None:
|
||||
titlepage = titlepage.item
|
||||
if titlepage is not None and titlepage not in self.items:
|
||||
self.items.append(titlepage)
|
||||
epub3_nav = None
|
||||
if getattr(self.opts, 'epub3_nav_href', None):
|
||||
epub3_nav = self.oeb.manifest.hrefs.get(self.opts.epub3_nav_href)
|
||||
if epub3_nav is not None and epub3_nav not in self.items:
|
||||
self.items.append(epub3_nav)
|
||||
|
||||
self.filter_css = frozenset()
|
||||
if self.opts.filter_css:
|
||||
try:
|
||||
self.filter_css = {x.strip().lower() for x in
|
||||
self.opts.filter_css.split(',')}
|
||||
except:
|
||||
self.oeb.log.warning('Failed to parse filter_css, ignoring')
|
||||
else:
|
||||
from calibre.ebooks.oeb.normalize_css import normalize_filter_css
|
||||
self.filter_css = frozenset(normalize_filter_css(self.filter_css))
|
||||
self.oeb.log.debug('Filtering CSS properties: %s'%
|
||||
', '.join(self.filter_css))
|
||||
|
||||
for item in oeb.manifest.values():
|
||||
# Make all links to resources absolute, as these sheets will be
|
||||
# consolidated into a single stylesheet at the root of the document
|
||||
if item.media_type in OEB_STYLES:
|
||||
css_parser.replaceUrls(item.data, item.abshref,
|
||||
ignoreImportRules=True)
|
||||
|
||||
self.body_font_family, self.embed_font_rules = self.get_embed_font_info(
|
||||
self.opts.embed_font_family)
|
||||
# Store for use in output plugins/transforms that generate content,
|
||||
# like the AZW3 output inline ToC.
|
||||
self.oeb.store_embed_font_rules = EmbedFontsCSSRules(self.body_font_family,
|
||||
self.embed_font_rules)
|
||||
self.stylize_spine()
|
||||
self.sbase = self.baseline_spine() if self.fbase else None
|
||||
self.fmap = FontMapper(self.sbase, self.fbase, self.fkey)
|
||||
self.flatten_spine()
|
||||
if epub3_nav is not None:
|
||||
self.opts.epub3_nav_parsed = epub3_nav.data
|
||||
|
||||
self.store_page_margins()
|
||||
|
||||
def store_page_margins(self):
|
||||
self.opts._stored_page_margins = {}
|
||||
for item, stylizer in iteritems(self.stylizers):
|
||||
margins = self.opts._stored_page_margins[item.href] = {}
|
||||
for prop, val in stylizer.page_rule.items():
|
||||
p, w = prop.partition('-')[::2]
|
||||
if p == 'margin':
|
||||
margins[w] = unit_convert(
|
||||
val, stylizer.profile.width_pts, stylizer.body_font_size,
|
||||
stylizer.profile.dpi, body_font_size=stylizer.body_font_size)
|
||||
|
||||
def get_embed_font_info(self, family, failure_critical=True):
|
||||
efi = []
|
||||
body_font_family = None
|
||||
if not family:
|
||||
return body_font_family, efi
|
||||
from calibre.utils.fonts.scanner import font_scanner, NoFonts
|
||||
from calibre.utils.fonts.utils import panose_to_css_generic_family
|
||||
try:
|
||||
faces = font_scanner.fonts_for_family(family)
|
||||
except NoFonts:
|
||||
msg = ('No embeddable fonts found for family: %r'%family)
|
||||
if failure_critical:
|
||||
raise ValueError(msg)
|
||||
self.oeb.log.warn(msg)
|
||||
return body_font_family, efi
|
||||
if not faces:
|
||||
msg = ('No embeddable fonts found for family: %r'%family)
|
||||
if failure_critical:
|
||||
raise ValueError(msg)
|
||||
self.oeb.log.warn(msg)
|
||||
return body_font_family, efi
|
||||
|
||||
for i, font in enumerate(faces):
|
||||
ext = 'otf' if font['is_otf'] else 'ttf'
|
||||
fid, href = self.oeb.manifest.generate(id=u'font',
|
||||
href='fonts/%s.%s'%(ascii_filename(font['full_name']).replace(' ', '-'), ext))
|
||||
item = self.oeb.manifest.add(fid, href,
|
||||
guess_type('dummy.'+ext)[0],
|
||||
data=font_scanner.get_font_data(font))
|
||||
item.unload_data_from_memory()
|
||||
|
||||
cfont = {
|
||||
'font-family': '"%s"'%font['font-family'],
|
||||
'panose-1': ' '.join(map(unicode_type, font['panose'])),
|
||||
'src': 'url(%s)'%item.href,
|
||||
}
|
||||
|
||||
if i == 0:
|
||||
generic_family = panose_to_css_generic_family(font['panose'])
|
||||
body_font_family = "'%s',%s"%(font['font-family'], generic_family)
|
||||
self.oeb.log('Embedding font: %s'%font['font-family'])
|
||||
for k in ('font-weight', 'font-style', 'font-stretch'):
|
||||
if font[k] != 'normal':
|
||||
cfont[k] = font[k]
|
||||
rule = '@font-face { %s }'%('; '.join('%s:%s'%(k, v) for k, v in
|
||||
iteritems(cfont)))
|
||||
rule = css_parser.parseString(rule)
|
||||
efi.append(rule)
|
||||
|
||||
return body_font_family, efi
|
||||
|
||||
def stylize_spine(self):
|
||||
self.stylizers = {}
|
||||
profile = self.context.source
|
||||
css = ''
|
||||
for item in self.items:
|
||||
html = item.data
|
||||
body = html.find(XHTML('body'))
|
||||
if 'style' in html.attrib:
|
||||
b = body.attrib.get('style', '')
|
||||
body.set('style', html.get('style') + ';' + b)
|
||||
del html.attrib['style']
|
||||
bs = body.get('style', '').split(';')
|
||||
bs.append('margin-top: 0pt')
|
||||
bs.append('margin-bottom: 0pt')
|
||||
if float(self.context.margin_left) >= 0:
|
||||
bs.append('margin-left : %gpt'%
|
||||
float(self.context.margin_left))
|
||||
if float(self.context.margin_right) >= 0:
|
||||
bs.append('margin-right : %gpt'%
|
||||
float(self.context.margin_right))
|
||||
bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
|
||||
if self.page_break_on_body:
|
||||
bs.extend(['page-break-before: always'])
|
||||
if self.context.change_justification != 'original':
|
||||
bs.append('text-align: '+ self.context.change_justification)
|
||||
if self.body_font_family:
|
||||
bs.append('font-family: '+self.body_font_family)
|
||||
body.set('style', '; '.join(bs))
|
||||
stylizer = Stylizer(html, item.href, self.oeb, self.context, profile,
|
||||
user_css=self.context.extra_css,
|
||||
extra_css=css)
|
||||
self.stylizers[item] = stylizer
|
||||
|
||||
def baseline_node(self, node, stylizer, sizes, csize):
|
||||
csize = stylizer.style(node)['font-size']
|
||||
if node.text:
|
||||
sizes[csize] += len(COLLAPSE.sub(' ', node.text))
|
||||
for child in node:
|
||||
self.baseline_node(child, stylizer, sizes, csize)
|
||||
if child.tail:
|
||||
sizes[csize] += len(COLLAPSE.sub(' ', child.tail))
|
||||
|
||||
def baseline_spine(self):
|
||||
sizes = defaultdict(float)
|
||||
for item in self.items:
|
||||
html = item.data
|
||||
stylizer = self.stylizers[item]
|
||||
body = html.find(XHTML('body'))
|
||||
fsize = self.context.source.fbase
|
||||
self.baseline_node(body, stylizer, sizes, fsize)
|
||||
try:
|
||||
sbase = max(list(sizes.items()), key=operator.itemgetter(1))[0]
|
||||
except:
|
||||
sbase = 12.0
|
||||
self.oeb.logger.info(
|
||||
"Source base font size is %0.05fpt" % sbase)
|
||||
return sbase
|
||||
|
||||
def clean_edges(self, cssdict, style, fsize):
|
||||
slineh = self.sbase * 1.26
|
||||
dlineh = self.lineh
|
||||
for kind in ('margin', 'padding'):
|
||||
for edge in ('bottom', 'top'):
|
||||
property = "%s-%s" % (kind, edge)
|
||||
if property not in cssdict:
|
||||
continue
|
||||
if '%' in cssdict[property]:
|
||||
continue
|
||||
value = style[property]
|
||||
if value == 0:
|
||||
continue
|
||||
elif value <= slineh:
|
||||
cssdict[property] = "%0.5fem" % (dlineh / fsize)
|
||||
else:
|
||||
try:
|
||||
value = round(value / slineh) * dlineh
|
||||
except:
|
||||
self.oeb.logger.warning(
|
||||
'Invalid length:', value)
|
||||
value = 0.0
|
||||
cssdict[property] = "%0.5fem" % (value / fsize)
|
||||
|
||||
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True):
|
||||
if not isinstance(node.tag, string_or_bytes) \
|
||||
or namespace(node.tag) != XHTML_NS:
|
||||
return
|
||||
tag = barename(node.tag)
|
||||
style = stylizer.style(node)
|
||||
cssdict = style.cssdict()
|
||||
try:
|
||||
font_size = style['font-size']
|
||||
except:
|
||||
font_size = self.sbase if self.sbase is not None else \
|
||||
self.context.source.fbase
|
||||
if tag == 'body' and isinstance(font_size, numbers.Number):
|
||||
stylizer.body_font_size = font_size
|
||||
if 'align' in node.attrib:
|
||||
if tag != 'img':
|
||||
cssdict['text-align'] = node.attrib['align']
|
||||
if cssdict['text-align'] == 'center':
|
||||
# align=center causes tables to be center aligned,
|
||||
# which text-align does not. And the ever trustworthy Word
|
||||
# uses this construct in its HTML output. See
|
||||
# https://bugs.launchpad.net/bugs/1569583
|
||||
if tag == 'table':
|
||||
if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
|
||||
cssdict['margin-left'] = cssdict['margin-right'] = 'auto'
|
||||
else:
|
||||
for table in node.iterchildren(XHTML("table")):
|
||||
ts = stylizer.style(table)
|
||||
if ts.get('margin-left') is None and ts.get('margin-right') is None:
|
||||
ts.set('margin-left', 'auto')
|
||||
ts.set('margin-right', 'auto')
|
||||
else:
|
||||
val = node.attrib['align']
|
||||
if val in ('middle', 'bottom', 'top'):
|
||||
cssdict['vertical-align'] = val
|
||||
elif val in ('left', 'right'):
|
||||
cssdict['float'] = val
|
||||
del node.attrib['align']
|
||||
if 'valign' in node.attrib and tag == 'td':
|
||||
if cssdict.get('vertical-align') == 'inherit':
|
||||
cssdict['vertical-align'] = node.attrib['valign']
|
||||
del node.attrib['valign']
|
||||
if node.tag == XHTML('font'):
|
||||
tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1',
|
||||
'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')]
|
||||
tag = 'div' if XPath('|'.join(tags))(node) else 'span'
|
||||
node.tag = XHTML(tag)
|
||||
if 'size' in node.attrib:
|
||||
def force_int(raw):
|
||||
return int(re.search(r'([0-9+-]+)', raw).group(1))
|
||||
size = node.attrib['size'].strip()
|
||||
if size:
|
||||
fnums = self.context.source.fnums
|
||||
if size[0] in ('+', '-'):
|
||||
# Oh, the warcrimes
|
||||
try:
|
||||
esize = 3 + force_int(size)
|
||||
except:
|
||||
esize = 3
|
||||
if esize < 1:
|
||||
esize = 1
|
||||
if esize > 7:
|
||||
esize = 7
|
||||
font_size = fnums[esize]
|
||||
else:
|
||||
try:
|
||||
font_size = fnums[force_int(size)]
|
||||
except:
|
||||
font_size = fnums[3]
|
||||
cssdict['font-size'] = '%.1fpt'%font_size
|
||||
del node.attrib['size']
|
||||
if 'face' in node.attrib:
|
||||
cssdict['font-family'] = node.attrib['face']
|
||||
del node.attrib['face']
|
||||
if 'color' in node.attrib:
|
||||
try:
|
||||
cssdict['color'] = Property('color', node.attrib['color']).value
|
||||
except (ValueError, SyntaxErr):
|
||||
pass
|
||||
del node.attrib['color']
|
||||
if 'bgcolor' in node.attrib:
|
||||
try:
|
||||
cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value
|
||||
except (ValueError, SyntaxErr):
|
||||
pass
|
||||
del node.attrib['bgcolor']
|
||||
if tag == 'ol' and 'type' in node.attrib:
|
||||
del node.attrib['type']
|
||||
if cssdict.get('font-weight', '').lower() == 'medium':
|
||||
cssdict['font-weight'] = 'normal' # ADE chokes on font-weight medium
|
||||
|
||||
fsize = font_size
|
||||
is_drop_cap = (cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and (
|
||||
len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f)))
|
||||
# Detect drop caps generated by the docx input plugin
|
||||
if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \
|
||||
not node.tail and 'line-height' in cssdict and 'font-size' in cssdict:
|
||||
dp = node.getparent()
|
||||
if dp.tag and dp.tag.endswith('}div') and len(dp) == 1 and not dp.text:
|
||||
if stylizer.style(dp).cssdict().get('float', None) == 'left':
|
||||
is_drop_cap = True
|
||||
if not self.context.disable_font_rescaling and not is_drop_cap:
|
||||
_sbase = self.sbase if self.sbase is not None else \
|
||||
self.context.source.fbase
|
||||
dyn_rescale = node.attrib.pop('data-calibre-rescale', None)
|
||||
if dyn_rescale is not None:
|
||||
try:
|
||||
dyn_rescale = float(dyn_rescale) / 100
|
||||
except Exception:
|
||||
dyn_rescale = 1
|
||||
fsize = self.fmap[_sbase]
|
||||
fsize *= dyn_rescale
|
||||
cssdict['font-size'] = '%0.5fem'%(fsize/psize)
|
||||
psize = fsize
|
||||
elif 'font-size' in cssdict or tag == 'body':
|
||||
fsize = self.fmap[font_size]
|
||||
try:
|
||||
cssdict['font-size'] = "%0.5fem" % (fsize / psize)
|
||||
except ZeroDivisionError:
|
||||
cssdict['font-size'] = '%.1fpt'%fsize
|
||||
psize = fsize
|
||||
|
||||
try:
|
||||
minlh = self.context.minimum_line_height / 100.
|
||||
slh = style['line-height']
|
||||
if not is_drop_cap and isinstance(slh, numbers.Number) and slh < minlh * fsize:
|
||||
cssdict['line-height'] = unicode_type(minlh)
|
||||
except Exception:
|
||||
self.oeb.logger.exception('Failed to set minimum line-height')
|
||||
|
||||
if cssdict:
|
||||
for x in self.filter_css:
|
||||
popval = cssdict.pop(x, None)
|
||||
if self.body_font_family and popval and x == 'font-family' \
|
||||
and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]:
|
||||
cssdict[x] = popval
|
||||
|
||||
if cssdict:
|
||||
if self.lineh and self.fbase and tag != 'body':
|
||||
self.clean_edges(cssdict, style, psize)
|
||||
if 'display' in cssdict and cssdict['display'] == 'in-line':
|
||||
cssdict['display'] = 'inline'
|
||||
if self.unfloat and 'float' in cssdict \
|
||||
and cssdict.get('display', 'none') != 'none':
|
||||
del cssdict['display']
|
||||
if self.untable and 'display' in cssdict \
|
||||
and cssdict['display'].startswith('table'):
|
||||
display = cssdict['display']
|
||||
if display == 'table-cell':
|
||||
cssdict['display'] = 'inline'
|
||||
else:
|
||||
cssdict['display'] = 'block'
|
||||
if 'vertical-align' in cssdict \
|
||||
and cssdict['vertical-align'] == 'sup':
|
||||
cssdict['vertical-align'] = 'super'
|
||||
if self.lineh and 'line-height' not in cssdict:
|
||||
lineh = self.lineh / psize
|
||||
cssdict['line-height'] = "%0.5fem" % lineh
|
||||
|
||||
if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'):
|
||||
if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
|
||||
for prop in ('margin', 'padding', 'border'):
|
||||
for edge in ('top', 'bottom'):
|
||||
cssdict['%s-%s'%(prop, edge)] = '0pt'
|
||||
if self.context.insert_blank_line:
|
||||
cssdict['margin-top'] = cssdict['margin-bottom'] = \
|
||||
'%fem'%self.context.insert_blank_line_size
|
||||
indent_size = self.context.remove_paragraph_spacing_indent_size
|
||||
keep_indents = indent_size < 0.0
|
||||
if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')):
|
||||
cssdict['text-indent'] = "%1.1fem" % indent_size
|
||||
|
||||
pseudo_classes = style.pseudo_classes(self.filter_css)
|
||||
if cssdict or pseudo_classes:
|
||||
keep_classes = set()
|
||||
|
||||
if cssdict:
|
||||
items = sorted(iteritems(cssdict))
|
||||
css = ';\n'.join(u'%s: %s' % (key, val) for key, val in items)
|
||||
classes = node.get('class', '').strip() or 'calibre'
|
||||
classes_list = classes.split()
|
||||
# lower() because otherwise if the document uses the same class
|
||||
# name with different case, both cases will apply, leading
|
||||
# to incorrect results.
|
||||
klass = ascii_text(STRIPNUM.sub('', classes_list[0])).lower().strip().replace(' ', '_')
|
||||
if css in styles:
|
||||
match = styles[css]
|
||||
else:
|
||||
match = klass + unicode_type(names[klass] or '')
|
||||
styles[css] = match
|
||||
names[klass] += 1
|
||||
node.attrib['class'] = match
|
||||
keep_classes.add(match)
|
||||
|
||||
for psel, cssdict in iteritems(pseudo_classes):
|
||||
items = sorted(iteritems(cssdict))
|
||||
css = ';\n'.join('%s: %s' % (key, val) for key, val in items)
|
||||
pstyles = pseudo_styles[psel]
|
||||
if css in pstyles:
|
||||
match = pstyles[css]
|
||||
else:
|
||||
# We have to use a different class for each psel as
|
||||
# otherwise you can have incorrect styles for a situation
|
||||
# like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
|
||||
# If the pcalibre class for a:hover and a:link is the same,
|
||||
# then the class attribute for a.x tags will contain both
|
||||
# that class and the class for a.x:hover, which is wrong.
|
||||
klass = 'pcalibre'
|
||||
match = klass + unicode_type(names[klass] or '')
|
||||
pstyles[css] = match
|
||||
names[klass] += 1
|
||||
keep_classes.add(match)
|
||||
node.attrib['class'] = ' '.join(keep_classes)
|
||||
|
||||
elif 'class' in node.attrib:
|
||||
del node.attrib['class']
|
||||
if 'style' in node.attrib:
|
||||
del node.attrib['style']
|
||||
if recurse:
|
||||
for child in node:
|
||||
self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
|
||||
|
||||
def flatten_head(self, item, href, global_href):
|
||||
html = item.data
|
||||
head = html.find(XHTML('head'))
|
||||
|
||||
def safe_lower(x):
|
||||
try:
|
||||
x = x.lower()
|
||||
except Exception:
|
||||
pass
|
||||
return x
|
||||
|
||||
for node in html.xpath('//*[local-name()="style" or local-name()="link"]'):
|
||||
if node.tag == XHTML('link') \
|
||||
and safe_lower(node.get('rel', 'stylesheet')) == 'stylesheet' \
|
||||
and safe_lower(node.get('type', CSS_MIME)) in OEB_STYLES:
|
||||
node.getparent().remove(node)
|
||||
elif node.tag == XHTML('style') \
|
||||
and node.get('type', CSS_MIME) in OEB_STYLES:
|
||||
node.getparent().remove(node)
|
||||
href = item.relhref(href)
|
||||
l = etree.SubElement(head, XHTML('link'),
|
||||
rel='stylesheet', type=CSS_MIME, href=href)
|
||||
l.tail='\n'
|
||||
if global_href:
|
||||
href = item.relhref(global_href)
|
||||
l = etree.SubElement(head, XHTML('link'),
|
||||
rel='stylesheet', type=CSS_MIME, href=href)
|
||||
l.tail = '\n'
|
||||
|
||||
def replace_css(self, css):
|
||||
manifest = self.oeb.manifest
|
||||
for item in manifest.values():
|
||||
if item.media_type in OEB_STYLES:
|
||||
manifest.remove(item)
|
||||
id, href = manifest.generate('css', 'stylesheet.css')
|
||||
sheet = css_parser.parseString(css, validate=False)
|
||||
if self.transform_css_rules:
|
||||
from calibre.ebooks.css_transform_rules import transform_sheet
|
||||
transform_sheet(self.transform_css_rules, sheet)
|
||||
item = manifest.add(id, href, CSS_MIME, data=sheet)
|
||||
self.oeb.manifest.main_stylesheet = item
|
||||
return href
|
||||
|
||||
def collect_global_css(self):
|
||||
global_css = defaultdict(list)
|
||||
for item in self.items:
|
||||
stylizer = self.stylizers[item]
|
||||
if float(self.context.margin_top) >= 0:
|
||||
stylizer.page_rule['margin-top'] = '%gpt'%\
|
||||
float(self.context.margin_top)
|
||||
if float(self.context.margin_bottom) >= 0:
|
||||
stylizer.page_rule['margin-bottom'] = '%gpt'%\
|
||||
float(self.context.margin_bottom)
|
||||
items = sorted(stylizer.page_rule.items())
|
||||
css = ';\n'.join("%s: %s" % (key, val) for key, val in items)
|
||||
css = ('@page {\n%s\n}\n'%css) if items else ''
|
||||
rules = [css_text(r) for r in stylizer.font_face_rules + self.embed_font_rules]
|
||||
raw = '\n\n'.join(rules)
|
||||
css += '\n\n' + raw
|
||||
global_css[css].append(item)
|
||||
|
||||
gc_map = {}
|
||||
manifest = self.oeb.manifest
|
||||
for css in global_css:
|
||||
href = None
|
||||
if css.strip():
|
||||
id_, href = manifest.generate('page_css', 'page_styles.css')
|
||||
sheet = css_parser.parseString(css, validate=False)
|
||||
if self.transform_css_rules:
|
||||
from calibre.ebooks.css_transform_rules import transform_sheet
|
||||
transform_sheet(self.transform_css_rules, sheet)
|
||||
manifest.add(id_, href, CSS_MIME, data=sheet)
|
||||
gc_map[css] = href
|
||||
|
||||
ans = {}
|
||||
for css, items in iteritems(global_css):
|
||||
for item in items:
|
||||
ans[item] = gc_map[css]
|
||||
return ans
|
||||
|
||||
def flatten_spine(self):
|
||||
names = defaultdict(int)
|
||||
styles, pseudo_styles = {}, defaultdict(dict)
|
||||
for item in self.items:
|
||||
html = item.data
|
||||
stylizer = self.stylizers[item]
|
||||
if self.specializer is not None:
|
||||
self.specializer(item, stylizer)
|
||||
fsize = self.context.dest.fbase
|
||||
self.flatten_node(html, stylizer, names, styles, pseudo_styles, fsize, item.id, recurse=False)
|
||||
self.flatten_node(html.find(XHTML('body')), stylizer, names, styles, pseudo_styles, fsize, item.id)
|
||||
items = sorted(((key, val) for (val, key) in iteritems(styles)), key=lambda x:numeric_sort_key(x[0]))
|
||||
# :hover must come after link and :active must come after :hover
|
||||
psels = sorted(pseudo_styles, key=lambda x :
|
||||
{'hover':1, 'active':2}.get(x, 0))
|
||||
for psel in psels:
|
||||
styles = pseudo_styles[psel]
|
||||
if not styles:
|
||||
continue
|
||||
x = sorted(((k+':'+psel, v) for v, k in iteritems(styles)))
|
||||
items.extend(x)
|
||||
|
||||
css = ''.join(".%s {\n%s;\n}\n\n" % (key, val) for key, val in items)
|
||||
|
||||
href = self.replace_css(css)
|
||||
global_css = self.collect_global_css()
|
||||
for item in self.items:
|
||||
stylizer = self.stylizers[item]
|
||||
self.flatten_head(item, href, global_css[item])
|
||||
55
ebook_converter/ebooks/oeb/transforms/guide.py
Normal file
55
ebook_converter/ebooks/oeb/transforms/guide.py
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
class Clean(object):
|
||||
'''Clean up guide, leaving only known values '''
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
self.oeb, self.log, self.opts = oeb, oeb.log, opts
|
||||
|
||||
if 'cover' not in self.oeb.guide:
|
||||
covers = []
|
||||
for x in ('other.ms-coverimage-standard', 'coverimagestandard',
|
||||
'other.ms-titleimage-standard', 'other.ms-titleimage',
|
||||
'other.ms-coverimage', 'other.ms-thumbimage-standard',
|
||||
'other.ms-thumbimage', 'thumbimagestandard'):
|
||||
if x in self.oeb.guide:
|
||||
href = self.oeb.guide[x].href
|
||||
try:
|
||||
item = self.oeb.manifest.hrefs[href]
|
||||
except KeyError:
|
||||
continue
|
||||
else:
|
||||
covers.append([self.oeb.guide[x], len(item.data)])
|
||||
|
||||
covers.sort(key=lambda x: x[1], reverse=True)
|
||||
if covers:
|
||||
ref = covers[0][0]
|
||||
if len(covers) > 1:
|
||||
self.log('Choosing %s:%s as the cover'%(ref.type, ref.href))
|
||||
ref.type = 'cover'
|
||||
self.oeb.guide.refs['cover'] = ref
|
||||
|
||||
if ('start' in self.oeb.guide and 'text' not in self.oeb.guide):
|
||||
# Prefer text to start as per the OPF 2.0 spec
|
||||
x = self.oeb.guide['start']
|
||||
self.oeb.guide.add('text', x.title, x.href)
|
||||
self.oeb.guide.remove('start')
|
||||
|
||||
for x in list(self.oeb.guide):
|
||||
if x.lower() not in {
|
||||
'cover', 'titlepage', 'masthead', 'toc', 'title-page',
|
||||
'copyright-page', 'text', 'index', 'glossary',
|
||||
'acknowledgements', 'bibliography', 'colophon',
|
||||
'copyright-page', 'dedication', 'epigraph', 'foreword',
|
||||
'loi', 'lot', 'notes', 'preface'}:
|
||||
item = self.oeb.guide[x]
|
||||
if item.title and item.title.lower() == 'start':
|
||||
continue
|
||||
self.oeb.guide.remove(x)
|
||||
395
ebook_converter/ebooks/oeb/transforms/jacket.py
Normal file
395
ebook_converter/ebooks/oeb/transforms/jacket.py
Normal file
@@ -0,0 +1,395 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os, re
|
||||
from xml.sax.saxutils import escape
|
||||
from string import Formatter
|
||||
|
||||
from calibre import guess_type, strftime
|
||||
from calibre.constants import iswindows
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML, xml2text, urldefrag, urlnormalize
|
||||
from calibre.library.comments import comments_to_html, markdown
|
||||
from calibre.utils.date import is_date_undefined, as_local_time
|
||||
from calibre.utils.icu import sort_key
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations
|
||||
from calibre.ebooks.metadata import fmt_sidx, rating_to_stars
|
||||
from polyglot.builtins import unicode_type, map
|
||||
|
||||
JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'
|
||||
|
||||
|
||||
class SafeFormatter(Formatter):
|
||||
|
||||
def get_value(self, *args, **kwargs):
|
||||
try:
|
||||
return Formatter.get_value(self, *args, **kwargs)
|
||||
except KeyError:
|
||||
return ''
|
||||
|
||||
|
||||
class Base(object):
|
||||
|
||||
def remove_images(self, item, limit=1):
|
||||
path = XPath('//h:img[@src]')
|
||||
removed = 0
|
||||
for img in path(item.data):
|
||||
if removed >= limit:
|
||||
break
|
||||
href = item.abshref(img.get('src'))
|
||||
image = self.oeb.manifest.hrefs.get(href)
|
||||
if image is None:
|
||||
href = urlnormalize(href)
|
||||
image = self.oeb.manifest.hrefs.get(href)
|
||||
if image is not None:
|
||||
self.oeb.manifest.remove(image)
|
||||
self.oeb.guide.remove_by_href(href)
|
||||
img.getparent().remove(img)
|
||||
removed += 1
|
||||
return removed
|
||||
|
||||
|
||||
class RemoveFirstImage(Base):
|
||||
|
||||
def remove_first_image(self):
|
||||
deleted_item = None
|
||||
for item in self.oeb.spine:
|
||||
if XPath(JACKET_XPATH)(item.data):
|
||||
continue
|
||||
removed = self.remove_images(item)
|
||||
if removed > 0:
|
||||
self.log('Removed first image')
|
||||
body = XPath('//h:body')(item.data)
|
||||
if body:
|
||||
raw = xml2text(body[0]).strip()
|
||||
imgs = XPath('//h:img|//svg:svg')(item.data)
|
||||
if not raw and not imgs:
|
||||
self.log('Removing %s as it has no content'%item.href)
|
||||
self.oeb.manifest.remove(item)
|
||||
deleted_item = item
|
||||
break
|
||||
else:
|
||||
self.log.warn('Could not find first image to remove')
|
||||
if deleted_item is not None:
|
||||
for item in list(self.oeb.toc):
|
||||
href = urldefrag(item.href)[0]
|
||||
if href == deleted_item.href:
|
||||
self.oeb.toc.remove(item)
|
||||
self.oeb.guide.remove_by_href(deleted_item.href)
|
||||
|
||||
def __call__(self, oeb, opts, metadata):
|
||||
'''
|
||||
Add metadata in jacket.xhtml if specified in opts
|
||||
If not specified, remove previous jacket instance
|
||||
'''
|
||||
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
||||
if opts.remove_first_image:
|
||||
self.remove_first_image()
|
||||
|
||||
|
||||
class Jacket(Base):
|
||||
'''
|
||||
Book jacket manipulation. Remove first image and insert comments at start of
|
||||
book.
|
||||
'''
|
||||
|
||||
def insert_metadata(self, mi):
|
||||
self.log('Inserting metadata into book...')
|
||||
|
||||
try:
|
||||
tags = list(map(unicode_type, self.oeb.metadata.subject))
|
||||
except Exception:
|
||||
tags = []
|
||||
|
||||
try:
|
||||
comments = unicode_type(self.oeb.metadata.description[0])
|
||||
except:
|
||||
comments = ''
|
||||
|
||||
try:
|
||||
title = unicode_type(self.oeb.metadata.title[0])
|
||||
except:
|
||||
title = _('Unknown')
|
||||
|
||||
try:
|
||||
authors = list(map(unicode_type, self.oeb.metadata.creator))
|
||||
except:
|
||||
authors = [_('Unknown')]
|
||||
|
||||
root = render_jacket(mi, self.opts.output_profile,
|
||||
alt_title=title, alt_tags=tags, alt_authors=authors,
|
||||
alt_comments=comments, rescale_fonts=True)
|
||||
id, href = self.oeb.manifest.generate('calibre_jacket', 'jacket.xhtml')
|
||||
|
||||
jacket = self.oeb.manifest.add(id, href, guess_type(href)[0], data=root)
|
||||
self.oeb.spine.insert(0, jacket, True)
|
||||
self.oeb.inserted_metadata_jacket = jacket
|
||||
for img, path in referenced_images(root):
|
||||
self.oeb.log('Embedding referenced image %s into jacket' % path)
|
||||
ext = path.rpartition('.')[-1].lower()
|
||||
item_id, href = self.oeb.manifest.generate('jacket_image', 'jacket_img.'+ext)
|
||||
with open(path, 'rb') as f:
|
||||
item = self.oeb.manifest.add(item_id, href, guess_type(href)[0], data=f.read())
|
||||
item.unload_data_from_memory()
|
||||
img.set('src', jacket.relhref(item.href))
|
||||
|
||||
def remove_existing_jacket(self):
|
||||
for x in self.oeb.spine[:4]:
|
||||
if XPath(JACKET_XPATH)(x.data):
|
||||
self.remove_images(x, limit=sys.maxsize)
|
||||
self.oeb.manifest.remove(x)
|
||||
self.log('Removed existing jacket')
|
||||
break
|
||||
|
||||
def __call__(self, oeb, opts, metadata):
|
||||
'''
|
||||
Add metadata in jacket.xhtml if specified in opts
|
||||
If not specified, remove previous jacket instance
|
||||
'''
|
||||
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
||||
self.remove_existing_jacket()
|
||||
if opts.insert_metadata:
|
||||
self.insert_metadata(metadata)
|
||||
|
||||
# Render Jacket {{{
|
||||
|
||||
|
||||
def get_rating(rating, rchar, e_rchar):
|
||||
ans = ''
|
||||
try:
|
||||
num = float(rating)/2
|
||||
except:
|
||||
return ans
|
||||
num = max(0, num)
|
||||
num = min(num, 5)
|
||||
if num < 1:
|
||||
return ans
|
||||
|
||||
ans = ("%s%s") % (rchar * int(num), e_rchar * (5 - int(num)))
|
||||
return ans
|
||||
|
||||
|
||||
class Series(unicode_type):
|
||||
|
||||
def __new__(self, series, series_index):
|
||||
if series and series_index is not None:
|
||||
roman = _('{1} of <em>{0}</em>').format(
|
||||
escape(series), escape(fmt_sidx(series_index, use_roman=True)))
|
||||
combined = _('{1} of <em>{0}</em>').format(
|
||||
escape(series), escape(fmt_sidx(series_index, use_roman=False)))
|
||||
else:
|
||||
combined = roman = escape(series or u'')
|
||||
s = unicode_type.__new__(self, combined)
|
||||
s.roman = roman
|
||||
s.name = escape(series or '')
|
||||
s.number = escape(fmt_sidx(series_index or 1.0, use_roman=False))
|
||||
s.roman_number = escape(fmt_sidx(series_index or 1.0, use_roman=True))
|
||||
return s
|
||||
|
||||
|
||||
class Tags(unicode_type):
|
||||
|
||||
def __new__(self, tags, output_profile):
|
||||
tags = [escape(x) for x in tags or ()]
|
||||
t = unicode_type.__new__(self, ', '.join(tags))
|
||||
t.alphabetical = ', '.join(sorted(tags, key=sort_key))
|
||||
t.tags_list = tags
|
||||
return t
|
||||
|
||||
|
||||
def postprocess_jacket(root, output_profile, has_data):
|
||||
# Post-process the generated html to strip out empty header items
|
||||
|
||||
def extract(tag):
|
||||
parent = tag.getparent()
|
||||
idx = parent.index(tag)
|
||||
parent.remove(tag)
|
||||
if tag.tail:
|
||||
if idx == 0:
|
||||
parent.text = (parent.text or '') + tag.tail
|
||||
else:
|
||||
if idx >= len(parent):
|
||||
idx = -1
|
||||
parent[-1].tail = (parent[-1].tail or '') + tag.tail
|
||||
|
||||
def extract_class(cls):
|
||||
for tag in root.xpath('//*[@class="_"]'.replace('_', cls)):
|
||||
extract(tag)
|
||||
|
||||
for key in 'series rating tags'.split():
|
||||
if not has_data[key]:
|
||||
extract_class('cbj_' + key)
|
||||
if not has_data['pubdate']:
|
||||
extract_class('cbj_pubdata')
|
||||
if output_profile.short_name != 'kindle':
|
||||
extract_class('cbj_kindle_banner_hr')
|
||||
|
||||
|
||||
def render_jacket(mi, output_profile,
|
||||
alt_title=_('Unknown'), alt_tags=[], alt_comments='',
|
||||
alt_publisher='', rescale_fonts=False, alt_authors=None):
|
||||
css = P('jacket/stylesheet.css', data=True).decode('utf-8')
|
||||
template = P('jacket/template.xhtml', data=True).decode('utf-8')
|
||||
|
||||
template = re.sub(r'<!--.*?-->', '', template, flags=re.DOTALL)
|
||||
css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)
|
||||
|
||||
try:
|
||||
title_str = alt_title if mi.is_null('title') else mi.title
|
||||
except:
|
||||
title_str = _('Unknown')
|
||||
title_str = escape(title_str)
|
||||
title = '<span class="title">%s</span>' % title_str
|
||||
|
||||
series = Series(mi.series, mi.series_index)
|
||||
try:
|
||||
publisher = mi.publisher if not mi.is_null('publisher') else alt_publisher
|
||||
except:
|
||||
publisher = ''
|
||||
publisher = escape(publisher)
|
||||
|
||||
try:
|
||||
if is_date_undefined(mi.pubdate):
|
||||
pubdate = ''
|
||||
else:
|
||||
dt = as_local_time(mi.pubdate)
|
||||
pubdate = strftime('%Y', dt.timetuple())
|
||||
except:
|
||||
pubdate = ''
|
||||
|
||||
rating = get_rating(mi.rating, output_profile.ratings_char, output_profile.empty_ratings_char)
|
||||
|
||||
tags = Tags((mi.tags if mi.tags else alt_tags), output_profile)
|
||||
|
||||
comments = mi.comments if mi.comments else alt_comments
|
||||
comments = comments.strip()
|
||||
if comments:
|
||||
comments = comments_to_html(comments)
|
||||
|
||||
orig = mi.authors
|
||||
if mi.is_null('authors'):
|
||||
mi.authors = list(alt_authors or (_('Unknown'),))
|
||||
try:
|
||||
author = mi.format_authors()
|
||||
except:
|
||||
author = ''
|
||||
mi.authors = orig
|
||||
author = escape(author)
|
||||
has_data = {}
|
||||
|
||||
def generate_html(comments):
|
||||
args = dict(xmlns=XHTML_NS,
|
||||
title_str=title_str,
|
||||
css=css,
|
||||
title=title,
|
||||
author=author,
|
||||
publisher=publisher,
|
||||
pubdate_label=_('Published'), pubdate=pubdate,
|
||||
series_label=_('Series'), series=series,
|
||||
rating_label=_('Rating'), rating=rating,
|
||||
tags_label=_('Tags'), tags=tags,
|
||||
comments=comments,
|
||||
footer='',
|
||||
searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list),
|
||||
)
|
||||
for key in mi.custom_field_keys():
|
||||
m = mi.get_user_metadata(key, False) or {}
|
||||
try:
|
||||
display_name, val = mi.format_field_extended(key)[:2]
|
||||
dkey = key.replace('#', '_')
|
||||
dt = m.get('datatype')
|
||||
if dt == 'series':
|
||||
args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
|
||||
elif dt == 'rating':
|
||||
args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False))
|
||||
elif dt == 'comments':
|
||||
val = val or ''
|
||||
display = m.get('display', {})
|
||||
ctype = display.get('interpret_as') or 'html'
|
||||
if ctype == 'long-text':
|
||||
val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val)
|
||||
elif ctype == 'short-text':
|
||||
val = '<span>%s</span>' % escape(val)
|
||||
elif ctype == 'markdown':
|
||||
val = markdown(val)
|
||||
else:
|
||||
val = comments_to_html(val)
|
||||
args[dkey] = val
|
||||
else:
|
||||
args[dkey] = escape(val)
|
||||
args[dkey+'_label'] = escape(display_name)
|
||||
except Exception:
|
||||
# if the val (custom column contents) is None, don't add to args
|
||||
pass
|
||||
|
||||
if False:
|
||||
print("Custom column values available in jacket template:")
|
||||
for key in args.keys():
|
||||
if key.startswith('_') and not key.endswith('_label'):
|
||||
print(" %s: %s" % ('#' + key[1:], args[key]))
|
||||
|
||||
# Used in the comment describing use of custom columns in templates
|
||||
# Don't change this unless you also change it in template.xhtml
|
||||
args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
|
||||
args['_genre'] = args.get('_genre', '{_genre}')
|
||||
|
||||
formatter = SafeFormatter()
|
||||
generated_html = formatter.format(template, **args)
|
||||
has_data['series'] = bool(series)
|
||||
has_data['tags'] = bool(tags)
|
||||
has_data['rating'] = bool(rating)
|
||||
has_data['pubdate'] = bool(pubdate)
|
||||
|
||||
return strip_encoding_declarations(generated_html)
|
||||
|
||||
from calibre.ebooks.oeb.polish.parsing import parse
|
||||
raw = generate_html(comments)
|
||||
root = parse(raw, line_numbers=False, force_html5_parse=True)
|
||||
|
||||
if rescale_fonts:
|
||||
# We ensure that the conversion pipeline will set the font sizes for
|
||||
# text in the jacket to the same size as the font sizes for the rest of
|
||||
# the text in the book. That means that as long as the jacket uses
|
||||
# relative font sizes (em or %), the post conversion font size will be
|
||||
# the same as for text in the main book. So text with size x em will
|
||||
# be rescaled to the same value in both the jacket and the main content.
|
||||
#
|
||||
# We cannot use data-calibre-rescale 100 on the body tag as that will just
|
||||
# give the body tag a font size of 1em, which is useless.
|
||||
for body in root.xpath('//*[local-name()="body"]'):
|
||||
fw = body.makeelement(XHTML('div'))
|
||||
fw.set('data-calibre-rescale', '100')
|
||||
for child in body:
|
||||
fw.append(child)
|
||||
body.append(fw)
|
||||
postprocess_jacket(root, output_profile, has_data)
|
||||
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
|
||||
pretty_html_tree(None, root)
|
||||
return root
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
def linearize_jacket(oeb):
|
||||
for x in oeb.spine[:4]:
|
||||
if XPath(JACKET_XPATH)(x.data):
|
||||
for e in XPath('//h:table|//h:tr|//h:th')(x.data):
|
||||
e.tag = XHTML('div')
|
||||
for e in XPath('//h:td')(x.data):
|
||||
e.tag = XHTML('span')
|
||||
break
|
||||
|
||||
|
||||
def referenced_images(root):
|
||||
for img in XPath('//h:img[@src]')(root):
|
||||
src = img.get('src')
|
||||
if src.startswith('file://'):
|
||||
path = src[7:]
|
||||
if iswindows and path.startswith('/'):
|
||||
path = path[1:]
|
||||
if os.path.exists(path):
|
||||
yield img, path
|
||||
218
ebook_converter/ebooks/oeb/transforms/metadata.py
Normal file
218
ebook_converter/ebooks/oeb/transforms/metadata.py
Normal file
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re
|
||||
from calibre.utils.date import isoformat, now
|
||||
from calibre import guess_type
|
||||
from polyglot.builtins import iteritems
|
||||
|
||||
|
||||
def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
if not mi.is_null('title'):
|
||||
m.clear('title')
|
||||
m.add('title', mi.title)
|
||||
if mi.title_sort:
|
||||
if not m.title:
|
||||
m.add('title', mi.title_sort)
|
||||
m.clear('title_sort')
|
||||
m.add('title_sort', mi.title_sort)
|
||||
if not mi.is_null('authors'):
|
||||
m.filter('creator', lambda x : x.role.lower() in ['aut', ''])
|
||||
for a in mi.authors:
|
||||
attrib = {'role':'aut'}
|
||||
if mi.author_sort:
|
||||
attrib[OPF('file-as')] = mi.author_sort
|
||||
m.add('creator', a, attrib=attrib)
|
||||
if not mi.is_null('book_producer'):
|
||||
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
|
||||
m.add('contributor', mi.book_producer, role='bkp')
|
||||
elif override_input_metadata:
|
||||
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
|
||||
if not mi.is_null('comments'):
|
||||
m.clear('description')
|
||||
m.add('description', mi.comments)
|
||||
elif override_input_metadata:
|
||||
m.clear('description')
|
||||
if not mi.is_null('publisher'):
|
||||
m.clear('publisher')
|
||||
m.add('publisher', mi.publisher)
|
||||
elif override_input_metadata:
|
||||
m.clear('publisher')
|
||||
if not mi.is_null('series'):
|
||||
m.clear('series')
|
||||
m.add('series', mi.series)
|
||||
elif override_input_metadata:
|
||||
m.clear('series')
|
||||
identifiers = mi.get_identifiers()
|
||||
set_isbn = False
|
||||
for typ, val in iteritems(identifiers):
|
||||
has = False
|
||||
if typ.lower() == 'isbn':
|
||||
set_isbn = True
|
||||
for x in m.identifier:
|
||||
if x.scheme.lower() == typ.lower():
|
||||
x.content = val
|
||||
has = True
|
||||
if not has:
|
||||
m.add('identifier', val, scheme=typ.upper())
|
||||
if override_input_metadata and not set_isbn:
|
||||
m.filter('identifier', lambda x: x.scheme.lower() == 'isbn')
|
||||
if not mi.is_null('languages'):
|
||||
m.clear('language')
|
||||
for lang in mi.languages:
|
||||
if lang and lang.lower() not in ('und', ''):
|
||||
m.add('language', lang)
|
||||
if not mi.is_null('series_index'):
|
||||
m.clear('series_index')
|
||||
m.add('series_index', mi.format_series_index())
|
||||
elif override_input_metadata:
|
||||
m.clear('series_index')
|
||||
if not mi.is_null('rating'):
|
||||
m.clear('rating')
|
||||
m.add('rating', '%.2f'%mi.rating)
|
||||
elif override_input_metadata:
|
||||
m.clear('rating')
|
||||
if not mi.is_null('tags'):
|
||||
m.clear('subject')
|
||||
for t in mi.tags:
|
||||
m.add('subject', t)
|
||||
elif override_input_metadata:
|
||||
m.clear('subject')
|
||||
if not mi.is_null('pubdate'):
|
||||
m.clear('date')
|
||||
m.add('date', isoformat(mi.pubdate))
|
||||
if not mi.is_null('timestamp'):
|
||||
m.clear('timestamp')
|
||||
m.add('timestamp', isoformat(mi.timestamp))
|
||||
if not mi.is_null('rights'):
|
||||
m.clear('rights')
|
||||
m.add('rights', mi.rights)
|
||||
if not mi.is_null('publication_type'):
|
||||
m.clear('publication_type')
|
||||
m.add('publication_type', mi.publication_type)
|
||||
|
||||
if not m.timestamp:
|
||||
m.add('timestamp', isoformat(now()))
|
||||
|
||||
|
||||
class MergeMetadata(object):
|
||||
'Merge in user metadata, including cover'
|
||||
|
||||
def __call__(self, oeb, mi, opts, override_input_metadata=False):
|
||||
self.oeb, self.log = oeb, oeb.log
|
||||
m = self.oeb.metadata
|
||||
self.log('Merging user specified metadata...')
|
||||
meta_info_to_oeb_metadata(mi, m, oeb.log,
|
||||
override_input_metadata=override_input_metadata)
|
||||
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
|
||||
m.clear('cover')
|
||||
if cover_id is not None:
|
||||
m.add('cover', cover_id)
|
||||
if mi.uuid is not None:
|
||||
m.filter('identifier', lambda x:x.id=='uuid_id')
|
||||
self.oeb.metadata.add('identifier', mi.uuid, id='uuid_id',
|
||||
scheme='uuid')
|
||||
self.oeb.uid = self.oeb.metadata.identifier[-1]
|
||||
if mi.application_id is not None:
|
||||
m.filter('identifier', lambda x:x.scheme=='calibre')
|
||||
self.oeb.metadata.add('identifier', mi.application_id, scheme='calibre')
|
||||
|
||||
def set_cover(self, mi, prefer_metadata_cover):
|
||||
cdata, ext = b'', 'jpg'
|
||||
if mi.cover and os.access(mi.cover, os.R_OK):
|
||||
with open(mi.cover, 'rb') as f:
|
||||
cdata = f.read()
|
||||
ext = mi.cover.rpartition('.')[-1].lower().strip()
|
||||
elif mi.cover_data and mi.cover_data[-1]:
|
||||
cdata = mi.cover_data[1]
|
||||
ext = mi.cover_data[0]
|
||||
if ext not in ('png', 'jpg', 'jpeg'):
|
||||
ext = 'jpg'
|
||||
id = old_cover = None
|
||||
if 'cover' in self.oeb.guide:
|
||||
old_cover = self.oeb.guide['cover']
|
||||
if prefer_metadata_cover and old_cover is not None:
|
||||
cdata = b''
|
||||
if cdata:
|
||||
self.oeb.guide.remove('cover')
|
||||
self.oeb.guide.remove('titlepage')
|
||||
elif self.oeb.plumber_output_format in {'mobi', 'azw3'} and old_cover is not None:
|
||||
# The amazon formats dont support html cover pages, so remove them
|
||||
# even if no cover was specified.
|
||||
self.oeb.guide.remove('titlepage')
|
||||
do_remove_old_cover = False
|
||||
if old_cover is not None:
|
||||
if old_cover.href in self.oeb.manifest.hrefs:
|
||||
item = self.oeb.manifest.hrefs[old_cover.href]
|
||||
if not cdata:
|
||||
return item.id
|
||||
do_remove_old_cover = True
|
||||
elif not cdata:
|
||||
id = self.oeb.manifest.generate(id='cover')[0]
|
||||
self.oeb.manifest.add(id, old_cover.href, 'image/jpeg')
|
||||
return id
|
||||
new_cover_item = None
|
||||
if cdata:
|
||||
id, href = self.oeb.manifest.generate('cover', 'cover.'+ext)
|
||||
new_cover_item = self.oeb.manifest.add(id, href, guess_type('cover.'+ext)[0], data=cdata)
|
||||
self.oeb.guide.add('cover', 'Cover', href)
|
||||
if do_remove_old_cover:
|
||||
self.remove_old_cover(item, new_cover_item.href)
|
||||
return id
|
||||
|
||||
def remove_old_cover(self, cover_item, new_cover_href=None):
|
||||
from calibre.ebooks.oeb.base import XPath, XLINK
|
||||
from lxml import etree
|
||||
|
||||
self.oeb.manifest.remove(cover_item)
|
||||
|
||||
# Remove any references to the cover in the HTML
|
||||
affected_items = set()
|
||||
xp = XPath('//h:img[@src]|//svg:image[@xl:href]')
|
||||
for i, item in enumerate(self.oeb.spine):
|
||||
try:
|
||||
images = xp(item.data)
|
||||
except Exception:
|
||||
images = ()
|
||||
removed = False
|
||||
for img in images:
|
||||
href = img.get('src') or img.get(XLINK('href'))
|
||||
try:
|
||||
href = item.abshref(href)
|
||||
except Exception:
|
||||
continue # Invalid URL, ignore
|
||||
if href == cover_item.href:
|
||||
if new_cover_href is not None:
|
||||
replacement_href = item.relhref(new_cover_href)
|
||||
attr = 'src' if img.tag.endswith('img') else XLINK('href')
|
||||
img.set(attr, replacement_href)
|
||||
else:
|
||||
p = img.getparent()
|
||||
if p.tag.endswith('}svg'):
|
||||
p.getparent().remove(p)
|
||||
else:
|
||||
p.remove(img)
|
||||
removed = True
|
||||
if removed:
|
||||
affected_items.add(item)
|
||||
|
||||
# Check if the resulting HTML has no content, if so remove it
|
||||
for item in affected_items:
|
||||
body = XPath('//h:body')(item.data)
|
||||
if body:
|
||||
text = etree.tostring(body[0], method='text', encoding='unicode')
|
||||
else:
|
||||
text = ''
|
||||
text = re.sub(r'\s+', '', text)
|
||||
if not text and not XPath('//h:img|//svg:svg')(item.data):
|
||||
self.log('Removing %s as it is a wrapper around'
|
||||
' the cover image'%item.href)
|
||||
self.oeb.spine.remove(item)
|
||||
self.oeb.manifest.remove(item)
|
||||
self.oeb.guide.remove_by_href(item.href)
|
||||
189
ebook_converter/ebooks/oeb/transforms/page_margin.py
Normal file
189
ebook_converter/ebooks/oeb/transforms/page_margin.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import numbers
|
||||
from collections import Counter
|
||||
|
||||
from calibre.ebooks.oeb.base import barename, XPath
|
||||
from polyglot.builtins import iteritems
|
||||
|
||||
|
||||
class RemoveAdobeMargins(object):
|
||||
'''
|
||||
Remove margins specified in Adobe's page templates.
|
||||
'''
|
||||
|
||||
def __call__(self, oeb, log, opts):
|
||||
self.oeb, self.opts, self.log = oeb, opts, log
|
||||
|
||||
for item in self.oeb.manifest:
|
||||
if item.media_type in {
|
||||
'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml',
|
||||
'application/adobe-page-template+xml', 'application/adobe.page-template+xml',
|
||||
} and hasattr(item.data, 'xpath'):
|
||||
self.log('Removing page margins specified in the'
|
||||
' Adobe page template')
|
||||
for elem in item.data.xpath(
|
||||
'//*[@margin-bottom or @margin-top '
|
||||
'or @margin-left or @margin-right]'):
|
||||
for margin in ('left', 'right', 'top', 'bottom'):
|
||||
attr = 'margin-'+margin
|
||||
elem.attrib.pop(attr, None)
|
||||
|
||||
|
||||
class NegativeTextIndent(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RemoveFakeMargins(object):
|
||||
|
||||
'''
|
||||
Remove left and right margins from paragraph/divs if the same margin is specified
|
||||
on almost all the elements at that level.
|
||||
|
||||
Must be called only after CSS flattening
|
||||
'''
|
||||
|
||||
def __call__(self, oeb, log, opts):
|
||||
if not opts.remove_fake_margins:
|
||||
return
|
||||
self.oeb, self.log, self.opts = oeb, log, opts
|
||||
stylesheet = None
|
||||
self.levels = {}
|
||||
self.stats = {}
|
||||
self.selector_map = {}
|
||||
|
||||
stylesheet = self.oeb.manifest.main_stylesheet
|
||||
if stylesheet is None:
|
||||
return
|
||||
|
||||
self.log('Removing fake margins...')
|
||||
|
||||
stylesheet = stylesheet.data
|
||||
|
||||
from css_parser.css import CSSRule
|
||||
for rule in stylesheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
self.selector_map[rule.selectorList.selectorText] = rule.style
|
||||
|
||||
self.find_levels()
|
||||
|
||||
for level in self.levels:
|
||||
try:
|
||||
self.process_level(level)
|
||||
except NegativeTextIndent:
|
||||
self.log.debug('Negative text indent detected at level '
|
||||
' %s, ignoring this level'%level)
|
||||
|
||||
def get_margins(self, elem):
|
||||
cls = elem.get('class', None)
|
||||
if cls:
|
||||
style = self.selector_map.get('.'+cls, None)
|
||||
if style:
|
||||
try:
|
||||
ti = style['text-indent']
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
if ((hasattr(ti, 'startswith') and ti.startswith('-')) or
|
||||
isinstance(ti, numbers.Number) and ti < 0):
|
||||
raise NegativeTextIndent()
|
||||
return style.marginLeft, style.marginRight, style
|
||||
return '', '', None
|
||||
|
||||
def process_level(self, level):
|
||||
elems = self.levels[level]
|
||||
self.stats[level+'_left'] = Counter()
|
||||
self.stats[level+'_right'] = Counter()
|
||||
|
||||
for elem in elems:
|
||||
lm, rm = self.get_margins(elem)[:2]
|
||||
self.stats[level+'_left'][lm] += 1
|
||||
self.stats[level+'_right'][rm] += 1
|
||||
|
||||
self.log.debug(level, ' left margin stats:', self.stats[level+'_left'])
|
||||
self.log.debug(level, ' right margin stats:', self.stats[level+'_right'])
|
||||
|
||||
remove_left = self.analyze_stats(self.stats[level+'_left'])
|
||||
remove_right = self.analyze_stats(self.stats[level+'_right'])
|
||||
|
||||
if remove_left:
|
||||
mcl = self.stats[level+'_left'].most_common(1)[0][0]
|
||||
self.log('Removing level %s left margin of:'%level, mcl)
|
||||
|
||||
if remove_right:
|
||||
mcr = self.stats[level+'_right'].most_common(1)[0][0]
|
||||
self.log('Removing level %s right margin of:'%level, mcr)
|
||||
|
||||
if remove_left or remove_right:
|
||||
for elem in elems:
|
||||
lm, rm, style = self.get_margins(elem)
|
||||
if remove_left and lm == mcl:
|
||||
style.removeProperty('margin-left')
|
||||
if remove_right and rm == mcr:
|
||||
style.removeProperty('margin-right')
|
||||
|
||||
def find_levels(self):
|
||||
|
||||
def level_of(elem, body):
|
||||
ans = 1
|
||||
while elem.getparent() is not body:
|
||||
ans += 1
|
||||
elem = elem.getparent()
|
||||
return ans
|
||||
|
||||
paras = XPath('descendant::h:p|descendant::h:div')
|
||||
|
||||
for item in self.oeb.spine:
|
||||
body = XPath('//h:body')(item.data)
|
||||
if not body:
|
||||
continue
|
||||
body = body[0]
|
||||
|
||||
for p in paras(body):
|
||||
level = level_of(p, body)
|
||||
level = '%s_%d'%(barename(p.tag), level)
|
||||
if level not in self.levels:
|
||||
self.levels[level] = []
|
||||
self.levels[level].append(p)
|
||||
|
||||
remove = set()
|
||||
for k, v in iteritems(self.levels):
|
||||
num = len(v)
|
||||
self.log.debug('Found %d items of level:'%num, k)
|
||||
level = int(k.split('_')[-1])
|
||||
tag = k.split('_')[0]
|
||||
if tag == 'p' and num < 25:
|
||||
remove.add(k)
|
||||
if tag == 'div':
|
||||
if level > 2 and num < 25:
|
||||
remove.add(k)
|
||||
elif level < 3:
|
||||
# Check each level < 3 element and only keep those
|
||||
# that have many child paras
|
||||
for elem in list(v):
|
||||
children = len(paras(elem))
|
||||
if children < 5:
|
||||
v.remove(elem)
|
||||
|
||||
for k in remove:
|
||||
self.levels.pop(k)
|
||||
self.log.debug('Ignoring level', k)
|
||||
|
||||
def analyze_stats(self, stats):
|
||||
if not stats:
|
||||
return False
|
||||
mc = stats.most_common(1)
|
||||
if len(mc) > 1:
|
||||
return False
|
||||
mc = mc[0]
|
||||
most_common, most_common_count = mc
|
||||
if not most_common or most_common == '0':
|
||||
return False
|
||||
total = sum(stats.values())
|
||||
# True if greater than 95% of elements have the same margin
|
||||
return most_common_count/total > 0.95
|
||||
324
ebook_converter/ebooks/oeb/transforms/structure.py
Normal file
324
ebook_converter/ebooks/oeb/transforms/structure.py
Normal file
@@ -0,0 +1,324 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re, uuid
|
||||
|
||||
from lxml import etree
|
||||
from collections import OrderedDict, Counter
|
||||
|
||||
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename
|
||||
from calibre.ebooks import ConversionError
|
||||
from polyglot.builtins import itervalues, unicode_type
|
||||
from polyglot.urllib import urlparse
|
||||
|
||||
|
||||
def XPath(x):
|
||||
try:
|
||||
return etree.XPath(x, namespaces=XPNSMAP)
|
||||
except etree.XPathSyntaxError:
|
||||
raise ConversionError(
|
||||
'The syntax of the XPath expression %s is invalid.' % repr(x))
|
||||
|
||||
|
||||
def isspace(x):
|
||||
return not x or x.replace('\xa0', '').isspace()
|
||||
|
||||
|
||||
def at_start(elem):
|
||||
' Return True if there is no content before elem '
|
||||
body = XPath('ancestor-or-self::h:body')(elem)
|
||||
if not body:
|
||||
return True
|
||||
body = body[0]
|
||||
ancestors = frozenset(XPath('ancestor::*')(elem))
|
||||
for x in body.iter():
|
||||
if x is elem:
|
||||
return True
|
||||
if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}:
|
||||
return False
|
||||
if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))):
|
||||
continue
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
class DetectStructure(object):
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
self.log = oeb.log
|
||||
self.oeb = oeb
|
||||
self.opts = opts
|
||||
self.log('Detecting structure...')
|
||||
|
||||
self.detect_chapters()
|
||||
if self.oeb.auto_generated_toc or opts.use_auto_toc:
|
||||
orig_toc = self.oeb.toc
|
||||
self.oeb.toc = TOC()
|
||||
self.create_level_based_toc()
|
||||
if self.oeb.toc.count() < 1:
|
||||
if not opts.no_chapters_in_toc and self.detected_chapters:
|
||||
self.create_toc_from_chapters()
|
||||
if self.oeb.toc.count() < opts.toc_threshold:
|
||||
self.create_toc_from_links()
|
||||
if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
|
||||
self.oeb.toc = orig_toc
|
||||
else:
|
||||
self.oeb.auto_generated_toc = True
|
||||
self.log('Auto generated TOC with %d entries.' %
|
||||
self.oeb.toc.count())
|
||||
|
||||
if opts.toc_filter is not None:
|
||||
regexp = re.compile(opts.toc_filter)
|
||||
for node in list(self.oeb.toc.iter()):
|
||||
if not node.title or regexp.search(node.title) is not None:
|
||||
self.log('Filtering', node.title if node.title else
|
||||
'empty node', 'from TOC')
|
||||
self.oeb.toc.remove(node)
|
||||
|
||||
if opts.page_breaks_before is not None:
|
||||
pb_xpath = XPath(opts.page_breaks_before)
|
||||
for item in oeb.spine:
|
||||
for elem in pb_xpath(item.data):
|
||||
try:
|
||||
prev = next(elem.itersiblings(tag=etree.Element,
|
||||
preceding=True))
|
||||
if (barename(elem.tag) in {'h1', 'h2'} and barename(
|
||||
prev.tag) in {'h1', 'h2'} and (not prev.tail or
|
||||
not prev.tail.split())):
|
||||
# We have two adjacent headings, do not put a page
|
||||
# break on the second one
|
||||
continue
|
||||
except StopIteration:
|
||||
pass
|
||||
|
||||
style = elem.get('style', '')
|
||||
if style:
|
||||
style += '; '
|
||||
elem.set('style', style+'page-break-before:always')
|
||||
|
||||
for node in self.oeb.toc.iter():
|
||||
if not node.title or not node.title.strip():
|
||||
node.title = _('Unnamed')
|
||||
|
||||
if self.opts.start_reading_at:
|
||||
self.detect_start_reading()
|
||||
|
||||
def detect_start_reading(self):
|
||||
expr = self.opts.start_reading_at
|
||||
try:
|
||||
expr = XPath(expr)
|
||||
except:
|
||||
self.log.warn(
|
||||
'Invalid start reading at XPath expression, ignoring: %s'%expr)
|
||||
return
|
||||
for item in self.oeb.spine:
|
||||
if not hasattr(item.data, 'xpath'):
|
||||
continue
|
||||
matches = expr(item.data)
|
||||
if matches:
|
||||
elem = matches[0]
|
||||
eid = elem.get('id', None)
|
||||
if not eid:
|
||||
eid = 'start_reading_at_'+unicode_type(uuid.uuid4()).replace('-', '')
|
||||
elem.set('id', eid)
|
||||
if 'text' in self.oeb.guide:
|
||||
self.oeb.guide.remove('text')
|
||||
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
|
||||
self.log('Setting start reading at position to %s in %s'%(
|
||||
self.opts.start_reading_at, item.href))
|
||||
return
|
||||
self.log.warn("Failed to find start reading at position: %s"%
|
||||
self.opts.start_reading_at)
|
||||
|
||||
def get_toc_parts_for_xpath(self, expr):
|
||||
# if an attribute is selected by the xpath expr then truncate it
|
||||
# from the path and instead return it as where to find the title text
|
||||
title_attribute_regex = re.compile(r'/@([-\w]+)$')
|
||||
match = title_attribute_regex.search(expr)
|
||||
if match is not None:
|
||||
return expr[0:match.start()], match.group(1)
|
||||
|
||||
return expr, None
|
||||
|
||||
def detect_chapters(self):
|
||||
self.detected_chapters = []
|
||||
self.chapter_title_attribute = None
|
||||
|
||||
def find_matches(expr, doc):
|
||||
try:
|
||||
ans = XPath(expr)(doc)
|
||||
len(ans)
|
||||
return ans
|
||||
except:
|
||||
self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
|
||||
return []
|
||||
|
||||
if self.opts.chapter:
|
||||
chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter)
|
||||
self.chapter_title_attribute = title_attribute
|
||||
for item in self.oeb.spine:
|
||||
for x in find_matches(chapter_path, item.data):
|
||||
self.detected_chapters.append((item, x))
|
||||
|
||||
chapter_mark = self.opts.chapter_mark
|
||||
page_break_before = 'display: block; page-break-before: always'
|
||||
page_break_after = 'display: block; page-break-after: always'
|
||||
c = Counter()
|
||||
for item, elem in self.detected_chapters:
|
||||
c[item] += 1
|
||||
text = xml2text(elem).strip()
|
||||
text = re.sub(r'\s+', ' ', text.strip())
|
||||
self.log('\tDetected chapter:', text[:50])
|
||||
if chapter_mark == 'none':
|
||||
continue
|
||||
if chapter_mark == 'rule':
|
||||
mark = elem.makeelement(XHTML('hr'))
|
||||
elif chapter_mark == 'pagebreak':
|
||||
if c[item] < 3 and at_start(elem):
|
||||
# For the first two elements in this item, check if they
|
||||
# are at the start of the file, in which case inserting a
|
||||
# page break in unnecessary and can lead to extra blank
|
||||
# pages in the PDF Output plugin. We need to use two as
|
||||
# feedbooks epubs match both a heading tag and its
|
||||
# containing div with the default chapter expression.
|
||||
continue
|
||||
mark = elem.makeelement(XHTML('div'), style=page_break_after)
|
||||
else: # chapter_mark == 'both':
|
||||
mark = elem.makeelement(XHTML('hr'), style=page_break_before)
|
||||
try:
|
||||
elem.addprevious(mark)
|
||||
except TypeError:
|
||||
self.log.exception('Failed to mark chapter')
|
||||
|
||||
def create_level_based_toc(self):
|
||||
if self.opts.level1_toc is not None:
|
||||
self.add_leveled_toc_items()
|
||||
|
||||
def create_toc_from_chapters(self):
|
||||
counter = self.oeb.toc.next_play_order()
|
||||
for item, elem in self.detected_chapters:
|
||||
text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter)
|
||||
self.oeb.toc.add(text, href, play_order=counter)
|
||||
counter += 1
|
||||
|
||||
def create_toc_from_links(self):
|
||||
num = 0
|
||||
for item in self.oeb.spine:
|
||||
for a in XPath('//h:a[@href]')(item.data):
|
||||
href = a.get('href')
|
||||
try:
|
||||
purl = urlparse(href)
|
||||
except ValueError:
|
||||
self.log.warning('Ignoring malformed URL:', href)
|
||||
continue
|
||||
if not purl[0] or purl[0] == 'file':
|
||||
href, frag = purl.path, purl.fragment
|
||||
href = item.abshref(href)
|
||||
if frag:
|
||||
href = '#'.join((href, frag))
|
||||
if not self.oeb.toc.has_href(href):
|
||||
text = xml2text(a)
|
||||
text = text[:100].strip()
|
||||
if (not self.opts.duplicate_links_in_toc and
|
||||
self.oeb.toc.has_text(text)):
|
||||
continue
|
||||
try:
|
||||
self.oeb.toc.add(text, href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
num += 1
|
||||
except ValueError:
|
||||
self.oeb.log.exception('Failed to process link: %r' % href)
|
||||
continue # Most likely an incorrectly URL encoded link
|
||||
if self.opts.max_toc_links > 0 and \
|
||||
num >= self.opts.max_toc_links:
|
||||
self.log('Maximum TOC links reached, stopping.')
|
||||
return
|
||||
|
||||
def elem_to_link(self, item, elem, title_attribute, counter):
|
||||
text = ''
|
||||
if title_attribute is not None:
|
||||
text = elem.get(title_attribute, '')
|
||||
if not text:
|
||||
text = xml2text(elem).strip()
|
||||
if not text:
|
||||
text = elem.get('title', '')
|
||||
if not text:
|
||||
text = elem.get('alt', '')
|
||||
text = re.sub(r'\s+', ' ', text.strip())
|
||||
text = text[:1000].strip()
|
||||
id = elem.get('id', 'calibre_toc_%d'%counter)
|
||||
elem.set('id', id)
|
||||
href = '#'.join((item.href, id))
|
||||
return text, href
|
||||
|
||||
def add_leveled_toc_items(self):
|
||||
added = OrderedDict()
|
||||
added2 = OrderedDict()
|
||||
counter = 1
|
||||
|
||||
def find_matches(expr, doc):
|
||||
try:
|
||||
ans = XPath(expr)(doc)
|
||||
len(ans)
|
||||
return ans
|
||||
except:
|
||||
self.log.warn('Invalid ToC expression, ignoring: %s'%expr)
|
||||
return []
|
||||
|
||||
for document in self.oeb.spine:
|
||||
previous_level1 = list(itervalues(added))[-1] if added else None
|
||||
previous_level2 = list(itervalues(added2))[-1] if added2 else None
|
||||
|
||||
level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc)
|
||||
for elem in find_matches(level1_toc, document.data):
|
||||
text, _href = self.elem_to_link(document, elem, level1_title, counter)
|
||||
counter += 1
|
||||
if text:
|
||||
node = self.oeb.toc.add(text, _href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
added[elem] = node
|
||||
# node.add(_('Top'), _href)
|
||||
|
||||
if self.opts.level2_toc is not None and added:
|
||||
level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc)
|
||||
for elem in find_matches(level2_toc, document.data):
|
||||
level1 = None
|
||||
for item in document.data.iterdescendants():
|
||||
if item in added:
|
||||
level1 = added[item]
|
||||
elif item == elem:
|
||||
if level1 is None:
|
||||
if previous_level1 is None:
|
||||
break
|
||||
level1 = previous_level1
|
||||
text, _href = self.elem_to_link(document, elem, level2_title, counter)
|
||||
counter += 1
|
||||
if text:
|
||||
added2[elem] = level1.add(text, _href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
break
|
||||
|
||||
if self.opts.level3_toc is not None and added2:
|
||||
level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc)
|
||||
for elem in find_matches(level3_toc, document.data):
|
||||
level2 = None
|
||||
for item in document.data.iterdescendants():
|
||||
if item in added2:
|
||||
level2 = added2[item]
|
||||
elif item == elem:
|
||||
if level2 is None:
|
||||
if previous_level2 is None:
|
||||
break
|
||||
level2 = previous_level2
|
||||
text, _href = \
|
||||
self.elem_to_link(document, elem, level3_title, counter)
|
||||
counter += 1
|
||||
if text:
|
||||
level2.add(text, _href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
break
|
||||
73
ebook_converter/ebooks/oeb/transforms/trimmanifest.py
Normal file
73
ebook_converter/ebooks/oeb/transforms/trimmanifest.py
Normal file
@@ -0,0 +1,73 @@
|
||||
'''
|
||||
OPF manifest trimming transform.
|
||||
'''
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS
|
||||
from calibre.ebooks.oeb.base import urlnormalize, iterlinks
|
||||
from polyglot.urllib import urldefrag
|
||||
|
||||
|
||||
class ManifestTrimmer(object):
|
||||
|
||||
@classmethod
|
||||
def config(cls, cfg):
|
||||
return cfg
|
||||
|
||||
@classmethod
|
||||
def generate(cls, opts):
|
||||
return cls()
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
import css_parser
|
||||
oeb.logger.info('Trimming unused files from manifest...')
|
||||
self.opts = context
|
||||
used = set()
|
||||
for term in oeb.metadata:
|
||||
for item in oeb.metadata[term]:
|
||||
if item.value in oeb.manifest.hrefs:
|
||||
used.add(oeb.manifest.hrefs[item.value])
|
||||
elif item.value in oeb.manifest.ids:
|
||||
used.add(oeb.manifest.ids[item.value])
|
||||
for ref in oeb.guide.values():
|
||||
path, _ = urldefrag(ref.href)
|
||||
if path in oeb.manifest.hrefs:
|
||||
used.add(oeb.manifest.hrefs[path])
|
||||
# TOC items are required to be in the spine
|
||||
for item in oeb.spine:
|
||||
used.add(item)
|
||||
unchecked = used
|
||||
while unchecked:
|
||||
new = set()
|
||||
for item in unchecked:
|
||||
if (item.media_type in OEB_DOCS or
|
||||
item.media_type[-4:] in ('/xml', '+xml')) and \
|
||||
item.data is not None:
|
||||
hrefs = [r[2] for r in iterlinks(item.data)]
|
||||
for href in hrefs:
|
||||
if isinstance(href, bytes):
|
||||
href = href.decode('utf-8')
|
||||
try:
|
||||
href = item.abshref(urlnormalize(href))
|
||||
except:
|
||||
continue
|
||||
if href in oeb.manifest.hrefs:
|
||||
found = oeb.manifest.hrefs[href]
|
||||
if found not in used:
|
||||
new.add(found)
|
||||
elif item.media_type == CSS_MIME:
|
||||
for href in css_parser.getUrls(item.data):
|
||||
href = item.abshref(urlnormalize(href))
|
||||
if href in oeb.manifest.hrefs:
|
||||
found = oeb.manifest.hrefs[href]
|
||||
if found not in used:
|
||||
new.add(found)
|
||||
used.update(new)
|
||||
unchecked = new
|
||||
for item in oeb.manifest.values():
|
||||
if item not in used:
|
||||
oeb.logger.info('Trimming %r from manifest' % item.href)
|
||||
oeb.manifest.remove(item)
|
||||
Reference in New Issue
Block a user