1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-28 14:33:31 +01:00

Initial import

This commit is contained in:
2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,52 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from calibre.ebooks.oeb.base import XPath, urlunquote
from polyglot.builtins import as_bytes
class DataURL(object):
def __call__(self, oeb, opts):
from calibre.utils.imghdr import what
self.log = oeb.log
attr_path = XPath('//h:img[@src]')
for item in oeb.spine:
root = item.data
if not hasattr(root, 'xpath'):
continue
for img in attr_path(root):
raw = img.get('src', '')
if not raw.startswith('data:'):
continue
header, data = raw.partition(',')[0::2]
if not header.startswith('data:image/') or not data:
continue
if ';base64' in header:
data = re.sub(r'\s+', '', data)
from polyglot.binary import from_base64_bytes
try:
data = from_base64_bytes(data)
except Exception:
self.log.error('Found invalid base64 encoded data URI, ignoring it')
continue
else:
data = urlunquote(data)
data = as_bytes(data)
fmt = what(None, data)
if not fmt:
self.log.warn('Image encoded as data URL has unknown format, ignoring')
continue
img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
def convert_image_data_uri(self, data, fmt, oeb):
self.log('Found image encoded as data URI converting it to normal image')
from calibre import guess_type
item_id, item_href = oeb.manifest.generate('data-url-image', 'data-url-image.' + fmt)
oeb.manifest.add(item_id, item_href, guess_type(item_href)[0], data=data)
return item_href

View File

@@ -0,0 +1,684 @@
'''
CSS flattening transform.
'''
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import re, operator, math, numbers
from collections import defaultdict
from xml.dom import SyntaxErr
from lxml import etree
import css_parser
from css_parser.css import Property
from calibre import guess_type
from calibre.ebooks import unit_convert
from calibre.ebooks.oeb.base import (XHTML, XHTML_NS, CSS_MIME, OEB_STYLES,
namespace, barename, XPath, css_text)
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.utils.filenames import ascii_filename, ascii_text
from calibre.utils.icu import numeric_sort_key
from polyglot.builtins import iteritems, unicode_type, string_or_bytes, map
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
STRIPNUM = re.compile(r'[-0-9]+$')
def asfloat(value, default):
if not isinstance(value, numbers.Number):
value = default
return float(value)
class KeyMapper(object):
def __init__(self, sbase, dbase, dkey):
self.sbase = float(sbase)
self.dprop = [(self.relate(x, dbase), float(x)) for x in dkey]
self.cache = {}
@staticmethod
def relate(size, base):
if size == 0:
return base
size = float(size)
base = float(base)
if abs(size - base) < 0.1:
return 0
sign = -1 if size < base else 1
endp = 0 if size < base else 36
diff = (abs(base - size) * 3) + ((36 - size) / 100)
logb = abs(base - endp)
if logb == 1.0:
logb = 1.1
try:
result = sign * math.log(diff, logb)
except ValueError:
if diff < 0:
# Size is both very large and close to base
return 0
if logb == 0:
logb = 1e-6
if diff == 0:
diff = 1e-6
result = sign * math.log(diff, logb)
return result
def __getitem__(self, ssize):
ssize = asfloat(ssize, 0)
if ssize in self.cache:
return self.cache[ssize]
dsize = self.map(ssize)
self.cache[ssize] = dsize
return dsize
def map(self, ssize):
sbase = self.sbase
prop = self.relate(ssize, sbase)
diff = [(abs(prop - p), s) for p, s in self.dprop]
dsize = min(diff)[1]
return dsize
class ScaleMapper(object):
def __init__(self, sbase, dbase):
self.dscale = float(dbase) / float(sbase)
def __getitem__(self, ssize):
ssize = asfloat(ssize, 0)
dsize = ssize * self.dscale
return dsize
class NullMapper(object):
def __init__(self):
pass
def __getitem__(self, ssize):
return ssize
def FontMapper(sbase=None, dbase=None, dkey=None):
if sbase and dbase and dkey:
return KeyMapper(sbase, dbase, dkey)
elif sbase and dbase:
return ScaleMapper(sbase, dbase)
else:
return NullMapper()
class EmbedFontsCSSRules(object):
def __init__(self, body_font_family, rules):
self.body_font_family, self.rules = body_font_family, rules
self.href = None
def __call__(self, oeb):
if not self.body_font_family:
return None
if not self.href:
iid, href = oeb.manifest.generate('page_styles', 'page_styles.css')
rules = [css_text(x) for x in self.rules]
rules = '\n\n'.join(rules)
sheet = css_parser.parseString(rules, validate=False)
self.href = oeb.manifest.add(iid, href, guess_type(href)[0],
data=sheet).href
return self.href
class CSSFlattener(object):
def __init__(self, fbase=None, fkey=None, lineh=None, unfloat=False,
untable=False, page_break_on_body=False, specializer=None,
transform_css_rules=()):
self.fbase = fbase
self.transform_css_rules = transform_css_rules
if self.transform_css_rules:
from calibre.ebooks.css_transform_rules import compile_rules
self.transform_css_rules = compile_rules(self.transform_css_rules)
self.fkey = fkey
self.lineh = lineh
self.unfloat = unfloat
self.untable = untable
self.specializer = specializer
self.page_break_on_body = page_break_on_body
@classmethod
def config(cls, cfg):
return cfg
@classmethod
def generate(cls, opts):
return cls()
def __call__(self, oeb, context):
oeb.logger.info('Flattening CSS and remapping font sizes...')
self.context = self.opts = context
self.oeb = oeb
self.items = list(self.oeb.spine)
titlepage = self.oeb.guide.get('titlepage')
if titlepage is not None:
titlepage = titlepage.item
if titlepage is not None and titlepage not in self.items:
self.items.append(titlepage)
epub3_nav = None
if getattr(self.opts, 'epub3_nav_href', None):
epub3_nav = self.oeb.manifest.hrefs.get(self.opts.epub3_nav_href)
if epub3_nav is not None and epub3_nav not in self.items:
self.items.append(epub3_nav)
self.filter_css = frozenset()
if self.opts.filter_css:
try:
self.filter_css = {x.strip().lower() for x in
self.opts.filter_css.split(',')}
except:
self.oeb.log.warning('Failed to parse filter_css, ignoring')
else:
from calibre.ebooks.oeb.normalize_css import normalize_filter_css
self.filter_css = frozenset(normalize_filter_css(self.filter_css))
self.oeb.log.debug('Filtering CSS properties: %s'%
', '.join(self.filter_css))
for item in oeb.manifest.values():
# Make all links to resources absolute, as these sheets will be
# consolidated into a single stylesheet at the root of the document
if item.media_type in OEB_STYLES:
css_parser.replaceUrls(item.data, item.abshref,
ignoreImportRules=True)
self.body_font_family, self.embed_font_rules = self.get_embed_font_info(
self.opts.embed_font_family)
# Store for use in output plugins/transforms that generate content,
# like the AZW3 output inline ToC.
self.oeb.store_embed_font_rules = EmbedFontsCSSRules(self.body_font_family,
self.embed_font_rules)
self.stylize_spine()
self.sbase = self.baseline_spine() if self.fbase else None
self.fmap = FontMapper(self.sbase, self.fbase, self.fkey)
self.flatten_spine()
if epub3_nav is not None:
self.opts.epub3_nav_parsed = epub3_nav.data
self.store_page_margins()
def store_page_margins(self):
self.opts._stored_page_margins = {}
for item, stylizer in iteritems(self.stylizers):
margins = self.opts._stored_page_margins[item.href] = {}
for prop, val in stylizer.page_rule.items():
p, w = prop.partition('-')[::2]
if p == 'margin':
margins[w] = unit_convert(
val, stylizer.profile.width_pts, stylizer.body_font_size,
stylizer.profile.dpi, body_font_size=stylizer.body_font_size)
def get_embed_font_info(self, family, failure_critical=True):
efi = []
body_font_family = None
if not family:
return body_font_family, efi
from calibre.utils.fonts.scanner import font_scanner, NoFonts
from calibre.utils.fonts.utils import panose_to_css_generic_family
try:
faces = font_scanner.fonts_for_family(family)
except NoFonts:
msg = ('No embeddable fonts found for family: %r'%family)
if failure_critical:
raise ValueError(msg)
self.oeb.log.warn(msg)
return body_font_family, efi
if not faces:
msg = ('No embeddable fonts found for family: %r'%family)
if failure_critical:
raise ValueError(msg)
self.oeb.log.warn(msg)
return body_font_family, efi
for i, font in enumerate(faces):
ext = 'otf' if font['is_otf'] else 'ttf'
fid, href = self.oeb.manifest.generate(id=u'font',
href='fonts/%s.%s'%(ascii_filename(font['full_name']).replace(' ', '-'), ext))
item = self.oeb.manifest.add(fid, href,
guess_type('dummy.'+ext)[0],
data=font_scanner.get_font_data(font))
item.unload_data_from_memory()
cfont = {
'font-family': '"%s"'%font['font-family'],
'panose-1': ' '.join(map(unicode_type, font['panose'])),
'src': 'url(%s)'%item.href,
}
if i == 0:
generic_family = panose_to_css_generic_family(font['panose'])
body_font_family = "'%s',%s"%(font['font-family'], generic_family)
self.oeb.log('Embedding font: %s'%font['font-family'])
for k in ('font-weight', 'font-style', 'font-stretch'):
if font[k] != 'normal':
cfont[k] = font[k]
rule = '@font-face { %s }'%('; '.join('%s:%s'%(k, v) for k, v in
iteritems(cfont)))
rule = css_parser.parseString(rule)
efi.append(rule)
return body_font_family, efi
def stylize_spine(self):
self.stylizers = {}
profile = self.context.source
css = ''
for item in self.items:
html = item.data
body = html.find(XHTML('body'))
if 'style' in html.attrib:
b = body.attrib.get('style', '')
body.set('style', html.get('style') + ';' + b)
del html.attrib['style']
bs = body.get('style', '').split(';')
bs.append('margin-top: 0pt')
bs.append('margin-bottom: 0pt')
if float(self.context.margin_left) >= 0:
bs.append('margin-left : %gpt'%
float(self.context.margin_left))
if float(self.context.margin_right) >= 0:
bs.append('margin-right : %gpt'%
float(self.context.margin_right))
bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
if self.page_break_on_body:
bs.extend(['page-break-before: always'])
if self.context.change_justification != 'original':
bs.append('text-align: '+ self.context.change_justification)
if self.body_font_family:
bs.append('font-family: '+self.body_font_family)
body.set('style', '; '.join(bs))
stylizer = Stylizer(html, item.href, self.oeb, self.context, profile,
user_css=self.context.extra_css,
extra_css=css)
self.stylizers[item] = stylizer
def baseline_node(self, node, stylizer, sizes, csize):
csize = stylizer.style(node)['font-size']
if node.text:
sizes[csize] += len(COLLAPSE.sub(' ', node.text))
for child in node:
self.baseline_node(child, stylizer, sizes, csize)
if child.tail:
sizes[csize] += len(COLLAPSE.sub(' ', child.tail))
def baseline_spine(self):
sizes = defaultdict(float)
for item in self.items:
html = item.data
stylizer = self.stylizers[item]
body = html.find(XHTML('body'))
fsize = self.context.source.fbase
self.baseline_node(body, stylizer, sizes, fsize)
try:
sbase = max(list(sizes.items()), key=operator.itemgetter(1))[0]
except:
sbase = 12.0
self.oeb.logger.info(
"Source base font size is %0.05fpt" % sbase)
return sbase
def clean_edges(self, cssdict, style, fsize):
slineh = self.sbase * 1.26
dlineh = self.lineh
for kind in ('margin', 'padding'):
for edge in ('bottom', 'top'):
property = "%s-%s" % (kind, edge)
if property not in cssdict:
continue
if '%' in cssdict[property]:
continue
value = style[property]
if value == 0:
continue
elif value <= slineh:
cssdict[property] = "%0.5fem" % (dlineh / fsize)
else:
try:
value = round(value / slineh) * dlineh
except:
self.oeb.logger.warning(
'Invalid length:', value)
value = 0.0
cssdict[property] = "%0.5fem" % (value / fsize)
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True):
if not isinstance(node.tag, string_or_bytes) \
or namespace(node.tag) != XHTML_NS:
return
tag = barename(node.tag)
style = stylizer.style(node)
cssdict = style.cssdict()
try:
font_size = style['font-size']
except:
font_size = self.sbase if self.sbase is not None else \
self.context.source.fbase
if tag == 'body' and isinstance(font_size, numbers.Number):
stylizer.body_font_size = font_size
if 'align' in node.attrib:
if tag != 'img':
cssdict['text-align'] = node.attrib['align']
if cssdict['text-align'] == 'center':
# align=center causes tables to be center aligned,
# which text-align does not. And the ever trustworthy Word
# uses this construct in its HTML output. See
# https://bugs.launchpad.net/bugs/1569583
if tag == 'table':
if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
cssdict['margin-left'] = cssdict['margin-right'] = 'auto'
else:
for table in node.iterchildren(XHTML("table")):
ts = stylizer.style(table)
if ts.get('margin-left') is None and ts.get('margin-right') is None:
ts.set('margin-left', 'auto')
ts.set('margin-right', 'auto')
else:
val = node.attrib['align']
if val in ('middle', 'bottom', 'top'):
cssdict['vertical-align'] = val
elif val in ('left', 'right'):
cssdict['float'] = val
del node.attrib['align']
if 'valign' in node.attrib and tag == 'td':
if cssdict.get('vertical-align') == 'inherit':
cssdict['vertical-align'] = node.attrib['valign']
del node.attrib['valign']
if node.tag == XHTML('font'):
tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')]
tag = 'div' if XPath('|'.join(tags))(node) else 'span'
node.tag = XHTML(tag)
if 'size' in node.attrib:
def force_int(raw):
return int(re.search(r'([0-9+-]+)', raw).group(1))
size = node.attrib['size'].strip()
if size:
fnums = self.context.source.fnums
if size[0] in ('+', '-'):
# Oh, the warcrimes
try:
esize = 3 + force_int(size)
except:
esize = 3
if esize < 1:
esize = 1
if esize > 7:
esize = 7
font_size = fnums[esize]
else:
try:
font_size = fnums[force_int(size)]
except:
font_size = fnums[3]
cssdict['font-size'] = '%.1fpt'%font_size
del node.attrib['size']
if 'face' in node.attrib:
cssdict['font-family'] = node.attrib['face']
del node.attrib['face']
if 'color' in node.attrib:
try:
cssdict['color'] = Property('color', node.attrib['color']).value
except (ValueError, SyntaxErr):
pass
del node.attrib['color']
if 'bgcolor' in node.attrib:
try:
cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value
except (ValueError, SyntaxErr):
pass
del node.attrib['bgcolor']
if tag == 'ol' and 'type' in node.attrib:
del node.attrib['type']
if cssdict.get('font-weight', '').lower() == 'medium':
cssdict['font-weight'] = 'normal' # ADE chokes on font-weight medium
fsize = font_size
is_drop_cap = (cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and (
len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f)))
# Detect drop caps generated by the docx input plugin
if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \
not node.tail and 'line-height' in cssdict and 'font-size' in cssdict:
dp = node.getparent()
if dp.tag and dp.tag.endswith('}div') and len(dp) == 1 and not dp.text:
if stylizer.style(dp).cssdict().get('float', None) == 'left':
is_drop_cap = True
if not self.context.disable_font_rescaling and not is_drop_cap:
_sbase = self.sbase if self.sbase is not None else \
self.context.source.fbase
dyn_rescale = node.attrib.pop('data-calibre-rescale', None)
if dyn_rescale is not None:
try:
dyn_rescale = float(dyn_rescale) / 100
except Exception:
dyn_rescale = 1
fsize = self.fmap[_sbase]
fsize *= dyn_rescale
cssdict['font-size'] = '%0.5fem'%(fsize/psize)
psize = fsize
elif 'font-size' in cssdict or tag == 'body':
fsize = self.fmap[font_size]
try:
cssdict['font-size'] = "%0.5fem" % (fsize / psize)
except ZeroDivisionError:
cssdict['font-size'] = '%.1fpt'%fsize
psize = fsize
try:
minlh = self.context.minimum_line_height / 100.
slh = style['line-height']
if not is_drop_cap and isinstance(slh, numbers.Number) and slh < minlh * fsize:
cssdict['line-height'] = unicode_type(minlh)
except Exception:
self.oeb.logger.exception('Failed to set minimum line-height')
if cssdict:
for x in self.filter_css:
popval = cssdict.pop(x, None)
if self.body_font_family and popval and x == 'font-family' \
and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]:
cssdict[x] = popval
if cssdict:
if self.lineh and self.fbase and tag != 'body':
self.clean_edges(cssdict, style, psize)
if 'display' in cssdict and cssdict['display'] == 'in-line':
cssdict['display'] = 'inline'
if self.unfloat and 'float' in cssdict \
and cssdict.get('display', 'none') != 'none':
del cssdict['display']
if self.untable and 'display' in cssdict \
and cssdict['display'].startswith('table'):
display = cssdict['display']
if display == 'table-cell':
cssdict['display'] = 'inline'
else:
cssdict['display'] = 'block'
if 'vertical-align' in cssdict \
and cssdict['vertical-align'] == 'sup':
cssdict['vertical-align'] = 'super'
if self.lineh and 'line-height' not in cssdict:
lineh = self.lineh / psize
cssdict['line-height'] = "%0.5fem" % lineh
if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'):
if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
for prop in ('margin', 'padding', 'border'):
for edge in ('top', 'bottom'):
cssdict['%s-%s'%(prop, edge)] = '0pt'
if self.context.insert_blank_line:
cssdict['margin-top'] = cssdict['margin-bottom'] = \
'%fem'%self.context.insert_blank_line_size
indent_size = self.context.remove_paragraph_spacing_indent_size
keep_indents = indent_size < 0.0
if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')):
cssdict['text-indent'] = "%1.1fem" % indent_size
pseudo_classes = style.pseudo_classes(self.filter_css)
if cssdict or pseudo_classes:
keep_classes = set()
if cssdict:
items = sorted(iteritems(cssdict))
css = ';\n'.join(u'%s: %s' % (key, val) for key, val in items)
classes = node.get('class', '').strip() or 'calibre'
classes_list = classes.split()
# lower() because otherwise if the document uses the same class
# name with different case, both cases will apply, leading
# to incorrect results.
klass = ascii_text(STRIPNUM.sub('', classes_list[0])).lower().strip().replace(' ', '_')
if css in styles:
match = styles[css]
else:
match = klass + unicode_type(names[klass] or '')
styles[css] = match
names[klass] += 1
node.attrib['class'] = match
keep_classes.add(match)
for psel, cssdict in iteritems(pseudo_classes):
items = sorted(iteritems(cssdict))
css = ';\n'.join('%s: %s' % (key, val) for key, val in items)
pstyles = pseudo_styles[psel]
if css in pstyles:
match = pstyles[css]
else:
# We have to use a different class for each psel as
# otherwise you can have incorrect styles for a situation
# like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
# If the pcalibre class for a:hover and a:link is the same,
# then the class attribute for a.x tags will contain both
# that class and the class for a.x:hover, which is wrong.
klass = 'pcalibre'
match = klass + unicode_type(names[klass] or '')
pstyles[css] = match
names[klass] += 1
keep_classes.add(match)
node.attrib['class'] = ' '.join(keep_classes)
elif 'class' in node.attrib:
del node.attrib['class']
if 'style' in node.attrib:
del node.attrib['style']
if recurse:
for child in node:
self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
def flatten_head(self, item, href, global_href):
html = item.data
head = html.find(XHTML('head'))
def safe_lower(x):
try:
x = x.lower()
except Exception:
pass
return x
for node in html.xpath('//*[local-name()="style" or local-name()="link"]'):
if node.tag == XHTML('link') \
and safe_lower(node.get('rel', 'stylesheet')) == 'stylesheet' \
and safe_lower(node.get('type', CSS_MIME)) in OEB_STYLES:
node.getparent().remove(node)
elif node.tag == XHTML('style') \
and node.get('type', CSS_MIME) in OEB_STYLES:
node.getparent().remove(node)
href = item.relhref(href)
l = etree.SubElement(head, XHTML('link'),
rel='stylesheet', type=CSS_MIME, href=href)
l.tail='\n'
if global_href:
href = item.relhref(global_href)
l = etree.SubElement(head, XHTML('link'),
rel='stylesheet', type=CSS_MIME, href=href)
l.tail = '\n'
def replace_css(self, css):
manifest = self.oeb.manifest
for item in manifest.values():
if item.media_type in OEB_STYLES:
manifest.remove(item)
id, href = manifest.generate('css', 'stylesheet.css')
sheet = css_parser.parseString(css, validate=False)
if self.transform_css_rules:
from calibre.ebooks.css_transform_rules import transform_sheet
transform_sheet(self.transform_css_rules, sheet)
item = manifest.add(id, href, CSS_MIME, data=sheet)
self.oeb.manifest.main_stylesheet = item
return href
def collect_global_css(self):
global_css = defaultdict(list)
for item in self.items:
stylizer = self.stylizers[item]
if float(self.context.margin_top) >= 0:
stylizer.page_rule['margin-top'] = '%gpt'%\
float(self.context.margin_top)
if float(self.context.margin_bottom) >= 0:
stylizer.page_rule['margin-bottom'] = '%gpt'%\
float(self.context.margin_bottom)
items = sorted(stylizer.page_rule.items())
css = ';\n'.join("%s: %s" % (key, val) for key, val in items)
css = ('@page {\n%s\n}\n'%css) if items else ''
rules = [css_text(r) for r in stylizer.font_face_rules + self.embed_font_rules]
raw = '\n\n'.join(rules)
css += '\n\n' + raw
global_css[css].append(item)
gc_map = {}
manifest = self.oeb.manifest
for css in global_css:
href = None
if css.strip():
id_, href = manifest.generate('page_css', 'page_styles.css')
sheet = css_parser.parseString(css, validate=False)
if self.transform_css_rules:
from calibre.ebooks.css_transform_rules import transform_sheet
transform_sheet(self.transform_css_rules, sheet)
manifest.add(id_, href, CSS_MIME, data=sheet)
gc_map[css] = href
ans = {}
for css, items in iteritems(global_css):
for item in items:
ans[item] = gc_map[css]
return ans
def flatten_spine(self):
names = defaultdict(int)
styles, pseudo_styles = {}, defaultdict(dict)
for item in self.items:
html = item.data
stylizer = self.stylizers[item]
if self.specializer is not None:
self.specializer(item, stylizer)
fsize = self.context.dest.fbase
self.flatten_node(html, stylizer, names, styles, pseudo_styles, fsize, item.id, recurse=False)
self.flatten_node(html.find(XHTML('body')), stylizer, names, styles, pseudo_styles, fsize, item.id)
items = sorted(((key, val) for (val, key) in iteritems(styles)), key=lambda x:numeric_sort_key(x[0]))
# :hover must come after link and :active must come after :hover
psels = sorted(pseudo_styles, key=lambda x :
{'hover':1, 'active':2}.get(x, 0))
for psel in psels:
styles = pseudo_styles[psel]
if not styles:
continue
x = sorted(((k+':'+psel, v) for v, k in iteritems(styles)))
items.extend(x)
css = ''.join(".%s {\n%s;\n}\n\n" % (key, val) for key, val in items)
href = self.replace_css(css)
global_css = self.collect_global_css()
for item in self.items:
stylizer = self.stylizers[item]
self.flatten_head(item, href, global_css[item])

View File

@@ -0,0 +1,55 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class Clean(object):
'''Clean up guide, leaving only known values '''
def __call__(self, oeb, opts):
self.oeb, self.log, self.opts = oeb, oeb.log, opts
if 'cover' not in self.oeb.guide:
covers = []
for x in ('other.ms-coverimage-standard', 'coverimagestandard',
'other.ms-titleimage-standard', 'other.ms-titleimage',
'other.ms-coverimage', 'other.ms-thumbimage-standard',
'other.ms-thumbimage', 'thumbimagestandard'):
if x in self.oeb.guide:
href = self.oeb.guide[x].href
try:
item = self.oeb.manifest.hrefs[href]
except KeyError:
continue
else:
covers.append([self.oeb.guide[x], len(item.data)])
covers.sort(key=lambda x: x[1], reverse=True)
if covers:
ref = covers[0][0]
if len(covers) > 1:
self.log('Choosing %s:%s as the cover'%(ref.type, ref.href))
ref.type = 'cover'
self.oeb.guide.refs['cover'] = ref
if ('start' in self.oeb.guide and 'text' not in self.oeb.guide):
# Prefer text to start as per the OPF 2.0 spec
x = self.oeb.guide['start']
self.oeb.guide.add('text', x.title, x.href)
self.oeb.guide.remove('start')
for x in list(self.oeb.guide):
if x.lower() not in {
'cover', 'titlepage', 'masthead', 'toc', 'title-page',
'copyright-page', 'text', 'index', 'glossary',
'acknowledgements', 'bibliography', 'colophon',
'copyright-page', 'dedication', 'epigraph', 'foreword',
'loi', 'lot', 'notes', 'preface'}:
item = self.oeb.guide[x]
if item.title and item.title.lower() == 'start':
continue
self.oeb.guide.remove(x)

View File

@@ -0,0 +1,395 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, re
from xml.sax.saxutils import escape
from string import Formatter
from calibre import guess_type, strftime
from calibre.constants import iswindows
from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML, xml2text, urldefrag, urlnormalize
from calibre.library.comments import comments_to_html, markdown
from calibre.utils.date import is_date_undefined, as_local_time
from calibre.utils.icu import sort_key
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.metadata import fmt_sidx, rating_to_stars
from polyglot.builtins import unicode_type, map
JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'
class SafeFormatter(Formatter):
def get_value(self, *args, **kwargs):
try:
return Formatter.get_value(self, *args, **kwargs)
except KeyError:
return ''
class Base(object):
def remove_images(self, item, limit=1):
path = XPath('//h:img[@src]')
removed = 0
for img in path(item.data):
if removed >= limit:
break
href = item.abshref(img.get('src'))
image = self.oeb.manifest.hrefs.get(href)
if image is None:
href = urlnormalize(href)
image = self.oeb.manifest.hrefs.get(href)
if image is not None:
self.oeb.manifest.remove(image)
self.oeb.guide.remove_by_href(href)
img.getparent().remove(img)
removed += 1
return removed
class RemoveFirstImage(Base):
def remove_first_image(self):
deleted_item = None
for item in self.oeb.spine:
if XPath(JACKET_XPATH)(item.data):
continue
removed = self.remove_images(item)
if removed > 0:
self.log('Removed first image')
body = XPath('//h:body')(item.data)
if body:
raw = xml2text(body[0]).strip()
imgs = XPath('//h:img|//svg:svg')(item.data)
if not raw and not imgs:
self.log('Removing %s as it has no content'%item.href)
self.oeb.manifest.remove(item)
deleted_item = item
break
else:
self.log.warn('Could not find first image to remove')
if deleted_item is not None:
for item in list(self.oeb.toc):
href = urldefrag(item.href)[0]
if href == deleted_item.href:
self.oeb.toc.remove(item)
self.oeb.guide.remove_by_href(deleted_item.href)
def __call__(self, oeb, opts, metadata):
'''
Add metadata in jacket.xhtml if specified in opts
If not specified, remove previous jacket instance
'''
self.oeb, self.opts, self.log = oeb, opts, oeb.log
if opts.remove_first_image:
self.remove_first_image()
class Jacket(Base):
'''
Book jacket manipulation. Remove first image and insert comments at start of
book.
'''
def insert_metadata(self, mi):
self.log('Inserting metadata into book...')
try:
tags = list(map(unicode_type, self.oeb.metadata.subject))
except Exception:
tags = []
try:
comments = unicode_type(self.oeb.metadata.description[0])
except:
comments = ''
try:
title = unicode_type(self.oeb.metadata.title[0])
except:
title = _('Unknown')
try:
authors = list(map(unicode_type, self.oeb.metadata.creator))
except:
authors = [_('Unknown')]
root = render_jacket(mi, self.opts.output_profile,
alt_title=title, alt_tags=tags, alt_authors=authors,
alt_comments=comments, rescale_fonts=True)
id, href = self.oeb.manifest.generate('calibre_jacket', 'jacket.xhtml')
jacket = self.oeb.manifest.add(id, href, guess_type(href)[0], data=root)
self.oeb.spine.insert(0, jacket, True)
self.oeb.inserted_metadata_jacket = jacket
for img, path in referenced_images(root):
self.oeb.log('Embedding referenced image %s into jacket' % path)
ext = path.rpartition('.')[-1].lower()
item_id, href = self.oeb.manifest.generate('jacket_image', 'jacket_img.'+ext)
with open(path, 'rb') as f:
item = self.oeb.manifest.add(item_id, href, guess_type(href)[0], data=f.read())
item.unload_data_from_memory()
img.set('src', jacket.relhref(item.href))
def remove_existing_jacket(self):
for x in self.oeb.spine[:4]:
if XPath(JACKET_XPATH)(x.data):
self.remove_images(x, limit=sys.maxsize)
self.oeb.manifest.remove(x)
self.log('Removed existing jacket')
break
def __call__(self, oeb, opts, metadata):
'''
Add metadata in jacket.xhtml if specified in opts
If not specified, remove previous jacket instance
'''
self.oeb, self.opts, self.log = oeb, opts, oeb.log
self.remove_existing_jacket()
if opts.insert_metadata:
self.insert_metadata(metadata)
# Render Jacket {{{
def get_rating(rating, rchar, e_rchar):
ans = ''
try:
num = float(rating)/2
except:
return ans
num = max(0, num)
num = min(num, 5)
if num < 1:
return ans
ans = ("%s%s") % (rchar * int(num), e_rchar * (5 - int(num)))
return ans
class Series(unicode_type):
def __new__(self, series, series_index):
if series and series_index is not None:
roman = _('{1} of <em>{0}</em>').format(
escape(series), escape(fmt_sidx(series_index, use_roman=True)))
combined = _('{1} of <em>{0}</em>').format(
escape(series), escape(fmt_sidx(series_index, use_roman=False)))
else:
combined = roman = escape(series or u'')
s = unicode_type.__new__(self, combined)
s.roman = roman
s.name = escape(series or '')
s.number = escape(fmt_sidx(series_index or 1.0, use_roman=False))
s.roman_number = escape(fmt_sidx(series_index or 1.0, use_roman=True))
return s
class Tags(unicode_type):
def __new__(self, tags, output_profile):
tags = [escape(x) for x in tags or ()]
t = unicode_type.__new__(self, ', '.join(tags))
t.alphabetical = ', '.join(sorted(tags, key=sort_key))
t.tags_list = tags
return t
def postprocess_jacket(root, output_profile, has_data):
# Post-process the generated html to strip out empty header items
def extract(tag):
parent = tag.getparent()
idx = parent.index(tag)
parent.remove(tag)
if tag.tail:
if idx == 0:
parent.text = (parent.text or '') + tag.tail
else:
if idx >= len(parent):
idx = -1
parent[-1].tail = (parent[-1].tail or '') + tag.tail
def extract_class(cls):
for tag in root.xpath('//*[@class="_"]'.replace('_', cls)):
extract(tag)
for key in 'series rating tags'.split():
if not has_data[key]:
extract_class('cbj_' + key)
if not has_data['pubdate']:
extract_class('cbj_pubdata')
if output_profile.short_name != 'kindle':
extract_class('cbj_kindle_banner_hr')
def render_jacket(mi, output_profile,
alt_title=_('Unknown'), alt_tags=[], alt_comments='',
alt_publisher='', rescale_fonts=False, alt_authors=None):
css = P('jacket/stylesheet.css', data=True).decode('utf-8')
template = P('jacket/template.xhtml', data=True).decode('utf-8')
template = re.sub(r'<!--.*?-->', '', template, flags=re.DOTALL)
css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)
try:
title_str = alt_title if mi.is_null('title') else mi.title
except:
title_str = _('Unknown')
title_str = escape(title_str)
title = '<span class="title">%s</span>' % title_str
series = Series(mi.series, mi.series_index)
try:
publisher = mi.publisher if not mi.is_null('publisher') else alt_publisher
except:
publisher = ''
publisher = escape(publisher)
try:
if is_date_undefined(mi.pubdate):
pubdate = ''
else:
dt = as_local_time(mi.pubdate)
pubdate = strftime('%Y', dt.timetuple())
except:
pubdate = ''
rating = get_rating(mi.rating, output_profile.ratings_char, output_profile.empty_ratings_char)
tags = Tags((mi.tags if mi.tags else alt_tags), output_profile)
comments = mi.comments if mi.comments else alt_comments
comments = comments.strip()
if comments:
comments = comments_to_html(comments)
orig = mi.authors
if mi.is_null('authors'):
mi.authors = list(alt_authors or (_('Unknown'),))
try:
author = mi.format_authors()
except:
author = ''
mi.authors = orig
author = escape(author)
has_data = {}
def generate_html(comments):
args = dict(xmlns=XHTML_NS,
title_str=title_str,
css=css,
title=title,
author=author,
publisher=publisher,
pubdate_label=_('Published'), pubdate=pubdate,
series_label=_('Series'), series=series,
rating_label=_('Rating'), rating=rating,
tags_label=_('Tags'), tags=tags,
comments=comments,
footer='',
searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list),
)
for key in mi.custom_field_keys():
m = mi.get_user_metadata(key, False) or {}
try:
display_name, val = mi.format_field_extended(key)[:2]
dkey = key.replace('#', '_')
dt = m.get('datatype')
if dt == 'series':
args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
elif dt == 'rating':
args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False))
elif dt == 'comments':
val = val or ''
display = m.get('display', {})
ctype = display.get('interpret_as') or 'html'
if ctype == 'long-text':
val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val)
elif ctype == 'short-text':
val = '<span>%s</span>' % escape(val)
elif ctype == 'markdown':
val = markdown(val)
else:
val = comments_to_html(val)
args[dkey] = val
else:
args[dkey] = escape(val)
args[dkey+'_label'] = escape(display_name)
except Exception:
# if the val (custom column contents) is None, don't add to args
pass
if False:
print("Custom column values available in jacket template:")
for key in args.keys():
if key.startswith('_') and not key.endswith('_label'):
print(" %s: %s" % ('#' + key[1:], args[key]))
# Used in the comment describing use of custom columns in templates
# Don't change this unless you also change it in template.xhtml
args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
args['_genre'] = args.get('_genre', '{_genre}')
formatter = SafeFormatter()
generated_html = formatter.format(template, **args)
has_data['series'] = bool(series)
has_data['tags'] = bool(tags)
has_data['rating'] = bool(rating)
has_data['pubdate'] = bool(pubdate)
return strip_encoding_declarations(generated_html)
from calibre.ebooks.oeb.polish.parsing import parse
raw = generate_html(comments)
root = parse(raw, line_numbers=False, force_html5_parse=True)
if rescale_fonts:
# We ensure that the conversion pipeline will set the font sizes for
# text in the jacket to the same size as the font sizes for the rest of
# the text in the book. That means that as long as the jacket uses
# relative font sizes (em or %), the post conversion font size will be
# the same as for text in the main book. So text with size x em will
# be rescaled to the same value in both the jacket and the main content.
#
# We cannot use data-calibre-rescale 100 on the body tag as that will just
# give the body tag a font size of 1em, which is useless.
for body in root.xpath('//*[local-name()="body"]'):
fw = body.makeelement(XHTML('div'))
fw.set('data-calibre-rescale', '100')
for child in body:
fw.append(child)
body.append(fw)
postprocess_jacket(root, output_profile, has_data)
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
pretty_html_tree(None, root)
return root
# }}}
def linearize_jacket(oeb):
for x in oeb.spine[:4]:
if XPath(JACKET_XPATH)(x.data):
for e in XPath('//h:table|//h:tr|//h:th')(x.data):
e.tag = XHTML('div')
for e in XPath('//h:td')(x.data):
e.tag = XHTML('span')
break
def referenced_images(root):
for img in XPath('//h:img[@src]')(root):
src = img.get('src')
if src.startswith('file://'):
path = src[7:]
if iswindows and path.startswith('/'):
path = path[1:]
if os.path.exists(path):
yield img, path

View File

@@ -0,0 +1,218 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re
from calibre.utils.date import isoformat, now
from calibre import guess_type
from polyglot.builtins import iteritems
def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
from calibre.ebooks.oeb.base import OPF
if not mi.is_null('title'):
m.clear('title')
m.add('title', mi.title)
if mi.title_sort:
if not m.title:
m.add('title', mi.title_sort)
m.clear('title_sort')
m.add('title_sort', mi.title_sort)
if not mi.is_null('authors'):
m.filter('creator', lambda x : x.role.lower() in ['aut', ''])
for a in mi.authors:
attrib = {'role':'aut'}
if mi.author_sort:
attrib[OPF('file-as')] = mi.author_sort
m.add('creator', a, attrib=attrib)
if not mi.is_null('book_producer'):
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
m.add('contributor', mi.book_producer, role='bkp')
elif override_input_metadata:
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
if not mi.is_null('comments'):
m.clear('description')
m.add('description', mi.comments)
elif override_input_metadata:
m.clear('description')
if not mi.is_null('publisher'):
m.clear('publisher')
m.add('publisher', mi.publisher)
elif override_input_metadata:
m.clear('publisher')
if not mi.is_null('series'):
m.clear('series')
m.add('series', mi.series)
elif override_input_metadata:
m.clear('series')
identifiers = mi.get_identifiers()
set_isbn = False
for typ, val in iteritems(identifiers):
has = False
if typ.lower() == 'isbn':
set_isbn = True
for x in m.identifier:
if x.scheme.lower() == typ.lower():
x.content = val
has = True
if not has:
m.add('identifier', val, scheme=typ.upper())
if override_input_metadata and not set_isbn:
m.filter('identifier', lambda x: x.scheme.lower() == 'isbn')
if not mi.is_null('languages'):
m.clear('language')
for lang in mi.languages:
if lang and lang.lower() not in ('und', ''):
m.add('language', lang)
if not mi.is_null('series_index'):
m.clear('series_index')
m.add('series_index', mi.format_series_index())
elif override_input_metadata:
m.clear('series_index')
if not mi.is_null('rating'):
m.clear('rating')
m.add('rating', '%.2f'%mi.rating)
elif override_input_metadata:
m.clear('rating')
if not mi.is_null('tags'):
m.clear('subject')
for t in mi.tags:
m.add('subject', t)
elif override_input_metadata:
m.clear('subject')
if not mi.is_null('pubdate'):
m.clear('date')
m.add('date', isoformat(mi.pubdate))
if not mi.is_null('timestamp'):
m.clear('timestamp')
m.add('timestamp', isoformat(mi.timestamp))
if not mi.is_null('rights'):
m.clear('rights')
m.add('rights', mi.rights)
if not mi.is_null('publication_type'):
m.clear('publication_type')
m.add('publication_type', mi.publication_type)
if not m.timestamp:
m.add('timestamp', isoformat(now()))
class MergeMetadata(object):
'Merge in user metadata, including cover'
def __call__(self, oeb, mi, opts, override_input_metadata=False):
self.oeb, self.log = oeb, oeb.log
m = self.oeb.metadata
self.log('Merging user specified metadata...')
meta_info_to_oeb_metadata(mi, m, oeb.log,
override_input_metadata=override_input_metadata)
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
m.clear('cover')
if cover_id is not None:
m.add('cover', cover_id)
if mi.uuid is not None:
m.filter('identifier', lambda x:x.id=='uuid_id')
self.oeb.metadata.add('identifier', mi.uuid, id='uuid_id',
scheme='uuid')
self.oeb.uid = self.oeb.metadata.identifier[-1]
if mi.application_id is not None:
m.filter('identifier', lambda x:x.scheme=='calibre')
self.oeb.metadata.add('identifier', mi.application_id, scheme='calibre')
def set_cover(self, mi, prefer_metadata_cover):
cdata, ext = b'', 'jpg'
if mi.cover and os.access(mi.cover, os.R_OK):
with open(mi.cover, 'rb') as f:
cdata = f.read()
ext = mi.cover.rpartition('.')[-1].lower().strip()
elif mi.cover_data and mi.cover_data[-1]:
cdata = mi.cover_data[1]
ext = mi.cover_data[0]
if ext not in ('png', 'jpg', 'jpeg'):
ext = 'jpg'
id = old_cover = None
if 'cover' in self.oeb.guide:
old_cover = self.oeb.guide['cover']
if prefer_metadata_cover and old_cover is not None:
cdata = b''
if cdata:
self.oeb.guide.remove('cover')
self.oeb.guide.remove('titlepage')
elif self.oeb.plumber_output_format in {'mobi', 'azw3'} and old_cover is not None:
# The amazon formats dont support html cover pages, so remove them
# even if no cover was specified.
self.oeb.guide.remove('titlepage')
do_remove_old_cover = False
if old_cover is not None:
if old_cover.href in self.oeb.manifest.hrefs:
item = self.oeb.manifest.hrefs[old_cover.href]
if not cdata:
return item.id
do_remove_old_cover = True
elif not cdata:
id = self.oeb.manifest.generate(id='cover')[0]
self.oeb.manifest.add(id, old_cover.href, 'image/jpeg')
return id
new_cover_item = None
if cdata:
id, href = self.oeb.manifest.generate('cover', 'cover.'+ext)
new_cover_item = self.oeb.manifest.add(id, href, guess_type('cover.'+ext)[0], data=cdata)
self.oeb.guide.add('cover', 'Cover', href)
if do_remove_old_cover:
self.remove_old_cover(item, new_cover_item.href)
return id
def remove_old_cover(self, cover_item, new_cover_href=None):
from calibre.ebooks.oeb.base import XPath, XLINK
from lxml import etree
self.oeb.manifest.remove(cover_item)
# Remove any references to the cover in the HTML
affected_items = set()
xp = XPath('//h:img[@src]|//svg:image[@xl:href]')
for i, item in enumerate(self.oeb.spine):
try:
images = xp(item.data)
except Exception:
images = ()
removed = False
for img in images:
href = img.get('src') or img.get(XLINK('href'))
try:
href = item.abshref(href)
except Exception:
continue # Invalid URL, ignore
if href == cover_item.href:
if new_cover_href is not None:
replacement_href = item.relhref(new_cover_href)
attr = 'src' if img.tag.endswith('img') else XLINK('href')
img.set(attr, replacement_href)
else:
p = img.getparent()
if p.tag.endswith('}svg'):
p.getparent().remove(p)
else:
p.remove(img)
removed = True
if removed:
affected_items.add(item)
# Check if the resulting HTML has no content, if so remove it
for item in affected_items:
body = XPath('//h:body')(item.data)
if body:
text = etree.tostring(body[0], method='text', encoding='unicode')
else:
text = ''
text = re.sub(r'\s+', '', text)
if not text and not XPath('//h:img|//svg:svg')(item.data):
self.log('Removing %s as it is a wrapper around'
' the cover image'%item.href)
self.oeb.spine.remove(item)
self.oeb.manifest.remove(item)
self.oeb.guide.remove_by_href(item.href)

View File

@@ -0,0 +1,189 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import numbers
from collections import Counter
from calibre.ebooks.oeb.base import barename, XPath
from polyglot.builtins import iteritems
class RemoveAdobeMargins(object):
'''
Remove margins specified in Adobe's page templates.
'''
def __call__(self, oeb, log, opts):
self.oeb, self.opts, self.log = oeb, opts, log
for item in self.oeb.manifest:
if item.media_type in {
'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml',
'application/adobe-page-template+xml', 'application/adobe.page-template+xml',
} and hasattr(item.data, 'xpath'):
self.log('Removing page margins specified in the'
' Adobe page template')
for elem in item.data.xpath(
'//*[@margin-bottom or @margin-top '
'or @margin-left or @margin-right]'):
for margin in ('left', 'right', 'top', 'bottom'):
attr = 'margin-'+margin
elem.attrib.pop(attr, None)
class NegativeTextIndent(Exception):
pass
class RemoveFakeMargins(object):
'''
Remove left and right margins from paragraph/divs if the same margin is specified
on almost all the elements at that level.
Must be called only after CSS flattening
'''
def __call__(self, oeb, log, opts):
if not opts.remove_fake_margins:
return
self.oeb, self.log, self.opts = oeb, log, opts
stylesheet = None
self.levels = {}
self.stats = {}
self.selector_map = {}
stylesheet = self.oeb.manifest.main_stylesheet
if stylesheet is None:
return
self.log('Removing fake margins...')
stylesheet = stylesheet.data
from css_parser.css import CSSRule
for rule in stylesheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
self.selector_map[rule.selectorList.selectorText] = rule.style
self.find_levels()
for level in self.levels:
try:
self.process_level(level)
except NegativeTextIndent:
self.log.debug('Negative text indent detected at level '
' %s, ignoring this level'%level)
def get_margins(self, elem):
cls = elem.get('class', None)
if cls:
style = self.selector_map.get('.'+cls, None)
if style:
try:
ti = style['text-indent']
except:
pass
else:
if ((hasattr(ti, 'startswith') and ti.startswith('-')) or
isinstance(ti, numbers.Number) and ti < 0):
raise NegativeTextIndent()
return style.marginLeft, style.marginRight, style
return '', '', None
def process_level(self, level):
elems = self.levels[level]
self.stats[level+'_left'] = Counter()
self.stats[level+'_right'] = Counter()
for elem in elems:
lm, rm = self.get_margins(elem)[:2]
self.stats[level+'_left'][lm] += 1
self.stats[level+'_right'][rm] += 1
self.log.debug(level, ' left margin stats:', self.stats[level+'_left'])
self.log.debug(level, ' right margin stats:', self.stats[level+'_right'])
remove_left = self.analyze_stats(self.stats[level+'_left'])
remove_right = self.analyze_stats(self.stats[level+'_right'])
if remove_left:
mcl = self.stats[level+'_left'].most_common(1)[0][0]
self.log('Removing level %s left margin of:'%level, mcl)
if remove_right:
mcr = self.stats[level+'_right'].most_common(1)[0][0]
self.log('Removing level %s right margin of:'%level, mcr)
if remove_left or remove_right:
for elem in elems:
lm, rm, style = self.get_margins(elem)
if remove_left and lm == mcl:
style.removeProperty('margin-left')
if remove_right and rm == mcr:
style.removeProperty('margin-right')
def find_levels(self):
def level_of(elem, body):
ans = 1
while elem.getparent() is not body:
ans += 1
elem = elem.getparent()
return ans
paras = XPath('descendant::h:p|descendant::h:div')
for item in self.oeb.spine:
body = XPath('//h:body')(item.data)
if not body:
continue
body = body[0]
for p in paras(body):
level = level_of(p, body)
level = '%s_%d'%(barename(p.tag), level)
if level not in self.levels:
self.levels[level] = []
self.levels[level].append(p)
remove = set()
for k, v in iteritems(self.levels):
num = len(v)
self.log.debug('Found %d items of level:'%num, k)
level = int(k.split('_')[-1])
tag = k.split('_')[0]
if tag == 'p' and num < 25:
remove.add(k)
if tag == 'div':
if level > 2 and num < 25:
remove.add(k)
elif level < 3:
# Check each level < 3 element and only keep those
# that have many child paras
for elem in list(v):
children = len(paras(elem))
if children < 5:
v.remove(elem)
for k in remove:
self.levels.pop(k)
self.log.debug('Ignoring level', k)
def analyze_stats(self, stats):
if not stats:
return False
mc = stats.most_common(1)
if len(mc) > 1:
return False
mc = mc[0]
most_common, most_common_count = mc
if not most_common or most_common == '0':
return False
total = sum(stats.values())
# True if greater than 95% of elements have the same margin
return most_common_count/total > 0.95

View File

@@ -0,0 +1,324 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, uuid
from lxml import etree
from collections import OrderedDict, Counter
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename
from calibre.ebooks import ConversionError
from polyglot.builtins import itervalues, unicode_type
from polyglot.urllib import urlparse
def XPath(x):
try:
return etree.XPath(x, namespaces=XPNSMAP)
except etree.XPathSyntaxError:
raise ConversionError(
'The syntax of the XPath expression %s is invalid.' % repr(x))
def isspace(x):
return not x or x.replace('\xa0', '').isspace()
def at_start(elem):
' Return True if there is no content before elem '
body = XPath('ancestor-or-self::h:body')(elem)
if not body:
return True
body = body[0]
ancestors = frozenset(XPath('ancestor::*')(elem))
for x in body.iter():
if x is elem:
return True
if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}:
return False
if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))):
continue
return False
return False
class DetectStructure(object):
def __call__(self, oeb, opts):
self.log = oeb.log
self.oeb = oeb
self.opts = opts
self.log('Detecting structure...')
self.detect_chapters()
if self.oeb.auto_generated_toc or opts.use_auto_toc:
orig_toc = self.oeb.toc
self.oeb.toc = TOC()
self.create_level_based_toc()
if self.oeb.toc.count() < 1:
if not opts.no_chapters_in_toc and self.detected_chapters:
self.create_toc_from_chapters()
if self.oeb.toc.count() < opts.toc_threshold:
self.create_toc_from_links()
if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
self.oeb.toc = orig_toc
else:
self.oeb.auto_generated_toc = True
self.log('Auto generated TOC with %d entries.' %
self.oeb.toc.count())
if opts.toc_filter is not None:
regexp = re.compile(opts.toc_filter)
for node in list(self.oeb.toc.iter()):
if not node.title or regexp.search(node.title) is not None:
self.log('Filtering', node.title if node.title else
'empty node', 'from TOC')
self.oeb.toc.remove(node)
if opts.page_breaks_before is not None:
pb_xpath = XPath(opts.page_breaks_before)
for item in oeb.spine:
for elem in pb_xpath(item.data):
try:
prev = next(elem.itersiblings(tag=etree.Element,
preceding=True))
if (barename(elem.tag) in {'h1', 'h2'} and barename(
prev.tag) in {'h1', 'h2'} and (not prev.tail or
not prev.tail.split())):
# We have two adjacent headings, do not put a page
# break on the second one
continue
except StopIteration:
pass
style = elem.get('style', '')
if style:
style += '; '
elem.set('style', style+'page-break-before:always')
for node in self.oeb.toc.iter():
if not node.title or not node.title.strip():
node.title = _('Unnamed')
if self.opts.start_reading_at:
self.detect_start_reading()
def detect_start_reading(self):
expr = self.opts.start_reading_at
try:
expr = XPath(expr)
except:
self.log.warn(
'Invalid start reading at XPath expression, ignoring: %s'%expr)
return
for item in self.oeb.spine:
if not hasattr(item.data, 'xpath'):
continue
matches = expr(item.data)
if matches:
elem = matches[0]
eid = elem.get('id', None)
if not eid:
eid = 'start_reading_at_'+unicode_type(uuid.uuid4()).replace('-', '')
elem.set('id', eid)
if 'text' in self.oeb.guide:
self.oeb.guide.remove('text')
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
self.log('Setting start reading at position to %s in %s'%(
self.opts.start_reading_at, item.href))
return
self.log.warn("Failed to find start reading at position: %s"%
self.opts.start_reading_at)
def get_toc_parts_for_xpath(self, expr):
# if an attribute is selected by the xpath expr then truncate it
# from the path and instead return it as where to find the title text
title_attribute_regex = re.compile(r'/@([-\w]+)$')
match = title_attribute_regex.search(expr)
if match is not None:
return expr[0:match.start()], match.group(1)
return expr, None
def detect_chapters(self):
self.detected_chapters = []
self.chapter_title_attribute = None
def find_matches(expr, doc):
try:
ans = XPath(expr)(doc)
len(ans)
return ans
except:
self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
return []
if self.opts.chapter:
chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter)
self.chapter_title_attribute = title_attribute
for item in self.oeb.spine:
for x in find_matches(chapter_path, item.data):
self.detected_chapters.append((item, x))
chapter_mark = self.opts.chapter_mark
page_break_before = 'display: block; page-break-before: always'
page_break_after = 'display: block; page-break-after: always'
c = Counter()
for item, elem in self.detected_chapters:
c[item] += 1
text = xml2text(elem).strip()
text = re.sub(r'\s+', ' ', text.strip())
self.log('\tDetected chapter:', text[:50])
if chapter_mark == 'none':
continue
if chapter_mark == 'rule':
mark = elem.makeelement(XHTML('hr'))
elif chapter_mark == 'pagebreak':
if c[item] < 3 and at_start(elem):
# For the first two elements in this item, check if they
# are at the start of the file, in which case inserting a
# page break in unnecessary and can lead to extra blank
# pages in the PDF Output plugin. We need to use two as
# feedbooks epubs match both a heading tag and its
# containing div with the default chapter expression.
continue
mark = elem.makeelement(XHTML('div'), style=page_break_after)
else: # chapter_mark == 'both':
mark = elem.makeelement(XHTML('hr'), style=page_break_before)
try:
elem.addprevious(mark)
except TypeError:
self.log.exception('Failed to mark chapter')
def create_level_based_toc(self):
if self.opts.level1_toc is not None:
self.add_leveled_toc_items()
def create_toc_from_chapters(self):
counter = self.oeb.toc.next_play_order()
for item, elem in self.detected_chapters:
text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter)
self.oeb.toc.add(text, href, play_order=counter)
counter += 1
def create_toc_from_links(self):
num = 0
for item in self.oeb.spine:
for a in XPath('//h:a[@href]')(item.data):
href = a.get('href')
try:
purl = urlparse(href)
except ValueError:
self.log.warning('Ignoring malformed URL:', href)
continue
if not purl[0] or purl[0] == 'file':
href, frag = purl.path, purl.fragment
href = item.abshref(href)
if frag:
href = '#'.join((href, frag))
if not self.oeb.toc.has_href(href):
text = xml2text(a)
text = text[:100].strip()
if (not self.opts.duplicate_links_in_toc and
self.oeb.toc.has_text(text)):
continue
try:
self.oeb.toc.add(text, href,
play_order=self.oeb.toc.next_play_order())
num += 1
except ValueError:
self.oeb.log.exception('Failed to process link: %r' % href)
continue # Most likely an incorrectly URL encoded link
if self.opts.max_toc_links > 0 and \
num >= self.opts.max_toc_links:
self.log('Maximum TOC links reached, stopping.')
return
def elem_to_link(self, item, elem, title_attribute, counter):
text = ''
if title_attribute is not None:
text = elem.get(title_attribute, '')
if not text:
text = xml2text(elem).strip()
if not text:
text = elem.get('title', '')
if not text:
text = elem.get('alt', '')
text = re.sub(r'\s+', ' ', text.strip())
text = text[:1000].strip()
id = elem.get('id', 'calibre_toc_%d'%counter)
elem.set('id', id)
href = '#'.join((item.href, id))
return text, href
def add_leveled_toc_items(self):
added = OrderedDict()
added2 = OrderedDict()
counter = 1
def find_matches(expr, doc):
try:
ans = XPath(expr)(doc)
len(ans)
return ans
except:
self.log.warn('Invalid ToC expression, ignoring: %s'%expr)
return []
for document in self.oeb.spine:
previous_level1 = list(itervalues(added))[-1] if added else None
previous_level2 = list(itervalues(added2))[-1] if added2 else None
level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc)
for elem in find_matches(level1_toc, document.data):
text, _href = self.elem_to_link(document, elem, level1_title, counter)
counter += 1
if text:
node = self.oeb.toc.add(text, _href,
play_order=self.oeb.toc.next_play_order())
added[elem] = node
# node.add(_('Top'), _href)
if self.opts.level2_toc is not None and added:
level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc)
for elem in find_matches(level2_toc, document.data):
level1 = None
for item in document.data.iterdescendants():
if item in added:
level1 = added[item]
elif item == elem:
if level1 is None:
if previous_level1 is None:
break
level1 = previous_level1
text, _href = self.elem_to_link(document, elem, level2_title, counter)
counter += 1
if text:
added2[elem] = level1.add(text, _href,
play_order=self.oeb.toc.next_play_order())
break
if self.opts.level3_toc is not None and added2:
level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc)
for elem in find_matches(level3_toc, document.data):
level2 = None
for item in document.data.iterdescendants():
if item in added2:
level2 = added2[item]
elif item == elem:
if level2 is None:
if previous_level2 is None:
break
level2 = previous_level2
text, _href = \
self.elem_to_link(document, elem, level3_title, counter)
counter += 1
if text:
level2.add(text, _href,
play_order=self.oeb.toc.next_play_order())
break

View File

@@ -0,0 +1,73 @@
'''
OPF manifest trimming transform.
'''
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS
from calibre.ebooks.oeb.base import urlnormalize, iterlinks
from polyglot.urllib import urldefrag
class ManifestTrimmer(object):
@classmethod
def config(cls, cfg):
return cfg
@classmethod
def generate(cls, opts):
return cls()
def __call__(self, oeb, context):
import css_parser
oeb.logger.info('Trimming unused files from manifest...')
self.opts = context
used = set()
for term in oeb.metadata:
for item in oeb.metadata[term]:
if item.value in oeb.manifest.hrefs:
used.add(oeb.manifest.hrefs[item.value])
elif item.value in oeb.manifest.ids:
used.add(oeb.manifest.ids[item.value])
for ref in oeb.guide.values():
path, _ = urldefrag(ref.href)
if path in oeb.manifest.hrefs:
used.add(oeb.manifest.hrefs[path])
# TOC items are required to be in the spine
for item in oeb.spine:
used.add(item)
unchecked = used
while unchecked:
new = set()
for item in unchecked:
if (item.media_type in OEB_DOCS or
item.media_type[-4:] in ('/xml', '+xml')) and \
item.data is not None:
hrefs = [r[2] for r in iterlinks(item.data)]
for href in hrefs:
if isinstance(href, bytes):
href = href.decode('utf-8')
try:
href = item.abshref(urlnormalize(href))
except:
continue
if href in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href]
if found not in used:
new.add(found)
elif item.media_type == CSS_MIME:
for href in css_parser.getUrls(item.data):
href = item.abshref(urlnormalize(href))
if href in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href]
if found not in used:
new.add(found)
used.update(new)
unchecked = new
for item in oeb.manifest.values():
if item not in used:
oeb.logger.info('Trimming %r from manifest' % item.href)
oeb.manifest.remove(item)