1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-04-05 12:23:34 +02:00

Fixing leftovers from first concept of constants

This commit is contained in:
2020-06-07 11:59:00 +02:00
parent 7419954e0c
commit a69884d724
9 changed files with 652 additions and 464 deletions

View File

@@ -1,5 +1,8 @@
import collections
import errno
import hashlib
import io
import itertools
import logging
import os
import re
@@ -7,13 +10,10 @@ import shutil
import sys
import time
import unicodedata
import uuid
from collections import defaultdict
from io import BytesIO
from itertools import count
import urllib.parse
import uuid
from css_parser import getUrls, replaceUrls
import css_parser
from lxml import etree
from ebook_converter import constants as const
@@ -35,10 +35,7 @@ from ebook_converter.ebooks.metadata.utils import parse_opf_version
from ebook_converter.ebooks.mobi import MobiError
from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader
from ebook_converter.ebooks.mobi.tweak import set_cover
from ebook_converter.ebooks.oeb.base import (
OEB_DOCS, OEB_STYLES, Manifest, itercsslinks, iterlinks,
rewrite_links, serialize, urlquote, urlunquote
)
from ebook_converter.ebooks.oeb import base as oeb_base
from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html
from ebook_converter.ebooks.oeb.polish.errors import DRMError, InvalidBook
from ebook_converter.ebooks.oeb.polish.parsing import parse as parse_html_tweak
@@ -96,7 +93,7 @@ def abspath_to_name(path, root):
return relpath(os.path.abspath(path), root).replace(os.sep, '/')
def name_to_href(name, root, base=None, quote=urlquote):
def name_to_href(name, root, base=None, quote=oeb_base.urlquote):
fullpath = name_to_abspath(name, root)
basepath = root if base is None else os.path.dirname(name_to_abspath(base, root))
path = relpath(fullpath, basepath).replace(os.sep, '/')
@@ -111,7 +108,7 @@ def href_to_name(href, root, base=None):
return None
if purl.scheme or not purl.path:
return None
href = urlunquote(purl.path)
href = oeb_base.urlunquote(purl.path)
if iswindows and ':' in href:
# path manipulations on windows fail for paths with : in them, so we
# assume all such paths are invalid/absolute paths.
@@ -324,7 +321,7 @@ class Container(ContainerBase): # {{{
item_id = 'id' + '%d'%c
manifest = self.opf_xpath('//opf:manifest')[0]
href = self.name_to_href(name, self.opf_name)
item = manifest.makeelement(const.OPF_ITEM,
item = manifest.makeelement(oeb_base.tag('opf', 'item'),
id=item_id, href=href)
item.set('media-type', self.mime_map[name])
self.insert_into_xml(manifest, item)
@@ -340,7 +337,7 @@ class Container(ContainerBase): # {{{
def make_name_unique(self, name):
''' Ensure that `name` does not already exist in this book. If it does, return a modified version that does not exist. '''
counter = count()
counter = itertools.count()
while self.has_name_case_insensitive(name) or self.manifest_has_name(name):
c = next(counter) + 1
base, ext = name.rpartition('.')[::2]
@@ -377,10 +374,10 @@ class Container(ContainerBase): # {{{
if self.ok_to_be_unmanifested(name):
return name
item_id = self.add_name_to_manifest(name, process_manifest_item=process_manifest_item)
if mt in OEB_DOCS:
if mt in oeb_base.OEB_DOCS:
manifest = self.opf_xpath('//opf:manifest')[0]
spine = self.opf_xpath('//opf:spine')[0]
si = manifest.makeelement(const.OPF_ITEMREF, idref=item_id)
si = manifest.makeelement(oeb_base.tag('opf', 'itemref'), idref=item_id)
self.insert_into_xml(spine, si, index=spine_index)
return name
@@ -442,12 +439,12 @@ class Container(ContainerBase): # {{{
replace_func.file_type = 'opf'
for elem in self.opf_xpath('//*[@href]'):
elem.set('href', replace_func(elem.get('href')))
elif media_type.lower() in OEB_DOCS:
elif media_type.lower() in oeb_base.OEB_DOCS:
replace_func.file_type = 'text'
rewrite_links(self.parsed(name), replace_func)
elif media_type.lower() in OEB_STYLES:
oeb_base.rewrite_links(self.parsed(name), replace_func)
elif media_type.lower() in oeb_base.OEB_STYLES:
replace_func.file_type = 'style'
replaceUrls(self.parsed(name), replace_func)
css_parser.replaceUrls(self.parsed(name), replace_func)
elif media_type.lower() == guess_type('toc.ncx'):
replace_func.file_type = 'ncx'
for elem in self.parsed(name).xpath('//*[@src]'):
@@ -467,21 +464,21 @@ class Container(ContainerBase): # {{{
if name == self.opf_name:
for elem in self.opf_xpath('//*[@href]'):
yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
elif media_type.lower() in OEB_DOCS:
for el, attr, link, pos in iterlinks(self.parsed(name)):
elif media_type.lower() in oeb_base.OEB_DOCS:
for el, attr, link, pos in oeb_base.iterlinks(self.parsed(name)):
yield (link, el.sourceline, pos) if get_line_numbers else link
elif media_type.lower() in OEB_STYLES:
elif media_type.lower() in oeb_base.OEB_STYLES:
if get_line_numbers:
with self.open(name, 'rb') as f:
raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
position = PositionFinder(raw)
is_in_comment = CommentFinder(raw)
for link, offset in itercsslinks(raw):
for link, offset in oeb_base.itercsslinks(raw):
if not is_in_comment(offset):
lnum, col = position(offset)
yield link, lnum, col
else:
for link in getUrls(self.parsed(name)):
for link in css_parser.getUrls(self.parsed(name)):
yield link
elif media_type.lower() == guess_type('toc.ncx'):
for elem in self.parsed(name).xpath('//*[@src]'):
@@ -533,7 +530,7 @@ class Container(ContainerBase): # {{{
def opf_xpath(self, expr):
' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. '
return self.opf.xpath(expr, namespaces=const.OPF_NAMESPACES)
return self.opf.xpath(expr, namespaces=oeb_base.tag('opf', 'namespaces'))
def has_name(self, name):
''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. '''
@@ -580,11 +577,11 @@ class Container(ContainerBase): # {{{
def parse(self, path, mime):
with open(path, 'rb') as src:
data = src.read()
if mime in OEB_DOCS:
if mime in oeb_base.OEB_DOCS:
data = self.parse_xhtml(data, self.relpath(path))
elif mime[-4:] in {'+xml', '/xml'}:
data = self.parse_xml(data)
elif mime in OEB_STYLES:
elif mime in oeb_base.OEB_STYLES:
data = self.parse_css(data, self.relpath(path))
return data
@@ -597,7 +594,7 @@ class Container(ContainerBase): # {{{
'''
ans = self.open(name).read()
mime = self.mime_map.get(name, guess_type(name))
if decode and (mime in OEB_STYLES or mime in OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}):
if decode and (mime in oeb_base.OEB_STYLES or mime in oeb_base.OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}):
ans = self.decode(ans, normalize_to_nfc=normalize_to_nfc)
return ans
@@ -637,7 +634,7 @@ class Container(ContainerBase): # {{{
so use it sparingly. '''
from ebook_converter.ebooks.metadata.opf2 import OPF as O
mi = self.serialize_item(self.opf_name)
return O(BytesIO(mi), basedir=self.opf_dir, unquote_urls=False,
return O(io.BytesIO(mi), basedir=self.opf_dir, unquote_urls=False,
populate_spine=False).to_book_metadata()
@property
@@ -662,7 +659,7 @@ class Container(ContainerBase): # {{{
@property
def manifest_type_map(self):
' Mapping of manifest media-type to list of canonical names of that media-type '
ans = defaultdict(list)
ans = collections.defaultdict(list)
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'):
ans[item.get('media-type').lower()].append(self.href_to_name(
item.get('href'), self.opf_name))
@@ -813,7 +810,7 @@ class Container(ContainerBase): # {{{
spine = self.opf_xpath('//opf:spine')[0]
spine.text = tail
for name, linear in spine_items:
i = spine.makeelement(const.OPF_ITEMREF,
i = spine.makeelement(oeb_base.tag('opf', 'itemref'),
nsmap={'opf': const.OPF2_NS})
i.tail = tail
i.set('idref', imap[name])
@@ -922,7 +919,7 @@ class Container(ContainerBase): # {{{
return ans[0]
self.dirty(self.opf_name)
package = self.opf_xpath('//opf:package')[0]
item = package.makeelement(OPF(name))
item = package.makeelement(oeb_base.tag('opf', name))
item.tail = '\n'
package.append(item)
return item
@@ -945,7 +942,7 @@ class Container(ContainerBase): # {{{
item_id = id_prefix + '%d'%c
manifest = self.opf_xpath('//opf:manifest')[0]
item = manifest.makeelement(const.OPF_ITEM,
item = manifest.makeelement(oeb_base.tag('opf', 'item'),
id=item_id, href=href)
item.set('media-type', media_type)
self.insert_into_xml(manifest, item)
@@ -992,7 +989,7 @@ class Container(ContainerBase): # {{{
data = root = self.parsed(name)
if name == self.opf_name:
self.format_opf()
data = serialize(data, self.mime_map[name], pretty_print=name in
data = oeb_base.serialize(data, self.mime_map[name], pretty_print=name in
self.pretty_print)
if name == self.opf_name and root.nsmap.get(None) == const.OPF2_NS:
# Needed as I can't get lxml to output opf:role and
@@ -1181,7 +1178,7 @@ class EpubContainer(Container):
)
if not opf_files:
raise InvalidEpub('META-INF/container.xml contains no link to OPF file')
opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/')))
opf_path = os.path.join(self.root, *(oeb_base.urlunquote(opf_files[0].get('full-path')).split('/')))
if not exists(opf_path):
raise InvalidEpub('OPF file does not exist at location pointed to'
' by META-INF/container.xml')
@@ -1412,7 +1409,7 @@ def do_explode(path, dest):
def opf_to_azw3(opf, outpath, container):
from ebook_converter.ebooks.conversion.plumber import Plumber, create_oebbook
class Item(Manifest.Item):
class Item(oeb_base.Manifest.Item):
def _parse_css(self, data):
# The default CSS parser used by oeb.base inserts the h namespace

View File

@@ -1,22 +1,16 @@
from collections import defaultdict
from functools import partial
import collections
import functools
from css_parser.css import CSSRule, CSSStyleDeclaration
from ebook_converter import constants as const
from ebook_converter import force_unicode
from ebook_converter.css_selectors import parse, SelectorSyntaxError
from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, css_text
from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
from ebook_converter.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish import pretty
from ebook_converter.utils.icu import numeric_sort_key
from ebook_converter.css_selectors import Select, SelectorError
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
def filter_used_rules(rules, log, select):
for rule in rules:
used = False
@@ -34,7 +28,8 @@ def filter_used_rules(rules, log, select):
yield rule
def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None):
def get_imported_sheets(name, container, sheets, recursion_level=10,
sheet=None):
ans = set()
sheet = sheet or sheets[name]
for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
@@ -44,7 +39,8 @@ def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None)
ans.add(iname)
if recursion_level > 0:
for imported_sheet in tuple(ans):
ans |= get_imported_sheets(imported_sheet, container, sheets, recursion_level=recursion_level-1)
ans |= get_imported_sheets(imported_sheet, container, sheets,
recursion_level=recursion_level-1)
ans.discard(name)
return ans
@@ -56,7 +52,7 @@ def merge_declarations(first, second):
def merge_identical_selectors(sheet):
' Merge rules that have identical selectors '
selector_map = defaultdict(list)
selector_map = collections.defaultdict(list)
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
selector_map[rule.selectorText].append(rule)
remove = []
@@ -70,23 +66,29 @@ def merge_identical_selectors(sheet):
return len(remove)
def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False):
'''
Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content.
def remove_unused_css(container, report=None, remove_unused_classes=False,
merge_rules=False):
"""
Remove all unused CSS rules from the book. An unused CSS rule is one that
does not match any actual content.
:param report: An optional callable that takes a single argument. It is called with information about the operations being performed.
:param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed.
:param report: An optional callable that takes a single argument. It is
called with information about the operations being
performed.
:param remove_unused_classes: If True, class attributes in the HTML that
do not match any CSS rules are also removed.
:param merge_rules: If True, rules with identical selectors are merged.
'''
report = report or (lambda x:x)
"""
report = report or (lambda x: x)
def safe_parse(name):
try:
return container.parsed(name)
except TypeError:
pass
sheets = {name:safe_parse(name) for name, mt in container.mime_map.items() if mt in OEB_STYLES}
sheets = {k:v for k, v in sheets.items() if v is not None}
sheets = {name: safe_parse(name) for name, mt in container.mime_map.items()
if mt in base.OEB_STYLES and safe_parse(name) is not None}
num_merged = 0
if merge_rules:
for name, sheet in sheets.items():
@@ -106,7 +108,7 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
num_of_removed_rules = num_of_removed_classes = 0
for name, mt in container.mime_map.items():
if mt not in OEB_DOCS:
if mt not in base.OEB_DOCS:
continue
root = container.parsed(name)
select = Select(root, ignore_inappropriate_pseudo_classes=True)
@@ -120,31 +122,39 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
num_merged += num
container.dirty(name)
if remove_unused_classes:
used_classes |= {x.lower() for x in classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets, sheet=sheet)
used_classes |= {x.lower() for x in
classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets,
sheet=sheet)
for imported_sheet in imports:
style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
style_rules[imported_sheet] = tuple(filter_used_rules(
style_rules[imported_sheet], container.log, select))
if remove_unused_classes:
used_classes |= class_map[imported_sheet]
rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
unused_rules = tuple(filter_used_rules(rules, container.log, select))
unused_rules = tuple(filter_used_rules(rules, container.log,
select))
if unused_rules:
num_of_removed_rules += len(unused_rules)
[sheet.cssRules.remove(r) for r in unused_rules]
style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style)
pretty.pretty_script_or_style(container, style)
container.dirty(name)
for link in root.xpath('//*[local-name()="link" and @href]'):
sname = container.href_to_name(link.get('href'), name)
if sname not in sheets:
continue
style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select))
style_rules[sname] = tuple(filter_used_rules(style_rules[sname],
container.log,
select))
if remove_unused_classes:
used_classes |= class_map[sname]
for iname in import_map[sname]:
style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select))
style_rules[iname] = tuple(
filter_used_rules(style_rules[iname], container.log,
select))
if remove_unused_classes:
used_classes |= class_map[iname]
@@ -159,7 +169,8 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
elem.set('class', ' '.join(classes))
else:
del elem.attrib['class']
num_of_removed_classes += len(original_classes) - len(classes)
num_of_removed_classes += (len(original_classes) -
len(classes))
container.dirty(name)
for name, sheet in sheets.items():
@@ -195,7 +206,7 @@ def filter_declaration(style, properties=()):
changed = True
all_props = set(style.keys())
for prop in style.getProperties():
n = normalizers.get(prop.name, None)
n = base.normalize_css.normalizers.get(prop.name, None)
if n is not None:
normalized = n(prop.name, prop.propertyValue)
removed = properties.intersection(set(normalized))
@@ -225,12 +236,13 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
root = container.parsed(name)
changed = False
for style in root.xpath('//*[local-name()="style"]'):
if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
if style.text and (style.get('type') or
'text/css').lower() == 'text/css':
sheet = container.parse_css(style.text)
if transform_sheet(sheet):
changed = True
style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style)
pretty.pretty_script_or_style(container, style)
for elem in root.xpath('//*[@style]'):
text = elem.get('style', None)
if text:
@@ -240,13 +252,16 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
if style.length == 0:
del elem.attrib['style']
else:
elem.set('style', force_unicode(style.getCssText(separator=' '), 'utf-8'))
elem.set('style',
force_unicode(style.getCssText(separator=' '),
'utf-8'))
return changed
def transform_css(container, transform_sheet=None, transform_style=None, names=()):
def transform_css(container, transform_sheet=None, transform_style=None,
names=()):
if not names:
types = OEB_STYLES | OEB_DOCS
types = base.OEB_STYLES | base.OEB_DOCS
names = []
for name, mt in container.mime_map.items():
if mt in types:
@@ -256,13 +271,14 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=(
for name in names:
mt = container.mime_map[name]
if mt in OEB_STYLES:
if mt in base.OEB_STYLES:
sheet = container.parsed(name)
if transform_sheet(sheet):
container.dirty(name)
doc_changed = True
elif mt in OEB_DOCS:
if transform_inline_styles(container, name, transform_sheet, transform_style):
elif mt in base.OEB_DOCS:
if transform_inline_styles(container, name, transform_sheet,
transform_style):
container.dirty(name)
doc_changed = True
@@ -270,15 +286,21 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=(
def filter_css(container, properties, names=()):
'''
"""
Remove the specified CSS properties from all CSS rules in the book.
:param properties: Set of properties to remove. For example: :code:`{'font-family', 'color'}`.
:param names: The files from which to remove the properties. Defaults to all HTML and CSS files in the book.
'''
properties = normalize_filter_css(properties)
return transform_css(container, transform_sheet=partial(filter_sheet, properties=properties),
transform_style=partial(filter_declaration, properties=properties), names=names)
:param properties: Set of properties to remove. For example:
:code:`{'font-family', 'color'}`.
:param names: The files from which to remove the properties. Defaults to
all HTML and CSS files in the book.
"""
properties = base.normalize_css.normalize_filter_css(properties)
return transform_css(container,
transform_sheet=functools.partial(
filter_sheet, properties=properties),
transform_style=functools.partial(
filter_declaration, properties=properties),
names=names)
def _classes_in_selector(selector, classes):
@@ -331,21 +353,29 @@ def remove_property_value(prop, predicate):
if len(removed_vals) == len(prop.propertyValue):
prop.parent.removeProperty(prop.name)
else:
x = css_text(prop.propertyValue)
x = base.css_text(prop.propertyValue)
for v in removed_vals:
x = x.replace(css_text(v), '').strip()
x = x.replace(base.css_text(v), '').strip()
prop.propertyValue.cssText = x
return bool(removed_vals)
RULE_PRIORITIES = {t:i for i, t in enumerate((CSSRule.COMMENT, CSSRule.CHARSET_RULE, CSSRule.IMPORT_RULE, CSSRule.NAMESPACE_RULE))}
RULE_PRIORITIES = {t: i for i, t in enumerate((CSSRule.COMMENT,
CSSRule.CHARSET_RULE,
CSSRule.IMPORT_RULE,
CSSRule.NAMESPACE_RULE))}
def sort_sheet(container, sheet_or_text):
''' Sort the rules in a stylesheet. Note that in the general case this can
change the effective styles, but for most common sheets, it should be safe.
'''
sheet = container.parse_css(sheet_or_text) if isinstance(sheet_or_text, str) else sheet_or_text
"""
Sort the rules in a stylesheet. Note that in the general case this can
change the effective styles, but for most common sheets, it should be
safe.
"""
if isinstance(sheet_or_text, str):
sheet = container.parse_css(sheet_or_text)
else:
sheet = sheet_or_text
def text_sort_key(x):
return numeric_sort_key(str(x or ''))
@@ -364,7 +394,8 @@ def sort_sheet(container, sheet_or_text):
rule.selectorText = ', '.join(s.selectorText for s in selectors)
elif rule.type == CSSRule.FONT_FACE_RULE:
try:
tertiary = text_sort_key(rule.style.getPropertyValue('font-family'))
tertiary = text_sort_key(rule.style.getPropertyValue('font-'
'family'))
except Exception:
pass
@@ -379,11 +410,14 @@ def add_stylesheet_links(container, name, text):
if not head:
return
head = head[0]
sheets = tuple(container.manifest_items_of_type(lambda mt: mt in OEB_STYLES))
sheets = tuple(container.manifest_items_of_type(lambda mt:
mt in base.OEB_STYLES))
if not sheets:
return
for sname in sheets:
link = head.makeelement(const.XHTML_LINK, type='text/css', rel='stylesheet', href=container.name_to_href(sname, name))
link = head.makeelement(base.tag('xhtml', 'link'), type='text/css',
rel='stylesheet',
href=container.name_to_href(sname, name))
head.append(link)
pretty_xml_tree(head)
return serialize(root, 'text/html')
pretty.pretty_xml_tree(head)
return pretty.serialize(root, 'text/html')

View File

@@ -1,6 +1,7 @@
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.utils.localization import canonicalize_lang
@@ -14,7 +15,7 @@ def get_book_language(container):
def set_guide_item(container, item_type, title, name, frag=None):
ref_tag = const.OPF_REFERENCE
ref_tag = base.tag('opf', 'reference')
href = None
if name:
href = container.name_to_href(name, container.opf_name)
@@ -23,7 +24,7 @@ def set_guide_item(container, item_type, title, name, frag=None):
guides = container.opf_xpath('//opf:guide')
if not guides and href:
g = container.opf.makeelement(const.OPF_GUIDE,
g = container.opf.makeelement(base.tag('opf', 'guide'),
nsmap={'opf': const.OPF2_NS})
container.insert_into_xml(container.opf, g)
guides = [g]

View File

@@ -1,18 +1,13 @@
import textwrap
# from lxml.etree import Element
from ebook_converter import constants as const
from ebook_converter import force_unicode
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.base import serialize, OEB_DOCS, OEB_STYLES
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.utils import guess_type
from ebook_converter.utils.icu import sort_key
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
def isspace(x):
return not x.strip('\u0009\u000a\u000c\u000d\u0020')
@@ -28,37 +23,40 @@ def pretty_xml_tree(elem, level=0, indent=' '):
for i, child in enumerate(elem):
pretty_xml_tree(child, level=level+1, indent=indent)
if not child.tail or isspace(child.tail):
l = level + 1
new_level = level + 1
if i == len(elem) - 1:
l -= 1
child.tail = '\n' + (indent * l)
new_level -= 1
child.tail = '\n' + (indent * new_level)
def pretty_opf(root):
# Put all dc: tags first starting with title and author. Preserve order for
# the rest.
def dckey(x):
return {'title':0, 'creator':1}.get(parse_utils.barename(x.tag), 2)
for metadata in root.xpath('//opf:metadata', namespaces=const.OPF_NAMESPACES):
return {'title': 0, 'creator': 1}.get(parse_utils.barename(x.tag), 2)
for metadata in root.xpath('//opf:metadata',
namespaces=const.OPF_NAMESPACES):
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS)
dc_tags.sort(key=dckey)
for x in reversed(dc_tags):
metadata.insert(0, x)
# Group items in the manifest
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=const.OPF_NAMESPACES)
spine_ids = {x:i for i, x in enumerate(spine_ids)}
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref',
namespaces=const.OPF_NAMESPACES)
spine_ids = {x: i for i, x in enumerate(spine_ids)}
def manifest_key(x):
mt = x.get('media-type', '')
href = x.get('href', '')
ext = href.rpartition('.')[-1].lower()
cat = 1000
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
cat = 0
elif mt == guess_type('a.ncx'):
cat = 1
elif mt in OEB_STYLES:
elif mt in base.OEB_STYLES:
cat = 2
elif mt.startswith('image/'):
cat = 3
@@ -75,20 +73,23 @@ def pretty_opf(root):
i = sort_key(href)
return (cat, i)
for manifest in root.xpath('//opf:manifest', namespaces=const.OPF_NAMESPACES):
for manifest in root.xpath('//opf:manifest',
namespaces=const.OPF_NAMESPACES):
try:
children = sorted(manifest, key=manifest_key)
except AttributeError:
continue # There are comments so dont sort since that would mess up the comments
# There are comments so dont sort since that would mess up the
# comments.
continue
for x in reversed(children):
manifest.insert(0, x)
def isblock(x):
if callable(x.tag) or not x.tag:
return True
if x.tag in const.XHTML_BLOCK_TAGS | {const.SVG_SVG}:
if x.tag in const.XHTML_BLOCK_TAGS | {base.tag('svg', 'svg')}:
return True
return False
@@ -133,28 +134,34 @@ def pretty_block(parent, level=1, indent=' '):
that contain only other block tags '''
if not parent.text or isspace(parent.text):
parent.text = ''
nn = '\n' if hasattr(parent.tag, 'strip') and parse_utils.barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
if (hasattr(parent.tag, 'strip') and
parse_utils.barename(parent.tag) in {'tr', 'td', 'th'}):
nn = '\n'
else:
nn = '\n\n'
parent.text = parent.text + nn + (indent * level)
for i, child in enumerate(parent):
if isblock(child) and has_only_blocks(child):
pretty_block(child, level=level+1, indent=indent)
elif child.tag == const.SVG_SVG:
elif child.tag == base.tag('svg', 'svg'):
pretty_xml_tree(child, level=level, indent=indent)
l = level
new_level = level
if i == len(parent) - 1:
l -= 1
new_level -= 1
if not child.tail or isspace(child.tail):
child.tail = ''
child.tail = child.tail + nn + (indent * l)
child.tail = child.tail + nn + (indent * new_level)
def pretty_script_or_style(container, child):
if child.text:
indent = indent_for_tag(child)
if child.tag.endswith('style'):
child.text = force_unicode(pretty_css(container, '', child.text), 'utf-8')
child.text = force_unicode(pretty_css(container, '', child.text),
'utf-8')
child.text = textwrap.dedent(child.text)
child.text = '\n' + '\n'.join([(indent + x) if x else '' for x in child.text.splitlines()])
child.text = '\n' + '\n'.join([(indent + x) if x else ''
for x in child.text.splitlines()])
set_indent(child, 'text', indent)
@@ -169,62 +176,82 @@ def pretty_html_tree(container, root):
# Special case the handling of a body that contains a single block tag
# with all content. In this case we prettify the containing block tag
# even if it has non block children.
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(
body[0]) and parse_utils.barename(body[0].tag) not in (
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
if (len(body) == 1 and
not callable(body[0].tag) and
isblock(body[0]) and
not has_only_blocks(body[0]) and
parse_utils.barename(body[0].tag) not in ('pre', 'p', 'h1',
'h2', 'h3', 'h4',
'h5', 'h6') and
len(body[0]) > 0):
pretty_block(body[0], level=2)
if container is not None:
# Handle <script> and <style> tags
for child in root.xpath('//*[local-name()="script" or local-name()="style"]'):
for child in root.xpath('//*[local-name()="script" or local-name()='
'"style"]'):
pretty_script_or_style(container, child)
def fix_html(container, raw):
' Fix any parsing errors in the HTML represented as a string in raw. Fixing is done using the HTML5 parsing algorithm. '
"""
Fix any parsing errors in the HTML represented as a string in raw. Fixing
is done using the HTML5 parsing algorithm.
"""
root = container.parse_xhtml(raw)
return serialize(root, 'text/html')
return base.serialize(root, 'text/html')
def pretty_html(container, name, raw):
' Pretty print the HTML represented as a string in raw '
"""
Pretty print the HTML represented as a string in raw
"""
root = container.parse_xhtml(raw)
pretty_html_tree(container, root)
return serialize(root, 'text/html')
return base.serialize(root, 'text/html')
def pretty_css(container, name, raw):
' Pretty print the CSS represented as a string in raw '
"""
Pretty print the CSS represented as a string in raw
"""
sheet = container.parse_css(raw)
return serialize(sheet, 'text/css')
return base.serialize(sheet, 'text/css')
def pretty_xml(container, name, raw):
' Pretty print the XML represented as a string in raw. If ``name`` is the name of the OPF, extra OPF-specific prettying is performed. '
"""
Pretty print the XML represented as a string in raw. If ``name`` is the
name of the OPF, extra OPF-specific prettying is performed.
"""
root = container.parse_xml(raw)
if name == container.opf_name:
pretty_opf(root)
pretty_xml_tree(root)
return serialize(root, 'text/xml')
return base.serialize(root, 'text/xml')
def fix_all_html(container):
' Fix any parsing errors in all HTML files in the container. Fixing is done using the HTML5 parsing algorithm. '
"""
Fix any parsing errors in all HTML files in the container. Fixing is done
using the HTML5 parsing algorithm. """
for name, mt in container.mime_map.items():
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
container.parsed(name)
container.dirty(name)
def pretty_all(container):
' Pretty print all HTML/CSS/XML files in the container '
"""
Pretty print all HTML/CSS/XML files in the container
"""
xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
for name, mt in container.mime_map.items():
prettied = False
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
pretty_html_tree(container, container.parsed(name))
prettied = True
elif mt in OEB_STYLES:
elif mt in base.OEB_STYLES:
container.parsed(name)
prettied = True
elif name == container.opf_name:

View File

@@ -7,6 +7,7 @@ import urllib.parse
from ebook_converter import constants as const
from ebook_converter import guess_type, strftime
from ebook_converter.constants_old import iswindows
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.base import XPath, xml2text, urlnormalize
from ebook_converter.library.comments import comments_to_html, markdown
from ebook_converter.utils.date import is_date_undefined, as_local_time
@@ -371,7 +372,7 @@ def render_jacket(mi, output_profile,
# We cannot use data-calibre-rescale 100 on the body tag as that will just
# give the body tag a font size of 1em, which is useless.
for body in root.xpath('//*[local-name()="body"]'):
fw = body.makeelement(const.XHTML_DIV)
fw = body.makeelement(base.tag('xhtml', 'div'))
fw.set('data-calibre-rescale', '100')
for child in body:
fw.append(child)
@@ -388,9 +389,9 @@ def linearize_jacket(oeb):
for x in oeb.spine[:4]:
if XPath(JACKET_XPATH)(x.data):
for e in XPath('//h:table|//h:tr|//h:th')(x.data):
e.tag = const.XHTML_DIV
e.tag = base.tag('xhtml', 'div')
for e in XPath('//h:td')(x.data):
e.tag = const.XHTML_SPAN
e.tag = base.tag('xhtml', 'span')
break

View File

@@ -3,8 +3,11 @@ Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
forced at "likely" locations to conform to size limitations. This transform
assumes a prior call to the flatcss transform.
"""
import os, functools, collections, re, copy
from collections import OrderedDict
import collections
import copy
import functools
import os
import re
import urllib.parse
from lxml.etree import XPath as _XPath
@@ -13,8 +16,7 @@ from lxml import etree
from ebook_converter import constants as const
from ebook_converter import as_unicode, force_unicode
from ebook_converter.ebooks.epub import rules
from ebook_converter.ebooks.oeb.base import \
OEB_STYLES, rewrite_links, urlnormalize
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.split import do_split
from ebook_converter.polyglot.urllib import unquote
from ebook_converter.css_selectors import Select, SelectorError
@@ -44,14 +46,15 @@ class SplitError(ValueError):
class Split(object):
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
max_flow_size=0, remove_css_pagebreaks=True):
max_flow_size=0, remove_css_pagebreaks=True):
self.split_on_page_breaks = split_on_page_breaks
self.page_breaks_xpath = page_breaks_xpath
self.max_flow_size = max_flow_size
self.page_break_selectors = None
self.remove_css_pagebreaks = remove_css_pagebreaks
if self.page_breaks_xpath is not None:
self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)]
self.page_break_selectors = [(XPath(self.page_breaks_xpath),
False)]
def __call__(self, oeb, opts):
self.oeb = oeb
@@ -71,7 +74,7 @@ class Split(object):
page_breaks, page_break_ids = self.find_page_breaks(item)
splitter = FlowSplitter(item, page_breaks, page_break_ids,
self.max_flow_size, self.oeb, self.opts)
self.max_flow_size, self.oeb, self.opts)
if splitter.was_split:
am = splitter.anchor_map
self.map[item.href] = collections.defaultdict(
@@ -81,25 +84,27 @@ class Split(object):
if self.page_break_selectors is None:
self.page_break_selectors = set()
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES]
base.OEB_STYLES]
for rule in rules(stylesheets):
before = force_unicode(getattr(rule.style.getPropertyCSSValue(
'page-break-before'), 'cssText', '').strip().lower())
after = force_unicode(getattr(rule.style.getPropertyCSSValue(
after = force_unicode(getattr(rule.style.getPropertyCSSValue(
'page-break-after'), 'cssText', '').strip().lower())
try:
if before and before not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((rule.selectorText, True))
self.page_break_selectors.add((rule.selectorText,
True))
if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-before')
except:
except Exception:
pass
try:
if after and after not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((rule.selectorText, False))
self.page_break_selectors.add((rule.selectorText,
False))
if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-after')
except:
except Exception:
pass
page_breaks = set()
select = Select(item.data)
@@ -110,14 +115,18 @@ class Split(object):
return [], []
descendants = frozenset(body[0].iterdescendants('*'))
_tags = {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}
for selector, before in self.page_break_selectors:
try:
for elem in select(selector):
if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
if (elem in descendants and
elem.tag.rpartition('}')[2].lower() not in _tags):
elem.set('pb_before', '1' if before else '0')
page_breaks.add(elem)
except SelectorError as err:
self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))
self.log.warn('Ignoring page breaks specified with invalid '
'CSS selector: %r (%s)' %
(selector, as_unicode(err)))
for i, elem in enumerate(item.data.iter('*')):
try:
@@ -126,23 +135,23 @@ class Split(object):
continue
page_breaks = list(page_breaks)
page_breaks.sort(key=lambda x:int(x.get('pb_order')))
page_breaks.sort(key=lambda x: int(x.get('pb_order')))
page_break_ids, page_breaks_ = [], []
for i, x in enumerate(page_breaks):
x.set('id', x.get('id', 'calibre_pb_%d'%i))
x.set('id', x.get('id', 'calibre_pb_%d' % i))
id = x.get('id')
try:
xp = XPath('//*[@id="%s"]'%id)
except:
xp = XPath('//*[@id="%s"]' % id)
except Exception:
try:
xp = XPath("//*[@id='%s']"%id)
except:
xp = XPath("//*[@id='%s']" % id)
except Exception:
# The id has both a quote and an apostrophe or some other
# Just replace it since I doubt its going to work anywhere else
# either
id = 'calibre_pb_%d'%i
# Just replace it since I doubt its going to work anywhere
# else either
id = 'calibre_pb_%d' % i
x.set('id', id)
xp = XPath('//*[@id=%r]'%id)
xp = XPath('//*[@id=%r]' % id)
page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
page_break_ids.append(id)
@@ -159,7 +168,7 @@ class Split(object):
for item in self.oeb.manifest:
if etree.iselement(item.data):
self.current_item = item
rewrite_links(item.data, self.rewrite_links)
base.rewrite_links(item.data, self.rewrite_links)
def rewrite_links(self, url):
href, frag = urllib.parse.urldefrag(url)
@@ -169,7 +178,7 @@ class Split(object):
# Unparseable URL
return url
try:
href = urlnormalize(href)
href = base.urlnormalize(href)
except ValueError:
# href has non utf-8 quoting
return url
@@ -188,19 +197,19 @@ class FlowSplitter(object):
'The actual splitting logic'
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
opts):
self.item = item
self.oeb = oeb
self.opts = opts
self.log = oeb.log
self.page_breaks = page_breaks
opts):
self.item = item
self.oeb = oeb
self.opts = opts
self.log = oeb.log
self.page_breaks = page_breaks
self.page_break_ids = page_break_ids
self.max_flow_size = max_flow_size
self.base = item.href
self.csp_counter = 0
self.max_flow_size = max_flow_size
self.base = item.href
self.csp_counter = 0
base, ext = os.path.splitext(self.base)
self.base = base.replace('%', '%%')+'_split_%.3d'+ext
name, ext = os.path.splitext(self.base)
self.base = name.replace('%', '%%') + '_split_%.3d' + ext
self.trees = [self.item.data.getroottree()]
self.splitting_on_page_breaks = True
@@ -210,13 +219,13 @@ class FlowSplitter(object):
if self.max_flow_size > 0:
lt_found = False
self.log('\tLooking for large trees in %s...'%item.href)
self.log('\tLooking for large trees in %s...' % item.href)
trees = list(self.trees)
self.tree_map = {}
for i, tree in enumerate(trees):
size = len(tostring(tree.getroot()))
if size > self.max_flow_size:
self.log('\tFound large tree #%d'%i)
self.log('\tFound large tree #%d' % i)
lt_found = True
self.split_trees = []
self.split_to_size(tree)
@@ -229,11 +238,11 @@ class FlowSplitter(object):
self.was_split = len(self.trees) > 1
if self.was_split:
self.log('\tSplit into %d parts'%len(self.trees))
self.log('\tSplit into %d parts' % len(self.trees))
self.commit()
def split_on_page_breaks(self, orig_tree):
ordered_ids = OrderedDict()
ordered_ids = collections.OrderedDict()
all_page_break_ids = frozenset(self.page_break_ids)
for elem_id in orig_tree.xpath('//*/@id'):
if elem_id in all_page_break_ids:
@@ -248,9 +257,10 @@ class FlowSplitter(object):
tree = self.trees[i]
elem = pattern(tree)
if elem:
self.log.debug('\t\tSplitting on page-break at id=%s'%
elem[0].get('id'))
before_tree, after_tree = self.do_split(tree, elem[0], before)
self.log.debug('\t\tSplitting on page-break at id=%s' %
elem[0].get('id'))
before_tree, after_tree = self.do_split(tree, elem[0],
before)
self.trees[i:i+1] = [before_tree, after_tree]
break
@@ -269,7 +279,11 @@ class FlowSplitter(object):
if body is not None:
existing_ids = frozenset(body.xpath('//*/@id'))
for x in ids - existing_ids:
body.insert(0, body.makeelement(const.XHTML_div, id=x, style='height:0pt'))
body.insert(0,
body.makeelement(base.tag('xhtml',
'div'),
id=x,
style='height:0pt'))
ids = set()
trees.append(tree)
self.trees = trees
@@ -281,12 +295,13 @@ class FlowSplitter(object):
return body[0]
def do_split(self, tree, split_point, before):
'''
"""
Split ``tree`` into a *before* and *after* tree at ``split_point``.
:param before: If True tree is split before split_point, otherwise after split_point
:param before: If True tree is split before split_point, otherwise
after split_point
:return: before_tree, after_tree
'''
"""
return do_split(split_point, self.log, before=before)
def is_page_empty(self, root):
@@ -294,7 +309,7 @@ class FlowSplitter(object):
if body is None:
return False
txt = re.sub(r'\s+|\xa0', '',
etree.tostring(body, method='text', encoding='unicode'))
etree.tostring(body, method='text', encoding='unicode'))
if len(txt) > 1:
return False
for img in root.xpath('//h:img', namespaces=const.XPNSMAP):
@@ -305,13 +320,13 @@ class FlowSplitter(object):
return True
def split_text(self, text, root, size):
self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
self.log.debug('\t\t\tSplitting text of length: %d' % len(text))
rest = text.replace('\r', '')
parts = re.split('\n\n', rest)
self.log.debug('\t\t\t\tFound %d parts'%len(parts))
self.log.debug('\t\t\t\tFound %d parts' % len(parts))
if max(map(len, parts)) > size:
raise SplitError('Cannot split as file contains a <pre> tag '
'with a very large paragraph', root)
'with a very large paragraph', root)
ans = []
buf = ''
for part in parts:
@@ -331,7 +346,8 @@ class FlowSplitter(object):
continue
if pre.text and len(pre.text) > self.max_flow_size*0.5:
self.log.debug('\t\tSplitting large <pre> tag')
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
frags = self.split_text(pre.text, root,
int(0.2 * self.max_flow_size))
new_pres = []
for frag in frags:
pre2 = copy.copy(pre)
@@ -346,7 +362,8 @@ class FlowSplitter(object):
split_point, before = self.find_split_point(root)
if split_point is None:
raise SplitError(self.item.href, root)
self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))
self.log.debug('\t\t\tSplit point:', split_point.tag,
tree.getpath(split_point))
trees = self.do_split(tree, split_point, before)
sizes = [len(tostring(t.getroot())) for t in trees]
@@ -361,12 +378,11 @@ class FlowSplitter(object):
continue
elif size <= self.max_flow_size:
self.split_trees.append(t)
self.log.debug(
'\t\t\tCommitted sub-tree #%d (%d KB)'%(
len(self.split_trees), size/1024.))
self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)' %
(len(self.split_trees), size/1024.))
else:
self.log.debug(
'\t\t\tSplit tree still too large: %d KB' % (size/1024.))
self.log.debug('\t\t\tSplit tree still too large: %d KB' %
size/1024)
self.split_to_size(t)
def find_split_point(self, root):
@@ -385,8 +401,8 @@ class FlowSplitter(object):
'''
def pick_elem(elems):
if elems:
elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') !=
'1']
elems = [i for i in elems
if i.get(SPLIT_POINT_ATTR, '0') != '1']
if elems:
i = int(len(elems)//2)
elems[i].set(SPLIT_POINT_ATTR, '1')
@@ -407,7 +423,7 @@ class FlowSplitter(object):
if elem is not None:
try:
XPath(elem.getroottree().getpath(elem))
except:
except Exception:
continue
return elem, True
@@ -421,23 +437,24 @@ class FlowSplitter(object):
'''
if not self.was_split:
return
self.anchor_map = collections.defaultdict(lambda :self.base%0)
self.anchor_map = collections.defaultdict(lambda: self.base % 0)
self.files = []
for i, tree in enumerate(self.trees):
root = tree.getroot()
self.files.append(self.base%i)
self.files.append(self.base % i)
for elem in root.xpath('//*[@id or @name]'):
for anchor in elem.get('id', ''), elem.get('name', ''):
if anchor != '' and anchor not in self.anchor_map:
self.anchor_map[anchor] = self.files[-1]
for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR):
for elem in root.xpath('//*[@%s]' % SPLIT_POINT_ATTR):
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
spine_pos = self.item.spine_position
for current, tree in zip(*map(reversed, (self.files, self.trees))):
for a in tree.getroot().xpath('//h:a[@href]', namespaces=const.XPNSMAP):
for a in tree.getroot().xpath('//h:a[@href]',
namespaces=const.XPNSMAP):
href = a.get('href').strip()
if href.startswith('#'):
anchor = href[1:]
@@ -448,7 +465,8 @@ class FlowSplitter(object):
new_id = self.oeb.manifest.generate(id=self.item.id)[0]
new_item = self.oeb.manifest.add(new_id, current,
self.item.media_type, data=tree.getroot())
self.item.media_type,
data=tree.getroot())
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
if self.oeb.guide:

View File

@@ -7,7 +7,7 @@ from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.base import TOC, xml2text
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks import ConversionError
@@ -15,8 +15,8 @@ def XPath(x):
try:
return etree.XPath(x, namespaces=const.XPNSMAP)
except etree.XPathSyntaxError:
raise ConversionError(
'The syntax of the XPath expression %s is invalid.' % repr(x))
raise ConversionError('The syntax of the XPath expression %s is '
'invalid.' % repr(x))
def isspace(x):
@@ -33,9 +33,13 @@ def at_start(elem):
for x in body.iter():
if x is elem:
return True
if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}:
if hasattr(getattr(x, 'tag', None),
'rpartition') and x.tag.rpartition('}')[-1] in {'img',
'svg'}:
return False
if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))):
if isspace(getattr(x, 'text', None)) and (x in ancestors or
isspace(getattr(x, 'tail',
None))):
continue
return False
return False
@@ -52,7 +56,7 @@ class DetectStructure(object):
self.detect_chapters()
if self.oeb.auto_generated_toc or opts.use_auto_toc:
orig_toc = self.oeb.toc
self.oeb.toc = TOC()
self.oeb.toc = base.TOC()
self.create_level_based_toc()
if self.oeb.toc.count() < 1:
if not opts.no_chapters_in_toc and self.detected_chapters:
@@ -64,14 +68,14 @@ class DetectStructure(object):
else:
self.oeb.auto_generated_toc = True
self.log('Auto generated TOC with %d entries.' %
self.oeb.toc.count())
self.oeb.toc.count())
if opts.toc_filter is not None:
regexp = re.compile(opts.toc_filter)
for node in list(self.oeb.toc.iter()):
if not node.title or regexp.search(node.title) is not None:
self.log('Filtering', node.title if node.title else
'empty node', 'from TOC')
'empty node', 'from TOC')
self.oeb.toc.remove(node)
if opts.page_breaks_before is not None:
@@ -80,10 +84,11 @@ class DetectStructure(object):
for elem in pb_xpath(item.data):
try:
prev = next(elem.itersiblings(tag=etree.Element,
preceding=True))
if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and parse_utils.barename(
prev.tag) in {'h1', 'h2'} and (not prev.tail or
not prev.tail.split())):
preceding=True))
if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and
parse_utils.barename(prev.tag) in {'h1',
'h2'} and
(not prev.tail or not prev.tail.split())):
# We have two adjacent headings, do not put a page
# break on the second one
continue
@@ -106,9 +111,9 @@ class DetectStructure(object):
expr = self.opts.start_reading_at
try:
expr = XPath(expr)
except:
self.log.warn(
'Invalid start reading at XPath expression, ignoring: %s'%expr)
except Exception:
self.log.warn('Invalid start reading at XPath expression, '
'ignoring: %s' % expr)
return
for item in self.oeb.spine:
if not hasattr(item.data, 'xpath'):
@@ -118,16 +123,17 @@ class DetectStructure(object):
elem = matches[0]
eid = elem.get('id', None)
if not eid:
eid = 'start_reading_at_'+str(uuid.uuid4()).replace('-', '')
eid = 'start_reading_at_' + str(uuid.uuid4()).replace('-',
'')
elem.set('id', eid)
if 'text' in self.oeb.guide:
self.oeb.guide.remove('text')
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
self.log('Setting start reading at position to %s in %s'%(
self.opts.start_reading_at, item.href))
self.log('Setting start reading at position to %s in %s' %
(self.opts.start_reading_at, item.href))
return
self.log.warn("Failed to find start reading at position: %s"%
self.opts.start_reading_at)
self.log.warn("Failed to find start reading at position: %s" %
self.opts.start_reading_at)
def get_toc_parts_for_xpath(self, expr):
# if an attribute is selected by the xpath expr then truncate it
@@ -148,12 +154,14 @@ class DetectStructure(object):
ans = XPath(expr)(doc)
len(ans)
return ans
except:
self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
except Exception:
self.log.warn('Invalid chapter expression, ignoring: %s' %
expr)
return []
if self.opts.chapter:
chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter)
chapter_path, title_attribute = (
self.get_toc_parts_for_xpath(self.opts.chapter))
self.chapter_title_attribute = title_attribute
for item in self.oeb.spine:
for x in find_matches(chapter_path, item.data):
@@ -165,25 +173,28 @@ class DetectStructure(object):
c = collections.Counter()
for item, elem in self.detected_chapters:
c[item] += 1
text = xml2text(elem).strip()
text = base.xml2text(elem).strip()
text = re.sub(r'\s+', ' ', text.strip())
self.log('\tDetected chapter:', text[:50])
if chapter_mark == 'none':
continue
if chapter_mark == 'rule':
mark = elem.makeelement(const.XHTML_HR)
mark = elem.makeelement(base.tag('xhtml', 'hr'))
elif chapter_mark == 'pagebreak':
if c[item] < 3 and at_start(elem):
# For the first two elements in this item, check if they
# are at the start of the file, in which case inserting a
# page break in unnecessary and can lead to extra blank
# pages in the PDF Output plugin. We need to use two as
# feedbooks epubs match both a heading tag and its
# containing div with the default chapter expression.
# For the first two elements in this item, check if
# they are at the start of the file, in which case
# inserting a page break in unnecessary and can lead
# to extra blank pages in the PDF Output plugin. We
# need to use two as feedbooks epubs match both a
# heading tag and its containing div with the default
# chapter expression.
continue
mark = elem.makeelement(const.XHTML_DIV, style=page_break_after)
mark = elem.makeelement(base.tag('xhtml', 'div'),
style=page_break_after)
else: # chapter_mark == 'both':
mark = elem.makeelement(const.XHTML_HR, style=page_break_before)
mark = elem.makeelement(base.tag('xhtml', 'hr'),
style=page_break_before)
try:
elem.addprevious(mark)
except TypeError:
@@ -196,7 +207,9 @@ class DetectStructure(object):
def create_toc_from_chapters(self):
counter = self.oeb.toc.next_play_order()
for item, elem in self.detected_chapters:
text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter)
text, href = self.elem_to_link(item, elem,
self.chapter_title_attribute,
counter)
self.oeb.toc.add(text, href, play_order=counter)
counter += 1
@@ -216,18 +229,21 @@ class DetectStructure(object):
if frag:
href = '#'.join((href, frag))
if not self.oeb.toc.has_href(href):
text = xml2text(a)
text = base.xml2text(a)
text = text[:100].strip()
if (not self.opts.duplicate_links_in_toc and
self.oeb.toc.has_text(text)):
continue
try:
self.oeb.toc.add(text, href,
self.oeb.toc.add(
text, href,
play_order=self.oeb.toc.next_play_order())
num += 1
except ValueError:
self.oeb.log.exception('Failed to process link: %r' % href)
continue # Most likely an incorrectly URL encoded link
self.oeb.log.exception('Failed to process link: '
'%r' % href)
# Most likely an incorrectly URL encoded link
continue
if self.opts.max_toc_links > 0 and \
num >= self.opts.max_toc_links:
self.log('Maximum TOC links reached, stopping.')
@@ -238,14 +254,14 @@ class DetectStructure(object):
if title_attribute is not None:
text = elem.get(title_attribute, '')
if not text:
text = xml2text(elem).strip()
text = base.xml2text(elem).strip()
if not text:
text = elem.get('title', '')
if not text:
text = elem.get('alt', '')
text = re.sub(r'\s+', ' ', text.strip())
text = text[:1000].strip()
id = elem.get('id', 'calibre_toc_%d'%counter)
id = elem.get('id', 'calibre_toc_%d' % counter)
elem.set('id', id)
href = '#'.join((item.href, id))
return text, href
@@ -260,26 +276,29 @@ class DetectStructure(object):
ans = XPath(expr)(doc)
len(ans)
return ans
except:
self.log.warn('Invalid ToC expression, ignoring: %s'%expr)
except Exception:
self.log.warn('Invalid ToC expression, ignoring: %s' % expr)
return []
for document in self.oeb.spine:
previous_level1 = list(added.values())[-1] if added else None
previous_level2 = list(added2.values())[-1] if added2 else None
level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc)
(level1_toc,
level1_title) = self.get_toc_parts_for_xpath(self.opts.level1_toc)
for elem in find_matches(level1_toc, document.data):
text, _href = self.elem_to_link(document, elem, level1_title, counter)
text, _href = self.elem_to_link(document, elem, level1_title,
counter)
counter += 1
if text:
node = self.oeb.toc.add(text, _href,
play_order=self.oeb.toc.next_play_order())
node = self.oeb.toc.add(
text, _href, play_order=self.oeb.toc.next_play_order())
added[elem] = node
# node.add('Top', _href)
if self.opts.level2_toc is not None and added:
level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc)
level2_toc, level2_title = self.get_toc_parts_for_xpath(
self.opts.level2_toc)
for elem in find_matches(level2_toc, document.data):
level1 = None
for item in document.data.iterdescendants():
@@ -290,15 +309,19 @@ class DetectStructure(object):
if previous_level1 is None:
break
level1 = previous_level1
text, _href = self.elem_to_link(document, elem, level2_title, counter)
text, _href = self.elem_to_link(document, elem,
level2_title,
counter)
counter += 1
if text:
added2[elem] = level1.add(text, _href,
added2[elem] = level1.add(
text, _href,
play_order=self.oeb.toc.next_play_order())
break
if self.opts.level3_toc is not None and added2:
level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc)
level3_toc, level3_title = self.get_toc_parts_for_xpath(
self.opts.level3_toc)
for elem in find_matches(level3_toc, document.data):
level2 = None
for item in document.data.iterdescendants():
@@ -309,10 +332,13 @@ class DetectStructure(object):
if previous_level2 is None:
break
level2 = previous_level2
text, _href = \
self.elem_to_link(document, elem, level3_title, counter)
text, _href = self.elem_to_link(document,
elem,
level3_title,
counter)
counter += 1
if text:
level2.add(text, _href,
play_order=self.oeb.toc.next_play_order())
play_order=self.oeb
.toc.next_play_order())
break