1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-01 14:15:54 +01:00

Fixing leftovers from first concept of constants

This commit is contained in:
2020-06-07 11:59:00 +02:00
parent 7419954e0c
commit a69884d724
9 changed files with 652 additions and 464 deletions

View File

@@ -1,14 +1,16 @@
import collections
import functools
import json import json
import re import re
from collections import defaultdict, namedtuple
from functools import wraps
from lxml import etree from lxml import etree
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter import prints from ebook_converter import prints
from ebook_converter.ebooks.metadata import authors_to_string, check_isbn, string_to_authors from ebook_converter.ebooks.metadata import authors_to_string
from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.ebooks.metadata import check_isbn
from ebook_converter.ebooks.metadata import string_to_authors
from ebook_converter.ebooks.metadata.book import base
from ebook_converter.ebooks.metadata.book.json_codec import ( from ebook_converter.ebooks.metadata.book.json_codec import (
decode_is_multiple, encode_is_multiple, object_to_unicode decode_is_multiple, encode_is_multiple, object_to_unicode
) )
@@ -17,17 +19,30 @@ from ebook_converter.ebooks.metadata.utils import (
pretty_print_opf pretty_print_opf
) )
from ebook_converter.utils.config import from_json, to_json from ebook_converter.utils.config import from_json, to_json
from ebook_converter.utils.date import ( from ebook_converter.utils.date import (fix_only_date, is_date_undefined,
fix_only_date, is_date_undefined, isoformat, parse_date as parse_date_, utcnow, isoformat, parse_date as parse_date_,
w3cdtf utcnow, w3cdtf)
)
from ebook_converter.utils.iso8601 import parse_iso8601 from ebook_converter.utils.iso8601 import parse_iso8601
from ebook_converter.utils.localization import canonicalize_lang from ebook_converter.utils.localization import canonicalize_lang
RES_PREFIXES = {'dcterms': 'http://purl.org/dc/terms/',
'epubsc': 'http://idpf.org/epub/vocab/sc/#',
'marc': 'http://id.loc.gov/vocabulary/',
'media': 'http://www.idpf.org/epub/vocab/overlays/#',
'onix': 'http://www.editeur.org/ONIX/book/codelists/'
'current.html#',
'rendition': 'http://www.idpf.org/vocab/rendition/#',
'schema': 'http://schema.org/',
'xsd': 'http://www.w3.org/2001/XMLSchema#'}
CALIBRE_PREFIX = 'https://calibre-ebook.com'
KNOWN_PREFIXES = RES_PREFIXES.copy()
KNOWN_PREFIXES['calibre'] = CALIBRE_PREFIX
# Utils {{{ # Utils {{{
_xpath_cache = {} _XPATH_CACHE = {}
_re_cache = {} _RE_CACHE = {}
def uniq(vals): def uniq(vals):
@@ -39,22 +54,23 @@ def uniq(vals):
def dump_dict(cats): def dump_dict(cats):
return json.dumps(object_to_unicode(cats or {}), ensure_ascii=False, skipkeys=True) return json.dumps(object_to_unicode(cats or {}), ensure_ascii=False,
skipkeys=True)
def XPath(x): def XPath(x):
try: try:
return _xpath_cache[x] return _XPATH_CACHE[x]
except KeyError: except KeyError:
_xpath_cache[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP) _XPATH_CACHE[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP)
return ans return ans
def regex(r, flags=0): def regex(r, flags=0):
try: try:
return _re_cache[(r, flags)] return _RE_CACHE[(r, flags)]
except KeyError: except KeyError:
_re_cache[(r, flags)] = ans = re.compile(r, flags) _RE_CACHE[(r, flags)] = ans = re.compile(r, flags)
return ans return ans
@@ -82,7 +98,7 @@ def properties_for_id(item_id, refines):
def properties_for_id_with_scheme(item_id, prefixes, refines): def properties_for_id_with_scheme(item_id, prefixes, refines):
ans = defaultdict(list) ans = collections.defaultdict(list)
if item_id: if item_id:
for elem in refines[item_id]: for elem in refines[item_id]:
key = elem.get('property') key = elem.get('property')
@@ -126,7 +142,7 @@ def normalize_whitespace(text):
def simple_text(f): def simple_text(f):
@wraps(f) @functools.wraps(f)
def wrapper(*args, **kw): def wrapper(*args, **kw):
return normalize_whitespace(f(*args, **kw)) return normalize_whitespace(f(*args, **kw))
return wrapper return wrapper
@@ -135,7 +151,7 @@ def simple_text(f):
def items_with_property(root, q, prefixes=None): def items_with_property(root, q, prefixes=None):
if prefixes is None: if prefixes is None:
prefixes = read_prefixes(root) prefixes = read_prefixes(root)
q = expand_prefix(q, known_prefixes).lower() q = expand_prefix(q, KNOWN_PREFIXES).lower()
for item in XPath("./opf:manifest/opf:item[@properties]")(root): for item in XPath("./opf:manifest/opf:item[@properties]")(root):
for prop in (item.get('properties') or '').lower().split(): for prop in (item.get('properties') or '').lower().split():
prop = expand_prefix(prop, prefixes) prop = expand_prefix(prop, prefixes)
@@ -150,43 +166,32 @@ def items_with_property(root, q, prefixes=None):
# http://www.idpf.org/epub/vocab/package/pfx/ # http://www.idpf.org/epub/vocab/package/pfx/
reserved_prefixes = {
'dcterms': 'http://purl.org/dc/terms/',
'epubsc': 'http://idpf.org/epub/vocab/sc/#',
'marc': 'http://id.loc.gov/vocabulary/',
'media': 'http://www.idpf.org/epub/vocab/overlays/#',
'onix': 'http://www.editeur.org/ONIX/book/codelists/current.html#',
'rendition':'http://www.idpf.org/vocab/rendition/#',
'schema': 'http://schema.org/',
'xsd': 'http://www.w3.org/2001/XMLSchema#',
}
CALIBRE_PREFIX = 'https://calibre-ebook.com'
known_prefixes = reserved_prefixes.copy()
known_prefixes['calibre'] = CALIBRE_PREFIX
def parse_prefixes(x): def parse_prefixes(x):
return {m.group(1):m.group(2) for m in re.finditer(r'(\S+): \s*(\S+)', x)} return {m.group(1): m.group(2)
for m in re.finditer(r'(\S+): \s*(\S+)', x)}
def read_prefixes(root): def read_prefixes(root):
ans = reserved_prefixes.copy() ans = RES_PREFIXES.copy()
ans.update(parse_prefixes(root.get('prefix') or '')) ans.update(parse_prefixes(root.get('prefix') or ''))
return ans return ans
def expand_prefix(raw, prefixes): def expand_prefix(raw, prefixes):
return regex(r'(\S+)\s*:\s*(\S+)').sub(lambda m:(prefixes.get(m.group(1), m.group(1)) + ':' + m.group(2)), raw or '') return (regex(r'(\S+)\s*:\s*(\S+)')
.sub(lambda m: (prefixes.get(m.group(1),
m.group(1)) + ':' + m.group(2)),
raw or ''))
def ensure_prefix(root, prefixes, prefix, value=None): def ensure_prefix(root, prefixes, prefix, value=None):
if prefixes is None: if prefixes is None:
prefixes = read_prefixes(root) prefixes = read_prefixes(root)
prefixes[prefix] = value or reserved_prefixes[prefix] prefixes[prefix] = value or RES_PREFIXES[prefix]
prefixes = {k:v for k, v in prefixes.items() if reserved_prefixes.get(k) != v} prefixes = {k: v for k, v in prefixes.items() if RES_PREFIXES.get(k) != v}
if prefixes: if prefixes:
root.set('prefix', ' '.join('%s: %s' % (k, v) for k, v in prefixes.items())) root.set('prefix', ' '.join('%s: %s' % (k, v)
for k, v in prefixes.items()))
else: else:
root.attrib.pop('prefix', None) root.attrib.pop('prefix', None)
@@ -196,7 +201,7 @@ def ensure_prefix(root, prefixes, prefix, value=None):
def read_refines(root): def read_refines(root):
ans = defaultdict(list) ans = collections.defaultdict(list)
for meta in XPath('./opf:metadata/opf:meta[@refines]')(root): for meta in XPath('./opf:metadata/opf:meta[@refines]')(root):
r = meta.get('refines') or '' r = meta.get('refines') or ''
if r.startswith('#'): if r.startswith('#'):
@@ -213,7 +218,7 @@ def set_refines(elem, existing_refines, *new_refines):
remove_refines(elem, existing_refines) remove_refines(elem, existing_refines)
for ref in reversed(new_refines): for ref in reversed(new_refines):
prop, val, scheme = ref prop, val, scheme = ref
r = elem.makeelement(const.OPF_META) r = elem.makeelement(base.tag('opf', 'meta'))
r.set('refines', '#' + eid), r.set('property', prop) r.set('refines', '#' + eid), r.set('property', prop)
r.text = val.strip() r.text = val.strip()
if scheme: if scheme:
@@ -249,7 +254,7 @@ def parse_identifier(ident, val, refines):
# Try the OPF 2 style opf:scheme attribute, which will be present, for # Try the OPF 2 style opf:scheme attribute, which will be present, for
# example, in EPUB 3 files that have had their metadata set by an # example, in EPUB 3 files that have had their metadata set by an
# application that only understands EPUB 2. # application that only understands EPUB 2.
scheme = ident.get(const.OPF_SCHEME) scheme = ident.get(base.tag('opf', 'scheme'))
if scheme and not lval.startswith('urn:'): if scheme and not lval.startswith('urn:'):
return finalize(scheme, val) return finalize(scheme, val)
@@ -267,7 +272,7 @@ def parse_identifier(ident, val, refines):
def read_identifiers(root, prefixes, refines): def read_identifiers(root, prefixes, refines):
ans = defaultdict(list) ans = collections.defaultdict(list)
for ident in XPath('./opf:metadata/dc:identifier')(root): for ident in XPath('./opf:metadata/dc:identifier')(root):
val = (ident.text or '').strip() val = (ident.text or '').strip()
if val: if val:
@@ -277,7 +282,8 @@ def read_identifiers(root, prefixes, refines):
return ans return ans
def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=False): def set_identifiers(root, prefixes, refines, new_identifiers,
force_identifiers=False):
uid = root.get('unique-identifier') uid = root.get('unique-identifier')
package_identifier = None package_identifier = None
for ident in XPath('./opf:metadata/dc:identifier')(root): for ident in XPath('./opf:metadata/dc:identifier')(root):
@@ -289,12 +295,15 @@ def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=
ident.getparent().remove(ident) ident.getparent().remove(ident)
continue continue
scheme, val = parse_identifier(ident, val, refines) scheme, val = parse_identifier(ident, val, refines)
if not scheme or not val or force_identifiers or scheme in new_identifiers: if (not scheme or
not val or
force_identifiers or
scheme in new_identifiers):
remove_element(ident, refines) remove_element(ident, refines)
continue continue
metadata = XPath('./opf:metadata')(root)[0] metadata = XPath('./opf:metadata')(root)[0]
for scheme, val in new_identifiers.items(): for scheme, val in new_identifiers.items():
ident = metadata.makeelement(const.DC_IDENT) ident = metadata.makeelement(base.tag('dc', 'ident'))
ident.text = '%s:%s' % (scheme, val) ident.text = '%s:%s' % (scheme, val)
if package_identifier is None: if package_identifier is None:
metadata.append(ident) metadata.append(ident)
@@ -312,11 +321,12 @@ def identifier_writer(name):
if is_package_id: if is_package_id:
package_identifier = ident package_identifier = ident
val = (ident.text or '').strip() val = (ident.text or '').strip()
if (val.startswith(name + ':') or ident.get(const.OPF_SCHEME) == name) and not is_package_id: if (val.startswith(name + ':') or
ident.get(base.tag('opf', 'scheme')) == name) and not is_package_id:
remove_element(ident, refines) remove_element(ident, refines)
metadata = XPath('./opf:metadata')(root)[0] metadata = XPath('./opf:metadata')(root)[0]
if ival: if ival:
ident = metadata.makeelement(const.DC_IDENT) ident = metadata.makeelement(base.tag('dc', 'ident'))
ident.text = '%s:%s' % (name, ival) ident.text = '%s:%s' % (name, ival)
if package_identifier is None: if package_identifier is None:
metadata.append(ident) metadata.append(ident)
@@ -366,7 +376,8 @@ def read_title_sort(root, prefixes, refines):
if fa: if fa:
return fa return fa
# Look for OPF 2.0 style title_sort # Look for OPF 2.0 style title_sort
for m in XPath('./opf:metadata/opf:meta[@name="calibre:title_sort"]')(root): for m in XPath('./opf:metadata/opf:meta[@name="calibre:'
'title_sort"]')(root):
ans = m.get('content') ans = m.get('content')
if ans: if ans:
return ans return ans
@@ -376,12 +387,13 @@ def set_title(root, prefixes, refines, title, title_sort=None):
main_title = find_main_title(root, refines, remove_blanks=True) main_title = find_main_title(root, refines, remove_blanks=True)
if main_title is None: if main_title is None:
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
main_title = m.makeelement(const.DC_TITLE) main_title = m.makeelement(base.tag('dc', 'title'))
m.insert(0, main_title) m.insert(0, main_title)
main_title.text = title or None main_title.text = title or None
ts = [refdef('file-as', title_sort)] if title_sort else () ts = [refdef('file-as', title_sort)] if title_sort else ()
set_refines(main_title, refines, refdef('title-type', 'main'), *ts) set_refines(main_title, refines, refdef('title-type', 'main'), *ts)
for m in XPath('./opf:metadata/opf:meta[@name="calibre:title_sort"]')(root): for m in XPath('./opf:metadata/opf:meta[@name="calibre:'
'title_sort"]')(root):
remove_element(m, refines) remove_element(m, refines)
# }}} # }}}
@@ -405,28 +417,32 @@ def set_languages(root, prefixes, refines, languages):
val = (lang.text or '').strip() val = (lang.text or '').strip()
if val: if val:
opf_languages.append(val) opf_languages.append(val)
languages = list(filter(lambda x: x and x != 'und', normalize_languages(opf_languages, languages))) languages = list(filter(lambda x: x and x != 'und',
normalize_languages(opf_languages, languages)))
if not languages: if not languages:
# EPUB spec says dc:language is required # EPUB spec says dc:language is required
languages = ['und'] languages = ['und']
metadata = XPath('./opf:metadata')(root)[0] metadata = XPath('./opf:metadata')(root)[0]
for lang in uniq(languages): for lang in uniq(languages):
l = metadata.makeelement(const.DC_LANG) dc_lang = metadata.makeelement(base.tag('dc', 'lang'))
l.text = lang dc_lang.text = lang
metadata.append(l) metadata.append(dc_lang)
# }}} # }}}
# Creator/Contributor {{{ # Creator/Contributor {{{
Author = namedtuple('Author', 'name sort') Author = collections.namedtuple('Author', 'name sort')
def is_relators_role(props, q): def is_relators_role(props, q):
for role in props.get('role'): for role in props.get('role'):
if role: if role:
scheme_ns, scheme, role = role scheme_ns, scheme, role = role
if role.lower() == q and (scheme_ns is None or (scheme_ns, scheme) == (reserved_prefixes['marc'], 'relators')): if (role.lower() == q and
(scheme_ns is None or
(scheme_ns, scheme) == (RES_PREFIXES['marc'],
'relators'))):
return True return True
return False return False
@@ -440,15 +456,16 @@ def read_authors(root, prefixes, refines):
if file_as: if file_as:
aus = file_as[0][-1] aus = file_as[0][-1]
else: else:
aus = item.get(const.OPF_FILE_AS) or None aus = item.get(base.tag('opf', 'file_as')) or None
return Author(normalize_whitespace(val), normalize_whitespace(aus)) return Author(normalize_whitespace(val), normalize_whitespace(aus))
for item in XPath('./opf:metadata/dc:creator')(root): for item in XPath('./opf:metadata/dc:creator')(root):
val = (item.text or '').strip() val = (item.text or '').strip()
if val: if val:
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) props = properties_for_id_with_scheme(item.get('id'), prefixes,
refines)
role = props.get('role') role = props.get('role')
opf_role = item.get(const.OPF_ROLE) opf_role = item.get(base.tag('opf', 'role'))
if role: if role:
if is_relators_role(props, 'aut'): if is_relators_role(props, 'aut'):
roled_authors.append(author(item, props, val)) roled_authors.append(author(item, props, val))
@@ -464,23 +481,30 @@ def read_authors(root, prefixes, refines):
def set_authors(root, prefixes, refines, authors): def set_authors(root, prefixes, refines, authors):
ensure_prefix(root, prefixes, 'marc') ensure_prefix(root, prefixes, 'marc')
for item in XPath('./opf:metadata/dc:creator')(root): for item in XPath('./opf:metadata/dc:creator')(root):
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) props = properties_for_id_with_scheme(item.get('id'), prefixes,
opf_role = item.get(const.OPF_ROLE) refines)
if (opf_role and opf_role.lower() != 'aut') or (props.get('role') and not is_relators_role(props, 'aut')): opf_role = item.get(base.tag('opf', 'role'))
if ((opf_role and opf_role.lower() != 'aut') or
(props.get('role') and not is_relators_role(props, 'aut'))):
continue continue
remove_element(item, refines) remove_element(item, refines)
metadata = XPath('./opf:metadata')(root)[0] metadata = XPath('./opf:metadata')(root)[0]
for author in authors: for author in authors:
if author.name: if author.name:
a = metadata.makeelement(const.DC_CREATOR) a = metadata.makeelement(base.tag('dc', 'creator'))
aid = ensure_id(a) aid = ensure_id(a)
a.text = author.name a.text = author.name
metadata.append(a) metadata.append(a)
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'}) m = metadata.makeelement(base.tag('opf', 'meta'),
attrib={'refines': '#' + aid,
'property': 'role',
'scheme': 'marc:relators'})
m.text = 'aut' m.text = 'aut'
metadata.append(m) metadata.append(m)
if author.sort: if author.sort:
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'file-as'}) m = metadata.makeelement(base.tag('opf', 'meta'),
attrib={'refines': '#' + aid,
'property': 'file-as'})
m.text = author.sort m.text = author.sort
metadata.append(m) metadata.append(m)
@@ -490,9 +514,10 @@ def read_book_producers(root, prefixes, refines):
for item in XPath('./opf:metadata/dc:contributor')(root): for item in XPath('./opf:metadata/dc:contributor')(root):
val = (item.text or '').strip() val = (item.text or '').strip()
if val: if val:
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) props = properties_for_id_with_scheme(item.get('id'), prefixes,
refines)
role = props.get('role') role = props.get('role')
opf_role = item.get(const.OPF_ROLE) opf_role = item.get(base.tag('opf', 'role'))
if role: if role:
if is_relators_role(props, 'bkp'): if is_relators_role(props, 'bkp'):
ans.append(normalize_whitespace(val)) ans.append(normalize_whitespace(val))
@@ -503,19 +528,24 @@ def read_book_producers(root, prefixes, refines):
def set_book_producers(root, prefixes, refines, producers): def set_book_producers(root, prefixes, refines, producers):
for item in XPath('./opf:metadata/dc:contributor')(root): for item in XPath('./opf:metadata/dc:contributor')(root):
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) props = properties_for_id_with_scheme(item.get('id'), prefixes,
opf_role = item.get(const.OPF_ROLE) refines)
if (opf_role and opf_role.lower() != 'bkp') or (props.get('role') and not is_relators_role(props, 'bkp')): opf_role = item.get(base.tag('opf', 'role'))
if ((opf_role and opf_role.lower() != 'bkp') or
(props.get('role') and not is_relators_role(props, 'bkp'))):
continue continue
remove_element(item, refines) remove_element(item, refines)
metadata = XPath('./opf:metadata')(root)[0] metadata = XPath('./opf:metadata')(root)[0]
for bkp in producers: for bkp in producers:
if bkp: if bkp:
a = metadata.makeelement(const.DC_CONTRIBUTOR) a = metadata.makeelement(base.tag('dc', 'contributor'))
aid = ensure_id(a) aid = ensure_id(a)
a.text = bkp a.text = bkp
metadata.append(a) metadata.append(a)
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'}) m = metadata.makeelement(base.tag('opf', 'meta'),
attrib={'refines': '#' + aid,
'property': 'role',
'scheme': 'marc:relators'})
m.text = 'bkp' m.text = 'bkp'
metadata.append(m) metadata.append(m)
# }}} # }}}
@@ -531,7 +561,9 @@ def parse_date(raw, is_w3cdtf=False):
ans = fix_only_date(ans) ans = fix_only_date(ans)
else: else:
ans = parse_date_(raw, assume_utc=True) ans = parse_date_(raw, assume_utc=True)
if ' ' not in raw and 'T' not in raw and (ans.hour, ans.minute, ans.second) == (0, 0, 0): if (' ' not in raw and
'T' not in raw and
(ans.hour, ans.minute, ans.second) == (0, 0, 0)):
ans = fix_only_date(ans) ans = fix_only_date(ans)
return ans return ans
@@ -552,14 +584,14 @@ def set_pubdate(root, prefixes, refines, val):
if not is_date_undefined(val): if not is_date_undefined(val):
val = isoformat(val) val = isoformat(val)
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(const.DC_DATE) d = m.makeelement(base.tag('dc', 'date'))
d.text = val d.text = val
m.append(d) m.append(d)
def read_timestamp(root, prefixes, refines): def read_timestamp(root, prefixes, refines):
pq = '%s:timestamp' % CALIBRE_PREFIX pq = '%s:timestamp' % CALIBRE_PREFIX
sq = '%s:w3cdtf' % reserved_prefixes['dcterms'] sq = '%s:w3cdtf' % RES_PREFIXES['dcterms']
for meta in XPath('./opf:metadata/opf:meta[@property]')(root): for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
val = (meta.text or '').strip() val = (meta.text or '').strip()
if val: if val:
@@ -570,7 +602,8 @@ def read_timestamp(root, prefixes, refines):
return parse_date(val, is_w3cdtf=scheme == sq) return parse_date(val, is_w3cdtf=scheme == sq)
except Exception: except Exception:
continue continue
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:timestamp"]')(root): for meta in XPath('./opf:metadata/opf:meta[@name="calibre:'
'timestamp"]')(root):
val = meta.get('content') val = meta.get('content')
if val: if val:
try: try:
@@ -584,7 +617,9 @@ def create_timestamp(root, prefixes, m, val):
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
ensure_prefix(root, prefixes, 'dcterms') ensure_prefix(root, prefixes, 'dcterms')
val = w3cdtf(val) val = w3cdtf(val)
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'}) d = m.makeelement(base.tag('opf', 'meta'),
attrib={'property': 'calibre:timestamp',
'scheme': 'dcterms:W3CDTF'})
d.text = val d.text = val
m.append(d) m.append(d)
@@ -599,8 +634,8 @@ def set_timestamp(root, prefixes, refines, val):
def read_last_modified(root, prefixes, refines): def read_last_modified(root, prefixes, refines):
pq = '%s:modified' % reserved_prefixes['dcterms'] pq = '%s:modified' % RES_PREFIXES['dcterms']
sq = '%s:w3cdtf' % reserved_prefixes['dcterms'] sq = '%s:w3cdtf' % RES_PREFIXES['dcterms']
for meta in XPath('./opf:metadata/opf:meta[@property]')(root): for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
val = (meta.text or '').strip() val = (meta.text or '').strip()
if val: if val:
@@ -614,7 +649,7 @@ def read_last_modified(root, prefixes, refines):
def set_last_modified(root, prefixes, refines, val=None): def set_last_modified(root, prefixes, refines, val=None):
pq = '%s:modified' % reserved_prefixes['dcterms'] pq = '%s:modified' % RES_PREFIXES['dcterms']
val = w3cdtf(val or utcnow()) val = w3cdtf(val or utcnow())
for meta in XPath('./opf:metadata/opf:meta[@property]')(root): for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
prop = expand_prefix(meta.get('property'), prefixes) prop = expand_prefix(meta.get('property'), prefixes)
@@ -625,7 +660,9 @@ def set_last_modified(root, prefixes, refines, val=None):
else: else:
ensure_prefix(root, prefixes, 'dcterms') ensure_prefix(root, prefixes, 'dcterms')
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
meta = m.makeelement(const.OPF_META, attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'}) meta = m.makeelement(base.tag('opf', 'meta'),
attrib={'property': 'dcterms:modified',
'scheme': 'dcterms:W3CDTF'})
m.append(meta) m.append(meta)
meta.text = val meta.text = val
# }}} # }}}
@@ -648,7 +685,7 @@ def set_comments(root, prefixes, refines, val):
if val: if val:
val = val.strip() val = val.strip()
if val: if val:
c = m.makeelement(const.DC_DESC) c = m.makeelement(base.tag('dc', 'desc'))
c.text = val c.text = val
m.append(c) m.append(c)
# }}} # }}}
@@ -670,7 +707,7 @@ def set_publisher(root, prefixes, refines, val):
if val: if val:
val = val.strip() val = val.strip()
if val: if val:
c = m.makeelement(const.DC_PUBLISHER('publisher')) c = m.makeelement(base.tag('dc', 'publisher'))
c.text = normalize_whitespace(val) c.text = normalize_whitespace(val)
m.append(c) m.append(c)
# }}} # }}}
@@ -693,7 +730,7 @@ def set_tags(root, prefixes, refines, val):
if val: if val:
val = uniq(list(filter(None, val))) val = uniq(list(filter(None, val)))
for x in val: for x in val:
c = m.makeelement(const.DC_SUBJ) c = m.makeelement(base.tag('dc', 'subj'))
c.text = normalize_whitespace(x) c.text = normalize_whitespace(x)
if c.text: if c.text:
m.append(c) m.append(c)
@@ -725,7 +762,7 @@ def read_rating(root, prefixes, refines):
def create_rating(root, prefixes, val): def create_rating(root, prefixes, val):
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:rating'}) d = m.makeelement(base.tag('opf', 'meta'), attrib={'property': 'calibre:rating'})
d.text = val d.text = val
m.append(d) m.append(d)
@@ -747,7 +784,8 @@ def set_rating(root, prefixes, refines, val):
def read_series(root, prefixes, refines): def read_series(root, prefixes, refines):
series_index = 1.0 series_index = 1.0
for meta in XPath('./opf:metadata/opf:meta[@property="belongs-to-collection" and @id]')(root): for meta in XPath('./opf:metadata/opf:meta[@property="'
'belongs-to-collection" and @id]')(root):
val = (meta.text or '').strip() val = (meta.text or '').strip()
if val: if val:
props = properties_for_id(meta.get('id'), refines) props = properties_for_id(meta.get('id'), refines)
@@ -757,13 +795,15 @@ def read_series(root, prefixes, refines):
except Exception: except Exception:
pass pass
return normalize_whitespace(val), series_index return normalize_whitespace(val), series_index
for si in XPath('./opf:metadata/opf:meta[@name="calibre:series_index"]/@content')(root): for si in XPath('./opf:metadata/opf:meta[@name="calibre:series_index"]'
'/@content')(root):
try: try:
series_index = float(si) series_index = float(si)
break break
except: except Exception:
pass pass
for s in XPath('./opf:metadata/opf:meta[@name="calibre:series"]/@content')(root): for s in XPath('./opf:metadata/opf:meta[@name="calibre:series"]'
'/@content')(root):
s = normalize_whitespace(s) s = normalize_whitespace(s)
if s: if s:
return s, series_index return s, series_index
@@ -772,16 +812,20 @@ def read_series(root, prefixes, refines):
def create_series(root, refines, series, series_index): def create_series(root, refines, series, series_index):
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(const.OPF_META, attrib={'property':'belongs-to-collection'}) d = m.makeelement(base.tag('opf', 'meta'),
attrib={'property': 'belongs-to-collection'})
d.text = series d.text = series
m.append(d) m.append(d)
set_refines(d, refines, refdef('collection-type', 'series'), refdef('group-position', series_index)) set_refines(d, refines, refdef('collection-type', 'series'),
refdef('group-position', series_index))
def set_series(root, prefixes, refines, series, series_index): def set_series(root, prefixes, refines, series, series_index):
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:series" or @name="calibre:series_index"]')(root): for meta in XPath('./opf:metadata/opf:meta[@name="calibre:series" or '
'@name="calibre:series_index"]')(root):
remove_element(meta, refines) remove_element(meta, refines)
for meta in XPath('./opf:metadata/opf:meta[@property="belongs-to-collection"]')(root): for meta in XPath('./opf:metadata/opf:meta[@property="'
'belongs-to-collection"]')(root):
remove_element(meta, refines) remove_element(meta, refines)
if series: if series:
create_series(root, refines, series, '%.2g' % series_index) create_series(root, refines, series, '%.2g' % series_index)
@@ -806,7 +850,8 @@ def dict_reader(name, load=json.loads, try2=True):
except Exception: except Exception:
continue continue
if try2: if try2:
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % name)(root): for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' %
name)(root):
val = meta.get('content') val = meta.get('content')
if val: if val:
try: try:
@@ -827,7 +872,8 @@ def dict_writer(name, serialize=dump_dict, remove2=True):
def writer(root, prefixes, refines, val): def writer(root, prefixes, refines, val):
if remove2: if remove2:
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % name)(root): for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' %
name)(root):
remove_element(meta, refines) remove_element(meta, refines)
for meta in XPath('./opf:metadata/opf:meta[@property]')(root): for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
prop = expand_prefix(meta.get('property'), prefixes) prop = expand_prefix(meta.get('property'), prefixes)
@@ -836,7 +882,8 @@ def dict_writer(name, serialize=dump_dict, remove2=True):
if val: if val:
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:%s' % name}) d = m.makeelement(base.tag('opf', 'meta'),
attrib={'property': 'calibre:%s' % name})
d.text = serialize(val) d.text = serialize(val)
m.append(d) m.append(d)
return writer return writer
@@ -855,12 +902,14 @@ def deserialize_user_metadata(val):
return ans return ans
read_user_metadata3 = dict_reader('user_metadata', load=deserialize_user_metadata, try2=False) read_user_metadata3 = dict_reader('user_metadata',
load=deserialize_user_metadata, try2=False)
def read_user_metadata2(root, remove_tags=False): def read_user_metadata2(root, remove_tags=False):
ans = {} ans = {}
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, "calibre:user_metadata:")]')(root): for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, '
'"calibre:user_metadata:")]')(root):
name = meta.get('name') name = meta.get('name')
name = ':'.join(name.split(':')[2:]) name = ':'.join(name.split(':')[2:])
if not name or not name.startswith('#'): if not name or not name.startswith('#'):
@@ -881,18 +930,23 @@ def read_user_metadata2(root, remove_tags=False):
def read_user_metadata(root, prefixes, refines): def read_user_metadata(root, prefixes, refines):
return read_user_metadata3(root, prefixes, refines) or read_user_metadata2(root) return read_user_metadata3(root, prefixes,
refines) or read_user_metadata2(root)
def serialize_user_metadata(val): def serialize_user_metadata(val):
return json.dumps(object_to_unicode(val), ensure_ascii=False, default=to_json, indent=2, sort_keys=True) return json.dumps(object_to_unicode(val), ensure_ascii=False,
default=to_json, indent=2, sort_keys=True)
set_user_metadata3 = dict_writer('user_metadata', serialize=serialize_user_metadata, remove2=False) set_user_metadata3 = dict_writer('user_metadata',
serialize=serialize_user_metadata,
remove2=False)
def set_user_metadata(root, prefixes, refines, val): def set_user_metadata(root, prefixes, refines, val):
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, "calibre:user_metadata:")]')(root): for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, '
'"calibre:user_metadata:")]')(root):
remove_element(meta, refines) remove_element(meta, refines)
if val: if val:
nval = {} nval = {}
@@ -921,26 +975,32 @@ def read_raster_cover(root, prefixes, refines):
if href: if href:
return href return href
for item_id in XPath('./opf:metadata/opf:meta[@name="cover"]/@content')(root): for item_id in XPath('./opf:metadata/opf:meta[@name="cover"]'
for item in XPath('./opf:manifest/opf:item[@id and @href and @media-type]')(root): '/@content')(root):
for item in XPath('./opf:manifest/opf:item[@id and @href and '
'@media-type]')(root):
if item.get('id') == item_id: if item.get('id') == item_id:
href = get_href(item) href = get_href(item)
if href: if href:
return href return href
def ensure_is_only_raster_cover(root, prefixes, refines, raster_cover_item_href): def ensure_is_only_raster_cover(root, prefixes, refines,
raster_cover_item_href):
for item in XPath('./opf:metadata/opf:meta[@name="cover"]')(root): for item in XPath('./opf:metadata/opf:meta[@name="cover"]')(root):
remove_element(item, refines) remove_element(item, refines)
for item in items_with_property(root, 'cover-image', prefixes): for item in items_with_property(root, 'cover-image', prefixes):
prop = normalize_whitespace(item.get('properties').replace('cover-image', '')) prop = normalize_whitespace(item.get('properties')
.replace('cover-image', ''))
if prop: if prop:
item.set('properties', prop) item.set('properties', prop)
else: else:
del item.attrib['properties'] del item.attrib['properties']
for item in XPath('./opf:manifest/opf:item')(root): for item in XPath('./opf:manifest/opf:item')(root):
if item.get('href') == raster_cover_item_href: if item.get('href') == raster_cover_item_href:
item.set('properties', normalize_whitespace((item.get('properties') or '') + ' cover-image')) item.set('properties',
normalize_whitespace((item.get('properties')
or '') + ' cover-image'))
# }}} # }}}
@@ -960,7 +1020,7 @@ def set_last_modified_in_opf(root):
def read_metadata(root, ver=None, return_extra_data=False): def read_metadata(root, ver=None, return_extra_data=False):
ans = Metadata('Unknown', ['Unknown']) ans = base.Metadata('Unknown', ['Unknown'])
prefixes, refines = read_prefixes(root), read_refines(root) prefixes, refines = read_prefixes(root), read_refines(root)
identifiers = read_identifiers(root, prefixes, refines) identifiers = read_identifiers(root, prefixes, refines)
ids = {} ids = {}
@@ -1000,12 +1060,16 @@ def read_metadata(root, ver=None, return_extra_data=False):
s, si = read_series(root, prefixes, refines) s, si = read_series(root, prefixes, refines)
if s: if s:
ans.series, ans.series_index = s, si ans.series, ans.series_index = s, si
ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map ans.author_link_map = read_author_link_map(root, prefixes,
ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories refines) or ans.author_link_map
for name, fm in (read_user_metadata(root, prefixes, refines) or {}).items(): ans.user_categories = read_user_categories(root, prefixes,
refines) or ans.user_categories
for name, fm in (read_user_metadata(root, prefixes,
refines) or {}).items():
ans.set_user_metadata(name, fm) ans.set_user_metadata(name, fm)
if return_extra_data: if return_extra_data:
ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines) ans = (ans, ver, read_raster_cover(root, prefixes, refines),
first_spine_item(root, prefixes, refines))
return ans return ans
@@ -1014,7 +1078,9 @@ def get_metadata(stream):
return read_metadata(root) return read_metadata(root)
def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True): def apply_metadata(root, mi, cover_prefix='', cover_data=None,
apply_null=False, update_timestamp=False,
force_identifiers=False, add_missing_cover=True):
prefixes, refines = read_prefixes(root), read_refines(root) prefixes, refines = read_prefixes(root), read_refines(root)
current_mi = read_metadata(root) current_mi = read_metadata(root)
if apply_null: if apply_null:
@@ -1024,7 +1090,8 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False,
def ok(x): def ok(x):
return not mi.is_null(x) return not mi.is_null(x)
if ok('identifiers'): if ok('identifiers'):
set_identifiers(root, prefixes, refines, mi.identifiers, force_identifiers=force_identifiers) set_identifiers(root, prefixes, refines, mi.identifiers,
force_identifiers=force_identifiers)
if ok('title'): if ok('title'):
set_title(root, prefixes, refines, mi.title, mi.title_sort) set_title(root, prefixes, refines, mi.title, mi.title_sort)
if ok('languages'): if ok('languages'):
@@ -1052,16 +1119,21 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False,
if ok('series'): if ok('series'):
set_series(root, prefixes, refines, mi.series, mi.series_index or 1) set_series(root, prefixes, refines, mi.series, mi.series_index or 1)
if ok('author_link_map'): if ok('author_link_map'):
set_author_link_map(root, prefixes, refines, getattr(mi, 'author_link_map', None)) set_author_link_map(root, prefixes, refines,
getattr(mi, 'author_link_map', None))
if ok('user_categories'): if ok('user_categories'):
set_user_categories(root, prefixes, refines, getattr(mi, 'user_categories', None)) set_user_categories(root, prefixes, refines,
getattr(mi, 'user_categories', None))
# We ignore apply_null for the next two to match the behavior with opf2.py # We ignore apply_null for the next two to match the behavior with opf2.py
if mi.application_id: if mi.application_id:
set_application_id(root, prefixes, refines, mi.application_id) set_application_id(root, prefixes, refines, mi.application_id)
if mi.uuid: if mi.uuid:
set_uuid(root, prefixes, refines, mi.uuid) set_uuid(root, prefixes, refines, mi.uuid)
new_user_metadata, current_user_metadata = mi.get_all_user_metadata(True), current_mi.get_all_user_metadata(True)
new_user_metadata = mi.get_all_user_metadata(True)
current_user_metadata = current_mi.get_all_user_metadata(True)
missing = object() missing = object()
for key in tuple(new_user_metadata): for key in tuple(new_user_metadata):
meta = new_user_metadata.get(key) meta = new_user_metadata.get(key)
if meta is None: if meta is None:
@@ -1098,7 +1170,9 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False,
return raster_cover return raster_cover
def set_metadata(stream, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True): def set_metadata(stream, mi, cover_prefix='', cover_data=None,
apply_null=False, update_timestamp=False,
force_identifiers=False, add_missing_cover=True):
root = parse_opf(stream) root = parse_opf(stream)
return apply_metadata( return apply_metadata(
root, mi, cover_prefix=cover_prefix, cover_data=cover_data, root, mi, cover_prefix=cover_prefix, cover_data=cover_data,

View File

@@ -8,37 +8,28 @@ import uuid
from lxml import etree from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.mobi.reader.headers import NULL_INDEX from ebook_converter.ebooks.mobi.reader.headers import NULL_INDEX
from ebook_converter.ebooks.mobi.reader.index import read_index from ebook_converter.ebooks.mobi.reader.index import read_index
from ebook_converter.ebooks.mobi.reader.ncx import read_ncx, build_toc from ebook_converter.ebooks.mobi.reader.ncx import read_ncx, build_toc
from ebook_converter.ebooks.mobi.reader.markup import expand_mobi8_markup from ebook_converter.ebooks.mobi.reader.markup import expand_mobi8_markup
from ebook_converter.ebooks.mobi.reader.containers import Container, find_imgtype from ebook_converter.ebooks.mobi.reader import containers
from ebook_converter.ebooks.metadata.opf2 import Guide, OPFCreator from ebook_converter.ebooks.metadata.opf2 import Guide, OPFCreator
from ebook_converter.ebooks.metadata.toc import TOC from ebook_converter.ebooks.metadata.toc import TOC
from ebook_converter.ebooks.mobi.utils import read_font_record from ebook_converter.ebooks.mobi.utils import read_font_record
from ebook_converter.ebooks.oeb.parse_utils import parse_html from ebook_converter.ebooks.oeb.parse_utils import parse_html
from ebook_converter.ebooks.oeb.base import XPath, xml2text from ebook_converter.ebooks.oeb import base
from ebook_converter.polyglot.builtins import as_unicode from ebook_converter.polyglot.builtins import as_unicode
ID_RE = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
NAME_RE = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
AID_RE = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
Part = collections.namedtuple('Part', 'num type filename start end aid')
Elem = collections.namedtuple('Elem', 'insert_pos toc_text file_number '
'sequence_number start_pos length')
FlowInfo = collections.namedtuple('FlowInfo', 'type format dir fname')
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
Part = collections.namedtuple('Part',
'num type filename start end aid')
Elem = collections.namedtuple('Elem',
'insert_pos toc_text file_number sequence_number start_pos '
'length')
FlowInfo = collections.namedtuple('FlowInfo',
'type format dir fname')
# locate beginning and ending positions of tag with specific aid attribute # locate beginning and ending positions of tag with specific aid attribute
def locate_beg_end_of_tag(ml, aid): def locate_beg_end_of_tag(ml, aid):
pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid
aid_pattern = re.compile(pattern, re.IGNORECASE) aid_pattern = re.compile(pattern, re.IGNORECASE)
@@ -64,7 +55,8 @@ def reverse_tag_iter(block):
end = plt end = plt
def get_first_resource_index(first_image_index, num_of_text_records, first_text_record_number): def get_first_resource_index(first_image_index, num_of_text_records,
first_text_record_number):
first_resource_index = first_image_index first_resource_index = first_image_index
if first_resource_index in {-1, NULL_INDEX}: if first_resource_index in {-1, NULL_INDEX}:
first_resource_index = num_of_text_records + first_text_record_number first_resource_index = num_of_text_records + first_text_record_number
@@ -78,23 +70,27 @@ class Mobi8Reader(object):
self.mobi6_reader, self.log = mobi6_reader, log self.mobi6_reader, self.log = mobi6_reader, log
self.header = mobi6_reader.book_header self.header = mobi6_reader.book_header
self.encrypted_fonts = [] self.encrypted_fonts = []
self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''') self.id_re = ID_RE
self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''') self.name_re = NAME_RE
self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''') self.aid_re = AID_RE
def __call__(self): def __call__(self):
self.mobi6_reader.check_for_drm() self.mobi6_reader.check_for_drm()
self.aid_anchor_suffix = uuid.uuid4().hex.encode('utf-8') self.aid_anchor_suffix = uuid.uuid4().hex.encode('utf-8')
bh = self.mobi6_reader.book_header bh = self.mobi6_reader.book_header
_gfri = get_first_resource_index
if self.mobi6_reader.kf8_type == 'joint': if self.mobi6_reader.kf8_type == 'joint':
offset = self.mobi6_reader.kf8_boundary + 2 offset = self.mobi6_reader.kf8_boundary + 2
self.resource_offsets = [ self.resource_offsets = [(_gfri(bh.first_image_index,
(get_first_resource_index(bh.first_image_index, bh.mobi6_records, 1), offset - 2), bh.mobi6_records, 1), offset - 2),
(get_first_resource_index(bh.kf8_first_image_index, bh.records, offset), len(self.mobi6_reader.sections)), (_gfri(bh.kf8_first_image_index,
] bh.records, offset),
len(self.mobi6_reader.sections))]
else: else:
offset = 1 offset = 1
self.resource_offsets = [(get_first_resource_index(bh.first_image_index, bh.records, offset), len(self.mobi6_reader.sections))] self.resource_offsets = [(_gfri(bh.first_image_index, bh.records,
offset),
len(self.mobi6_reader.sections))]
self.processed_records = self.mobi6_reader.extract_text(offset=offset) self.processed_records = self.mobi6_reader.extract_text(offset=offset)
self.raw_ml = self.mobi6_reader.mobi_html self.raw_ml = self.mobi6_reader.mobi_html
@@ -123,37 +119,37 @@ class Mobi8Reader(object):
raise ValueError('KF8 does not have a valid FDST record') raise ValueError('KF8 does not have a valid FDST record')
sec_start, num_sections = struct.unpack_from(b'>LL', header, 4) sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
secs = struct.unpack_from(b'>%dL' % (num_sections*2), secs = struct.unpack_from(b'>%dL' % (num_sections*2),
header, sec_start) header, sec_start)
self.flow_table = tuple(zip(secs[::2], secs[1::2])) self.flow_table = tuple(zip(secs[::2], secs[1::2]))
self.files = [] self.files = []
if self.header.skelidx != NULL_INDEX: if self.header.skelidx != NULL_INDEX:
table = read_index(self.kf8_sections, self.header.skelidx, table = read_index(self.kf8_sections, self.header.skelidx,
self.header.codec)[0] self.header.codec)[0]
File = collections.namedtuple('File', File = collections.namedtuple('File', 'file_number name '
'file_number name divtbl_count start_position length') 'divtbl_count start_position length')
for i, text in enumerate(table): for i, text in enumerate(table):
tag_map = table[text] tag_map = table[text]
self.files.append(File(i, text, tag_map[1][0], self.files.append(File(i, text, tag_map[1][0],
tag_map[6][0], tag_map[6][1])) tag_map[6][0], tag_map[6][1]))
self.elems = [] self.elems = []
if self.header.dividx != NULL_INDEX: if self.header.dividx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.dividx, table, cncx = read_index(self.kf8_sections, self.header.dividx,
self.header.codec) self.header.codec)
for i, text in enumerate(table): for i, text in enumerate(table):
tag_map = table[text] tag_map = table[text]
toc_text = cncx[tag_map[2][0]] toc_text = cncx[tag_map[2][0]]
self.elems.append(Elem(int(text), toc_text, tag_map[3][0], self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
tag_map[4][0], tag_map[6][0], tag_map[6][1])) tag_map[4][0], tag_map[6][0],
tag_map[6][1]))
self.guide = [] self.guide = []
if self.header.othidx != NULL_INDEX: if self.header.othidx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.othidx, table, cncx = read_index(self.kf8_sections, self.header.othidx,
self.header.codec) self.header.codec)
Item = collections.namedtuple('Item', Item = collections.namedtuple('Item', 'type title pos_fid')
'type title pos_fid')
for i, ref_type in enumerate(table): for i, ref_type in enumerate(table):
tag_map = table[ref_type] tag_map = table[ref_type]
@@ -161,7 +157,7 @@ class Mobi8Reader(object):
title = cncx[tag_map[1][0]] title = cncx[tag_map[1][0]]
fileno = None fileno = None
if 3 in list(tag_map.keys()): if 3 in list(tag_map.keys()):
fileno = tag_map[3][0] fileno = tag_map[3][0]
if 6 in list(tag_map.keys()): if 6 in list(tag_map.keys()):
fileno = tag_map[6] fileno = tag_map[6]
if isinstance(ref_type, bytes): if isinstance(ref_type, bytes):
@@ -205,17 +201,19 @@ class Mobi8Reader(object):
head = skeleton[:insertpos] head = skeleton[:insertpos]
tail = skeleton[insertpos:] tail = skeleton[insertpos:]
if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') <
head.rfind(b'<')): head.rfind(b'<')):
# There is an incomplete tag in either the head or tail. # There is an incomplete tag in either the head or tail.
# This can happen for some badly formed KF8 files, see for # This can happen for some badly formed KF8 files, see for
# example, https://bugs.launchpad.net/bugs/1082669 # example, https://bugs.launchpad.net/bugs/1082669
if not inspos_warned: if not inspos_warned:
self.log.warn( self.log.warn('The div table for %s has incorrect '
'The div table for %s has incorrect insert ' 'insert positions. Calculating '
'positions. Calculating manually.'%skelname) 'manually.' % skelname)
inspos_warned = True inspos_warned = True
bp, ep = locate_beg_end_of_tag(skeleton, aidtext if bp, ep = locate_beg_end_of_tag(skeleton, aidtext if
isinstance(aidtext, bytes) else aidtext.encode('utf-8')) isinstance(aidtext, bytes)
else
aidtext.encode('utf-8'))
if bp != ep: if bp != ep:
insertpos = ep + 1 + startpos insertpos = ep + 1 + startpos
@@ -228,7 +226,7 @@ class Mobi8Reader(object):
aidtext = str(uuid.uuid4()) aidtext = str(uuid.uuid4())
filename = aidtext + '.html' filename = aidtext + '.html'
self.partinfo.append(Part(skelnum, 'text', filename, skelpos, self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
baseptr, aidtext)) baseptr, aidtext))
# The primary css style sheet is typically stored next followed by any # The primary css style sheet is typically stored next followed by any
# snippets of code that were previously inlined in the # snippets of code that were previously inlined in the
@@ -238,10 +236,10 @@ class Mobi8Reader(object):
# The problem is that for most browsers and ereaders, you can not # The problem is that for most browsers and ereaders, you can not
# use <img src="imageXXXX.svg" /> to import any svg image that itself # use <img src="imageXXXX.svg" /> to import any svg image that itself
# properly uses an <image/> tag to import some raster image - it # properly uses an <image/> tag to import some raster image - it
# should work according to the spec but does not for almost all browsers # should work according to the spec but does not for almost all
# and ereaders and causes epub validation issues because those raster # browsers and ereaders and causes epub validation issues because
# images are in manifest but not in xhtml text - since they only # those raster images are in manifest but not in xhtml text - since
# referenced from an svg image # they only referenced from an svg image
# So we need to check the remaining flow pieces to see if they are css # So we need to check the remaining flow pieces to see if they are css
# or svg images. if svg images, we must check if they have an <image/> # or svg images. if svg images, we must check if they have an <image/>
@@ -252,7 +250,8 @@ class Mobi8Reader(object):
self.flowinfo.append(FlowInfo(None, None, None, None)) self.flowinfo.append(FlowInfo(None, None, None, None))
svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE) svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''', re.IGNORECASE) image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''',
re.IGNORECASE)
for j in range(1, len(self.flows)): for j in range(1, len(self.flows)):
flowpart = self.flows[j] flowpart = self.flows[j]
nstr = '%04d' % j nstr = '%04d' % j
@@ -276,7 +275,8 @@ class Mobi8Reader(object):
# search for CDATA and if exists inline it # search for CDATA and if exists inline it
if flowpart.find(b'[CDATA[') >= 0: if flowpart.find(b'[CDATA[') >= 0:
typ = 'css' typ = 'css'
flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n' flowpart = (b'<style type="text/css">\n' + flowpart +
b'\n</style>\n')
format = 'inline' format = 'inline'
dir = None dir = None
fname = None fname = None
@@ -300,7 +300,8 @@ class Mobi8Reader(object):
def get_id_tag_by_pos_fid(self, posfid, offset): def get_id_tag_by_pos_fid(self, posfid, offset):
# first convert kindle:pos:fid and offset info to position in file # first convert kindle:pos:fid and offset info to position in file
insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid] (insertpos, idtext, filenum,
seqnm, startpos, length) = self.elems[posfid]
pos = insertpos + offset pos = insertpos + offset
fi = self.get_file_info(pos) fi = self.get_file_info(pos)
# an existing "id=" must exist in original xhtml otherwise it would not # an existing "id=" must exist in original xhtml otherwise it would not
@@ -311,20 +312,20 @@ class Mobi8Reader(object):
# so find the closest "id=" before position the file by actually # so find the closest "id=" before position the file by actually
# searching in that file # searching in that file
idtext = self.get_id_tag(pos) idtext = self.get_id_tag(pos)
return '%s/%s'%(fi.type, fi.filename), idtext return '%s/%s' % (fi.type, fi.filename), idtext
def get_id_tag(self, pos): def get_id_tag(self, pos):
# Find the first tag with a named anchor (name or id attribute) before # Find the first tag with a named anchor (name or id attribute) before
# pos # pos
fi = self.get_file_info(pos) fi = self.get_file_info(pos)
if fi.num is None and fi.start is None: if fi.num is None and fi.start is None:
raise ValueError('No file contains pos: %d'%pos) raise ValueError('No file contains pos: %d' % pos)
textblock = self.parts[fi.num] textblock = self.parts[fi.num]
npos = pos - fi.start npos = pos - fi.start
pgt = textblock.find(b'>', npos) pgt = textblock.find(b'>', npos)
plt = textblock.find(b'<', npos) plt = textblock.find(b'<', npos)
# if npos inside a tag then search all text before the its end of tag marker # if npos inside a tag then search all text before the its end of tag
# else not in a tag need to search the preceding tag # marker else not in a tag need to search the preceding tag
if plt == npos or pgt < plt: if plt == npos or pgt < plt:
npos = pgt + 1 npos = pgt + 1
textblock = textblock[0:npos] textblock = textblock[0:npos]
@@ -371,7 +372,7 @@ class Mobi8Reader(object):
linktgt = fi.filename linktgt = fi.filename
if idtext: if idtext:
linktgt += '#' + idtext linktgt += '#' + idtext
g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwd()) g = Guide.Reference('%s/%s' % (fi.type, linktgt), os.getcwd())
g.title, g.type = 'start', 'text' g.title, g.type = 'start', 'text'
guide.append(g) guide.append(g)
@@ -379,7 +380,7 @@ class Mobi8Reader(object):
def create_ncx(self): def create_ncx(self):
index_entries = read_ncx(self.kf8_sections, self.header.ncxidx, index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
self.header.codec) self.header.codec)
remove = [] remove = []
# Add href and anchor info to the index entries # Add href and anchor info to the index entries
@@ -389,15 +390,15 @@ class Mobi8Reader(object):
pos = entry['pos'] pos = entry['pos']
fi = self.get_file_info(pos) fi = self.get_file_info(pos)
if fi.filename is None: if fi.filename is None:
raise ValueError('Index entry has invalid pos: %d'%pos) raise ValueError('Index entry has invalid pos: %d' % pos)
idtag = self.get_id_tag(pos) idtag = self.get_id_tag(pos)
href = '%s/%s'%(fi.type, fi.filename) href = '%s/%s' % (fi.type, fi.filename)
else: else:
try: try:
href, idtag = self.get_id_tag_by_pos_fid(*pos_fid) href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
except ValueError: except ValueError:
self.log.warn('Invalid entry in NCX (title: %s), ignoring' self.log.warn('Invalid entry in NCX (title: %s), '
%entry['text']) 'ignoring' % entry['text'])
remove.append(entry) remove.append(entry)
continue continue
@@ -411,7 +412,8 @@ class Mobi8Reader(object):
return build_toc(index_entries) return build_toc(index_entries)
def extract_resources(self, sections): def extract_resources(self, sections):
from ebook_converter.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF from ebook_converter.ebooks.mobi.writer2.resources import \
PLACEHOLDER_GIF
resource_map = [] resource_map = []
container = None container = None
for x in ('fonts', 'images'): for x in ('fonts', 'images'):
@@ -424,16 +426,18 @@ class Mobi8Reader(object):
typ = data[:4] typ = data[:4]
href = None href = None
if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN', if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN',
b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET', b'PAGE'}: b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC',
b'CMET', b'PAGE'}:
pass # Ignore these records pass # Ignore these records
elif typ == b'FONT': elif typ == b'FONT':
font = read_font_record(data) font = read_font_record(data)
href = "fonts/%05d.%s" % (fname_idx, font['ext']) href = "fonts/%05d.%s" % (fname_idx, font['ext'])
if font['err']: if font['err']:
self.log.warn('Reading font record %d failed: %s'%( self.log.warn('Reading font record %d failed: %s' %
fname_idx, font['err'])) (fname_idx, font['err']))
if font['headers']: if font['headers']:
self.log.debug('Font record headers: %s'%font['headers']) self.log.debug('Font record headers: %s' %
font['headers'])
with open(href.replace('/', os.sep), 'wb') as f: with open(href.replace('/', os.sep), 'wb') as f:
f.write(font['font_data'] if font['font_data'] else f.write(font['font_data'] if font['font_data'] else
font['raw_data']) font['raw_data'])
@@ -443,19 +447,23 @@ class Mobi8Reader(object):
if data == b'CONTBOUNDARY': if data == b'CONTBOUNDARY':
container = None container = None
continue continue
container = Container(data) container = containers.Container(data)
elif typ == b'CRES': elif typ == b'CRES':
data, imgtype = container.load_image(data) data, imgtype = container.load_image(data)
if data is not None: if data is not None:
href = 'images/%05d.%s'%(container.resource_index, imgtype) href = 'images/%05d.%s' % (container.resource_index,
imgtype)
with open(href.replace('/', os.sep), 'wb') as f: with open(href.replace('/', os.sep), 'wb') as f:
f.write(data) f.write(data)
elif typ == b'\xa0\xa0\xa0\xa0' and len(data) == 4 and container is not None: elif (typ == b'\xa0\xa0\xa0\xa0' and
len(data) == 4 and
container is not None):
container.resource_index += 1 container.resource_index += 1
elif container is None: elif container is None:
if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF): if not (len(data) == len(PLACEHOLDER_GIF) and
imgtype = find_imgtype(data) data == PLACEHOLDER_GIF):
href = 'images/%05d.%s'%(fname_idx, imgtype) imgtype = containers.find_imgtype(data)
href = 'images/%05d.%s' % (fname_idx, imgtype)
with open(href.replace('/', os.sep), 'wb') as f: with open(href.replace('/', os.sep), 'wb') as f:
f.write(data) f.write(data)
@@ -482,7 +490,7 @@ class Mobi8Reader(object):
if os.path.exists(href.replace('/', os.sep)): if os.path.exists(href.replace('/', os.sep)):
try: try:
toc = self.read_inline_toc(href, frag) toc = self.read_inline_toc(href, frag)
except: except Exception:
self.log.exception('Failed to read inline ToC') self.log.exception('Failed to read inline ToC')
opf = OPFCreator(os.getcwd(), mi) opf = OPFCreator(os.getcwd(), mi)
@@ -493,7 +501,8 @@ class Mobi8Reader(object):
# If there are no images then the azw3 input plugin dumps all # If there are no images then the azw3 input plugin dumps all
# binary records as .unknown images, remove them # binary records as .unknown images, remove them
if self.for_tweak and os.path.exists('images') and os.path.isdir('images'): if (self.for_tweak and os.path.exists('images') and
os.path.isdir('images')):
files = os.listdir('images') files = os.listdir('images')
unknown = [x for x in files if x.endswith('.unknown')] unknown = [x for x in files if x.endswith('.unknown')]
if len(files) == len(unknown): if len(files) == len(unknown):
@@ -502,7 +511,7 @@ class Mobi8Reader(object):
if self.for_tweak: if self.for_tweak:
try: try:
os.remove('debug-raw.html') os.remove('debug-raw.html')
except: except Exception:
pass pass
opf.create_manifest_from_files_in([os.getcwd()], exclude=exclude) opf.create_manifest_from_files_in([os.getcwd()], exclude=exclude)
@@ -528,7 +537,7 @@ class Mobi8Reader(object):
with open(href.replace('/', os.sep), 'rb') as f: with open(href.replace('/', os.sep), 'rb') as f:
raw = f.read().decode(self.header.codec) raw = f.read().decode(self.header.codec)
root = parse_html(raw, log=self.log) root = parse_html(raw, log=self.log)
body = XPath('//h:body')(root) body = base.XPath('//h:body')(root)
reached = False reached = False
if body: if body:
start = body[0] start = body[0]
@@ -536,7 +545,7 @@ class Mobi8Reader(object):
start = None start = None
reached = True reached = True
if frag: if frag:
elems = XPath('//*[@id="%s"]'%frag)(root) elems = base.XPath('//*[@id="%s"]' % frag)(root)
if elems: if elems:
start = elems[0] start = elems[0]
@@ -554,12 +563,13 @@ class Mobi8Reader(object):
seen = set() seen = set()
links = [] links = []
for elem in root.iterdescendants(etree.Element): for elem in root.iterdescendants(etree.Element):
if reached and elem.tag == const.XHTML_A and elem.get('href', if reached and elem.tag == base.tag('xhtml',
'a') and elem.get('href',
False): False):
href = elem.get('href') href = elem.get('href')
href, frag = urllib.parse.urldefrag(href) href, frag = urllib.parse.urldefrag(href)
href = base_href + '/' + href href = base_href + '/' + href
text = xml2text(elem).strip() text = base.xml2text(elem).strip()
if (text, href, frag) in seen: if (text, href, frag) in seen:
continue continue
seen.add((text, href, frag)) seen.add((text, href, frag))
@@ -568,7 +578,7 @@ class Mobi8Reader(object):
reached = True reached = True
depths = sorted(set(x[-1] for x in links)) depths = sorted(set(x[-1] for x in links))
depth_map = {x:i for i, x in enumerate(depths)} depth_map = {x: i for i, x in enumerate(depths)}
for text, href, frag, depth in links: for text, href, frag, depth in links:
depth = depth_map[depth] depth = depth_map[depth]
if current_depth is None: if current_depth is None:

View File

@@ -1,5 +1,8 @@
import collections
import errno import errno
import hashlib import hashlib
import io
import itertools
import logging import logging
import os import os
import re import re
@@ -7,13 +10,10 @@ import shutil
import sys import sys
import time import time
import unicodedata import unicodedata
import uuid
from collections import defaultdict
from io import BytesIO
from itertools import count
import urllib.parse import urllib.parse
import uuid
from css_parser import getUrls, replaceUrls import css_parser
from lxml import etree from lxml import etree
from ebook_converter import constants as const from ebook_converter import constants as const
@@ -35,10 +35,7 @@ from ebook_converter.ebooks.metadata.utils import parse_opf_version
from ebook_converter.ebooks.mobi import MobiError from ebook_converter.ebooks.mobi import MobiError
from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader
from ebook_converter.ebooks.mobi.tweak import set_cover from ebook_converter.ebooks.mobi.tweak import set_cover
from ebook_converter.ebooks.oeb.base import ( from ebook_converter.ebooks.oeb import base as oeb_base
OEB_DOCS, OEB_STYLES, Manifest, itercsslinks, iterlinks,
rewrite_links, serialize, urlquote, urlunquote
)
from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html
from ebook_converter.ebooks.oeb.polish.errors import DRMError, InvalidBook from ebook_converter.ebooks.oeb.polish.errors import DRMError, InvalidBook
from ebook_converter.ebooks.oeb.polish.parsing import parse as parse_html_tweak from ebook_converter.ebooks.oeb.polish.parsing import parse as parse_html_tweak
@@ -96,7 +93,7 @@ def abspath_to_name(path, root):
return relpath(os.path.abspath(path), root).replace(os.sep, '/') return relpath(os.path.abspath(path), root).replace(os.sep, '/')
def name_to_href(name, root, base=None, quote=urlquote): def name_to_href(name, root, base=None, quote=oeb_base.urlquote):
fullpath = name_to_abspath(name, root) fullpath = name_to_abspath(name, root)
basepath = root if base is None else os.path.dirname(name_to_abspath(base, root)) basepath = root if base is None else os.path.dirname(name_to_abspath(base, root))
path = relpath(fullpath, basepath).replace(os.sep, '/') path = relpath(fullpath, basepath).replace(os.sep, '/')
@@ -111,7 +108,7 @@ def href_to_name(href, root, base=None):
return None return None
if purl.scheme or not purl.path: if purl.scheme or not purl.path:
return None return None
href = urlunquote(purl.path) href = oeb_base.urlunquote(purl.path)
if iswindows and ':' in href: if iswindows and ':' in href:
# path manipulations on windows fail for paths with : in them, so we # path manipulations on windows fail for paths with : in them, so we
# assume all such paths are invalid/absolute paths. # assume all such paths are invalid/absolute paths.
@@ -324,7 +321,7 @@ class Container(ContainerBase): # {{{
item_id = 'id' + '%d'%c item_id = 'id' + '%d'%c
manifest = self.opf_xpath('//opf:manifest')[0] manifest = self.opf_xpath('//opf:manifest')[0]
href = self.name_to_href(name, self.opf_name) href = self.name_to_href(name, self.opf_name)
item = manifest.makeelement(const.OPF_ITEM, item = manifest.makeelement(oeb_base.tag('opf', 'item'),
id=item_id, href=href) id=item_id, href=href)
item.set('media-type', self.mime_map[name]) item.set('media-type', self.mime_map[name])
self.insert_into_xml(manifest, item) self.insert_into_xml(manifest, item)
@@ -340,7 +337,7 @@ class Container(ContainerBase): # {{{
def make_name_unique(self, name): def make_name_unique(self, name):
''' Ensure that `name` does not already exist in this book. If it does, return a modified version that does not exist. ''' ''' Ensure that `name` does not already exist in this book. If it does, return a modified version that does not exist. '''
counter = count() counter = itertools.count()
while self.has_name_case_insensitive(name) or self.manifest_has_name(name): while self.has_name_case_insensitive(name) or self.manifest_has_name(name):
c = next(counter) + 1 c = next(counter) + 1
base, ext = name.rpartition('.')[::2] base, ext = name.rpartition('.')[::2]
@@ -377,10 +374,10 @@ class Container(ContainerBase): # {{{
if self.ok_to_be_unmanifested(name): if self.ok_to_be_unmanifested(name):
return name return name
item_id = self.add_name_to_manifest(name, process_manifest_item=process_manifest_item) item_id = self.add_name_to_manifest(name, process_manifest_item=process_manifest_item)
if mt in OEB_DOCS: if mt in oeb_base.OEB_DOCS:
manifest = self.opf_xpath('//opf:manifest')[0] manifest = self.opf_xpath('//opf:manifest')[0]
spine = self.opf_xpath('//opf:spine')[0] spine = self.opf_xpath('//opf:spine')[0]
si = manifest.makeelement(const.OPF_ITEMREF, idref=item_id) si = manifest.makeelement(oeb_base.tag('opf', 'itemref'), idref=item_id)
self.insert_into_xml(spine, si, index=spine_index) self.insert_into_xml(spine, si, index=spine_index)
return name return name
@@ -442,12 +439,12 @@ class Container(ContainerBase): # {{{
replace_func.file_type = 'opf' replace_func.file_type = 'opf'
for elem in self.opf_xpath('//*[@href]'): for elem in self.opf_xpath('//*[@href]'):
elem.set('href', replace_func(elem.get('href'))) elem.set('href', replace_func(elem.get('href')))
elif media_type.lower() in OEB_DOCS: elif media_type.lower() in oeb_base.OEB_DOCS:
replace_func.file_type = 'text' replace_func.file_type = 'text'
rewrite_links(self.parsed(name), replace_func) oeb_base.rewrite_links(self.parsed(name), replace_func)
elif media_type.lower() in OEB_STYLES: elif media_type.lower() in oeb_base.OEB_STYLES:
replace_func.file_type = 'style' replace_func.file_type = 'style'
replaceUrls(self.parsed(name), replace_func) css_parser.replaceUrls(self.parsed(name), replace_func)
elif media_type.lower() == guess_type('toc.ncx'): elif media_type.lower() == guess_type('toc.ncx'):
replace_func.file_type = 'ncx' replace_func.file_type = 'ncx'
for elem in self.parsed(name).xpath('//*[@src]'): for elem in self.parsed(name).xpath('//*[@src]'):
@@ -467,21 +464,21 @@ class Container(ContainerBase): # {{{
if name == self.opf_name: if name == self.opf_name:
for elem in self.opf_xpath('//*[@href]'): for elem in self.opf_xpath('//*[@href]'):
yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href') yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
elif media_type.lower() in OEB_DOCS: elif media_type.lower() in oeb_base.OEB_DOCS:
for el, attr, link, pos in iterlinks(self.parsed(name)): for el, attr, link, pos in oeb_base.iterlinks(self.parsed(name)):
yield (link, el.sourceline, pos) if get_line_numbers else link yield (link, el.sourceline, pos) if get_line_numbers else link
elif media_type.lower() in OEB_STYLES: elif media_type.lower() in oeb_base.OEB_STYLES:
if get_line_numbers: if get_line_numbers:
with self.open(name, 'rb') as f: with self.open(name, 'rb') as f:
raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n') raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
position = PositionFinder(raw) position = PositionFinder(raw)
is_in_comment = CommentFinder(raw) is_in_comment = CommentFinder(raw)
for link, offset in itercsslinks(raw): for link, offset in oeb_base.itercsslinks(raw):
if not is_in_comment(offset): if not is_in_comment(offset):
lnum, col = position(offset) lnum, col = position(offset)
yield link, lnum, col yield link, lnum, col
else: else:
for link in getUrls(self.parsed(name)): for link in css_parser.getUrls(self.parsed(name)):
yield link yield link
elif media_type.lower() == guess_type('toc.ncx'): elif media_type.lower() == guess_type('toc.ncx'):
for elem in self.parsed(name).xpath('//*[@src]'): for elem in self.parsed(name).xpath('//*[@src]'):
@@ -533,7 +530,7 @@ class Container(ContainerBase): # {{{
def opf_xpath(self, expr): def opf_xpath(self, expr):
' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. ' ' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. '
return self.opf.xpath(expr, namespaces=const.OPF_NAMESPACES) return self.opf.xpath(expr, namespaces=oeb_base.tag('opf', 'namespaces'))
def has_name(self, name): def has_name(self, name):
''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. ''' ''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. '''
@@ -580,11 +577,11 @@ class Container(ContainerBase): # {{{
def parse(self, path, mime): def parse(self, path, mime):
with open(path, 'rb') as src: with open(path, 'rb') as src:
data = src.read() data = src.read()
if mime in OEB_DOCS: if mime in oeb_base.OEB_DOCS:
data = self.parse_xhtml(data, self.relpath(path)) data = self.parse_xhtml(data, self.relpath(path))
elif mime[-4:] in {'+xml', '/xml'}: elif mime[-4:] in {'+xml', '/xml'}:
data = self.parse_xml(data) data = self.parse_xml(data)
elif mime in OEB_STYLES: elif mime in oeb_base.OEB_STYLES:
data = self.parse_css(data, self.relpath(path)) data = self.parse_css(data, self.relpath(path))
return data return data
@@ -597,7 +594,7 @@ class Container(ContainerBase): # {{{
''' '''
ans = self.open(name).read() ans = self.open(name).read()
mime = self.mime_map.get(name, guess_type(name)) mime = self.mime_map.get(name, guess_type(name))
if decode and (mime in OEB_STYLES or mime in OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}): if decode and (mime in oeb_base.OEB_STYLES or mime in oeb_base.OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}):
ans = self.decode(ans, normalize_to_nfc=normalize_to_nfc) ans = self.decode(ans, normalize_to_nfc=normalize_to_nfc)
return ans return ans
@@ -637,7 +634,7 @@ class Container(ContainerBase): # {{{
so use it sparingly. ''' so use it sparingly. '''
from ebook_converter.ebooks.metadata.opf2 import OPF as O from ebook_converter.ebooks.metadata.opf2 import OPF as O
mi = self.serialize_item(self.opf_name) mi = self.serialize_item(self.opf_name)
return O(BytesIO(mi), basedir=self.opf_dir, unquote_urls=False, return O(io.BytesIO(mi), basedir=self.opf_dir, unquote_urls=False,
populate_spine=False).to_book_metadata() populate_spine=False).to_book_metadata()
@property @property
@@ -662,7 +659,7 @@ class Container(ContainerBase): # {{{
@property @property
def manifest_type_map(self): def manifest_type_map(self):
' Mapping of manifest media-type to list of canonical names of that media-type ' ' Mapping of manifest media-type to list of canonical names of that media-type '
ans = defaultdict(list) ans = collections.defaultdict(list)
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'): for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'):
ans[item.get('media-type').lower()].append(self.href_to_name( ans[item.get('media-type').lower()].append(self.href_to_name(
item.get('href'), self.opf_name)) item.get('href'), self.opf_name))
@@ -813,7 +810,7 @@ class Container(ContainerBase): # {{{
spine = self.opf_xpath('//opf:spine')[0] spine = self.opf_xpath('//opf:spine')[0]
spine.text = tail spine.text = tail
for name, linear in spine_items: for name, linear in spine_items:
i = spine.makeelement(const.OPF_ITEMREF, i = spine.makeelement(oeb_base.tag('opf', 'itemref'),
nsmap={'opf': const.OPF2_NS}) nsmap={'opf': const.OPF2_NS})
i.tail = tail i.tail = tail
i.set('idref', imap[name]) i.set('idref', imap[name])
@@ -922,7 +919,7 @@ class Container(ContainerBase): # {{{
return ans[0] return ans[0]
self.dirty(self.opf_name) self.dirty(self.opf_name)
package = self.opf_xpath('//opf:package')[0] package = self.opf_xpath('//opf:package')[0]
item = package.makeelement(OPF(name)) item = package.makeelement(oeb_base.tag('opf', name))
item.tail = '\n' item.tail = '\n'
package.append(item) package.append(item)
return item return item
@@ -945,7 +942,7 @@ class Container(ContainerBase): # {{{
item_id = id_prefix + '%d'%c item_id = id_prefix + '%d'%c
manifest = self.opf_xpath('//opf:manifest')[0] manifest = self.opf_xpath('//opf:manifest')[0]
item = manifest.makeelement(const.OPF_ITEM, item = manifest.makeelement(oeb_base.tag('opf', 'item'),
id=item_id, href=href) id=item_id, href=href)
item.set('media-type', media_type) item.set('media-type', media_type)
self.insert_into_xml(manifest, item) self.insert_into_xml(manifest, item)
@@ -992,7 +989,7 @@ class Container(ContainerBase): # {{{
data = root = self.parsed(name) data = root = self.parsed(name)
if name == self.opf_name: if name == self.opf_name:
self.format_opf() self.format_opf()
data = serialize(data, self.mime_map[name], pretty_print=name in data = oeb_base.serialize(data, self.mime_map[name], pretty_print=name in
self.pretty_print) self.pretty_print)
if name == self.opf_name and root.nsmap.get(None) == const.OPF2_NS: if name == self.opf_name and root.nsmap.get(None) == const.OPF2_NS:
# Needed as I can't get lxml to output opf:role and # Needed as I can't get lxml to output opf:role and
@@ -1181,7 +1178,7 @@ class EpubContainer(Container):
) )
if not opf_files: if not opf_files:
raise InvalidEpub('META-INF/container.xml contains no link to OPF file') raise InvalidEpub('META-INF/container.xml contains no link to OPF file')
opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/'))) opf_path = os.path.join(self.root, *(oeb_base.urlunquote(opf_files[0].get('full-path')).split('/')))
if not exists(opf_path): if not exists(opf_path):
raise InvalidEpub('OPF file does not exist at location pointed to' raise InvalidEpub('OPF file does not exist at location pointed to'
' by META-INF/container.xml') ' by META-INF/container.xml')
@@ -1412,7 +1409,7 @@ def do_explode(path, dest):
def opf_to_azw3(opf, outpath, container): def opf_to_azw3(opf, outpath, container):
from ebook_converter.ebooks.conversion.plumber import Plumber, create_oebbook from ebook_converter.ebooks.conversion.plumber import Plumber, create_oebbook
class Item(Manifest.Item): class Item(oeb_base.Manifest.Item):
def _parse_css(self, data): def _parse_css(self, data):
# The default CSS parser used by oeb.base inserts the h namespace # The default CSS parser used by oeb.base inserts the h namespace

View File

@@ -1,22 +1,16 @@
from collections import defaultdict import collections
from functools import partial import functools
from css_parser.css import CSSRule, CSSStyleDeclaration from css_parser.css import CSSRule, CSSStyleDeclaration
from ebook_converter import constants as const
from ebook_converter import force_unicode from ebook_converter import force_unicode
from ebook_converter.css_selectors import parse, SelectorSyntaxError from ebook_converter.css_selectors import parse, SelectorSyntaxError
from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, css_text from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css, normalizers from ebook_converter.ebooks.oeb.polish import pretty
from ebook_converter.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize
from ebook_converter.utils.icu import numeric_sort_key from ebook_converter.utils.icu import numeric_sort_key
from ebook_converter.css_selectors import Select, SelectorError from ebook_converter.css_selectors import Select, SelectorError
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
def filter_used_rules(rules, log, select): def filter_used_rules(rules, log, select):
for rule in rules: for rule in rules:
used = False used = False
@@ -34,7 +28,8 @@ def filter_used_rules(rules, log, select):
yield rule yield rule
def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None): def get_imported_sheets(name, container, sheets, recursion_level=10,
sheet=None):
ans = set() ans = set()
sheet = sheet or sheets[name] sheet = sheet or sheets[name]
for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE): for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
@@ -44,7 +39,8 @@ def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None)
ans.add(iname) ans.add(iname)
if recursion_level > 0: if recursion_level > 0:
for imported_sheet in tuple(ans): for imported_sheet in tuple(ans):
ans |= get_imported_sheets(imported_sheet, container, sheets, recursion_level=recursion_level-1) ans |= get_imported_sheets(imported_sheet, container, sheets,
recursion_level=recursion_level-1)
ans.discard(name) ans.discard(name)
return ans return ans
@@ -56,7 +52,7 @@ def merge_declarations(first, second):
def merge_identical_selectors(sheet): def merge_identical_selectors(sheet):
' Merge rules that have identical selectors ' ' Merge rules that have identical selectors '
selector_map = defaultdict(list) selector_map = collections.defaultdict(list)
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE): for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
selector_map[rule.selectorText].append(rule) selector_map[rule.selectorText].append(rule)
remove = [] remove = []
@@ -70,23 +66,29 @@ def merge_identical_selectors(sheet):
return len(remove) return len(remove)
def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False): def remove_unused_css(container, report=None, remove_unused_classes=False,
''' merge_rules=False):
Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content. """
Remove all unused CSS rules from the book. An unused CSS rule is one that
does not match any actual content.
:param report: An optional callable that takes a single argument. It is called with information about the operations being performed. :param report: An optional callable that takes a single argument. It is
:param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed. called with information about the operations being
performed.
:param remove_unused_classes: If True, class attributes in the HTML that
do not match any CSS rules are also removed.
:param merge_rules: If True, rules with identical selectors are merged. :param merge_rules: If True, rules with identical selectors are merged.
''' """
report = report or (lambda x:x) report = report or (lambda x: x)
def safe_parse(name): def safe_parse(name):
try: try:
return container.parsed(name) return container.parsed(name)
except TypeError: except TypeError:
pass pass
sheets = {name:safe_parse(name) for name, mt in container.mime_map.items() if mt in OEB_STYLES}
sheets = {k:v for k, v in sheets.items() if v is not None} sheets = {name: safe_parse(name) for name, mt in container.mime_map.items()
if mt in base.OEB_STYLES and safe_parse(name) is not None}
num_merged = 0 num_merged = 0
if merge_rules: if merge_rules:
for name, sheet in sheets.items(): for name, sheet in sheets.items():
@@ -106,7 +108,7 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
num_of_removed_rules = num_of_removed_classes = 0 num_of_removed_rules = num_of_removed_classes = 0
for name, mt in container.mime_map.items(): for name, mt in container.mime_map.items():
if mt not in OEB_DOCS: if mt not in base.OEB_DOCS:
continue continue
root = container.parsed(name) root = container.parsed(name)
select = Select(root, ignore_inappropriate_pseudo_classes=True) select = Select(root, ignore_inappropriate_pseudo_classes=True)
@@ -120,31 +122,39 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
num_merged += num num_merged += num
container.dirty(name) container.dirty(name)
if remove_unused_classes: if remove_unused_classes:
used_classes |= {x.lower() for x in classes_in_rule_list(sheet.cssRules)} used_classes |= {x.lower() for x in
imports = get_imported_sheets(name, container, sheets, sheet=sheet) classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets,
sheet=sheet)
for imported_sheet in imports: for imported_sheet in imports:
style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select)) style_rules[imported_sheet] = tuple(filter_used_rules(
style_rules[imported_sheet], container.log, select))
if remove_unused_classes: if remove_unused_classes:
used_classes |= class_map[imported_sheet] used_classes |= class_map[imported_sheet]
rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
unused_rules = tuple(filter_used_rules(rules, container.log, select)) unused_rules = tuple(filter_used_rules(rules, container.log,
select))
if unused_rules: if unused_rules:
num_of_removed_rules += len(unused_rules) num_of_removed_rules += len(unused_rules)
[sheet.cssRules.remove(r) for r in unused_rules] [sheet.cssRules.remove(r) for r in unused_rules]
style.text = force_unicode(sheet.cssText, 'utf-8') style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style) pretty.pretty_script_or_style(container, style)
container.dirty(name) container.dirty(name)
for link in root.xpath('//*[local-name()="link" and @href]'): for link in root.xpath('//*[local-name()="link" and @href]'):
sname = container.href_to_name(link.get('href'), name) sname = container.href_to_name(link.get('href'), name)
if sname not in sheets: if sname not in sheets:
continue continue
style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select)) style_rules[sname] = tuple(filter_used_rules(style_rules[sname],
container.log,
select))
if remove_unused_classes: if remove_unused_classes:
used_classes |= class_map[sname] used_classes |= class_map[sname]
for iname in import_map[sname]: for iname in import_map[sname]:
style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select)) style_rules[iname] = tuple(
filter_used_rules(style_rules[iname], container.log,
select))
if remove_unused_classes: if remove_unused_classes:
used_classes |= class_map[iname] used_classes |= class_map[iname]
@@ -159,7 +169,8 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
elem.set('class', ' '.join(classes)) elem.set('class', ' '.join(classes))
else: else:
del elem.attrib['class'] del elem.attrib['class']
num_of_removed_classes += len(original_classes) - len(classes) num_of_removed_classes += (len(original_classes) -
len(classes))
container.dirty(name) container.dirty(name)
for name, sheet in sheets.items(): for name, sheet in sheets.items():
@@ -195,7 +206,7 @@ def filter_declaration(style, properties=()):
changed = True changed = True
all_props = set(style.keys()) all_props = set(style.keys())
for prop in style.getProperties(): for prop in style.getProperties():
n = normalizers.get(prop.name, None) n = base.normalize_css.normalizers.get(prop.name, None)
if n is not None: if n is not None:
normalized = n(prop.name, prop.propertyValue) normalized = n(prop.name, prop.propertyValue)
removed = properties.intersection(set(normalized)) removed = properties.intersection(set(normalized))
@@ -225,12 +236,13 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
root = container.parsed(name) root = container.parsed(name)
changed = False changed = False
for style in root.xpath('//*[local-name()="style"]'): for style in root.xpath('//*[local-name()="style"]'):
if style.text and (style.get('type') or 'text/css').lower() == 'text/css': if style.text and (style.get('type') or
'text/css').lower() == 'text/css':
sheet = container.parse_css(style.text) sheet = container.parse_css(style.text)
if transform_sheet(sheet): if transform_sheet(sheet):
changed = True changed = True
style.text = force_unicode(sheet.cssText, 'utf-8') style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style) pretty.pretty_script_or_style(container, style)
for elem in root.xpath('//*[@style]'): for elem in root.xpath('//*[@style]'):
text = elem.get('style', None) text = elem.get('style', None)
if text: if text:
@@ -240,13 +252,16 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
if style.length == 0: if style.length == 0:
del elem.attrib['style'] del elem.attrib['style']
else: else:
elem.set('style', force_unicode(style.getCssText(separator=' '), 'utf-8')) elem.set('style',
force_unicode(style.getCssText(separator=' '),
'utf-8'))
return changed return changed
def transform_css(container, transform_sheet=None, transform_style=None, names=()): def transform_css(container, transform_sheet=None, transform_style=None,
names=()):
if not names: if not names:
types = OEB_STYLES | OEB_DOCS types = base.OEB_STYLES | base.OEB_DOCS
names = [] names = []
for name, mt in container.mime_map.items(): for name, mt in container.mime_map.items():
if mt in types: if mt in types:
@@ -256,13 +271,14 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=(
for name in names: for name in names:
mt = container.mime_map[name] mt = container.mime_map[name]
if mt in OEB_STYLES: if mt in base.OEB_STYLES:
sheet = container.parsed(name) sheet = container.parsed(name)
if transform_sheet(sheet): if transform_sheet(sheet):
container.dirty(name) container.dirty(name)
doc_changed = True doc_changed = True
elif mt in OEB_DOCS: elif mt in base.OEB_DOCS:
if transform_inline_styles(container, name, transform_sheet, transform_style): if transform_inline_styles(container, name, transform_sheet,
transform_style):
container.dirty(name) container.dirty(name)
doc_changed = True doc_changed = True
@@ -270,15 +286,21 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=(
def filter_css(container, properties, names=()): def filter_css(container, properties, names=()):
''' """
Remove the specified CSS properties from all CSS rules in the book. Remove the specified CSS properties from all CSS rules in the book.
:param properties: Set of properties to remove. For example: :code:`{'font-family', 'color'}`. :param properties: Set of properties to remove. For example:
:param names: The files from which to remove the properties. Defaults to all HTML and CSS files in the book. :code:`{'font-family', 'color'}`.
''' :param names: The files from which to remove the properties. Defaults to
properties = normalize_filter_css(properties) all HTML and CSS files in the book.
return transform_css(container, transform_sheet=partial(filter_sheet, properties=properties), """
transform_style=partial(filter_declaration, properties=properties), names=names) properties = base.normalize_css.normalize_filter_css(properties)
return transform_css(container,
transform_sheet=functools.partial(
filter_sheet, properties=properties),
transform_style=functools.partial(
filter_declaration, properties=properties),
names=names)
def _classes_in_selector(selector, classes): def _classes_in_selector(selector, classes):
@@ -331,21 +353,29 @@ def remove_property_value(prop, predicate):
if len(removed_vals) == len(prop.propertyValue): if len(removed_vals) == len(prop.propertyValue):
prop.parent.removeProperty(prop.name) prop.parent.removeProperty(prop.name)
else: else:
x = css_text(prop.propertyValue) x = base.css_text(prop.propertyValue)
for v in removed_vals: for v in removed_vals:
x = x.replace(css_text(v), '').strip() x = x.replace(base.css_text(v), '').strip()
prop.propertyValue.cssText = x prop.propertyValue.cssText = x
return bool(removed_vals) return bool(removed_vals)
RULE_PRIORITIES = {t:i for i, t in enumerate((CSSRule.COMMENT, CSSRule.CHARSET_RULE, CSSRule.IMPORT_RULE, CSSRule.NAMESPACE_RULE))} RULE_PRIORITIES = {t: i for i, t in enumerate((CSSRule.COMMENT,
CSSRule.CHARSET_RULE,
CSSRule.IMPORT_RULE,
CSSRule.NAMESPACE_RULE))}
def sort_sheet(container, sheet_or_text): def sort_sheet(container, sheet_or_text):
''' Sort the rules in a stylesheet. Note that in the general case this can """
change the effective styles, but for most common sheets, it should be safe. Sort the rules in a stylesheet. Note that in the general case this can
''' change the effective styles, but for most common sheets, it should be
sheet = container.parse_css(sheet_or_text) if isinstance(sheet_or_text, str) else sheet_or_text safe.
"""
if isinstance(sheet_or_text, str):
sheet = container.parse_css(sheet_or_text)
else:
sheet = sheet_or_text
def text_sort_key(x): def text_sort_key(x):
return numeric_sort_key(str(x or '')) return numeric_sort_key(str(x or ''))
@@ -364,7 +394,8 @@ def sort_sheet(container, sheet_or_text):
rule.selectorText = ', '.join(s.selectorText for s in selectors) rule.selectorText = ', '.join(s.selectorText for s in selectors)
elif rule.type == CSSRule.FONT_FACE_RULE: elif rule.type == CSSRule.FONT_FACE_RULE:
try: try:
tertiary = text_sort_key(rule.style.getPropertyValue('font-family')) tertiary = text_sort_key(rule.style.getPropertyValue('font-'
'family'))
except Exception: except Exception:
pass pass
@@ -379,11 +410,14 @@ def add_stylesheet_links(container, name, text):
if not head: if not head:
return return
head = head[0] head = head[0]
sheets = tuple(container.manifest_items_of_type(lambda mt: mt in OEB_STYLES)) sheets = tuple(container.manifest_items_of_type(lambda mt:
mt in base.OEB_STYLES))
if not sheets: if not sheets:
return return
for sname in sheets: for sname in sheets:
link = head.makeelement(const.XHTML_LINK, type='text/css', rel='stylesheet', href=container.name_to_href(sname, name)) link = head.makeelement(base.tag('xhtml', 'link'), type='text/css',
rel='stylesheet',
href=container.name_to_href(sname, name))
head.append(link) head.append(link)
pretty_xml_tree(head) pretty.pretty_xml_tree(head)
return serialize(root, 'text/html') return pretty.serialize(root, 'text/html')

View File

@@ -1,6 +1,7 @@
from lxml import etree from lxml import etree
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.utils.localization import canonicalize_lang from ebook_converter.utils.localization import canonicalize_lang
@@ -14,7 +15,7 @@ def get_book_language(container):
def set_guide_item(container, item_type, title, name, frag=None): def set_guide_item(container, item_type, title, name, frag=None):
ref_tag = const.OPF_REFERENCE ref_tag = base.tag('opf', 'reference')
href = None href = None
if name: if name:
href = container.name_to_href(name, container.opf_name) href = container.name_to_href(name, container.opf_name)
@@ -23,7 +24,7 @@ def set_guide_item(container, item_type, title, name, frag=None):
guides = container.opf_xpath('//opf:guide') guides = container.opf_xpath('//opf:guide')
if not guides and href: if not guides and href:
g = container.opf.makeelement(const.OPF_GUIDE, g = container.opf.makeelement(base.tag('opf', 'guide'),
nsmap={'opf': const.OPF2_NS}) nsmap={'opf': const.OPF2_NS})
container.insert_into_xml(container.opf, g) container.insert_into_xml(container.opf, g)
guides = [g] guides = [g]

View File

@@ -1,18 +1,13 @@
import textwrap import textwrap
# from lxml.etree import Element
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter import force_unicode from ebook_converter import force_unicode
from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.base import serialize, OEB_DOCS, OEB_STYLES from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.utils import guess_type from ebook_converter.ebooks.oeb.polish.utils import guess_type
from ebook_converter.utils.icu import sort_key from ebook_converter.utils.icu import sort_key
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
def isspace(x): def isspace(x):
return not x.strip('\u0009\u000a\u000c\u000d\u0020') return not x.strip('\u0009\u000a\u000c\u000d\u0020')
@@ -28,37 +23,40 @@ def pretty_xml_tree(elem, level=0, indent=' '):
for i, child in enumerate(elem): for i, child in enumerate(elem):
pretty_xml_tree(child, level=level+1, indent=indent) pretty_xml_tree(child, level=level+1, indent=indent)
if not child.tail or isspace(child.tail): if not child.tail or isspace(child.tail):
l = level + 1 new_level = level + 1
if i == len(elem) - 1: if i == len(elem) - 1:
l -= 1 new_level -= 1
child.tail = '\n' + (indent * l) child.tail = '\n' + (indent * new_level)
def pretty_opf(root): def pretty_opf(root):
# Put all dc: tags first starting with title and author. Preserve order for # Put all dc: tags first starting with title and author. Preserve order for
# the rest. # the rest.
def dckey(x): def dckey(x):
return {'title':0, 'creator':1}.get(parse_utils.barename(x.tag), 2) return {'title': 0, 'creator': 1}.get(parse_utils.barename(x.tag), 2)
for metadata in root.xpath('//opf:metadata', namespaces=const.OPF_NAMESPACES):
for metadata in root.xpath('//opf:metadata',
namespaces=const.OPF_NAMESPACES):
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS) dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS)
dc_tags.sort(key=dckey) dc_tags.sort(key=dckey)
for x in reversed(dc_tags): for x in reversed(dc_tags):
metadata.insert(0, x) metadata.insert(0, x)
# Group items in the manifest # Group items in the manifest
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=const.OPF_NAMESPACES) spine_ids = root.xpath('//opf:spine/opf:itemref/@idref',
spine_ids = {x:i for i, x in enumerate(spine_ids)} namespaces=const.OPF_NAMESPACES)
spine_ids = {x: i for i, x in enumerate(spine_ids)}
def manifest_key(x): def manifest_key(x):
mt = x.get('media-type', '') mt = x.get('media-type', '')
href = x.get('href', '') href = x.get('href', '')
ext = href.rpartition('.')[-1].lower() ext = href.rpartition('.')[-1].lower()
cat = 1000 cat = 1000
if mt in OEB_DOCS: if mt in base.OEB_DOCS:
cat = 0 cat = 0
elif mt == guess_type('a.ncx'): elif mt == guess_type('a.ncx'):
cat = 1 cat = 1
elif mt in OEB_STYLES: elif mt in base.OEB_STYLES:
cat = 2 cat = 2
elif mt.startswith('image/'): elif mt.startswith('image/'):
cat = 3 cat = 3
@@ -75,20 +73,23 @@ def pretty_opf(root):
i = sort_key(href) i = sort_key(href)
return (cat, i) return (cat, i)
for manifest in root.xpath('//opf:manifest', namespaces=const.OPF_NAMESPACES): for manifest in root.xpath('//opf:manifest',
namespaces=const.OPF_NAMESPACES):
try: try:
children = sorted(manifest, key=manifest_key) children = sorted(manifest, key=manifest_key)
except AttributeError: except AttributeError:
continue # There are comments so dont sort since that would mess up the comments # There are comments so dont sort since that would mess up the
# comments.
continue
for x in reversed(children): for x in reversed(children):
manifest.insert(0, x) manifest.insert(0, x)
def isblock(x): def isblock(x):
if callable(x.tag) or not x.tag: if callable(x.tag) or not x.tag:
return True return True
if x.tag in const.XHTML_BLOCK_TAGS | {const.SVG_SVG}: if x.tag in const.XHTML_BLOCK_TAGS | {base.tag('svg', 'svg')}:
return True return True
return False return False
@@ -133,28 +134,34 @@ def pretty_block(parent, level=1, indent=' '):
that contain only other block tags ''' that contain only other block tags '''
if not parent.text or isspace(parent.text): if not parent.text or isspace(parent.text):
parent.text = '' parent.text = ''
nn = '\n' if hasattr(parent.tag, 'strip') and parse_utils.barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n' if (hasattr(parent.tag, 'strip') and
parse_utils.barename(parent.tag) in {'tr', 'td', 'th'}):
nn = '\n'
else:
nn = '\n\n'
parent.text = parent.text + nn + (indent * level) parent.text = parent.text + nn + (indent * level)
for i, child in enumerate(parent): for i, child in enumerate(parent):
if isblock(child) and has_only_blocks(child): if isblock(child) and has_only_blocks(child):
pretty_block(child, level=level+1, indent=indent) pretty_block(child, level=level+1, indent=indent)
elif child.tag == const.SVG_SVG: elif child.tag == base.tag('svg', 'svg'):
pretty_xml_tree(child, level=level, indent=indent) pretty_xml_tree(child, level=level, indent=indent)
l = level new_level = level
if i == len(parent) - 1: if i == len(parent) - 1:
l -= 1 new_level -= 1
if not child.tail or isspace(child.tail): if not child.tail or isspace(child.tail):
child.tail = '' child.tail = ''
child.tail = child.tail + nn + (indent * l) child.tail = child.tail + nn + (indent * new_level)
def pretty_script_or_style(container, child): def pretty_script_or_style(container, child):
if child.text: if child.text:
indent = indent_for_tag(child) indent = indent_for_tag(child)
if child.tag.endswith('style'): if child.tag.endswith('style'):
child.text = force_unicode(pretty_css(container, '', child.text), 'utf-8') child.text = force_unicode(pretty_css(container, '', child.text),
'utf-8')
child.text = textwrap.dedent(child.text) child.text = textwrap.dedent(child.text)
child.text = '\n' + '\n'.join([(indent + x) if x else '' for x in child.text.splitlines()]) child.text = '\n' + '\n'.join([(indent + x) if x else ''
for x in child.text.splitlines()])
set_indent(child, 'text', indent) set_indent(child, 'text', indent)
@@ -169,62 +176,82 @@ def pretty_html_tree(container, root):
# Special case the handling of a body that contains a single block tag # Special case the handling of a body that contains a single block tag
# with all content. In this case we prettify the containing block tag # with all content. In this case we prettify the containing block tag
# even if it has non block children. # even if it has non block children.
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks( if (len(body) == 1 and
body[0]) and parse_utils.barename(body[0].tag) not in ( not callable(body[0].tag) and
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0): isblock(body[0]) and
not has_only_blocks(body[0]) and
parse_utils.barename(body[0].tag) not in ('pre', 'p', 'h1',
'h2', 'h3', 'h4',
'h5', 'h6') and
len(body[0]) > 0):
pretty_block(body[0], level=2) pretty_block(body[0], level=2)
if container is not None: if container is not None:
# Handle <script> and <style> tags # Handle <script> and <style> tags
for child in root.xpath('//*[local-name()="script" or local-name()="style"]'): for child in root.xpath('//*[local-name()="script" or local-name()='
'"style"]'):
pretty_script_or_style(container, child) pretty_script_or_style(container, child)
def fix_html(container, raw): def fix_html(container, raw):
' Fix any parsing errors in the HTML represented as a string in raw. Fixing is done using the HTML5 parsing algorithm. ' """
Fix any parsing errors in the HTML represented as a string in raw. Fixing
is done using the HTML5 parsing algorithm.
"""
root = container.parse_xhtml(raw) root = container.parse_xhtml(raw)
return serialize(root, 'text/html') return base.serialize(root, 'text/html')
def pretty_html(container, name, raw): def pretty_html(container, name, raw):
' Pretty print the HTML represented as a string in raw ' """
Pretty print the HTML represented as a string in raw
"""
root = container.parse_xhtml(raw) root = container.parse_xhtml(raw)
pretty_html_tree(container, root) pretty_html_tree(container, root)
return serialize(root, 'text/html') return base.serialize(root, 'text/html')
def pretty_css(container, name, raw): def pretty_css(container, name, raw):
' Pretty print the CSS represented as a string in raw ' """
Pretty print the CSS represented as a string in raw
"""
sheet = container.parse_css(raw) sheet = container.parse_css(raw)
return serialize(sheet, 'text/css') return base.serialize(sheet, 'text/css')
def pretty_xml(container, name, raw): def pretty_xml(container, name, raw):
' Pretty print the XML represented as a string in raw. If ``name`` is the name of the OPF, extra OPF-specific prettying is performed. ' """
Pretty print the XML represented as a string in raw. If ``name`` is the
name of the OPF, extra OPF-specific prettying is performed.
"""
root = container.parse_xml(raw) root = container.parse_xml(raw)
if name == container.opf_name: if name == container.opf_name:
pretty_opf(root) pretty_opf(root)
pretty_xml_tree(root) pretty_xml_tree(root)
return serialize(root, 'text/xml') return base.serialize(root, 'text/xml')
def fix_all_html(container): def fix_all_html(container):
' Fix any parsing errors in all HTML files in the container. Fixing is done using the HTML5 parsing algorithm. ' """
Fix any parsing errors in all HTML files in the container. Fixing is done
using the HTML5 parsing algorithm. """
for name, mt in container.mime_map.items(): for name, mt in container.mime_map.items():
if mt in OEB_DOCS: if mt in base.OEB_DOCS:
container.parsed(name) container.parsed(name)
container.dirty(name) container.dirty(name)
def pretty_all(container): def pretty_all(container):
' Pretty print all HTML/CSS/XML files in the container ' """
Pretty print all HTML/CSS/XML files in the container
"""
xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')} xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
for name, mt in container.mime_map.items(): for name, mt in container.mime_map.items():
prettied = False prettied = False
if mt in OEB_DOCS: if mt in base.OEB_DOCS:
pretty_html_tree(container, container.parsed(name)) pretty_html_tree(container, container.parsed(name))
prettied = True prettied = True
elif mt in OEB_STYLES: elif mt in base.OEB_STYLES:
container.parsed(name) container.parsed(name)
prettied = True prettied = True
elif name == container.opf_name: elif name == container.opf_name:

View File

@@ -7,6 +7,7 @@ import urllib.parse
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter import guess_type, strftime from ebook_converter import guess_type, strftime
from ebook_converter.constants_old import iswindows from ebook_converter.constants_old import iswindows
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.base import XPath, xml2text, urlnormalize from ebook_converter.ebooks.oeb.base import XPath, xml2text, urlnormalize
from ebook_converter.library.comments import comments_to_html, markdown from ebook_converter.library.comments import comments_to_html, markdown
from ebook_converter.utils.date import is_date_undefined, as_local_time from ebook_converter.utils.date import is_date_undefined, as_local_time
@@ -371,7 +372,7 @@ def render_jacket(mi, output_profile,
# We cannot use data-calibre-rescale 100 on the body tag as that will just # We cannot use data-calibre-rescale 100 on the body tag as that will just
# give the body tag a font size of 1em, which is useless. # give the body tag a font size of 1em, which is useless.
for body in root.xpath('//*[local-name()="body"]'): for body in root.xpath('//*[local-name()="body"]'):
fw = body.makeelement(const.XHTML_DIV) fw = body.makeelement(base.tag('xhtml', 'div'))
fw.set('data-calibre-rescale', '100') fw.set('data-calibre-rescale', '100')
for child in body: for child in body:
fw.append(child) fw.append(child)
@@ -388,9 +389,9 @@ def linearize_jacket(oeb):
for x in oeb.spine[:4]: for x in oeb.spine[:4]:
if XPath(JACKET_XPATH)(x.data): if XPath(JACKET_XPATH)(x.data):
for e in XPath('//h:table|//h:tr|//h:th')(x.data): for e in XPath('//h:table|//h:tr|//h:th')(x.data):
e.tag = const.XHTML_DIV e.tag = base.tag('xhtml', 'div')
for e in XPath('//h:td')(x.data): for e in XPath('//h:td')(x.data):
e.tag = const.XHTML_SPAN e.tag = base.tag('xhtml', 'span')
break break

View File

@@ -3,8 +3,11 @@ Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
forced at "likely" locations to conform to size limitations. This transform forced at "likely" locations to conform to size limitations. This transform
assumes a prior call to the flatcss transform. assumes a prior call to the flatcss transform.
""" """
import os, functools, collections, re, copy import collections
from collections import OrderedDict import copy
import functools
import os
import re
import urllib.parse import urllib.parse
from lxml.etree import XPath as _XPath from lxml.etree import XPath as _XPath
@@ -13,8 +16,7 @@ from lxml import etree
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter import as_unicode, force_unicode from ebook_converter import as_unicode, force_unicode
from ebook_converter.ebooks.epub import rules from ebook_converter.ebooks.epub import rules
from ebook_converter.ebooks.oeb.base import \ from ebook_converter.ebooks.oeb import base
OEB_STYLES, rewrite_links, urlnormalize
from ebook_converter.ebooks.oeb.polish.split import do_split from ebook_converter.ebooks.oeb.polish.split import do_split
from ebook_converter.polyglot.urllib import unquote from ebook_converter.polyglot.urllib import unquote
from ebook_converter.css_selectors import Select, SelectorError from ebook_converter.css_selectors import Select, SelectorError
@@ -44,14 +46,15 @@ class SplitError(ValueError):
class Split(object): class Split(object):
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None, def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
max_flow_size=0, remove_css_pagebreaks=True): max_flow_size=0, remove_css_pagebreaks=True):
self.split_on_page_breaks = split_on_page_breaks self.split_on_page_breaks = split_on_page_breaks
self.page_breaks_xpath = page_breaks_xpath self.page_breaks_xpath = page_breaks_xpath
self.max_flow_size = max_flow_size self.max_flow_size = max_flow_size
self.page_break_selectors = None self.page_break_selectors = None
self.remove_css_pagebreaks = remove_css_pagebreaks self.remove_css_pagebreaks = remove_css_pagebreaks
if self.page_breaks_xpath is not None: if self.page_breaks_xpath is not None:
self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)] self.page_break_selectors = [(XPath(self.page_breaks_xpath),
False)]
def __call__(self, oeb, opts): def __call__(self, oeb, opts):
self.oeb = oeb self.oeb = oeb
@@ -71,7 +74,7 @@ class Split(object):
page_breaks, page_break_ids = self.find_page_breaks(item) page_breaks, page_break_ids = self.find_page_breaks(item)
splitter = FlowSplitter(item, page_breaks, page_break_ids, splitter = FlowSplitter(item, page_breaks, page_break_ids,
self.max_flow_size, self.oeb, self.opts) self.max_flow_size, self.oeb, self.opts)
if splitter.was_split: if splitter.was_split:
am = splitter.anchor_map am = splitter.anchor_map
self.map[item.href] = collections.defaultdict( self.map[item.href] = collections.defaultdict(
@@ -81,25 +84,27 @@ class Split(object):
if self.page_break_selectors is None: if self.page_break_selectors is None:
self.page_break_selectors = set() self.page_break_selectors = set()
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES] base.OEB_STYLES]
for rule in rules(stylesheets): for rule in rules(stylesheets):
before = force_unicode(getattr(rule.style.getPropertyCSSValue( before = force_unicode(getattr(rule.style.getPropertyCSSValue(
'page-break-before'), 'cssText', '').strip().lower()) 'page-break-before'), 'cssText', '').strip().lower())
after = force_unicode(getattr(rule.style.getPropertyCSSValue( after = force_unicode(getattr(rule.style.getPropertyCSSValue(
'page-break-after'), 'cssText', '').strip().lower()) 'page-break-after'), 'cssText', '').strip().lower())
try: try:
if before and before not in {'avoid', 'auto', 'inherit'}: if before and before not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((rule.selectorText, True)) self.page_break_selectors.add((rule.selectorText,
True))
if self.remove_css_pagebreaks: if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-before') rule.style.removeProperty('page-break-before')
except: except Exception:
pass pass
try: try:
if after and after not in {'avoid', 'auto', 'inherit'}: if after and after not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((rule.selectorText, False)) self.page_break_selectors.add((rule.selectorText,
False))
if self.remove_css_pagebreaks: if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-after') rule.style.removeProperty('page-break-after')
except: except Exception:
pass pass
page_breaks = set() page_breaks = set()
select = Select(item.data) select = Select(item.data)
@@ -110,14 +115,18 @@ class Split(object):
return [], [] return [], []
descendants = frozenset(body[0].iterdescendants('*')) descendants = frozenset(body[0].iterdescendants('*'))
_tags = {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}
for selector, before in self.page_break_selectors: for selector, before in self.page_break_selectors:
try: try:
for elem in select(selector): for elem in select(selector):
if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}: if (elem in descendants and
elem.tag.rpartition('}')[2].lower() not in _tags):
elem.set('pb_before', '1' if before else '0') elem.set('pb_before', '1' if before else '0')
page_breaks.add(elem) page_breaks.add(elem)
except SelectorError as err: except SelectorError as err:
self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err))) self.log.warn('Ignoring page breaks specified with invalid '
'CSS selector: %r (%s)' %
(selector, as_unicode(err)))
for i, elem in enumerate(item.data.iter('*')): for i, elem in enumerate(item.data.iter('*')):
try: try:
@@ -126,23 +135,23 @@ class Split(object):
continue continue
page_breaks = list(page_breaks) page_breaks = list(page_breaks)
page_breaks.sort(key=lambda x:int(x.get('pb_order'))) page_breaks.sort(key=lambda x: int(x.get('pb_order')))
page_break_ids, page_breaks_ = [], [] page_break_ids, page_breaks_ = [], []
for i, x in enumerate(page_breaks): for i, x in enumerate(page_breaks):
x.set('id', x.get('id', 'calibre_pb_%d'%i)) x.set('id', x.get('id', 'calibre_pb_%d' % i))
id = x.get('id') id = x.get('id')
try: try:
xp = XPath('//*[@id="%s"]'%id) xp = XPath('//*[@id="%s"]' % id)
except: except Exception:
try: try:
xp = XPath("//*[@id='%s']"%id) xp = XPath("//*[@id='%s']" % id)
except: except Exception:
# The id has both a quote and an apostrophe or some other # The id has both a quote and an apostrophe or some other
# Just replace it since I doubt its going to work anywhere else # Just replace it since I doubt its going to work anywhere
# either # else either
id = 'calibre_pb_%d'%i id = 'calibre_pb_%d' % i
x.set('id', id) x.set('id', id)
xp = XPath('//*[@id=%r]'%id) xp = XPath('//*[@id=%r]' % id)
page_breaks_.append((xp, x.get('pb_before', '0') == '1')) page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
page_break_ids.append(id) page_break_ids.append(id)
@@ -159,7 +168,7 @@ class Split(object):
for item in self.oeb.manifest: for item in self.oeb.manifest:
if etree.iselement(item.data): if etree.iselement(item.data):
self.current_item = item self.current_item = item
rewrite_links(item.data, self.rewrite_links) base.rewrite_links(item.data, self.rewrite_links)
def rewrite_links(self, url): def rewrite_links(self, url):
href, frag = urllib.parse.urldefrag(url) href, frag = urllib.parse.urldefrag(url)
@@ -169,7 +178,7 @@ class Split(object):
# Unparseable URL # Unparseable URL
return url return url
try: try:
href = urlnormalize(href) href = base.urlnormalize(href)
except ValueError: except ValueError:
# href has non utf-8 quoting # href has non utf-8 quoting
return url return url
@@ -188,19 +197,19 @@ class FlowSplitter(object):
'The actual splitting logic' 'The actual splitting logic'
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb, def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
opts): opts):
self.item = item self.item = item
self.oeb = oeb self.oeb = oeb
self.opts = opts self.opts = opts
self.log = oeb.log self.log = oeb.log
self.page_breaks = page_breaks self.page_breaks = page_breaks
self.page_break_ids = page_break_ids self.page_break_ids = page_break_ids
self.max_flow_size = max_flow_size self.max_flow_size = max_flow_size
self.base = item.href self.base = item.href
self.csp_counter = 0 self.csp_counter = 0
base, ext = os.path.splitext(self.base) name, ext = os.path.splitext(self.base)
self.base = base.replace('%', '%%')+'_split_%.3d'+ext self.base = name.replace('%', '%%') + '_split_%.3d' + ext
self.trees = [self.item.data.getroottree()] self.trees = [self.item.data.getroottree()]
self.splitting_on_page_breaks = True self.splitting_on_page_breaks = True
@@ -210,13 +219,13 @@ class FlowSplitter(object):
if self.max_flow_size > 0: if self.max_flow_size > 0:
lt_found = False lt_found = False
self.log('\tLooking for large trees in %s...'%item.href) self.log('\tLooking for large trees in %s...' % item.href)
trees = list(self.trees) trees = list(self.trees)
self.tree_map = {} self.tree_map = {}
for i, tree in enumerate(trees): for i, tree in enumerate(trees):
size = len(tostring(tree.getroot())) size = len(tostring(tree.getroot()))
if size > self.max_flow_size: if size > self.max_flow_size:
self.log('\tFound large tree #%d'%i) self.log('\tFound large tree #%d' % i)
lt_found = True lt_found = True
self.split_trees = [] self.split_trees = []
self.split_to_size(tree) self.split_to_size(tree)
@@ -229,11 +238,11 @@ class FlowSplitter(object):
self.was_split = len(self.trees) > 1 self.was_split = len(self.trees) > 1
if self.was_split: if self.was_split:
self.log('\tSplit into %d parts'%len(self.trees)) self.log('\tSplit into %d parts' % len(self.trees))
self.commit() self.commit()
def split_on_page_breaks(self, orig_tree): def split_on_page_breaks(self, orig_tree):
ordered_ids = OrderedDict() ordered_ids = collections.OrderedDict()
all_page_break_ids = frozenset(self.page_break_ids) all_page_break_ids = frozenset(self.page_break_ids)
for elem_id in orig_tree.xpath('//*/@id'): for elem_id in orig_tree.xpath('//*/@id'):
if elem_id in all_page_break_ids: if elem_id in all_page_break_ids:
@@ -248,9 +257,10 @@ class FlowSplitter(object):
tree = self.trees[i] tree = self.trees[i]
elem = pattern(tree) elem = pattern(tree)
if elem: if elem:
self.log.debug('\t\tSplitting on page-break at id=%s'% self.log.debug('\t\tSplitting on page-break at id=%s' %
elem[0].get('id')) elem[0].get('id'))
before_tree, after_tree = self.do_split(tree, elem[0], before) before_tree, after_tree = self.do_split(tree, elem[0],
before)
self.trees[i:i+1] = [before_tree, after_tree] self.trees[i:i+1] = [before_tree, after_tree]
break break
@@ -269,7 +279,11 @@ class FlowSplitter(object):
if body is not None: if body is not None:
existing_ids = frozenset(body.xpath('//*/@id')) existing_ids = frozenset(body.xpath('//*/@id'))
for x in ids - existing_ids: for x in ids - existing_ids:
body.insert(0, body.makeelement(const.XHTML_div, id=x, style='height:0pt')) body.insert(0,
body.makeelement(base.tag('xhtml',
'div'),
id=x,
style='height:0pt'))
ids = set() ids = set()
trees.append(tree) trees.append(tree)
self.trees = trees self.trees = trees
@@ -281,12 +295,13 @@ class FlowSplitter(object):
return body[0] return body[0]
def do_split(self, tree, split_point, before): def do_split(self, tree, split_point, before):
''' """
Split ``tree`` into a *before* and *after* tree at ``split_point``. Split ``tree`` into a *before* and *after* tree at ``split_point``.
:param before: If True tree is split before split_point, otherwise after split_point :param before: If True tree is split before split_point, otherwise
after split_point
:return: before_tree, after_tree :return: before_tree, after_tree
''' """
return do_split(split_point, self.log, before=before) return do_split(split_point, self.log, before=before)
def is_page_empty(self, root): def is_page_empty(self, root):
@@ -294,7 +309,7 @@ class FlowSplitter(object):
if body is None: if body is None:
return False return False
txt = re.sub(r'\s+|\xa0', '', txt = re.sub(r'\s+|\xa0', '',
etree.tostring(body, method='text', encoding='unicode')) etree.tostring(body, method='text', encoding='unicode'))
if len(txt) > 1: if len(txt) > 1:
return False return False
for img in root.xpath('//h:img', namespaces=const.XPNSMAP): for img in root.xpath('//h:img', namespaces=const.XPNSMAP):
@@ -305,13 +320,13 @@ class FlowSplitter(object):
return True return True
def split_text(self, text, root, size): def split_text(self, text, root, size):
self.log.debug('\t\t\tSplitting text of length: %d'%len(text)) self.log.debug('\t\t\tSplitting text of length: %d' % len(text))
rest = text.replace('\r', '') rest = text.replace('\r', '')
parts = re.split('\n\n', rest) parts = re.split('\n\n', rest)
self.log.debug('\t\t\t\tFound %d parts'%len(parts)) self.log.debug('\t\t\t\tFound %d parts' % len(parts))
if max(map(len, parts)) > size: if max(map(len, parts)) > size:
raise SplitError('Cannot split as file contains a <pre> tag ' raise SplitError('Cannot split as file contains a <pre> tag '
'with a very large paragraph', root) 'with a very large paragraph', root)
ans = [] ans = []
buf = '' buf = ''
for part in parts: for part in parts:
@@ -331,7 +346,8 @@ class FlowSplitter(object):
continue continue
if pre.text and len(pre.text) > self.max_flow_size*0.5: if pre.text and len(pre.text) > self.max_flow_size*0.5:
self.log.debug('\t\tSplitting large <pre> tag') self.log.debug('\t\tSplitting large <pre> tag')
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size)) frags = self.split_text(pre.text, root,
int(0.2 * self.max_flow_size))
new_pres = [] new_pres = []
for frag in frags: for frag in frags:
pre2 = copy.copy(pre) pre2 = copy.copy(pre)
@@ -346,7 +362,8 @@ class FlowSplitter(object):
split_point, before = self.find_split_point(root) split_point, before = self.find_split_point(root)
if split_point is None: if split_point is None:
raise SplitError(self.item.href, root) raise SplitError(self.item.href, root)
self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point)) self.log.debug('\t\t\tSplit point:', split_point.tag,
tree.getpath(split_point))
trees = self.do_split(tree, split_point, before) trees = self.do_split(tree, split_point, before)
sizes = [len(tostring(t.getroot())) for t in trees] sizes = [len(tostring(t.getroot())) for t in trees]
@@ -361,12 +378,11 @@ class FlowSplitter(object):
continue continue
elif size <= self.max_flow_size: elif size <= self.max_flow_size:
self.split_trees.append(t) self.split_trees.append(t)
self.log.debug( self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)' %
'\t\t\tCommitted sub-tree #%d (%d KB)'%( (len(self.split_trees), size/1024.))
len(self.split_trees), size/1024.))
else: else:
self.log.debug( self.log.debug('\t\t\tSplit tree still too large: %d KB' %
'\t\t\tSplit tree still too large: %d KB' % (size/1024.)) size/1024)
self.split_to_size(t) self.split_to_size(t)
def find_split_point(self, root): def find_split_point(self, root):
@@ -385,8 +401,8 @@ class FlowSplitter(object):
''' '''
def pick_elem(elems): def pick_elem(elems):
if elems: if elems:
elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') != elems = [i for i in elems
'1'] if i.get(SPLIT_POINT_ATTR, '0') != '1']
if elems: if elems:
i = int(len(elems)//2) i = int(len(elems)//2)
elems[i].set(SPLIT_POINT_ATTR, '1') elems[i].set(SPLIT_POINT_ATTR, '1')
@@ -407,7 +423,7 @@ class FlowSplitter(object):
if elem is not None: if elem is not None:
try: try:
XPath(elem.getroottree().getpath(elem)) XPath(elem.getroottree().getpath(elem))
except: except Exception:
continue continue
return elem, True return elem, True
@@ -421,23 +437,24 @@ class FlowSplitter(object):
''' '''
if not self.was_split: if not self.was_split:
return return
self.anchor_map = collections.defaultdict(lambda :self.base%0) self.anchor_map = collections.defaultdict(lambda: self.base % 0)
self.files = [] self.files = []
for i, tree in enumerate(self.trees): for i, tree in enumerate(self.trees):
root = tree.getroot() root = tree.getroot()
self.files.append(self.base%i) self.files.append(self.base % i)
for elem in root.xpath('//*[@id or @name]'): for elem in root.xpath('//*[@id or @name]'):
for anchor in elem.get('id', ''), elem.get('name', ''): for anchor in elem.get('id', ''), elem.get('name', ''):
if anchor != '' and anchor not in self.anchor_map: if anchor != '' and anchor not in self.anchor_map:
self.anchor_map[anchor] = self.files[-1] self.anchor_map[anchor] = self.files[-1]
for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR): for elem in root.xpath('//*[@%s]' % SPLIT_POINT_ATTR):
elem.attrib.pop(SPLIT_POINT_ATTR, '0') elem.attrib.pop(SPLIT_POINT_ATTR, '0')
spine_pos = self.item.spine_position spine_pos = self.item.spine_position
for current, tree in zip(*map(reversed, (self.files, self.trees))): for current, tree in zip(*map(reversed, (self.files, self.trees))):
for a in tree.getroot().xpath('//h:a[@href]', namespaces=const.XPNSMAP): for a in tree.getroot().xpath('//h:a[@href]',
namespaces=const.XPNSMAP):
href = a.get('href').strip() href = a.get('href').strip()
if href.startswith('#'): if href.startswith('#'):
anchor = href[1:] anchor = href[1:]
@@ -448,7 +465,8 @@ class FlowSplitter(object):
new_id = self.oeb.manifest.generate(id=self.item.id)[0] new_id = self.oeb.manifest.generate(id=self.item.id)[0]
new_item = self.oeb.manifest.add(new_id, current, new_item = self.oeb.manifest.add(new_id, current,
self.item.media_type, data=tree.getroot()) self.item.media_type,
data=tree.getroot())
self.oeb.spine.insert(spine_pos, new_item, self.item.linear) self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
if self.oeb.guide: if self.oeb.guide:

View File

@@ -7,7 +7,7 @@ from lxml import etree
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.base import TOC, xml2text from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks import ConversionError from ebook_converter.ebooks import ConversionError
@@ -15,8 +15,8 @@ def XPath(x):
try: try:
return etree.XPath(x, namespaces=const.XPNSMAP) return etree.XPath(x, namespaces=const.XPNSMAP)
except etree.XPathSyntaxError: except etree.XPathSyntaxError:
raise ConversionError( raise ConversionError('The syntax of the XPath expression %s is '
'The syntax of the XPath expression %s is invalid.' % repr(x)) 'invalid.' % repr(x))
def isspace(x): def isspace(x):
@@ -33,9 +33,13 @@ def at_start(elem):
for x in body.iter(): for x in body.iter():
if x is elem: if x is elem:
return True return True
if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}: if hasattr(getattr(x, 'tag', None),
'rpartition') and x.tag.rpartition('}')[-1] in {'img',
'svg'}:
return False return False
if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))): if isspace(getattr(x, 'text', None)) and (x in ancestors or
isspace(getattr(x, 'tail',
None))):
continue continue
return False return False
return False return False
@@ -52,7 +56,7 @@ class DetectStructure(object):
self.detect_chapters() self.detect_chapters()
if self.oeb.auto_generated_toc or opts.use_auto_toc: if self.oeb.auto_generated_toc or opts.use_auto_toc:
orig_toc = self.oeb.toc orig_toc = self.oeb.toc
self.oeb.toc = TOC() self.oeb.toc = base.TOC()
self.create_level_based_toc() self.create_level_based_toc()
if self.oeb.toc.count() < 1: if self.oeb.toc.count() < 1:
if not opts.no_chapters_in_toc and self.detected_chapters: if not opts.no_chapters_in_toc and self.detected_chapters:
@@ -64,14 +68,14 @@ class DetectStructure(object):
else: else:
self.oeb.auto_generated_toc = True self.oeb.auto_generated_toc = True
self.log('Auto generated TOC with %d entries.' % self.log('Auto generated TOC with %d entries.' %
self.oeb.toc.count()) self.oeb.toc.count())
if opts.toc_filter is not None: if opts.toc_filter is not None:
regexp = re.compile(opts.toc_filter) regexp = re.compile(opts.toc_filter)
for node in list(self.oeb.toc.iter()): for node in list(self.oeb.toc.iter()):
if not node.title or regexp.search(node.title) is not None: if not node.title or regexp.search(node.title) is not None:
self.log('Filtering', node.title if node.title else self.log('Filtering', node.title if node.title else
'empty node', 'from TOC') 'empty node', 'from TOC')
self.oeb.toc.remove(node) self.oeb.toc.remove(node)
if opts.page_breaks_before is not None: if opts.page_breaks_before is not None:
@@ -80,10 +84,11 @@ class DetectStructure(object):
for elem in pb_xpath(item.data): for elem in pb_xpath(item.data):
try: try:
prev = next(elem.itersiblings(tag=etree.Element, prev = next(elem.itersiblings(tag=etree.Element,
preceding=True)) preceding=True))
if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and parse_utils.barename( if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and
prev.tag) in {'h1', 'h2'} and (not prev.tail or parse_utils.barename(prev.tag) in {'h1',
not prev.tail.split())): 'h2'} and
(not prev.tail or not prev.tail.split())):
# We have two adjacent headings, do not put a page # We have two adjacent headings, do not put a page
# break on the second one # break on the second one
continue continue
@@ -106,9 +111,9 @@ class DetectStructure(object):
expr = self.opts.start_reading_at expr = self.opts.start_reading_at
try: try:
expr = XPath(expr) expr = XPath(expr)
except: except Exception:
self.log.warn( self.log.warn('Invalid start reading at XPath expression, '
'Invalid start reading at XPath expression, ignoring: %s'%expr) 'ignoring: %s' % expr)
return return
for item in self.oeb.spine: for item in self.oeb.spine:
if not hasattr(item.data, 'xpath'): if not hasattr(item.data, 'xpath'):
@@ -118,16 +123,17 @@ class DetectStructure(object):
elem = matches[0] elem = matches[0]
eid = elem.get('id', None) eid = elem.get('id', None)
if not eid: if not eid:
eid = 'start_reading_at_'+str(uuid.uuid4()).replace('-', '') eid = 'start_reading_at_' + str(uuid.uuid4()).replace('-',
'')
elem.set('id', eid) elem.set('id', eid)
if 'text' in self.oeb.guide: if 'text' in self.oeb.guide:
self.oeb.guide.remove('text') self.oeb.guide.remove('text')
self.oeb.guide.add('text', 'Start', item.href+'#'+eid) self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
self.log('Setting start reading at position to %s in %s'%( self.log('Setting start reading at position to %s in %s' %
self.opts.start_reading_at, item.href)) (self.opts.start_reading_at, item.href))
return return
self.log.warn("Failed to find start reading at position: %s"% self.log.warn("Failed to find start reading at position: %s" %
self.opts.start_reading_at) self.opts.start_reading_at)
def get_toc_parts_for_xpath(self, expr): def get_toc_parts_for_xpath(self, expr):
# if an attribute is selected by the xpath expr then truncate it # if an attribute is selected by the xpath expr then truncate it
@@ -148,12 +154,14 @@ class DetectStructure(object):
ans = XPath(expr)(doc) ans = XPath(expr)(doc)
len(ans) len(ans)
return ans return ans
except: except Exception:
self.log.warn('Invalid chapter expression, ignoring: %s'%expr) self.log.warn('Invalid chapter expression, ignoring: %s' %
expr)
return [] return []
if self.opts.chapter: if self.opts.chapter:
chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter) chapter_path, title_attribute = (
self.get_toc_parts_for_xpath(self.opts.chapter))
self.chapter_title_attribute = title_attribute self.chapter_title_attribute = title_attribute
for item in self.oeb.spine: for item in self.oeb.spine:
for x in find_matches(chapter_path, item.data): for x in find_matches(chapter_path, item.data):
@@ -165,25 +173,28 @@ class DetectStructure(object):
c = collections.Counter() c = collections.Counter()
for item, elem in self.detected_chapters: for item, elem in self.detected_chapters:
c[item] += 1 c[item] += 1
text = xml2text(elem).strip() text = base.xml2text(elem).strip()
text = re.sub(r'\s+', ' ', text.strip()) text = re.sub(r'\s+', ' ', text.strip())
self.log('\tDetected chapter:', text[:50]) self.log('\tDetected chapter:', text[:50])
if chapter_mark == 'none': if chapter_mark == 'none':
continue continue
if chapter_mark == 'rule': if chapter_mark == 'rule':
mark = elem.makeelement(const.XHTML_HR) mark = elem.makeelement(base.tag('xhtml', 'hr'))
elif chapter_mark == 'pagebreak': elif chapter_mark == 'pagebreak':
if c[item] < 3 and at_start(elem): if c[item] < 3 and at_start(elem):
# For the first two elements in this item, check if they # For the first two elements in this item, check if
# are at the start of the file, in which case inserting a # they are at the start of the file, in which case
# page break in unnecessary and can lead to extra blank # inserting a page break in unnecessary and can lead
# pages in the PDF Output plugin. We need to use two as # to extra blank pages in the PDF Output plugin. We
# feedbooks epubs match both a heading tag and its # need to use two as feedbooks epubs match both a
# containing div with the default chapter expression. # heading tag and its containing div with the default
# chapter expression.
continue continue
mark = elem.makeelement(const.XHTML_DIV, style=page_break_after) mark = elem.makeelement(base.tag('xhtml', 'div'),
style=page_break_after)
else: # chapter_mark == 'both': else: # chapter_mark == 'both':
mark = elem.makeelement(const.XHTML_HR, style=page_break_before) mark = elem.makeelement(base.tag('xhtml', 'hr'),
style=page_break_before)
try: try:
elem.addprevious(mark) elem.addprevious(mark)
except TypeError: except TypeError:
@@ -196,7 +207,9 @@ class DetectStructure(object):
def create_toc_from_chapters(self): def create_toc_from_chapters(self):
counter = self.oeb.toc.next_play_order() counter = self.oeb.toc.next_play_order()
for item, elem in self.detected_chapters: for item, elem in self.detected_chapters:
text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter) text, href = self.elem_to_link(item, elem,
self.chapter_title_attribute,
counter)
self.oeb.toc.add(text, href, play_order=counter) self.oeb.toc.add(text, href, play_order=counter)
counter += 1 counter += 1
@@ -216,18 +229,21 @@ class DetectStructure(object):
if frag: if frag:
href = '#'.join((href, frag)) href = '#'.join((href, frag))
if not self.oeb.toc.has_href(href): if not self.oeb.toc.has_href(href):
text = xml2text(a) text = base.xml2text(a)
text = text[:100].strip() text = text[:100].strip()
if (not self.opts.duplicate_links_in_toc and if (not self.opts.duplicate_links_in_toc and
self.oeb.toc.has_text(text)): self.oeb.toc.has_text(text)):
continue continue
try: try:
self.oeb.toc.add(text, href, self.oeb.toc.add(
text, href,
play_order=self.oeb.toc.next_play_order()) play_order=self.oeb.toc.next_play_order())
num += 1 num += 1
except ValueError: except ValueError:
self.oeb.log.exception('Failed to process link: %r' % href) self.oeb.log.exception('Failed to process link: '
continue # Most likely an incorrectly URL encoded link '%r' % href)
# Most likely an incorrectly URL encoded link
continue
if self.opts.max_toc_links > 0 and \ if self.opts.max_toc_links > 0 and \
num >= self.opts.max_toc_links: num >= self.opts.max_toc_links:
self.log('Maximum TOC links reached, stopping.') self.log('Maximum TOC links reached, stopping.')
@@ -238,14 +254,14 @@ class DetectStructure(object):
if title_attribute is not None: if title_attribute is not None:
text = elem.get(title_attribute, '') text = elem.get(title_attribute, '')
if not text: if not text:
text = xml2text(elem).strip() text = base.xml2text(elem).strip()
if not text: if not text:
text = elem.get('title', '') text = elem.get('title', '')
if not text: if not text:
text = elem.get('alt', '') text = elem.get('alt', '')
text = re.sub(r'\s+', ' ', text.strip()) text = re.sub(r'\s+', ' ', text.strip())
text = text[:1000].strip() text = text[:1000].strip()
id = elem.get('id', 'calibre_toc_%d'%counter) id = elem.get('id', 'calibre_toc_%d' % counter)
elem.set('id', id) elem.set('id', id)
href = '#'.join((item.href, id)) href = '#'.join((item.href, id))
return text, href return text, href
@@ -260,26 +276,29 @@ class DetectStructure(object):
ans = XPath(expr)(doc) ans = XPath(expr)(doc)
len(ans) len(ans)
return ans return ans
except: except Exception:
self.log.warn('Invalid ToC expression, ignoring: %s'%expr) self.log.warn('Invalid ToC expression, ignoring: %s' % expr)
return [] return []
for document in self.oeb.spine: for document in self.oeb.spine:
previous_level1 = list(added.values())[-1] if added else None previous_level1 = list(added.values())[-1] if added else None
previous_level2 = list(added2.values())[-1] if added2 else None previous_level2 = list(added2.values())[-1] if added2 else None
level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc) (level1_toc,
level1_title) = self.get_toc_parts_for_xpath(self.opts.level1_toc)
for elem in find_matches(level1_toc, document.data): for elem in find_matches(level1_toc, document.data):
text, _href = self.elem_to_link(document, elem, level1_title, counter) text, _href = self.elem_to_link(document, elem, level1_title,
counter)
counter += 1 counter += 1
if text: if text:
node = self.oeb.toc.add(text, _href, node = self.oeb.toc.add(
play_order=self.oeb.toc.next_play_order()) text, _href, play_order=self.oeb.toc.next_play_order())
added[elem] = node added[elem] = node
# node.add('Top', _href) # node.add('Top', _href)
if self.opts.level2_toc is not None and added: if self.opts.level2_toc is not None and added:
level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc) level2_toc, level2_title = self.get_toc_parts_for_xpath(
self.opts.level2_toc)
for elem in find_matches(level2_toc, document.data): for elem in find_matches(level2_toc, document.data):
level1 = None level1 = None
for item in document.data.iterdescendants(): for item in document.data.iterdescendants():
@@ -290,15 +309,19 @@ class DetectStructure(object):
if previous_level1 is None: if previous_level1 is None:
break break
level1 = previous_level1 level1 = previous_level1
text, _href = self.elem_to_link(document, elem, level2_title, counter) text, _href = self.elem_to_link(document, elem,
level2_title,
counter)
counter += 1 counter += 1
if text: if text:
added2[elem] = level1.add(text, _href, added2[elem] = level1.add(
text, _href,
play_order=self.oeb.toc.next_play_order()) play_order=self.oeb.toc.next_play_order())
break break
if self.opts.level3_toc is not None and added2: if self.opts.level3_toc is not None and added2:
level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc) level3_toc, level3_title = self.get_toc_parts_for_xpath(
self.opts.level3_toc)
for elem in find_matches(level3_toc, document.data): for elem in find_matches(level3_toc, document.data):
level2 = None level2 = None
for item in document.data.iterdescendants(): for item in document.data.iterdescendants():
@@ -309,10 +332,13 @@ class DetectStructure(object):
if previous_level2 is None: if previous_level2 is None:
break break
level2 = previous_level2 level2 = previous_level2
text, _href = \ text, _href = self.elem_to_link(document,
self.elem_to_link(document, elem, level3_title, counter) elem,
level3_title,
counter)
counter += 1 counter += 1
if text: if text:
level2.add(text, _href, level2.add(text, _href,
play_order=self.oeb.toc.next_play_order()) play_order=self.oeb
.toc.next_play_order())
break break