1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-15 00:04:10 +01:00

Fixing leftovers from first concept of constants

This commit is contained in:
2020-06-07 11:59:00 +02:00
parent 7419954e0c
commit a69884d724
9 changed files with 652 additions and 464 deletions

View File

@@ -1,14 +1,16 @@
import collections
import functools
import json
import re
from collections import defaultdict, namedtuple
from functools import wraps
from lxml import etree
from ebook_converter import constants as const
from ebook_converter import prints
from ebook_converter.ebooks.metadata import authors_to_string, check_isbn, string_to_authors
from ebook_converter.ebooks.metadata.book.base import Metadata
from ebook_converter.ebooks.metadata import authors_to_string
from ebook_converter.ebooks.metadata import check_isbn
from ebook_converter.ebooks.metadata import string_to_authors
from ebook_converter.ebooks.metadata.book import base
from ebook_converter.ebooks.metadata.book.json_codec import (
decode_is_multiple, encode_is_multiple, object_to_unicode
)
@@ -17,17 +19,30 @@ from ebook_converter.ebooks.metadata.utils import (
pretty_print_opf
)
from ebook_converter.utils.config import from_json, to_json
from ebook_converter.utils.date import (
fix_only_date, is_date_undefined, isoformat, parse_date as parse_date_, utcnow,
w3cdtf
)
from ebook_converter.utils.date import (fix_only_date, is_date_undefined,
isoformat, parse_date as parse_date_,
utcnow, w3cdtf)
from ebook_converter.utils.iso8601 import parse_iso8601
from ebook_converter.utils.localization import canonicalize_lang
RES_PREFIXES = {'dcterms': 'http://purl.org/dc/terms/',
'epubsc': 'http://idpf.org/epub/vocab/sc/#',
'marc': 'http://id.loc.gov/vocabulary/',
'media': 'http://www.idpf.org/epub/vocab/overlays/#',
'onix': 'http://www.editeur.org/ONIX/book/codelists/'
'current.html#',
'rendition': 'http://www.idpf.org/vocab/rendition/#',
'schema': 'http://schema.org/',
'xsd': 'http://www.w3.org/2001/XMLSchema#'}
CALIBRE_PREFIX = 'https://calibre-ebook.com'
KNOWN_PREFIXES = RES_PREFIXES.copy()
KNOWN_PREFIXES['calibre'] = CALIBRE_PREFIX
# Utils {{{
_xpath_cache = {}
_re_cache = {}
_XPATH_CACHE = {}
_RE_CACHE = {}
def uniq(vals):
@@ -39,22 +54,23 @@ def uniq(vals):
def dump_dict(cats):
return json.dumps(object_to_unicode(cats or {}), ensure_ascii=False, skipkeys=True)
return json.dumps(object_to_unicode(cats or {}), ensure_ascii=False,
skipkeys=True)
def XPath(x):
try:
return _xpath_cache[x]
return _XPATH_CACHE[x]
except KeyError:
_xpath_cache[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP)
_XPATH_CACHE[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP)
return ans
def regex(r, flags=0):
try:
return _re_cache[(r, flags)]
return _RE_CACHE[(r, flags)]
except KeyError:
_re_cache[(r, flags)] = ans = re.compile(r, flags)
_RE_CACHE[(r, flags)] = ans = re.compile(r, flags)
return ans
@@ -82,7 +98,7 @@ def properties_for_id(item_id, refines):
def properties_for_id_with_scheme(item_id, prefixes, refines):
ans = defaultdict(list)
ans = collections.defaultdict(list)
if item_id:
for elem in refines[item_id]:
key = elem.get('property')
@@ -126,7 +142,7 @@ def normalize_whitespace(text):
def simple_text(f):
@wraps(f)
@functools.wraps(f)
def wrapper(*args, **kw):
return normalize_whitespace(f(*args, **kw))
return wrapper
@@ -135,7 +151,7 @@ def simple_text(f):
def items_with_property(root, q, prefixes=None):
if prefixes is None:
prefixes = read_prefixes(root)
q = expand_prefix(q, known_prefixes).lower()
q = expand_prefix(q, KNOWN_PREFIXES).lower()
for item in XPath("./opf:manifest/opf:item[@properties]")(root):
for prop in (item.get('properties') or '').lower().split():
prop = expand_prefix(prop, prefixes)
@@ -150,43 +166,32 @@ def items_with_property(root, q, prefixes=None):
# http://www.idpf.org/epub/vocab/package/pfx/
reserved_prefixes = {
'dcterms': 'http://purl.org/dc/terms/',
'epubsc': 'http://idpf.org/epub/vocab/sc/#',
'marc': 'http://id.loc.gov/vocabulary/',
'media': 'http://www.idpf.org/epub/vocab/overlays/#',
'onix': 'http://www.editeur.org/ONIX/book/codelists/current.html#',
'rendition':'http://www.idpf.org/vocab/rendition/#',
'schema': 'http://schema.org/',
'xsd': 'http://www.w3.org/2001/XMLSchema#',
}
CALIBRE_PREFIX = 'https://calibre-ebook.com'
known_prefixes = reserved_prefixes.copy()
known_prefixes['calibre'] = CALIBRE_PREFIX
def parse_prefixes(x):
return {m.group(1):m.group(2) for m in re.finditer(r'(\S+): \s*(\S+)', x)}
return {m.group(1): m.group(2)
for m in re.finditer(r'(\S+): \s*(\S+)', x)}
def read_prefixes(root):
ans = reserved_prefixes.copy()
ans = RES_PREFIXES.copy()
ans.update(parse_prefixes(root.get('prefix') or ''))
return ans
def expand_prefix(raw, prefixes):
return regex(r'(\S+)\s*:\s*(\S+)').sub(lambda m:(prefixes.get(m.group(1), m.group(1)) + ':' + m.group(2)), raw or '')
return (regex(r'(\S+)\s*:\s*(\S+)')
.sub(lambda m: (prefixes.get(m.group(1),
m.group(1)) + ':' + m.group(2)),
raw or ''))
def ensure_prefix(root, prefixes, prefix, value=None):
if prefixes is None:
prefixes = read_prefixes(root)
prefixes[prefix] = value or reserved_prefixes[prefix]
prefixes = {k:v for k, v in prefixes.items() if reserved_prefixes.get(k) != v}
prefixes[prefix] = value or RES_PREFIXES[prefix]
prefixes = {k: v for k, v in prefixes.items() if RES_PREFIXES.get(k) != v}
if prefixes:
root.set('prefix', ' '.join('%s: %s' % (k, v) for k, v in prefixes.items()))
root.set('prefix', ' '.join('%s: %s' % (k, v)
for k, v in prefixes.items()))
else:
root.attrib.pop('prefix', None)
@@ -196,7 +201,7 @@ def ensure_prefix(root, prefixes, prefix, value=None):
def read_refines(root):
ans = defaultdict(list)
ans = collections.defaultdict(list)
for meta in XPath('./opf:metadata/opf:meta[@refines]')(root):
r = meta.get('refines') or ''
if r.startswith('#'):
@@ -213,7 +218,7 @@ def set_refines(elem, existing_refines, *new_refines):
remove_refines(elem, existing_refines)
for ref in reversed(new_refines):
prop, val, scheme = ref
r = elem.makeelement(const.OPF_META)
r = elem.makeelement(base.tag('opf', 'meta'))
r.set('refines', '#' + eid), r.set('property', prop)
r.text = val.strip()
if scheme:
@@ -249,7 +254,7 @@ def parse_identifier(ident, val, refines):
# Try the OPF 2 style opf:scheme attribute, which will be present, for
# example, in EPUB 3 files that have had their metadata set by an
# application that only understands EPUB 2.
scheme = ident.get(const.OPF_SCHEME)
scheme = ident.get(base.tag('opf', 'scheme'))
if scheme and not lval.startswith('urn:'):
return finalize(scheme, val)
@@ -267,7 +272,7 @@ def parse_identifier(ident, val, refines):
def read_identifiers(root, prefixes, refines):
ans = defaultdict(list)
ans = collections.defaultdict(list)
for ident in XPath('./opf:metadata/dc:identifier')(root):
val = (ident.text or '').strip()
if val:
@@ -277,7 +282,8 @@ def read_identifiers(root, prefixes, refines):
return ans
def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=False):
def set_identifiers(root, prefixes, refines, new_identifiers,
force_identifiers=False):
uid = root.get('unique-identifier')
package_identifier = None
for ident in XPath('./opf:metadata/dc:identifier')(root):
@@ -289,12 +295,15 @@ def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=
ident.getparent().remove(ident)
continue
scheme, val = parse_identifier(ident, val, refines)
if not scheme or not val or force_identifiers or scheme in new_identifiers:
if (not scheme or
not val or
force_identifiers or
scheme in new_identifiers):
remove_element(ident, refines)
continue
metadata = XPath('./opf:metadata')(root)[0]
for scheme, val in new_identifiers.items():
ident = metadata.makeelement(const.DC_IDENT)
ident = metadata.makeelement(base.tag('dc', 'ident'))
ident.text = '%s:%s' % (scheme, val)
if package_identifier is None:
metadata.append(ident)
@@ -312,11 +321,12 @@ def identifier_writer(name):
if is_package_id:
package_identifier = ident
val = (ident.text or '').strip()
if (val.startswith(name + ':') or ident.get(const.OPF_SCHEME) == name) and not is_package_id:
if (val.startswith(name + ':') or
ident.get(base.tag('opf', 'scheme')) == name) and not is_package_id:
remove_element(ident, refines)
metadata = XPath('./opf:metadata')(root)[0]
if ival:
ident = metadata.makeelement(const.DC_IDENT)
ident = metadata.makeelement(base.tag('dc', 'ident'))
ident.text = '%s:%s' % (name, ival)
if package_identifier is None:
metadata.append(ident)
@@ -366,7 +376,8 @@ def read_title_sort(root, prefixes, refines):
if fa:
return fa
# Look for OPF 2.0 style title_sort
for m in XPath('./opf:metadata/opf:meta[@name="calibre:title_sort"]')(root):
for m in XPath('./opf:metadata/opf:meta[@name="calibre:'
'title_sort"]')(root):
ans = m.get('content')
if ans:
return ans
@@ -376,12 +387,13 @@ def set_title(root, prefixes, refines, title, title_sort=None):
main_title = find_main_title(root, refines, remove_blanks=True)
if main_title is None:
m = XPath('./opf:metadata')(root)[0]
main_title = m.makeelement(const.DC_TITLE)
main_title = m.makeelement(base.tag('dc', 'title'))
m.insert(0, main_title)
main_title.text = title or None
ts = [refdef('file-as', title_sort)] if title_sort else ()
set_refines(main_title, refines, refdef('title-type', 'main'), *ts)
for m in XPath('./opf:metadata/opf:meta[@name="calibre:title_sort"]')(root):
for m in XPath('./opf:metadata/opf:meta[@name="calibre:'
'title_sort"]')(root):
remove_element(m, refines)
# }}}
@@ -405,28 +417,32 @@ def set_languages(root, prefixes, refines, languages):
val = (lang.text or '').strip()
if val:
opf_languages.append(val)
languages = list(filter(lambda x: x and x != 'und', normalize_languages(opf_languages, languages)))
languages = list(filter(lambda x: x and x != 'und',
normalize_languages(opf_languages, languages)))
if not languages:
# EPUB spec says dc:language is required
languages = ['und']
metadata = XPath('./opf:metadata')(root)[0]
for lang in uniq(languages):
l = metadata.makeelement(const.DC_LANG)
l.text = lang
metadata.append(l)
dc_lang = metadata.makeelement(base.tag('dc', 'lang'))
dc_lang.text = lang
metadata.append(dc_lang)
# }}}
# Creator/Contributor {{{
Author = namedtuple('Author', 'name sort')
Author = collections.namedtuple('Author', 'name sort')
def is_relators_role(props, q):
for role in props.get('role'):
if role:
scheme_ns, scheme, role = role
if role.lower() == q and (scheme_ns is None or (scheme_ns, scheme) == (reserved_prefixes['marc'], 'relators')):
if (role.lower() == q and
(scheme_ns is None or
(scheme_ns, scheme) == (RES_PREFIXES['marc'],
'relators'))):
return True
return False
@@ -440,15 +456,16 @@ def read_authors(root, prefixes, refines):
if file_as:
aus = file_as[0][-1]
else:
aus = item.get(const.OPF_FILE_AS) or None
aus = item.get(base.tag('opf', 'file_as')) or None
return Author(normalize_whitespace(val), normalize_whitespace(aus))
for item in XPath('./opf:metadata/dc:creator')(root):
val = (item.text or '').strip()
if val:
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
props = properties_for_id_with_scheme(item.get('id'), prefixes,
refines)
role = props.get('role')
opf_role = item.get(const.OPF_ROLE)
opf_role = item.get(base.tag('opf', 'role'))
if role:
if is_relators_role(props, 'aut'):
roled_authors.append(author(item, props, val))
@@ -464,23 +481,30 @@ def read_authors(root, prefixes, refines):
def set_authors(root, prefixes, refines, authors):
ensure_prefix(root, prefixes, 'marc')
for item in XPath('./opf:metadata/dc:creator')(root):
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
opf_role = item.get(const.OPF_ROLE)
if (opf_role and opf_role.lower() != 'aut') or (props.get('role') and not is_relators_role(props, 'aut')):
props = properties_for_id_with_scheme(item.get('id'), prefixes,
refines)
opf_role = item.get(base.tag('opf', 'role'))
if ((opf_role and opf_role.lower() != 'aut') or
(props.get('role') and not is_relators_role(props, 'aut'))):
continue
remove_element(item, refines)
metadata = XPath('./opf:metadata')(root)[0]
for author in authors:
if author.name:
a = metadata.makeelement(const.DC_CREATOR)
a = metadata.makeelement(base.tag('dc', 'creator'))
aid = ensure_id(a)
a.text = author.name
metadata.append(a)
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
m = metadata.makeelement(base.tag('opf', 'meta'),
attrib={'refines': '#' + aid,
'property': 'role',
'scheme': 'marc:relators'})
m.text = 'aut'
metadata.append(m)
if author.sort:
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'file-as'})
m = metadata.makeelement(base.tag('opf', 'meta'),
attrib={'refines': '#' + aid,
'property': 'file-as'})
m.text = author.sort
metadata.append(m)
@@ -490,9 +514,10 @@ def read_book_producers(root, prefixes, refines):
for item in XPath('./opf:metadata/dc:contributor')(root):
val = (item.text or '').strip()
if val:
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
props = properties_for_id_with_scheme(item.get('id'), prefixes,
refines)
role = props.get('role')
opf_role = item.get(const.OPF_ROLE)
opf_role = item.get(base.tag('opf', 'role'))
if role:
if is_relators_role(props, 'bkp'):
ans.append(normalize_whitespace(val))
@@ -503,19 +528,24 @@ def read_book_producers(root, prefixes, refines):
def set_book_producers(root, prefixes, refines, producers):
for item in XPath('./opf:metadata/dc:contributor')(root):
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
opf_role = item.get(const.OPF_ROLE)
if (opf_role and opf_role.lower() != 'bkp') or (props.get('role') and not is_relators_role(props, 'bkp')):
props = properties_for_id_with_scheme(item.get('id'), prefixes,
refines)
opf_role = item.get(base.tag('opf', 'role'))
if ((opf_role and opf_role.lower() != 'bkp') or
(props.get('role') and not is_relators_role(props, 'bkp'))):
continue
remove_element(item, refines)
metadata = XPath('./opf:metadata')(root)[0]
for bkp in producers:
if bkp:
a = metadata.makeelement(const.DC_CONTRIBUTOR)
a = metadata.makeelement(base.tag('dc', 'contributor'))
aid = ensure_id(a)
a.text = bkp
metadata.append(a)
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
m = metadata.makeelement(base.tag('opf', 'meta'),
attrib={'refines': '#' + aid,
'property': 'role',
'scheme': 'marc:relators'})
m.text = 'bkp'
metadata.append(m)
# }}}
@@ -531,7 +561,9 @@ def parse_date(raw, is_w3cdtf=False):
ans = fix_only_date(ans)
else:
ans = parse_date_(raw, assume_utc=True)
if ' ' not in raw and 'T' not in raw and (ans.hour, ans.minute, ans.second) == (0, 0, 0):
if (' ' not in raw and
'T' not in raw and
(ans.hour, ans.minute, ans.second) == (0, 0, 0)):
ans = fix_only_date(ans)
return ans
@@ -552,14 +584,14 @@ def set_pubdate(root, prefixes, refines, val):
if not is_date_undefined(val):
val = isoformat(val)
m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(const.DC_DATE)
d = m.makeelement(base.tag('dc', 'date'))
d.text = val
m.append(d)
def read_timestamp(root, prefixes, refines):
pq = '%s:timestamp' % CALIBRE_PREFIX
sq = '%s:w3cdtf' % reserved_prefixes['dcterms']
sq = '%s:w3cdtf' % RES_PREFIXES['dcterms']
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
val = (meta.text or '').strip()
if val:
@@ -570,7 +602,8 @@ def read_timestamp(root, prefixes, refines):
return parse_date(val, is_w3cdtf=scheme == sq)
except Exception:
continue
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:timestamp"]')(root):
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:'
'timestamp"]')(root):
val = meta.get('content')
if val:
try:
@@ -584,7 +617,9 @@ def create_timestamp(root, prefixes, m, val):
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
ensure_prefix(root, prefixes, 'dcterms')
val = w3cdtf(val)
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'})
d = m.makeelement(base.tag('opf', 'meta'),
attrib={'property': 'calibre:timestamp',
'scheme': 'dcterms:W3CDTF'})
d.text = val
m.append(d)
@@ -599,8 +634,8 @@ def set_timestamp(root, prefixes, refines, val):
def read_last_modified(root, prefixes, refines):
pq = '%s:modified' % reserved_prefixes['dcterms']
sq = '%s:w3cdtf' % reserved_prefixes['dcterms']
pq = '%s:modified' % RES_PREFIXES['dcterms']
sq = '%s:w3cdtf' % RES_PREFIXES['dcterms']
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
val = (meta.text or '').strip()
if val:
@@ -614,7 +649,7 @@ def read_last_modified(root, prefixes, refines):
def set_last_modified(root, prefixes, refines, val=None):
pq = '%s:modified' % reserved_prefixes['dcterms']
pq = '%s:modified' % RES_PREFIXES['dcterms']
val = w3cdtf(val or utcnow())
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
prop = expand_prefix(meta.get('property'), prefixes)
@@ -625,7 +660,9 @@ def set_last_modified(root, prefixes, refines, val=None):
else:
ensure_prefix(root, prefixes, 'dcterms')
m = XPath('./opf:metadata')(root)[0]
meta = m.makeelement(const.OPF_META, attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'})
meta = m.makeelement(base.tag('opf', 'meta'),
attrib={'property': 'dcterms:modified',
'scheme': 'dcterms:W3CDTF'})
m.append(meta)
meta.text = val
# }}}
@@ -648,7 +685,7 @@ def set_comments(root, prefixes, refines, val):
if val:
val = val.strip()
if val:
c = m.makeelement(const.DC_DESC)
c = m.makeelement(base.tag('dc', 'desc'))
c.text = val
m.append(c)
# }}}
@@ -670,7 +707,7 @@ def set_publisher(root, prefixes, refines, val):
if val:
val = val.strip()
if val:
c = m.makeelement(const.DC_PUBLISHER('publisher'))
c = m.makeelement(base.tag('dc', 'publisher'))
c.text = normalize_whitespace(val)
m.append(c)
# }}}
@@ -693,7 +730,7 @@ def set_tags(root, prefixes, refines, val):
if val:
val = uniq(list(filter(None, val)))
for x in val:
c = m.makeelement(const.DC_SUBJ)
c = m.makeelement(base.tag('dc', 'subj'))
c.text = normalize_whitespace(x)
if c.text:
m.append(c)
@@ -725,7 +762,7 @@ def read_rating(root, prefixes, refines):
def create_rating(root, prefixes, val):
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:rating'})
d = m.makeelement(base.tag('opf', 'meta'), attrib={'property': 'calibre:rating'})
d.text = val
m.append(d)
@@ -747,7 +784,8 @@ def set_rating(root, prefixes, refines, val):
def read_series(root, prefixes, refines):
series_index = 1.0
for meta in XPath('./opf:metadata/opf:meta[@property="belongs-to-collection" and @id]')(root):
for meta in XPath('./opf:metadata/opf:meta[@property="'
'belongs-to-collection" and @id]')(root):
val = (meta.text or '').strip()
if val:
props = properties_for_id(meta.get('id'), refines)
@@ -757,13 +795,15 @@ def read_series(root, prefixes, refines):
except Exception:
pass
return normalize_whitespace(val), series_index
for si in XPath('./opf:metadata/opf:meta[@name="calibre:series_index"]/@content')(root):
for si in XPath('./opf:metadata/opf:meta[@name="calibre:series_index"]'
'/@content')(root):
try:
series_index = float(si)
break
except:
except Exception:
pass
for s in XPath('./opf:metadata/opf:meta[@name="calibre:series"]/@content')(root):
for s in XPath('./opf:metadata/opf:meta[@name="calibre:series"]'
'/@content')(root):
s = normalize_whitespace(s)
if s:
return s, series_index
@@ -772,16 +812,20 @@ def read_series(root, prefixes, refines):
def create_series(root, refines, series, series_index):
m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(const.OPF_META, attrib={'property':'belongs-to-collection'})
d = m.makeelement(base.tag('opf', 'meta'),
attrib={'property': 'belongs-to-collection'})
d.text = series
m.append(d)
set_refines(d, refines, refdef('collection-type', 'series'), refdef('group-position', series_index))
set_refines(d, refines, refdef('collection-type', 'series'),
refdef('group-position', series_index))
def set_series(root, prefixes, refines, series, series_index):
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:series" or @name="calibre:series_index"]')(root):
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:series" or '
'@name="calibre:series_index"]')(root):
remove_element(meta, refines)
for meta in XPath('./opf:metadata/opf:meta[@property="belongs-to-collection"]')(root):
for meta in XPath('./opf:metadata/opf:meta[@property="'
'belongs-to-collection"]')(root):
remove_element(meta, refines)
if series:
create_series(root, refines, series, '%.2g' % series_index)
@@ -806,7 +850,8 @@ def dict_reader(name, load=json.loads, try2=True):
except Exception:
continue
if try2:
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % name)(root):
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' %
name)(root):
val = meta.get('content')
if val:
try:
@@ -827,7 +872,8 @@ def dict_writer(name, serialize=dump_dict, remove2=True):
def writer(root, prefixes, refines, val):
if remove2:
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % name)(root):
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' %
name)(root):
remove_element(meta, refines)
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
prop = expand_prefix(meta.get('property'), prefixes)
@@ -836,7 +882,8 @@ def dict_writer(name, serialize=dump_dict, remove2=True):
if val:
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:%s' % name})
d = m.makeelement(base.tag('opf', 'meta'),
attrib={'property': 'calibre:%s' % name})
d.text = serialize(val)
m.append(d)
return writer
@@ -855,12 +902,14 @@ def deserialize_user_metadata(val):
return ans
read_user_metadata3 = dict_reader('user_metadata', load=deserialize_user_metadata, try2=False)
read_user_metadata3 = dict_reader('user_metadata',
load=deserialize_user_metadata, try2=False)
def read_user_metadata2(root, remove_tags=False):
ans = {}
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, "calibre:user_metadata:")]')(root):
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, '
'"calibre:user_metadata:")]')(root):
name = meta.get('name')
name = ':'.join(name.split(':')[2:])
if not name or not name.startswith('#'):
@@ -881,18 +930,23 @@ def read_user_metadata2(root, remove_tags=False):
def read_user_metadata(root, prefixes, refines):
return read_user_metadata3(root, prefixes, refines) or read_user_metadata2(root)
return read_user_metadata3(root, prefixes,
refines) or read_user_metadata2(root)
def serialize_user_metadata(val):
return json.dumps(object_to_unicode(val), ensure_ascii=False, default=to_json, indent=2, sort_keys=True)
return json.dumps(object_to_unicode(val), ensure_ascii=False,
default=to_json, indent=2, sort_keys=True)
set_user_metadata3 = dict_writer('user_metadata', serialize=serialize_user_metadata, remove2=False)
set_user_metadata3 = dict_writer('user_metadata',
serialize=serialize_user_metadata,
remove2=False)
def set_user_metadata(root, prefixes, refines, val):
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, "calibre:user_metadata:")]')(root):
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, '
'"calibre:user_metadata:")]')(root):
remove_element(meta, refines)
if val:
nval = {}
@@ -921,26 +975,32 @@ def read_raster_cover(root, prefixes, refines):
if href:
return href
for item_id in XPath('./opf:metadata/opf:meta[@name="cover"]/@content')(root):
for item in XPath('./opf:manifest/opf:item[@id and @href and @media-type]')(root):
for item_id in XPath('./opf:metadata/opf:meta[@name="cover"]'
'/@content')(root):
for item in XPath('./opf:manifest/opf:item[@id and @href and '
'@media-type]')(root):
if item.get('id') == item_id:
href = get_href(item)
if href:
return href
def ensure_is_only_raster_cover(root, prefixes, refines, raster_cover_item_href):
def ensure_is_only_raster_cover(root, prefixes, refines,
raster_cover_item_href):
for item in XPath('./opf:metadata/opf:meta[@name="cover"]')(root):
remove_element(item, refines)
for item in items_with_property(root, 'cover-image', prefixes):
prop = normalize_whitespace(item.get('properties').replace('cover-image', ''))
prop = normalize_whitespace(item.get('properties')
.replace('cover-image', ''))
if prop:
item.set('properties', prop)
else:
del item.attrib['properties']
for item in XPath('./opf:manifest/opf:item')(root):
if item.get('href') == raster_cover_item_href:
item.set('properties', normalize_whitespace((item.get('properties') or '') + ' cover-image'))
item.set('properties',
normalize_whitespace((item.get('properties')
or '') + ' cover-image'))
# }}}
@@ -960,7 +1020,7 @@ def set_last_modified_in_opf(root):
def read_metadata(root, ver=None, return_extra_data=False):
ans = Metadata('Unknown', ['Unknown'])
ans = base.Metadata('Unknown', ['Unknown'])
prefixes, refines = read_prefixes(root), read_refines(root)
identifiers = read_identifiers(root, prefixes, refines)
ids = {}
@@ -1000,12 +1060,16 @@ def read_metadata(root, ver=None, return_extra_data=False):
s, si = read_series(root, prefixes, refines)
if s:
ans.series, ans.series_index = s, si
ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map
ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories
for name, fm in (read_user_metadata(root, prefixes, refines) or {}).items():
ans.author_link_map = read_author_link_map(root, prefixes,
refines) or ans.author_link_map
ans.user_categories = read_user_categories(root, prefixes,
refines) or ans.user_categories
for name, fm in (read_user_metadata(root, prefixes,
refines) or {}).items():
ans.set_user_metadata(name, fm)
if return_extra_data:
ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines)
ans = (ans, ver, read_raster_cover(root, prefixes, refines),
first_spine_item(root, prefixes, refines))
return ans
@@ -1014,7 +1078,9 @@ def get_metadata(stream):
return read_metadata(root)
def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True):
def apply_metadata(root, mi, cover_prefix='', cover_data=None,
apply_null=False, update_timestamp=False,
force_identifiers=False, add_missing_cover=True):
prefixes, refines = read_prefixes(root), read_refines(root)
current_mi = read_metadata(root)
if apply_null:
@@ -1024,7 +1090,8 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False,
def ok(x):
return not mi.is_null(x)
if ok('identifiers'):
set_identifiers(root, prefixes, refines, mi.identifiers, force_identifiers=force_identifiers)
set_identifiers(root, prefixes, refines, mi.identifiers,
force_identifiers=force_identifiers)
if ok('title'):
set_title(root, prefixes, refines, mi.title, mi.title_sort)
if ok('languages'):
@@ -1052,16 +1119,21 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False,
if ok('series'):
set_series(root, prefixes, refines, mi.series, mi.series_index or 1)
if ok('author_link_map'):
set_author_link_map(root, prefixes, refines, getattr(mi, 'author_link_map', None))
set_author_link_map(root, prefixes, refines,
getattr(mi, 'author_link_map', None))
if ok('user_categories'):
set_user_categories(root, prefixes, refines, getattr(mi, 'user_categories', None))
set_user_categories(root, prefixes, refines,
getattr(mi, 'user_categories', None))
# We ignore apply_null for the next two to match the behavior with opf2.py
if mi.application_id:
set_application_id(root, prefixes, refines, mi.application_id)
if mi.uuid:
set_uuid(root, prefixes, refines, mi.uuid)
new_user_metadata, current_user_metadata = mi.get_all_user_metadata(True), current_mi.get_all_user_metadata(True)
new_user_metadata = mi.get_all_user_metadata(True)
current_user_metadata = current_mi.get_all_user_metadata(True)
missing = object()
for key in tuple(new_user_metadata):
meta = new_user_metadata.get(key)
if meta is None:
@@ -1098,7 +1170,9 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False,
return raster_cover
def set_metadata(stream, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True):
def set_metadata(stream, mi, cover_prefix='', cover_data=None,
apply_null=False, update_timestamp=False,
force_identifiers=False, add_missing_cover=True):
root = parse_opf(stream)
return apply_metadata(
root, mi, cover_prefix=cover_prefix, cover_data=cover_data,

View File

@@ -8,37 +8,28 @@ import uuid
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.mobi.reader.headers import NULL_INDEX
from ebook_converter.ebooks.mobi.reader.index import read_index
from ebook_converter.ebooks.mobi.reader.ncx import read_ncx, build_toc
from ebook_converter.ebooks.mobi.reader.markup import expand_mobi8_markup
from ebook_converter.ebooks.mobi.reader.containers import Container, find_imgtype
from ebook_converter.ebooks.mobi.reader import containers
from ebook_converter.ebooks.metadata.opf2 import Guide, OPFCreator
from ebook_converter.ebooks.metadata.toc import TOC
from ebook_converter.ebooks.mobi.utils import read_font_record
from ebook_converter.ebooks.oeb.parse_utils import parse_html
from ebook_converter.ebooks.oeb.base import XPath, xml2text
from ebook_converter.ebooks.oeb import base
from ebook_converter.polyglot.builtins import as_unicode
ID_RE = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
NAME_RE = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
AID_RE = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
Part = collections.namedtuple('Part', 'num type filename start end aid')
Elem = collections.namedtuple('Elem', 'insert_pos toc_text file_number '
'sequence_number start_pos length')
FlowInfo = collections.namedtuple('FlowInfo', 'type format dir fname')
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
Part = collections.namedtuple('Part',
'num type filename start end aid')
Elem = collections.namedtuple('Elem',
'insert_pos toc_text file_number sequence_number start_pos '
'length')
FlowInfo = collections.namedtuple('FlowInfo',
'type format dir fname')
# locate beginning and ending positions of tag with specific aid attribute
def locate_beg_end_of_tag(ml, aid):
pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid
aid_pattern = re.compile(pattern, re.IGNORECASE)
@@ -64,7 +55,8 @@ def reverse_tag_iter(block):
end = plt
def get_first_resource_index(first_image_index, num_of_text_records, first_text_record_number):
def get_first_resource_index(first_image_index, num_of_text_records,
first_text_record_number):
first_resource_index = first_image_index
if first_resource_index in {-1, NULL_INDEX}:
first_resource_index = num_of_text_records + first_text_record_number
@@ -78,23 +70,27 @@ class Mobi8Reader(object):
self.mobi6_reader, self.log = mobi6_reader, log
self.header = mobi6_reader.book_header
self.encrypted_fonts = []
self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
self.id_re = ID_RE
self.name_re = NAME_RE
self.aid_re = AID_RE
def __call__(self):
self.mobi6_reader.check_for_drm()
self.aid_anchor_suffix = uuid.uuid4().hex.encode('utf-8')
bh = self.mobi6_reader.book_header
_gfri = get_first_resource_index
if self.mobi6_reader.kf8_type == 'joint':
offset = self.mobi6_reader.kf8_boundary + 2
self.resource_offsets = [
(get_first_resource_index(bh.first_image_index, bh.mobi6_records, 1), offset - 2),
(get_first_resource_index(bh.kf8_first_image_index, bh.records, offset), len(self.mobi6_reader.sections)),
]
self.resource_offsets = [(_gfri(bh.first_image_index,
bh.mobi6_records, 1), offset - 2),
(_gfri(bh.kf8_first_image_index,
bh.records, offset),
len(self.mobi6_reader.sections))]
else:
offset = 1
self.resource_offsets = [(get_first_resource_index(bh.first_image_index, bh.records, offset), len(self.mobi6_reader.sections))]
self.resource_offsets = [(_gfri(bh.first_image_index, bh.records,
offset),
len(self.mobi6_reader.sections))]
self.processed_records = self.mobi6_reader.extract_text(offset=offset)
self.raw_ml = self.mobi6_reader.mobi_html
@@ -123,37 +119,37 @@ class Mobi8Reader(object):
raise ValueError('KF8 does not have a valid FDST record')
sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
secs = struct.unpack_from(b'>%dL' % (num_sections*2),
header, sec_start)
header, sec_start)
self.flow_table = tuple(zip(secs[::2], secs[1::2]))
self.files = []
if self.header.skelidx != NULL_INDEX:
table = read_index(self.kf8_sections, self.header.skelidx,
self.header.codec)[0]
File = collections.namedtuple('File',
'file_number name divtbl_count start_position length')
self.header.codec)[0]
File = collections.namedtuple('File', 'file_number name '
'divtbl_count start_position length')
for i, text in enumerate(table):
tag_map = table[text]
self.files.append(File(i, text, tag_map[1][0],
tag_map[6][0], tag_map[6][1]))
tag_map[6][0], tag_map[6][1]))
self.elems = []
if self.header.dividx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.dividx,
self.header.codec)
self.header.codec)
for i, text in enumerate(table):
tag_map = table[text]
toc_text = cncx[tag_map[2][0]]
self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
tag_map[4][0], tag_map[6][0], tag_map[6][1]))
tag_map[4][0], tag_map[6][0],
tag_map[6][1]))
self.guide = []
if self.header.othidx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.othidx,
self.header.codec)
Item = collections.namedtuple('Item',
'type title pos_fid')
self.header.codec)
Item = collections.namedtuple('Item', 'type title pos_fid')
for i, ref_type in enumerate(table):
tag_map = table[ref_type]
@@ -161,7 +157,7 @@ class Mobi8Reader(object):
title = cncx[tag_map[1][0]]
fileno = None
if 3 in list(tag_map.keys()):
fileno = tag_map[3][0]
fileno = tag_map[3][0]
if 6 in list(tag_map.keys()):
fileno = tag_map[6]
if isinstance(ref_type, bytes):
@@ -205,17 +201,19 @@ class Mobi8Reader(object):
head = skeleton[:insertpos]
tail = skeleton[insertpos:]
if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') <
head.rfind(b'<')):
head.rfind(b'<')):
# There is an incomplete tag in either the head or tail.
# This can happen for some badly formed KF8 files, see for
# example, https://bugs.launchpad.net/bugs/1082669
if not inspos_warned:
self.log.warn(
'The div table for %s has incorrect insert '
'positions. Calculating manually.'%skelname)
self.log.warn('The div table for %s has incorrect '
'insert positions. Calculating '
'manually.' % skelname)
inspos_warned = True
bp, ep = locate_beg_end_of_tag(skeleton, aidtext if
isinstance(aidtext, bytes) else aidtext.encode('utf-8'))
isinstance(aidtext, bytes)
else
aidtext.encode('utf-8'))
if bp != ep:
insertpos = ep + 1 + startpos
@@ -228,7 +226,7 @@ class Mobi8Reader(object):
aidtext = str(uuid.uuid4())
filename = aidtext + '.html'
self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
baseptr, aidtext))
baseptr, aidtext))
# The primary css style sheet is typically stored next followed by any
# snippets of code that were previously inlined in the
@@ -238,10 +236,10 @@ class Mobi8Reader(object):
# The problem is that for most browsers and ereaders, you can not
# use <img src="imageXXXX.svg" /> to import any svg image that itself
# properly uses an <image/> tag to import some raster image - it
# should work according to the spec but does not for almost all browsers
# and ereaders and causes epub validation issues because those raster
# images are in manifest but not in xhtml text - since they only
# referenced from an svg image
# should work according to the spec but does not for almost all
# browsers and ereaders and causes epub validation issues because
# those raster images are in manifest but not in xhtml text - since
# they only referenced from an svg image
# So we need to check the remaining flow pieces to see if they are css
# or svg images. if svg images, we must check if they have an <image/>
@@ -252,7 +250,8 @@ class Mobi8Reader(object):
self.flowinfo.append(FlowInfo(None, None, None, None))
svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''', re.IGNORECASE)
image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''',
re.IGNORECASE)
for j in range(1, len(self.flows)):
flowpart = self.flows[j]
nstr = '%04d' % j
@@ -276,7 +275,8 @@ class Mobi8Reader(object):
# search for CDATA and if exists inline it
if flowpart.find(b'[CDATA[') >= 0:
typ = 'css'
flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
flowpart = (b'<style type="text/css">\n' + flowpart +
b'\n</style>\n')
format = 'inline'
dir = None
fname = None
@@ -300,7 +300,8 @@ class Mobi8Reader(object):
def get_id_tag_by_pos_fid(self, posfid, offset):
# first convert kindle:pos:fid and offset info to position in file
insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid]
(insertpos, idtext, filenum,
seqnm, startpos, length) = self.elems[posfid]
pos = insertpos + offset
fi = self.get_file_info(pos)
# an existing "id=" must exist in original xhtml otherwise it would not
@@ -311,20 +312,20 @@ class Mobi8Reader(object):
# so find the closest "id=" before position the file by actually
# searching in that file
idtext = self.get_id_tag(pos)
return '%s/%s'%(fi.type, fi.filename), idtext
return '%s/%s' % (fi.type, fi.filename), idtext
def get_id_tag(self, pos):
# Find the first tag with a named anchor (name or id attribute) before
# pos
fi = self.get_file_info(pos)
if fi.num is None and fi.start is None:
raise ValueError('No file contains pos: %d'%pos)
raise ValueError('No file contains pos: %d' % pos)
textblock = self.parts[fi.num]
npos = pos - fi.start
pgt = textblock.find(b'>', npos)
plt = textblock.find(b'<', npos)
# if npos inside a tag then search all text before the its end of tag marker
# else not in a tag need to search the preceding tag
# if npos inside a tag then search all text before the its end of tag
# marker else not in a tag need to search the preceding tag
if plt == npos or pgt < plt:
npos = pgt + 1
textblock = textblock[0:npos]
@@ -371,7 +372,7 @@ class Mobi8Reader(object):
linktgt = fi.filename
if idtext:
linktgt += '#' + idtext
g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwd())
g = Guide.Reference('%s/%s' % (fi.type, linktgt), os.getcwd())
g.title, g.type = 'start', 'text'
guide.append(g)
@@ -379,7 +380,7 @@ class Mobi8Reader(object):
def create_ncx(self):
index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
self.header.codec)
self.header.codec)
remove = []
# Add href and anchor info to the index entries
@@ -389,15 +390,15 @@ class Mobi8Reader(object):
pos = entry['pos']
fi = self.get_file_info(pos)
if fi.filename is None:
raise ValueError('Index entry has invalid pos: %d'%pos)
raise ValueError('Index entry has invalid pos: %d' % pos)
idtag = self.get_id_tag(pos)
href = '%s/%s'%(fi.type, fi.filename)
href = '%s/%s' % (fi.type, fi.filename)
else:
try:
href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
except ValueError:
self.log.warn('Invalid entry in NCX (title: %s), ignoring'
%entry['text'])
self.log.warn('Invalid entry in NCX (title: %s), '
'ignoring' % entry['text'])
remove.append(entry)
continue
@@ -411,7 +412,8 @@ class Mobi8Reader(object):
return build_toc(index_entries)
def extract_resources(self, sections):
from ebook_converter.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF
from ebook_converter.ebooks.mobi.writer2.resources import \
PLACEHOLDER_GIF
resource_map = []
container = None
for x in ('fonts', 'images'):
@@ -424,16 +426,18 @@ class Mobi8Reader(object):
typ = data[:4]
href = None
if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN',
b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET', b'PAGE'}:
b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC',
b'CMET', b'PAGE'}:
pass # Ignore these records
elif typ == b'FONT':
font = read_font_record(data)
href = "fonts/%05d.%s" % (fname_idx, font['ext'])
if font['err']:
self.log.warn('Reading font record %d failed: %s'%(
fname_idx, font['err']))
self.log.warn('Reading font record %d failed: %s' %
(fname_idx, font['err']))
if font['headers']:
self.log.debug('Font record headers: %s'%font['headers'])
self.log.debug('Font record headers: %s' %
font['headers'])
with open(href.replace('/', os.sep), 'wb') as f:
f.write(font['font_data'] if font['font_data'] else
font['raw_data'])
@@ -443,19 +447,23 @@ class Mobi8Reader(object):
if data == b'CONTBOUNDARY':
container = None
continue
container = Container(data)
container = containers.Container(data)
elif typ == b'CRES':
data, imgtype = container.load_image(data)
if data is not None:
href = 'images/%05d.%s'%(container.resource_index, imgtype)
href = 'images/%05d.%s' % (container.resource_index,
imgtype)
with open(href.replace('/', os.sep), 'wb') as f:
f.write(data)
elif typ == b'\xa0\xa0\xa0\xa0' and len(data) == 4 and container is not None:
elif (typ == b'\xa0\xa0\xa0\xa0' and
len(data) == 4 and
container is not None):
container.resource_index += 1
elif container is None:
if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF):
imgtype = find_imgtype(data)
href = 'images/%05d.%s'%(fname_idx, imgtype)
if not (len(data) == len(PLACEHOLDER_GIF) and
data == PLACEHOLDER_GIF):
imgtype = containers.find_imgtype(data)
href = 'images/%05d.%s' % (fname_idx, imgtype)
with open(href.replace('/', os.sep), 'wb') as f:
f.write(data)
@@ -482,7 +490,7 @@ class Mobi8Reader(object):
if os.path.exists(href.replace('/', os.sep)):
try:
toc = self.read_inline_toc(href, frag)
except:
except Exception:
self.log.exception('Failed to read inline ToC')
opf = OPFCreator(os.getcwd(), mi)
@@ -493,7 +501,8 @@ class Mobi8Reader(object):
# If there are no images then the azw3 input plugin dumps all
# binary records as .unknown images, remove them
if self.for_tweak and os.path.exists('images') and os.path.isdir('images'):
if (self.for_tweak and os.path.exists('images') and
os.path.isdir('images')):
files = os.listdir('images')
unknown = [x for x in files if x.endswith('.unknown')]
if len(files) == len(unknown):
@@ -502,7 +511,7 @@ class Mobi8Reader(object):
if self.for_tweak:
try:
os.remove('debug-raw.html')
except:
except Exception:
pass
opf.create_manifest_from_files_in([os.getcwd()], exclude=exclude)
@@ -528,7 +537,7 @@ class Mobi8Reader(object):
with open(href.replace('/', os.sep), 'rb') as f:
raw = f.read().decode(self.header.codec)
root = parse_html(raw, log=self.log)
body = XPath('//h:body')(root)
body = base.XPath('//h:body')(root)
reached = False
if body:
start = body[0]
@@ -536,7 +545,7 @@ class Mobi8Reader(object):
start = None
reached = True
if frag:
elems = XPath('//*[@id="%s"]'%frag)(root)
elems = base.XPath('//*[@id="%s"]' % frag)(root)
if elems:
start = elems[0]
@@ -554,12 +563,13 @@ class Mobi8Reader(object):
seen = set()
links = []
for elem in root.iterdescendants(etree.Element):
if reached and elem.tag == const.XHTML_A and elem.get('href',
if reached and elem.tag == base.tag('xhtml',
'a') and elem.get('href',
False):
href = elem.get('href')
href, frag = urllib.parse.urldefrag(href)
href = base_href + '/' + href
text = xml2text(elem).strip()
text = base.xml2text(elem).strip()
if (text, href, frag) in seen:
continue
seen.add((text, href, frag))
@@ -568,7 +578,7 @@ class Mobi8Reader(object):
reached = True
depths = sorted(set(x[-1] for x in links))
depth_map = {x:i for i, x in enumerate(depths)}
depth_map = {x: i for i, x in enumerate(depths)}
for text, href, frag, depth in links:
depth = depth_map[depth]
if current_depth is None:

View File

@@ -1,5 +1,8 @@
import collections
import errno
import hashlib
import io
import itertools
import logging
import os
import re
@@ -7,13 +10,10 @@ import shutil
import sys
import time
import unicodedata
import uuid
from collections import defaultdict
from io import BytesIO
from itertools import count
import urllib.parse
import uuid
from css_parser import getUrls, replaceUrls
import css_parser
from lxml import etree
from ebook_converter import constants as const
@@ -35,10 +35,7 @@ from ebook_converter.ebooks.metadata.utils import parse_opf_version
from ebook_converter.ebooks.mobi import MobiError
from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader
from ebook_converter.ebooks.mobi.tweak import set_cover
from ebook_converter.ebooks.oeb.base import (
OEB_DOCS, OEB_STYLES, Manifest, itercsslinks, iterlinks,
rewrite_links, serialize, urlquote, urlunquote
)
from ebook_converter.ebooks.oeb import base as oeb_base
from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html
from ebook_converter.ebooks.oeb.polish.errors import DRMError, InvalidBook
from ebook_converter.ebooks.oeb.polish.parsing import parse as parse_html_tweak
@@ -96,7 +93,7 @@ def abspath_to_name(path, root):
return relpath(os.path.abspath(path), root).replace(os.sep, '/')
def name_to_href(name, root, base=None, quote=urlquote):
def name_to_href(name, root, base=None, quote=oeb_base.urlquote):
fullpath = name_to_abspath(name, root)
basepath = root if base is None else os.path.dirname(name_to_abspath(base, root))
path = relpath(fullpath, basepath).replace(os.sep, '/')
@@ -111,7 +108,7 @@ def href_to_name(href, root, base=None):
return None
if purl.scheme or not purl.path:
return None
href = urlunquote(purl.path)
href = oeb_base.urlunquote(purl.path)
if iswindows and ':' in href:
# path manipulations on windows fail for paths with : in them, so we
# assume all such paths are invalid/absolute paths.
@@ -324,7 +321,7 @@ class Container(ContainerBase): # {{{
item_id = 'id' + '%d'%c
manifest = self.opf_xpath('//opf:manifest')[0]
href = self.name_to_href(name, self.opf_name)
item = manifest.makeelement(const.OPF_ITEM,
item = manifest.makeelement(oeb_base.tag('opf', 'item'),
id=item_id, href=href)
item.set('media-type', self.mime_map[name])
self.insert_into_xml(manifest, item)
@@ -340,7 +337,7 @@ class Container(ContainerBase): # {{{
def make_name_unique(self, name):
''' Ensure that `name` does not already exist in this book. If it does, return a modified version that does not exist. '''
counter = count()
counter = itertools.count()
while self.has_name_case_insensitive(name) or self.manifest_has_name(name):
c = next(counter) + 1
base, ext = name.rpartition('.')[::2]
@@ -377,10 +374,10 @@ class Container(ContainerBase): # {{{
if self.ok_to_be_unmanifested(name):
return name
item_id = self.add_name_to_manifest(name, process_manifest_item=process_manifest_item)
if mt in OEB_DOCS:
if mt in oeb_base.OEB_DOCS:
manifest = self.opf_xpath('//opf:manifest')[0]
spine = self.opf_xpath('//opf:spine')[0]
si = manifest.makeelement(const.OPF_ITEMREF, idref=item_id)
si = manifest.makeelement(oeb_base.tag('opf', 'itemref'), idref=item_id)
self.insert_into_xml(spine, si, index=spine_index)
return name
@@ -442,12 +439,12 @@ class Container(ContainerBase): # {{{
replace_func.file_type = 'opf'
for elem in self.opf_xpath('//*[@href]'):
elem.set('href', replace_func(elem.get('href')))
elif media_type.lower() in OEB_DOCS:
elif media_type.lower() in oeb_base.OEB_DOCS:
replace_func.file_type = 'text'
rewrite_links(self.parsed(name), replace_func)
elif media_type.lower() in OEB_STYLES:
oeb_base.rewrite_links(self.parsed(name), replace_func)
elif media_type.lower() in oeb_base.OEB_STYLES:
replace_func.file_type = 'style'
replaceUrls(self.parsed(name), replace_func)
css_parser.replaceUrls(self.parsed(name), replace_func)
elif media_type.lower() == guess_type('toc.ncx'):
replace_func.file_type = 'ncx'
for elem in self.parsed(name).xpath('//*[@src]'):
@@ -467,21 +464,21 @@ class Container(ContainerBase): # {{{
if name == self.opf_name:
for elem in self.opf_xpath('//*[@href]'):
yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
elif media_type.lower() in OEB_DOCS:
for el, attr, link, pos in iterlinks(self.parsed(name)):
elif media_type.lower() in oeb_base.OEB_DOCS:
for el, attr, link, pos in oeb_base.iterlinks(self.parsed(name)):
yield (link, el.sourceline, pos) if get_line_numbers else link
elif media_type.lower() in OEB_STYLES:
elif media_type.lower() in oeb_base.OEB_STYLES:
if get_line_numbers:
with self.open(name, 'rb') as f:
raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
position = PositionFinder(raw)
is_in_comment = CommentFinder(raw)
for link, offset in itercsslinks(raw):
for link, offset in oeb_base.itercsslinks(raw):
if not is_in_comment(offset):
lnum, col = position(offset)
yield link, lnum, col
else:
for link in getUrls(self.parsed(name)):
for link in css_parser.getUrls(self.parsed(name)):
yield link
elif media_type.lower() == guess_type('toc.ncx'):
for elem in self.parsed(name).xpath('//*[@src]'):
@@ -533,7 +530,7 @@ class Container(ContainerBase): # {{{
def opf_xpath(self, expr):
' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. '
return self.opf.xpath(expr, namespaces=const.OPF_NAMESPACES)
return self.opf.xpath(expr, namespaces=oeb_base.tag('opf', 'namespaces'))
def has_name(self, name):
''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. '''
@@ -580,11 +577,11 @@ class Container(ContainerBase): # {{{
def parse(self, path, mime):
with open(path, 'rb') as src:
data = src.read()
if mime in OEB_DOCS:
if mime in oeb_base.OEB_DOCS:
data = self.parse_xhtml(data, self.relpath(path))
elif mime[-4:] in {'+xml', '/xml'}:
data = self.parse_xml(data)
elif mime in OEB_STYLES:
elif mime in oeb_base.OEB_STYLES:
data = self.parse_css(data, self.relpath(path))
return data
@@ -597,7 +594,7 @@ class Container(ContainerBase): # {{{
'''
ans = self.open(name).read()
mime = self.mime_map.get(name, guess_type(name))
if decode and (mime in OEB_STYLES or mime in OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}):
if decode and (mime in oeb_base.OEB_STYLES or mime in oeb_base.OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}):
ans = self.decode(ans, normalize_to_nfc=normalize_to_nfc)
return ans
@@ -637,7 +634,7 @@ class Container(ContainerBase): # {{{
so use it sparingly. '''
from ebook_converter.ebooks.metadata.opf2 import OPF as O
mi = self.serialize_item(self.opf_name)
return O(BytesIO(mi), basedir=self.opf_dir, unquote_urls=False,
return O(io.BytesIO(mi), basedir=self.opf_dir, unquote_urls=False,
populate_spine=False).to_book_metadata()
@property
@@ -662,7 +659,7 @@ class Container(ContainerBase): # {{{
@property
def manifest_type_map(self):
' Mapping of manifest media-type to list of canonical names of that media-type '
ans = defaultdict(list)
ans = collections.defaultdict(list)
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'):
ans[item.get('media-type').lower()].append(self.href_to_name(
item.get('href'), self.opf_name))
@@ -813,7 +810,7 @@ class Container(ContainerBase): # {{{
spine = self.opf_xpath('//opf:spine')[0]
spine.text = tail
for name, linear in spine_items:
i = spine.makeelement(const.OPF_ITEMREF,
i = spine.makeelement(oeb_base.tag('opf', 'itemref'),
nsmap={'opf': const.OPF2_NS})
i.tail = tail
i.set('idref', imap[name])
@@ -922,7 +919,7 @@ class Container(ContainerBase): # {{{
return ans[0]
self.dirty(self.opf_name)
package = self.opf_xpath('//opf:package')[0]
item = package.makeelement(OPF(name))
item = package.makeelement(oeb_base.tag('opf', name))
item.tail = '\n'
package.append(item)
return item
@@ -945,7 +942,7 @@ class Container(ContainerBase): # {{{
item_id = id_prefix + '%d'%c
manifest = self.opf_xpath('//opf:manifest')[0]
item = manifest.makeelement(const.OPF_ITEM,
item = manifest.makeelement(oeb_base.tag('opf', 'item'),
id=item_id, href=href)
item.set('media-type', media_type)
self.insert_into_xml(manifest, item)
@@ -992,7 +989,7 @@ class Container(ContainerBase): # {{{
data = root = self.parsed(name)
if name == self.opf_name:
self.format_opf()
data = serialize(data, self.mime_map[name], pretty_print=name in
data = oeb_base.serialize(data, self.mime_map[name], pretty_print=name in
self.pretty_print)
if name == self.opf_name and root.nsmap.get(None) == const.OPF2_NS:
# Needed as I can't get lxml to output opf:role and
@@ -1181,7 +1178,7 @@ class EpubContainer(Container):
)
if not opf_files:
raise InvalidEpub('META-INF/container.xml contains no link to OPF file')
opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/')))
opf_path = os.path.join(self.root, *(oeb_base.urlunquote(opf_files[0].get('full-path')).split('/')))
if not exists(opf_path):
raise InvalidEpub('OPF file does not exist at location pointed to'
' by META-INF/container.xml')
@@ -1412,7 +1409,7 @@ def do_explode(path, dest):
def opf_to_azw3(opf, outpath, container):
from ebook_converter.ebooks.conversion.plumber import Plumber, create_oebbook
class Item(Manifest.Item):
class Item(oeb_base.Manifest.Item):
def _parse_css(self, data):
# The default CSS parser used by oeb.base inserts the h namespace

View File

@@ -1,22 +1,16 @@
from collections import defaultdict
from functools import partial
import collections
import functools
from css_parser.css import CSSRule, CSSStyleDeclaration
from ebook_converter import constants as const
from ebook_converter import force_unicode
from ebook_converter.css_selectors import parse, SelectorSyntaxError
from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, css_text
from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
from ebook_converter.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish import pretty
from ebook_converter.utils.icu import numeric_sort_key
from ebook_converter.css_selectors import Select, SelectorError
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
def filter_used_rules(rules, log, select):
for rule in rules:
used = False
@@ -34,7 +28,8 @@ def filter_used_rules(rules, log, select):
yield rule
def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None):
def get_imported_sheets(name, container, sheets, recursion_level=10,
sheet=None):
ans = set()
sheet = sheet or sheets[name]
for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
@@ -44,7 +39,8 @@ def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None)
ans.add(iname)
if recursion_level > 0:
for imported_sheet in tuple(ans):
ans |= get_imported_sheets(imported_sheet, container, sheets, recursion_level=recursion_level-1)
ans |= get_imported_sheets(imported_sheet, container, sheets,
recursion_level=recursion_level-1)
ans.discard(name)
return ans
@@ -56,7 +52,7 @@ def merge_declarations(first, second):
def merge_identical_selectors(sheet):
' Merge rules that have identical selectors '
selector_map = defaultdict(list)
selector_map = collections.defaultdict(list)
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
selector_map[rule.selectorText].append(rule)
remove = []
@@ -70,23 +66,29 @@ def merge_identical_selectors(sheet):
return len(remove)
def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False):
'''
Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content.
def remove_unused_css(container, report=None, remove_unused_classes=False,
merge_rules=False):
"""
Remove all unused CSS rules from the book. An unused CSS rule is one that
does not match any actual content.
:param report: An optional callable that takes a single argument. It is called with information about the operations being performed.
:param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed.
:param report: An optional callable that takes a single argument. It is
called with information about the operations being
performed.
:param remove_unused_classes: If True, class attributes in the HTML that
do not match any CSS rules are also removed.
:param merge_rules: If True, rules with identical selectors are merged.
'''
report = report or (lambda x:x)
"""
report = report or (lambda x: x)
def safe_parse(name):
try:
return container.parsed(name)
except TypeError:
pass
sheets = {name:safe_parse(name) for name, mt in container.mime_map.items() if mt in OEB_STYLES}
sheets = {k:v for k, v in sheets.items() if v is not None}
sheets = {name: safe_parse(name) for name, mt in container.mime_map.items()
if mt in base.OEB_STYLES and safe_parse(name) is not None}
num_merged = 0
if merge_rules:
for name, sheet in sheets.items():
@@ -106,7 +108,7 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
num_of_removed_rules = num_of_removed_classes = 0
for name, mt in container.mime_map.items():
if mt not in OEB_DOCS:
if mt not in base.OEB_DOCS:
continue
root = container.parsed(name)
select = Select(root, ignore_inappropriate_pseudo_classes=True)
@@ -120,31 +122,39 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
num_merged += num
container.dirty(name)
if remove_unused_classes:
used_classes |= {x.lower() for x in classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets, sheet=sheet)
used_classes |= {x.lower() for x in
classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets,
sheet=sheet)
for imported_sheet in imports:
style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
style_rules[imported_sheet] = tuple(filter_used_rules(
style_rules[imported_sheet], container.log, select))
if remove_unused_classes:
used_classes |= class_map[imported_sheet]
rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
unused_rules = tuple(filter_used_rules(rules, container.log, select))
unused_rules = tuple(filter_used_rules(rules, container.log,
select))
if unused_rules:
num_of_removed_rules += len(unused_rules)
[sheet.cssRules.remove(r) for r in unused_rules]
style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style)
pretty.pretty_script_or_style(container, style)
container.dirty(name)
for link in root.xpath('//*[local-name()="link" and @href]'):
sname = container.href_to_name(link.get('href'), name)
if sname not in sheets:
continue
style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select))
style_rules[sname] = tuple(filter_used_rules(style_rules[sname],
container.log,
select))
if remove_unused_classes:
used_classes |= class_map[sname]
for iname in import_map[sname]:
style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select))
style_rules[iname] = tuple(
filter_used_rules(style_rules[iname], container.log,
select))
if remove_unused_classes:
used_classes |= class_map[iname]
@@ -159,7 +169,8 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
elem.set('class', ' '.join(classes))
else:
del elem.attrib['class']
num_of_removed_classes += len(original_classes) - len(classes)
num_of_removed_classes += (len(original_classes) -
len(classes))
container.dirty(name)
for name, sheet in sheets.items():
@@ -195,7 +206,7 @@ def filter_declaration(style, properties=()):
changed = True
all_props = set(style.keys())
for prop in style.getProperties():
n = normalizers.get(prop.name, None)
n = base.normalize_css.normalizers.get(prop.name, None)
if n is not None:
normalized = n(prop.name, prop.propertyValue)
removed = properties.intersection(set(normalized))
@@ -225,12 +236,13 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
root = container.parsed(name)
changed = False
for style in root.xpath('//*[local-name()="style"]'):
if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
if style.text and (style.get('type') or
'text/css').lower() == 'text/css':
sheet = container.parse_css(style.text)
if transform_sheet(sheet):
changed = True
style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style)
pretty.pretty_script_or_style(container, style)
for elem in root.xpath('//*[@style]'):
text = elem.get('style', None)
if text:
@@ -240,13 +252,16 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
if style.length == 0:
del elem.attrib['style']
else:
elem.set('style', force_unicode(style.getCssText(separator=' '), 'utf-8'))
elem.set('style',
force_unicode(style.getCssText(separator=' '),
'utf-8'))
return changed
def transform_css(container, transform_sheet=None, transform_style=None, names=()):
def transform_css(container, transform_sheet=None, transform_style=None,
names=()):
if not names:
types = OEB_STYLES | OEB_DOCS
types = base.OEB_STYLES | base.OEB_DOCS
names = []
for name, mt in container.mime_map.items():
if mt in types:
@@ -256,13 +271,14 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=(
for name in names:
mt = container.mime_map[name]
if mt in OEB_STYLES:
if mt in base.OEB_STYLES:
sheet = container.parsed(name)
if transform_sheet(sheet):
container.dirty(name)
doc_changed = True
elif mt in OEB_DOCS:
if transform_inline_styles(container, name, transform_sheet, transform_style):
elif mt in base.OEB_DOCS:
if transform_inline_styles(container, name, transform_sheet,
transform_style):
container.dirty(name)
doc_changed = True
@@ -270,15 +286,21 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=(
def filter_css(container, properties, names=()):
'''
"""
Remove the specified CSS properties from all CSS rules in the book.
:param properties: Set of properties to remove. For example: :code:`{'font-family', 'color'}`.
:param names: The files from which to remove the properties. Defaults to all HTML and CSS files in the book.
'''
properties = normalize_filter_css(properties)
return transform_css(container, transform_sheet=partial(filter_sheet, properties=properties),
transform_style=partial(filter_declaration, properties=properties), names=names)
:param properties: Set of properties to remove. For example:
:code:`{'font-family', 'color'}`.
:param names: The files from which to remove the properties. Defaults to
all HTML and CSS files in the book.
"""
properties = base.normalize_css.normalize_filter_css(properties)
return transform_css(container,
transform_sheet=functools.partial(
filter_sheet, properties=properties),
transform_style=functools.partial(
filter_declaration, properties=properties),
names=names)
def _classes_in_selector(selector, classes):
@@ -331,21 +353,29 @@ def remove_property_value(prop, predicate):
if len(removed_vals) == len(prop.propertyValue):
prop.parent.removeProperty(prop.name)
else:
x = css_text(prop.propertyValue)
x = base.css_text(prop.propertyValue)
for v in removed_vals:
x = x.replace(css_text(v), '').strip()
x = x.replace(base.css_text(v), '').strip()
prop.propertyValue.cssText = x
return bool(removed_vals)
RULE_PRIORITIES = {t:i for i, t in enumerate((CSSRule.COMMENT, CSSRule.CHARSET_RULE, CSSRule.IMPORT_RULE, CSSRule.NAMESPACE_RULE))}
RULE_PRIORITIES = {t: i for i, t in enumerate((CSSRule.COMMENT,
CSSRule.CHARSET_RULE,
CSSRule.IMPORT_RULE,
CSSRule.NAMESPACE_RULE))}
def sort_sheet(container, sheet_or_text):
''' Sort the rules in a stylesheet. Note that in the general case this can
change the effective styles, but for most common sheets, it should be safe.
'''
sheet = container.parse_css(sheet_or_text) if isinstance(sheet_or_text, str) else sheet_or_text
"""
Sort the rules in a stylesheet. Note that in the general case this can
change the effective styles, but for most common sheets, it should be
safe.
"""
if isinstance(sheet_or_text, str):
sheet = container.parse_css(sheet_or_text)
else:
sheet = sheet_or_text
def text_sort_key(x):
return numeric_sort_key(str(x or ''))
@@ -364,7 +394,8 @@ def sort_sheet(container, sheet_or_text):
rule.selectorText = ', '.join(s.selectorText for s in selectors)
elif rule.type == CSSRule.FONT_FACE_RULE:
try:
tertiary = text_sort_key(rule.style.getPropertyValue('font-family'))
tertiary = text_sort_key(rule.style.getPropertyValue('font-'
'family'))
except Exception:
pass
@@ -379,11 +410,14 @@ def add_stylesheet_links(container, name, text):
if not head:
return
head = head[0]
sheets = tuple(container.manifest_items_of_type(lambda mt: mt in OEB_STYLES))
sheets = tuple(container.manifest_items_of_type(lambda mt:
mt in base.OEB_STYLES))
if not sheets:
return
for sname in sheets:
link = head.makeelement(const.XHTML_LINK, type='text/css', rel='stylesheet', href=container.name_to_href(sname, name))
link = head.makeelement(base.tag('xhtml', 'link'), type='text/css',
rel='stylesheet',
href=container.name_to_href(sname, name))
head.append(link)
pretty_xml_tree(head)
return serialize(root, 'text/html')
pretty.pretty_xml_tree(head)
return pretty.serialize(root, 'text/html')

View File

@@ -1,6 +1,7 @@
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.utils.localization import canonicalize_lang
@@ -14,7 +15,7 @@ def get_book_language(container):
def set_guide_item(container, item_type, title, name, frag=None):
ref_tag = const.OPF_REFERENCE
ref_tag = base.tag('opf', 'reference')
href = None
if name:
href = container.name_to_href(name, container.opf_name)
@@ -23,7 +24,7 @@ def set_guide_item(container, item_type, title, name, frag=None):
guides = container.opf_xpath('//opf:guide')
if not guides and href:
g = container.opf.makeelement(const.OPF_GUIDE,
g = container.opf.makeelement(base.tag('opf', 'guide'),
nsmap={'opf': const.OPF2_NS})
container.insert_into_xml(container.opf, g)
guides = [g]

View File

@@ -1,18 +1,13 @@
import textwrap
# from lxml.etree import Element
from ebook_converter import constants as const
from ebook_converter import force_unicode
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.base import serialize, OEB_DOCS, OEB_STYLES
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.utils import guess_type
from ebook_converter.utils.icu import sort_key
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
def isspace(x):
return not x.strip('\u0009\u000a\u000c\u000d\u0020')
@@ -28,37 +23,40 @@ def pretty_xml_tree(elem, level=0, indent=' '):
for i, child in enumerate(elem):
pretty_xml_tree(child, level=level+1, indent=indent)
if not child.tail or isspace(child.tail):
l = level + 1
new_level = level + 1
if i == len(elem) - 1:
l -= 1
child.tail = '\n' + (indent * l)
new_level -= 1
child.tail = '\n' + (indent * new_level)
def pretty_opf(root):
# Put all dc: tags first starting with title and author. Preserve order for
# the rest.
def dckey(x):
return {'title':0, 'creator':1}.get(parse_utils.barename(x.tag), 2)
for metadata in root.xpath('//opf:metadata', namespaces=const.OPF_NAMESPACES):
return {'title': 0, 'creator': 1}.get(parse_utils.barename(x.tag), 2)
for metadata in root.xpath('//opf:metadata',
namespaces=const.OPF_NAMESPACES):
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS)
dc_tags.sort(key=dckey)
for x in reversed(dc_tags):
metadata.insert(0, x)
# Group items in the manifest
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=const.OPF_NAMESPACES)
spine_ids = {x:i for i, x in enumerate(spine_ids)}
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref',
namespaces=const.OPF_NAMESPACES)
spine_ids = {x: i for i, x in enumerate(spine_ids)}
def manifest_key(x):
mt = x.get('media-type', '')
href = x.get('href', '')
ext = href.rpartition('.')[-1].lower()
cat = 1000
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
cat = 0
elif mt == guess_type('a.ncx'):
cat = 1
elif mt in OEB_STYLES:
elif mt in base.OEB_STYLES:
cat = 2
elif mt.startswith('image/'):
cat = 3
@@ -75,20 +73,23 @@ def pretty_opf(root):
i = sort_key(href)
return (cat, i)
for manifest in root.xpath('//opf:manifest', namespaces=const.OPF_NAMESPACES):
for manifest in root.xpath('//opf:manifest',
namespaces=const.OPF_NAMESPACES):
try:
children = sorted(manifest, key=manifest_key)
except AttributeError:
continue # There are comments so dont sort since that would mess up the comments
# There are comments so dont sort since that would mess up the
# comments.
continue
for x in reversed(children):
manifest.insert(0, x)
def isblock(x):
if callable(x.tag) or not x.tag:
return True
if x.tag in const.XHTML_BLOCK_TAGS | {const.SVG_SVG}:
if x.tag in const.XHTML_BLOCK_TAGS | {base.tag('svg', 'svg')}:
return True
return False
@@ -133,28 +134,34 @@ def pretty_block(parent, level=1, indent=' '):
that contain only other block tags '''
if not parent.text or isspace(parent.text):
parent.text = ''
nn = '\n' if hasattr(parent.tag, 'strip') and parse_utils.barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
if (hasattr(parent.tag, 'strip') and
parse_utils.barename(parent.tag) in {'tr', 'td', 'th'}):
nn = '\n'
else:
nn = '\n\n'
parent.text = parent.text + nn + (indent * level)
for i, child in enumerate(parent):
if isblock(child) and has_only_blocks(child):
pretty_block(child, level=level+1, indent=indent)
elif child.tag == const.SVG_SVG:
elif child.tag == base.tag('svg', 'svg'):
pretty_xml_tree(child, level=level, indent=indent)
l = level
new_level = level
if i == len(parent) - 1:
l -= 1
new_level -= 1
if not child.tail or isspace(child.tail):
child.tail = ''
child.tail = child.tail + nn + (indent * l)
child.tail = child.tail + nn + (indent * new_level)
def pretty_script_or_style(container, child):
if child.text:
indent = indent_for_tag(child)
if child.tag.endswith('style'):
child.text = force_unicode(pretty_css(container, '', child.text), 'utf-8')
child.text = force_unicode(pretty_css(container, '', child.text),
'utf-8')
child.text = textwrap.dedent(child.text)
child.text = '\n' + '\n'.join([(indent + x) if x else '' for x in child.text.splitlines()])
child.text = '\n' + '\n'.join([(indent + x) if x else ''
for x in child.text.splitlines()])
set_indent(child, 'text', indent)
@@ -169,62 +176,82 @@ def pretty_html_tree(container, root):
# Special case the handling of a body that contains a single block tag
# with all content. In this case we prettify the containing block tag
# even if it has non block children.
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(
body[0]) and parse_utils.barename(body[0].tag) not in (
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
if (len(body) == 1 and
not callable(body[0].tag) and
isblock(body[0]) and
not has_only_blocks(body[0]) and
parse_utils.barename(body[0].tag) not in ('pre', 'p', 'h1',
'h2', 'h3', 'h4',
'h5', 'h6') and
len(body[0]) > 0):
pretty_block(body[0], level=2)
if container is not None:
# Handle <script> and <style> tags
for child in root.xpath('//*[local-name()="script" or local-name()="style"]'):
for child in root.xpath('//*[local-name()="script" or local-name()='
'"style"]'):
pretty_script_or_style(container, child)
def fix_html(container, raw):
' Fix any parsing errors in the HTML represented as a string in raw. Fixing is done using the HTML5 parsing algorithm. '
"""
Fix any parsing errors in the HTML represented as a string in raw. Fixing
is done using the HTML5 parsing algorithm.
"""
root = container.parse_xhtml(raw)
return serialize(root, 'text/html')
return base.serialize(root, 'text/html')
def pretty_html(container, name, raw):
' Pretty print the HTML represented as a string in raw '
"""
Pretty print the HTML represented as a string in raw
"""
root = container.parse_xhtml(raw)
pretty_html_tree(container, root)
return serialize(root, 'text/html')
return base.serialize(root, 'text/html')
def pretty_css(container, name, raw):
' Pretty print the CSS represented as a string in raw '
"""
Pretty print the CSS represented as a string in raw
"""
sheet = container.parse_css(raw)
return serialize(sheet, 'text/css')
return base.serialize(sheet, 'text/css')
def pretty_xml(container, name, raw):
' Pretty print the XML represented as a string in raw. If ``name`` is the name of the OPF, extra OPF-specific prettying is performed. '
"""
Pretty print the XML represented as a string in raw. If ``name`` is the
name of the OPF, extra OPF-specific prettying is performed.
"""
root = container.parse_xml(raw)
if name == container.opf_name:
pretty_opf(root)
pretty_xml_tree(root)
return serialize(root, 'text/xml')
return base.serialize(root, 'text/xml')
def fix_all_html(container):
' Fix any parsing errors in all HTML files in the container. Fixing is done using the HTML5 parsing algorithm. '
"""
Fix any parsing errors in all HTML files in the container. Fixing is done
using the HTML5 parsing algorithm. """
for name, mt in container.mime_map.items():
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
container.parsed(name)
container.dirty(name)
def pretty_all(container):
' Pretty print all HTML/CSS/XML files in the container '
"""
Pretty print all HTML/CSS/XML files in the container
"""
xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
for name, mt in container.mime_map.items():
prettied = False
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
pretty_html_tree(container, container.parsed(name))
prettied = True
elif mt in OEB_STYLES:
elif mt in base.OEB_STYLES:
container.parsed(name)
prettied = True
elif name == container.opf_name:

View File

@@ -7,6 +7,7 @@ import urllib.parse
from ebook_converter import constants as const
from ebook_converter import guess_type, strftime
from ebook_converter.constants_old import iswindows
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.base import XPath, xml2text, urlnormalize
from ebook_converter.library.comments import comments_to_html, markdown
from ebook_converter.utils.date import is_date_undefined, as_local_time
@@ -371,7 +372,7 @@ def render_jacket(mi, output_profile,
# We cannot use data-calibre-rescale 100 on the body tag as that will just
# give the body tag a font size of 1em, which is useless.
for body in root.xpath('//*[local-name()="body"]'):
fw = body.makeelement(const.XHTML_DIV)
fw = body.makeelement(base.tag('xhtml', 'div'))
fw.set('data-calibre-rescale', '100')
for child in body:
fw.append(child)
@@ -388,9 +389,9 @@ def linearize_jacket(oeb):
for x in oeb.spine[:4]:
if XPath(JACKET_XPATH)(x.data):
for e in XPath('//h:table|//h:tr|//h:th')(x.data):
e.tag = const.XHTML_DIV
e.tag = base.tag('xhtml', 'div')
for e in XPath('//h:td')(x.data):
e.tag = const.XHTML_SPAN
e.tag = base.tag('xhtml', 'span')
break

View File

@@ -3,8 +3,11 @@ Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
forced at "likely" locations to conform to size limitations. This transform
assumes a prior call to the flatcss transform.
"""
import os, functools, collections, re, copy
from collections import OrderedDict
import collections
import copy
import functools
import os
import re
import urllib.parse
from lxml.etree import XPath as _XPath
@@ -13,8 +16,7 @@ from lxml import etree
from ebook_converter import constants as const
from ebook_converter import as_unicode, force_unicode
from ebook_converter.ebooks.epub import rules
from ebook_converter.ebooks.oeb.base import \
OEB_STYLES, rewrite_links, urlnormalize
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.split import do_split
from ebook_converter.polyglot.urllib import unquote
from ebook_converter.css_selectors import Select, SelectorError
@@ -44,14 +46,15 @@ class SplitError(ValueError):
class Split(object):
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
max_flow_size=0, remove_css_pagebreaks=True):
max_flow_size=0, remove_css_pagebreaks=True):
self.split_on_page_breaks = split_on_page_breaks
self.page_breaks_xpath = page_breaks_xpath
self.max_flow_size = max_flow_size
self.page_break_selectors = None
self.remove_css_pagebreaks = remove_css_pagebreaks
if self.page_breaks_xpath is not None:
self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)]
self.page_break_selectors = [(XPath(self.page_breaks_xpath),
False)]
def __call__(self, oeb, opts):
self.oeb = oeb
@@ -71,7 +74,7 @@ class Split(object):
page_breaks, page_break_ids = self.find_page_breaks(item)
splitter = FlowSplitter(item, page_breaks, page_break_ids,
self.max_flow_size, self.oeb, self.opts)
self.max_flow_size, self.oeb, self.opts)
if splitter.was_split:
am = splitter.anchor_map
self.map[item.href] = collections.defaultdict(
@@ -81,25 +84,27 @@ class Split(object):
if self.page_break_selectors is None:
self.page_break_selectors = set()
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES]
base.OEB_STYLES]
for rule in rules(stylesheets):
before = force_unicode(getattr(rule.style.getPropertyCSSValue(
'page-break-before'), 'cssText', '').strip().lower())
after = force_unicode(getattr(rule.style.getPropertyCSSValue(
after = force_unicode(getattr(rule.style.getPropertyCSSValue(
'page-break-after'), 'cssText', '').strip().lower())
try:
if before and before not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((rule.selectorText, True))
self.page_break_selectors.add((rule.selectorText,
True))
if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-before')
except:
except Exception:
pass
try:
if after and after not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((rule.selectorText, False))
self.page_break_selectors.add((rule.selectorText,
False))
if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-after')
except:
except Exception:
pass
page_breaks = set()
select = Select(item.data)
@@ -110,14 +115,18 @@ class Split(object):
return [], []
descendants = frozenset(body[0].iterdescendants('*'))
_tags = {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}
for selector, before in self.page_break_selectors:
try:
for elem in select(selector):
if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
if (elem in descendants and
elem.tag.rpartition('}')[2].lower() not in _tags):
elem.set('pb_before', '1' if before else '0')
page_breaks.add(elem)
except SelectorError as err:
self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))
self.log.warn('Ignoring page breaks specified with invalid '
'CSS selector: %r (%s)' %
(selector, as_unicode(err)))
for i, elem in enumerate(item.data.iter('*')):
try:
@@ -126,23 +135,23 @@ class Split(object):
continue
page_breaks = list(page_breaks)
page_breaks.sort(key=lambda x:int(x.get('pb_order')))
page_breaks.sort(key=lambda x: int(x.get('pb_order')))
page_break_ids, page_breaks_ = [], []
for i, x in enumerate(page_breaks):
x.set('id', x.get('id', 'calibre_pb_%d'%i))
x.set('id', x.get('id', 'calibre_pb_%d' % i))
id = x.get('id')
try:
xp = XPath('//*[@id="%s"]'%id)
except:
xp = XPath('//*[@id="%s"]' % id)
except Exception:
try:
xp = XPath("//*[@id='%s']"%id)
except:
xp = XPath("//*[@id='%s']" % id)
except Exception:
# The id has both a quote and an apostrophe or some other
# Just replace it since I doubt its going to work anywhere else
# either
id = 'calibre_pb_%d'%i
# Just replace it since I doubt its going to work anywhere
# else either
id = 'calibre_pb_%d' % i
x.set('id', id)
xp = XPath('//*[@id=%r]'%id)
xp = XPath('//*[@id=%r]' % id)
page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
page_break_ids.append(id)
@@ -159,7 +168,7 @@ class Split(object):
for item in self.oeb.manifest:
if etree.iselement(item.data):
self.current_item = item
rewrite_links(item.data, self.rewrite_links)
base.rewrite_links(item.data, self.rewrite_links)
def rewrite_links(self, url):
href, frag = urllib.parse.urldefrag(url)
@@ -169,7 +178,7 @@ class Split(object):
# Unparseable URL
return url
try:
href = urlnormalize(href)
href = base.urlnormalize(href)
except ValueError:
# href has non utf-8 quoting
return url
@@ -188,19 +197,19 @@ class FlowSplitter(object):
'The actual splitting logic'
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
opts):
self.item = item
self.oeb = oeb
self.opts = opts
self.log = oeb.log
self.page_breaks = page_breaks
opts):
self.item = item
self.oeb = oeb
self.opts = opts
self.log = oeb.log
self.page_breaks = page_breaks
self.page_break_ids = page_break_ids
self.max_flow_size = max_flow_size
self.base = item.href
self.csp_counter = 0
self.max_flow_size = max_flow_size
self.base = item.href
self.csp_counter = 0
base, ext = os.path.splitext(self.base)
self.base = base.replace('%', '%%')+'_split_%.3d'+ext
name, ext = os.path.splitext(self.base)
self.base = name.replace('%', '%%') + '_split_%.3d' + ext
self.trees = [self.item.data.getroottree()]
self.splitting_on_page_breaks = True
@@ -210,13 +219,13 @@ class FlowSplitter(object):
if self.max_flow_size > 0:
lt_found = False
self.log('\tLooking for large trees in %s...'%item.href)
self.log('\tLooking for large trees in %s...' % item.href)
trees = list(self.trees)
self.tree_map = {}
for i, tree in enumerate(trees):
size = len(tostring(tree.getroot()))
if size > self.max_flow_size:
self.log('\tFound large tree #%d'%i)
self.log('\tFound large tree #%d' % i)
lt_found = True
self.split_trees = []
self.split_to_size(tree)
@@ -229,11 +238,11 @@ class FlowSplitter(object):
self.was_split = len(self.trees) > 1
if self.was_split:
self.log('\tSplit into %d parts'%len(self.trees))
self.log('\tSplit into %d parts' % len(self.trees))
self.commit()
def split_on_page_breaks(self, orig_tree):
ordered_ids = OrderedDict()
ordered_ids = collections.OrderedDict()
all_page_break_ids = frozenset(self.page_break_ids)
for elem_id in orig_tree.xpath('//*/@id'):
if elem_id in all_page_break_ids:
@@ -248,9 +257,10 @@ class FlowSplitter(object):
tree = self.trees[i]
elem = pattern(tree)
if elem:
self.log.debug('\t\tSplitting on page-break at id=%s'%
elem[0].get('id'))
before_tree, after_tree = self.do_split(tree, elem[0], before)
self.log.debug('\t\tSplitting on page-break at id=%s' %
elem[0].get('id'))
before_tree, after_tree = self.do_split(tree, elem[0],
before)
self.trees[i:i+1] = [before_tree, after_tree]
break
@@ -269,7 +279,11 @@ class FlowSplitter(object):
if body is not None:
existing_ids = frozenset(body.xpath('//*/@id'))
for x in ids - existing_ids:
body.insert(0, body.makeelement(const.XHTML_div, id=x, style='height:0pt'))
body.insert(0,
body.makeelement(base.tag('xhtml',
'div'),
id=x,
style='height:0pt'))
ids = set()
trees.append(tree)
self.trees = trees
@@ -281,12 +295,13 @@ class FlowSplitter(object):
return body[0]
def do_split(self, tree, split_point, before):
'''
"""
Split ``tree`` into a *before* and *after* tree at ``split_point``.
:param before: If True tree is split before split_point, otherwise after split_point
:param before: If True tree is split before split_point, otherwise
after split_point
:return: before_tree, after_tree
'''
"""
return do_split(split_point, self.log, before=before)
def is_page_empty(self, root):
@@ -294,7 +309,7 @@ class FlowSplitter(object):
if body is None:
return False
txt = re.sub(r'\s+|\xa0', '',
etree.tostring(body, method='text', encoding='unicode'))
etree.tostring(body, method='text', encoding='unicode'))
if len(txt) > 1:
return False
for img in root.xpath('//h:img', namespaces=const.XPNSMAP):
@@ -305,13 +320,13 @@ class FlowSplitter(object):
return True
def split_text(self, text, root, size):
self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
self.log.debug('\t\t\tSplitting text of length: %d' % len(text))
rest = text.replace('\r', '')
parts = re.split('\n\n', rest)
self.log.debug('\t\t\t\tFound %d parts'%len(parts))
self.log.debug('\t\t\t\tFound %d parts' % len(parts))
if max(map(len, parts)) > size:
raise SplitError('Cannot split as file contains a <pre> tag '
'with a very large paragraph', root)
'with a very large paragraph', root)
ans = []
buf = ''
for part in parts:
@@ -331,7 +346,8 @@ class FlowSplitter(object):
continue
if pre.text and len(pre.text) > self.max_flow_size*0.5:
self.log.debug('\t\tSplitting large <pre> tag')
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
frags = self.split_text(pre.text, root,
int(0.2 * self.max_flow_size))
new_pres = []
for frag in frags:
pre2 = copy.copy(pre)
@@ -346,7 +362,8 @@ class FlowSplitter(object):
split_point, before = self.find_split_point(root)
if split_point is None:
raise SplitError(self.item.href, root)
self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))
self.log.debug('\t\t\tSplit point:', split_point.tag,
tree.getpath(split_point))
trees = self.do_split(tree, split_point, before)
sizes = [len(tostring(t.getroot())) for t in trees]
@@ -361,12 +378,11 @@ class FlowSplitter(object):
continue
elif size <= self.max_flow_size:
self.split_trees.append(t)
self.log.debug(
'\t\t\tCommitted sub-tree #%d (%d KB)'%(
len(self.split_trees), size/1024.))
self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)' %
(len(self.split_trees), size/1024.))
else:
self.log.debug(
'\t\t\tSplit tree still too large: %d KB' % (size/1024.))
self.log.debug('\t\t\tSplit tree still too large: %d KB' %
size/1024)
self.split_to_size(t)
def find_split_point(self, root):
@@ -385,8 +401,8 @@ class FlowSplitter(object):
'''
def pick_elem(elems):
if elems:
elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') !=
'1']
elems = [i for i in elems
if i.get(SPLIT_POINT_ATTR, '0') != '1']
if elems:
i = int(len(elems)//2)
elems[i].set(SPLIT_POINT_ATTR, '1')
@@ -407,7 +423,7 @@ class FlowSplitter(object):
if elem is not None:
try:
XPath(elem.getroottree().getpath(elem))
except:
except Exception:
continue
return elem, True
@@ -421,23 +437,24 @@ class FlowSplitter(object):
'''
if not self.was_split:
return
self.anchor_map = collections.defaultdict(lambda :self.base%0)
self.anchor_map = collections.defaultdict(lambda: self.base % 0)
self.files = []
for i, tree in enumerate(self.trees):
root = tree.getroot()
self.files.append(self.base%i)
self.files.append(self.base % i)
for elem in root.xpath('//*[@id or @name]'):
for anchor in elem.get('id', ''), elem.get('name', ''):
if anchor != '' and anchor not in self.anchor_map:
self.anchor_map[anchor] = self.files[-1]
for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR):
for elem in root.xpath('//*[@%s]' % SPLIT_POINT_ATTR):
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
spine_pos = self.item.spine_position
for current, tree in zip(*map(reversed, (self.files, self.trees))):
for a in tree.getroot().xpath('//h:a[@href]', namespaces=const.XPNSMAP):
for a in tree.getroot().xpath('//h:a[@href]',
namespaces=const.XPNSMAP):
href = a.get('href').strip()
if href.startswith('#'):
anchor = href[1:]
@@ -448,7 +465,8 @@ class FlowSplitter(object):
new_id = self.oeb.manifest.generate(id=self.item.id)[0]
new_item = self.oeb.manifest.add(new_id, current,
self.item.media_type, data=tree.getroot())
self.item.media_type,
data=tree.getroot())
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
if self.oeb.guide:

View File

@@ -7,7 +7,7 @@ from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.base import TOC, xml2text
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks import ConversionError
@@ -15,8 +15,8 @@ def XPath(x):
try:
return etree.XPath(x, namespaces=const.XPNSMAP)
except etree.XPathSyntaxError:
raise ConversionError(
'The syntax of the XPath expression %s is invalid.' % repr(x))
raise ConversionError('The syntax of the XPath expression %s is '
'invalid.' % repr(x))
def isspace(x):
@@ -33,9 +33,13 @@ def at_start(elem):
for x in body.iter():
if x is elem:
return True
if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}:
if hasattr(getattr(x, 'tag', None),
'rpartition') and x.tag.rpartition('}')[-1] in {'img',
'svg'}:
return False
if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))):
if isspace(getattr(x, 'text', None)) and (x in ancestors or
isspace(getattr(x, 'tail',
None))):
continue
return False
return False
@@ -52,7 +56,7 @@ class DetectStructure(object):
self.detect_chapters()
if self.oeb.auto_generated_toc or opts.use_auto_toc:
orig_toc = self.oeb.toc
self.oeb.toc = TOC()
self.oeb.toc = base.TOC()
self.create_level_based_toc()
if self.oeb.toc.count() < 1:
if not opts.no_chapters_in_toc and self.detected_chapters:
@@ -64,14 +68,14 @@ class DetectStructure(object):
else:
self.oeb.auto_generated_toc = True
self.log('Auto generated TOC with %d entries.' %
self.oeb.toc.count())
self.oeb.toc.count())
if opts.toc_filter is not None:
regexp = re.compile(opts.toc_filter)
for node in list(self.oeb.toc.iter()):
if not node.title or regexp.search(node.title) is not None:
self.log('Filtering', node.title if node.title else
'empty node', 'from TOC')
'empty node', 'from TOC')
self.oeb.toc.remove(node)
if opts.page_breaks_before is not None:
@@ -80,10 +84,11 @@ class DetectStructure(object):
for elem in pb_xpath(item.data):
try:
prev = next(elem.itersiblings(tag=etree.Element,
preceding=True))
if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and parse_utils.barename(
prev.tag) in {'h1', 'h2'} and (not prev.tail or
not prev.tail.split())):
preceding=True))
if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and
parse_utils.barename(prev.tag) in {'h1',
'h2'} and
(not prev.tail or not prev.tail.split())):
# We have two adjacent headings, do not put a page
# break on the second one
continue
@@ -106,9 +111,9 @@ class DetectStructure(object):
expr = self.opts.start_reading_at
try:
expr = XPath(expr)
except:
self.log.warn(
'Invalid start reading at XPath expression, ignoring: %s'%expr)
except Exception:
self.log.warn('Invalid start reading at XPath expression, '
'ignoring: %s' % expr)
return
for item in self.oeb.spine:
if not hasattr(item.data, 'xpath'):
@@ -118,16 +123,17 @@ class DetectStructure(object):
elem = matches[0]
eid = elem.get('id', None)
if not eid:
eid = 'start_reading_at_'+str(uuid.uuid4()).replace('-', '')
eid = 'start_reading_at_' + str(uuid.uuid4()).replace('-',
'')
elem.set('id', eid)
if 'text' in self.oeb.guide:
self.oeb.guide.remove('text')
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
self.log('Setting start reading at position to %s in %s'%(
self.opts.start_reading_at, item.href))
self.log('Setting start reading at position to %s in %s' %
(self.opts.start_reading_at, item.href))
return
self.log.warn("Failed to find start reading at position: %s"%
self.opts.start_reading_at)
self.log.warn("Failed to find start reading at position: %s" %
self.opts.start_reading_at)
def get_toc_parts_for_xpath(self, expr):
# if an attribute is selected by the xpath expr then truncate it
@@ -148,12 +154,14 @@ class DetectStructure(object):
ans = XPath(expr)(doc)
len(ans)
return ans
except:
self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
except Exception:
self.log.warn('Invalid chapter expression, ignoring: %s' %
expr)
return []
if self.opts.chapter:
chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter)
chapter_path, title_attribute = (
self.get_toc_parts_for_xpath(self.opts.chapter))
self.chapter_title_attribute = title_attribute
for item in self.oeb.spine:
for x in find_matches(chapter_path, item.data):
@@ -165,25 +173,28 @@ class DetectStructure(object):
c = collections.Counter()
for item, elem in self.detected_chapters:
c[item] += 1
text = xml2text(elem).strip()
text = base.xml2text(elem).strip()
text = re.sub(r'\s+', ' ', text.strip())
self.log('\tDetected chapter:', text[:50])
if chapter_mark == 'none':
continue
if chapter_mark == 'rule':
mark = elem.makeelement(const.XHTML_HR)
mark = elem.makeelement(base.tag('xhtml', 'hr'))
elif chapter_mark == 'pagebreak':
if c[item] < 3 and at_start(elem):
# For the first two elements in this item, check if they
# are at the start of the file, in which case inserting a
# page break in unnecessary and can lead to extra blank
# pages in the PDF Output plugin. We need to use two as
# feedbooks epubs match both a heading tag and its
# containing div with the default chapter expression.
# For the first two elements in this item, check if
# they are at the start of the file, in which case
# inserting a page break in unnecessary and can lead
# to extra blank pages in the PDF Output plugin. We
# need to use two as feedbooks epubs match both a
# heading tag and its containing div with the default
# chapter expression.
continue
mark = elem.makeelement(const.XHTML_DIV, style=page_break_after)
mark = elem.makeelement(base.tag('xhtml', 'div'),
style=page_break_after)
else: # chapter_mark == 'both':
mark = elem.makeelement(const.XHTML_HR, style=page_break_before)
mark = elem.makeelement(base.tag('xhtml', 'hr'),
style=page_break_before)
try:
elem.addprevious(mark)
except TypeError:
@@ -196,7 +207,9 @@ class DetectStructure(object):
def create_toc_from_chapters(self):
counter = self.oeb.toc.next_play_order()
for item, elem in self.detected_chapters:
text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter)
text, href = self.elem_to_link(item, elem,
self.chapter_title_attribute,
counter)
self.oeb.toc.add(text, href, play_order=counter)
counter += 1
@@ -216,18 +229,21 @@ class DetectStructure(object):
if frag:
href = '#'.join((href, frag))
if not self.oeb.toc.has_href(href):
text = xml2text(a)
text = base.xml2text(a)
text = text[:100].strip()
if (not self.opts.duplicate_links_in_toc and
self.oeb.toc.has_text(text)):
continue
try:
self.oeb.toc.add(text, href,
self.oeb.toc.add(
text, href,
play_order=self.oeb.toc.next_play_order())
num += 1
except ValueError:
self.oeb.log.exception('Failed to process link: %r' % href)
continue # Most likely an incorrectly URL encoded link
self.oeb.log.exception('Failed to process link: '
'%r' % href)
# Most likely an incorrectly URL encoded link
continue
if self.opts.max_toc_links > 0 and \
num >= self.opts.max_toc_links:
self.log('Maximum TOC links reached, stopping.')
@@ -238,14 +254,14 @@ class DetectStructure(object):
if title_attribute is not None:
text = elem.get(title_attribute, '')
if not text:
text = xml2text(elem).strip()
text = base.xml2text(elem).strip()
if not text:
text = elem.get('title', '')
if not text:
text = elem.get('alt', '')
text = re.sub(r'\s+', ' ', text.strip())
text = text[:1000].strip()
id = elem.get('id', 'calibre_toc_%d'%counter)
id = elem.get('id', 'calibre_toc_%d' % counter)
elem.set('id', id)
href = '#'.join((item.href, id))
return text, href
@@ -260,26 +276,29 @@ class DetectStructure(object):
ans = XPath(expr)(doc)
len(ans)
return ans
except:
self.log.warn('Invalid ToC expression, ignoring: %s'%expr)
except Exception:
self.log.warn('Invalid ToC expression, ignoring: %s' % expr)
return []
for document in self.oeb.spine:
previous_level1 = list(added.values())[-1] if added else None
previous_level2 = list(added2.values())[-1] if added2 else None
level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc)
(level1_toc,
level1_title) = self.get_toc_parts_for_xpath(self.opts.level1_toc)
for elem in find_matches(level1_toc, document.data):
text, _href = self.elem_to_link(document, elem, level1_title, counter)
text, _href = self.elem_to_link(document, elem, level1_title,
counter)
counter += 1
if text:
node = self.oeb.toc.add(text, _href,
play_order=self.oeb.toc.next_play_order())
node = self.oeb.toc.add(
text, _href, play_order=self.oeb.toc.next_play_order())
added[elem] = node
# node.add('Top', _href)
if self.opts.level2_toc is not None and added:
level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc)
level2_toc, level2_title = self.get_toc_parts_for_xpath(
self.opts.level2_toc)
for elem in find_matches(level2_toc, document.data):
level1 = None
for item in document.data.iterdescendants():
@@ -290,15 +309,19 @@ class DetectStructure(object):
if previous_level1 is None:
break
level1 = previous_level1
text, _href = self.elem_to_link(document, elem, level2_title, counter)
text, _href = self.elem_to_link(document, elem,
level2_title,
counter)
counter += 1
if text:
added2[elem] = level1.add(text, _href,
added2[elem] = level1.add(
text, _href,
play_order=self.oeb.toc.next_play_order())
break
if self.opts.level3_toc is not None and added2:
level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc)
level3_toc, level3_title = self.get_toc_parts_for_xpath(
self.opts.level3_toc)
for elem in find_matches(level3_toc, document.data):
level2 = None
for item in document.data.iterdescendants():
@@ -309,10 +332,13 @@ class DetectStructure(object):
if previous_level2 is None:
break
level2 = previous_level2
text, _href = \
self.elem_to_link(document, elem, level3_title, counter)
text, _href = self.elem_to_link(document,
elem,
level3_title,
counter)
counter += 1
if text:
level2.add(text, _href,
play_order=self.oeb.toc.next_play_order())
play_order=self.oeb
.toc.next_play_order())
break