mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-01 14:15:54 +01:00
Fixing leftovers from first concept of constants
This commit is contained in:
@@ -1,14 +1,16 @@
|
|||||||
|
import collections
|
||||||
|
import functools
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from collections import defaultdict, namedtuple
|
|
||||||
from functools import wraps
|
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from ebook_converter import constants as const
|
from ebook_converter import constants as const
|
||||||
from ebook_converter import prints
|
from ebook_converter import prints
|
||||||
from ebook_converter.ebooks.metadata import authors_to_string, check_isbn, string_to_authors
|
from ebook_converter.ebooks.metadata import authors_to_string
|
||||||
from ebook_converter.ebooks.metadata.book.base import Metadata
|
from ebook_converter.ebooks.metadata import check_isbn
|
||||||
|
from ebook_converter.ebooks.metadata import string_to_authors
|
||||||
|
from ebook_converter.ebooks.metadata.book import base
|
||||||
from ebook_converter.ebooks.metadata.book.json_codec import (
|
from ebook_converter.ebooks.metadata.book.json_codec import (
|
||||||
decode_is_multiple, encode_is_multiple, object_to_unicode
|
decode_is_multiple, encode_is_multiple, object_to_unicode
|
||||||
)
|
)
|
||||||
@@ -17,17 +19,30 @@ from ebook_converter.ebooks.metadata.utils import (
|
|||||||
pretty_print_opf
|
pretty_print_opf
|
||||||
)
|
)
|
||||||
from ebook_converter.utils.config import from_json, to_json
|
from ebook_converter.utils.config import from_json, to_json
|
||||||
from ebook_converter.utils.date import (
|
from ebook_converter.utils.date import (fix_only_date, is_date_undefined,
|
||||||
fix_only_date, is_date_undefined, isoformat, parse_date as parse_date_, utcnow,
|
isoformat, parse_date as parse_date_,
|
||||||
w3cdtf
|
utcnow, w3cdtf)
|
||||||
)
|
|
||||||
from ebook_converter.utils.iso8601 import parse_iso8601
|
from ebook_converter.utils.iso8601 import parse_iso8601
|
||||||
from ebook_converter.utils.localization import canonicalize_lang
|
from ebook_converter.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
|
|
||||||
|
RES_PREFIXES = {'dcterms': 'http://purl.org/dc/terms/',
|
||||||
|
'epubsc': 'http://idpf.org/epub/vocab/sc/#',
|
||||||
|
'marc': 'http://id.loc.gov/vocabulary/',
|
||||||
|
'media': 'http://www.idpf.org/epub/vocab/overlays/#',
|
||||||
|
'onix': 'http://www.editeur.org/ONIX/book/codelists/'
|
||||||
|
'current.html#',
|
||||||
|
'rendition': 'http://www.idpf.org/vocab/rendition/#',
|
||||||
|
'schema': 'http://schema.org/',
|
||||||
|
'xsd': 'http://www.w3.org/2001/XMLSchema#'}
|
||||||
|
|
||||||
|
CALIBRE_PREFIX = 'https://calibre-ebook.com'
|
||||||
|
KNOWN_PREFIXES = RES_PREFIXES.copy()
|
||||||
|
KNOWN_PREFIXES['calibre'] = CALIBRE_PREFIX
|
||||||
|
|
||||||
# Utils {{{
|
# Utils {{{
|
||||||
_xpath_cache = {}
|
_XPATH_CACHE = {}
|
||||||
_re_cache = {}
|
_RE_CACHE = {}
|
||||||
|
|
||||||
|
|
||||||
def uniq(vals):
|
def uniq(vals):
|
||||||
@@ -39,22 +54,23 @@ def uniq(vals):
|
|||||||
|
|
||||||
|
|
||||||
def dump_dict(cats):
|
def dump_dict(cats):
|
||||||
return json.dumps(object_to_unicode(cats or {}), ensure_ascii=False, skipkeys=True)
|
return json.dumps(object_to_unicode(cats or {}), ensure_ascii=False,
|
||||||
|
skipkeys=True)
|
||||||
|
|
||||||
|
|
||||||
def XPath(x):
|
def XPath(x):
|
||||||
try:
|
try:
|
||||||
return _xpath_cache[x]
|
return _XPATH_CACHE[x]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
_xpath_cache[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP)
|
_XPATH_CACHE[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def regex(r, flags=0):
|
def regex(r, flags=0):
|
||||||
try:
|
try:
|
||||||
return _re_cache[(r, flags)]
|
return _RE_CACHE[(r, flags)]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
_re_cache[(r, flags)] = ans = re.compile(r, flags)
|
_RE_CACHE[(r, flags)] = ans = re.compile(r, flags)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
@@ -82,7 +98,7 @@ def properties_for_id(item_id, refines):
|
|||||||
|
|
||||||
|
|
||||||
def properties_for_id_with_scheme(item_id, prefixes, refines):
|
def properties_for_id_with_scheme(item_id, prefixes, refines):
|
||||||
ans = defaultdict(list)
|
ans = collections.defaultdict(list)
|
||||||
if item_id:
|
if item_id:
|
||||||
for elem in refines[item_id]:
|
for elem in refines[item_id]:
|
||||||
key = elem.get('property')
|
key = elem.get('property')
|
||||||
@@ -126,7 +142,7 @@ def normalize_whitespace(text):
|
|||||||
|
|
||||||
|
|
||||||
def simple_text(f):
|
def simple_text(f):
|
||||||
@wraps(f)
|
@functools.wraps(f)
|
||||||
def wrapper(*args, **kw):
|
def wrapper(*args, **kw):
|
||||||
return normalize_whitespace(f(*args, **kw))
|
return normalize_whitespace(f(*args, **kw))
|
||||||
return wrapper
|
return wrapper
|
||||||
@@ -135,7 +151,7 @@ def simple_text(f):
|
|||||||
def items_with_property(root, q, prefixes=None):
|
def items_with_property(root, q, prefixes=None):
|
||||||
if prefixes is None:
|
if prefixes is None:
|
||||||
prefixes = read_prefixes(root)
|
prefixes = read_prefixes(root)
|
||||||
q = expand_prefix(q, known_prefixes).lower()
|
q = expand_prefix(q, KNOWN_PREFIXES).lower()
|
||||||
for item in XPath("./opf:manifest/opf:item[@properties]")(root):
|
for item in XPath("./opf:manifest/opf:item[@properties]")(root):
|
||||||
for prop in (item.get('properties') or '').lower().split():
|
for prop in (item.get('properties') or '').lower().split():
|
||||||
prop = expand_prefix(prop, prefixes)
|
prop = expand_prefix(prop, prefixes)
|
||||||
@@ -150,43 +166,32 @@ def items_with_property(root, q, prefixes=None):
|
|||||||
# http://www.idpf.org/epub/vocab/package/pfx/
|
# http://www.idpf.org/epub/vocab/package/pfx/
|
||||||
|
|
||||||
|
|
||||||
reserved_prefixes = {
|
|
||||||
'dcterms': 'http://purl.org/dc/terms/',
|
|
||||||
'epubsc': 'http://idpf.org/epub/vocab/sc/#',
|
|
||||||
'marc': 'http://id.loc.gov/vocabulary/',
|
|
||||||
'media': 'http://www.idpf.org/epub/vocab/overlays/#',
|
|
||||||
'onix': 'http://www.editeur.org/ONIX/book/codelists/current.html#',
|
|
||||||
'rendition':'http://www.idpf.org/vocab/rendition/#',
|
|
||||||
'schema': 'http://schema.org/',
|
|
||||||
'xsd': 'http://www.w3.org/2001/XMLSchema#',
|
|
||||||
}
|
|
||||||
|
|
||||||
CALIBRE_PREFIX = 'https://calibre-ebook.com'
|
|
||||||
known_prefixes = reserved_prefixes.copy()
|
|
||||||
known_prefixes['calibre'] = CALIBRE_PREFIX
|
|
||||||
|
|
||||||
|
|
||||||
def parse_prefixes(x):
|
def parse_prefixes(x):
|
||||||
return {m.group(1):m.group(2) for m in re.finditer(r'(\S+): \s*(\S+)', x)}
|
return {m.group(1): m.group(2)
|
||||||
|
for m in re.finditer(r'(\S+): \s*(\S+)', x)}
|
||||||
|
|
||||||
|
|
||||||
def read_prefixes(root):
|
def read_prefixes(root):
|
||||||
ans = reserved_prefixes.copy()
|
ans = RES_PREFIXES.copy()
|
||||||
ans.update(parse_prefixes(root.get('prefix') or ''))
|
ans.update(parse_prefixes(root.get('prefix') or ''))
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def expand_prefix(raw, prefixes):
|
def expand_prefix(raw, prefixes):
|
||||||
return regex(r'(\S+)\s*:\s*(\S+)').sub(lambda m:(prefixes.get(m.group(1), m.group(1)) + ':' + m.group(2)), raw or '')
|
return (regex(r'(\S+)\s*:\s*(\S+)')
|
||||||
|
.sub(lambda m: (prefixes.get(m.group(1),
|
||||||
|
m.group(1)) + ':' + m.group(2)),
|
||||||
|
raw or ''))
|
||||||
|
|
||||||
|
|
||||||
def ensure_prefix(root, prefixes, prefix, value=None):
|
def ensure_prefix(root, prefixes, prefix, value=None):
|
||||||
if prefixes is None:
|
if prefixes is None:
|
||||||
prefixes = read_prefixes(root)
|
prefixes = read_prefixes(root)
|
||||||
prefixes[prefix] = value or reserved_prefixes[prefix]
|
prefixes[prefix] = value or RES_PREFIXES[prefix]
|
||||||
prefixes = {k:v for k, v in prefixes.items() if reserved_prefixes.get(k) != v}
|
prefixes = {k: v for k, v in prefixes.items() if RES_PREFIXES.get(k) != v}
|
||||||
if prefixes:
|
if prefixes:
|
||||||
root.set('prefix', ' '.join('%s: %s' % (k, v) for k, v in prefixes.items()))
|
root.set('prefix', ' '.join('%s: %s' % (k, v)
|
||||||
|
for k, v in prefixes.items()))
|
||||||
else:
|
else:
|
||||||
root.attrib.pop('prefix', None)
|
root.attrib.pop('prefix', None)
|
||||||
|
|
||||||
@@ -196,7 +201,7 @@ def ensure_prefix(root, prefixes, prefix, value=None):
|
|||||||
|
|
||||||
|
|
||||||
def read_refines(root):
|
def read_refines(root):
|
||||||
ans = defaultdict(list)
|
ans = collections.defaultdict(list)
|
||||||
for meta in XPath('./opf:metadata/opf:meta[@refines]')(root):
|
for meta in XPath('./opf:metadata/opf:meta[@refines]')(root):
|
||||||
r = meta.get('refines') or ''
|
r = meta.get('refines') or ''
|
||||||
if r.startswith('#'):
|
if r.startswith('#'):
|
||||||
@@ -213,7 +218,7 @@ def set_refines(elem, existing_refines, *new_refines):
|
|||||||
remove_refines(elem, existing_refines)
|
remove_refines(elem, existing_refines)
|
||||||
for ref in reversed(new_refines):
|
for ref in reversed(new_refines):
|
||||||
prop, val, scheme = ref
|
prop, val, scheme = ref
|
||||||
r = elem.makeelement(const.OPF_META)
|
r = elem.makeelement(base.tag('opf', 'meta'))
|
||||||
r.set('refines', '#' + eid), r.set('property', prop)
|
r.set('refines', '#' + eid), r.set('property', prop)
|
||||||
r.text = val.strip()
|
r.text = val.strip()
|
||||||
if scheme:
|
if scheme:
|
||||||
@@ -249,7 +254,7 @@ def parse_identifier(ident, val, refines):
|
|||||||
# Try the OPF 2 style opf:scheme attribute, which will be present, for
|
# Try the OPF 2 style opf:scheme attribute, which will be present, for
|
||||||
# example, in EPUB 3 files that have had their metadata set by an
|
# example, in EPUB 3 files that have had their metadata set by an
|
||||||
# application that only understands EPUB 2.
|
# application that only understands EPUB 2.
|
||||||
scheme = ident.get(const.OPF_SCHEME)
|
scheme = ident.get(base.tag('opf', 'scheme'))
|
||||||
if scheme and not lval.startswith('urn:'):
|
if scheme and not lval.startswith('urn:'):
|
||||||
return finalize(scheme, val)
|
return finalize(scheme, val)
|
||||||
|
|
||||||
@@ -267,7 +272,7 @@ def parse_identifier(ident, val, refines):
|
|||||||
|
|
||||||
|
|
||||||
def read_identifiers(root, prefixes, refines):
|
def read_identifiers(root, prefixes, refines):
|
||||||
ans = defaultdict(list)
|
ans = collections.defaultdict(list)
|
||||||
for ident in XPath('./opf:metadata/dc:identifier')(root):
|
for ident in XPath('./opf:metadata/dc:identifier')(root):
|
||||||
val = (ident.text or '').strip()
|
val = (ident.text or '').strip()
|
||||||
if val:
|
if val:
|
||||||
@@ -277,7 +282,8 @@ def read_identifiers(root, prefixes, refines):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=False):
|
def set_identifiers(root, prefixes, refines, new_identifiers,
|
||||||
|
force_identifiers=False):
|
||||||
uid = root.get('unique-identifier')
|
uid = root.get('unique-identifier')
|
||||||
package_identifier = None
|
package_identifier = None
|
||||||
for ident in XPath('./opf:metadata/dc:identifier')(root):
|
for ident in XPath('./opf:metadata/dc:identifier')(root):
|
||||||
@@ -289,12 +295,15 @@ def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=
|
|||||||
ident.getparent().remove(ident)
|
ident.getparent().remove(ident)
|
||||||
continue
|
continue
|
||||||
scheme, val = parse_identifier(ident, val, refines)
|
scheme, val = parse_identifier(ident, val, refines)
|
||||||
if not scheme or not val or force_identifiers or scheme in new_identifiers:
|
if (not scheme or
|
||||||
|
not val or
|
||||||
|
force_identifiers or
|
||||||
|
scheme in new_identifiers):
|
||||||
remove_element(ident, refines)
|
remove_element(ident, refines)
|
||||||
continue
|
continue
|
||||||
metadata = XPath('./opf:metadata')(root)[0]
|
metadata = XPath('./opf:metadata')(root)[0]
|
||||||
for scheme, val in new_identifiers.items():
|
for scheme, val in new_identifiers.items():
|
||||||
ident = metadata.makeelement(const.DC_IDENT)
|
ident = metadata.makeelement(base.tag('dc', 'ident'))
|
||||||
ident.text = '%s:%s' % (scheme, val)
|
ident.text = '%s:%s' % (scheme, val)
|
||||||
if package_identifier is None:
|
if package_identifier is None:
|
||||||
metadata.append(ident)
|
metadata.append(ident)
|
||||||
@@ -312,11 +321,12 @@ def identifier_writer(name):
|
|||||||
if is_package_id:
|
if is_package_id:
|
||||||
package_identifier = ident
|
package_identifier = ident
|
||||||
val = (ident.text or '').strip()
|
val = (ident.text or '').strip()
|
||||||
if (val.startswith(name + ':') or ident.get(const.OPF_SCHEME) == name) and not is_package_id:
|
if (val.startswith(name + ':') or
|
||||||
|
ident.get(base.tag('opf', 'scheme')) == name) and not is_package_id:
|
||||||
remove_element(ident, refines)
|
remove_element(ident, refines)
|
||||||
metadata = XPath('./opf:metadata')(root)[0]
|
metadata = XPath('./opf:metadata')(root)[0]
|
||||||
if ival:
|
if ival:
|
||||||
ident = metadata.makeelement(const.DC_IDENT)
|
ident = metadata.makeelement(base.tag('dc', 'ident'))
|
||||||
ident.text = '%s:%s' % (name, ival)
|
ident.text = '%s:%s' % (name, ival)
|
||||||
if package_identifier is None:
|
if package_identifier is None:
|
||||||
metadata.append(ident)
|
metadata.append(ident)
|
||||||
@@ -366,7 +376,8 @@ def read_title_sort(root, prefixes, refines):
|
|||||||
if fa:
|
if fa:
|
||||||
return fa
|
return fa
|
||||||
# Look for OPF 2.0 style title_sort
|
# Look for OPF 2.0 style title_sort
|
||||||
for m in XPath('./opf:metadata/opf:meta[@name="calibre:title_sort"]')(root):
|
for m in XPath('./opf:metadata/opf:meta[@name="calibre:'
|
||||||
|
'title_sort"]')(root):
|
||||||
ans = m.get('content')
|
ans = m.get('content')
|
||||||
if ans:
|
if ans:
|
||||||
return ans
|
return ans
|
||||||
@@ -376,12 +387,13 @@ def set_title(root, prefixes, refines, title, title_sort=None):
|
|||||||
main_title = find_main_title(root, refines, remove_blanks=True)
|
main_title = find_main_title(root, refines, remove_blanks=True)
|
||||||
if main_title is None:
|
if main_title is None:
|
||||||
m = XPath('./opf:metadata')(root)[0]
|
m = XPath('./opf:metadata')(root)[0]
|
||||||
main_title = m.makeelement(const.DC_TITLE)
|
main_title = m.makeelement(base.tag('dc', 'title'))
|
||||||
m.insert(0, main_title)
|
m.insert(0, main_title)
|
||||||
main_title.text = title or None
|
main_title.text = title or None
|
||||||
ts = [refdef('file-as', title_sort)] if title_sort else ()
|
ts = [refdef('file-as', title_sort)] if title_sort else ()
|
||||||
set_refines(main_title, refines, refdef('title-type', 'main'), *ts)
|
set_refines(main_title, refines, refdef('title-type', 'main'), *ts)
|
||||||
for m in XPath('./opf:metadata/opf:meta[@name="calibre:title_sort"]')(root):
|
for m in XPath('./opf:metadata/opf:meta[@name="calibre:'
|
||||||
|
'title_sort"]')(root):
|
||||||
remove_element(m, refines)
|
remove_element(m, refines)
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
@@ -405,28 +417,32 @@ def set_languages(root, prefixes, refines, languages):
|
|||||||
val = (lang.text or '').strip()
|
val = (lang.text or '').strip()
|
||||||
if val:
|
if val:
|
||||||
opf_languages.append(val)
|
opf_languages.append(val)
|
||||||
languages = list(filter(lambda x: x and x != 'und', normalize_languages(opf_languages, languages)))
|
languages = list(filter(lambda x: x and x != 'und',
|
||||||
|
normalize_languages(opf_languages, languages)))
|
||||||
if not languages:
|
if not languages:
|
||||||
# EPUB spec says dc:language is required
|
# EPUB spec says dc:language is required
|
||||||
languages = ['und']
|
languages = ['und']
|
||||||
metadata = XPath('./opf:metadata')(root)[0]
|
metadata = XPath('./opf:metadata')(root)[0]
|
||||||
for lang in uniq(languages):
|
for lang in uniq(languages):
|
||||||
l = metadata.makeelement(const.DC_LANG)
|
dc_lang = metadata.makeelement(base.tag('dc', 'lang'))
|
||||||
l.text = lang
|
dc_lang.text = lang
|
||||||
metadata.append(l)
|
metadata.append(dc_lang)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# Creator/Contributor {{{
|
# Creator/Contributor {{{
|
||||||
|
|
||||||
|
|
||||||
Author = namedtuple('Author', 'name sort')
|
Author = collections.namedtuple('Author', 'name sort')
|
||||||
|
|
||||||
|
|
||||||
def is_relators_role(props, q):
|
def is_relators_role(props, q):
|
||||||
for role in props.get('role'):
|
for role in props.get('role'):
|
||||||
if role:
|
if role:
|
||||||
scheme_ns, scheme, role = role
|
scheme_ns, scheme, role = role
|
||||||
if role.lower() == q and (scheme_ns is None or (scheme_ns, scheme) == (reserved_prefixes['marc'], 'relators')):
|
if (role.lower() == q and
|
||||||
|
(scheme_ns is None or
|
||||||
|
(scheme_ns, scheme) == (RES_PREFIXES['marc'],
|
||||||
|
'relators'))):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -440,15 +456,16 @@ def read_authors(root, prefixes, refines):
|
|||||||
if file_as:
|
if file_as:
|
||||||
aus = file_as[0][-1]
|
aus = file_as[0][-1]
|
||||||
else:
|
else:
|
||||||
aus = item.get(const.OPF_FILE_AS) or None
|
aus = item.get(base.tag('opf', 'file_as')) or None
|
||||||
return Author(normalize_whitespace(val), normalize_whitespace(aus))
|
return Author(normalize_whitespace(val), normalize_whitespace(aus))
|
||||||
|
|
||||||
for item in XPath('./opf:metadata/dc:creator')(root):
|
for item in XPath('./opf:metadata/dc:creator')(root):
|
||||||
val = (item.text or '').strip()
|
val = (item.text or '').strip()
|
||||||
if val:
|
if val:
|
||||||
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
props = properties_for_id_with_scheme(item.get('id'), prefixes,
|
||||||
|
refines)
|
||||||
role = props.get('role')
|
role = props.get('role')
|
||||||
opf_role = item.get(const.OPF_ROLE)
|
opf_role = item.get(base.tag('opf', 'role'))
|
||||||
if role:
|
if role:
|
||||||
if is_relators_role(props, 'aut'):
|
if is_relators_role(props, 'aut'):
|
||||||
roled_authors.append(author(item, props, val))
|
roled_authors.append(author(item, props, val))
|
||||||
@@ -464,23 +481,30 @@ def read_authors(root, prefixes, refines):
|
|||||||
def set_authors(root, prefixes, refines, authors):
|
def set_authors(root, prefixes, refines, authors):
|
||||||
ensure_prefix(root, prefixes, 'marc')
|
ensure_prefix(root, prefixes, 'marc')
|
||||||
for item in XPath('./opf:metadata/dc:creator')(root):
|
for item in XPath('./opf:metadata/dc:creator')(root):
|
||||||
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
props = properties_for_id_with_scheme(item.get('id'), prefixes,
|
||||||
opf_role = item.get(const.OPF_ROLE)
|
refines)
|
||||||
if (opf_role and opf_role.lower() != 'aut') or (props.get('role') and not is_relators_role(props, 'aut')):
|
opf_role = item.get(base.tag('opf', 'role'))
|
||||||
|
if ((opf_role and opf_role.lower() != 'aut') or
|
||||||
|
(props.get('role') and not is_relators_role(props, 'aut'))):
|
||||||
continue
|
continue
|
||||||
remove_element(item, refines)
|
remove_element(item, refines)
|
||||||
metadata = XPath('./opf:metadata')(root)[0]
|
metadata = XPath('./opf:metadata')(root)[0]
|
||||||
for author in authors:
|
for author in authors:
|
||||||
if author.name:
|
if author.name:
|
||||||
a = metadata.makeelement(const.DC_CREATOR)
|
a = metadata.makeelement(base.tag('dc', 'creator'))
|
||||||
aid = ensure_id(a)
|
aid = ensure_id(a)
|
||||||
a.text = author.name
|
a.text = author.name
|
||||||
metadata.append(a)
|
metadata.append(a)
|
||||||
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
|
m = metadata.makeelement(base.tag('opf', 'meta'),
|
||||||
|
attrib={'refines': '#' + aid,
|
||||||
|
'property': 'role',
|
||||||
|
'scheme': 'marc:relators'})
|
||||||
m.text = 'aut'
|
m.text = 'aut'
|
||||||
metadata.append(m)
|
metadata.append(m)
|
||||||
if author.sort:
|
if author.sort:
|
||||||
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'file-as'})
|
m = metadata.makeelement(base.tag('opf', 'meta'),
|
||||||
|
attrib={'refines': '#' + aid,
|
||||||
|
'property': 'file-as'})
|
||||||
m.text = author.sort
|
m.text = author.sort
|
||||||
metadata.append(m)
|
metadata.append(m)
|
||||||
|
|
||||||
@@ -490,9 +514,10 @@ def read_book_producers(root, prefixes, refines):
|
|||||||
for item in XPath('./opf:metadata/dc:contributor')(root):
|
for item in XPath('./opf:metadata/dc:contributor')(root):
|
||||||
val = (item.text or '').strip()
|
val = (item.text or '').strip()
|
||||||
if val:
|
if val:
|
||||||
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
props = properties_for_id_with_scheme(item.get('id'), prefixes,
|
||||||
|
refines)
|
||||||
role = props.get('role')
|
role = props.get('role')
|
||||||
opf_role = item.get(const.OPF_ROLE)
|
opf_role = item.get(base.tag('opf', 'role'))
|
||||||
if role:
|
if role:
|
||||||
if is_relators_role(props, 'bkp'):
|
if is_relators_role(props, 'bkp'):
|
||||||
ans.append(normalize_whitespace(val))
|
ans.append(normalize_whitespace(val))
|
||||||
@@ -503,19 +528,24 @@ def read_book_producers(root, prefixes, refines):
|
|||||||
|
|
||||||
def set_book_producers(root, prefixes, refines, producers):
|
def set_book_producers(root, prefixes, refines, producers):
|
||||||
for item in XPath('./opf:metadata/dc:contributor')(root):
|
for item in XPath('./opf:metadata/dc:contributor')(root):
|
||||||
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
props = properties_for_id_with_scheme(item.get('id'), prefixes,
|
||||||
opf_role = item.get(const.OPF_ROLE)
|
refines)
|
||||||
if (opf_role and opf_role.lower() != 'bkp') or (props.get('role') and not is_relators_role(props, 'bkp')):
|
opf_role = item.get(base.tag('opf', 'role'))
|
||||||
|
if ((opf_role and opf_role.lower() != 'bkp') or
|
||||||
|
(props.get('role') and not is_relators_role(props, 'bkp'))):
|
||||||
continue
|
continue
|
||||||
remove_element(item, refines)
|
remove_element(item, refines)
|
||||||
metadata = XPath('./opf:metadata')(root)[0]
|
metadata = XPath('./opf:metadata')(root)[0]
|
||||||
for bkp in producers:
|
for bkp in producers:
|
||||||
if bkp:
|
if bkp:
|
||||||
a = metadata.makeelement(const.DC_CONTRIBUTOR)
|
a = metadata.makeelement(base.tag('dc', 'contributor'))
|
||||||
aid = ensure_id(a)
|
aid = ensure_id(a)
|
||||||
a.text = bkp
|
a.text = bkp
|
||||||
metadata.append(a)
|
metadata.append(a)
|
||||||
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
|
m = metadata.makeelement(base.tag('opf', 'meta'),
|
||||||
|
attrib={'refines': '#' + aid,
|
||||||
|
'property': 'role',
|
||||||
|
'scheme': 'marc:relators'})
|
||||||
m.text = 'bkp'
|
m.text = 'bkp'
|
||||||
metadata.append(m)
|
metadata.append(m)
|
||||||
# }}}
|
# }}}
|
||||||
@@ -531,7 +561,9 @@ def parse_date(raw, is_w3cdtf=False):
|
|||||||
ans = fix_only_date(ans)
|
ans = fix_only_date(ans)
|
||||||
else:
|
else:
|
||||||
ans = parse_date_(raw, assume_utc=True)
|
ans = parse_date_(raw, assume_utc=True)
|
||||||
if ' ' not in raw and 'T' not in raw and (ans.hour, ans.minute, ans.second) == (0, 0, 0):
|
if (' ' not in raw and
|
||||||
|
'T' not in raw and
|
||||||
|
(ans.hour, ans.minute, ans.second) == (0, 0, 0)):
|
||||||
ans = fix_only_date(ans)
|
ans = fix_only_date(ans)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
@@ -552,14 +584,14 @@ def set_pubdate(root, prefixes, refines, val):
|
|||||||
if not is_date_undefined(val):
|
if not is_date_undefined(val):
|
||||||
val = isoformat(val)
|
val = isoformat(val)
|
||||||
m = XPath('./opf:metadata')(root)[0]
|
m = XPath('./opf:metadata')(root)[0]
|
||||||
d = m.makeelement(const.DC_DATE)
|
d = m.makeelement(base.tag('dc', 'date'))
|
||||||
d.text = val
|
d.text = val
|
||||||
m.append(d)
|
m.append(d)
|
||||||
|
|
||||||
|
|
||||||
def read_timestamp(root, prefixes, refines):
|
def read_timestamp(root, prefixes, refines):
|
||||||
pq = '%s:timestamp' % CALIBRE_PREFIX
|
pq = '%s:timestamp' % CALIBRE_PREFIX
|
||||||
sq = '%s:w3cdtf' % reserved_prefixes['dcterms']
|
sq = '%s:w3cdtf' % RES_PREFIXES['dcterms']
|
||||||
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
||||||
val = (meta.text or '').strip()
|
val = (meta.text or '').strip()
|
||||||
if val:
|
if val:
|
||||||
@@ -570,7 +602,8 @@ def read_timestamp(root, prefixes, refines):
|
|||||||
return parse_date(val, is_w3cdtf=scheme == sq)
|
return parse_date(val, is_w3cdtf=scheme == sq)
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:timestamp"]')(root):
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:'
|
||||||
|
'timestamp"]')(root):
|
||||||
val = meta.get('content')
|
val = meta.get('content')
|
||||||
if val:
|
if val:
|
||||||
try:
|
try:
|
||||||
@@ -584,7 +617,9 @@ def create_timestamp(root, prefixes, m, val):
|
|||||||
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
||||||
ensure_prefix(root, prefixes, 'dcterms')
|
ensure_prefix(root, prefixes, 'dcterms')
|
||||||
val = w3cdtf(val)
|
val = w3cdtf(val)
|
||||||
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'})
|
d = m.makeelement(base.tag('opf', 'meta'),
|
||||||
|
attrib={'property': 'calibre:timestamp',
|
||||||
|
'scheme': 'dcterms:W3CDTF'})
|
||||||
d.text = val
|
d.text = val
|
||||||
m.append(d)
|
m.append(d)
|
||||||
|
|
||||||
@@ -599,8 +634,8 @@ def set_timestamp(root, prefixes, refines, val):
|
|||||||
|
|
||||||
|
|
||||||
def read_last_modified(root, prefixes, refines):
|
def read_last_modified(root, prefixes, refines):
|
||||||
pq = '%s:modified' % reserved_prefixes['dcterms']
|
pq = '%s:modified' % RES_PREFIXES['dcterms']
|
||||||
sq = '%s:w3cdtf' % reserved_prefixes['dcterms']
|
sq = '%s:w3cdtf' % RES_PREFIXES['dcterms']
|
||||||
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
||||||
val = (meta.text or '').strip()
|
val = (meta.text or '').strip()
|
||||||
if val:
|
if val:
|
||||||
@@ -614,7 +649,7 @@ def read_last_modified(root, prefixes, refines):
|
|||||||
|
|
||||||
|
|
||||||
def set_last_modified(root, prefixes, refines, val=None):
|
def set_last_modified(root, prefixes, refines, val=None):
|
||||||
pq = '%s:modified' % reserved_prefixes['dcterms']
|
pq = '%s:modified' % RES_PREFIXES['dcterms']
|
||||||
val = w3cdtf(val or utcnow())
|
val = w3cdtf(val or utcnow())
|
||||||
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
||||||
prop = expand_prefix(meta.get('property'), prefixes)
|
prop = expand_prefix(meta.get('property'), prefixes)
|
||||||
@@ -625,7 +660,9 @@ def set_last_modified(root, prefixes, refines, val=None):
|
|||||||
else:
|
else:
|
||||||
ensure_prefix(root, prefixes, 'dcterms')
|
ensure_prefix(root, prefixes, 'dcterms')
|
||||||
m = XPath('./opf:metadata')(root)[0]
|
m = XPath('./opf:metadata')(root)[0]
|
||||||
meta = m.makeelement(const.OPF_META, attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'})
|
meta = m.makeelement(base.tag('opf', 'meta'),
|
||||||
|
attrib={'property': 'dcterms:modified',
|
||||||
|
'scheme': 'dcterms:W3CDTF'})
|
||||||
m.append(meta)
|
m.append(meta)
|
||||||
meta.text = val
|
meta.text = val
|
||||||
# }}}
|
# }}}
|
||||||
@@ -648,7 +685,7 @@ def set_comments(root, prefixes, refines, val):
|
|||||||
if val:
|
if val:
|
||||||
val = val.strip()
|
val = val.strip()
|
||||||
if val:
|
if val:
|
||||||
c = m.makeelement(const.DC_DESC)
|
c = m.makeelement(base.tag('dc', 'desc'))
|
||||||
c.text = val
|
c.text = val
|
||||||
m.append(c)
|
m.append(c)
|
||||||
# }}}
|
# }}}
|
||||||
@@ -670,7 +707,7 @@ def set_publisher(root, prefixes, refines, val):
|
|||||||
if val:
|
if val:
|
||||||
val = val.strip()
|
val = val.strip()
|
||||||
if val:
|
if val:
|
||||||
c = m.makeelement(const.DC_PUBLISHER('publisher'))
|
c = m.makeelement(base.tag('dc', 'publisher'))
|
||||||
c.text = normalize_whitespace(val)
|
c.text = normalize_whitespace(val)
|
||||||
m.append(c)
|
m.append(c)
|
||||||
# }}}
|
# }}}
|
||||||
@@ -693,7 +730,7 @@ def set_tags(root, prefixes, refines, val):
|
|||||||
if val:
|
if val:
|
||||||
val = uniq(list(filter(None, val)))
|
val = uniq(list(filter(None, val)))
|
||||||
for x in val:
|
for x in val:
|
||||||
c = m.makeelement(const.DC_SUBJ)
|
c = m.makeelement(base.tag('dc', 'subj'))
|
||||||
c.text = normalize_whitespace(x)
|
c.text = normalize_whitespace(x)
|
||||||
if c.text:
|
if c.text:
|
||||||
m.append(c)
|
m.append(c)
|
||||||
@@ -725,7 +762,7 @@ def read_rating(root, prefixes, refines):
|
|||||||
def create_rating(root, prefixes, val):
|
def create_rating(root, prefixes, val):
|
||||||
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
||||||
m = XPath('./opf:metadata')(root)[0]
|
m = XPath('./opf:metadata')(root)[0]
|
||||||
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:rating'})
|
d = m.makeelement(base.tag('opf', 'meta'), attrib={'property': 'calibre:rating'})
|
||||||
d.text = val
|
d.text = val
|
||||||
m.append(d)
|
m.append(d)
|
||||||
|
|
||||||
@@ -747,7 +784,8 @@ def set_rating(root, prefixes, refines, val):
|
|||||||
|
|
||||||
def read_series(root, prefixes, refines):
|
def read_series(root, prefixes, refines):
|
||||||
series_index = 1.0
|
series_index = 1.0
|
||||||
for meta in XPath('./opf:metadata/opf:meta[@property="belongs-to-collection" and @id]')(root):
|
for meta in XPath('./opf:metadata/opf:meta[@property="'
|
||||||
|
'belongs-to-collection" and @id]')(root):
|
||||||
val = (meta.text or '').strip()
|
val = (meta.text or '').strip()
|
||||||
if val:
|
if val:
|
||||||
props = properties_for_id(meta.get('id'), refines)
|
props = properties_for_id(meta.get('id'), refines)
|
||||||
@@ -757,13 +795,15 @@ def read_series(root, prefixes, refines):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return normalize_whitespace(val), series_index
|
return normalize_whitespace(val), series_index
|
||||||
for si in XPath('./opf:metadata/opf:meta[@name="calibre:series_index"]/@content')(root):
|
for si in XPath('./opf:metadata/opf:meta[@name="calibre:series_index"]'
|
||||||
|
'/@content')(root):
|
||||||
try:
|
try:
|
||||||
series_index = float(si)
|
series_index = float(si)
|
||||||
break
|
break
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
for s in XPath('./opf:metadata/opf:meta[@name="calibre:series"]/@content')(root):
|
for s in XPath('./opf:metadata/opf:meta[@name="calibre:series"]'
|
||||||
|
'/@content')(root):
|
||||||
s = normalize_whitespace(s)
|
s = normalize_whitespace(s)
|
||||||
if s:
|
if s:
|
||||||
return s, series_index
|
return s, series_index
|
||||||
@@ -772,16 +812,20 @@ def read_series(root, prefixes, refines):
|
|||||||
|
|
||||||
def create_series(root, refines, series, series_index):
|
def create_series(root, refines, series, series_index):
|
||||||
m = XPath('./opf:metadata')(root)[0]
|
m = XPath('./opf:metadata')(root)[0]
|
||||||
d = m.makeelement(const.OPF_META, attrib={'property':'belongs-to-collection'})
|
d = m.makeelement(base.tag('opf', 'meta'),
|
||||||
|
attrib={'property': 'belongs-to-collection'})
|
||||||
d.text = series
|
d.text = series
|
||||||
m.append(d)
|
m.append(d)
|
||||||
set_refines(d, refines, refdef('collection-type', 'series'), refdef('group-position', series_index))
|
set_refines(d, refines, refdef('collection-type', 'series'),
|
||||||
|
refdef('group-position', series_index))
|
||||||
|
|
||||||
|
|
||||||
def set_series(root, prefixes, refines, series, series_index):
|
def set_series(root, prefixes, refines, series, series_index):
|
||||||
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:series" or @name="calibre:series_index"]')(root):
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:series" or '
|
||||||
|
'@name="calibre:series_index"]')(root):
|
||||||
remove_element(meta, refines)
|
remove_element(meta, refines)
|
||||||
for meta in XPath('./opf:metadata/opf:meta[@property="belongs-to-collection"]')(root):
|
for meta in XPath('./opf:metadata/opf:meta[@property="'
|
||||||
|
'belongs-to-collection"]')(root):
|
||||||
remove_element(meta, refines)
|
remove_element(meta, refines)
|
||||||
if series:
|
if series:
|
||||||
create_series(root, refines, series, '%.2g' % series_index)
|
create_series(root, refines, series, '%.2g' % series_index)
|
||||||
@@ -806,7 +850,8 @@ def dict_reader(name, load=json.loads, try2=True):
|
|||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
if try2:
|
if try2:
|
||||||
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % name)(root):
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' %
|
||||||
|
name)(root):
|
||||||
val = meta.get('content')
|
val = meta.get('content')
|
||||||
if val:
|
if val:
|
||||||
try:
|
try:
|
||||||
@@ -827,7 +872,8 @@ def dict_writer(name, serialize=dump_dict, remove2=True):
|
|||||||
|
|
||||||
def writer(root, prefixes, refines, val):
|
def writer(root, prefixes, refines, val):
|
||||||
if remove2:
|
if remove2:
|
||||||
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % name)(root):
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' %
|
||||||
|
name)(root):
|
||||||
remove_element(meta, refines)
|
remove_element(meta, refines)
|
||||||
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
||||||
prop = expand_prefix(meta.get('property'), prefixes)
|
prop = expand_prefix(meta.get('property'), prefixes)
|
||||||
@@ -836,7 +882,8 @@ def dict_writer(name, serialize=dump_dict, remove2=True):
|
|||||||
if val:
|
if val:
|
||||||
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
||||||
m = XPath('./opf:metadata')(root)[0]
|
m = XPath('./opf:metadata')(root)[0]
|
||||||
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:%s' % name})
|
d = m.makeelement(base.tag('opf', 'meta'),
|
||||||
|
attrib={'property': 'calibre:%s' % name})
|
||||||
d.text = serialize(val)
|
d.text = serialize(val)
|
||||||
m.append(d)
|
m.append(d)
|
||||||
return writer
|
return writer
|
||||||
@@ -855,12 +902,14 @@ def deserialize_user_metadata(val):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
read_user_metadata3 = dict_reader('user_metadata', load=deserialize_user_metadata, try2=False)
|
read_user_metadata3 = dict_reader('user_metadata',
|
||||||
|
load=deserialize_user_metadata, try2=False)
|
||||||
|
|
||||||
|
|
||||||
def read_user_metadata2(root, remove_tags=False):
|
def read_user_metadata2(root, remove_tags=False):
|
||||||
ans = {}
|
ans = {}
|
||||||
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, "calibre:user_metadata:")]')(root):
|
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, '
|
||||||
|
'"calibre:user_metadata:")]')(root):
|
||||||
name = meta.get('name')
|
name = meta.get('name')
|
||||||
name = ':'.join(name.split(':')[2:])
|
name = ':'.join(name.split(':')[2:])
|
||||||
if not name or not name.startswith('#'):
|
if not name or not name.startswith('#'):
|
||||||
@@ -881,18 +930,23 @@ def read_user_metadata2(root, remove_tags=False):
|
|||||||
|
|
||||||
|
|
||||||
def read_user_metadata(root, prefixes, refines):
|
def read_user_metadata(root, prefixes, refines):
|
||||||
return read_user_metadata3(root, prefixes, refines) or read_user_metadata2(root)
|
return read_user_metadata3(root, prefixes,
|
||||||
|
refines) or read_user_metadata2(root)
|
||||||
|
|
||||||
|
|
||||||
def serialize_user_metadata(val):
|
def serialize_user_metadata(val):
|
||||||
return json.dumps(object_to_unicode(val), ensure_ascii=False, default=to_json, indent=2, sort_keys=True)
|
return json.dumps(object_to_unicode(val), ensure_ascii=False,
|
||||||
|
default=to_json, indent=2, sort_keys=True)
|
||||||
|
|
||||||
|
|
||||||
set_user_metadata3 = dict_writer('user_metadata', serialize=serialize_user_metadata, remove2=False)
|
set_user_metadata3 = dict_writer('user_metadata',
|
||||||
|
serialize=serialize_user_metadata,
|
||||||
|
remove2=False)
|
||||||
|
|
||||||
|
|
||||||
def set_user_metadata(root, prefixes, refines, val):
|
def set_user_metadata(root, prefixes, refines, val):
|
||||||
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, "calibre:user_metadata:")]')(root):
|
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, '
|
||||||
|
'"calibre:user_metadata:")]')(root):
|
||||||
remove_element(meta, refines)
|
remove_element(meta, refines)
|
||||||
if val:
|
if val:
|
||||||
nval = {}
|
nval = {}
|
||||||
@@ -921,26 +975,32 @@ def read_raster_cover(root, prefixes, refines):
|
|||||||
if href:
|
if href:
|
||||||
return href
|
return href
|
||||||
|
|
||||||
for item_id in XPath('./opf:metadata/opf:meta[@name="cover"]/@content')(root):
|
for item_id in XPath('./opf:metadata/opf:meta[@name="cover"]'
|
||||||
for item in XPath('./opf:manifest/opf:item[@id and @href and @media-type]')(root):
|
'/@content')(root):
|
||||||
|
for item in XPath('./opf:manifest/opf:item[@id and @href and '
|
||||||
|
'@media-type]')(root):
|
||||||
if item.get('id') == item_id:
|
if item.get('id') == item_id:
|
||||||
href = get_href(item)
|
href = get_href(item)
|
||||||
if href:
|
if href:
|
||||||
return href
|
return href
|
||||||
|
|
||||||
|
|
||||||
def ensure_is_only_raster_cover(root, prefixes, refines, raster_cover_item_href):
|
def ensure_is_only_raster_cover(root, prefixes, refines,
|
||||||
|
raster_cover_item_href):
|
||||||
for item in XPath('./opf:metadata/opf:meta[@name="cover"]')(root):
|
for item in XPath('./opf:metadata/opf:meta[@name="cover"]')(root):
|
||||||
remove_element(item, refines)
|
remove_element(item, refines)
|
||||||
for item in items_with_property(root, 'cover-image', prefixes):
|
for item in items_with_property(root, 'cover-image', prefixes):
|
||||||
prop = normalize_whitespace(item.get('properties').replace('cover-image', ''))
|
prop = normalize_whitespace(item.get('properties')
|
||||||
|
.replace('cover-image', ''))
|
||||||
if prop:
|
if prop:
|
||||||
item.set('properties', prop)
|
item.set('properties', prop)
|
||||||
else:
|
else:
|
||||||
del item.attrib['properties']
|
del item.attrib['properties']
|
||||||
for item in XPath('./opf:manifest/opf:item')(root):
|
for item in XPath('./opf:manifest/opf:item')(root):
|
||||||
if item.get('href') == raster_cover_item_href:
|
if item.get('href') == raster_cover_item_href:
|
||||||
item.set('properties', normalize_whitespace((item.get('properties') or '') + ' cover-image'))
|
item.set('properties',
|
||||||
|
normalize_whitespace((item.get('properties')
|
||||||
|
or '') + ' cover-image'))
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
@@ -960,7 +1020,7 @@ def set_last_modified_in_opf(root):
|
|||||||
|
|
||||||
|
|
||||||
def read_metadata(root, ver=None, return_extra_data=False):
|
def read_metadata(root, ver=None, return_extra_data=False):
|
||||||
ans = Metadata('Unknown', ['Unknown'])
|
ans = base.Metadata('Unknown', ['Unknown'])
|
||||||
prefixes, refines = read_prefixes(root), read_refines(root)
|
prefixes, refines = read_prefixes(root), read_refines(root)
|
||||||
identifiers = read_identifiers(root, prefixes, refines)
|
identifiers = read_identifiers(root, prefixes, refines)
|
||||||
ids = {}
|
ids = {}
|
||||||
@@ -1000,12 +1060,16 @@ def read_metadata(root, ver=None, return_extra_data=False):
|
|||||||
s, si = read_series(root, prefixes, refines)
|
s, si = read_series(root, prefixes, refines)
|
||||||
if s:
|
if s:
|
||||||
ans.series, ans.series_index = s, si
|
ans.series, ans.series_index = s, si
|
||||||
ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map
|
ans.author_link_map = read_author_link_map(root, prefixes,
|
||||||
ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories
|
refines) or ans.author_link_map
|
||||||
for name, fm in (read_user_metadata(root, prefixes, refines) or {}).items():
|
ans.user_categories = read_user_categories(root, prefixes,
|
||||||
|
refines) or ans.user_categories
|
||||||
|
for name, fm in (read_user_metadata(root, prefixes,
|
||||||
|
refines) or {}).items():
|
||||||
ans.set_user_metadata(name, fm)
|
ans.set_user_metadata(name, fm)
|
||||||
if return_extra_data:
|
if return_extra_data:
|
||||||
ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines)
|
ans = (ans, ver, read_raster_cover(root, prefixes, refines),
|
||||||
|
first_spine_item(root, prefixes, refines))
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
@@ -1014,7 +1078,9 @@ def get_metadata(stream):
|
|||||||
return read_metadata(root)
|
return read_metadata(root)
|
||||||
|
|
||||||
|
|
||||||
def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True):
|
def apply_metadata(root, mi, cover_prefix='', cover_data=None,
|
||||||
|
apply_null=False, update_timestamp=False,
|
||||||
|
force_identifiers=False, add_missing_cover=True):
|
||||||
prefixes, refines = read_prefixes(root), read_refines(root)
|
prefixes, refines = read_prefixes(root), read_refines(root)
|
||||||
current_mi = read_metadata(root)
|
current_mi = read_metadata(root)
|
||||||
if apply_null:
|
if apply_null:
|
||||||
@@ -1024,7 +1090,8 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False,
|
|||||||
def ok(x):
|
def ok(x):
|
||||||
return not mi.is_null(x)
|
return not mi.is_null(x)
|
||||||
if ok('identifiers'):
|
if ok('identifiers'):
|
||||||
set_identifiers(root, prefixes, refines, mi.identifiers, force_identifiers=force_identifiers)
|
set_identifiers(root, prefixes, refines, mi.identifiers,
|
||||||
|
force_identifiers=force_identifiers)
|
||||||
if ok('title'):
|
if ok('title'):
|
||||||
set_title(root, prefixes, refines, mi.title, mi.title_sort)
|
set_title(root, prefixes, refines, mi.title, mi.title_sort)
|
||||||
if ok('languages'):
|
if ok('languages'):
|
||||||
@@ -1052,16 +1119,21 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False,
|
|||||||
if ok('series'):
|
if ok('series'):
|
||||||
set_series(root, prefixes, refines, mi.series, mi.series_index or 1)
|
set_series(root, prefixes, refines, mi.series, mi.series_index or 1)
|
||||||
if ok('author_link_map'):
|
if ok('author_link_map'):
|
||||||
set_author_link_map(root, prefixes, refines, getattr(mi, 'author_link_map', None))
|
set_author_link_map(root, prefixes, refines,
|
||||||
|
getattr(mi, 'author_link_map', None))
|
||||||
if ok('user_categories'):
|
if ok('user_categories'):
|
||||||
set_user_categories(root, prefixes, refines, getattr(mi, 'user_categories', None))
|
set_user_categories(root, prefixes, refines,
|
||||||
|
getattr(mi, 'user_categories', None))
|
||||||
# We ignore apply_null for the next two to match the behavior with opf2.py
|
# We ignore apply_null for the next two to match the behavior with opf2.py
|
||||||
if mi.application_id:
|
if mi.application_id:
|
||||||
set_application_id(root, prefixes, refines, mi.application_id)
|
set_application_id(root, prefixes, refines, mi.application_id)
|
||||||
if mi.uuid:
|
if mi.uuid:
|
||||||
set_uuid(root, prefixes, refines, mi.uuid)
|
set_uuid(root, prefixes, refines, mi.uuid)
|
||||||
new_user_metadata, current_user_metadata = mi.get_all_user_metadata(True), current_mi.get_all_user_metadata(True)
|
|
||||||
|
new_user_metadata = mi.get_all_user_metadata(True)
|
||||||
|
current_user_metadata = current_mi.get_all_user_metadata(True)
|
||||||
missing = object()
|
missing = object()
|
||||||
|
|
||||||
for key in tuple(new_user_metadata):
|
for key in tuple(new_user_metadata):
|
||||||
meta = new_user_metadata.get(key)
|
meta = new_user_metadata.get(key)
|
||||||
if meta is None:
|
if meta is None:
|
||||||
@@ -1098,7 +1170,9 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False,
|
|||||||
return raster_cover
|
return raster_cover
|
||||||
|
|
||||||
|
|
||||||
def set_metadata(stream, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True):
|
def set_metadata(stream, mi, cover_prefix='', cover_data=None,
|
||||||
|
apply_null=False, update_timestamp=False,
|
||||||
|
force_identifiers=False, add_missing_cover=True):
|
||||||
root = parse_opf(stream)
|
root = parse_opf(stream)
|
||||||
return apply_metadata(
|
return apply_metadata(
|
||||||
root, mi, cover_prefix=cover_prefix, cover_data=cover_data,
|
root, mi, cover_prefix=cover_prefix, cover_data=cover_data,
|
||||||
|
|||||||
@@ -8,37 +8,28 @@ import uuid
|
|||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from ebook_converter import constants as const
|
|
||||||
from ebook_converter.ebooks.mobi.reader.headers import NULL_INDEX
|
from ebook_converter.ebooks.mobi.reader.headers import NULL_INDEX
|
||||||
from ebook_converter.ebooks.mobi.reader.index import read_index
|
from ebook_converter.ebooks.mobi.reader.index import read_index
|
||||||
from ebook_converter.ebooks.mobi.reader.ncx import read_ncx, build_toc
|
from ebook_converter.ebooks.mobi.reader.ncx import read_ncx, build_toc
|
||||||
from ebook_converter.ebooks.mobi.reader.markup import expand_mobi8_markup
|
from ebook_converter.ebooks.mobi.reader.markup import expand_mobi8_markup
|
||||||
from ebook_converter.ebooks.mobi.reader.containers import Container, find_imgtype
|
from ebook_converter.ebooks.mobi.reader import containers
|
||||||
from ebook_converter.ebooks.metadata.opf2 import Guide, OPFCreator
|
from ebook_converter.ebooks.metadata.opf2 import Guide, OPFCreator
|
||||||
from ebook_converter.ebooks.metadata.toc import TOC
|
from ebook_converter.ebooks.metadata.toc import TOC
|
||||||
from ebook_converter.ebooks.mobi.utils import read_font_record
|
from ebook_converter.ebooks.mobi.utils import read_font_record
|
||||||
from ebook_converter.ebooks.oeb.parse_utils import parse_html
|
from ebook_converter.ebooks.oeb.parse_utils import parse_html
|
||||||
from ebook_converter.ebooks.oeb.base import XPath, xml2text
|
from ebook_converter.ebooks.oeb import base
|
||||||
from ebook_converter.polyglot.builtins import as_unicode
|
from ebook_converter.polyglot.builtins import as_unicode
|
||||||
|
|
||||||
|
ID_RE = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
|
||||||
|
NAME_RE = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
|
||||||
|
AID_RE = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
|
||||||
|
Part = collections.namedtuple('Part', 'num type filename start end aid')
|
||||||
|
Elem = collections.namedtuple('Elem', 'insert_pos toc_text file_number '
|
||||||
|
'sequence_number start_pos length')
|
||||||
|
FlowInfo = collections.namedtuple('FlowInfo', 'type format dir fname')
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
Part = collections.namedtuple('Part',
|
|
||||||
'num type filename start end aid')
|
|
||||||
|
|
||||||
Elem = collections.namedtuple('Elem',
|
|
||||||
'insert_pos toc_text file_number sequence_number start_pos '
|
|
||||||
'length')
|
|
||||||
|
|
||||||
FlowInfo = collections.namedtuple('FlowInfo',
|
|
||||||
'type format dir fname')
|
|
||||||
|
|
||||||
# locate beginning and ending positions of tag with specific aid attribute
|
# locate beginning and ending positions of tag with specific aid attribute
|
||||||
|
|
||||||
|
|
||||||
def locate_beg_end_of_tag(ml, aid):
|
def locate_beg_end_of_tag(ml, aid):
|
||||||
pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid
|
pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid
|
||||||
aid_pattern = re.compile(pattern, re.IGNORECASE)
|
aid_pattern = re.compile(pattern, re.IGNORECASE)
|
||||||
@@ -64,7 +55,8 @@ def reverse_tag_iter(block):
|
|||||||
end = plt
|
end = plt
|
||||||
|
|
||||||
|
|
||||||
def get_first_resource_index(first_image_index, num_of_text_records, first_text_record_number):
|
def get_first_resource_index(first_image_index, num_of_text_records,
|
||||||
|
first_text_record_number):
|
||||||
first_resource_index = first_image_index
|
first_resource_index = first_image_index
|
||||||
if first_resource_index in {-1, NULL_INDEX}:
|
if first_resource_index in {-1, NULL_INDEX}:
|
||||||
first_resource_index = num_of_text_records + first_text_record_number
|
first_resource_index = num_of_text_records + first_text_record_number
|
||||||
@@ -78,23 +70,27 @@ class Mobi8Reader(object):
|
|||||||
self.mobi6_reader, self.log = mobi6_reader, log
|
self.mobi6_reader, self.log = mobi6_reader, log
|
||||||
self.header = mobi6_reader.book_header
|
self.header = mobi6_reader.book_header
|
||||||
self.encrypted_fonts = []
|
self.encrypted_fonts = []
|
||||||
self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
|
self.id_re = ID_RE
|
||||||
self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
|
self.name_re = NAME_RE
|
||||||
self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
|
self.aid_re = AID_RE
|
||||||
|
|
||||||
def __call__(self):
|
def __call__(self):
|
||||||
self.mobi6_reader.check_for_drm()
|
self.mobi6_reader.check_for_drm()
|
||||||
self.aid_anchor_suffix = uuid.uuid4().hex.encode('utf-8')
|
self.aid_anchor_suffix = uuid.uuid4().hex.encode('utf-8')
|
||||||
bh = self.mobi6_reader.book_header
|
bh = self.mobi6_reader.book_header
|
||||||
|
_gfri = get_first_resource_index
|
||||||
if self.mobi6_reader.kf8_type == 'joint':
|
if self.mobi6_reader.kf8_type == 'joint':
|
||||||
offset = self.mobi6_reader.kf8_boundary + 2
|
offset = self.mobi6_reader.kf8_boundary + 2
|
||||||
self.resource_offsets = [
|
self.resource_offsets = [(_gfri(bh.first_image_index,
|
||||||
(get_first_resource_index(bh.first_image_index, bh.mobi6_records, 1), offset - 2),
|
bh.mobi6_records, 1), offset - 2),
|
||||||
(get_first_resource_index(bh.kf8_first_image_index, bh.records, offset), len(self.mobi6_reader.sections)),
|
(_gfri(bh.kf8_first_image_index,
|
||||||
]
|
bh.records, offset),
|
||||||
|
len(self.mobi6_reader.sections))]
|
||||||
else:
|
else:
|
||||||
offset = 1
|
offset = 1
|
||||||
self.resource_offsets = [(get_first_resource_index(bh.first_image_index, bh.records, offset), len(self.mobi6_reader.sections))]
|
self.resource_offsets = [(_gfri(bh.first_image_index, bh.records,
|
||||||
|
offset),
|
||||||
|
len(self.mobi6_reader.sections))]
|
||||||
|
|
||||||
self.processed_records = self.mobi6_reader.extract_text(offset=offset)
|
self.processed_records = self.mobi6_reader.extract_text(offset=offset)
|
||||||
self.raw_ml = self.mobi6_reader.mobi_html
|
self.raw_ml = self.mobi6_reader.mobi_html
|
||||||
@@ -123,37 +119,37 @@ class Mobi8Reader(object):
|
|||||||
raise ValueError('KF8 does not have a valid FDST record')
|
raise ValueError('KF8 does not have a valid FDST record')
|
||||||
sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
|
sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
|
||||||
secs = struct.unpack_from(b'>%dL' % (num_sections*2),
|
secs = struct.unpack_from(b'>%dL' % (num_sections*2),
|
||||||
header, sec_start)
|
header, sec_start)
|
||||||
self.flow_table = tuple(zip(secs[::2], secs[1::2]))
|
self.flow_table = tuple(zip(secs[::2], secs[1::2]))
|
||||||
|
|
||||||
self.files = []
|
self.files = []
|
||||||
if self.header.skelidx != NULL_INDEX:
|
if self.header.skelidx != NULL_INDEX:
|
||||||
table = read_index(self.kf8_sections, self.header.skelidx,
|
table = read_index(self.kf8_sections, self.header.skelidx,
|
||||||
self.header.codec)[0]
|
self.header.codec)[0]
|
||||||
File = collections.namedtuple('File',
|
File = collections.namedtuple('File', 'file_number name '
|
||||||
'file_number name divtbl_count start_position length')
|
'divtbl_count start_position length')
|
||||||
|
|
||||||
for i, text in enumerate(table):
|
for i, text in enumerate(table):
|
||||||
tag_map = table[text]
|
tag_map = table[text]
|
||||||
self.files.append(File(i, text, tag_map[1][0],
|
self.files.append(File(i, text, tag_map[1][0],
|
||||||
tag_map[6][0], tag_map[6][1]))
|
tag_map[6][0], tag_map[6][1]))
|
||||||
|
|
||||||
self.elems = []
|
self.elems = []
|
||||||
if self.header.dividx != NULL_INDEX:
|
if self.header.dividx != NULL_INDEX:
|
||||||
table, cncx = read_index(self.kf8_sections, self.header.dividx,
|
table, cncx = read_index(self.kf8_sections, self.header.dividx,
|
||||||
self.header.codec)
|
self.header.codec)
|
||||||
for i, text in enumerate(table):
|
for i, text in enumerate(table):
|
||||||
tag_map = table[text]
|
tag_map = table[text]
|
||||||
toc_text = cncx[tag_map[2][0]]
|
toc_text = cncx[tag_map[2][0]]
|
||||||
self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
|
self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
|
||||||
tag_map[4][0], tag_map[6][0], tag_map[6][1]))
|
tag_map[4][0], tag_map[6][0],
|
||||||
|
tag_map[6][1]))
|
||||||
|
|
||||||
self.guide = []
|
self.guide = []
|
||||||
if self.header.othidx != NULL_INDEX:
|
if self.header.othidx != NULL_INDEX:
|
||||||
table, cncx = read_index(self.kf8_sections, self.header.othidx,
|
table, cncx = read_index(self.kf8_sections, self.header.othidx,
|
||||||
self.header.codec)
|
self.header.codec)
|
||||||
Item = collections.namedtuple('Item',
|
Item = collections.namedtuple('Item', 'type title pos_fid')
|
||||||
'type title pos_fid')
|
|
||||||
|
|
||||||
for i, ref_type in enumerate(table):
|
for i, ref_type in enumerate(table):
|
||||||
tag_map = table[ref_type]
|
tag_map = table[ref_type]
|
||||||
@@ -161,7 +157,7 @@ class Mobi8Reader(object):
|
|||||||
title = cncx[tag_map[1][0]]
|
title = cncx[tag_map[1][0]]
|
||||||
fileno = None
|
fileno = None
|
||||||
if 3 in list(tag_map.keys()):
|
if 3 in list(tag_map.keys()):
|
||||||
fileno = tag_map[3][0]
|
fileno = tag_map[3][0]
|
||||||
if 6 in list(tag_map.keys()):
|
if 6 in list(tag_map.keys()):
|
||||||
fileno = tag_map[6]
|
fileno = tag_map[6]
|
||||||
if isinstance(ref_type, bytes):
|
if isinstance(ref_type, bytes):
|
||||||
@@ -205,17 +201,19 @@ class Mobi8Reader(object):
|
|||||||
head = skeleton[:insertpos]
|
head = skeleton[:insertpos]
|
||||||
tail = skeleton[insertpos:]
|
tail = skeleton[insertpos:]
|
||||||
if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') <
|
if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') <
|
||||||
head.rfind(b'<')):
|
head.rfind(b'<')):
|
||||||
# There is an incomplete tag in either the head or tail.
|
# There is an incomplete tag in either the head or tail.
|
||||||
# This can happen for some badly formed KF8 files, see for
|
# This can happen for some badly formed KF8 files, see for
|
||||||
# example, https://bugs.launchpad.net/bugs/1082669
|
# example, https://bugs.launchpad.net/bugs/1082669
|
||||||
if not inspos_warned:
|
if not inspos_warned:
|
||||||
self.log.warn(
|
self.log.warn('The div table for %s has incorrect '
|
||||||
'The div table for %s has incorrect insert '
|
'insert positions. Calculating '
|
||||||
'positions. Calculating manually.'%skelname)
|
'manually.' % skelname)
|
||||||
inspos_warned = True
|
inspos_warned = True
|
||||||
bp, ep = locate_beg_end_of_tag(skeleton, aidtext if
|
bp, ep = locate_beg_end_of_tag(skeleton, aidtext if
|
||||||
isinstance(aidtext, bytes) else aidtext.encode('utf-8'))
|
isinstance(aidtext, bytes)
|
||||||
|
else
|
||||||
|
aidtext.encode('utf-8'))
|
||||||
if bp != ep:
|
if bp != ep:
|
||||||
insertpos = ep + 1 + startpos
|
insertpos = ep + 1 + startpos
|
||||||
|
|
||||||
@@ -228,7 +226,7 @@ class Mobi8Reader(object):
|
|||||||
aidtext = str(uuid.uuid4())
|
aidtext = str(uuid.uuid4())
|
||||||
filename = aidtext + '.html'
|
filename = aidtext + '.html'
|
||||||
self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
|
self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
|
||||||
baseptr, aidtext))
|
baseptr, aidtext))
|
||||||
|
|
||||||
# The primary css style sheet is typically stored next followed by any
|
# The primary css style sheet is typically stored next followed by any
|
||||||
# snippets of code that were previously inlined in the
|
# snippets of code that were previously inlined in the
|
||||||
@@ -238,10 +236,10 @@ class Mobi8Reader(object):
|
|||||||
# The problem is that for most browsers and ereaders, you can not
|
# The problem is that for most browsers and ereaders, you can not
|
||||||
# use <img src="imageXXXX.svg" /> to import any svg image that itself
|
# use <img src="imageXXXX.svg" /> to import any svg image that itself
|
||||||
# properly uses an <image/> tag to import some raster image - it
|
# properly uses an <image/> tag to import some raster image - it
|
||||||
# should work according to the spec but does not for almost all browsers
|
# should work according to the spec but does not for almost all
|
||||||
# and ereaders and causes epub validation issues because those raster
|
# browsers and ereaders and causes epub validation issues because
|
||||||
# images are in manifest but not in xhtml text - since they only
|
# those raster images are in manifest but not in xhtml text - since
|
||||||
# referenced from an svg image
|
# they only referenced from an svg image
|
||||||
|
|
||||||
# So we need to check the remaining flow pieces to see if they are css
|
# So we need to check the remaining flow pieces to see if they are css
|
||||||
# or svg images. if svg images, we must check if they have an <image/>
|
# or svg images. if svg images, we must check if they have an <image/>
|
||||||
@@ -252,7 +250,8 @@ class Mobi8Reader(object):
|
|||||||
|
|
||||||
self.flowinfo.append(FlowInfo(None, None, None, None))
|
self.flowinfo.append(FlowInfo(None, None, None, None))
|
||||||
svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
|
svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
|
||||||
image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''', re.IGNORECASE)
|
image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''',
|
||||||
|
re.IGNORECASE)
|
||||||
for j in range(1, len(self.flows)):
|
for j in range(1, len(self.flows)):
|
||||||
flowpart = self.flows[j]
|
flowpart = self.flows[j]
|
||||||
nstr = '%04d' % j
|
nstr = '%04d' % j
|
||||||
@@ -276,7 +275,8 @@ class Mobi8Reader(object):
|
|||||||
# search for CDATA and if exists inline it
|
# search for CDATA and if exists inline it
|
||||||
if flowpart.find(b'[CDATA[') >= 0:
|
if flowpart.find(b'[CDATA[') >= 0:
|
||||||
typ = 'css'
|
typ = 'css'
|
||||||
flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
|
flowpart = (b'<style type="text/css">\n' + flowpart +
|
||||||
|
b'\n</style>\n')
|
||||||
format = 'inline'
|
format = 'inline'
|
||||||
dir = None
|
dir = None
|
||||||
fname = None
|
fname = None
|
||||||
@@ -300,7 +300,8 @@ class Mobi8Reader(object):
|
|||||||
|
|
||||||
def get_id_tag_by_pos_fid(self, posfid, offset):
|
def get_id_tag_by_pos_fid(self, posfid, offset):
|
||||||
# first convert kindle:pos:fid and offset info to position in file
|
# first convert kindle:pos:fid and offset info to position in file
|
||||||
insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid]
|
(insertpos, idtext, filenum,
|
||||||
|
seqnm, startpos, length) = self.elems[posfid]
|
||||||
pos = insertpos + offset
|
pos = insertpos + offset
|
||||||
fi = self.get_file_info(pos)
|
fi = self.get_file_info(pos)
|
||||||
# an existing "id=" must exist in original xhtml otherwise it would not
|
# an existing "id=" must exist in original xhtml otherwise it would not
|
||||||
@@ -311,20 +312,20 @@ class Mobi8Reader(object):
|
|||||||
# so find the closest "id=" before position the file by actually
|
# so find the closest "id=" before position the file by actually
|
||||||
# searching in that file
|
# searching in that file
|
||||||
idtext = self.get_id_tag(pos)
|
idtext = self.get_id_tag(pos)
|
||||||
return '%s/%s'%(fi.type, fi.filename), idtext
|
return '%s/%s' % (fi.type, fi.filename), idtext
|
||||||
|
|
||||||
def get_id_tag(self, pos):
|
def get_id_tag(self, pos):
|
||||||
# Find the first tag with a named anchor (name or id attribute) before
|
# Find the first tag with a named anchor (name or id attribute) before
|
||||||
# pos
|
# pos
|
||||||
fi = self.get_file_info(pos)
|
fi = self.get_file_info(pos)
|
||||||
if fi.num is None and fi.start is None:
|
if fi.num is None and fi.start is None:
|
||||||
raise ValueError('No file contains pos: %d'%pos)
|
raise ValueError('No file contains pos: %d' % pos)
|
||||||
textblock = self.parts[fi.num]
|
textblock = self.parts[fi.num]
|
||||||
npos = pos - fi.start
|
npos = pos - fi.start
|
||||||
pgt = textblock.find(b'>', npos)
|
pgt = textblock.find(b'>', npos)
|
||||||
plt = textblock.find(b'<', npos)
|
plt = textblock.find(b'<', npos)
|
||||||
# if npos inside a tag then search all text before the its end of tag marker
|
# if npos inside a tag then search all text before the its end of tag
|
||||||
# else not in a tag need to search the preceding tag
|
# marker else not in a tag need to search the preceding tag
|
||||||
if plt == npos or pgt < plt:
|
if plt == npos or pgt < plt:
|
||||||
npos = pgt + 1
|
npos = pgt + 1
|
||||||
textblock = textblock[0:npos]
|
textblock = textblock[0:npos]
|
||||||
@@ -371,7 +372,7 @@ class Mobi8Reader(object):
|
|||||||
linktgt = fi.filename
|
linktgt = fi.filename
|
||||||
if idtext:
|
if idtext:
|
||||||
linktgt += '#' + idtext
|
linktgt += '#' + idtext
|
||||||
g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwd())
|
g = Guide.Reference('%s/%s' % (fi.type, linktgt), os.getcwd())
|
||||||
g.title, g.type = 'start', 'text'
|
g.title, g.type = 'start', 'text'
|
||||||
guide.append(g)
|
guide.append(g)
|
||||||
|
|
||||||
@@ -379,7 +380,7 @@ class Mobi8Reader(object):
|
|||||||
|
|
||||||
def create_ncx(self):
|
def create_ncx(self):
|
||||||
index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
|
index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
|
||||||
self.header.codec)
|
self.header.codec)
|
||||||
remove = []
|
remove = []
|
||||||
|
|
||||||
# Add href and anchor info to the index entries
|
# Add href and anchor info to the index entries
|
||||||
@@ -389,15 +390,15 @@ class Mobi8Reader(object):
|
|||||||
pos = entry['pos']
|
pos = entry['pos']
|
||||||
fi = self.get_file_info(pos)
|
fi = self.get_file_info(pos)
|
||||||
if fi.filename is None:
|
if fi.filename is None:
|
||||||
raise ValueError('Index entry has invalid pos: %d'%pos)
|
raise ValueError('Index entry has invalid pos: %d' % pos)
|
||||||
idtag = self.get_id_tag(pos)
|
idtag = self.get_id_tag(pos)
|
||||||
href = '%s/%s'%(fi.type, fi.filename)
|
href = '%s/%s' % (fi.type, fi.filename)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
|
href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.log.warn('Invalid entry in NCX (title: %s), ignoring'
|
self.log.warn('Invalid entry in NCX (title: %s), '
|
||||||
%entry['text'])
|
'ignoring' % entry['text'])
|
||||||
remove.append(entry)
|
remove.append(entry)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -411,7 +412,8 @@ class Mobi8Reader(object):
|
|||||||
return build_toc(index_entries)
|
return build_toc(index_entries)
|
||||||
|
|
||||||
def extract_resources(self, sections):
|
def extract_resources(self, sections):
|
||||||
from ebook_converter.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF
|
from ebook_converter.ebooks.mobi.writer2.resources import \
|
||||||
|
PLACEHOLDER_GIF
|
||||||
resource_map = []
|
resource_map = []
|
||||||
container = None
|
container = None
|
||||||
for x in ('fonts', 'images'):
|
for x in ('fonts', 'images'):
|
||||||
@@ -424,16 +426,18 @@ class Mobi8Reader(object):
|
|||||||
typ = data[:4]
|
typ = data[:4]
|
||||||
href = None
|
href = None
|
||||||
if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN',
|
if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN',
|
||||||
b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET', b'PAGE'}:
|
b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC',
|
||||||
|
b'CMET', b'PAGE'}:
|
||||||
pass # Ignore these records
|
pass # Ignore these records
|
||||||
elif typ == b'FONT':
|
elif typ == b'FONT':
|
||||||
font = read_font_record(data)
|
font = read_font_record(data)
|
||||||
href = "fonts/%05d.%s" % (fname_idx, font['ext'])
|
href = "fonts/%05d.%s" % (fname_idx, font['ext'])
|
||||||
if font['err']:
|
if font['err']:
|
||||||
self.log.warn('Reading font record %d failed: %s'%(
|
self.log.warn('Reading font record %d failed: %s' %
|
||||||
fname_idx, font['err']))
|
(fname_idx, font['err']))
|
||||||
if font['headers']:
|
if font['headers']:
|
||||||
self.log.debug('Font record headers: %s'%font['headers'])
|
self.log.debug('Font record headers: %s' %
|
||||||
|
font['headers'])
|
||||||
with open(href.replace('/', os.sep), 'wb') as f:
|
with open(href.replace('/', os.sep), 'wb') as f:
|
||||||
f.write(font['font_data'] if font['font_data'] else
|
f.write(font['font_data'] if font['font_data'] else
|
||||||
font['raw_data'])
|
font['raw_data'])
|
||||||
@@ -443,19 +447,23 @@ class Mobi8Reader(object):
|
|||||||
if data == b'CONTBOUNDARY':
|
if data == b'CONTBOUNDARY':
|
||||||
container = None
|
container = None
|
||||||
continue
|
continue
|
||||||
container = Container(data)
|
container = containers.Container(data)
|
||||||
elif typ == b'CRES':
|
elif typ == b'CRES':
|
||||||
data, imgtype = container.load_image(data)
|
data, imgtype = container.load_image(data)
|
||||||
if data is not None:
|
if data is not None:
|
||||||
href = 'images/%05d.%s'%(container.resource_index, imgtype)
|
href = 'images/%05d.%s' % (container.resource_index,
|
||||||
|
imgtype)
|
||||||
with open(href.replace('/', os.sep), 'wb') as f:
|
with open(href.replace('/', os.sep), 'wb') as f:
|
||||||
f.write(data)
|
f.write(data)
|
||||||
elif typ == b'\xa0\xa0\xa0\xa0' and len(data) == 4 and container is not None:
|
elif (typ == b'\xa0\xa0\xa0\xa0' and
|
||||||
|
len(data) == 4 and
|
||||||
|
container is not None):
|
||||||
container.resource_index += 1
|
container.resource_index += 1
|
||||||
elif container is None:
|
elif container is None:
|
||||||
if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF):
|
if not (len(data) == len(PLACEHOLDER_GIF) and
|
||||||
imgtype = find_imgtype(data)
|
data == PLACEHOLDER_GIF):
|
||||||
href = 'images/%05d.%s'%(fname_idx, imgtype)
|
imgtype = containers.find_imgtype(data)
|
||||||
|
href = 'images/%05d.%s' % (fname_idx, imgtype)
|
||||||
with open(href.replace('/', os.sep), 'wb') as f:
|
with open(href.replace('/', os.sep), 'wb') as f:
|
||||||
f.write(data)
|
f.write(data)
|
||||||
|
|
||||||
@@ -482,7 +490,7 @@ class Mobi8Reader(object):
|
|||||||
if os.path.exists(href.replace('/', os.sep)):
|
if os.path.exists(href.replace('/', os.sep)):
|
||||||
try:
|
try:
|
||||||
toc = self.read_inline_toc(href, frag)
|
toc = self.read_inline_toc(href, frag)
|
||||||
except:
|
except Exception:
|
||||||
self.log.exception('Failed to read inline ToC')
|
self.log.exception('Failed to read inline ToC')
|
||||||
|
|
||||||
opf = OPFCreator(os.getcwd(), mi)
|
opf = OPFCreator(os.getcwd(), mi)
|
||||||
@@ -493,7 +501,8 @@ class Mobi8Reader(object):
|
|||||||
|
|
||||||
# If there are no images then the azw3 input plugin dumps all
|
# If there are no images then the azw3 input plugin dumps all
|
||||||
# binary records as .unknown images, remove them
|
# binary records as .unknown images, remove them
|
||||||
if self.for_tweak and os.path.exists('images') and os.path.isdir('images'):
|
if (self.for_tweak and os.path.exists('images') and
|
||||||
|
os.path.isdir('images')):
|
||||||
files = os.listdir('images')
|
files = os.listdir('images')
|
||||||
unknown = [x for x in files if x.endswith('.unknown')]
|
unknown = [x for x in files if x.endswith('.unknown')]
|
||||||
if len(files) == len(unknown):
|
if len(files) == len(unknown):
|
||||||
@@ -502,7 +511,7 @@ class Mobi8Reader(object):
|
|||||||
if self.for_tweak:
|
if self.for_tweak:
|
||||||
try:
|
try:
|
||||||
os.remove('debug-raw.html')
|
os.remove('debug-raw.html')
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
opf.create_manifest_from_files_in([os.getcwd()], exclude=exclude)
|
opf.create_manifest_from_files_in([os.getcwd()], exclude=exclude)
|
||||||
@@ -528,7 +537,7 @@ class Mobi8Reader(object):
|
|||||||
with open(href.replace('/', os.sep), 'rb') as f:
|
with open(href.replace('/', os.sep), 'rb') as f:
|
||||||
raw = f.read().decode(self.header.codec)
|
raw = f.read().decode(self.header.codec)
|
||||||
root = parse_html(raw, log=self.log)
|
root = parse_html(raw, log=self.log)
|
||||||
body = XPath('//h:body')(root)
|
body = base.XPath('//h:body')(root)
|
||||||
reached = False
|
reached = False
|
||||||
if body:
|
if body:
|
||||||
start = body[0]
|
start = body[0]
|
||||||
@@ -536,7 +545,7 @@ class Mobi8Reader(object):
|
|||||||
start = None
|
start = None
|
||||||
reached = True
|
reached = True
|
||||||
if frag:
|
if frag:
|
||||||
elems = XPath('//*[@id="%s"]'%frag)(root)
|
elems = base.XPath('//*[@id="%s"]' % frag)(root)
|
||||||
if elems:
|
if elems:
|
||||||
start = elems[0]
|
start = elems[0]
|
||||||
|
|
||||||
@@ -554,12 +563,13 @@ class Mobi8Reader(object):
|
|||||||
seen = set()
|
seen = set()
|
||||||
links = []
|
links = []
|
||||||
for elem in root.iterdescendants(etree.Element):
|
for elem in root.iterdescendants(etree.Element):
|
||||||
if reached and elem.tag == const.XHTML_A and elem.get('href',
|
if reached and elem.tag == base.tag('xhtml',
|
||||||
|
'a') and elem.get('href',
|
||||||
False):
|
False):
|
||||||
href = elem.get('href')
|
href = elem.get('href')
|
||||||
href, frag = urllib.parse.urldefrag(href)
|
href, frag = urllib.parse.urldefrag(href)
|
||||||
href = base_href + '/' + href
|
href = base_href + '/' + href
|
||||||
text = xml2text(elem).strip()
|
text = base.xml2text(elem).strip()
|
||||||
if (text, href, frag) in seen:
|
if (text, href, frag) in seen:
|
||||||
continue
|
continue
|
||||||
seen.add((text, href, frag))
|
seen.add((text, href, frag))
|
||||||
@@ -568,7 +578,7 @@ class Mobi8Reader(object):
|
|||||||
reached = True
|
reached = True
|
||||||
|
|
||||||
depths = sorted(set(x[-1] for x in links))
|
depths = sorted(set(x[-1] for x in links))
|
||||||
depth_map = {x:i for i, x in enumerate(depths)}
|
depth_map = {x: i for i, x in enumerate(depths)}
|
||||||
for text, href, frag, depth in links:
|
for text, href, frag, depth in links:
|
||||||
depth = depth_map[depth]
|
depth = depth_map[depth]
|
||||||
if current_depth is None:
|
if current_depth is None:
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
|
import collections
|
||||||
import errno
|
import errno
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import io
|
||||||
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
@@ -7,13 +10,10 @@ import shutil
|
|||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import uuid
|
|
||||||
from collections import defaultdict
|
|
||||||
from io import BytesIO
|
|
||||||
from itertools import count
|
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
import uuid
|
||||||
|
|
||||||
from css_parser import getUrls, replaceUrls
|
import css_parser
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from ebook_converter import constants as const
|
from ebook_converter import constants as const
|
||||||
@@ -35,10 +35,7 @@ from ebook_converter.ebooks.metadata.utils import parse_opf_version
|
|||||||
from ebook_converter.ebooks.mobi import MobiError
|
from ebook_converter.ebooks.mobi import MobiError
|
||||||
from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader
|
from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader
|
||||||
from ebook_converter.ebooks.mobi.tweak import set_cover
|
from ebook_converter.ebooks.mobi.tweak import set_cover
|
||||||
from ebook_converter.ebooks.oeb.base import (
|
from ebook_converter.ebooks.oeb import base as oeb_base
|
||||||
OEB_DOCS, OEB_STYLES, Manifest, itercsslinks, iterlinks,
|
|
||||||
rewrite_links, serialize, urlquote, urlunquote
|
|
||||||
)
|
|
||||||
from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html
|
from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html
|
||||||
from ebook_converter.ebooks.oeb.polish.errors import DRMError, InvalidBook
|
from ebook_converter.ebooks.oeb.polish.errors import DRMError, InvalidBook
|
||||||
from ebook_converter.ebooks.oeb.polish.parsing import parse as parse_html_tweak
|
from ebook_converter.ebooks.oeb.polish.parsing import parse as parse_html_tweak
|
||||||
@@ -96,7 +93,7 @@ def abspath_to_name(path, root):
|
|||||||
return relpath(os.path.abspath(path), root).replace(os.sep, '/')
|
return relpath(os.path.abspath(path), root).replace(os.sep, '/')
|
||||||
|
|
||||||
|
|
||||||
def name_to_href(name, root, base=None, quote=urlquote):
|
def name_to_href(name, root, base=None, quote=oeb_base.urlquote):
|
||||||
fullpath = name_to_abspath(name, root)
|
fullpath = name_to_abspath(name, root)
|
||||||
basepath = root if base is None else os.path.dirname(name_to_abspath(base, root))
|
basepath = root if base is None else os.path.dirname(name_to_abspath(base, root))
|
||||||
path = relpath(fullpath, basepath).replace(os.sep, '/')
|
path = relpath(fullpath, basepath).replace(os.sep, '/')
|
||||||
@@ -111,7 +108,7 @@ def href_to_name(href, root, base=None):
|
|||||||
return None
|
return None
|
||||||
if purl.scheme or not purl.path:
|
if purl.scheme or not purl.path:
|
||||||
return None
|
return None
|
||||||
href = urlunquote(purl.path)
|
href = oeb_base.urlunquote(purl.path)
|
||||||
if iswindows and ':' in href:
|
if iswindows and ':' in href:
|
||||||
# path manipulations on windows fail for paths with : in them, so we
|
# path manipulations on windows fail for paths with : in them, so we
|
||||||
# assume all such paths are invalid/absolute paths.
|
# assume all such paths are invalid/absolute paths.
|
||||||
@@ -324,7 +321,7 @@ class Container(ContainerBase): # {{{
|
|||||||
item_id = 'id' + '%d'%c
|
item_id = 'id' + '%d'%c
|
||||||
manifest = self.opf_xpath('//opf:manifest')[0]
|
manifest = self.opf_xpath('//opf:manifest')[0]
|
||||||
href = self.name_to_href(name, self.opf_name)
|
href = self.name_to_href(name, self.opf_name)
|
||||||
item = manifest.makeelement(const.OPF_ITEM,
|
item = manifest.makeelement(oeb_base.tag('opf', 'item'),
|
||||||
id=item_id, href=href)
|
id=item_id, href=href)
|
||||||
item.set('media-type', self.mime_map[name])
|
item.set('media-type', self.mime_map[name])
|
||||||
self.insert_into_xml(manifest, item)
|
self.insert_into_xml(manifest, item)
|
||||||
@@ -340,7 +337,7 @@ class Container(ContainerBase): # {{{
|
|||||||
|
|
||||||
def make_name_unique(self, name):
|
def make_name_unique(self, name):
|
||||||
''' Ensure that `name` does not already exist in this book. If it does, return a modified version that does not exist. '''
|
''' Ensure that `name` does not already exist in this book. If it does, return a modified version that does not exist. '''
|
||||||
counter = count()
|
counter = itertools.count()
|
||||||
while self.has_name_case_insensitive(name) or self.manifest_has_name(name):
|
while self.has_name_case_insensitive(name) or self.manifest_has_name(name):
|
||||||
c = next(counter) + 1
|
c = next(counter) + 1
|
||||||
base, ext = name.rpartition('.')[::2]
|
base, ext = name.rpartition('.')[::2]
|
||||||
@@ -377,10 +374,10 @@ class Container(ContainerBase): # {{{
|
|||||||
if self.ok_to_be_unmanifested(name):
|
if self.ok_to_be_unmanifested(name):
|
||||||
return name
|
return name
|
||||||
item_id = self.add_name_to_manifest(name, process_manifest_item=process_manifest_item)
|
item_id = self.add_name_to_manifest(name, process_manifest_item=process_manifest_item)
|
||||||
if mt in OEB_DOCS:
|
if mt in oeb_base.OEB_DOCS:
|
||||||
manifest = self.opf_xpath('//opf:manifest')[0]
|
manifest = self.opf_xpath('//opf:manifest')[0]
|
||||||
spine = self.opf_xpath('//opf:spine')[0]
|
spine = self.opf_xpath('//opf:spine')[0]
|
||||||
si = manifest.makeelement(const.OPF_ITEMREF, idref=item_id)
|
si = manifest.makeelement(oeb_base.tag('opf', 'itemref'), idref=item_id)
|
||||||
self.insert_into_xml(spine, si, index=spine_index)
|
self.insert_into_xml(spine, si, index=spine_index)
|
||||||
return name
|
return name
|
||||||
|
|
||||||
@@ -442,12 +439,12 @@ class Container(ContainerBase): # {{{
|
|||||||
replace_func.file_type = 'opf'
|
replace_func.file_type = 'opf'
|
||||||
for elem in self.opf_xpath('//*[@href]'):
|
for elem in self.opf_xpath('//*[@href]'):
|
||||||
elem.set('href', replace_func(elem.get('href')))
|
elem.set('href', replace_func(elem.get('href')))
|
||||||
elif media_type.lower() in OEB_DOCS:
|
elif media_type.lower() in oeb_base.OEB_DOCS:
|
||||||
replace_func.file_type = 'text'
|
replace_func.file_type = 'text'
|
||||||
rewrite_links(self.parsed(name), replace_func)
|
oeb_base.rewrite_links(self.parsed(name), replace_func)
|
||||||
elif media_type.lower() in OEB_STYLES:
|
elif media_type.lower() in oeb_base.OEB_STYLES:
|
||||||
replace_func.file_type = 'style'
|
replace_func.file_type = 'style'
|
||||||
replaceUrls(self.parsed(name), replace_func)
|
css_parser.replaceUrls(self.parsed(name), replace_func)
|
||||||
elif media_type.lower() == guess_type('toc.ncx'):
|
elif media_type.lower() == guess_type('toc.ncx'):
|
||||||
replace_func.file_type = 'ncx'
|
replace_func.file_type = 'ncx'
|
||||||
for elem in self.parsed(name).xpath('//*[@src]'):
|
for elem in self.parsed(name).xpath('//*[@src]'):
|
||||||
@@ -467,21 +464,21 @@ class Container(ContainerBase): # {{{
|
|||||||
if name == self.opf_name:
|
if name == self.opf_name:
|
||||||
for elem in self.opf_xpath('//*[@href]'):
|
for elem in self.opf_xpath('//*[@href]'):
|
||||||
yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
|
yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
|
||||||
elif media_type.lower() in OEB_DOCS:
|
elif media_type.lower() in oeb_base.OEB_DOCS:
|
||||||
for el, attr, link, pos in iterlinks(self.parsed(name)):
|
for el, attr, link, pos in oeb_base.iterlinks(self.parsed(name)):
|
||||||
yield (link, el.sourceline, pos) if get_line_numbers else link
|
yield (link, el.sourceline, pos) if get_line_numbers else link
|
||||||
elif media_type.lower() in OEB_STYLES:
|
elif media_type.lower() in oeb_base.OEB_STYLES:
|
||||||
if get_line_numbers:
|
if get_line_numbers:
|
||||||
with self.open(name, 'rb') as f:
|
with self.open(name, 'rb') as f:
|
||||||
raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
|
raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
|
||||||
position = PositionFinder(raw)
|
position = PositionFinder(raw)
|
||||||
is_in_comment = CommentFinder(raw)
|
is_in_comment = CommentFinder(raw)
|
||||||
for link, offset in itercsslinks(raw):
|
for link, offset in oeb_base.itercsslinks(raw):
|
||||||
if not is_in_comment(offset):
|
if not is_in_comment(offset):
|
||||||
lnum, col = position(offset)
|
lnum, col = position(offset)
|
||||||
yield link, lnum, col
|
yield link, lnum, col
|
||||||
else:
|
else:
|
||||||
for link in getUrls(self.parsed(name)):
|
for link in css_parser.getUrls(self.parsed(name)):
|
||||||
yield link
|
yield link
|
||||||
elif media_type.lower() == guess_type('toc.ncx'):
|
elif media_type.lower() == guess_type('toc.ncx'):
|
||||||
for elem in self.parsed(name).xpath('//*[@src]'):
|
for elem in self.parsed(name).xpath('//*[@src]'):
|
||||||
@@ -533,7 +530,7 @@ class Container(ContainerBase): # {{{
|
|||||||
|
|
||||||
def opf_xpath(self, expr):
|
def opf_xpath(self, expr):
|
||||||
' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. '
|
' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. '
|
||||||
return self.opf.xpath(expr, namespaces=const.OPF_NAMESPACES)
|
return self.opf.xpath(expr, namespaces=oeb_base.tag('opf', 'namespaces'))
|
||||||
|
|
||||||
def has_name(self, name):
|
def has_name(self, name):
|
||||||
''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. '''
|
''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. '''
|
||||||
@@ -580,11 +577,11 @@ class Container(ContainerBase): # {{{
|
|||||||
def parse(self, path, mime):
|
def parse(self, path, mime):
|
||||||
with open(path, 'rb') as src:
|
with open(path, 'rb') as src:
|
||||||
data = src.read()
|
data = src.read()
|
||||||
if mime in OEB_DOCS:
|
if mime in oeb_base.OEB_DOCS:
|
||||||
data = self.parse_xhtml(data, self.relpath(path))
|
data = self.parse_xhtml(data, self.relpath(path))
|
||||||
elif mime[-4:] in {'+xml', '/xml'}:
|
elif mime[-4:] in {'+xml', '/xml'}:
|
||||||
data = self.parse_xml(data)
|
data = self.parse_xml(data)
|
||||||
elif mime in OEB_STYLES:
|
elif mime in oeb_base.OEB_STYLES:
|
||||||
data = self.parse_css(data, self.relpath(path))
|
data = self.parse_css(data, self.relpath(path))
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@@ -597,7 +594,7 @@ class Container(ContainerBase): # {{{
|
|||||||
'''
|
'''
|
||||||
ans = self.open(name).read()
|
ans = self.open(name).read()
|
||||||
mime = self.mime_map.get(name, guess_type(name))
|
mime = self.mime_map.get(name, guess_type(name))
|
||||||
if decode and (mime in OEB_STYLES or mime in OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}):
|
if decode and (mime in oeb_base.OEB_STYLES or mime in oeb_base.OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}):
|
||||||
ans = self.decode(ans, normalize_to_nfc=normalize_to_nfc)
|
ans = self.decode(ans, normalize_to_nfc=normalize_to_nfc)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
@@ -637,7 +634,7 @@ class Container(ContainerBase): # {{{
|
|||||||
so use it sparingly. '''
|
so use it sparingly. '''
|
||||||
from ebook_converter.ebooks.metadata.opf2 import OPF as O
|
from ebook_converter.ebooks.metadata.opf2 import OPF as O
|
||||||
mi = self.serialize_item(self.opf_name)
|
mi = self.serialize_item(self.opf_name)
|
||||||
return O(BytesIO(mi), basedir=self.opf_dir, unquote_urls=False,
|
return O(io.BytesIO(mi), basedir=self.opf_dir, unquote_urls=False,
|
||||||
populate_spine=False).to_book_metadata()
|
populate_spine=False).to_book_metadata()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -662,7 +659,7 @@ class Container(ContainerBase): # {{{
|
|||||||
@property
|
@property
|
||||||
def manifest_type_map(self):
|
def manifest_type_map(self):
|
||||||
' Mapping of manifest media-type to list of canonical names of that media-type '
|
' Mapping of manifest media-type to list of canonical names of that media-type '
|
||||||
ans = defaultdict(list)
|
ans = collections.defaultdict(list)
|
||||||
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'):
|
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'):
|
||||||
ans[item.get('media-type').lower()].append(self.href_to_name(
|
ans[item.get('media-type').lower()].append(self.href_to_name(
|
||||||
item.get('href'), self.opf_name))
|
item.get('href'), self.opf_name))
|
||||||
@@ -813,7 +810,7 @@ class Container(ContainerBase): # {{{
|
|||||||
spine = self.opf_xpath('//opf:spine')[0]
|
spine = self.opf_xpath('//opf:spine')[0]
|
||||||
spine.text = tail
|
spine.text = tail
|
||||||
for name, linear in spine_items:
|
for name, linear in spine_items:
|
||||||
i = spine.makeelement(const.OPF_ITEMREF,
|
i = spine.makeelement(oeb_base.tag('opf', 'itemref'),
|
||||||
nsmap={'opf': const.OPF2_NS})
|
nsmap={'opf': const.OPF2_NS})
|
||||||
i.tail = tail
|
i.tail = tail
|
||||||
i.set('idref', imap[name])
|
i.set('idref', imap[name])
|
||||||
@@ -922,7 +919,7 @@ class Container(ContainerBase): # {{{
|
|||||||
return ans[0]
|
return ans[0]
|
||||||
self.dirty(self.opf_name)
|
self.dirty(self.opf_name)
|
||||||
package = self.opf_xpath('//opf:package')[0]
|
package = self.opf_xpath('//opf:package')[0]
|
||||||
item = package.makeelement(OPF(name))
|
item = package.makeelement(oeb_base.tag('opf', name))
|
||||||
item.tail = '\n'
|
item.tail = '\n'
|
||||||
package.append(item)
|
package.append(item)
|
||||||
return item
|
return item
|
||||||
@@ -945,7 +942,7 @@ class Container(ContainerBase): # {{{
|
|||||||
item_id = id_prefix + '%d'%c
|
item_id = id_prefix + '%d'%c
|
||||||
|
|
||||||
manifest = self.opf_xpath('//opf:manifest')[0]
|
manifest = self.opf_xpath('//opf:manifest')[0]
|
||||||
item = manifest.makeelement(const.OPF_ITEM,
|
item = manifest.makeelement(oeb_base.tag('opf', 'item'),
|
||||||
id=item_id, href=href)
|
id=item_id, href=href)
|
||||||
item.set('media-type', media_type)
|
item.set('media-type', media_type)
|
||||||
self.insert_into_xml(manifest, item)
|
self.insert_into_xml(manifest, item)
|
||||||
@@ -992,7 +989,7 @@ class Container(ContainerBase): # {{{
|
|||||||
data = root = self.parsed(name)
|
data = root = self.parsed(name)
|
||||||
if name == self.opf_name:
|
if name == self.opf_name:
|
||||||
self.format_opf()
|
self.format_opf()
|
||||||
data = serialize(data, self.mime_map[name], pretty_print=name in
|
data = oeb_base.serialize(data, self.mime_map[name], pretty_print=name in
|
||||||
self.pretty_print)
|
self.pretty_print)
|
||||||
if name == self.opf_name and root.nsmap.get(None) == const.OPF2_NS:
|
if name == self.opf_name and root.nsmap.get(None) == const.OPF2_NS:
|
||||||
# Needed as I can't get lxml to output opf:role and
|
# Needed as I can't get lxml to output opf:role and
|
||||||
@@ -1181,7 +1178,7 @@ class EpubContainer(Container):
|
|||||||
)
|
)
|
||||||
if not opf_files:
|
if not opf_files:
|
||||||
raise InvalidEpub('META-INF/container.xml contains no link to OPF file')
|
raise InvalidEpub('META-INF/container.xml contains no link to OPF file')
|
||||||
opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/')))
|
opf_path = os.path.join(self.root, *(oeb_base.urlunquote(opf_files[0].get('full-path')).split('/')))
|
||||||
if not exists(opf_path):
|
if not exists(opf_path):
|
||||||
raise InvalidEpub('OPF file does not exist at location pointed to'
|
raise InvalidEpub('OPF file does not exist at location pointed to'
|
||||||
' by META-INF/container.xml')
|
' by META-INF/container.xml')
|
||||||
@@ -1412,7 +1409,7 @@ def do_explode(path, dest):
|
|||||||
def opf_to_azw3(opf, outpath, container):
|
def opf_to_azw3(opf, outpath, container):
|
||||||
from ebook_converter.ebooks.conversion.plumber import Plumber, create_oebbook
|
from ebook_converter.ebooks.conversion.plumber import Plumber, create_oebbook
|
||||||
|
|
||||||
class Item(Manifest.Item):
|
class Item(oeb_base.Manifest.Item):
|
||||||
|
|
||||||
def _parse_css(self, data):
|
def _parse_css(self, data):
|
||||||
# The default CSS parser used by oeb.base inserts the h namespace
|
# The default CSS parser used by oeb.base inserts the h namespace
|
||||||
|
|||||||
@@ -1,22 +1,16 @@
|
|||||||
from collections import defaultdict
|
import collections
|
||||||
from functools import partial
|
import functools
|
||||||
|
|
||||||
from css_parser.css import CSSRule, CSSStyleDeclaration
|
from css_parser.css import CSSRule, CSSStyleDeclaration
|
||||||
|
|
||||||
from ebook_converter import constants as const
|
|
||||||
from ebook_converter import force_unicode
|
from ebook_converter import force_unicode
|
||||||
from ebook_converter.css_selectors import parse, SelectorSyntaxError
|
from ebook_converter.css_selectors import parse, SelectorSyntaxError
|
||||||
from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, css_text
|
from ebook_converter.ebooks.oeb import base
|
||||||
from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
|
from ebook_converter.ebooks.oeb.polish import pretty
|
||||||
from ebook_converter.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize
|
|
||||||
from ebook_converter.utils.icu import numeric_sort_key
|
from ebook_converter.utils.icu import numeric_sort_key
|
||||||
from ebook_converter.css_selectors import Select, SelectorError
|
from ebook_converter.css_selectors import Select, SelectorError
|
||||||
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
||||||
|
|
||||||
|
|
||||||
def filter_used_rules(rules, log, select):
|
def filter_used_rules(rules, log, select):
|
||||||
for rule in rules:
|
for rule in rules:
|
||||||
used = False
|
used = False
|
||||||
@@ -34,7 +28,8 @@ def filter_used_rules(rules, log, select):
|
|||||||
yield rule
|
yield rule
|
||||||
|
|
||||||
|
|
||||||
def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None):
|
def get_imported_sheets(name, container, sheets, recursion_level=10,
|
||||||
|
sheet=None):
|
||||||
ans = set()
|
ans = set()
|
||||||
sheet = sheet or sheets[name]
|
sheet = sheet or sheets[name]
|
||||||
for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
|
for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
|
||||||
@@ -44,7 +39,8 @@ def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None)
|
|||||||
ans.add(iname)
|
ans.add(iname)
|
||||||
if recursion_level > 0:
|
if recursion_level > 0:
|
||||||
for imported_sheet in tuple(ans):
|
for imported_sheet in tuple(ans):
|
||||||
ans |= get_imported_sheets(imported_sheet, container, sheets, recursion_level=recursion_level-1)
|
ans |= get_imported_sheets(imported_sheet, container, sheets,
|
||||||
|
recursion_level=recursion_level-1)
|
||||||
ans.discard(name)
|
ans.discard(name)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
@@ -56,7 +52,7 @@ def merge_declarations(first, second):
|
|||||||
|
|
||||||
def merge_identical_selectors(sheet):
|
def merge_identical_selectors(sheet):
|
||||||
' Merge rules that have identical selectors '
|
' Merge rules that have identical selectors '
|
||||||
selector_map = defaultdict(list)
|
selector_map = collections.defaultdict(list)
|
||||||
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||||
selector_map[rule.selectorText].append(rule)
|
selector_map[rule.selectorText].append(rule)
|
||||||
remove = []
|
remove = []
|
||||||
@@ -70,23 +66,29 @@ def merge_identical_selectors(sheet):
|
|||||||
return len(remove)
|
return len(remove)
|
||||||
|
|
||||||
|
|
||||||
def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False):
|
def remove_unused_css(container, report=None, remove_unused_classes=False,
|
||||||
'''
|
merge_rules=False):
|
||||||
Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content.
|
"""
|
||||||
|
Remove all unused CSS rules from the book. An unused CSS rule is one that
|
||||||
|
does not match any actual content.
|
||||||
|
|
||||||
:param report: An optional callable that takes a single argument. It is called with information about the operations being performed.
|
:param report: An optional callable that takes a single argument. It is
|
||||||
:param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed.
|
called with information about the operations being
|
||||||
|
performed.
|
||||||
|
:param remove_unused_classes: If True, class attributes in the HTML that
|
||||||
|
do not match any CSS rules are also removed.
|
||||||
:param merge_rules: If True, rules with identical selectors are merged.
|
:param merge_rules: If True, rules with identical selectors are merged.
|
||||||
'''
|
"""
|
||||||
report = report or (lambda x:x)
|
report = report or (lambda x: x)
|
||||||
|
|
||||||
def safe_parse(name):
|
def safe_parse(name):
|
||||||
try:
|
try:
|
||||||
return container.parsed(name)
|
return container.parsed(name)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
pass
|
pass
|
||||||
sheets = {name:safe_parse(name) for name, mt in container.mime_map.items() if mt in OEB_STYLES}
|
|
||||||
sheets = {k:v for k, v in sheets.items() if v is not None}
|
sheets = {name: safe_parse(name) for name, mt in container.mime_map.items()
|
||||||
|
if mt in base.OEB_STYLES and safe_parse(name) is not None}
|
||||||
num_merged = 0
|
num_merged = 0
|
||||||
if merge_rules:
|
if merge_rules:
|
||||||
for name, sheet in sheets.items():
|
for name, sheet in sheets.items():
|
||||||
@@ -106,7 +108,7 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
|
|||||||
num_of_removed_rules = num_of_removed_classes = 0
|
num_of_removed_rules = num_of_removed_classes = 0
|
||||||
|
|
||||||
for name, mt in container.mime_map.items():
|
for name, mt in container.mime_map.items():
|
||||||
if mt not in OEB_DOCS:
|
if mt not in base.OEB_DOCS:
|
||||||
continue
|
continue
|
||||||
root = container.parsed(name)
|
root = container.parsed(name)
|
||||||
select = Select(root, ignore_inappropriate_pseudo_classes=True)
|
select = Select(root, ignore_inappropriate_pseudo_classes=True)
|
||||||
@@ -120,31 +122,39 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
|
|||||||
num_merged += num
|
num_merged += num
|
||||||
container.dirty(name)
|
container.dirty(name)
|
||||||
if remove_unused_classes:
|
if remove_unused_classes:
|
||||||
used_classes |= {x.lower() for x in classes_in_rule_list(sheet.cssRules)}
|
used_classes |= {x.lower() for x in
|
||||||
imports = get_imported_sheets(name, container, sheets, sheet=sheet)
|
classes_in_rule_list(sheet.cssRules)}
|
||||||
|
imports = get_imported_sheets(name, container, sheets,
|
||||||
|
sheet=sheet)
|
||||||
for imported_sheet in imports:
|
for imported_sheet in imports:
|
||||||
style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
|
style_rules[imported_sheet] = tuple(filter_used_rules(
|
||||||
|
style_rules[imported_sheet], container.log, select))
|
||||||
if remove_unused_classes:
|
if remove_unused_classes:
|
||||||
used_classes |= class_map[imported_sheet]
|
used_classes |= class_map[imported_sheet]
|
||||||
rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
|
rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
|
||||||
unused_rules = tuple(filter_used_rules(rules, container.log, select))
|
unused_rules = tuple(filter_used_rules(rules, container.log,
|
||||||
|
select))
|
||||||
if unused_rules:
|
if unused_rules:
|
||||||
num_of_removed_rules += len(unused_rules)
|
num_of_removed_rules += len(unused_rules)
|
||||||
[sheet.cssRules.remove(r) for r in unused_rules]
|
[sheet.cssRules.remove(r) for r in unused_rules]
|
||||||
style.text = force_unicode(sheet.cssText, 'utf-8')
|
style.text = force_unicode(sheet.cssText, 'utf-8')
|
||||||
pretty_script_or_style(container, style)
|
pretty.pretty_script_or_style(container, style)
|
||||||
container.dirty(name)
|
container.dirty(name)
|
||||||
|
|
||||||
for link in root.xpath('//*[local-name()="link" and @href]'):
|
for link in root.xpath('//*[local-name()="link" and @href]'):
|
||||||
sname = container.href_to_name(link.get('href'), name)
|
sname = container.href_to_name(link.get('href'), name)
|
||||||
if sname not in sheets:
|
if sname not in sheets:
|
||||||
continue
|
continue
|
||||||
style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select))
|
style_rules[sname] = tuple(filter_used_rules(style_rules[sname],
|
||||||
|
container.log,
|
||||||
|
select))
|
||||||
if remove_unused_classes:
|
if remove_unused_classes:
|
||||||
used_classes |= class_map[sname]
|
used_classes |= class_map[sname]
|
||||||
|
|
||||||
for iname in import_map[sname]:
|
for iname in import_map[sname]:
|
||||||
style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select))
|
style_rules[iname] = tuple(
|
||||||
|
filter_used_rules(style_rules[iname], container.log,
|
||||||
|
select))
|
||||||
if remove_unused_classes:
|
if remove_unused_classes:
|
||||||
used_classes |= class_map[iname]
|
used_classes |= class_map[iname]
|
||||||
|
|
||||||
@@ -159,7 +169,8 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
|
|||||||
elem.set('class', ' '.join(classes))
|
elem.set('class', ' '.join(classes))
|
||||||
else:
|
else:
|
||||||
del elem.attrib['class']
|
del elem.attrib['class']
|
||||||
num_of_removed_classes += len(original_classes) - len(classes)
|
num_of_removed_classes += (len(original_classes) -
|
||||||
|
len(classes))
|
||||||
container.dirty(name)
|
container.dirty(name)
|
||||||
|
|
||||||
for name, sheet in sheets.items():
|
for name, sheet in sheets.items():
|
||||||
@@ -195,7 +206,7 @@ def filter_declaration(style, properties=()):
|
|||||||
changed = True
|
changed = True
|
||||||
all_props = set(style.keys())
|
all_props = set(style.keys())
|
||||||
for prop in style.getProperties():
|
for prop in style.getProperties():
|
||||||
n = normalizers.get(prop.name, None)
|
n = base.normalize_css.normalizers.get(prop.name, None)
|
||||||
if n is not None:
|
if n is not None:
|
||||||
normalized = n(prop.name, prop.propertyValue)
|
normalized = n(prop.name, prop.propertyValue)
|
||||||
removed = properties.intersection(set(normalized))
|
removed = properties.intersection(set(normalized))
|
||||||
@@ -225,12 +236,13 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
|
|||||||
root = container.parsed(name)
|
root = container.parsed(name)
|
||||||
changed = False
|
changed = False
|
||||||
for style in root.xpath('//*[local-name()="style"]'):
|
for style in root.xpath('//*[local-name()="style"]'):
|
||||||
if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
|
if style.text and (style.get('type') or
|
||||||
|
'text/css').lower() == 'text/css':
|
||||||
sheet = container.parse_css(style.text)
|
sheet = container.parse_css(style.text)
|
||||||
if transform_sheet(sheet):
|
if transform_sheet(sheet):
|
||||||
changed = True
|
changed = True
|
||||||
style.text = force_unicode(sheet.cssText, 'utf-8')
|
style.text = force_unicode(sheet.cssText, 'utf-8')
|
||||||
pretty_script_or_style(container, style)
|
pretty.pretty_script_or_style(container, style)
|
||||||
for elem in root.xpath('//*[@style]'):
|
for elem in root.xpath('//*[@style]'):
|
||||||
text = elem.get('style', None)
|
text = elem.get('style', None)
|
||||||
if text:
|
if text:
|
||||||
@@ -240,13 +252,16 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
|
|||||||
if style.length == 0:
|
if style.length == 0:
|
||||||
del elem.attrib['style']
|
del elem.attrib['style']
|
||||||
else:
|
else:
|
||||||
elem.set('style', force_unicode(style.getCssText(separator=' '), 'utf-8'))
|
elem.set('style',
|
||||||
|
force_unicode(style.getCssText(separator=' '),
|
||||||
|
'utf-8'))
|
||||||
return changed
|
return changed
|
||||||
|
|
||||||
|
|
||||||
def transform_css(container, transform_sheet=None, transform_style=None, names=()):
|
def transform_css(container, transform_sheet=None, transform_style=None,
|
||||||
|
names=()):
|
||||||
if not names:
|
if not names:
|
||||||
types = OEB_STYLES | OEB_DOCS
|
types = base.OEB_STYLES | base.OEB_DOCS
|
||||||
names = []
|
names = []
|
||||||
for name, mt in container.mime_map.items():
|
for name, mt in container.mime_map.items():
|
||||||
if mt in types:
|
if mt in types:
|
||||||
@@ -256,13 +271,14 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=(
|
|||||||
|
|
||||||
for name in names:
|
for name in names:
|
||||||
mt = container.mime_map[name]
|
mt = container.mime_map[name]
|
||||||
if mt in OEB_STYLES:
|
if mt in base.OEB_STYLES:
|
||||||
sheet = container.parsed(name)
|
sheet = container.parsed(name)
|
||||||
if transform_sheet(sheet):
|
if transform_sheet(sheet):
|
||||||
container.dirty(name)
|
container.dirty(name)
|
||||||
doc_changed = True
|
doc_changed = True
|
||||||
elif mt in OEB_DOCS:
|
elif mt in base.OEB_DOCS:
|
||||||
if transform_inline_styles(container, name, transform_sheet, transform_style):
|
if transform_inline_styles(container, name, transform_sheet,
|
||||||
|
transform_style):
|
||||||
container.dirty(name)
|
container.dirty(name)
|
||||||
doc_changed = True
|
doc_changed = True
|
||||||
|
|
||||||
@@ -270,15 +286,21 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=(
|
|||||||
|
|
||||||
|
|
||||||
def filter_css(container, properties, names=()):
|
def filter_css(container, properties, names=()):
|
||||||
'''
|
"""
|
||||||
Remove the specified CSS properties from all CSS rules in the book.
|
Remove the specified CSS properties from all CSS rules in the book.
|
||||||
|
|
||||||
:param properties: Set of properties to remove. For example: :code:`{'font-family', 'color'}`.
|
:param properties: Set of properties to remove. For example:
|
||||||
:param names: The files from which to remove the properties. Defaults to all HTML and CSS files in the book.
|
:code:`{'font-family', 'color'}`.
|
||||||
'''
|
:param names: The files from which to remove the properties. Defaults to
|
||||||
properties = normalize_filter_css(properties)
|
all HTML and CSS files in the book.
|
||||||
return transform_css(container, transform_sheet=partial(filter_sheet, properties=properties),
|
"""
|
||||||
transform_style=partial(filter_declaration, properties=properties), names=names)
|
properties = base.normalize_css.normalize_filter_css(properties)
|
||||||
|
return transform_css(container,
|
||||||
|
transform_sheet=functools.partial(
|
||||||
|
filter_sheet, properties=properties),
|
||||||
|
transform_style=functools.partial(
|
||||||
|
filter_declaration, properties=properties),
|
||||||
|
names=names)
|
||||||
|
|
||||||
|
|
||||||
def _classes_in_selector(selector, classes):
|
def _classes_in_selector(selector, classes):
|
||||||
@@ -331,21 +353,29 @@ def remove_property_value(prop, predicate):
|
|||||||
if len(removed_vals) == len(prop.propertyValue):
|
if len(removed_vals) == len(prop.propertyValue):
|
||||||
prop.parent.removeProperty(prop.name)
|
prop.parent.removeProperty(prop.name)
|
||||||
else:
|
else:
|
||||||
x = css_text(prop.propertyValue)
|
x = base.css_text(prop.propertyValue)
|
||||||
for v in removed_vals:
|
for v in removed_vals:
|
||||||
x = x.replace(css_text(v), '').strip()
|
x = x.replace(base.css_text(v), '').strip()
|
||||||
prop.propertyValue.cssText = x
|
prop.propertyValue.cssText = x
|
||||||
return bool(removed_vals)
|
return bool(removed_vals)
|
||||||
|
|
||||||
|
|
||||||
RULE_PRIORITIES = {t:i for i, t in enumerate((CSSRule.COMMENT, CSSRule.CHARSET_RULE, CSSRule.IMPORT_RULE, CSSRule.NAMESPACE_RULE))}
|
RULE_PRIORITIES = {t: i for i, t in enumerate((CSSRule.COMMENT,
|
||||||
|
CSSRule.CHARSET_RULE,
|
||||||
|
CSSRule.IMPORT_RULE,
|
||||||
|
CSSRule.NAMESPACE_RULE))}
|
||||||
|
|
||||||
|
|
||||||
def sort_sheet(container, sheet_or_text):
|
def sort_sheet(container, sheet_or_text):
|
||||||
''' Sort the rules in a stylesheet. Note that in the general case this can
|
"""
|
||||||
change the effective styles, but for most common sheets, it should be safe.
|
Sort the rules in a stylesheet. Note that in the general case this can
|
||||||
'''
|
change the effective styles, but for most common sheets, it should be
|
||||||
sheet = container.parse_css(sheet_or_text) if isinstance(sheet_or_text, str) else sheet_or_text
|
safe.
|
||||||
|
"""
|
||||||
|
if isinstance(sheet_or_text, str):
|
||||||
|
sheet = container.parse_css(sheet_or_text)
|
||||||
|
else:
|
||||||
|
sheet = sheet_or_text
|
||||||
|
|
||||||
def text_sort_key(x):
|
def text_sort_key(x):
|
||||||
return numeric_sort_key(str(x or ''))
|
return numeric_sort_key(str(x or ''))
|
||||||
@@ -364,7 +394,8 @@ def sort_sheet(container, sheet_or_text):
|
|||||||
rule.selectorText = ', '.join(s.selectorText for s in selectors)
|
rule.selectorText = ', '.join(s.selectorText for s in selectors)
|
||||||
elif rule.type == CSSRule.FONT_FACE_RULE:
|
elif rule.type == CSSRule.FONT_FACE_RULE:
|
||||||
try:
|
try:
|
||||||
tertiary = text_sort_key(rule.style.getPropertyValue('font-family'))
|
tertiary = text_sort_key(rule.style.getPropertyValue('font-'
|
||||||
|
'family'))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -379,11 +410,14 @@ def add_stylesheet_links(container, name, text):
|
|||||||
if not head:
|
if not head:
|
||||||
return
|
return
|
||||||
head = head[0]
|
head = head[0]
|
||||||
sheets = tuple(container.manifest_items_of_type(lambda mt: mt in OEB_STYLES))
|
sheets = tuple(container.manifest_items_of_type(lambda mt:
|
||||||
|
mt in base.OEB_STYLES))
|
||||||
if not sheets:
|
if not sheets:
|
||||||
return
|
return
|
||||||
for sname in sheets:
|
for sname in sheets:
|
||||||
link = head.makeelement(const.XHTML_LINK, type='text/css', rel='stylesheet', href=container.name_to_href(sname, name))
|
link = head.makeelement(base.tag('xhtml', 'link'), type='text/css',
|
||||||
|
rel='stylesheet',
|
||||||
|
href=container.name_to_href(sname, name))
|
||||||
head.append(link)
|
head.append(link)
|
||||||
pretty_xml_tree(head)
|
pretty.pretty_xml_tree(head)
|
||||||
return serialize(root, 'text/html')
|
return pretty.serialize(root, 'text/html')
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from ebook_converter import constants as const
|
from ebook_converter import constants as const
|
||||||
|
from ebook_converter.ebooks.oeb import base
|
||||||
from ebook_converter.utils.localization import canonicalize_lang
|
from ebook_converter.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
|
|
||||||
@@ -14,7 +15,7 @@ def get_book_language(container):
|
|||||||
|
|
||||||
|
|
||||||
def set_guide_item(container, item_type, title, name, frag=None):
|
def set_guide_item(container, item_type, title, name, frag=None):
|
||||||
ref_tag = const.OPF_REFERENCE
|
ref_tag = base.tag('opf', 'reference')
|
||||||
href = None
|
href = None
|
||||||
if name:
|
if name:
|
||||||
href = container.name_to_href(name, container.opf_name)
|
href = container.name_to_href(name, container.opf_name)
|
||||||
@@ -23,7 +24,7 @@ def set_guide_item(container, item_type, title, name, frag=None):
|
|||||||
|
|
||||||
guides = container.opf_xpath('//opf:guide')
|
guides = container.opf_xpath('//opf:guide')
|
||||||
if not guides and href:
|
if not guides and href:
|
||||||
g = container.opf.makeelement(const.OPF_GUIDE,
|
g = container.opf.makeelement(base.tag('opf', 'guide'),
|
||||||
nsmap={'opf': const.OPF2_NS})
|
nsmap={'opf': const.OPF2_NS})
|
||||||
container.insert_into_xml(container.opf, g)
|
container.insert_into_xml(container.opf, g)
|
||||||
guides = [g]
|
guides = [g]
|
||||||
|
|||||||
@@ -1,18 +1,13 @@
|
|||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
# from lxml.etree import Element
|
|
||||||
|
|
||||||
from ebook_converter import constants as const
|
from ebook_converter import constants as const
|
||||||
from ebook_converter import force_unicode
|
from ebook_converter import force_unicode
|
||||||
from ebook_converter.ebooks.oeb import parse_utils
|
from ebook_converter.ebooks.oeb import parse_utils
|
||||||
from ebook_converter.ebooks.oeb.base import serialize, OEB_DOCS, OEB_STYLES
|
from ebook_converter.ebooks.oeb import base
|
||||||
from ebook_converter.ebooks.oeb.polish.utils import guess_type
|
from ebook_converter.ebooks.oeb.polish.utils import guess_type
|
||||||
from ebook_converter.utils.icu import sort_key
|
from ebook_converter.utils.icu import sort_key
|
||||||
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
||||||
|
|
||||||
def isspace(x):
|
def isspace(x):
|
||||||
return not x.strip('\u0009\u000a\u000c\u000d\u0020')
|
return not x.strip('\u0009\u000a\u000c\u000d\u0020')
|
||||||
|
|
||||||
@@ -28,37 +23,40 @@ def pretty_xml_tree(elem, level=0, indent=' '):
|
|||||||
for i, child in enumerate(elem):
|
for i, child in enumerate(elem):
|
||||||
pretty_xml_tree(child, level=level+1, indent=indent)
|
pretty_xml_tree(child, level=level+1, indent=indent)
|
||||||
if not child.tail or isspace(child.tail):
|
if not child.tail or isspace(child.tail):
|
||||||
l = level + 1
|
new_level = level + 1
|
||||||
if i == len(elem) - 1:
|
if i == len(elem) - 1:
|
||||||
l -= 1
|
new_level -= 1
|
||||||
child.tail = '\n' + (indent * l)
|
child.tail = '\n' + (indent * new_level)
|
||||||
|
|
||||||
|
|
||||||
def pretty_opf(root):
|
def pretty_opf(root):
|
||||||
# Put all dc: tags first starting with title and author. Preserve order for
|
# Put all dc: tags first starting with title and author. Preserve order for
|
||||||
# the rest.
|
# the rest.
|
||||||
def dckey(x):
|
def dckey(x):
|
||||||
return {'title':0, 'creator':1}.get(parse_utils.barename(x.tag), 2)
|
return {'title': 0, 'creator': 1}.get(parse_utils.barename(x.tag), 2)
|
||||||
for metadata in root.xpath('//opf:metadata', namespaces=const.OPF_NAMESPACES):
|
|
||||||
|
for metadata in root.xpath('//opf:metadata',
|
||||||
|
namespaces=const.OPF_NAMESPACES):
|
||||||
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS)
|
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS)
|
||||||
dc_tags.sort(key=dckey)
|
dc_tags.sort(key=dckey)
|
||||||
for x in reversed(dc_tags):
|
for x in reversed(dc_tags):
|
||||||
metadata.insert(0, x)
|
metadata.insert(0, x)
|
||||||
|
|
||||||
# Group items in the manifest
|
# Group items in the manifest
|
||||||
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=const.OPF_NAMESPACES)
|
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref',
|
||||||
spine_ids = {x:i for i, x in enumerate(spine_ids)}
|
namespaces=const.OPF_NAMESPACES)
|
||||||
|
spine_ids = {x: i for i, x in enumerate(spine_ids)}
|
||||||
|
|
||||||
def manifest_key(x):
|
def manifest_key(x):
|
||||||
mt = x.get('media-type', '')
|
mt = x.get('media-type', '')
|
||||||
href = x.get('href', '')
|
href = x.get('href', '')
|
||||||
ext = href.rpartition('.')[-1].lower()
|
ext = href.rpartition('.')[-1].lower()
|
||||||
cat = 1000
|
cat = 1000
|
||||||
if mt in OEB_DOCS:
|
if mt in base.OEB_DOCS:
|
||||||
cat = 0
|
cat = 0
|
||||||
elif mt == guess_type('a.ncx'):
|
elif mt == guess_type('a.ncx'):
|
||||||
cat = 1
|
cat = 1
|
||||||
elif mt in OEB_STYLES:
|
elif mt in base.OEB_STYLES:
|
||||||
cat = 2
|
cat = 2
|
||||||
elif mt.startswith('image/'):
|
elif mt.startswith('image/'):
|
||||||
cat = 3
|
cat = 3
|
||||||
@@ -75,20 +73,23 @@ def pretty_opf(root):
|
|||||||
i = sort_key(href)
|
i = sort_key(href)
|
||||||
return (cat, i)
|
return (cat, i)
|
||||||
|
|
||||||
for manifest in root.xpath('//opf:manifest', namespaces=const.OPF_NAMESPACES):
|
for manifest in root.xpath('//opf:manifest',
|
||||||
|
namespaces=const.OPF_NAMESPACES):
|
||||||
try:
|
try:
|
||||||
children = sorted(manifest, key=manifest_key)
|
children = sorted(manifest, key=manifest_key)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
continue # There are comments so dont sort since that would mess up the comments
|
# There are comments so dont sort since that would mess up the
|
||||||
|
# comments.
|
||||||
|
continue
|
||||||
|
|
||||||
for x in reversed(children):
|
for x in reversed(children):
|
||||||
manifest.insert(0, x)
|
manifest.insert(0, x)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def isblock(x):
|
def isblock(x):
|
||||||
if callable(x.tag) or not x.tag:
|
if callable(x.tag) or not x.tag:
|
||||||
return True
|
return True
|
||||||
if x.tag in const.XHTML_BLOCK_TAGS | {const.SVG_SVG}:
|
if x.tag in const.XHTML_BLOCK_TAGS | {base.tag('svg', 'svg')}:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -133,28 +134,34 @@ def pretty_block(parent, level=1, indent=' '):
|
|||||||
that contain only other block tags '''
|
that contain only other block tags '''
|
||||||
if not parent.text or isspace(parent.text):
|
if not parent.text or isspace(parent.text):
|
||||||
parent.text = ''
|
parent.text = ''
|
||||||
nn = '\n' if hasattr(parent.tag, 'strip') and parse_utils.barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
|
if (hasattr(parent.tag, 'strip') and
|
||||||
|
parse_utils.barename(parent.tag) in {'tr', 'td', 'th'}):
|
||||||
|
nn = '\n'
|
||||||
|
else:
|
||||||
|
nn = '\n\n'
|
||||||
parent.text = parent.text + nn + (indent * level)
|
parent.text = parent.text + nn + (indent * level)
|
||||||
for i, child in enumerate(parent):
|
for i, child in enumerate(parent):
|
||||||
if isblock(child) and has_only_blocks(child):
|
if isblock(child) and has_only_blocks(child):
|
||||||
pretty_block(child, level=level+1, indent=indent)
|
pretty_block(child, level=level+1, indent=indent)
|
||||||
elif child.tag == const.SVG_SVG:
|
elif child.tag == base.tag('svg', 'svg'):
|
||||||
pretty_xml_tree(child, level=level, indent=indent)
|
pretty_xml_tree(child, level=level, indent=indent)
|
||||||
l = level
|
new_level = level
|
||||||
if i == len(parent) - 1:
|
if i == len(parent) - 1:
|
||||||
l -= 1
|
new_level -= 1
|
||||||
if not child.tail or isspace(child.tail):
|
if not child.tail or isspace(child.tail):
|
||||||
child.tail = ''
|
child.tail = ''
|
||||||
child.tail = child.tail + nn + (indent * l)
|
child.tail = child.tail + nn + (indent * new_level)
|
||||||
|
|
||||||
|
|
||||||
def pretty_script_or_style(container, child):
|
def pretty_script_or_style(container, child):
|
||||||
if child.text:
|
if child.text:
|
||||||
indent = indent_for_tag(child)
|
indent = indent_for_tag(child)
|
||||||
if child.tag.endswith('style'):
|
if child.tag.endswith('style'):
|
||||||
child.text = force_unicode(pretty_css(container, '', child.text), 'utf-8')
|
child.text = force_unicode(pretty_css(container, '', child.text),
|
||||||
|
'utf-8')
|
||||||
child.text = textwrap.dedent(child.text)
|
child.text = textwrap.dedent(child.text)
|
||||||
child.text = '\n' + '\n'.join([(indent + x) if x else '' for x in child.text.splitlines()])
|
child.text = '\n' + '\n'.join([(indent + x) if x else ''
|
||||||
|
for x in child.text.splitlines()])
|
||||||
set_indent(child, 'text', indent)
|
set_indent(child, 'text', indent)
|
||||||
|
|
||||||
|
|
||||||
@@ -169,62 +176,82 @@ def pretty_html_tree(container, root):
|
|||||||
# Special case the handling of a body that contains a single block tag
|
# Special case the handling of a body that contains a single block tag
|
||||||
# with all content. In this case we prettify the containing block tag
|
# with all content. In this case we prettify the containing block tag
|
||||||
# even if it has non block children.
|
# even if it has non block children.
|
||||||
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(
|
if (len(body) == 1 and
|
||||||
body[0]) and parse_utils.barename(body[0].tag) not in (
|
not callable(body[0].tag) and
|
||||||
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
|
isblock(body[0]) and
|
||||||
|
not has_only_blocks(body[0]) and
|
||||||
|
parse_utils.barename(body[0].tag) not in ('pre', 'p', 'h1',
|
||||||
|
'h2', 'h3', 'h4',
|
||||||
|
'h5', 'h6') and
|
||||||
|
len(body[0]) > 0):
|
||||||
pretty_block(body[0], level=2)
|
pretty_block(body[0], level=2)
|
||||||
|
|
||||||
if container is not None:
|
if container is not None:
|
||||||
# Handle <script> and <style> tags
|
# Handle <script> and <style> tags
|
||||||
for child in root.xpath('//*[local-name()="script" or local-name()="style"]'):
|
for child in root.xpath('//*[local-name()="script" or local-name()='
|
||||||
|
'"style"]'):
|
||||||
pretty_script_or_style(container, child)
|
pretty_script_or_style(container, child)
|
||||||
|
|
||||||
|
|
||||||
def fix_html(container, raw):
|
def fix_html(container, raw):
|
||||||
' Fix any parsing errors in the HTML represented as a string in raw. Fixing is done using the HTML5 parsing algorithm. '
|
"""
|
||||||
|
Fix any parsing errors in the HTML represented as a string in raw. Fixing
|
||||||
|
is done using the HTML5 parsing algorithm.
|
||||||
|
"""
|
||||||
root = container.parse_xhtml(raw)
|
root = container.parse_xhtml(raw)
|
||||||
return serialize(root, 'text/html')
|
return base.serialize(root, 'text/html')
|
||||||
|
|
||||||
|
|
||||||
def pretty_html(container, name, raw):
|
def pretty_html(container, name, raw):
|
||||||
' Pretty print the HTML represented as a string in raw '
|
"""
|
||||||
|
Pretty print the HTML represented as a string in raw
|
||||||
|
"""
|
||||||
root = container.parse_xhtml(raw)
|
root = container.parse_xhtml(raw)
|
||||||
pretty_html_tree(container, root)
|
pretty_html_tree(container, root)
|
||||||
return serialize(root, 'text/html')
|
return base.serialize(root, 'text/html')
|
||||||
|
|
||||||
|
|
||||||
def pretty_css(container, name, raw):
|
def pretty_css(container, name, raw):
|
||||||
' Pretty print the CSS represented as a string in raw '
|
"""
|
||||||
|
Pretty print the CSS represented as a string in raw
|
||||||
|
"""
|
||||||
sheet = container.parse_css(raw)
|
sheet = container.parse_css(raw)
|
||||||
return serialize(sheet, 'text/css')
|
return base.serialize(sheet, 'text/css')
|
||||||
|
|
||||||
|
|
||||||
def pretty_xml(container, name, raw):
|
def pretty_xml(container, name, raw):
|
||||||
' Pretty print the XML represented as a string in raw. If ``name`` is the name of the OPF, extra OPF-specific prettying is performed. '
|
"""
|
||||||
|
Pretty print the XML represented as a string in raw. If ``name`` is the
|
||||||
|
name of the OPF, extra OPF-specific prettying is performed.
|
||||||
|
"""
|
||||||
root = container.parse_xml(raw)
|
root = container.parse_xml(raw)
|
||||||
if name == container.opf_name:
|
if name == container.opf_name:
|
||||||
pretty_opf(root)
|
pretty_opf(root)
|
||||||
pretty_xml_tree(root)
|
pretty_xml_tree(root)
|
||||||
return serialize(root, 'text/xml')
|
return base.serialize(root, 'text/xml')
|
||||||
|
|
||||||
|
|
||||||
def fix_all_html(container):
|
def fix_all_html(container):
|
||||||
' Fix any parsing errors in all HTML files in the container. Fixing is done using the HTML5 parsing algorithm. '
|
"""
|
||||||
|
Fix any parsing errors in all HTML files in the container. Fixing is done
|
||||||
|
using the HTML5 parsing algorithm. """
|
||||||
for name, mt in container.mime_map.items():
|
for name, mt in container.mime_map.items():
|
||||||
if mt in OEB_DOCS:
|
if mt in base.OEB_DOCS:
|
||||||
container.parsed(name)
|
container.parsed(name)
|
||||||
container.dirty(name)
|
container.dirty(name)
|
||||||
|
|
||||||
|
|
||||||
def pretty_all(container):
|
def pretty_all(container):
|
||||||
' Pretty print all HTML/CSS/XML files in the container '
|
"""
|
||||||
|
Pretty print all HTML/CSS/XML files in the container
|
||||||
|
"""
|
||||||
xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
|
xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
|
||||||
for name, mt in container.mime_map.items():
|
for name, mt in container.mime_map.items():
|
||||||
prettied = False
|
prettied = False
|
||||||
if mt in OEB_DOCS:
|
if mt in base.OEB_DOCS:
|
||||||
pretty_html_tree(container, container.parsed(name))
|
pretty_html_tree(container, container.parsed(name))
|
||||||
prettied = True
|
prettied = True
|
||||||
elif mt in OEB_STYLES:
|
elif mt in base.OEB_STYLES:
|
||||||
container.parsed(name)
|
container.parsed(name)
|
||||||
prettied = True
|
prettied = True
|
||||||
elif name == container.opf_name:
|
elif name == container.opf_name:
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import urllib.parse
|
|||||||
from ebook_converter import constants as const
|
from ebook_converter import constants as const
|
||||||
from ebook_converter import guess_type, strftime
|
from ebook_converter import guess_type, strftime
|
||||||
from ebook_converter.constants_old import iswindows
|
from ebook_converter.constants_old import iswindows
|
||||||
|
from ebook_converter.ebooks.oeb import base
|
||||||
from ebook_converter.ebooks.oeb.base import XPath, xml2text, urlnormalize
|
from ebook_converter.ebooks.oeb.base import XPath, xml2text, urlnormalize
|
||||||
from ebook_converter.library.comments import comments_to_html, markdown
|
from ebook_converter.library.comments import comments_to_html, markdown
|
||||||
from ebook_converter.utils.date import is_date_undefined, as_local_time
|
from ebook_converter.utils.date import is_date_undefined, as_local_time
|
||||||
@@ -371,7 +372,7 @@ def render_jacket(mi, output_profile,
|
|||||||
# We cannot use data-calibre-rescale 100 on the body tag as that will just
|
# We cannot use data-calibre-rescale 100 on the body tag as that will just
|
||||||
# give the body tag a font size of 1em, which is useless.
|
# give the body tag a font size of 1em, which is useless.
|
||||||
for body in root.xpath('//*[local-name()="body"]'):
|
for body in root.xpath('//*[local-name()="body"]'):
|
||||||
fw = body.makeelement(const.XHTML_DIV)
|
fw = body.makeelement(base.tag('xhtml', 'div'))
|
||||||
fw.set('data-calibre-rescale', '100')
|
fw.set('data-calibre-rescale', '100')
|
||||||
for child in body:
|
for child in body:
|
||||||
fw.append(child)
|
fw.append(child)
|
||||||
@@ -388,9 +389,9 @@ def linearize_jacket(oeb):
|
|||||||
for x in oeb.spine[:4]:
|
for x in oeb.spine[:4]:
|
||||||
if XPath(JACKET_XPATH)(x.data):
|
if XPath(JACKET_XPATH)(x.data):
|
||||||
for e in XPath('//h:table|//h:tr|//h:th')(x.data):
|
for e in XPath('//h:table|//h:tr|//h:th')(x.data):
|
||||||
e.tag = const.XHTML_DIV
|
e.tag = base.tag('xhtml', 'div')
|
||||||
for e in XPath('//h:td')(x.data):
|
for e in XPath('//h:td')(x.data):
|
||||||
e.tag = const.XHTML_SPAN
|
e.tag = base.tag('xhtml', 'span')
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -3,8 +3,11 @@ Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
|
|||||||
forced at "likely" locations to conform to size limitations. This transform
|
forced at "likely" locations to conform to size limitations. This transform
|
||||||
assumes a prior call to the flatcss transform.
|
assumes a prior call to the flatcss transform.
|
||||||
"""
|
"""
|
||||||
import os, functools, collections, re, copy
|
import collections
|
||||||
from collections import OrderedDict
|
import copy
|
||||||
|
import functools
|
||||||
|
import os
|
||||||
|
import re
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
from lxml.etree import XPath as _XPath
|
from lxml.etree import XPath as _XPath
|
||||||
@@ -13,8 +16,7 @@ from lxml import etree
|
|||||||
from ebook_converter import constants as const
|
from ebook_converter import constants as const
|
||||||
from ebook_converter import as_unicode, force_unicode
|
from ebook_converter import as_unicode, force_unicode
|
||||||
from ebook_converter.ebooks.epub import rules
|
from ebook_converter.ebooks.epub import rules
|
||||||
from ebook_converter.ebooks.oeb.base import \
|
from ebook_converter.ebooks.oeb import base
|
||||||
OEB_STYLES, rewrite_links, urlnormalize
|
|
||||||
from ebook_converter.ebooks.oeb.polish.split import do_split
|
from ebook_converter.ebooks.oeb.polish.split import do_split
|
||||||
from ebook_converter.polyglot.urllib import unquote
|
from ebook_converter.polyglot.urllib import unquote
|
||||||
from ebook_converter.css_selectors import Select, SelectorError
|
from ebook_converter.css_selectors import Select, SelectorError
|
||||||
@@ -44,14 +46,15 @@ class SplitError(ValueError):
|
|||||||
class Split(object):
|
class Split(object):
|
||||||
|
|
||||||
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
|
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
|
||||||
max_flow_size=0, remove_css_pagebreaks=True):
|
max_flow_size=0, remove_css_pagebreaks=True):
|
||||||
self.split_on_page_breaks = split_on_page_breaks
|
self.split_on_page_breaks = split_on_page_breaks
|
||||||
self.page_breaks_xpath = page_breaks_xpath
|
self.page_breaks_xpath = page_breaks_xpath
|
||||||
self.max_flow_size = max_flow_size
|
self.max_flow_size = max_flow_size
|
||||||
self.page_break_selectors = None
|
self.page_break_selectors = None
|
||||||
self.remove_css_pagebreaks = remove_css_pagebreaks
|
self.remove_css_pagebreaks = remove_css_pagebreaks
|
||||||
if self.page_breaks_xpath is not None:
|
if self.page_breaks_xpath is not None:
|
||||||
self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)]
|
self.page_break_selectors = [(XPath(self.page_breaks_xpath),
|
||||||
|
False)]
|
||||||
|
|
||||||
def __call__(self, oeb, opts):
|
def __call__(self, oeb, opts):
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
@@ -71,7 +74,7 @@ class Split(object):
|
|||||||
page_breaks, page_break_ids = self.find_page_breaks(item)
|
page_breaks, page_break_ids = self.find_page_breaks(item)
|
||||||
|
|
||||||
splitter = FlowSplitter(item, page_breaks, page_break_ids,
|
splitter = FlowSplitter(item, page_breaks, page_break_ids,
|
||||||
self.max_flow_size, self.oeb, self.opts)
|
self.max_flow_size, self.oeb, self.opts)
|
||||||
if splitter.was_split:
|
if splitter.was_split:
|
||||||
am = splitter.anchor_map
|
am = splitter.anchor_map
|
||||||
self.map[item.href] = collections.defaultdict(
|
self.map[item.href] = collections.defaultdict(
|
||||||
@@ -81,25 +84,27 @@ class Split(object):
|
|||||||
if self.page_break_selectors is None:
|
if self.page_break_selectors is None:
|
||||||
self.page_break_selectors = set()
|
self.page_break_selectors = set()
|
||||||
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
|
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
|
||||||
OEB_STYLES]
|
base.OEB_STYLES]
|
||||||
for rule in rules(stylesheets):
|
for rule in rules(stylesheets):
|
||||||
before = force_unicode(getattr(rule.style.getPropertyCSSValue(
|
before = force_unicode(getattr(rule.style.getPropertyCSSValue(
|
||||||
'page-break-before'), 'cssText', '').strip().lower())
|
'page-break-before'), 'cssText', '').strip().lower())
|
||||||
after = force_unicode(getattr(rule.style.getPropertyCSSValue(
|
after = force_unicode(getattr(rule.style.getPropertyCSSValue(
|
||||||
'page-break-after'), 'cssText', '').strip().lower())
|
'page-break-after'), 'cssText', '').strip().lower())
|
||||||
try:
|
try:
|
||||||
if before and before not in {'avoid', 'auto', 'inherit'}:
|
if before and before not in {'avoid', 'auto', 'inherit'}:
|
||||||
self.page_break_selectors.add((rule.selectorText, True))
|
self.page_break_selectors.add((rule.selectorText,
|
||||||
|
True))
|
||||||
if self.remove_css_pagebreaks:
|
if self.remove_css_pagebreaks:
|
||||||
rule.style.removeProperty('page-break-before')
|
rule.style.removeProperty('page-break-before')
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
if after and after not in {'avoid', 'auto', 'inherit'}:
|
if after and after not in {'avoid', 'auto', 'inherit'}:
|
||||||
self.page_break_selectors.add((rule.selectorText, False))
|
self.page_break_selectors.add((rule.selectorText,
|
||||||
|
False))
|
||||||
if self.remove_css_pagebreaks:
|
if self.remove_css_pagebreaks:
|
||||||
rule.style.removeProperty('page-break-after')
|
rule.style.removeProperty('page-break-after')
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
page_breaks = set()
|
page_breaks = set()
|
||||||
select = Select(item.data)
|
select = Select(item.data)
|
||||||
@@ -110,14 +115,18 @@ class Split(object):
|
|||||||
return [], []
|
return [], []
|
||||||
descendants = frozenset(body[0].iterdescendants('*'))
|
descendants = frozenset(body[0].iterdescendants('*'))
|
||||||
|
|
||||||
|
_tags = {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}
|
||||||
for selector, before in self.page_break_selectors:
|
for selector, before in self.page_break_selectors:
|
||||||
try:
|
try:
|
||||||
for elem in select(selector):
|
for elem in select(selector):
|
||||||
if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
|
if (elem in descendants and
|
||||||
|
elem.tag.rpartition('}')[2].lower() not in _tags):
|
||||||
elem.set('pb_before', '1' if before else '0')
|
elem.set('pb_before', '1' if before else '0')
|
||||||
page_breaks.add(elem)
|
page_breaks.add(elem)
|
||||||
except SelectorError as err:
|
except SelectorError as err:
|
||||||
self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))
|
self.log.warn('Ignoring page breaks specified with invalid '
|
||||||
|
'CSS selector: %r (%s)' %
|
||||||
|
(selector, as_unicode(err)))
|
||||||
|
|
||||||
for i, elem in enumerate(item.data.iter('*')):
|
for i, elem in enumerate(item.data.iter('*')):
|
||||||
try:
|
try:
|
||||||
@@ -126,23 +135,23 @@ class Split(object):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
page_breaks = list(page_breaks)
|
page_breaks = list(page_breaks)
|
||||||
page_breaks.sort(key=lambda x:int(x.get('pb_order')))
|
page_breaks.sort(key=lambda x: int(x.get('pb_order')))
|
||||||
page_break_ids, page_breaks_ = [], []
|
page_break_ids, page_breaks_ = [], []
|
||||||
for i, x in enumerate(page_breaks):
|
for i, x in enumerate(page_breaks):
|
||||||
x.set('id', x.get('id', 'calibre_pb_%d'%i))
|
x.set('id', x.get('id', 'calibre_pb_%d' % i))
|
||||||
id = x.get('id')
|
id = x.get('id')
|
||||||
try:
|
try:
|
||||||
xp = XPath('//*[@id="%s"]'%id)
|
xp = XPath('//*[@id="%s"]' % id)
|
||||||
except:
|
except Exception:
|
||||||
try:
|
try:
|
||||||
xp = XPath("//*[@id='%s']"%id)
|
xp = XPath("//*[@id='%s']" % id)
|
||||||
except:
|
except Exception:
|
||||||
# The id has both a quote and an apostrophe or some other
|
# The id has both a quote and an apostrophe or some other
|
||||||
# Just replace it since I doubt its going to work anywhere else
|
# Just replace it since I doubt its going to work anywhere
|
||||||
# either
|
# else either
|
||||||
id = 'calibre_pb_%d'%i
|
id = 'calibre_pb_%d' % i
|
||||||
x.set('id', id)
|
x.set('id', id)
|
||||||
xp = XPath('//*[@id=%r]'%id)
|
xp = XPath('//*[@id=%r]' % id)
|
||||||
page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
|
page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
|
||||||
page_break_ids.append(id)
|
page_break_ids.append(id)
|
||||||
|
|
||||||
@@ -159,7 +168,7 @@ class Split(object):
|
|||||||
for item in self.oeb.manifest:
|
for item in self.oeb.manifest:
|
||||||
if etree.iselement(item.data):
|
if etree.iselement(item.data):
|
||||||
self.current_item = item
|
self.current_item = item
|
||||||
rewrite_links(item.data, self.rewrite_links)
|
base.rewrite_links(item.data, self.rewrite_links)
|
||||||
|
|
||||||
def rewrite_links(self, url):
|
def rewrite_links(self, url):
|
||||||
href, frag = urllib.parse.urldefrag(url)
|
href, frag = urllib.parse.urldefrag(url)
|
||||||
@@ -169,7 +178,7 @@ class Split(object):
|
|||||||
# Unparseable URL
|
# Unparseable URL
|
||||||
return url
|
return url
|
||||||
try:
|
try:
|
||||||
href = urlnormalize(href)
|
href = base.urlnormalize(href)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# href has non utf-8 quoting
|
# href has non utf-8 quoting
|
||||||
return url
|
return url
|
||||||
@@ -188,19 +197,19 @@ class FlowSplitter(object):
|
|||||||
'The actual splitting logic'
|
'The actual splitting logic'
|
||||||
|
|
||||||
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
|
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
|
||||||
opts):
|
opts):
|
||||||
self.item = item
|
self.item = item
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
self.opts = opts
|
self.opts = opts
|
||||||
self.log = oeb.log
|
self.log = oeb.log
|
||||||
self.page_breaks = page_breaks
|
self.page_breaks = page_breaks
|
||||||
self.page_break_ids = page_break_ids
|
self.page_break_ids = page_break_ids
|
||||||
self.max_flow_size = max_flow_size
|
self.max_flow_size = max_flow_size
|
||||||
self.base = item.href
|
self.base = item.href
|
||||||
self.csp_counter = 0
|
self.csp_counter = 0
|
||||||
|
|
||||||
base, ext = os.path.splitext(self.base)
|
name, ext = os.path.splitext(self.base)
|
||||||
self.base = base.replace('%', '%%')+'_split_%.3d'+ext
|
self.base = name.replace('%', '%%') + '_split_%.3d' + ext
|
||||||
|
|
||||||
self.trees = [self.item.data.getroottree()]
|
self.trees = [self.item.data.getroottree()]
|
||||||
self.splitting_on_page_breaks = True
|
self.splitting_on_page_breaks = True
|
||||||
@@ -210,13 +219,13 @@ class FlowSplitter(object):
|
|||||||
|
|
||||||
if self.max_flow_size > 0:
|
if self.max_flow_size > 0:
|
||||||
lt_found = False
|
lt_found = False
|
||||||
self.log('\tLooking for large trees in %s...'%item.href)
|
self.log('\tLooking for large trees in %s...' % item.href)
|
||||||
trees = list(self.trees)
|
trees = list(self.trees)
|
||||||
self.tree_map = {}
|
self.tree_map = {}
|
||||||
for i, tree in enumerate(trees):
|
for i, tree in enumerate(trees):
|
||||||
size = len(tostring(tree.getroot()))
|
size = len(tostring(tree.getroot()))
|
||||||
if size > self.max_flow_size:
|
if size > self.max_flow_size:
|
||||||
self.log('\tFound large tree #%d'%i)
|
self.log('\tFound large tree #%d' % i)
|
||||||
lt_found = True
|
lt_found = True
|
||||||
self.split_trees = []
|
self.split_trees = []
|
||||||
self.split_to_size(tree)
|
self.split_to_size(tree)
|
||||||
@@ -229,11 +238,11 @@ class FlowSplitter(object):
|
|||||||
|
|
||||||
self.was_split = len(self.trees) > 1
|
self.was_split = len(self.trees) > 1
|
||||||
if self.was_split:
|
if self.was_split:
|
||||||
self.log('\tSplit into %d parts'%len(self.trees))
|
self.log('\tSplit into %d parts' % len(self.trees))
|
||||||
self.commit()
|
self.commit()
|
||||||
|
|
||||||
def split_on_page_breaks(self, orig_tree):
|
def split_on_page_breaks(self, orig_tree):
|
||||||
ordered_ids = OrderedDict()
|
ordered_ids = collections.OrderedDict()
|
||||||
all_page_break_ids = frozenset(self.page_break_ids)
|
all_page_break_ids = frozenset(self.page_break_ids)
|
||||||
for elem_id in orig_tree.xpath('//*/@id'):
|
for elem_id in orig_tree.xpath('//*/@id'):
|
||||||
if elem_id in all_page_break_ids:
|
if elem_id in all_page_break_ids:
|
||||||
@@ -248,9 +257,10 @@ class FlowSplitter(object):
|
|||||||
tree = self.trees[i]
|
tree = self.trees[i]
|
||||||
elem = pattern(tree)
|
elem = pattern(tree)
|
||||||
if elem:
|
if elem:
|
||||||
self.log.debug('\t\tSplitting on page-break at id=%s'%
|
self.log.debug('\t\tSplitting on page-break at id=%s' %
|
||||||
elem[0].get('id'))
|
elem[0].get('id'))
|
||||||
before_tree, after_tree = self.do_split(tree, elem[0], before)
|
before_tree, after_tree = self.do_split(tree, elem[0],
|
||||||
|
before)
|
||||||
self.trees[i:i+1] = [before_tree, after_tree]
|
self.trees[i:i+1] = [before_tree, after_tree]
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -269,7 +279,11 @@ class FlowSplitter(object):
|
|||||||
if body is not None:
|
if body is not None:
|
||||||
existing_ids = frozenset(body.xpath('//*/@id'))
|
existing_ids = frozenset(body.xpath('//*/@id'))
|
||||||
for x in ids - existing_ids:
|
for x in ids - existing_ids:
|
||||||
body.insert(0, body.makeelement(const.XHTML_div, id=x, style='height:0pt'))
|
body.insert(0,
|
||||||
|
body.makeelement(base.tag('xhtml',
|
||||||
|
'div'),
|
||||||
|
id=x,
|
||||||
|
style='height:0pt'))
|
||||||
ids = set()
|
ids = set()
|
||||||
trees.append(tree)
|
trees.append(tree)
|
||||||
self.trees = trees
|
self.trees = trees
|
||||||
@@ -281,12 +295,13 @@ class FlowSplitter(object):
|
|||||||
return body[0]
|
return body[0]
|
||||||
|
|
||||||
def do_split(self, tree, split_point, before):
|
def do_split(self, tree, split_point, before):
|
||||||
'''
|
"""
|
||||||
Split ``tree`` into a *before* and *after* tree at ``split_point``.
|
Split ``tree`` into a *before* and *after* tree at ``split_point``.
|
||||||
|
|
||||||
:param before: If True tree is split before split_point, otherwise after split_point
|
:param before: If True tree is split before split_point, otherwise
|
||||||
|
after split_point
|
||||||
:return: before_tree, after_tree
|
:return: before_tree, after_tree
|
||||||
'''
|
"""
|
||||||
return do_split(split_point, self.log, before=before)
|
return do_split(split_point, self.log, before=before)
|
||||||
|
|
||||||
def is_page_empty(self, root):
|
def is_page_empty(self, root):
|
||||||
@@ -294,7 +309,7 @@ class FlowSplitter(object):
|
|||||||
if body is None:
|
if body is None:
|
||||||
return False
|
return False
|
||||||
txt = re.sub(r'\s+|\xa0', '',
|
txt = re.sub(r'\s+|\xa0', '',
|
||||||
etree.tostring(body, method='text', encoding='unicode'))
|
etree.tostring(body, method='text', encoding='unicode'))
|
||||||
if len(txt) > 1:
|
if len(txt) > 1:
|
||||||
return False
|
return False
|
||||||
for img in root.xpath('//h:img', namespaces=const.XPNSMAP):
|
for img in root.xpath('//h:img', namespaces=const.XPNSMAP):
|
||||||
@@ -305,13 +320,13 @@ class FlowSplitter(object):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def split_text(self, text, root, size):
|
def split_text(self, text, root, size):
|
||||||
self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
|
self.log.debug('\t\t\tSplitting text of length: %d' % len(text))
|
||||||
rest = text.replace('\r', '')
|
rest = text.replace('\r', '')
|
||||||
parts = re.split('\n\n', rest)
|
parts = re.split('\n\n', rest)
|
||||||
self.log.debug('\t\t\t\tFound %d parts'%len(parts))
|
self.log.debug('\t\t\t\tFound %d parts' % len(parts))
|
||||||
if max(map(len, parts)) > size:
|
if max(map(len, parts)) > size:
|
||||||
raise SplitError('Cannot split as file contains a <pre> tag '
|
raise SplitError('Cannot split as file contains a <pre> tag '
|
||||||
'with a very large paragraph', root)
|
'with a very large paragraph', root)
|
||||||
ans = []
|
ans = []
|
||||||
buf = ''
|
buf = ''
|
||||||
for part in parts:
|
for part in parts:
|
||||||
@@ -331,7 +346,8 @@ class FlowSplitter(object):
|
|||||||
continue
|
continue
|
||||||
if pre.text and len(pre.text) > self.max_flow_size*0.5:
|
if pre.text and len(pre.text) > self.max_flow_size*0.5:
|
||||||
self.log.debug('\t\tSplitting large <pre> tag')
|
self.log.debug('\t\tSplitting large <pre> tag')
|
||||||
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
|
frags = self.split_text(pre.text, root,
|
||||||
|
int(0.2 * self.max_flow_size))
|
||||||
new_pres = []
|
new_pres = []
|
||||||
for frag in frags:
|
for frag in frags:
|
||||||
pre2 = copy.copy(pre)
|
pre2 = copy.copy(pre)
|
||||||
@@ -346,7 +362,8 @@ class FlowSplitter(object):
|
|||||||
split_point, before = self.find_split_point(root)
|
split_point, before = self.find_split_point(root)
|
||||||
if split_point is None:
|
if split_point is None:
|
||||||
raise SplitError(self.item.href, root)
|
raise SplitError(self.item.href, root)
|
||||||
self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))
|
self.log.debug('\t\t\tSplit point:', split_point.tag,
|
||||||
|
tree.getpath(split_point))
|
||||||
|
|
||||||
trees = self.do_split(tree, split_point, before)
|
trees = self.do_split(tree, split_point, before)
|
||||||
sizes = [len(tostring(t.getroot())) for t in trees]
|
sizes = [len(tostring(t.getroot())) for t in trees]
|
||||||
@@ -361,12 +378,11 @@ class FlowSplitter(object):
|
|||||||
continue
|
continue
|
||||||
elif size <= self.max_flow_size:
|
elif size <= self.max_flow_size:
|
||||||
self.split_trees.append(t)
|
self.split_trees.append(t)
|
||||||
self.log.debug(
|
self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)' %
|
||||||
'\t\t\tCommitted sub-tree #%d (%d KB)'%(
|
(len(self.split_trees), size/1024.))
|
||||||
len(self.split_trees), size/1024.))
|
|
||||||
else:
|
else:
|
||||||
self.log.debug(
|
self.log.debug('\t\t\tSplit tree still too large: %d KB' %
|
||||||
'\t\t\tSplit tree still too large: %d KB' % (size/1024.))
|
size/1024)
|
||||||
self.split_to_size(t)
|
self.split_to_size(t)
|
||||||
|
|
||||||
def find_split_point(self, root):
|
def find_split_point(self, root):
|
||||||
@@ -385,8 +401,8 @@ class FlowSplitter(object):
|
|||||||
'''
|
'''
|
||||||
def pick_elem(elems):
|
def pick_elem(elems):
|
||||||
if elems:
|
if elems:
|
||||||
elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') !=
|
elems = [i for i in elems
|
||||||
'1']
|
if i.get(SPLIT_POINT_ATTR, '0') != '1']
|
||||||
if elems:
|
if elems:
|
||||||
i = int(len(elems)//2)
|
i = int(len(elems)//2)
|
||||||
elems[i].set(SPLIT_POINT_ATTR, '1')
|
elems[i].set(SPLIT_POINT_ATTR, '1')
|
||||||
@@ -407,7 +423,7 @@ class FlowSplitter(object):
|
|||||||
if elem is not None:
|
if elem is not None:
|
||||||
try:
|
try:
|
||||||
XPath(elem.getroottree().getpath(elem))
|
XPath(elem.getroottree().getpath(elem))
|
||||||
except:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
return elem, True
|
return elem, True
|
||||||
|
|
||||||
@@ -421,23 +437,24 @@ class FlowSplitter(object):
|
|||||||
'''
|
'''
|
||||||
if not self.was_split:
|
if not self.was_split:
|
||||||
return
|
return
|
||||||
self.anchor_map = collections.defaultdict(lambda :self.base%0)
|
self.anchor_map = collections.defaultdict(lambda: self.base % 0)
|
||||||
self.files = []
|
self.files = []
|
||||||
|
|
||||||
for i, tree in enumerate(self.trees):
|
for i, tree in enumerate(self.trees):
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
self.files.append(self.base%i)
|
self.files.append(self.base % i)
|
||||||
for elem in root.xpath('//*[@id or @name]'):
|
for elem in root.xpath('//*[@id or @name]'):
|
||||||
for anchor in elem.get('id', ''), elem.get('name', ''):
|
for anchor in elem.get('id', ''), elem.get('name', ''):
|
||||||
if anchor != '' and anchor not in self.anchor_map:
|
if anchor != '' and anchor not in self.anchor_map:
|
||||||
self.anchor_map[anchor] = self.files[-1]
|
self.anchor_map[anchor] = self.files[-1]
|
||||||
for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR):
|
for elem in root.xpath('//*[@%s]' % SPLIT_POINT_ATTR):
|
||||||
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
|
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
|
||||||
|
|
||||||
spine_pos = self.item.spine_position
|
spine_pos = self.item.spine_position
|
||||||
|
|
||||||
for current, tree in zip(*map(reversed, (self.files, self.trees))):
|
for current, tree in zip(*map(reversed, (self.files, self.trees))):
|
||||||
for a in tree.getroot().xpath('//h:a[@href]', namespaces=const.XPNSMAP):
|
for a in tree.getroot().xpath('//h:a[@href]',
|
||||||
|
namespaces=const.XPNSMAP):
|
||||||
href = a.get('href').strip()
|
href = a.get('href').strip()
|
||||||
if href.startswith('#'):
|
if href.startswith('#'):
|
||||||
anchor = href[1:]
|
anchor = href[1:]
|
||||||
@@ -448,7 +465,8 @@ class FlowSplitter(object):
|
|||||||
|
|
||||||
new_id = self.oeb.manifest.generate(id=self.item.id)[0]
|
new_id = self.oeb.manifest.generate(id=self.item.id)[0]
|
||||||
new_item = self.oeb.manifest.add(new_id, current,
|
new_item = self.oeb.manifest.add(new_id, current,
|
||||||
self.item.media_type, data=tree.getroot())
|
self.item.media_type,
|
||||||
|
data=tree.getroot())
|
||||||
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
|
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
|
||||||
|
|
||||||
if self.oeb.guide:
|
if self.oeb.guide:
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from lxml import etree
|
|||||||
|
|
||||||
from ebook_converter import constants as const
|
from ebook_converter import constants as const
|
||||||
from ebook_converter.ebooks.oeb import parse_utils
|
from ebook_converter.ebooks.oeb import parse_utils
|
||||||
from ebook_converter.ebooks.oeb.base import TOC, xml2text
|
from ebook_converter.ebooks.oeb import base
|
||||||
from ebook_converter.ebooks import ConversionError
|
from ebook_converter.ebooks import ConversionError
|
||||||
|
|
||||||
|
|
||||||
@@ -15,8 +15,8 @@ def XPath(x):
|
|||||||
try:
|
try:
|
||||||
return etree.XPath(x, namespaces=const.XPNSMAP)
|
return etree.XPath(x, namespaces=const.XPNSMAP)
|
||||||
except etree.XPathSyntaxError:
|
except etree.XPathSyntaxError:
|
||||||
raise ConversionError(
|
raise ConversionError('The syntax of the XPath expression %s is '
|
||||||
'The syntax of the XPath expression %s is invalid.' % repr(x))
|
'invalid.' % repr(x))
|
||||||
|
|
||||||
|
|
||||||
def isspace(x):
|
def isspace(x):
|
||||||
@@ -33,9 +33,13 @@ def at_start(elem):
|
|||||||
for x in body.iter():
|
for x in body.iter():
|
||||||
if x is elem:
|
if x is elem:
|
||||||
return True
|
return True
|
||||||
if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}:
|
if hasattr(getattr(x, 'tag', None),
|
||||||
|
'rpartition') and x.tag.rpartition('}')[-1] in {'img',
|
||||||
|
'svg'}:
|
||||||
return False
|
return False
|
||||||
if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))):
|
if isspace(getattr(x, 'text', None)) and (x in ancestors or
|
||||||
|
isspace(getattr(x, 'tail',
|
||||||
|
None))):
|
||||||
continue
|
continue
|
||||||
return False
|
return False
|
||||||
return False
|
return False
|
||||||
@@ -52,7 +56,7 @@ class DetectStructure(object):
|
|||||||
self.detect_chapters()
|
self.detect_chapters()
|
||||||
if self.oeb.auto_generated_toc or opts.use_auto_toc:
|
if self.oeb.auto_generated_toc or opts.use_auto_toc:
|
||||||
orig_toc = self.oeb.toc
|
orig_toc = self.oeb.toc
|
||||||
self.oeb.toc = TOC()
|
self.oeb.toc = base.TOC()
|
||||||
self.create_level_based_toc()
|
self.create_level_based_toc()
|
||||||
if self.oeb.toc.count() < 1:
|
if self.oeb.toc.count() < 1:
|
||||||
if not opts.no_chapters_in_toc and self.detected_chapters:
|
if not opts.no_chapters_in_toc and self.detected_chapters:
|
||||||
@@ -64,14 +68,14 @@ class DetectStructure(object):
|
|||||||
else:
|
else:
|
||||||
self.oeb.auto_generated_toc = True
|
self.oeb.auto_generated_toc = True
|
||||||
self.log('Auto generated TOC with %d entries.' %
|
self.log('Auto generated TOC with %d entries.' %
|
||||||
self.oeb.toc.count())
|
self.oeb.toc.count())
|
||||||
|
|
||||||
if opts.toc_filter is not None:
|
if opts.toc_filter is not None:
|
||||||
regexp = re.compile(opts.toc_filter)
|
regexp = re.compile(opts.toc_filter)
|
||||||
for node in list(self.oeb.toc.iter()):
|
for node in list(self.oeb.toc.iter()):
|
||||||
if not node.title or regexp.search(node.title) is not None:
|
if not node.title or regexp.search(node.title) is not None:
|
||||||
self.log('Filtering', node.title if node.title else
|
self.log('Filtering', node.title if node.title else
|
||||||
'empty node', 'from TOC')
|
'empty node', 'from TOC')
|
||||||
self.oeb.toc.remove(node)
|
self.oeb.toc.remove(node)
|
||||||
|
|
||||||
if opts.page_breaks_before is not None:
|
if opts.page_breaks_before is not None:
|
||||||
@@ -80,10 +84,11 @@ class DetectStructure(object):
|
|||||||
for elem in pb_xpath(item.data):
|
for elem in pb_xpath(item.data):
|
||||||
try:
|
try:
|
||||||
prev = next(elem.itersiblings(tag=etree.Element,
|
prev = next(elem.itersiblings(tag=etree.Element,
|
||||||
preceding=True))
|
preceding=True))
|
||||||
if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and parse_utils.barename(
|
if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and
|
||||||
prev.tag) in {'h1', 'h2'} and (not prev.tail or
|
parse_utils.barename(prev.tag) in {'h1',
|
||||||
not prev.tail.split())):
|
'h2'} and
|
||||||
|
(not prev.tail or not prev.tail.split())):
|
||||||
# We have two adjacent headings, do not put a page
|
# We have two adjacent headings, do not put a page
|
||||||
# break on the second one
|
# break on the second one
|
||||||
continue
|
continue
|
||||||
@@ -106,9 +111,9 @@ class DetectStructure(object):
|
|||||||
expr = self.opts.start_reading_at
|
expr = self.opts.start_reading_at
|
||||||
try:
|
try:
|
||||||
expr = XPath(expr)
|
expr = XPath(expr)
|
||||||
except:
|
except Exception:
|
||||||
self.log.warn(
|
self.log.warn('Invalid start reading at XPath expression, '
|
||||||
'Invalid start reading at XPath expression, ignoring: %s'%expr)
|
'ignoring: %s' % expr)
|
||||||
return
|
return
|
||||||
for item in self.oeb.spine:
|
for item in self.oeb.spine:
|
||||||
if not hasattr(item.data, 'xpath'):
|
if not hasattr(item.data, 'xpath'):
|
||||||
@@ -118,16 +123,17 @@ class DetectStructure(object):
|
|||||||
elem = matches[0]
|
elem = matches[0]
|
||||||
eid = elem.get('id', None)
|
eid = elem.get('id', None)
|
||||||
if not eid:
|
if not eid:
|
||||||
eid = 'start_reading_at_'+str(uuid.uuid4()).replace('-', '')
|
eid = 'start_reading_at_' + str(uuid.uuid4()).replace('-',
|
||||||
|
'')
|
||||||
elem.set('id', eid)
|
elem.set('id', eid)
|
||||||
if 'text' in self.oeb.guide:
|
if 'text' in self.oeb.guide:
|
||||||
self.oeb.guide.remove('text')
|
self.oeb.guide.remove('text')
|
||||||
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
|
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
|
||||||
self.log('Setting start reading at position to %s in %s'%(
|
self.log('Setting start reading at position to %s in %s' %
|
||||||
self.opts.start_reading_at, item.href))
|
(self.opts.start_reading_at, item.href))
|
||||||
return
|
return
|
||||||
self.log.warn("Failed to find start reading at position: %s"%
|
self.log.warn("Failed to find start reading at position: %s" %
|
||||||
self.opts.start_reading_at)
|
self.opts.start_reading_at)
|
||||||
|
|
||||||
def get_toc_parts_for_xpath(self, expr):
|
def get_toc_parts_for_xpath(self, expr):
|
||||||
# if an attribute is selected by the xpath expr then truncate it
|
# if an attribute is selected by the xpath expr then truncate it
|
||||||
@@ -148,12 +154,14 @@ class DetectStructure(object):
|
|||||||
ans = XPath(expr)(doc)
|
ans = XPath(expr)(doc)
|
||||||
len(ans)
|
len(ans)
|
||||||
return ans
|
return ans
|
||||||
except:
|
except Exception:
|
||||||
self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
|
self.log.warn('Invalid chapter expression, ignoring: %s' %
|
||||||
|
expr)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
if self.opts.chapter:
|
if self.opts.chapter:
|
||||||
chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter)
|
chapter_path, title_attribute = (
|
||||||
|
self.get_toc_parts_for_xpath(self.opts.chapter))
|
||||||
self.chapter_title_attribute = title_attribute
|
self.chapter_title_attribute = title_attribute
|
||||||
for item in self.oeb.spine:
|
for item in self.oeb.spine:
|
||||||
for x in find_matches(chapter_path, item.data):
|
for x in find_matches(chapter_path, item.data):
|
||||||
@@ -165,25 +173,28 @@ class DetectStructure(object):
|
|||||||
c = collections.Counter()
|
c = collections.Counter()
|
||||||
for item, elem in self.detected_chapters:
|
for item, elem in self.detected_chapters:
|
||||||
c[item] += 1
|
c[item] += 1
|
||||||
text = xml2text(elem).strip()
|
text = base.xml2text(elem).strip()
|
||||||
text = re.sub(r'\s+', ' ', text.strip())
|
text = re.sub(r'\s+', ' ', text.strip())
|
||||||
self.log('\tDetected chapter:', text[:50])
|
self.log('\tDetected chapter:', text[:50])
|
||||||
if chapter_mark == 'none':
|
if chapter_mark == 'none':
|
||||||
continue
|
continue
|
||||||
if chapter_mark == 'rule':
|
if chapter_mark == 'rule':
|
||||||
mark = elem.makeelement(const.XHTML_HR)
|
mark = elem.makeelement(base.tag('xhtml', 'hr'))
|
||||||
elif chapter_mark == 'pagebreak':
|
elif chapter_mark == 'pagebreak':
|
||||||
if c[item] < 3 and at_start(elem):
|
if c[item] < 3 and at_start(elem):
|
||||||
# For the first two elements in this item, check if they
|
# For the first two elements in this item, check if
|
||||||
# are at the start of the file, in which case inserting a
|
# they are at the start of the file, in which case
|
||||||
# page break in unnecessary and can lead to extra blank
|
# inserting a page break in unnecessary and can lead
|
||||||
# pages in the PDF Output plugin. We need to use two as
|
# to extra blank pages in the PDF Output plugin. We
|
||||||
# feedbooks epubs match both a heading tag and its
|
# need to use two as feedbooks epubs match both a
|
||||||
# containing div with the default chapter expression.
|
# heading tag and its containing div with the default
|
||||||
|
# chapter expression.
|
||||||
continue
|
continue
|
||||||
mark = elem.makeelement(const.XHTML_DIV, style=page_break_after)
|
mark = elem.makeelement(base.tag('xhtml', 'div'),
|
||||||
|
style=page_break_after)
|
||||||
else: # chapter_mark == 'both':
|
else: # chapter_mark == 'both':
|
||||||
mark = elem.makeelement(const.XHTML_HR, style=page_break_before)
|
mark = elem.makeelement(base.tag('xhtml', 'hr'),
|
||||||
|
style=page_break_before)
|
||||||
try:
|
try:
|
||||||
elem.addprevious(mark)
|
elem.addprevious(mark)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
@@ -196,7 +207,9 @@ class DetectStructure(object):
|
|||||||
def create_toc_from_chapters(self):
|
def create_toc_from_chapters(self):
|
||||||
counter = self.oeb.toc.next_play_order()
|
counter = self.oeb.toc.next_play_order()
|
||||||
for item, elem in self.detected_chapters:
|
for item, elem in self.detected_chapters:
|
||||||
text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter)
|
text, href = self.elem_to_link(item, elem,
|
||||||
|
self.chapter_title_attribute,
|
||||||
|
counter)
|
||||||
self.oeb.toc.add(text, href, play_order=counter)
|
self.oeb.toc.add(text, href, play_order=counter)
|
||||||
counter += 1
|
counter += 1
|
||||||
|
|
||||||
@@ -216,18 +229,21 @@ class DetectStructure(object):
|
|||||||
if frag:
|
if frag:
|
||||||
href = '#'.join((href, frag))
|
href = '#'.join((href, frag))
|
||||||
if not self.oeb.toc.has_href(href):
|
if not self.oeb.toc.has_href(href):
|
||||||
text = xml2text(a)
|
text = base.xml2text(a)
|
||||||
text = text[:100].strip()
|
text = text[:100].strip()
|
||||||
if (not self.opts.duplicate_links_in_toc and
|
if (not self.opts.duplicate_links_in_toc and
|
||||||
self.oeb.toc.has_text(text)):
|
self.oeb.toc.has_text(text)):
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
self.oeb.toc.add(text, href,
|
self.oeb.toc.add(
|
||||||
|
text, href,
|
||||||
play_order=self.oeb.toc.next_play_order())
|
play_order=self.oeb.toc.next_play_order())
|
||||||
num += 1
|
num += 1
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.oeb.log.exception('Failed to process link: %r' % href)
|
self.oeb.log.exception('Failed to process link: '
|
||||||
continue # Most likely an incorrectly URL encoded link
|
'%r' % href)
|
||||||
|
# Most likely an incorrectly URL encoded link
|
||||||
|
continue
|
||||||
if self.opts.max_toc_links > 0 and \
|
if self.opts.max_toc_links > 0 and \
|
||||||
num >= self.opts.max_toc_links:
|
num >= self.opts.max_toc_links:
|
||||||
self.log('Maximum TOC links reached, stopping.')
|
self.log('Maximum TOC links reached, stopping.')
|
||||||
@@ -238,14 +254,14 @@ class DetectStructure(object):
|
|||||||
if title_attribute is not None:
|
if title_attribute is not None:
|
||||||
text = elem.get(title_attribute, '')
|
text = elem.get(title_attribute, '')
|
||||||
if not text:
|
if not text:
|
||||||
text = xml2text(elem).strip()
|
text = base.xml2text(elem).strip()
|
||||||
if not text:
|
if not text:
|
||||||
text = elem.get('title', '')
|
text = elem.get('title', '')
|
||||||
if not text:
|
if not text:
|
||||||
text = elem.get('alt', '')
|
text = elem.get('alt', '')
|
||||||
text = re.sub(r'\s+', ' ', text.strip())
|
text = re.sub(r'\s+', ' ', text.strip())
|
||||||
text = text[:1000].strip()
|
text = text[:1000].strip()
|
||||||
id = elem.get('id', 'calibre_toc_%d'%counter)
|
id = elem.get('id', 'calibre_toc_%d' % counter)
|
||||||
elem.set('id', id)
|
elem.set('id', id)
|
||||||
href = '#'.join((item.href, id))
|
href = '#'.join((item.href, id))
|
||||||
return text, href
|
return text, href
|
||||||
@@ -260,26 +276,29 @@ class DetectStructure(object):
|
|||||||
ans = XPath(expr)(doc)
|
ans = XPath(expr)(doc)
|
||||||
len(ans)
|
len(ans)
|
||||||
return ans
|
return ans
|
||||||
except:
|
except Exception:
|
||||||
self.log.warn('Invalid ToC expression, ignoring: %s'%expr)
|
self.log.warn('Invalid ToC expression, ignoring: %s' % expr)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
for document in self.oeb.spine:
|
for document in self.oeb.spine:
|
||||||
previous_level1 = list(added.values())[-1] if added else None
|
previous_level1 = list(added.values())[-1] if added else None
|
||||||
previous_level2 = list(added2.values())[-1] if added2 else None
|
previous_level2 = list(added2.values())[-1] if added2 else None
|
||||||
|
|
||||||
level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc)
|
(level1_toc,
|
||||||
|
level1_title) = self.get_toc_parts_for_xpath(self.opts.level1_toc)
|
||||||
for elem in find_matches(level1_toc, document.data):
|
for elem in find_matches(level1_toc, document.data):
|
||||||
text, _href = self.elem_to_link(document, elem, level1_title, counter)
|
text, _href = self.elem_to_link(document, elem, level1_title,
|
||||||
|
counter)
|
||||||
counter += 1
|
counter += 1
|
||||||
if text:
|
if text:
|
||||||
node = self.oeb.toc.add(text, _href,
|
node = self.oeb.toc.add(
|
||||||
play_order=self.oeb.toc.next_play_order())
|
text, _href, play_order=self.oeb.toc.next_play_order())
|
||||||
added[elem] = node
|
added[elem] = node
|
||||||
# node.add('Top', _href)
|
# node.add('Top', _href)
|
||||||
|
|
||||||
if self.opts.level2_toc is not None and added:
|
if self.opts.level2_toc is not None and added:
|
||||||
level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc)
|
level2_toc, level2_title = self.get_toc_parts_for_xpath(
|
||||||
|
self.opts.level2_toc)
|
||||||
for elem in find_matches(level2_toc, document.data):
|
for elem in find_matches(level2_toc, document.data):
|
||||||
level1 = None
|
level1 = None
|
||||||
for item in document.data.iterdescendants():
|
for item in document.data.iterdescendants():
|
||||||
@@ -290,15 +309,19 @@ class DetectStructure(object):
|
|||||||
if previous_level1 is None:
|
if previous_level1 is None:
|
||||||
break
|
break
|
||||||
level1 = previous_level1
|
level1 = previous_level1
|
||||||
text, _href = self.elem_to_link(document, elem, level2_title, counter)
|
text, _href = self.elem_to_link(document, elem,
|
||||||
|
level2_title,
|
||||||
|
counter)
|
||||||
counter += 1
|
counter += 1
|
||||||
if text:
|
if text:
|
||||||
added2[elem] = level1.add(text, _href,
|
added2[elem] = level1.add(
|
||||||
|
text, _href,
|
||||||
play_order=self.oeb.toc.next_play_order())
|
play_order=self.oeb.toc.next_play_order())
|
||||||
break
|
break
|
||||||
|
|
||||||
if self.opts.level3_toc is not None and added2:
|
if self.opts.level3_toc is not None and added2:
|
||||||
level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc)
|
level3_toc, level3_title = self.get_toc_parts_for_xpath(
|
||||||
|
self.opts.level3_toc)
|
||||||
for elem in find_matches(level3_toc, document.data):
|
for elem in find_matches(level3_toc, document.data):
|
||||||
level2 = None
|
level2 = None
|
||||||
for item in document.data.iterdescendants():
|
for item in document.data.iterdescendants():
|
||||||
@@ -309,10 +332,13 @@ class DetectStructure(object):
|
|||||||
if previous_level2 is None:
|
if previous_level2 is None:
|
||||||
break
|
break
|
||||||
level2 = previous_level2
|
level2 = previous_level2
|
||||||
text, _href = \
|
text, _href = self.elem_to_link(document,
|
||||||
self.elem_to_link(document, elem, level3_title, counter)
|
elem,
|
||||||
|
level3_title,
|
||||||
|
counter)
|
||||||
counter += 1
|
counter += 1
|
||||||
if text:
|
if text:
|
||||||
level2.add(text, _href,
|
level2.add(text, _href,
|
||||||
play_order=self.oeb.toc.next_play_order())
|
play_order=self.oeb
|
||||||
|
.toc.next_play_order())
|
||||||
break
|
break
|
||||||
|
|||||||
Reference in New Issue
Block a user