diff --git a/ebook_converter/ebooks/metadata/opf3.py b/ebook_converter/ebooks/metadata/opf3.py index dd146a6..0f4fd1d 100644 --- a/ebook_converter/ebooks/metadata/opf3.py +++ b/ebook_converter/ebooks/metadata/opf3.py @@ -1,14 +1,16 @@ +import collections +import functools import json import re -from collections import defaultdict, namedtuple -from functools import wraps from lxml import etree from ebook_converter import constants as const from ebook_converter import prints -from ebook_converter.ebooks.metadata import authors_to_string, check_isbn, string_to_authors -from ebook_converter.ebooks.metadata.book.base import Metadata +from ebook_converter.ebooks.metadata import authors_to_string +from ebook_converter.ebooks.metadata import check_isbn +from ebook_converter.ebooks.metadata import string_to_authors +from ebook_converter.ebooks.metadata.book import base from ebook_converter.ebooks.metadata.book.json_codec import ( decode_is_multiple, encode_is_multiple, object_to_unicode ) @@ -17,17 +19,30 @@ from ebook_converter.ebooks.metadata.utils import ( pretty_print_opf ) from ebook_converter.utils.config import from_json, to_json -from ebook_converter.utils.date import ( - fix_only_date, is_date_undefined, isoformat, parse_date as parse_date_, utcnow, - w3cdtf -) +from ebook_converter.utils.date import (fix_only_date, is_date_undefined, + isoformat, parse_date as parse_date_, + utcnow, w3cdtf) from ebook_converter.utils.iso8601 import parse_iso8601 from ebook_converter.utils.localization import canonicalize_lang +RES_PREFIXES = {'dcterms': 'http://purl.org/dc/terms/', + 'epubsc': 'http://idpf.org/epub/vocab/sc/#', + 'marc': 'http://id.loc.gov/vocabulary/', + 'media': 'http://www.idpf.org/epub/vocab/overlays/#', + 'onix': 'http://www.editeur.org/ONIX/book/codelists/' + 'current.html#', + 'rendition': 'http://www.idpf.org/vocab/rendition/#', + 'schema': 'http://schema.org/', + 'xsd': 'http://www.w3.org/2001/XMLSchema#'} + +CALIBRE_PREFIX = 'https://calibre-ebook.com' +KNOWN_PREFIXES = RES_PREFIXES.copy() +KNOWN_PREFIXES['calibre'] = CALIBRE_PREFIX + # Utils {{{ -_xpath_cache = {} -_re_cache = {} +_XPATH_CACHE = {} +_RE_CACHE = {} def uniq(vals): @@ -39,22 +54,23 @@ def uniq(vals): def dump_dict(cats): - return json.dumps(object_to_unicode(cats or {}), ensure_ascii=False, skipkeys=True) + return json.dumps(object_to_unicode(cats or {}), ensure_ascii=False, + skipkeys=True) def XPath(x): try: - return _xpath_cache[x] + return _XPATH_CACHE[x] except KeyError: - _xpath_cache[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP) + _XPATH_CACHE[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP) return ans def regex(r, flags=0): try: - return _re_cache[(r, flags)] + return _RE_CACHE[(r, flags)] except KeyError: - _re_cache[(r, flags)] = ans = re.compile(r, flags) + _RE_CACHE[(r, flags)] = ans = re.compile(r, flags) return ans @@ -82,7 +98,7 @@ def properties_for_id(item_id, refines): def properties_for_id_with_scheme(item_id, prefixes, refines): - ans = defaultdict(list) + ans = collections.defaultdict(list) if item_id: for elem in refines[item_id]: key = elem.get('property') @@ -126,7 +142,7 @@ def normalize_whitespace(text): def simple_text(f): - @wraps(f) + @functools.wraps(f) def wrapper(*args, **kw): return normalize_whitespace(f(*args, **kw)) return wrapper @@ -135,7 +151,7 @@ def simple_text(f): def items_with_property(root, q, prefixes=None): if prefixes is None: prefixes = read_prefixes(root) - q = expand_prefix(q, known_prefixes).lower() + q = expand_prefix(q, KNOWN_PREFIXES).lower() for item in XPath("./opf:manifest/opf:item[@properties]")(root): for prop in (item.get('properties') or '').lower().split(): prop = expand_prefix(prop, prefixes) @@ -150,43 +166,32 @@ def items_with_property(root, q, prefixes=None): # http://www.idpf.org/epub/vocab/package/pfx/ -reserved_prefixes = { - 'dcterms': 'http://purl.org/dc/terms/', - 'epubsc': 'http://idpf.org/epub/vocab/sc/#', - 'marc': 'http://id.loc.gov/vocabulary/', - 'media': 'http://www.idpf.org/epub/vocab/overlays/#', - 'onix': 'http://www.editeur.org/ONIX/book/codelists/current.html#', - 'rendition':'http://www.idpf.org/vocab/rendition/#', - 'schema': 'http://schema.org/', - 'xsd': 'http://www.w3.org/2001/XMLSchema#', -} - -CALIBRE_PREFIX = 'https://calibre-ebook.com' -known_prefixes = reserved_prefixes.copy() -known_prefixes['calibre'] = CALIBRE_PREFIX - - def parse_prefixes(x): - return {m.group(1):m.group(2) for m in re.finditer(r'(\S+): \s*(\S+)', x)} + return {m.group(1): m.group(2) + for m in re.finditer(r'(\S+): \s*(\S+)', x)} def read_prefixes(root): - ans = reserved_prefixes.copy() + ans = RES_PREFIXES.copy() ans.update(parse_prefixes(root.get('prefix') or '')) return ans def expand_prefix(raw, prefixes): - return regex(r'(\S+)\s*:\s*(\S+)').sub(lambda m:(prefixes.get(m.group(1), m.group(1)) + ':' + m.group(2)), raw or '') + return (regex(r'(\S+)\s*:\s*(\S+)') + .sub(lambda m: (prefixes.get(m.group(1), + m.group(1)) + ':' + m.group(2)), + raw or '')) def ensure_prefix(root, prefixes, prefix, value=None): if prefixes is None: prefixes = read_prefixes(root) - prefixes[prefix] = value or reserved_prefixes[prefix] - prefixes = {k:v for k, v in prefixes.items() if reserved_prefixes.get(k) != v} + prefixes[prefix] = value or RES_PREFIXES[prefix] + prefixes = {k: v for k, v in prefixes.items() if RES_PREFIXES.get(k) != v} if prefixes: - root.set('prefix', ' '.join('%s: %s' % (k, v) for k, v in prefixes.items())) + root.set('prefix', ' '.join('%s: %s' % (k, v) + for k, v in prefixes.items())) else: root.attrib.pop('prefix', None) @@ -196,7 +201,7 @@ def ensure_prefix(root, prefixes, prefix, value=None): def read_refines(root): - ans = defaultdict(list) + ans = collections.defaultdict(list) for meta in XPath('./opf:metadata/opf:meta[@refines]')(root): r = meta.get('refines') or '' if r.startswith('#'): @@ -213,7 +218,7 @@ def set_refines(elem, existing_refines, *new_refines): remove_refines(elem, existing_refines) for ref in reversed(new_refines): prop, val, scheme = ref - r = elem.makeelement(const.OPF_META) + r = elem.makeelement(base.tag('opf', 'meta')) r.set('refines', '#' + eid), r.set('property', prop) r.text = val.strip() if scheme: @@ -249,7 +254,7 @@ def parse_identifier(ident, val, refines): # Try the OPF 2 style opf:scheme attribute, which will be present, for # example, in EPUB 3 files that have had their metadata set by an # application that only understands EPUB 2. - scheme = ident.get(const.OPF_SCHEME) + scheme = ident.get(base.tag('opf', 'scheme')) if scheme and not lval.startswith('urn:'): return finalize(scheme, val) @@ -267,7 +272,7 @@ def parse_identifier(ident, val, refines): def read_identifiers(root, prefixes, refines): - ans = defaultdict(list) + ans = collections.defaultdict(list) for ident in XPath('./opf:metadata/dc:identifier')(root): val = (ident.text or '').strip() if val: @@ -277,7 +282,8 @@ def read_identifiers(root, prefixes, refines): return ans -def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=False): +def set_identifiers(root, prefixes, refines, new_identifiers, + force_identifiers=False): uid = root.get('unique-identifier') package_identifier = None for ident in XPath('./opf:metadata/dc:identifier')(root): @@ -289,12 +295,15 @@ def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers= ident.getparent().remove(ident) continue scheme, val = parse_identifier(ident, val, refines) - if not scheme or not val or force_identifiers or scheme in new_identifiers: + if (not scheme or + not val or + force_identifiers or + scheme in new_identifiers): remove_element(ident, refines) continue metadata = XPath('./opf:metadata')(root)[0] for scheme, val in new_identifiers.items(): - ident = metadata.makeelement(const.DC_IDENT) + ident = metadata.makeelement(base.tag('dc', 'ident')) ident.text = '%s:%s' % (scheme, val) if package_identifier is None: metadata.append(ident) @@ -312,11 +321,12 @@ def identifier_writer(name): if is_package_id: package_identifier = ident val = (ident.text or '').strip() - if (val.startswith(name + ':') or ident.get(const.OPF_SCHEME) == name) and not is_package_id: + if (val.startswith(name + ':') or + ident.get(base.tag('opf', 'scheme')) == name) and not is_package_id: remove_element(ident, refines) metadata = XPath('./opf:metadata')(root)[0] if ival: - ident = metadata.makeelement(const.DC_IDENT) + ident = metadata.makeelement(base.tag('dc', 'ident')) ident.text = '%s:%s' % (name, ival) if package_identifier is None: metadata.append(ident) @@ -366,7 +376,8 @@ def read_title_sort(root, prefixes, refines): if fa: return fa # Look for OPF 2.0 style title_sort - for m in XPath('./opf:metadata/opf:meta[@name="calibre:title_sort"]')(root): + for m in XPath('./opf:metadata/opf:meta[@name="calibre:' + 'title_sort"]')(root): ans = m.get('content') if ans: return ans @@ -376,12 +387,13 @@ def set_title(root, prefixes, refines, title, title_sort=None): main_title = find_main_title(root, refines, remove_blanks=True) if main_title is None: m = XPath('./opf:metadata')(root)[0] - main_title = m.makeelement(const.DC_TITLE) + main_title = m.makeelement(base.tag('dc', 'title')) m.insert(0, main_title) main_title.text = title or None ts = [refdef('file-as', title_sort)] if title_sort else () set_refines(main_title, refines, refdef('title-type', 'main'), *ts) - for m in XPath('./opf:metadata/opf:meta[@name="calibre:title_sort"]')(root): + for m in XPath('./opf:metadata/opf:meta[@name="calibre:' + 'title_sort"]')(root): remove_element(m, refines) # }}} @@ -405,28 +417,32 @@ def set_languages(root, prefixes, refines, languages): val = (lang.text or '').strip() if val: opf_languages.append(val) - languages = list(filter(lambda x: x and x != 'und', normalize_languages(opf_languages, languages))) + languages = list(filter(lambda x: x and x != 'und', + normalize_languages(opf_languages, languages))) if not languages: # EPUB spec says dc:language is required languages = ['und'] metadata = XPath('./opf:metadata')(root)[0] for lang in uniq(languages): - l = metadata.makeelement(const.DC_LANG) - l.text = lang - metadata.append(l) + dc_lang = metadata.makeelement(base.tag('dc', 'lang')) + dc_lang.text = lang + metadata.append(dc_lang) # }}} # Creator/Contributor {{{ -Author = namedtuple('Author', 'name sort') +Author = collections.namedtuple('Author', 'name sort') def is_relators_role(props, q): for role in props.get('role'): if role: scheme_ns, scheme, role = role - if role.lower() == q and (scheme_ns is None or (scheme_ns, scheme) == (reserved_prefixes['marc'], 'relators')): + if (role.lower() == q and + (scheme_ns is None or + (scheme_ns, scheme) == (RES_PREFIXES['marc'], + 'relators'))): return True return False @@ -440,15 +456,16 @@ def read_authors(root, prefixes, refines): if file_as: aus = file_as[0][-1] else: - aus = item.get(const.OPF_FILE_AS) or None + aus = item.get(base.tag('opf', 'file_as')) or None return Author(normalize_whitespace(val), normalize_whitespace(aus)) for item in XPath('./opf:metadata/dc:creator')(root): val = (item.text or '').strip() if val: - props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) + props = properties_for_id_with_scheme(item.get('id'), prefixes, + refines) role = props.get('role') - opf_role = item.get(const.OPF_ROLE) + opf_role = item.get(base.tag('opf', 'role')) if role: if is_relators_role(props, 'aut'): roled_authors.append(author(item, props, val)) @@ -464,23 +481,30 @@ def read_authors(root, prefixes, refines): def set_authors(root, prefixes, refines, authors): ensure_prefix(root, prefixes, 'marc') for item in XPath('./opf:metadata/dc:creator')(root): - props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) - opf_role = item.get(const.OPF_ROLE) - if (opf_role and opf_role.lower() != 'aut') or (props.get('role') and not is_relators_role(props, 'aut')): + props = properties_for_id_with_scheme(item.get('id'), prefixes, + refines) + opf_role = item.get(base.tag('opf', 'role')) + if ((opf_role and opf_role.lower() != 'aut') or + (props.get('role') and not is_relators_role(props, 'aut'))): continue remove_element(item, refines) metadata = XPath('./opf:metadata')(root)[0] for author in authors: if author.name: - a = metadata.makeelement(const.DC_CREATOR) + a = metadata.makeelement(base.tag('dc', 'creator')) aid = ensure_id(a) a.text = author.name metadata.append(a) - m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'}) + m = metadata.makeelement(base.tag('opf', 'meta'), + attrib={'refines': '#' + aid, + 'property': 'role', + 'scheme': 'marc:relators'}) m.text = 'aut' metadata.append(m) if author.sort: - m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'file-as'}) + m = metadata.makeelement(base.tag('opf', 'meta'), + attrib={'refines': '#' + aid, + 'property': 'file-as'}) m.text = author.sort metadata.append(m) @@ -490,9 +514,10 @@ def read_book_producers(root, prefixes, refines): for item in XPath('./opf:metadata/dc:contributor')(root): val = (item.text or '').strip() if val: - props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) + props = properties_for_id_with_scheme(item.get('id'), prefixes, + refines) role = props.get('role') - opf_role = item.get(const.OPF_ROLE) + opf_role = item.get(base.tag('opf', 'role')) if role: if is_relators_role(props, 'bkp'): ans.append(normalize_whitespace(val)) @@ -503,19 +528,24 @@ def read_book_producers(root, prefixes, refines): def set_book_producers(root, prefixes, refines, producers): for item in XPath('./opf:metadata/dc:contributor')(root): - props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) - opf_role = item.get(const.OPF_ROLE) - if (opf_role and opf_role.lower() != 'bkp') or (props.get('role') and not is_relators_role(props, 'bkp')): + props = properties_for_id_with_scheme(item.get('id'), prefixes, + refines) + opf_role = item.get(base.tag('opf', 'role')) + if ((opf_role and opf_role.lower() != 'bkp') or + (props.get('role') and not is_relators_role(props, 'bkp'))): continue remove_element(item, refines) metadata = XPath('./opf:metadata')(root)[0] for bkp in producers: if bkp: - a = metadata.makeelement(const.DC_CONTRIBUTOR) + a = metadata.makeelement(base.tag('dc', 'contributor')) aid = ensure_id(a) a.text = bkp metadata.append(a) - m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'}) + m = metadata.makeelement(base.tag('opf', 'meta'), + attrib={'refines': '#' + aid, + 'property': 'role', + 'scheme': 'marc:relators'}) m.text = 'bkp' metadata.append(m) # }}} @@ -531,7 +561,9 @@ def parse_date(raw, is_w3cdtf=False): ans = fix_only_date(ans) else: ans = parse_date_(raw, assume_utc=True) - if ' ' not in raw and 'T' not in raw and (ans.hour, ans.minute, ans.second) == (0, 0, 0): + if (' ' not in raw and + 'T' not in raw and + (ans.hour, ans.minute, ans.second) == (0, 0, 0)): ans = fix_only_date(ans) return ans @@ -552,14 +584,14 @@ def set_pubdate(root, prefixes, refines, val): if not is_date_undefined(val): val = isoformat(val) m = XPath('./opf:metadata')(root)[0] - d = m.makeelement(const.DC_DATE) + d = m.makeelement(base.tag('dc', 'date')) d.text = val m.append(d) def read_timestamp(root, prefixes, refines): pq = '%s:timestamp' % CALIBRE_PREFIX - sq = '%s:w3cdtf' % reserved_prefixes['dcterms'] + sq = '%s:w3cdtf' % RES_PREFIXES['dcterms'] for meta in XPath('./opf:metadata/opf:meta[@property]')(root): val = (meta.text or '').strip() if val: @@ -570,7 +602,8 @@ def read_timestamp(root, prefixes, refines): return parse_date(val, is_w3cdtf=scheme == sq) except Exception: continue - for meta in XPath('./opf:metadata/opf:meta[@name="calibre:timestamp"]')(root): + for meta in XPath('./opf:metadata/opf:meta[@name="calibre:' + 'timestamp"]')(root): val = meta.get('content') if val: try: @@ -584,7 +617,9 @@ def create_timestamp(root, prefixes, m, val): ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) ensure_prefix(root, prefixes, 'dcterms') val = w3cdtf(val) - d = m.makeelement(const.OPF_META, attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'}) + d = m.makeelement(base.tag('opf', 'meta'), + attrib={'property': 'calibre:timestamp', + 'scheme': 'dcterms:W3CDTF'}) d.text = val m.append(d) @@ -599,8 +634,8 @@ def set_timestamp(root, prefixes, refines, val): def read_last_modified(root, prefixes, refines): - pq = '%s:modified' % reserved_prefixes['dcterms'] - sq = '%s:w3cdtf' % reserved_prefixes['dcterms'] + pq = '%s:modified' % RES_PREFIXES['dcterms'] + sq = '%s:w3cdtf' % RES_PREFIXES['dcterms'] for meta in XPath('./opf:metadata/opf:meta[@property]')(root): val = (meta.text or '').strip() if val: @@ -614,7 +649,7 @@ def read_last_modified(root, prefixes, refines): def set_last_modified(root, prefixes, refines, val=None): - pq = '%s:modified' % reserved_prefixes['dcterms'] + pq = '%s:modified' % RES_PREFIXES['dcterms'] val = w3cdtf(val or utcnow()) for meta in XPath('./opf:metadata/opf:meta[@property]')(root): prop = expand_prefix(meta.get('property'), prefixes) @@ -625,7 +660,9 @@ def set_last_modified(root, prefixes, refines, val=None): else: ensure_prefix(root, prefixes, 'dcterms') m = XPath('./opf:metadata')(root)[0] - meta = m.makeelement(const.OPF_META, attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'}) + meta = m.makeelement(base.tag('opf', 'meta'), + attrib={'property': 'dcterms:modified', + 'scheme': 'dcterms:W3CDTF'}) m.append(meta) meta.text = val # }}} @@ -648,7 +685,7 @@ def set_comments(root, prefixes, refines, val): if val: val = val.strip() if val: - c = m.makeelement(const.DC_DESC) + c = m.makeelement(base.tag('dc', 'desc')) c.text = val m.append(c) # }}} @@ -670,7 +707,7 @@ def set_publisher(root, prefixes, refines, val): if val: val = val.strip() if val: - c = m.makeelement(const.DC_PUBLISHER('publisher')) + c = m.makeelement(base.tag('dc', 'publisher')) c.text = normalize_whitespace(val) m.append(c) # }}} @@ -693,7 +730,7 @@ def set_tags(root, prefixes, refines, val): if val: val = uniq(list(filter(None, val))) for x in val: - c = m.makeelement(const.DC_SUBJ) + c = m.makeelement(base.tag('dc', 'subj')) c.text = normalize_whitespace(x) if c.text: m.append(c) @@ -725,7 +762,7 @@ def read_rating(root, prefixes, refines): def create_rating(root, prefixes, val): ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) m = XPath('./opf:metadata')(root)[0] - d = m.makeelement(const.OPF_META, attrib={'property':'calibre:rating'}) + d = m.makeelement(base.tag('opf', 'meta'), attrib={'property': 'calibre:rating'}) d.text = val m.append(d) @@ -747,7 +784,8 @@ def set_rating(root, prefixes, refines, val): def read_series(root, prefixes, refines): series_index = 1.0 - for meta in XPath('./opf:metadata/opf:meta[@property="belongs-to-collection" and @id]')(root): + for meta in XPath('./opf:metadata/opf:meta[@property="' + 'belongs-to-collection" and @id]')(root): val = (meta.text or '').strip() if val: props = properties_for_id(meta.get('id'), refines) @@ -757,13 +795,15 @@ def read_series(root, prefixes, refines): except Exception: pass return normalize_whitespace(val), series_index - for si in XPath('./opf:metadata/opf:meta[@name="calibre:series_index"]/@content')(root): + for si in XPath('./opf:metadata/opf:meta[@name="calibre:series_index"]' + '/@content')(root): try: series_index = float(si) break - except: + except Exception: pass - for s in XPath('./opf:metadata/opf:meta[@name="calibre:series"]/@content')(root): + for s in XPath('./opf:metadata/opf:meta[@name="calibre:series"]' + '/@content')(root): s = normalize_whitespace(s) if s: return s, series_index @@ -772,16 +812,20 @@ def read_series(root, prefixes, refines): def create_series(root, refines, series, series_index): m = XPath('./opf:metadata')(root)[0] - d = m.makeelement(const.OPF_META, attrib={'property':'belongs-to-collection'}) + d = m.makeelement(base.tag('opf', 'meta'), + attrib={'property': 'belongs-to-collection'}) d.text = series m.append(d) - set_refines(d, refines, refdef('collection-type', 'series'), refdef('group-position', series_index)) + set_refines(d, refines, refdef('collection-type', 'series'), + refdef('group-position', series_index)) def set_series(root, prefixes, refines, series, series_index): - for meta in XPath('./opf:metadata/opf:meta[@name="calibre:series" or @name="calibre:series_index"]')(root): + for meta in XPath('./opf:metadata/opf:meta[@name="calibre:series" or ' + '@name="calibre:series_index"]')(root): remove_element(meta, refines) - for meta in XPath('./opf:metadata/opf:meta[@property="belongs-to-collection"]')(root): + for meta in XPath('./opf:metadata/opf:meta[@property="' + 'belongs-to-collection"]')(root): remove_element(meta, refines) if series: create_series(root, refines, series, '%.2g' % series_index) @@ -806,7 +850,8 @@ def dict_reader(name, load=json.loads, try2=True): except Exception: continue if try2: - for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % name)(root): + for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % + name)(root): val = meta.get('content') if val: try: @@ -827,7 +872,8 @@ def dict_writer(name, serialize=dump_dict, remove2=True): def writer(root, prefixes, refines, val): if remove2: - for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % name)(root): + for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % + name)(root): remove_element(meta, refines) for meta in XPath('./opf:metadata/opf:meta[@property]')(root): prop = expand_prefix(meta.get('property'), prefixes) @@ -836,7 +882,8 @@ def dict_writer(name, serialize=dump_dict, remove2=True): if val: ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) m = XPath('./opf:metadata')(root)[0] - d = m.makeelement(const.OPF_META, attrib={'property':'calibre:%s' % name}) + d = m.makeelement(base.tag('opf', 'meta'), + attrib={'property': 'calibre:%s' % name}) d.text = serialize(val) m.append(d) return writer @@ -855,12 +902,14 @@ def deserialize_user_metadata(val): return ans -read_user_metadata3 = dict_reader('user_metadata', load=deserialize_user_metadata, try2=False) +read_user_metadata3 = dict_reader('user_metadata', + load=deserialize_user_metadata, try2=False) def read_user_metadata2(root, remove_tags=False): ans = {} - for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, "calibre:user_metadata:")]')(root): + for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, ' + '"calibre:user_metadata:")]')(root): name = meta.get('name') name = ':'.join(name.split(':')[2:]) if not name or not name.startswith('#'): @@ -881,18 +930,23 @@ def read_user_metadata2(root, remove_tags=False): def read_user_metadata(root, prefixes, refines): - return read_user_metadata3(root, prefixes, refines) or read_user_metadata2(root) + return read_user_metadata3(root, prefixes, + refines) or read_user_metadata2(root) def serialize_user_metadata(val): - return json.dumps(object_to_unicode(val), ensure_ascii=False, default=to_json, indent=2, sort_keys=True) + return json.dumps(object_to_unicode(val), ensure_ascii=False, + default=to_json, indent=2, sort_keys=True) -set_user_metadata3 = dict_writer('user_metadata', serialize=serialize_user_metadata, remove2=False) +set_user_metadata3 = dict_writer('user_metadata', + serialize=serialize_user_metadata, + remove2=False) def set_user_metadata(root, prefixes, refines, val): - for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, "calibre:user_metadata:")]')(root): + for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, ' + '"calibre:user_metadata:")]')(root): remove_element(meta, refines) if val: nval = {} @@ -921,26 +975,32 @@ def read_raster_cover(root, prefixes, refines): if href: return href - for item_id in XPath('./opf:metadata/opf:meta[@name="cover"]/@content')(root): - for item in XPath('./opf:manifest/opf:item[@id and @href and @media-type]')(root): + for item_id in XPath('./opf:metadata/opf:meta[@name="cover"]' + '/@content')(root): + for item in XPath('./opf:manifest/opf:item[@id and @href and ' + '@media-type]')(root): if item.get('id') == item_id: href = get_href(item) if href: return href -def ensure_is_only_raster_cover(root, prefixes, refines, raster_cover_item_href): +def ensure_is_only_raster_cover(root, prefixes, refines, + raster_cover_item_href): for item in XPath('./opf:metadata/opf:meta[@name="cover"]')(root): remove_element(item, refines) for item in items_with_property(root, 'cover-image', prefixes): - prop = normalize_whitespace(item.get('properties').replace('cover-image', '')) + prop = normalize_whitespace(item.get('properties') + .replace('cover-image', '')) if prop: item.set('properties', prop) else: del item.attrib['properties'] for item in XPath('./opf:manifest/opf:item')(root): if item.get('href') == raster_cover_item_href: - item.set('properties', normalize_whitespace((item.get('properties') or '') + ' cover-image')) + item.set('properties', + normalize_whitespace((item.get('properties') + or '') + ' cover-image')) # }}} @@ -960,7 +1020,7 @@ def set_last_modified_in_opf(root): def read_metadata(root, ver=None, return_extra_data=False): - ans = Metadata('Unknown', ['Unknown']) + ans = base.Metadata('Unknown', ['Unknown']) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} @@ -1000,12 +1060,16 @@ def read_metadata(root, ver=None, return_extra_data=False): s, si = read_series(root, prefixes, refines) if s: ans.series, ans.series_index = s, si - ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map - ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories - for name, fm in (read_user_metadata(root, prefixes, refines) or {}).items(): + ans.author_link_map = read_author_link_map(root, prefixes, + refines) or ans.author_link_map + ans.user_categories = read_user_categories(root, prefixes, + refines) or ans.user_categories + for name, fm in (read_user_metadata(root, prefixes, + refines) or {}).items(): ans.set_user_metadata(name, fm) if return_extra_data: - ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines) + ans = (ans, ver, read_raster_cover(root, prefixes, refines), + first_spine_item(root, prefixes, refines)) return ans @@ -1014,7 +1078,9 @@ def get_metadata(stream): return read_metadata(root) -def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True): +def apply_metadata(root, mi, cover_prefix='', cover_data=None, + apply_null=False, update_timestamp=False, + force_identifiers=False, add_missing_cover=True): prefixes, refines = read_prefixes(root), read_refines(root) current_mi = read_metadata(root) if apply_null: @@ -1024,7 +1090,8 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False, def ok(x): return not mi.is_null(x) if ok('identifiers'): - set_identifiers(root, prefixes, refines, mi.identifiers, force_identifiers=force_identifiers) + set_identifiers(root, prefixes, refines, mi.identifiers, + force_identifiers=force_identifiers) if ok('title'): set_title(root, prefixes, refines, mi.title, mi.title_sort) if ok('languages'): @@ -1052,16 +1119,21 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False, if ok('series'): set_series(root, prefixes, refines, mi.series, mi.series_index or 1) if ok('author_link_map'): - set_author_link_map(root, prefixes, refines, getattr(mi, 'author_link_map', None)) + set_author_link_map(root, prefixes, refines, + getattr(mi, 'author_link_map', None)) if ok('user_categories'): - set_user_categories(root, prefixes, refines, getattr(mi, 'user_categories', None)) + set_user_categories(root, prefixes, refines, + getattr(mi, 'user_categories', None)) # We ignore apply_null for the next two to match the behavior with opf2.py if mi.application_id: set_application_id(root, prefixes, refines, mi.application_id) if mi.uuid: set_uuid(root, prefixes, refines, mi.uuid) - new_user_metadata, current_user_metadata = mi.get_all_user_metadata(True), current_mi.get_all_user_metadata(True) + + new_user_metadata = mi.get_all_user_metadata(True) + current_user_metadata = current_mi.get_all_user_metadata(True) missing = object() + for key in tuple(new_user_metadata): meta = new_user_metadata.get(key) if meta is None: @@ -1098,7 +1170,9 @@ def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False, return raster_cover -def set_metadata(stream, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True): +def set_metadata(stream, mi, cover_prefix='', cover_data=None, + apply_null=False, update_timestamp=False, + force_identifiers=False, add_missing_cover=True): root = parse_opf(stream) return apply_metadata( root, mi, cover_prefix=cover_prefix, cover_data=cover_data, diff --git a/ebook_converter/ebooks/mobi/reader/mobi8.py b/ebook_converter/ebooks/mobi/reader/mobi8.py index 02b2f0e..8665cbe 100644 --- a/ebook_converter/ebooks/mobi/reader/mobi8.py +++ b/ebook_converter/ebooks/mobi/reader/mobi8.py @@ -8,37 +8,28 @@ import uuid from lxml import etree -from ebook_converter import constants as const from ebook_converter.ebooks.mobi.reader.headers import NULL_INDEX from ebook_converter.ebooks.mobi.reader.index import read_index from ebook_converter.ebooks.mobi.reader.ncx import read_ncx, build_toc from ebook_converter.ebooks.mobi.reader.markup import expand_mobi8_markup -from ebook_converter.ebooks.mobi.reader.containers import Container, find_imgtype +from ebook_converter.ebooks.mobi.reader import containers from ebook_converter.ebooks.metadata.opf2 import Guide, OPFCreator from ebook_converter.ebooks.metadata.toc import TOC from ebook_converter.ebooks.mobi.utils import read_font_record from ebook_converter.ebooks.oeb.parse_utils import parse_html -from ebook_converter.ebooks.oeb.base import XPath, xml2text +from ebook_converter.ebooks.oeb import base from ebook_converter.polyglot.builtins import as_unicode +ID_RE = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''') +NAME_RE = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''') +AID_RE = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''') +Part = collections.namedtuple('Part', 'num type filename start end aid') +Elem = collections.namedtuple('Elem', 'insert_pos toc_text file_number ' + 'sequence_number start_pos length') +FlowInfo = collections.namedtuple('FlowInfo', 'type format dir fname') -__license__ = 'GPL v3' -__copyright__ = '2012, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - -Part = collections.namedtuple('Part', - 'num type filename start end aid') - -Elem = collections.namedtuple('Elem', - 'insert_pos toc_text file_number sequence_number start_pos ' - 'length') - -FlowInfo = collections.namedtuple('FlowInfo', - 'type format dir fname') # locate beginning and ending positions of tag with specific aid attribute - - def locate_beg_end_of_tag(ml, aid): pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid aid_pattern = re.compile(pattern, re.IGNORECASE) @@ -64,7 +55,8 @@ def reverse_tag_iter(block): end = plt -def get_first_resource_index(first_image_index, num_of_text_records, first_text_record_number): +def get_first_resource_index(first_image_index, num_of_text_records, + first_text_record_number): first_resource_index = first_image_index if first_resource_index in {-1, NULL_INDEX}: first_resource_index = num_of_text_records + first_text_record_number @@ -78,23 +70,27 @@ class Mobi8Reader(object): self.mobi6_reader, self.log = mobi6_reader, log self.header = mobi6_reader.book_header self.encrypted_fonts = [] - self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''') - self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''') - self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''') + self.id_re = ID_RE + self.name_re = NAME_RE + self.aid_re = AID_RE def __call__(self): self.mobi6_reader.check_for_drm() self.aid_anchor_suffix = uuid.uuid4().hex.encode('utf-8') bh = self.mobi6_reader.book_header + _gfri = get_first_resource_index if self.mobi6_reader.kf8_type == 'joint': offset = self.mobi6_reader.kf8_boundary + 2 - self.resource_offsets = [ - (get_first_resource_index(bh.first_image_index, bh.mobi6_records, 1), offset - 2), - (get_first_resource_index(bh.kf8_first_image_index, bh.records, offset), len(self.mobi6_reader.sections)), - ] + self.resource_offsets = [(_gfri(bh.first_image_index, + bh.mobi6_records, 1), offset - 2), + (_gfri(bh.kf8_first_image_index, + bh.records, offset), + len(self.mobi6_reader.sections))] else: offset = 1 - self.resource_offsets = [(get_first_resource_index(bh.first_image_index, bh.records, offset), len(self.mobi6_reader.sections))] + self.resource_offsets = [(_gfri(bh.first_image_index, bh.records, + offset), + len(self.mobi6_reader.sections))] self.processed_records = self.mobi6_reader.extract_text(offset=offset) self.raw_ml = self.mobi6_reader.mobi_html @@ -123,37 +119,37 @@ class Mobi8Reader(object): raise ValueError('KF8 does not have a valid FDST record') sec_start, num_sections = struct.unpack_from(b'>LL', header, 4) secs = struct.unpack_from(b'>%dL' % (num_sections*2), - header, sec_start) + header, sec_start) self.flow_table = tuple(zip(secs[::2], secs[1::2])) self.files = [] if self.header.skelidx != NULL_INDEX: table = read_index(self.kf8_sections, self.header.skelidx, - self.header.codec)[0] - File = collections.namedtuple('File', - 'file_number name divtbl_count start_position length') + self.header.codec)[0] + File = collections.namedtuple('File', 'file_number name ' + 'divtbl_count start_position length') for i, text in enumerate(table): tag_map = table[text] self.files.append(File(i, text, tag_map[1][0], - tag_map[6][0], tag_map[6][1])) + tag_map[6][0], tag_map[6][1])) self.elems = [] if self.header.dividx != NULL_INDEX: table, cncx = read_index(self.kf8_sections, self.header.dividx, - self.header.codec) + self.header.codec) for i, text in enumerate(table): tag_map = table[text] toc_text = cncx[tag_map[2][0]] self.elems.append(Elem(int(text), toc_text, tag_map[3][0], - tag_map[4][0], tag_map[6][0], tag_map[6][1])) + tag_map[4][0], tag_map[6][0], + tag_map[6][1])) self.guide = [] if self.header.othidx != NULL_INDEX: table, cncx = read_index(self.kf8_sections, self.header.othidx, - self.header.codec) - Item = collections.namedtuple('Item', - 'type title pos_fid') + self.header.codec) + Item = collections.namedtuple('Item', 'type title pos_fid') for i, ref_type in enumerate(table): tag_map = table[ref_type] @@ -161,7 +157,7 @@ class Mobi8Reader(object): title = cncx[tag_map[1][0]] fileno = None if 3 in list(tag_map.keys()): - fileno = tag_map[3][0] + fileno = tag_map[3][0] if 6 in list(tag_map.keys()): fileno = tag_map[6] if isinstance(ref_type, bytes): @@ -205,17 +201,19 @@ class Mobi8Reader(object): head = skeleton[:insertpos] tail = skeleton[insertpos:] if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < - head.rfind(b'<')): + head.rfind(b'<')): # There is an incomplete tag in either the head or tail. # This can happen for some badly formed KF8 files, see for # example, https://bugs.launchpad.net/bugs/1082669 if not inspos_warned: - self.log.warn( - 'The div table for %s has incorrect insert ' - 'positions. Calculating manually.'%skelname) + self.log.warn('The div table for %s has incorrect ' + 'insert positions. Calculating ' + 'manually.' % skelname) inspos_warned = True bp, ep = locate_beg_end_of_tag(skeleton, aidtext if - isinstance(aidtext, bytes) else aidtext.encode('utf-8')) + isinstance(aidtext, bytes) + else + aidtext.encode('utf-8')) if bp != ep: insertpos = ep + 1 + startpos @@ -228,7 +226,7 @@ class Mobi8Reader(object): aidtext = str(uuid.uuid4()) filename = aidtext + '.html' self.partinfo.append(Part(skelnum, 'text', filename, skelpos, - baseptr, aidtext)) + baseptr, aidtext)) # The primary css style sheet is typically stored next followed by any # snippets of code that were previously inlined in the @@ -238,10 +236,10 @@ class Mobi8Reader(object): # The problem is that for most browsers and ereaders, you can not # use to import any svg image that itself # properly uses an tag to import some raster image - it - # should work according to the spec but does not for almost all browsers - # and ereaders and causes epub validation issues because those raster - # images are in manifest but not in xhtml text - since they only - # referenced from an svg image + # should work according to the spec but does not for almost all + # browsers and ereaders and causes epub validation issues because + # those raster images are in manifest but not in xhtml text - since + # they only referenced from an svg image # So we need to check the remaining flow pieces to see if they are css # or svg images. if svg images, we must check if they have an @@ -252,7 +250,8 @@ class Mobi8Reader(object): self.flowinfo.append(FlowInfo(None, None, None, None)) svg_tag_pattern = re.compile(br'''(]*>)''', re.IGNORECASE) - image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''', re.IGNORECASE) + image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''', + re.IGNORECASE) for j in range(1, len(self.flows)): flowpart = self.flows[j] nstr = '%04d' % j @@ -276,7 +275,8 @@ class Mobi8Reader(object): # search for CDATA and if exists inline it if flowpart.find(b'[CDATA[') >= 0: typ = 'css' - flowpart = b'\n' + flowpart = (b'\n') format = 'inline' dir = None fname = None @@ -300,7 +300,8 @@ class Mobi8Reader(object): def get_id_tag_by_pos_fid(self, posfid, offset): # first convert kindle:pos:fid and offset info to position in file - insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid] + (insertpos, idtext, filenum, + seqnm, startpos, length) = self.elems[posfid] pos = insertpos + offset fi = self.get_file_info(pos) # an existing "id=" must exist in original xhtml otherwise it would not @@ -311,20 +312,20 @@ class Mobi8Reader(object): # so find the closest "id=" before position the file by actually # searching in that file idtext = self.get_id_tag(pos) - return '%s/%s'%(fi.type, fi.filename), idtext + return '%s/%s' % (fi.type, fi.filename), idtext def get_id_tag(self, pos): # Find the first tag with a named anchor (name or id attribute) before # pos fi = self.get_file_info(pos) if fi.num is None and fi.start is None: - raise ValueError('No file contains pos: %d'%pos) + raise ValueError('No file contains pos: %d' % pos) textblock = self.parts[fi.num] npos = pos - fi.start pgt = textblock.find(b'>', npos) plt = textblock.find(b'<', npos) - # if npos inside a tag then search all text before the its end of tag marker - # else not in a tag need to search the preceding tag + # if npos inside a tag then search all text before the its end of tag + # marker else not in a tag need to search the preceding tag if plt == npos or pgt < plt: npos = pgt + 1 textblock = textblock[0:npos] @@ -371,7 +372,7 @@ class Mobi8Reader(object): linktgt = fi.filename if idtext: linktgt += '#' + idtext - g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwd()) + g = Guide.Reference('%s/%s' % (fi.type, linktgt), os.getcwd()) g.title, g.type = 'start', 'text' guide.append(g) @@ -379,7 +380,7 @@ class Mobi8Reader(object): def create_ncx(self): index_entries = read_ncx(self.kf8_sections, self.header.ncxidx, - self.header.codec) + self.header.codec) remove = [] # Add href and anchor info to the index entries @@ -389,15 +390,15 @@ class Mobi8Reader(object): pos = entry['pos'] fi = self.get_file_info(pos) if fi.filename is None: - raise ValueError('Index entry has invalid pos: %d'%pos) + raise ValueError('Index entry has invalid pos: %d' % pos) idtag = self.get_id_tag(pos) - href = '%s/%s'%(fi.type, fi.filename) + href = '%s/%s' % (fi.type, fi.filename) else: try: href, idtag = self.get_id_tag_by_pos_fid(*pos_fid) except ValueError: - self.log.warn('Invalid entry in NCX (title: %s), ignoring' - %entry['text']) + self.log.warn('Invalid entry in NCX (title: %s), ' + 'ignoring' % entry['text']) remove.append(entry) continue @@ -411,7 +412,8 @@ class Mobi8Reader(object): return build_toc(index_entries) def extract_resources(self, sections): - from ebook_converter.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF + from ebook_converter.ebooks.mobi.writer2.resources import \ + PLACEHOLDER_GIF resource_map = [] container = None for x in ('fonts', 'images'): @@ -424,16 +426,18 @@ class Mobi8Reader(object): typ = data[:4] href = None if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN', - b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET', b'PAGE'}: + b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', + b'CMET', b'PAGE'}: pass # Ignore these records elif typ == b'FONT': font = read_font_record(data) href = "fonts/%05d.%s" % (fname_idx, font['ext']) if font['err']: - self.log.warn('Reading font record %d failed: %s'%( - fname_idx, font['err'])) + self.log.warn('Reading font record %d failed: %s' % + (fname_idx, font['err'])) if font['headers']: - self.log.debug('Font record headers: %s'%font['headers']) + self.log.debug('Font record headers: %s' % + font['headers']) with open(href.replace('/', os.sep), 'wb') as f: f.write(font['font_data'] if font['font_data'] else font['raw_data']) @@ -443,19 +447,23 @@ class Mobi8Reader(object): if data == b'CONTBOUNDARY': container = None continue - container = Container(data) + container = containers.Container(data) elif typ == b'CRES': data, imgtype = container.load_image(data) if data is not None: - href = 'images/%05d.%s'%(container.resource_index, imgtype) + href = 'images/%05d.%s' % (container.resource_index, + imgtype) with open(href.replace('/', os.sep), 'wb') as f: f.write(data) - elif typ == b'\xa0\xa0\xa0\xa0' and len(data) == 4 and container is not None: + elif (typ == b'\xa0\xa0\xa0\xa0' and + len(data) == 4 and + container is not None): container.resource_index += 1 elif container is None: - if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF): - imgtype = find_imgtype(data) - href = 'images/%05d.%s'%(fname_idx, imgtype) + if not (len(data) == len(PLACEHOLDER_GIF) and + data == PLACEHOLDER_GIF): + imgtype = containers.find_imgtype(data) + href = 'images/%05d.%s' % (fname_idx, imgtype) with open(href.replace('/', os.sep), 'wb') as f: f.write(data) @@ -482,7 +490,7 @@ class Mobi8Reader(object): if os.path.exists(href.replace('/', os.sep)): try: toc = self.read_inline_toc(href, frag) - except: + except Exception: self.log.exception('Failed to read inline ToC') opf = OPFCreator(os.getcwd(), mi) @@ -493,7 +501,8 @@ class Mobi8Reader(object): # If there are no images then the azw3 input plugin dumps all # binary records as .unknown images, remove them - if self.for_tweak and os.path.exists('images') and os.path.isdir('images'): + if (self.for_tweak and os.path.exists('images') and + os.path.isdir('images')): files = os.listdir('images') unknown = [x for x in files if x.endswith('.unknown')] if len(files) == len(unknown): @@ -502,7 +511,7 @@ class Mobi8Reader(object): if self.for_tweak: try: os.remove('debug-raw.html') - except: + except Exception: pass opf.create_manifest_from_files_in([os.getcwd()], exclude=exclude) @@ -528,7 +537,7 @@ class Mobi8Reader(object): with open(href.replace('/', os.sep), 'rb') as f: raw = f.read().decode(self.header.codec) root = parse_html(raw, log=self.log) - body = XPath('//h:body')(root) + body = base.XPath('//h:body')(root) reached = False if body: start = body[0] @@ -536,7 +545,7 @@ class Mobi8Reader(object): start = None reached = True if frag: - elems = XPath('//*[@id="%s"]'%frag)(root) + elems = base.XPath('//*[@id="%s"]' % frag)(root) if elems: start = elems[0] @@ -554,12 +563,13 @@ class Mobi8Reader(object): seen = set() links = [] for elem in root.iterdescendants(etree.Element): - if reached and elem.tag == const.XHTML_A and elem.get('href', + if reached and elem.tag == base.tag('xhtml', + 'a') and elem.get('href', False): href = elem.get('href') href, frag = urllib.parse.urldefrag(href) href = base_href + '/' + href - text = xml2text(elem).strip() + text = base.xml2text(elem).strip() if (text, href, frag) in seen: continue seen.add((text, href, frag)) @@ -568,7 +578,7 @@ class Mobi8Reader(object): reached = True depths = sorted(set(x[-1] for x in links)) - depth_map = {x:i for i, x in enumerate(depths)} + depth_map = {x: i for i, x in enumerate(depths)} for text, href, frag, depth in links: depth = depth_map[depth] if current_depth is None: diff --git a/ebook_converter/ebooks/oeb/polish/container.py b/ebook_converter/ebooks/oeb/polish/container.py index 2ee2957..f4fb288 100644 --- a/ebook_converter/ebooks/oeb/polish/container.py +++ b/ebook_converter/ebooks/oeb/polish/container.py @@ -1,5 +1,8 @@ +import collections import errno import hashlib +import io +import itertools import logging import os import re @@ -7,13 +10,10 @@ import shutil import sys import time import unicodedata -import uuid -from collections import defaultdict -from io import BytesIO -from itertools import count import urllib.parse +import uuid -from css_parser import getUrls, replaceUrls +import css_parser from lxml import etree from ebook_converter import constants as const @@ -35,10 +35,7 @@ from ebook_converter.ebooks.metadata.utils import parse_opf_version from ebook_converter.ebooks.mobi import MobiError from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader from ebook_converter.ebooks.mobi.tweak import set_cover -from ebook_converter.ebooks.oeb.base import ( - OEB_DOCS, OEB_STYLES, Manifest, itercsslinks, iterlinks, - rewrite_links, serialize, urlquote, urlunquote -) +from ebook_converter.ebooks.oeb import base as oeb_base from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html from ebook_converter.ebooks.oeb.polish.errors import DRMError, InvalidBook from ebook_converter.ebooks.oeb.polish.parsing import parse as parse_html_tweak @@ -96,7 +93,7 @@ def abspath_to_name(path, root): return relpath(os.path.abspath(path), root).replace(os.sep, '/') -def name_to_href(name, root, base=None, quote=urlquote): +def name_to_href(name, root, base=None, quote=oeb_base.urlquote): fullpath = name_to_abspath(name, root) basepath = root if base is None else os.path.dirname(name_to_abspath(base, root)) path = relpath(fullpath, basepath).replace(os.sep, '/') @@ -111,7 +108,7 @@ def href_to_name(href, root, base=None): return None if purl.scheme or not purl.path: return None - href = urlunquote(purl.path) + href = oeb_base.urlunquote(purl.path) if iswindows and ':' in href: # path manipulations on windows fail for paths with : in them, so we # assume all such paths are invalid/absolute paths. @@ -324,7 +321,7 @@ class Container(ContainerBase): # {{{ item_id = 'id' + '%d'%c manifest = self.opf_xpath('//opf:manifest')[0] href = self.name_to_href(name, self.opf_name) - item = manifest.makeelement(const.OPF_ITEM, + item = manifest.makeelement(oeb_base.tag('opf', 'item'), id=item_id, href=href) item.set('media-type', self.mime_map[name]) self.insert_into_xml(manifest, item) @@ -340,7 +337,7 @@ class Container(ContainerBase): # {{{ def make_name_unique(self, name): ''' Ensure that `name` does not already exist in this book. If it does, return a modified version that does not exist. ''' - counter = count() + counter = itertools.count() while self.has_name_case_insensitive(name) or self.manifest_has_name(name): c = next(counter) + 1 base, ext = name.rpartition('.')[::2] @@ -377,10 +374,10 @@ class Container(ContainerBase): # {{{ if self.ok_to_be_unmanifested(name): return name item_id = self.add_name_to_manifest(name, process_manifest_item=process_manifest_item) - if mt in OEB_DOCS: + if mt in oeb_base.OEB_DOCS: manifest = self.opf_xpath('//opf:manifest')[0] spine = self.opf_xpath('//opf:spine')[0] - si = manifest.makeelement(const.OPF_ITEMREF, idref=item_id) + si = manifest.makeelement(oeb_base.tag('opf', 'itemref'), idref=item_id) self.insert_into_xml(spine, si, index=spine_index) return name @@ -442,12 +439,12 @@ class Container(ContainerBase): # {{{ replace_func.file_type = 'opf' for elem in self.opf_xpath('//*[@href]'): elem.set('href', replace_func(elem.get('href'))) - elif media_type.lower() in OEB_DOCS: + elif media_type.lower() in oeb_base.OEB_DOCS: replace_func.file_type = 'text' - rewrite_links(self.parsed(name), replace_func) - elif media_type.lower() in OEB_STYLES: + oeb_base.rewrite_links(self.parsed(name), replace_func) + elif media_type.lower() in oeb_base.OEB_STYLES: replace_func.file_type = 'style' - replaceUrls(self.parsed(name), replace_func) + css_parser.replaceUrls(self.parsed(name), replace_func) elif media_type.lower() == guess_type('toc.ncx'): replace_func.file_type = 'ncx' for elem in self.parsed(name).xpath('//*[@src]'): @@ -467,21 +464,21 @@ class Container(ContainerBase): # {{{ if name == self.opf_name: for elem in self.opf_xpath('//*[@href]'): yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href') - elif media_type.lower() in OEB_DOCS: - for el, attr, link, pos in iterlinks(self.parsed(name)): + elif media_type.lower() in oeb_base.OEB_DOCS: + for el, attr, link, pos in oeb_base.iterlinks(self.parsed(name)): yield (link, el.sourceline, pos) if get_line_numbers else link - elif media_type.lower() in OEB_STYLES: + elif media_type.lower() in oeb_base.OEB_STYLES: if get_line_numbers: with self.open(name, 'rb') as f: raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n') position = PositionFinder(raw) is_in_comment = CommentFinder(raw) - for link, offset in itercsslinks(raw): + for link, offset in oeb_base.itercsslinks(raw): if not is_in_comment(offset): lnum, col = position(offset) yield link, lnum, col else: - for link in getUrls(self.parsed(name)): + for link in css_parser.getUrls(self.parsed(name)): yield link elif media_type.lower() == guess_type('toc.ncx'): for elem in self.parsed(name).xpath('//*[@src]'): @@ -533,7 +530,7 @@ class Container(ContainerBase): # {{{ def opf_xpath(self, expr): ' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. ' - return self.opf.xpath(expr, namespaces=const.OPF_NAMESPACES) + return self.opf.xpath(expr, namespaces=oeb_base.tag('opf', 'namespaces')) def has_name(self, name): ''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. ''' @@ -580,11 +577,11 @@ class Container(ContainerBase): # {{{ def parse(self, path, mime): with open(path, 'rb') as src: data = src.read() - if mime in OEB_DOCS: + if mime in oeb_base.OEB_DOCS: data = self.parse_xhtml(data, self.relpath(path)) elif mime[-4:] in {'+xml', '/xml'}: data = self.parse_xml(data) - elif mime in OEB_STYLES: + elif mime in oeb_base.OEB_STYLES: data = self.parse_css(data, self.relpath(path)) return data @@ -597,7 +594,7 @@ class Container(ContainerBase): # {{{ ''' ans = self.open(name).read() mime = self.mime_map.get(name, guess_type(name)) - if decode and (mime in OEB_STYLES or mime in OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}): + if decode and (mime in oeb_base.OEB_STYLES or mime in oeb_base.OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}): ans = self.decode(ans, normalize_to_nfc=normalize_to_nfc) return ans @@ -637,7 +634,7 @@ class Container(ContainerBase): # {{{ so use it sparingly. ''' from ebook_converter.ebooks.metadata.opf2 import OPF as O mi = self.serialize_item(self.opf_name) - return O(BytesIO(mi), basedir=self.opf_dir, unquote_urls=False, + return O(io.BytesIO(mi), basedir=self.opf_dir, unquote_urls=False, populate_spine=False).to_book_metadata() @property @@ -662,7 +659,7 @@ class Container(ContainerBase): # {{{ @property def manifest_type_map(self): ' Mapping of manifest media-type to list of canonical names of that media-type ' - ans = defaultdict(list) + ans = collections.defaultdict(list) for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'): ans[item.get('media-type').lower()].append(self.href_to_name( item.get('href'), self.opf_name)) @@ -813,7 +810,7 @@ class Container(ContainerBase): # {{{ spine = self.opf_xpath('//opf:spine')[0] spine.text = tail for name, linear in spine_items: - i = spine.makeelement(const.OPF_ITEMREF, + i = spine.makeelement(oeb_base.tag('opf', 'itemref'), nsmap={'opf': const.OPF2_NS}) i.tail = tail i.set('idref', imap[name]) @@ -922,7 +919,7 @@ class Container(ContainerBase): # {{{ return ans[0] self.dirty(self.opf_name) package = self.opf_xpath('//opf:package')[0] - item = package.makeelement(OPF(name)) + item = package.makeelement(oeb_base.tag('opf', name)) item.tail = '\n' package.append(item) return item @@ -945,7 +942,7 @@ class Container(ContainerBase): # {{{ item_id = id_prefix + '%d'%c manifest = self.opf_xpath('//opf:manifest')[0] - item = manifest.makeelement(const.OPF_ITEM, + item = manifest.makeelement(oeb_base.tag('opf', 'item'), id=item_id, href=href) item.set('media-type', media_type) self.insert_into_xml(manifest, item) @@ -992,7 +989,7 @@ class Container(ContainerBase): # {{{ data = root = self.parsed(name) if name == self.opf_name: self.format_opf() - data = serialize(data, self.mime_map[name], pretty_print=name in + data = oeb_base.serialize(data, self.mime_map[name], pretty_print=name in self.pretty_print) if name == self.opf_name and root.nsmap.get(None) == const.OPF2_NS: # Needed as I can't get lxml to output opf:role and @@ -1181,7 +1178,7 @@ class EpubContainer(Container): ) if not opf_files: raise InvalidEpub('META-INF/container.xml contains no link to OPF file') - opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/'))) + opf_path = os.path.join(self.root, *(oeb_base.urlunquote(opf_files[0].get('full-path')).split('/'))) if not exists(opf_path): raise InvalidEpub('OPF file does not exist at location pointed to' ' by META-INF/container.xml') @@ -1412,7 +1409,7 @@ def do_explode(path, dest): def opf_to_azw3(opf, outpath, container): from ebook_converter.ebooks.conversion.plumber import Plumber, create_oebbook - class Item(Manifest.Item): + class Item(oeb_base.Manifest.Item): def _parse_css(self, data): # The default CSS parser used by oeb.base inserts the h namespace diff --git a/ebook_converter/ebooks/oeb/polish/css.py b/ebook_converter/ebooks/oeb/polish/css.py index 43b209c..3274f79 100644 --- a/ebook_converter/ebooks/oeb/polish/css.py +++ b/ebook_converter/ebooks/oeb/polish/css.py @@ -1,22 +1,16 @@ -from collections import defaultdict -from functools import partial +import collections +import functools from css_parser.css import CSSRule, CSSStyleDeclaration -from ebook_converter import constants as const from ebook_converter import force_unicode from ebook_converter.css_selectors import parse, SelectorSyntaxError -from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, css_text -from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css, normalizers -from ebook_converter.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize +from ebook_converter.ebooks.oeb import base +from ebook_converter.ebooks.oeb.polish import pretty from ebook_converter.utils.icu import numeric_sort_key from ebook_converter.css_selectors import Select, SelectorError -__license__ = 'GPL v3' -__copyright__ = '2014, Kovid Goyal ' - - def filter_used_rules(rules, log, select): for rule in rules: used = False @@ -34,7 +28,8 @@ def filter_used_rules(rules, log, select): yield rule -def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None): +def get_imported_sheets(name, container, sheets, recursion_level=10, + sheet=None): ans = set() sheet = sheet or sheets[name] for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE): @@ -44,7 +39,8 @@ def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None) ans.add(iname) if recursion_level > 0: for imported_sheet in tuple(ans): - ans |= get_imported_sheets(imported_sheet, container, sheets, recursion_level=recursion_level-1) + ans |= get_imported_sheets(imported_sheet, container, sheets, + recursion_level=recursion_level-1) ans.discard(name) return ans @@ -56,7 +52,7 @@ def merge_declarations(first, second): def merge_identical_selectors(sheet): ' Merge rules that have identical selectors ' - selector_map = defaultdict(list) + selector_map = collections.defaultdict(list) for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE): selector_map[rule.selectorText].append(rule) remove = [] @@ -70,23 +66,29 @@ def merge_identical_selectors(sheet): return len(remove) -def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False): - ''' - Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content. +def remove_unused_css(container, report=None, remove_unused_classes=False, + merge_rules=False): + """ + Remove all unused CSS rules from the book. An unused CSS rule is one that + does not match any actual content. - :param report: An optional callable that takes a single argument. It is called with information about the operations being performed. - :param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed. + :param report: An optional callable that takes a single argument. It is + called with information about the operations being + performed. + :param remove_unused_classes: If True, class attributes in the HTML that + do not match any CSS rules are also removed. :param merge_rules: If True, rules with identical selectors are merged. - ''' - report = report or (lambda x:x) + """ + report = report or (lambda x: x) def safe_parse(name): try: return container.parsed(name) except TypeError: pass - sheets = {name:safe_parse(name) for name, mt in container.mime_map.items() if mt in OEB_STYLES} - sheets = {k:v for k, v in sheets.items() if v is not None} + + sheets = {name: safe_parse(name) for name, mt in container.mime_map.items() + if mt in base.OEB_STYLES and safe_parse(name) is not None} num_merged = 0 if merge_rules: for name, sheet in sheets.items(): @@ -106,7 +108,7 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge num_of_removed_rules = num_of_removed_classes = 0 for name, mt in container.mime_map.items(): - if mt not in OEB_DOCS: + if mt not in base.OEB_DOCS: continue root = container.parsed(name) select = Select(root, ignore_inappropriate_pseudo_classes=True) @@ -120,31 +122,39 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge num_merged += num container.dirty(name) if remove_unused_classes: - used_classes |= {x.lower() for x in classes_in_rule_list(sheet.cssRules)} - imports = get_imported_sheets(name, container, sheets, sheet=sheet) + used_classes |= {x.lower() for x in + classes_in_rule_list(sheet.cssRules)} + imports = get_imported_sheets(name, container, sheets, + sheet=sheet) for imported_sheet in imports: - style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select)) + style_rules[imported_sheet] = tuple(filter_used_rules( + style_rules[imported_sheet], container.log, select)) if remove_unused_classes: used_classes |= class_map[imported_sheet] rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) - unused_rules = tuple(filter_used_rules(rules, container.log, select)) + unused_rules = tuple(filter_used_rules(rules, container.log, + select)) if unused_rules: num_of_removed_rules += len(unused_rules) [sheet.cssRules.remove(r) for r in unused_rules] style.text = force_unicode(sheet.cssText, 'utf-8') - pretty_script_or_style(container, style) + pretty.pretty_script_or_style(container, style) container.dirty(name) for link in root.xpath('//*[local-name()="link" and @href]'): sname = container.href_to_name(link.get('href'), name) if sname not in sheets: continue - style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select)) + style_rules[sname] = tuple(filter_used_rules(style_rules[sname], + container.log, + select)) if remove_unused_classes: used_classes |= class_map[sname] for iname in import_map[sname]: - style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select)) + style_rules[iname] = tuple( + filter_used_rules(style_rules[iname], container.log, + select)) if remove_unused_classes: used_classes |= class_map[iname] @@ -159,7 +169,8 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge elem.set('class', ' '.join(classes)) else: del elem.attrib['class'] - num_of_removed_classes += len(original_classes) - len(classes) + num_of_removed_classes += (len(original_classes) - + len(classes)) container.dirty(name) for name, sheet in sheets.items(): @@ -195,7 +206,7 @@ def filter_declaration(style, properties=()): changed = True all_props = set(style.keys()) for prop in style.getProperties(): - n = normalizers.get(prop.name, None) + n = base.normalize_css.normalizers.get(prop.name, None) if n is not None: normalized = n(prop.name, prop.propertyValue) removed = properties.intersection(set(normalized)) @@ -225,12 +236,13 @@ def transform_inline_styles(container, name, transform_sheet, transform_style): root = container.parsed(name) changed = False for style in root.xpath('//*[local-name()="style"]'): - if style.text and (style.get('type') or 'text/css').lower() == 'text/css': + if style.text and (style.get('type') or + 'text/css').lower() == 'text/css': sheet = container.parse_css(style.text) if transform_sheet(sheet): changed = True style.text = force_unicode(sheet.cssText, 'utf-8') - pretty_script_or_style(container, style) + pretty.pretty_script_or_style(container, style) for elem in root.xpath('//*[@style]'): text = elem.get('style', None) if text: @@ -240,13 +252,16 @@ def transform_inline_styles(container, name, transform_sheet, transform_style): if style.length == 0: del elem.attrib['style'] else: - elem.set('style', force_unicode(style.getCssText(separator=' '), 'utf-8')) + elem.set('style', + force_unicode(style.getCssText(separator=' '), + 'utf-8')) return changed -def transform_css(container, transform_sheet=None, transform_style=None, names=()): +def transform_css(container, transform_sheet=None, transform_style=None, + names=()): if not names: - types = OEB_STYLES | OEB_DOCS + types = base.OEB_STYLES | base.OEB_DOCS names = [] for name, mt in container.mime_map.items(): if mt in types: @@ -256,13 +271,14 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=( for name in names: mt = container.mime_map[name] - if mt in OEB_STYLES: + if mt in base.OEB_STYLES: sheet = container.parsed(name) if transform_sheet(sheet): container.dirty(name) doc_changed = True - elif mt in OEB_DOCS: - if transform_inline_styles(container, name, transform_sheet, transform_style): + elif mt in base.OEB_DOCS: + if transform_inline_styles(container, name, transform_sheet, + transform_style): container.dirty(name) doc_changed = True @@ -270,15 +286,21 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=( def filter_css(container, properties, names=()): - ''' + """ Remove the specified CSS properties from all CSS rules in the book. - :param properties: Set of properties to remove. For example: :code:`{'font-family', 'color'}`. - :param names: The files from which to remove the properties. Defaults to all HTML and CSS files in the book. - ''' - properties = normalize_filter_css(properties) - return transform_css(container, transform_sheet=partial(filter_sheet, properties=properties), - transform_style=partial(filter_declaration, properties=properties), names=names) + :param properties: Set of properties to remove. For example: + :code:`{'font-family', 'color'}`. + :param names: The files from which to remove the properties. Defaults to + all HTML and CSS files in the book. + """ + properties = base.normalize_css.normalize_filter_css(properties) + return transform_css(container, + transform_sheet=functools.partial( + filter_sheet, properties=properties), + transform_style=functools.partial( + filter_declaration, properties=properties), + names=names) def _classes_in_selector(selector, classes): @@ -331,21 +353,29 @@ def remove_property_value(prop, predicate): if len(removed_vals) == len(prop.propertyValue): prop.parent.removeProperty(prop.name) else: - x = css_text(prop.propertyValue) + x = base.css_text(prop.propertyValue) for v in removed_vals: - x = x.replace(css_text(v), '').strip() + x = x.replace(base.css_text(v), '').strip() prop.propertyValue.cssText = x return bool(removed_vals) -RULE_PRIORITIES = {t:i for i, t in enumerate((CSSRule.COMMENT, CSSRule.CHARSET_RULE, CSSRule.IMPORT_RULE, CSSRule.NAMESPACE_RULE))} +RULE_PRIORITIES = {t: i for i, t in enumerate((CSSRule.COMMENT, + CSSRule.CHARSET_RULE, + CSSRule.IMPORT_RULE, + CSSRule.NAMESPACE_RULE))} def sort_sheet(container, sheet_or_text): - ''' Sort the rules in a stylesheet. Note that in the general case this can - change the effective styles, but for most common sheets, it should be safe. - ''' - sheet = container.parse_css(sheet_or_text) if isinstance(sheet_or_text, str) else sheet_or_text + """ + Sort the rules in a stylesheet. Note that in the general case this can + change the effective styles, but for most common sheets, it should be + safe. + """ + if isinstance(sheet_or_text, str): + sheet = container.parse_css(sheet_or_text) + else: + sheet = sheet_or_text def text_sort_key(x): return numeric_sort_key(str(x or '')) @@ -364,7 +394,8 @@ def sort_sheet(container, sheet_or_text): rule.selectorText = ', '.join(s.selectorText for s in selectors) elif rule.type == CSSRule.FONT_FACE_RULE: try: - tertiary = text_sort_key(rule.style.getPropertyValue('font-family')) + tertiary = text_sort_key(rule.style.getPropertyValue('font-' + 'family')) except Exception: pass @@ -379,11 +410,14 @@ def add_stylesheet_links(container, name, text): if not head: return head = head[0] - sheets = tuple(container.manifest_items_of_type(lambda mt: mt in OEB_STYLES)) + sheets = tuple(container.manifest_items_of_type(lambda mt: + mt in base.OEB_STYLES)) if not sheets: return for sname in sheets: - link = head.makeelement(const.XHTML_LINK, type='text/css', rel='stylesheet', href=container.name_to_href(sname, name)) + link = head.makeelement(base.tag('xhtml', 'link'), type='text/css', + rel='stylesheet', + href=container.name_to_href(sname, name)) head.append(link) - pretty_xml_tree(head) - return serialize(root, 'text/html') + pretty.pretty_xml_tree(head) + return pretty.serialize(root, 'text/html') diff --git a/ebook_converter/ebooks/oeb/polish/opf.py b/ebook_converter/ebooks/oeb/polish/opf.py index 77cd848..4c8e372 100644 --- a/ebook_converter/ebooks/oeb/polish/opf.py +++ b/ebook_converter/ebooks/oeb/polish/opf.py @@ -1,6 +1,7 @@ from lxml import etree from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import base from ebook_converter.utils.localization import canonicalize_lang @@ -14,7 +15,7 @@ def get_book_language(container): def set_guide_item(container, item_type, title, name, frag=None): - ref_tag = const.OPF_REFERENCE + ref_tag = base.tag('opf', 'reference') href = None if name: href = container.name_to_href(name, container.opf_name) @@ -23,7 +24,7 @@ def set_guide_item(container, item_type, title, name, frag=None): guides = container.opf_xpath('//opf:guide') if not guides and href: - g = container.opf.makeelement(const.OPF_GUIDE, + g = container.opf.makeelement(base.tag('opf', 'guide'), nsmap={'opf': const.OPF2_NS}) container.insert_into_xml(container.opf, g) guides = [g] diff --git a/ebook_converter/ebooks/oeb/polish/pretty.py b/ebook_converter/ebooks/oeb/polish/pretty.py index 8620f9e..85774a8 100644 --- a/ebook_converter/ebooks/oeb/polish/pretty.py +++ b/ebook_converter/ebooks/oeb/polish/pretty.py @@ -1,18 +1,13 @@ import textwrap -# from lxml.etree import Element - from ebook_converter import constants as const from ebook_converter import force_unicode from ebook_converter.ebooks.oeb import parse_utils -from ebook_converter.ebooks.oeb.base import serialize, OEB_DOCS, OEB_STYLES +from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb.polish.utils import guess_type from ebook_converter.utils.icu import sort_key -__license__ = 'GPL v3' -__copyright__ = '2013, Kovid Goyal ' - def isspace(x): return not x.strip('\u0009\u000a\u000c\u000d\u0020') @@ -28,37 +23,40 @@ def pretty_xml_tree(elem, level=0, indent=' '): for i, child in enumerate(elem): pretty_xml_tree(child, level=level+1, indent=indent) if not child.tail or isspace(child.tail): - l = level + 1 + new_level = level + 1 if i == len(elem) - 1: - l -= 1 - child.tail = '\n' + (indent * l) + new_level -= 1 + child.tail = '\n' + (indent * new_level) def pretty_opf(root): # Put all dc: tags first starting with title and author. Preserve order for # the rest. def dckey(x): - return {'title':0, 'creator':1}.get(parse_utils.barename(x.tag), 2) - for metadata in root.xpath('//opf:metadata', namespaces=const.OPF_NAMESPACES): + return {'title': 0, 'creator': 1}.get(parse_utils.barename(x.tag), 2) + + for metadata in root.xpath('//opf:metadata', + namespaces=const.OPF_NAMESPACES): dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS) dc_tags.sort(key=dckey) for x in reversed(dc_tags): metadata.insert(0, x) # Group items in the manifest - spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=const.OPF_NAMESPACES) - spine_ids = {x:i for i, x in enumerate(spine_ids)} + spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', + namespaces=const.OPF_NAMESPACES) + spine_ids = {x: i for i, x in enumerate(spine_ids)} def manifest_key(x): mt = x.get('media-type', '') href = x.get('href', '') ext = href.rpartition('.')[-1].lower() cat = 1000 - if mt in OEB_DOCS: + if mt in base.OEB_DOCS: cat = 0 elif mt == guess_type('a.ncx'): cat = 1 - elif mt in OEB_STYLES: + elif mt in base.OEB_STYLES: cat = 2 elif mt.startswith('image/'): cat = 3 @@ -75,20 +73,23 @@ def pretty_opf(root): i = sort_key(href) return (cat, i) - for manifest in root.xpath('//opf:manifest', namespaces=const.OPF_NAMESPACES): + for manifest in root.xpath('//opf:manifest', + namespaces=const.OPF_NAMESPACES): try: children = sorted(manifest, key=manifest_key) except AttributeError: - continue # There are comments so dont sort since that would mess up the comments + # There are comments so dont sort since that would mess up the + # comments. + continue + for x in reversed(children): manifest.insert(0, x) - def isblock(x): if callable(x.tag) or not x.tag: return True - if x.tag in const.XHTML_BLOCK_TAGS | {const.SVG_SVG}: + if x.tag in const.XHTML_BLOCK_TAGS | {base.tag('svg', 'svg')}: return True return False @@ -133,28 +134,34 @@ def pretty_block(parent, level=1, indent=' '): that contain only other block tags ''' if not parent.text or isspace(parent.text): parent.text = '' - nn = '\n' if hasattr(parent.tag, 'strip') and parse_utils.barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n' + if (hasattr(parent.tag, 'strip') and + parse_utils.barename(parent.tag) in {'tr', 'td', 'th'}): + nn = '\n' + else: + nn = '\n\n' parent.text = parent.text + nn + (indent * level) for i, child in enumerate(parent): if isblock(child) and has_only_blocks(child): pretty_block(child, level=level+1, indent=indent) - elif child.tag == const.SVG_SVG: + elif child.tag == base.tag('svg', 'svg'): pretty_xml_tree(child, level=level, indent=indent) - l = level + new_level = level if i == len(parent) - 1: - l -= 1 + new_level -= 1 if not child.tail or isspace(child.tail): child.tail = '' - child.tail = child.tail + nn + (indent * l) + child.tail = child.tail + nn + (indent * new_level) def pretty_script_or_style(container, child): if child.text: indent = indent_for_tag(child) if child.tag.endswith('style'): - child.text = force_unicode(pretty_css(container, '', child.text), 'utf-8') + child.text = force_unicode(pretty_css(container, '', child.text), + 'utf-8') child.text = textwrap.dedent(child.text) - child.text = '\n' + '\n'.join([(indent + x) if x else '' for x in child.text.splitlines()]) + child.text = '\n' + '\n'.join([(indent + x) if x else '' + for x in child.text.splitlines()]) set_indent(child, 'text', indent) @@ -169,62 +176,82 @@ def pretty_html_tree(container, root): # Special case the handling of a body that contains a single block tag # with all content. In this case we prettify the containing block tag # even if it has non block children. - if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks( - body[0]) and parse_utils.barename(body[0].tag) not in ( - 'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0): + if (len(body) == 1 and + not callable(body[0].tag) and + isblock(body[0]) and + not has_only_blocks(body[0]) and + parse_utils.barename(body[0].tag) not in ('pre', 'p', 'h1', + 'h2', 'h3', 'h4', + 'h5', 'h6') and + len(body[0]) > 0): pretty_block(body[0], level=2) if container is not None: # Handle