mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-29 01:25:46 +01:00
1188 lines
40 KiB
Python
1188 lines
40 KiB
Python
import collections
|
|
import functools
|
|
import json
|
|
import re
|
|
|
|
from lxml import etree
|
|
|
|
from ebook_converter import constants as const
|
|
from ebook_converter import prints
|
|
from ebook_converter.ebooks.metadata import authors_to_string
|
|
from ebook_converter.ebooks.metadata import check_isbn
|
|
from ebook_converter.ebooks.metadata import string_to_authors
|
|
from ebook_converter.ebooks.oeb import base as oeb_base
|
|
from ebook_converter.ebooks.metadata.book import base
|
|
from ebook_converter.ebooks.metadata.book.json_codec import (
|
|
decode_is_multiple, encode_is_multiple, object_to_unicode
|
|
)
|
|
from ebook_converter.ebooks.metadata.utils import (
|
|
create_manifest_item, ensure_unique, normalize_languages, parse_opf,
|
|
pretty_print_opf
|
|
)
|
|
from ebook_converter.utils.config import from_json, to_json
|
|
from ebook_converter.utils.date import (fix_only_date, is_date_undefined,
|
|
isoformat, parse_date as parse_date_,
|
|
utcnow, w3cdtf)
|
|
from ebook_converter.utils.iso8601 import parse_iso8601
|
|
from ebook_converter.utils.localization import canonicalize_lang
|
|
|
|
|
|
RES_PREFIXES = {'dcterms': 'http://purl.org/dc/terms/',
|
|
'epubsc': 'http://idpf.org/epub/vocab/sc/#',
|
|
'marc': 'http://id.loc.gov/vocabulary/',
|
|
'media': 'http://www.idpf.org/epub/vocab/overlays/#',
|
|
'onix': 'http://www.editeur.org/ONIX/book/codelists/'
|
|
'current.html#',
|
|
'rendition': 'http://www.idpf.org/vocab/rendition/#',
|
|
'schema': 'http://schema.org/',
|
|
'xsd': 'http://www.w3.org/2001/XMLSchema#'}
|
|
|
|
CALIBRE_PREFIX = 'https://calibre-ebook.com'
|
|
KNOWN_PREFIXES = RES_PREFIXES.copy()
|
|
KNOWN_PREFIXES['calibre'] = CALIBRE_PREFIX
|
|
|
|
# Utils {{{
|
|
_XPATH_CACHE = {}
|
|
_RE_CACHE = {}
|
|
|
|
|
|
def uniq(vals):
|
|
''' Remove all duplicates from vals, while preserving order. '''
|
|
vals = vals or ()
|
|
seen = set()
|
|
seen_add = seen.add
|
|
return list(x for x in vals if x not in seen and not seen_add(x))
|
|
|
|
|
|
def dump_dict(cats):
|
|
return json.dumps(object_to_unicode(cats or {}), ensure_ascii=False,
|
|
skipkeys=True)
|
|
|
|
|
|
def XPath(x):
|
|
try:
|
|
return _XPATH_CACHE[x]
|
|
except KeyError:
|
|
_XPATH_CACHE[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP)
|
|
return ans
|
|
|
|
|
|
def regex(r, flags=0):
|
|
try:
|
|
return _RE_CACHE[(r, flags)]
|
|
except KeyError:
|
|
_RE_CACHE[(r, flags)] = ans = re.compile(r, flags)
|
|
return ans
|
|
|
|
|
|
def remove_refines(e, refines):
|
|
for x in refines[e.get('id')]:
|
|
x.getparent().remove(x)
|
|
refines.pop(e.get('id'), None)
|
|
|
|
|
|
def remove_element(e, refines):
|
|
remove_refines(e, refines)
|
|
e.getparent().remove(e)
|
|
|
|
|
|
def properties_for_id(item_id, refines):
|
|
ans = {}
|
|
if item_id:
|
|
for elem in refines[item_id]:
|
|
key = elem.get('property')
|
|
if key:
|
|
val = (elem.text or '').strip()
|
|
if val:
|
|
ans[key] = val
|
|
return ans
|
|
|
|
|
|
def properties_for_id_with_scheme(item_id, prefixes, refines):
|
|
ans = collections.defaultdict(list)
|
|
if item_id:
|
|
for elem in refines[item_id]:
|
|
key = elem.get('property')
|
|
if key:
|
|
val = (elem.text or '').strip()
|
|
if val:
|
|
scheme = elem.get('scheme') or None
|
|
scheme_ns = None
|
|
if scheme is not None:
|
|
p, r = scheme.partition(':')[::2]
|
|
if p and r:
|
|
ns = prefixes.get(p)
|
|
if ns:
|
|
scheme_ns = ns
|
|
scheme = r
|
|
ans[key].append((scheme_ns, scheme, val))
|
|
return ans
|
|
|
|
|
|
def getroot(elem):
|
|
while True:
|
|
q = elem.getparent()
|
|
if q is None:
|
|
return elem
|
|
elem = q
|
|
|
|
|
|
def ensure_id(elem):
|
|
root = getroot(elem)
|
|
eid = elem.get('id')
|
|
if not eid:
|
|
eid = ensure_unique('id', frozenset(XPath('//*/@id')(root)))
|
|
elem.set('id', eid)
|
|
return eid
|
|
|
|
|
|
def normalize_whitespace(text):
|
|
if not text:
|
|
return text
|
|
return re.sub(r'\s+', ' ', text).strip()
|
|
|
|
|
|
def simple_text(f):
|
|
@functools.wraps(f)
|
|
def wrapper(*args, **kw):
|
|
return normalize_whitespace(f(*args, **kw))
|
|
return wrapper
|
|
|
|
|
|
def items_with_property(root, q, prefixes=None):
|
|
if prefixes is None:
|
|
prefixes = read_prefixes(root)
|
|
q = expand_prefix(q, KNOWN_PREFIXES).lower()
|
|
for item in XPath("./opf:manifest/opf:item[@properties]")(root):
|
|
for prop in (item.get('properties') or '').lower().split():
|
|
prop = expand_prefix(prop, prefixes)
|
|
if prop == q:
|
|
yield item
|
|
break
|
|
|
|
# }}}
|
|
|
|
# Prefixes {{{
|
|
|
|
# http://www.idpf.org/epub/vocab/package/pfx/
|
|
|
|
|
|
def parse_prefixes(x):
|
|
return {m.group(1): m.group(2)
|
|
for m in re.finditer(r'(\S+): \s*(\S+)', x)}
|
|
|
|
|
|
def read_prefixes(root):
|
|
ans = RES_PREFIXES.copy()
|
|
ans.update(parse_prefixes(root.get('prefix') or ''))
|
|
return ans
|
|
|
|
|
|
def expand_prefix(raw, prefixes):
|
|
return (regex(r'(\S+)\s*:\s*(\S+)')
|
|
.sub(lambda m: (prefixes.get(m.group(1),
|
|
m.group(1)) + ':' + m.group(2)),
|
|
raw or ''))
|
|
|
|
|
|
def ensure_prefix(root, prefixes, prefix, value=None):
|
|
if prefixes is None:
|
|
prefixes = read_prefixes(root)
|
|
prefixes[prefix] = value or RES_PREFIXES[prefix]
|
|
prefixes = {k: v for k, v in prefixes.items() if RES_PREFIXES.get(k) != v}
|
|
if prefixes:
|
|
root.set('prefix', ' '.join('%s: %s' % (k, v)
|
|
for k, v in prefixes.items()))
|
|
else:
|
|
root.attrib.pop('prefix', None)
|
|
|
|
# }}}
|
|
|
|
# Refines {{{
|
|
|
|
|
|
def read_refines(root):
|
|
ans = collections.defaultdict(list)
|
|
for meta in XPath('./opf:metadata/opf:meta[@refines]')(root):
|
|
r = meta.get('refines') or ''
|
|
if r.startswith('#'):
|
|
ans[r[1:]].append(meta)
|
|
return ans
|
|
|
|
|
|
def refdef(prop, val, scheme=None):
|
|
return (prop, val, scheme)
|
|
|
|
|
|
def set_refines(elem, existing_refines, *new_refines):
|
|
eid = ensure_id(elem)
|
|
remove_refines(elem, existing_refines)
|
|
for ref in reversed(new_refines):
|
|
prop, val, scheme = ref
|
|
r = elem.makeelement(oeb_base.tag('opf', 'meta'))
|
|
r.set('refines', '#' + eid), r.set('property', prop)
|
|
r.text = val.strip()
|
|
if scheme:
|
|
r.set('scheme', scheme)
|
|
p = elem.getparent()
|
|
p.insert(p.index(elem)+1, r)
|
|
# }}}
|
|
|
|
# Identifiers {{{
|
|
|
|
|
|
def parse_identifier(ident, val, refines):
|
|
idid = ident.get('id')
|
|
refines = refines[idid]
|
|
scheme = None
|
|
lval = val.lower()
|
|
|
|
def finalize(scheme, val):
|
|
if not scheme or not val:
|
|
return None, None
|
|
scheme = scheme.lower()
|
|
if scheme in ('http', 'https'):
|
|
return None, None
|
|
if scheme.startswith('isbn'):
|
|
scheme = 'isbn'
|
|
if scheme == 'isbn':
|
|
val = val.split(':')[-1]
|
|
val = check_isbn(val)
|
|
if val is None:
|
|
return None, None
|
|
return scheme, val
|
|
|
|
# Try the OPF 2 style opf:scheme attribute, which will be present, for
|
|
# example, in EPUB 3 files that have had their metadata set by an
|
|
# application that only understands EPUB 2.
|
|
scheme = ident.get(oeb_base.tag('opf', 'scheme'))
|
|
if scheme and not lval.startswith('urn:'):
|
|
return finalize(scheme, val)
|
|
|
|
# Technically, we should be looking for refines that define the scheme, but
|
|
# the IDioticPF created such a bad spec that they got their own
|
|
# examples wrong, so I cannot be bothered doing this.
|
|
# http://www.idpf.org/epub/301/spec/epub-publications-errata/
|
|
|
|
# Parse the value for the scheme
|
|
if lval.startswith('urn:'):
|
|
val = val[4:]
|
|
|
|
prefix, rest = val.partition(':')[::2]
|
|
return finalize(prefix, rest)
|
|
|
|
|
|
def read_identifiers(root, prefixes, refines):
|
|
ans = collections.defaultdict(list)
|
|
for ident in XPath('./opf:metadata/dc:identifier')(root):
|
|
val = (ident.text or '').strip()
|
|
if val:
|
|
scheme, val = parse_identifier(ident, val, refines)
|
|
if scheme and val:
|
|
ans[scheme].append(val)
|
|
return ans
|
|
|
|
|
|
def set_identifiers(root, prefixes, refines, new_identifiers,
|
|
force_identifiers=False):
|
|
uid = root.get('unique-identifier')
|
|
package_identifier = None
|
|
for ident in XPath('./opf:metadata/dc:identifier')(root):
|
|
if uid is not None and uid == ident.get('id'):
|
|
package_identifier = ident
|
|
continue
|
|
val = (ident.text or '').strip()
|
|
if not val:
|
|
ident.getparent().remove(ident)
|
|
continue
|
|
scheme, val = parse_identifier(ident, val, refines)
|
|
if (not scheme or
|
|
not val or
|
|
force_identifiers or
|
|
scheme in new_identifiers):
|
|
remove_element(ident, refines)
|
|
continue
|
|
metadata = XPath('./opf:metadata')(root)[0]
|
|
for scheme, val in new_identifiers.items():
|
|
ident = metadata.makeelement(oeb_base.tag('dc', 'ident'))
|
|
ident.text = '%s:%s' % (scheme, val)
|
|
if package_identifier is None:
|
|
metadata.append(ident)
|
|
else:
|
|
p = package_identifier.getparent()
|
|
p.insert(p.index(package_identifier), ident)
|
|
|
|
|
|
def identifier_writer(name):
|
|
def writer(root, prefixes, refines, ival=None):
|
|
uid = root.get('unique-identifier')
|
|
package_identifier = None
|
|
for ident in XPath('./opf:metadata/dc:identifier')(root):
|
|
is_package_id = uid is not None and uid == ident.get('id')
|
|
if is_package_id:
|
|
package_identifier = ident
|
|
val = (ident.text or '').strip()
|
|
if (val.startswith(name + ':') or
|
|
ident.get(oeb_base.tag('opf', 'scheme')) == name) and not is_package_id:
|
|
remove_element(ident, refines)
|
|
metadata = XPath('./opf:metadata')(root)[0]
|
|
if ival:
|
|
ident = metadata.makeelement(oeb_base.tag('dc', 'ident'))
|
|
ident.text = '%s:%s' % (name, ival)
|
|
if package_identifier is None:
|
|
metadata.append(ident)
|
|
else:
|
|
p = package_identifier.getparent()
|
|
p.insert(p.index(package_identifier), ident)
|
|
return writer
|
|
|
|
|
|
set_application_id = identifier_writer('calibre')
|
|
set_uuid = identifier_writer('uuid')
|
|
|
|
# }}}
|
|
|
|
# Title {{{
|
|
|
|
|
|
def find_main_title(root, refines, remove_blanks=False):
|
|
first_title = main_title = None
|
|
for title in XPath('./opf:metadata/dc:title')(root):
|
|
if not title.text or not title.text.strip():
|
|
if remove_blanks:
|
|
remove_element(title, refines)
|
|
continue
|
|
if first_title is None:
|
|
first_title = title
|
|
props = properties_for_id(title.get('id'), refines)
|
|
if props.get('title-type') == 'main':
|
|
main_title = title
|
|
break
|
|
else:
|
|
main_title = first_title
|
|
return main_title
|
|
|
|
|
|
@simple_text
|
|
def read_title(root, prefixes, refines):
|
|
main_title = find_main_title(root, refines)
|
|
return None if main_title is None else main_title.text.strip()
|
|
|
|
|
|
@simple_text
|
|
def read_title_sort(root, prefixes, refines):
|
|
main_title = find_main_title(root, refines)
|
|
if main_title is not None:
|
|
fa = properties_for_id(main_title.get('id'), refines).get('file-as')
|
|
if fa:
|
|
return fa
|
|
# Look for OPF 2.0 style title_sort
|
|
for m in XPath('./opf:metadata/opf:meta[@name="calibre:'
|
|
'title_sort"]')(root):
|
|
ans = m.get('content')
|
|
if ans:
|
|
return ans
|
|
|
|
|
|
def set_title(root, prefixes, refines, title, title_sort=None):
|
|
main_title = find_main_title(root, refines, remove_blanks=True)
|
|
if main_title is None:
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
main_title = m.makeelement(oeb_base.tag('dc', 'title'))
|
|
m.insert(0, main_title)
|
|
main_title.text = title or None
|
|
ts = [refdef('file-as', title_sort)] if title_sort else ()
|
|
set_refines(main_title, refines, refdef('title-type', 'main'), *ts)
|
|
for m in XPath('./opf:metadata/opf:meta[@name="calibre:'
|
|
'title_sort"]')(root):
|
|
remove_element(m, refines)
|
|
|
|
# }}}
|
|
|
|
# Languages {{{
|
|
|
|
|
|
def read_languages(root, prefixes, refines):
|
|
ans = []
|
|
for lang in XPath('./opf:metadata/dc:language')(root):
|
|
val = canonicalize_lang((lang.text or '').strip())
|
|
if val and val not in ans and val != 'und':
|
|
ans.append(val)
|
|
return uniq(ans)
|
|
|
|
|
|
def set_languages(root, prefixes, refines, languages):
|
|
opf_languages = []
|
|
for lang in XPath('./opf:metadata/dc:language')(root):
|
|
remove_element(lang, refines)
|
|
val = (lang.text or '').strip()
|
|
if val:
|
|
opf_languages.append(val)
|
|
languages = list(filter(lambda x: x and x != 'und',
|
|
normalize_languages(opf_languages, languages)))
|
|
if not languages:
|
|
# EPUB spec says dc:language is required
|
|
languages = ['und']
|
|
metadata = XPath('./opf:metadata')(root)[0]
|
|
for lang in uniq(languages):
|
|
dc_lang = metadata.makeelement(oeb_base.tag('dc', 'lang'))
|
|
dc_lang.text = lang
|
|
metadata.append(dc_lang)
|
|
# }}}
|
|
|
|
# Creator/Contributor {{{
|
|
|
|
|
|
Author = collections.namedtuple('Author', 'name sort')
|
|
|
|
|
|
def is_relators_role(props, q):
|
|
for role in props.get('role'):
|
|
if role:
|
|
scheme_ns, scheme, role = role
|
|
if (role.lower() == q and
|
|
(scheme_ns is None or
|
|
(scheme_ns, scheme) == (RES_PREFIXES['marc'],
|
|
'relators'))):
|
|
return True
|
|
return False
|
|
|
|
|
|
def read_authors(root, prefixes, refines):
|
|
roled_authors, unroled_authors = [], []
|
|
|
|
def author(item, props, val):
|
|
aus = None
|
|
file_as = props.get('file-as')
|
|
if file_as:
|
|
aus = file_as[0][-1]
|
|
else:
|
|
aus = item.get(oeb_base.tag('opf', 'file_as')) or None
|
|
return Author(normalize_whitespace(val), normalize_whitespace(aus))
|
|
|
|
for item in XPath('./opf:metadata/dc:creator')(root):
|
|
val = (item.text or '').strip()
|
|
if val:
|
|
props = properties_for_id_with_scheme(item.get('id'), prefixes,
|
|
refines)
|
|
role = props.get('role')
|
|
opf_role = item.get(oeb_base.tag('opf', 'role'))
|
|
if role:
|
|
if is_relators_role(props, 'aut'):
|
|
roled_authors.append(author(item, props, val))
|
|
elif opf_role:
|
|
if opf_role.lower() == 'aut':
|
|
roled_authors.append(author(item, props, val))
|
|
else:
|
|
unroled_authors.append(author(item, props, val))
|
|
|
|
return uniq(roled_authors or unroled_authors)
|
|
|
|
|
|
def set_authors(root, prefixes, refines, authors):
|
|
ensure_prefix(root, prefixes, 'marc')
|
|
for item in XPath('./opf:metadata/dc:creator')(root):
|
|
props = properties_for_id_with_scheme(item.get('id'), prefixes,
|
|
refines)
|
|
opf_role = item.get(oeb_base.tag('opf', 'role'))
|
|
if ((opf_role and opf_role.lower() != 'aut') or
|
|
(props.get('role') and not is_relators_role(props, 'aut'))):
|
|
continue
|
|
remove_element(item, refines)
|
|
metadata = XPath('./opf:metadata')(root)[0]
|
|
for author in authors:
|
|
if author.name:
|
|
a = metadata.makeelement(oeb_base.tag('dc', 'creator'))
|
|
aid = ensure_id(a)
|
|
a.text = author.name
|
|
metadata.append(a)
|
|
m = metadata.makeelement(oeb_base.tag('opf', 'meta'),
|
|
attrib={'refines': '#' + aid,
|
|
'property': 'role',
|
|
'scheme': 'marc:relators'})
|
|
m.text = 'aut'
|
|
metadata.append(m)
|
|
if author.sort:
|
|
m = metadata.makeelement(oeb_base.tag('opf', 'meta'),
|
|
attrib={'refines': '#' + aid,
|
|
'property': 'file-as'})
|
|
m.text = author.sort
|
|
metadata.append(m)
|
|
|
|
|
|
def read_book_producers(root, prefixes, refines):
|
|
ans = []
|
|
for item in XPath('./opf:metadata/dc:contributor')(root):
|
|
val = (item.text or '').strip()
|
|
if val:
|
|
props = properties_for_id_with_scheme(item.get('id'), prefixes,
|
|
refines)
|
|
role = props.get('role')
|
|
opf_role = item.get(oeb_base.tag('opf', 'role'))
|
|
if role:
|
|
if is_relators_role(props, 'bkp'):
|
|
ans.append(normalize_whitespace(val))
|
|
elif opf_role and opf_role.lower() == 'bkp':
|
|
ans.append(normalize_whitespace(val))
|
|
return ans
|
|
|
|
|
|
def set_book_producers(root, prefixes, refines, producers):
|
|
for item in XPath('./opf:metadata/dc:contributor')(root):
|
|
props = properties_for_id_with_scheme(item.get('id'), prefixes,
|
|
refines)
|
|
opf_role = item.get(oeb_base.tag('opf', 'role'))
|
|
if ((opf_role and opf_role.lower() != 'bkp') or
|
|
(props.get('role') and not is_relators_role(props, 'bkp'))):
|
|
continue
|
|
remove_element(item, refines)
|
|
metadata = XPath('./opf:metadata')(root)[0]
|
|
for bkp in producers:
|
|
if bkp:
|
|
a = metadata.makeelement(oeb_base.tag('dc', 'contributor'))
|
|
aid = ensure_id(a)
|
|
a.text = bkp
|
|
metadata.append(a)
|
|
m = metadata.makeelement(oeb_base.tag('opf', 'meta'),
|
|
attrib={'refines': '#' + aid,
|
|
'property': 'role',
|
|
'scheme': 'marc:relators'})
|
|
m.text = 'bkp'
|
|
metadata.append(m)
|
|
# }}}
|
|
|
|
# Dates {{{
|
|
|
|
|
|
def parse_date(raw, is_w3cdtf=False):
|
|
raw = raw.strip()
|
|
if is_w3cdtf:
|
|
ans = parse_iso8601(raw, assume_utc=True)
|
|
if 'T' not in raw and ' ' not in raw:
|
|
ans = fix_only_date(ans)
|
|
else:
|
|
ans = parse_date_(raw, assume_utc=True)
|
|
if (' ' not in raw and
|
|
'T' not in raw and
|
|
(ans.hour, ans.minute, ans.second) == (0, 0, 0)):
|
|
ans = fix_only_date(ans)
|
|
return ans
|
|
|
|
|
|
def read_pubdate(root, prefixes, refines):
|
|
for date in XPath('./opf:metadata/dc:date')(root):
|
|
val = (date.text or '').strip()
|
|
if val:
|
|
try:
|
|
return parse_date(val)
|
|
except Exception:
|
|
continue
|
|
|
|
|
|
def set_pubdate(root, prefixes, refines, val):
|
|
for date in XPath('./opf:metadata/dc:date')(root):
|
|
remove_element(date, refines)
|
|
if not is_date_undefined(val):
|
|
val = isoformat(val)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
d = m.makeelement(oeb_base.tag('dc', 'date'))
|
|
d.text = val
|
|
m.append(d)
|
|
|
|
|
|
def read_timestamp(root, prefixes, refines):
|
|
pq = '%s:timestamp' % CALIBRE_PREFIX
|
|
sq = '%s:w3cdtf' % RES_PREFIXES['dcterms']
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
val = (meta.text or '').strip()
|
|
if val:
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
scheme = expand_prefix(meta.get('scheme'), prefixes).lower()
|
|
try:
|
|
return parse_date(val, is_w3cdtf=scheme == sq)
|
|
except Exception:
|
|
continue
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:'
|
|
'timestamp"]')(root):
|
|
val = meta.get('content')
|
|
if val:
|
|
try:
|
|
return parse_date(val, is_w3cdtf=True)
|
|
except Exception:
|
|
continue
|
|
|
|
|
|
def create_timestamp(root, prefixes, m, val):
|
|
if not is_date_undefined(val):
|
|
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
|
ensure_prefix(root, prefixes, 'dcterms')
|
|
val = w3cdtf(val)
|
|
d = m.makeelement(oeb_base.tag('opf', 'meta'),
|
|
attrib={'property': 'calibre:timestamp',
|
|
'scheme': 'dcterms:W3CDTF'})
|
|
d.text = val
|
|
m.append(d)
|
|
|
|
|
|
def set_timestamp(root, prefixes, refines, val):
|
|
pq = '%s:timestamp' % CALIBRE_PREFIX
|
|
for meta in XPath('./opf:metadata/opf:meta')(root):
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq or meta.get('name') == 'calibre:timestamp':
|
|
remove_element(meta, refines)
|
|
create_timestamp(root, prefixes, XPath('./opf:metadata')(root)[0], val)
|
|
|
|
|
|
def read_last_modified(root, prefixes, refines):
|
|
pq = '%s:modified' % RES_PREFIXES['dcterms']
|
|
sq = '%s:w3cdtf' % RES_PREFIXES['dcterms']
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
val = (meta.text or '').strip()
|
|
if val:
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
scheme = expand_prefix(meta.get('scheme'), prefixes).lower()
|
|
try:
|
|
return parse_date(val, is_w3cdtf=scheme == sq)
|
|
except Exception:
|
|
continue
|
|
|
|
|
|
def set_last_modified(root, prefixes, refines, val=None):
|
|
pq = '%s:modified' % RES_PREFIXES['dcterms']
|
|
val = w3cdtf(val or utcnow())
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
iid = meta.get('id')
|
|
if not iid or not refines[iid]:
|
|
break
|
|
else:
|
|
ensure_prefix(root, prefixes, 'dcterms')
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
meta = m.makeelement(oeb_base.tag('opf', 'meta'),
|
|
attrib={'property': 'dcterms:modified',
|
|
'scheme': 'dcterms:W3CDTF'})
|
|
m.append(meta)
|
|
meta.text = val
|
|
# }}}
|
|
|
|
# Comments {{{
|
|
|
|
|
|
def read_comments(root, prefixes, refines):
|
|
ans = ''
|
|
for dc in XPath('./opf:metadata/dc:description')(root):
|
|
if dc.text:
|
|
ans += '\n' + dc.text.strip()
|
|
return ans.strip()
|
|
|
|
|
|
def set_comments(root, prefixes, refines, val):
|
|
for dc in XPath('./opf:metadata/dc:description')(root):
|
|
remove_element(dc, refines)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
if val:
|
|
val = val.strip()
|
|
if val:
|
|
c = m.makeelement(oeb_base.tag('dc', 'desc'))
|
|
c.text = val
|
|
m.append(c)
|
|
# }}}
|
|
|
|
# Publisher {{{
|
|
|
|
|
|
@simple_text
|
|
def read_publisher(root, prefixes, refines):
|
|
for dc in XPath('./opf:metadata/dc:publisher')(root):
|
|
if dc.text:
|
|
return dc.text
|
|
|
|
|
|
def set_publisher(root, prefixes, refines, val):
|
|
for dc in XPath('./opf:metadata/dc:publisher')(root):
|
|
remove_element(dc, refines)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
if val:
|
|
val = val.strip()
|
|
if val:
|
|
c = m.makeelement(oeb_base.tag('dc', 'publisher'))
|
|
c.text = normalize_whitespace(val)
|
|
m.append(c)
|
|
# }}}
|
|
|
|
# Tags {{{
|
|
|
|
|
|
def read_tags(root, prefixes, refines):
|
|
ans = []
|
|
for dc in XPath('./opf:metadata/dc:subject')(root):
|
|
if dc.text:
|
|
ans.extend(map(normalize_whitespace, dc.text.split(',')))
|
|
return uniq(list(filter(None, ans)))
|
|
|
|
|
|
def set_tags(root, prefixes, refines, val):
|
|
for dc in XPath('./opf:metadata/dc:subject')(root):
|
|
remove_element(dc, refines)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
if val:
|
|
val = uniq(list(filter(None, val)))
|
|
for x in val:
|
|
c = m.makeelement(oeb_base.tag('dc', 'subj'))
|
|
c.text = normalize_whitespace(x)
|
|
if c.text:
|
|
m.append(c)
|
|
# }}}
|
|
|
|
# Rating {{{
|
|
|
|
|
|
def read_rating(root, prefixes, refines):
|
|
pq = '%s:rating' % CALIBRE_PREFIX
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
val = (meta.text or '').strip()
|
|
if val:
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
try:
|
|
return float(val)
|
|
except Exception:
|
|
continue
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:rating"]')(root):
|
|
val = meta.get('content')
|
|
if val:
|
|
try:
|
|
return float(val)
|
|
except Exception:
|
|
continue
|
|
|
|
|
|
def create_rating(root, prefixes, val):
|
|
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
d = m.makeelement(oeb_base.tag('opf', 'meta'), attrib={'property': 'calibre:rating'})
|
|
d.text = val
|
|
m.append(d)
|
|
|
|
|
|
def set_rating(root, prefixes, refines, val):
|
|
pq = '%s:rating' % CALIBRE_PREFIX
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:rating"]')(root):
|
|
remove_element(meta, refines)
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
remove_element(meta, refines)
|
|
if val:
|
|
create_rating(root, prefixes, '%.2g' % val)
|
|
# }}}
|
|
|
|
# Series {{{
|
|
|
|
|
|
def read_series(root, prefixes, refines):
|
|
series_index = 1.0
|
|
for meta in XPath('./opf:metadata/opf:meta[@property="'
|
|
'belongs-to-collection" and @id]')(root):
|
|
val = (meta.text or '').strip()
|
|
if val:
|
|
props = properties_for_id(meta.get('id'), refines)
|
|
if props.get('collection-type') == 'series':
|
|
try:
|
|
series_index = float(props.get('group-position').strip())
|
|
except Exception:
|
|
pass
|
|
return normalize_whitespace(val), series_index
|
|
for si in XPath('./opf:metadata/opf:meta[@name="calibre:series_index"]'
|
|
'/@content')(root):
|
|
try:
|
|
series_index = float(si)
|
|
break
|
|
except Exception:
|
|
pass
|
|
for s in XPath('./opf:metadata/opf:meta[@name="calibre:series"]'
|
|
'/@content')(root):
|
|
s = normalize_whitespace(s)
|
|
if s:
|
|
return s, series_index
|
|
return None, series_index
|
|
|
|
|
|
def create_series(root, refines, series, series_index):
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
d = m.makeelement(oeb_base.tag('opf', 'meta'),
|
|
attrib={'property': 'belongs-to-collection'})
|
|
d.text = series
|
|
m.append(d)
|
|
set_refines(d, refines, refdef('collection-type', 'series'),
|
|
refdef('group-position', series_index))
|
|
|
|
|
|
def set_series(root, prefixes, refines, series, series_index):
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:series" or '
|
|
'@name="calibre:series_index"]')(root):
|
|
remove_element(meta, refines)
|
|
for meta in XPath('./opf:metadata/opf:meta[@property="'
|
|
'belongs-to-collection"]')(root):
|
|
remove_element(meta, refines)
|
|
if series:
|
|
create_series(root, refines, series, '%.2g' % series_index)
|
|
# }}}
|
|
|
|
# User metadata {{{
|
|
|
|
|
|
def dict_reader(name, load=json.loads, try2=True):
|
|
pq = '%s:%s' % (CALIBRE_PREFIX, name)
|
|
|
|
def reader(root, prefixes, refines):
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
val = (meta.text or '').strip()
|
|
if val:
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
try:
|
|
ans = load(val)
|
|
if isinstance(ans, dict):
|
|
return ans
|
|
except Exception:
|
|
continue
|
|
if try2:
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' %
|
|
name)(root):
|
|
val = meta.get('content')
|
|
if val:
|
|
try:
|
|
ans = load(val)
|
|
if isinstance(ans, dict):
|
|
return ans
|
|
except Exception:
|
|
continue
|
|
return reader
|
|
|
|
|
|
read_user_categories = dict_reader('user_categories')
|
|
read_author_link_map = dict_reader('author_link_map')
|
|
|
|
|
|
def dict_writer(name, serialize=dump_dict, remove2=True):
|
|
pq = '%s:%s' % (CALIBRE_PREFIX, name)
|
|
|
|
def writer(root, prefixes, refines, val):
|
|
if remove2:
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' %
|
|
name)(root):
|
|
remove_element(meta, refines)
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
remove_element(meta, refines)
|
|
if val:
|
|
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
d = m.makeelement(oeb_base.tag('opf', 'meta'),
|
|
attrib={'property': 'calibre:%s' % name})
|
|
d.text = serialize(val)
|
|
m.append(d)
|
|
return writer
|
|
|
|
|
|
set_user_categories = dict_writer('user_categories')
|
|
set_author_link_map = dict_writer('author_link_map')
|
|
|
|
|
|
def deserialize_user_metadata(val):
|
|
val = json.loads(val, object_hook=from_json)
|
|
ans = {}
|
|
for name, fm in val.items():
|
|
decode_is_multiple(fm)
|
|
ans[name] = fm
|
|
return ans
|
|
|
|
|
|
read_user_metadata3 = dict_reader('user_metadata',
|
|
load=deserialize_user_metadata, try2=False)
|
|
|
|
|
|
def read_user_metadata2(root, remove_tags=False):
|
|
ans = {}
|
|
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, '
|
|
'"calibre:user_metadata:")]')(root):
|
|
name = meta.get('name')
|
|
name = ':'.join(name.split(':')[2:])
|
|
if not name or not name.startswith('#'):
|
|
continue
|
|
fm = meta.get('content')
|
|
if remove_tags:
|
|
meta.getparent().remove(meta)
|
|
try:
|
|
fm = json.loads(fm, object_hook=from_json)
|
|
decode_is_multiple(fm)
|
|
ans[name] = fm
|
|
except Exception:
|
|
prints('Failed to read user metadata:', name)
|
|
import traceback
|
|
traceback.print_exc()
|
|
continue
|
|
return ans
|
|
|
|
|
|
def read_user_metadata(root, prefixes, refines):
|
|
return read_user_metadata3(root, prefixes,
|
|
refines) or read_user_metadata2(root)
|
|
|
|
|
|
def serialize_user_metadata(val):
|
|
return json.dumps(object_to_unicode(val), ensure_ascii=False,
|
|
default=to_json, indent=2, sort_keys=True)
|
|
|
|
|
|
set_user_metadata3 = dict_writer('user_metadata',
|
|
serialize=serialize_user_metadata,
|
|
remove2=False)
|
|
|
|
|
|
def set_user_metadata(root, prefixes, refines, val):
|
|
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, '
|
|
'"calibre:user_metadata:")]')(root):
|
|
remove_element(meta, refines)
|
|
if val:
|
|
nval = {}
|
|
for name, fm in val.items():
|
|
fm = fm.copy()
|
|
encode_is_multiple(fm)
|
|
nval[name] = fm
|
|
set_user_metadata3(root, prefixes, refines, nval)
|
|
|
|
# }}}
|
|
|
|
# Covers {{{
|
|
|
|
|
|
def read_raster_cover(root, prefixes, refines):
|
|
|
|
def get_href(item):
|
|
mt = item.get('media-type')
|
|
if mt and 'xml' not in mt and 'html' not in mt:
|
|
href = item.get('href')
|
|
if href:
|
|
return href
|
|
|
|
for item in items_with_property(root, 'cover-image', prefixes):
|
|
href = get_href(item)
|
|
if href:
|
|
return href
|
|
|
|
for item_id in XPath('./opf:metadata/opf:meta[@name="cover"]'
|
|
'/@content')(root):
|
|
for item in XPath('./opf:manifest/opf:item[@id and @href and '
|
|
'@media-type]')(root):
|
|
if item.get('id') == item_id:
|
|
href = get_href(item)
|
|
if href:
|
|
return href
|
|
|
|
|
|
def ensure_is_only_raster_cover(root, prefixes, refines,
|
|
raster_cover_item_href):
|
|
for item in XPath('./opf:metadata/opf:meta[@name="cover"]')(root):
|
|
remove_element(item, refines)
|
|
for item in items_with_property(root, 'cover-image', prefixes):
|
|
prop = normalize_whitespace(item.get('properties')
|
|
.replace('cover-image', ''))
|
|
if prop:
|
|
item.set('properties', prop)
|
|
else:
|
|
del item.attrib['properties']
|
|
for item in XPath('./opf:manifest/opf:item')(root):
|
|
if item.get('href') == raster_cover_item_href:
|
|
item.set('properties',
|
|
normalize_whitespace((item.get('properties')
|
|
or '') + ' cover-image'))
|
|
|
|
# }}}
|
|
|
|
# Reading/setting Metadata objects {{{
|
|
|
|
|
|
def first_spine_item(root, prefixes, refines):
|
|
for i in XPath('./opf:spine/opf:itemref/@idref')(root):
|
|
for item in XPath('./opf:manifest/opf:item')(root):
|
|
if item.get('id') == i:
|
|
return item.get('href') or None
|
|
|
|
|
|
def set_last_modified_in_opf(root):
|
|
prefixes, refines = read_prefixes(root), read_refines(root)
|
|
set_last_modified(root, prefixes, refines)
|
|
|
|
|
|
def read_metadata(root, ver=None, return_extra_data=False):
|
|
ans = base.Metadata('Unknown', ['Unknown'])
|
|
prefixes, refines = read_prefixes(root), read_refines(root)
|
|
identifiers = read_identifiers(root, prefixes, refines)
|
|
ids = {}
|
|
for key, vals in identifiers.items():
|
|
if key == 'calibre':
|
|
ans.application_id = vals[0]
|
|
elif key == 'uuid':
|
|
ans.uuid = vals[0]
|
|
else:
|
|
ids[key] = vals[0]
|
|
ans.set_identifiers(ids)
|
|
ans.title = read_title(root, prefixes, refines) or ans.title
|
|
ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
|
|
ans.languages = read_languages(root, prefixes, refines) or ans.languages
|
|
auts, aus = [], []
|
|
for a in read_authors(root, prefixes, refines):
|
|
auts.append(a.name), aus.append(a.sort)
|
|
ans.authors = auts or ans.authors
|
|
ans.author_sort = authors_to_string(aus) or ans.author_sort
|
|
bkp = read_book_producers(root, prefixes, refines)
|
|
if bkp:
|
|
if bkp[0]:
|
|
ans.book_producer = bkp[0]
|
|
pd = read_pubdate(root, prefixes, refines)
|
|
if not is_date_undefined(pd):
|
|
ans.pubdate = pd
|
|
ts = read_timestamp(root, prefixes, refines)
|
|
if not is_date_undefined(ts):
|
|
ans.timestamp = ts
|
|
lm = read_last_modified(root, prefixes, refines)
|
|
if not is_date_undefined(lm):
|
|
ans.last_modified = lm
|
|
ans.comments = read_comments(root, prefixes, refines) or ans.comments
|
|
ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher
|
|
ans.tags = read_tags(root, prefixes, refines) or ans.tags
|
|
ans.rating = read_rating(root, prefixes, refines) or ans.rating
|
|
s, si = read_series(root, prefixes, refines)
|
|
if s:
|
|
ans.series, ans.series_index = s, si
|
|
ans.author_link_map = read_author_link_map(root, prefixes,
|
|
refines) or ans.author_link_map
|
|
ans.user_categories = read_user_categories(root, prefixes,
|
|
refines) or ans.user_categories
|
|
for name, fm in (read_user_metadata(root, prefixes,
|
|
refines) or {}).items():
|
|
ans.set_user_metadata(name, fm)
|
|
if return_extra_data:
|
|
ans = (ans, ver, read_raster_cover(root, prefixes, refines),
|
|
first_spine_item(root, prefixes, refines))
|
|
return ans
|
|
|
|
|
|
def get_metadata(stream):
|
|
root = parse_opf(stream)
|
|
return read_metadata(root)
|
|
|
|
|
|
def apply_metadata(root, mi, cover_prefix='', cover_data=None,
|
|
apply_null=False, update_timestamp=False,
|
|
force_identifiers=False, add_missing_cover=True):
|
|
prefixes, refines = read_prefixes(root), read_refines(root)
|
|
current_mi = read_metadata(root)
|
|
if apply_null:
|
|
def ok(x):
|
|
return True
|
|
else:
|
|
def ok(x):
|
|
return not mi.is_null(x)
|
|
if ok('identifiers'):
|
|
set_identifiers(root, prefixes, refines, mi.identifiers,
|
|
force_identifiers=force_identifiers)
|
|
if ok('title'):
|
|
set_title(root, prefixes, refines, mi.title, mi.title_sort)
|
|
if ok('languages'):
|
|
set_languages(root, prefixes, refines, mi.languages)
|
|
if ok('book_producer'):
|
|
set_book_producers(root, prefixes, refines, (mi.book_producer,))
|
|
aus = string_to_authors(mi.author_sort or '')
|
|
authors = []
|
|
for i, aut in enumerate(mi.authors):
|
|
authors.append(Author(aut, aus[i] if i < len(aus) else None))
|
|
if authors or apply_null:
|
|
set_authors(root, prefixes, refines, authors)
|
|
if ok('pubdate'):
|
|
set_pubdate(root, prefixes, refines, mi.pubdate)
|
|
if update_timestamp and mi.timestamp is not None:
|
|
set_timestamp(root, prefixes, refines, mi.timestamp)
|
|
if ok('comments'):
|
|
set_comments(root, prefixes, refines, mi.comments)
|
|
if ok('publisher'):
|
|
set_publisher(root, prefixes, refines, mi.publisher)
|
|
if ok('tags'):
|
|
set_tags(root, prefixes, refines, mi.tags)
|
|
if ok('rating') and mi.rating is not None and mi.rating > 0.1:
|
|
set_rating(root, prefixes, refines, mi.rating)
|
|
if ok('series'):
|
|
set_series(root, prefixes, refines, mi.series, mi.series_index or 1)
|
|
if ok('author_link_map'):
|
|
set_author_link_map(root, prefixes, refines,
|
|
getattr(mi, 'author_link_map', None))
|
|
if ok('user_categories'):
|
|
set_user_categories(root, prefixes, refines,
|
|
getattr(mi, 'user_categories', None))
|
|
# We ignore apply_null for the next two to match the behavior with opf2.py
|
|
if mi.application_id:
|
|
set_application_id(root, prefixes, refines, mi.application_id)
|
|
if mi.uuid:
|
|
set_uuid(root, prefixes, refines, mi.uuid)
|
|
|
|
new_user_metadata = mi.get_all_user_metadata(True)
|
|
current_user_metadata = current_mi.get_all_user_metadata(True)
|
|
missing = object()
|
|
|
|
for key in tuple(new_user_metadata):
|
|
meta = new_user_metadata.get(key)
|
|
if meta is None:
|
|
if apply_null:
|
|
new_user_metadata[key] = None
|
|
continue
|
|
dt = meta.get('datatype')
|
|
if dt == 'text' and meta.get('is_multiple'):
|
|
val = mi.get(key, [])
|
|
if val or apply_null:
|
|
current_user_metadata[key] = meta
|
|
elif dt in {'int', 'float', 'bool'}:
|
|
val = mi.get(key, missing)
|
|
if val is missing:
|
|
if apply_null:
|
|
current_user_metadata[key] = meta
|
|
elif apply_null or val is not None:
|
|
current_user_metadata[key] = meta
|
|
elif apply_null or not mi.is_null(key):
|
|
current_user_metadata[key] = meta
|
|
|
|
set_user_metadata(root, prefixes, refines, current_user_metadata)
|
|
raster_cover = read_raster_cover(root, prefixes, refines)
|
|
if not raster_cover and cover_data and add_missing_cover:
|
|
if cover_prefix and not cover_prefix.endswith('/'):
|
|
cover_prefix += '/'
|
|
name = cover_prefix + 'cover.jpg'
|
|
i = create_manifest_item(root, name, 'cover')
|
|
if i is not None:
|
|
ensure_is_only_raster_cover(root, prefixes, refines, name)
|
|
raster_cover = name
|
|
|
|
pretty_print_opf(root)
|
|
return raster_cover
|
|
|
|
|
|
def set_metadata(stream, mi, cover_prefix='', cover_data=None,
|
|
apply_null=False, update_timestamp=False,
|
|
force_identifiers=False, add_missing_cover=True):
|
|
root = parse_opf(stream)
|
|
return apply_metadata(
|
|
root, mi, cover_prefix=cover_prefix, cover_data=cover_data,
|
|
apply_null=apply_null, update_timestamp=update_timestamp,
|
|
force_identifiers=force_identifiers)
|
|
# }}}
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
print(get_metadata(open(sys.argv[-1], 'rb')))
|