mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-06 03:04:11 +01:00
1114 lines
37 KiB
Python
1114 lines
37 KiB
Python
import json
|
|
import re
|
|
from collections import defaultdict, namedtuple
|
|
from functools import wraps
|
|
from ebook_converter.polyglot.builtins import iteritems
|
|
|
|
from lxml import etree
|
|
|
|
from ebook_converter import prints
|
|
from ebook_converter.ebooks.metadata import authors_to_string, check_isbn, string_to_authors
|
|
from ebook_converter.ebooks.metadata.book.base import Metadata
|
|
from ebook_converter.ebooks.metadata.book.json_codec import (
|
|
decode_is_multiple, encode_is_multiple, object_to_unicode
|
|
)
|
|
from ebook_converter.ebooks.metadata.utils import (
|
|
create_manifest_item, ensure_unique, normalize_languages, parse_opf,
|
|
pretty_print_opf
|
|
)
|
|
from ebook_converter.ebooks.oeb.base import DC, OPF, OPF2_NSMAP
|
|
from ebook_converter.utils.config import from_json, to_json
|
|
from ebook_converter.utils.date import (
|
|
fix_only_date, is_date_undefined, isoformat, parse_date as parse_date_, utcnow,
|
|
w3cdtf
|
|
)
|
|
from ebook_converter.utils.iso8601 import parse_iso8601
|
|
from ebook_converter.utils.localization import canonicalize_lang
|
|
|
|
|
|
# Utils {{{
|
|
_xpath_cache = {}
|
|
_re_cache = {}
|
|
|
|
|
|
def uniq(vals):
|
|
''' Remove all duplicates from vals, while preserving order. '''
|
|
vals = vals or ()
|
|
seen = set()
|
|
seen_add = seen.add
|
|
return list(x for x in vals if x not in seen and not seen_add(x))
|
|
|
|
|
|
def dump_dict(cats):
|
|
return json.dumps(object_to_unicode(cats or {}), ensure_ascii=False, skipkeys=True)
|
|
|
|
|
|
def XPath(x):
|
|
try:
|
|
return _xpath_cache[x]
|
|
except KeyError:
|
|
_xpath_cache[x] = ans = etree.XPath(x, namespaces=OPF2_NSMAP)
|
|
return ans
|
|
|
|
|
|
def regex(r, flags=0):
|
|
try:
|
|
return _re_cache[(r, flags)]
|
|
except KeyError:
|
|
_re_cache[(r, flags)] = ans = re.compile(r, flags)
|
|
return ans
|
|
|
|
|
|
def remove_refines(e, refines):
|
|
for x in refines[e.get('id')]:
|
|
x.getparent().remove(x)
|
|
refines.pop(e.get('id'), None)
|
|
|
|
|
|
def remove_element(e, refines):
|
|
remove_refines(e, refines)
|
|
e.getparent().remove(e)
|
|
|
|
|
|
def properties_for_id(item_id, refines):
|
|
ans = {}
|
|
if item_id:
|
|
for elem in refines[item_id]:
|
|
key = elem.get('property')
|
|
if key:
|
|
val = (elem.text or '').strip()
|
|
if val:
|
|
ans[key] = val
|
|
return ans
|
|
|
|
|
|
def properties_for_id_with_scheme(item_id, prefixes, refines):
|
|
ans = defaultdict(list)
|
|
if item_id:
|
|
for elem in refines[item_id]:
|
|
key = elem.get('property')
|
|
if key:
|
|
val = (elem.text or '').strip()
|
|
if val:
|
|
scheme = elem.get('scheme') or None
|
|
scheme_ns = None
|
|
if scheme is not None:
|
|
p, r = scheme.partition(':')[::2]
|
|
if p and r:
|
|
ns = prefixes.get(p)
|
|
if ns:
|
|
scheme_ns = ns
|
|
scheme = r
|
|
ans[key].append((scheme_ns, scheme, val))
|
|
return ans
|
|
|
|
|
|
def getroot(elem):
|
|
while True:
|
|
q = elem.getparent()
|
|
if q is None:
|
|
return elem
|
|
elem = q
|
|
|
|
|
|
def ensure_id(elem):
|
|
root = getroot(elem)
|
|
eid = elem.get('id')
|
|
if not eid:
|
|
eid = ensure_unique('id', frozenset(XPath('//*/@id')(root)))
|
|
elem.set('id', eid)
|
|
return eid
|
|
|
|
|
|
def normalize_whitespace(text):
|
|
if not text:
|
|
return text
|
|
return re.sub(r'\s+', ' ', text).strip()
|
|
|
|
|
|
def simple_text(f):
|
|
@wraps(f)
|
|
def wrapper(*args, **kw):
|
|
return normalize_whitespace(f(*args, **kw))
|
|
return wrapper
|
|
|
|
|
|
def items_with_property(root, q, prefixes=None):
|
|
if prefixes is None:
|
|
prefixes = read_prefixes(root)
|
|
q = expand_prefix(q, known_prefixes).lower()
|
|
for item in XPath("./opf:manifest/opf:item[@properties]")(root):
|
|
for prop in (item.get('properties') or '').lower().split():
|
|
prop = expand_prefix(prop, prefixes)
|
|
if prop == q:
|
|
yield item
|
|
break
|
|
|
|
# }}}
|
|
|
|
# Prefixes {{{
|
|
|
|
# http://www.idpf.org/epub/vocab/package/pfx/
|
|
|
|
|
|
reserved_prefixes = {
|
|
'dcterms': 'http://purl.org/dc/terms/',
|
|
'epubsc': 'http://idpf.org/epub/vocab/sc/#',
|
|
'marc': 'http://id.loc.gov/vocabulary/',
|
|
'media': 'http://www.idpf.org/epub/vocab/overlays/#',
|
|
'onix': 'http://www.editeur.org/ONIX/book/codelists/current.html#',
|
|
'rendition':'http://www.idpf.org/vocab/rendition/#',
|
|
'schema': 'http://schema.org/',
|
|
'xsd': 'http://www.w3.org/2001/XMLSchema#',
|
|
}
|
|
|
|
CALIBRE_PREFIX = 'https://calibre-ebook.com'
|
|
known_prefixes = reserved_prefixes.copy()
|
|
known_prefixes['calibre'] = CALIBRE_PREFIX
|
|
|
|
|
|
def parse_prefixes(x):
|
|
return {m.group(1):m.group(2) for m in re.finditer(r'(\S+): \s*(\S+)', x)}
|
|
|
|
|
|
def read_prefixes(root):
|
|
ans = reserved_prefixes.copy()
|
|
ans.update(parse_prefixes(root.get('prefix') or ''))
|
|
return ans
|
|
|
|
|
|
def expand_prefix(raw, prefixes):
|
|
return regex(r'(\S+)\s*:\s*(\S+)').sub(lambda m:(prefixes.get(m.group(1), m.group(1)) + ':' + m.group(2)), raw or '')
|
|
|
|
|
|
def ensure_prefix(root, prefixes, prefix, value=None):
|
|
if prefixes is None:
|
|
prefixes = read_prefixes(root)
|
|
prefixes[prefix] = value or reserved_prefixes[prefix]
|
|
prefixes = {k:v for k, v in iteritems(prefixes) if reserved_prefixes.get(k) != v}
|
|
if prefixes:
|
|
root.set('prefix', ' '.join('%s: %s' % (k, v) for k, v in iteritems(prefixes)))
|
|
else:
|
|
root.attrib.pop('prefix', None)
|
|
|
|
# }}}
|
|
|
|
# Refines {{{
|
|
|
|
|
|
def read_refines(root):
|
|
ans = defaultdict(list)
|
|
for meta in XPath('./opf:metadata/opf:meta[@refines]')(root):
|
|
r = meta.get('refines') or ''
|
|
if r.startswith('#'):
|
|
ans[r[1:]].append(meta)
|
|
return ans
|
|
|
|
|
|
def refdef(prop, val, scheme=None):
|
|
return (prop, val, scheme)
|
|
|
|
|
|
def set_refines(elem, existing_refines, *new_refines):
|
|
eid = ensure_id(elem)
|
|
remove_refines(elem, existing_refines)
|
|
for ref in reversed(new_refines):
|
|
prop, val, scheme = ref
|
|
r = elem.makeelement(OPF('meta'))
|
|
r.set('refines', '#' + eid), r.set('property', prop)
|
|
r.text = val.strip()
|
|
if scheme:
|
|
r.set('scheme', scheme)
|
|
p = elem.getparent()
|
|
p.insert(p.index(elem)+1, r)
|
|
# }}}
|
|
|
|
# Identifiers {{{
|
|
|
|
|
|
def parse_identifier(ident, val, refines):
|
|
idid = ident.get('id')
|
|
refines = refines[idid]
|
|
scheme = None
|
|
lval = val.lower()
|
|
|
|
def finalize(scheme, val):
|
|
if not scheme or not val:
|
|
return None, None
|
|
scheme = scheme.lower()
|
|
if scheme in ('http', 'https'):
|
|
return None, None
|
|
if scheme.startswith('isbn'):
|
|
scheme = 'isbn'
|
|
if scheme == 'isbn':
|
|
val = val.split(':')[-1]
|
|
val = check_isbn(val)
|
|
if val is None:
|
|
return None, None
|
|
return scheme, val
|
|
|
|
# Try the OPF 2 style opf:scheme attribute, which will be present, for
|
|
# example, in EPUB 3 files that have had their metadata set by an
|
|
# application that only understands EPUB 2.
|
|
scheme = ident.get(OPF('scheme'))
|
|
if scheme and not lval.startswith('urn:'):
|
|
return finalize(scheme, val)
|
|
|
|
# Technically, we should be looking for refines that define the scheme, but
|
|
# the IDioticPF created such a bad spec that they got their own
|
|
# examples wrong, so I cannot be bothered doing this.
|
|
# http://www.idpf.org/epub/301/spec/epub-publications-errata/
|
|
|
|
# Parse the value for the scheme
|
|
if lval.startswith('urn:'):
|
|
val = val[4:]
|
|
|
|
prefix, rest = val.partition(':')[::2]
|
|
return finalize(prefix, rest)
|
|
|
|
|
|
def read_identifiers(root, prefixes, refines):
|
|
ans = defaultdict(list)
|
|
for ident in XPath('./opf:metadata/dc:identifier')(root):
|
|
val = (ident.text or '').strip()
|
|
if val:
|
|
scheme, val = parse_identifier(ident, val, refines)
|
|
if scheme and val:
|
|
ans[scheme].append(val)
|
|
return ans
|
|
|
|
|
|
def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=False):
|
|
uid = root.get('unique-identifier')
|
|
package_identifier = None
|
|
for ident in XPath('./opf:metadata/dc:identifier')(root):
|
|
if uid is not None and uid == ident.get('id'):
|
|
package_identifier = ident
|
|
continue
|
|
val = (ident.text or '').strip()
|
|
if not val:
|
|
ident.getparent().remove(ident)
|
|
continue
|
|
scheme, val = parse_identifier(ident, val, refines)
|
|
if not scheme or not val or force_identifiers or scheme in new_identifiers:
|
|
remove_element(ident, refines)
|
|
continue
|
|
metadata = XPath('./opf:metadata')(root)[0]
|
|
for scheme, val in iteritems(new_identifiers):
|
|
ident = metadata.makeelement(DC('identifier'))
|
|
ident.text = '%s:%s' % (scheme, val)
|
|
if package_identifier is None:
|
|
metadata.append(ident)
|
|
else:
|
|
p = package_identifier.getparent()
|
|
p.insert(p.index(package_identifier), ident)
|
|
|
|
|
|
def identifier_writer(name):
|
|
def writer(root, prefixes, refines, ival=None):
|
|
uid = root.get('unique-identifier')
|
|
package_identifier = None
|
|
for ident in XPath('./opf:metadata/dc:identifier')(root):
|
|
is_package_id = uid is not None and uid == ident.get('id')
|
|
if is_package_id:
|
|
package_identifier = ident
|
|
val = (ident.text or '').strip()
|
|
if (val.startswith(name + ':') or ident.get(OPF('scheme')) == name) and not is_package_id:
|
|
remove_element(ident, refines)
|
|
metadata = XPath('./opf:metadata')(root)[0]
|
|
if ival:
|
|
ident = metadata.makeelement(DC('identifier'))
|
|
ident.text = '%s:%s' % (name, ival)
|
|
if package_identifier is None:
|
|
metadata.append(ident)
|
|
else:
|
|
p = package_identifier.getparent()
|
|
p.insert(p.index(package_identifier), ident)
|
|
return writer
|
|
|
|
|
|
set_application_id = identifier_writer('calibre')
|
|
set_uuid = identifier_writer('uuid')
|
|
|
|
# }}}
|
|
|
|
# Title {{{
|
|
|
|
|
|
def find_main_title(root, refines, remove_blanks=False):
|
|
first_title = main_title = None
|
|
for title in XPath('./opf:metadata/dc:title')(root):
|
|
if not title.text or not title.text.strip():
|
|
if remove_blanks:
|
|
remove_element(title, refines)
|
|
continue
|
|
if first_title is None:
|
|
first_title = title
|
|
props = properties_for_id(title.get('id'), refines)
|
|
if props.get('title-type') == 'main':
|
|
main_title = title
|
|
break
|
|
else:
|
|
main_title = first_title
|
|
return main_title
|
|
|
|
|
|
@simple_text
|
|
def read_title(root, prefixes, refines):
|
|
main_title = find_main_title(root, refines)
|
|
return None if main_title is None else main_title.text.strip()
|
|
|
|
|
|
@simple_text
|
|
def read_title_sort(root, prefixes, refines):
|
|
main_title = find_main_title(root, refines)
|
|
if main_title is not None:
|
|
fa = properties_for_id(main_title.get('id'), refines).get('file-as')
|
|
if fa:
|
|
return fa
|
|
# Look for OPF 2.0 style title_sort
|
|
for m in XPath('./opf:metadata/opf:meta[@name="calibre:title_sort"]')(root):
|
|
ans = m.get('content')
|
|
if ans:
|
|
return ans
|
|
|
|
|
|
def set_title(root, prefixes, refines, title, title_sort=None):
|
|
main_title = find_main_title(root, refines, remove_blanks=True)
|
|
if main_title is None:
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
main_title = m.makeelement(DC('title'))
|
|
m.insert(0, main_title)
|
|
main_title.text = title or None
|
|
ts = [refdef('file-as', title_sort)] if title_sort else ()
|
|
set_refines(main_title, refines, refdef('title-type', 'main'), *ts)
|
|
for m in XPath('./opf:metadata/opf:meta[@name="calibre:title_sort"]')(root):
|
|
remove_element(m, refines)
|
|
|
|
# }}}
|
|
|
|
# Languages {{{
|
|
|
|
|
|
def read_languages(root, prefixes, refines):
|
|
ans = []
|
|
for lang in XPath('./opf:metadata/dc:language')(root):
|
|
val = canonicalize_lang((lang.text or '').strip())
|
|
if val and val not in ans and val != 'und':
|
|
ans.append(val)
|
|
return uniq(ans)
|
|
|
|
|
|
def set_languages(root, prefixes, refines, languages):
|
|
opf_languages = []
|
|
for lang in XPath('./opf:metadata/dc:language')(root):
|
|
remove_element(lang, refines)
|
|
val = (lang.text or '').strip()
|
|
if val:
|
|
opf_languages.append(val)
|
|
languages = list(filter(lambda x: x and x != 'und', normalize_languages(opf_languages, languages)))
|
|
if not languages:
|
|
# EPUB spec says dc:language is required
|
|
languages = ['und']
|
|
metadata = XPath('./opf:metadata')(root)[0]
|
|
for lang in uniq(languages):
|
|
l = metadata.makeelement(DC('language'))
|
|
l.text = lang
|
|
metadata.append(l)
|
|
# }}}
|
|
|
|
# Creator/Contributor {{{
|
|
|
|
|
|
Author = namedtuple('Author', 'name sort')
|
|
|
|
|
|
def is_relators_role(props, q):
|
|
for role in props.get('role'):
|
|
if role:
|
|
scheme_ns, scheme, role = role
|
|
if role.lower() == q and (scheme_ns is None or (scheme_ns, scheme) == (reserved_prefixes['marc'], 'relators')):
|
|
return True
|
|
return False
|
|
|
|
|
|
def read_authors(root, prefixes, refines):
|
|
roled_authors, unroled_authors = [], []
|
|
|
|
def author(item, props, val):
|
|
aus = None
|
|
file_as = props.get('file-as')
|
|
if file_as:
|
|
aus = file_as[0][-1]
|
|
else:
|
|
aus = item.get(OPF('file-as')) or None
|
|
return Author(normalize_whitespace(val), normalize_whitespace(aus))
|
|
|
|
for item in XPath('./opf:metadata/dc:creator')(root):
|
|
val = (item.text or '').strip()
|
|
if val:
|
|
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
|
role = props.get('role')
|
|
opf_role = item.get(OPF('role'))
|
|
if role:
|
|
if is_relators_role(props, 'aut'):
|
|
roled_authors.append(author(item, props, val))
|
|
elif opf_role:
|
|
if opf_role.lower() == 'aut':
|
|
roled_authors.append(author(item, props, val))
|
|
else:
|
|
unroled_authors.append(author(item, props, val))
|
|
|
|
return uniq(roled_authors or unroled_authors)
|
|
|
|
|
|
def set_authors(root, prefixes, refines, authors):
|
|
ensure_prefix(root, prefixes, 'marc')
|
|
for item in XPath('./opf:metadata/dc:creator')(root):
|
|
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
|
opf_role = item.get(OPF('role'))
|
|
if (opf_role and opf_role.lower() != 'aut') or (props.get('role') and not is_relators_role(props, 'aut')):
|
|
continue
|
|
remove_element(item, refines)
|
|
metadata = XPath('./opf:metadata')(root)[0]
|
|
for author in authors:
|
|
if author.name:
|
|
a = metadata.makeelement(DC('creator'))
|
|
aid = ensure_id(a)
|
|
a.text = author.name
|
|
metadata.append(a)
|
|
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
|
|
m.text = 'aut'
|
|
metadata.append(m)
|
|
if author.sort:
|
|
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'file-as'})
|
|
m.text = author.sort
|
|
metadata.append(m)
|
|
|
|
|
|
def read_book_producers(root, prefixes, refines):
|
|
ans = []
|
|
for item in XPath('./opf:metadata/dc:contributor')(root):
|
|
val = (item.text or '').strip()
|
|
if val:
|
|
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
|
role = props.get('role')
|
|
opf_role = item.get(OPF('role'))
|
|
if role:
|
|
if is_relators_role(props, 'bkp'):
|
|
ans.append(normalize_whitespace(val))
|
|
elif opf_role and opf_role.lower() == 'bkp':
|
|
ans.append(normalize_whitespace(val))
|
|
return ans
|
|
|
|
|
|
def set_book_producers(root, prefixes, refines, producers):
|
|
for item in XPath('./opf:metadata/dc:contributor')(root):
|
|
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
|
opf_role = item.get(OPF('role'))
|
|
if (opf_role and opf_role.lower() != 'bkp') or (props.get('role') and not is_relators_role(props, 'bkp')):
|
|
continue
|
|
remove_element(item, refines)
|
|
metadata = XPath('./opf:metadata')(root)[0]
|
|
for bkp in producers:
|
|
if bkp:
|
|
a = metadata.makeelement(DC('contributor'))
|
|
aid = ensure_id(a)
|
|
a.text = bkp
|
|
metadata.append(a)
|
|
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
|
|
m.text = 'bkp'
|
|
metadata.append(m)
|
|
# }}}
|
|
|
|
# Dates {{{
|
|
|
|
|
|
def parse_date(raw, is_w3cdtf=False):
|
|
raw = raw.strip()
|
|
if is_w3cdtf:
|
|
ans = parse_iso8601(raw, assume_utc=True)
|
|
if 'T' not in raw and ' ' not in raw:
|
|
ans = fix_only_date(ans)
|
|
else:
|
|
ans = parse_date_(raw, assume_utc=True)
|
|
if ' ' not in raw and 'T' not in raw and (ans.hour, ans.minute, ans.second) == (0, 0, 0):
|
|
ans = fix_only_date(ans)
|
|
return ans
|
|
|
|
|
|
def read_pubdate(root, prefixes, refines):
|
|
for date in XPath('./opf:metadata/dc:date')(root):
|
|
val = (date.text or '').strip()
|
|
if val:
|
|
try:
|
|
return parse_date(val)
|
|
except Exception:
|
|
continue
|
|
|
|
|
|
def set_pubdate(root, prefixes, refines, val):
|
|
for date in XPath('./opf:metadata/dc:date')(root):
|
|
remove_element(date, refines)
|
|
if not is_date_undefined(val):
|
|
val = isoformat(val)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
d = m.makeelement(DC('date'))
|
|
d.text = val
|
|
m.append(d)
|
|
|
|
|
|
def read_timestamp(root, prefixes, refines):
|
|
pq = '%s:timestamp' % CALIBRE_PREFIX
|
|
sq = '%s:w3cdtf' % reserved_prefixes['dcterms']
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
val = (meta.text or '').strip()
|
|
if val:
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
scheme = expand_prefix(meta.get('scheme'), prefixes).lower()
|
|
try:
|
|
return parse_date(val, is_w3cdtf=scheme == sq)
|
|
except Exception:
|
|
continue
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:timestamp"]')(root):
|
|
val = meta.get('content')
|
|
if val:
|
|
try:
|
|
return parse_date(val, is_w3cdtf=True)
|
|
except Exception:
|
|
continue
|
|
|
|
|
|
def create_timestamp(root, prefixes, m, val):
|
|
if not is_date_undefined(val):
|
|
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
|
ensure_prefix(root, prefixes, 'dcterms')
|
|
val = w3cdtf(val)
|
|
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'})
|
|
d.text = val
|
|
m.append(d)
|
|
|
|
|
|
def set_timestamp(root, prefixes, refines, val):
|
|
pq = '%s:timestamp' % CALIBRE_PREFIX
|
|
for meta in XPath('./opf:metadata/opf:meta')(root):
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq or meta.get('name') == 'calibre:timestamp':
|
|
remove_element(meta, refines)
|
|
create_timestamp(root, prefixes, XPath('./opf:metadata')(root)[0], val)
|
|
|
|
|
|
def read_last_modified(root, prefixes, refines):
|
|
pq = '%s:modified' % reserved_prefixes['dcterms']
|
|
sq = '%s:w3cdtf' % reserved_prefixes['dcterms']
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
val = (meta.text or '').strip()
|
|
if val:
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
scheme = expand_prefix(meta.get('scheme'), prefixes).lower()
|
|
try:
|
|
return parse_date(val, is_w3cdtf=scheme == sq)
|
|
except Exception:
|
|
continue
|
|
|
|
|
|
def set_last_modified(root, prefixes, refines, val=None):
|
|
pq = '%s:modified' % reserved_prefixes['dcterms']
|
|
val = w3cdtf(val or utcnow())
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
iid = meta.get('id')
|
|
if not iid or not refines[iid]:
|
|
break
|
|
else:
|
|
ensure_prefix(root, prefixes, 'dcterms')
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
meta = m.makeelement(OPF('meta'), attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'})
|
|
m.append(meta)
|
|
meta.text = val
|
|
# }}}
|
|
|
|
# Comments {{{
|
|
|
|
|
|
def read_comments(root, prefixes, refines):
|
|
ans = ''
|
|
for dc in XPath('./opf:metadata/dc:description')(root):
|
|
if dc.text:
|
|
ans += '\n' + dc.text.strip()
|
|
return ans.strip()
|
|
|
|
|
|
def set_comments(root, prefixes, refines, val):
|
|
for dc in XPath('./opf:metadata/dc:description')(root):
|
|
remove_element(dc, refines)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
if val:
|
|
val = val.strip()
|
|
if val:
|
|
c = m.makeelement(DC('description'))
|
|
c.text = val
|
|
m.append(c)
|
|
# }}}
|
|
|
|
# Publisher {{{
|
|
|
|
|
|
@simple_text
|
|
def read_publisher(root, prefixes, refines):
|
|
for dc in XPath('./opf:metadata/dc:publisher')(root):
|
|
if dc.text:
|
|
return dc.text
|
|
|
|
|
|
def set_publisher(root, prefixes, refines, val):
|
|
for dc in XPath('./opf:metadata/dc:publisher')(root):
|
|
remove_element(dc, refines)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
if val:
|
|
val = val.strip()
|
|
if val:
|
|
c = m.makeelement(DC('publisher'))
|
|
c.text = normalize_whitespace(val)
|
|
m.append(c)
|
|
# }}}
|
|
|
|
# Tags {{{
|
|
|
|
|
|
def read_tags(root, prefixes, refines):
|
|
ans = []
|
|
for dc in XPath('./opf:metadata/dc:subject')(root):
|
|
if dc.text:
|
|
ans.extend(map(normalize_whitespace, dc.text.split(',')))
|
|
return uniq(list(filter(None, ans)))
|
|
|
|
|
|
def set_tags(root, prefixes, refines, val):
|
|
for dc in XPath('./opf:metadata/dc:subject')(root):
|
|
remove_element(dc, refines)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
if val:
|
|
val = uniq(list(filter(None, val)))
|
|
for x in val:
|
|
c = m.makeelement(DC('subject'))
|
|
c.text = normalize_whitespace(x)
|
|
if c.text:
|
|
m.append(c)
|
|
# }}}
|
|
|
|
# Rating {{{
|
|
|
|
|
|
def read_rating(root, prefixes, refines):
|
|
pq = '%s:rating' % CALIBRE_PREFIX
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
val = (meta.text or '').strip()
|
|
if val:
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
try:
|
|
return float(val)
|
|
except Exception:
|
|
continue
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:rating"]')(root):
|
|
val = meta.get('content')
|
|
if val:
|
|
try:
|
|
return float(val)
|
|
except Exception:
|
|
continue
|
|
|
|
|
|
def create_rating(root, prefixes, val):
|
|
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:rating'})
|
|
d.text = val
|
|
m.append(d)
|
|
|
|
|
|
def set_rating(root, prefixes, refines, val):
|
|
pq = '%s:rating' % CALIBRE_PREFIX
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:rating"]')(root):
|
|
remove_element(meta, refines)
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
remove_element(meta, refines)
|
|
if val:
|
|
create_rating(root, prefixes, '%.2g' % val)
|
|
# }}}
|
|
|
|
# Series {{{
|
|
|
|
|
|
def read_series(root, prefixes, refines):
|
|
series_index = 1.0
|
|
for meta in XPath('./opf:metadata/opf:meta[@property="belongs-to-collection" and @id]')(root):
|
|
val = (meta.text or '').strip()
|
|
if val:
|
|
props = properties_for_id(meta.get('id'), refines)
|
|
if props.get('collection-type') == 'series':
|
|
try:
|
|
series_index = float(props.get('group-position').strip())
|
|
except Exception:
|
|
pass
|
|
return normalize_whitespace(val), series_index
|
|
for si in XPath('./opf:metadata/opf:meta[@name="calibre:series_index"]/@content')(root):
|
|
try:
|
|
series_index = float(si)
|
|
break
|
|
except:
|
|
pass
|
|
for s in XPath('./opf:metadata/opf:meta[@name="calibre:series"]/@content')(root):
|
|
s = normalize_whitespace(s)
|
|
if s:
|
|
return s, series_index
|
|
return None, series_index
|
|
|
|
|
|
def create_series(root, refines, series, series_index):
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
d = m.makeelement(OPF('meta'), attrib={'property':'belongs-to-collection'})
|
|
d.text = series
|
|
m.append(d)
|
|
set_refines(d, refines, refdef('collection-type', 'series'), refdef('group-position', series_index))
|
|
|
|
|
|
def set_series(root, prefixes, refines, series, series_index):
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:series" or @name="calibre:series_index"]')(root):
|
|
remove_element(meta, refines)
|
|
for meta in XPath('./opf:metadata/opf:meta[@property="belongs-to-collection"]')(root):
|
|
remove_element(meta, refines)
|
|
if series:
|
|
create_series(root, refines, series, '%.2g' % series_index)
|
|
# }}}
|
|
|
|
# User metadata {{{
|
|
|
|
|
|
def dict_reader(name, load=json.loads, try2=True):
|
|
pq = '%s:%s' % (CALIBRE_PREFIX, name)
|
|
|
|
def reader(root, prefixes, refines):
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
val = (meta.text or '').strip()
|
|
if val:
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
try:
|
|
ans = load(val)
|
|
if isinstance(ans, dict):
|
|
return ans
|
|
except Exception:
|
|
continue
|
|
if try2:
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % name)(root):
|
|
val = meta.get('content')
|
|
if val:
|
|
try:
|
|
ans = load(val)
|
|
if isinstance(ans, dict):
|
|
return ans
|
|
except Exception:
|
|
continue
|
|
return reader
|
|
|
|
|
|
read_user_categories = dict_reader('user_categories')
|
|
read_author_link_map = dict_reader('author_link_map')
|
|
|
|
|
|
def dict_writer(name, serialize=dump_dict, remove2=True):
|
|
pq = '%s:%s' % (CALIBRE_PREFIX, name)
|
|
|
|
def writer(root, prefixes, refines, val):
|
|
if remove2:
|
|
for meta in XPath('./opf:metadata/opf:meta[@name="calibre:%s"]' % name)(root):
|
|
remove_element(meta, refines)
|
|
for meta in XPath('./opf:metadata/opf:meta[@property]')(root):
|
|
prop = expand_prefix(meta.get('property'), prefixes)
|
|
if prop.lower() == pq:
|
|
remove_element(meta, refines)
|
|
if val:
|
|
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
|
m = XPath('./opf:metadata')(root)[0]
|
|
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:%s' % name})
|
|
d.text = serialize(val)
|
|
m.append(d)
|
|
return writer
|
|
|
|
|
|
set_user_categories = dict_writer('user_categories')
|
|
set_author_link_map = dict_writer('author_link_map')
|
|
|
|
|
|
def deserialize_user_metadata(val):
|
|
val = json.loads(val, object_hook=from_json)
|
|
ans = {}
|
|
for name, fm in iteritems(val):
|
|
decode_is_multiple(fm)
|
|
ans[name] = fm
|
|
return ans
|
|
|
|
|
|
read_user_metadata3 = dict_reader('user_metadata', load=deserialize_user_metadata, try2=False)
|
|
|
|
|
|
def read_user_metadata2(root, remove_tags=False):
|
|
ans = {}
|
|
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, "calibre:user_metadata:")]')(root):
|
|
name = meta.get('name')
|
|
name = ':'.join(name.split(':')[2:])
|
|
if not name or not name.startswith('#'):
|
|
continue
|
|
fm = meta.get('content')
|
|
if remove_tags:
|
|
meta.getparent().remove(meta)
|
|
try:
|
|
fm = json.loads(fm, object_hook=from_json)
|
|
decode_is_multiple(fm)
|
|
ans[name] = fm
|
|
except Exception:
|
|
prints('Failed to read user metadata:', name)
|
|
import traceback
|
|
traceback.print_exc()
|
|
continue
|
|
return ans
|
|
|
|
|
|
def read_user_metadata(root, prefixes, refines):
|
|
return read_user_metadata3(root, prefixes, refines) or read_user_metadata2(root)
|
|
|
|
|
|
def serialize_user_metadata(val):
|
|
return json.dumps(object_to_unicode(val), ensure_ascii=False, default=to_json, indent=2, sort_keys=True)
|
|
|
|
|
|
set_user_metadata3 = dict_writer('user_metadata', serialize=serialize_user_metadata, remove2=False)
|
|
|
|
|
|
def set_user_metadata(root, prefixes, refines, val):
|
|
for meta in XPath('./opf:metadata/opf:meta[starts-with(@name, "calibre:user_metadata:")]')(root):
|
|
remove_element(meta, refines)
|
|
if val:
|
|
nval = {}
|
|
for name, fm in val.items():
|
|
fm = fm.copy()
|
|
encode_is_multiple(fm)
|
|
nval[name] = fm
|
|
set_user_metadata3(root, prefixes, refines, nval)
|
|
|
|
# }}}
|
|
|
|
# Covers {{{
|
|
|
|
|
|
def read_raster_cover(root, prefixes, refines):
|
|
|
|
def get_href(item):
|
|
mt = item.get('media-type')
|
|
if mt and 'xml' not in mt and 'html' not in mt:
|
|
href = item.get('href')
|
|
if href:
|
|
return href
|
|
|
|
for item in items_with_property(root, 'cover-image', prefixes):
|
|
href = get_href(item)
|
|
if href:
|
|
return href
|
|
|
|
for item_id in XPath('./opf:metadata/opf:meta[@name="cover"]/@content')(root):
|
|
for item in XPath('./opf:manifest/opf:item[@id and @href and @media-type]')(root):
|
|
if item.get('id') == item_id:
|
|
href = get_href(item)
|
|
if href:
|
|
return href
|
|
|
|
|
|
def ensure_is_only_raster_cover(root, prefixes, refines, raster_cover_item_href):
|
|
for item in XPath('./opf:metadata/opf:meta[@name="cover"]')(root):
|
|
remove_element(item, refines)
|
|
for item in items_with_property(root, 'cover-image', prefixes):
|
|
prop = normalize_whitespace(item.get('properties').replace('cover-image', ''))
|
|
if prop:
|
|
item.set('properties', prop)
|
|
else:
|
|
del item.attrib['properties']
|
|
for item in XPath('./opf:manifest/opf:item')(root):
|
|
if item.get('href') == raster_cover_item_href:
|
|
item.set('properties', normalize_whitespace((item.get('properties') or '') + ' cover-image'))
|
|
|
|
# }}}
|
|
|
|
# Reading/setting Metadata objects {{{
|
|
|
|
|
|
def first_spine_item(root, prefixes, refines):
|
|
for i in XPath('./opf:spine/opf:itemref/@idref')(root):
|
|
for item in XPath('./opf:manifest/opf:item')(root):
|
|
if item.get('id') == i:
|
|
return item.get('href') or None
|
|
|
|
|
|
def set_last_modified_in_opf(root):
|
|
prefixes, refines = read_prefixes(root), read_refines(root)
|
|
set_last_modified(root, prefixes, refines)
|
|
|
|
|
|
def read_metadata(root, ver=None, return_extra_data=False):
|
|
ans = Metadata(_('Unknown'), [_('Unknown')])
|
|
prefixes, refines = read_prefixes(root), read_refines(root)
|
|
identifiers = read_identifiers(root, prefixes, refines)
|
|
ids = {}
|
|
for key, vals in iteritems(identifiers):
|
|
if key == 'calibre':
|
|
ans.application_id = vals[0]
|
|
elif key == 'uuid':
|
|
ans.uuid = vals[0]
|
|
else:
|
|
ids[key] = vals[0]
|
|
ans.set_identifiers(ids)
|
|
ans.title = read_title(root, prefixes, refines) or ans.title
|
|
ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
|
|
ans.languages = read_languages(root, prefixes, refines) or ans.languages
|
|
auts, aus = [], []
|
|
for a in read_authors(root, prefixes, refines):
|
|
auts.append(a.name), aus.append(a.sort)
|
|
ans.authors = auts or ans.authors
|
|
ans.author_sort = authors_to_string(aus) or ans.author_sort
|
|
bkp = read_book_producers(root, prefixes, refines)
|
|
if bkp:
|
|
if bkp[0]:
|
|
ans.book_producer = bkp[0]
|
|
pd = read_pubdate(root, prefixes, refines)
|
|
if not is_date_undefined(pd):
|
|
ans.pubdate = pd
|
|
ts = read_timestamp(root, prefixes, refines)
|
|
if not is_date_undefined(ts):
|
|
ans.timestamp = ts
|
|
lm = read_last_modified(root, prefixes, refines)
|
|
if not is_date_undefined(lm):
|
|
ans.last_modified = lm
|
|
ans.comments = read_comments(root, prefixes, refines) or ans.comments
|
|
ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher
|
|
ans.tags = read_tags(root, prefixes, refines) or ans.tags
|
|
ans.rating = read_rating(root, prefixes, refines) or ans.rating
|
|
s, si = read_series(root, prefixes, refines)
|
|
if s:
|
|
ans.series, ans.series_index = s, si
|
|
ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map
|
|
ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories
|
|
for name, fm in iteritems((read_user_metadata(root, prefixes, refines) or {})):
|
|
ans.set_user_metadata(name, fm)
|
|
if return_extra_data:
|
|
ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines)
|
|
return ans
|
|
|
|
|
|
def get_metadata(stream):
|
|
root = parse_opf(stream)
|
|
return read_metadata(root)
|
|
|
|
|
|
def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True):
|
|
prefixes, refines = read_prefixes(root), read_refines(root)
|
|
current_mi = read_metadata(root)
|
|
if apply_null:
|
|
def ok(x):
|
|
return True
|
|
else:
|
|
def ok(x):
|
|
return not mi.is_null(x)
|
|
if ok('identifiers'):
|
|
set_identifiers(root, prefixes, refines, mi.identifiers, force_identifiers=force_identifiers)
|
|
if ok('title'):
|
|
set_title(root, prefixes, refines, mi.title, mi.title_sort)
|
|
if ok('languages'):
|
|
set_languages(root, prefixes, refines, mi.languages)
|
|
if ok('book_producer'):
|
|
set_book_producers(root, prefixes, refines, (mi.book_producer,))
|
|
aus = string_to_authors(mi.author_sort or '')
|
|
authors = []
|
|
for i, aut in enumerate(mi.authors):
|
|
authors.append(Author(aut, aus[i] if i < len(aus) else None))
|
|
if authors or apply_null:
|
|
set_authors(root, prefixes, refines, authors)
|
|
if ok('pubdate'):
|
|
set_pubdate(root, prefixes, refines, mi.pubdate)
|
|
if update_timestamp and mi.timestamp is not None:
|
|
set_timestamp(root, prefixes, refines, mi.timestamp)
|
|
if ok('comments'):
|
|
set_comments(root, prefixes, refines, mi.comments)
|
|
if ok('publisher'):
|
|
set_publisher(root, prefixes, refines, mi.publisher)
|
|
if ok('tags'):
|
|
set_tags(root, prefixes, refines, mi.tags)
|
|
if ok('rating') and mi.rating is not None and mi.rating > 0.1:
|
|
set_rating(root, prefixes, refines, mi.rating)
|
|
if ok('series'):
|
|
set_series(root, prefixes, refines, mi.series, mi.series_index or 1)
|
|
if ok('author_link_map'):
|
|
set_author_link_map(root, prefixes, refines, getattr(mi, 'author_link_map', None))
|
|
if ok('user_categories'):
|
|
set_user_categories(root, prefixes, refines, getattr(mi, 'user_categories', None))
|
|
# We ignore apply_null for the next two to match the behavior with opf2.py
|
|
if mi.application_id:
|
|
set_application_id(root, prefixes, refines, mi.application_id)
|
|
if mi.uuid:
|
|
set_uuid(root, prefixes, refines, mi.uuid)
|
|
new_user_metadata, current_user_metadata = mi.get_all_user_metadata(True), current_mi.get_all_user_metadata(True)
|
|
missing = object()
|
|
for key in tuple(new_user_metadata):
|
|
meta = new_user_metadata.get(key)
|
|
if meta is None:
|
|
if apply_null:
|
|
new_user_metadata[key] = None
|
|
continue
|
|
dt = meta.get('datatype')
|
|
if dt == 'text' and meta.get('is_multiple'):
|
|
val = mi.get(key, [])
|
|
if val or apply_null:
|
|
current_user_metadata[key] = meta
|
|
elif dt in {'int', 'float', 'bool'}:
|
|
val = mi.get(key, missing)
|
|
if val is missing:
|
|
if apply_null:
|
|
current_user_metadata[key] = meta
|
|
elif apply_null or val is not None:
|
|
current_user_metadata[key] = meta
|
|
elif apply_null or not mi.is_null(key):
|
|
current_user_metadata[key] = meta
|
|
|
|
set_user_metadata(root, prefixes, refines, current_user_metadata)
|
|
raster_cover = read_raster_cover(root, prefixes, refines)
|
|
if not raster_cover and cover_data and add_missing_cover:
|
|
if cover_prefix and not cover_prefix.endswith('/'):
|
|
cover_prefix += '/'
|
|
name = cover_prefix + 'cover.jpg'
|
|
i = create_manifest_item(root, name, 'cover')
|
|
if i is not None:
|
|
ensure_is_only_raster_cover(root, prefixes, refines, name)
|
|
raster_cover = name
|
|
|
|
pretty_print_opf(root)
|
|
return raster_cover
|
|
|
|
|
|
def set_metadata(stream, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True):
|
|
root = parse_opf(stream)
|
|
return apply_metadata(
|
|
root, mi, cover_prefix=cover_prefix, cover_data=cover_data,
|
|
apply_null=apply_null, update_timestamp=update_timestamp,
|
|
force_identifiers=force_identifiers)
|
|
# }}}
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
print(get_metadata(open(sys.argv[-1], 'rb')))
|