1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-04-19 12:43:35 +02:00

Use the real constants module.

This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
This commit is contained in:
2020-05-29 17:04:53 +02:00
parent ee4801228f
commit ce89f5c9d1
54 changed files with 2383 additions and 2081 deletions

View File

@@ -1,38 +1,32 @@
"""
Read meta information from fb2 files
"""
import os, random
from functools import partial
from string import ascii_letters, digits
import functools
import os
import random
import string
from lxml import etree
from ebook_converter.utils.date import parse_only_date
from ebook_converter.utils.img import save_cover_data_to
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.utils.imghdr import identify
from ebook_converter import guess_type, guess_all_extensions, prints, force_unicode
from ebook_converter import guess_type, guess_all_extensions, prints, \
force_unicode
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.polyglot.binary import as_base64_unicode
__license__ = 'GPL v3'
__copyright__ = ('2011, Roman Mukhin <ramses_ru at hotmail.com>, '
'2008, Anatoly Shipitsin <norguhtar at gmail.com>')
NAMESPACES = {'fb2': 'http://www.gribuser.ru/xml/fictionbook/2.0',
'fb21': 'http://www.gribuser.ru/xml/fictionbook/2.1',
'xlink': 'http://www.w3.org/1999/xlink'}
NAMESPACES = {
'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0',
'fb21' : 'http://www.gribuser.ru/xml/fictionbook/2.1',
'xlink' : 'http://www.w3.org/1999/xlink'
}
tostring = partial(etree.tostring, method='text', encoding='unicode')
tostring = functools.partial(etree.tostring, method='text', encoding='unicode')
def XLINK(tag):
return '{%s}%s'%(NAMESPACES['xlink'], tag)
return '{%s}%s' % (NAMESPACES['xlink'], tag)
class Context(object):
@@ -52,7 +46,7 @@ class Context(object):
return etree.XPath(*args, namespaces=self.namespaces)
def get_or_create(self, parent, tag, attribs={}, at_start=True):
xpathstr='./fb:'+tag
xpathstr = './fb:'+tag
for n, v in attribs.items():
xpathstr += '[@%s="%s"]' % (n, v)
ans = self.XPath(xpathstr)(parent)
@@ -73,7 +67,7 @@ class Context(object):
def clear_meta_tags(self, doc, tag):
for parent in ('title-info', 'src-title-info', 'publish-info'):
for x in self.XPath('//fb:%s/fb:%s'%(parent, tag))(doc):
for x in self.XPath('//fb:%s/fb:%s' % (parent, tag))(doc):
x.getparent().remove(x)
def text2fb2(self, parent, text):
@@ -117,42 +111,41 @@ def get_metadata(stream):
book_title = str(book_title)
else:
book_title = force_unicode(os.path.splitext(
os.path.basename(getattr(stream, 'name',
'Unknown')))[0])
os.path.basename(getattr(stream, 'name', 'Unknown')))[0])
mi = MetaInformation(book_title, authors)
try:
_parse_cover(root, mi, ctx)
except:
except Exception:
pass
try:
_parse_comments(root, mi, ctx)
except:
except Exception:
pass
try:
_parse_tags(root, mi, ctx)
except:
except Exception:
pass
try:
_parse_series(root, mi, ctx)
except:
except Exception:
pass
try:
_parse_isbn(root, mi, ctx)
except:
except Exception:
pass
try:
_parse_publisher(root, mi, ctx)
except:
except Exception:
pass
try:
_parse_pubdate(root, mi, ctx)
except:
except Exception:
pass
try:
_parse_language(root, mi, ctx)
except:
except Exception:
pass
return mi
@@ -160,11 +153,11 @@ def get_metadata(stream):
def _parse_authors(root, ctx):
authors = []
# pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
# Those are fallbacks: <src-title-info>, <document-info>
# pick up authors but only from 1 secrion <title-info>; otherwise it is
# not consistent! Those are fallbacks: <src-title-info>, <document-info>
author = None
for author_sec in ['title-info', 'src-title-info', 'document-info']:
for au in ctx.XPath('//fb:%s/fb:author'%author_sec)(root):
for au in ctx.XPath('//fb:%s/fb:author' % author_sec)(root):
author = _parse_author(au, ctx)
if author:
authors.append(author)
@@ -207,24 +200,26 @@ def _parse_book_title(root, ctx):
xp_ti = '//fb:title-info/fb:book-title/text()'
xp_pi = '//fb:publish-info/fb:book-title/text()'
xp_si = '//fb:src-title-info/fb:book-title/text()'
book_title = ctx.XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
book_title = ctx.XPath('normalize-space(%s|%s|%s)' %
(xp_ti, xp_pi, xp_si))(root)
return book_title
def _parse_cover(root, mi, ctx):
# pickup from <title-info>, if not exists it fallbacks to <src-title-info>
imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/@xlink:href), "#")')(root)
imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/'
'@xlink:href), "#")')(root)
if imgid:
try:
_parse_cover_data(root, imgid, mi, ctx)
except:
except Exception:
pass
def _parse_cover_data(root, imgid, mi, ctx):
from ebook_converter.ebooks.fb2 import base64_decode
elm_binary = ctx.XPath('//fb:binary[@id="%s"]'%imgid)(root)
elm_binary = ctx.XPath('//fb:binary[@id="%s"]' % imgid)(root)
if elm_binary:
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
mime_extensions = guess_all_extensions(mimetype)
@@ -241,12 +236,13 @@ def _parse_cover_data(root, imgid, mi, ctx):
fmt = identify(cdata)[0]
mi.cover_data = (fmt, cdata)
else:
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid))
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" %
(mimetype, imgid))
def _parse_tags(root, mi, ctx):
# pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
# Those are fallbacks: <src-title-info>
# pick up genre but only from 1 secrion <title-info>; otherwise it is not
# consistent! Those are fallbacks: <src-title-info>
for genre_sec in ['title-info', 'src-title-info']:
# -- i18n Translations-- ?
tags = ctx.XPath('//fb:%s/fb:genre/text()' % genre_sec)(root)
@@ -267,16 +263,20 @@ def _parse_series(root, mi, ctx):
mi.series = elms_sequence[0].get('name', None)
if mi.series:
try:
mi.series_index = float('.'.join(elms_sequence[0].get('number', None).split()[:2]))
i = float('.'.join(elms_sequence[0].get('number',
None).split()[:2]))
mi.series_index = i
except Exception:
pass
def _parse_isbn(root, mi, ctx):
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
# some people try to put several isbn in this field, but it is not
# allowed. try to stick to the 1-st one in this case
isbn = ctx.XPath('normalize-space(//fb:publish-info/fb:isbn/text())')(root)
if isbn:
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
# some people try to put several isbn in this field, but it is not
# allowed. try to stick to the 1-st one in this case
if ',' in isbn:
isbn = isbn[:isbn.index(',')]
if check_isbn(isbn):
@@ -284,9 +284,11 @@ def _parse_isbn(root, mi, ctx):
def _parse_comments(root, mi, ctx):
# pick up annotation but only from 1 section <title-info>; fallback: <src-title-info>
# pick up annotation but only from 1 section <title-info>;
# fallback: <src-title-info>
for annotation_sec in ['title-info', 'src-title-info']:
elms_annotation = ctx.XPath('//fb:%s/fb:annotation' % annotation_sec)(root)
elms_annotation = ctx.XPath('//fb:%s/fb:annotation' %
annotation_sec)(root)
if elms_annotation:
mi.comments = tostring(elms_annotation[0])
# TODO: tags i18n, xslt?
@@ -294,7 +296,8 @@ def _parse_comments(root, mi, ctx):
def _parse_publisher(root, mi, ctx):
publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/text())')(root)
publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/'
'text())')(root)
if publisher:
mi.publisher = publisher
@@ -315,7 +318,7 @@ def _parse_language(root, mi, ctx):
def _get_fbroot(raw):
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
root = safe_xml_fromstring(raw)
root = etree.fromstring(raw)
return ensure_namespace(root)
@@ -348,10 +351,12 @@ def _set_authors(title_info, mi, ctx):
ctx.create_tag(atag, 'first-name').text = author_parts[0]
author_parts = author_parts[1:]
if len(author_parts) > 1:
ctx.create_tag(atag, 'middle-name', at_start=False).text = author_parts[0]
ctx.create_tag(atag, 'middle-name',
at_start=False).text = author_parts[0]
author_parts = author_parts[1:]
if author_parts:
ctx.create_tag(atag, 'last-name', at_start=False).text = ' '.join(author_parts)
a = ' '.join(author_parts)
ctx.create_tag(atag, 'last-name', at_start=False).text = a
def _set_tags(title_info, mi, ctx):
@@ -368,12 +373,12 @@ def _set_series(title_info, mi, ctx):
seq = ctx.get_or_create(title_info, 'sequence')
seq.set('name', mi.series)
try:
seq.set('number', '%g'%mi.series_index)
except:
seq.set('number', '%g' % mi.series_index)
except Exception:
seq.set('number', '1')
def _rnd_name(size=8, chars=ascii_letters + digits):
def _rnd_name(size=8, chars=string.ascii_letters + string.digits):
return ''.join(random.choice(chars) for x in range(size))
@@ -396,7 +401,9 @@ def _set_cover(title_info, mi, ctx):
cim_filename = _rnd_pic_file_name('cover')
cim_tag.attrib[XLINK('href')] = '#' + cim_filename
fb2_root = cim_tag.getroottree().getroot()
cim_binary = ctx.get_or_create(fb2_root, 'binary', attribs={'id': cim_filename}, at_start=False)
cim_binary = ctx.get_or_create(fb2_root, 'binary',
attribs={'id': cim_filename},
at_start=False)
cim_binary.attrib['content-type'] = 'image/jpeg'
cim_binary.text = _encode_into_jpeg(mi.cover_data[1])
@@ -425,7 +432,8 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
# single quotes in xml declaration. Sigh. See
# https://www.mobileread.com/forums/showthread.php?p=2273184#post2273184
raw = b'<?xml version="1.0" encoding="UTF-8"?>\n'
raw += etree.tostring(root, method='xml', encoding='utf-8', xml_declaration=False)
raw += etree.tostring(root, method='xml', encoding='utf-8',
xml_declaration=False)
stream.seek(0)
stream.truncate()
@@ -449,6 +457,7 @@ def ensure_namespace(doc):
if bare_tags:
import re
raw = etree.tostring(doc, encoding='unicode')
raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw)
doc = safe_xml_fromstring(raw)
raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>',
raw)
doc = etree.fromstring(raw)
return doc

File diff suppressed because it is too large Load Diff

View File

@@ -5,6 +5,7 @@ from functools import wraps
from lxml import etree
from ebook_converter import constants as const
from ebook_converter import prints
from ebook_converter.ebooks.metadata import authors_to_string, check_isbn, string_to_authors
from ebook_converter.ebooks.metadata.book.base import Metadata
@@ -15,7 +16,6 @@ from ebook_converter.ebooks.metadata.utils import (
create_manifest_item, ensure_unique, normalize_languages, parse_opf,
pretty_print_opf
)
from ebook_converter.ebooks.oeb.base import DC, OPF, OPF2_NSMAP
from ebook_converter.utils.config import from_json, to_json
from ebook_converter.utils.date import (
fix_only_date, is_date_undefined, isoformat, parse_date as parse_date_, utcnow,
@@ -46,7 +46,7 @@ def XPath(x):
try:
return _xpath_cache[x]
except KeyError:
_xpath_cache[x] = ans = etree.XPath(x, namespaces=OPF2_NSMAP)
_xpath_cache[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP)
return ans
@@ -213,7 +213,7 @@ def set_refines(elem, existing_refines, *new_refines):
remove_refines(elem, existing_refines)
for ref in reversed(new_refines):
prop, val, scheme = ref
r = elem.makeelement(OPF('meta'))
r = elem.makeelement(const.OPF_META)
r.set('refines', '#' + eid), r.set('property', prop)
r.text = val.strip()
if scheme:
@@ -249,7 +249,7 @@ def parse_identifier(ident, val, refines):
# Try the OPF 2 style opf:scheme attribute, which will be present, for
# example, in EPUB 3 files that have had their metadata set by an
# application that only understands EPUB 2.
scheme = ident.get(OPF('scheme'))
scheme = ident.get(const.OPF_SCHEME)
if scheme and not lval.startswith('urn:'):
return finalize(scheme, val)
@@ -294,7 +294,7 @@ def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=
continue
metadata = XPath('./opf:metadata')(root)[0]
for scheme, val in new_identifiers.items():
ident = metadata.makeelement(DC('identifier'))
ident = metadata.makeelement(const.DC_IDENT)
ident.text = '%s:%s' % (scheme, val)
if package_identifier is None:
metadata.append(ident)
@@ -312,11 +312,11 @@ def identifier_writer(name):
if is_package_id:
package_identifier = ident
val = (ident.text or '').strip()
if (val.startswith(name + ':') or ident.get(OPF('scheme')) == name) and not is_package_id:
if (val.startswith(name + ':') or ident.get(const.OPF_SCHEME) == name) and not is_package_id:
remove_element(ident, refines)
metadata = XPath('./opf:metadata')(root)[0]
if ival:
ident = metadata.makeelement(DC('identifier'))
ident = metadata.makeelement(const.DC_IDENT)
ident.text = '%s:%s' % (name, ival)
if package_identifier is None:
metadata.append(ident)
@@ -376,7 +376,7 @@ def set_title(root, prefixes, refines, title, title_sort=None):
main_title = find_main_title(root, refines, remove_blanks=True)
if main_title is None:
m = XPath('./opf:metadata')(root)[0]
main_title = m.makeelement(DC('title'))
main_title = m.makeelement(const.DC_TITLE)
m.insert(0, main_title)
main_title.text = title or None
ts = [refdef('file-as', title_sort)] if title_sort else ()
@@ -411,7 +411,7 @@ def set_languages(root, prefixes, refines, languages):
languages = ['und']
metadata = XPath('./opf:metadata')(root)[0]
for lang in uniq(languages):
l = metadata.makeelement(DC('language'))
l = metadata.makeelement(const.DC_LANG)
l.text = lang
metadata.append(l)
# }}}
@@ -440,7 +440,7 @@ def read_authors(root, prefixes, refines):
if file_as:
aus = file_as[0][-1]
else:
aus = item.get(OPF('file-as')) or None
aus = item.get(const.OPF_FILE_AS) or None
return Author(normalize_whitespace(val), normalize_whitespace(aus))
for item in XPath('./opf:metadata/dc:creator')(root):
@@ -448,7 +448,7 @@ def read_authors(root, prefixes, refines):
if val:
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
role = props.get('role')
opf_role = item.get(OPF('role'))
opf_role = item.get(const.OPF_ROLE)
if role:
if is_relators_role(props, 'aut'):
roled_authors.append(author(item, props, val))
@@ -465,22 +465,22 @@ def set_authors(root, prefixes, refines, authors):
ensure_prefix(root, prefixes, 'marc')
for item in XPath('./opf:metadata/dc:creator')(root):
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
opf_role = item.get(OPF('role'))
opf_role = item.get(const.OPF_ROLE)
if (opf_role and opf_role.lower() != 'aut') or (props.get('role') and not is_relators_role(props, 'aut')):
continue
remove_element(item, refines)
metadata = XPath('./opf:metadata')(root)[0]
for author in authors:
if author.name:
a = metadata.makeelement(DC('creator'))
a = metadata.makeelement(const.DC_CREATOR)
aid = ensure_id(a)
a.text = author.name
metadata.append(a)
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
m.text = 'aut'
metadata.append(m)
if author.sort:
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'file-as'})
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'file-as'})
m.text = author.sort
metadata.append(m)
@@ -492,7 +492,7 @@ def read_book_producers(root, prefixes, refines):
if val:
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
role = props.get('role')
opf_role = item.get(OPF('role'))
opf_role = item.get(const.OPF_ROLE)
if role:
if is_relators_role(props, 'bkp'):
ans.append(normalize_whitespace(val))
@@ -504,18 +504,18 @@ def read_book_producers(root, prefixes, refines):
def set_book_producers(root, prefixes, refines, producers):
for item in XPath('./opf:metadata/dc:contributor')(root):
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
opf_role = item.get(OPF('role'))
opf_role = item.get(const.OPF_ROLE)
if (opf_role and opf_role.lower() != 'bkp') or (props.get('role') and not is_relators_role(props, 'bkp')):
continue
remove_element(item, refines)
metadata = XPath('./opf:metadata')(root)[0]
for bkp in producers:
if bkp:
a = metadata.makeelement(DC('contributor'))
a = metadata.makeelement(const.DC_CONTRIBUTOR)
aid = ensure_id(a)
a.text = bkp
metadata.append(a)
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
m.text = 'bkp'
metadata.append(m)
# }}}
@@ -552,7 +552,7 @@ def set_pubdate(root, prefixes, refines, val):
if not is_date_undefined(val):
val = isoformat(val)
m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(DC('date'))
d = m.makeelement(const.DC_DATE)
d.text = val
m.append(d)
@@ -584,7 +584,7 @@ def create_timestamp(root, prefixes, m, val):
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
ensure_prefix(root, prefixes, 'dcterms')
val = w3cdtf(val)
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'})
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'})
d.text = val
m.append(d)
@@ -625,7 +625,7 @@ def set_last_modified(root, prefixes, refines, val=None):
else:
ensure_prefix(root, prefixes, 'dcterms')
m = XPath('./opf:metadata')(root)[0]
meta = m.makeelement(OPF('meta'), attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'})
meta = m.makeelement(const.OPF_META, attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'})
m.append(meta)
meta.text = val
# }}}
@@ -648,7 +648,7 @@ def set_comments(root, prefixes, refines, val):
if val:
val = val.strip()
if val:
c = m.makeelement(DC('description'))
c = m.makeelement(const.DC_DESC)
c.text = val
m.append(c)
# }}}
@@ -670,7 +670,7 @@ def set_publisher(root, prefixes, refines, val):
if val:
val = val.strip()
if val:
c = m.makeelement(DC('publisher'))
c = m.makeelement(const.DC_PUBLISHER('publisher'))
c.text = normalize_whitespace(val)
m.append(c)
# }}}
@@ -693,7 +693,7 @@ def set_tags(root, prefixes, refines, val):
if val:
val = uniq(list(filter(None, val)))
for x in val:
c = m.makeelement(DC('subject'))
c = m.makeelement(const.DC_SUBJ)
c.text = normalize_whitespace(x)
if c.text:
m.append(c)
@@ -725,7 +725,7 @@ def read_rating(root, prefixes, refines):
def create_rating(root, prefixes, val):
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:rating'})
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:rating'})
d.text = val
m.append(d)
@@ -772,7 +772,7 @@ def read_series(root, prefixes, refines):
def create_series(root, refines, series, series_index):
m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(OPF('meta'), attrib={'property':'belongs-to-collection'})
d = m.makeelement(const.OPF_META, attrib={'property':'belongs-to-collection'})
d.text = series
m.append(d)
set_refines(d, refines, refdef('collection-type', 'series'), refdef('group-position', series_index))
@@ -836,7 +836,7 @@ def dict_writer(name, serialize=dump_dict, remove2=True):
if val:
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:%s' % name})
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:%s' % name})
d.text = serialize(val)
m.append(d)
return writer

View File

@@ -10,17 +10,13 @@ from lxml.builder import ElementMaker
from ebook_converter.constants_old import __appname__, __version__
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.polyglot.urllib import unquote
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid at kovidgoyal.net>'
NCX_NS = "http://www.daisy.org/z3986/2005/ncx/"
CALIBRE_NS = "http://calibre.kovidgoyal.net/2009/metadata"
NSMAP = {None: NCX_NS, 'calibre':CALIBRE_NS}
NSMAP = {None: NCX_NS, 'calibre': CALIBRE_NS}
E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP)
C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
@@ -30,8 +26,10 @@ def parse_html_toc(data):
from ebook_converter.utils.cleantext import clean_xml_chars
from lxml import etree
if isinstance(data, bytes):
data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
data = xml_to_unicode(data, strip_encoding_pats=True,
resolve_entities=True)[0]
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False,
sanitize_names=True)
for a in root.xpath('//*[@href and local-name()="a"]'):
purl = urllib.parse.urlparse(unquote(a.get('href')))
href, fragment = purl[2], purl[5]
@@ -48,8 +46,8 @@ def parse_html_toc(data):
class TOC(list):
def __init__(self, href=None, fragment=None, text=None, parent=None,
play_order=0, base_path=os.getcwd(), type='unknown', author=None,
description=None, toc_thumbnail=None):
play_order=0, base_path=os.getcwd(), type='unknown',
author=None, description=None, toc_thumbnail=None):
self.href = href
self.fragment = fragment
if not self.fragment:
@@ -64,7 +62,7 @@ class TOC(list):
self.toc_thumbnail = toc_thumbnail
def __str__(self):
lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)]
lines = ['TOC: %s#%s %s' % (self.href, self.fragment, self.text)]
for child in self:
c = str(child).splitlines()
for l in c:
@@ -91,12 +89,14 @@ class TOC(list):
entry.parent = None
def add_item(self, href, fragment, text, play_order=None, type='unknown',
author=None, description=None, toc_thumbnail=None):
author=None, description=None, toc_thumbnail=None):
if play_order is None:
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
play_order = (self[-1].play_order
if len(self) else self.play_order) + 1
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
base_path=self.base_path, play_order=play_order,
type=type, author=author, description=description, toc_thumbnail=toc_thumbnail))
type=type, author=author, description=description,
toc_thumbnail=toc_thumbnail))
return self[-1]
def top_level_items(self):
@@ -121,7 +121,10 @@ class TOC(list):
@property
def abspath(self):
'Return the file this toc entry points to as a absolute path to a file on the system.'
"""
Return the file this toc entry points to as a absolute path to a file
on the system.
"""
if self.href is None:
return None
@@ -136,8 +139,9 @@ class TOC(list):
toc = toc['toc']
if toc is None:
try:
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
except:
toc = (opfreader.soup.find('guide')
.find('reference', attrs={'type': 'toc'})['href'])
except Exception:
for item in opfreader.manifest:
if 'toc' in item.href().lower():
toc = item.href()
@@ -151,13 +155,15 @@ class TOC(list):
toc = os.path.join(self.base_path, toc)
try:
if not os.path.exists(toc):
bn = os.path.basename(toc)
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
bn = os.path.basename(toc)
# Bug in BAEN OPF files
bn = bn.replace('_top.htm', '_toc.htm')
toc = os.path.join(os.path.dirname(toc), bn)
self.read_html_toc(toc)
except:
print('WARNING: Could not read Table of Contents. Continuing anyway.')
except Exception:
print('WARNING: Could not read Table of Contents. '
'Continuing anyway.')
else:
path = opfreader.manifest.item(toc.lower())
path = getattr(path, 'path', path)
@@ -177,9 +183,9 @@ class TOC(list):
self.base_path = os.path.dirname(toc)
if root is None:
with open(toc, 'rb') as f:
raw = xml_to_unicode(f.read(), assume_utf8=True,
strip_encoding_pats=True)[0]
root = safe_xml_fromstring(raw)
raw = xml_to_unicode(f.read(), assume_utf8=True,
strip_encoding_pats=True)[0]
root = etree.fromstring(raw)
xpn = {'re': 'http://exslt.org/regular-expressions'}
XPath = functools.partial(etree.XPath, namespaces=xpn)
@@ -197,7 +203,7 @@ class TOC(list):
def process_navpoint(np, dest):
try:
play_order = int(get_attr(np, 1))
except:
except Exception:
play_order = 1
href = fragment = text = None
nd = dest
@@ -207,7 +213,7 @@ class TOC(list):
text = ''
for txt in txt_path(nl):
text += etree.tostring(txt, method='text',
encoding='unicode', with_tail=False)
encoding='unicode', with_tail=False)
content = content_path(np)
if content and text:
content = content[0]
@@ -242,17 +248,14 @@ class TOC(list):
self.add_item(href, fragment, txt)
def render(self, stream, uid):
root = E.ncx(
E.head(
E.meta(name='dtb:uid', content=str(uid)),
E.meta(name='dtb:depth', content=str(self.depth())),
E.meta(name='dtb:generator', content='%s (%s)'%(__appname__,
__version__)),
E.meta(name='dtb:totalPageCount', content='0'),
E.meta(name='dtb:maxPageNumber', content='0'),
),
E.docTitle(E.text('Table of Contents')),
)
root = E.ncx(E.head(E.meta(name='dtb:uid', content=str(uid)),
E.meta(name='dtb:depth',
content=str(self.depth())),
E.meta(name='dtb:generator', content='%s (%s)' %
(__appname__, __version__)),
E.meta(name='dtb:totalPageCount', content='0'),
E.meta(name='dtb:maxPageNumber', content='0')),
E.docTitle(E.text('Table of Contents')))
navmap = E.navMap()
root.append(navmap)
root.set('{http://www.w3.org/XML/1998/namespace}lang', 'en')
@@ -263,12 +266,12 @@ class TOC(list):
if not text:
text = ''
c[1] += 1
item_id = 'num_%d'%c[1]
item_id = 'num_%d' % c[1]
text = clean_xml_chars(text)
elem = E.navPoint(
E.navLabel(E.text(re.sub(r'\s+', ' ', text))),
E.content(src=str(np.href)+(('#' + str(np.fragment))
if np.fragment else '')),
if np.fragment else '')),
id=item_id,
playOrder=str(np.play_order)
)
@@ -282,7 +285,8 @@ class TOC(list):
try:
elem.append(C.meta(desc, name='description'))
except ValueError:
elem.append(C.meta(clean_xml_chars(desc), name='description'))
elem.append(C.meta(clean_xml_chars(desc),
name='description'))
idx = getattr(np, 'toc_thumbnail', None)
if idx:
elem.append(C.meta(idx, name='toc_thumbnail'))
@@ -293,5 +297,5 @@ class TOC(list):
for np in self:
navpoint(navmap, np)
raw = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=True)
pretty_print=True)
stream.write(raw)

View File

@@ -1,12 +1,13 @@
from collections import namedtuple
from lxml import etree
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ebooks.oeb.base import OPF
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.utils import guess_type
from ebook_converter.spell import parse_lang_code
from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.utils.localization import lang_as_iso639_1
from ebook_converter.utils.xml_parse import safe_xml_fromstring
OPFVersion = namedtuple('OPFVersion', 'major minor patch')
@@ -35,23 +36,26 @@ def parse_opf(stream_or_path):
raw = stream.read()
if not raw:
raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True, assume_utf8=True)
raw = raw[raw.find('<'):]
root = safe_xml_fromstring(clean_xml_chars(raw))
root = etree.fromstring(clean_xml_chars(raw))
if root is None:
raise ValueError('Not an OPF file')
return root
def normalize_languages(opf_languages, mi_languages):
' Preserve original country codes and use 2-letter lang codes where possible '
"""
Preserve original country codes and use 2-letter lang codes where possible
"""
def parse(x):
try:
return parse_lang_code(x)
except ValueError:
return None
opf_languages = filter(None, map(parse, opf_languages))
cc_map = {c.langcode:c.countrycode for c in opf_languages}
cc_map = {c.langcode: c.countrycode for c in opf_languages}
mi_languages = filter(None, map(parse, mi_languages))
def norm(x):
@@ -83,9 +87,9 @@ def create_manifest_item(root, href_template, id_template, media_type=None):
all_hrefs = frozenset(root.xpath('//*/@href'))
href = ensure_unique(href_template, all_hrefs)
item_id = ensure_unique(id_template, all_ids)
manifest = root.find(OPF('manifest'))
manifest = root.find(base.tag('opf', 'manifest'))
if manifest is not None:
i = manifest.makeelement(OPF('item'))
i = manifest.makeelement(base.tag('opf', 'item'))
i.set('href', href), i.set('id', item_id)
i.set('media-type', media_type or guess_type(href_template))
manifest.append(i)
@@ -93,6 +97,7 @@ def create_manifest_item(root, href_template, id_template, media_type=None):
def pretty_print_opf(root):
from ebook_converter.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree
from ebook_converter.ebooks.oeb.polish.pretty import pretty_opf, \
pretty_xml_tree
pretty_opf(root)
pretty_xml_tree(root)

View File

@@ -1,44 +1,43 @@
import re, sys, copy, json
from itertools import repeat
from collections import defaultdict
import collections
import copy
import itertools
import json
import re
import sys
from lxml import etree
from lxml.builder import ElementMaker
from ebook_converter import prints
from ebook_converter.ebooks.metadata import check_isbn, check_doi
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.ebooks.metadata.book.base import Metadata
from ebook_converter.ebooks.metadata.opf2 import dump_dict
from ebook_converter.utils.date import parse_date, isoformat, now
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
from ebook_converter.utils.localization import canonicalize_lang, \
lang_as_iso639_1
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
_xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)'
r'[\'"][^<>]*>', re.IGNORECASE)
_xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE)
NS_MAP = {
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'dc': 'http://purl.org/dc/elements/1.1/',
'pdf': 'http://ns.adobe.com/pdf/1.3/',
'pdfx': 'http://ns.adobe.com/pdfx/1.3/',
'xmp': 'http://ns.adobe.com/xap/1.0/',
'xmpidq': 'http://ns.adobe.com/xmp/Identifier/qual/1.0/',
'xmpMM': 'http://ns.adobe.com/xap/1.0/mm/',
'xmpRights': 'http://ns.adobe.com/xap/1.0/rights/',
'xmpBJ': 'http://ns.adobe.com/xap/1.0/bj/',
'xmpTPg': 'http://ns.adobe.com/xap/1.0/t/pg/',
'xmpDM': 'http://ns.adobe.com/xmp/1.0/DynamicMedia/',
'prism': 'http://prismstandard.org/namespaces/basic/2.0/',
'crossmark': 'http://crossref.org/crossmark/1.0/',
'xml': 'http://www.w3.org/XML/1998/namespace',
'x': 'adobe:ns:meta/',
'calibre': 'http://calibre-ebook.com/xmp-namespace',
'calibreSI': 'http://calibre-ebook.com/xmp-namespace-series-index',
'calibreCC': 'http://calibre-ebook.com/xmp-namespace-custom-columns',
}
NS_MAP = {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'dc': 'http://purl.org/dc/elements/1.1/',
'pdf': 'http://ns.adobe.com/pdf/1.3/',
'pdfx': 'http://ns.adobe.com/pdfx/1.3/',
'xmp': 'http://ns.adobe.com/xap/1.0/',
'xmpidq': 'http://ns.adobe.com/xmp/Identifier/qual/1.0/',
'xmpMM': 'http://ns.adobe.com/xap/1.0/mm/',
'xmpRights': 'http://ns.adobe.com/xap/1.0/rights/',
'xmpBJ': 'http://ns.adobe.com/xap/1.0/bj/',
'xmpTPg': 'http://ns.adobe.com/xap/1.0/t/pg/',
'xmpDM': 'http://ns.adobe.com/xmp/1.0/DynamicMedia/',
'prism': 'http://prismstandard.org/namespaces/basic/2.0/',
'crossmark': 'http://crossref.org/crossmark/1.0/',
'xml': 'http://www.w3.org/XML/1998/namespace',
'x': 'adobe:ns:meta/',
'calibre': 'http://calibre-ebook.com/xmp-namespace',
'calibreSI': 'http://calibre-ebook.com/xmp-namespace-series-index',
'calibreCC': 'http://calibre-ebook.com/xmp-namespace-custom-columns'}
KNOWN_ID_SCHEMES = {'isbn', 'url', 'doi'}
@@ -63,7 +62,7 @@ def parse_xmp_packet(raw_bytes):
pat = r'''<?xpacket\s+[^>]*?begin\s*=\s*['"]([^'"]*)['"]'''
encodings = ('8', '16-le', '16-be', '32-le', '32-be')
header = raw_bytes[:1024]
emap = {'\ufeff'.encode('utf-'+x):'utf-'+x for x in encodings}
emap = {'\ufeff'.encode('utf-'+x): 'utf-'+x for x in encodings}
emap[b''] = 'utf-8'
for q in encodings:
m = re.search(pat.encode('utf-'+q), header)
@@ -71,15 +70,19 @@ def parse_xmp_packet(raw_bytes):
enc = emap.get(m.group(1), enc)
break
if enc is None:
return safe_xml_fromstring(raw_bytes)
raw = _xml_declaration.sub('', raw_bytes.decode(enc)) # lxml barfs if encoding declaration present in unicode string
return safe_xml_fromstring(raw)
return etree.fromstring(raw_bytes)
# lxml barfs if encoding declaration present in unicode string
raw = _xml_declaration.sub('', raw_bytes.decode(enc))
return etree.fromstring(raw)
def serialize_xmp_packet(root, encoding='utf-8'):
root.tail = '\n' + '\n'.join(repeat(' '*100, 30)) # Adobe spec recommends inserting padding at the end of the packet
raw_bytes = etree.tostring(root, encoding=encoding, pretty_print=True, with_tail=True, method='xml')
return b'<?xpacket begin="%s" id="W5M0MpCehiHzreSzNTczkc9d"?>\n%s\n<?xpacket end="w"?>' % ('\ufeff'.encode(encoding), raw_bytes)
# Adobe spec recommends inserting padding at the end of the packet
root.tail = '\n' + '\n'.join(itertools.repeat(' '*100, 30))
raw_bytes = etree.tostring(root, encoding=encoding, pretty_print=True,
with_tail=True, method='xml')
return ('<?xpacket begin="%s" id="W5M0MpCehiHzreSzNTczkc9d"?>\n%s\n'
'<?xpacket end="w"?>' % ('\ufeff'.encode(encoding), raw_bytes))
def read_simple_property(elem):
@@ -106,14 +109,15 @@ def read_sequence(parent):
yield read_simple_property(item)
def uniq(vals, kmap=lambda x:x):
def uniq(vals, kmap=lambda x: x):
''' Remove all duplicates from vals, while preserving order. kmap must be a
callable that returns a hashable value for every item in vals '''
vals = vals or ()
lvals = (kmap(x) for x in vals)
seen = set()
seen_add = seen.add
return tuple(x for x, k in zip(vals, lvals) if k not in seen and not seen_add(k))
return tuple(x for x, k in zip(vals, lvals) if k not in seen
and not seen_add(k))
def multiple_sequences(expr, root):
@@ -170,7 +174,8 @@ def read_series(root):
def read_user_metadata(mi, root):
from ebook_converter.utils.config import from_json
from ebook_converter.ebooks.metadata.book.json_codec import decode_is_multiple
from ebook_converter.ebooks.metadata.book.json_codec import \
decode_is_multiple
fields = set()
for item in XPath('//calibre:custom_metadata')(root):
for li in XPath('./rdf:Bag/rdf:li')(item):
@@ -186,7 +191,7 @@ def read_user_metadata(mi, root):
decode_is_multiple(fm)
mi.set_user_metadata(name, fm)
fields.add(name)
except:
except Exception:
prints('Failed to read user metadata:', name)
import traceback
traceback.print_exc()
@@ -194,13 +199,17 @@ def read_user_metadata(mi, root):
def read_xmp_identifers(parent):
''' For example:
<rdf:li rdf:parseType="Resource"><xmpidq:Scheme>URL</xmp:idq><rdf:value>http://foo.com</rdf:value></rdf:li>
<rdf:li rdf:parseType="Resource"><xmpidq:Scheme>URL</xmp:idq>
<rdf:value>http://foo.com</rdf:value></rdf:li>
or the longer form:
<rdf:li><rdf:Description><xmpidq:Scheme>URL</xmp:idq><rdf:value>http://foo.com</rdf:value></rdf:Description></rdf:li>
<rdf:li><rdf:Description><xmpidq:Scheme>URL</xmp:idq>
<rdf:value>http://foo.com</rdf:value></rdf:Description></rdf:li>
'''
for li in XPath('./rdf:Bag/rdf:li')(parent):
is_resource = li.attrib.get(expand('rdf:parseType'), None) == 'Resource'
is_resource = is_resource or (len(li) == 1 and li[0].tag == expand('rdf:Description'))
is_resource = li.attrib.get(expand('rdf:parseType'),
None) == 'Resource'
is_resource = is_resource or (len(li) == 1 and
li[0].tag == expand('rdf:Description'))
if not is_resource:
yield None, li.text or ''
value = XPath('descendant::rdf:value')(li)
@@ -241,12 +250,15 @@ def metadata_from_xmp_packet(raw_bytes):
if title.startswith(r'\376\377'):
# corrupted XMP packet generated by Nitro PDF. See
# https://bugs.launchpad.net/calibre/+bug/1541981
raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF')
raise ValueError('Corrupted XMP metadata packet detected, '
'probably generated by Nitro PDF')
mi.title = title
authors = multiple_sequences('//dc:creator', root)
if authors:
mi.authors = authors
tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root)
tags = multiple_sequences('//dc:subject',
root) or multiple_sequences('//pdf:Keywords',
root)
if tags:
mi.tags = tags
comments = first_alt('//dc:description', root)
@@ -256,8 +268,10 @@ def metadata_from_xmp_packet(raw_bytes):
if publishers:
mi.publisher = publishers[0]
try:
pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False)
except:
pubdate = (parse_date(first_sequence('//dc:date', root) or
first_simple('//xmp:CreateDate', root),
assume_utc=False))
except Exception:
pass
else:
mi.pubdate = pubdate
@@ -291,7 +305,7 @@ def metadata_from_xmp_packet(raw_bytes):
if val:
try:
setattr(mi, x, json.loads(val))
except:
except Exception:
pass
languages = multiple_sequences('//dc:language', root)
@@ -319,7 +333,7 @@ def metadata_from_xmp_packet(raw_bytes):
identifiers[scheme] = val
# Check Dublin Core for recognizable identifier types
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.items():
for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items():
if scheme not in identifiers:
val = check_func(first_simple('//dc:identifier', root))
if val:
@@ -359,17 +373,21 @@ def consolidate_metadata(info_mi, info):
else:
prefer_info = info_date > xmp_mi.metadata_date
if prefer_info:
info_mi.title, info_mi.authors, info_mi.tags = info_title, info_authors, info_tags
info_mi.title = info_title
info_mi.authors = info_authors
info_mi.tags = info_tags
else:
# We'll use the xmp tags/authors but fallback to the info ones if the
# xmp does not have tags/authors. smart_update() should have taken care of
# the rest
info_mi.authors, info_mi.tags = (info_authors if xmp_mi.is_null('authors') else xmp_mi.authors), xmp_mi.tags or info_tags
# xmp does not have tags/authors. smart_update() should have taken care
# of the rest
info_mi.authors = (info_authors if xmp_mi.is_null('authors')
else xmp_mi.authors)
info_mi.tags = xmp_mi.tags or info_tags
return info_mi
def nsmap(*args):
return {x:NS_MAP[x] for x in args}
return {x: NS_MAP[x] for x in args}
def create_simple_property(parent, tag, value):
@@ -435,7 +453,8 @@ def create_series(calibre, series, series_index):
def create_user_metadata(calibre, all_user_metadata):
from ebook_converter.utils.config import to_json
from ebook_converter.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple
from ebook_converter.ebooks.metadata.book.json_codec import \
object_to_unicode, encode_is_multiple
s = calibre.makeelement(expand('calibre:custom_metadata'))
calibre.append(s)
@@ -447,7 +466,7 @@ def create_user_metadata(calibre, all_user_metadata):
encode_is_multiple(fm)
fm = object_to_unicode(fm)
fm = json.dumps(fm, default=to_json, ensure_ascii=False)
except:
except Exception:
prints('Failed to write user metadata:', name)
import traceback
traceback.print_exc()
@@ -471,7 +490,8 @@ def metadata_to_xmp_packet(mi):
dc = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('dc'))
dc.set(expand('rdf:about'), '')
rdf.append(dc)
for prop, tag in {'title':'dc:title', 'comments':'dc:description'}.items():
for prop, tag in {'title': 'dc:title',
'comments': 'dc:description'}.items():
val = mi.get(prop) or ''
create_alt_property(dc, tag, val)
for prop, (tag, ordered) in {'authors': ('dc:creator', True),
@@ -482,18 +502,23 @@ def metadata_to_xmp_packet(mi):
val = [val]
create_sequence_property(dc, tag, val, ordered)
if not mi.is_null('pubdate'):
create_sequence_property(dc, 'dc:date', [isoformat(mi.pubdate, as_utc=False)]) # Adobe spec recommends local time
# Adobe spec recommends local time
create_sequence_property(dc, 'dc:date',
[isoformat(mi.pubdate, as_utc=False)])
if not mi.is_null('languages'):
langs = list(filter(None, map(lambda x:lang_as_iso639_1(x) or canonicalize_lang(x), mi.languages)))
langs = list(filter(None, map(lambda x: lang_as_iso639_1(x) or
canonicalize_lang(x), mi.languages)))
if langs:
create_sequence_property(dc, 'dc:language', langs, ordered=False)
xmp = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('xmp', 'xmpidq'))
xmp = rdf.makeelement(expand('rdf:Description'),
nsmap=nsmap('xmp', 'xmpidq'))
xmp.set(expand('rdf:about'), '')
rdf.append(xmp)
extra_ids = {}
for x in ('prism', 'pdfx'):
p = extra_ids[x] = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap(x))
p = extra_ids[x] = rdf.makeelement(expand('rdf:Description'),
nsmap=nsmap(x))
p.set(expand('rdf:about'), '')
rdf.append(p)
@@ -503,7 +528,7 @@ def metadata_to_xmp_packet(mi):
for scheme, val in identifiers.items():
if scheme in {'isbn', 'doi'}:
for prefix, parent in extra_ids.items():
ie = parent.makeelement(expand('%s:%s'%(prefix, scheme)))
ie = parent.makeelement(expand('%s:%s' % (prefix, scheme)))
ie.text = val
parent.append(ie)
@@ -511,7 +536,8 @@ def metadata_to_xmp_packet(mi):
d.text = isoformat(now(), as_utc=False)
xmp.append(d)
calibre = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('calibre', 'calibreSI', 'calibreCC'))
calibre = rdf.makeelement(expand('rdf:Description'),
nsmap=nsmap('calibre', 'calibreSI', 'calibreCC'))
calibre.set(expand('rdf:about'), '')
rdf.append(calibre)
if not mi.is_null('rating'):
@@ -524,7 +550,8 @@ def metadata_to_xmp_packet(mi):
if not mi.is_null('series'):
create_series(calibre, mi.series, mi.series_index)
if not mi.is_null('timestamp'):
create_simple_property(calibre, 'calibre:timestamp', isoformat(mi.timestamp, as_utc=False))
create_simple_property(calibre, 'calibre:timestamp',
isoformat(mi.timestamp, as_utc=False))
for x in ('author_link_map', 'user_categories'):
val = getattr(mi, x, None)
if val:
@@ -550,10 +577,11 @@ def find_used_namespaces(elem):
def find_preferred_prefix(namespace, elems):
for elem in elems:
ans = {v:k for k, v in elem.nsmap.items()}.get(namespace, None)
ans = {v: k for k, v in elem.nsmap.items()}.get(namespace, None)
if ans is not None:
return ans
return find_preferred_prefix(namespace, elem.iterchildren(etree.Element))
return find_preferred_prefix(namespace,
elem.iterchildren(etree.Element))
def find_nsmap(elems):
@@ -562,7 +590,7 @@ def find_nsmap(elems):
used_namespaces |= find_used_namespaces(elem)
ans = {}
used_namespaces -= {NS_MAP['xml'], NS_MAP['x'], None, NS_MAP['rdf']}
rmap = {v:k for k, v in NS_MAP.items()}
rmap = {v: k for k, v in NS_MAP.items()}
i = 0
for ns in used_namespaces:
if ns in rmap:
@@ -578,7 +606,10 @@ def find_nsmap(elems):
def clone_into(parent, elem):
' Clone the element, assuming that all namespace declarations are present in parent '
"""
Clone the element, assuming that all namespace declarations are present
in parent
"""
clone = parent.makeelement(elem.tag)
parent.append(clone)
if elem.text and not elem.text.isspace():
@@ -591,28 +622,38 @@ def clone_into(parent, elem):
def merge_xmp_packet(old, new):
''' Merge metadata present in the old packet that is not present in the new
"""
Merge metadata present in the old packet that is not present in the new
one into the new one. Assumes the new packet was generated by
metadata_to_xmp_packet() '''
metadata_to_xmp_packet()
"""
old, new = parse_xmp_packet(old), parse_xmp_packet(new)
# As per the adobe spec all metadata items have to be present inside top-level rdf:Description containers
# As per the adobe spec all metadata items have to be present inside
# top-level rdf:Description containers
item_xpath = XPath('//rdf:RDF/rdf:Description/*')
# First remove all data fields that metadata_to_xmp_packet() knowns about,
# since either they will have been set or if not present, imply they have
# been cleared
defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES}
defined_tags |= {expand('dc:' + x) for x in ('identifier', 'title', 'creator', 'date', 'description', 'language', 'publisher', 'subject')}
defined_tags |= {expand('xmp:' + x) for x in ('MetadataDate', 'Identifier')}
defined_tags = {expand(prefix + ':' + scheme)
for prefix in ('prism', 'pdfx')
for scheme in KNOWN_ID_SCHEMES}
defined_tags |= {expand('dc:' + x)
for x in ('identifier', 'title', 'creator', 'date',
'description', 'language', 'publisher',
'subject')}
defined_tags |= {expand('xmp:' + x)
for x in ('MetadataDate', 'Identifier')}
# For redundancy also remove all fields explicitly set in the new packet
defined_tags |= {x.tag for x in item_xpath(new)}
calibrens = '{%s}' % NS_MAP['calibre']
for elem in item_xpath(old):
if elem.tag in defined_tags or (elem.tag and elem.tag.startswith(calibrens)):
if elem.tag in defined_tags or (elem.tag and
elem.tag.startswith(calibrens)):
elem.getparent().remove(elem)
# Group all items into groups based on their namespaces
groups = defaultdict(list)
groups = collections.defaultdict(list)
for item in item_xpath(new):
ns = item.nsmap[item.prefix]
groups[ns].append(item)
@@ -626,9 +667,14 @@ def merge_xmp_packet(old, new):
root = A.xmpmeta(R.RDF)
rdf = root[0]
for namespace in sorted(groups, key=lambda x:{NS_MAP['dc']:'a', NS_MAP['xmp']:'b', NS_MAP['calibre']:'c'}.get(x, 'z'+x)):
for namespace in sorted(groups,
key=lambda x: {NS_MAP['dc']: 'a',
NS_MAP['xmp']: 'b',
NS_MAP['calibre']: 'c'}.get(x,
'z'+x)):
items = groups[namespace]
desc = rdf.makeelement(expand('rdf:Description'), nsmap=find_nsmap(items))
desc = rdf.makeelement(expand('rdf:Description'),
nsmap=find_nsmap(items))
desc.set(expand('rdf:about'), '')
rdf.append(desc)
for item in items: