mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-19 12:43:35 +02:00
Use the real constants module.
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
This commit is contained in:
@@ -1,38 +1,32 @@
|
||||
"""
|
||||
Read meta information from fb2 files
|
||||
"""
|
||||
import os, random
|
||||
from functools import partial
|
||||
from string import ascii_letters, digits
|
||||
import functools
|
||||
import os
|
||||
import random
|
||||
import string
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter.utils.date import parse_only_date
|
||||
from ebook_converter.utils.img import save_cover_data_to
|
||||
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
||||
from ebook_converter.utils.imghdr import identify
|
||||
from ebook_converter import guess_type, guess_all_extensions, prints, force_unicode
|
||||
from ebook_converter import guess_type, guess_all_extensions, prints, \
|
||||
force_unicode
|
||||
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.polyglot.binary import as_base64_unicode
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = ('2011, Roman Mukhin <ramses_ru at hotmail.com>, '
|
||||
'2008, Anatoly Shipitsin <norguhtar at gmail.com>')
|
||||
NAMESPACES = {'fb2': 'http://www.gribuser.ru/xml/fictionbook/2.0',
|
||||
'fb21': 'http://www.gribuser.ru/xml/fictionbook/2.1',
|
||||
'xlink': 'http://www.w3.org/1999/xlink'}
|
||||
|
||||
|
||||
NAMESPACES = {
|
||||
'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0',
|
||||
'fb21' : 'http://www.gribuser.ru/xml/fictionbook/2.1',
|
||||
'xlink' : 'http://www.w3.org/1999/xlink'
|
||||
}
|
||||
|
||||
tostring = partial(etree.tostring, method='text', encoding='unicode')
|
||||
tostring = functools.partial(etree.tostring, method='text', encoding='unicode')
|
||||
|
||||
|
||||
def XLINK(tag):
|
||||
return '{%s}%s'%(NAMESPACES['xlink'], tag)
|
||||
return '{%s}%s' % (NAMESPACES['xlink'], tag)
|
||||
|
||||
|
||||
class Context(object):
|
||||
@@ -52,7 +46,7 @@ class Context(object):
|
||||
return etree.XPath(*args, namespaces=self.namespaces)
|
||||
|
||||
def get_or_create(self, parent, tag, attribs={}, at_start=True):
|
||||
xpathstr='./fb:'+tag
|
||||
xpathstr = './fb:'+tag
|
||||
for n, v in attribs.items():
|
||||
xpathstr += '[@%s="%s"]' % (n, v)
|
||||
ans = self.XPath(xpathstr)(parent)
|
||||
@@ -73,7 +67,7 @@ class Context(object):
|
||||
|
||||
def clear_meta_tags(self, doc, tag):
|
||||
for parent in ('title-info', 'src-title-info', 'publish-info'):
|
||||
for x in self.XPath('//fb:%s/fb:%s'%(parent, tag))(doc):
|
||||
for x in self.XPath('//fb:%s/fb:%s' % (parent, tag))(doc):
|
||||
x.getparent().remove(x)
|
||||
|
||||
def text2fb2(self, parent, text):
|
||||
@@ -117,42 +111,41 @@ def get_metadata(stream):
|
||||
book_title = str(book_title)
|
||||
else:
|
||||
book_title = force_unicode(os.path.splitext(
|
||||
os.path.basename(getattr(stream, 'name',
|
||||
'Unknown')))[0])
|
||||
os.path.basename(getattr(stream, 'name', 'Unknown')))[0])
|
||||
mi = MetaInformation(book_title, authors)
|
||||
|
||||
try:
|
||||
_parse_cover(root, mi, ctx)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
_parse_comments(root, mi, ctx)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
_parse_tags(root, mi, ctx)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
_parse_series(root, mi, ctx)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
_parse_isbn(root, mi, ctx)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
_parse_publisher(root, mi, ctx)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
_parse_pubdate(root, mi, ctx)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
_parse_language(root, mi, ctx)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return mi
|
||||
@@ -160,11 +153,11 @@ def get_metadata(stream):
|
||||
|
||||
def _parse_authors(root, ctx):
|
||||
authors = []
|
||||
# pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
|
||||
# Those are fallbacks: <src-title-info>, <document-info>
|
||||
# pick up authors but only from 1 secrion <title-info>; otherwise it is
|
||||
# not consistent! Those are fallbacks: <src-title-info>, <document-info>
|
||||
author = None
|
||||
for author_sec in ['title-info', 'src-title-info', 'document-info']:
|
||||
for au in ctx.XPath('//fb:%s/fb:author'%author_sec)(root):
|
||||
for au in ctx.XPath('//fb:%s/fb:author' % author_sec)(root):
|
||||
author = _parse_author(au, ctx)
|
||||
if author:
|
||||
authors.append(author)
|
||||
@@ -207,24 +200,26 @@ def _parse_book_title(root, ctx):
|
||||
xp_ti = '//fb:title-info/fb:book-title/text()'
|
||||
xp_pi = '//fb:publish-info/fb:book-title/text()'
|
||||
xp_si = '//fb:src-title-info/fb:book-title/text()'
|
||||
book_title = ctx.XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
|
||||
book_title = ctx.XPath('normalize-space(%s|%s|%s)' %
|
||||
(xp_ti, xp_pi, xp_si))(root)
|
||||
|
||||
return book_title
|
||||
|
||||
|
||||
def _parse_cover(root, mi, ctx):
|
||||
# pickup from <title-info>, if not exists it fallbacks to <src-title-info>
|
||||
imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/@xlink:href), "#")')(root)
|
||||
imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/'
|
||||
'@xlink:href), "#")')(root)
|
||||
if imgid:
|
||||
try:
|
||||
_parse_cover_data(root, imgid, mi, ctx)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _parse_cover_data(root, imgid, mi, ctx):
|
||||
from ebook_converter.ebooks.fb2 import base64_decode
|
||||
elm_binary = ctx.XPath('//fb:binary[@id="%s"]'%imgid)(root)
|
||||
elm_binary = ctx.XPath('//fb:binary[@id="%s"]' % imgid)(root)
|
||||
if elm_binary:
|
||||
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
|
||||
mime_extensions = guess_all_extensions(mimetype)
|
||||
@@ -241,12 +236,13 @@ def _parse_cover_data(root, imgid, mi, ctx):
|
||||
fmt = identify(cdata)[0]
|
||||
mi.cover_data = (fmt, cdata)
|
||||
else:
|
||||
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid))
|
||||
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" %
|
||||
(mimetype, imgid))
|
||||
|
||||
|
||||
def _parse_tags(root, mi, ctx):
|
||||
# pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
|
||||
# Those are fallbacks: <src-title-info>
|
||||
# pick up genre but only from 1 secrion <title-info>; otherwise it is not
|
||||
# consistent! Those are fallbacks: <src-title-info>
|
||||
for genre_sec in ['title-info', 'src-title-info']:
|
||||
# -- i18n Translations-- ?
|
||||
tags = ctx.XPath('//fb:%s/fb:genre/text()' % genre_sec)(root)
|
||||
@@ -267,16 +263,20 @@ def _parse_series(root, mi, ctx):
|
||||
mi.series = elms_sequence[0].get('name', None)
|
||||
if mi.series:
|
||||
try:
|
||||
mi.series_index = float('.'.join(elms_sequence[0].get('number', None).split()[:2]))
|
||||
i = float('.'.join(elms_sequence[0].get('number',
|
||||
None).split()[:2]))
|
||||
mi.series_index = i
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _parse_isbn(root, mi, ctx):
|
||||
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
|
||||
# some people try to put several isbn in this field, but it is not
|
||||
# allowed. try to stick to the 1-st one in this case
|
||||
isbn = ctx.XPath('normalize-space(//fb:publish-info/fb:isbn/text())')(root)
|
||||
if isbn:
|
||||
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
|
||||
# some people try to put several isbn in this field, but it is not
|
||||
# allowed. try to stick to the 1-st one in this case
|
||||
if ',' in isbn:
|
||||
isbn = isbn[:isbn.index(',')]
|
||||
if check_isbn(isbn):
|
||||
@@ -284,9 +284,11 @@ def _parse_isbn(root, mi, ctx):
|
||||
|
||||
|
||||
def _parse_comments(root, mi, ctx):
|
||||
# pick up annotation but only from 1 section <title-info>; fallback: <src-title-info>
|
||||
# pick up annotation but only from 1 section <title-info>;
|
||||
# fallback: <src-title-info>
|
||||
for annotation_sec in ['title-info', 'src-title-info']:
|
||||
elms_annotation = ctx.XPath('//fb:%s/fb:annotation' % annotation_sec)(root)
|
||||
elms_annotation = ctx.XPath('//fb:%s/fb:annotation' %
|
||||
annotation_sec)(root)
|
||||
if elms_annotation:
|
||||
mi.comments = tostring(elms_annotation[0])
|
||||
# TODO: tags i18n, xslt?
|
||||
@@ -294,7 +296,8 @@ def _parse_comments(root, mi, ctx):
|
||||
|
||||
|
||||
def _parse_publisher(root, mi, ctx):
|
||||
publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/text())')(root)
|
||||
publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/'
|
||||
'text())')(root)
|
||||
if publisher:
|
||||
mi.publisher = publisher
|
||||
|
||||
@@ -315,7 +318,7 @@ def _parse_language(root, mi, ctx):
|
||||
|
||||
def _get_fbroot(raw):
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
|
||||
root = safe_xml_fromstring(raw)
|
||||
root = etree.fromstring(raw)
|
||||
return ensure_namespace(root)
|
||||
|
||||
|
||||
@@ -348,10 +351,12 @@ def _set_authors(title_info, mi, ctx):
|
||||
ctx.create_tag(atag, 'first-name').text = author_parts[0]
|
||||
author_parts = author_parts[1:]
|
||||
if len(author_parts) > 1:
|
||||
ctx.create_tag(atag, 'middle-name', at_start=False).text = author_parts[0]
|
||||
ctx.create_tag(atag, 'middle-name',
|
||||
at_start=False).text = author_parts[0]
|
||||
author_parts = author_parts[1:]
|
||||
if author_parts:
|
||||
ctx.create_tag(atag, 'last-name', at_start=False).text = ' '.join(author_parts)
|
||||
a = ' '.join(author_parts)
|
||||
ctx.create_tag(atag, 'last-name', at_start=False).text = a
|
||||
|
||||
|
||||
def _set_tags(title_info, mi, ctx):
|
||||
@@ -368,12 +373,12 @@ def _set_series(title_info, mi, ctx):
|
||||
seq = ctx.get_or_create(title_info, 'sequence')
|
||||
seq.set('name', mi.series)
|
||||
try:
|
||||
seq.set('number', '%g'%mi.series_index)
|
||||
except:
|
||||
seq.set('number', '%g' % mi.series_index)
|
||||
except Exception:
|
||||
seq.set('number', '1')
|
||||
|
||||
|
||||
def _rnd_name(size=8, chars=ascii_letters + digits):
|
||||
def _rnd_name(size=8, chars=string.ascii_letters + string.digits):
|
||||
return ''.join(random.choice(chars) for x in range(size))
|
||||
|
||||
|
||||
@@ -396,7 +401,9 @@ def _set_cover(title_info, mi, ctx):
|
||||
cim_filename = _rnd_pic_file_name('cover')
|
||||
cim_tag.attrib[XLINK('href')] = '#' + cim_filename
|
||||
fb2_root = cim_tag.getroottree().getroot()
|
||||
cim_binary = ctx.get_or_create(fb2_root, 'binary', attribs={'id': cim_filename}, at_start=False)
|
||||
cim_binary = ctx.get_or_create(fb2_root, 'binary',
|
||||
attribs={'id': cim_filename},
|
||||
at_start=False)
|
||||
cim_binary.attrib['content-type'] = 'image/jpeg'
|
||||
cim_binary.text = _encode_into_jpeg(mi.cover_data[1])
|
||||
|
||||
@@ -425,7 +432,8 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
|
||||
# single quotes in xml declaration. Sigh. See
|
||||
# https://www.mobileread.com/forums/showthread.php?p=2273184#post2273184
|
||||
raw = b'<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||
raw += etree.tostring(root, method='xml', encoding='utf-8', xml_declaration=False)
|
||||
raw += etree.tostring(root, method='xml', encoding='utf-8',
|
||||
xml_declaration=False)
|
||||
|
||||
stream.seek(0)
|
||||
stream.truncate()
|
||||
@@ -449,6 +457,7 @@ def ensure_namespace(doc):
|
||||
if bare_tags:
|
||||
import re
|
||||
raw = etree.tostring(doc, encoding='unicode')
|
||||
raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw)
|
||||
doc = safe_xml_fromstring(raw)
|
||||
raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>',
|
||||
raw)
|
||||
doc = etree.fromstring(raw)
|
||||
return doc
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -5,6 +5,7 @@ from functools import wraps
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import prints
|
||||
from ebook_converter.ebooks.metadata import authors_to_string, check_isbn, string_to_authors
|
||||
from ebook_converter.ebooks.metadata.book.base import Metadata
|
||||
@@ -15,7 +16,6 @@ from ebook_converter.ebooks.metadata.utils import (
|
||||
create_manifest_item, ensure_unique, normalize_languages, parse_opf,
|
||||
pretty_print_opf
|
||||
)
|
||||
from ebook_converter.ebooks.oeb.base import DC, OPF, OPF2_NSMAP
|
||||
from ebook_converter.utils.config import from_json, to_json
|
||||
from ebook_converter.utils.date import (
|
||||
fix_only_date, is_date_undefined, isoformat, parse_date as parse_date_, utcnow,
|
||||
@@ -46,7 +46,7 @@ def XPath(x):
|
||||
try:
|
||||
return _xpath_cache[x]
|
||||
except KeyError:
|
||||
_xpath_cache[x] = ans = etree.XPath(x, namespaces=OPF2_NSMAP)
|
||||
_xpath_cache[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP)
|
||||
return ans
|
||||
|
||||
|
||||
@@ -213,7 +213,7 @@ def set_refines(elem, existing_refines, *new_refines):
|
||||
remove_refines(elem, existing_refines)
|
||||
for ref in reversed(new_refines):
|
||||
prop, val, scheme = ref
|
||||
r = elem.makeelement(OPF('meta'))
|
||||
r = elem.makeelement(const.OPF_META)
|
||||
r.set('refines', '#' + eid), r.set('property', prop)
|
||||
r.text = val.strip()
|
||||
if scheme:
|
||||
@@ -249,7 +249,7 @@ def parse_identifier(ident, val, refines):
|
||||
# Try the OPF 2 style opf:scheme attribute, which will be present, for
|
||||
# example, in EPUB 3 files that have had their metadata set by an
|
||||
# application that only understands EPUB 2.
|
||||
scheme = ident.get(OPF('scheme'))
|
||||
scheme = ident.get(const.OPF_SCHEME)
|
||||
if scheme and not lval.startswith('urn:'):
|
||||
return finalize(scheme, val)
|
||||
|
||||
@@ -294,7 +294,7 @@ def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=
|
||||
continue
|
||||
metadata = XPath('./opf:metadata')(root)[0]
|
||||
for scheme, val in new_identifiers.items():
|
||||
ident = metadata.makeelement(DC('identifier'))
|
||||
ident = metadata.makeelement(const.DC_IDENT)
|
||||
ident.text = '%s:%s' % (scheme, val)
|
||||
if package_identifier is None:
|
||||
metadata.append(ident)
|
||||
@@ -312,11 +312,11 @@ def identifier_writer(name):
|
||||
if is_package_id:
|
||||
package_identifier = ident
|
||||
val = (ident.text or '').strip()
|
||||
if (val.startswith(name + ':') or ident.get(OPF('scheme')) == name) and not is_package_id:
|
||||
if (val.startswith(name + ':') or ident.get(const.OPF_SCHEME) == name) and not is_package_id:
|
||||
remove_element(ident, refines)
|
||||
metadata = XPath('./opf:metadata')(root)[0]
|
||||
if ival:
|
||||
ident = metadata.makeelement(DC('identifier'))
|
||||
ident = metadata.makeelement(const.DC_IDENT)
|
||||
ident.text = '%s:%s' % (name, ival)
|
||||
if package_identifier is None:
|
||||
metadata.append(ident)
|
||||
@@ -376,7 +376,7 @@ def set_title(root, prefixes, refines, title, title_sort=None):
|
||||
main_title = find_main_title(root, refines, remove_blanks=True)
|
||||
if main_title is None:
|
||||
m = XPath('./opf:metadata')(root)[0]
|
||||
main_title = m.makeelement(DC('title'))
|
||||
main_title = m.makeelement(const.DC_TITLE)
|
||||
m.insert(0, main_title)
|
||||
main_title.text = title or None
|
||||
ts = [refdef('file-as', title_sort)] if title_sort else ()
|
||||
@@ -411,7 +411,7 @@ def set_languages(root, prefixes, refines, languages):
|
||||
languages = ['und']
|
||||
metadata = XPath('./opf:metadata')(root)[0]
|
||||
for lang in uniq(languages):
|
||||
l = metadata.makeelement(DC('language'))
|
||||
l = metadata.makeelement(const.DC_LANG)
|
||||
l.text = lang
|
||||
metadata.append(l)
|
||||
# }}}
|
||||
@@ -440,7 +440,7 @@ def read_authors(root, prefixes, refines):
|
||||
if file_as:
|
||||
aus = file_as[0][-1]
|
||||
else:
|
||||
aus = item.get(OPF('file-as')) or None
|
||||
aus = item.get(const.OPF_FILE_AS) or None
|
||||
return Author(normalize_whitespace(val), normalize_whitespace(aus))
|
||||
|
||||
for item in XPath('./opf:metadata/dc:creator')(root):
|
||||
@@ -448,7 +448,7 @@ def read_authors(root, prefixes, refines):
|
||||
if val:
|
||||
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
||||
role = props.get('role')
|
||||
opf_role = item.get(OPF('role'))
|
||||
opf_role = item.get(const.OPF_ROLE)
|
||||
if role:
|
||||
if is_relators_role(props, 'aut'):
|
||||
roled_authors.append(author(item, props, val))
|
||||
@@ -465,22 +465,22 @@ def set_authors(root, prefixes, refines, authors):
|
||||
ensure_prefix(root, prefixes, 'marc')
|
||||
for item in XPath('./opf:metadata/dc:creator')(root):
|
||||
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
||||
opf_role = item.get(OPF('role'))
|
||||
opf_role = item.get(const.OPF_ROLE)
|
||||
if (opf_role and opf_role.lower() != 'aut') or (props.get('role') and not is_relators_role(props, 'aut')):
|
||||
continue
|
||||
remove_element(item, refines)
|
||||
metadata = XPath('./opf:metadata')(root)[0]
|
||||
for author in authors:
|
||||
if author.name:
|
||||
a = metadata.makeelement(DC('creator'))
|
||||
a = metadata.makeelement(const.DC_CREATOR)
|
||||
aid = ensure_id(a)
|
||||
a.text = author.name
|
||||
metadata.append(a)
|
||||
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
|
||||
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
|
||||
m.text = 'aut'
|
||||
metadata.append(m)
|
||||
if author.sort:
|
||||
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'file-as'})
|
||||
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'file-as'})
|
||||
m.text = author.sort
|
||||
metadata.append(m)
|
||||
|
||||
@@ -492,7 +492,7 @@ def read_book_producers(root, prefixes, refines):
|
||||
if val:
|
||||
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
||||
role = props.get('role')
|
||||
opf_role = item.get(OPF('role'))
|
||||
opf_role = item.get(const.OPF_ROLE)
|
||||
if role:
|
||||
if is_relators_role(props, 'bkp'):
|
||||
ans.append(normalize_whitespace(val))
|
||||
@@ -504,18 +504,18 @@ def read_book_producers(root, prefixes, refines):
|
||||
def set_book_producers(root, prefixes, refines, producers):
|
||||
for item in XPath('./opf:metadata/dc:contributor')(root):
|
||||
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
|
||||
opf_role = item.get(OPF('role'))
|
||||
opf_role = item.get(const.OPF_ROLE)
|
||||
if (opf_role and opf_role.lower() != 'bkp') or (props.get('role') and not is_relators_role(props, 'bkp')):
|
||||
continue
|
||||
remove_element(item, refines)
|
||||
metadata = XPath('./opf:metadata')(root)[0]
|
||||
for bkp in producers:
|
||||
if bkp:
|
||||
a = metadata.makeelement(DC('contributor'))
|
||||
a = metadata.makeelement(const.DC_CONTRIBUTOR)
|
||||
aid = ensure_id(a)
|
||||
a.text = bkp
|
||||
metadata.append(a)
|
||||
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
|
||||
m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
|
||||
m.text = 'bkp'
|
||||
metadata.append(m)
|
||||
# }}}
|
||||
@@ -552,7 +552,7 @@ def set_pubdate(root, prefixes, refines, val):
|
||||
if not is_date_undefined(val):
|
||||
val = isoformat(val)
|
||||
m = XPath('./opf:metadata')(root)[0]
|
||||
d = m.makeelement(DC('date'))
|
||||
d = m.makeelement(const.DC_DATE)
|
||||
d.text = val
|
||||
m.append(d)
|
||||
|
||||
@@ -584,7 +584,7 @@ def create_timestamp(root, prefixes, m, val):
|
||||
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
||||
ensure_prefix(root, prefixes, 'dcterms')
|
||||
val = w3cdtf(val)
|
||||
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'})
|
||||
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'})
|
||||
d.text = val
|
||||
m.append(d)
|
||||
|
||||
@@ -625,7 +625,7 @@ def set_last_modified(root, prefixes, refines, val=None):
|
||||
else:
|
||||
ensure_prefix(root, prefixes, 'dcterms')
|
||||
m = XPath('./opf:metadata')(root)[0]
|
||||
meta = m.makeelement(OPF('meta'), attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'})
|
||||
meta = m.makeelement(const.OPF_META, attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'})
|
||||
m.append(meta)
|
||||
meta.text = val
|
||||
# }}}
|
||||
@@ -648,7 +648,7 @@ def set_comments(root, prefixes, refines, val):
|
||||
if val:
|
||||
val = val.strip()
|
||||
if val:
|
||||
c = m.makeelement(DC('description'))
|
||||
c = m.makeelement(const.DC_DESC)
|
||||
c.text = val
|
||||
m.append(c)
|
||||
# }}}
|
||||
@@ -670,7 +670,7 @@ def set_publisher(root, prefixes, refines, val):
|
||||
if val:
|
||||
val = val.strip()
|
||||
if val:
|
||||
c = m.makeelement(DC('publisher'))
|
||||
c = m.makeelement(const.DC_PUBLISHER('publisher'))
|
||||
c.text = normalize_whitespace(val)
|
||||
m.append(c)
|
||||
# }}}
|
||||
@@ -693,7 +693,7 @@ def set_tags(root, prefixes, refines, val):
|
||||
if val:
|
||||
val = uniq(list(filter(None, val)))
|
||||
for x in val:
|
||||
c = m.makeelement(DC('subject'))
|
||||
c = m.makeelement(const.DC_SUBJ)
|
||||
c.text = normalize_whitespace(x)
|
||||
if c.text:
|
||||
m.append(c)
|
||||
@@ -725,7 +725,7 @@ def read_rating(root, prefixes, refines):
|
||||
def create_rating(root, prefixes, val):
|
||||
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
||||
m = XPath('./opf:metadata')(root)[0]
|
||||
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:rating'})
|
||||
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:rating'})
|
||||
d.text = val
|
||||
m.append(d)
|
||||
|
||||
@@ -772,7 +772,7 @@ def read_series(root, prefixes, refines):
|
||||
|
||||
def create_series(root, refines, series, series_index):
|
||||
m = XPath('./opf:metadata')(root)[0]
|
||||
d = m.makeelement(OPF('meta'), attrib={'property':'belongs-to-collection'})
|
||||
d = m.makeelement(const.OPF_META, attrib={'property':'belongs-to-collection'})
|
||||
d.text = series
|
||||
m.append(d)
|
||||
set_refines(d, refines, refdef('collection-type', 'series'), refdef('group-position', series_index))
|
||||
@@ -836,7 +836,7 @@ def dict_writer(name, serialize=dump_dict, remove2=True):
|
||||
if val:
|
||||
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
|
||||
m = XPath('./opf:metadata')(root)[0]
|
||||
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:%s' % name})
|
||||
d = m.makeelement(const.OPF_META, attrib={'property':'calibre:%s' % name})
|
||||
d.text = serialize(val)
|
||||
m.append(d)
|
||||
return writer
|
||||
|
||||
@@ -10,17 +10,13 @@ from lxml.builder import ElementMaker
|
||||
|
||||
from ebook_converter.constants_old import __appname__, __version__
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
from ebook_converter.polyglot.urllib import unquote
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
NCX_NS = "http://www.daisy.org/z3986/2005/ncx/"
|
||||
CALIBRE_NS = "http://calibre.kovidgoyal.net/2009/metadata"
|
||||
NSMAP = {None: NCX_NS, 'calibre':CALIBRE_NS}
|
||||
NSMAP = {None: NCX_NS, 'calibre': CALIBRE_NS}
|
||||
E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP)
|
||||
C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
|
||||
|
||||
@@ -30,8 +26,10 @@ def parse_html_toc(data):
|
||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
from lxml import etree
|
||||
if isinstance(data, bytes):
|
||||
data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
|
||||
data = xml_to_unicode(data, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False,
|
||||
sanitize_names=True)
|
||||
for a in root.xpath('//*[@href and local-name()="a"]'):
|
||||
purl = urllib.parse.urlparse(unquote(a.get('href')))
|
||||
href, fragment = purl[2], purl[5]
|
||||
@@ -48,8 +46,8 @@ def parse_html_toc(data):
|
||||
class TOC(list):
|
||||
|
||||
def __init__(self, href=None, fragment=None, text=None, parent=None,
|
||||
play_order=0, base_path=os.getcwd(), type='unknown', author=None,
|
||||
description=None, toc_thumbnail=None):
|
||||
play_order=0, base_path=os.getcwd(), type='unknown',
|
||||
author=None, description=None, toc_thumbnail=None):
|
||||
self.href = href
|
||||
self.fragment = fragment
|
||||
if not self.fragment:
|
||||
@@ -64,7 +62,7 @@ class TOC(list):
|
||||
self.toc_thumbnail = toc_thumbnail
|
||||
|
||||
def __str__(self):
|
||||
lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)]
|
||||
lines = ['TOC: %s#%s %s' % (self.href, self.fragment, self.text)]
|
||||
for child in self:
|
||||
c = str(child).splitlines()
|
||||
for l in c:
|
||||
@@ -91,12 +89,14 @@ class TOC(list):
|
||||
entry.parent = None
|
||||
|
||||
def add_item(self, href, fragment, text, play_order=None, type='unknown',
|
||||
author=None, description=None, toc_thumbnail=None):
|
||||
author=None, description=None, toc_thumbnail=None):
|
||||
if play_order is None:
|
||||
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
|
||||
play_order = (self[-1].play_order
|
||||
if len(self) else self.play_order) + 1
|
||||
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
|
||||
base_path=self.base_path, play_order=play_order,
|
||||
type=type, author=author, description=description, toc_thumbnail=toc_thumbnail))
|
||||
type=type, author=author, description=description,
|
||||
toc_thumbnail=toc_thumbnail))
|
||||
return self[-1]
|
||||
|
||||
def top_level_items(self):
|
||||
@@ -121,7 +121,10 @@ class TOC(list):
|
||||
|
||||
@property
|
||||
def abspath(self):
|
||||
'Return the file this toc entry points to as a absolute path to a file on the system.'
|
||||
"""
|
||||
Return the file this toc entry points to as a absolute path to a file
|
||||
on the system.
|
||||
"""
|
||||
|
||||
if self.href is None:
|
||||
return None
|
||||
@@ -136,8 +139,9 @@ class TOC(list):
|
||||
toc = toc['toc']
|
||||
if toc is None:
|
||||
try:
|
||||
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
|
||||
except:
|
||||
toc = (opfreader.soup.find('guide')
|
||||
.find('reference', attrs={'type': 'toc'})['href'])
|
||||
except Exception:
|
||||
for item in opfreader.manifest:
|
||||
if 'toc' in item.href().lower():
|
||||
toc = item.href()
|
||||
@@ -151,13 +155,15 @@ class TOC(list):
|
||||
toc = os.path.join(self.base_path, toc)
|
||||
try:
|
||||
if not os.path.exists(toc):
|
||||
bn = os.path.basename(toc)
|
||||
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
|
||||
bn = os.path.basename(toc)
|
||||
# Bug in BAEN OPF files
|
||||
bn = bn.replace('_top.htm', '_toc.htm')
|
||||
toc = os.path.join(os.path.dirname(toc), bn)
|
||||
|
||||
self.read_html_toc(toc)
|
||||
except:
|
||||
print('WARNING: Could not read Table of Contents. Continuing anyway.')
|
||||
except Exception:
|
||||
print('WARNING: Could not read Table of Contents. '
|
||||
'Continuing anyway.')
|
||||
else:
|
||||
path = opfreader.manifest.item(toc.lower())
|
||||
path = getattr(path, 'path', path)
|
||||
@@ -177,9 +183,9 @@ class TOC(list):
|
||||
self.base_path = os.path.dirname(toc)
|
||||
if root is None:
|
||||
with open(toc, 'rb') as f:
|
||||
raw = xml_to_unicode(f.read(), assume_utf8=True,
|
||||
strip_encoding_pats=True)[0]
|
||||
root = safe_xml_fromstring(raw)
|
||||
raw = xml_to_unicode(f.read(), assume_utf8=True,
|
||||
strip_encoding_pats=True)[0]
|
||||
root = etree.fromstring(raw)
|
||||
xpn = {'re': 'http://exslt.org/regular-expressions'}
|
||||
XPath = functools.partial(etree.XPath, namespaces=xpn)
|
||||
|
||||
@@ -197,7 +203,7 @@ class TOC(list):
|
||||
def process_navpoint(np, dest):
|
||||
try:
|
||||
play_order = int(get_attr(np, 1))
|
||||
except:
|
||||
except Exception:
|
||||
play_order = 1
|
||||
href = fragment = text = None
|
||||
nd = dest
|
||||
@@ -207,7 +213,7 @@ class TOC(list):
|
||||
text = ''
|
||||
for txt in txt_path(nl):
|
||||
text += etree.tostring(txt, method='text',
|
||||
encoding='unicode', with_tail=False)
|
||||
encoding='unicode', with_tail=False)
|
||||
content = content_path(np)
|
||||
if content and text:
|
||||
content = content[0]
|
||||
@@ -242,17 +248,14 @@ class TOC(list):
|
||||
self.add_item(href, fragment, txt)
|
||||
|
||||
def render(self, stream, uid):
|
||||
root = E.ncx(
|
||||
E.head(
|
||||
E.meta(name='dtb:uid', content=str(uid)),
|
||||
E.meta(name='dtb:depth', content=str(self.depth())),
|
||||
E.meta(name='dtb:generator', content='%s (%s)'%(__appname__,
|
||||
__version__)),
|
||||
E.meta(name='dtb:totalPageCount', content='0'),
|
||||
E.meta(name='dtb:maxPageNumber', content='0'),
|
||||
),
|
||||
E.docTitle(E.text('Table of Contents')),
|
||||
)
|
||||
root = E.ncx(E.head(E.meta(name='dtb:uid', content=str(uid)),
|
||||
E.meta(name='dtb:depth',
|
||||
content=str(self.depth())),
|
||||
E.meta(name='dtb:generator', content='%s (%s)' %
|
||||
(__appname__, __version__)),
|
||||
E.meta(name='dtb:totalPageCount', content='0'),
|
||||
E.meta(name='dtb:maxPageNumber', content='0')),
|
||||
E.docTitle(E.text('Table of Contents')))
|
||||
navmap = E.navMap()
|
||||
root.append(navmap)
|
||||
root.set('{http://www.w3.org/XML/1998/namespace}lang', 'en')
|
||||
@@ -263,12 +266,12 @@ class TOC(list):
|
||||
if not text:
|
||||
text = ''
|
||||
c[1] += 1
|
||||
item_id = 'num_%d'%c[1]
|
||||
item_id = 'num_%d' % c[1]
|
||||
text = clean_xml_chars(text)
|
||||
elem = E.navPoint(
|
||||
E.navLabel(E.text(re.sub(r'\s+', ' ', text))),
|
||||
E.content(src=str(np.href)+(('#' + str(np.fragment))
|
||||
if np.fragment else '')),
|
||||
if np.fragment else '')),
|
||||
id=item_id,
|
||||
playOrder=str(np.play_order)
|
||||
)
|
||||
@@ -282,7 +285,8 @@ class TOC(list):
|
||||
try:
|
||||
elem.append(C.meta(desc, name='description'))
|
||||
except ValueError:
|
||||
elem.append(C.meta(clean_xml_chars(desc), name='description'))
|
||||
elem.append(C.meta(clean_xml_chars(desc),
|
||||
name='description'))
|
||||
idx = getattr(np, 'toc_thumbnail', None)
|
||||
if idx:
|
||||
elem.append(C.meta(idx, name='toc_thumbnail'))
|
||||
@@ -293,5 +297,5 @@ class TOC(list):
|
||||
for np in self:
|
||||
navpoint(navmap, np)
|
||||
raw = etree.tostring(root, encoding='utf-8', xml_declaration=True,
|
||||
pretty_print=True)
|
||||
pretty_print=True)
|
||||
stream.write(raw)
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
from collections import namedtuple
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.ebooks.oeb.base import OPF
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
from ebook_converter.ebooks.oeb.polish.utils import guess_type
|
||||
from ebook_converter.spell import parse_lang_code
|
||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
from ebook_converter.utils.localization import lang_as_iso639_1
|
||||
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
||||
|
||||
|
||||
OPFVersion = namedtuple('OPFVersion', 'major minor patch')
|
||||
@@ -35,23 +36,26 @@ def parse_opf(stream_or_path):
|
||||
raw = stream.read()
|
||||
if not raw:
|
||||
raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
|
||||
raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
|
||||
raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True, assume_utf8=True)
|
||||
raw = raw[raw.find('<'):]
|
||||
root = safe_xml_fromstring(clean_xml_chars(raw))
|
||||
root = etree.fromstring(clean_xml_chars(raw))
|
||||
if root is None:
|
||||
raise ValueError('Not an OPF file')
|
||||
return root
|
||||
|
||||
|
||||
def normalize_languages(opf_languages, mi_languages):
|
||||
' Preserve original country codes and use 2-letter lang codes where possible '
|
||||
"""
|
||||
Preserve original country codes and use 2-letter lang codes where possible
|
||||
"""
|
||||
def parse(x):
|
||||
try:
|
||||
return parse_lang_code(x)
|
||||
except ValueError:
|
||||
return None
|
||||
opf_languages = filter(None, map(parse, opf_languages))
|
||||
cc_map = {c.langcode:c.countrycode for c in opf_languages}
|
||||
cc_map = {c.langcode: c.countrycode for c in opf_languages}
|
||||
mi_languages = filter(None, map(parse, mi_languages))
|
||||
|
||||
def norm(x):
|
||||
@@ -83,9 +87,9 @@ def create_manifest_item(root, href_template, id_template, media_type=None):
|
||||
all_hrefs = frozenset(root.xpath('//*/@href'))
|
||||
href = ensure_unique(href_template, all_hrefs)
|
||||
item_id = ensure_unique(id_template, all_ids)
|
||||
manifest = root.find(OPF('manifest'))
|
||||
manifest = root.find(base.tag('opf', 'manifest'))
|
||||
if manifest is not None:
|
||||
i = manifest.makeelement(OPF('item'))
|
||||
i = manifest.makeelement(base.tag('opf', 'item'))
|
||||
i.set('href', href), i.set('id', item_id)
|
||||
i.set('media-type', media_type or guess_type(href_template))
|
||||
manifest.append(i)
|
||||
@@ -93,6 +97,7 @@ def create_manifest_item(root, href_template, id_template, media_type=None):
|
||||
|
||||
|
||||
def pretty_print_opf(root):
|
||||
from ebook_converter.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree
|
||||
from ebook_converter.ebooks.oeb.polish.pretty import pretty_opf, \
|
||||
pretty_xml_tree
|
||||
pretty_opf(root)
|
||||
pretty_xml_tree(root)
|
||||
|
||||
@@ -1,44 +1,43 @@
|
||||
import re, sys, copy, json
|
||||
from itertools import repeat
|
||||
from collections import defaultdict
|
||||
import collections
|
||||
import copy
|
||||
import itertools
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
|
||||
from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
|
||||
from ebook_converter import prints
|
||||
from ebook_converter.ebooks.metadata import check_isbn, check_doi
|
||||
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
||||
from ebook_converter.ebooks.metadata.book.base import Metadata
|
||||
from ebook_converter.ebooks.metadata.opf2 import dump_dict
|
||||
from ebook_converter.utils.date import parse_date, isoformat, now
|
||||
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
from ebook_converter.utils.localization import canonicalize_lang, \
|
||||
lang_as_iso639_1
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
_xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)'
|
||||
r'[\'"][^<>]*>', re.IGNORECASE)
|
||||
|
||||
_xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE)
|
||||
|
||||
NS_MAP = {
|
||||
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
|
||||
'dc': 'http://purl.org/dc/elements/1.1/',
|
||||
'pdf': 'http://ns.adobe.com/pdf/1.3/',
|
||||
'pdfx': 'http://ns.adobe.com/pdfx/1.3/',
|
||||
'xmp': 'http://ns.adobe.com/xap/1.0/',
|
||||
'xmpidq': 'http://ns.adobe.com/xmp/Identifier/qual/1.0/',
|
||||
'xmpMM': 'http://ns.adobe.com/xap/1.0/mm/',
|
||||
'xmpRights': 'http://ns.adobe.com/xap/1.0/rights/',
|
||||
'xmpBJ': 'http://ns.adobe.com/xap/1.0/bj/',
|
||||
'xmpTPg': 'http://ns.adobe.com/xap/1.0/t/pg/',
|
||||
'xmpDM': 'http://ns.adobe.com/xmp/1.0/DynamicMedia/',
|
||||
'prism': 'http://prismstandard.org/namespaces/basic/2.0/',
|
||||
'crossmark': 'http://crossref.org/crossmark/1.0/',
|
||||
'xml': 'http://www.w3.org/XML/1998/namespace',
|
||||
'x': 'adobe:ns:meta/',
|
||||
'calibre': 'http://calibre-ebook.com/xmp-namespace',
|
||||
'calibreSI': 'http://calibre-ebook.com/xmp-namespace-series-index',
|
||||
'calibreCC': 'http://calibre-ebook.com/xmp-namespace-custom-columns',
|
||||
}
|
||||
NS_MAP = {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
|
||||
'dc': 'http://purl.org/dc/elements/1.1/',
|
||||
'pdf': 'http://ns.adobe.com/pdf/1.3/',
|
||||
'pdfx': 'http://ns.adobe.com/pdfx/1.3/',
|
||||
'xmp': 'http://ns.adobe.com/xap/1.0/',
|
||||
'xmpidq': 'http://ns.adobe.com/xmp/Identifier/qual/1.0/',
|
||||
'xmpMM': 'http://ns.adobe.com/xap/1.0/mm/',
|
||||
'xmpRights': 'http://ns.adobe.com/xap/1.0/rights/',
|
||||
'xmpBJ': 'http://ns.adobe.com/xap/1.0/bj/',
|
||||
'xmpTPg': 'http://ns.adobe.com/xap/1.0/t/pg/',
|
||||
'xmpDM': 'http://ns.adobe.com/xmp/1.0/DynamicMedia/',
|
||||
'prism': 'http://prismstandard.org/namespaces/basic/2.0/',
|
||||
'crossmark': 'http://crossref.org/crossmark/1.0/',
|
||||
'xml': 'http://www.w3.org/XML/1998/namespace',
|
||||
'x': 'adobe:ns:meta/',
|
||||
'calibre': 'http://calibre-ebook.com/xmp-namespace',
|
||||
'calibreSI': 'http://calibre-ebook.com/xmp-namespace-series-index',
|
||||
'calibreCC': 'http://calibre-ebook.com/xmp-namespace-custom-columns'}
|
||||
KNOWN_ID_SCHEMES = {'isbn', 'url', 'doi'}
|
||||
|
||||
|
||||
@@ -63,7 +62,7 @@ def parse_xmp_packet(raw_bytes):
|
||||
pat = r'''<?xpacket\s+[^>]*?begin\s*=\s*['"]([^'"]*)['"]'''
|
||||
encodings = ('8', '16-le', '16-be', '32-le', '32-be')
|
||||
header = raw_bytes[:1024]
|
||||
emap = {'\ufeff'.encode('utf-'+x):'utf-'+x for x in encodings}
|
||||
emap = {'\ufeff'.encode('utf-'+x): 'utf-'+x for x in encodings}
|
||||
emap[b''] = 'utf-8'
|
||||
for q in encodings:
|
||||
m = re.search(pat.encode('utf-'+q), header)
|
||||
@@ -71,15 +70,19 @@ def parse_xmp_packet(raw_bytes):
|
||||
enc = emap.get(m.group(1), enc)
|
||||
break
|
||||
if enc is None:
|
||||
return safe_xml_fromstring(raw_bytes)
|
||||
raw = _xml_declaration.sub('', raw_bytes.decode(enc)) # lxml barfs if encoding declaration present in unicode string
|
||||
return safe_xml_fromstring(raw)
|
||||
return etree.fromstring(raw_bytes)
|
||||
# lxml barfs if encoding declaration present in unicode string
|
||||
raw = _xml_declaration.sub('', raw_bytes.decode(enc))
|
||||
return etree.fromstring(raw)
|
||||
|
||||
|
||||
def serialize_xmp_packet(root, encoding='utf-8'):
|
||||
root.tail = '\n' + '\n'.join(repeat(' '*100, 30)) # Adobe spec recommends inserting padding at the end of the packet
|
||||
raw_bytes = etree.tostring(root, encoding=encoding, pretty_print=True, with_tail=True, method='xml')
|
||||
return b'<?xpacket begin="%s" id="W5M0MpCehiHzreSzNTczkc9d"?>\n%s\n<?xpacket end="w"?>' % ('\ufeff'.encode(encoding), raw_bytes)
|
||||
# Adobe spec recommends inserting padding at the end of the packet
|
||||
root.tail = '\n' + '\n'.join(itertools.repeat(' '*100, 30))
|
||||
raw_bytes = etree.tostring(root, encoding=encoding, pretty_print=True,
|
||||
with_tail=True, method='xml')
|
||||
return ('<?xpacket begin="%s" id="W5M0MpCehiHzreSzNTczkc9d"?>\n%s\n'
|
||||
'<?xpacket end="w"?>' % ('\ufeff'.encode(encoding), raw_bytes))
|
||||
|
||||
|
||||
def read_simple_property(elem):
|
||||
@@ -106,14 +109,15 @@ def read_sequence(parent):
|
||||
yield read_simple_property(item)
|
||||
|
||||
|
||||
def uniq(vals, kmap=lambda x:x):
|
||||
def uniq(vals, kmap=lambda x: x):
|
||||
''' Remove all duplicates from vals, while preserving order. kmap must be a
|
||||
callable that returns a hashable value for every item in vals '''
|
||||
vals = vals or ()
|
||||
lvals = (kmap(x) for x in vals)
|
||||
seen = set()
|
||||
seen_add = seen.add
|
||||
return tuple(x for x, k in zip(vals, lvals) if k not in seen and not seen_add(k))
|
||||
return tuple(x for x, k in zip(vals, lvals) if k not in seen
|
||||
and not seen_add(k))
|
||||
|
||||
|
||||
def multiple_sequences(expr, root):
|
||||
@@ -170,7 +174,8 @@ def read_series(root):
|
||||
|
||||
def read_user_metadata(mi, root):
|
||||
from ebook_converter.utils.config import from_json
|
||||
from ebook_converter.ebooks.metadata.book.json_codec import decode_is_multiple
|
||||
from ebook_converter.ebooks.metadata.book.json_codec import \
|
||||
decode_is_multiple
|
||||
fields = set()
|
||||
for item in XPath('//calibre:custom_metadata')(root):
|
||||
for li in XPath('./rdf:Bag/rdf:li')(item):
|
||||
@@ -186,7 +191,7 @@ def read_user_metadata(mi, root):
|
||||
decode_is_multiple(fm)
|
||||
mi.set_user_metadata(name, fm)
|
||||
fields.add(name)
|
||||
except:
|
||||
except Exception:
|
||||
prints('Failed to read user metadata:', name)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
@@ -194,13 +199,17 @@ def read_user_metadata(mi, root):
|
||||
|
||||
def read_xmp_identifers(parent):
|
||||
''' For example:
|
||||
<rdf:li rdf:parseType="Resource"><xmpidq:Scheme>URL</xmp:idq><rdf:value>http://foo.com</rdf:value></rdf:li>
|
||||
<rdf:li rdf:parseType="Resource"><xmpidq:Scheme>URL</xmp:idq>
|
||||
<rdf:value>http://foo.com</rdf:value></rdf:li>
|
||||
or the longer form:
|
||||
<rdf:li><rdf:Description><xmpidq:Scheme>URL</xmp:idq><rdf:value>http://foo.com</rdf:value></rdf:Description></rdf:li>
|
||||
<rdf:li><rdf:Description><xmpidq:Scheme>URL</xmp:idq>
|
||||
<rdf:value>http://foo.com</rdf:value></rdf:Description></rdf:li>
|
||||
'''
|
||||
for li in XPath('./rdf:Bag/rdf:li')(parent):
|
||||
is_resource = li.attrib.get(expand('rdf:parseType'), None) == 'Resource'
|
||||
is_resource = is_resource or (len(li) == 1 and li[0].tag == expand('rdf:Description'))
|
||||
is_resource = li.attrib.get(expand('rdf:parseType'),
|
||||
None) == 'Resource'
|
||||
is_resource = is_resource or (len(li) == 1 and
|
||||
li[0].tag == expand('rdf:Description'))
|
||||
if not is_resource:
|
||||
yield None, li.text or ''
|
||||
value = XPath('descendant::rdf:value')(li)
|
||||
@@ -241,12 +250,15 @@ def metadata_from_xmp_packet(raw_bytes):
|
||||
if title.startswith(r'\376\377'):
|
||||
# corrupted XMP packet generated by Nitro PDF. See
|
||||
# https://bugs.launchpad.net/calibre/+bug/1541981
|
||||
raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF')
|
||||
raise ValueError('Corrupted XMP metadata packet detected, '
|
||||
'probably generated by Nitro PDF')
|
||||
mi.title = title
|
||||
authors = multiple_sequences('//dc:creator', root)
|
||||
if authors:
|
||||
mi.authors = authors
|
||||
tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root)
|
||||
tags = multiple_sequences('//dc:subject',
|
||||
root) or multiple_sequences('//pdf:Keywords',
|
||||
root)
|
||||
if tags:
|
||||
mi.tags = tags
|
||||
comments = first_alt('//dc:description', root)
|
||||
@@ -256,8 +268,10 @@ def metadata_from_xmp_packet(raw_bytes):
|
||||
if publishers:
|
||||
mi.publisher = publishers[0]
|
||||
try:
|
||||
pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False)
|
||||
except:
|
||||
pubdate = (parse_date(first_sequence('//dc:date', root) or
|
||||
first_simple('//xmp:CreateDate', root),
|
||||
assume_utc=False))
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
mi.pubdate = pubdate
|
||||
@@ -291,7 +305,7 @@ def metadata_from_xmp_packet(raw_bytes):
|
||||
if val:
|
||||
try:
|
||||
setattr(mi, x, json.loads(val))
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
languages = multiple_sequences('//dc:language', root)
|
||||
@@ -319,7 +333,7 @@ def metadata_from_xmp_packet(raw_bytes):
|
||||
identifiers[scheme] = val
|
||||
|
||||
# Check Dublin Core for recognizable identifier types
|
||||
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.items():
|
||||
for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items():
|
||||
if scheme not in identifiers:
|
||||
val = check_func(first_simple('//dc:identifier', root))
|
||||
if val:
|
||||
@@ -359,17 +373,21 @@ def consolidate_metadata(info_mi, info):
|
||||
else:
|
||||
prefer_info = info_date > xmp_mi.metadata_date
|
||||
if prefer_info:
|
||||
info_mi.title, info_mi.authors, info_mi.tags = info_title, info_authors, info_tags
|
||||
info_mi.title = info_title
|
||||
info_mi.authors = info_authors
|
||||
info_mi.tags = info_tags
|
||||
else:
|
||||
# We'll use the xmp tags/authors but fallback to the info ones if the
|
||||
# xmp does not have tags/authors. smart_update() should have taken care of
|
||||
# the rest
|
||||
info_mi.authors, info_mi.tags = (info_authors if xmp_mi.is_null('authors') else xmp_mi.authors), xmp_mi.tags or info_tags
|
||||
# xmp does not have tags/authors. smart_update() should have taken care
|
||||
# of the rest
|
||||
info_mi.authors = (info_authors if xmp_mi.is_null('authors')
|
||||
else xmp_mi.authors)
|
||||
info_mi.tags = xmp_mi.tags or info_tags
|
||||
return info_mi
|
||||
|
||||
|
||||
def nsmap(*args):
|
||||
return {x:NS_MAP[x] for x in args}
|
||||
return {x: NS_MAP[x] for x in args}
|
||||
|
||||
|
||||
def create_simple_property(parent, tag, value):
|
||||
@@ -435,7 +453,8 @@ def create_series(calibre, series, series_index):
|
||||
|
||||
def create_user_metadata(calibre, all_user_metadata):
|
||||
from ebook_converter.utils.config import to_json
|
||||
from ebook_converter.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple
|
||||
from ebook_converter.ebooks.metadata.book.json_codec import \
|
||||
object_to_unicode, encode_is_multiple
|
||||
|
||||
s = calibre.makeelement(expand('calibre:custom_metadata'))
|
||||
calibre.append(s)
|
||||
@@ -447,7 +466,7 @@ def create_user_metadata(calibre, all_user_metadata):
|
||||
encode_is_multiple(fm)
|
||||
fm = object_to_unicode(fm)
|
||||
fm = json.dumps(fm, default=to_json, ensure_ascii=False)
|
||||
except:
|
||||
except Exception:
|
||||
prints('Failed to write user metadata:', name)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
@@ -471,7 +490,8 @@ def metadata_to_xmp_packet(mi):
|
||||
dc = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('dc'))
|
||||
dc.set(expand('rdf:about'), '')
|
||||
rdf.append(dc)
|
||||
for prop, tag in {'title':'dc:title', 'comments':'dc:description'}.items():
|
||||
for prop, tag in {'title': 'dc:title',
|
||||
'comments': 'dc:description'}.items():
|
||||
val = mi.get(prop) or ''
|
||||
create_alt_property(dc, tag, val)
|
||||
for prop, (tag, ordered) in {'authors': ('dc:creator', True),
|
||||
@@ -482,18 +502,23 @@ def metadata_to_xmp_packet(mi):
|
||||
val = [val]
|
||||
create_sequence_property(dc, tag, val, ordered)
|
||||
if not mi.is_null('pubdate'):
|
||||
create_sequence_property(dc, 'dc:date', [isoformat(mi.pubdate, as_utc=False)]) # Adobe spec recommends local time
|
||||
# Adobe spec recommends local time
|
||||
create_sequence_property(dc, 'dc:date',
|
||||
[isoformat(mi.pubdate, as_utc=False)])
|
||||
if not mi.is_null('languages'):
|
||||
langs = list(filter(None, map(lambda x:lang_as_iso639_1(x) or canonicalize_lang(x), mi.languages)))
|
||||
langs = list(filter(None, map(lambda x: lang_as_iso639_1(x) or
|
||||
canonicalize_lang(x), mi.languages)))
|
||||
if langs:
|
||||
create_sequence_property(dc, 'dc:language', langs, ordered=False)
|
||||
|
||||
xmp = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('xmp', 'xmpidq'))
|
||||
xmp = rdf.makeelement(expand('rdf:Description'),
|
||||
nsmap=nsmap('xmp', 'xmpidq'))
|
||||
xmp.set(expand('rdf:about'), '')
|
||||
rdf.append(xmp)
|
||||
extra_ids = {}
|
||||
for x in ('prism', 'pdfx'):
|
||||
p = extra_ids[x] = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap(x))
|
||||
p = extra_ids[x] = rdf.makeelement(expand('rdf:Description'),
|
||||
nsmap=nsmap(x))
|
||||
p.set(expand('rdf:about'), '')
|
||||
rdf.append(p)
|
||||
|
||||
@@ -503,7 +528,7 @@ def metadata_to_xmp_packet(mi):
|
||||
for scheme, val in identifiers.items():
|
||||
if scheme in {'isbn', 'doi'}:
|
||||
for prefix, parent in extra_ids.items():
|
||||
ie = parent.makeelement(expand('%s:%s'%(prefix, scheme)))
|
||||
ie = parent.makeelement(expand('%s:%s' % (prefix, scheme)))
|
||||
ie.text = val
|
||||
parent.append(ie)
|
||||
|
||||
@@ -511,7 +536,8 @@ def metadata_to_xmp_packet(mi):
|
||||
d.text = isoformat(now(), as_utc=False)
|
||||
xmp.append(d)
|
||||
|
||||
calibre = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('calibre', 'calibreSI', 'calibreCC'))
|
||||
calibre = rdf.makeelement(expand('rdf:Description'),
|
||||
nsmap=nsmap('calibre', 'calibreSI', 'calibreCC'))
|
||||
calibre.set(expand('rdf:about'), '')
|
||||
rdf.append(calibre)
|
||||
if not mi.is_null('rating'):
|
||||
@@ -524,7 +550,8 @@ def metadata_to_xmp_packet(mi):
|
||||
if not mi.is_null('series'):
|
||||
create_series(calibre, mi.series, mi.series_index)
|
||||
if not mi.is_null('timestamp'):
|
||||
create_simple_property(calibre, 'calibre:timestamp', isoformat(mi.timestamp, as_utc=False))
|
||||
create_simple_property(calibre, 'calibre:timestamp',
|
||||
isoformat(mi.timestamp, as_utc=False))
|
||||
for x in ('author_link_map', 'user_categories'):
|
||||
val = getattr(mi, x, None)
|
||||
if val:
|
||||
@@ -550,10 +577,11 @@ def find_used_namespaces(elem):
|
||||
|
||||
def find_preferred_prefix(namespace, elems):
|
||||
for elem in elems:
|
||||
ans = {v:k for k, v in elem.nsmap.items()}.get(namespace, None)
|
||||
ans = {v: k for k, v in elem.nsmap.items()}.get(namespace, None)
|
||||
if ans is not None:
|
||||
return ans
|
||||
return find_preferred_prefix(namespace, elem.iterchildren(etree.Element))
|
||||
return find_preferred_prefix(namespace,
|
||||
elem.iterchildren(etree.Element))
|
||||
|
||||
|
||||
def find_nsmap(elems):
|
||||
@@ -562,7 +590,7 @@ def find_nsmap(elems):
|
||||
used_namespaces |= find_used_namespaces(elem)
|
||||
ans = {}
|
||||
used_namespaces -= {NS_MAP['xml'], NS_MAP['x'], None, NS_MAP['rdf']}
|
||||
rmap = {v:k for k, v in NS_MAP.items()}
|
||||
rmap = {v: k for k, v in NS_MAP.items()}
|
||||
i = 0
|
||||
for ns in used_namespaces:
|
||||
if ns in rmap:
|
||||
@@ -578,7 +606,10 @@ def find_nsmap(elems):
|
||||
|
||||
|
||||
def clone_into(parent, elem):
|
||||
' Clone the element, assuming that all namespace declarations are present in parent '
|
||||
"""
|
||||
Clone the element, assuming that all namespace declarations are present
|
||||
in parent
|
||||
"""
|
||||
clone = parent.makeelement(elem.tag)
|
||||
parent.append(clone)
|
||||
if elem.text and not elem.text.isspace():
|
||||
@@ -591,28 +622,38 @@ def clone_into(parent, elem):
|
||||
|
||||
|
||||
def merge_xmp_packet(old, new):
|
||||
''' Merge metadata present in the old packet that is not present in the new
|
||||
"""
|
||||
Merge metadata present in the old packet that is not present in the new
|
||||
one into the new one. Assumes the new packet was generated by
|
||||
metadata_to_xmp_packet() '''
|
||||
metadata_to_xmp_packet()
|
||||
"""
|
||||
old, new = parse_xmp_packet(old), parse_xmp_packet(new)
|
||||
# As per the adobe spec all metadata items have to be present inside top-level rdf:Description containers
|
||||
# As per the adobe spec all metadata items have to be present inside
|
||||
# top-level rdf:Description containers
|
||||
item_xpath = XPath('//rdf:RDF/rdf:Description/*')
|
||||
|
||||
# First remove all data fields that metadata_to_xmp_packet() knowns about,
|
||||
# since either they will have been set or if not present, imply they have
|
||||
# been cleared
|
||||
defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES}
|
||||
defined_tags |= {expand('dc:' + x) for x in ('identifier', 'title', 'creator', 'date', 'description', 'language', 'publisher', 'subject')}
|
||||
defined_tags |= {expand('xmp:' + x) for x in ('MetadataDate', 'Identifier')}
|
||||
defined_tags = {expand(prefix + ':' + scheme)
|
||||
for prefix in ('prism', 'pdfx')
|
||||
for scheme in KNOWN_ID_SCHEMES}
|
||||
defined_tags |= {expand('dc:' + x)
|
||||
for x in ('identifier', 'title', 'creator', 'date',
|
||||
'description', 'language', 'publisher',
|
||||
'subject')}
|
||||
defined_tags |= {expand('xmp:' + x)
|
||||
for x in ('MetadataDate', 'Identifier')}
|
||||
# For redundancy also remove all fields explicitly set in the new packet
|
||||
defined_tags |= {x.tag for x in item_xpath(new)}
|
||||
calibrens = '{%s}' % NS_MAP['calibre']
|
||||
for elem in item_xpath(old):
|
||||
if elem.tag in defined_tags or (elem.tag and elem.tag.startswith(calibrens)):
|
||||
if elem.tag in defined_tags or (elem.tag and
|
||||
elem.tag.startswith(calibrens)):
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
# Group all items into groups based on their namespaces
|
||||
groups = defaultdict(list)
|
||||
groups = collections.defaultdict(list)
|
||||
for item in item_xpath(new):
|
||||
ns = item.nsmap[item.prefix]
|
||||
groups[ns].append(item)
|
||||
@@ -626,9 +667,14 @@ def merge_xmp_packet(old, new):
|
||||
root = A.xmpmeta(R.RDF)
|
||||
rdf = root[0]
|
||||
|
||||
for namespace in sorted(groups, key=lambda x:{NS_MAP['dc']:'a', NS_MAP['xmp']:'b', NS_MAP['calibre']:'c'}.get(x, 'z'+x)):
|
||||
for namespace in sorted(groups,
|
||||
key=lambda x: {NS_MAP['dc']: 'a',
|
||||
NS_MAP['xmp']: 'b',
|
||||
NS_MAP['calibre']: 'c'}.get(x,
|
||||
'z'+x)):
|
||||
items = groups[namespace]
|
||||
desc = rdf.makeelement(expand('rdf:Description'), nsmap=find_nsmap(items))
|
||||
desc = rdf.makeelement(expand('rdf:Description'),
|
||||
nsmap=find_nsmap(items))
|
||||
desc.set(expand('rdf:about'), '')
|
||||
rdf.append(desc)
|
||||
for item in items:
|
||||
|
||||
Reference in New Issue
Block a user