mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-21 09:15:54 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
464 lines
15 KiB
Python
464 lines
15 KiB
Python
"""
|
|
Read meta information from fb2 files
|
|
"""
|
|
import functools
|
|
import os
|
|
import random
|
|
import string
|
|
|
|
from lxml import etree
|
|
|
|
from ebook_converter.utils.date import parse_only_date
|
|
from ebook_converter.utils.img import save_cover_data_to
|
|
from ebook_converter.utils.imghdr import identify
|
|
from ebook_converter import guess_type, guess_all_extensions, prints, \
|
|
force_unicode
|
|
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
|
|
from ebook_converter.ebooks.chardet import xml_to_unicode
|
|
from ebook_converter.polyglot.binary import as_base64_unicode
|
|
|
|
|
|
NAMESPACES = {'fb2': 'http://www.gribuser.ru/xml/fictionbook/2.0',
|
|
'fb21': 'http://www.gribuser.ru/xml/fictionbook/2.1',
|
|
'xlink': 'http://www.w3.org/1999/xlink'}
|
|
|
|
tostring = functools.partial(etree.tostring, method='text', encoding='unicode')
|
|
|
|
|
|
def XLINK(tag):
|
|
return '{%s}%s' % (NAMESPACES['xlink'], tag)
|
|
|
|
|
|
class Context(object):
|
|
|
|
def __init__(self, root):
|
|
try:
|
|
self.fb_ns = root.nsmap[root.prefix] or NAMESPACES['fb2']
|
|
except Exception:
|
|
self.fb_ns = NAMESPACES['fb2']
|
|
self.namespaces = {
|
|
'fb': self.fb_ns,
|
|
'fb2': self.fb_ns,
|
|
'xlink': NAMESPACES['xlink']
|
|
}
|
|
|
|
def XPath(self, *args):
|
|
return etree.XPath(*args, namespaces=self.namespaces)
|
|
|
|
def get_or_create(self, parent, tag, attribs={}, at_start=True):
|
|
xpathstr = './fb:'+tag
|
|
for n, v in attribs.items():
|
|
xpathstr += '[@%s="%s"]' % (n, v)
|
|
ans = self.XPath(xpathstr)(parent)
|
|
if ans:
|
|
ans = ans[0]
|
|
else:
|
|
ans = self.create_tag(parent, tag, attribs, at_start)
|
|
return ans
|
|
|
|
def create_tag(self, parent, tag, attribs={}, at_start=True):
|
|
ans = parent.makeelement('{%s}%s' % (self.fb_ns, tag))
|
|
ans.attrib.update(attribs)
|
|
if at_start:
|
|
parent.insert(0, ans)
|
|
else:
|
|
parent.append(ans)
|
|
return ans
|
|
|
|
def clear_meta_tags(self, doc, tag):
|
|
for parent in ('title-info', 'src-title-info', 'publish-info'):
|
|
for x in self.XPath('//fb:%s/fb:%s' % (parent, tag))(doc):
|
|
x.getparent().remove(x)
|
|
|
|
def text2fb2(self, parent, text):
|
|
lines = text.split('\n')
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line:
|
|
p = self.create_tag(parent, 'p', at_start=False)
|
|
p.text = line
|
|
else:
|
|
self.create_tag(parent, 'empty-line', at_start=False)
|
|
|
|
|
|
def get_fb2_data(stream):
|
|
from ebook_converter.utils.zipfile import ZipFile, BadZipfile
|
|
pos = stream.tell()
|
|
try:
|
|
zf = ZipFile(stream)
|
|
except BadZipfile:
|
|
stream.seek(pos)
|
|
ans = stream.read()
|
|
zip_file_name = None
|
|
else:
|
|
names = zf.namelist()
|
|
names = [x for x in names if x.lower().endswith('.fb2')] or names
|
|
zip_file_name = names[0]
|
|
ans = zf.open(zip_file_name).read()
|
|
return ans, zip_file_name
|
|
|
|
|
|
def get_metadata(stream):
|
|
''' Return fb2 metadata as a L{MetaInformation} object '''
|
|
|
|
root = _get_fbroot(get_fb2_data(stream)[0])
|
|
ctx = Context(root)
|
|
book_title = _parse_book_title(root, ctx)
|
|
authors = _parse_authors(root, ctx) or ['Unknown']
|
|
|
|
# fallback for book_title
|
|
if book_title:
|
|
book_title = str(book_title)
|
|
else:
|
|
book_title = force_unicode(os.path.splitext(
|
|
os.path.basename(getattr(stream, 'name', 'Unknown')))[0])
|
|
mi = MetaInformation(book_title, authors)
|
|
|
|
try:
|
|
_parse_cover(root, mi, ctx)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
_parse_comments(root, mi, ctx)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
_parse_tags(root, mi, ctx)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
_parse_series(root, mi, ctx)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
_parse_isbn(root, mi, ctx)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
_parse_publisher(root, mi, ctx)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
_parse_pubdate(root, mi, ctx)
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
_parse_language(root, mi, ctx)
|
|
except Exception:
|
|
pass
|
|
|
|
return mi
|
|
|
|
|
|
def _parse_authors(root, ctx):
|
|
authors = []
|
|
# pick up authors but only from 1 secrion <title-info>; otherwise it is
|
|
# not consistent! Those are fallbacks: <src-title-info>, <document-info>
|
|
author = None
|
|
for author_sec in ['title-info', 'src-title-info', 'document-info']:
|
|
for au in ctx.XPath('//fb:%s/fb:author' % author_sec)(root):
|
|
author = _parse_author(au, ctx)
|
|
if author:
|
|
authors.append(author)
|
|
if author:
|
|
break
|
|
|
|
# if no author so far
|
|
if not authors:
|
|
authors.append('Unknown')
|
|
|
|
return authors
|
|
|
|
|
|
def _parse_author(elm_author, ctx):
|
|
""" Returns a list of display author and sortable author"""
|
|
|
|
xp_templ = 'normalize-space(fb:%s/text())'
|
|
|
|
author = ctx.XPath(xp_templ % 'first-name')(elm_author)
|
|
lname = ctx.XPath(xp_templ % 'last-name')(elm_author)
|
|
mname = ctx.XPath(xp_templ % 'middle-name')(elm_author)
|
|
|
|
if mname:
|
|
author = (author + ' ' + mname).strip()
|
|
if lname:
|
|
author = (author + ' ' + lname).strip()
|
|
|
|
# fallback to nickname
|
|
if not author:
|
|
nname = ctx.XPath(xp_templ % 'nickname')(elm_author)
|
|
if nname:
|
|
author = nname
|
|
|
|
return author
|
|
|
|
|
|
def _parse_book_title(root, ctx):
|
|
# <title-info> has a priority. (actually <title-info> is mandatory)
|
|
# other are backup solution (sequence is important. Other than in fb2-doc)
|
|
xp_ti = '//fb:title-info/fb:book-title/text()'
|
|
xp_pi = '//fb:publish-info/fb:book-title/text()'
|
|
xp_si = '//fb:src-title-info/fb:book-title/text()'
|
|
book_title = ctx.XPath('normalize-space(%s|%s|%s)' %
|
|
(xp_ti, xp_pi, xp_si))(root)
|
|
|
|
return book_title
|
|
|
|
|
|
def _parse_cover(root, mi, ctx):
|
|
# pickup from <title-info>, if not exists it fallbacks to <src-title-info>
|
|
imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/'
|
|
'@xlink:href), "#")')(root)
|
|
if imgid:
|
|
try:
|
|
_parse_cover_data(root, imgid, mi, ctx)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _parse_cover_data(root, imgid, mi, ctx):
|
|
from ebook_converter.ebooks.fb2 import base64_decode
|
|
elm_binary = ctx.XPath('//fb:binary[@id="%s"]' % imgid)(root)
|
|
if elm_binary:
|
|
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
|
|
mime_extensions = guess_all_extensions(mimetype)
|
|
|
|
if not mime_extensions and mimetype.startswith('image/'):
|
|
mimetype_fromid = guess_type(imgid)[0]
|
|
if mimetype_fromid and mimetype_fromid.startswith('image/'):
|
|
mime_extensions = guess_all_extensions(mimetype_fromid)
|
|
|
|
if mime_extensions:
|
|
pic_data = elm_binary[0].text
|
|
if pic_data:
|
|
cdata = base64_decode(pic_data.strip())
|
|
fmt = identify(cdata)[0]
|
|
mi.cover_data = (fmt, cdata)
|
|
else:
|
|
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" %
|
|
(mimetype, imgid))
|
|
|
|
|
|
def _parse_tags(root, mi, ctx):
|
|
# pick up genre but only from 1 secrion <title-info>; otherwise it is not
|
|
# consistent! Those are fallbacks: <src-title-info>
|
|
for genre_sec in ['title-info', 'src-title-info']:
|
|
# -- i18n Translations-- ?
|
|
tags = ctx.XPath('//fb:%s/fb:genre/text()' % genre_sec)(root)
|
|
if tags:
|
|
mi.tags = list(map(str, tags))
|
|
break
|
|
|
|
|
|
def _parse_series(root, mi, ctx):
|
|
# calibre supports only 1 series: use the 1-st one
|
|
# pick up sequence but only from 1 section in preferred order
|
|
# except <src-title-info>
|
|
xp_ti = '//fb:title-info/fb:sequence[1]'
|
|
xp_pi = '//fb:publish-info/fb:sequence[1]'
|
|
|
|
elms_sequence = ctx.XPath('%s|%s' % (xp_ti, xp_pi))(root)
|
|
if elms_sequence:
|
|
mi.series = elms_sequence[0].get('name', None)
|
|
if mi.series:
|
|
try:
|
|
i = float('.'.join(elms_sequence[0].get('number',
|
|
None).split()[:2]))
|
|
mi.series_index = i
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _parse_isbn(root, mi, ctx):
|
|
# some people try to put several isbn in this field, but it is not
|
|
# allowed. try to stick to the 1-st one in this case
|
|
isbn = ctx.XPath('normalize-space(//fb:publish-info/fb:isbn/text())')(root)
|
|
if isbn:
|
|
# some people try to put several isbn in this field, but it is not
|
|
# allowed. try to stick to the 1-st one in this case
|
|
if ',' in isbn:
|
|
isbn = isbn[:isbn.index(',')]
|
|
if check_isbn(isbn):
|
|
mi.isbn = isbn
|
|
|
|
|
|
def _parse_comments(root, mi, ctx):
|
|
# pick up annotation but only from 1 section <title-info>;
|
|
# fallback: <src-title-info>
|
|
for annotation_sec in ['title-info', 'src-title-info']:
|
|
elms_annotation = ctx.XPath('//fb:%s/fb:annotation' %
|
|
annotation_sec)(root)
|
|
if elms_annotation:
|
|
mi.comments = tostring(elms_annotation[0])
|
|
# TODO: tags i18n, xslt?
|
|
break
|
|
|
|
|
|
def _parse_publisher(root, mi, ctx):
|
|
publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/'
|
|
'text())')(root)
|
|
if publisher:
|
|
mi.publisher = publisher
|
|
|
|
|
|
def _parse_pubdate(root, mi, ctx):
|
|
year = ctx.XPath('number(//fb:publish-info/fb:year/text())')(root)
|
|
if float.is_integer(year):
|
|
# only year is available, so use 2nd of June
|
|
mi.pubdate = parse_only_date(str(int(year)))
|
|
|
|
|
|
def _parse_language(root, mi, ctx):
|
|
language = ctx.XPath('string(//fb:title-info/fb:lang/text())')(root)
|
|
if language:
|
|
mi.language = language
|
|
mi.languages = [language]
|
|
|
|
|
|
def _get_fbroot(raw):
|
|
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
|
|
root = etree.fromstring(raw)
|
|
return ensure_namespace(root)
|
|
|
|
|
|
def _set_title(title_info, mi, ctx):
|
|
if not mi.is_null('title'):
|
|
ctx.clear_meta_tags(title_info, 'book-title')
|
|
title = ctx.get_or_create(title_info, 'book-title')
|
|
title.text = mi.title
|
|
|
|
|
|
def _set_comments(title_info, mi, ctx):
|
|
if not mi.is_null('comments'):
|
|
from ebook_converter.utils.html2text import html2text
|
|
ctx.clear_meta_tags(title_info, 'annotation')
|
|
title = ctx.get_or_create(title_info, 'annotation')
|
|
ctx.text2fb2(title, html2text(mi.comments))
|
|
|
|
|
|
def _set_authors(title_info, mi, ctx):
|
|
if not mi.is_null('authors'):
|
|
ctx.clear_meta_tags(title_info, 'author')
|
|
for author in reversed(mi.authors):
|
|
author_parts = author.split()
|
|
if not author_parts:
|
|
continue
|
|
atag = ctx.create_tag(title_info, 'author')
|
|
if len(author_parts) == 1:
|
|
ctx.create_tag(atag, 'nickname').text = author
|
|
else:
|
|
ctx.create_tag(atag, 'first-name').text = author_parts[0]
|
|
author_parts = author_parts[1:]
|
|
if len(author_parts) > 1:
|
|
ctx.create_tag(atag, 'middle-name',
|
|
at_start=False).text = author_parts[0]
|
|
author_parts = author_parts[1:]
|
|
if author_parts:
|
|
a = ' '.join(author_parts)
|
|
ctx.create_tag(atag, 'last-name', at_start=False).text = a
|
|
|
|
|
|
def _set_tags(title_info, mi, ctx):
|
|
if not mi.is_null('tags'):
|
|
ctx.clear_meta_tags(title_info, 'genre')
|
|
for t in mi.tags:
|
|
tag = ctx.create_tag(title_info, 'genre')
|
|
tag.text = t
|
|
|
|
|
|
def _set_series(title_info, mi, ctx):
|
|
if not mi.is_null('series'):
|
|
ctx.clear_meta_tags(title_info, 'sequence')
|
|
seq = ctx.get_or_create(title_info, 'sequence')
|
|
seq.set('name', mi.series)
|
|
try:
|
|
seq.set('number', '%g' % mi.series_index)
|
|
except Exception:
|
|
seq.set('number', '1')
|
|
|
|
|
|
def _rnd_name(size=8, chars=string.ascii_letters + string.digits):
|
|
return ''.join(random.choice(chars) for x in range(size))
|
|
|
|
|
|
def _rnd_pic_file_name(prefix='calibre_cover_', size=32, ext='jpg'):
|
|
return prefix + _rnd_name(size=size) + '.' + ext
|
|
|
|
|
|
def _encode_into_jpeg(data):
|
|
data = save_cover_data_to(data)
|
|
return as_base64_unicode(data)
|
|
|
|
|
|
def _set_cover(title_info, mi, ctx):
|
|
if not mi.is_null('cover_data') and mi.cover_data[1]:
|
|
coverpage = ctx.get_or_create(title_info, 'coverpage')
|
|
cim_tag = ctx.get_or_create(coverpage, 'image')
|
|
if XLINK('href') in cim_tag.attrib:
|
|
cim_filename = cim_tag.attrib[XLINK('href')][1:]
|
|
else:
|
|
cim_filename = _rnd_pic_file_name('cover')
|
|
cim_tag.attrib[XLINK('href')] = '#' + cim_filename
|
|
fb2_root = cim_tag.getroottree().getroot()
|
|
cim_binary = ctx.get_or_create(fb2_root, 'binary',
|
|
attribs={'id': cim_filename},
|
|
at_start=False)
|
|
cim_binary.attrib['content-type'] = 'image/jpeg'
|
|
cim_binary.text = _encode_into_jpeg(mi.cover_data[1])
|
|
|
|
|
|
def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
|
|
stream.seek(0)
|
|
raw, zip_file_name = get_fb2_data(stream)
|
|
root = _get_fbroot(raw)
|
|
ctx = Context(root)
|
|
desc = ctx.get_or_create(root, 'description')
|
|
ti = ctx.get_or_create(desc, 'title-info')
|
|
|
|
indent = ti.text
|
|
|
|
_set_comments(ti, mi, ctx)
|
|
_set_series(ti, mi, ctx)
|
|
_set_tags(ti, mi, ctx)
|
|
_set_authors(ti, mi, ctx)
|
|
_set_title(ti, mi, ctx)
|
|
_set_cover(ti, mi, ctx)
|
|
|
|
for child in ti:
|
|
child.tail = indent
|
|
|
|
# Apparently there exists FB2 reading software that chokes on the use of
|
|
# single quotes in xml declaration. Sigh. See
|
|
# https://www.mobileread.com/forums/showthread.php?p=2273184#post2273184
|
|
raw = b'<?xml version="1.0" encoding="UTF-8"?>\n'
|
|
raw += etree.tostring(root, method='xml', encoding='utf-8',
|
|
xml_declaration=False)
|
|
|
|
stream.seek(0)
|
|
stream.truncate()
|
|
if zip_file_name:
|
|
from ebook_converter.utils.zipfile import ZipFile
|
|
with ZipFile(stream, 'w') as zf:
|
|
zf.writestr(zip_file_name, raw)
|
|
else:
|
|
stream.write(raw)
|
|
|
|
|
|
def ensure_namespace(doc):
|
|
# Workaround for broken FB2 files produced by convertonlinefree.com. See
|
|
# https://bugs.launchpad.net/bugs/1404701
|
|
bare_tags = False
|
|
for x in ('description', 'body'):
|
|
for x in doc.findall(x):
|
|
if '{' not in x.tag:
|
|
bare_tags = True
|
|
break
|
|
if bare_tags:
|
|
import re
|
|
raw = etree.tostring(doc, encoding='unicode')
|
|
raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>',
|
|
raw)
|
|
doc = etree.fromstring(raw)
|
|
return doc
|