1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-02-21 09:15:54 +01:00
Files
ebook-converter/ebook_converter/ebooks/metadata/fb2.py
gryf ce89f5c9d1 Use the real constants module.
This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
2020-05-29 17:04:53 +02:00

464 lines
15 KiB
Python

"""
Read meta information from fb2 files
"""
import functools
import os
import random
import string
from lxml import etree
from ebook_converter.utils.date import parse_only_date
from ebook_converter.utils.img import save_cover_data_to
from ebook_converter.utils.imghdr import identify
from ebook_converter import guess_type, guess_all_extensions, prints, \
force_unicode
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.polyglot.binary import as_base64_unicode
NAMESPACES = {'fb2': 'http://www.gribuser.ru/xml/fictionbook/2.0',
'fb21': 'http://www.gribuser.ru/xml/fictionbook/2.1',
'xlink': 'http://www.w3.org/1999/xlink'}
tostring = functools.partial(etree.tostring, method='text', encoding='unicode')
def XLINK(tag):
return '{%s}%s' % (NAMESPACES['xlink'], tag)
class Context(object):
def __init__(self, root):
try:
self.fb_ns = root.nsmap[root.prefix] or NAMESPACES['fb2']
except Exception:
self.fb_ns = NAMESPACES['fb2']
self.namespaces = {
'fb': self.fb_ns,
'fb2': self.fb_ns,
'xlink': NAMESPACES['xlink']
}
def XPath(self, *args):
return etree.XPath(*args, namespaces=self.namespaces)
def get_or_create(self, parent, tag, attribs={}, at_start=True):
xpathstr = './fb:'+tag
for n, v in attribs.items():
xpathstr += '[@%s="%s"]' % (n, v)
ans = self.XPath(xpathstr)(parent)
if ans:
ans = ans[0]
else:
ans = self.create_tag(parent, tag, attribs, at_start)
return ans
def create_tag(self, parent, tag, attribs={}, at_start=True):
ans = parent.makeelement('{%s}%s' % (self.fb_ns, tag))
ans.attrib.update(attribs)
if at_start:
parent.insert(0, ans)
else:
parent.append(ans)
return ans
def clear_meta_tags(self, doc, tag):
for parent in ('title-info', 'src-title-info', 'publish-info'):
for x in self.XPath('//fb:%s/fb:%s' % (parent, tag))(doc):
x.getparent().remove(x)
def text2fb2(self, parent, text):
lines = text.split('\n')
for line in lines:
line = line.strip()
if line:
p = self.create_tag(parent, 'p', at_start=False)
p.text = line
else:
self.create_tag(parent, 'empty-line', at_start=False)
def get_fb2_data(stream):
from ebook_converter.utils.zipfile import ZipFile, BadZipfile
pos = stream.tell()
try:
zf = ZipFile(stream)
except BadZipfile:
stream.seek(pos)
ans = stream.read()
zip_file_name = None
else:
names = zf.namelist()
names = [x for x in names if x.lower().endswith('.fb2')] or names
zip_file_name = names[0]
ans = zf.open(zip_file_name).read()
return ans, zip_file_name
def get_metadata(stream):
''' Return fb2 metadata as a L{MetaInformation} object '''
root = _get_fbroot(get_fb2_data(stream)[0])
ctx = Context(root)
book_title = _parse_book_title(root, ctx)
authors = _parse_authors(root, ctx) or ['Unknown']
# fallback for book_title
if book_title:
book_title = str(book_title)
else:
book_title = force_unicode(os.path.splitext(
os.path.basename(getattr(stream, 'name', 'Unknown')))[0])
mi = MetaInformation(book_title, authors)
try:
_parse_cover(root, mi, ctx)
except Exception:
pass
try:
_parse_comments(root, mi, ctx)
except Exception:
pass
try:
_parse_tags(root, mi, ctx)
except Exception:
pass
try:
_parse_series(root, mi, ctx)
except Exception:
pass
try:
_parse_isbn(root, mi, ctx)
except Exception:
pass
try:
_parse_publisher(root, mi, ctx)
except Exception:
pass
try:
_parse_pubdate(root, mi, ctx)
except Exception:
pass
try:
_parse_language(root, mi, ctx)
except Exception:
pass
return mi
def _parse_authors(root, ctx):
authors = []
# pick up authors but only from 1 secrion <title-info>; otherwise it is
# not consistent! Those are fallbacks: <src-title-info>, <document-info>
author = None
for author_sec in ['title-info', 'src-title-info', 'document-info']:
for au in ctx.XPath('//fb:%s/fb:author' % author_sec)(root):
author = _parse_author(au, ctx)
if author:
authors.append(author)
if author:
break
# if no author so far
if not authors:
authors.append('Unknown')
return authors
def _parse_author(elm_author, ctx):
""" Returns a list of display author and sortable author"""
xp_templ = 'normalize-space(fb:%s/text())'
author = ctx.XPath(xp_templ % 'first-name')(elm_author)
lname = ctx.XPath(xp_templ % 'last-name')(elm_author)
mname = ctx.XPath(xp_templ % 'middle-name')(elm_author)
if mname:
author = (author + ' ' + mname).strip()
if lname:
author = (author + ' ' + lname).strip()
# fallback to nickname
if not author:
nname = ctx.XPath(xp_templ % 'nickname')(elm_author)
if nname:
author = nname
return author
def _parse_book_title(root, ctx):
# <title-info> has a priority. (actually <title-info> is mandatory)
# other are backup solution (sequence is important. Other than in fb2-doc)
xp_ti = '//fb:title-info/fb:book-title/text()'
xp_pi = '//fb:publish-info/fb:book-title/text()'
xp_si = '//fb:src-title-info/fb:book-title/text()'
book_title = ctx.XPath('normalize-space(%s|%s|%s)' %
(xp_ti, xp_pi, xp_si))(root)
return book_title
def _parse_cover(root, mi, ctx):
# pickup from <title-info>, if not exists it fallbacks to <src-title-info>
imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/'
'@xlink:href), "#")')(root)
if imgid:
try:
_parse_cover_data(root, imgid, mi, ctx)
except Exception:
pass
def _parse_cover_data(root, imgid, mi, ctx):
from ebook_converter.ebooks.fb2 import base64_decode
elm_binary = ctx.XPath('//fb:binary[@id="%s"]' % imgid)(root)
if elm_binary:
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
mime_extensions = guess_all_extensions(mimetype)
if not mime_extensions and mimetype.startswith('image/'):
mimetype_fromid = guess_type(imgid)[0]
if mimetype_fromid and mimetype_fromid.startswith('image/'):
mime_extensions = guess_all_extensions(mimetype_fromid)
if mime_extensions:
pic_data = elm_binary[0].text
if pic_data:
cdata = base64_decode(pic_data.strip())
fmt = identify(cdata)[0]
mi.cover_data = (fmt, cdata)
else:
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" %
(mimetype, imgid))
def _parse_tags(root, mi, ctx):
# pick up genre but only from 1 secrion <title-info>; otherwise it is not
# consistent! Those are fallbacks: <src-title-info>
for genre_sec in ['title-info', 'src-title-info']:
# -- i18n Translations-- ?
tags = ctx.XPath('//fb:%s/fb:genre/text()' % genre_sec)(root)
if tags:
mi.tags = list(map(str, tags))
break
def _parse_series(root, mi, ctx):
# calibre supports only 1 series: use the 1-st one
# pick up sequence but only from 1 section in preferred order
# except <src-title-info>
xp_ti = '//fb:title-info/fb:sequence[1]'
xp_pi = '//fb:publish-info/fb:sequence[1]'
elms_sequence = ctx.XPath('%s|%s' % (xp_ti, xp_pi))(root)
if elms_sequence:
mi.series = elms_sequence[0].get('name', None)
if mi.series:
try:
i = float('.'.join(elms_sequence[0].get('number',
None).split()[:2]))
mi.series_index = i
except Exception:
pass
def _parse_isbn(root, mi, ctx):
# some people try to put several isbn in this field, but it is not
# allowed. try to stick to the 1-st one in this case
isbn = ctx.XPath('normalize-space(//fb:publish-info/fb:isbn/text())')(root)
if isbn:
# some people try to put several isbn in this field, but it is not
# allowed. try to stick to the 1-st one in this case
if ',' in isbn:
isbn = isbn[:isbn.index(',')]
if check_isbn(isbn):
mi.isbn = isbn
def _parse_comments(root, mi, ctx):
# pick up annotation but only from 1 section <title-info>;
# fallback: <src-title-info>
for annotation_sec in ['title-info', 'src-title-info']:
elms_annotation = ctx.XPath('//fb:%s/fb:annotation' %
annotation_sec)(root)
if elms_annotation:
mi.comments = tostring(elms_annotation[0])
# TODO: tags i18n, xslt?
break
def _parse_publisher(root, mi, ctx):
publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/'
'text())')(root)
if publisher:
mi.publisher = publisher
def _parse_pubdate(root, mi, ctx):
year = ctx.XPath('number(//fb:publish-info/fb:year/text())')(root)
if float.is_integer(year):
# only year is available, so use 2nd of June
mi.pubdate = parse_only_date(str(int(year)))
def _parse_language(root, mi, ctx):
language = ctx.XPath('string(//fb:title-info/fb:lang/text())')(root)
if language:
mi.language = language
mi.languages = [language]
def _get_fbroot(raw):
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
root = etree.fromstring(raw)
return ensure_namespace(root)
def _set_title(title_info, mi, ctx):
if not mi.is_null('title'):
ctx.clear_meta_tags(title_info, 'book-title')
title = ctx.get_or_create(title_info, 'book-title')
title.text = mi.title
def _set_comments(title_info, mi, ctx):
if not mi.is_null('comments'):
from ebook_converter.utils.html2text import html2text
ctx.clear_meta_tags(title_info, 'annotation')
title = ctx.get_or_create(title_info, 'annotation')
ctx.text2fb2(title, html2text(mi.comments))
def _set_authors(title_info, mi, ctx):
if not mi.is_null('authors'):
ctx.clear_meta_tags(title_info, 'author')
for author in reversed(mi.authors):
author_parts = author.split()
if not author_parts:
continue
atag = ctx.create_tag(title_info, 'author')
if len(author_parts) == 1:
ctx.create_tag(atag, 'nickname').text = author
else:
ctx.create_tag(atag, 'first-name').text = author_parts[0]
author_parts = author_parts[1:]
if len(author_parts) > 1:
ctx.create_tag(atag, 'middle-name',
at_start=False).text = author_parts[0]
author_parts = author_parts[1:]
if author_parts:
a = ' '.join(author_parts)
ctx.create_tag(atag, 'last-name', at_start=False).text = a
def _set_tags(title_info, mi, ctx):
if not mi.is_null('tags'):
ctx.clear_meta_tags(title_info, 'genre')
for t in mi.tags:
tag = ctx.create_tag(title_info, 'genre')
tag.text = t
def _set_series(title_info, mi, ctx):
if not mi.is_null('series'):
ctx.clear_meta_tags(title_info, 'sequence')
seq = ctx.get_or_create(title_info, 'sequence')
seq.set('name', mi.series)
try:
seq.set('number', '%g' % mi.series_index)
except Exception:
seq.set('number', '1')
def _rnd_name(size=8, chars=string.ascii_letters + string.digits):
return ''.join(random.choice(chars) for x in range(size))
def _rnd_pic_file_name(prefix='calibre_cover_', size=32, ext='jpg'):
return prefix + _rnd_name(size=size) + '.' + ext
def _encode_into_jpeg(data):
data = save_cover_data_to(data)
return as_base64_unicode(data)
def _set_cover(title_info, mi, ctx):
if not mi.is_null('cover_data') and mi.cover_data[1]:
coverpage = ctx.get_or_create(title_info, 'coverpage')
cim_tag = ctx.get_or_create(coverpage, 'image')
if XLINK('href') in cim_tag.attrib:
cim_filename = cim_tag.attrib[XLINK('href')][1:]
else:
cim_filename = _rnd_pic_file_name('cover')
cim_tag.attrib[XLINK('href')] = '#' + cim_filename
fb2_root = cim_tag.getroottree().getroot()
cim_binary = ctx.get_or_create(fb2_root, 'binary',
attribs={'id': cim_filename},
at_start=False)
cim_binary.attrib['content-type'] = 'image/jpeg'
cim_binary.text = _encode_into_jpeg(mi.cover_data[1])
def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
stream.seek(0)
raw, zip_file_name = get_fb2_data(stream)
root = _get_fbroot(raw)
ctx = Context(root)
desc = ctx.get_or_create(root, 'description')
ti = ctx.get_or_create(desc, 'title-info')
indent = ti.text
_set_comments(ti, mi, ctx)
_set_series(ti, mi, ctx)
_set_tags(ti, mi, ctx)
_set_authors(ti, mi, ctx)
_set_title(ti, mi, ctx)
_set_cover(ti, mi, ctx)
for child in ti:
child.tail = indent
# Apparently there exists FB2 reading software that chokes on the use of
# single quotes in xml declaration. Sigh. See
# https://www.mobileread.com/forums/showthread.php?p=2273184#post2273184
raw = b'<?xml version="1.0" encoding="UTF-8"?>\n'
raw += etree.tostring(root, method='xml', encoding='utf-8',
xml_declaration=False)
stream.seek(0)
stream.truncate()
if zip_file_name:
from ebook_converter.utils.zipfile import ZipFile
with ZipFile(stream, 'w') as zf:
zf.writestr(zip_file_name, raw)
else:
stream.write(raw)
def ensure_namespace(doc):
# Workaround for broken FB2 files produced by convertonlinefree.com. See
# https://bugs.launchpad.net/bugs/1404701
bare_tags = False
for x in ('description', 'body'):
for x in doc.findall(x):
if '{' not in x.tag:
bare_tags = True
break
if bare_tags:
import re
raw = etree.tostring(doc, encoding='unicode')
raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>',
raw)
doc = etree.fromstring(raw)
return doc