diff --git a/ebook_converter/ebooks/fb2/__init__.py b/ebook_converter/ebooks/fb2/__init__.py new file mode 100644 index 0000000..e42aba2 --- /dev/null +++ b/ebook_converter/ebooks/fb2/__init__.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +def base64_decode(raw): + from io import BytesIO + from polyglot.binary import from_base64_bytes + + # First try the python implementation as it is faster + try: + return from_base64_bytes(raw) + except Exception: + pass + + # Try a more robust version (adapted from FBReader sources) + A, Z, a, z, zero, nine, plus, slash, equal = bytearray(b'AZaz09+/=') + raw = bytearray(raw) + out = BytesIO() + pos = 0 + while pos < len(raw): + tot = 0 + i = 0 + while i < 4 and pos < len(raw): + byt = raw[pos] + pos += 1 + num = 0 + if A <= byt <= Z: + num = byt - A + elif a <= byt <= z: + num = byt - a + 26 + elif zero <= byt <= nine: + num = byt - zero + 52 + else: + num = {plus:62, slash:63, equal:64}.get(byt, None) + if num is None: + # Ignore this byte + continue + tot += num << (6 * (3 - i)) + i += 1 + triple = bytearray(3) + for j in (2, 1, 0): + triple[j] = tot & 0xff + tot >>= 8 + out.write(bytes(triple)) + return out.getvalue() diff --git a/ebook_converter/ebooks/fb2/fb2ml.py b/ebook_converter/ebooks/fb2/fb2ml.py new file mode 100644 index 0000000..d8a0098 --- /dev/null +++ b/ebook_converter/ebooks/fb2/fb2ml.py @@ -0,0 +1,574 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into FB2 markup +''' + +import re, textwrap, uuid +from datetime import datetime + +from lxml import etree + +from calibre import prepare_string_for_xml +from calibre.constants import __appname__, __version__ +from calibre.utils.localization import lang_as_iso639_1 +from calibre.utils.xml_parse import safe_xml_fromstring +from calibre.utils.img import save_cover_data_to +from calibre.ebooks.oeb.base import urlnormalize +from polyglot.builtins import unicode_type, string_or_bytes, range, filter +from polyglot.binary import as_base64_unicode +from polyglot.urllib import urlparse + + +class FB2MLizer(object): + ''' + Todo: * Include more FB2 specific tags in the conversion. + * Handle notes and anchor links. + ''' + + def __init__(self, log): + self.log = log + self.reset_state() + + def reset_state(self): + # Used to ensure text and tags are always within

and

+ self.in_p = False + # Mapping of image names. OEB allows for images to have the same name but be stored + # in different directories. FB2 images are all in a flat layout so we rename all images + # into a sequential numbering system to ensure there are no collisions between image names. + self.image_hrefs = {} + # Mapping of toc items and their + self.toc = {} + # Used to see whether a new
needs to be opened + self.section_level = 0 + + def extract_content(self, oeb_book, opts): + self.log.info('Converting XHTML to FB2 markup...') + self.oeb_book = oeb_book + self.opts = opts + self.reset_state() + + # Used for adding
s and s to allow readers + # to generate toc from the document. + if self.opts.sectionize == 'toc': + self.create_flat_toc(self.oeb_book.toc, 1) + + return self.fb2mlize_spine() + + def fb2mlize_spine(self): + output = ( + self.fb2_header(), + self.get_text(), + self.fb2mlize_images(), + self.fb2_footer(), + ) + output = self.clean_text('\n'.join(output)) + + if self.opts.pretty_print: + output = etree.tostring(safe_xml_fromstring(output), encoding='unicode', pretty_print=True) + + return '<?xml version="1.0" encoding="UTF-8"?>\n' + output + + def clean_text(self, text): + # Remove pointless tags, but keep their contents. + text = re.sub(r'(?mu)<(strong|emphasis|strikethrough|sub|sup)>(\s*)</\1>', r'\2', text) + + # Clean up paragraphs endings. + text = re.sub(r'(?mu)\s+</p>', '</p>', text) + # Condense empty paragraphs into a line break. + text = re.sub(r'(?mu)(?:<p></p>\s*){3,}', '<empty-line/>', text) + # Remove empty paragraphs. + text = re.sub(r'(?mu)<p></p>\s*', '', text) + # Put the paragraph following a paragraph on a separate line. + text = re.sub(r'(?mu)</p>\s*<p>', '</p>\n<p>', text) + + if self.opts.insert_blank_line: + text = re.sub(r'(?mu)</p>', '</p><empty-line/>', text) + + # Clean up title endings. + text = re.sub(r'(?mu)\s+', '', text) + # Remove empty title elements. + text = re.sub(r'(?mu)\s*', '', text) + # Put the paragraph following a title on a separate line. + text = re.sub(r'(?mu)\s*

', '\n

', text) + + # Put line breaks between paragraphs on a separate line. + text = re.sub(r'(?mu)\s*', r'\n', text) + text = re.sub(r'(?mu)\s*

', '\n

', text) + + # Remove empty sections. + text = re.sub(r'(?mu)

\s*
', '', text) + # Clean up sections starts and ends. + text = re.sub(r'(?mu)\s*
', '\n
', text) + text = re.sub(r'(?mu)
\s*', '
\n', text) + text = re.sub(r'(?mu)\s*
', '\n
', text) + text = re.sub(r'(?mu)
\s*', '
\n', text) + + return text + + def fb2_header(self): + from calibre.ebooks.oeb.base import OPF + metadata = {} + metadata['title'] = self.oeb_book.metadata.title[0].value + metadata['appname'] = __appname__ + metadata['version'] = __version__ + metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year) + if self.oeb_book.metadata.language: + lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value) + if not lc: + lc = self.oeb_book.metadata.language[0].value + metadata['lang'] = lc or 'en' + else: + metadata['lang'] = u'en' + metadata['id'] = None + metadata['cover'] = self.get_cover() + metadata['genre'] = self.opts.fb2_genre + + metadata['author'] = '' + for auth in self.oeb_book.metadata.creator: + author_first = '' + author_middle = '' + author_last = '' + author_parts = auth.value.split(' ') + if len(author_parts) == 1: + author_last = author_parts[0] + elif len(author_parts) == 2: + author_first = author_parts[0] + author_last = author_parts[1] + else: + author_first = author_parts[0] + author_middle = ' '.join(author_parts[1:-1]) + author_last = author_parts[-1] + metadata['author'] += '' + metadata['author'] += '%s' % prepare_string_for_xml(author_first) + if author_middle: + metadata['author'] += '%s' % prepare_string_for_xml(author_middle) + metadata['author'] += '%s' % prepare_string_for_xml(author_last) + metadata['author'] += '' + if not metadata['author']: + metadata['author'] = '' + + metadata['keywords'] = '' + tags = list(map(unicode_type, self.oeb_book.metadata.subject)) + if tags: + tags = ', '.join(prepare_string_for_xml(x) for x in tags) + metadata['keywords'] = '%s'%tags + + metadata['sequence'] = '' + if self.oeb_book.metadata.series: + index = '1' + if self.oeb_book.metadata.series_index: + index = self.oeb_book.metadata.series_index[0] + metadata['sequence'] = '' % (prepare_string_for_xml('%s' % self.oeb_book.metadata.series[0]), index) + + year = publisher = isbn = '' + identifiers = self.oeb_book.metadata['identifier'] + for x in identifiers: + if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'): + metadata['id'] = unicode_type(x).split(':')[-1] + break + if metadata['id'] is None: + self.log.warn('No UUID identifier found') + metadata['id'] = unicode_type(uuid.uuid4()) + + try: + date = self.oeb_book.metadata['date'][0] + except IndexError: + pass + else: + year = '%s' % prepare_string_for_xml(date.value.partition('-')[0]) + + try: + publisher = self.oeb_book.metadata['publisher'][0] + except IndexError: + pass + else: + publisher = '%s' % prepare_string_for_xml(publisher.value) + + for x in identifiers: + if x.get(OPF('scheme'), None).lower() == 'isbn': + isbn = '%s' % prepare_string_for_xml(x.value) + + metadata['year'], metadata['isbn'], metadata['publisher'] = year, isbn, publisher + for key, value in metadata.items(): + if key not in ('author', 'cover', 'sequence', 'keywords', 'year', 'publisher', 'isbn'): + metadata[key] = prepare_string_for_xml(value) + + try: + comments = self.oeb_book.metadata['description'][0] + except Exception: + metadata['comments'] = '' + else: + from calibre.utils.html2text import html2text + metadata['comments'] = '

{}

'.format(prepare_string_for_xml(html2text(comments.value).strip())) + + # Keep the indentation level of the description the same as the body. + header = textwrap.dedent('''\ + + + + %(genre)s + %(author)s + %(title)s + %(cover)s + %(lang)s + %(keywords)s + %(sequence)s + %(comments)s + + + %(author)s + %(appname)s %(version)s + %(date)s + %(id)s + 1.0 + + + %(publisher)s + %(year)s + %(isbn)s + + ''') % metadata + + # Remove empty lines. + return '\n'.join(filter(unicode_type.strip, header.splitlines())) + + def fb2_footer(self): + return '' + + def get_cover(self): + from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES + + cover_href = None + + # Get the raster cover if it's available. + if self.oeb_book.metadata.cover and unicode_type(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: + id = unicode_type(self.oeb_book.metadata.cover[0]) + cover_item = self.oeb_book.manifest.ids[id] + if cover_item.media_type in OEB_RASTER_IMAGES: + cover_href = cover_item.href + else: + # Figure out if we have a title page or a cover page + page_name = '' + if 'titlepage' in self.oeb_book.guide: + page_name = 'titlepage' + elif 'cover' in self.oeb_book.guide: + page_name = 'cover' + + if page_name: + cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href] + # Get the first image in the page + for img in cover_item.xpath('//img'): + cover_href = cover_item.abshref(img.get('src')) + break + + if cover_href: + # Only write the image tag if it is in the manifest. + if cover_href in self.oeb_book.manifest.hrefs and cover_href not in self.image_hrefs: + self.image_hrefs[cover_href] = 'img_%s' % len(self.image_hrefs) + return '' % self.image_hrefs[cover_href] + + return '' + + def get_text(self): + from calibre.ebooks.oeb.base import XHTML + from calibre.ebooks.oeb.stylizer import Stylizer + text = [''] + + # Create main section if there are no others to create + if self.opts.sectionize == 'nothing': + text.append('
') + self.section_level += 1 + + for item in self.oeb_book.spine: + self.log.debug('Converting %s to FictionBook2 XML' % item.href) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) + + # Start a
if we must sectionize each file or if the TOC references this page + page_section_open = False + if self.opts.sectionize == 'files' or None in self.toc.get(item.href, ()): + text.append('
') + page_section_open = True + self.section_level += 1 + + text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + + if page_section_open: + text.append('
') + self.section_level -= 1 + + # Close any open sections + while self.section_level > 0: + text.append('
') + self.section_level -= 1 + + text.append('') + return ''.join(text) + + def fb2mlize_images(self): + ''' + This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. + ''' + from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES + + images = [] + for item in self.oeb_book.manifest: + # Don't write the image if it's not referenced in the document's text. + if item.href not in self.image_hrefs: + continue + if item.media_type in OEB_RASTER_IMAGES: + try: + if item.media_type not in ('image/jpeg', 'image/png'): + imdata = save_cover_data_to(item.data, compression_quality=70) + raw_data = as_base64_unicode(imdata) + content_type = 'image/jpeg' + else: + raw_data = as_base64_unicode(item.data) + content_type = item.media_type + # Don't put the encoded image on a single line. + step = 72 + data = '\n'.join(raw_data[i:i+step] for i in range(0, len(raw_data), step)) + images.append('%s' % (self.image_hrefs[item.href], content_type, data)) + except Exception as e: + self.log.error('Error: Could not include file %s because ' + '%s.' % (item.href, e)) + return '\n'.join(images) + + def create_flat_toc(self, nodes, level): + for item in nodes: + href, mid, id = item.href.partition('#') + if not id: + self.toc[href] = {None: 'page'} + else: + if not self.toc.get(href, None): + self.toc[href] = {} + self.toc[href][id] = level + self.create_flat_toc(item.nodes, level + 1) + + def ensure_p(self): + if self.in_p: + return [], [] + else: + self.in_p = True + return ['

'], ['p'] + + def close_open_p(self, tags): + text = [''] + added_p = False + + if self.in_p: + # Close all up to p. Close p. Reopen all closed tags including p. + closed_tags = [] + tags.reverse() + for t in tags: + text.append('' % t) + closed_tags.append(t) + if t == 'p': + break + closed_tags.reverse() + for t in closed_tags: + text.append('<%s>' % t) + else: + text.append('

') + added_p = True + self.in_p = True + + return text, added_p + + def handle_simple_tag(self, tag, tags): + s_out = [] + s_tags = [] + if tag not in tags: + p_out, p_tags = self.ensure_p() + s_out += p_out + s_tags += p_tags + s_out.append('<%s>' % tag) + s_tags.append(tag) + return s_out, s_tags + + def dump_text(self, elem_tree, stylizer, page, tag_stack=[]): + ''' + This function is intended to be used in a recursive manner. dump_text will + run though all elements in the elem_tree and call itself on each element. + + self.image_hrefs will be populated by calling this function. + + @param elem_tree: etree representation of XHTML content to be transformed. + @param stylizer: Used to track the style of elements within the tree. + @param page: OEB page used to determine absolute urls. + @param tag_stack: List of open FB2 tags to take into account. + + @return: List of string representing the XHTML converted to FB2 markup. + ''' + from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace + elem = elem_tree + + # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace. + if not isinstance(elem_tree.tag, string_or_bytes) or namespace(elem_tree.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [] + + style = stylizer.style(elem_tree) + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + if hasattr(elem, 'tail') and elem.tail: + return [elem.tail] + return [] + + # FB2 generated output. + fb2_out = [] + # FB2 tags in the order they are opened. This will be used to close the tags. + tags = [] + # First tag in tree + tag = barename(elem_tree.tag) + # Number of blank lines above tag + try: + ems = int(round((float(style.marginTop) / style.fontSize) - 1)) + if ems < 0: + ems = 0 + except: + ems = 0 + + # Convert TOC entries to s and add <section>s + if self.opts.sectionize == 'toc': + # A section cannot be a child of any other element than another section, + # so leave the tag alone if there are parents + if not tag_stack: + # There are two reasons to start a new section here: the TOC pointed to + # this page (then we use the first non-<body> on the page as a <title>), or + # the TOC pointed to a specific element + newlevel = 0 + toc_entry = self.toc.get(page.href, None) + if toc_entry is not None: + if None in toc_entry: + if tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text: + newlevel = 1 + self.toc[page.href] = None + if not newlevel and elem_tree.attrib.get('id', None) is not None: + newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None) + + # Start a new section if necessary + if newlevel: + while newlevel <= self.section_level: + fb2_out.append('</section>') + self.section_level -= 1 + fb2_out.append('<section>') + self.section_level += 1 + fb2_out.append('<title>') + tags.append('title') + if self.section_level == 0: + # If none of the prior processing made a section, make one now to be FB2 spec compliant + fb2_out.append('<section>') + self.section_level += 1 + + # Process the XHTML tag and styles. Converted to an FB2 tag. + # Use individual if statement not if else. There can be + # only one XHTML tag but it can have multiple styles. + if tag == 'img' and elem_tree.attrib.get('src', None): + # Only write the image tag if it is in the manifest. + ihref = urlnormalize(page.abshref(elem_tree.attrib['src'])) + if ihref in self.oeb_book.manifest.hrefs: + if ihref not in self.image_hrefs: + self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs) + p_txt, p_tag = self.ensure_p() + fb2_out += p_txt + tags += p_tag + fb2_out.append('<image l:href="#%s"/>' % self.image_hrefs[ihref]) + else: + self.log.warn(u'Ignoring image not in manifest: %s' % ihref) + if tag in ('br', 'hr') or ems >= 1: + if ems < 1: + multiplier = 1 + else: + multiplier = ems + if self.in_p: + closed_tags = [] + open_tags = tag_stack+tags + open_tags.reverse() + for t in open_tags: + fb2_out.append('</%s>' % t) + closed_tags.append(t) + if t == 'p': + break + fb2_out.append('<empty-line/>' * multiplier) + closed_tags.reverse() + for t in closed_tags: + fb2_out.append('<%s>' % t) + else: + fb2_out.append('<empty-line/>' * multiplier) + if tag in ('div', 'li', 'p'): + p_text, added_p = self.close_open_p(tag_stack+tags) + fb2_out += p_text + if added_p: + tags.append('p') + if tag == 'a' and elem_tree.attrib.get('href', None): + # Handle only external links for now + if urlparse(elem_tree.attrib['href']).netloc: + p_txt, p_tag = self.ensure_p() + fb2_out += p_txt + tags += p_tag + fb2_out.append('<a l:href="%s">' % urlnormalize(elem_tree.attrib['href'])) + tags.append('a') + if tag == 'b' or style['font-weight'] in ('bold', 'bolder'): + s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags) + fb2_out += s_out + tags += s_tags + if tag == 'i' or style['font-style'] == 'italic': + s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags) + fb2_out += s_out + tags += s_tags + if tag in ('del', 'strike') or style['text-decoration'] == 'line-through': + s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack+tags) + fb2_out += s_out + tags += s_tags + if tag == 'sub': + s_out, s_tags = self.handle_simple_tag('sub', tag_stack+tags) + fb2_out += s_out + tags += s_tags + if tag == 'sup': + s_out, s_tags = self.handle_simple_tag('sup', tag_stack+tags) + fb2_out += s_out + tags += s_tags + + # Process element text. + if hasattr(elem_tree, 'text') and elem_tree.text: + if not self.in_p: + fb2_out.append('<p>') + fb2_out.append(prepare_string_for_xml(elem_tree.text)) + if not self.in_p: + fb2_out.append('</p>') + + # Process sub-elements. + for item in elem_tree: + fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags) + + # Close open FB2 tags. + tags.reverse() + fb2_out += self.close_tags(tags) + + # Process element text that comes after the close of the XHTML tag but before the next XHTML tag. + if hasattr(elem_tree, 'tail') and elem_tree.tail: + if not self.in_p: + fb2_out.append('<p>') + fb2_out.append(prepare_string_for_xml(elem_tree.tail)) + if not self.in_p: + fb2_out.append('</p>') + + return fb2_out + + def close_tags(self, tags): + text = [] + for tag in tags: + text.append('</%s>' % tag) + if tag == 'p': + self.in_p = False + + return text diff --git a/ebook_converter/ebooks/metadata/fb2.py b/ebook_converter/ebooks/metadata/fb2.py new file mode 100644 index 0000000..ceaf047 --- /dev/null +++ b/ebook_converter/ebooks/metadata/fb2.py @@ -0,0 +1,456 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\ + '2008, Anatoly Shipitsin <norguhtar at gmail.com>' +'''Read meta information from fb2 files''' + +import os, random +from functools import partial +from string import ascii_letters, digits + +from lxml import etree + +from calibre.utils.date import parse_only_date +from calibre.utils.img import save_cover_data_to +from calibre.utils.xml_parse import safe_xml_fromstring +from calibre.utils.imghdr import identify +from calibre import guess_type, guess_all_extensions, prints, force_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn +from calibre.ebooks.chardet import xml_to_unicode +from polyglot.builtins import unicode_type +from polyglot.binary import as_base64_unicode + + +NAMESPACES = { + 'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0', + 'fb21' : 'http://www.gribuser.ru/xml/fictionbook/2.1', + 'xlink' : 'http://www.w3.org/1999/xlink' +} + +tostring = partial(etree.tostring, method='text', encoding='unicode') + + +def XLINK(tag): + return '{%s}%s'%(NAMESPACES['xlink'], tag) + + +class Context(object): + + def __init__(self, root): + try: + self.fb_ns = root.nsmap[root.prefix] or NAMESPACES['fb2'] + except Exception: + self.fb_ns = NAMESPACES['fb2'] + self.namespaces = { + 'fb': self.fb_ns, + 'fb2': self.fb_ns, + 'xlink': NAMESPACES['xlink'] + } + + def XPath(self, *args): + return etree.XPath(*args, namespaces=self.namespaces) + + def get_or_create(self, parent, tag, attribs={}, at_start=True): + xpathstr='./fb:'+tag + for n, v in attribs.items(): + xpathstr += '[@%s="%s"]' % (n, v) + ans = self.XPath(xpathstr)(parent) + if ans: + ans = ans[0] + else: + ans = self.create_tag(parent, tag, attribs, at_start) + return ans + + def create_tag(self, parent, tag, attribs={}, at_start=True): + ans = parent.makeelement('{%s}%s' % (self.fb_ns, tag)) + ans.attrib.update(attribs) + if at_start: + parent.insert(0, ans) + else: + parent.append(ans) + return ans + + def clear_meta_tags(self, doc, tag): + for parent in ('title-info', 'src-title-info', 'publish-info'): + for x in self.XPath('//fb:%s/fb:%s'%(parent, tag))(doc): + x.getparent().remove(x) + + def text2fb2(self, parent, text): + lines = text.split('\n') + for line in lines: + line = line.strip() + if line: + p = self.create_tag(parent, 'p', at_start=False) + p.text = line + else: + self.create_tag(parent, 'empty-line', at_start=False) + + +def get_fb2_data(stream): + from calibre.utils.zipfile import ZipFile, BadZipfile + pos = stream.tell() + try: + zf = ZipFile(stream) + except BadZipfile: + stream.seek(pos) + ans = stream.read() + zip_file_name = None + else: + names = zf.namelist() + names = [x for x in names if x.lower().endswith('.fb2')] or names + zip_file_name = names[0] + ans = zf.open(zip_file_name).read() + return ans, zip_file_name + + +def get_metadata(stream): + ''' Return fb2 metadata as a L{MetaInformation} object ''' + + root = _get_fbroot(get_fb2_data(stream)[0]) + ctx = Context(root) + book_title = _parse_book_title(root, ctx) + authors = _parse_authors(root, ctx) or [_('Unknown')] + + # fallback for book_title + if book_title: + book_title = unicode_type(book_title) + else: + book_title = force_unicode(os.path.splitext( + os.path.basename(getattr(stream, 'name', + _('Unknown'))))[0]) + mi = MetaInformation(book_title, authors) + + try: + _parse_cover(root, mi, ctx) + except: + pass + try: + _parse_comments(root, mi, ctx) + except: + pass + try: + _parse_tags(root, mi, ctx) + except: + pass + try: + _parse_series(root, mi, ctx) + except: + pass + try: + _parse_isbn(root, mi, ctx) + except: + pass + try: + _parse_publisher(root, mi, ctx) + except: + pass + try: + _parse_pubdate(root, mi, ctx) + except: + pass + + try: + _parse_language(root, mi, ctx) + except: + pass + + return mi + + +def _parse_authors(root, ctx): + authors = [] + # pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent! + # Those are fallbacks: <src-title-info>, <document-info> + author = None + for author_sec in ['title-info', 'src-title-info', 'document-info']: + for au in ctx.XPath('//fb:%s/fb:author'%author_sec)(root): + author = _parse_author(au, ctx) + if author: + authors.append(author) + if author: + break + + # if no author so far + if not authors: + authors.append(_('Unknown')) + + return authors + + +def _parse_author(elm_author, ctx): + """ Returns a list of display author and sortable author""" + + xp_templ = 'normalize-space(fb:%s/text())' + + author = ctx.XPath(xp_templ % 'first-name')(elm_author) + lname = ctx.XPath(xp_templ % 'last-name')(elm_author) + mname = ctx.XPath(xp_templ % 'middle-name')(elm_author) + + if mname: + author = (author + ' ' + mname).strip() + if lname: + author = (author + ' ' + lname).strip() + + # fallback to nickname + if not author: + nname = ctx.XPath(xp_templ % 'nickname')(elm_author) + if nname: + author = nname + + return author + + +def _parse_book_title(root, ctx): + # <title-info> has a priority. (actually <title-info> is mandatory) + # other are backup solution (sequence is important. Other than in fb2-doc) + xp_ti = '//fb:title-info/fb:book-title/text()' + xp_pi = '//fb:publish-info/fb:book-title/text()' + xp_si = '//fb:src-title-info/fb:book-title/text()' + book_title = ctx.XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root) + + return book_title + + +def _parse_cover(root, mi, ctx): + # pickup from <title-info>, if not exists it fallbacks to <src-title-info> + imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/@xlink:href), "#")')(root) + if imgid: + try: + _parse_cover_data(root, imgid, mi, ctx) + except: + pass + + +def _parse_cover_data(root, imgid, mi, ctx): + from calibre.ebooks.fb2 import base64_decode + elm_binary = ctx.XPath('//fb:binary[@id="%s"]'%imgid)(root) + if elm_binary: + mimetype = elm_binary[0].get('content-type', 'image/jpeg') + mime_extensions = guess_all_extensions(mimetype) + + if not mime_extensions and mimetype.startswith('image/'): + mimetype_fromid = guess_type(imgid)[0] + if mimetype_fromid and mimetype_fromid.startswith('image/'): + mime_extensions = guess_all_extensions(mimetype_fromid) + + if mime_extensions: + pic_data = elm_binary[0].text + if pic_data: + cdata = base64_decode(pic_data.strip()) + fmt = identify(cdata)[0] + mi.cover_data = (fmt, cdata) + else: + prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid)) + + +def _parse_tags(root, mi, ctx): + # pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent! + # Those are fallbacks: <src-title-info> + for genre_sec in ['title-info', 'src-title-info']: + # -- i18n Translations-- ? + tags = ctx.XPath('//fb:%s/fb:genre/text()' % genre_sec)(root) + if tags: + mi.tags = list(map(unicode_type, tags)) + break + + +def _parse_series(root, mi, ctx): + # calibre supports only 1 series: use the 1-st one + # pick up sequence but only from 1 section in preferred order + # except <src-title-info> + xp_ti = '//fb:title-info/fb:sequence[1]' + xp_pi = '//fb:publish-info/fb:sequence[1]' + + elms_sequence = ctx.XPath('%s|%s' % (xp_ti, xp_pi))(root) + if elms_sequence: + mi.series = elms_sequence[0].get('name', None) + if mi.series: + try: + mi.series_index = float('.'.join(elms_sequence[0].get('number', None).split()[:2])) + except Exception: + pass + + +def _parse_isbn(root, mi, ctx): + # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case + isbn = ctx.XPath('normalize-space(//fb:publish-info/fb:isbn/text())')(root) + if isbn: + # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case + if ',' in isbn: + isbn = isbn[:isbn.index(',')] + if check_isbn(isbn): + mi.isbn = isbn + + +def _parse_comments(root, mi, ctx): + # pick up annotation but only from 1 section <title-info>; fallback: <src-title-info> + for annotation_sec in ['title-info', 'src-title-info']: + elms_annotation = ctx.XPath('//fb:%s/fb:annotation' % annotation_sec)(root) + if elms_annotation: + mi.comments = tostring(elms_annotation[0]) + # TODO: tags i18n, xslt? + break + + +def _parse_publisher(root, mi, ctx): + publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/text())')(root) + if publisher: + mi.publisher = publisher + + +def _parse_pubdate(root, mi, ctx): + year = ctx.XPath('number(//fb:publish-info/fb:year/text())')(root) + if float.is_integer(year): + # only year is available, so use 2nd of June + mi.pubdate = parse_only_date(unicode_type(int(year))) + + +def _parse_language(root, mi, ctx): + language = ctx.XPath('string(//fb:title-info/fb:lang/text())')(root) + if language: + mi.language = language + mi.languages = [language] + + +def _get_fbroot(raw): + raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] + root = safe_xml_fromstring(raw) + return ensure_namespace(root) + + +def _set_title(title_info, mi, ctx): + if not mi.is_null('title'): + ctx.clear_meta_tags(title_info, 'book-title') + title = ctx.get_or_create(title_info, 'book-title') + title.text = mi.title + + +def _set_comments(title_info, mi, ctx): + if not mi.is_null('comments'): + from calibre.utils.html2text import html2text + ctx.clear_meta_tags(title_info, 'annotation') + title = ctx.get_or_create(title_info, 'annotation') + ctx.text2fb2(title, html2text(mi.comments)) + + +def _set_authors(title_info, mi, ctx): + if not mi.is_null('authors'): + ctx.clear_meta_tags(title_info, 'author') + for author in reversed(mi.authors): + author_parts = author.split() + if not author_parts: + continue + atag = ctx.create_tag(title_info, 'author') + if len(author_parts) == 1: + ctx.create_tag(atag, 'nickname').text = author + else: + ctx.create_tag(atag, 'first-name').text = author_parts[0] + author_parts = author_parts[1:] + if len(author_parts) > 1: + ctx.create_tag(atag, 'middle-name', at_start=False).text = author_parts[0] + author_parts = author_parts[1:] + if author_parts: + ctx.create_tag(atag, 'last-name', at_start=False).text = ' '.join(author_parts) + + +def _set_tags(title_info, mi, ctx): + if not mi.is_null('tags'): + ctx.clear_meta_tags(title_info, 'genre') + for t in mi.tags: + tag = ctx.create_tag(title_info, 'genre') + tag.text = t + + +def _set_series(title_info, mi, ctx): + if not mi.is_null('series'): + ctx.clear_meta_tags(title_info, 'sequence') + seq = ctx.get_or_create(title_info, 'sequence') + seq.set('name', mi.series) + try: + seq.set('number', '%g'%mi.series_index) + except: + seq.set('number', '1') + + +def _rnd_name(size=8, chars=ascii_letters + digits): + return ''.join(random.choice(chars) for x in range(size)) + + +def _rnd_pic_file_name(prefix='calibre_cover_', size=32, ext='jpg'): + return prefix + _rnd_name(size=size) + '.' + ext + + +def _encode_into_jpeg(data): + data = save_cover_data_to(data) + return as_base64_unicode(data) + + +def _set_cover(title_info, mi, ctx): + if not mi.is_null('cover_data') and mi.cover_data[1]: + coverpage = ctx.get_or_create(title_info, 'coverpage') + cim_tag = ctx.get_or_create(coverpage, 'image') + if XLINK('href') in cim_tag.attrib: + cim_filename = cim_tag.attrib[XLINK('href')][1:] + else: + cim_filename = _rnd_pic_file_name('cover') + cim_tag.attrib[XLINK('href')] = '#' + cim_filename + fb2_root = cim_tag.getroottree().getroot() + cim_binary = ctx.get_or_create(fb2_root, 'binary', attribs={'id': cim_filename}, at_start=False) + cim_binary.attrib['content-type'] = 'image/jpeg' + cim_binary.text = _encode_into_jpeg(mi.cover_data[1]) + + +def set_metadata(stream, mi, apply_null=False, update_timestamp=False): + stream.seek(0) + raw, zip_file_name = get_fb2_data(stream) + root = _get_fbroot(raw) + ctx = Context(root) + desc = ctx.get_or_create(root, 'description') + ti = ctx.get_or_create(desc, 'title-info') + + indent = ti.text + + _set_comments(ti, mi, ctx) + _set_series(ti, mi, ctx) + _set_tags(ti, mi, ctx) + _set_authors(ti, mi, ctx) + _set_title(ti, mi, ctx) + _set_cover(ti, mi, ctx) + + for child in ti: + child.tail = indent + + # Apparently there exists FB2 reading software that chokes on the use of + # single quotes in xml declaration. Sigh. See + # https://www.mobileread.com/forums/showthread.php?p=2273184#post2273184 + raw = b'<?xml version="1.0" encoding="UTF-8"?>\n' + raw += etree.tostring(root, method='xml', encoding='utf-8', xml_declaration=False) + + stream.seek(0) + stream.truncate() + if zip_file_name: + from calibre.utils.zipfile import ZipFile + with ZipFile(stream, 'w') as zf: + zf.writestr(zip_file_name, raw) + else: + stream.write(raw) + + +def ensure_namespace(doc): + # Workaround for broken FB2 files produced by convertonlinefree.com. See + # https://bugs.launchpad.net/bugs/1404701 + bare_tags = False + for x in ('description', 'body'): + for x in doc.findall(x): + if '{' not in x.tag: + bare_tags = True + break + if bare_tags: + import re + raw = etree.tostring(doc, encoding='unicode') + raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw) + doc = safe_xml_fromstring(raw) + return doc