diff --git a/ebook_converter/ebooks/fb2/__init__.py b/ebook_converter/ebooks/fb2/__init__.py
new file mode 100644
index 0000000..e42aba2
--- /dev/null
+++ b/ebook_converter/ebooks/fb2/__init__.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+
+def base64_decode(raw):
+ from io import BytesIO
+ from polyglot.binary import from_base64_bytes
+
+ # First try the python implementation as it is faster
+ try:
+ return from_base64_bytes(raw)
+ except Exception:
+ pass
+
+ # Try a more robust version (adapted from FBReader sources)
+ A, Z, a, z, zero, nine, plus, slash, equal = bytearray(b'AZaz09+/=')
+ raw = bytearray(raw)
+ out = BytesIO()
+ pos = 0
+ while pos < len(raw):
+ tot = 0
+ i = 0
+ while i < 4 and pos < len(raw):
+ byt = raw[pos]
+ pos += 1
+ num = 0
+ if A <= byt <= Z:
+ num = byt - A
+ elif a <= byt <= z:
+ num = byt - a + 26
+ elif zero <= byt <= nine:
+ num = byt - zero + 52
+ else:
+ num = {plus:62, slash:63, equal:64}.get(byt, None)
+ if num is None:
+ # Ignore this byte
+ continue
+ tot += num << (6 * (3 - i))
+ i += 1
+ triple = bytearray(3)
+ for j in (2, 1, 0):
+ triple[j] = tot & 0xff
+ tot >>= 8
+ out.write(bytes(triple))
+ return out.getvalue()
diff --git a/ebook_converter/ebooks/fb2/fb2ml.py b/ebook_converter/ebooks/fb2/fb2ml.py
new file mode 100644
index 0000000..d8a0098
--- /dev/null
+++ b/ebook_converter/ebooks/fb2/fb2ml.py
@@ -0,0 +1,574 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember '
+__docformat__ = 'restructuredtext en'
+
+'''
+Transform OEB content into FB2 markup
+'''
+
+import re, textwrap, uuid
+from datetime import datetime
+
+from lxml import etree
+
+from calibre import prepare_string_for_xml
+from calibre.constants import __appname__, __version__
+from calibre.utils.localization import lang_as_iso639_1
+from calibre.utils.xml_parse import safe_xml_fromstring
+from calibre.utils.img import save_cover_data_to
+from calibre.ebooks.oeb.base import urlnormalize
+from polyglot.builtins import unicode_type, string_or_bytes, range, filter
+from polyglot.binary import as_base64_unicode
+from polyglot.urllib import urlparse
+
+
+class FB2MLizer(object):
+ '''
+ Todo: * Include more FB2 specific tags in the conversion.
+ * Handle notes and anchor links.
+ '''
+
+ def __init__(self, log):
+ self.log = log
+ self.reset_state()
+
+ def reset_state(self):
+ # Used to ensure text and tags are always within and
+ self.in_p = False
+ # Mapping of image names. OEB allows for images to have the same name but be stored
+ # in different directories. FB2 images are all in a flat layout so we rename all images
+ # into a sequential numbering system to ensure there are no collisions between image names.
+ self.image_hrefs = {}
+ # Mapping of toc items and their
+ self.toc = {}
+ # Used to see whether a new needs to be opened
+ self.section_level = 0
+
+ def extract_content(self, oeb_book, opts):
+ self.log.info('Converting XHTML to FB2 markup...')
+ self.oeb_book = oeb_book
+ self.opts = opts
+ self.reset_state()
+
+ # Used for adding s and s to allow readers
+ # to generate toc from the document.
+ if self.opts.sectionize == 'toc':
+ self.create_flat_toc(self.oeb_book.toc, 1)
+
+ return self.fb2mlize_spine()
+
+ def fb2mlize_spine(self):
+ output = (
+ self.fb2_header(),
+ self.get_text(),
+ self.fb2mlize_images(),
+ self.fb2_footer(),
+ )
+ output = self.clean_text('\n'.join(output))
+
+ if self.opts.pretty_print:
+ output = etree.tostring(safe_xml_fromstring(output), encoding='unicode', pretty_print=True)
+
+ return '\n' + output
+
+ def clean_text(self, text):
+ # Remove pointless tags, but keep their contents.
+ text = re.sub(r'(?mu)<(strong|emphasis|strikethrough|sub|sup)>(\s*)\1>', r'\2', text)
+
+ # Clean up paragraphs endings.
+ text = re.sub(r'(?mu)\s+
', '', text)
+ # Condense empty paragraphs into a line break.
+ text = re.sub(r'(?mu)(?:\s*){3,}', '', text)
+ # Remove empty paragraphs.
+ text = re.sub(r'(?mu)\s*', '', text)
+ # Put the paragraph following a paragraph on a separate line.
+ text = re.sub(r'(?mu)\s*', '
\n', text)
+
+ if self.opts.insert_blank_line:
+ text = re.sub(r'(?mu)
', '', text)
+
+ # Clean up title endings.
+ text = re.sub(r'(?mu)\s+', '', text)
+ # Remove empty title elements.
+ text = re.sub(r'(?mu)\s*', '', text)
+ # Put the paragraph following a title on a separate line.
+ text = re.sub(r'(?mu)\s*', '\n
', text)
+
+ # Put line breaks between paragraphs on a separate line.
+ text = re.sub(r'(?mu)(p|title)>\s*', r'\1>\n', text)
+ text = re.sub(r'(?mu)\s*
', '\n
', text)
+
+ # Remove empty sections.
+ text = re.sub(r'(?mu)', '', text)
+ # Clean up sections starts and ends.
+ text = re.sub(r'(?mu)\s*', '\n', text)
+ text = re.sub(r'(?mu)\s*', '\n', text)
+ text = re.sub(r'(?mu)\s*', '\n', text)
+ text = re.sub(r'(?mu)\s*', '\n', text)
+
+ return text
+
+ def fb2_header(self):
+ from calibre.ebooks.oeb.base import OPF
+ metadata = {}
+ metadata['title'] = self.oeb_book.metadata.title[0].value
+ metadata['appname'] = __appname__
+ metadata['version'] = __version__
+ metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
+ if self.oeb_book.metadata.language:
+ lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value)
+ if not lc:
+ lc = self.oeb_book.metadata.language[0].value
+ metadata['lang'] = lc or 'en'
+ else:
+ metadata['lang'] = u'en'
+ metadata['id'] = None
+ metadata['cover'] = self.get_cover()
+ metadata['genre'] = self.opts.fb2_genre
+
+ metadata['author'] = ''
+ for auth in self.oeb_book.metadata.creator:
+ author_first = ''
+ author_middle = ''
+ author_last = ''
+ author_parts = auth.value.split(' ')
+ if len(author_parts) == 1:
+ author_last = author_parts[0]
+ elif len(author_parts) == 2:
+ author_first = author_parts[0]
+ author_last = author_parts[1]
+ else:
+ author_first = author_parts[0]
+ author_middle = ' '.join(author_parts[1:-1])
+ author_last = author_parts[-1]
+ metadata['author'] += ''
+ metadata['author'] += '%s' % prepare_string_for_xml(author_first)
+ if author_middle:
+ metadata['author'] += '%s' % prepare_string_for_xml(author_middle)
+ metadata['author'] += '%s' % prepare_string_for_xml(author_last)
+ metadata['author'] += ''
+ if not metadata['author']:
+ metadata['author'] = ''
+
+ metadata['keywords'] = ''
+ tags = list(map(unicode_type, self.oeb_book.metadata.subject))
+ if tags:
+ tags = ', '.join(prepare_string_for_xml(x) for x in tags)
+ metadata['keywords'] = '%s'%tags
+
+ metadata['sequence'] = ''
+ if self.oeb_book.metadata.series:
+ index = '1'
+ if self.oeb_book.metadata.series_index:
+ index = self.oeb_book.metadata.series_index[0]
+ metadata['sequence'] = '' % (prepare_string_for_xml('%s' % self.oeb_book.metadata.series[0]), index)
+
+ year = publisher = isbn = ''
+ identifiers = self.oeb_book.metadata['identifier']
+ for x in identifiers:
+ if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
+ metadata['id'] = unicode_type(x).split(':')[-1]
+ break
+ if metadata['id'] is None:
+ self.log.warn('No UUID identifier found')
+ metadata['id'] = unicode_type(uuid.uuid4())
+
+ try:
+ date = self.oeb_book.metadata['date'][0]
+ except IndexError:
+ pass
+ else:
+ year = '%s' % prepare_string_for_xml(date.value.partition('-')[0])
+
+ try:
+ publisher = self.oeb_book.metadata['publisher'][0]
+ except IndexError:
+ pass
+ else:
+ publisher = '%s' % prepare_string_for_xml(publisher.value)
+
+ for x in identifiers:
+ if x.get(OPF('scheme'), None).lower() == 'isbn':
+ isbn = '%s' % prepare_string_for_xml(x.value)
+
+ metadata['year'], metadata['isbn'], metadata['publisher'] = year, isbn, publisher
+ for key, value in metadata.items():
+ if key not in ('author', 'cover', 'sequence', 'keywords', 'year', 'publisher', 'isbn'):
+ metadata[key] = prepare_string_for_xml(value)
+
+ try:
+ comments = self.oeb_book.metadata['description'][0]
+ except Exception:
+ metadata['comments'] = ''
+ else:
+ from calibre.utils.html2text import html2text
+ metadata['comments'] = '{}
'.format(prepare_string_for_xml(html2text(comments.value).strip()))
+
+ # Keep the indentation level of the description the same as the body.
+ header = textwrap.dedent('''\
+
+
+
+ %(genre)s
+ %(author)s
+ %(title)s
+ %(cover)s
+ %(lang)s
+ %(keywords)s
+ %(sequence)s
+ %(comments)s
+
+
+ %(author)s
+ %(appname)s %(version)s
+ %(date)s
+ %(id)s
+ 1.0
+
+
+ %(publisher)s
+ %(year)s
+ %(isbn)s
+
+ ''') % metadata
+
+ # Remove empty lines.
+ return '\n'.join(filter(unicode_type.strip, header.splitlines()))
+
+ def fb2_footer(self):
+ return ''
+
+ def get_cover(self):
+ from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
+
+ cover_href = None
+
+ # Get the raster cover if it's available.
+ if self.oeb_book.metadata.cover and unicode_type(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
+ id = unicode_type(self.oeb_book.metadata.cover[0])
+ cover_item = self.oeb_book.manifest.ids[id]
+ if cover_item.media_type in OEB_RASTER_IMAGES:
+ cover_href = cover_item.href
+ else:
+ # Figure out if we have a title page or a cover page
+ page_name = ''
+ if 'titlepage' in self.oeb_book.guide:
+ page_name = 'titlepage'
+ elif 'cover' in self.oeb_book.guide:
+ page_name = 'cover'
+
+ if page_name:
+ cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href]
+ # Get the first image in the page
+ for img in cover_item.xpath('//img'):
+ cover_href = cover_item.abshref(img.get('src'))
+ break
+
+ if cover_href:
+ # Only write the image tag if it is in the manifest.
+ if cover_href in self.oeb_book.manifest.hrefs and cover_href not in self.image_hrefs:
+ self.image_hrefs[cover_href] = 'img_%s' % len(self.image_hrefs)
+ return '' % self.image_hrefs[cover_href]
+
+ return ''
+
+ def get_text(self):
+ from calibre.ebooks.oeb.base import XHTML
+ from calibre.ebooks.oeb.stylizer import Stylizer
+ text = ['
']
+
+ # Create main section if there are no others to create
+ if self.opts.sectionize == 'nothing':
+ text.append('')
+ self.section_level += 1
+
+ for item in self.oeb_book.spine:
+ self.log.debug('Converting %s to FictionBook2 XML' % item.href)
+ stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
+
+ # Start a if we must sectionize each file or if the TOC references this page
+ page_section_open = False
+ if self.opts.sectionize == 'files' or None in self.toc.get(item.href, ()):
+ text.append('')
+ page_section_open = True
+ self.section_level += 1
+
+ text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
+
+ if page_section_open:
+ text.append('')
+ self.section_level -= 1
+
+ # Close any open sections
+ while self.section_level > 0:
+ text.append('')
+ self.section_level -= 1
+
+ text.append('')
+ return ''.join(text)
+
+ def fb2mlize_images(self):
+ '''
+ This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.
+ '''
+ from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
+
+ images = []
+ for item in self.oeb_book.manifest:
+ # Don't write the image if it's not referenced in the document's text.
+ if item.href not in self.image_hrefs:
+ continue
+ if item.media_type in OEB_RASTER_IMAGES:
+ try:
+ if item.media_type not in ('image/jpeg', 'image/png'):
+ imdata = save_cover_data_to(item.data, compression_quality=70)
+ raw_data = as_base64_unicode(imdata)
+ content_type = 'image/jpeg'
+ else:
+ raw_data = as_base64_unicode(item.data)
+ content_type = item.media_type
+ # Don't put the encoded image on a single line.
+ step = 72
+ data = '\n'.join(raw_data[i:i+step] for i in range(0, len(raw_data), step))
+ images.append('%s' % (self.image_hrefs[item.href], content_type, data))
+ except Exception as e:
+ self.log.error('Error: Could not include file %s because '
+ '%s.' % (item.href, e))
+ return '\n'.join(images)
+
+ def create_flat_toc(self, nodes, level):
+ for item in nodes:
+ href, mid, id = item.href.partition('#')
+ if not id:
+ self.toc[href] = {None: 'page'}
+ else:
+ if not self.toc.get(href, None):
+ self.toc[href] = {}
+ self.toc[href][id] = level
+ self.create_flat_toc(item.nodes, level + 1)
+
+ def ensure_p(self):
+ if self.in_p:
+ return [], []
+ else:
+ self.in_p = True
+ return [''], ['p']
+
+ def close_open_p(self, tags):
+ text = ['']
+ added_p = False
+
+ if self.in_p:
+ # Close all up to p. Close p. Reopen all closed tags including p.
+ closed_tags = []
+ tags.reverse()
+ for t in tags:
+ text.append('%s>' % t)
+ closed_tags.append(t)
+ if t == 'p':
+ break
+ closed_tags.reverse()
+ for t in closed_tags:
+ text.append('<%s>' % t)
+ else:
+ text.append('
')
+ added_p = True
+ self.in_p = True
+
+ return text, added_p
+
+ def handle_simple_tag(self, tag, tags):
+ s_out = []
+ s_tags = []
+ if tag not in tags:
+ p_out, p_tags = self.ensure_p()
+ s_out += p_out
+ s_tags += p_tags
+ s_out.append('<%s>' % tag)
+ s_tags.append(tag)
+ return s_out, s_tags
+
+ def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
+ '''
+ This function is intended to be used in a recursive manner. dump_text will
+ run though all elements in the elem_tree and call itself on each element.
+
+ self.image_hrefs will be populated by calling this function.
+
+ @param elem_tree: etree representation of XHTML content to be transformed.
+ @param stylizer: Used to track the style of elements within the tree.
+ @param page: OEB page used to determine absolute urls.
+ @param tag_stack: List of open FB2 tags to take into account.
+
+ @return: List of string representing the XHTML converted to FB2 markup.
+ '''
+ from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace
+ elem = elem_tree
+
+ # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace.
+ if not isinstance(elem_tree.tag, string_or_bytes) or namespace(elem_tree.tag) != XHTML_NS:
+ p = elem.getparent()
+ if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
+ and elem.tail:
+ return [elem.tail]
+ return []
+
+ style = stylizer.style(elem_tree)
+ if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+ or style['visibility'] == 'hidden':
+ if hasattr(elem, 'tail') and elem.tail:
+ return [elem.tail]
+ return []
+
+ # FB2 generated output.
+ fb2_out = []
+ # FB2 tags in the order they are opened. This will be used to close the tags.
+ tags = []
+ # First tag in tree
+ tag = barename(elem_tree.tag)
+ # Number of blank lines above tag
+ try:
+ ems = int(round((float(style.marginTop) / style.fontSize) - 1))
+ if ems < 0:
+ ems = 0
+ except:
+ ems = 0
+
+ # Convert TOC entries to
s and add s
+ if self.opts.sectionize == 'toc':
+ # A section cannot be a child of any other element than another section,
+ # so leave the tag alone if there are parents
+ if not tag_stack:
+ # There are two reasons to start a new section here: the TOC pointed to
+ # this page (then we use the first non- on the page as a ), or
+ # the TOC pointed to a specific element
+ newlevel = 0
+ toc_entry = self.toc.get(page.href, None)
+ if toc_entry is not None:
+ if None in toc_entry:
+ if tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text:
+ newlevel = 1
+ self.toc[page.href] = None
+ if not newlevel and elem_tree.attrib.get('id', None) is not None:
+ newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None)
+
+ # Start a new section if necessary
+ if newlevel:
+ while newlevel <= self.section_level:
+ fb2_out.append('')
+ self.section_level -= 1
+ fb2_out.append('')
+ self.section_level += 1
+ fb2_out.append('')
+ tags.append('title')
+ if self.section_level == 0:
+ # If none of the prior processing made a section, make one now to be FB2 spec compliant
+ fb2_out.append('')
+ self.section_level += 1
+
+ # Process the XHTML tag and styles. Converted to an FB2 tag.
+ # Use individual if statement not if else. There can be
+ # only one XHTML tag but it can have multiple styles.
+ if tag == 'img' and elem_tree.attrib.get('src', None):
+ # Only write the image tag if it is in the manifest.
+ ihref = urlnormalize(page.abshref(elem_tree.attrib['src']))
+ if ihref in self.oeb_book.manifest.hrefs:
+ if ihref not in self.image_hrefs:
+ self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs)
+ p_txt, p_tag = self.ensure_p()
+ fb2_out += p_txt
+ tags += p_tag
+ fb2_out.append('' % self.image_hrefs[ihref])
+ else:
+ self.log.warn(u'Ignoring image not in manifest: %s' % ihref)
+ if tag in ('br', 'hr') or ems >= 1:
+ if ems < 1:
+ multiplier = 1
+ else:
+ multiplier = ems
+ if self.in_p:
+ closed_tags = []
+ open_tags = tag_stack+tags
+ open_tags.reverse()
+ for t in open_tags:
+ fb2_out.append('%s>' % t)
+ closed_tags.append(t)
+ if t == 'p':
+ break
+ fb2_out.append('' * multiplier)
+ closed_tags.reverse()
+ for t in closed_tags:
+ fb2_out.append('<%s>' % t)
+ else:
+ fb2_out.append('' * multiplier)
+ if tag in ('div', 'li', 'p'):
+ p_text, added_p = self.close_open_p(tag_stack+tags)
+ fb2_out += p_text
+ if added_p:
+ tags.append('p')
+ if tag == 'a' and elem_tree.attrib.get('href', None):
+ # Handle only external links for now
+ if urlparse(elem_tree.attrib['href']).netloc:
+ p_txt, p_tag = self.ensure_p()
+ fb2_out += p_txt
+ tags += p_tag
+ fb2_out.append('' % urlnormalize(elem_tree.attrib['href']))
+ tags.append('a')
+ if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
+ s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
+ fb2_out += s_out
+ tags += s_tags
+ if tag == 'i' or style['font-style'] == 'italic':
+ s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
+ fb2_out += s_out
+ tags += s_tags
+ if tag in ('del', 'strike') or style['text-decoration'] == 'line-through':
+ s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack+tags)
+ fb2_out += s_out
+ tags += s_tags
+ if tag == 'sub':
+ s_out, s_tags = self.handle_simple_tag('sub', tag_stack+tags)
+ fb2_out += s_out
+ tags += s_tags
+ if tag == 'sup':
+ s_out, s_tags = self.handle_simple_tag('sup', tag_stack+tags)
+ fb2_out += s_out
+ tags += s_tags
+
+ # Process element text.
+ if hasattr(elem_tree, 'text') and elem_tree.text:
+ if not self.in_p:
+ fb2_out.append('')
+ fb2_out.append(prepare_string_for_xml(elem_tree.text))
+ if not self.in_p:
+ fb2_out.append('
')
+
+ # Process sub-elements.
+ for item in elem_tree:
+ fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags)
+
+ # Close open FB2 tags.
+ tags.reverse()
+ fb2_out += self.close_tags(tags)
+
+ # Process element text that comes after the close of the XHTML tag but before the next XHTML tag.
+ if hasattr(elem_tree, 'tail') and elem_tree.tail:
+ if not self.in_p:
+ fb2_out.append('')
+ fb2_out.append(prepare_string_for_xml(elem_tree.tail))
+ if not self.in_p:
+ fb2_out.append('
')
+
+ return fb2_out
+
+ def close_tags(self, tags):
+ text = []
+ for tag in tags:
+ text.append('%s>' % tag)
+ if tag == 'p':
+ self.in_p = False
+
+ return text
diff --git a/ebook_converter/ebooks/metadata/fb2.py b/ebook_converter/ebooks/metadata/fb2.py
new file mode 100644
index 0000000..ceaf047
--- /dev/null
+++ b/ebook_converter/ebooks/metadata/fb2.py
@@ -0,0 +1,456 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2011, Roman Mukhin , '\
+ '2008, Anatoly Shipitsin '
+'''Read meta information from fb2 files'''
+
+import os, random
+from functools import partial
+from string import ascii_letters, digits
+
+from lxml import etree
+
+from calibre.utils.date import parse_only_date
+from calibre.utils.img import save_cover_data_to
+from calibre.utils.xml_parse import safe_xml_fromstring
+from calibre.utils.imghdr import identify
+from calibre import guess_type, guess_all_extensions, prints, force_unicode
+from calibre.ebooks.metadata import MetaInformation, check_isbn
+from calibre.ebooks.chardet import xml_to_unicode
+from polyglot.builtins import unicode_type
+from polyglot.binary import as_base64_unicode
+
+
+NAMESPACES = {
+ 'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0',
+ 'fb21' : 'http://www.gribuser.ru/xml/fictionbook/2.1',
+ 'xlink' : 'http://www.w3.org/1999/xlink'
+}
+
+tostring = partial(etree.tostring, method='text', encoding='unicode')
+
+
+def XLINK(tag):
+ return '{%s}%s'%(NAMESPACES['xlink'], tag)
+
+
+class Context(object):
+
+ def __init__(self, root):
+ try:
+ self.fb_ns = root.nsmap[root.prefix] or NAMESPACES['fb2']
+ except Exception:
+ self.fb_ns = NAMESPACES['fb2']
+ self.namespaces = {
+ 'fb': self.fb_ns,
+ 'fb2': self.fb_ns,
+ 'xlink': NAMESPACES['xlink']
+ }
+
+ def XPath(self, *args):
+ return etree.XPath(*args, namespaces=self.namespaces)
+
+ def get_or_create(self, parent, tag, attribs={}, at_start=True):
+ xpathstr='./fb:'+tag
+ for n, v in attribs.items():
+ xpathstr += '[@%s="%s"]' % (n, v)
+ ans = self.XPath(xpathstr)(parent)
+ if ans:
+ ans = ans[0]
+ else:
+ ans = self.create_tag(parent, tag, attribs, at_start)
+ return ans
+
+ def create_tag(self, parent, tag, attribs={}, at_start=True):
+ ans = parent.makeelement('{%s}%s' % (self.fb_ns, tag))
+ ans.attrib.update(attribs)
+ if at_start:
+ parent.insert(0, ans)
+ else:
+ parent.append(ans)
+ return ans
+
+ def clear_meta_tags(self, doc, tag):
+ for parent in ('title-info', 'src-title-info', 'publish-info'):
+ for x in self.XPath('//fb:%s/fb:%s'%(parent, tag))(doc):
+ x.getparent().remove(x)
+
+ def text2fb2(self, parent, text):
+ lines = text.split('\n')
+ for line in lines:
+ line = line.strip()
+ if line:
+ p = self.create_tag(parent, 'p', at_start=False)
+ p.text = line
+ else:
+ self.create_tag(parent, 'empty-line', at_start=False)
+
+
+def get_fb2_data(stream):
+ from calibre.utils.zipfile import ZipFile, BadZipfile
+ pos = stream.tell()
+ try:
+ zf = ZipFile(stream)
+ except BadZipfile:
+ stream.seek(pos)
+ ans = stream.read()
+ zip_file_name = None
+ else:
+ names = zf.namelist()
+ names = [x for x in names if x.lower().endswith('.fb2')] or names
+ zip_file_name = names[0]
+ ans = zf.open(zip_file_name).read()
+ return ans, zip_file_name
+
+
+def get_metadata(stream):
+ ''' Return fb2 metadata as a L{MetaInformation} object '''
+
+ root = _get_fbroot(get_fb2_data(stream)[0])
+ ctx = Context(root)
+ book_title = _parse_book_title(root, ctx)
+ authors = _parse_authors(root, ctx) or [_('Unknown')]
+
+ # fallback for book_title
+ if book_title:
+ book_title = unicode_type(book_title)
+ else:
+ book_title = force_unicode(os.path.splitext(
+ os.path.basename(getattr(stream, 'name',
+ _('Unknown'))))[0])
+ mi = MetaInformation(book_title, authors)
+
+ try:
+ _parse_cover(root, mi, ctx)
+ except:
+ pass
+ try:
+ _parse_comments(root, mi, ctx)
+ except:
+ pass
+ try:
+ _parse_tags(root, mi, ctx)
+ except:
+ pass
+ try:
+ _parse_series(root, mi, ctx)
+ except:
+ pass
+ try:
+ _parse_isbn(root, mi, ctx)
+ except:
+ pass
+ try:
+ _parse_publisher(root, mi, ctx)
+ except:
+ pass
+ try:
+ _parse_pubdate(root, mi, ctx)
+ except:
+ pass
+
+ try:
+ _parse_language(root, mi, ctx)
+ except:
+ pass
+
+ return mi
+
+
+def _parse_authors(root, ctx):
+ authors = []
+ # pick up authors but only from 1 secrion ; otherwise it is not consistent!
+ # Those are fallbacks: ,
+ author = None
+ for author_sec in ['title-info', 'src-title-info', 'document-info']:
+ for au in ctx.XPath('//fb:%s/fb:author'%author_sec)(root):
+ author = _parse_author(au, ctx)
+ if author:
+ authors.append(author)
+ if author:
+ break
+
+ # if no author so far
+ if not authors:
+ authors.append(_('Unknown'))
+
+ return authors
+
+
+def _parse_author(elm_author, ctx):
+ """ Returns a list of display author and sortable author"""
+
+ xp_templ = 'normalize-space(fb:%s/text())'
+
+ author = ctx.XPath(xp_templ % 'first-name')(elm_author)
+ lname = ctx.XPath(xp_templ % 'last-name')(elm_author)
+ mname = ctx.XPath(xp_templ % 'middle-name')(elm_author)
+
+ if mname:
+ author = (author + ' ' + mname).strip()
+ if lname:
+ author = (author + ' ' + lname).strip()
+
+ # fallback to nickname
+ if not author:
+ nname = ctx.XPath(xp_templ % 'nickname')(elm_author)
+ if nname:
+ author = nname
+
+ return author
+
+
+def _parse_book_title(root, ctx):
+ # has a priority. (actually is mandatory)
+ # other are backup solution (sequence is important. Other than in fb2-doc)
+ xp_ti = '//fb:title-info/fb:book-title/text()'
+ xp_pi = '//fb:publish-info/fb:book-title/text()'
+ xp_si = '//fb:src-title-info/fb:book-title/text()'
+ book_title = ctx.XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
+
+ return book_title
+
+
+def _parse_cover(root, mi, ctx):
+ # pickup from , if not exists it fallbacks to
+ imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/@xlink:href), "#")')(root)
+ if imgid:
+ try:
+ _parse_cover_data(root, imgid, mi, ctx)
+ except:
+ pass
+
+
+def _parse_cover_data(root, imgid, mi, ctx):
+ from calibre.ebooks.fb2 import base64_decode
+ elm_binary = ctx.XPath('//fb:binary[@id="%s"]'%imgid)(root)
+ if elm_binary:
+ mimetype = elm_binary[0].get('content-type', 'image/jpeg')
+ mime_extensions = guess_all_extensions(mimetype)
+
+ if not mime_extensions and mimetype.startswith('image/'):
+ mimetype_fromid = guess_type(imgid)[0]
+ if mimetype_fromid and mimetype_fromid.startswith('image/'):
+ mime_extensions = guess_all_extensions(mimetype_fromid)
+
+ if mime_extensions:
+ pic_data = elm_binary[0].text
+ if pic_data:
+ cdata = base64_decode(pic_data.strip())
+ fmt = identify(cdata)[0]
+ mi.cover_data = (fmt, cdata)
+ else:
+ prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid))
+
+
+def _parse_tags(root, mi, ctx):
+ # pick up genre but only from 1 secrion ; otherwise it is not consistent!
+ # Those are fallbacks:
+ for genre_sec in ['title-info', 'src-title-info']:
+ # -- i18n Translations-- ?
+ tags = ctx.XPath('//fb:%s/fb:genre/text()' % genre_sec)(root)
+ if tags:
+ mi.tags = list(map(unicode_type, tags))
+ break
+
+
+def _parse_series(root, mi, ctx):
+ # calibre supports only 1 series: use the 1-st one
+ # pick up sequence but only from 1 section in preferred order
+ # except
+ xp_ti = '//fb:title-info/fb:sequence[1]'
+ xp_pi = '//fb:publish-info/fb:sequence[1]'
+
+ elms_sequence = ctx.XPath('%s|%s' % (xp_ti, xp_pi))(root)
+ if elms_sequence:
+ mi.series = elms_sequence[0].get('name', None)
+ if mi.series:
+ try:
+ mi.series_index = float('.'.join(elms_sequence[0].get('number', None).split()[:2]))
+ except Exception:
+ pass
+
+
+def _parse_isbn(root, mi, ctx):
+ # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
+ isbn = ctx.XPath('normalize-space(//fb:publish-info/fb:isbn/text())')(root)
+ if isbn:
+ # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
+ if ',' in isbn:
+ isbn = isbn[:isbn.index(',')]
+ if check_isbn(isbn):
+ mi.isbn = isbn
+
+
+def _parse_comments(root, mi, ctx):
+ # pick up annotation but only from 1 section ; fallback:
+ for annotation_sec in ['title-info', 'src-title-info']:
+ elms_annotation = ctx.XPath('//fb:%s/fb:annotation' % annotation_sec)(root)
+ if elms_annotation:
+ mi.comments = tostring(elms_annotation[0])
+ # TODO: tags i18n, xslt?
+ break
+
+
+def _parse_publisher(root, mi, ctx):
+ publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/text())')(root)
+ if publisher:
+ mi.publisher = publisher
+
+
+def _parse_pubdate(root, mi, ctx):
+ year = ctx.XPath('number(//fb:publish-info/fb:year/text())')(root)
+ if float.is_integer(year):
+ # only year is available, so use 2nd of June
+ mi.pubdate = parse_only_date(unicode_type(int(year)))
+
+
+def _parse_language(root, mi, ctx):
+ language = ctx.XPath('string(//fb:title-info/fb:lang/text())')(root)
+ if language:
+ mi.language = language
+ mi.languages = [language]
+
+
+def _get_fbroot(raw):
+ raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
+ root = safe_xml_fromstring(raw)
+ return ensure_namespace(root)
+
+
+def _set_title(title_info, mi, ctx):
+ if not mi.is_null('title'):
+ ctx.clear_meta_tags(title_info, 'book-title')
+ title = ctx.get_or_create(title_info, 'book-title')
+ title.text = mi.title
+
+
+def _set_comments(title_info, mi, ctx):
+ if not mi.is_null('comments'):
+ from calibre.utils.html2text import html2text
+ ctx.clear_meta_tags(title_info, 'annotation')
+ title = ctx.get_or_create(title_info, 'annotation')
+ ctx.text2fb2(title, html2text(mi.comments))
+
+
+def _set_authors(title_info, mi, ctx):
+ if not mi.is_null('authors'):
+ ctx.clear_meta_tags(title_info, 'author')
+ for author in reversed(mi.authors):
+ author_parts = author.split()
+ if not author_parts:
+ continue
+ atag = ctx.create_tag(title_info, 'author')
+ if len(author_parts) == 1:
+ ctx.create_tag(atag, 'nickname').text = author
+ else:
+ ctx.create_tag(atag, 'first-name').text = author_parts[0]
+ author_parts = author_parts[1:]
+ if len(author_parts) > 1:
+ ctx.create_tag(atag, 'middle-name', at_start=False).text = author_parts[0]
+ author_parts = author_parts[1:]
+ if author_parts:
+ ctx.create_tag(atag, 'last-name', at_start=False).text = ' '.join(author_parts)
+
+
+def _set_tags(title_info, mi, ctx):
+ if not mi.is_null('tags'):
+ ctx.clear_meta_tags(title_info, 'genre')
+ for t in mi.tags:
+ tag = ctx.create_tag(title_info, 'genre')
+ tag.text = t
+
+
+def _set_series(title_info, mi, ctx):
+ if not mi.is_null('series'):
+ ctx.clear_meta_tags(title_info, 'sequence')
+ seq = ctx.get_or_create(title_info, 'sequence')
+ seq.set('name', mi.series)
+ try:
+ seq.set('number', '%g'%mi.series_index)
+ except:
+ seq.set('number', '1')
+
+
+def _rnd_name(size=8, chars=ascii_letters + digits):
+ return ''.join(random.choice(chars) for x in range(size))
+
+
+def _rnd_pic_file_name(prefix='calibre_cover_', size=32, ext='jpg'):
+ return prefix + _rnd_name(size=size) + '.' + ext
+
+
+def _encode_into_jpeg(data):
+ data = save_cover_data_to(data)
+ return as_base64_unicode(data)
+
+
+def _set_cover(title_info, mi, ctx):
+ if not mi.is_null('cover_data') and mi.cover_data[1]:
+ coverpage = ctx.get_or_create(title_info, 'coverpage')
+ cim_tag = ctx.get_or_create(coverpage, 'image')
+ if XLINK('href') in cim_tag.attrib:
+ cim_filename = cim_tag.attrib[XLINK('href')][1:]
+ else:
+ cim_filename = _rnd_pic_file_name('cover')
+ cim_tag.attrib[XLINK('href')] = '#' + cim_filename
+ fb2_root = cim_tag.getroottree().getroot()
+ cim_binary = ctx.get_or_create(fb2_root, 'binary', attribs={'id': cim_filename}, at_start=False)
+ cim_binary.attrib['content-type'] = 'image/jpeg'
+ cim_binary.text = _encode_into_jpeg(mi.cover_data[1])
+
+
+def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
+ stream.seek(0)
+ raw, zip_file_name = get_fb2_data(stream)
+ root = _get_fbroot(raw)
+ ctx = Context(root)
+ desc = ctx.get_or_create(root, 'description')
+ ti = ctx.get_or_create(desc, 'title-info')
+
+ indent = ti.text
+
+ _set_comments(ti, mi, ctx)
+ _set_series(ti, mi, ctx)
+ _set_tags(ti, mi, ctx)
+ _set_authors(ti, mi, ctx)
+ _set_title(ti, mi, ctx)
+ _set_cover(ti, mi, ctx)
+
+ for child in ti:
+ child.tail = indent
+
+ # Apparently there exists FB2 reading software that chokes on the use of
+ # single quotes in xml declaration. Sigh. See
+ # https://www.mobileread.com/forums/showthread.php?p=2273184#post2273184
+ raw = b'\n'
+ raw += etree.tostring(root, method='xml', encoding='utf-8', xml_declaration=False)
+
+ stream.seek(0)
+ stream.truncate()
+ if zip_file_name:
+ from calibre.utils.zipfile import ZipFile
+ with ZipFile(stream, 'w') as zf:
+ zf.writestr(zip_file_name, raw)
+ else:
+ stream.write(raw)
+
+
+def ensure_namespace(doc):
+ # Workaround for broken FB2 files produced by convertonlinefree.com. See
+ # https://bugs.launchpad.net/bugs/1404701
+ bare_tags = False
+ for x in ('description', 'body'):
+ for x in doc.findall(x):
+ if '{' not in x.tag:
+ bare_tags = True
+ break
+ if bare_tags:
+ import re
+ raw = etree.tostring(doc, encoding='unicode')
+ raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw)
+ doc = safe_xml_fromstring(raw)
+ return doc