""" Convert .fb2 files to .lrf """ import os, re import pkg_resources from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation from ebook_converter import guess_type from ebook_converter.polyglot.builtins import iteritems __license__ = 'GPL v3' __copyright__ = '2008, Anatoly Shipitsin ' FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0' FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1' class FB2Input(InputFormatPlugin): name = 'FB2 Input' author = 'Anatoly Shipitsin' description = 'Convert FB2 and FBZ files to HTML' file_types = {'fb2', 'fbz'} commit_name = 'fb2_input' recommendations = { ('level1_toc', '//h:h1', OptionRecommendation.MED), ('level2_toc', '//h:h2', OptionRecommendation.MED), ('level3_toc', '//h:h3', OptionRecommendation.MED), } options = { OptionRecommendation(name='no_inline_fb2_toc', recommended_value=False, level=OptionRecommendation.LOW, help=_('Do not insert a Table of Contents at the beginning of the book.' ) )} def convert(self, stream, options, file_ext, log, accelerators): from lxml import etree from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.metadata.meta import get_metadata from ebook_converter.ebooks.oeb.base import XLINK_NS, XHTML_NS from ebook_converter.ebooks.chardet import xml_to_unicode self.log = log log.debug('Parsing XML...') raw = get_fb2_data(stream)[0] raw = raw.replace(b'\0', b'') raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)[0] try: doc = safe_xml_fromstring(raw) except etree.XMLSyntaxError: doc = safe_xml_fromstring(raw.replace('& ', '&')) if doc is None: raise ValueError('The FB2 file is not valid XML') doc = ensure_namespace(doc) try: fb_ns = doc.nsmap[doc.prefix] except Exception: fb_ns = FB2NS NAMESPACES = {'f':fb_ns, 'l':XLINK_NS} stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]') css = '' for s in stylesheets: css += etree.tostring(s, encoding='unicode', method='text', with_tail=False) + '\n\n' if css: import css_parser, logging parser = css_parser.CSSParser(fetcher=None, log=logging.getLogger('calibre.css')) XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS text = XHTML_CSS_NAMESPACE + css log.debug('Parsing stylesheet...') stylesheet = parser.parseString(text) stylesheet.namespaces['h'] = XHTML_NS css = stylesheet.cssText if isinstance(css, bytes): css = css.decode('utf-8', 'replace') css = css.replace('h|style', 'h|span') css = re.sub(r'name\s*=\s*', 'class=', css) self.extract_embedded_content(doc) log.debug('Converting XML to HTML...') with open(pkg_resources.resource_filename('ebook_converter', 'data/fb2.xsl')) as f: ss = f.read() ss = ss.replace("__FB_NS__", fb_ns) if options.no_inline_fb2_toc: log('Disabling generation of inline FB2 TOC') ss = re.compile(r'.*', re.DOTALL).sub('', ss) styledoc = safe_xml_fromstring(ss) transform = etree.XSLT(styledoc) result = transform(doc) # Handle links of type note and cite notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')} cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')} all_ids = {x for x in result.xpath('//*/@id')} for cite, a in iteritems(cites): note = notes.get(cite, None) if note: c = 1 while 'cite%d' % c in all_ids: c += 1 if not note.get('id', None): note.set('id', 'cite%d' % c) all_ids.add(note.get('id')) a.set('href', '#%s' % note.get('id')) for x in result.xpath('//*[@link_note or @link_cite]'): x.attrib.pop('link_note', None) x.attrib.pop('link_cite', None) for img in result.xpath('//img[@src]'): src = img.get('src') img.set('src', self.binary_map.get(src, src)) index = transform.tostring(result) with open('index.xhtml', 'wb') as f: f.write(index.encode('utf-8')) with open('inline-styles.css', 'wb') as f: f.write(css.encode('utf-8')) stream.seek(0) mi = get_metadata(stream, 'fb2') if not mi.title: mi.title = _('Unknown') if not mi.authors: mi.authors = [_('Unknown')] cpath = None if mi.cover_data and mi.cover_data[1]: with open('fb2_cover_calibre_mi.jpg', 'wb') as f: f.write(mi.cover_data[1]) cpath = os.path.abspath('fb2_cover_calibre_mi.jpg') else: for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): href = img.get('{%s}href'%XLINK_NS, img.get('href', None)) if href is not None: if href.startswith('#'): href = href[1:] cpath = os.path.abspath(href) break opf = OPFCreator(os.getcwd(), mi) entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')] opf.create_manifest(entries) opf.create_spine(['index.xhtml']) if cpath: opf.guide.set_cover(cpath) with open('metadata.opf', 'wb') as f: opf.render(f) return os.path.join(os.getcwd(), 'metadata.opf') def extract_embedded_content(self, doc): from ebook_converter.ebooks.fb2 import base64_decode self.binary_map = {} for elem in doc.xpath('./*'): if elem.text and 'binary' in elem.tag and 'id' in elem.attrib: ct = elem.get('content-type', '') fname = elem.attrib['id'] ext = ct.rpartition('/')[-1].lower() if ext in ('png', 'jpeg', 'jpg'): if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg', 'png'}: fname += '.' + ext self.binary_map[elem.get('id')] = fname raw = elem.text.strip() try: data = base64_decode(raw) except TypeError: self.log.exception('Binary data with id=%s is corrupted, ignoring'%( elem.get('id'))) else: with open(fname, 'wb') as f: f.write(data)