mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-01 15:32:26 +01:00
182 lines
7.2 KiB
Python
182 lines
7.2 KiB
Python
"""
|
|
Convert .fb2 files to .lrf
|
|
"""
|
|
import os, re
|
|
import pkg_resources
|
|
|
|
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
|
|
from ebook_converter import guess_type
|
|
from ebook_converter.polyglot.builtins import iteritems
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
|
|
|
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
|
|
FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1'
|
|
|
|
|
|
class FB2Input(InputFormatPlugin):
|
|
|
|
name = 'FB2 Input'
|
|
author = 'Anatoly Shipitsin'
|
|
description = 'Convert FB2 and FBZ files to HTML'
|
|
file_types = {'fb2', 'fbz'}
|
|
commit_name = 'fb2_input'
|
|
|
|
recommendations = {
|
|
('level1_toc', '//h:h1', OptionRecommendation.MED),
|
|
('level2_toc', '//h:h2', OptionRecommendation.MED),
|
|
('level3_toc', '//h:h3', OptionRecommendation.MED),
|
|
}
|
|
|
|
options = {
|
|
OptionRecommendation(name='no_inline_fb2_toc',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help=_('Do not insert a Table of Contents at the beginning of the book.'
|
|
)
|
|
)}
|
|
|
|
def convert(self, stream, options, file_ext, log,
|
|
accelerators):
|
|
from lxml import etree
|
|
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
|
from ebook_converter.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
|
|
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
|
|
from ebook_converter.ebooks.metadata.meta import get_metadata
|
|
from ebook_converter.ebooks.oeb.base import XLINK_NS, XHTML_NS
|
|
from ebook_converter.ebooks.chardet import xml_to_unicode
|
|
self.log = log
|
|
log.debug('Parsing XML...')
|
|
raw = get_fb2_data(stream)[0]
|
|
raw = raw.replace(b'\0', b'')
|
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
|
assume_utf8=True, resolve_entities=True)[0]
|
|
try:
|
|
doc = safe_xml_fromstring(raw)
|
|
except etree.XMLSyntaxError:
|
|
doc = safe_xml_fromstring(raw.replace('& ', '&'))
|
|
if doc is None:
|
|
raise ValueError('The FB2 file is not valid XML')
|
|
doc = ensure_namespace(doc)
|
|
try:
|
|
fb_ns = doc.nsmap[doc.prefix]
|
|
except Exception:
|
|
fb_ns = FB2NS
|
|
|
|
NAMESPACES = {'f':fb_ns, 'l':XLINK_NS}
|
|
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
|
|
css = ''
|
|
for s in stylesheets:
|
|
css += etree.tostring(s, encoding='unicode', method='text',
|
|
with_tail=False) + '\n\n'
|
|
if css:
|
|
import css_parser, logging
|
|
parser = css_parser.CSSParser(fetcher=None,
|
|
log=logging.getLogger('calibre.css'))
|
|
|
|
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
|
|
text = XHTML_CSS_NAMESPACE + css
|
|
log.debug('Parsing stylesheet...')
|
|
stylesheet = parser.parseString(text)
|
|
stylesheet.namespaces['h'] = XHTML_NS
|
|
css = stylesheet.cssText
|
|
if isinstance(css, bytes):
|
|
css = css.decode('utf-8', 'replace')
|
|
css = css.replace('h|style', 'h|span')
|
|
css = re.sub(r'name\s*=\s*', 'class=', css)
|
|
self.extract_embedded_content(doc)
|
|
log.debug('Converting XML to HTML...')
|
|
with open(pkg_resources.resource_filename('ebook_converter',
|
|
'data/fb2.xsl')) as f:
|
|
ss = f.read()
|
|
ss = ss.replace("__FB_NS__", fb_ns)
|
|
if options.no_inline_fb2_toc:
|
|
log('Disabling generation of inline FB2 TOC')
|
|
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
|
|
re.DOTALL).sub('', ss)
|
|
|
|
styledoc = safe_xml_fromstring(ss)
|
|
|
|
transform = etree.XSLT(styledoc)
|
|
result = transform(doc)
|
|
|
|
# Handle links of type note and cite
|
|
notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')}
|
|
cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')}
|
|
all_ids = {x for x in result.xpath('//*/@id')}
|
|
for cite, a in iteritems(cites):
|
|
note = notes.get(cite, None)
|
|
if note:
|
|
c = 1
|
|
while 'cite%d' % c in all_ids:
|
|
c += 1
|
|
if not note.get('id', None):
|
|
note.set('id', 'cite%d' % c)
|
|
all_ids.add(note.get('id'))
|
|
a.set('href', '#%s' % note.get('id'))
|
|
for x in result.xpath('//*[@link_note or @link_cite]'):
|
|
x.attrib.pop('link_note', None)
|
|
x.attrib.pop('link_cite', None)
|
|
|
|
for img in result.xpath('//img[@src]'):
|
|
src = img.get('src')
|
|
img.set('src', self.binary_map.get(src, src))
|
|
index = transform.tostring(result)
|
|
with open('index.xhtml', 'wb') as f:
|
|
f.write(index.encode('utf-8'))
|
|
with open('inline-styles.css', 'wb') as f:
|
|
f.write(css.encode('utf-8'))
|
|
stream.seek(0)
|
|
mi = get_metadata(stream, 'fb2')
|
|
if not mi.title:
|
|
mi.title = _('Unknown')
|
|
if not mi.authors:
|
|
mi.authors = [_('Unknown')]
|
|
cpath = None
|
|
if mi.cover_data and mi.cover_data[1]:
|
|
with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
|
|
f.write(mi.cover_data[1])
|
|
cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
|
|
else:
|
|
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
|
|
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
|
|
if href is not None:
|
|
if href.startswith('#'):
|
|
href = href[1:]
|
|
cpath = os.path.abspath(href)
|
|
break
|
|
|
|
opf = OPFCreator(os.getcwd(), mi)
|
|
entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
|
|
opf.create_manifest(entries)
|
|
opf.create_spine(['index.xhtml'])
|
|
if cpath:
|
|
opf.guide.set_cover(cpath)
|
|
with open('metadata.opf', 'wb') as f:
|
|
opf.render(f)
|
|
return os.path.join(os.getcwd(), 'metadata.opf')
|
|
|
|
def extract_embedded_content(self, doc):
|
|
from ebook_converter.ebooks.fb2 import base64_decode
|
|
self.binary_map = {}
|
|
for elem in doc.xpath('./*'):
|
|
if elem.text and 'binary' in elem.tag and 'id' in elem.attrib:
|
|
ct = elem.get('content-type', '')
|
|
fname = elem.attrib['id']
|
|
ext = ct.rpartition('/')[-1].lower()
|
|
if ext in ('png', 'jpeg', 'jpg'):
|
|
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
|
|
'png'}:
|
|
fname += '.' + ext
|
|
self.binary_map[elem.get('id')] = fname
|
|
raw = elem.text.strip()
|
|
try:
|
|
data = base64_decode(raw)
|
|
except TypeError:
|
|
self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
|
|
elem.get('id')))
|
|
else:
|
|
with open(fname, 'wb') as f:
|
|
f.write(data)
|