mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-06 03:04:11 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
549 lines
23 KiB
Python
549 lines
23 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
import os, shutil, re
|
|
|
|
from ebook_converter.customize.conversion import (OutputFormatPlugin,
|
|
OptionRecommendation)
|
|
from ebook_converter.ptempfile import TemporaryDirectory
|
|
from ebook_converter import CurrentDir
|
|
from ebook_converter.polyglot.builtins import unicode_type, filter, map, zip, range, as_bytes
|
|
|
|
block_level_tags = (
|
|
'address',
|
|
'body',
|
|
'blockquote',
|
|
'center',
|
|
'dir',
|
|
'div',
|
|
'dl',
|
|
'fieldset',
|
|
'form',
|
|
'h1',
|
|
'h2',
|
|
'h3',
|
|
'h4',
|
|
'h5',
|
|
'h6',
|
|
'hr',
|
|
'isindex',
|
|
'menu',
|
|
'noframes',
|
|
'noscript',
|
|
'ol',
|
|
'p',
|
|
'pre',
|
|
'table',
|
|
'ul',
|
|
)
|
|
|
|
|
|
class EPUBOutput(OutputFormatPlugin):
|
|
|
|
name = 'EPUB Output'
|
|
author = 'Kovid Goyal'
|
|
file_type = 'epub'
|
|
commit_name = 'epub_output'
|
|
ui_data = {'versions': ('2', '3')}
|
|
|
|
options = {
|
|
OptionRecommendation(name='extract_to',
|
|
help=_('Extract the contents of the generated %s file to the '
|
|
'specified directory. The contents of the directory are first '
|
|
'deleted, so be careful.') % 'EPUB'),
|
|
|
|
OptionRecommendation(name='dont_split_on_page_breaks',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help=_('Turn off splitting at page breaks. Normally, input '
|
|
'files are automatically split at every page break into '
|
|
'two files. This gives an output e-book that can be '
|
|
'parsed faster and with less resources. However, '
|
|
'splitting is slow and if your source file contains a '
|
|
'very large number of page breaks, you should turn off '
|
|
'splitting on page breaks.'
|
|
)
|
|
),
|
|
|
|
OptionRecommendation(name='flow_size', recommended_value=260,
|
|
help=_('Split all HTML files larger than this size (in KB). '
|
|
'This is necessary as most EPUB readers cannot handle large '
|
|
'file sizes. The default of %defaultKB is the size required '
|
|
'for Adobe Digital Editions. Set to 0 to disable size based splitting.')
|
|
),
|
|
|
|
OptionRecommendation(name='no_default_epub_cover', recommended_value=False,
|
|
help=_('Normally, if the input file has no cover and you don\'t'
|
|
' specify one, a default cover is generated with the title, '
|
|
'authors, etc. This option disables the generation of this cover.')
|
|
),
|
|
|
|
OptionRecommendation(name='no_svg_cover', recommended_value=False,
|
|
help=_('Do not use SVG for the book cover. Use this option if '
|
|
'your EPUB is going to be used on a device that does not '
|
|
'support SVG, like the iPhone or the JetBook Lite. '
|
|
'Without this option, such devices will display the cover '
|
|
'as a blank page.')
|
|
),
|
|
|
|
OptionRecommendation(name='preserve_cover_aspect_ratio',
|
|
recommended_value=False, help=_(
|
|
'When using an SVG cover, this option will cause the cover to scale '
|
|
'to cover the available screen area, but still preserve its aspect ratio '
|
|
'(ratio of width to height). That means there may be white borders '
|
|
'at the sides or top and bottom of the image, but the image will '
|
|
'never be distorted. Without this option the image may be slightly '
|
|
'distorted, but there will be no borders.'
|
|
)
|
|
),
|
|
|
|
OptionRecommendation(name='epub_flatten', recommended_value=False,
|
|
help=_('This option is needed only if you intend to use the EPUB'
|
|
' with FBReaderJ. It will flatten the file system inside the'
|
|
' EPUB, putting all files into the top level.')
|
|
),
|
|
|
|
OptionRecommendation(name='epub_inline_toc', recommended_value=False,
|
|
help=_('Insert an inline Table of Contents that will appear as part of the main book content.')
|
|
),
|
|
|
|
OptionRecommendation(name='epub_toc_at_end', recommended_value=False,
|
|
help=_('Put the inserted inline Table of Contents at the end of the book instead of the start.')
|
|
),
|
|
|
|
OptionRecommendation(name='toc_title', recommended_value=None,
|
|
help=_('Title for any generated in-line table of contents.')
|
|
),
|
|
|
|
OptionRecommendation(name='epub_version', recommended_value='2', choices=ui_data['versions'],
|
|
help=_('The version of the EPUB file to generate. EPUB 2 is the'
|
|
' most widely compatible, only use EPUB 3 if you know you'
|
|
' actually need it.')
|
|
),
|
|
|
|
}
|
|
|
|
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
|
|
|
|
def workaround_webkit_quirks(self): # {{{
|
|
from ebook_converter.ebooks.oeb.base import XPath
|
|
for x in self.oeb.spine:
|
|
root = x.data
|
|
body = XPath('//h:body')(root)
|
|
if body:
|
|
body = body[0]
|
|
|
|
if not hasattr(body, 'xpath'):
|
|
continue
|
|
|
|
for pre in XPath('//h:pre')(body):
|
|
if not pre.text and len(pre) == 0:
|
|
pre.tag = 'div'
|
|
# }}}
|
|
|
|
def upshift_markup(self): # {{{
|
|
'Upgrade markup to comply with XHTML 1.1 where possible'
|
|
from ebook_converter.ebooks.oeb.base import XPath, XML
|
|
for x in self.oeb.spine:
|
|
root = x.data
|
|
if (not root.get(XML('lang'))) and (root.get('lang')):
|
|
root.set(XML('lang'), root.get('lang'))
|
|
body = XPath('//h:body')(root)
|
|
if body:
|
|
body = body[0]
|
|
|
|
if not hasattr(body, 'xpath'):
|
|
continue
|
|
for u in XPath('//h:u')(root):
|
|
u.tag = 'span'
|
|
|
|
seen_ids, seen_names = set(), set()
|
|
for x in XPath('//*[@id or @name]')(root):
|
|
eid, name = x.get('id', None), x.get('name', None)
|
|
if eid:
|
|
if eid in seen_ids:
|
|
del x.attrib['id']
|
|
else:
|
|
seen_ids.add(eid)
|
|
if name:
|
|
if name in seen_names:
|
|
del x.attrib['name']
|
|
else:
|
|
seen_names.add(name)
|
|
|
|
# }}}
|
|
|
|
def convert(self, oeb, output_path, input_plugin, opts, log):
|
|
self.log, self.opts, self.oeb = log, opts, oeb
|
|
|
|
if self.opts.epub_inline_toc:
|
|
from ebook_converter.ebooks.mobi.writer8.toc import TOCAdder
|
|
opts.mobi_toc_at_start = not opts.epub_toc_at_end
|
|
opts.mobi_passthrough = False
|
|
opts.no_inline_toc = False
|
|
TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)
|
|
|
|
if self.opts.epub_flatten:
|
|
from ebook_converter.ebooks.oeb.transforms.filenames import FlatFilenames
|
|
FlatFilenames()(oeb, opts)
|
|
else:
|
|
from ebook_converter.ebooks.oeb.transforms.filenames import UniqueFilenames
|
|
UniqueFilenames()(oeb, opts)
|
|
|
|
self.workaround_ade_quirks()
|
|
self.workaround_webkit_quirks()
|
|
self.upshift_markup()
|
|
from ebook_converter.ebooks.oeb.transforms.rescale import RescaleImages
|
|
RescaleImages(check_colorspaces=True)(oeb, opts)
|
|
|
|
from ebook_converter.ebooks.oeb.transforms.split import Split
|
|
split = Split(not self.opts.dont_split_on_page_breaks,
|
|
max_flow_size=self.opts.flow_size*1024
|
|
)
|
|
split(self.oeb, self.opts)
|
|
|
|
from ebook_converter.ebooks.oeb.transforms.cover import CoverManager
|
|
cm = CoverManager(
|
|
no_default_cover=self.opts.no_default_epub_cover,
|
|
no_svg_cover=self.opts.no_svg_cover,
|
|
preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
|
|
cm(self.oeb, self.opts, self.log)
|
|
|
|
self.workaround_sony_quirks()
|
|
|
|
if self.oeb.toc.count() == 0:
|
|
self.log.warn('This EPUB file has no Table of Contents. '
|
|
'Creating a default TOC')
|
|
first = next(iter(self.oeb.spine))
|
|
self.oeb.toc.add(_('Start'), first.href)
|
|
|
|
from ebook_converter.ebooks.oeb.base import OPF
|
|
identifiers = oeb.metadata['identifier']
|
|
uuid = None
|
|
for x in identifiers:
|
|
if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
|
|
uuid = unicode_type(x).split(':')[-1]
|
|
break
|
|
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
|
|
|
|
if uuid is None:
|
|
self.log.warn('No UUID identifier found')
|
|
from uuid import uuid4
|
|
uuid = unicode_type(uuid4())
|
|
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
|
|
|
|
if encrypted_fonts and not uuid.startswith('urn:uuid:'):
|
|
# Apparently ADE requires this value to start with urn:uuid:
|
|
# for some absurd reason, or it will throw a hissy fit and refuse
|
|
# to use the obfuscated fonts.
|
|
for x in identifiers:
|
|
if unicode_type(x) == uuid:
|
|
x.content = 'urn:uuid:'+uuid
|
|
|
|
with TemporaryDirectory('_epub_output') as tdir:
|
|
from ebook_converter.customize.ui import plugin_for_output_format
|
|
metadata_xml = None
|
|
extra_entries = []
|
|
if self.is_periodical:
|
|
if self.opts.output_profile.epub_periodical_format == 'sony':
|
|
from ebook_converter.ebooks.epub.periodical import sony_metadata
|
|
metadata_xml, atom_xml = sony_metadata(oeb)
|
|
extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
|
|
oeb_output = plugin_for_output_format('oeb')
|
|
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
|
|
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
|
|
self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
|
|
if x.endswith('.ncx')][0])
|
|
if self.opts.epub_version == '3':
|
|
self.upgrade_to_epub3(tdir, opf)
|
|
encryption = None
|
|
if encrypted_fonts:
|
|
encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
|
|
|
|
from ebook_converter.ebooks.epub import initialize_container
|
|
with initialize_container(output_path, os.path.basename(opf),
|
|
extra_entries=extra_entries) as epub:
|
|
epub.add_dir(tdir)
|
|
if encryption is not None:
|
|
epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
|
|
if metadata_xml is not None:
|
|
epub.writestr('META-INF/metadata.xml',
|
|
metadata_xml.encode('utf-8'))
|
|
if opts.extract_to is not None:
|
|
from ebook_converter.utils.zipfile import ZipFile
|
|
if os.path.exists(opts.extract_to):
|
|
if os.path.isdir(opts.extract_to):
|
|
shutil.rmtree(opts.extract_to)
|
|
else:
|
|
os.remove(opts.extract_to)
|
|
os.mkdir(opts.extract_to)
|
|
with ZipFile(output_path) as zf:
|
|
zf.extractall(path=opts.extract_to)
|
|
self.log.info('EPUB extracted to', opts.extract_to)
|
|
|
|
def upgrade_to_epub3(self, tdir, opf):
|
|
self.log.info('Upgrading to EPUB 3...')
|
|
from ebook_converter.ebooks.epub import simple_container_xml
|
|
from ebook_converter.ebooks.oeb.polish.cover import fix_conversion_titlepage_links_in_nav
|
|
try:
|
|
os.mkdir(os.path.join(tdir, 'META-INF'))
|
|
except EnvironmentError:
|
|
pass
|
|
with open(os.path.join(tdir, 'META-INF', 'container.xml'), 'wb') as f:
|
|
f.write(simple_container_xml(os.path.basename(opf)).encode('utf-8'))
|
|
from ebook_converter.ebooks.oeb.polish.container import EpubContainer
|
|
container = EpubContainer(tdir, self.log)
|
|
from ebook_converter.ebooks.oeb.polish.upgrade import epub_2_to_3
|
|
existing_nav = getattr(self.opts, 'epub3_nav_parsed', None)
|
|
nav_href = getattr(self.opts, 'epub3_nav_href', None)
|
|
previous_nav = (nav_href, existing_nav) if existing_nav and nav_href else None
|
|
epub_2_to_3(container, self.log.info, previous_nav=previous_nav)
|
|
fix_conversion_titlepage_links_in_nav(container)
|
|
container.commit()
|
|
os.remove(f.name)
|
|
try:
|
|
os.rmdir(os.path.join(tdir, 'META-INF'))
|
|
except EnvironmentError:
|
|
pass
|
|
|
|
def encrypt_fonts(self, uris, tdir, uuid): # {{{
|
|
from ebook_converter.polyglot.binary import from_hex_bytes
|
|
|
|
key = re.sub(r'[^a-fA-F0-9]', '', uuid)
|
|
if len(key) < 16:
|
|
raise ValueError('UUID identifier %r is invalid'%uuid)
|
|
key = bytearray(from_hex_bytes((key + key)[:32]))
|
|
paths = []
|
|
with CurrentDir(tdir):
|
|
paths = [os.path.join(*x.split('/')) for x in uris]
|
|
uris = dict(zip(uris, paths))
|
|
fonts = []
|
|
for uri in list(uris.keys()):
|
|
path = uris[uri]
|
|
if not os.path.exists(path):
|
|
uris.pop(uri)
|
|
continue
|
|
self.log.debug('Encrypting font:', uri)
|
|
with lopen(path, 'r+b') as f:
|
|
data = f.read(1024)
|
|
if len(data) >= 1024:
|
|
data = bytearray(data)
|
|
f.seek(0)
|
|
f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024))))
|
|
else:
|
|
self.log.warn('Font', path, 'is invalid, ignoring')
|
|
if not isinstance(uri, unicode_type):
|
|
uri = uri.decode('utf-8')
|
|
fonts.append('''
|
|
<enc:EncryptedData>
|
|
<enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
|
|
<enc:CipherData>
|
|
<enc:CipherReference URI="%s"/>
|
|
</enc:CipherData>
|
|
</enc:EncryptedData>
|
|
'''%(uri.replace('"', '\\"')))
|
|
if fonts:
|
|
ans = '''<encryption
|
|
xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
|
|
xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
|
|
xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
|
|
'''
|
|
ans += '\n'.join(fonts)
|
|
ans += '\n</encryption>'
|
|
return ans
|
|
# }}}
|
|
|
|
def condense_ncx(self, ncx_path): # {{{
|
|
from lxml import etree
|
|
if not self.opts.pretty_print:
|
|
tree = etree.parse(ncx_path)
|
|
for tag in tree.getroot().iter(tag=etree.Element):
|
|
if tag.text:
|
|
tag.text = tag.text.strip()
|
|
if tag.tail:
|
|
tag.tail = tag.tail.strip()
|
|
compressed = etree.tostring(tree.getroot(), encoding='utf-8')
|
|
with open(ncx_path, 'wb') as f:
|
|
f.write(compressed)
|
|
# }}}
|
|
|
|
def workaround_ade_quirks(self): # {{{
|
|
'''
|
|
Perform various markup transforms to get the output to render correctly
|
|
in the quirky ADE.
|
|
'''
|
|
from ebook_converter.ebooks.oeb.base import XPath, XHTML, barename, urlunquote
|
|
|
|
stylesheet = self.oeb.manifest.main_stylesheet
|
|
|
|
# ADE cries big wet tears when it encounters an invalid fragment
|
|
# identifier in the NCX toc.
|
|
frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
|
|
for node in self.oeb.toc.iter():
|
|
href = getattr(node, 'href', None)
|
|
if hasattr(href, 'partition'):
|
|
base, _, frag = href.partition('#')
|
|
frag = urlunquote(frag)
|
|
if frag and frag_pat.match(frag) is None:
|
|
self.log.warn(
|
|
'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
|
|
node.href = base
|
|
|
|
for x in self.oeb.spine:
|
|
root = x.data
|
|
body = XPath('//h:body')(root)
|
|
if body:
|
|
body = body[0]
|
|
|
|
if hasattr(body, 'xpath'):
|
|
# remove <img> tags with empty src elements
|
|
bad = []
|
|
for x in XPath('//h:img')(body):
|
|
src = x.get('src', '').strip()
|
|
if src in ('', '#') or src.startswith('http:'):
|
|
bad.append(x)
|
|
for img in bad:
|
|
img.getparent().remove(img)
|
|
|
|
# Add id attribute to <a> tags that have name
|
|
for x in XPath('//h:a[@name]')(body):
|
|
if not x.get('id', False):
|
|
x.set('id', x.get('name'))
|
|
# The delightful epubcheck has started complaining about <a> tags that
|
|
# have name attributes.
|
|
x.attrib.pop('name')
|
|
|
|
# Replace <br> that are children of <body> as ADE doesn't handle them
|
|
for br in XPath('./h:br')(body):
|
|
if br.getparent() is None:
|
|
continue
|
|
try:
|
|
prior = next(br.itersiblings(preceding=True))
|
|
priortag = barename(prior.tag)
|
|
priortext = prior.tail
|
|
except:
|
|
priortag = 'body'
|
|
priortext = body.text
|
|
if priortext:
|
|
priortext = priortext.strip()
|
|
br.tag = XHTML('p')
|
|
br.text = '\u00a0'
|
|
style = br.get('style', '').split(';')
|
|
style = list(filter(None, map(lambda x: x.strip(), style)))
|
|
style.append('margin:0pt; border:0pt')
|
|
# If the prior tag is a block (including a <br> we replaced)
|
|
# then this <br> replacement should have a 1-line height.
|
|
# Otherwise it should have no height.
|
|
if not priortext and priortag in block_level_tags:
|
|
style.append('height:1em')
|
|
else:
|
|
style.append('height:0pt')
|
|
br.set('style', '; '.join(style))
|
|
|
|
for tag in XPath('//h:embed')(root):
|
|
tag.getparent().remove(tag)
|
|
for tag in XPath('//h:object')(root):
|
|
if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
|
|
continue
|
|
tag.getparent().remove(tag)
|
|
|
|
for tag in XPath('//h:title|//h:style')(root):
|
|
if not tag.text:
|
|
tag.getparent().remove(tag)
|
|
for tag in XPath('//h:script')(root):
|
|
if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
|
|
tag.getparent().remove(tag)
|
|
for tag in XPath('//h:body/descendant::h:script')(root):
|
|
tag.getparent().remove(tag)
|
|
|
|
formchildren = XPath('./h:input|./h:button|./h:textarea|'
|
|
'./h:label|./h:fieldset|./h:legend')
|
|
for tag in XPath('//h:form')(root):
|
|
if formchildren(tag):
|
|
tag.getparent().remove(tag)
|
|
else:
|
|
# Not a real form
|
|
tag.tag = XHTML('div')
|
|
|
|
for tag in XPath('//h:center')(root):
|
|
tag.tag = XHTML('div')
|
|
tag.set('style', 'text-align:center')
|
|
# ADE can't handle & in an img url
|
|
for tag in XPath('//h:img[@src]')(root):
|
|
tag.set('src', tag.get('src', '').replace('&', ''))
|
|
|
|
# ADE whimpers in fright when it encounters a <td> outside a
|
|
# <table>
|
|
in_table = XPath('ancestor::h:table')
|
|
for tag in XPath('//h:td|//h:tr|//h:th')(root):
|
|
if not in_table(tag):
|
|
tag.tag = XHTML('div')
|
|
|
|
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
|
|
special_chars = re.compile('[\u200b\u00ad]')
|
|
for elem in root.iterdescendants('*'):
|
|
if elem.text:
|
|
elem.text = special_chars.sub('', elem.text)
|
|
elem.text = elem.text.replace('\u2011', '-')
|
|
if elem.tail:
|
|
elem.tail = special_chars.sub('', elem.tail)
|
|
elem.tail = elem.tail.replace('\u2011', '-')
|
|
|
|
if stylesheet is not None:
|
|
# ADE doesn't render lists correctly if they have left margins
|
|
from css_parser.css import CSSRule
|
|
for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
|
|
sel = '.'+lb.get('class')
|
|
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
|
if sel == rule.selectorList.selectorText:
|
|
rule.style.removeProperty('margin-left')
|
|
# padding-left breaks rendering in webkit and gecko
|
|
rule.style.removeProperty('padding-left')
|
|
# Change whitespace:pre to pre-wrap to accommodate readers that
|
|
# cannot scroll horizontally
|
|
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
|
style = rule.style
|
|
ws = style.getPropertyValue('white-space')
|
|
if ws == 'pre':
|
|
style.setProperty('white-space', 'pre-wrap')
|
|
|
|
# }}}
|
|
|
|
def workaround_sony_quirks(self): # {{{
|
|
'''
|
|
Perform toc link transforms to alleviate slow loading.
|
|
'''
|
|
from ebook_converter.ebooks.oeb.base import urldefrag, XPath
|
|
from ebook_converter.ebooks.oeb.polish.toc import item_at_top
|
|
|
|
def frag_is_at_top(root, frag):
|
|
elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
|
|
if elem:
|
|
elem = elem[0]
|
|
else:
|
|
return False
|
|
return item_at_top(elem)
|
|
|
|
def simplify_toc_entry(toc):
|
|
if toc.href:
|
|
href, frag = urldefrag(toc.href)
|
|
if frag:
|
|
for x in self.oeb.spine:
|
|
if x.href == href:
|
|
if frag_is_at_top(x.data, frag):
|
|
self.log.debug('Removing anchor from TOC href:',
|
|
href+'#'+frag)
|
|
toc.href = href
|
|
break
|
|
for x in toc:
|
|
simplify_toc_entry(x)
|
|
|
|
if self.oeb.toc:
|
|
simplify_toc_entry(self.oeb.toc)
|
|
|
|
# }}}
|