mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-24 22:35:46 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
459 lines
19 KiB
Python
459 lines
19 KiB
Python
import hashlib
|
|
import itertools
|
|
import os
|
|
import re
|
|
import traceback
|
|
import uuid
|
|
|
|
from lxml import etree
|
|
|
|
from ebook_converter.ebooks.metadata import opf2 as opf_meta
|
|
from ebook_converter.ebooks.oeb import base
|
|
from ebook_converter.customize.conversion import InputFormatPlugin
|
|
from ebook_converter.customize.conversion import OptionRecommendation
|
|
|
|
|
|
ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
|
|
IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
|
|
|
|
|
|
def decrypt_font_data(key, data, algorithm):
|
|
is_adobe = algorithm == ADOBE_OBFUSCATION
|
|
crypt_len = 1024 if is_adobe else 1040
|
|
crypt = bytearray(data[:crypt_len])
|
|
key = itertools.cycle(iter(bytearray(key)))
|
|
decrypt = bytes(bytearray(x ^ next(key) for x in crypt))
|
|
return decrypt + data[crypt_len:]
|
|
|
|
|
|
def decrypt_font(key, path, algorithm):
|
|
with open(path, 'r+b') as f:
|
|
data = decrypt_font_data(key, f.read(), algorithm)
|
|
f.seek(0), f.truncate(), f.write(data)
|
|
|
|
|
|
class EPUBInput(InputFormatPlugin):
|
|
|
|
name = 'EPUB Input'
|
|
author = 'Kovid Goyal'
|
|
description = 'Convert EPUB files (.epub) to HTML'
|
|
file_types = {'epub'}
|
|
output_encoding = None
|
|
commit_name = 'epub_input'
|
|
|
|
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
|
|
|
|
def process_encryption(self, encfile, opf, log):
|
|
idpf_key = opf.raw_unique_identifier
|
|
if idpf_key:
|
|
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
|
|
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
|
|
key = None
|
|
for item in opf.identifier_iter():
|
|
scheme = None
|
|
for xkey in item.attrib.keys():
|
|
if xkey.endswith('scheme'):
|
|
scheme = item.get(xkey)
|
|
if (scheme and scheme.lower() == 'uuid') or \
|
|
(item.text and item.text.startswith('urn:uuid:')):
|
|
try:
|
|
key = item.text.rpartition(':')[-1]
|
|
key = uuid.UUID(key).bytes
|
|
except Exception:
|
|
traceback.print_exc()
|
|
key = None
|
|
|
|
try:
|
|
root = etree.parse(encfile)
|
|
for em in root.xpath('descendant::*[contains(name(), '
|
|
'"EncryptionMethod")]'):
|
|
algorithm = em.get('Algorithm', '')
|
|
if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
|
|
return False
|
|
cr = em.getparent().xpath('descendant::*[contains(name(), '
|
|
'"CipherReference")]')[0]
|
|
uri = cr.get('URI')
|
|
path = os.path.abspath(os.path.join(os.path.dirname(encfile),
|
|
'..', *uri.split('/')))
|
|
tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
|
|
if (tkey and os.path.exists(path)):
|
|
self._encrypted_font_uris.append(uri)
|
|
decrypt_font(tkey, path, algorithm)
|
|
return True
|
|
except Exception:
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def set_guide_type(self, opf, gtype, href=None, title=''):
|
|
# Set the specified guide entry
|
|
for elem in list(opf.iterguide()):
|
|
if elem.get('type', '').lower() == gtype:
|
|
elem.getparent().remove(elem)
|
|
|
|
if href is not None:
|
|
t = opf.create_guide_item(gtype, title, href)
|
|
for guide in opf.root.xpath('./*[local-name()="guide"]'):
|
|
guide.append(t)
|
|
return
|
|
guide = opf.create_guide_element()
|
|
opf.root.append(guide)
|
|
guide.append(t)
|
|
return t
|
|
|
|
def rationalize_cover3(self, opf, log):
|
|
"""
|
|
If there is a reference to the cover/titlepage via manifest
|
|
properties, convert to entries in the <guide> so that the rest of the
|
|
pipeline picks it up.
|
|
"""
|
|
from ebook_converter.ebooks.metadata.opf3 import items_with_property
|
|
removed = guide_titlepage_href = guide_titlepage_id = None
|
|
|
|
# Look for titlepages incorrectly marked in the <guide> as covers
|
|
guide_cover, guide_elem = None, None
|
|
for guide_elem in opf.iterguide():
|
|
if guide_elem.get('type', '').lower() == 'cover':
|
|
guide_cover = guide_elem.get('href', '').partition('#')[0]
|
|
break
|
|
if guide_cover:
|
|
spine = list(opf.iterspine())
|
|
if spine:
|
|
idref = spine[0].get('idref', '')
|
|
for x in opf.itermanifest():
|
|
if x.get('id') == idref and x.get('href') == guide_cover:
|
|
guide_titlepage_href = guide_cover
|
|
guide_titlepage_id = idref
|
|
break
|
|
|
|
raster_cover_href = opf.epub3_raster_cover or opf.raster_cover
|
|
if raster_cover_href:
|
|
self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image')
|
|
titlepage_id = titlepage_href = None
|
|
for item in items_with_property(opf.root, 'calibre:title-page'):
|
|
tid, href = item.get('id'), item.get('href')
|
|
if href and tid:
|
|
titlepage_id, titlepage_href = tid, href.partition('#')[0]
|
|
break
|
|
if titlepage_href is None:
|
|
titlepage_href = guide_titlepage_href
|
|
titlepage_id = guide_titlepage_id
|
|
if titlepage_href is not None:
|
|
self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page')
|
|
spine = list(opf.iterspine())
|
|
if len(spine) > 1:
|
|
for item in spine:
|
|
if item.get('idref') == titlepage_id:
|
|
log('Found HTML cover', titlepage_href)
|
|
if self.for_viewer:
|
|
item.attrib.pop('linear', None)
|
|
else:
|
|
item.getparent().remove(item)
|
|
removed = titlepage_href
|
|
return removed
|
|
|
|
def rationalize_cover2(self, opf, log):
|
|
''' Ensure that the cover information in the guide is correct. That
|
|
means, at most one entry with type="cover" that points to a raster
|
|
cover and at most one entry with type="titlepage" that points to an
|
|
HTML titlepage. '''
|
|
removed = None
|
|
from lxml import etree
|
|
guide_cover, guide_elem = None, None
|
|
for guide_elem in opf.iterguide():
|
|
if guide_elem.get('type', '').lower() == 'cover':
|
|
guide_cover = guide_elem.get('href', '').partition('#')[0]
|
|
break
|
|
if not guide_cover:
|
|
raster_cover = opf.raster_cover
|
|
if raster_cover:
|
|
if guide_elem is None:
|
|
g = opf.root.makeelement(base.tag('opf', 'guide'))
|
|
opf.root.append(g)
|
|
else:
|
|
g = guide_elem.getparent()
|
|
guide_cover = raster_cover
|
|
guide_elem = g.makeelement(base.tag('opf', 'reference'),
|
|
attrib={'href': raster_cover,
|
|
'type': 'cover'})
|
|
g.append(guide_elem)
|
|
return
|
|
spine = list(opf.iterspine())
|
|
if not spine:
|
|
return
|
|
# Check if the cover specified in the guide is also
|
|
# the first element in spine
|
|
idref = spine[0].get('idref', '')
|
|
manifest = list(opf.itermanifest())
|
|
if not manifest:
|
|
return
|
|
elem = [x for x in manifest if x.get('id', '') == idref]
|
|
if not elem or elem[0].get('href', None) != guide_cover:
|
|
return
|
|
log('Found HTML cover', guide_cover)
|
|
|
|
# Remove from spine as covers must be treated
|
|
# specially
|
|
if not self.for_viewer:
|
|
if len(spine) == 1:
|
|
log.warn('There is only a single spine item and it is marked '
|
|
'as the cover. Removing cover marking.')
|
|
for guide_elem in tuple(opf.iterguide()):
|
|
if guide_elem.get('type', '').lower() == 'cover':
|
|
guide_elem.getparent().remove(guide_elem)
|
|
return
|
|
else:
|
|
spine[0].getparent().remove(spine[0])
|
|
removed = guide_cover
|
|
else:
|
|
# Ensure the cover is displayed as the first item in the book, some
|
|
# epub files have it set with linear='no' which causes the cover to
|
|
# display in the end
|
|
spine[0].attrib.pop('linear', None)
|
|
opf.spine[0].is_linear = True
|
|
# Ensure that the guide has a cover entry pointing to a raster cover
|
|
# and a titlepage entry pointing to the html titlepage. The titlepage
|
|
# entry will be used by the epub output plugin, the raster cover entry
|
|
# by other output plugins.
|
|
|
|
# Search for a raster cover identified in the OPF
|
|
raster_cover = opf.raster_cover
|
|
|
|
# Set the cover guide entry
|
|
if raster_cover is not None:
|
|
guide_elem.set('href', raster_cover)
|
|
else:
|
|
# Render the titlepage to create a raster cover
|
|
from ebook_converter.ebooks import render_html_svg_workaround
|
|
guide_elem.set('href', 'calibre_raster_cover.jpg')
|
|
t = etree.SubElement(elem[0].getparent(), base.tag('opf', 'item'),
|
|
href=guide_elem.get('href'),
|
|
id='calibre_raster_cover')
|
|
t.set('media-type', 'image/jpeg')
|
|
if os.path.exists(guide_cover):
|
|
renderer = render_html_svg_workaround(guide_cover, log)
|
|
if renderer is not None:
|
|
with open('calibre_raster_cover.jpg', 'wb') as f:
|
|
f.write(renderer)
|
|
|
|
# Set the titlepage guide entry
|
|
self.set_guide_type(opf, 'titlepage', guide_cover, 'Title Page')
|
|
return removed
|
|
|
|
def find_opf(self):
|
|
def attr(n, attr):
|
|
for k, v in n.attrib.items():
|
|
if k.endswith(attr):
|
|
return v
|
|
try:
|
|
with open('META-INF/container.xml', 'rb') as f:
|
|
root = etree.fromstring(f.read())
|
|
for r in root.xpath('//*[local-name()="rootfile"]'):
|
|
if (attr(r, 'media-type') !=
|
|
"application/oebps-package+xml"):
|
|
continue
|
|
path = attr(r, 'full-path')
|
|
if not path:
|
|
continue
|
|
path = os.path.join(os.getcwd(), *path.split('/'))
|
|
if os.path.exists(path):
|
|
return path
|
|
except Exception:
|
|
traceback.print_exc()
|
|
|
|
def convert(self, stream, options, file_ext, log, accelerators):
|
|
from ebook_converter.utils.zipfile import ZipFile
|
|
from ebook_converter import walk
|
|
from ebook_converter.ebooks import DRMError
|
|
try:
|
|
zf = ZipFile(stream)
|
|
zf.extractall(os.getcwd())
|
|
except Exception:
|
|
log.exception('EPUB appears to be invalid ZIP file, trying a'
|
|
' more forgiving ZIP parser')
|
|
from ebook_converter.utils.localunzip import extractall
|
|
stream.seek(0)
|
|
extractall(stream)
|
|
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
|
|
opf = self.find_opf()
|
|
if opf is None:
|
|
for f in walk('.'):
|
|
if f.lower().endswith('.opf') and '__MACOSX' not in f and \
|
|
not os.path.basename(f).startswith('.'):
|
|
opf = os.path.abspath(f)
|
|
break
|
|
path = getattr(stream, 'name', 'stream')
|
|
|
|
if opf is None:
|
|
raise ValueError('%s is not a valid EPUB file (could not find '
|
|
'opf)' % path)
|
|
|
|
opf = os.path.relpath(opf, os.getcwd())
|
|
# parts = os.path.split(opf)
|
|
opf = opf_meta.OPF(opf, os.path.dirname(os.path.abspath(opf)))
|
|
|
|
self._encrypted_font_uris = []
|
|
if os.path.exists(encfile):
|
|
if not self.process_encryption(encfile, opf, log):
|
|
raise DRMError(os.path.basename(path))
|
|
self.encrypted_fonts = self._encrypted_font_uris
|
|
|
|
# XXX(gryf): this code would fail pretty ugly, thus, this part was
|
|
# never used.
|
|
# if len(parts) > 1 and parts[0]:
|
|
# delta = '/'.join(parts[:-1])+'/'
|
|
|
|
# def normpath(x):
|
|
# return posixpath.normpath(delta + elem.get('href'))
|
|
|
|
# for elem in opf.itermanifest():
|
|
# elem.set('href', normpath(elem.get('href')))
|
|
# for elem in opf.iterguide():
|
|
# elem.set('href', normpath(elem.get('href')))
|
|
|
|
if opf.package_version >= 3.0:
|
|
f = self.rationalize_cover3
|
|
else:
|
|
f = self.rationalize_cover2
|
|
self.removed_cover = f(opf, log)
|
|
if self.removed_cover:
|
|
self.removed_items_to_ignore = (self.removed_cover,)
|
|
epub3_nav = opf.epub3_nav
|
|
if epub3_nav is not None:
|
|
self.convert_epub3_nav(epub3_nav, opf, log, options)
|
|
|
|
for x in opf.itermanifest():
|
|
if x.get('media-type', '') == 'application/x-dtbook+xml':
|
|
raise ValueError(
|
|
'EPUB files with DTBook markup are not supported')
|
|
|
|
not_for_spine = set()
|
|
for y in opf.itermanifest():
|
|
id_ = y.get('id', None)
|
|
if id_:
|
|
mt = y.get('media-type', None)
|
|
if mt in {
|
|
'application/vnd.adobe-page-template+xml',
|
|
'application/vnd.adobe.page-template+xml',
|
|
'application/adobe-page-template+xml',
|
|
'application/adobe.page-template+xml',
|
|
'application/text'
|
|
}:
|
|
not_for_spine.add(id_)
|
|
ext = y.get('href', '').rpartition('.')[-1].lower()
|
|
if mt == 'text/plain' and ext in {'otf', 'ttf'}:
|
|
# some epub authoring software sets font mime types to
|
|
# text/plain
|
|
not_for_spine.add(id_)
|
|
y.set('media-type', 'application/font')
|
|
|
|
seen = set()
|
|
for x in list(opf.iterspine()):
|
|
ref = x.get('idref', None)
|
|
if not ref or ref in not_for_spine or ref in seen:
|
|
x.getparent().remove(x)
|
|
continue
|
|
seen.add(ref)
|
|
|
|
if len(list(opf.iterspine())) == 0:
|
|
raise ValueError('No valid entries in the spine of this EPUB')
|
|
|
|
with open('content.opf', 'wb') as nopf:
|
|
nopf.write(opf.render())
|
|
|
|
return os.path.abspath('content.opf')
|
|
|
|
def convert_epub3_nav(self, nav_path, opf, log, opts):
|
|
from lxml import etree
|
|
from ebook_converter.ebooks.chardet import xml_to_unicode
|
|
from ebook_converter.ebooks.oeb.polish.parsing import parse
|
|
from ebook_converter.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, \
|
|
NCX, urlnormalize, urlunquote, serialize
|
|
from ebook_converter.ebooks.oeb.polish.toc import first_child
|
|
from tempfile import NamedTemporaryFile
|
|
with open(nav_path, 'rb') as f:
|
|
raw = f.read()
|
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
|
assume_utf8=True)[0]
|
|
root = parse(raw, log=log)
|
|
ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/'
|
|
'ncx/" version="2005-1" xml:lang="eng">'
|
|
'<navMap/></ncx>')
|
|
navmap = ncx[0]
|
|
et = '{%s}type' % EPUB_NS
|
|
bn = os.path.basename(nav_path)
|
|
|
|
def add_from_li(li, parent):
|
|
href = text = None
|
|
for x in li.iterchildren(XHTML('a'), XHTML('span')):
|
|
text = etree.tostring(x, method='text', encoding='unicode',
|
|
with_tail=False).strip() or ' '.join(
|
|
x.xpath('descendant-or-self::*/@title')).strip()
|
|
href = x.get('href')
|
|
if href:
|
|
if href.startswith('#'):
|
|
href = bn + href
|
|
break
|
|
np = parent.makeelement(NCX('navPoint'))
|
|
parent.append(np)
|
|
np.append(np.makeelement(NCX('navLabel')))
|
|
np[0].append(np.makeelement(NCX('text')))
|
|
np[0][0].text = text
|
|
if href:
|
|
np.append(np.makeelement(NCX('content'), attrib={'src': href}))
|
|
return np
|
|
|
|
def process_nav_node(node, toc_parent):
|
|
for li in node.iterchildren(XHTML('li')):
|
|
child = add_from_li(li, toc_parent)
|
|
ol = first_child(li, XHTML('ol'))
|
|
if child is not None and ol is not None:
|
|
process_nav_node(ol, child)
|
|
|
|
for nav in root.iterdescendants(XHTML('nav')):
|
|
if nav.get(et) == 'toc':
|
|
ol = first_child(nav, XHTML('ol'))
|
|
if ol is not None:
|
|
process_nav_node(ol, navmap)
|
|
break
|
|
else:
|
|
return
|
|
|
|
with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path),
|
|
delete=False) as f:
|
|
f.write(etree.tostring(ncx, encoding='utf-8'))
|
|
ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/')
|
|
ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME,
|
|
append=True).get('id')
|
|
for spine in opf.root.xpath('//*[local-name()="spine"]'):
|
|
spine.set('toc', ncx_id)
|
|
url = os.path.relpath(nav_path).replace(os.sep, '/')
|
|
opts.epub3_nav_href = urlnormalize(url)
|
|
opts.epub3_nav_parsed = root
|
|
if getattr(self, 'removed_cover', None):
|
|
changed = False
|
|
base_path = os.path.dirname(nav_path)
|
|
for elem in root.xpath('//*[@href]'):
|
|
href, frag = elem.get('href').partition('#')[::2]
|
|
link_path = os.path.relpath(os.path.join(base_path,
|
|
urlunquote(href)),
|
|
base_path)
|
|
abs_href = urlnormalize(link_path)
|
|
if abs_href == self.removed_cover:
|
|
changed = True
|
|
elem.set('data-calibre-removed-titlepage', '1')
|
|
if changed:
|
|
with open(nav_path, 'wb') as f:
|
|
f.write(serialize(root, 'application/xhtml+xml'))
|
|
|
|
def postprocess_book(self, oeb, opts, log):
|
|
rc = getattr(self, 'removed_cover', None)
|
|
if rc:
|
|
cover_toc_item = None
|
|
for item in oeb.toc.iterdescendants():
|
|
if item.href and item.href.partition('#')[0] == rc:
|
|
cover_toc_item = item
|
|
break
|
|
spine = {x.href for x in oeb.spine}
|
|
if (cover_toc_item is not None and cover_toc_item not in spine):
|
|
oeb.toc.item_that_refers_to_cover = cover_toc_item
|