1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-26 15:25:45 +01:00
Files
ebook-converter/ebook_converter/ebooks/oeb/reader.py
gryf 1465e4267f Sorted out mime initialization.
Every mime related function in main __init__.py has a flag check for the
check if initialization has already done. This is nonsense, since it
should be done implicitly early on the converter is starting.

This commit straight the things out, and initialization is done in cli
module.

Also, function guess_type was removed, since it's just a proxy for
mimetypes.guess_type function.
2020-06-14 15:41:18 +02:00

737 lines
29 KiB
Python

"""
Container-/OPF-based input OEBBook reader.
"""
import collections
import copy
import io
import mimetypes
import os
import re
import sys
import urllib.parse
import uuid
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.metadata import opf2 as opf_meta
from ebook_converter.ebooks.oeb.writer import OEBWriter
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.utils.localization import get_lang
from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.constants_old import __appname__, __version__
from ebook_converter import xml_replace_entities
from ebook_converter.polyglot.urllib import unquote
class OEBReader(object):
"""Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
COVER_SVG_XP = base.XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = base.XPath('h:body//h:object[@data][position() = 1]')
Container = base.DirContainer
"""Container type used to access book files. Override in sub-classes."""
DEFAULT_PROFILE = 'PRS505'
"""Default renderer profile for content read with this Reader."""
TRANSFORMS = []
"""List of transforms to apply to content read with this Reader."""
@classmethod
def config(cls, cfg):
"""Add any book-reading options to the :class:`Config` object
:param:`cfg`.
"""
return
@classmethod
def generate(cls, opts):
"""Generate a Reader instance from command-line options."""
return cls()
def __call__(self, oeb, path):
"""Read the book at :param:`path` into the :class:`OEBBook` object
:param:`oeb`.
"""
self.oeb = oeb
self.logger = self.log = oeb.logger
oeb.container = self.Container(path, self.logger)
oeb.container.log = oeb.log
opf = self._read_opf()
self._all_from_opf(opf)
return oeb
def _clean_opf(self, opf):
nsmap = {}
for elem in opf.iter(tag=etree.Element):
nsmap.update(elem.nsmap)
for elem in opf.iter(tag=etree.Element):
if (parse_utils.namespace(elem.tag) in ('', const.OPF1_NS) and
':' not in parse_utils.barename(elem.tag)):
elem.tag = base.tag('opf', parse_utils.barename(elem.tag))
nsmap.update(const.OPF2_NSMAP)
attrib = dict(opf.attrib)
nroot = etree.Element(base.tag('opf', 'package'),
nsmap={None: const.OPF2_NS}, attrib=attrib)
metadata = etree.SubElement(nroot, base.tag('opf', 'metadata'),
nsmap=nsmap)
ignored = (base.tag('opf', 'dc-metadata'), base.tag('opf', 'x-metadata'))
for elem in base.xpath(opf, 'o2:metadata//*'):
if elem.tag in ignored:
continue
if parse_utils.namespace(elem.tag) in const.DC_NSES:
tag = parse_utils.barename(elem.tag).lower()
elem.tag = '{%s}%s' % (const.DC11_NS, tag)
if elem.tag.startswith('dc:'):
tag = elem.tag.partition(':')[-1].lower()
elem.tag = '{%s}%s' % (const.DC11_NS, tag)
metadata.append(elem)
for element in base.xpath(opf, 'o2:metadata//o2:meta'):
metadata.append(element)
for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
for element in base.xpath(opf, tag):
nroot.append(element)
return nroot
def _read_opf(self):
data = self.oeb.container.read(None)
data = self.oeb.decode(data)
data = base.XMLDECL_RE.sub('', data)
data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
const.OPF1_NS, data)
try:
opf = etree.fromstring(data)
except etree.XMLSyntaxError:
data = xml_replace_entities(clean_xml_chars(data), encoding=None)
try:
opf = etree.fromstring(data)
self.logger.warn('OPF contains invalid HTML named entities')
except etree.XMLSyntaxError:
data = re.sub(r'(?is)<tours>.+</tours>', '', data)
data = data.replace('<dc-metadata>',
'<dc-metadata xmlns:dc="'
'http://purl.org/metadata/dublin_core">')
opf = etree.fromstring(data)
self.logger.warn('OPF contains invalid tours section')
ns = parse_utils.namespace(opf.tag)
if ns not in ('', const.OPF1_NS, const.OPF2_NS):
raise base.OEBError('Invalid namespace %r for OPF document' % ns)
opf = self._clean_opf(opf)
return opf
def _metadata_from_opf(self, opf):
from ebook_converter.ebooks.metadata.opf2 import OPF
from ebook_converter.ebooks.oeb.transforms.metadata import \
meta_info_to_oeb_metadata
stream = io.BytesIO(etree.tostring(opf, xml_declaration=True,
encoding='utf-8'))
o = OPF(stream)
pwm = o.primary_writing_mode
if pwm:
self.oeb.metadata.primary_writing_mode = pwm
mi = o.to_book_metadata()
if not mi.language:
mi.language = get_lang().replace('_', '-')
self.oeb.metadata.add('language', mi.language)
if not mi.book_producer:
mi.book_producer = ('%(a)s (%(v)s) [http://%(a)s-ebook.com]' %
dict(a=__appname__, v=__version__))
meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger)
m = self.oeb.metadata
m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid')
self.oeb.uid = self.oeb.metadata.identifier[-1]
if not m.title:
m.add('title', self.oeb.translate('Unknown'))
has_aut = False
for x in m.creator:
if getattr(x, 'role', '').lower() in ('', 'aut'):
has_aut = True
break
if not has_aut:
m.add('creator', self.oeb.translate('Unknown'), role='aut')
def _manifest_prune_invalid(self):
'''
Remove items from manifest that contain invalid data. This prevents
catastrophic conversion failure, when a few files contain corrupted
data.
'''
bad = []
check = base.OEB_DOCS.union(base.OEB_STYLES)
for item in list(self.oeb.manifest.values()):
if item.media_type in check:
try:
item.data
except KeyboardInterrupt:
raise
except Exception:
self.logger.exception('Failed to parse content in %s' %
item.href)
bad.append(item)
self.oeb.manifest.remove(item)
return bad
def _manifest_add_missing(self, invalid):
import css_parser
manifest = self.oeb.manifest
known = set(manifest.hrefs)
unchecked = set(manifest.values())
cdoc = base.OEB_DOCS | base.OEB_STYLES
invalid = set()
while unchecked:
new = set()
for item in unchecked:
data = None
if (item.media_type in cdoc or
item.media_type[-4:] in ('/xml', '+xml')):
try:
data = item.data
except Exception:
self.oeb.log.exception('Failed to read from manifest '
'entry with id: %s, ignoring' %
item.id)
invalid.add(item)
continue
if data is None:
continue
if (item.media_type in base.OEB_DOCS or
item.media_type[-4:] in ('/xml', '+xml')):
hrefs = [r[2] for r in base.iterlinks(data)]
for href in hrefs:
if isinstance(href, bytes):
href = href.decode('utf-8')
href, _ = urllib.parse.urldefrag(href)
if not href:
continue
try:
href = item.abshref(base.urlnormalize(href))
scheme = urllib.parse.urlparse(href).scheme
except Exception:
self.oeb.log.exception('Skipping invalid href: '
'%r' % href)
continue
if not scheme and href not in known:
new.add(href)
elif item.media_type in base.OEB_STYLES:
try:
urls = list(css_parser.getUrls(data))
except Exception:
urls = []
for url in urls:
href, _ = urllib.parse.urldefrag(url)
href = item.abshref(base.urlnormalize(href))
scheme = urllib.parse.urlparse(href).scheme
if not scheme and href not in known:
new.add(href)
unchecked.clear()
warned = set()
for href in new:
known.add(href)
is_invalid = False
for item in invalid:
if href == item.abshref(base.urlnormalize(href)):
is_invalid = True
break
if is_invalid:
continue
if not self.oeb.container.exists(href):
if href not in warned:
self.logger.warn('Referenced file %r not found' % href)
warned.add(href)
continue
if href not in warned:
self.logger.warn('Referenced file %r not in manifest' %
href)
warned.add(href)
id, _ = manifest.generate(id='added')
guessed = mimetypes.guess_type(href)[0]
media_type = guessed or base.BINARY_MIME
added = manifest.add(id, href, media_type)
unchecked.add(added)
for item in invalid:
self.oeb.manifest.remove(item)
def _manifest_from_opf(self, opf):
manifest = self.oeb.manifest
for elem in base.xpath(opf, '/o2:package/o2:manifest/o2:item'):
id = elem.get('id')
href = elem.get('href')
media_type = elem.get('media-type', None)
if media_type is None:
media_type = elem.get('mediatype', None)
if not media_type or media_type == 'text/xml':
guessed = mimetypes.guess_type(href)[0]
media_type = guessed or media_type or base.BINARY_MIME
if hasattr(media_type, 'lower'):
media_type = media_type.lower()
fallback = elem.get('fallback')
if href in manifest.hrefs:
self.logger.warn('Duplicate manifest entry for %r' % href)
continue
if not self.oeb.container.exists(href):
self.logger.warn('Manifest item %r not found' % href)
continue
if id in manifest.ids:
self.logger.warn('Duplicate manifest id %r' % id)
id, href = manifest.generate(id, href)
manifest.add(id, href, media_type, fallback)
invalid = self._manifest_prune_invalid()
self._manifest_add_missing(invalid)
def _spine_add_extra(self):
manifest = self.oeb.manifest
spine = self.oeb.spine
unchecked = set(spine)
selector = base.XPath('h:body//h:a/@href')
extras = set()
while unchecked:
new = set()
for item in unchecked:
if item.media_type not in base.OEB_DOCS:
# TODO: handle fallback chains
continue
for href in selector(item.data):
href, _ = urllib.parse.urldefrag(href)
if not href:
continue
try:
href = item.abshref(base.urlnormalize(href))
except ValueError: # Malformed URL
continue
if href not in manifest.hrefs:
continue
found = manifest.hrefs[href]
if found.media_type not in base.OEB_DOCS or \
found in spine or found in extras:
continue
new.add(found)
extras.update(new)
unchecked = new
version = int(self.oeb.version[0])
removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore',
())
for item in sorted(extras):
if item.href in removed_items_to_ignore:
continue
if version >= 2:
self.logger.warn(
'Spine-referenced file %r not in spine' % item.href)
spine.add(item, linear=False)
def _spine_from_opf(self, opf):
spine = self.oeb.spine
manifest = self.oeb.manifest
for elem in base.xpath(opf, '/o2:package/o2:spine/o2:itemref'):
idref = elem.get('idref')
if idref not in manifest.ids:
self.logger.warn('Spine item %r not found' % idref)
continue
item = manifest.ids[idref]
if (item.media_type.lower() in base.OEB_DOCS and
hasattr(item.data, 'xpath') and not
getattr(item.data, 'tag', '').endswith('}ncx')):
spine.add(item, elem.get('linear'))
else:
if (hasattr(item.data, 'tag') and
item.data.tag and item.data.tag.endswith('}html')):
item.media_type = base.XHTML_MIME
spine.add(item, elem.get('linear'))
else:
self.oeb.log.warn('The item %s is not a XML document.'
' Removing it from spine.' % item.href)
if len(spine) == 0:
raise base.OEBError("Spine is empty")
self._spine_add_extra()
for val in base.xpath(opf,
'/o2:package/o2:spine/@page-progression-direction'):
if val in {'ltr', 'rtl'}:
spine.page_progression_direction = val
def _guide_from_opf(self, opf):
guide = self.oeb.guide
manifest = self.oeb.manifest
for elem in base.xpath(opf, '/o2:package/o2:guide/o2:reference'):
ref_href = elem.get('href')
path = base.urlnormalize(urllib.parse.urldefrag(ref_href)[0])
if path not in manifest.hrefs:
corrected_href = None
for href in manifest.hrefs:
if href.lower() == path.lower():
corrected_href = href
break
if corrected_href is None:
self.logger.warn('Guide reference %r not found' % ref_href)
continue
ref_href = corrected_href
typ = elem.get('type')
if typ not in guide:
guide.add(typ, elem.get('title'), ref_href)
def _find_ncx(self, opf):
result = base.xpath(opf, '/o2:package/o2:spine/@toc')
if result:
id = result[0]
if id not in self.oeb.manifest.ids:
return None
item = self.oeb.manifest.ids[id]
self.oeb.manifest.remove(item)
return item
for item in self.oeb.manifest.values():
if item.media_type == base.NCX_MIME:
self.oeb.manifest.remove(item)
return item
return None
def _toc_from_navpoint(self, item, toc, navpoint):
children = base.xpath(navpoint, 'ncx:navPoint')
for child in children:
title = ''.join(base.xpath(child, 'ncx:navLabel/ncx:text/text()'))
title = base.COLLAPSE_RE.sub(' ', title.strip())
href = base.xpath(child, 'ncx:content/@src')
if not title:
self._toc_from_navpoint(item, toc, child)
continue
if (not href or not href[0]) and not base.xpath(child, 'ncx:navPoint'):
# This node is useless
continue
if href and href[0]:
href = item.abshref(base.urlnormalize(href[0]))
else:
href = ''
path, _ = urllib.parse.urldefrag(href)
if path and path not in self.oeb.manifest.hrefs:
path = base.urlnormalize(path)
if href and path not in self.oeb.manifest.hrefs:
self.logger.warn('TOC reference %r not found' % href)
gc = base.xpath(child, 'ncx:navPoint')
if not gc:
# This node is useless
continue
id = child.get('id')
klass = child.get('class', 'chapter')
try:
po = int(child.get('playOrder',
self.oeb.toc.next_play_order()))
except Exception:
po = self.oeb.toc.next_play_order()
authorElement = base.xpath(child,
'descendant::calibre:meta[@name = "author"]')
if authorElement:
author = authorElement[0].text
else:
author = None
descriptionElement = base.xpath(child,
'descendant::calibre:meta[@name = '
'"description"]')
if descriptionElement:
description = etree.tostring(descriptionElement[0],
method='text',
encoding='unicode').strip()
if not description:
description = None
else:
description = None
index_image = base.xpath(child,
'descendant::calibre:meta[@name = '
'"toc_thumbnail"]')
toc_thumbnail = (index_image[0].text if index_image else None)
if not toc_thumbnail or not toc_thumbnail.strip():
toc_thumbnail = None
node = toc.add(title, href, id=id, klass=klass,
play_order=po, description=description,
author=author, toc_thumbnail=toc_thumbnail)
self._toc_from_navpoint(item, node, child)
def _toc_from_ncx(self, item):
if (item is None) or (item.data is None):
return False
self.log.debug('Reading TOC from NCX...')
ncx = item.data
title = ''.join(base.xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
title = base.COLLAPSE_RE.sub(' ', title.strip())
title = title or str(self.oeb.metadata.title[0])
toc = self.oeb.toc
toc.title = title
navmaps = base.xpath(ncx, 'ncx:navMap')
for navmap in navmaps:
self._toc_from_navpoint(item, toc, navmap)
return True
def _toc_from_tour(self, opf):
result = base.xpath(opf, 'o2:tours/o2:tour')
if not result:
return False
self.log.debug('Reading TOC from tour...')
tour = result[0]
toc = self.oeb.toc
toc.title = tour.get('title')
sites = base.xpath(tour, 'o2:site')
for site in sites:
title = site.get('title')
href = site.get('href')
if not title or not href:
continue
path, _ = urllib.parse.urldefrag(base.urlnormalize(href))
if path not in self.oeb.manifest.hrefs:
self.logger.warn('TOC reference %r not found' % href)
continue
id = site.get('id')
toc.add(title, href, id=id)
return True
def _toc_from_html(self, opf):
if 'toc' not in self.oeb.guide:
return False
self.log.debug('Reading TOC from HTML...')
itempath, frag = urllib.parse.urldefrag(self.oeb.guide['toc'].href)
item = self.oeb.manifest.hrefs[itempath]
html = item.data
if frag:
elems = base.xpath(html, './/*[@id="%s"]' % frag)
if not elems:
elems = base.xpath(html, './/*[@name="%s"]' % frag)
elem = elems[0] if elems else html
while elem != html and not base.xpath(elem, './/h:a[@href]'):
elem = elem.getparent()
html = elem
titles = collections.defaultdict(list)
order = []
for anchor in base.xpath(html, './/h:a[@href]'):
href = anchor.attrib['href']
href = item.abshref(base.urlnormalize(href))
path, frag = urllib.parse.urldefrag(href)
if path not in self.oeb.manifest.hrefs:
continue
title = base.xml2text(anchor)
title = base.COLLAPSE_RE.sub(' ', title.strip())
if href not in titles:
order.append(href)
titles[href].append(title)
toc = self.oeb.toc
for href in order:
toc.add(' '.join(titles[href]), href)
return True
def _toc_from_spine(self, opf):
self.log.warn('Generating default TOC from spine...')
toc = self.oeb.toc
titles = []
headers = []
for item in self.oeb.spine:
if not item.linear:
continue
html = item.data
title = ''.join(base.xpath(html, '/h:html/h:head/h:title/text()'))
title = base.COLLAPSE_RE.sub(' ', title.strip())
if title:
titles.append(title)
headers.append('(unlabled)')
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
expr = '/h:html/h:body//h:%s[position()=1]/text()'
header = ''.join(base.xpath(html, expr % tag))
header = base.COLLAPSE_RE.sub(' ', header.strip())
if header:
headers[-1] = header
break
use = titles
if len(titles) > len(set(titles)):
use = headers
for title, item in zip(use, self.oeb.spine):
if not item.linear:
continue
toc.add(title, item.href)
return True
def _toc_from_opf(self, opf, item):
self.oeb.auto_generated_toc = False
if self._toc_from_ncx(item):
return
# Prefer HTML to tour based TOC, since several LIT files
# have good HTML TOCs but bad tour based TOCs
if self._toc_from_html(opf):
return
if self._toc_from_tour(opf):
return
self._toc_from_spine(opf)
self.oeb.auto_generated_toc = True
def _pages_from_ncx(self, opf, item):
if item is None:
return False
ncx = item.data
if ncx is None:
return False
ptargets = base.xpath(ncx, 'ncx:pageList/ncx:pageTarget')
if not ptargets:
return False
pages = self.oeb.pages
for ptarget in ptargets:
name = ''.join(base.xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
name = base.COLLAPSE_RE.sub(' ', name.strip())
href = base.xpath(ptarget, 'ncx:content/@src')
if not href:
continue
href = item.abshref(base.urlnormalize(href[0]))
id = ptarget.get('id')
type = ptarget.get('type', 'normal')
klass = ptarget.get('class')
pages.add(name, href, type=type, id=id, klass=klass)
return True
def _find_page_map(self, opf):
result = base.xpath(opf, '/o2:package/o2:spine/@page-map')
if result:
id = result[0]
if id not in self.oeb.manifest.ids:
return None
item = self.oeb.manifest.ids[id]
self.oeb.manifest.remove(item)
return item
for item in self.oeb.manifest.values():
if item.media_type == base.PAGE_MAP_MIME:
self.oeb.manifest.remove(item)
return item
return None
def _pages_from_page_map(self, opf):
item = self._find_page_map(opf)
if item is None:
return False
pmap = item.data
pages = self.oeb.pages
for page in base.xpath(pmap, 'o2:page'):
name = page.get('name', '')
href = page.get('href')
if not href:
continue
name = base.COLLAPSE_RE.sub(' ', name.strip())
href = item.abshref(base.urlnormalize(href))
type = 'normal'
if not name:
type = 'special'
elif name.lower().strip('ivxlcdm') == '':
type = 'front'
pages.add(name, href, type=type)
return True
def _pages_from_opf(self, opf, item):
if self._pages_from_ncx(opf, item):
return
if self._pages_from_page_map(opf):
return
return
def _cover_from_html(self, hcover):
from ebook_converter.ebooks import render_html_svg_workaround
with TemporaryDirectory('_html_cover') as tdir:
writer = OEBWriter()
writer(self.oeb, tdir)
path = os.path.join(tdir, unquote(hcover.href))
data = render_html_svg_workaround(path, self.logger)
if not data:
data = b''
id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
item = self.oeb.manifest.add(id, href, base.JPEG_MIME, data=data)
return item
def _locate_cover_image(self):
if self.oeb.metadata.cover:
id = str(self.oeb.metadata.cover[0])
item = self.oeb.manifest.ids.get(id, None)
if item is not None and item.media_type in base.OEB_IMAGES:
return item
else:
self.logger.warn('Invalid cover image @id %r' % id)
hcover = self.oeb.spine[0]
if 'cover' in self.oeb.guide:
href = self.oeb.guide['cover'].href
item = self.oeb.manifest.hrefs[href]
media_type = item.media_type
if media_type in base.OEB_IMAGES:
return item
elif media_type in base.OEB_DOCS:
hcover = item
html = hcover.data
if base.MS_COVER_TYPE in self.oeb.guide:
href = self.oeb.guide[base.MS_COVER_TYPE].href
item = self.oeb.manifest.hrefs.get(href, None)
if item is not None and item.media_type in base.OEB_IMAGES:
return item
if self.COVER_SVG_XP(html):
svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
href = os.path.splitext(hcover.href)[0] + '.svg'
id, href = self.oeb.manifest.generate(hcover.id, href)
item = self.oeb.manifest.add(id, href, base.SVG_MIME, data=svg)
return item
if self.COVER_OBJECT_XP(html):
object = self.COVER_OBJECT_XP(html)[0]
href = hcover.abshref(object.get('data'))
item = self.oeb.manifest.hrefs.get(href, None)
if item is not None and item.media_type in base.OEB_IMAGES:
return item
return self._cover_from_html(hcover)
def _ensure_cover_image(self):
cover = self._locate_cover_image()
if self.oeb.metadata.cover:
self.oeb.metadata.cover[0].value = cover.id
return
self.oeb.metadata.add('cover', cover.id)
def _manifest_remove_duplicates(self):
seen = set()
dups = set()
for item in self.oeb.manifest:
if item.href in seen:
dups.add(item.href)
seen.add(item.href)
for href in dups:
items = [x for x in self.oeb.manifest if x.href == href]
for x in items:
if x not in self.oeb.spine:
self.oeb.log.warn('Removing duplicate manifest item with '
'id:', x.id)
self.oeb.manifest.remove_duplicate_item(x)
def _all_from_opf(self, opf):
self.oeb.version = opf.get('version', '1.2')
self._metadata_from_opf(opf)
self._manifest_from_opf(opf)
self._spine_from_opf(opf)
self._manifest_remove_duplicates()
self._guide_from_opf(opf)
item = self._find_ncx(opf)
self._toc_from_opf(opf, item)
self._pages_from_opf(opf, item)
# self._ensure_cover_image()
def main(argv=sys.argv):
reader = OEBReader()
for arg in argv[1:]:
oeb = reader(base.OEBBook(), arg)
for name, doc in oeb.to_opf1().values():
print(etree.tostring(doc, pretty_print=True))
for name, doc in oeb.to_opf2(page_map=True).values():
print(etree.tostring(doc, pretty_print=True))
return 0
if __name__ == '__main__':
sys.exit(main())