Initial import

2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
@@ -0,0 +1,720 @@
+"""
+Container-/OPF-based input OEBBook reader.
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
+
+import sys, os, uuid, copy, re, io
+from collections import defaultdict
+
+from lxml import etree
+
+from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
+    DC_NSES, OPF, xml2text, XHTML_MIME
+from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
+    PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
+from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
+    MS_COVER_TYPE, iterlinks
+from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
+                                    urlnormalize, BINARY_MIME, \
+                                    OEBError, OEBBook, DirContainer
+from calibre.ebooks.oeb.writer import OEBWriter
+from calibre.utils.xml_parse import safe_xml_fromstring
+from calibre.utils.cleantext import clean_xml_chars
+from calibre.utils.localization import get_lang
+from calibre.ptempfile import TemporaryDirectory
+from calibre.constants import __appname__, __version__
+from calibre import guess_type, xml_replace_entities
+from polyglot.builtins import unicode_type, zip
+from polyglot.urllib import unquote, urldefrag, urlparse
+
+__all__ = ['OEBReader']
+
+
+class OEBReader(object):
+    """Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
+
+    COVER_SVG_XP    = XPath('h:body//svg:svg[position() = 1]')
+    COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
+
+    Container = DirContainer
+    """Container type used to access book files.  Override in sub-classes."""
+
+    DEFAULT_PROFILE = 'PRS505'
+    """Default renderer profile for content read with this Reader."""
+
+    TRANSFORMS = []
+    """List of transforms to apply to content read with this Reader."""
+
+    @classmethod
+    def config(cls, cfg):
+        """Add any book-reading options to the :class:`Config` object
+        :param:`cfg`.
+        """
+        return
+
+    @classmethod
+    def generate(cls, opts):
+        """Generate a Reader instance from command-line options."""
+        return cls()
+
+    def __call__(self, oeb, path):
+        """Read the book at :param:`path` into the :class:`OEBBook` object
+        :param:`oeb`.
+        """
+        self.oeb = oeb
+        self.logger = self.log = oeb.logger
+        oeb.container = self.Container(path, self.logger)
+        oeb.container.log = oeb.log
+        opf = self._read_opf()
+        self._all_from_opf(opf)
+        return oeb
+
+    def _clean_opf(self, opf):
+        nsmap = {}
+        for elem in opf.iter(tag=etree.Element):
+            nsmap.update(elem.nsmap)
+        for elem in opf.iter(tag=etree.Element):
+            if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag):
+                elem.tag = OPF(barename(elem.tag))
+        nsmap.update(OPF2_NSMAP)
+        attrib = dict(opf.attrib)
+        nroot = etree.Element(OPF('package'),
+            nsmap={None: OPF2_NS}, attrib=attrib)
+        metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
+        ignored = (OPF('dc-metadata'), OPF('x-metadata'))
+        for elem in xpath(opf, 'o2:metadata//*'):
+            if elem.tag in ignored:
+                continue
+            if namespace(elem.tag) in DC_NSES:
+                tag = barename(elem.tag).lower()
+                elem.tag = '{%s}%s' % (DC11_NS, tag)
+            if elem.tag.startswith('dc:'):
+                tag = elem.tag.partition(':')[-1].lower()
+                elem.tag = '{%s}%s' % (DC11_NS, tag)
+            metadata.append(elem)
+        for element in xpath(opf, 'o2:metadata//o2:meta'):
+            metadata.append(element)
+        for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
+            for element in xpath(opf, tag):
+                nroot.append(element)
+        return nroot
+
+    def _read_opf(self):
+        data = self.oeb.container.read(None)
+        data = self.oeb.decode(data)
+        data = XMLDECL_RE.sub('', data)
+        data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
+                OPF1_NS, data)
+        try:
+            opf = safe_xml_fromstring(data)
+        except etree.XMLSyntaxError:
+            data = xml_replace_entities(clean_xml_chars(data), encoding=None)
+            try:
+                opf = safe_xml_fromstring(data)
+                self.logger.warn('OPF contains invalid HTML named entities')
+            except etree.XMLSyntaxError:
+                data = re.sub(r'(?is)<tours>.+</tours>', '', data)
+                data = data.replace('<dc-metadata>',
+                    '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">')
+                opf = safe_xml_fromstring(data)
+                self.logger.warn('OPF contains invalid tours section')
+
+        ns = namespace(opf.tag)
+        if ns not in ('', OPF1_NS, OPF2_NS):
+            raise OEBError('Invalid namespace %r for OPF document' % ns)
+        opf = self._clean_opf(opf)
+        return opf
+
+    def _metadata_from_opf(self, opf):
+        from calibre.ebooks.metadata.opf2 import OPF
+        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
+        stream = io.BytesIO(etree.tostring(opf, xml_declaration=True, encoding='utf-8'))
+        o = OPF(stream)
+        pwm = o.primary_writing_mode
+        if pwm:
+            self.oeb.metadata.primary_writing_mode = pwm
+        mi = o.to_book_metadata()
+        if not mi.language:
+            mi.language = get_lang().replace('_', '-')
+        self.oeb.metadata.add('language', mi.language)
+        if not mi.book_producer:
+            mi.book_producer = '%(a)s (%(v)s) [http://%(a)s-ebook.com]'%\
+                dict(a=__appname__, v=__version__)
+        meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger)
+        m = self.oeb.metadata
+        m.add('identifier', unicode_type(uuid.uuid4()), id='uuid_id', scheme='uuid')
+        self.oeb.uid = self.oeb.metadata.identifier[-1]
+        if not m.title:
+            m.add('title', self.oeb.translate(__('Unknown')))
+        has_aut = False
+        for x in m.creator:
+            if getattr(x, 'role', '').lower() in ('', 'aut'):
+                has_aut = True
+                break
+        if not has_aut:
+            m.add('creator', self.oeb.translate(__('Unknown')), role='aut')
+
+    def _manifest_prune_invalid(self):
+        '''
+        Remove items from manifest that contain invalid data. This prevents
+        catastrophic conversion failure, when a few files contain corrupted
+        data.
+        '''
+        bad = []
+        check = OEB_DOCS.union(OEB_STYLES)
+        for item in list(self.oeb.manifest.values()):
+            if item.media_type in check:
+                try:
+                    item.data
+                except KeyboardInterrupt:
+                    raise
+                except:
+                    self.logger.exception('Failed to parse content in %s'%
+                            item.href)
+                    bad.append(item)
+                    self.oeb.manifest.remove(item)
+        return bad
+
+    def _manifest_add_missing(self, invalid):
+        import css_parser
+        manifest = self.oeb.manifest
+        known = set(manifest.hrefs)
+        unchecked = set(manifest.values())
+        cdoc = OEB_DOCS|OEB_STYLES
+        invalid = set()
+        while unchecked:
+            new = set()
+            for item in unchecked:
+                data = None
+                if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')):
+                    try:
+                        data = item.data
+                    except:
+                        self.oeb.log.exception('Failed to read from manifest '
+                                'entry with id: %s, ignoring'%item.id)
+                        invalid.add(item)
+                        continue
+                if data is None:
+                    continue
+
+                if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')):
+                    hrefs = [r[2] for r in iterlinks(data)]
+                    for href in hrefs:
+                        if isinstance(href, bytes):
+                            href = href.decode('utf-8')
+                        href, _ = urldefrag(href)
+                        if not href:
+                            continue
+                        try:
+                            href = item.abshref(urlnormalize(href))
+                            scheme = urlparse(href).scheme
+                        except:
+                            self.oeb.log.exception(
+                                'Skipping invalid href: %r'%href)
+                            continue
+                        if not scheme and href not in known:
+                            new.add(href)
+                elif item.media_type in OEB_STYLES:
+                    try:
+                        urls = list(css_parser.getUrls(data))
+                    except:
+                        urls = []
+                    for url in urls:
+                        href, _ = urldefrag(url)
+                        href = item.abshref(urlnormalize(href))
+                        scheme = urlparse(href).scheme
+                        if not scheme and href not in known:
+                            new.add(href)
+            unchecked.clear()
+            warned = set()
+            for href in new:
+                known.add(href)
+                is_invalid = False
+                for item in invalid:
+                    if href == item.abshref(urlnormalize(href)):
+                        is_invalid = True
+                        break
+                if is_invalid:
+                    continue
+                if not self.oeb.container.exists(href):
+                    if href not in warned:
+                        self.logger.warn('Referenced file %r not found' % href)
+                        warned.add(href)
+                    continue
+                if href not in warned:
+                    self.logger.warn('Referenced file %r not in manifest' % href)
+                    warned.add(href)
+                id, _ = manifest.generate(id='added')
+                guessed = guess_type(href)[0]
+                media_type = guessed or BINARY_MIME
+                added = manifest.add(id, href, media_type)
+                unchecked.add(added)
+
+            for item in invalid:
+                self.oeb.manifest.remove(item)
+
+    def _manifest_from_opf(self, opf):
+        manifest = self.oeb.manifest
+        for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
+            id = elem.get('id')
+            href = elem.get('href')
+            media_type = elem.get('media-type', None)
+            if media_type is None:
+                media_type = elem.get('mediatype', None)
+            if not media_type or media_type == 'text/xml':
+                guessed = guess_type(href)[0]
+                media_type = guessed or media_type or BINARY_MIME
+            if hasattr(media_type, 'lower'):
+                media_type = media_type.lower()
+            fallback = elem.get('fallback')
+            if href in manifest.hrefs:
+                self.logger.warn('Duplicate manifest entry for %r' % href)
+                continue
+            if not self.oeb.container.exists(href):
+                self.logger.warn('Manifest item %r not found' % href)
+                continue
+            if id in manifest.ids:
+                self.logger.warn('Duplicate manifest id %r' % id)
+                id, href = manifest.generate(id, href)
+            manifest.add(id, href, media_type, fallback)
+        invalid = self._manifest_prune_invalid()
+        self._manifest_add_missing(invalid)
+
+    def _spine_add_extra(self):
+        manifest = self.oeb.manifest
+        spine = self.oeb.spine
+        unchecked = set(spine)
+        selector = XPath('h:body//h:a/@href')
+        extras = set()
+        while unchecked:
+            new = set()
+            for item in unchecked:
+                if item.media_type not in OEB_DOCS:
+                    # TODO: handle fallback chains
+                    continue
+                for href in selector(item.data):
+                    href, _ = urldefrag(href)
+                    if not href:
+                        continue
+                    try:
+                        href = item.abshref(urlnormalize(href))
+                    except ValueError:  # Malformed URL
+                        continue
+                    if href not in manifest.hrefs:
+                        continue
+                    found = manifest.hrefs[href]
+                    if found.media_type not in OEB_DOCS or \
+                       found in spine or found in extras:
+                        continue
+                    new.add(found)
+            extras.update(new)
+            unchecked = new
+        version = int(self.oeb.version[0])
+        removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ())
+        for item in sorted(extras):
+            if item.href in removed_items_to_ignore:
+                continue
+            if version >= 2:
+                self.logger.warn(
+                    'Spine-referenced file %r not in spine' % item.href)
+            spine.add(item, linear=False)
+
+    def _spine_from_opf(self, opf):
+        spine = self.oeb.spine
+        manifest = self.oeb.manifest
+        for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
+            idref = elem.get('idref')
+            if idref not in manifest.ids:
+                self.logger.warn('Spine item %r not found' % idref)
+                continue
+            item = manifest.ids[idref]
+            if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath') and not getattr(item.data, 'tag', '').endswith('}ncx'):
+                spine.add(item, elem.get('linear'))
+            else:
+                if hasattr(item.data, 'tag') and item.data.tag and item.data.tag.endswith('}html'):
+                    item.media_type = XHTML_MIME
+                    spine.add(item, elem.get('linear'))
+                else:
+                    self.oeb.log.warn('The item %s is not a XML document.'
+                        ' Removing it from spine.'%item.href)
+        if len(spine) == 0:
+            raise OEBError("Spine is empty")
+        self._spine_add_extra()
+        for val in xpath(opf, '/o2:package/o2:spine/@page-progression-direction'):
+            if val in {'ltr', 'rtl'}:
+                spine.page_progression_direction = val
+
+    def _guide_from_opf(self, opf):
+        guide = self.oeb.guide
+        manifest = self.oeb.manifest
+        for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
+            ref_href = elem.get('href')
+            path = urlnormalize(urldefrag(ref_href)[0])
+            if path not in manifest.hrefs:
+                corrected_href = None
+                for href in manifest.hrefs:
+                    if href.lower() == path.lower():
+                        corrected_href = href
+                        break
+                if corrected_href is None:
+                    self.logger.warn('Guide reference %r not found' % ref_href)
+                    continue
+                ref_href = corrected_href
+            typ = elem.get('type')
+            if typ not in guide:
+                guide.add(typ, elem.get('title'), ref_href)
+
+    def _find_ncx(self, opf):
+        result = xpath(opf, '/o2:package/o2:spine/@toc')
+        if result:
+            id = result[0]
+            if id not in self.oeb.manifest.ids:
+                return None
+            item = self.oeb.manifest.ids[id]
+            self.oeb.manifest.remove(item)
+            return item
+        for item in self.oeb.manifest.values():
+            if item.media_type == NCX_MIME:
+                self.oeb.manifest.remove(item)
+                return item
+        return None
+
+    def _toc_from_navpoint(self, item, toc, navpoint):
+        children = xpath(navpoint, 'ncx:navPoint')
+        for child in children:
+            title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
+            title = COLLAPSE_RE.sub(' ', title.strip())
+            href = xpath(child, 'ncx:content/@src')
+            if not title:
+                self._toc_from_navpoint(item, toc, child)
+                continue
+            if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'):
+                # This node is useless
+                continue
+            href = item.abshref(urlnormalize(href[0])) if href and href[0] else ''
+            path, _ = urldefrag(href)
+            if path and path not in self.oeb.manifest.hrefs:
+                path = urlnormalize(path)
+            if href and path not in self.oeb.manifest.hrefs:
+                self.logger.warn('TOC reference %r not found' % href)
+                gc = xpath(child, 'ncx:navPoint')
+                if not gc:
+                    # This node is useless
+                    continue
+            id = child.get('id')
+            klass = child.get('class', 'chapter')
+
+            try:
+                po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
+            except:
+                po = self.oeb.toc.next_play_order()
+
+            authorElement = xpath(child,
+                    'descendant::calibre:meta[@name = "author"]')
+            if authorElement:
+                author = authorElement[0].text
+            else:
+                author = None
+
+            descriptionElement = xpath(child,
+                    'descendant::calibre:meta[@name = "description"]')
+            if descriptionElement:
+                description = etree.tostring(descriptionElement[0],
+                method='text', encoding='unicode').strip()
+                if not description:
+                    description = None
+            else:
+                description = None
+
+            index_image = xpath(child,
+                    'descendant::calibre:meta[@name = "toc_thumbnail"]')
+            toc_thumbnail = (index_image[0].text if index_image else None)
+            if not toc_thumbnail or not toc_thumbnail.strip():
+                toc_thumbnail = None
+
+            node = toc.add(title, href, id=id, klass=klass,
+                    play_order=po, description=description, author=author,
+                           toc_thumbnail=toc_thumbnail)
+
+            self._toc_from_navpoint(item, node, child)
+
+    def _toc_from_ncx(self, item):
+        if (item is None) or (item.data is None):
+            return False
+        self.log.debug('Reading TOC from NCX...')
+        ncx = item.data
+        title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
+        title = COLLAPSE_RE.sub(' ', title.strip())
+        title = title or unicode_type(self.oeb.metadata.title[0])
+        toc = self.oeb.toc
+        toc.title = title
+        navmaps = xpath(ncx, 'ncx:navMap')
+        for navmap in navmaps:
+            self._toc_from_navpoint(item, toc, navmap)
+        return True
+
+    def _toc_from_tour(self, opf):
+        result = xpath(opf, 'o2:tours/o2:tour')
+        if not result:
+            return False
+        self.log.debug('Reading TOC from tour...')
+        tour = result[0]
+        toc = self.oeb.toc
+        toc.title = tour.get('title')
+        sites = xpath(tour, 'o2:site')
+        for site in sites:
+            title = site.get('title')
+            href = site.get('href')
+            if not title or not href:
+                continue
+            path, _ = urldefrag(urlnormalize(href))
+            if path not in self.oeb.manifest.hrefs:
+                self.logger.warn('TOC reference %r not found' % href)
+                continue
+            id = site.get('id')
+            toc.add(title, href, id=id)
+        return True
+
+    def _toc_from_html(self, opf):
+        if 'toc' not in self.oeb.guide:
+            return False
+        self.log.debug('Reading TOC from HTML...')
+        itempath, frag = urldefrag(self.oeb.guide['toc'].href)
+        item = self.oeb.manifest.hrefs[itempath]
+        html = item.data
+        if frag:
+            elems = xpath(html, './/*[@id="%s"]' % frag)
+            if not elems:
+                elems = xpath(html, './/*[@name="%s"]' % frag)
+            elem = elems[0] if elems else html
+            while elem != html and not xpath(elem, './/h:a[@href]'):
+                elem = elem.getparent()
+            html = elem
+        titles = defaultdict(list)
+        order = []
+        for anchor in xpath(html, './/h:a[@href]'):
+            href = anchor.attrib['href']
+            href = item.abshref(urlnormalize(href))
+            path, frag = urldefrag(href)
+            if path not in self.oeb.manifest.hrefs:
+                continue
+            title = xml2text(anchor)
+            title = COLLAPSE_RE.sub(' ', title.strip())
+            if href not in titles:
+                order.append(href)
+            titles[href].append(title)
+        toc = self.oeb.toc
+        for href in order:
+            toc.add(' '.join(titles[href]), href)
+        return True
+
+    def _toc_from_spine(self, opf):
+        self.log.warn('Generating default TOC from spine...')
+        toc = self.oeb.toc
+        titles = []
+        headers = []
+        for item in self.oeb.spine:
+            if not item.linear:
+                continue
+            html = item.data
+            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
+            title = COLLAPSE_RE.sub(' ', title.strip())
+            if title:
+                titles.append(title)
+            headers.append('(unlabled)')
+            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
+                expr = '/h:html/h:body//h:%s[position()=1]/text()'
+                header = ''.join(xpath(html, expr % tag))
+                header = COLLAPSE_RE.sub(' ', header.strip())
+                if header:
+                    headers[-1] = header
+                    break
+        use = titles
+        if len(titles) > len(set(titles)):
+            use = headers
+        for title, item in zip(use, self.oeb.spine):
+            if not item.linear:
+                continue
+            toc.add(title, item.href)
+        return True
+
+    def _toc_from_opf(self, opf, item):
+        self.oeb.auto_generated_toc = False
+        if self._toc_from_ncx(item):
+            return
+        # Prefer HTML to tour based TOC, since several LIT files
+        # have good HTML TOCs but bad tour based TOCs
+        if self._toc_from_html(opf):
+            return
+        if self._toc_from_tour(opf):
+            return
+        self._toc_from_spine(opf)
+        self.oeb.auto_generated_toc = True
+
+    def _pages_from_ncx(self, opf, item):
+        if item is None:
+            return False
+        ncx = item.data
+        if ncx is None:
+            return False
+        ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget')
+        if not ptargets:
+            return False
+        pages = self.oeb.pages
+        for ptarget in ptargets:
+            name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
+            name = COLLAPSE_RE.sub(' ', name.strip())
+            href = xpath(ptarget, 'ncx:content/@src')
+            if not href:
+                continue
+            href = item.abshref(urlnormalize(href[0]))
+            id = ptarget.get('id')
+            type = ptarget.get('type', 'normal')
+            klass = ptarget.get('class')
+            pages.add(name, href, type=type, id=id, klass=klass)
+        return True
+
+    def _find_page_map(self, opf):
+        result = xpath(opf, '/o2:package/o2:spine/@page-map')
+        if result:
+            id = result[0]
+            if id not in self.oeb.manifest.ids:
+                return None
+            item = self.oeb.manifest.ids[id]
+            self.oeb.manifest.remove(item)
+            return item
+        for item in self.oeb.manifest.values():
+            if item.media_type == PAGE_MAP_MIME:
+                self.oeb.manifest.remove(item)
+                return item
+        return None
+
+    def _pages_from_page_map(self, opf):
+        item = self._find_page_map(opf)
+        if item is None:
+            return False
+        pmap = item.data
+        pages = self.oeb.pages
+        for page in xpath(pmap, 'o2:page'):
+            name = page.get('name', '')
+            href = page.get('href')
+            if not href:
+                continue
+            name = COLLAPSE_RE.sub(' ', name.strip())
+            href = item.abshref(urlnormalize(href))
+            type = 'normal'
+            if not name:
+                type = 'special'
+            elif name.lower().strip('ivxlcdm') == '':
+                type = 'front'
+            pages.add(name, href, type=type)
+        return True
+
+    def _pages_from_opf(self, opf, item):
+        if self._pages_from_ncx(opf, item):
+            return
+        if self._pages_from_page_map(opf):
+            return
+        return
+
+    def _cover_from_html(self, hcover):
+        from calibre.ebooks import render_html_svg_workaround
+        with TemporaryDirectory('_html_cover') as tdir:
+            writer = OEBWriter()
+            writer(self.oeb, tdir)
+            path = os.path.join(tdir, unquote(hcover.href))
+            data = render_html_svg_workaround(path, self.logger)
+            if not data:
+                data = b''
+        id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
+        item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
+        return item
+
+    def _locate_cover_image(self):
+        if self.oeb.metadata.cover:
+            id = unicode_type(self.oeb.metadata.cover[0])
+            item = self.oeb.manifest.ids.get(id, None)
+            if item is not None and item.media_type in OEB_IMAGES:
+                return item
+            else:
+                self.logger.warn('Invalid cover image @id %r' % id)
+        hcover = self.oeb.spine[0]
+        if 'cover' in self.oeb.guide:
+            href = self.oeb.guide['cover'].href
+            item = self.oeb.manifest.hrefs[href]
+            media_type = item.media_type
+            if media_type in OEB_IMAGES:
+                return item
+            elif media_type in OEB_DOCS:
+                hcover = item
+        html = hcover.data
+        if MS_COVER_TYPE in self.oeb.guide:
+            href = self.oeb.guide[MS_COVER_TYPE].href
+            item = self.oeb.manifest.hrefs.get(href, None)
+            if item is not None and item.media_type in OEB_IMAGES:
+                return item
+        if self.COVER_SVG_XP(html):
+            svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
+            href = os.path.splitext(hcover.href)[0] + '.svg'
+            id, href = self.oeb.manifest.generate(hcover.id, href)
+            item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg)
+            return item
+        if self.COVER_OBJECT_XP(html):
+            object = self.COVER_OBJECT_XP(html)[0]
+            href = hcover.abshref(object.get('data'))
+            item = self.oeb.manifest.hrefs.get(href, None)
+            if item is not None and item.media_type in OEB_IMAGES:
+                return item
+        return self._cover_from_html(hcover)
+
+    def _ensure_cover_image(self):
+        cover = self._locate_cover_image()
+        if self.oeb.metadata.cover:
+            self.oeb.metadata.cover[0].value = cover.id
+            return
+        self.oeb.metadata.add('cover', cover.id)
+
+    def _manifest_remove_duplicates(self):
+        seen = set()
+        dups = set()
+        for item in self.oeb.manifest:
+            if item.href in seen:
+                dups.add(item.href)
+            seen.add(item.href)
+
+        for href in dups:
+            items = [x for x in self.oeb.manifest if x.href == href]
+            for x in items:
+                if x not in self.oeb.spine:
+                    self.oeb.log.warn('Removing duplicate manifest item with id:', x.id)
+                    self.oeb.manifest.remove_duplicate_item(x)
+
+    def _all_from_opf(self, opf):
+        self.oeb.version = opf.get('version', '1.2')
+        self._metadata_from_opf(opf)
+        self._manifest_from_opf(opf)
+        self._spine_from_opf(opf)
+        self._manifest_remove_duplicates()
+        self._guide_from_opf(opf)
+        item = self._find_ncx(opf)
+        self._toc_from_opf(opf, item)
+        self._pages_from_opf(opf, item)
+        # self._ensure_cover_image()
+
+
+def main(argv=sys.argv):
+    reader = OEBReader()
+    for arg in argv[1:]:
+        oeb = reader(OEBBook(), arg)
+        for name, doc in oeb.to_opf1().values():
+            print(etree.tostring(doc, pretty_print=True))
+        for name, doc in oeb.to_opf2(page_map=True).values():
+            print(etree.tostring(doc, pretty_print=True))
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())