Initial import

2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
@@ -0,0 +1,202 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+''' CHM File decoding support '''
+__license__ = 'GPL v3'
+__copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
+                 ' and Alex Bramley <a.bramley at gmail.com>.'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin
+from calibre.ptempfile import TemporaryDirectory
+from calibre.constants import filesystem_encoding
+from polyglot.builtins import unicode_type, as_bytes
+
+
+class CHMInput(InputFormatPlugin):
+
+    name        = 'CHM Input'
+    author      = 'Kovid Goyal and Alex Bramley'
+    description = 'Convert CHM files to OEB'
+    file_types  = {'chm'}
+    commit_name = 'chm_input'
+
+    def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False):
+        from calibre.ebooks.chm.reader import CHMReader
+        log.debug('Opening CHM file')
+        rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding)
+        log.debug('Extracting CHM to %s' % output_dir)
+        rdr.extract_content(output_dir, debug_dump=debug_dump)
+        self._chm_reader = rdr
+        return rdr.hhc_path
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.ebooks.chm.metadata import get_metadata_from_reader
+        from calibre.customize.ui import plugin_for_input_format
+        self.opts = options
+
+        log.debug('Processing CHM...')
+        with TemporaryDirectory('_chm2oeb') as tdir:
+            if not isinstance(tdir, unicode_type):
+                tdir = tdir.decode(filesystem_encoding)
+            html_input = plugin_for_input_format('html')
+            for opt in html_input.options:
+                setattr(options, opt.option.name, opt.recommended_value)
+            no_images = False  # options.no_images
+            chm_name = stream.name
+            # chm_data = stream.read()
+
+            # closing stream so CHM can be opened by external library
+            stream.close()
+            log.debug('tdir=%s' % tdir)
+            log.debug('stream.name=%s' % stream.name)
+            debug_dump = False
+            odi = options.debug_pipeline
+            if odi:
+                debug_dump = os.path.join(odi, 'input')
+            mainname = self._chmtohtml(tdir, chm_name, no_images, log,
+                    debug_dump=debug_dump)
+            mainpath = os.path.join(tdir, mainname)
+
+            try:
+                metadata = get_metadata_from_reader(self._chm_reader)
+            except Exception:
+                log.exception('Failed to read metadata, using filename')
+                from calibre.ebooks.metadata.book.base import Metadata
+                metadata = Metadata(os.path.basename(chm_name))
+            encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
+            self._chm_reader.CloseCHM()
+            # print((tdir, mainpath))
+            # from calibre import ipython
+            # ipython()
+
+            options.debug_pipeline = None
+            options.input_encoding = 'utf-8'
+            uenc = encoding
+            if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
+                uenc = 'utf-8'
+            htmlpath, toc = self._create_html_root(mainpath, log, uenc)
+            oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
+            options.debug_pipeline = odi
+            if toc.count() > 1:
+                oeb.toc = self.parse_html_toc(oeb.spine[0])
+                oeb.manifest.remove(oeb.spine[0])
+                oeb.auto_generated_toc = False
+        return oeb
+
+    def parse_html_toc(self, item):
+        from calibre.ebooks.oeb.base import TOC, XPath
+        dx = XPath('./h:div')
+        ax = XPath('./h:a[1]')
+
+        def do_node(parent, div):
+            for child in dx(div):
+                a = ax(child)[0]
+                c = parent.add(a.text, a.attrib['href'])
+                do_node(c, child)
+
+        toc = TOC()
+        root = XPath('//h:div[1]')(item.data)[0]
+        do_node(toc, root)
+        return toc
+
+    def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
+        # use HTMLInput plugin to generate book
+        from calibre.customize.builtins import HTMLInput
+        opts.breadth_first = True
+        htmlinput = HTMLInput(None)
+        oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
+        return oeb
+
+    def _create_html_root(self, hhcpath, log, encoding):
+        from lxml import html
+        from polyglot.urllib import unquote as _unquote
+        from calibre.ebooks.oeb.base import urlquote
+        from calibre.ebooks.chardet import xml_to_unicode
+        hhcdata = self._read_file(hhcpath)
+        hhcdata = hhcdata.decode(encoding)
+        hhcdata = xml_to_unicode(hhcdata, verbose=True,
+                            strip_encoding_pats=True, resolve_entities=True)[0]
+        hhcroot = html.fromstring(hhcdata)
+        toc = self._process_nodes(hhcroot)
+        # print("=============================")
+        # print("Printing hhcroot")
+        # print(etree.tostring(hhcroot, pretty_print=True))
+        # print("=============================")
+        log.debug('Found %d section nodes' % toc.count())
+        htmlpath = os.path.splitext(hhcpath)[0] + ".html"
+        base = os.path.dirname(os.path.abspath(htmlpath))
+
+        def unquote(x):
+            if isinstance(x, unicode_type):
+                x = x.encode('utf-8')
+            return _unquote(x).decode('utf-8')
+
+        def unquote_path(x):
+            y = unquote(x)
+            if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))):
+                x = y
+            return x
+
+        def donode(item, parent, base, subpath):
+            for child in item:
+                title = child.title
+                if not title:
+                    continue
+                raw = unquote_path(child.href or '')
+                rsrcname = os.path.basename(raw)
+                rsrcpath = os.path.join(subpath, rsrcname)
+                if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))):
+                    rsrcpath = raw
+
+                if '%' not in rsrcpath:
+                    rsrcpath = urlquote(rsrcpath)
+                if not raw:
+                    rsrcpath = ''
+                c = DIV(A(title, href=rsrcpath))
+                donode(child, c, base, subpath)
+                parent.append(c)
+
+        with open(htmlpath, 'wb') as f:
+            if toc.count() > 1:
+                from lxml.html.builder import HTML, BODY, DIV, A
+                path0 = toc[0].href
+                path0 = unquote_path(path0)
+                subpath = os.path.dirname(path0)
+                base = os.path.dirname(f.name)
+                root = DIV()
+                donode(toc, root, base, subpath)
+                raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
+                                   pretty_print=True)
+                f.write(raw)
+            else:
+                f.write(as_bytes(hhcdata))
+        return htmlpath, toc
+
+    def _read_file(self, name):
+        with lopen(name, 'rb') as f:
+            data = f.read()
+        return data
+
+    def add_node(self, node, toc, ancestor_map):
+        from calibre.ebooks.chm.reader import match_string
+        if match_string(node.attrib.get('type', ''), 'text/sitemap'):
+            p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
+            parent = p[0] if p else None
+            toc = ancestor_map.get(parent, toc)
+            title = href = ''
+            for param in node.xpath('./param'):
+                if match_string(param.attrib['name'], 'name'):
+                    title = param.attrib['value']
+                elif match_string(param.attrib['name'], 'local'):
+                    href = param.attrib['value']
+            child = toc.add(title or _('Unknown'), href)
+            ancestor_map[node] = child
+
+    def _process_nodes(self, root):
+        from calibre.ebooks.oeb.base import TOC
+        toc = TOC()
+        ancestor_map = {}
+        for node in root.xpath('//object'):
+            self.add_node(node, toc, ancestor_map)
+        return toc