Initial import

2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
@@ -0,0 +1,7 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
@@ -0,0 +1,258 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+'''
+Input plugin for HTML or OPF ebooks.
+'''
+
+import os, re, sys,  errno as gerrno
+
+from calibre.ebooks.oeb.base import urlunquote
+from calibre.ebooks.chardet import detect_xml_encoding
+from calibre.constants import iswindows
+from calibre import unicode_path, as_unicode, replace_entities
+from polyglot.builtins import is_py3, unicode_type
+from polyglot.urllib import urlparse, urlunparse
+
+
+class Link(object):
+
+    '''
+    Represents a link in a HTML file.
+    '''
+
+    @classmethod
+    def url_to_local_path(cls, url, base):
+        path = url.path
+        isabs = False
+        if iswindows and path.startswith('/'):
+            path = path[1:]
+            isabs = True
+        path = urlunparse(('', '', path, url.params, url.query, ''))
+        path = urlunquote(path)
+        if isabs or os.path.isabs(path):
+            return path
+        return os.path.abspath(os.path.join(base, path))
+
+    def __init__(self, url, base):
+        '''
+        :param url:  The url this link points to. Must be an unquoted unicode string.
+        :param base: The base directory that relative URLs are with respect to.
+                     Must be a unicode string.
+        '''
+        assert isinstance(url, unicode_type) and isinstance(base, unicode_type)
+        self.url         = url
+        self.parsed_url  = urlparse(self.url)
+        self.is_local    = self.parsed_url.scheme in ('', 'file')
+        self.is_internal = self.is_local and not bool(self.parsed_url.path)
+        self.path        = None
+        self.fragment    = urlunquote(self.parsed_url.fragment)
+        if self.is_local and not self.is_internal:
+            self.path = self.url_to_local_path(self.parsed_url, base)
+
+    def __hash__(self):
+        if self.path is None:
+            return hash(self.url)
+        return hash(self.path)
+
+    def __eq__(self, other):
+        return self.path == getattr(other, 'path', other)
+
+    def __str__(self):
+        return 'Link: %s --> %s'%(self.url, self.path)
+
+    if not is_py3:
+        __unicode__ = __str__
+
+
+class IgnoreFile(Exception):
+
+    def __init__(self, msg, errno):
+        Exception.__init__(self, msg)
+        self.doesnt_exist = errno == gerrno.ENOENT
+        self.errno = errno
+
+
+class HTMLFile(object):
+
+    '''
+    Contains basic information about an HTML file. This
+    includes a list of links to other files as well as
+    the encoding of each file. Also tries to detect if the file is not a HTML
+    file in which case :member:`is_binary` is set to True.
+
+    The encoding of the file is available as :member:`encoding`.
+    '''
+
+    HTML_PAT  = re.compile(r'<\s*html', re.IGNORECASE)
+    TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
+    LINK_PAT  = re.compile(
+    r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
+    re.DOTALL|re.IGNORECASE)
+
+    def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
+        '''
+        :param level: The level of this file. Should be 0 for the root file.
+        :param encoding: Use `encoding` to decode HTML.
+        :param referrer: The :class:`HTMLFile` that first refers to this file.
+        '''
+        self.path     = unicode_path(path_to_html_file, abs=True)
+        self.title    = os.path.splitext(os.path.basename(self.path))[0]
+        self.base     = os.path.dirname(self.path)
+        self.level    = level
+        self.referrer = referrer
+        self.links    = []
+
+        try:
+            with open(self.path, 'rb') as f:
+                src = header = f.read(4096)
+                encoding = detect_xml_encoding(src)[1]
+                if encoding:
+                    try:
+                        header = header.decode(encoding)
+                    except ValueError:
+                        pass
+                self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
+                if not self.is_binary:
+                    src += f.read()
+        except IOError as err:
+            msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
+            if level == 0:
+                raise IOError(msg)
+            raise IgnoreFile(msg, err.errno)
+
+        if not src:
+            if level == 0:
+                raise ValueError('The file %s is empty'%self.path)
+            self.is_binary = True
+
+        if not self.is_binary:
+            if not encoding:
+                encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
+                self.encoding = encoding
+            else:
+                self.encoding = encoding
+
+            src = src.decode(encoding, 'replace')
+            match = self.TITLE_PAT.search(src)
+            self.title = match.group(1) if match is not None else self.title
+            self.find_links(src)
+
+    def __eq__(self, other):
+        return self.path == getattr(other, 'path', other)
+
+    def __hash__(self):
+        return hash(self.path)
+
+    def __str__(self):
+        return 'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
+
+    def __repr__(self):
+        return unicode_type(self)
+
+    def find_links(self, src):
+        for match in self.LINK_PAT.finditer(src):
+            url = None
+            for i in ('url1', 'url2', 'url3'):
+                url = match.group(i)
+                if url:
+                    break
+            url = replace_entities(url)
+            try:
+                link = self.resolve(url)
+            except ValueError:
+                # Unparseable URL, ignore
+                continue
+            if link not in self.links:
+                self.links.append(link)
+
+    def resolve(self, url):
+        return Link(url, self.base)
+
+
+def depth_first(root, flat, visited=None):
+    yield root
+    if visited is None:
+        visited = set()
+    visited.add(root)
+    for link in root.links:
+        if link.path is not None and link not in visited:
+            try:
+                index = flat.index(link)
+            except ValueError:  # Can happen if max_levels is used
+                continue
+            hf = flat[index]
+            if hf not in visited:
+                yield hf
+                visited.add(hf)
+                for hf in depth_first(hf, flat, visited):
+                    if hf not in visited:
+                        yield hf
+                        visited.add(hf)
+
+
+def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
+    '''
+    Recursively traverse all links in the HTML file.
+
+    :param max_levels: Maximum levels of recursion. Must be non-negative. 0
+                       implies that no links in the root HTML file are followed.
+    :param encoding:   Specify character encoding of HTML files. If `None` it is
+                       auto-detected.
+    :return:           A pair of lists (breadth_first, depth_first). Each list contains
+                       :class:`HTMLFile` objects.
+    '''
+    assert max_levels >= 0
+    level = 0
+    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
+    next_level = list(flat)
+    while level < max_levels and len(next_level) > 0:
+        level += 1
+        nl = []
+        for hf in next_level:
+            rejects = []
+            for link in hf.links:
+                if link.path is None or link.path in flat:
+                    continue
+                try:
+                    nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
+                    if nf.is_binary:
+                        raise IgnoreFile('%s is a binary file'%nf.path, -1)
+                    nl.append(nf)
+                    flat.append(nf)
+                except IgnoreFile as err:
+                    rejects.append(link)
+                    if not err.doesnt_exist or verbose > 1:
+                        print(repr(err))
+            for link in rejects:
+                hf.links.remove(link)
+
+        next_level = list(nl)
+    orec = sys.getrecursionlimit()
+    sys.setrecursionlimit(500000)
+    try:
+        return flat, list(depth_first(flat[0], flat))
+    finally:
+        sys.setrecursionlimit(orec)
+
+
+def get_filelist(htmlfile, dir, opts, log):
+    '''
+    Build list of files referenced by html file or try to detect and use an
+    OPF file instead.
+    '''
+    log.info('Building file list...')
+    filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
+                        verbose=opts.verbose,
+                        encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
+    if opts.verbose:
+        log.debug('\tFound files...')
+        for f in filelist:
+            log.debug('\t\t', f)
+    return filelist
@@ -0,0 +1,122 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import textwrap, os, glob
+
+from calibre.customize import FileTypePlugin
+from calibre.constants import numeric_version
+from polyglot.builtins import unicode_type
+
+
+class HTML2ZIP(FileTypePlugin):
+    name = 'HTML to ZIP'
+    author = 'Kovid Goyal'
+    description = textwrap.dedent(_('''\
+Follow all local links in an HTML file and create a ZIP \
+file containing all linked files. This plugin is run \
+every time you add an HTML file to the library.\
+'''))
+    version = numeric_version
+    file_types = {'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
+    supported_platforms = ['windows', 'osx', 'linux']
+    on_import = True
+
+    def run(self, htmlfile):
+        import codecs
+        from calibre import prints
+        from calibre.ptempfile import TemporaryDirectory
+        from calibre.gui2.convert.gui_conversion import gui_convert
+        from calibre.customize.conversion import OptionRecommendation
+        from calibre.ebooks.epub import initialize_container
+
+        with TemporaryDirectory('_plugin_html2zip') as tdir:
+            recs =[('debug_pipeline', tdir, OptionRecommendation.HIGH)]
+            recs.append(['keep_ligatures', True, OptionRecommendation.HIGH])
+            if self.site_customization and self.site_customization.strip():
+                sc = self.site_customization.strip()
+                enc, _, bf = sc.partition('|')
+                if enc:
+                    try:
+                        codecs.lookup(enc)
+                    except Exception:
+                        prints('Ignoring invalid input encoding for HTML:', enc)
+                    else:
+                        recs.append(['input_encoding', enc, OptionRecommendation.HIGH])
+                if bf == 'bf':
+                    recs.append(['breadth_first', True,
+                        OptionRecommendation.HIGH])
+            gui_convert(htmlfile, tdir, recs, abort_after_input_dump=True)
+            of = self.temporary_file('_plugin_html2zip.zip')
+            tdir = os.path.join(tdir, 'input')
+            opf = glob.glob(os.path.join(tdir, '*.opf'))[0]
+            ncx = glob.glob(os.path.join(tdir, '*.ncx'))
+            if ncx:
+                os.remove(ncx[0])
+            epub = initialize_container(of.name, os.path.basename(opf))
+            epub.add_dir(tdir)
+            epub.close()
+
+        return of.name
+
+    def customization_help(self, gui=False):
+        return _('Character encoding for the input HTML files. Common choices '
+        'include: cp1252, cp1251, latin1 and utf-8.')
+
+    def do_user_config(self, parent=None):
+        '''
+        This method shows a configuration dialog for this plugin. It returns
+        True if the user clicks OK, False otherwise. The changes are
+        automatically applied.
+        '''
+        from PyQt5.Qt import (QDialog, QDialogButtonBox, QVBoxLayout,
+                QLabel, Qt, QLineEdit, QCheckBox)
+
+        config_dialog = QDialog(parent)
+        button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
+        v = QVBoxLayout(config_dialog)
+
+        def size_dialog():
+            config_dialog.resize(config_dialog.sizeHint())
+
+        button_box.accepted.connect(config_dialog.accept)
+        button_box.rejected.connect(config_dialog.reject)
+        config_dialog.setWindowTitle(_('Customize') + ' ' + self.name)
+        from calibre.customize.ui import (plugin_customization,
+                customize_plugin)
+        help_text = self.customization_help(gui=True)
+        help_text = QLabel(help_text, config_dialog)
+        help_text.setWordWrap(True)
+        help_text.setTextInteractionFlags(Qt.LinksAccessibleByMouse | Qt.LinksAccessibleByKeyboard)
+        help_text.setOpenExternalLinks(True)
+        v.addWidget(help_text)
+        bf = QCheckBox(_('Add linked files in breadth first order'))
+        bf.setToolTip(_('Normally, when following links in HTML files'
+            ' calibre does it depth first, i.e. if file A links to B and '
+            ' C, but B links to D, the files are added in the order A, B, D, C. '
+            ' With this option, they will instead be added as A, B, C, D'))
+        sc = plugin_customization(self)
+        if not sc:
+            sc = ''
+        sc = sc.strip()
+        enc = sc.partition('|')[0]
+        bfs = sc.partition('|')[-1]
+        bf.setChecked(bfs == 'bf')
+        sc = QLineEdit(enc, config_dialog)
+        v.addWidget(sc)
+        v.addWidget(bf)
+        v.addWidget(button_box)
+        size_dialog()
+        config_dialog.exec_()
+
+        if config_dialog.result() == QDialog.Accepted:
+            sc = unicode_type(sc.text()).strip()
+            if bf.isChecked():
+                sc += '|bf'
+            customize_plugin(self, sc)
+
+        return config_dialog.result()