mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-06 19:44:12 +01:00
Initial import
This commit is contained in:
7
ebook_converter/ebooks/html/__init__.py
Normal file
7
ebook_converter/ebooks/html/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
258
ebook_converter/ebooks/html/input.py
Normal file
258
ebook_converter/ebooks/html/input.py
Normal file
@@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
'''
|
||||
Input plugin for HTML or OPF ebooks.
|
||||
'''
|
||||
|
||||
import os, re, sys, errno as gerrno
|
||||
|
||||
from calibre.ebooks.oeb.base import urlunquote
|
||||
from calibre.ebooks.chardet import detect_xml_encoding
|
||||
from calibre.constants import iswindows
|
||||
from calibre import unicode_path, as_unicode, replace_entities
|
||||
from polyglot.builtins import is_py3, unicode_type
|
||||
from polyglot.urllib import urlparse, urlunparse
|
||||
|
||||
|
||||
class Link(object):
|
||||
|
||||
'''
|
||||
Represents a link in a HTML file.
|
||||
'''
|
||||
|
||||
@classmethod
|
||||
def url_to_local_path(cls, url, base):
|
||||
path = url.path
|
||||
isabs = False
|
||||
if iswindows and path.startswith('/'):
|
||||
path = path[1:]
|
||||
isabs = True
|
||||
path = urlunparse(('', '', path, url.params, url.query, ''))
|
||||
path = urlunquote(path)
|
||||
if isabs or os.path.isabs(path):
|
||||
return path
|
||||
return os.path.abspath(os.path.join(base, path))
|
||||
|
||||
def __init__(self, url, base):
|
||||
'''
|
||||
:param url: The url this link points to. Must be an unquoted unicode string.
|
||||
:param base: The base directory that relative URLs are with respect to.
|
||||
Must be a unicode string.
|
||||
'''
|
||||
assert isinstance(url, unicode_type) and isinstance(base, unicode_type)
|
||||
self.url = url
|
||||
self.parsed_url = urlparse(self.url)
|
||||
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
||||
self.path = None
|
||||
self.fragment = urlunquote(self.parsed_url.fragment)
|
||||
if self.is_local and not self.is_internal:
|
||||
self.path = self.url_to_local_path(self.parsed_url, base)
|
||||
|
||||
def __hash__(self):
|
||||
if self.path is None:
|
||||
return hash(self.url)
|
||||
return hash(self.path)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __str__(self):
|
||||
return 'Link: %s --> %s'%(self.url, self.path)
|
||||
|
||||
if not is_py3:
|
||||
__unicode__ = __str__
|
||||
|
||||
|
||||
class IgnoreFile(Exception):
|
||||
|
||||
def __init__(self, msg, errno):
|
||||
Exception.__init__(self, msg)
|
||||
self.doesnt_exist = errno == gerrno.ENOENT
|
||||
self.errno = errno
|
||||
|
||||
|
||||
class HTMLFile(object):
|
||||
|
||||
'''
|
||||
Contains basic information about an HTML file. This
|
||||
includes a list of links to other files as well as
|
||||
the encoding of each file. Also tries to detect if the file is not a HTML
|
||||
file in which case :member:`is_binary` is set to True.
|
||||
|
||||
The encoding of the file is available as :member:`encoding`.
|
||||
'''
|
||||
|
||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
||||
LINK_PAT = re.compile(
|
||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||
re.DOTALL|re.IGNORECASE)
|
||||
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
|
||||
'''
|
||||
:param level: The level of this file. Should be 0 for the root file.
|
||||
:param encoding: Use `encoding` to decode HTML.
|
||||
:param referrer: The :class:`HTMLFile` that first refers to this file.
|
||||
'''
|
||||
self.path = unicode_path(path_to_html_file, abs=True)
|
||||
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||
self.base = os.path.dirname(self.path)
|
||||
self.level = level
|
||||
self.referrer = referrer
|
||||
self.links = []
|
||||
|
||||
try:
|
||||
with open(self.path, 'rb') as f:
|
||||
src = header = f.read(4096)
|
||||
encoding = detect_xml_encoding(src)[1]
|
||||
if encoding:
|
||||
try:
|
||||
header = header.decode(encoding)
|
||||
except ValueError:
|
||||
pass
|
||||
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
|
||||
if not self.is_binary:
|
||||
src += f.read()
|
||||
except IOError as err:
|
||||
msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
|
||||
if level == 0:
|
||||
raise IOError(msg)
|
||||
raise IgnoreFile(msg, err.errno)
|
||||
|
||||
if not src:
|
||||
if level == 0:
|
||||
raise ValueError('The file %s is empty'%self.path)
|
||||
self.is_binary = True
|
||||
|
||||
if not self.is_binary:
|
||||
if not encoding:
|
||||
encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
|
||||
self.encoding = encoding
|
||||
else:
|
||||
self.encoding = encoding
|
||||
|
||||
src = src.decode(encoding, 'replace')
|
||||
match = self.TITLE_PAT.search(src)
|
||||
self.title = match.group(1) if match is not None else self.title
|
||||
self.find_links(src)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.path)
|
||||
|
||||
def __str__(self):
|
||||
return 'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
||||
|
||||
def __repr__(self):
|
||||
return unicode_type(self)
|
||||
|
||||
def find_links(self, src):
|
||||
for match in self.LINK_PAT.finditer(src):
|
||||
url = None
|
||||
for i in ('url1', 'url2', 'url3'):
|
||||
url = match.group(i)
|
||||
if url:
|
||||
break
|
||||
url = replace_entities(url)
|
||||
try:
|
||||
link = self.resolve(url)
|
||||
except ValueError:
|
||||
# Unparseable URL, ignore
|
||||
continue
|
||||
if link not in self.links:
|
||||
self.links.append(link)
|
||||
|
||||
def resolve(self, url):
|
||||
return Link(url, self.base)
|
||||
|
||||
|
||||
def depth_first(root, flat, visited=None):
|
||||
yield root
|
||||
if visited is None:
|
||||
visited = set()
|
||||
visited.add(root)
|
||||
for link in root.links:
|
||||
if link.path is not None and link not in visited:
|
||||
try:
|
||||
index = flat.index(link)
|
||||
except ValueError: # Can happen if max_levels is used
|
||||
continue
|
||||
hf = flat[index]
|
||||
if hf not in visited:
|
||||
yield hf
|
||||
visited.add(hf)
|
||||
for hf in depth_first(hf, flat, visited):
|
||||
if hf not in visited:
|
||||
yield hf
|
||||
visited.add(hf)
|
||||
|
||||
|
||||
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
|
||||
'''
|
||||
Recursively traverse all links in the HTML file.
|
||||
|
||||
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||
implies that no links in the root HTML file are followed.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
||||
:class:`HTMLFile` objects.
|
||||
'''
|
||||
assert max_levels >= 0
|
||||
level = 0
|
||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||
next_level = list(flat)
|
||||
while level < max_levels and len(next_level) > 0:
|
||||
level += 1
|
||||
nl = []
|
||||
for hf in next_level:
|
||||
rejects = []
|
||||
for link in hf.links:
|
||||
if link.path is None or link.path in flat:
|
||||
continue
|
||||
try:
|
||||
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
||||
if nf.is_binary:
|
||||
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
||||
nl.append(nf)
|
||||
flat.append(nf)
|
||||
except IgnoreFile as err:
|
||||
rejects.append(link)
|
||||
if not err.doesnt_exist or verbose > 1:
|
||||
print(repr(err))
|
||||
for link in rejects:
|
||||
hf.links.remove(link)
|
||||
|
||||
next_level = list(nl)
|
||||
orec = sys.getrecursionlimit()
|
||||
sys.setrecursionlimit(500000)
|
||||
try:
|
||||
return flat, list(depth_first(flat[0], flat))
|
||||
finally:
|
||||
sys.setrecursionlimit(orec)
|
||||
|
||||
|
||||
def get_filelist(htmlfile, dir, opts, log):
|
||||
'''
|
||||
Build list of files referenced by html file or try to detect and use an
|
||||
OPF file instead.
|
||||
'''
|
||||
log.info('Building file list...')
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
log.debug('\tFound files...')
|
||||
for f in filelist:
|
||||
log.debug('\t\t', f)
|
||||
return filelist
|
||||
122
ebook_converter/ebooks/html/to_zip.py
Normal file
122
ebook_converter/ebooks/html/to_zip.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import textwrap, os, glob
|
||||
|
||||
from calibre.customize import FileTypePlugin
|
||||
from calibre.constants import numeric_version
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class HTML2ZIP(FileTypePlugin):
|
||||
name = 'HTML to ZIP'
|
||||
author = 'Kovid Goyal'
|
||||
description = textwrap.dedent(_('''\
|
||||
Follow all local links in an HTML file and create a ZIP \
|
||||
file containing all linked files. This plugin is run \
|
||||
every time you add an HTML file to the library.\
|
||||
'''))
|
||||
version = numeric_version
|
||||
file_types = {'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
on_import = True
|
||||
|
||||
def run(self, htmlfile):
|
||||
import codecs
|
||||
from calibre import prints
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.gui2.convert.gui_conversion import gui_convert
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre.ebooks.epub import initialize_container
|
||||
|
||||
with TemporaryDirectory('_plugin_html2zip') as tdir:
|
||||
recs =[('debug_pipeline', tdir, OptionRecommendation.HIGH)]
|
||||
recs.append(['keep_ligatures', True, OptionRecommendation.HIGH])
|
||||
if self.site_customization and self.site_customization.strip():
|
||||
sc = self.site_customization.strip()
|
||||
enc, _, bf = sc.partition('|')
|
||||
if enc:
|
||||
try:
|
||||
codecs.lookup(enc)
|
||||
except Exception:
|
||||
prints('Ignoring invalid input encoding for HTML:', enc)
|
||||
else:
|
||||
recs.append(['input_encoding', enc, OptionRecommendation.HIGH])
|
||||
if bf == 'bf':
|
||||
recs.append(['breadth_first', True,
|
||||
OptionRecommendation.HIGH])
|
||||
gui_convert(htmlfile, tdir, recs, abort_after_input_dump=True)
|
||||
of = self.temporary_file('_plugin_html2zip.zip')
|
||||
tdir = os.path.join(tdir, 'input')
|
||||
opf = glob.glob(os.path.join(tdir, '*.opf'))[0]
|
||||
ncx = glob.glob(os.path.join(tdir, '*.ncx'))
|
||||
if ncx:
|
||||
os.remove(ncx[0])
|
||||
epub = initialize_container(of.name, os.path.basename(opf))
|
||||
epub.add_dir(tdir)
|
||||
epub.close()
|
||||
|
||||
return of.name
|
||||
|
||||
def customization_help(self, gui=False):
|
||||
return _('Character encoding for the input HTML files. Common choices '
|
||||
'include: cp1252, cp1251, latin1 and utf-8.')
|
||||
|
||||
def do_user_config(self, parent=None):
|
||||
'''
|
||||
This method shows a configuration dialog for this plugin. It returns
|
||||
True if the user clicks OK, False otherwise. The changes are
|
||||
automatically applied.
|
||||
'''
|
||||
from PyQt5.Qt import (QDialog, QDialogButtonBox, QVBoxLayout,
|
||||
QLabel, Qt, QLineEdit, QCheckBox)
|
||||
|
||||
config_dialog = QDialog(parent)
|
||||
button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
|
||||
v = QVBoxLayout(config_dialog)
|
||||
|
||||
def size_dialog():
|
||||
config_dialog.resize(config_dialog.sizeHint())
|
||||
|
||||
button_box.accepted.connect(config_dialog.accept)
|
||||
button_box.rejected.connect(config_dialog.reject)
|
||||
config_dialog.setWindowTitle(_('Customize') + ' ' + self.name)
|
||||
from calibre.customize.ui import (plugin_customization,
|
||||
customize_plugin)
|
||||
help_text = self.customization_help(gui=True)
|
||||
help_text = QLabel(help_text, config_dialog)
|
||||
help_text.setWordWrap(True)
|
||||
help_text.setTextInteractionFlags(Qt.LinksAccessibleByMouse | Qt.LinksAccessibleByKeyboard)
|
||||
help_text.setOpenExternalLinks(True)
|
||||
v.addWidget(help_text)
|
||||
bf = QCheckBox(_('Add linked files in breadth first order'))
|
||||
bf.setToolTip(_('Normally, when following links in HTML files'
|
||||
' calibre does it depth first, i.e. if file A links to B and '
|
||||
' C, but B links to D, the files are added in the order A, B, D, C. '
|
||||
' With this option, they will instead be added as A, B, C, D'))
|
||||
sc = plugin_customization(self)
|
||||
if not sc:
|
||||
sc = ''
|
||||
sc = sc.strip()
|
||||
enc = sc.partition('|')[0]
|
||||
bfs = sc.partition('|')[-1]
|
||||
bf.setChecked(bfs == 'bf')
|
||||
sc = QLineEdit(enc, config_dialog)
|
||||
v.addWidget(sc)
|
||||
v.addWidget(bf)
|
||||
v.addWidget(button_box)
|
||||
size_dialog()
|
||||
config_dialog.exec_()
|
||||
|
||||
if config_dialog.result() == QDialog.Accepted:
|
||||
sc = unicode_type(sc.text()).strip()
|
||||
if bf.isChecked():
|
||||
sc += '|bf'
|
||||
customize_plugin(self, sc)
|
||||
|
||||
return config_dialog.result()
|
||||
Reference in New Issue
Block a user