mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-26 07:05:45 +01:00
Every mime related function in main __init__.py has a flag check for the check if initialization has already done. This is nonsense, since it should be done implicitly early on the converter is starting. This commit straight the things out, and initialization is done in cli module. Also, function guess_type was removed, since it's just a proxy for mimetypes.guess_type function.
309 lines
12 KiB
Python
309 lines
12 KiB
Python
import functools
|
|
import mimetypes
|
|
import os
|
|
import re
|
|
import tempfile
|
|
import urllib.parse
|
|
|
|
from ebook_converter.constants_old import islinux, isbsd
|
|
from ebook_converter.customize.conversion import InputFormatPlugin
|
|
from ebook_converter.customize.conversion import OptionRecommendation
|
|
from ebook_converter.utils.localization import get_lang
|
|
from ebook_converter.utils.filenames import ascii_filename
|
|
from ebook_converter.utils.imghdr import what
|
|
from ebook_converter.polyglot.builtins import as_unicode
|
|
|
|
|
|
def sanitize_file_name(x):
|
|
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
|
|
ans, ext = ans.rpartition('.')[::2]
|
|
return (ans.strip() + '.' + ext.strip()).rstrip('.')
|
|
|
|
|
|
class HTMLInput(InputFormatPlugin):
|
|
|
|
name = 'HTML Input'
|
|
author = 'Kovid Goyal'
|
|
description = 'Convert HTML and OPF files to an OEB'
|
|
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
|
|
commit_name = 'html_input'
|
|
|
|
options = {
|
|
OptionRecommendation(name='breadth_first',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Traverse links in HTML files breadth first. Normally, '
|
|
'they are traversed depth first.'
|
|
),
|
|
|
|
OptionRecommendation(name='max_levels',
|
|
recommended_value=5, level=OptionRecommendation.LOW,
|
|
help='Maximum levels of recursion when following links in '
|
|
'HTML files. Must be non-negative. 0 implies that no '
|
|
'links in the root HTML file are followed. Default is '
|
|
'%default.'
|
|
),
|
|
|
|
OptionRecommendation(name='dont_package',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Normally this input plugin re-arranges all the input '
|
|
'files into a standard folder hierarchy. Only use this option '
|
|
'if you know what you are doing as it can result in various '
|
|
'nasty side effects in the rest of the conversion pipeline.'
|
|
),
|
|
|
|
}
|
|
|
|
def convert(self, stream, opts, file_ext, log,
|
|
accelerators):
|
|
self._is_case_sensitive = None
|
|
basedir = os.getcwd()
|
|
self.opts = opts
|
|
|
|
fname = None
|
|
if hasattr(stream, 'name'):
|
|
basedir = os.path.dirname(stream.name)
|
|
fname = os.path.basename(stream.name)
|
|
|
|
if file_ext != 'opf':
|
|
if opts.dont_package:
|
|
raise ValueError('The --dont-package option is not supported for an HTML input file')
|
|
from ebook_converter.ebooks.metadata.html import get_metadata
|
|
mi = get_metadata(stream)
|
|
if fname:
|
|
from ebook_converter.ebooks.metadata.meta import metadata_from_filename
|
|
fmi = metadata_from_filename(fname)
|
|
fmi.smart_update(mi)
|
|
mi = fmi
|
|
oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
|
|
return oeb
|
|
|
|
from ebook_converter.ebooks.conversion.plumber import create_oebbook
|
|
return create_oebbook(log, stream.name, opts,
|
|
encoding=opts.input_encoding)
|
|
|
|
def is_case_sensitive(self, path):
|
|
if getattr(self, '_is_case_sensitive', None) is not None:
|
|
return self._is_case_sensitive
|
|
if not path or not os.path.exists(path):
|
|
return islinux or isbsd
|
|
self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
|
|
return self._is_case_sensitive
|
|
|
|
def create_oebbook(self, htmlpath, basedir, opts, log, mi):
|
|
import uuid
|
|
from ebook_converter.ebooks.conversion.plumber import create_oebbook
|
|
from ebook_converter.ebooks.oeb.base import (DirContainer,
|
|
rewrite_links, urlnormalize, BINARY_MIME, OEB_STYLES,
|
|
xpath, urlquote)
|
|
from ebook_converter.ebooks.oeb.transforms.metadata import \
|
|
meta_info_to_oeb_metadata
|
|
from ebook_converter.ebooks.html.input import get_filelist
|
|
from ebook_converter.ebooks.metadata import string_to_authors
|
|
from ebook_converter.utils.localization import canonicalize_lang
|
|
import css_parser, logging
|
|
css_parser.log.setLevel(logging.WARN)
|
|
self.OEB_STYLES = OEB_STYLES
|
|
oeb = create_oebbook(log, None, opts, self,
|
|
encoding=opts.input_encoding, populate=False)
|
|
self.oeb = oeb
|
|
|
|
metadata = oeb.metadata
|
|
meta_info_to_oeb_metadata(mi, metadata, log)
|
|
if not metadata.language:
|
|
l = canonicalize_lang(getattr(opts, 'language', None))
|
|
if not l:
|
|
oeb.logger.warn('Language not specified')
|
|
l = get_lang().replace('_', '-')
|
|
metadata.add('language', l)
|
|
if not metadata.creator:
|
|
a = getattr(opts, 'authors', None)
|
|
if a:
|
|
a = string_to_authors(a)
|
|
if not a:
|
|
oeb.logger.warn('Creator not specified')
|
|
a = [self.oeb.translate('Unknown')]
|
|
for aut in a:
|
|
metadata.add('creator', aut)
|
|
if not metadata.title:
|
|
oeb.logger.warn('Title not specified')
|
|
metadata.add('title', self.oeb.translate('Unknown'))
|
|
bookid = str(uuid.uuid4())
|
|
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
|
for ident in metadata.identifier:
|
|
if 'id' in ident.attrib:
|
|
self.oeb.uid = metadata.identifier[0]
|
|
break
|
|
|
|
filelist = get_filelist(htmlpath, basedir, opts, log)
|
|
filelist = [f for f in filelist if not f.is_binary]
|
|
htmlfile_map = {}
|
|
for f in filelist:
|
|
path = f.path
|
|
oeb.container = DirContainer(os.path.dirname(path), log,
|
|
ignore_opf=True)
|
|
bname = os.path.basename(path)
|
|
id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
|
|
htmlfile_map[path] = href
|
|
item = oeb.manifest.add(id, href, 'text/html')
|
|
if path == htmlpath and '%' in path:
|
|
bname = urlquote(bname)
|
|
item.html_input_href = bname
|
|
oeb.spine.add(item, True)
|
|
|
|
self.added_resources = {}
|
|
self.log = log
|
|
self.log('Normalizing filename cases')
|
|
for path, href in htmlfile_map.items():
|
|
if not self.is_case_sensitive(path):
|
|
path = path.lower()
|
|
self.added_resources[path] = href
|
|
self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
|
|
self.urldefrag = urllib.parse.urldefrag
|
|
self.BINARY_MIME = BINARY_MIME
|
|
|
|
self.log('Rewriting HTML links')
|
|
for f in filelist:
|
|
path = f.path
|
|
dpath = os.path.dirname(path)
|
|
oeb.container = DirContainer(dpath, log, ignore_opf=True)
|
|
href = htmlfile_map[path]
|
|
try:
|
|
item = oeb.manifest.hrefs[href]
|
|
except KeyError:
|
|
item = oeb.manifest.hrefs[urlnormalize(href)]
|
|
rewrite_links(item.data,
|
|
functools.partial(self.resource_adder, base=dpath))
|
|
|
|
for item in oeb.manifest.values():
|
|
if item.media_type in self.OEB_STYLES:
|
|
dpath = None
|
|
for path, href in self.added_resources.items():
|
|
if href == item.href:
|
|
dpath = os.path.dirname(path)
|
|
break
|
|
css_parser.replaceUrls(item.data,
|
|
functools.partial(self.resource_adder, base=dpath))
|
|
|
|
toc = self.oeb.toc
|
|
self.oeb.auto_generated_toc = True
|
|
titles = []
|
|
headers = []
|
|
for item in self.oeb.spine:
|
|
if not item.linear:
|
|
continue
|
|
html = item.data
|
|
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
|
|
title = re.sub(r'\s+', ' ', title.strip())
|
|
if title:
|
|
titles.append(title)
|
|
headers.append('(unlabled)')
|
|
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
|
|
expr = '/h:html/h:body//h:%s[position()=1]/text()'
|
|
header = ''.join(xpath(html, expr % tag))
|
|
header = re.sub(r'\s+', ' ', header.strip())
|
|
if header:
|
|
headers[-1] = header
|
|
break
|
|
use = titles
|
|
if len(titles) > len(set(titles)):
|
|
use = headers
|
|
for title, item in zip(use, self.oeb.spine):
|
|
if not item.linear:
|
|
continue
|
|
toc.add(title, item.href)
|
|
|
|
oeb.container = DirContainer(os.getcwd(), oeb.log, ignore_opf=True)
|
|
return oeb
|
|
|
|
def link_to_local_path(self, link_, base=None):
|
|
from ebook_converter.ebooks.html.input import Link
|
|
if not isinstance(link_, str):
|
|
try:
|
|
link_ = link_.decode('utf-8', 'error')
|
|
except:
|
|
self.log.warn('Failed to decode link %r. Ignoring'%link_)
|
|
return None, None
|
|
try:
|
|
l = Link(link_, base if base else os.getcwd())
|
|
except:
|
|
self.log.exception('Failed to process link: %r'%link_)
|
|
return None, None
|
|
if l.path is None:
|
|
# Not a local resource
|
|
return None, None
|
|
link = l.path.replace('/', os.sep).strip()
|
|
frag = l.fragment
|
|
if not link:
|
|
return None, None
|
|
return link, frag
|
|
|
|
def resource_adder(self, link_, base=None):
|
|
link, frag = self.link_to_local_path(link_, base=base)
|
|
if link is None:
|
|
return link_
|
|
try:
|
|
if base and not os.path.isabs(link):
|
|
link = os.path.join(base, link)
|
|
link = os.path.abspath(link)
|
|
except:
|
|
return link_
|
|
if not os.access(link, os.R_OK):
|
|
return link_
|
|
if os.path.isdir(link):
|
|
self.log.warn(link_, 'is a link to a directory. Ignoring.')
|
|
return link_
|
|
if not self.is_case_sensitive(tempfile.gettempdir()):
|
|
link = link.lower()
|
|
if link not in self.added_resources:
|
|
bhref = os.path.basename(link)
|
|
id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
|
|
guessed = mimetypes.guess_type(href)[0]
|
|
media_type = guessed or self.BINARY_MIME
|
|
if media_type == 'text/plain':
|
|
self.log.warn('Ignoring link to text file %r'%link_)
|
|
return None
|
|
if media_type == self.BINARY_MIME:
|
|
# Check for the common case, images
|
|
try:
|
|
img = what(link)
|
|
except EnvironmentError:
|
|
pass
|
|
else:
|
|
if img:
|
|
media_type = mimetypes.guess_type('dummy.'+img)[0] or self.BINARY_MIME
|
|
|
|
self.oeb.log.debug('Added', link)
|
|
self.oeb.container = self.DirContainer(os.path.dirname(link),
|
|
self.oeb.log, ignore_opf=True)
|
|
# Load into memory
|
|
item = self.oeb.manifest.add(id, href, media_type)
|
|
# bhref refers to an already existing file. The read() method of
|
|
# DirContainer will call unquote on it before trying to read the
|
|
# file, therefore we quote it here.
|
|
if isinstance(bhref, str):
|
|
bhref = bhref.encode('utf-8')
|
|
item.html_input_href = as_unicode(urllib.parse.quote(bhref))
|
|
if guessed in self.OEB_STYLES:
|
|
item.override_css_fetch = functools.partial(
|
|
self.css_import_handler, os.path.dirname(link))
|
|
item.data
|
|
self.added_resources[link] = href
|
|
|
|
nlink = self.added_resources[link]
|
|
if frag:
|
|
nlink = '#'.join((nlink, frag))
|
|
return nlink
|
|
|
|
def css_import_handler(self, base, href):
|
|
link, frag = self.link_to_local_path(href, base=base)
|
|
if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
|
|
return (None, None)
|
|
try:
|
|
with open(link, 'rb') as f:
|
|
raw = f.read().decode('utf-8', 'replace')
|
|
raw = self.oeb.css_preprocessor(raw, add_namespace=False)
|
|
except:
|
|
self.log.exception('Failed to read CSS file: %r'%link)
|
|
return (None, None)
|
|
return (None, raw)
|