mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-02 08:32:26 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
317 lines
13 KiB
Python
317 lines
13 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
import re, tempfile, os
|
|
from functools import partial
|
|
|
|
from ebook_converter.constants import islinux, isbsd
|
|
from ebook_converter.customize.conversion import (InputFormatPlugin,
|
|
OptionRecommendation)
|
|
from ebook_converter.utils.localization import get_lang
|
|
from ebook_converter.utils.filenames import ascii_filename
|
|
from ebook_converter.utils.imghdr import what
|
|
from ebook_converter.polyglot.builtins import unicode_type, zip, getcwd, as_unicode
|
|
|
|
|
|
def sanitize_file_name(x):
|
|
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
|
|
ans, ext = ans.rpartition('.')[::2]
|
|
return (ans.strip() + '.' + ext.strip()).rstrip('.')
|
|
|
|
|
|
class HTMLInput(InputFormatPlugin):
|
|
|
|
name = 'HTML Input'
|
|
author = 'Kovid Goyal'
|
|
description = 'Convert HTML and OPF files to an OEB'
|
|
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
|
|
commit_name = 'html_input'
|
|
|
|
options = {
|
|
OptionRecommendation(name='breadth_first',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help=_('Traverse links in HTML files breadth first. Normally, '
|
|
'they are traversed depth first.'
|
|
)
|
|
),
|
|
|
|
OptionRecommendation(name='max_levels',
|
|
recommended_value=5, level=OptionRecommendation.LOW,
|
|
help=_('Maximum levels of recursion when following links in '
|
|
'HTML files. Must be non-negative. 0 implies that no '
|
|
'links in the root HTML file are followed. Default is '
|
|
'%default.'
|
|
)
|
|
),
|
|
|
|
OptionRecommendation(name='dont_package',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help=_('Normally this input plugin re-arranges all the input '
|
|
'files into a standard folder hierarchy. Only use this option '
|
|
'if you know what you are doing as it can result in various '
|
|
'nasty side effects in the rest of the conversion pipeline.'
|
|
)
|
|
),
|
|
|
|
}
|
|
|
|
def convert(self, stream, opts, file_ext, log,
|
|
accelerators):
|
|
self._is_case_sensitive = None
|
|
basedir = getcwd()
|
|
self.opts = opts
|
|
|
|
fname = None
|
|
if hasattr(stream, 'name'):
|
|
basedir = os.path.dirname(stream.name)
|
|
fname = os.path.basename(stream.name)
|
|
|
|
if file_ext != 'opf':
|
|
if opts.dont_package:
|
|
raise ValueError('The --dont-package option is not supported for an HTML input file')
|
|
from ebook_converter.ebooks.metadata.html import get_metadata
|
|
mi = get_metadata(stream)
|
|
if fname:
|
|
from ebook_converter.ebooks.metadata.meta import metadata_from_filename
|
|
fmi = metadata_from_filename(fname)
|
|
fmi.smart_update(mi)
|
|
mi = fmi
|
|
oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
|
|
return oeb
|
|
|
|
from ebook_converter.ebooks.conversion.plumber import create_oebbook
|
|
return create_oebbook(log, stream.name, opts,
|
|
encoding=opts.input_encoding)
|
|
|
|
def is_case_sensitive(self, path):
|
|
if getattr(self, '_is_case_sensitive', None) is not None:
|
|
return self._is_case_sensitive
|
|
if not path or not os.path.exists(path):
|
|
return islinux or isbsd
|
|
self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
|
|
return self._is_case_sensitive
|
|
|
|
def create_oebbook(self, htmlpath, basedir, opts, log, mi):
|
|
import uuid
|
|
from ebook_converter.ebooks.conversion.plumber import create_oebbook
|
|
from ebook_converter.ebooks.oeb.base import (DirContainer,
|
|
rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
|
|
xpath, urlquote)
|
|
from ebook_converter import guess_type
|
|
from ebook_converter.ebooks.oeb.transforms.metadata import \
|
|
meta_info_to_oeb_metadata
|
|
from ebook_converter.ebooks.html.input import get_filelist
|
|
from ebook_converter.ebooks.metadata import string_to_authors
|
|
from ebook_converter.utils.localization import canonicalize_lang
|
|
import css_parser, logging
|
|
css_parser.log.setLevel(logging.WARN)
|
|
self.OEB_STYLES = OEB_STYLES
|
|
oeb = create_oebbook(log, None, opts, self,
|
|
encoding=opts.input_encoding, populate=False)
|
|
self.oeb = oeb
|
|
|
|
metadata = oeb.metadata
|
|
meta_info_to_oeb_metadata(mi, metadata, log)
|
|
if not metadata.language:
|
|
l = canonicalize_lang(getattr(opts, 'language', None))
|
|
if not l:
|
|
oeb.logger.warn('Language not specified')
|
|
l = get_lang().replace('_', '-')
|
|
metadata.add('language', l)
|
|
if not metadata.creator:
|
|
a = getattr(opts, 'authors', None)
|
|
if a:
|
|
a = string_to_authors(a)
|
|
if not a:
|
|
oeb.logger.warn('Creator not specified')
|
|
a = [self.oeb.translate(__('Unknown'))]
|
|
for aut in a:
|
|
metadata.add('creator', aut)
|
|
if not metadata.title:
|
|
oeb.logger.warn('Title not specified')
|
|
metadata.add('title', self.oeb.translate(__('Unknown')))
|
|
bookid = unicode_type(uuid.uuid4())
|
|
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
|
for ident in metadata.identifier:
|
|
if 'id' in ident.attrib:
|
|
self.oeb.uid = metadata.identifier[0]
|
|
break
|
|
|
|
filelist = get_filelist(htmlpath, basedir, opts, log)
|
|
filelist = [f for f in filelist if not f.is_binary]
|
|
htmlfile_map = {}
|
|
for f in filelist:
|
|
path = f.path
|
|
oeb.container = DirContainer(os.path.dirname(path), log,
|
|
ignore_opf=True)
|
|
bname = os.path.basename(path)
|
|
id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
|
|
htmlfile_map[path] = href
|
|
item = oeb.manifest.add(id, href, 'text/html')
|
|
if path == htmlpath and '%' in path:
|
|
bname = urlquote(bname)
|
|
item.html_input_href = bname
|
|
oeb.spine.add(item, True)
|
|
|
|
self.added_resources = {}
|
|
self.log = log
|
|
self.log('Normalizing filename cases')
|
|
for path, href in htmlfile_map.items():
|
|
if not self.is_case_sensitive(path):
|
|
path = path.lower()
|
|
self.added_resources[path] = href
|
|
self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
|
|
self.urldefrag = urldefrag
|
|
self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME
|
|
|
|
self.log('Rewriting HTML links')
|
|
for f in filelist:
|
|
path = f.path
|
|
dpath = os.path.dirname(path)
|
|
oeb.container = DirContainer(dpath, log, ignore_opf=True)
|
|
href = htmlfile_map[path]
|
|
try:
|
|
item = oeb.manifest.hrefs[href]
|
|
except KeyError:
|
|
item = oeb.manifest.hrefs[urlnormalize(href)]
|
|
rewrite_links(item.data, partial(self.resource_adder, base=dpath))
|
|
|
|
for item in oeb.manifest.values():
|
|
if item.media_type in self.OEB_STYLES:
|
|
dpath = None
|
|
for path, href in self.added_resources.items():
|
|
if href == item.href:
|
|
dpath = os.path.dirname(path)
|
|
break
|
|
css_parser.replaceUrls(item.data,
|
|
partial(self.resource_adder, base=dpath))
|
|
|
|
toc = self.oeb.toc
|
|
self.oeb.auto_generated_toc = True
|
|
titles = []
|
|
headers = []
|
|
for item in self.oeb.spine:
|
|
if not item.linear:
|
|
continue
|
|
html = item.data
|
|
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
|
|
title = re.sub(r'\s+', ' ', title.strip())
|
|
if title:
|
|
titles.append(title)
|
|
headers.append('(unlabled)')
|
|
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
|
|
expr = '/h:html/h:body//h:%s[position()=1]/text()'
|
|
header = ''.join(xpath(html, expr % tag))
|
|
header = re.sub(r'\s+', ' ', header.strip())
|
|
if header:
|
|
headers[-1] = header
|
|
break
|
|
use = titles
|
|
if len(titles) > len(set(titles)):
|
|
use = headers
|
|
for title, item in zip(use, self.oeb.spine):
|
|
if not item.linear:
|
|
continue
|
|
toc.add(title, item.href)
|
|
|
|
oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
|
|
return oeb
|
|
|
|
def link_to_local_path(self, link_, base=None):
|
|
from ebook_converter.ebooks.html.input import Link
|
|
if not isinstance(link_, unicode_type):
|
|
try:
|
|
link_ = link_.decode('utf-8', 'error')
|
|
except:
|
|
self.log.warn('Failed to decode link %r. Ignoring'%link_)
|
|
return None, None
|
|
try:
|
|
l = Link(link_, base if base else getcwd())
|
|
except:
|
|
self.log.exception('Failed to process link: %r'%link_)
|
|
return None, None
|
|
if l.path is None:
|
|
# Not a local resource
|
|
return None, None
|
|
link = l.path.replace('/', os.sep).strip()
|
|
frag = l.fragment
|
|
if not link:
|
|
return None, None
|
|
return link, frag
|
|
|
|
def resource_adder(self, link_, base=None):
|
|
from ebook_converter.polyglot.urllib import quote
|
|
link, frag = self.link_to_local_path(link_, base=base)
|
|
if link is None:
|
|
return link_
|
|
try:
|
|
if base and not os.path.isabs(link):
|
|
link = os.path.join(base, link)
|
|
link = os.path.abspath(link)
|
|
except:
|
|
return link_
|
|
if not os.access(link, os.R_OK):
|
|
return link_
|
|
if os.path.isdir(link):
|
|
self.log.warn(link_, 'is a link to a directory. Ignoring.')
|
|
return link_
|
|
if not self.is_case_sensitive(tempfile.gettempdir()):
|
|
link = link.lower()
|
|
if link not in self.added_resources:
|
|
bhref = os.path.basename(link)
|
|
id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
|
|
guessed = self.guess_type(href)[0]
|
|
media_type = guessed or self.BINARY_MIME
|
|
if media_type == 'text/plain':
|
|
self.log.warn('Ignoring link to text file %r'%link_)
|
|
return None
|
|
if media_type == self.BINARY_MIME:
|
|
# Check for the common case, images
|
|
try:
|
|
img = what(link)
|
|
except EnvironmentError:
|
|
pass
|
|
else:
|
|
if img:
|
|
media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME
|
|
|
|
self.oeb.log.debug('Added', link)
|
|
self.oeb.container = self.DirContainer(os.path.dirname(link),
|
|
self.oeb.log, ignore_opf=True)
|
|
# Load into memory
|
|
item = self.oeb.manifest.add(id, href, media_type)
|
|
# bhref refers to an already existing file. The read() method of
|
|
# DirContainer will call unquote on it before trying to read the
|
|
# file, therefore we quote it here.
|
|
if isinstance(bhref, unicode_type):
|
|
bhref = bhref.encode('utf-8')
|
|
item.html_input_href = as_unicode(quote(bhref))
|
|
if guessed in self.OEB_STYLES:
|
|
item.override_css_fetch = partial(
|
|
self.css_import_handler, os.path.dirname(link))
|
|
item.data
|
|
self.added_resources[link] = href
|
|
|
|
nlink = self.added_resources[link]
|
|
if frag:
|
|
nlink = '#'.join((nlink, frag))
|
|
return nlink
|
|
|
|
def css_import_handler(self, base, href):
|
|
link, frag = self.link_to_local_path(href, base=base)
|
|
if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
|
|
return (None, None)
|
|
try:
|
|
with open(link, 'rb') as f:
|
|
raw = f.read().decode('utf-8', 'replace')
|
|
raw = self.oeb.css_preprocessor(raw, add_namespace=False)
|
|
except:
|
|
self.log.exception('Failed to read CSS file: %r'%link)
|
|
return (None, None)
|
|
return (None, raw)
|