mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-24 15:11:30 +02:00
Added first portion of logging adaptation.
Things may be broken at this point - there are still several modules to be adapted.
This commit is contained in:
@@ -12,7 +12,7 @@ import re
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
from ebook_converter.utils.config import OptionParser
|
from ebook_converter.utils.config import OptionParser
|
||||||
from ebook_converter.utils.logging import Log
|
from ebook_converter.utils import logging
|
||||||
from ebook_converter.customize.conversion import OptionRecommendation
|
from ebook_converter.customize.conversion import OptionRecommendation
|
||||||
|
|
||||||
|
|
||||||
@@ -66,7 +66,7 @@ def check_command_line_options(parser, args, log):
|
|||||||
if (not input_file.endswith('.recipe') and
|
if (not input_file.endswith('.recipe') and
|
||||||
not os.access(input_file, os.R_OK) and
|
not os.access(input_file, os.R_OK) and
|
||||||
not ('-h' in args or '--help' in args)):
|
not ('-h' in args or '--help' in args)):
|
||||||
log.error('Cannot read from', input_file)
|
log.error('Cannot read from %s', input_file)
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
if input_file.endswith('.recipe') and not os.access(input_file, os.R_OK):
|
if input_file.endswith('.recipe') and not os.access(input_file, os.R_OK):
|
||||||
input_file = args[1]
|
input_file = args[1]
|
||||||
@@ -267,7 +267,7 @@ class ProgressBar(object):
|
|||||||
def __call__(self, frac, msg=''):
|
def __call__(self, frac, msg=''):
|
||||||
if msg:
|
if msg:
|
||||||
percent = int(frac*100)
|
percent = int(frac*100)
|
||||||
self.log('%d%% %s' % (percent, msg))
|
self.log.info('%d%% %s' % (percent, msg))
|
||||||
|
|
||||||
|
|
||||||
def create_option_parser(args, log):
|
def create_option_parser(args, log):
|
||||||
@@ -275,20 +275,18 @@ def create_option_parser(args, log):
|
|||||||
from ebook_converter.constants_old import __appname__
|
from ebook_converter.constants_old import __appname__
|
||||||
from ebook_converter.constants_old import __author__
|
from ebook_converter.constants_old import __author__
|
||||||
from ebook_converter.constants_old import __version__
|
from ebook_converter.constants_old import __version__
|
||||||
log(os.path.basename(args[0]), '('+__appname__, __version__+')')
|
log.info("%s (%s, %s)", os.path.basename(args[0]), __appname__,
|
||||||
log('Created by:', __author__)
|
__version__)
|
||||||
|
log.info('Created by: %s', __author__)
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
if '--list-recipes' in args:
|
if '--list-recipes' in args:
|
||||||
from ebook_converter.web.feeds.recipes.collection import \
|
from ebook_converter.web.feeds.recipes.collection import \
|
||||||
get_builtin_recipe_titles
|
get_builtin_recipe_titles
|
||||||
log('Available recipes:')
|
log.info('Available recipes:')
|
||||||
titles = sorted(get_builtin_recipe_titles())
|
titles = sorted(get_builtin_recipe_titles())
|
||||||
for title in titles:
|
for title in titles:
|
||||||
try:
|
log.info('\t%s', title)
|
||||||
log('\t'+title)
|
log.info('%d recipes available', len(titles))
|
||||||
except Exception:
|
|
||||||
log('\t'+repr(title))
|
|
||||||
log('%d recipes available' % len(titles))
|
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
|
|
||||||
parser = option_parser()
|
parser = option_parser()
|
||||||
@@ -352,7 +350,7 @@ def read_sr_patterns(path, log=None):
|
|||||||
|
|
||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
log = Log()
|
log = logging.default_log
|
||||||
mimetypes.init([pkg_resources.resource_filename('ebook_converter',
|
mimetypes.init([pkg_resources.resource_filename('ebook_converter',
|
||||||
'data/mime.types')])
|
'data/mime.types')])
|
||||||
parser, plumber = create_option_parser(args, log)
|
parser, plumber = create_option_parser(args, log)
|
||||||
@@ -386,7 +384,7 @@ def main(args=sys.argv):
|
|||||||
|
|
||||||
plumber.run()
|
plumber.run()
|
||||||
|
|
||||||
log('Output saved to', ' ', plumber.output)
|
log.info('Output saved to %s', plumber.output)
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|||||||
@@ -146,7 +146,7 @@ class EPUBInput(InputFormatPlugin):
|
|||||||
if len(spine) > 1:
|
if len(spine) > 1:
|
||||||
for item in spine:
|
for item in spine:
|
||||||
if item.get('idref') == titlepage_id:
|
if item.get('idref') == titlepage_id:
|
||||||
log('Found HTML cover', titlepage_href)
|
log.info('Found HTML cover %s', titlepage_href)
|
||||||
if self.for_viewer:
|
if self.for_viewer:
|
||||||
item.attrib.pop('linear', None)
|
item.attrib.pop('linear', None)
|
||||||
else:
|
else:
|
||||||
@@ -192,7 +192,7 @@ class EPUBInput(InputFormatPlugin):
|
|||||||
elem = [x for x in manifest if x.get('id', '') == idref]
|
elem = [x for x in manifest if x.get('id', '') == idref]
|
||||||
if not elem or elem[0].get('href', None) != guide_cover:
|
if not elem or elem[0].get('href', None) != guide_cover:
|
||||||
return
|
return
|
||||||
log('Found HTML cover', guide_cover)
|
log.info('Found HTML cover %s', guide_cover)
|
||||||
|
|
||||||
# Remove from spine as covers must be treated
|
# Remove from spine as covers must be treated
|
||||||
# specially
|
# specially
|
||||||
|
|||||||
@@ -214,7 +214,7 @@ class EPUBOutput(OutputFormatPlugin):
|
|||||||
self.workaround_sony_quirks()
|
self.workaround_sony_quirks()
|
||||||
|
|
||||||
if self.oeb.toc.count() == 0:
|
if self.oeb.toc.count() == 0:
|
||||||
self.log.warn('This EPUB file has no Table of Contents. '
|
self.log.warning('This EPUB file has no Table of Contents. '
|
||||||
'Creating a default TOC')
|
'Creating a default TOC')
|
||||||
first = next(iter(self.oeb.spine))
|
first = next(iter(self.oeb.spine))
|
||||||
self.oeb.toc.add('Start', first.href)
|
self.oeb.toc.add('Start', first.href)
|
||||||
@@ -229,7 +229,7 @@ class EPUBOutput(OutputFormatPlugin):
|
|||||||
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
|
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
|
||||||
|
|
||||||
if _uuid is None:
|
if _uuid is None:
|
||||||
self.log.warn('No UUID identifier found')
|
self.log.warning('No UUID identifier found')
|
||||||
_uuid = str(uuid.uuid4())
|
_uuid = str(uuid.uuid4())
|
||||||
oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid)
|
oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid)
|
||||||
|
|
||||||
@@ -281,7 +281,7 @@ class EPUBOutput(OutputFormatPlugin):
|
|||||||
os.mkdir(opts.extract_to)
|
os.mkdir(opts.extract_to)
|
||||||
with ZipFile(output_path) as zf:
|
with ZipFile(output_path) as zf:
|
||||||
zf.extractall(path=opts.extract_to)
|
zf.extractall(path=opts.extract_to)
|
||||||
self.log.info('EPUB extracted to', opts.extract_to)
|
self.log.info('EPUB extracted to %s', opts.extract_to)
|
||||||
|
|
||||||
def upgrade_to_epub3(self, tdir, opf):
|
def upgrade_to_epub3(self, tdir, opf):
|
||||||
self.log.info('Upgrading to EPUB 3...')
|
self.log.info('Upgrading to EPUB 3...')
|
||||||
@@ -323,7 +323,7 @@ class EPUBOutput(OutputFormatPlugin):
|
|||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
uris.pop(uri)
|
uris.pop(uri)
|
||||||
continue
|
continue
|
||||||
self.log.debug('Encrypting font:', uri)
|
self.log.debug('Encrypting font: %s', uri)
|
||||||
with open(path, 'r+b') as f:
|
with open(path, 'r+b') as f:
|
||||||
data = f.read(1024)
|
data = f.read(1024)
|
||||||
if len(data) >= 1024:
|
if len(data) >= 1024:
|
||||||
@@ -332,7 +332,7 @@ class EPUBOutput(OutputFormatPlugin):
|
|||||||
f.write(bytes(bytearray(data[i] ^ key[i%16]
|
f.write(bytes(bytearray(data[i] ^ key[i%16]
|
||||||
for i in range(1024))))
|
for i in range(1024))))
|
||||||
else:
|
else:
|
||||||
self.log.warn('Font', path, 'is invalid, ignoring')
|
self.log.warning('Font %s is invalid, ignoring', path)
|
||||||
if not isinstance(uri, str):
|
if not isinstance(uri, str):
|
||||||
uri = uri.decode('utf-8')
|
uri = uri.decode('utf-8')
|
||||||
fonts.append('''
|
fonts.append('''
|
||||||
@@ -385,8 +385,9 @@ class EPUBOutput(OutputFormatPlugin):
|
|||||||
_base, _, frag = href.partition('#')
|
_base, _, frag = href.partition('#')
|
||||||
frag = urllib.parse.unquote(frag)
|
frag = urllib.parse.unquote(frag)
|
||||||
if frag and frag_pat.match(frag) is None:
|
if frag and frag_pat.match(frag) is None:
|
||||||
self.log.warn(
|
self.log.warning('Removing fragment identifier %r from '
|
||||||
'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
|
'TOC as Adobe Digital Editions cannot '
|
||||||
|
'handle it', frag)
|
||||||
node.href = _base
|
node.href = _base
|
||||||
|
|
||||||
for x in self.oeb.spine:
|
for x in self.oeb.spine:
|
||||||
@@ -530,8 +531,8 @@ class EPUBOutput(OutputFormatPlugin):
|
|||||||
for x in self.oeb.spine:
|
for x in self.oeb.spine:
|
||||||
if x.href == href:
|
if x.href == href:
|
||||||
if frag_is_at_top(x.data, frag):
|
if frag_is_at_top(x.data, frag):
|
||||||
self.log.debug('Removing anchor from TOC href:',
|
self.log.debug('Removing anchor from TOC '
|
||||||
href+'#'+frag)
|
'href: %s#%s', href, frag)
|
||||||
toc.href = href
|
toc.href = href
|
||||||
break
|
break
|
||||||
for x in toc:
|
for x in toc:
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ class LRFInput(InputFormatPlugin):
|
|||||||
from ebook_converter.ebooks.lrf.input import MediaType, Styles, \
|
from ebook_converter.ebooks.lrf.input import MediaType, Styles, \
|
||||||
TextBlock, Canvas, ImageBlock, RuledLine
|
TextBlock, Canvas, ImageBlock, RuledLine
|
||||||
self.log = log
|
self.log = log
|
||||||
self.log('Generating XML')
|
self.log.info('Generating XML')
|
||||||
from ebook_converter.ebooks.lrf.lrfparser import LRFDocument
|
from ebook_converter.ebooks.lrf.lrfparser import LRFDocument
|
||||||
d = LRFDocument(stream)
|
d = LRFDocument(stream)
|
||||||
d.parse()
|
d.parse()
|
||||||
@@ -50,7 +50,7 @@ class LRFInput(InputFormatPlugin):
|
|||||||
if imgstr:
|
if imgstr:
|
||||||
plot_map[ro] = imgstr[0].get('file')
|
plot_map[ro] = imgstr[0].get('file')
|
||||||
|
|
||||||
self.log('Converting XML to HTML...')
|
self.log.info('Converting XML to HTML...')
|
||||||
|
|
||||||
with open(pkg_resources.
|
with open(pkg_resources.
|
||||||
resource_filename('ebook_converter',
|
resource_filename('ebook_converter',
|
||||||
|
|||||||
@@ -854,8 +854,8 @@ OptionRecommendation(name='search_replace',
|
|||||||
try:
|
try:
|
||||||
val = float(val)
|
val = float(val)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.log.warn('Values of series index and rating must'
|
self.log.warning('Values of series index and rating '
|
||||||
' be numbers. Ignoring', val)
|
'must be numbers. Ignoring %s', val)
|
||||||
continue
|
continue
|
||||||
elif x in ('timestamp', 'pubdate'):
|
elif x in ('timestamp', 'pubdate'):
|
||||||
try:
|
try:
|
||||||
@@ -882,7 +882,7 @@ OptionRecommendation(name='search_replace',
|
|||||||
self.opts_to_mi(mi)
|
self.opts_to_mi(mi)
|
||||||
if mi.cover:
|
if mi.cover:
|
||||||
if mi.cover.startswith('http:') or mi.cover.startswith('https:'):
|
if mi.cover.startswith('http:') or mi.cover.startswith('https:'):
|
||||||
self.log.warn("TODO: Cover image is on remote server, "
|
self.log.warning("TODO: Cover image is on remote server, "
|
||||||
"implement downloading using requests")
|
"implement downloading using requests")
|
||||||
ext = mi.cover.rpartition('.')[-1].lower().strip()
|
ext = mi.cover.rpartition('.')[-1].lower().strip()
|
||||||
if ext not in ('png', 'jpg', 'jpeg', 'gif'):
|
if ext not in ('png', 'jpg', 'jpeg', 'gif'):
|
||||||
@@ -909,8 +909,8 @@ OptionRecommendation(name='search_replace',
|
|||||||
if x.short_name == sval:
|
if x.short_name == sval:
|
||||||
setattr(self.opts, attr, x)
|
setattr(self.opts, attr, x)
|
||||||
return
|
return
|
||||||
self.log.warn(
|
self.log.warning('Profile (%s) %r is no longer available, using '
|
||||||
'Profile (%s) %r is no longer available, using default'%(which, sval))
|
'default', which, sval)
|
||||||
for x in profiles():
|
for x in profiles():
|
||||||
if x.short_name == 'default':
|
if x.short_name == 'default':
|
||||||
setattr(self.opts, attr, x)
|
setattr(self.opts, attr, x)
|
||||||
@@ -925,14 +925,16 @@ OptionRecommendation(name='search_replace',
|
|||||||
if self.opts.verbose:
|
if self.opts.verbose:
|
||||||
self.log.filter_level = self.log.DEBUG
|
self.log.filter_level = self.log.DEBUG
|
||||||
if self.changed_options:
|
if self.changed_options:
|
||||||
self.log('Conversion options changed from defaults:')
|
self.log.info('Conversion options changed from defaults:')
|
||||||
for rec in self.changed_options:
|
for rec in self.changed_options:
|
||||||
if rec.option.name not in ('username', 'password'):
|
if rec.option.name not in ('username', 'password'):
|
||||||
self.log(' ', '%s:' % rec.option.name, repr(rec.recommended_value))
|
self.log.info(' %s', rec.option.name,
|
||||||
|
repr(rec.recommended_value))
|
||||||
if self.opts.verbose > 1:
|
if self.opts.verbose > 1:
|
||||||
self.log.debug('Resolved conversion options')
|
self.log.debug('Resolved conversion options')
|
||||||
try:
|
try:
|
||||||
self.log.debug('ebook_converter version:', constants.VERSION)
|
self.log.debug('ebook_converter version: %s',
|
||||||
|
constants.VERSION)
|
||||||
odict = dict(self.opts.__dict__)
|
odict = dict(self.opts.__dict__)
|
||||||
for x in ('username', 'password'):
|
for x in ('username', 'password'):
|
||||||
odict.pop(x, None)
|
odict.pop(x, None)
|
||||||
@@ -968,7 +970,7 @@ OptionRecommendation(name='search_replace',
|
|||||||
self.input_plugin.save_download(zf)
|
self.input_plugin.save_download(zf)
|
||||||
zf.close()
|
zf.close()
|
||||||
|
|
||||||
self.log.info('Input debug saved to:', out_dir)
|
self.log.info('Input debug saved to: %s', out_dir)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
'''
|
'''
|
||||||
@@ -1022,7 +1024,8 @@ OptionRecommendation(name='search_replace',
|
|||||||
from ebook_converter.ebooks.azw4.reader import unwrap
|
from ebook_converter.ebooks.azw4.reader import unwrap
|
||||||
unwrap(stream, self.output)
|
unwrap(stream, self.output)
|
||||||
self.ui_reporter(1.)
|
self.ui_reporter(1.)
|
||||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
self.log.info('%s output written to %s', self.output_fmt.upper(),
|
||||||
|
self.output)
|
||||||
self.flush()
|
self.flush()
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -1056,7 +1059,7 @@ OptionRecommendation(name='search_replace',
|
|||||||
if self.opts.debug_pipeline is not None:
|
if self.opts.debug_pipeline is not None:
|
||||||
out_dir = os.path.join(self.opts.debug_pipeline, 'parsed')
|
out_dir = os.path.join(self.opts.debug_pipeline, 'parsed')
|
||||||
self.dump_oeb(self.oeb, out_dir)
|
self.dump_oeb(self.oeb, out_dir)
|
||||||
self.log('Parsed HTML written to:', out_dir)
|
self.log.info('Parsed HTML written to: %s', out_dir)
|
||||||
self.input_plugin.specialize(self.oeb, self.opts, self.log,
|
self.input_plugin.specialize(self.oeb, self.opts, self.log,
|
||||||
self.output_fmt)
|
self.output_fmt)
|
||||||
|
|
||||||
@@ -1105,13 +1108,13 @@ OptionRecommendation(name='search_replace',
|
|||||||
try:
|
try:
|
||||||
fkey = list(map(float, fkey.split(',')))
|
fkey = list(map(float, fkey.split(',')))
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.error('Invalid font size key: %r ignoring'%fkey)
|
self.log.error('Invalid font size key: %s ignoring', fkey)
|
||||||
fkey = self.opts.dest.fkey
|
fkey = self.opts.dest.fkey
|
||||||
|
|
||||||
if self.opts.debug_pipeline is not None:
|
if self.opts.debug_pipeline is not None:
|
||||||
out_dir = os.path.join(self.opts.debug_pipeline, 'structure')
|
out_dir = os.path.join(self.opts.debug_pipeline, 'structure')
|
||||||
self.dump_oeb(self.oeb, out_dir)
|
self.dump_oeb(self.oeb, out_dir)
|
||||||
self.log('Structured HTML written to:', out_dir)
|
self.log.info('Structured HTML written to: %s', out_dir)
|
||||||
|
|
||||||
if self.opts.extra_css and os.path.exists(self.opts.extra_css):
|
if self.opts.extra_css and os.path.exists(self.opts.extra_css):
|
||||||
with open(self.opts.extra_css, 'rb') as f:
|
with open(self.opts.extra_css, 'rb') as f:
|
||||||
@@ -1187,9 +1190,9 @@ OptionRecommendation(name='search_replace',
|
|||||||
if self.opts.debug_pipeline is not None:
|
if self.opts.debug_pipeline is not None:
|
||||||
out_dir = os.path.join(self.opts.debug_pipeline, 'processed')
|
out_dir = os.path.join(self.opts.debug_pipeline, 'processed')
|
||||||
self.dump_oeb(self.oeb, out_dir)
|
self.dump_oeb(self.oeb, out_dir)
|
||||||
self.log('Processed HTML written to:', out_dir)
|
self.log.info('Processed HTML written to: %s', out_dir)
|
||||||
|
|
||||||
self.log.info('Creating %s...'%self.output_plugin.name)
|
self.log.info('Creating %s...', self.output_plugin.name)
|
||||||
our = CompositeProgressReporter(0.67, 1., self.ui_reporter)
|
our = CompositeProgressReporter(0.67, 1., self.ui_reporter)
|
||||||
self.output_plugin.report_progress = our
|
self.output_plugin.report_progress = our
|
||||||
our(0., 'Running %s plugin' % self.output_plugin.name)
|
our(0., 'Running %s plugin' % self.output_plugin.name)
|
||||||
@@ -1200,7 +1203,8 @@ OptionRecommendation(name='search_replace',
|
|||||||
self.ui_reporter(1.)
|
self.ui_reporter(1.)
|
||||||
run_plugins_on_postprocess(self.output, self.output_fmt)
|
run_plugins_on_postprocess(self.output, self.output_fmt)
|
||||||
|
|
||||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
self.log.info('%s output written to %s', self.output_fmt.upper(),
|
||||||
|
self.output)
|
||||||
self.flush()
|
self.flush()
|
||||||
|
|
||||||
|
|
||||||
@@ -1230,7 +1234,7 @@ def create_oebbook(log, path_or_stream, opts, reader=None,
|
|||||||
if specialize is not None:
|
if specialize is not None:
|
||||||
oeb = specialize(oeb) or oeb
|
oeb = specialize(oeb) or oeb
|
||||||
# Read OEB Book into OEBBook
|
# Read OEB Book into OEBBook
|
||||||
log('Parsing all content...')
|
log.info('Parsing all content...')
|
||||||
oeb.removed_items_to_ignore = removed_items
|
oeb.removed_items_to_ignore = removed_items
|
||||||
if reader is None:
|
if reader is None:
|
||||||
from ebook_converter.ebooks.oeb.reader import OEBReader
|
from ebook_converter.ebooks.oeb.reader import OEBReader
|
||||||
@@ -1241,11 +1245,11 @@ def create_oebbook(log, path_or_stream, opts, reader=None,
|
|||||||
|
|
||||||
|
|
||||||
def create_dummy_plumber(input_format, output_format):
|
def create_dummy_plumber(input_format, output_format):
|
||||||
from ebook_converter.utils.logging import Log
|
from ebook_converter.utils import logging
|
||||||
input_format = input_format.lower()
|
input_format = input_format.lower()
|
||||||
output_format = output_format.lower()
|
output_format = output_format.lower()
|
||||||
output_path = 'dummy.'+output_format
|
output_path = 'dummy.'+output_format
|
||||||
log = Log()
|
log = logging.default_log
|
||||||
log.outputs = []
|
log.outputs = []
|
||||||
input_file = 'dummy.'+input_format
|
input_file = 'dummy.'+input_format
|
||||||
if input_format in ARCHIVE_FMTS:
|
if input_format in ARCHIVE_FMTS:
|
||||||
|
|||||||
@@ -5,11 +5,6 @@ from ebook_converter.utils.logging import default_log
|
|||||||
from ebook_converter.utils.wordcount import get_wordcount_obj
|
from ebook_converter.utils.wordcount import get_wordcount_obj
|
||||||
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
|
|
||||||
class HeuristicProcessor(object):
|
class HeuristicProcessor(object):
|
||||||
|
|
||||||
def __init__(self, extra_opts=None, log=None):
|
def __init__(self, extra_opts=None, log=None):
|
||||||
@@ -50,8 +45,8 @@ class HeuristicProcessor(object):
|
|||||||
title = match.group('title')
|
title = match.group('title')
|
||||||
if not title:
|
if not title:
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log.debug("marked " + str(self.html_preprocess_sections) +
|
self.log.debug("marked %s chapters. - %s",
|
||||||
" chapters. - " + str(chap))
|
self.html_preprocess_sections, str(chap))
|
||||||
return '<h2>'+chap+'</h2>\n'
|
return '<h2>'+chap+'</h2>\n'
|
||||||
else:
|
else:
|
||||||
delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
|
delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
|
||||||
@@ -59,16 +54,16 @@ class HeuristicProcessor(object):
|
|||||||
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
|
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
|
||||||
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
|
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log.debug("marked " + str(self.html_preprocess_sections) +
|
self.log.debug("marked %s chapters & titles. - %s, %s",
|
||||||
" chapters & titles. - " + str(chap) + ", " + str(title))
|
self.html_preprocess_sections, chap, title)
|
||||||
return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
|
return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
|
||||||
|
|
||||||
def chapter_break(self, match):
|
def chapter_break(self, match):
|
||||||
chap = match.group('section')
|
chap = match.group('section')
|
||||||
styles = match.group('styles')
|
styles = match.group('styles')
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log.debug("marked " + str(self.html_preprocess_sections) +
|
self.log.debug("marked %s section markers based on punctuation. - %s",
|
||||||
" section markers based on punctuation. - " + str(chap))
|
self.html_preprocess_sections, chap)
|
||||||
return '<'+styles+' style="page-break-before:always">'+chap
|
return '<'+styles+' style="page-break-before:always">'+chap
|
||||||
|
|
||||||
def analyze_title_matches(self, match):
|
def analyze_title_matches(self, match):
|
||||||
@@ -111,8 +106,6 @@ class HeuristicProcessor(object):
|
|||||||
line_end = line_end_ere.findall(raw)
|
line_end = line_end_ere.findall(raw)
|
||||||
tot_htm_ends = len(htm_end)
|
tot_htm_ends = len(htm_end)
|
||||||
tot_ln_fds = len(line_end)
|
tot_ln_fds = len(line_end)
|
||||||
# self.log.debug("There are " + str(tot_ln_fds) + " total Line feeds, and " +
|
|
||||||
# str(tot_htm_ends) + " marked up endings")
|
|
||||||
|
|
||||||
if percent > 1:
|
if percent > 1:
|
||||||
percent = 1
|
percent = 1
|
||||||
@@ -120,7 +113,6 @@ class HeuristicProcessor(object):
|
|||||||
percent = 0
|
percent = 0
|
||||||
|
|
||||||
min_lns = tot_ln_fds * percent
|
min_lns = tot_ln_fds * percent
|
||||||
# self.log.debug("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
|
|
||||||
return min_lns > tot_htm_ends
|
return min_lns > tot_htm_ends
|
||||||
|
|
||||||
def dump(self, raw, where):
|
def dump(self, raw, where):
|
||||||
@@ -148,7 +140,6 @@ class HeuristicProcessor(object):
|
|||||||
return wordcount.words
|
return wordcount.words
|
||||||
|
|
||||||
def markup_italicis(self, html):
|
def markup_italicis(self, html):
|
||||||
# self.log.debug("\n\n\nitalicize debugging \n\n\n")
|
|
||||||
ITALICIZE_WORDS = [
|
ITALICIZE_WORDS = [
|
||||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||||
@@ -178,7 +169,6 @@ class HeuristicProcessor(object):
|
|||||||
for pat in ITALICIZE_STYLE_PATS:
|
for pat in ITALICIZE_STYLE_PATS:
|
||||||
for match in re.finditer(pat, search_text):
|
for match in re.finditer(pat, search_text):
|
||||||
ital_string = str(match.group('words'))
|
ital_string = str(match.group('words'))
|
||||||
# self.log.debug("italicising "+str(match.group(0))+" with <i>"+ital_string+"</i>")
|
|
||||||
try:
|
try:
|
||||||
html = re.sub(re.escape(str(match.group(0))), '<i>%s</i>' % ital_string, html)
|
html = re.sub(re.escape(str(match.group(0))), '<i>%s</i>' % ital_string, html)
|
||||||
except OverflowError:
|
except OverflowError:
|
||||||
@@ -205,10 +195,11 @@ class HeuristicProcessor(object):
|
|||||||
if wordcount > 200000:
|
if wordcount > 200000:
|
||||||
typical_chapters = 15000.
|
typical_chapters = 15000.
|
||||||
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||||
self.log.debug("minimum chapters required are: "+str(self.min_chapters))
|
self.log.debug("minimum chapters required are: %s", self.min_chapters)
|
||||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
self.html_preprocess_sections = len(heading.findall(html))
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
self.log.debug("found " + str(self.html_preprocess_sections) + " pre-existing headings")
|
self.log.debug("found %s pre-existing headings",
|
||||||
|
self.html_preprocess_sections)
|
||||||
|
|
||||||
# Build the Regular Expressions in pieces
|
# Build the Regular Expressions in pieces
|
||||||
init_lookahead = "(?=<(p|div))"
|
init_lookahead = "(?=<(p|div))"
|
||||||
@@ -298,7 +289,8 @@ class HeuristicProcessor(object):
|
|||||||
if n_lookahead_req:
|
if n_lookahead_req:
|
||||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||||
if not analyze:
|
if not analyze:
|
||||||
self.log.debug("Marked " + str(self.html_preprocess_sections) + " headings, " + log_message)
|
self.log.debug("Marked %s headings, %s",
|
||||||
|
self.html_preprocess_sections, log_message)
|
||||||
|
|
||||||
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
|
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
|
||||||
lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
||||||
@@ -311,11 +303,12 @@ class HeuristicProcessor(object):
|
|||||||
if float(self.chapters_with_title) / float(hits) > .5:
|
if float(self.chapters_with_title) / float(hits) > .5:
|
||||||
title_req = True
|
title_req = True
|
||||||
strict_title = False
|
strict_title = False
|
||||||
self.log.debug(
|
self.log.debug('%s had %s hits %s chapters with no '
|
||||||
str(type_name)+" had "+str(hits)+
|
'title, %s chapters with titles, %s '
|
||||||
" hits - "+str(self.chapters_no_title)+" chapters with no title, "+
|
'percent.', type_name, hits,
|
||||||
str(self.chapters_with_title)+" chapters with titles, "+
|
self.chapters_no_title,
|
||||||
str(float(self.chapters_with_title) / float(hits))+" percent. ")
|
self.chapters_with_title,
|
||||||
|
self.chapters_with_title / hits)
|
||||||
if type_name == 'common':
|
if type_name == 'common':
|
||||||
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
|
elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
|
||||||
@@ -332,8 +325,9 @@ class HeuristicProcessor(object):
|
|||||||
words_per_chptr = wordcount
|
words_per_chptr = wordcount
|
||||||
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
||||||
words_per_chptr = wordcount // self.html_preprocess_sections
|
words_per_chptr = wordcount // self.html_preprocess_sections
|
||||||
self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+
|
self.log.debug("Total wordcount is: %s, Average words per section "
|
||||||
str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
"is: %s, Marked up %s chapters", wordcount,
|
||||||
|
words_per_chptr, self.html_preprocess_sections)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def punctuation_unwrap(self, length, content, format):
|
def punctuation_unwrap(self, length, content, format):
|
||||||
@@ -427,7 +421,8 @@ class HeuristicProcessor(object):
|
|||||||
txtindent = re.compile(str(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
|
txtindent = re.compile(str(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
|
||||||
html = txtindent.sub(self.insert_indent, html)
|
html = txtindent.sub(self.insert_indent, html)
|
||||||
if self.found_indents > 1:
|
if self.found_indents > 1:
|
||||||
self.log.debug("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
|
self.log.debug("replaced %s nbsp indents with inline styles",
|
||||||
|
self.found_indents)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def cleanup_markup(self, html):
|
def cleanup_markup(self, html):
|
||||||
@@ -475,8 +470,8 @@ class HeuristicProcessor(object):
|
|||||||
blanklines = self.blankreg.findall(html)
|
blanklines = self.blankreg.findall(html)
|
||||||
lines = self.linereg.findall(html)
|
lines = self.linereg.findall(html)
|
||||||
if len(lines) > 1:
|
if len(lines) > 1:
|
||||||
self.log.debug("There are " + str(len(blanklines)) + " blank lines. " +
|
self.log.debug("There are %s blank lines. %s percent blank",
|
||||||
str(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
len(blanklines), len(blanklines) / len(lines))
|
||||||
|
|
||||||
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||||
return True
|
return True
|
||||||
@@ -600,7 +595,7 @@ class HeuristicProcessor(object):
|
|||||||
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
|
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
|
||||||
except:
|
except:
|
||||||
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||||
self.log.warn('Invalid replacement scene break'
|
self.log.warning('Invalid replacement scene break'
|
||||||
' expression, using default')
|
' expression, using default')
|
||||||
else:
|
else:
|
||||||
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
|
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
|
||||||
@@ -702,20 +697,23 @@ class HeuristicProcessor(object):
|
|||||||
blockquote_open_loop = blockquote_open
|
blockquote_open_loop = blockquote_open
|
||||||
if debugabby:
|
if debugabby:
|
||||||
self.log.debug('\n\n******\n')
|
self.log.debug('\n\n******\n')
|
||||||
self.log.debug('padding top is: '+str(setting[0]))
|
self.log.debug('padding top is: %s', setting[0])
|
||||||
self.log.debug('padding right is:' +str(setting[1]))
|
self.log.debug('padding right is: %s', setting[1])
|
||||||
self.log.debug('padding bottom is: ' + str(setting[2]))
|
self.log.debug('padding bottom is: %s', setting[2])
|
||||||
self.log.debug('padding left is: ' +str(setting[3]))
|
self.log.debug('padding left is: %s', setting[3])
|
||||||
|
|
||||||
# print "text-align is: "+str(text_align)
|
# print "text-align is: "+str(text_align)
|
||||||
# print "\n***\nline is:\n "+str(match.group(0))+'\n'
|
# print "\n***\nline is:\n "+str(match.group(0))+'\n'
|
||||||
if debugabby:
|
if debugabby:
|
||||||
# print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph)
|
# print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph)
|
||||||
self.log.debug("styles for this line were:", styles)
|
self.log.debug("styles for this line were: %s", styles)
|
||||||
self.log.debug('newline is:')
|
self.log.debug('newline is: %s', blockquote_open_loop +
|
||||||
self.log.debug(blockquote_open_loop+blockquote_close_loop+
|
blockquote_close_loop +
|
||||||
paragraph_before+'<p style="'+text_indent+text_align+
|
paragraph_before +
|
||||||
'">'+content+'</p>'+paragraph_after+'\n\n\n\n\n')
|
'<p style="%s">%s</p>' %
|
||||||
|
(text_indent + text_align, content) +
|
||||||
|
paragraph_after +
|
||||||
|
'\n\n\n\n\n')
|
||||||
# print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph)
|
# print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph)
|
||||||
self.previous_was_paragraph = is_paragraph
|
self.previous_was_paragraph = is_paragraph
|
||||||
# print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n"
|
# print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n"
|
||||||
@@ -731,10 +729,10 @@ class HeuristicProcessor(object):
|
|||||||
try:
|
try:
|
||||||
self.totalwords = self.get_word_count(html)
|
self.totalwords = self.get_word_count(html)
|
||||||
except:
|
except:
|
||||||
self.log.warn("Can't get wordcount")
|
self.log.warning("Can't get wordcount")
|
||||||
|
|
||||||
if self.totalwords < 50:
|
if self.totalwords < 50:
|
||||||
self.log.warn("flow is too short, not running heuristics")
|
self.log.warning("flow is too short, not running heuristics")
|
||||||
return html
|
return html
|
||||||
|
|
||||||
is_abbyy = self.is_abbyy(html)
|
is_abbyy = self.is_abbyy(html)
|
||||||
@@ -801,12 +799,13 @@ class HeuristicProcessor(object):
|
|||||||
# more of the lines break in the same region of the document then unwrapping is required
|
# more of the lines break in the same region of the document then unwrapping is required
|
||||||
docanalysis = DocAnalysis(format, html)
|
docanalysis = DocAnalysis(format, html)
|
||||||
hardbreaks = docanalysis.line_histogram(.50)
|
hardbreaks = docanalysis.line_histogram(.50)
|
||||||
self.log.debug("Hard line breaks check returned "+str(hardbreaks))
|
self.log.debug("Hard line breaks check returned %s", hardbreaks)
|
||||||
|
|
||||||
# Calculate Length
|
# Calculate Length
|
||||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||||
length = docanalysis.line_length(unwrap_factor)
|
length = docanalysis.line_length(unwrap_factor)
|
||||||
self.log.debug("Median line length is " + str(length) + ", calculated with " + format + " format")
|
self.log.debug("Median line length is %s, calculated with %s format",
|
||||||
|
length, format)
|
||||||
|
|
||||||
# ##### Unwrap lines ######
|
# ##### Unwrap lines ######
|
||||||
if getattr(self.extra_opts, 'unwrap_lines', False):
|
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||||
@@ -827,8 +826,9 @@ class HeuristicProcessor(object):
|
|||||||
|
|
||||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||||
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
self.log.debug("Looking for more split points based on punctuation,"
|
self.log.debug("Looking for more split points based on "
|
||||||
" currently have " + str(self.html_preprocess_sections))
|
"punctuation, currently have %s",
|
||||||
|
self.html_preprocess_sections)
|
||||||
chapdetect3 = re.compile(
|
chapdetect3 = re.compile(
|
||||||
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
|
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
|
||||||
r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
|
r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
|
||||||
|
|||||||
@@ -188,7 +188,7 @@ class FB2MLizer(object):
|
|||||||
metadata['id'] = str(x).split(':')[-1]
|
metadata['id'] = str(x).split(':')[-1]
|
||||||
break
|
break
|
||||||
if metadata['id'] is None:
|
if metadata['id'] is None:
|
||||||
self.log.warn('No UUID identifier found')
|
self.log.warning('No UUID identifier found')
|
||||||
metadata['id'] = str(uuid.uuid4())
|
metadata['id'] = str(uuid.uuid4())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -311,7 +311,7 @@ class FB2MLizer(object):
|
|||||||
self.section_level += 1
|
self.section_level += 1
|
||||||
|
|
||||||
for item in self.oeb_book.spine:
|
for item in self.oeb_book.spine:
|
||||||
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
|
self.log.debug('Converting %s to FictionBook2 XML', item.href)
|
||||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts,
|
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts,
|
||||||
self.opts.output_profile)
|
self.opts.output_profile)
|
||||||
|
|
||||||
@@ -369,7 +369,7 @@ class FB2MLizer(object):
|
|||||||
content_type, data))
|
content_type, data))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log.error('Error: Could not include file %s because '
|
self.log.error('Error: Could not include file %s because '
|
||||||
'%s.' % (item.href, e))
|
'%s.', item.href, e)
|
||||||
return '\n'.join(images)
|
return '\n'.join(images)
|
||||||
|
|
||||||
def create_flat_toc(self, nodes, level):
|
def create_flat_toc(self, nodes, level):
|
||||||
@@ -528,7 +528,7 @@ class FB2MLizer(object):
|
|||||||
fb2_out.append('<image l:href="#%s"/>' %
|
fb2_out.append('<image l:href="#%s"/>' %
|
||||||
self.image_hrefs[ihref])
|
self.image_hrefs[ihref])
|
||||||
else:
|
else:
|
||||||
self.log.warn(u'Ignoring image not in manifest: %s' % ihref)
|
self.log.warning('Ignoring image not in manifest: %s', ihref)
|
||||||
if tag in ('br', 'hr') or ems >= 1:
|
if tag in ('br', 'hr') or ems >= 1:
|
||||||
if ems < 1:
|
if ems < 1:
|
||||||
multiplier = 1
|
multiplier = 1
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ class OEB2HTML(object):
|
|||||||
'<title>%s</title></head>'
|
'<title>%s</title></head>'
|
||||||
'<body>' % entities.prepare_string_for_xml(self.book_title)]
|
'<body>' % entities.prepare_string_for_xml(self.book_title)]
|
||||||
for item in oeb_book.spine:
|
for item in oeb_book.spine:
|
||||||
self.log.debug('Converting %s to HTML...' % item.href)
|
self.log.debug('Converting %s to HTML...', item.href)
|
||||||
self.rewrite_ids(item.data, item)
|
self.rewrite_ids(item.data, item)
|
||||||
base.rewrite_links(item.data, partial(self.rewrite_link,
|
base.rewrite_links(item.data, partial(self.rewrite_link,
|
||||||
page=item))
|
page=item))
|
||||||
@@ -342,7 +342,7 @@ class OEB2HTMLClassCSSizer(OEB2HTML):
|
|||||||
def mlize_spine(self, oeb_book):
|
def mlize_spine(self, oeb_book):
|
||||||
output = []
|
output = []
|
||||||
for item in oeb_book.spine:
|
for item in oeb_book.spine:
|
||||||
self.log.debug('Converting %s to HTML...' % item.href)
|
self.log.debug('Converting %s to HTML...', item.href)
|
||||||
self.rewrite_ids(item.data, item)
|
self.rewrite_ids(item.data, item)
|
||||||
base.rewrite_links(item.data, partial(self.rewrite_link,
|
base.rewrite_links(item.data, partial(self.rewrite_link,
|
||||||
page=item))
|
page=item))
|
||||||
|
|||||||
@@ -331,9 +331,9 @@ class HTMLConverter(object):
|
|||||||
if link['path'] == path:
|
if link['path'] == path:
|
||||||
self.links.remove(link)
|
self.links.remove(link)
|
||||||
break
|
break
|
||||||
self.log.warn('Could not process '+path)
|
self.log.warning('Could not process %s', path)
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
self.log.exception(' ')
|
self.log.exception(' ') # WAT
|
||||||
self.links = self.process_links()
|
self.links = self.process_links()
|
||||||
self.link_level += 1
|
self.link_level += 1
|
||||||
paths = [link['path'] for link in self.links]
|
paths = [link['path'] for link in self.links]
|
||||||
@@ -400,7 +400,7 @@ class HTMLConverter(object):
|
|||||||
with open(os.path.join(tdir,
|
with open(os.path.join(tdir,
|
||||||
'html2lrf-verbose.html'), 'wb') as f:
|
'html2lrf-verbose.html'), 'wb') as f:
|
||||||
f.write(str(soup).encode('utf-8'))
|
f.write(str(soup).encode('utf-8'))
|
||||||
self.log.info('Written preprocessed HTML to '+f.name)
|
self.log.info('Written preprocessed HTML to %s', f.name)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -416,8 +416,8 @@ class HTMLConverter(object):
|
|||||||
self.css[selector] = self.override_css[selector]
|
self.css[selector] = self.override_css[selector]
|
||||||
|
|
||||||
self.file_name = os.path.basename(path)
|
self.file_name = os.path.basename(path)
|
||||||
self.log.info('Processing %s' % (path if self.verbose else
|
self.log.info('Processing %s', path if self.verbose else
|
||||||
self.file_name))
|
self.file_name)
|
||||||
|
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
# convertlit replaces & with %26 in file names
|
# convertlit replaces & with %26 in file names
|
||||||
@@ -589,7 +589,7 @@ class HTMLConverter(object):
|
|||||||
try:
|
try:
|
||||||
index = self.book.pages().index(opage)
|
index = self.book.pages().index(opage)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.log.warning('%s is an empty file' % self.file_name)
|
self.log.warning('%s is an empty file', self.file_name)
|
||||||
tb = self.book.create_text_block()
|
tb = self.book.create_text_block()
|
||||||
self.current_page.append(tb)
|
self.current_page.append(tb)
|
||||||
return tb
|
return tb
|
||||||
@@ -656,7 +656,7 @@ class HTMLConverter(object):
|
|||||||
hasattr(target.parent, 'objId'):
|
hasattr(target.parent, 'objId'):
|
||||||
self.book.addTocEntry(ascii_text, tb)
|
self.book.addTocEntry(ascii_text, tb)
|
||||||
else:
|
else:
|
||||||
self.log.debug("Cannot add link %s to TOC" % ascii_text)
|
self.log.debug("Cannot add link %s to TOC", ascii_text)
|
||||||
|
|
||||||
def get_target_block(fragment, targets):
|
def get_target_block(fragment, targets):
|
||||||
'''Return the correct block for the <a name> element'''
|
'''Return the correct block for the <a name> element'''
|
||||||
@@ -1617,7 +1617,7 @@ class HTMLConverter(object):
|
|||||||
tag[key]] = self.current_block
|
tag[key]] = self.current_block
|
||||||
self.current_block.must_append = True
|
self.current_block.must_append = True
|
||||||
else:
|
else:
|
||||||
self.log.debug('Could not follow link to ',
|
self.log.debug('Could not follow link to %s',
|
||||||
tag['href'])
|
tag['href'])
|
||||||
self.process_children(tag, tag_css, tag_pseudo_css)
|
self.process_children(tag, tag_css, tag_pseudo_css)
|
||||||
elif tag.has_attr('name') or tag.has_attr('id'):
|
elif tag.has_attr('name') or tag.has_attr('id'):
|
||||||
@@ -1642,7 +1642,8 @@ class HTMLConverter(object):
|
|||||||
self.process_image(path, tag_css, width, height,
|
self.process_image(path, tag_css, width, height,
|
||||||
dropcaps=dropcaps, rescale=True)
|
dropcaps=dropcaps, rescale=True)
|
||||||
elif not urllib.parse.urlparse(tag['src'])[0]:
|
elif not urllib.parse.urlparse(tag['src'])[0]:
|
||||||
self.log.warn('Could not find image: '+tag['src'])
|
self.log.warning('Could not find image: %s',
|
||||||
|
tag['src'])
|
||||||
else:
|
else:
|
||||||
self.log.debug("Failed to process: %s", tag)
|
self.log.debug("Failed to process: %s", tag)
|
||||||
elif tagname in ['style', 'link']:
|
elif tagname in ['style', 'link']:
|
||||||
@@ -1665,7 +1666,7 @@ class HTMLConverter(object):
|
|||||||
self.page_break_found = True
|
self.page_break_found = True
|
||||||
ncss, npcss = self.parse_css(src)
|
ncss, npcss = self.parse_css(src)
|
||||||
except IOError:
|
except IOError:
|
||||||
self.log.warn('Could not read stylesheet: %s',
|
self.log.warning('Could not read stylesheet: %s',
|
||||||
tag['href'])
|
tag['href'])
|
||||||
if ncss:
|
if ncss:
|
||||||
update_css(ncss, self.css)
|
update_css(ncss, self.css)
|
||||||
@@ -1876,10 +1877,10 @@ class HTMLConverter(object):
|
|||||||
self.process_table(tag, tag_css)
|
self.process_table(tag, tag_css)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
self.log.warning('An error occurred while processing a '
|
self.log.warning('An error occurred while processing a '
|
||||||
'table: %s. Ignoring table markup.' %
|
'table: %s. Ignoring table markup.',
|
||||||
repr(err))
|
repr(err))
|
||||||
self.log.exception('')
|
self.log.exception('') # WAT
|
||||||
self.log.debug('Bad table:\n%s' % str(tag)[:300])
|
self.log.debug('Bad table:\n%s', str(tag)[:300])
|
||||||
self.in_table = False
|
self.in_table = False
|
||||||
self.process_children(tag, tag_css, tag_pseudo_css)
|
self.process_children(tag, tag_css, tag_pseudo_css)
|
||||||
finally:
|
finally:
|
||||||
@@ -1977,7 +1978,7 @@ def process_file(path, options, logger):
|
|||||||
tpath = tf.name
|
tpath = tf.name
|
||||||
# PIL sometimes fails, for example on interlaced PNG files
|
# PIL sometimes fails, for example on interlaced PNG files
|
||||||
except IOError as err:
|
except IOError as err:
|
||||||
logger.warn('Could not read cover image: %s', err)
|
logger.warning('Could not read cover image: %s', err)
|
||||||
options.cover = None
|
options.cover = None
|
||||||
else:
|
else:
|
||||||
raise ConversionError('Cannot read from: %s', options.cover)
|
raise ConversionError('Cannot read from: %s', options.cover)
|
||||||
|
|||||||
@@ -48,7 +48,8 @@ class Canvas(etree.XSLTExtension):
|
|||||||
table.append(tr)
|
table.append(tr)
|
||||||
for obj, x, y in self.get_objects(canvas):
|
for obj, x, y in self.get_objects(canvas):
|
||||||
if obj.tag != 'TextBlock':
|
if obj.tag != 'TextBlock':
|
||||||
self.log.warn(obj.tag, 'elements in Canvas not supported')
|
self.log.warning('%s elements in Canvas not supported',
|
||||||
|
obj.tag)
|
||||||
continue
|
continue
|
||||||
td = table.makeelement('td')
|
td = table.makeelement('td')
|
||||||
self.text_block.render_block(obj, td)
|
self.text_block.render_block(obj, td)
|
||||||
@@ -168,7 +169,7 @@ class TextBlock(etree.XSLTExtension):
|
|||||||
if deepest < 500:
|
if deepest < 500:
|
||||||
return
|
return
|
||||||
|
|
||||||
self.log.warn('Found deeply nested spans. Flattening.')
|
self.log.warning('Found deeply nested spans. Flattening.')
|
||||||
# with open('/t/before.xml', 'wb') as f:
|
# with open('/t/before.xml', 'wb') as f:
|
||||||
# f.write(etree.tostring(node, method='xml'))
|
# f.write(etree.tostring(node, method='xml'))
|
||||||
|
|
||||||
@@ -270,7 +271,7 @@ class TextBlock(etree.XSLTExtension):
|
|||||||
self.add_text_to = (img, 'tail')
|
self.add_text_to = (img, 'tail')
|
||||||
self.add_text(child.tail)
|
self.add_text(child.tail)
|
||||||
else:
|
else:
|
||||||
self.log.warn('Unhandled Text element:', child.tag)
|
self.log.warning('Unhandled Text element: %s', child.tag)
|
||||||
|
|
||||||
|
|
||||||
class Styles(etree.XSLTExtension):
|
class Styles(etree.XSLTExtension):
|
||||||
|
|||||||
@@ -24,7 +24,8 @@ class Extract(ODF2XHTML):
|
|||||||
if not os.path.exists('Pictures'):
|
if not os.path.exists('Pictures'):
|
||||||
os.makedirs('Pictures')
|
os.makedirs('Pictures')
|
||||||
for name in zf.namelist():
|
for name in zf.namelist():
|
||||||
if name.startswith('Pictures') and name not in {'Pictures', 'Pictures/'}:
|
if (name.startswith('Pictures') and
|
||||||
|
name not in {'Pictures', 'Pictures/'}):
|
||||||
data = zf.read(name)
|
data = zf.read(name)
|
||||||
with open(name, 'wb') as f:
|
with open(name, 'wb') as f:
|
||||||
f.write(data)
|
f.write(data)
|
||||||
@@ -46,13 +47,13 @@ class Extract(ODF2XHTML):
|
|||||||
self.extract_css(root, log)
|
self.extract_css(root, log)
|
||||||
self.epubify_markup(root, log)
|
self.epubify_markup(root, log)
|
||||||
self.apply_list_starts(root, log)
|
self.apply_list_starts(root, log)
|
||||||
html = etree.tostring(root, encoding='utf-8',
|
html = etree.tostring(root, encoding='utf-8', xml_declaration=True)
|
||||||
xml_declaration=True)
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def extract_css(self, root, log):
|
def extract_css(self, root, log):
|
||||||
ans = []
|
ans = []
|
||||||
for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'):
|
for s in root.xpath('//*[local-name() = "style" and '
|
||||||
|
'@type="text/css"]'):
|
||||||
ans.append(s.text)
|
ans.append(s.text)
|
||||||
s.getparent().remove(s)
|
s.getparent().remove(s)
|
||||||
|
|
||||||
@@ -63,11 +64,11 @@ class Extract(ODF2XHTML):
|
|||||||
if ns:
|
if ns:
|
||||||
ns = '{%s}'%ns
|
ns = '{%s}'%ns
|
||||||
etree.SubElement(head, ns+'link', {'type':'text/css',
|
etree.SubElement(head, ns+'link', {'type':'text/css',
|
||||||
'rel':'stylesheet', 'href':'odfpy.css'})
|
'rel':'stylesheet',
|
||||||
|
'href':'odfpy.css'})
|
||||||
|
|
||||||
css = u'\n\n'.join(ans)
|
css = u'\n\n'.join(ans)
|
||||||
parser = CSSParser(loglevel=logging.WARNING,
|
parser = CSSParser(loglevel=logging.WARNING, log=_css_logger)
|
||||||
log=_css_logger)
|
|
||||||
self.css = parser.parseString(css, validate=False)
|
self.css = parser.parseString(css, validate=False)
|
||||||
|
|
||||||
with open('odfpy.css', 'wb') as f:
|
with open('odfpy.css', 'wb') as f:
|
||||||
@@ -209,7 +210,8 @@ class Extract(ODF2XHTML):
|
|||||||
for frm in self.document.topnode.getElementsByType(odFrame):
|
for frm in self.document.topnode.getElementsByType(odFrame):
|
||||||
try:
|
try:
|
||||||
if frm.getAttrNS(odTEXTNS,u'anchor-type') == 'page':
|
if frm.getAttrNS(odTEXTNS,u'anchor-type') == 'page':
|
||||||
log.warn('Document has Pictures anchored to Page, will all end up before first page!')
|
log.warning('Document has Pictures anchored to Page, will '
|
||||||
|
'all end up before first page!')
|
||||||
break
|
break
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
@@ -234,7 +236,8 @@ class Extract(ODF2XHTML):
|
|||||||
# now it should be safe to remove the text:p
|
# now it should be safe to remove the text:p
|
||||||
parent = para.parentNode
|
parent = para.parentNode
|
||||||
parent.removeChild(para)
|
parent.removeChild(para)
|
||||||
log("Removed cover image paragraph from document...")
|
log.info("Removed cover image paragraph from "
|
||||||
|
"document...")
|
||||||
break
|
break
|
||||||
|
|
||||||
def filter_load(self, odffile, mi, log):
|
def filter_load(self, odffile, mi, log):
|
||||||
@@ -267,7 +270,7 @@ class Extract(ODF2XHTML):
|
|||||||
if not os.path.exists(odir):
|
if not os.path.exists(odir):
|
||||||
os.makedirs(odir)
|
os.makedirs(odir)
|
||||||
with directory.CurrentDir(odir):
|
with directory.CurrentDir(odir):
|
||||||
log('Extracting ODT file...')
|
log.info('Extracting ODT file...')
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
mi = get_metadata(stream, 'odt')
|
mi = get_metadata(stream, 'odt')
|
||||||
if not mi.title:
|
if not mi.title:
|
||||||
|
|||||||
@@ -904,7 +904,7 @@ class Manifest(object):
|
|||||||
def _parse_xhtml(self, data):
|
def _parse_xhtml(self, data):
|
||||||
orig_data = data
|
orig_data = data
|
||||||
fname = urllib.parse.unquote(self.href)
|
fname = urllib.parse.unquote(self.href)
|
||||||
self.oeb.log.debug('Parsing', fname, '...')
|
self.oeb.log.debug('Parsing %s ...', fname)
|
||||||
self.oeb.html_preprocessor.current_href = self.href
|
self.oeb.html_preprocessor.current_href = self.href
|
||||||
try:
|
try:
|
||||||
data = parse_utils.parse_html(data, log=self.oeb.log,
|
data = parse_utils.parse_html(data, log=self.oeb.log,
|
||||||
@@ -924,7 +924,7 @@ class Manifest(object):
|
|||||||
if has_html in data:
|
if has_html in data:
|
||||||
return self._parse_xhtml(data)
|
return self._parse_xhtml(data)
|
||||||
|
|
||||||
self.oeb.log.debug('Converting', self.href, '...')
|
self.oeb.log.debug('Converting %s ...', self.href)
|
||||||
|
|
||||||
from ebook_converter.ebooks.txt.processor import convert_markdown
|
from ebook_converter.ebooks.txt.processor import convert_markdown
|
||||||
|
|
||||||
@@ -941,7 +941,7 @@ class Manifest(object):
|
|||||||
from css_parser.css import CSSRule
|
from css_parser.css import CSSRule
|
||||||
log.setLevel(logging.WARN)
|
log.setLevel(logging.WARN)
|
||||||
log.raiseExceptions = False
|
log.raiseExceptions = False
|
||||||
self.oeb.log.debug('Parsing', self.href, '...')
|
self.oeb.log.debug('Parsing %s ...', self.href)
|
||||||
data = self.oeb.decode(data)
|
data = self.oeb.decode(data)
|
||||||
data = self.oeb.css_preprocessor(data, add_namespace=False)
|
data = self.oeb.css_preprocessor(data, add_namespace=False)
|
||||||
parser = CSSParser(loglevel=logging.WARNING,
|
parser = CSSParser(loglevel=logging.WARNING,
|
||||||
@@ -957,11 +957,11 @@ class Manifest(object):
|
|||||||
def _fetch_css(self, path):
|
def _fetch_css(self, path):
|
||||||
hrefs = self.oeb.manifest.hrefs
|
hrefs = self.oeb.manifest.hrefs
|
||||||
if path not in hrefs:
|
if path not in hrefs:
|
||||||
self.oeb.logger.warn('CSS import of missing file %r' % path)
|
self.oeb.logger.warning('CSS import of missing file %s', path)
|
||||||
return (None, None)
|
return (None, None)
|
||||||
item = hrefs[path]
|
item = hrefs[path]
|
||||||
if item.media_type not in OEB_STYLES:
|
if item.media_type not in OEB_STYLES:
|
||||||
self.oeb.logger.warn('CSS import of non-CSS file %r' % path)
|
self.oeb.logger.warning('CSS import of non-CSS file %s', path)
|
||||||
return (None, None)
|
return (None, None)
|
||||||
data = item.data.cssText
|
data = item.data.cssText
|
||||||
enc = None if isinstance(data, str) else 'utf-8'
|
enc = None if isinstance(data, str) else 'utf-8'
|
||||||
@@ -1002,8 +1002,8 @@ class Manifest(object):
|
|||||||
elif mt in OEB_STYLES:
|
elif mt in OEB_STYLES:
|
||||||
data = self._parse_css(data)
|
data = self._parse_css(data)
|
||||||
elif mt == 'text/plain':
|
elif mt == 'text/plain':
|
||||||
self.oeb.log.warn('%s contains data in TXT format' % self.href,
|
self.oeb.log.warning('%s contains data in TXT format. '
|
||||||
'converting to HTML')
|
'Converting to HTML', self.href)
|
||||||
data = self._parse_txt(data)
|
data = self._parse_txt(data)
|
||||||
self.media_type = XHTML_MIME
|
self.media_type = XHTML_MIME
|
||||||
self._data = data
|
self._data = data
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ def merge_multiple_html_heads_and_bodies(root, log=None):
|
|||||||
body.append(x)
|
body.append(x)
|
||||||
tuple(map(root.append, (head, body)))
|
tuple(map(root.append, (head, body)))
|
||||||
if log is not None:
|
if log is not None:
|
||||||
log.warn('Merging multiple <head> and <body> sections')
|
log.warning('Merging multiple <head> and <body> sections')
|
||||||
return root
|
return root
|
||||||
|
|
||||||
|
|
||||||
@@ -122,7 +122,7 @@ def clean_word_doc(data, log):
|
|||||||
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
|
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
|
||||||
prefixes.append(match.group(1))
|
prefixes.append(match.group(1))
|
||||||
if prefixes:
|
if prefixes:
|
||||||
log.warn('Found microsoft markup, cleaning...')
|
log.warning('Found microsoft markup, cleaning...')
|
||||||
# Remove empty tags as they are not rendered by browsers
|
# Remove empty tags as they are not rendered by browsers
|
||||||
# but can become renderable HTML tags like <p/> if the
|
# but can become renderable HTML tags like <p/> if the
|
||||||
# document is parsed by an HTML parser
|
# document is parsed by an HTML parser
|
||||||
@@ -214,13 +214,13 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
data = etree.fromstring(data)
|
data = etree.fromstring(data)
|
||||||
check_for_html5(pre, data)
|
check_for_html5(pre, data)
|
||||||
except (HTML5Doc, etree.XMLSyntaxError):
|
except (HTML5Doc, etree.XMLSyntaxError):
|
||||||
log.debug('Parsing %s as HTML' % filename)
|
log.debug('Parsing %s as HTML', filename)
|
||||||
data = raw
|
data = raw
|
||||||
try:
|
try:
|
||||||
data = html5_parse(data)
|
data = html5_parse(data)
|
||||||
except Exception:
|
except Exception:
|
||||||
log.exception(
|
log.exception('HTML 5 parsing failed, falling back to older '
|
||||||
'HTML 5 parsing failed, falling back to older parsers')
|
'parsers')
|
||||||
data = _html4_parse(data)
|
data = _html4_parse(data)
|
||||||
|
|
||||||
if has_html4_doctype or data.tag == 'HTML' or (len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))):
|
if has_html4_doctype or data.tag == 'HTML' or (len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))):
|
||||||
@@ -239,7 +239,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
if barename(data.tag) != 'html':
|
if barename(data.tag) != 'html':
|
||||||
if barename(data.tag) in non_html_file_tags:
|
if barename(data.tag) in non_html_file_tags:
|
||||||
raise NotHTML(data.tag)
|
raise NotHTML(data.tag)
|
||||||
log.warn('File %r does not appear to be (X)HTML'%filename)
|
log.warning('File %s does not appear to be (X)HTML', filename)
|
||||||
nroot = etree.fromstring('<html></html>')
|
nroot = etree.fromstring('<html></html>')
|
||||||
has_body = False
|
has_body = False
|
||||||
for child in list(data):
|
for child in list(data):
|
||||||
@@ -248,7 +248,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
break
|
break
|
||||||
parent = nroot
|
parent = nroot
|
||||||
if not has_body:
|
if not has_body:
|
||||||
log.warn('File %r appears to be a HTML fragment'%filename)
|
log.warning('File %s appears to be a HTML fragment', filename)
|
||||||
nroot = etree.fromstring('<html><body/></html>')
|
nroot = etree.fromstring('<html><body/></html>')
|
||||||
parent = nroot[0]
|
parent = nroot[0]
|
||||||
for child in list(data.iter()):
|
for child in list(data.iter()):
|
||||||
@@ -260,7 +260,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
|
|
||||||
# Force into the XHTML namespace
|
# Force into the XHTML namespace
|
||||||
if not namespace(data.tag):
|
if not namespace(data.tag):
|
||||||
log.warn('Forcing', filename, 'into XHTML namespace')
|
log.warning('Forcing %s into XHTML namespace', filename)
|
||||||
data.attrib['xmlns'] = const.XHTML_NS
|
data.attrib['xmlns'] = const.XHTML_NS
|
||||||
data = etree.tostring(data, encoding='unicode')
|
data = etree.tostring(data, encoding='unicode')
|
||||||
|
|
||||||
@@ -272,10 +272,8 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
try:
|
try:
|
||||||
data = etree.fromstring(data)
|
data = etree.fromstring(data)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
log.warn('Stripping comments from %s'%
|
log.warning('Stripping comments from %s', filename)
|
||||||
filename)
|
data = re.compile(r'<!--.*?-->', re.DOTALL).sub('', data)
|
||||||
data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
|
|
||||||
data)
|
|
||||||
data = data.replace(
|
data = data.replace(
|
||||||
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
|
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
|
||||||
'')
|
'')
|
||||||
@@ -283,7 +281,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
try:
|
try:
|
||||||
data = etree.fromstring(data)
|
data = etree.fromstring(data)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
log.warn('Stripping meta tags from %s'% filename)
|
log.warning('Stripping meta tags from %s', filename)
|
||||||
data = re.sub(r'<meta\s+[^>]+?>', '', data)
|
data = re.sub(r'<meta\s+[^>]+?>', '', data)
|
||||||
data = etree.fromstring(data)
|
data = etree.fromstring(data)
|
||||||
elif namespace(data.tag) != const.XHTML_NS:
|
elif namespace(data.tag) != const.XHTML_NS:
|
||||||
@@ -308,7 +306,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
head = xpath(data, '/h:html/h:head')
|
head = xpath(data, '/h:html/h:head')
|
||||||
head = head[0] if head else None
|
head = head[0] if head else None
|
||||||
if head is None:
|
if head is None:
|
||||||
log.warn('File %s missing <head/> element' % filename)
|
log.warning('File %s missing <head/> element', filename)
|
||||||
head = etree.Element(XHTML('head'))
|
head = etree.Element(XHTML('head'))
|
||||||
data.insert(0, head)
|
data.insert(0, head)
|
||||||
title = etree.SubElement(head, XHTML('title'))
|
title = etree.SubElement(head, XHTML('title'))
|
||||||
@@ -335,7 +333,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
body.getparent().remove(body)
|
body.getparent().remove(body)
|
||||||
data.append(body)
|
data.append(body)
|
||||||
else:
|
else:
|
||||||
log.warn('File %s missing <body/> element' % filename)
|
log.warning('File %s missing <body/> element', filename)
|
||||||
etree.SubElement(data, XHTML('body'))
|
etree.SubElement(data, XHTML('body'))
|
||||||
|
|
||||||
# Remove microsoft office markup
|
# Remove microsoft office markup
|
||||||
|
|||||||
@@ -1141,8 +1141,8 @@ class EpubContainer(Container):
|
|||||||
zf = ZipFile(stream)
|
zf = ZipFile(stream)
|
||||||
zf.extractall(tdir)
|
zf.extractall(tdir)
|
||||||
except:
|
except:
|
||||||
log.exception('EPUB appears to be invalid ZIP file, trying a'
|
log.exception('EPUB appears to be invalid ZIP file, '
|
||||||
' more forgiving ZIP parser')
|
'trying a more forgiving ZIP parser')
|
||||||
from ebook_converter.utils.localunzip import extractall
|
from ebook_converter.utils.localunzip import extractall
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
extractall(stream, path=tdir)
|
extractall(stream, path=tdir)
|
||||||
@@ -1481,7 +1481,7 @@ class AZW3Container(Container):
|
|||||||
'ebook_converter.ebooks.oeb.polish.container', 'do_explode',
|
'ebook_converter.ebooks.oeb.polish.container', 'do_explode',
|
||||||
args=(pathtoazw3, tdir), no_output=True)['result']
|
args=(pathtoazw3, tdir), no_output=True)['result']
|
||||||
except WorkerError as e:
|
except WorkerError as e:
|
||||||
log(e.orig_tb)
|
log.error(e.orig_tb)
|
||||||
raise InvalidMobi('Failed to explode MOBI')
|
raise InvalidMobi('Failed to explode MOBI')
|
||||||
super(AZW3Container, self).__init__(tdir, opf_path, log)
|
super(AZW3Container, self).__init__(tdir, opf_path, log)
|
||||||
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
|
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
|
||||||
|
|||||||
@@ -111,14 +111,14 @@ class OEBReader(object):
|
|||||||
encoding=None)
|
encoding=None)
|
||||||
try:
|
try:
|
||||||
opf = etree.fromstring(data)
|
opf = etree.fromstring(data)
|
||||||
self.logger.warn('OPF contains invalid HTML named entities')
|
self.logger.warning('OPF contains invalid HTML named entities')
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
data = re.sub(r'(?is)<tours>.+</tours>', '', data)
|
data = re.sub(r'(?is)<tours>.+</tours>', '', data)
|
||||||
data = data.replace('<dc-metadata>',
|
data = data.replace('<dc-metadata>',
|
||||||
'<dc-metadata xmlns:dc="'
|
'<dc-metadata xmlns:dc="'
|
||||||
'http://purl.org/metadata/dublin_core">')
|
'http://purl.org/metadata/dublin_core">')
|
||||||
opf = etree.fromstring(data)
|
opf = etree.fromstring(data)
|
||||||
self.logger.warn('OPF contains invalid tours section')
|
self.logger.warning('OPF contains invalid tours section')
|
||||||
|
|
||||||
ns = parse_utils.namespace(opf.tag)
|
ns = parse_utils.namespace(opf.tag)
|
||||||
if ns not in ('', const.OPF1_NS, const.OPF2_NS):
|
if ns not in ('', const.OPF1_NS, const.OPF2_NS):
|
||||||
@@ -172,7 +172,7 @@ class OEBReader(object):
|
|||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
raise
|
raise
|
||||||
except Exception:
|
except Exception:
|
||||||
self.logger.exception('Failed to parse content in %s' %
|
self.logger.exception('Failed to parse content in %s',
|
||||||
item.href)
|
item.href)
|
||||||
bad.append(item)
|
bad.append(item)
|
||||||
self.oeb.manifest.remove(item)
|
self.oeb.manifest.remove(item)
|
||||||
@@ -195,7 +195,7 @@ class OEBReader(object):
|
|||||||
data = item.data
|
data = item.data
|
||||||
except Exception:
|
except Exception:
|
||||||
self.oeb.log.exception('Failed to read from manifest '
|
self.oeb.log.exception('Failed to read from manifest '
|
||||||
'entry with id: %s, ignoring' %
|
'entry with id: %s, ignoring',
|
||||||
item.id)
|
item.id)
|
||||||
invalid.add(item)
|
invalid.add(item)
|
||||||
continue
|
continue
|
||||||
@@ -216,7 +216,7 @@ class OEBReader(object):
|
|||||||
scheme = urllib.parse.urlparse(href).scheme
|
scheme = urllib.parse.urlparse(href).scheme
|
||||||
except Exception:
|
except Exception:
|
||||||
self.oeb.log.exception('Skipping invalid href: '
|
self.oeb.log.exception('Skipping invalid href: '
|
||||||
'%r' % href)
|
'%s', href)
|
||||||
continue
|
continue
|
||||||
if not scheme and href not in known:
|
if not scheme and href not in known:
|
||||||
new.add(href)
|
new.add(href)
|
||||||
@@ -244,11 +244,12 @@ class OEBReader(object):
|
|||||||
continue
|
continue
|
||||||
if not self.oeb.container.exists(href):
|
if not self.oeb.container.exists(href):
|
||||||
if href not in warned:
|
if href not in warned:
|
||||||
self.logger.warn('Referenced file %r not found' % href)
|
self.logger.warning('Referenced file %s not found',
|
||||||
|
href)
|
||||||
warned.add(href)
|
warned.add(href)
|
||||||
continue
|
continue
|
||||||
if href not in warned:
|
if href not in warned:
|
||||||
self.logger.warn('Referenced file %r not in manifest' %
|
self.logger.warning('Referenced file %s not in manifest',
|
||||||
href)
|
href)
|
||||||
warned.add(href)
|
warned.add(href)
|
||||||
id, _ = manifest.generate(id='added')
|
id, _ = manifest.generate(id='added')
|
||||||
@@ -275,13 +276,13 @@ class OEBReader(object):
|
|||||||
media_type = media_type.lower()
|
media_type = media_type.lower()
|
||||||
fallback = elem.get('fallback')
|
fallback = elem.get('fallback')
|
||||||
if href in manifest.hrefs:
|
if href in manifest.hrefs:
|
||||||
self.logger.warn('Duplicate manifest entry for %r' % href)
|
self.logger.warning('Duplicate manifest entry for %s', href)
|
||||||
continue
|
continue
|
||||||
if not self.oeb.container.exists(href):
|
if not self.oeb.container.exists(href):
|
||||||
self.logger.warn('Manifest item %r not found' % href)
|
self.logger.warning('Manifest item %s not found', href)
|
||||||
continue
|
continue
|
||||||
if id in manifest.ids:
|
if id in manifest.ids:
|
||||||
self.logger.warn('Duplicate manifest id %r' % id)
|
self.logger.warning('Duplicate manifest id %s', id)
|
||||||
id, href = manifest.generate(id, href)
|
id, href = manifest.generate(id, href)
|
||||||
manifest.add(id, href, media_type, fallback)
|
manifest.add(id, href, media_type, fallback)
|
||||||
invalid = self._manifest_prune_invalid()
|
invalid = self._manifest_prune_invalid()
|
||||||
@@ -323,8 +324,8 @@ class OEBReader(object):
|
|||||||
if item.href in removed_items_to_ignore:
|
if item.href in removed_items_to_ignore:
|
||||||
continue
|
continue
|
||||||
if version >= 2:
|
if version >= 2:
|
||||||
self.logger.warn(
|
self.logger.warning('Spine-referenced file %s not in spine',
|
||||||
'Spine-referenced file %r not in spine' % item.href)
|
item.href)
|
||||||
spine.add(item, linear=False)
|
spine.add(item, linear=False)
|
||||||
|
|
||||||
def _spine_from_opf(self, opf):
|
def _spine_from_opf(self, opf):
|
||||||
@@ -333,7 +334,7 @@ class OEBReader(object):
|
|||||||
for elem in base.xpath(opf, '/o2:package/o2:spine/o2:itemref'):
|
for elem in base.xpath(opf, '/o2:package/o2:spine/o2:itemref'):
|
||||||
idref = elem.get('idref')
|
idref = elem.get('idref')
|
||||||
if idref not in manifest.ids:
|
if idref not in manifest.ids:
|
||||||
self.logger.warn('Spine item %r not found' % idref)
|
self.logger.warning('Spine item %s not found', idref)
|
||||||
continue
|
continue
|
||||||
item = manifest.ids[idref]
|
item = manifest.ids[idref]
|
||||||
if (item.media_type.lower() in base.OEB_DOCS and
|
if (item.media_type.lower() in base.OEB_DOCS and
|
||||||
@@ -346,8 +347,8 @@ class OEBReader(object):
|
|||||||
item.media_type = base.XHTML_MIME
|
item.media_type = base.XHTML_MIME
|
||||||
spine.add(item, elem.get('linear'))
|
spine.add(item, elem.get('linear'))
|
||||||
else:
|
else:
|
||||||
self.oeb.log.warn('The item %s is not a XML document.'
|
self.oeb.log.warning('The item %s is not a XML document.'
|
||||||
' Removing it from spine.' % item.href)
|
' Removing it from spine.', item.href)
|
||||||
if len(spine) == 0:
|
if len(spine) == 0:
|
||||||
raise base.OEBError("Spine is empty")
|
raise base.OEBError("Spine is empty")
|
||||||
self._spine_add_extra()
|
self._spine_add_extra()
|
||||||
@@ -369,7 +370,8 @@ class OEBReader(object):
|
|||||||
corrected_href = href
|
corrected_href = href
|
||||||
break
|
break
|
||||||
if corrected_href is None:
|
if corrected_href is None:
|
||||||
self.logger.warn('Guide reference %r not found' % ref_href)
|
self.logger.warning('Guide reference %s not found',
|
||||||
|
ref_href)
|
||||||
continue
|
continue
|
||||||
ref_href = corrected_href
|
ref_href = corrected_href
|
||||||
typ = elem.get('type')
|
typ = elem.get('type')
|
||||||
@@ -411,7 +413,7 @@ class OEBReader(object):
|
|||||||
if path and path not in self.oeb.manifest.hrefs:
|
if path and path not in self.oeb.manifest.hrefs:
|
||||||
path = base.urlnormalize(path)
|
path = base.urlnormalize(path)
|
||||||
if href and path not in self.oeb.manifest.hrefs:
|
if href and path not in self.oeb.manifest.hrefs:
|
||||||
self.logger.warn('TOC reference %r not found' % href)
|
self.logger.warning('TOC reference %s not found', href)
|
||||||
gc = base.xpath(child, 'ncx:navPoint')
|
gc = base.xpath(child, 'ncx:navPoint')
|
||||||
if not gc:
|
if not gc:
|
||||||
# This node is useless
|
# This node is useless
|
||||||
@@ -488,7 +490,7 @@ class OEBReader(object):
|
|||||||
continue
|
continue
|
||||||
path, _ = urllib.parse.urldefrag(base.urlnormalize(href))
|
path, _ = urllib.parse.urldefrag(base.urlnormalize(href))
|
||||||
if path not in self.oeb.manifest.hrefs:
|
if path not in self.oeb.manifest.hrefs:
|
||||||
self.logger.warn('TOC reference %r not found' % href)
|
self.logger.warning('TOC reference %s not found', href)
|
||||||
continue
|
continue
|
||||||
id = site.get('id')
|
id = site.get('id')
|
||||||
toc.add(title, href, id=id)
|
toc.add(title, href, id=id)
|
||||||
@@ -528,7 +530,7 @@ class OEBReader(object):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def _toc_from_spine(self, opf):
|
def _toc_from_spine(self, opf):
|
||||||
self.log.warn('Generating default TOC from spine...')
|
self.log.warning('Generating default TOC from spine...')
|
||||||
toc = self.oeb.toc
|
toc = self.oeb.toc
|
||||||
titles = []
|
titles = []
|
||||||
headers = []
|
headers = []
|
||||||
@@ -656,7 +658,7 @@ class OEBReader(object):
|
|||||||
if item is not None and item.media_type in base.OEB_IMAGES:
|
if item is not None and item.media_type in base.OEB_IMAGES:
|
||||||
return item
|
return item
|
||||||
else:
|
else:
|
||||||
self.logger.warn('Invalid cover image @id %r' % id)
|
self.logger.warning('Invalid cover image @id %s', id)
|
||||||
hcover = self.oeb.spine[0]
|
hcover = self.oeb.spine[0]
|
||||||
if 'cover' in self.oeb.guide:
|
if 'cover' in self.oeb.guide:
|
||||||
href = self.oeb.guide['cover'].href
|
href = self.oeb.guide['cover'].href
|
||||||
@@ -705,8 +707,8 @@ class OEBReader(object):
|
|||||||
items = [x for x in self.oeb.manifest if x.href == href]
|
items = [x for x in self.oeb.manifest if x.href == href]
|
||||||
for x in items:
|
for x in items:
|
||||||
if x not in self.oeb.spine:
|
if x not in self.oeb.spine:
|
||||||
self.oeb.log.warn('Removing duplicate manifest item with '
|
self.oeb.log.warning('Removing duplicate manifest item '
|
||||||
'id:', x.id)
|
'with id: %s', x.id)
|
||||||
self.oeb.manifest.remove_duplicate_item(x)
|
self.oeb.manifest.remove_duplicate_item(x)
|
||||||
|
|
||||||
def _all_from_opf(self, opf):
|
def _all_from_opf(self, opf):
|
||||||
|
|||||||
@@ -241,11 +241,14 @@ class Stylizer(object):
|
|||||||
continue
|
continue
|
||||||
hrefs = self.oeb.manifest.hrefs
|
hrefs = self.oeb.manifest.hrefs
|
||||||
if ihref not in hrefs:
|
if ihref not in hrefs:
|
||||||
self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
|
self.logger.warning('Ignoring missing '
|
||||||
|
'stylesheet in @import '
|
||||||
|
'rule: %s', rule.href)
|
||||||
continue
|
continue
|
||||||
sitem = hrefs[ihref]
|
sitem = hrefs[ihref]
|
||||||
if sitem.media_type not in base.OEB_STYLES:
|
if sitem.media_type not in base.OEB_STYLES:
|
||||||
self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
|
self.logger.warning('CSS @import of non-CSS '
|
||||||
|
'file %s', rule.href)
|
||||||
continue
|
continue
|
||||||
stylesheets.append(sitem.data)
|
stylesheets.append(sitem.data)
|
||||||
# Make links to resources absolute, since these rules will
|
# Make links to resources absolute, since these rules will
|
||||||
@@ -261,14 +264,12 @@ class Stylizer(object):
|
|||||||
path = item.abshref(href)
|
path = item.abshref(href)
|
||||||
sitem = oeb.manifest.hrefs.get(path, None)
|
sitem = oeb.manifest.hrefs.get(path, None)
|
||||||
if sitem is None:
|
if sitem is None:
|
||||||
self.logger.warn(
|
self.logger.warning('Stylesheet %s referenced by file %s '
|
||||||
'Stylesheet %r referenced by file %r not in manifest' %
|
'not in manifest', path, item.href)
|
||||||
(path, item.href))
|
|
||||||
continue
|
continue
|
||||||
if not hasattr(sitem.data, 'cssRules'):
|
if not hasattr(sitem.data, 'cssRules'):
|
||||||
self.logger.warn(
|
self.logger.warning('Stylesheet %s referenced by file %s '
|
||||||
'Stylesheet %r referenced by file %r is not CSS'%(path,
|
'is not CSS', path, item.href)
|
||||||
item.href))
|
|
||||||
continue
|
continue
|
||||||
stylesheets.append(sitem.data)
|
stylesheets.append(sitem.data)
|
||||||
csses = {'extra_css':extra_css, 'user_css':user_css}
|
csses = {'extra_css':extra_css, 'user_css':user_css}
|
||||||
@@ -280,9 +281,8 @@ class Stylizer(object):
|
|||||||
validate=False)
|
validate=False)
|
||||||
stylesheets.append(stylesheet)
|
stylesheets.append(stylesheet)
|
||||||
except Exception:
|
except Exception:
|
||||||
self.logger.exception('Failed to parse %s, ignoring.'%w)
|
self.logger.exception('Failed to parse %s, ignoring.', w)
|
||||||
self.logger.debug('Bad css: ')
|
self.logger.debug('Bad css: %s', x)
|
||||||
self.logger.debug(x)
|
|
||||||
|
|
||||||
# using oeb to store the rules, page rule and font face rules
|
# using oeb to store the rules, page rule and font face rules
|
||||||
# and generating them again if opts, profile or stylesheets are different
|
# and generating them again if opts, profile or stylesheets are different
|
||||||
@@ -303,7 +303,8 @@ class Stylizer(object):
|
|||||||
try:
|
try:
|
||||||
matches = tuple(select(text))
|
matches = tuple(select(text))
|
||||||
except SelectorError as err:
|
except SelectorError as err:
|
||||||
self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, err))
|
self.logger.error('Ignoring CSS rule with invalid selector: '
|
||||||
|
'%s (%s)', text, err)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if fl is not None:
|
if fl is not None:
|
||||||
@@ -367,11 +368,11 @@ class Stylizer(object):
|
|||||||
def _fetch_css_file(self, path):
|
def _fetch_css_file(self, path):
|
||||||
hrefs = self.oeb.manifest.hrefs
|
hrefs = self.oeb.manifest.hrefs
|
||||||
if path not in hrefs:
|
if path not in hrefs:
|
||||||
self.logger.warn('CSS import of missing file %r' % path)
|
self.logger.warning('CSS import of missing file %s', path)
|
||||||
return (None, None)
|
return (None, None)
|
||||||
item = hrefs[path]
|
item = hrefs[path]
|
||||||
if item.media_type not in base.OEB_STYLES:
|
if item.media_type not in base.OEB_STYLES:
|
||||||
self.logger.warn('CSS import of non-CSS file %r' % path)
|
self.logger.warning('CSS import of non-CSS file %r', path)
|
||||||
return (None, None)
|
return (None, None)
|
||||||
data = item.data.cssText
|
data = item.data.cssText
|
||||||
if not isinstance(data, bytes):
|
if not isinstance(data, bytes):
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ class DataURL(object):
|
|||||||
data = polyglot.as_bytes(data)
|
data = polyglot.as_bytes(data)
|
||||||
fmt = what(None, data)
|
fmt = what(None, data)
|
||||||
if not fmt:
|
if not fmt:
|
||||||
self.log.warn('Image encoded as data URL has unknown '
|
self.log.warning('Image encoded as data URL has unknown '
|
||||||
'format, ignoring')
|
'format, ignoring')
|
||||||
continue
|
continue
|
||||||
img.set('src',
|
img.set('src',
|
||||||
@@ -44,8 +44,8 @@ class DataURL(object):
|
|||||||
oeb)))
|
oeb)))
|
||||||
|
|
||||||
def convert_image_data_uri(self, data, fmt, oeb):
|
def convert_image_data_uri(self, data, fmt, oeb):
|
||||||
self.log('Found image encoded as data URI converting it to normal '
|
self.log.info('Found image encoded as data URI converting it to '
|
||||||
'image')
|
'normal image')
|
||||||
item_id, item_href = oeb.manifest.generate('data-url-image',
|
item_id, item_href = oeb.manifest.generate('data-url-image',
|
||||||
'data-url-image.' + fmt)
|
'data-url-image.' + fmt)
|
||||||
oeb.manifest.add(item_id, item_href,
|
oeb.manifest.add(item_id, item_href,
|
||||||
|
|||||||
@@ -117,8 +117,9 @@ class UniqueFilenames(object): # {{{
|
|||||||
self.seen_filenames.add(fname)
|
self.seen_filenames.add(fname)
|
||||||
|
|
||||||
if self.rename_map:
|
if self.rename_map:
|
||||||
self.log('Found non-unique filenames, renaming to support broken'
|
self.log.info('Found non-unique filenames, renaming to support '
|
||||||
' EPUB readers like FBReader, Aldiko and Stanza...')
|
'broken EPUB readers like FBReader, Aldiko and '
|
||||||
|
'Stanza...')
|
||||||
from pprint import pformat
|
from pprint import pformat
|
||||||
self.log.debug(pformat(self.rename_map))
|
self.log.debug(pformat(self.rename_map))
|
||||||
|
|
||||||
@@ -173,8 +174,8 @@ class FlatFilenames(object): # {{{
|
|||||||
oeb.spine.insert(isp, nitem, item.linear)
|
oeb.spine.insert(isp, nitem, item.linear)
|
||||||
|
|
||||||
if self.rename_map:
|
if self.rename_map:
|
||||||
self.log('Found non-flat filenames, renaming to support broken'
|
self.log.info('Found non-flat filenames, renaming to support '
|
||||||
' EPUB readers like FBReader...')
|
'broken EPUB readers like FBReader...')
|
||||||
from pprint import pformat
|
from pprint import pformat
|
||||||
self.log.debug(pformat(self.rename_map))
|
self.log.debug(pformat(self.rename_map))
|
||||||
self.log.debug(pformat(self.renamed_items_map))
|
self.log.debug(pformat(self.renamed_items_map))
|
||||||
|
|||||||
@@ -182,7 +182,7 @@ class CSSFlattener(object):
|
|||||||
else:
|
else:
|
||||||
from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css
|
from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css
|
||||||
self.filter_css = frozenset(normalize_filter_css(self.filter_css))
|
self.filter_css = frozenset(normalize_filter_css(self.filter_css))
|
||||||
self.oeb.log.debug('Filtering CSS properties: %s'%
|
self.oeb.log.debug('Filtering CSS properties: %s',
|
||||||
', '.join(self.filter_css))
|
', '.join(self.filter_css))
|
||||||
|
|
||||||
for item in oeb.manifest.values():
|
for item in oeb.manifest.values():
|
||||||
@@ -231,13 +231,13 @@ class CSSFlattener(object):
|
|||||||
msg = ('No embeddable fonts found for family: %r'%family)
|
msg = ('No embeddable fonts found for family: %r'%family)
|
||||||
if failure_critical:
|
if failure_critical:
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
self.oeb.log.warn(msg)
|
self.oeb.log.warning(msg)
|
||||||
return body_font_family, efi
|
return body_font_family, efi
|
||||||
if not faces:
|
if not faces:
|
||||||
msg = ('No embeddable fonts found for family: %r'%family)
|
msg = ('No embeddable fonts found for family: %r'%family)
|
||||||
if failure_critical:
|
if failure_critical:
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
self.oeb.log.warn(msg)
|
self.oeb.log.warning(msg)
|
||||||
return body_font_family, efi
|
return body_font_family, efi
|
||||||
|
|
||||||
for i, font in enumerate(faces):
|
for i, font in enumerate(faces):
|
||||||
@@ -258,7 +258,7 @@ class CSSFlattener(object):
|
|||||||
if i == 0:
|
if i == 0:
|
||||||
generic_family = panose_to_css_generic_family(font['panose'])
|
generic_family = panose_to_css_generic_family(font['panose'])
|
||||||
body_font_family = "'%s',%s"%(font['font-family'], generic_family)
|
body_font_family = "'%s',%s"%(font['font-family'], generic_family)
|
||||||
self.oeb.log('Embedding font: %s'%font['font-family'])
|
self.oeb.log.info('Embedding font: %s', font['font-family'])
|
||||||
for k in ('font-weight', 'font-style', 'font-stretch'):
|
for k in ('font-weight', 'font-style', 'font-stretch'):
|
||||||
if font[k] != 'normal':
|
if font[k] != 'normal':
|
||||||
cfont[k] = font[k]
|
cfont[k] = font[k]
|
||||||
@@ -323,8 +323,7 @@ class CSSFlattener(object):
|
|||||||
sbase = max(list(sizes.items()), key=operator.itemgetter(1))[0]
|
sbase = max(list(sizes.items()), key=operator.itemgetter(1))[0]
|
||||||
except:
|
except:
|
||||||
sbase = 12.0
|
sbase = 12.0
|
||||||
self.oeb.logger.info(
|
self.oeb.logger.info("Source base font size is %0.05fpt", sbase)
|
||||||
"Source base font size is %0.05fpt" % sbase)
|
|
||||||
return sbase
|
return sbase
|
||||||
|
|
||||||
def clean_edges(self, cssdict, style, fsize):
|
def clean_edges(self, cssdict, style, fsize):
|
||||||
@@ -346,8 +345,7 @@ class CSSFlattener(object):
|
|||||||
try:
|
try:
|
||||||
value = round(value / slineh) * dlineh
|
value = round(value / slineh) * dlineh
|
||||||
except:
|
except:
|
||||||
self.oeb.logger.warning(
|
self.oeb.logger.warning('Invalid length: %s', value)
|
||||||
'Invalid length:', value)
|
|
||||||
value = 0.0
|
value = 0.0
|
||||||
cssdict[property] = "%0.5fem" % (value / fsize)
|
cssdict[property] = "%0.5fem" % (value / fsize)
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,3 @@
|
|||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
|
|
||||||
class Clean(object):
|
class Clean(object):
|
||||||
'''Clean up guide, leaving only known values '''
|
'''Clean up guide, leaving only known values '''
|
||||||
|
|
||||||
@@ -28,7 +23,8 @@ class Clean(object):
|
|||||||
if covers:
|
if covers:
|
||||||
ref = covers[0][0]
|
ref = covers[0][0]
|
||||||
if len(covers) > 1:
|
if len(covers) > 1:
|
||||||
self.log('Choosing %s:%s as the cover'%(ref.type, ref.href))
|
self.log.info('Choosing %s:%s as the cover', ref.type,
|
||||||
|
ref.href)
|
||||||
ref.type = 'cover'
|
ref.type = 'cover'
|
||||||
self.oeb.guide.refs['cover'] = ref
|
self.oeb.guide.refs['cover'] = ref
|
||||||
|
|
||||||
|
|||||||
@@ -34,19 +34,19 @@ class RemoveFirstImage:
|
|||||||
continue
|
continue
|
||||||
removed = self.remove_images(item)
|
removed = self.remove_images(item)
|
||||||
if removed > 0:
|
if removed > 0:
|
||||||
self.log('Removed first image')
|
self.log.info('Removed first image')
|
||||||
body = XPath('//h:body')(item.data)
|
body = XPath('//h:body')(item.data)
|
||||||
if body:
|
if body:
|
||||||
raw = xml2text(body[0]).strip()
|
raw = xml2text(body[0]).strip()
|
||||||
imgs = XPath('//h:img|//svg:svg')(item.data)
|
imgs = XPath('//h:img|//svg:svg')(item.data)
|
||||||
if not raw and not imgs:
|
if not raw and not imgs:
|
||||||
self.log('Removing %s as it has no content' %
|
self.log.info('Removing %s as it has no content',
|
||||||
item.href)
|
item.href)
|
||||||
self.oeb.manifest.remove(item)
|
self.oeb.manifest.remove(item)
|
||||||
deleted_item = item
|
deleted_item = item
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
self.log.warn('Could not find first image to remove')
|
self.log.warning('Could not find first image to remove')
|
||||||
if deleted_item is not None:
|
if deleted_item is not None:
|
||||||
for item in list(self.oeb.toc):
|
for item in list(self.oeb.toc):
|
||||||
href = urllib.parse.urldefrag(item.href)[0]
|
href = urllib.parse.urldefrag(item.href)[0]
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ class MergeMetadata(object):
|
|||||||
_oim = override_input_metadata
|
_oim = override_input_metadata
|
||||||
self.oeb, self.log = oeb, oeb.log
|
self.oeb, self.log = oeb, oeb.log
|
||||||
m = self.oeb.metadata
|
m = self.oeb.metadata
|
||||||
self.log('Merging user specified metadata...')
|
self.log.info('Merging user specified metadata...')
|
||||||
meta_info_to_oeb_metadata(mi, m, oeb.log,
|
meta_info_to_oeb_metadata(mi, m, oeb.log,
|
||||||
override_input_metadata=_oim)
|
override_input_metadata=_oim)
|
||||||
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
|
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
|
||||||
@@ -210,8 +210,8 @@ class MergeMetadata(object):
|
|||||||
text = ''
|
text = ''
|
||||||
text = re.sub(r'\s+', '', text)
|
text = re.sub(r'\s+', '', text)
|
||||||
if not text and not XPath('//h:img|//svg:svg')(item.data):
|
if not text and not XPath('//h:img|//svg:svg')(item.data):
|
||||||
self.log('Removing %s as it is a wrapper around the cover '
|
self.log.info('Removing %s as it is a wrapper around the '
|
||||||
'image' % item.href)
|
'cover image', item.href)
|
||||||
self.oeb.spine.remove(item)
|
self.oeb.spine.remove(item)
|
||||||
self.oeb.manifest.remove(item)
|
self.oeb.manifest.remove(item)
|
||||||
self.oeb.guide.remove_by_href(item.href)
|
self.oeb.guide.remove_by_href(item.href)
|
||||||
|
|||||||
@@ -5,11 +5,6 @@ from ebook_converter.ebooks.oeb import parse_utils
|
|||||||
from ebook_converter.ebooks.oeb.base import XPath
|
from ebook_converter.ebooks.oeb.base import XPath
|
||||||
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
|
|
||||||
class RemoveAdobeMargins(object):
|
class RemoveAdobeMargins(object):
|
||||||
'''
|
'''
|
||||||
Remove margins specified in Adobe's page templates.
|
Remove margins specified in Adobe's page templates.
|
||||||
@@ -19,11 +14,12 @@ class RemoveAdobeMargins(object):
|
|||||||
self.oeb, self.opts, self.log = oeb, opts, log
|
self.oeb, self.opts, self.log = oeb, opts, log
|
||||||
|
|
||||||
for item in self.oeb.manifest:
|
for item in self.oeb.manifest:
|
||||||
if item.media_type in {
|
if (item.media_type in {'application/vnd.adobe-page-template+xml',
|
||||||
'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml',
|
'application/vnd.adobe.page-template+xml',
|
||||||
'application/adobe-page-template+xml', 'application/adobe.page-template+xml',
|
'application/adobe-page-template+xml',
|
||||||
} and hasattr(item.data, 'xpath'):
|
'application/adobe.page-template+xml'} and
|
||||||
self.log('Removing page margins specified in the'
|
hasattr(item.data, 'xpath')):
|
||||||
|
self.log.info('Removing page margins specified in the '
|
||||||
'Adobe page template')
|
'Adobe page template')
|
||||||
for elem in item.data.xpath(
|
for elem in item.data.xpath(
|
||||||
'//*[@margin-bottom or @margin-top '
|
'//*[@margin-bottom or @margin-top '
|
||||||
@@ -59,7 +55,7 @@ class RemoveFakeMargins(object):
|
|||||||
if stylesheet is None:
|
if stylesheet is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
self.log('Removing fake margins...')
|
self.log.info('Removing fake margins...')
|
||||||
|
|
||||||
stylesheet = stylesheet.data
|
stylesheet = stylesheet.data
|
||||||
|
|
||||||
@@ -73,8 +69,8 @@ class RemoveFakeMargins(object):
|
|||||||
try:
|
try:
|
||||||
self.process_level(level)
|
self.process_level(level)
|
||||||
except NegativeTextIndent:
|
except NegativeTextIndent:
|
||||||
self.log.debug('Negative text indent detected at level '
|
self.log.debug('Negative text indent detected at level %s, '
|
||||||
' %s, ignoring this level'%level)
|
'ignoring this level', level)
|
||||||
|
|
||||||
def get_margins(self, elem):
|
def get_margins(self, elem):
|
||||||
cls = elem.get('class', None)
|
cls = elem.get('class', None)
|
||||||
@@ -102,19 +98,21 @@ class RemoveFakeMargins(object):
|
|||||||
self.stats[level+'_left'][lm] += 1
|
self.stats[level+'_left'][lm] += 1
|
||||||
self.stats[level+'_right'][rm] += 1
|
self.stats[level+'_right'][rm] += 1
|
||||||
|
|
||||||
self.log.debug(level, ' left margin stats:', self.stats[level+'_left'])
|
self.log.debug('%s left margin stats: %s', level,
|
||||||
self.log.debug(level, ' right margin stats:', self.stats[level+'_right'])
|
self.stats[level+'_left'])
|
||||||
|
self.log.debug('%s right margin stats: %s', level,
|
||||||
|
self.stats[level+'_right'])
|
||||||
|
|
||||||
remove_left = self.analyze_stats(self.stats[level+'_left'])
|
remove_left = self.analyze_stats(self.stats[level+'_left'])
|
||||||
remove_right = self.analyze_stats(self.stats[level+'_right'])
|
remove_right = self.analyze_stats(self.stats[level+'_right'])
|
||||||
|
|
||||||
if remove_left:
|
if remove_left:
|
||||||
mcl = self.stats[level+'_left'].most_common(1)[0][0]
|
mcl = self.stats[level+'_left'].most_common(1)[0][0]
|
||||||
self.log('Removing level %s left margin of:'%level, mcl)
|
self.log.info('Removing level %s left margin of: %s', level, mcl)
|
||||||
|
|
||||||
if remove_right:
|
if remove_right:
|
||||||
mcr = self.stats[level+'_right'].most_common(1)[0][0]
|
mcr = self.stats[level+'_right'].most_common(1)[0][0]
|
||||||
self.log('Removing level %s right margin of:'%level, mcr)
|
self.log.info('Removing level %s right margin of: %s', level, mcr)
|
||||||
|
|
||||||
if remove_left or remove_right:
|
if remove_left or remove_right:
|
||||||
for elem in elems:
|
for elem in elems:
|
||||||
@@ -151,7 +149,7 @@ class RemoveFakeMargins(object):
|
|||||||
remove = set()
|
remove = set()
|
||||||
for k, v in self.levels.items():
|
for k, v in self.levels.items():
|
||||||
num = len(v)
|
num = len(v)
|
||||||
self.log.debug('Found %d items of level:'%num, k)
|
self.log.debug('Found %s items of level: %s', num, k)
|
||||||
level = int(k.split('_')[-1])
|
level = int(k.split('_')[-1])
|
||||||
tag = k.split('_')[0]
|
tag = k.split('_')[0]
|
||||||
if tag == 'p' and num < 25:
|
if tag == 'p' and num < 25:
|
||||||
@@ -169,7 +167,7 @@ class RemoveFakeMargins(object):
|
|||||||
|
|
||||||
for k in remove:
|
for k in remove:
|
||||||
self.levels.pop(k)
|
self.levels.pop(k)
|
||||||
self.log.debug('Ignoring level', k)
|
self.log.debug('Ignoring level %s', k)
|
||||||
|
|
||||||
def analyze_stats(self, stats):
|
def analyze_stats(self, stats):
|
||||||
if not stats:
|
if not stats:
|
||||||
|
|||||||
@@ -45,12 +45,14 @@ class RescaleImages(object):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
if self.check_colorspaces and img.mode == 'CMYK':
|
if self.check_colorspaces and img.mode == 'CMYK':
|
||||||
self.log.warn(
|
self.log.warning('The image %s is in the CMYK '
|
||||||
'The image %s is in the CMYK colorspace, converting it '
|
'colorspace, converting it to RGB as '
|
||||||
'to RGB as Adobe Digital Editions cannot display CMYK' % item.href)
|
'Adobe Digital Editions cannot '
|
||||||
|
'display CMYK', item.href)
|
||||||
img = img.convert('RGB')
|
img = img.convert('RGB')
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.exception('Failed to convert image %s from CMYK to RGB' % item.href)
|
self.log.exception('Failed to convert image %s from CMYK '
|
||||||
|
'to RGB', item.href)
|
||||||
|
|
||||||
scaled, new_width, new_height = uimg.fit_image(width, height,
|
scaled, new_width, new_height = uimg.fit_image(width, height,
|
||||||
page_width,
|
page_width,
|
||||||
@@ -58,18 +60,20 @@ class RescaleImages(object):
|
|||||||
if scaled:
|
if scaled:
|
||||||
new_width = max(1, new_width)
|
new_width = max(1, new_width)
|
||||||
new_height = max(1, new_height)
|
new_height = max(1, new_height)
|
||||||
self.log('Rescaling image from %dx%d to %dx%d'%(
|
self.log('Rescaling image from %sx%s to %sx%s %s', width,
|
||||||
width, height, new_width, new_height), item.href)
|
height, new_width, new_height, item.href)
|
||||||
try:
|
try:
|
||||||
img = img.resize((new_width, new_height))
|
img = img.resize((new_width, new_height))
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.exception('Failed to rescale image: %s' % item.href)
|
self.log.exception('Failed to rescale image: %s',
|
||||||
|
item.href)
|
||||||
continue
|
continue
|
||||||
buf = BytesIO()
|
buf = BytesIO()
|
||||||
try:
|
try:
|
||||||
img.save(buf, ext)
|
img.save(buf, ext)
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.exception('Failed to rescale image: %s' % item.href)
|
self.log.exception('Failed to rescale image: %s',
|
||||||
|
item.href)
|
||||||
else:
|
else:
|
||||||
item.data = buf.getvalue()
|
item.data = buf.getvalue()
|
||||||
item.unload_data_from_memory()
|
item.unload_data_from_memory()
|
||||||
|
|||||||
@@ -59,7 +59,8 @@ class Split(object):
|
|||||||
def __call__(self, oeb, opts):
|
def __call__(self, oeb, opts):
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
self.log = oeb.log
|
self.log = oeb.log
|
||||||
self.log('Splitting markup on page breaks and flow limits, if any...')
|
self.log.info('Splitting markup on page breaks and flow limits, if '
|
||||||
|
'any...')
|
||||||
self.opts = opts
|
self.opts = opts
|
||||||
self.map = {}
|
self.map = {}
|
||||||
for item in list(self.oeb.manifest.items):
|
for item in list(self.oeb.manifest.items):
|
||||||
@@ -127,8 +128,7 @@ class Split(object):
|
|||||||
page_breaks.add(elem)
|
page_breaks.add(elem)
|
||||||
except SelectorError as err:
|
except SelectorError as err:
|
||||||
self.log.warn('Ignoring page breaks specified with invalid '
|
self.log.warn('Ignoring page breaks specified with invalid '
|
||||||
'CSS selector: %r (%s)' %
|
'CSS selector: %s (%s)', selector, err)
|
||||||
(selector, err))
|
|
||||||
|
|
||||||
for i, elem in enumerate(item.data.iter('*')):
|
for i, elem in enumerate(item.data.iter('*')):
|
||||||
try:
|
try:
|
||||||
@@ -221,13 +221,13 @@ class FlowSplitter(object):
|
|||||||
|
|
||||||
if self.max_flow_size > 0:
|
if self.max_flow_size > 0:
|
||||||
lt_found = False
|
lt_found = False
|
||||||
self.log('\tLooking for large trees in %s...' % item.href)
|
self.log.info('\tLooking for large trees in %s...', item.href)
|
||||||
trees = list(self.trees)
|
trees = list(self.trees)
|
||||||
self.tree_map = {}
|
self.tree_map = {}
|
||||||
for i, tree in enumerate(trees):
|
for i, tree in enumerate(trees):
|
||||||
size = len(tostring(tree.getroot()))
|
size = len(tostring(tree.getroot()))
|
||||||
if size > self.max_flow_size:
|
if size > self.max_flow_size:
|
||||||
self.log('\tFound large tree #%d' % i)
|
self.log.info('\tFound large tree #%s', i)
|
||||||
lt_found = True
|
lt_found = True
|
||||||
self.split_trees = []
|
self.split_trees = []
|
||||||
self.split_to_size(tree)
|
self.split_to_size(tree)
|
||||||
@@ -240,7 +240,7 @@ class FlowSplitter(object):
|
|||||||
|
|
||||||
self.was_split = len(self.trees) > 1
|
self.was_split = len(self.trees) > 1
|
||||||
if self.was_split:
|
if self.was_split:
|
||||||
self.log('\tSplit into %d parts' % len(self.trees))
|
self.log('\tSplit into %s parts', len(self.trees))
|
||||||
self.commit()
|
self.commit()
|
||||||
|
|
||||||
def split_on_page_breaks(self, orig_tree):
|
def split_on_page_breaks(self, orig_tree):
|
||||||
@@ -259,7 +259,7 @@ class FlowSplitter(object):
|
|||||||
tree = self.trees[i]
|
tree = self.trees[i]
|
||||||
elem = pattern(tree)
|
elem = pattern(tree)
|
||||||
if elem:
|
if elem:
|
||||||
self.log.debug('\t\tSplitting on page-break at id=%s' %
|
self.log.debug('\t\tSplitting on page-break at id=%s',
|
||||||
elem[0].get('id'))
|
elem[0].get('id'))
|
||||||
before_tree, after_tree = self.do_split(tree, elem[0],
|
before_tree, after_tree = self.do_split(tree, elem[0],
|
||||||
before)
|
before)
|
||||||
@@ -322,10 +322,10 @@ class FlowSplitter(object):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def split_text(self, text, root, size):
|
def split_text(self, text, root, size):
|
||||||
self.log.debug('\t\t\tSplitting text of length: %d' % len(text))
|
self.log.debug('\t\t\tSplitting text of length: %d', len(text))
|
||||||
rest = text.replace('\r', '')
|
rest = text.replace('\r', '')
|
||||||
parts = re.split('\n\n', rest)
|
parts = re.split('\n\n', rest)
|
||||||
self.log.debug('\t\t\t\tFound %d parts' % len(parts))
|
self.log.debug('\t\t\t\tFound %d parts', len(parts))
|
||||||
if max(map(len, parts)) > size:
|
if max(map(len, parts)) > size:
|
||||||
raise SplitError('Cannot split as file contains a <pre> tag '
|
raise SplitError('Cannot split as file contains a <pre> tag '
|
||||||
'with a very large paragraph', root)
|
'with a very large paragraph', root)
|
||||||
@@ -364,7 +364,7 @@ class FlowSplitter(object):
|
|||||||
split_point, before = self.find_split_point(root)
|
split_point, before = self.find_split_point(root)
|
||||||
if split_point is None:
|
if split_point is None:
|
||||||
raise SplitError(self.item.href, root)
|
raise SplitError(self.item.href, root)
|
||||||
self.log.debug('\t\t\tSplit point:', split_point.tag,
|
self.log.debug('\t\t\tSplit point: %s %s', split_point.tag,
|
||||||
tree.getpath(split_point))
|
tree.getpath(split_point))
|
||||||
|
|
||||||
trees = self.do_split(tree, split_point, before)
|
trees = self.do_split(tree, split_point, before)
|
||||||
@@ -380,10 +380,10 @@ class FlowSplitter(object):
|
|||||||
continue
|
continue
|
||||||
elif size <= self.max_flow_size:
|
elif size <= self.max_flow_size:
|
||||||
self.split_trees.append(t)
|
self.split_trees.append(t)
|
||||||
self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)' %
|
self.log.debug('\t\t\tCommitted sub-tree #%s (%s KB)',
|
||||||
(len(self.split_trees), size/1024.))
|
len(self.split_trees), size/1024.)
|
||||||
else:
|
else:
|
||||||
self.log.debug('\t\t\tSplit tree still too large: %d KB' %
|
self.log.debug('\t\t\tSplit tree still too large: %d KB',
|
||||||
size/1024)
|
size/1024)
|
||||||
self.split_to_size(t)
|
self.split_to_size(t)
|
||||||
|
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ class DetectStructure(object):
|
|||||||
self.log = oeb.log
|
self.log = oeb.log
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
self.opts = opts
|
self.opts = opts
|
||||||
self.log('Detecting structure...')
|
self.log.info('Detecting structure...')
|
||||||
|
|
||||||
self.detect_chapters()
|
self.detect_chapters()
|
||||||
if self.oeb.auto_generated_toc or opts.use_auto_toc:
|
if self.oeb.auto_generated_toc or opts.use_auto_toc:
|
||||||
@@ -67,15 +67,15 @@ class DetectStructure(object):
|
|||||||
self.oeb.toc = orig_toc
|
self.oeb.toc = orig_toc
|
||||||
else:
|
else:
|
||||||
self.oeb.auto_generated_toc = True
|
self.oeb.auto_generated_toc = True
|
||||||
self.log('Auto generated TOC with %d entries.' %
|
self.log.info('Auto generated TOC with %s entries.',
|
||||||
self.oeb.toc.count())
|
self.oeb.toc.count())
|
||||||
|
|
||||||
if opts.toc_filter is not None:
|
if opts.toc_filter is not None:
|
||||||
regexp = re.compile(opts.toc_filter)
|
regexp = re.compile(opts.toc_filter)
|
||||||
for node in list(self.oeb.toc.iter()):
|
for node in list(self.oeb.toc.iter()):
|
||||||
if not node.title or regexp.search(node.title) is not None:
|
if not node.title or regexp.search(node.title) is not None:
|
||||||
self.log('Filtering', node.title if node.title else
|
self.log.info('Filtering %s from TOC', node.title if
|
||||||
'empty node', 'from TOC')
|
node.title else 'empty node')
|
||||||
self.oeb.toc.remove(node)
|
self.oeb.toc.remove(node)
|
||||||
|
|
||||||
if opts.page_breaks_before is not None:
|
if opts.page_breaks_before is not None:
|
||||||
@@ -112,8 +112,8 @@ class DetectStructure(object):
|
|||||||
try:
|
try:
|
||||||
expr = XPath(expr)
|
expr = XPath(expr)
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.warn('Invalid start reading at XPath expression, '
|
self.log.warning('Invalid start reading at XPath expression, '
|
||||||
'ignoring: %s' % expr)
|
'ignoring: %s', expr)
|
||||||
return
|
return
|
||||||
for item in self.oeb.spine:
|
for item in self.oeb.spine:
|
||||||
if not hasattr(item.data, 'xpath'):
|
if not hasattr(item.data, 'xpath'):
|
||||||
@@ -129,10 +129,10 @@ class DetectStructure(object):
|
|||||||
if 'text' in self.oeb.guide:
|
if 'text' in self.oeb.guide:
|
||||||
self.oeb.guide.remove('text')
|
self.oeb.guide.remove('text')
|
||||||
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
|
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
|
||||||
self.log('Setting start reading at position to %s in %s' %
|
self.log.info('Setting start reading at position to %s in %s',
|
||||||
(self.opts.start_reading_at, item.href))
|
self.opts.start_reading_at, item.href)
|
||||||
return
|
return
|
||||||
self.log.warn("Failed to find start reading at position: %s" %
|
self.log.warning("Failed to find start reading at position: %s",
|
||||||
self.opts.start_reading_at)
|
self.opts.start_reading_at)
|
||||||
|
|
||||||
def get_toc_parts_for_xpath(self, expr):
|
def get_toc_parts_for_xpath(self, expr):
|
||||||
@@ -155,7 +155,7 @@ class DetectStructure(object):
|
|||||||
len(ans)
|
len(ans)
|
||||||
return ans
|
return ans
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.warn('Invalid chapter expression, ignoring: %s' %
|
self.log.warning('Invalid chapter expression, ignoring: %s',
|
||||||
expr)
|
expr)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
@@ -175,7 +175,7 @@ class DetectStructure(object):
|
|||||||
c[item] += 1
|
c[item] += 1
|
||||||
text = base.xml2text(elem).strip()
|
text = base.xml2text(elem).strip()
|
||||||
text = re.sub(r'\s+', ' ', text.strip())
|
text = re.sub(r'\s+', ' ', text.strip())
|
||||||
self.log('\tDetected chapter:', text[:50])
|
self.log.info('\tDetected chapter: %s', text[:50])
|
||||||
if chapter_mark == 'none':
|
if chapter_mark == 'none':
|
||||||
continue
|
continue
|
||||||
if chapter_mark == 'rule':
|
if chapter_mark == 'rule':
|
||||||
@@ -221,7 +221,7 @@ class DetectStructure(object):
|
|||||||
try:
|
try:
|
||||||
purl = urllib.parse.urlparse(href)
|
purl = urllib.parse.urlparse(href)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.log.warning('Ignoring malformed URL:', href)
|
self.log.warning('Ignoring malformed URL: %s', href)
|
||||||
continue
|
continue
|
||||||
if not purl[0] or purl[0] == 'file':
|
if not purl[0] or purl[0] == 'file':
|
||||||
href, frag = purl.path, purl.fragment
|
href, frag = purl.path, purl.fragment
|
||||||
@@ -240,13 +240,14 @@ class DetectStructure(object):
|
|||||||
play_order=self.oeb.toc.next_play_order())
|
play_order=self.oeb.toc.next_play_order())
|
||||||
num += 1
|
num += 1
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.oeb.log.exception('Failed to process link: '
|
self.oeb.log.critical('Failed to process link: %s',
|
||||||
'%r' % href)
|
href)
|
||||||
# Most likely an incorrectly URL encoded link
|
# Most likely an incorrectly URL encoded link
|
||||||
continue
|
continue
|
||||||
if self.opts.max_toc_links > 0 and \
|
if self.opts.max_toc_links > 0 and \
|
||||||
num >= self.opts.max_toc_links:
|
num >= self.opts.max_toc_links:
|
||||||
self.log('Maximum TOC links reached, stopping.')
|
self.log.info('Maximum TOC links reached, '
|
||||||
|
'stopping.')
|
||||||
return
|
return
|
||||||
|
|
||||||
def elem_to_link(self, item, elem, title_attribute, counter):
|
def elem_to_link(self, item, elem, title_attribute, counter):
|
||||||
@@ -277,7 +278,7 @@ class DetectStructure(object):
|
|||||||
len(ans)
|
len(ans)
|
||||||
return ans
|
return ans
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.warn('Invalid ToC expression, ignoring: %s' % expr)
|
self.log.warning('Invalid ToC expression, ignoring: %s', expr)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
for document in self.oeb.spine:
|
for document in self.oeb.spine:
|
||||||
|
|||||||
@@ -5,11 +5,6 @@ from ebook_converter.utils.fonts.sfnt.subset import subset, NoGlyphs, Unsupporte
|
|||||||
from ebook_converter.tinycss.fonts3 import parse_font_family
|
from ebook_converter.tinycss.fonts3 import parse_font_family
|
||||||
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
|
|
||||||
def get_font_properties(rule, default=None):
|
def get_font_properties(rule, default=None):
|
||||||
'''
|
'''
|
||||||
Given a CSS rule, extract normalized font properties from
|
Given a CSS rule, extract normalized font properties from
|
||||||
@@ -149,18 +144,19 @@ class SubsetFonts(object):
|
|||||||
|
|
||||||
for font in fonts.values():
|
for font in fonts.values():
|
||||||
if not font['chars']:
|
if not font['chars']:
|
||||||
self.log('The font %s is unused. Removing it.'%font['src'])
|
self.log('The font %s is unused. Removing it.', font['src'])
|
||||||
remove(font)
|
remove(font)
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
raw, old_stats, new_stats = subset(font['item'].data, font['chars'])
|
raw, old_stats, new_stats = subset(font['item'].data, font['chars'])
|
||||||
except NoGlyphs:
|
except NoGlyphs:
|
||||||
self.log('The font %s has no used glyphs. Removing it.'%font['src'])
|
self.log('The font %s has no used glyphs. Removing it.',
|
||||||
|
font['src'])
|
||||||
remove(font)
|
remove(font)
|
||||||
continue
|
continue
|
||||||
except UnsupportedFont as e:
|
except UnsupportedFont as e:
|
||||||
self.log.warn('The font %s is unsupported for subsetting. %s'%(
|
self.log.warning('The font %s is unsupported for subsetting. '
|
||||||
font['src'], e))
|
'%s', font['src'], e)
|
||||||
sz = len(font['item'].data)
|
sz = len(font['item'].data)
|
||||||
totals[0] += sz
|
totals[0] += sz
|
||||||
totals[1] += sz
|
totals[1] += sz
|
||||||
@@ -168,16 +164,16 @@ class SubsetFonts(object):
|
|||||||
font['item'].data = raw
|
font['item'].data = raw
|
||||||
nlen = sum(new_stats.values())
|
nlen = sum(new_stats.values())
|
||||||
olen = sum(old_stats.values())
|
olen = sum(old_stats.values())
|
||||||
self.log('Decreased the font %s to %.1f%% of its original size'%
|
self.log('Decreased the font %s to %.1f%% of its original '
|
||||||
(font['src'], nlen/olen *100))
|
'size', font['src'], nlen/olen * 100)
|
||||||
totals[0] += nlen
|
totals[0] += nlen
|
||||||
totals[1] += olen
|
totals[1] += olen
|
||||||
|
|
||||||
font['item'].unload_data_from_memory()
|
font['item'].unload_data_from_memory()
|
||||||
|
|
||||||
if totals[0]:
|
if totals[0]:
|
||||||
self.log('Reduced total font size to %.1f%% of original'%
|
self.log('Reduced total font size to %.1f%% of original',
|
||||||
(totals[0]/totals[1] * 100))
|
totals[0]/totals[1] * 100)
|
||||||
|
|
||||||
def find_embedded_fonts(self):
|
def find_embedded_fonts(self):
|
||||||
'''
|
'''
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ class Reader132(FormatReader):
|
|||||||
|
|
||||||
pml = ''
|
pml = ''
|
||||||
for i in range(1, self.header_record.num_text_pages + 1):
|
for i in range(1, self.header_record.num_text_pages + 1):
|
||||||
self.log.debug('Extracting text page %i' % i)
|
self.log.debug('Extracting text page %s', i)
|
||||||
pml += self.get_text_page(i)
|
pml += self.get_text_page(i)
|
||||||
hizer = PML_HTMLizer()
|
hizer = PML_HTMLizer()
|
||||||
html += hizer.parse_pml(pml, 'index.html')
|
html += hizer.parse_pml(pml, 'index.html')
|
||||||
@@ -123,7 +123,7 @@ class Reader132(FormatReader):
|
|||||||
footnoteids = re.findall(
|
footnoteids = re.findall(
|
||||||
'\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
'\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||||
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
|
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
|
||||||
self.log.debug('Extracting footnote page %i' % i)
|
self.log.debug('Extracting footnote page %s', i)
|
||||||
if fid < len(footnoteids):
|
if fid < len(footnoteids):
|
||||||
fid = footnoteids[fid]
|
fid = footnoteids[fid]
|
||||||
else:
|
else:
|
||||||
@@ -135,7 +135,7 @@ class Reader132(FormatReader):
|
|||||||
sidebarids = re.findall(
|
sidebarids = re.findall(
|
||||||
'\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
'\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||||
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
|
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
|
||||||
self.log.debug('Extracting sidebar page %i' % i)
|
self.log.debug('Extracting sidebar page %s', i)
|
||||||
if sid < len(sidebarids):
|
if sid < len(sidebarids):
|
||||||
sid = sidebarids[sid]
|
sid = sidebarids[sid]
|
||||||
else:
|
else:
|
||||||
@@ -157,7 +157,7 @@ class Reader132(FormatReader):
|
|||||||
name, img = self.get_image(self.header_record.image_data_offset + i)
|
name, img = self.get_image(self.header_record.image_data_offset + i)
|
||||||
images.append(name)
|
images.append(name)
|
||||||
with open(name, 'wb') as imgf:
|
with open(name, 'wb') as imgf:
|
||||||
self.log.debug('Writing image %s to images/' % name)
|
self.log.debug('Writing image %s to images/', name)
|
||||||
imgf.write(img)
|
imgf.write(img)
|
||||||
|
|
||||||
opf_path = self.create_opf(output_dir, images, toc)
|
opf_path = self.create_opf(output_dir, images, toc)
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ class Reader202(FormatReader):
|
|||||||
|
|
||||||
pml = ''
|
pml = ''
|
||||||
for i in range(1, self.header_record.num_text_pages + 1):
|
for i in range(1, self.header_record.num_text_pages + 1):
|
||||||
self.log.debug('Extracting text page %i' % i)
|
self.log.debug('Extracting text page %s', i)
|
||||||
pml += self.get_text_page(i)
|
pml += self.get_text_page(i)
|
||||||
|
|
||||||
title = self.mi.title
|
title = self.mi.title
|
||||||
@@ -111,7 +111,7 @@ class Reader202(FormatReader):
|
|||||||
if name:
|
if name:
|
||||||
images.append(name)
|
images.append(name)
|
||||||
with open(name, 'wb') as imgf:
|
with open(name, 'wb') as imgf:
|
||||||
self.log.debug('Writing image %s to images/' % name)
|
self.log.debug('Writing image %s to images/', name)
|
||||||
imgf.write(img)
|
imgf.write(img)
|
||||||
|
|
||||||
opf_path = self.create_opf(output_dir, images)
|
opf_path = self.create_opf(output_dir, images)
|
||||||
|
|||||||
@@ -116,9 +116,9 @@ class Reader(FormatReader):
|
|||||||
def extract_content(self, output_dir):
|
def extract_content(self, output_dir):
|
||||||
txt = ''
|
txt = ''
|
||||||
|
|
||||||
self.log.info(u'Decompressing text...')
|
self.log.info('Decompressing text...')
|
||||||
for i in range(1, self.header_record.num_records + 1):
|
for i in range(1, self.header_record.num_records + 1):
|
||||||
self.log.debug(u'\tDecompressing text section %i' % i)
|
self.log.debug('\tDecompressing text section %s', i)
|
||||||
title = self.header_record.chapter_titles[i-1]
|
title = self.header_record.chapter_titles[i-1]
|
||||||
lines = []
|
lines = []
|
||||||
title_added = False
|
title_added = False
|
||||||
@@ -135,7 +135,7 @@ class Reader(FormatReader):
|
|||||||
lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
|
lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
|
||||||
txt += '\n'.join(lines)
|
txt += '\n'.join(lines)
|
||||||
|
|
||||||
self.log.info(u'Converting text to OEB...')
|
self.log.info('Converting text to OEB...')
|
||||||
html = HTML_TEMPLATE % (self.header_record.title, txt)
|
html = HTML_TEMPLATE % (self.header_record.title, txt)
|
||||||
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
||||||
index.write(html.encode('utf-8'))
|
index.write(html.encode('utf-8'))
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ class Reader(FormatReader):
|
|||||||
|
|
||||||
self.log.info('Decompressing text...')
|
self.log.info('Decompressing text...')
|
||||||
for i in range(1, self.header_record.num_records + 1):
|
for i in range(1, self.header_record.num_records + 1):
|
||||||
self.log.debug('\tDecompressing text section %i' % i)
|
self.log.debug('\tDecompressing text section %s', i)
|
||||||
raw_txt += self.decompress_text(i)
|
raw_txt += self.decompress_text(i)
|
||||||
|
|
||||||
self.log.info('Converting text to OEB...')
|
self.log.info('Converting text to OEB...')
|
||||||
|
|||||||
@@ -360,7 +360,8 @@ class Reader(FormatReader):
|
|||||||
# plugin assemble the order based on hyperlinks.
|
# plugin assemble the order based on hyperlinks.
|
||||||
with directory.CurrentDir(output_dir):
|
with directory.CurrentDir(output_dir):
|
||||||
for uid, num in self.uid_text_secion_number.items():
|
for uid, num in self.uid_text_secion_number.items():
|
||||||
self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid))
|
self.log.debug('Writing record with uid: %s as %s.html',
|
||||||
|
uid, uid)
|
||||||
with open('%s.html' % uid, 'wb') as htmlf:
|
with open('%s.html' % uid, 'wb') as htmlf:
|
||||||
html = u'<html><body>'
|
html = u'<html><body>'
|
||||||
section_header, section_data = self.sections[num]
|
section_header, section_data = self.sections[num]
|
||||||
@@ -393,11 +394,14 @@ class Reader(FormatReader):
|
|||||||
try:
|
try:
|
||||||
save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70)
|
save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70)
|
||||||
images.add(uid)
|
images.add(uid)
|
||||||
self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
|
self.log.debug('Wrote image with uid %s to '
|
||||||
|
'images/%s.jpg', uid, uid)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log.error('Failed to write image with uid %s: %s' % (uid, e))
|
self.log.error('Failed to write image with uid %s: %s',
|
||||||
|
uid, e)
|
||||||
else:
|
else:
|
||||||
self.log.error('Failed to write image with uid %s: No data.' % uid)
|
self.log.error('Failed to write image with uid %s: '
|
||||||
|
'No data.', uid)
|
||||||
# Composite images.
|
# Composite images.
|
||||||
# We're going to use the already compressed .jpg images here.
|
# We're going to use the already compressed .jpg images here.
|
||||||
for uid, num in self.uid_composite_image_section_number.items():
|
for uid, num in self.uid_composite_image_section_number.items():
|
||||||
@@ -436,9 +440,11 @@ class Reader(FormatReader):
|
|||||||
y_off += largest_height
|
y_off += largest_height
|
||||||
with open('%s.jpg' % uid) as out:
|
with open('%s.jpg' % uid) as out:
|
||||||
out.write(canvas.export(compression_quality=70))
|
out.write(canvas.export(compression_quality=70))
|
||||||
self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid))
|
self.log.debug('Wrote composite image with uid %s to '
|
||||||
|
'images/%s.jpg', uid, uid)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log.error('Failed to write composite image with uid %s: %s' % (uid, e))
|
self.log.error('Failed to write composite image with '
|
||||||
|
'uid %s: %s', uid, e)
|
||||||
|
|
||||||
# Run the HTML through the html processing plugin.
|
# Run the HTML through the html processing plugin.
|
||||||
from ebook_converter.customize.ui import plugin_for_input_format
|
from ebook_converter.customize.ui import plugin_for_input_format
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class Reader(FormatReader):
|
|||||||
if (self.header_record.flags & 0x01) == 0:
|
if (self.header_record.flags & 0x01) == 0:
|
||||||
raise zTXTError('Only compression method 1 (random access) is supported')
|
raise zTXTError('Only compression method 1 (random access) is supported')
|
||||||
|
|
||||||
self.log.debug('Foud ztxt version: %i.%i' % (vmajor, vminor))
|
self.log.debug('Foud ztxt version: %s.%s', vmajor, vminor)
|
||||||
|
|
||||||
# Initalize the decompressor
|
# Initalize the decompressor
|
||||||
self.uncompressor = zlib.decompressobj()
|
self.uncompressor = zlib.decompressobj()
|
||||||
@@ -73,7 +73,7 @@ class Reader(FormatReader):
|
|||||||
|
|
||||||
self.log.info('Decompressing text...')
|
self.log.info('Decompressing text...')
|
||||||
for i in range(1, self.header_record.num_records + 1):
|
for i in range(1, self.header_record.num_records + 1):
|
||||||
self.log.debug('\tDecompressing text section %i' % i)
|
self.log.debug('\tDecompressing text section %s', i)
|
||||||
raw_txt += self.decompress_text(i)
|
raw_txt += self.decompress_text(i)
|
||||||
|
|
||||||
self.log.info('Converting text to OEB...')
|
self.log.info('Converting text to OEB...')
|
||||||
|
|||||||
@@ -43,7 +43,8 @@ class MarkdownMLizer(OEB2HTML):
|
|||||||
def mlize_spine(self, oeb_book):
|
def mlize_spine(self, oeb_book):
|
||||||
output = ['']
|
output = ['']
|
||||||
for item in oeb_book.spine:
|
for item in oeb_book.spine:
|
||||||
self.log.debug('Converting %s to Markdown formatted TXT...' % item.href)
|
self.log.debug('Converting %s to Markdown formatted TXT...',
|
||||||
|
item.href)
|
||||||
self.rewrite_ids(item.data, item)
|
self.rewrite_ids(item.data, item)
|
||||||
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
||||||
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
|
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ class TextileMLizer(OEB2HTML):
|
|||||||
def mlize_spine(self, oeb_book):
|
def mlize_spine(self, oeb_book):
|
||||||
output = ['']
|
output = ['']
|
||||||
for item in oeb_book.spine:
|
for item in oeb_book.spine:
|
||||||
self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
|
self.log.debug('Converting %s to Textile formatted TXT...', item.href)
|
||||||
self.rewrite_ids(item.data, item)
|
self.rewrite_ids(item.data, item)
|
||||||
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
||||||
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
|
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ class TXTMLizer(object):
|
|||||||
output = [u'']
|
output = [u'']
|
||||||
output.append(self.get_toc())
|
output.append(self.get_toc())
|
||||||
for item in self.oeb_book.spine:
|
for item in self.oeb_book.spine:
|
||||||
self.log.debug('Converting %s to TXT...' % item.href)
|
self.log.debug('Converting %s to TXT...', item.href)
|
||||||
for x in item.data.iterdescendants(etree.Comment):
|
for x in item.data.iterdescendants(etree.Comment):
|
||||||
if x.text and '--' in x.text:
|
if x.text and '--' in x.text:
|
||||||
x.text = x.text.replace('--', '__')
|
x.text = x.text.replace('--', '__')
|
||||||
|
|||||||
Reference in New Issue
Block a user