mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-17 19:43:34 +02:00
Added first portion of logging adaptation.
Things may be broken at this point - there are still several modules to be adapted.
This commit is contained in:
@@ -12,7 +12,7 @@ import re
|
||||
import sys
|
||||
|
||||
from ebook_converter.utils.config import OptionParser
|
||||
from ebook_converter.utils.logging import Log
|
||||
from ebook_converter.utils import logging
|
||||
from ebook_converter.customize.conversion import OptionRecommendation
|
||||
|
||||
|
||||
@@ -66,7 +66,7 @@ def check_command_line_options(parser, args, log):
|
||||
if (not input_file.endswith('.recipe') and
|
||||
not os.access(input_file, os.R_OK) and
|
||||
not ('-h' in args or '--help' in args)):
|
||||
log.error('Cannot read from', input_file)
|
||||
log.error('Cannot read from %s', input_file)
|
||||
raise SystemExit(1)
|
||||
if input_file.endswith('.recipe') and not os.access(input_file, os.R_OK):
|
||||
input_file = args[1]
|
||||
@@ -267,7 +267,7 @@ class ProgressBar(object):
|
||||
def __call__(self, frac, msg=''):
|
||||
if msg:
|
||||
percent = int(frac*100)
|
||||
self.log('%d%% %s' % (percent, msg))
|
||||
self.log.info('%d%% %s' % (percent, msg))
|
||||
|
||||
|
||||
def create_option_parser(args, log):
|
||||
@@ -275,20 +275,18 @@ def create_option_parser(args, log):
|
||||
from ebook_converter.constants_old import __appname__
|
||||
from ebook_converter.constants_old import __author__
|
||||
from ebook_converter.constants_old import __version__
|
||||
log(os.path.basename(args[0]), '('+__appname__, __version__+')')
|
||||
log('Created by:', __author__)
|
||||
log.info("%s (%s, %s)", os.path.basename(args[0]), __appname__,
|
||||
__version__)
|
||||
log.info('Created by: %s', __author__)
|
||||
raise SystemExit(0)
|
||||
if '--list-recipes' in args:
|
||||
from ebook_converter.web.feeds.recipes.collection import \
|
||||
get_builtin_recipe_titles
|
||||
log('Available recipes:')
|
||||
log.info('Available recipes:')
|
||||
titles = sorted(get_builtin_recipe_titles())
|
||||
for title in titles:
|
||||
try:
|
||||
log('\t'+title)
|
||||
except Exception:
|
||||
log('\t'+repr(title))
|
||||
log('%d recipes available' % len(titles))
|
||||
log.info('\t%s', title)
|
||||
log.info('%d recipes available', len(titles))
|
||||
raise SystemExit(0)
|
||||
|
||||
parser = option_parser()
|
||||
@@ -352,7 +350,7 @@ def read_sr_patterns(path, log=None):
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
log = Log()
|
||||
log = logging.default_log
|
||||
mimetypes.init([pkg_resources.resource_filename('ebook_converter',
|
||||
'data/mime.types')])
|
||||
parser, plumber = create_option_parser(args, log)
|
||||
@@ -386,7 +384,7 @@ def main(args=sys.argv):
|
||||
|
||||
plumber.run()
|
||||
|
||||
log('Output saved to', ' ', plumber.output)
|
||||
log.info('Output saved to %s', plumber.output)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
@@ -146,7 +146,7 @@ class EPUBInput(InputFormatPlugin):
|
||||
if len(spine) > 1:
|
||||
for item in spine:
|
||||
if item.get('idref') == titlepage_id:
|
||||
log('Found HTML cover', titlepage_href)
|
||||
log.info('Found HTML cover %s', titlepage_href)
|
||||
if self.for_viewer:
|
||||
item.attrib.pop('linear', None)
|
||||
else:
|
||||
@@ -192,7 +192,7 @@ class EPUBInput(InputFormatPlugin):
|
||||
elem = [x for x in manifest if x.get('id', '') == idref]
|
||||
if not elem or elem[0].get('href', None) != guide_cover:
|
||||
return
|
||||
log('Found HTML cover', guide_cover)
|
||||
log.info('Found HTML cover %s', guide_cover)
|
||||
|
||||
# Remove from spine as covers must be treated
|
||||
# specially
|
||||
@@ -272,8 +272,8 @@ class EPUBInput(InputFormatPlugin):
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall(os.getcwd())
|
||||
except Exception:
|
||||
log.exception('EPUB appears to be invalid ZIP file, trying a'
|
||||
' more forgiving ZIP parser')
|
||||
log.exception('EPUB appears to be invalid ZIP file, trying a '
|
||||
'more forgiving ZIP parser')
|
||||
from ebook_converter.utils.localunzip import extractall
|
||||
stream.seek(0)
|
||||
extractall(stream)
|
||||
|
||||
@@ -214,8 +214,8 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
self.workaround_sony_quirks()
|
||||
|
||||
if self.oeb.toc.count() == 0:
|
||||
self.log.warn('This EPUB file has no Table of Contents. '
|
||||
'Creating a default TOC')
|
||||
self.log.warning('This EPUB file has no Table of Contents. '
|
||||
'Creating a default TOC')
|
||||
first = next(iter(self.oeb.spine))
|
||||
self.oeb.toc.add('Start', first.href)
|
||||
|
||||
@@ -229,7 +229,7 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
|
||||
|
||||
if _uuid is None:
|
||||
self.log.warn('No UUID identifier found')
|
||||
self.log.warning('No UUID identifier found')
|
||||
_uuid = str(uuid.uuid4())
|
||||
oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid)
|
||||
|
||||
@@ -281,7 +281,7 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
os.mkdir(opts.extract_to)
|
||||
with ZipFile(output_path) as zf:
|
||||
zf.extractall(path=opts.extract_to)
|
||||
self.log.info('EPUB extracted to', opts.extract_to)
|
||||
self.log.info('EPUB extracted to %s', opts.extract_to)
|
||||
|
||||
def upgrade_to_epub3(self, tdir, opf):
|
||||
self.log.info('Upgrading to EPUB 3...')
|
||||
@@ -323,7 +323,7 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
if not os.path.exists(path):
|
||||
uris.pop(uri)
|
||||
continue
|
||||
self.log.debug('Encrypting font:', uri)
|
||||
self.log.debug('Encrypting font: %s', uri)
|
||||
with open(path, 'r+b') as f:
|
||||
data = f.read(1024)
|
||||
if len(data) >= 1024:
|
||||
@@ -332,7 +332,7 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
f.write(bytes(bytearray(data[i] ^ key[i%16]
|
||||
for i in range(1024))))
|
||||
else:
|
||||
self.log.warn('Font', path, 'is invalid, ignoring')
|
||||
self.log.warning('Font %s is invalid, ignoring', path)
|
||||
if not isinstance(uri, str):
|
||||
uri = uri.decode('utf-8')
|
||||
fonts.append('''
|
||||
@@ -385,8 +385,9 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
_base, _, frag = href.partition('#')
|
||||
frag = urllib.parse.unquote(frag)
|
||||
if frag and frag_pat.match(frag) is None:
|
||||
self.log.warn(
|
||||
'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
|
||||
self.log.warning('Removing fragment identifier %r from '
|
||||
'TOC as Adobe Digital Editions cannot '
|
||||
'handle it', frag)
|
||||
node.href = _base
|
||||
|
||||
for x in self.oeb.spine:
|
||||
@@ -530,8 +531,8 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
for x in self.oeb.spine:
|
||||
if x.href == href:
|
||||
if frag_is_at_top(x.data, frag):
|
||||
self.log.debug('Removing anchor from TOC href:',
|
||||
href+'#'+frag)
|
||||
self.log.debug('Removing anchor from TOC '
|
||||
'href: %s#%s', href, frag)
|
||||
toc.href = href
|
||||
break
|
||||
for x in toc:
|
||||
|
||||
@@ -20,7 +20,7 @@ class LRFInput(InputFormatPlugin):
|
||||
from ebook_converter.ebooks.lrf.input import MediaType, Styles, \
|
||||
TextBlock, Canvas, ImageBlock, RuledLine
|
||||
self.log = log
|
||||
self.log('Generating XML')
|
||||
self.log.info('Generating XML')
|
||||
from ebook_converter.ebooks.lrf.lrfparser import LRFDocument
|
||||
d = LRFDocument(stream)
|
||||
d.parse()
|
||||
@@ -50,7 +50,7 @@ class LRFInput(InputFormatPlugin):
|
||||
if imgstr:
|
||||
plot_map[ro] = imgstr[0].get('file')
|
||||
|
||||
self.log('Converting XML to HTML...')
|
||||
self.log.info('Converting XML to HTML...')
|
||||
|
||||
with open(pkg_resources.
|
||||
resource_filename('ebook_converter',
|
||||
|
||||
@@ -854,8 +854,8 @@ OptionRecommendation(name='search_replace',
|
||||
try:
|
||||
val = float(val)
|
||||
except ValueError:
|
||||
self.log.warn('Values of series index and rating must'
|
||||
' be numbers. Ignoring', val)
|
||||
self.log.warning('Values of series index and rating '
|
||||
'must be numbers. Ignoring %s', val)
|
||||
continue
|
||||
elif x in ('timestamp', 'pubdate'):
|
||||
try:
|
||||
@@ -882,8 +882,8 @@ OptionRecommendation(name='search_replace',
|
||||
self.opts_to_mi(mi)
|
||||
if mi.cover:
|
||||
if mi.cover.startswith('http:') or mi.cover.startswith('https:'):
|
||||
self.log.warn("TODO: Cover image is on remote server, "
|
||||
"implement downloading using requests")
|
||||
self.log.warning("TODO: Cover image is on remote server, "
|
||||
"implement downloading using requests")
|
||||
ext = mi.cover.rpartition('.')[-1].lower().strip()
|
||||
if ext not in ('png', 'jpg', 'jpeg', 'gif'):
|
||||
ext = 'jpg'
|
||||
@@ -909,8 +909,8 @@ OptionRecommendation(name='search_replace',
|
||||
if x.short_name == sval:
|
||||
setattr(self.opts, attr, x)
|
||||
return
|
||||
self.log.warn(
|
||||
'Profile (%s) %r is no longer available, using default'%(which, sval))
|
||||
self.log.warning('Profile (%s) %r is no longer available, using '
|
||||
'default', which, sval)
|
||||
for x in profiles():
|
||||
if x.short_name == 'default':
|
||||
setattr(self.opts, attr, x)
|
||||
@@ -925,14 +925,16 @@ OptionRecommendation(name='search_replace',
|
||||
if self.opts.verbose:
|
||||
self.log.filter_level = self.log.DEBUG
|
||||
if self.changed_options:
|
||||
self.log('Conversion options changed from defaults:')
|
||||
self.log.info('Conversion options changed from defaults:')
|
||||
for rec in self.changed_options:
|
||||
if rec.option.name not in ('username', 'password'):
|
||||
self.log(' ', '%s:' % rec.option.name, repr(rec.recommended_value))
|
||||
self.log.info(' %s', rec.option.name,
|
||||
repr(rec.recommended_value))
|
||||
if self.opts.verbose > 1:
|
||||
self.log.debug('Resolved conversion options')
|
||||
try:
|
||||
self.log.debug('ebook_converter version:', constants.VERSION)
|
||||
self.log.debug('ebook_converter version: %s',
|
||||
constants.VERSION)
|
||||
odict = dict(self.opts.__dict__)
|
||||
for x in ('username', 'password'):
|
||||
odict.pop(x, None)
|
||||
@@ -968,7 +970,7 @@ OptionRecommendation(name='search_replace',
|
||||
self.input_plugin.save_download(zf)
|
||||
zf.close()
|
||||
|
||||
self.log.info('Input debug saved to:', out_dir)
|
||||
self.log.info('Input debug saved to: %s', out_dir)
|
||||
|
||||
def run(self):
|
||||
'''
|
||||
@@ -1022,7 +1024,8 @@ OptionRecommendation(name='search_replace',
|
||||
from ebook_converter.ebooks.azw4.reader import unwrap
|
||||
unwrap(stream, self.output)
|
||||
self.ui_reporter(1.)
|
||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
||||
self.log.info('%s output written to %s', self.output_fmt.upper(),
|
||||
self.output)
|
||||
self.flush()
|
||||
return
|
||||
|
||||
@@ -1056,7 +1059,7 @@ OptionRecommendation(name='search_replace',
|
||||
if self.opts.debug_pipeline is not None:
|
||||
out_dir = os.path.join(self.opts.debug_pipeline, 'parsed')
|
||||
self.dump_oeb(self.oeb, out_dir)
|
||||
self.log('Parsed HTML written to:', out_dir)
|
||||
self.log.info('Parsed HTML written to: %s', out_dir)
|
||||
self.input_plugin.specialize(self.oeb, self.opts, self.log,
|
||||
self.output_fmt)
|
||||
|
||||
@@ -1105,13 +1108,13 @@ OptionRecommendation(name='search_replace',
|
||||
try:
|
||||
fkey = list(map(float, fkey.split(',')))
|
||||
except Exception:
|
||||
self.log.error('Invalid font size key: %r ignoring'%fkey)
|
||||
self.log.error('Invalid font size key: %s ignoring', fkey)
|
||||
fkey = self.opts.dest.fkey
|
||||
|
||||
if self.opts.debug_pipeline is not None:
|
||||
out_dir = os.path.join(self.opts.debug_pipeline, 'structure')
|
||||
self.dump_oeb(self.oeb, out_dir)
|
||||
self.log('Structured HTML written to:', out_dir)
|
||||
self.log.info('Structured HTML written to: %s', out_dir)
|
||||
|
||||
if self.opts.extra_css and os.path.exists(self.opts.extra_css):
|
||||
with open(self.opts.extra_css, 'rb') as f:
|
||||
@@ -1187,9 +1190,9 @@ OptionRecommendation(name='search_replace',
|
||||
if self.opts.debug_pipeline is not None:
|
||||
out_dir = os.path.join(self.opts.debug_pipeline, 'processed')
|
||||
self.dump_oeb(self.oeb, out_dir)
|
||||
self.log('Processed HTML written to:', out_dir)
|
||||
self.log.info('Processed HTML written to: %s', out_dir)
|
||||
|
||||
self.log.info('Creating %s...'%self.output_plugin.name)
|
||||
self.log.info('Creating %s...', self.output_plugin.name)
|
||||
our = CompositeProgressReporter(0.67, 1., self.ui_reporter)
|
||||
self.output_plugin.report_progress = our
|
||||
our(0., 'Running %s plugin' % self.output_plugin.name)
|
||||
@@ -1200,7 +1203,8 @@ OptionRecommendation(name='search_replace',
|
||||
self.ui_reporter(1.)
|
||||
run_plugins_on_postprocess(self.output, self.output_fmt)
|
||||
|
||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
||||
self.log.info('%s output written to %s', self.output_fmt.upper(),
|
||||
self.output)
|
||||
self.flush()
|
||||
|
||||
|
||||
@@ -1230,7 +1234,7 @@ def create_oebbook(log, path_or_stream, opts, reader=None,
|
||||
if specialize is not None:
|
||||
oeb = specialize(oeb) or oeb
|
||||
# Read OEB Book into OEBBook
|
||||
log('Parsing all content...')
|
||||
log.info('Parsing all content...')
|
||||
oeb.removed_items_to_ignore = removed_items
|
||||
if reader is None:
|
||||
from ebook_converter.ebooks.oeb.reader import OEBReader
|
||||
@@ -1241,11 +1245,11 @@ def create_oebbook(log, path_or_stream, opts, reader=None,
|
||||
|
||||
|
||||
def create_dummy_plumber(input_format, output_format):
|
||||
from ebook_converter.utils.logging import Log
|
||||
from ebook_converter.utils import logging
|
||||
input_format = input_format.lower()
|
||||
output_format = output_format.lower()
|
||||
output_path = 'dummy.'+output_format
|
||||
log = Log()
|
||||
log = logging.default_log
|
||||
log.outputs = []
|
||||
input_file = 'dummy.'+input_format
|
||||
if input_format in ARCHIVE_FMTS:
|
||||
|
||||
@@ -5,11 +5,6 @@ from ebook_converter.utils.logging import default_log
|
||||
from ebook_converter.utils.wordcount import get_wordcount_obj
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
class HeuristicProcessor(object):
|
||||
|
||||
def __init__(self, extra_opts=None, log=None):
|
||||
@@ -50,8 +45,8 @@ class HeuristicProcessor(object):
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + str(self.html_preprocess_sections) +
|
||||
" chapters. - " + str(chap))
|
||||
self.log.debug("marked %s chapters. - %s",
|
||||
self.html_preprocess_sections, str(chap))
|
||||
return '<h2>'+chap+'</h2>\n'
|
||||
else:
|
||||
delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
|
||||
@@ -59,16 +54,16 @@ class HeuristicProcessor(object):
|
||||
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
|
||||
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + str(self.html_preprocess_sections) +
|
||||
" chapters & titles. - " + str(chap) + ", " + str(title))
|
||||
self.log.debug("marked %s chapters & titles. - %s, %s",
|
||||
self.html_preprocess_sections, chap, title)
|
||||
return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
|
||||
|
||||
def chapter_break(self, match):
|
||||
chap = match.group('section')
|
||||
styles = match.group('styles')
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + str(self.html_preprocess_sections) +
|
||||
" section markers based on punctuation. - " + str(chap))
|
||||
self.log.debug("marked %s section markers based on punctuation. - %s",
|
||||
self.html_preprocess_sections, chap)
|
||||
return '<'+styles+' style="page-break-before:always">'+chap
|
||||
|
||||
def analyze_title_matches(self, match):
|
||||
@@ -111,8 +106,6 @@ class HeuristicProcessor(object):
|
||||
line_end = line_end_ere.findall(raw)
|
||||
tot_htm_ends = len(htm_end)
|
||||
tot_ln_fds = len(line_end)
|
||||
# self.log.debug("There are " + str(tot_ln_fds) + " total Line feeds, and " +
|
||||
# str(tot_htm_ends) + " marked up endings")
|
||||
|
||||
if percent > 1:
|
||||
percent = 1
|
||||
@@ -120,7 +113,6 @@ class HeuristicProcessor(object):
|
||||
percent = 0
|
||||
|
||||
min_lns = tot_ln_fds * percent
|
||||
# self.log.debug("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
|
||||
return min_lns > tot_htm_ends
|
||||
|
||||
def dump(self, raw, where):
|
||||
@@ -148,7 +140,6 @@ class HeuristicProcessor(object):
|
||||
return wordcount.words
|
||||
|
||||
def markup_italicis(self, html):
|
||||
# self.log.debug("\n\n\nitalicize debugging \n\n\n")
|
||||
ITALICIZE_WORDS = [
|
||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||
@@ -178,7 +169,6 @@ class HeuristicProcessor(object):
|
||||
for pat in ITALICIZE_STYLE_PATS:
|
||||
for match in re.finditer(pat, search_text):
|
||||
ital_string = str(match.group('words'))
|
||||
# self.log.debug("italicising "+str(match.group(0))+" with <i>"+ital_string+"</i>")
|
||||
try:
|
||||
html = re.sub(re.escape(str(match.group(0))), '<i>%s</i>' % ital_string, html)
|
||||
except OverflowError:
|
||||
@@ -205,10 +195,11 @@ class HeuristicProcessor(object):
|
||||
if wordcount > 200000:
|
||||
typical_chapters = 15000.
|
||||
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||
self.log.debug("minimum chapters required are: "+str(self.min_chapters))
|
||||
self.log.debug("minimum chapters required are: %s", self.min_chapters)
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log.debug("found " + str(self.html_preprocess_sections) + " pre-existing headings")
|
||||
self.log.debug("found %s pre-existing headings",
|
||||
self.html_preprocess_sections)
|
||||
|
||||
# Build the Regular Expressions in pieces
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
@@ -298,7 +289,8 @@ class HeuristicProcessor(object):
|
||||
if n_lookahead_req:
|
||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||
if not analyze:
|
||||
self.log.debug("Marked " + str(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
self.log.debug("Marked %s headings, %s",
|
||||
self.html_preprocess_sections, log_message)
|
||||
|
||||
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
|
||||
lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
||||
@@ -311,11 +303,12 @@ class HeuristicProcessor(object):
|
||||
if float(self.chapters_with_title) / float(hits) > .5:
|
||||
title_req = True
|
||||
strict_title = False
|
||||
self.log.debug(
|
||||
str(type_name)+" had "+str(hits)+
|
||||
" hits - "+str(self.chapters_no_title)+" chapters with no title, "+
|
||||
str(self.chapters_with_title)+" chapters with titles, "+
|
||||
str(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||
self.log.debug('%s had %s hits %s chapters with no '
|
||||
'title, %s chapters with titles, %s '
|
||||
'percent.', type_name, hits,
|
||||
self.chapters_no_title,
|
||||
self.chapters_with_title,
|
||||
self.chapters_with_title / hits)
|
||||
if type_name == 'common':
|
||||
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||
elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
|
||||
@@ -332,8 +325,9 @@ class HeuristicProcessor(object):
|
||||
words_per_chptr = wordcount
|
||||
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
||||
words_per_chptr = wordcount // self.html_preprocess_sections
|
||||
self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+
|
||||
str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
||||
self.log.debug("Total wordcount is: %s, Average words per section "
|
||||
"is: %s, Marked up %s chapters", wordcount,
|
||||
words_per_chptr, self.html_preprocess_sections)
|
||||
return html
|
||||
|
||||
def punctuation_unwrap(self, length, content, format):
|
||||
@@ -427,7 +421,8 @@ class HeuristicProcessor(object):
|
||||
txtindent = re.compile(str(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
|
||||
html = txtindent.sub(self.insert_indent, html)
|
||||
if self.found_indents > 1:
|
||||
self.log.debug("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
|
||||
self.log.debug("replaced %s nbsp indents with inline styles",
|
||||
self.found_indents)
|
||||
return html
|
||||
|
||||
def cleanup_markup(self, html):
|
||||
@@ -475,8 +470,8 @@ class HeuristicProcessor(object):
|
||||
blanklines = self.blankreg.findall(html)
|
||||
lines = self.linereg.findall(html)
|
||||
if len(lines) > 1:
|
||||
self.log.debug("There are " + str(len(blanklines)) + " blank lines. " +
|
||||
str(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||
self.log.debug("There are %s blank lines. %s percent blank",
|
||||
len(blanklines), len(blanklines) / len(lines))
|
||||
|
||||
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||
return True
|
||||
@@ -600,8 +595,8 @@ class HeuristicProcessor(object):
|
||||
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
|
||||
except:
|
||||
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||
self.log.warn('Invalid replacement scene break'
|
||||
' expression, using default')
|
||||
self.log.warning('Invalid replacement scene break'
|
||||
' expression, using default')
|
||||
else:
|
||||
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
|
||||
divpercent = (100 - width) // 2
|
||||
@@ -702,20 +697,23 @@ class HeuristicProcessor(object):
|
||||
blockquote_open_loop = blockquote_open
|
||||
if debugabby:
|
||||
self.log.debug('\n\n******\n')
|
||||
self.log.debug('padding top is: '+str(setting[0]))
|
||||
self.log.debug('padding right is:' +str(setting[1]))
|
||||
self.log.debug('padding bottom is: ' + str(setting[2]))
|
||||
self.log.debug('padding left is: ' +str(setting[3]))
|
||||
self.log.debug('padding top is: %s', setting[0])
|
||||
self.log.debug('padding right is: %s', setting[1])
|
||||
self.log.debug('padding bottom is: %s', setting[2])
|
||||
self.log.debug('padding left is: %s', setting[3])
|
||||
|
||||
# print "text-align is: "+str(text_align)
|
||||
# print "\n***\nline is:\n "+str(match.group(0))+'\n'
|
||||
if debugabby:
|
||||
# print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph)
|
||||
self.log.debug("styles for this line were:", styles)
|
||||
self.log.debug('newline is:')
|
||||
self.log.debug(blockquote_open_loop+blockquote_close_loop+
|
||||
paragraph_before+'<p style="'+text_indent+text_align+
|
||||
'">'+content+'</p>'+paragraph_after+'\n\n\n\n\n')
|
||||
self.log.debug("styles for this line were: %s", styles)
|
||||
self.log.debug('newline is: %s', blockquote_open_loop +
|
||||
blockquote_close_loop +
|
||||
paragraph_before +
|
||||
'<p style="%s">%s</p>' %
|
||||
(text_indent + text_align, content) +
|
||||
paragraph_after +
|
||||
'\n\n\n\n\n')
|
||||
# print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph)
|
||||
self.previous_was_paragraph = is_paragraph
|
||||
# print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n"
|
||||
@@ -731,10 +729,10 @@ class HeuristicProcessor(object):
|
||||
try:
|
||||
self.totalwords = self.get_word_count(html)
|
||||
except:
|
||||
self.log.warn("Can't get wordcount")
|
||||
self.log.warning("Can't get wordcount")
|
||||
|
||||
if self.totalwords < 50:
|
||||
self.log.warn("flow is too short, not running heuristics")
|
||||
self.log.warning("flow is too short, not running heuristics")
|
||||
return html
|
||||
|
||||
is_abbyy = self.is_abbyy(html)
|
||||
@@ -801,12 +799,13 @@ class HeuristicProcessor(object):
|
||||
# more of the lines break in the same region of the document then unwrapping is required
|
||||
docanalysis = DocAnalysis(format, html)
|
||||
hardbreaks = docanalysis.line_histogram(.50)
|
||||
self.log.debug("Hard line breaks check returned "+str(hardbreaks))
|
||||
self.log.debug("Hard line breaks check returned %s", hardbreaks)
|
||||
|
||||
# Calculate Length
|
||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||
length = docanalysis.line_length(unwrap_factor)
|
||||
self.log.debug("Median line length is " + str(length) + ", calculated with " + format + " format")
|
||||
self.log.debug("Median line length is %s, calculated with %s format",
|
||||
length, format)
|
||||
|
||||
# ##### Unwrap lines ######
|
||||
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||
@@ -827,8 +826,9 @@ class HeuristicProcessor(object):
|
||||
|
||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||
self.log.debug("Looking for more split points based on punctuation,"
|
||||
" currently have " + str(self.html_preprocess_sections))
|
||||
self.log.debug("Looking for more split points based on "
|
||||
"punctuation, currently have %s",
|
||||
self.html_preprocess_sections)
|
||||
chapdetect3 = re.compile(
|
||||
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
|
||||
r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
|
||||
|
||||
Reference in New Issue
Block a user