Added first portion of logging adaptation.

Things may be broken at this point - there are still several modules to be adapted.
2021-06-22 22:04:43 +02:00
parent 6f898ab23e
commit 546cc26652
36 changed files with 326 additions and 316 deletions
@@ -5,11 +5,6 @@ from ebook_converter.utils.logging import default_log
 from ebook_converter.utils.wordcount import get_wordcount_obj


-__license__ = 'GPL v3'
-__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
-__docformat__ = 'restructuredtext en'
-
-
 class HeuristicProcessor(object):

    def __init__(self, extra_opts=None, log=None):
@@ -50,8 +45,8 @@ class HeuristicProcessor(object):
        title = match.group('title')
        if not title:
            self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log.debug("marked " + str(self.html_preprocess_sections) +
-                    " chapters. - " + str(chap))
+            self.log.debug("marked %s chapters. - %s",
+                           self.html_preprocess_sections, str(chap))
            return '<h2>'+chap+'</h2>\n'
        else:
            delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
@@ -59,16 +54,16 @@ class HeuristicProcessor(object):
            txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
            txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
            self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log.debug("marked " + str(self.html_preprocess_sections) +
-                    " chapters & titles. - " + str(chap) + ", " + str(title))
+            self.log.debug("marked %s chapters & titles. - %s, %s",
+                           self.html_preprocess_sections, chap, title)
            return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'

    def chapter_break(self, match):
        chap = match.group('section')
        styles = match.group('styles')
        self.html_preprocess_sections = self.html_preprocess_sections + 1
-        self.log.debug("marked " + str(self.html_preprocess_sections) +
-                " section markers based on punctuation. - " + str(chap))
+        self.log.debug("marked %s section markers based on punctuation. - %s",
+                       self.html_preprocess_sections, chap)
        return '<'+styles+' style="page-break-before:always">'+chap

    def analyze_title_matches(self, match):
@@ -111,8 +106,6 @@ class HeuristicProcessor(object):
        line_end = line_end_ere.findall(raw)
        tot_htm_ends = len(htm_end)
        tot_ln_fds = len(line_end)
-        # self.log.debug("There are " + str(tot_ln_fds) + " total Line feeds, and " +
-        #        str(tot_htm_ends) + " marked up endings")

        if percent > 1:
            percent = 1
@@ -120,7 +113,6 @@ class HeuristicProcessor(object):
            percent = 0

        min_lns = tot_ln_fds * percent
-        # self.log.debug("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
        return min_lns > tot_htm_ends

    def dump(self, raw, where):
@@ -148,7 +140,6 @@ class HeuristicProcessor(object):
        return wordcount.words

    def markup_italicis(self, html):
-        # self.log.debug("\n\n\nitalicize debugging \n\n\n")
        ITALICIZE_WORDS = [
            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
@@ -178,7 +169,6 @@ class HeuristicProcessor(object):
        for pat in ITALICIZE_STYLE_PATS:
            for match in re.finditer(pat, search_text):
                ital_string = str(match.group('words'))
-                # self.log.debug("italicising "+str(match.group(0))+"    with <i>"+ital_string+"</i>")
                try:
                    html = re.sub(re.escape(str(match.group(0))), '<i>%s</i>' % ital_string, html)
                except OverflowError:
@@ -205,10 +195,11 @@ class HeuristicProcessor(object):
            if wordcount > 200000:
                typical_chapters = 15000.
            self.min_chapters = int(ceil(wordcount / typical_chapters))
-        self.log.debug("minimum chapters required are: "+str(self.min_chapters))
+        self.log.debug("minimum chapters required are: %s", self.min_chapters)
        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
        self.html_preprocess_sections = len(heading.findall(html))
-        self.log.debug("found " + str(self.html_preprocess_sections) + " pre-existing headings")
+        self.log.debug("found %s pre-existing headings",
+                       self.html_preprocess_sections)

        # Build the Regular Expressions in pieces
        init_lookahead = "(?=<(p|div))"
@@ -298,7 +289,8 @@ class HeuristicProcessor(object):
                if n_lookahead_req:
                    n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
                if not analyze:
-                    self.log.debug("Marked " + str(self.html_preprocess_sections) + " headings, " + log_message)
+                    self.log.debug("Marked %s headings, %s",
+                                   self.html_preprocess_sections, log_message)

                chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
                    lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
@@ -311,11 +303,12 @@ class HeuristicProcessor(object):
                        if float(self.chapters_with_title) / float(hits) > .5:
                            title_req = True
                            strict_title = False
-                        self.log.debug(
-                                str(type_name)+" had "+str(hits)+
-                                " hits - "+str(self.chapters_no_title)+" chapters with no title, "+
-                                str(self.chapters_with_title)+" chapters with titles, "+
-                                str(float(self.chapters_with_title) / float(hits))+" percent. ")
+                        self.log.debug('%s had %s hits %s chapters with no '
+                                       'title, %s chapters with titles, %s '
+                                       'percent.', type_name, hits,
+                                       self.chapters_no_title,
+                                       self.chapters_with_title,
+                                       self.chapters_with_title / hits)
                        if type_name == 'common':
                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                        elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
@@ -332,8 +325,9 @@ class HeuristicProcessor(object):
        words_per_chptr = wordcount
        if words_per_chptr > 0 and self.html_preprocess_sections > 0:
            words_per_chptr = wordcount // self.html_preprocess_sections
-        self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+
-                       str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
+        self.log.debug("Total wordcount is: %s, Average words per section "
+                       "is: %s, Marked up %s chapters", wordcount,
+                       words_per_chptr, self.html_preprocess_sections)
        return html

    def punctuation_unwrap(self, length, content, format):
@@ -427,7 +421,8 @@ class HeuristicProcessor(object):
        txtindent = re.compile(str(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
        html = txtindent.sub(self.insert_indent, html)
        if self.found_indents > 1:
-            self.log.debug("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
+            self.log.debug("replaced %s nbsp indents with inline styles",
+                           self.found_indents)
        return html

    def cleanup_markup(self, html):
@@ -475,8 +470,8 @@ class HeuristicProcessor(object):
        blanklines = self.blankreg.findall(html)
        lines = self.linereg.findall(html)
        if len(lines) > 1:
-            self.log.debug("There are " + str(len(blanklines)) + " blank lines. " +
-                    str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+            self.log.debug("There are %s blank lines. %s percent blank",
+                           len(blanklines), len(blanklines) / len(lines))

            if float(len(blanklines)) / float(len(lines)) > 0.40:
                return True
@@ -600,8 +595,8 @@ class HeuristicProcessor(object):
                        width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
                    except:
                        scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
-                        self.log.warn('Invalid replacement scene break'
-                                ' expression, using default')
+                        self.log.warning('Invalid replacement scene break'
+                                         ' expression, using default')
                    else:
                        replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
                        divpercent = (100 - width) // 2
@@ -702,20 +697,23 @@ class HeuristicProcessor(object):
                            blockquote_open_loop = blockquote_open
                        if debugabby:
                            self.log.debug('\n\n******\n')
-                            self.log.debug('padding top is: '+str(setting[0]))
-                            self.log.debug('padding right is:' +str(setting[1]))
-                            self.log.debug('padding bottom is: ' + str(setting[2]))
-                            self.log.debug('padding left is: ' +str(setting[3]))
+                            self.log.debug('padding top is: %s', setting[0])
+                            self.log.debug('padding right is: %s', setting[1])
+                            self.log.debug('padding bottom is: %s', setting[2])
+                            self.log.debug('padding left is: %s', setting[3])

                # print "text-align is: "+str(text_align)
                # print "\n***\nline is:\n     "+str(match.group(0))+'\n'
                if debugabby:
                    # print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph)
-                    self.log.debug("styles for this line were:", styles)
-                    self.log.debug('newline is:')
-                    self.log.debug(blockquote_open_loop+blockquote_close_loop+
-                            paragraph_before+'<p style="'+text_indent+text_align+
-                            '">'+content+'</p>'+paragraph_after+'\n\n\n\n\n')
+                    self.log.debug("styles for this line were: %s", styles)
+                    self.log.debug('newline is: %s', blockquote_open_loop +
+                                   blockquote_close_loop +
+                                   paragraph_before +
+                                   '<p style="%s">%s</p>' %
+                                   (text_indent + text_align, content) +
+                                   paragraph_after +
+                                   '\n\n\n\n\n')
                # print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph)
                self.previous_was_paragraph = is_paragraph
                # print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n"
@@ -731,10 +729,10 @@ class HeuristicProcessor(object):
        try:
            self.totalwords = self.get_word_count(html)
        except:
-            self.log.warn("Can't get wordcount")
+            self.log.warning("Can't get wordcount")

        if self.totalwords < 50:
-            self.log.warn("flow is too short, not running heuristics")
+            self.log.warning("flow is too short, not running heuristics")
            return html

        is_abbyy = self.is_abbyy(html)
@@ -801,12 +799,13 @@ class HeuristicProcessor(object):
        # more of the lines break in the same region of the document then unwrapping is required
        docanalysis = DocAnalysis(format, html)
        hardbreaks = docanalysis.line_histogram(.50)
-        self.log.debug("Hard line breaks check returned "+str(hardbreaks))
+        self.log.debug("Hard line breaks check returned %s", hardbreaks)

        # Calculate Length
        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
        length = docanalysis.line_length(unwrap_factor)
-        self.log.debug("Median line length is " + str(length) + ", calculated with " + format + " format")
+        self.log.debug("Median line length is %s, calculated with %s format",
+                       length, format)

        # ##### Unwrap lines ######
        if getattr(self.extra_opts, 'unwrap_lines', False):
@@ -827,8 +826,9 @@ class HeuristicProcessor(object):

        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
-            self.log.debug("Looking for more split points based on punctuation,"
-                    " currently have " + str(self.html_preprocess_sections))
+            self.log.debug("Looking for more split points based on "
+                           "punctuation, currently have %s",
+                           self.html_preprocess_sections)
            chapdetect3 = re.compile(
                r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
                r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'