Removed as_unicode function

2026-02-16 22:25:54 +01:00 · 2020-06-14 19:02:23 +02:00
parent add7a8ca56
commit fdd531f6e0
7 changed files with 412 additions and 366 deletions
--- a/ebook_converter/ebooks/conversion/preprocess.py
+++ b/ebook_converter/ebooks/conversion/preprocess.py
@@ -1,43 +1,37 @@
-import functools, re, json
-from math import ceil
+import functools
+import json
+import math
+import re

-from ebook_converter import entity_to_unicode, as_unicode
+from ebook_converter import entity_to_unicode


-__license__ = 'GPL v3'
-__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
-__docformat__ = 'restructuredtext en'
-
-XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
-SVG_NS       = 'http://www.w3.org/2000/svg'
-XLINK_NS     = 'http://www.w3.org/1999/xlink'
+XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
+SVG_NS = 'http://www.w3.org/2000/svg'
+XLINK_NS = 'http://www.w3.org/1999/xlink'

 convert_entities = functools.partial(entity_to_unicode,
-        result_exceptions={
-            '<' : '&lt;',
-            '>' : '&gt;',
-            "'" : '&apos;',
-            '"' : '&quot;',
-            '&' : '&amp;',
-        })
-_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
+                                     result_exceptions={'<': '&lt;',
+                                                        '>': '&gt;',
+                                                        "'": '&apos;',
+                                                        '"': '&quot;',
+                                                        '&': '&amp;'})
+_span_pat = re.compile('<span.*?</span>', re.DOTALL | re.IGNORECASE)

-LIGATURES = {
-#        '\u00c6': 'AE',
-#        '\u00e6': 'ae',
-#        '\u0152': 'OE',
-#        '\u0153': 'oe',
-#        '\u0132': 'IJ',
-#        '\u0133': 'ij',
-#        '\u1D6B': 'ue',
-        '\uFB00': 'ff',
-        '\uFB01': 'fi',
-        '\uFB02': 'fl',
-        '\uFB03': 'ffi',
-        '\uFB04': 'ffl',
-        '\uFB05': 'ft',
-        '\uFB06': 'st',
-        }
+LIGATURES = {'\uFB00': 'ff',
+             '\uFB01': 'fi',
+             '\uFB02': 'fl',
+             '\uFB03': 'ffi',
+             '\uFB04': 'ffl',
+             '\uFB05': 'ft',
+             '\uFB06': 'st'}
+# '\u00c6': 'AE',
+# '\u00e6': 'ae',
+# '\u0152': 'OE',
+# '\u0153': 'oe',
+# '\u0132': 'IJ',
+# '\u0133': 'ij',
+# '\u1D6B': 'ue',

 _ligpat = re.compile('|'.join(LIGATURES))

@@ -83,17 +77,18 @@ def smarten_punctuation(html, log=None):


 class DocAnalysis(object):
-    '''
-    Provides various text analysis functions to determine how the document is structured.
-    format is the type of document analysis will be done against.
+    """
+    Provides various text analysis functions to determine how the document is
+    structured. format is the type of document analysis will be done against.
    raw is the raw text to determine the line length to use for wrapping.
    Blank lines are excluded from analysis
-    '''
+    """

    def __init__(self, format='html', raw=''):
        raw = raw.replace('&nbsp;', ' ')
        if format == 'html':
-            linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
+            linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)',
+                                re.DOTALL)
        elif format == 'pdf':
            linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
        elif format == 'spanned_html':
@@ -103,13 +98,13 @@ class DocAnalysis(object):
        self.lines = linere.findall(raw)

    def line_length(self, percent):
-        '''
+        """
        Analyses the document to find the median line length.
        percentage is a decimal number, 0 - 1 which is used to determine
        how far in the list of line lengths to use. The list of line lengths is
        ordered smallest to largest and does not include duplicates. 0.5 is the
        median value.
-        '''
+        """
        lengths = []
        for line in self.lines:
            if len(line) > 0:
@@ -121,7 +116,7 @@ class DocAnalysis(object):
        lengths = list(set(lengths))
        total = sum(lengths)
        avg = total / len(lengths)
-        max_line = ceil(avg * 2)
+        max_line = math.ceil(avg * 2)

        lengths = sorted(lengths)
        for i in range(len(lengths) - 1, -1, -1):
@@ -138,31 +133,32 @@ class DocAnalysis(object):
        return lengths[index]

    def line_histogram(self, percent):
-        '''
-        Creates a broad histogram of the document to determine whether it incorporates hard
-        line breaks.  Lines are sorted into 20 'buckets' based on length.
-        percent is the percentage of lines that should be in a single bucket to return true
-        The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
-        '''
-        minLineLength=20  # Ignore lines under 20 chars (typical of spaces)
-        maxLineLength=1900  # Discard larger than this to stay in range
-        buckets=20  # Each line is divided into a bucket based on length
+        """
+        Creates a broad histogram of the document to determine whether it
+        incorporates hard line breaks. Lines are sorted into 20 'buckets'
+        based on length. percent is the percentage of lines that should be in
+        a single bucket to return true The majority of the lines will exist in
+        1-2 buckets in typical docs with hard line breaks
+        """
+        minLineLength = 20  # Ignore lines under 20 chars (typical of spaces)
+        maxLineLength = 1900  # Discard larger than this to stay in range
+        buckets = 20  # Each line is divided into a bucket based on length

        # print("there are "+str(len(lines))+" lines")
        # max = 0
        # for line in self.lines:
-        #    l = len(line)
-        #    if l > max:
-        #        max = l
+        #    _l = len(line)
+        #    if _l > max:
+        #        max = _l
        # print("max line found is "+str(max))
        # Build the line length histogram
-        hRaw = [0 for i in range(0,buckets)]
+        hRaw = [0 for i in range(0, buckets)]
        for line in self.lines:
-            l = len(line)
-            if l > minLineLength and l < maxLineLength:
-                l = int(l // 100)
-                # print("adding "+str(l))
-                hRaw[l]+=1
+            _l = len(line)
+            if _l > minLineLength and _l < maxLineLength:
+                _l = int(_l // 100)
+                # print("adding "+str(_l))
+                hRaw[_l] += 1

        # Normalize the histogram into percents
        totalLines = len(self.lines)
@@ -175,7 +171,7 @@ class DocAnalysis(object):

        # Find the biggest bucket
        maxValue = 0
-        for i in range(0,len(h)):
+        for i in range(0, len(h)):
            if h[i] > maxValue:
                maxValue = h[i]

@@ -188,36 +184,42 @@ class DocAnalysis(object):


 class Dehyphenator(object):
-    '''
-    Analyzes words to determine whether hyphens should be retained/removed.  Uses the document
-    itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
-    scientific words. The primary disadvantage is that words appearing only once in the document
-    retain hyphens.
-    '''
+    """
+    Analyzes words to determine whether hyphens should be retained/removed.
+    Uses the document itself is as a dictionary. This method handles all
+    languages along with uncommon, made-up, and scientific words. The primary
+    disadvantage is that words appearing only once in the document retain
+    hyphens.
+    """

    def __init__(self, verbose=0, log=None):
        self.log = log
        self.verbose = verbose
-        # Add common suffixes to the regex below to increase the likelihood of a match -
-        # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        # only remove if it's not already the point of hyphenation
-        self.suffix_string = (
-            "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
-            "(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
-            "(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
+        # Add common suffixes to the regex below to increase the likelihood of
+        # a match - don't add suffixes which are also complete words, such as
+        # 'able' or 'sex' only remove if it's not already the point of
+        # hyphenation
+        self.suffix_string = ("((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?"
+                              "|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|"
+                              "istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
+                              "(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier"
+                              "|al|ex|ian)$")
        self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
-        self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
-        # remove prefixes if the prefix was not already the point of hyphenation
+        self.removesuffixes = re.compile(r"%s" % self.suffix_string,
+                                         re.IGNORECASE)
+        # remove prefixes if the prefix was not already the point of
+        # hyphenation
        self.prefix_string = '^(dis|re|un|in|ex)'
        self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
-        self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
+        self.removeprefix = re.compile(r'%s' % self.prefix_string,
+                                       re.IGNORECASE)

    def dehyphenate(self, match):
        firsthalf = match.group('firstpart')
        secondhalf = match.group('secondpart')
        try:
            wraptags = match.group('wraptags')
-        except:
+        except Exception:
            wraptags = ''
        hyphenated = str(firsthalf) + "-" + str(secondhalf)
        dehyphenated = str(firsthalf) + str(secondhalf)
@@ -231,65 +233,84 @@ class Dehyphenator(object):
            self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
        try:
            searchresult = self.html.find(lookupword.lower())
-        except:
+        except Exception:
            return hyphenated
        if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
            if self.html.find(lookupword) != -1 or searchresult != -1:
                if self.verbose > 2:
-                    self.log("    Cleanup:returned dehyphenated word: " + dehyphenated)
+                    self.log("    Cleanup:returned dehyphenated word: " +
+                             dehyphenated)
                return dehyphenated
            elif self.html.find(hyphenated) != -1:
                if self.verbose > 2:
-                    self.log("        Cleanup:returned hyphenated word: " + hyphenated)
+                    self.log("        Cleanup:returned hyphenated word: " +
+                             hyphenated)
                return hyphenated
            else:
                if self.verbose > 2:
-                    self.log("            Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf)
+                    self.log("            Cleanup:returning original text " +
+                             firsthalf + " + linefeed " + secondhalf)
                return firsthalf+'\u2014'+wraptags+secondhalf

        else:
-            if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
+            if (self.format == 'individual_words' and
+                    len(firsthalf) + len(secondhalf) <= 6):
                if self.verbose > 2:
-                    self.log("too short, returned hyphenated word: " + hyphenated)
+                    self.log("too short, returned hyphenated word: " +
+                             hyphenated)
                return hyphenated
            if len(firsthalf) <= 2 and len(secondhalf) <= 2:
                if self.verbose > 2:
-                    self.log("too short, returned hyphenated word: " + hyphenated)
+                    self.log("too short, returned hyphenated word: " +
+                             hyphenated)
                return hyphenated
            if self.html.find(lookupword) != -1 or searchresult != -1:
                if self.verbose > 2:
-                    self.log("     returned dehyphenated word: " + dehyphenated)
+                    self.log("     returned dehyphenated word: " +
+                             dehyphenated)
                return dehyphenated
            else:
                if self.verbose > 2:
-                    self.log("          returned hyphenated word: " + hyphenated)
+                    self.log("          returned hyphenated word: " +
+                             hyphenated)
                return hyphenated

    def __call__(self, html, format, length=1):
        self.html = html
        self.format = format
        if format == 'html':
-            intextmatch = re.compile((
-                r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
-                r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
-                r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
+            intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)'
+                                     r'(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
+                                     r'\s*(</[iubp]>\s*){1,2}'
+                                     r'(?P<up2threeblanks><(p|div)[^>]*>\s*'
+                                     r'(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+)'
+                                     r'{0,3}\s*(<[iubp][^>]*>\s*){1,2}'
+                                     r'(<span[^>]*>)?)\s*(?P<secondpart>'
+                                     r'[\w\d]+)' % length)
        elif format == 'pdf':
-            intextmatch = re.compile((
-                r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<wraptags><p>|'
-                r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
+            intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)'
+                                     r'\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*'
+                                     r'<[iub]>)\s*(?P<secondpart>[\w\d]+)' %
+                                     length)
        elif format == 'txt':
-            intextmatch = re.compile(
-                '(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
+            intextmatch = re.compile('(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)'
+                                     '(\u0020|\u0009)*(?P<wraptags>'
+                                     '(\n(\u0020|\u0009)*)+)(?P<secondpart>'
+                                     '[\\w\\d]+)' % length)
        elif format == 'individual_words':
-            intextmatch = re.compile(
-                r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
+            intextmatch = re.compile(r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*'
+                                     r'(?P<secondpart>\w+)(?![^<]*?>)',
+                                     re.UNICODE)
        elif format == 'html_cleanup':
-            intextmatch = re.compile(
-                r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
-                r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
+            intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)'
+                                     r'(?P<wraptags></span>\s*(</[iubp]>\s*'
+                                     r'<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>'
+                                     r'\s*<[iubp][^>]*>)?\s*(?P<secondpart>'
+                                     r'[\w\d]+)')
        elif format == 'txt_cleanup':
-            intextmatch = re.compile(
-                r'(?P<firstpart>[^\W\-]+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
+            intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|‐)'
+                                     r'(?P<wraptags>\s+)(?P<secondpart>'
+                                     r'[\w\d]+)')

        html = intextmatch.sub(self.dehyphenate, html)
        return html
@@ -299,18 +320,18 @@ class CSSPreProcessor(object):

    # Remove some of the broken CSS Microsoft products
    # create
-    MS_PAT     = re.compile(r'''
+    MS_PAT = re.compile(r'''
        (?P<start>^|;|\{)\s*    # The end of the previous rule or block start
        (%s).+?                 # The invalid selectors
        (?P<end>$|;|\})         # The end of the declaration
-        '''%'mso-|panose-|text-underline|tab-interval',
-        re.MULTILINE|re.IGNORECASE|re.VERBOSE)
+        ''' % 'mso-|panose-|text-underline|tab-interval',
+                        re.MULTILINE | re.IGNORECASE | re.VERBOSE)

    def ms_sub(self, match):
        end = match.group('end')
        try:
            start = match.group('start')
-        except:
+        except Exception:
            start = ''
        if end == ';':
            end = ''
@@ -332,7 +353,7 @@ class CSSPreProcessor(object):
        for line in data.splitlines():
            ll = line.lstrip()
            if not (namespaced or ll.startswith('@import') or not ll or
-                        ll.startswith('@charset')):
+                    ll.startswith('@charset')):
                ans.append(XHTML_CSS_NAMESPACE.strip())
                namespaced = True
            ans.append(line)
@@ -359,7 +380,8 @@ def accent_regex(accent_maps, letter_before=False):
        args = ''.join(accent_cat), ''.join(letters)
        accent_group, letter_group = 1, 2

-    pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
+    pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args),
+                     re.UNICODE)

    def sub(m):
        lmap = accent_maps[m.group(accent_group)]
@@ -371,83 +393,96 @@ def accent_regex(accent_maps, letter_before=False):
 def html_preprocess_rules():
    ans = getattr(html_preprocess_rules, 'ans', None)
    if ans is None:
-        ans = html_preprocess_rules.ans = [
-        # Remove huge block of contiguous spaces as they slow down
-        # the following regexes pretty badly
-        (re.compile(r'\s{10000,}'), ''),
-        # Some idiotic HTML generators (Frontpage I'm looking at you)
-        # Put all sorts of crap into <head>. This messes up lxml
-        (re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
-        sanitize_head),
-        # Convert all entities, since lxml doesn't handle them well
-        (re.compile(r'&(\S+?);'), convert_entities),
-        # Remove the <![if/endif tags inserted by everybody's darling, MS Word
-        (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
-    ]
+        ans = [
+               # Remove huge block of contiguous spaces as they slow down
+               # the following regexes pretty badly
+               (re.compile(r'\s{10000,}'), ''),
+               # Some idiotic HTML generators (Frontpage I'm looking at you)
+               # Put all sorts of crap into <head>. This messes up lxml
+               (re.compile(r'<head[^>]*>\n*(.*?)\n*</head>',
+                           re.IGNORECASE | re.DOTALL), sanitize_head),
+               # Convert all entities, since lxml doesn't handle them well
+               (re.compile(r'&(\S+?);'), convert_entities),
+               # Remove the <![if/endif tags inserted by everybody's darling,
+               # MS Word
+               (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
+                '')]
+        html_preprocess_rules.ans = ans
    return ans


 def pdftohtml_rules():
    ans = getattr(pdftohtml_rules, 'ans', None)
    if ans is None:
-        ans = pdftohtml_rules.ans = [
-        accent_regex({
-            '¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
-            '`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
-            '´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
-            'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
-            '¸': 'cC:çÇ',
-            '˛': 'aAeE:ąĄęĘ',
-            '˙': 'zZ:żŻ',
-            'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
-            '°': 'uU:ůŮ',
-        }),
+        ans = [accent_regex({'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
+                             '`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
+                             '´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚ'
+                                  'úÚźŹ',
+                             'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
+                             '¸': 'cC:çÇ',
+                             '˛': 'aAeE:ąĄęĘ',
+                             '˙': 'zZ:żŻ',
+                             'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
+                             '°': 'uU:ůŮ'}),
+               accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'},
+                            letter_before=True),

-        accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
+               # If pdf printed from a browser then the header/footer has a
+               # reliable pattern
+               (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?'
+                           r'[A-Z].*<br>(?=\s*<hr>))',
+                           re.IGNORECASE), lambda match: ''),

-        # If pdf printed from a browser then the header/footer has a reliable pattern
-        (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+               # Center separator lines
+               (re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'),
+                lambda match: '<p>\n<p style="text-align:center">' +
+                match.group('break') + '</p>'),

-        # Center separator lines
-        (re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
+               # Remove <hr> tags
+               (re.compile(r'<hr.*?>', re.IGNORECASE), ''),

-        # Remove <hr> tags
-        (re.compile(r'<hr.*?>', re.IGNORECASE), ''),
+               # Remove gray background
+               (re.compile(r'<BODY[^<>]+>'), '<BODY>'),

-        # Remove gray background
-        (re.compile(r'<BODY[^<>]+>'), '<BODY>'),
+               # Convert line breaks to paragraphs
+               (re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
+               (re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
+               (re.compile(r'\s*</body>'), '</p>\n</body>'),

-        # Convert line breaks to paragraphs
-        (re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
-        (re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
-        (re.compile(r'\s*</body>'), '</p>\n</body>'),
-
-        # Clean up spaces
-        (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
-        # Add space before and after italics
-        (re.compile(r'(?<!“)<i>'), ' <i>'),
-        (re.compile(r'</i>(?=\w)'), '</i> '),
-    ]
+               # Clean up spaces
+               (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
+               # Add space before and after italics
+               (re.compile(r'(?<!“)<i>'), ' <i>'),
+               (re.compile(r'</i>(?=\w)'), '</i> ')]
+        pdftohtml_rules.ans = ans
    return ans


 def book_designer_rules():
    ans = getattr(book_designer_rules, 'ans', None)
    if ans is None:
-        ans = book_designer_rules.ans = [
-        # HR
-        (re.compile('<hr>', re.IGNORECASE),
-        lambda match : '<span style="page-break-after:always"> </span>'),
-        # Create header tags
-        (re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
-        lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
-        (re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
-        lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
-        (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
-        lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
-        (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
-        lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
-    ]
+        ans = [(re.compile('<hr>', re.IGNORECASE),
+                lambda match: '<span style="page-break-after:always"> '
+                '</span>'),
+               # Create header tags
+               (re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)'
+                           r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
+                lambda match: '<h1 id="BookTitle" align="%s">%s</h1>' %
+                (match.group(2) if match.group(2) else 'center',
+                 match.group(3))),
+               (re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)'
+                           r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
+                lambda match: '<h2 id="BookAuthor" align="%s">%s</h2>' %
+                (match.group(2) if match.group(2) else 'center',
+                 match.group(3))),
+               (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>',
+                           re.IGNORECASE | re.DOTALL),
+                lambda match: '<h2 class="title">%s</h2>' % (match.group(1),)),
+               (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>',
+                           re.IGNORECASE | re.DOTALL),
+                lambda match: '<h3 class="subtitle">%s</h3>' %
+                (match.group(1),))]
+        book_designer_rules.ans = ans
    return None


@@ -470,7 +505,7 @@ class HTMLPreProcessor(object):
        return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]

    def __call__(self, html, remove_special_chars=None,
-            get_preprocess_html=False):
+                 get_preprocess_html=False):
        if remove_special_chars is not None:
            html = remove_special_chars.sub('', html)
        html = html.replace('\0', '')
@@ -487,13 +522,14 @@ class HTMLPreProcessor(object):
        start_rules = []

        if not getattr(self.extra_opts, 'keep_ligatures', False):
-            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
+            html = _ligpat.sub(lambda m: LIGATURES[m.group()], html)

        user_sr_rules = {}
        # Function for processing search and replace

        def do_search_replace(search_pattern, replace_txt):
-            from ebook_converter.ebooks.conversion.search_replace import compile_regular_expression
+            from ebook_converter.ebooks.conversion.search_replace import \
+                    compile_regular_expression
            try:
                search_re = compile_regular_expression(search_pattern)
                if not replace_txt:
@@ -502,11 +538,11 @@ class HTMLPreProcessor(object):
                user_sr_rules[(search_re, replace_txt)] = search_pattern
            except Exception as e:
                self.log.error('Failed to parse %r regexp because %s' %
-                        (search, as_unicode(e)))
+                               (search, e))

        # search / replace using the sr?_search / sr?_replace options
        for i in range(1, 4):
-            search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
+            search, replace = 'sr%d_search' % i, 'sr%d_replace' % i
            search_pattern = getattr(self.extra_opts, search, '')
            replace_txt = getattr(self.extra_opts, replace, '')
            if search_pattern:
@@ -520,31 +556,35 @@ class HTMLPreProcessor(object):
                do_search_replace(search_pattern, replace_txt)

        end_rules = []
-        # delete soft hyphens - moved here so it's executed after header/footer removal
+        # delete soft hyphens - moved here so it's executed after
+        # header/footer removal
        if is_pdftohtml:
            # unwrap/delete soft hyphens
-            end_rules.append((re.compile(
-                r'[](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
+            end_rules.append((re.compile(r'[](</p>\s*<p>\s*)+\s*'
+                                         r'(?=[\[a-z\d])'), lambda match: ''))
            # unwrap/delete soft hyphens with formatting
-            end_rules.append((re.compile(
-                r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
+            end_rules.append((re.compile(r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+'
+                                         r'\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'),
+                              lambda match: ''))

        length = -1
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            docanalysis = DocAnalysis('pdf', html)
-            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
+            length = docanalysis.line_length(getattr(self.extra_opts,
+                                                     'unwrap_factor'))
            if length:
                # print("The pdf line length returned is " + str(length))
                # unwrap em/en dashes
-                end_rules.append((re.compile(
-                    r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
+                end_rules.append((re.compile(r'(?<=.{%i}[–—])\s*<p>\s*'
+                                             r'(?=[\[a-z\d])' % length),
+                                  lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile((
-                        r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]'
-                        r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
-                        r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
-                )
+                    (re.compile((r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçą'
+                                 r'ężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4})'
+                                 r';))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*'
+                                 r'<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])') %
+                                length, re.UNICODE), wrap_lines))

        for rule in html_preprocess_rules() + start_rules:
            html = rule[0].sub(rule[1], html)
@@ -567,7 +607,7 @@ class HTMLPreProcessor(object):
                    name, i = None, 0
                    while not name or os.path.exists(os.path.join(odir, name)):
                        i += 1
-                        name = '%04d.html'%i
+                        name = '%04d.html' % i
                    with open(os.path.join(odir, name), 'wb') as f:
                        f.write(raw.encode('utf-8'))

@@ -578,20 +618,20 @@ class HTMLPreProcessor(object):
                html = rule[0].sub(rule[1], html)
            except Exception as e:
                if rule in user_sr_rules:
-                    self.log.error(
-                        'User supplied search & replace rule: %s -> %s '
-                        'failed with error: %s, ignoring.'%(
-                            user_sr_rules[rule], rule[1], e))
+                    self.log.error('User supplied search & replace rule: %s '
+                                   '-> %s failed with error: %s, ignoring.' %
+                                   (user_sr_rules[rule], rule[1], e))
                else:
                    raise

        if is_pdftohtml and length > -1:
            # Dehyphenate
            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
-            html = dehyphenator(html,'html', length)
+            html = dehyphenator(html, 'html', length)

        if is_pdftohtml:
-            from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
+            from ebook_converter.ebooks.conversion.utils import \
+                    HeuristicProcessor
            pdf_markup = HeuristicProcessor(self.extra_opts, None)
            totalwords = 0
            if pdf_markup.get_word_count(html) > 7000:
@@ -613,23 +653,26 @@ class HTMLPreProcessor(object):
            from ebook_converter.utils.localization import get_udc
            from ebook_converter.utils.mreplace import MReplace
            unihandecoder = get_udc()
-            mr = MReplace(data={'«':'&lt;'*3, '»':'&gt;'*3})
+            mr = MReplace(data={'«': '&lt;' * 3, '»': '&gt;' * 3})
            html = mr.mreplace(html)
            html = unihandecoder.decode(html)

        if getattr(self.extra_opts, 'enable_heuristics', False):
-            from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
+            from ebook_converter.ebooks.conversion.utils import \
+                    HeuristicProcessor
            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
            html = preprocessor(html)

        if is_pdftohtml:
-            html = html.replace('<!-- created by ebook-converter\'s pdftohtml -->', '')
+            html = html.replace('<!-- created by ebook-converter\'s '
+                                'pdftohtml -->', '')

        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = smarten_punctuation(html, self.log)

        try:
-            unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
+            unsupported_unicode_chars = (self.extra_opts.output_profile
+                                         .unsupported_unicode_chars)
        except AttributeError:
            unsupported_unicode_chars = ''
        if unsupported_unicode_chars: