Removed polyglots unicode_type usage

2026-03-28 06:23:32 +01:00 · 2020-04-20 19:25:28 +02:00
parent ef7e2b10be
commit 128705f258
130 changed files with 657 additions and 716 deletions
--- a/ebook_converter/ebooks/conversion/preprocess.py
+++ b/ebook_converter/ebooks/conversion/preprocess.py
@@ -2,7 +2,6 @@ import functools, re, json
 from math import ceil

 from ebook_converter import entity_to_unicode, as_unicode
-from ebook_converter.polyglot.builtins import unicode_type


 __license__ = 'GPL v3'
@@ -72,8 +71,8 @@ def smarten_punctuation(html, log=None):
    from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
    preprocessor = HeuristicProcessor(log=log)
    from uuid import uuid4
-    start = 'calibre-smartypants-'+unicode_type(uuid4())
-    stop = 'calibre-smartypants-'+unicode_type(uuid4())
+    start = 'calibre-smartypants-'+str(uuid4())
+    stop = 'calibre-smartypants-'+str(uuid4())
    html = html.replace('<!--', start)
    html = html.replace('-->', stop)
    html = preprocessor.fix_nbsp_indents(html)
@@ -149,20 +148,20 @@ class DocAnalysis(object):
        maxLineLength=1900  # Discard larger than this to stay in range
        buckets=20  # Each line is divided into a bucket based on length

-        # print("there are "+unicode_type(len(lines))+" lines")
+        # print("there are "+str(len(lines))+" lines")
        # max = 0
        # for line in self.lines:
        #    l = len(line)
        #    if l > max:
        #        max = l
-        # print("max line found is "+unicode_type(max))
+        # print("max line found is "+str(max))
        # Build the line length histogram
        hRaw = [0 for i in range(0,buckets)]
        for line in self.lines:
            l = len(line)
            if l > minLineLength and l < maxLineLength:
                l = int(l // 100)
-                # print("adding "+unicode_type(l))
+                # print("adding "+str(l))
                hRaw[l]+=1

        # Normalize the histogram into percents
@@ -171,8 +170,8 @@ class DocAnalysis(object):
            h = [float(count)/totalLines for count in hRaw]
        else:
            h = []
-        # print("\nhRaw histogram lengths are: "+unicode_type(hRaw))
-        # print("              percents are: "+unicode_type(h)+"\n")
+        # print("\nhRaw histogram lengths are: "+str(hRaw))
+        # print("              percents are: "+str(h)+"\n")

        # Find the biggest bucket
        maxValue = 0
@@ -184,7 +183,7 @@ class DocAnalysis(object):
            # print("Line lengths are too variable. Not unwrapping.")
            return False
        else:
-            # print(unicode_type(maxValue)+" of the lines were in one bucket")
+            # print(str(maxValue)+" of the lines were in one bucket")
            return True


@@ -220,8 +219,8 @@ class Dehyphenator(object):
            wraptags = match.group('wraptags')
        except:
            wraptags = ''
-        hyphenated = unicode_type(firsthalf) + "-" + unicode_type(secondhalf)
-        dehyphenated = unicode_type(firsthalf) + unicode_type(secondhalf)
+        hyphenated = str(firsthalf) + "-" + str(secondhalf)
+        dehyphenated = str(firsthalf) + str(secondhalf)
        if self.suffixes.match(secondhalf) is None:
            lookupword = self.removesuffixes.sub('', dehyphenated)
        else:
@@ -327,7 +326,7 @@ class CSSPreProcessor(object):
        # are commented lines before the first @import or @charset rule. Since
        # the conversion will remove all stylesheets anyway, we don't lose
        # anything
-        data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
+        data = re.sub(str(r'/\*.*?\*/'), '', data, flags=re.DOTALL)

        ans, namespaced = [], False
        for line in data.splitlines():
@@ -535,7 +534,7 @@ class HTMLPreProcessor(object):
            docanalysis = DocAnalysis('pdf', html)
            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
            if length:
-                # print("The pdf line length returned is " + unicode_type(length))
+                # print("The pdf line length returned is " + str(length))
                # unwrap em/en dashes
                end_rules.append((re.compile(
                    r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))