1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-15 06:03:38 +01:00

Removed polyglots unicode_type usage

This commit is contained in:
2020-04-20 19:25:28 +02:00
parent ef7e2b10be
commit 128705f258
130 changed files with 657 additions and 716 deletions

View File

@@ -2,7 +2,6 @@ import functools, re, json
from math import ceil
from ebook_converter import entity_to_unicode, as_unicode
from ebook_converter.polyglot.builtins import unicode_type
__license__ = 'GPL v3'
@@ -72,8 +71,8 @@ def smarten_punctuation(html, log=None):
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(log=log)
from uuid import uuid4
start = 'calibre-smartypants-'+unicode_type(uuid4())
stop = 'calibre-smartypants-'+unicode_type(uuid4())
start = 'calibre-smartypants-'+str(uuid4())
stop = 'calibre-smartypants-'+str(uuid4())
html = html.replace('<!--', start)
html = html.replace('-->', stop)
html = preprocessor.fix_nbsp_indents(html)
@@ -149,20 +148,20 @@ class DocAnalysis(object):
maxLineLength=1900 # Discard larger than this to stay in range
buckets=20 # Each line is divided into a bucket based on length
# print("there are "+unicode_type(len(lines))+" lines")
# print("there are "+str(len(lines))+" lines")
# max = 0
# for line in self.lines:
# l = len(line)
# if l > max:
# max = l
# print("max line found is "+unicode_type(max))
# print("max line found is "+str(max))
# Build the line length histogram
hRaw = [0 for i in range(0,buckets)]
for line in self.lines:
l = len(line)
if l > minLineLength and l < maxLineLength:
l = int(l // 100)
# print("adding "+unicode_type(l))
# print("adding "+str(l))
hRaw[l]+=1
# Normalize the histogram into percents
@@ -171,8 +170,8 @@ class DocAnalysis(object):
h = [float(count)/totalLines for count in hRaw]
else:
h = []
# print("\nhRaw histogram lengths are: "+unicode_type(hRaw))
# print(" percents are: "+unicode_type(h)+"\n")
# print("\nhRaw histogram lengths are: "+str(hRaw))
# print(" percents are: "+str(h)+"\n")
# Find the biggest bucket
maxValue = 0
@@ -184,7 +183,7 @@ class DocAnalysis(object):
# print("Line lengths are too variable. Not unwrapping.")
return False
else:
# print(unicode_type(maxValue)+" of the lines were in one bucket")
# print(str(maxValue)+" of the lines were in one bucket")
return True
@@ -220,8 +219,8 @@ class Dehyphenator(object):
wraptags = match.group('wraptags')
except:
wraptags = ''
hyphenated = unicode_type(firsthalf) + "-" + unicode_type(secondhalf)
dehyphenated = unicode_type(firsthalf) + unicode_type(secondhalf)
hyphenated = str(firsthalf) + "-" + str(secondhalf)
dehyphenated = str(firsthalf) + str(secondhalf)
if self.suffixes.match(secondhalf) is None:
lookupword = self.removesuffixes.sub('', dehyphenated)
else:
@@ -327,7 +326,7 @@ class CSSPreProcessor(object):
# are commented lines before the first @import or @charset rule. Since
# the conversion will remove all stylesheets anyway, we don't lose
# anything
data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
data = re.sub(str(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
ans, namespaced = [], False
for line in data.splitlines():
@@ -535,7 +534,7 @@ class HTMLPreProcessor(object):
docanalysis = DocAnalysis('pdf', html)
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
if length:
# print("The pdf line length returned is " + unicode_type(length))
# print("The pdf line length returned is " + str(length))
# unwrap em/en dashes
end_rules.append((re.compile(
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))