mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-04 11:43:33 +02:00
Removed polyglots unicode_type usage
This commit is contained in:
@@ -10,7 +10,7 @@ from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.customize.conversion import InputFormatPlugin
|
||||
from ebook_converter.ptempfile import TemporaryDirectory
|
||||
from ebook_converter.constants import filesystem_encoding
|
||||
from ebook_converter.polyglot.builtins import unicode_type, as_bytes
|
||||
from ebook_converter.polyglot.builtins import as_bytes
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = ('2008, Kovid Goyal <kovid at kovidgoyal.net>, '
|
||||
@@ -41,7 +41,7 @@ class CHMInput(InputFormatPlugin):
|
||||
|
||||
log.debug('Processing CHM...')
|
||||
with TemporaryDirectory('_chm2oeb') as tdir:
|
||||
if not isinstance(tdir, unicode_type):
|
||||
if not isinstance(tdir, str):
|
||||
tdir = tdir.decode(filesystem_encoding)
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
@@ -129,7 +129,7 @@ class CHMInput(InputFormatPlugin):
|
||||
base = os.path.dirname(os.path.abspath(htmlpath))
|
||||
|
||||
def unquote(x):
|
||||
if isinstance(x, unicode_type):
|
||||
if isinstance(x, str):
|
||||
x = x.encode('utf-8')
|
||||
return _unquote(x).decode('utf-8')
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ from ebook_converter.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from ebook_converter.ptempfile import TemporaryDirectory
|
||||
from ebook_converter import CurrentDir
|
||||
from ebook_converter.polyglot.builtins import unicode_type, as_bytes
|
||||
from ebook_converter.polyglot.builtins import as_bytes
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
@@ -225,15 +225,15 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
identifiers = oeb.metadata['identifier']
|
||||
uuid = None
|
||||
for x in identifiers:
|
||||
if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
|
||||
uuid = unicode_type(x).split(':')[-1]
|
||||
if x.get(OPF('scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:'):
|
||||
uuid = str(x).split(':')[-1]
|
||||
break
|
||||
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
|
||||
|
||||
if uuid is None:
|
||||
self.log.warn('No UUID identifier found')
|
||||
from uuid import uuid4
|
||||
uuid = unicode_type(uuid4())
|
||||
uuid = str(uuid4())
|
||||
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
|
||||
|
||||
if encrypted_fonts and not uuid.startswith('urn:uuid:'):
|
||||
@@ -241,7 +241,7 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
# for some absurd reason, or it will throw a hissy fit and refuse
|
||||
# to use the obfuscated fonts.
|
||||
for x in identifiers:
|
||||
if unicode_type(x) == uuid:
|
||||
if str(x) == uuid:
|
||||
x.content = 'urn:uuid:'+uuid
|
||||
|
||||
with TemporaryDirectory('_epub_output') as tdir:
|
||||
@@ -336,7 +336,7 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024))))
|
||||
else:
|
||||
self.log.warn('Font', path, 'is invalid, ignoring')
|
||||
if not isinstance(uri, unicode_type):
|
||||
if not isinstance(uri, str):
|
||||
uri = uri.decode('utf-8')
|
||||
fonts.append('''
|
||||
<enc:EncryptedData>
|
||||
|
||||
@@ -10,7 +10,7 @@ from ebook_converter.customize.conversion import (InputFormatPlugin,
|
||||
from ebook_converter.utils.localization import get_lang
|
||||
from ebook_converter.utils.filenames import ascii_filename
|
||||
from ebook_converter.utils.imghdr import what
|
||||
from ebook_converter.polyglot.builtins import unicode_type, getcwd, as_unicode
|
||||
from ebook_converter.polyglot.builtins import getcwd, as_unicode
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
@@ -135,7 +135,7 @@ class HTMLInput(InputFormatPlugin):
|
||||
if not metadata.title:
|
||||
oeb.logger.warn('Title not specified')
|
||||
metadata.add('title', self.oeb.translate(__('Unknown')))
|
||||
bookid = unicode_type(uuid.uuid4())
|
||||
bookid = str(uuid.uuid4())
|
||||
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
||||
for ident in metadata.identifier:
|
||||
if 'id' in ident.attrib:
|
||||
@@ -225,7 +225,7 @@ class HTMLInput(InputFormatPlugin):
|
||||
|
||||
def link_to_local_path(self, link_, base=None):
|
||||
from ebook_converter.ebooks.html.input import Link
|
||||
if not isinstance(link_, unicode_type):
|
||||
if not isinstance(link_, str):
|
||||
try:
|
||||
link_ = link_.decode('utf-8', 'error')
|
||||
except:
|
||||
@@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin):
|
||||
# bhref refers to an already existing file. The read() method of
|
||||
# DirContainer will call unquote on it before trying to read the
|
||||
# file, therefore we quote it here.
|
||||
if isinstance(bhref, unicode_type):
|
||||
if isinstance(bhref, str):
|
||||
bhref = bhref.encode('utf-8')
|
||||
item.html_input_href = as_unicode(urllib.parse.quote(bhref))
|
||||
if guessed in self.OEB_STYLES:
|
||||
|
||||
@@ -8,7 +8,6 @@ from lxml import etree
|
||||
from ebook_converter import CurrentDir
|
||||
from ebook_converter.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
from ebook_converter.ebooks.oeb.base import element
|
||||
from ebook_converter.polyglot.builtins import unicode_type
|
||||
from ebook_converter.polyglot.urllib import unquote
|
||||
from ebook_converter.ptempfile import PersistentTemporaryDirectory
|
||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
@@ -155,7 +154,7 @@ class HTMLOutput(OutputFormatPlugin):
|
||||
toc=html_toc, meta=meta, nextLink=nextLink,
|
||||
tocUrl=tocUrl, cssLink=cssLink,
|
||||
firstContentPageLink=nextLink)
|
||||
if isinstance(t, unicode_type):
|
||||
if isinstance(t, str):
|
||||
t = t.encode('utf-8')
|
||||
f.write(t)
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ import os
|
||||
from ebook_converter.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from ebook_converter.ptempfile import TemporaryDirectory
|
||||
from ebook_converter.polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
@@ -78,9 +77,9 @@ class HTMLZOutput(OutputFormatPlugin):
|
||||
fname = u'index'
|
||||
if opts.htmlz_title_filename:
|
||||
from ebook_converter.utils.filenames import shorten_components_to
|
||||
fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0]
|
||||
fname = shorten_components_to(100, (ascii_filename(str(oeb_book.metadata.title[0])),))[0]
|
||||
with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
|
||||
if isinstance(html, unicode_type):
|
||||
if isinstance(html, str):
|
||||
html = html.encode('utf-8')
|
||||
tf.write(html)
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@ import sys, os
|
||||
|
||||
from ebook_converter.customize.conversion import OutputFormatPlugin
|
||||
from ebook_converter.customize.conversion import OptionRecommendation
|
||||
from ebook_converter.polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
@@ -15,7 +14,7 @@ class LRFOptions(object):
|
||||
def __init__(self, output, opts, oeb):
|
||||
def f2s(f):
|
||||
try:
|
||||
return unicode_type(f[0])
|
||||
return str(f[0])
|
||||
except:
|
||||
return ''
|
||||
m = oeb.metadata
|
||||
@@ -29,13 +28,13 @@ class LRFOptions(object):
|
||||
self.title_sort = self.author_sort = ''
|
||||
for x in m.creator:
|
||||
if x.role == 'aut':
|
||||
self.author = unicode_type(x)
|
||||
fa = unicode_type(getattr(x, 'file_as', ''))
|
||||
self.author = str(x)
|
||||
fa = str(getattr(x, 'file_as', ''))
|
||||
if fa:
|
||||
self.author_sort = fa
|
||||
for x in m.title:
|
||||
if unicode_type(x.file_as):
|
||||
self.title_sort = unicode_type(x.file_as)
|
||||
if str(x.file_as):
|
||||
self.title_sort = str(x.file_as)
|
||||
self.freetext = f2s(m.description)
|
||||
self.category = f2s(m.subject)
|
||||
self.cover = None
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import os
|
||||
|
||||
from ebook_converter.customize.conversion import InputFormatPlugin
|
||||
from ebook_converter.polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
@@ -50,7 +49,7 @@ class MOBIInput(InputFormatPlugin):
|
||||
|
||||
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
|
||||
if raw:
|
||||
if isinstance(raw, unicode_type):
|
||||
if isinstance(raw, str):
|
||||
raw = raw.encode('utf-8')
|
||||
with lopen('debug-raw.html', 'wb') as f:
|
||||
f.write(raw)
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from ebook_converter.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from ebook_converter.polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
@@ -119,7 +118,7 @@ class MOBIOutput(OutputFormatPlugin):
|
||||
if not found:
|
||||
from ebook_converter.ebooks import generate_masthead
|
||||
self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
|
||||
raw = generate_masthead(unicode_type(self.oeb.metadata['title'][0]))
|
||||
raw = generate_masthead(str(self.oeb.metadata['title'][0]))
|
||||
id, href = self.oeb.manifest.generate('masthead', 'masthead')
|
||||
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
|
||||
self.oeb.guide.add('masthead', 'Masthead Image', href)
|
||||
@@ -163,7 +162,7 @@ class MOBIOutput(OutputFormatPlugin):
|
||||
sec.nodes.remove(a)
|
||||
|
||||
root = TOC(klass='periodical', href=self.oeb.spine[0].href,
|
||||
title=unicode_type(self.oeb.metadata.title[0]))
|
||||
title=str(self.oeb.metadata.title[0]))
|
||||
|
||||
for s in sections:
|
||||
if articles[id(s)]:
|
||||
|
||||
@@ -6,7 +6,7 @@ import glob, os
|
||||
from ebook_converter.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from ebook_converter.ptempfile import TemporaryDirectory
|
||||
from ebook_converter.polyglot.builtins import iteritems, unicode_type
|
||||
from ebook_converter.polyglot.builtins import iteritems
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
@@ -190,8 +190,8 @@ class PDFOutput(OutputFormatPlugin):
|
||||
|
||||
def get_cover_data(self):
|
||||
oeb = self.oeb
|
||||
if (oeb.metadata.cover and unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
|
||||
cover_id = unicode_type(oeb.metadata.cover[0])
|
||||
if (oeb.metadata.cover and str(oeb.metadata.cover[0]) in oeb.manifest.ids):
|
||||
cover_id = str(oeb.metadata.cover[0])
|
||||
item = oeb.manifest.ids[cover_id]
|
||||
self.cover_data = item.data
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ import os, io
|
||||
from ebook_converter.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from ebook_converter.ptempfile import TemporaryDirectory
|
||||
from ebook_converter.polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
@@ -40,7 +39,7 @@ class PMLOutput(OutputFormatPlugin):
|
||||
|
||||
with TemporaryDirectory('_pmlz_output') as tdir:
|
||||
pmlmlizer = PMLMLizer(log)
|
||||
pml = unicode_type(pmlmlizer.extract_content(oeb_book, opts))
|
||||
pml = str(pmlmlizer.extract_content(oeb_book, opts))
|
||||
with lopen(os.path.join(tdir, 'index.pml'), 'wb') as out:
|
||||
out.write(pml.encode(opts.pml_output_encoding, 'replace'))
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ import os
|
||||
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from ebook_converter.constants import numeric_version
|
||||
from ebook_converter import walk
|
||||
from ebook_converter.polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
@@ -161,6 +160,6 @@ class RecipeInput(InputFormatPlugin):
|
||||
|
||||
def save_download(self, zf):
|
||||
raw = self.recipe_source
|
||||
if isinstance(raw, unicode_type):
|
||||
if isinstance(raw, str):
|
||||
raw = raw.encode('utf-8')
|
||||
zf.writestr('download.recipe', raw)
|
||||
|
||||
@@ -3,7 +3,6 @@ import os
|
||||
from ebook_converter.customize.conversion import InputFormatPlugin
|
||||
from ebook_converter.ptempfile import TemporaryDirectory
|
||||
from ebook_converter.utils.filenames import ascii_filename
|
||||
from ebook_converter.polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
@@ -73,7 +72,7 @@ class SNBInput(InputFormatPlugin):
|
||||
if d['cover'] != '':
|
||||
oeb.guide.add('cover', 'Cover', d['cover'])
|
||||
|
||||
bookid = unicode_type(uuid.uuid4())
|
||||
bookid = str(uuid.uuid4())
|
||||
oeb.metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
||||
for ident in oeb.metadata.identifier:
|
||||
if 'id' in ident.attrib:
|
||||
|
||||
@@ -3,7 +3,6 @@ import os
|
||||
from ebook_converter.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
from ebook_converter.ptempfile import TemporaryDirectory
|
||||
from ebook_converter.constants import __appname__, __version__
|
||||
from ebook_converter.polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
@@ -73,20 +72,20 @@ class SNBOutput(OutputFormatPlugin):
|
||||
# Process Meta data
|
||||
meta = oeb_book.metadata
|
||||
if meta.title:
|
||||
title = unicode_type(meta.title[0])
|
||||
title = str(meta.title[0])
|
||||
else:
|
||||
title = ''
|
||||
authors = [unicode_type(x) for x in meta.creator if x.role == 'aut']
|
||||
authors = [str(x) for x in meta.creator if x.role == 'aut']
|
||||
if meta.publisher:
|
||||
publishers = unicode_type(meta.publisher[0])
|
||||
publishers = str(meta.publisher[0])
|
||||
else:
|
||||
publishers = ''
|
||||
if meta.language:
|
||||
lang = unicode_type(meta.language[0]).upper()
|
||||
lang = str(meta.language[0]).upper()
|
||||
else:
|
||||
lang = ''
|
||||
if meta.description:
|
||||
abstract = unicode_type(meta.description[0])
|
||||
abstract = str(meta.description[0])
|
||||
else:
|
||||
abstract = ''
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ from ebook_converter.utils.zipfile import ZipFile
|
||||
from ebook_converter import (extract, walk, isbytestring, filesystem_encoding,
|
||||
get_types_map)
|
||||
from ebook_converter.constants import __version__
|
||||
from ebook_converter.polyglot.builtins import unicode_type, string_or_bytes
|
||||
from ebook_converter.polyglot.builtins import string_or_bytes
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
@@ -795,7 +795,7 @@ OptionRecommendation(name='search_replace',
|
||||
def unarchive(self, path, tdir):
|
||||
extract(path, tdir)
|
||||
files = list(walk(tdir))
|
||||
files = [f if isinstance(f, unicode_type) else f.decode(filesystem_encoding)
|
||||
files = [f if isinstance(f, str) else f.decode(filesystem_encoding)
|
||||
for f in files]
|
||||
from ebook_converter.customize.ui import available_input_formats
|
||||
fmts = set(available_input_formats())
|
||||
@@ -848,7 +848,7 @@ OptionRecommendation(name='search_replace',
|
||||
rec = self.get_option_by_name(name)
|
||||
help = getattr(rec, 'help', None)
|
||||
if help is not None:
|
||||
return help.replace('%default', unicode_type(rec.recommended_value))
|
||||
return help.replace('%default', str(rec.recommended_value))
|
||||
|
||||
def get_all_help(self):
|
||||
ans = {}
|
||||
@@ -916,7 +916,7 @@ OptionRecommendation(name='search_replace',
|
||||
try:
|
||||
val = parse_date(val, assume_utc=x=='timestamp')
|
||||
except:
|
||||
self.log.exception(_('Failed to parse date/time') + ' ' + unicode_type(val))
|
||||
self.log.exception(_('Failed to parse date/time') + ' ' + str(val))
|
||||
continue
|
||||
setattr(mi, x, val)
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@ import functools, re, json
|
||||
from math import ceil
|
||||
|
||||
from ebook_converter import entity_to_unicode, as_unicode
|
||||
from ebook_converter.polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
@@ -72,8 +71,8 @@ def smarten_punctuation(html, log=None):
|
||||
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(log=log)
|
||||
from uuid import uuid4
|
||||
start = 'calibre-smartypants-'+unicode_type(uuid4())
|
||||
stop = 'calibre-smartypants-'+unicode_type(uuid4())
|
||||
start = 'calibre-smartypants-'+str(uuid4())
|
||||
stop = 'calibre-smartypants-'+str(uuid4())
|
||||
html = html.replace('<!--', start)
|
||||
html = html.replace('-->', stop)
|
||||
html = preprocessor.fix_nbsp_indents(html)
|
||||
@@ -149,20 +148,20 @@ class DocAnalysis(object):
|
||||
maxLineLength=1900 # Discard larger than this to stay in range
|
||||
buckets=20 # Each line is divided into a bucket based on length
|
||||
|
||||
# print("there are "+unicode_type(len(lines))+" lines")
|
||||
# print("there are "+str(len(lines))+" lines")
|
||||
# max = 0
|
||||
# for line in self.lines:
|
||||
# l = len(line)
|
||||
# if l > max:
|
||||
# max = l
|
||||
# print("max line found is "+unicode_type(max))
|
||||
# print("max line found is "+str(max))
|
||||
# Build the line length histogram
|
||||
hRaw = [0 for i in range(0,buckets)]
|
||||
for line in self.lines:
|
||||
l = len(line)
|
||||
if l > minLineLength and l < maxLineLength:
|
||||
l = int(l // 100)
|
||||
# print("adding "+unicode_type(l))
|
||||
# print("adding "+str(l))
|
||||
hRaw[l]+=1
|
||||
|
||||
# Normalize the histogram into percents
|
||||
@@ -171,8 +170,8 @@ class DocAnalysis(object):
|
||||
h = [float(count)/totalLines for count in hRaw]
|
||||
else:
|
||||
h = []
|
||||
# print("\nhRaw histogram lengths are: "+unicode_type(hRaw))
|
||||
# print(" percents are: "+unicode_type(h)+"\n")
|
||||
# print("\nhRaw histogram lengths are: "+str(hRaw))
|
||||
# print(" percents are: "+str(h)+"\n")
|
||||
|
||||
# Find the biggest bucket
|
||||
maxValue = 0
|
||||
@@ -184,7 +183,7 @@ class DocAnalysis(object):
|
||||
# print("Line lengths are too variable. Not unwrapping.")
|
||||
return False
|
||||
else:
|
||||
# print(unicode_type(maxValue)+" of the lines were in one bucket")
|
||||
# print(str(maxValue)+" of the lines were in one bucket")
|
||||
return True
|
||||
|
||||
|
||||
@@ -220,8 +219,8 @@ class Dehyphenator(object):
|
||||
wraptags = match.group('wraptags')
|
||||
except:
|
||||
wraptags = ''
|
||||
hyphenated = unicode_type(firsthalf) + "-" + unicode_type(secondhalf)
|
||||
dehyphenated = unicode_type(firsthalf) + unicode_type(secondhalf)
|
||||
hyphenated = str(firsthalf) + "-" + str(secondhalf)
|
||||
dehyphenated = str(firsthalf) + str(secondhalf)
|
||||
if self.suffixes.match(secondhalf) is None:
|
||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||
else:
|
||||
@@ -327,7 +326,7 @@ class CSSPreProcessor(object):
|
||||
# are commented lines before the first @import or @charset rule. Since
|
||||
# the conversion will remove all stylesheets anyway, we don't lose
|
||||
# anything
|
||||
data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
|
||||
data = re.sub(str(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
|
||||
|
||||
ans, namespaced = [], False
|
||||
for line in data.splitlines():
|
||||
@@ -535,7 +534,7 @@ class HTMLPreProcessor(object):
|
||||
docanalysis = DocAnalysis('pdf', html)
|
||||
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
|
||||
if length:
|
||||
# print("The pdf line length returned is " + unicode_type(length))
|
||||
# print("The pdf line length returned is " + str(length))
|
||||
# unwrap em/en dashes
|
||||
end_rules.append((re.compile(
|
||||
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
|
||||
|
||||
@@ -3,7 +3,6 @@ from math import ceil
|
||||
from ebook_converter.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from ebook_converter.utils.logging import default_log
|
||||
from ebook_converter.utils.wordcount import get_wordcount_obj
|
||||
from ebook_converter.polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
@@ -51,8 +50,8 @@ class HeuristicProcessor(object):
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
|
||||
" chapters. - " + unicode_type(chap))
|
||||
self.log.debug("marked " + str(self.html_preprocess_sections) +
|
||||
" chapters. - " + str(chap))
|
||||
return '<h2>'+chap+'</h2>\n'
|
||||
else:
|
||||
delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
|
||||
@@ -60,16 +59,16 @@ class HeuristicProcessor(object):
|
||||
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
|
||||
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
|
||||
" chapters & titles. - " + unicode_type(chap) + ", " + unicode_type(title))
|
||||
self.log.debug("marked " + str(self.html_preprocess_sections) +
|
||||
" chapters & titles. - " + str(chap) + ", " + str(title))
|
||||
return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
|
||||
|
||||
def chapter_break(self, match):
|
||||
chap = match.group('section')
|
||||
styles = match.group('styles')
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
|
||||
" section markers based on punctuation. - " + unicode_type(chap))
|
||||
self.log.debug("marked " + str(self.html_preprocess_sections) +
|
||||
" section markers based on punctuation. - " + str(chap))
|
||||
return '<'+styles+' style="page-break-before:always">'+chap
|
||||
|
||||
def analyze_title_matches(self, match):
|
||||
@@ -112,8 +111,8 @@ class HeuristicProcessor(object):
|
||||
line_end = line_end_ere.findall(raw)
|
||||
tot_htm_ends = len(htm_end)
|
||||
tot_ln_fds = len(line_end)
|
||||
# self.log.debug("There are " + unicode_type(tot_ln_fds) + " total Line feeds, and " +
|
||||
# unicode_type(tot_htm_ends) + " marked up endings")
|
||||
# self.log.debug("There are " + str(tot_ln_fds) + " total Line feeds, and " +
|
||||
# str(tot_htm_ends) + " marked up endings")
|
||||
|
||||
if percent > 1:
|
||||
percent = 1
|
||||
@@ -121,7 +120,7 @@ class HeuristicProcessor(object):
|
||||
percent = 0
|
||||
|
||||
min_lns = tot_ln_fds * percent
|
||||
# self.log.debug("There must be fewer than " + unicode_type(min_lns) + " unmarked lines to add markup")
|
||||
# self.log.debug("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
|
||||
return min_lns > tot_htm_ends
|
||||
|
||||
def dump(self, raw, where):
|
||||
@@ -158,17 +157,17 @@ class HeuristicProcessor(object):
|
||||
]
|
||||
|
||||
ITALICIZE_STYLE_PATS = [
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_\*/(?P<words>[^\*_]+)/\*_'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])~~(?P<words>[^~]+)~~'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_/(?P<words>[^/_]+)/_'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_\*(?P<words>[^\*_]+)\*_'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])\*/(?P<words>[^/\*]+)/\*'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])/:(?P<words>[^:/]+):/'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])\|:(?P<words>[^:\|]+):\|'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])\*(?P<words>[^\*]+)\*'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])~(?P<words>[^~]+)~'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])/(?P<words>[^/\*><]+)/'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_(?P<words>[^_]+)_'),
|
||||
str(r'(?msu)(?<=[\s>"“\'‘])_\*/(?P<words>[^\*_]+)/\*_'),
|
||||
str(r'(?msu)(?<=[\s>"“\'‘])~~(?P<words>[^~]+)~~'),
|
||||
str(r'(?msu)(?<=[\s>"“\'‘])_/(?P<words>[^/_]+)/_'),
|
||||
str(r'(?msu)(?<=[\s>"“\'‘])_\*(?P<words>[^\*_]+)\*_'),
|
||||
str(r'(?msu)(?<=[\s>"“\'‘])\*/(?P<words>[^/\*]+)/\*'),
|
||||
str(r'(?msu)(?<=[\s>"“\'‘])/:(?P<words>[^:/]+):/'),
|
||||
str(r'(?msu)(?<=[\s>"“\'‘])\|:(?P<words>[^:\|]+):\|'),
|
||||
str(r'(?msu)(?<=[\s>"“\'‘])\*(?P<words>[^\*]+)\*'),
|
||||
str(r'(?msu)(?<=[\s>"“\'‘])~(?P<words>[^~]+)~'),
|
||||
str(r'(?msu)(?<=[\s>"“\'‘])/(?P<words>[^/\*><]+)/'),
|
||||
str(r'(?msu)(?<=[\s>"“\'‘])_(?P<words>[^_]+)_'),
|
||||
]
|
||||
|
||||
for word in ITALICIZE_WORDS:
|
||||
@@ -178,10 +177,10 @@ class HeuristicProcessor(object):
|
||||
search_text = re.sub(r'<[^>]*>', '', search_text)
|
||||
for pat in ITALICIZE_STYLE_PATS:
|
||||
for match in re.finditer(pat, search_text):
|
||||
ital_string = unicode_type(match.group('words'))
|
||||
# self.log.debug("italicising "+unicode_type(match.group(0))+" with <i>"+ital_string+"</i>")
|
||||
ital_string = str(match.group('words'))
|
||||
# self.log.debug("italicising "+str(match.group(0))+" with <i>"+ital_string+"</i>")
|
||||
try:
|
||||
html = re.sub(re.escape(unicode_type(match.group(0))), '<i>%s</i>' % ital_string, html)
|
||||
html = re.sub(re.escape(str(match.group(0))), '<i>%s</i>' % ital_string, html)
|
||||
except OverflowError:
|
||||
# match.group(0) was too large to be compiled into a regex
|
||||
continue
|
||||
@@ -206,10 +205,10 @@ class HeuristicProcessor(object):
|
||||
if wordcount > 200000:
|
||||
typical_chapters = 15000.
|
||||
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||
self.log.debug("minimum chapters required are: "+unicode_type(self.min_chapters))
|
||||
self.log.debug("minimum chapters required are: "+str(self.min_chapters))
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log.debug("found " + unicode_type(self.html_preprocess_sections) + " pre-existing headings")
|
||||
self.log.debug("found " + str(self.html_preprocess_sections) + " pre-existing headings")
|
||||
|
||||
# Build the Regular Expressions in pieces
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
@@ -299,7 +298,7 @@ class HeuristicProcessor(object):
|
||||
if n_lookahead_req:
|
||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||
if not analyze:
|
||||
self.log.debug("Marked " + unicode_type(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
self.log.debug("Marked " + str(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
|
||||
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
|
||||
lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
||||
@@ -313,10 +312,10 @@ class HeuristicProcessor(object):
|
||||
title_req = True
|
||||
strict_title = False
|
||||
self.log.debug(
|
||||
unicode_type(type_name)+" had "+unicode_type(hits)+
|
||||
" hits - "+unicode_type(self.chapters_no_title)+" chapters with no title, "+
|
||||
unicode_type(self.chapters_with_title)+" chapters with titles, "+
|
||||
unicode_type(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||
str(type_name)+" had "+str(hits)+
|
||||
" hits - "+str(self.chapters_no_title)+" chapters with no title, "+
|
||||
str(self.chapters_with_title)+" chapters with titles, "+
|
||||
str(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||
if type_name == 'common':
|
||||
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||
elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
|
||||
@@ -333,8 +332,8 @@ class HeuristicProcessor(object):
|
||||
words_per_chptr = wordcount
|
||||
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
||||
words_per_chptr = wordcount // self.html_preprocess_sections
|
||||
self.log.debug("Total wordcount is: "+ unicode_type(wordcount)+", Average words per section is: "+
|
||||
unicode_type(words_per_chptr)+", Marked up "+unicode_type(self.html_preprocess_sections)+" chapters")
|
||||
self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+
|
||||
str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
||||
return html
|
||||
|
||||
def punctuation_unwrap(self, length, content, format):
|
||||
@@ -364,8 +363,8 @@ class HeuristicProcessor(object):
|
||||
|
||||
# define the pieces of the regex
|
||||
# (?<!\&\w{4});) is a semicolon not part of an entity
|
||||
lookahead = "(?<=.{"+unicode_type(length)+r"}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4});))"
|
||||
em_en_lookahead = "(?<=.{"+unicode_type(length)+"}[\u2013\u2014])"
|
||||
lookahead = "(?<=.{"+str(length)+r"}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4});))"
|
||||
em_en_lookahead = "(?<=.{"+str(length)+"}[\u2013\u2014])"
|
||||
soft_hyphen = "\xad"
|
||||
line_ending = "\\s*(?P<style_close></(span|[iub])>)?\\s*(</(p|div)>)?"
|
||||
blanklines = "\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*"
|
||||
@@ -425,18 +424,18 @@ class HeuristicProcessor(object):
|
||||
return html
|
||||
|
||||
def fix_nbsp_indents(self, html):
|
||||
txtindent = re.compile(unicode_type(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
|
||||
txtindent = re.compile(str(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
|
||||
html = txtindent.sub(self.insert_indent, html)
|
||||
if self.found_indents > 1:
|
||||
self.log.debug("replaced "+unicode_type(self.found_indents)+ " nbsp indents with inline styles")
|
||||
self.log.debug("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
|
||||
return html
|
||||
|
||||
def cleanup_markup(self, html):
|
||||
# remove remaining non-breaking spaces
|
||||
html = re.sub(unicode_type(r'\u00a0'), ' ', html)
|
||||
html = re.sub(str(r'\u00a0'), ' ', html)
|
||||
# Get rid of various common microsoft specific tags which can cause issues later
|
||||
# Get rid of empty <o:p> tags to simplify other processing
|
||||
html = re.sub(unicode_type(r'\s*<o:p>\s*</o:p>'), ' ', html)
|
||||
html = re.sub(str(r'\s*<o:p>\s*</o:p>'), ' ', html)
|
||||
# Delete microsoft 'smart' tags
|
||||
html = re.sub('(?i)</?st1:\\w+>', '', html)
|
||||
# Re-open self closing paragraph tags
|
||||
@@ -476,8 +475,8 @@ class HeuristicProcessor(object):
|
||||
blanklines = self.blankreg.findall(html)
|
||||
lines = self.linereg.findall(html)
|
||||
if len(lines) > 1:
|
||||
self.log.debug("There are " + unicode_type(len(blanklines)) + " blank lines. " +
|
||||
unicode_type(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||
self.log.debug("There are " + str(len(blanklines)) + " blank lines. " +
|
||||
str(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||
|
||||
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||
return True
|
||||
@@ -499,11 +498,11 @@ class HeuristicProcessor(object):
|
||||
lines = float(len(self.single_blank.findall(to_merge))) - 1.
|
||||
em = base_em + (em_per_line * lines)
|
||||
if to_merge.find('whitespace'):
|
||||
newline = self.any_multi_blank.sub('\n<p class="whitespace'+unicode_type(int(em * 10))+
|
||||
'" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
|
||||
newline = self.any_multi_blank.sub('\n<p class="whitespace'+str(int(em * 10))+
|
||||
'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
|
||||
else:
|
||||
newline = self.any_multi_blank.sub('\n<p class="softbreak'+unicode_type(int(em * 10))+
|
||||
'" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
|
||||
newline = self.any_multi_blank.sub('\n<p class="softbreak'+str(int(em * 10))+
|
||||
'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
|
||||
return newline
|
||||
|
||||
html = self.any_multi_blank.sub(merge_matches, html)
|
||||
@@ -527,9 +526,9 @@ class HeuristicProcessor(object):
|
||||
top_margin = ''
|
||||
bottom_margin = ''
|
||||
if initblanks is not None:
|
||||
top_margin = 'margin-top:'+unicode_type(len(self.single_blank.findall(initblanks)))+'em;'
|
||||
top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
|
||||
if endblanks is not None:
|
||||
bottom_margin = 'margin-bottom:'+unicode_type(len(self.single_blank.findall(endblanks)))+'em;'
|
||||
bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(endblanks)))+'em;'
|
||||
|
||||
if initblanks is None and endblanks is None:
|
||||
return content
|
||||
@@ -606,7 +605,7 @@ class HeuristicProcessor(object):
|
||||
else:
|
||||
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
|
||||
divpercent = (100 - width) // 2
|
||||
hr_open = re.sub('45', unicode_type(divpercent), hr_open)
|
||||
hr_open = re.sub('45', str(divpercent), hr_open)
|
||||
scene_break = hr_open+replacement_break+'</div>'
|
||||
else:
|
||||
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||
@@ -666,12 +665,12 @@ class HeuristicProcessor(object):
|
||||
else:
|
||||
styles = match.group('styles').split(';')
|
||||
is_paragraph = self.check_paragraph(content)
|
||||
# print "styles for this line are: "+unicode_type(styles)
|
||||
# print "styles for this line are: "+str(styles)
|
||||
split_styles = []
|
||||
for style in styles:
|
||||
# print "style is: "+unicode_type(style)
|
||||
# print "style is: "+str(style)
|
||||
newstyle = style.split(':')
|
||||
# print "newstyle is: "+unicode_type(newstyle)
|
||||
# print "newstyle is: "+str(newstyle)
|
||||
split_styles.append(newstyle)
|
||||
styles = split_styles
|
||||
for style, setting in styles:
|
||||
@@ -682,7 +681,7 @@ class HeuristicProcessor(object):
|
||||
if 9 < setting < 14:
|
||||
text_indent = indented_text
|
||||
else:
|
||||
text_indent = style+':'+unicode_type(setting)+'pt;'
|
||||
text_indent = style+':'+str(setting)+'pt;'
|
||||
if style == 'padding':
|
||||
setting = re.sub('pt', '', setting).split(' ')
|
||||
if int(setting[1]) < 16 and int(setting[3]) < 16:
|
||||
@@ -703,23 +702,23 @@ class HeuristicProcessor(object):
|
||||
blockquote_open_loop = blockquote_open
|
||||
if debugabby:
|
||||
self.log.debug('\n\n******\n')
|
||||
self.log.debug('padding top is: '+unicode_type(setting[0]))
|
||||
self.log.debug('padding right is:' +unicode_type(setting[1]))
|
||||
self.log.debug('padding bottom is: ' + unicode_type(setting[2]))
|
||||
self.log.debug('padding left is: ' +unicode_type(setting[3]))
|
||||
self.log.debug('padding top is: '+str(setting[0]))
|
||||
self.log.debug('padding right is:' +str(setting[1]))
|
||||
self.log.debug('padding bottom is: ' + str(setting[2]))
|
||||
self.log.debug('padding left is: ' +str(setting[3]))
|
||||
|
||||
# print "text-align is: "+unicode_type(text_align)
|
||||
# print "\n***\nline is:\n "+unicode_type(match.group(0))+'\n'
|
||||
# print "text-align is: "+str(text_align)
|
||||
# print "\n***\nline is:\n "+str(match.group(0))+'\n'
|
||||
if debugabby:
|
||||
# print "this line is a paragraph = "+unicode_type(is_paragraph)+", previous line was "+unicode_type(self.previous_was_paragraph)
|
||||
# print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph)
|
||||
self.log.debug("styles for this line were:", styles)
|
||||
self.log.debug('newline is:')
|
||||
self.log.debug(blockquote_open_loop+blockquote_close_loop+
|
||||
paragraph_before+'<p style="'+text_indent+text_align+
|
||||
'">'+content+'</p>'+paragraph_after+'\n\n\n\n\n')
|
||||
# print "is_paragraph is "+unicode_type(is_paragraph)+", previous_was_paragraph is "+unicode_type(self.previous_was_paragraph)
|
||||
# print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph)
|
||||
self.previous_was_paragraph = is_paragraph
|
||||
# print "previous_was_paragraph is now set to "+unicode_type(self.previous_was_paragraph)+"\n\n\n"
|
||||
# print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n"
|
||||
return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after
|
||||
|
||||
html = abbyy_line.sub(convert_styles, html)
|
||||
@@ -802,12 +801,12 @@ class HeuristicProcessor(object):
|
||||
# more of the lines break in the same region of the document then unwrapping is required
|
||||
docanalysis = DocAnalysis(format, html)
|
||||
hardbreaks = docanalysis.line_histogram(.50)
|
||||
self.log.debug("Hard line breaks check returned "+unicode_type(hardbreaks))
|
||||
self.log.debug("Hard line breaks check returned "+str(hardbreaks))
|
||||
|
||||
# Calculate Length
|
||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||
length = docanalysis.line_length(unwrap_factor)
|
||||
self.log.debug("Median line length is " + unicode_type(length) + ", calculated with " + format + " format")
|
||||
self.log.debug("Median line length is " + str(length) + ", calculated with " + format + " format")
|
||||
|
||||
# ##### Unwrap lines ######
|
||||
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||
@@ -829,7 +828,7 @@ class HeuristicProcessor(object):
|
||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||
self.log.debug("Looking for more split points based on punctuation,"
|
||||
" currently have " + unicode_type(self.html_preprocess_sections))
|
||||
" currently have " + str(self.html_preprocess_sections))
|
||||
chapdetect3 = re.compile(
|
||||
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
|
||||
r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
|
||||
|
||||
Reference in New Issue
Block a user