Removed polyglots unicode_type usage

2026-04-11 07:33:35 +02:00 · 2020-04-20 19:25:28 +02:00
parent ef7e2b10be
commit 128705f258
130 changed files with 657 additions and 716 deletions
--- a/ebook_converter/ebooks/lrf/html/convert_from.py
+++ b/ebook_converter/ebooks/lrf/html/convert_from.py
@@ -37,7 +37,7 @@ from ebook_converter.ebooks.lrf.pylrs.pylrs import (
    RuledLine, Span, Sub, Sup, TextBlock
 )
 from ebook_converter.ptempfile import PersistentTemporaryFile
-from ebook_converter.polyglot.builtins import getcwd, itervalues, string_or_bytes, unicode_type
+from ebook_converter.polyglot.builtins import getcwd, itervalues, string_or_bytes
 from ebook_converter.polyglot.urllib import unquote

 from PIL import Image as PILImage
@@ -276,7 +276,7 @@ class HTMLConverter(object):
                update_css(npcss, self.override_pcss)

        paths = [os.path.abspath(path) for path in paths]
-        paths = [path.decode(sys.getfilesystemencoding()) if not isinstance(path, unicode_type) else path for path in paths]
+        paths = [path.decode(sys.getfilesystemencoding()) if not isinstance(path, str) else path for path in paths]

        while len(paths) > 0 and self.link_level <= self.link_levels:
            for path in paths:
@@ -356,7 +356,7 @@ class HTMLConverter(object):
                os.makedirs(tdir)
            try:
                with open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb') as f:
-                    f.write(unicode_type(soup).encode('utf-8'))
+                    f.write(str(soup).encode('utf-8'))
                    self.log.info(_('Written preprocessed HTML to ')+f.name)
            except:
                pass
@@ -389,7 +389,7 @@ class HTMLConverter(object):
        self.log.info(_('\tConverting to BBeB...'))
        self.current_style = {}
        self.page_break_found = False
-        if not isinstance(path, unicode_type):
+        if not isinstance(path, str):
            path = path.decode(sys.getfilesystemencoding())
        self.target_prefix = path
        self.previous_text = '\n'
@@ -399,7 +399,7 @@ class HTMLConverter(object):
    def parse_css(self, style):
        """
        Parse the contents of a <style> tag or .css file.
-        @param style: C{unicode_type(style)} should be the CSS to parse.
+        @param style: C{str(style)} should be the CSS to parse.
        @return: A dictionary with one entry per selector where the key is the
        selector name and the value is a dictionary of properties
        """
@@ -587,7 +587,7 @@ class HTMLConverter(object):
            if isinstance(c, HTMLConverter.IGNORED_TAGS):
                continue
            if isinstance(c, bs4.NavigableString):
-                text += unicode_type(c)
+                text += str(c)
            elif isinstance(c, bs4.Tag):
                if c.name.lower() == 'img' and c.has_attr('alt'):
                    alt_text += c['alt']
@@ -642,7 +642,7 @@ class HTMLConverter(object):
            para, text, path, fragment = link['para'], link['text'], link['path'], link['fragment']
            ascii_text = text

-            if not isinstance(path, unicode_type):
+            if not isinstance(path, str):
                path = path.decode(sys.getfilesystemencoding())
            if path in self.processed_files:
                if path+fragment in self.targets.keys():
@@ -1085,7 +1085,7 @@ class HTMLConverter(object):

        s1, s2 = get('margin'), get('padding')

-        bl = unicode_type(self.current_block.blockStyle.attrs['blockwidth'])+'px'
+        bl = str(self.current_block.blockStyle.attrs['blockwidth'])+'px'

        def set(default, one, two):
            fval = None
@@ -1214,7 +1214,7 @@ class HTMLConverter(object):
                    ans = 120
            if ans is not None:
                ans += int(self.font_delta * 20)
-                ans = unicode_type(ans)
+                ans = str(ans)
            return ans

        family, weight, style, variant = 'serif', 'normal', 'normal', None
@@ -1320,10 +1320,10 @@ class HTMLConverter(object):
    def text_properties(self, tag_css):
        indent = self.book.defaultTextStyle.attrs['parindent']
        if 'text-indent' in tag_css:
-            bl = unicode_type(self.current_block.blockStyle.attrs['blockwidth'])+'px'
+            bl = str(self.current_block.blockStyle.attrs['blockwidth'])+'px'
            if 'em' in tag_css['text-indent']:
                bl = '10pt'
-            indent = self.unit_convert(unicode_type(tag_css['text-indent']), pts=True, base_length=bl)
+            indent = self.unit_convert(str(tag_css['text-indent']), pts=True, base_length=bl)
            if not indent:
                indent = 0
            if indent > 0 and indent < 10 * self.minimum_indent:
@@ -1518,11 +1518,11 @@ class HTMLConverter(object):
                    elif not urllib.parse.urlparse(tag['src'])[0]:
                        self.log.warn('Could not find image: '+tag['src'])
                else:
-                    self.log.debug("Failed to process: %s"%unicode_type(tag))
+                    self.log.debug("Failed to process: %s"%str(tag))
            elif tagname in ['style', 'link']:
                ncss, npcss = {}, {}
                if tagname == 'style':
-                    text = ''.join([unicode_type(i) for i in tag.findAll(text=True)])
+                    text = ''.join([str(i) for i in tag.findAll(text=True)])
                    css, pcss = self.parse_css(text)
                    ncss.update(css)
                    npcss.update(pcss)
@@ -1554,7 +1554,7 @@ class HTMLConverter(object):
                if tag.contents:
                    c = tag.contents[0]
                    if isinstance(c, bs4.NavigableString):
-                        c = unicode_type(c).replace('\r\n', '\n').replace('\r', '\n')
+                        c = str(c).replace('\r\n', '\n').replace('\r', '\n')
                        if c.startswith('\n'):
                            c = c[1:]
                            tag.contents[0] = bs4.NavigableString(c)
@@ -1612,7 +1612,7 @@ class HTMLConverter(object):
                            in_ol = parent.name.lower() == 'ol'
                            break
                        parent = parent.parent
-                    prepend = unicode_type(self.list_counter)+'. ' if in_ol else '\u2022' + ' '
+                    prepend = str(self.list_counter)+'. ' if in_ol else '\u2022' + ' '
                    self.current_para.append(Span(prepend))
                    self.process_children(tag, tag_css, tag_pseudo_css)
                    if in_ol:
@@ -1655,7 +1655,7 @@ class HTMLConverter(object):

                if (self.anchor_ids and tag.has_attr('id')) or (self.book_designer and tag.get('class') in ('title', ['title'])):
                    if not tag.has_attr('id'):
-                        tag['id'] = __appname__+'_id_'+unicode_type(self.id_counter)
+                        tag['id'] = __appname__+'_id_'+str(self.id_counter)
                        self.id_counter += 1

                    tkey = self.target_prefix+tag['id']
@@ -1728,7 +1728,7 @@ class HTMLConverter(object):
                except Exception as err:
                    self.log.warning(_('An error occurred while processing a table: %s. Ignoring table markup.')%repr(err))
                    self.log.exception('')
-                    self.log.debug(_('Bad table:\n%s')%unicode_type(tag)[:300])
+                    self.log.debug(_('Bad table:\n%s')%str(tag)[:300])
                    self.in_table = False
                    self.process_children(tag, tag_css, tag_pseudo_css)
                finally:
@@ -1824,9 +1824,9 @@ def process_file(path, options, logger):

    for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'):
        val = getattr(options, prop, None)
-        if val and not isinstance(val, unicode_type):
+        if val and not isinstance(val, str):
            soup = html5_parser(val)
-            setattr(options, prop, unicode_type(soup))
+            setattr(options, prop, str(soup))

    title = (options.title, options.title_sort)
    author = (options.author, options.author_sort)
@@ -1870,7 +1870,7 @@ def process_file(path, options, logger):
    options.force_page_break = fpb
    options.link_exclude = le
    options.page_break = pb
-    if not isinstance(options.chapter_regex, unicode_type):
+    if not isinstance(options.chapter_regex, str):
        options.chapter_regex = options.chapter_regex.decode(preferred_encoding)
    options.chapter_regex = re.compile(options.chapter_regex, re.IGNORECASE)
    fpba = options.force_page_break_attr.split(',')