Moved entity_to_unicode function to utils.entities module.

2021-01-03 18:52:13 +01:00
parent 3152c52839
commit 839cc3c79a
6 changed files with 20 additions and 76 deletions
@@ -1,11 +1,10 @@
-import html
 import os
 import re

 from functools import partial

 from ebook_converter import constants_old
-from ebook_converter.ebooks.html_entities import html5_entities
+from ebook_converter.utils import entities


 class CurrentDir(object):
@@ -27,69 +26,8 @@ class CurrentDir(object):
            pass


-def entity_to_unicode(match, exceptions=[], encoding='cp1252',
-                      result_exceptions={}):
-    """
-    :param match: A match object such that '&'+match.group(1)';' is the entity.
-
-    :param exceptions: A list of entities to not convert (Each entry is the
-                       name of the entity, for e.g. 'apos' or '#1234'
-
-    :param encoding: The encoding to use to decode numeric entities between
-                     128 and 256. If None, the Unicode UCS encoding is used.
-                     A common encoding is cp1252.
-
-    :param result_exceptions: A mapping of characters to entities. If the
-                              result is in result_exceptions,
-                              result_exception[result] is returned instead.
-                              Convenient way to specify exception for things
-                              like < or > that can be specified by various
-                              actual entities.
-    """
-
-    def my_unichr(num):
-        try:
-            return chr(num)
-        except (ValueError, OverflowError):
-            return '?'
-
-    def check(ch):
-        return result_exceptions.get(ch, ch)
-
-    ent = match.group(1)
-    if ent in exceptions:
-        return '&'+ent+';'
-    # squot is generated by some broken CMS software
-    if ent in {'apos', 'squot'}:
-        return check("'")
-    if ent == 'hellips':
-        ent = 'hellip'
-    if ent.startswith('#'):
-        try:
-            if ent[1] in ('x', 'X'):
-                num = int(ent[2:], 16)
-            else:
-                num = int(ent[1:])
-        except Exception:
-            return '&'+ent+';'
-        if encoding is None or num > 255:
-            return check(my_unichr(num))
-        try:
-            return check(bytes(bytearray((num,))).decode(encoding))
-        except UnicodeDecodeError:
-            return check(my_unichr(num))
-    try:
-        return check(html5_entities[ent])
-    except KeyError:
-        pass
-    try:
-        return check(my_unichr(html.entities.name2codepoint[ent]))
-    except KeyError:
-        return '&'+ent+';'
-
-
 _ent_pat = re.compile(r'&(\S+?);')
-xml_entity_to_unicode = partial(entity_to_unicode,
+xml_entity_to_unicode = partial(entities.entity_to_unicode,
                                result_exceptions={'"': '&quot;',
                                                   "'": '&apos;',
                                                   '<': '&lt;',
@@ -98,7 +36,8 @@ xml_entity_to_unicode = partial(entity_to_unicode,


 def replace_entities(raw, encoding='cp1252'):
-    return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)
+    return _ent_pat.sub(partial(entities.entity_to_unicode, encoding=encoding),
+                        raw)


 def xml_replace_entities(raw, encoding='cp1252'):
@@ -106,7 +45,7 @@ def xml_replace_entities(raw, encoding='cp1252'):


 def prepare_string_for_xml(raw, attribute=False):
-    raw = _ent_pat.sub(entity_to_unicode, raw)
+    raw = _ent_pat.sub(entities.entity_to_unicode, raw)
    raw = raw.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
    if attribute:
        raw = raw.replace('"', '&quot;').replace("'", '&apos;')