From 839cc3c79a7bc9ad98e9c60c1fb3bd396b6c7cf8 Mon Sep 17 00:00:00 2001
From: gryf <git@vimja.com>
Date: Sun, 3 Jan 2021 18:52:13 +0100
Subject: [PATCH] Moved entity_to_unicode function to utils.entities module.

---
 ebook_converter/__init__.py                   | 71 ++-----------------
 .../ebooks/conversion/preprocess.py           |  4 +-
 .../ebooks/lrf/html/convert_from.py           |  4 +-
 ebook_converter/ebooks/lrf/objects.py         |  6 +-
 ebook_converter/ebooks/lrf/pylrs/pylrs.py     |  5 +-
 ebook_converter/ebooks/mobi/reader/mobi6.py   |  6 +-
 6 files changed, 20 insertions(+), 76 deletions(-)

diff --git a/ebook_converter/__init__.py b/ebook_converter/__init__.py
index 07a972e..edfaa45 100644
--- a/ebook_converter/__init__.py
+++ b/ebook_converter/__init__.py
@@ -1,11 +1,10 @@
-import html
 import os
 import re
 
 from functools import partial
 
 from ebook_converter import constants_old
-from ebook_converter.ebooks.html_entities import html5_entities
+from ebook_converter.utils import entities
 
 
 class CurrentDir(object):
@@ -27,69 +26,8 @@ class CurrentDir(object):
             pass
 
 
-def entity_to_unicode(match, exceptions=[], encoding='cp1252',
-                      result_exceptions={}):
-    """
-    :param match: A match object such that '&'+match.group(1)';' is the entity.
-
-    :param exceptions: A list of entities to not convert (Each entry is the
-                       name of the entity, for e.g. 'apos' or '#1234'
-
-    :param encoding: The encoding to use to decode numeric entities between
-                     128 and 256. If None, the Unicode UCS encoding is used.
-                     A common encoding is cp1252.
-
-    :param result_exceptions: A mapping of characters to entities. If the
-                              result is in result_exceptions,
-                              result_exception[result] is returned instead.
-                              Convenient way to specify exception for things
-                              like < or > that can be specified by various
-                              actual entities.
-    """
-
-    def my_unichr(num):
-        try:
-            return chr(num)
-        except (ValueError, OverflowError):
-            return '?'
-
-    def check(ch):
-        return result_exceptions.get(ch, ch)
-
-    ent = match.group(1)
-    if ent in exceptions:
-        return '&'+ent+';'
-    # squot is generated by some broken CMS software
-    if ent in {'apos', 'squot'}:
-        return check("'")
-    if ent == 'hellips':
-        ent = 'hellip'
-    if ent.startswith('#'):
-        try:
-            if ent[1] in ('x', 'X'):
-                num = int(ent[2:], 16)
-            else:
-                num = int(ent[1:])
-        except Exception:
-            return '&'+ent+';'
-        if encoding is None or num > 255:
-            return check(my_unichr(num))
-        try:
-            return check(bytes(bytearray((num,))).decode(encoding))
-        except UnicodeDecodeError:
-            return check(my_unichr(num))
-    try:
-        return check(html5_entities[ent])
-    except KeyError:
-        pass
-    try:
-        return check(my_unichr(html.entities.name2codepoint[ent]))
-    except KeyError:
-        return '&'+ent+';'
-
-
 _ent_pat = re.compile(r'&(\S+?);')
-xml_entity_to_unicode = partial(entity_to_unicode,
+xml_entity_to_unicode = partial(entities.entity_to_unicode,
                                 result_exceptions={'"': '&quot;',
                                                    "'": '&apos;',
                                                    '<': '&lt;',
@@ -98,7 +36,8 @@ xml_entity_to_unicode = partial(entity_to_unicode,
 
 
 def replace_entities(raw, encoding='cp1252'):
-    return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)
+    return _ent_pat.sub(partial(entities.entity_to_unicode, encoding=encoding),
+                        raw)
 
 
 def xml_replace_entities(raw, encoding='cp1252'):
@@ -106,7 +45,7 @@ def xml_replace_entities(raw, encoding='cp1252'):
 
 
 def prepare_string_for_xml(raw, attribute=False):
-    raw = _ent_pat.sub(entity_to_unicode, raw)
+    raw = _ent_pat.sub(entities.entity_to_unicode, raw)
     raw = raw.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
     if attribute:
         raw = raw.replace('"', '&quot;').replace("'", '&apos;')
diff --git a/ebook_converter/ebooks/conversion/preprocess.py b/ebook_converter/ebooks/conversion/preprocess.py
index 9f8bfcf..212821a 100644
--- a/ebook_converter/ebooks/conversion/preprocess.py
+++ b/ebook_converter/ebooks/conversion/preprocess.py
@@ -3,14 +3,14 @@ import json
 import math
 import re
 
-from ebook_converter import entity_to_unicode
+from ebook_converter.utils import entities
 
 
 XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
 SVG_NS = 'http://www.w3.org/2000/svg'
 XLINK_NS = 'http://www.w3.org/1999/xlink'
 
-convert_entities = functools.partial(entity_to_unicode,
+convert_entities = functools.partial(entities.entity_to_unicode,
                                      result_exceptions={'<': '&lt;',
                                                         '>': '&gt;',
                                                         "'": '&apos;',
diff --git a/ebook_converter/ebooks/lrf/html/convert_from.py b/ebook_converter/ebooks/lrf/html/convert_from.py
index f8b9d75..813d8a1 100644
--- a/ebook_converter/ebooks/lrf/html/convert_from.py
+++ b/ebook_converter/ebooks/lrf/html/convert_from.py
@@ -21,7 +21,6 @@ import math
 import bs4
 from PIL import Image as PILImage
 
-from ebook_converter import entity_to_unicode
 from ebook_converter.constants_old import __appname__, filesystem_encoding, \
         preferred_encoding
 from ebook_converter.devices.interface import DevicePlugin as Device
@@ -39,6 +38,7 @@ from ebook_converter.ebooks.lrf.pylrs.pylrs import (
 from ebook_converter.ptempfile import PersistentTemporaryFile
 from ebook_converter.utils import encoding as uenc
 from ebook_converter.utils import img as uimg
+from ebook_converter.utils import entities
 
 
 def strip_style_comments(match):
@@ -90,7 +90,7 @@ MARKUP_MASSAGE = [  # Close <a /> tags
 
                   # Replace entities
                   (re.compile(r'&(\S+?);'),
-                   functools.partial(entity_to_unicode,
+                   functools.partial(entities.entity_to_unicode,
                                      exceptions=['lt', 'gt', 'amp', 'quot'])),
 
                   # Remove comments from within style tags as they can mess up
diff --git a/ebook_converter/ebooks/lrf/objects.py b/ebook_converter/ebooks/lrf/objects.py
index 306300f..81c93ba 100644
--- a/ebook_converter/ebooks/lrf/objects.py
+++ b/ebook_converter/ebooks/lrf/objects.py
@@ -6,8 +6,9 @@ import struct
 import zlib
 
 from ebook_converter.ebooks.lrf import LRFParseError, PRS500_PROFILE
-from ebook_converter import entity_to_unicode, prepare_string_for_xml
+from ebook_converter import prepare_string_for_xml
 from ebook_converter.ebooks.lrf.tags import Tag
+from ebook_converter.utils import entities
 
 ruby_tags = {0xF575: ['rubyAlignAndAdjust', 'W'],
              0xF576: ['rubyoverhang', 'W', {0: 'none', 1: 'auto'}],
@@ -713,7 +714,8 @@ class Text(LRFStream):
         s = str(text, "utf-16-le")
         if s:
             s = s.translate(self.text_map)
-            self.content.append(self.entity_pattern.sub(entity_to_unicode, s))
+            self.content.append(self.entity_pattern
+                                .sub(entities.entity_to_unicode, s))
 
     def end_container(self, tag, stream):
         self.content.append(None)
diff --git a/ebook_converter/ebooks/lrf/pylrs/pylrs.py b/ebook_converter/ebooks/lrf/pylrs/pylrs.py
index bb30f9d..7fc4d22 100644
--- a/ebook_converter/ebooks/lrf/pylrs/pylrs.py
+++ b/ebook_converter/ebooks/lrf/pylrs/pylrs.py
@@ -50,7 +50,7 @@ DEFAULT_SOURCE_ENCODING = "cp1252"      # default is us-windows character set
 DEFAULT_GENREADING      = "fs"          # default is yes to both lrf and lrs
 
 from ebook_converter.constants_old import __appname__, __version__
-from ebook_converter import entity_to_unicode
+from ebook_converter.utils import entities
 
 
 class LrsError(Exception):
@@ -737,7 +737,8 @@ class TableOfContents(object):
 class TocLabel(object):
 
     def __init__(self, label, textBlock):
-        self.label = escape(re.sub(r'&(\S+?);', entity_to_unicode, label))
+        self.label = escape(re.sub(r'&(\S+?);', entities.entity_to_unicode,
+                                   label))
         self.textBlock = textBlock
 
     def toElement(self, se):
diff --git a/ebook_converter/ebooks/mobi/reader/mobi6.py b/ebook_converter/ebooks/mobi/reader/mobi6.py
index 014b846..413a753 100644
--- a/ebook_converter/ebooks/mobi/reader/mobi6.py
+++ b/ebook_converter/ebooks/mobi/reader/mobi6.py
@@ -9,7 +9,7 @@ import textwrap
 
 from lxml import html, etree
 
-from ebook_converter import xml_entity_to_unicode, entity_to_unicode
+from ebook_converter import xml_entity_to_unicode
 from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
 from ebook_converter.ebooks import DRMError, unit_convert
 from ebook_converter.ebooks.chardet import strip_encoding_declarations
@@ -20,6 +20,7 @@ from ebook_converter.ebooks.metadata import MetaInformation
 from ebook_converter.ebooks.metadata.opf2 import OPFCreator, OPF
 from ebook_converter.ebooks.metadata.toc import TOC
 from ebook_converter.ebooks.mobi.reader.headers import BookHeader
+from ebook_converter.utils import entities
 from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data
 from ebook_converter.utils.img import AnimatedGIF
 from ebook_converter.utils.imghdr import what
@@ -759,7 +760,8 @@ class MobiReader(object):
                                                          ':text()')])
                             except Exception:
                                 text = ''
-                            text = ent_pat.sub(entity_to_unicode, text)
+                            text = ent_pat.sub(entities.entity_to_unicode,
+                                               text)
                             item = tocobj.add_item(toc.partition('#')[0],
                                                    href[1:], text)
                             item.left_space = int(self.get_left_whitespace(x))