From 839cc3c79a7bc9ad98e9c60c1fb3bd396b6c7cf8 Mon Sep 17 00:00:00 2001 From: gryf Date: Sun, 3 Jan 2021 18:52:13 +0100 Subject: [PATCH] Moved entity_to_unicode function to utils.entities module. --- ebook_converter/__init__.py | 71 ++----------------- .../ebooks/conversion/preprocess.py | 4 +- .../ebooks/lrf/html/convert_from.py | 4 +- ebook_converter/ebooks/lrf/objects.py | 6 +- ebook_converter/ebooks/lrf/pylrs/pylrs.py | 5 +- ebook_converter/ebooks/mobi/reader/mobi6.py | 6 +- 6 files changed, 20 insertions(+), 76 deletions(-) diff --git a/ebook_converter/__init__.py b/ebook_converter/__init__.py index 07a972e..edfaa45 100644 --- a/ebook_converter/__init__.py +++ b/ebook_converter/__init__.py @@ -1,11 +1,10 @@ -import html import os import re from functools import partial from ebook_converter import constants_old -from ebook_converter.ebooks.html_entities import html5_entities +from ebook_converter.utils import entities class CurrentDir(object): @@ -27,69 +26,8 @@ class CurrentDir(object): pass -def entity_to_unicode(match, exceptions=[], encoding='cp1252', - result_exceptions={}): - """ - :param match: A match object such that '&'+match.group(1)';' is the entity. - - :param exceptions: A list of entities to not convert (Each entry is the - name of the entity, for e.g. 'apos' or '#1234' - - :param encoding: The encoding to use to decode numeric entities between - 128 and 256. If None, the Unicode UCS encoding is used. - A common encoding is cp1252. - - :param result_exceptions: A mapping of characters to entities. If the - result is in result_exceptions, - result_exception[result] is returned instead. - Convenient way to specify exception for things - like < or > that can be specified by various - actual entities. - """ - - def my_unichr(num): - try: - return chr(num) - except (ValueError, OverflowError): - return '?' - - def check(ch): - return result_exceptions.get(ch, ch) - - ent = match.group(1) - if ent in exceptions: - return '&'+ent+';' - # squot is generated by some broken CMS software - if ent in {'apos', 'squot'}: - return check("'") - if ent == 'hellips': - ent = 'hellip' - if ent.startswith('#'): - try: - if ent[1] in ('x', 'X'): - num = int(ent[2:], 16) - else: - num = int(ent[1:]) - except Exception: - return '&'+ent+';' - if encoding is None or num > 255: - return check(my_unichr(num)) - try: - return check(bytes(bytearray((num,))).decode(encoding)) - except UnicodeDecodeError: - return check(my_unichr(num)) - try: - return check(html5_entities[ent]) - except KeyError: - pass - try: - return check(my_unichr(html.entities.name2codepoint[ent])) - except KeyError: - return '&'+ent+';' - - _ent_pat = re.compile(r'&(\S+?);') -xml_entity_to_unicode = partial(entity_to_unicode, +xml_entity_to_unicode = partial(entities.entity_to_unicode, result_exceptions={'"': '"', "'": ''', '<': '<', @@ -98,7 +36,8 @@ xml_entity_to_unicode = partial(entity_to_unicode, def replace_entities(raw, encoding='cp1252'): - return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw) + return _ent_pat.sub(partial(entities.entity_to_unicode, encoding=encoding), + raw) def xml_replace_entities(raw, encoding='cp1252'): @@ -106,7 +45,7 @@ def xml_replace_entities(raw, encoding='cp1252'): def prepare_string_for_xml(raw, attribute=False): - raw = _ent_pat.sub(entity_to_unicode, raw) + raw = _ent_pat.sub(entities.entity_to_unicode, raw) raw = raw.replace('&', '&').replace('<', '<').replace('>', '>') if attribute: raw = raw.replace('"', '"').replace("'", ''') diff --git a/ebook_converter/ebooks/conversion/preprocess.py b/ebook_converter/ebooks/conversion/preprocess.py index 9f8bfcf..212821a 100644 --- a/ebook_converter/ebooks/conversion/preprocess.py +++ b/ebook_converter/ebooks/conversion/preprocess.py @@ -3,14 +3,14 @@ import json import math import re -from ebook_converter import entity_to_unicode +from ebook_converter.utils import entities XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' -convert_entities = functools.partial(entity_to_unicode, +convert_entities = functools.partial(entities.entity_to_unicode, result_exceptions={'<': '<', '>': '>', "'": ''', diff --git a/ebook_converter/ebooks/lrf/html/convert_from.py b/ebook_converter/ebooks/lrf/html/convert_from.py index f8b9d75..813d8a1 100644 --- a/ebook_converter/ebooks/lrf/html/convert_from.py +++ b/ebook_converter/ebooks/lrf/html/convert_from.py @@ -21,7 +21,6 @@ import math import bs4 from PIL import Image as PILImage -from ebook_converter import entity_to_unicode from ebook_converter.constants_old import __appname__, filesystem_encoding, \ preferred_encoding from ebook_converter.devices.interface import DevicePlugin as Device @@ -39,6 +38,7 @@ from ebook_converter.ebooks.lrf.pylrs.pylrs import ( from ebook_converter.ptempfile import PersistentTemporaryFile from ebook_converter.utils import encoding as uenc from ebook_converter.utils import img as uimg +from ebook_converter.utils import entities def strip_style_comments(match): @@ -90,7 +90,7 @@ MARKUP_MASSAGE = [ # Close tags # Replace entities (re.compile(r'&(\S+?);'), - functools.partial(entity_to_unicode, + functools.partial(entities.entity_to_unicode, exceptions=['lt', 'gt', 'amp', 'quot'])), # Remove comments from within style tags as they can mess up diff --git a/ebook_converter/ebooks/lrf/objects.py b/ebook_converter/ebooks/lrf/objects.py index 306300f..81c93ba 100644 --- a/ebook_converter/ebooks/lrf/objects.py +++ b/ebook_converter/ebooks/lrf/objects.py @@ -6,8 +6,9 @@ import struct import zlib from ebook_converter.ebooks.lrf import LRFParseError, PRS500_PROFILE -from ebook_converter import entity_to_unicode, prepare_string_for_xml +from ebook_converter import prepare_string_for_xml from ebook_converter.ebooks.lrf.tags import Tag +from ebook_converter.utils import entities ruby_tags = {0xF575: ['rubyAlignAndAdjust', 'W'], 0xF576: ['rubyoverhang', 'W', {0: 'none', 1: 'auto'}], @@ -713,7 +714,8 @@ class Text(LRFStream): s = str(text, "utf-16-le") if s: s = s.translate(self.text_map) - self.content.append(self.entity_pattern.sub(entity_to_unicode, s)) + self.content.append(self.entity_pattern + .sub(entities.entity_to_unicode, s)) def end_container(self, tag, stream): self.content.append(None) diff --git a/ebook_converter/ebooks/lrf/pylrs/pylrs.py b/ebook_converter/ebooks/lrf/pylrs/pylrs.py index bb30f9d..7fc4d22 100644 --- a/ebook_converter/ebooks/lrf/pylrs/pylrs.py +++ b/ebook_converter/ebooks/lrf/pylrs/pylrs.py @@ -50,7 +50,7 @@ DEFAULT_SOURCE_ENCODING = "cp1252" # default is us-windows character set DEFAULT_GENREADING = "fs" # default is yes to both lrf and lrs from ebook_converter.constants_old import __appname__, __version__ -from ebook_converter import entity_to_unicode +from ebook_converter.utils import entities class LrsError(Exception): @@ -737,7 +737,8 @@ class TableOfContents(object): class TocLabel(object): def __init__(self, label, textBlock): - self.label = escape(re.sub(r'&(\S+?);', entity_to_unicode, label)) + self.label = escape(re.sub(r'&(\S+?);', entities.entity_to_unicode, + label)) self.textBlock = textBlock def toElement(self, se): diff --git a/ebook_converter/ebooks/mobi/reader/mobi6.py b/ebook_converter/ebooks/mobi/reader/mobi6.py index 014b846..413a753 100644 --- a/ebook_converter/ebooks/mobi/reader/mobi6.py +++ b/ebook_converter/ebooks/mobi/reader/mobi6.py @@ -9,7 +9,7 @@ import textwrap from lxml import html, etree -from ebook_converter import xml_entity_to_unicode, entity_to_unicode +from ebook_converter import xml_entity_to_unicode from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars from ebook_converter.ebooks import DRMError, unit_convert from ebook_converter.ebooks.chardet import strip_encoding_declarations @@ -20,6 +20,7 @@ from ebook_converter.ebooks.metadata import MetaInformation from ebook_converter.ebooks.metadata.opf2 import OPFCreator, OPF from ebook_converter.ebooks.metadata.toc import TOC from ebook_converter.ebooks.mobi.reader.headers import BookHeader +from ebook_converter.utils import entities from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data from ebook_converter.utils.img import AnimatedGIF from ebook_converter.utils.imghdr import what @@ -759,7 +760,8 @@ class MobiReader(object): ':text()')]) except Exception: text = '' - text = ent_pat.sub(entity_to_unicode, text) + text = ent_pat.sub(entities.entity_to_unicode, + text) item = tocobj.add_item(toc.partition('#')[0], href[1:], text) item.left_space = int(self.get_left_whitespace(x))