diff --git a/ebook_converter/__init__.py b/ebook_converter/__init__.py index 86292fd..8fec1d1 100644 --- a/ebook_converter/__init__.py +++ b/ebook_converter/__init__.py @@ -29,11 +29,6 @@ class CurrentDir(object): _ent_pat = re.compile(r'&(\S+?);') -def xml_replace_entities(raw, encoding='cp1252'): - return _ent_pat.sub(partial(entities.xml_entity_to_unicode, - encoding=encoding), raw) - - def prepare_string_for_xml(raw, attribute=False): raw = _ent_pat.sub(entities.entity_to_unicode, raw) raw = raw.replace('&', '&').replace('<', '<').replace('>', '>') diff --git a/ebook_converter/ebooks/oeb/parse_utils.py b/ebook_converter/ebooks/oeb/parse_utils.py index 6b84eab..3d0a119 100644 --- a/ebook_converter/ebooks/oeb/parse_utils.py +++ b/ebook_converter/ebooks/oeb/parse_utils.py @@ -5,10 +5,10 @@ from lxml import etree from lxml import html from ebook_converter import constants as const -from ebook_converter import xml_replace_entities from ebook_converter.constants_old import filesystem_encoding from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations from ebook_converter.utils import encoding as uenc +from ebook_converter.utils import entities RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, @@ -208,9 +208,8 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, data = etree.fromstring(data) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): - log.debug('Initial parse failed, using more' - ' forgiving parsers') - raw = data = xml_replace_entities(raw) + log.debug('Initial parse failed, using more forgiving parsers') + raw = data = entities.xml_replace_entities(raw) try: data = etree.fromstring(data) check_for_html5(pre, data) diff --git a/ebook_converter/ebooks/oeb/polish/parsing.py b/ebook_converter/ebooks/oeb/polish/parsing.py index 99cbd5e..4dfeac1 100644 --- a/ebook_converter/ebooks/oeb/polish/parsing.py +++ b/ebook_converter/ebooks/oeb/polish/parsing.py @@ -4,10 +4,10 @@ from lxml import etree import html5_parser from ebook_converter import constants as const -from ebook_converter import xml_replace_entities from ebook_converter.ebooks.chardet import strip_encoding_declarations from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.utils.cleantext import clean_xml_chars +from ebook_converter.utils import entities def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, @@ -16,7 +16,7 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: - raw = xml_replace_entities(raw) + raw = entities.xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = clean_xml_chars(raw) @@ -61,7 +61,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True, raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = handle_private_entities(raw) if replace_entities: - raw = xml_replace_entities(raw).replace('\0', '') # Handle � + # Handle � + raw = entities.xml_replace_entities(raw).replace('\0', '') raw = raw.replace('\r\n', '\n').replace('\r', '\n') # Remove any preamble before the opening html tag as it can cause problems, diff --git a/ebook_converter/ebooks/oeb/reader.py b/ebook_converter/ebooks/oeb/reader.py index d92403f..da1a9a9 100644 --- a/ebook_converter/ebooks/oeb/reader.py +++ b/ebook_converter/ebooks/oeb/reader.py @@ -23,7 +23,7 @@ from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils.localization import get_lang from ebook_converter.ptempfile import TemporaryDirectory from ebook_converter.constants_old import __appname__, __version__ -from ebook_converter import xml_replace_entities +from ebook_converter.utils import entities from ebook_converter.polyglot.urllib import unquote @@ -107,7 +107,8 @@ class OEBReader(object): try: opf = etree.fromstring(data) except etree.XMLSyntaxError: - data = xml_replace_entities(clean_xml_chars(data), encoding=None) + data = entities.xml_replace_entities(clean_xml_chars(data), + encoding=None) try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') diff --git a/ebook_converter/ebooks/pdf/pdftohtml.py b/ebook_converter/ebooks/pdf/pdftohtml.py index 8c8b03f..af10f7b 100644 --- a/ebook_converter/ebooks/pdf/pdftohtml.py +++ b/ebook_converter/ebooks/pdf/pdftohtml.py @@ -6,11 +6,12 @@ import subprocess from lxml import etree -from ebook_converter import CurrentDir, xml_replace_entities +from ebook_converter import CurrentDir from ebook_converter.ebooks import ConversionError, DRMError from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ptempfile import PersistentTemporaryFile from ebook_converter.utils.cleantext import clean_xml_chars +from ebook_converter.utils import entities from ebook_converter.utils.ipc import eintr_retry_call @@ -84,7 +85,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): flags=re.I) raw = re.sub(r'