1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2025-12-18 13:10:17 +01:00

Moved xml_replace_entities to utils.entities.

This commit is contained in:
2021-01-03 19:37:03 +01:00
parent dc4352fd4c
commit 8be21d878d
5 changed files with 13 additions and 16 deletions

View File

@@ -29,11 +29,6 @@ class CurrentDir(object):
_ent_pat = re.compile(r'&(\S+?);')
def xml_replace_entities(raw, encoding='cp1252'):
return _ent_pat.sub(partial(entities.xml_entity_to_unicode,
encoding=encoding), raw)
def prepare_string_for_xml(raw, attribute=False):
raw = _ent_pat.sub(entities.entity_to_unicode, raw)
raw = raw.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')

View File

@@ -5,10 +5,10 @@ from lxml import etree
from lxml import html
from ebook_converter import constants as const
from ebook_converter import xml_replace_entities
from ebook_converter.constants_old import filesystem_encoding
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
from ebook_converter.utils import encoding as uenc
from ebook_converter.utils import entities
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True,
@@ -208,9 +208,8 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
data = etree.fromstring(data)
check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Initial parse failed, using more'
' forgiving parsers')
raw = data = xml_replace_entities(raw)
log.debug('Initial parse failed, using more forgiving parsers')
raw = data = entities.xml_replace_entities(raw)
try:
data = etree.fromstring(data)
check_for_html5(pre, data)

View File

@@ -4,10 +4,10 @@ from lxml import etree
import html5_parser
from ebook_converter import constants as const
from ebook_converter import xml_replace_entities
from ebook_converter.ebooks.chardet import strip_encoding_declarations
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.utils import entities
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
@@ -16,7 +16,7 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
if replace_entities:
raw = xml_replace_entities(raw)
raw = entities.xml_replace_entities(raw)
if fix_newlines:
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
raw = clean_xml_chars(raw)
@@ -61,7 +61,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True,
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
raw = handle_private_entities(raw)
if replace_entities:
raw = xml_replace_entities(raw).replace('\0', '') # Handle &#0;
# Handle &#0;
raw = entities.xml_replace_entities(raw).replace('\0', '')
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
# Remove any preamble before the opening html tag as it can cause problems,

View File

@@ -23,7 +23,7 @@ from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.utils.localization import get_lang
from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.constants_old import __appname__, __version__
from ebook_converter import xml_replace_entities
from ebook_converter.utils import entities
from ebook_converter.polyglot.urllib import unquote
@@ -107,7 +107,8 @@ class OEBReader(object):
try:
opf = etree.fromstring(data)
except etree.XMLSyntaxError:
data = xml_replace_entities(clean_xml_chars(data), encoding=None)
data = entities.xml_replace_entities(clean_xml_chars(data),
encoding=None)
try:
opf = etree.fromstring(data)
self.logger.warn('OPF contains invalid HTML named entities')

View File

@@ -6,11 +6,12 @@ import subprocess
from lxml import etree
from ebook_converter import CurrentDir, xml_replace_entities
from ebook_converter import CurrentDir
from ebook_converter.ebooks import ConversionError, DRMError
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ptempfile import PersistentTemporaryFile
from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.utils import entities
from ebook_converter.utils.ipc import eintr_retry_call
@@ -84,7 +85,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
flags=re.I)
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"',
raw, flags=re.I)
raw = xml_replace_entities(raw)
raw = entities.xml_replace_entities(raw)
raw = raw.replace('\u00a0', ' ')
i.write(raw.encode('utf-8'))