mirror of
https://github.com/gryf/ebook-converter.git
synced 2025-12-18 13:10:17 +01:00
Moved xml_replace_entities to utils.entities.
This commit is contained in:
@@ -29,11 +29,6 @@ class CurrentDir(object):
|
||||
_ent_pat = re.compile(r'&(\S+?);')
|
||||
|
||||
|
||||
def xml_replace_entities(raw, encoding='cp1252'):
|
||||
return _ent_pat.sub(partial(entities.xml_entity_to_unicode,
|
||||
encoding=encoding), raw)
|
||||
|
||||
|
||||
def prepare_string_for_xml(raw, attribute=False):
|
||||
raw = _ent_pat.sub(entities.entity_to_unicode, raw)
|
||||
raw = raw.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
|
||||
@@ -5,10 +5,10 @@ from lxml import etree
|
||||
from lxml import html
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import xml_replace_entities
|
||||
from ebook_converter.constants_old import filesystem_encoding
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||
from ebook_converter.utils import encoding as uenc
|
||||
from ebook_converter.utils import entities
|
||||
|
||||
|
||||
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True,
|
||||
@@ -208,9 +208,8 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
data = etree.fromstring(data)
|
||||
check_for_html5(pre, data)
|
||||
except (HTML5Doc, etree.XMLSyntaxError):
|
||||
log.debug('Initial parse failed, using more'
|
||||
' forgiving parsers')
|
||||
raw = data = xml_replace_entities(raw)
|
||||
log.debug('Initial parse failed, using more forgiving parsers')
|
||||
raw = data = entities.xml_replace_entities(raw)
|
||||
try:
|
||||
data = etree.fromstring(data)
|
||||
check_for_html5(pre, data)
|
||||
|
||||
@@ -4,10 +4,10 @@ from lxml import etree
|
||||
import html5_parser
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import xml_replace_entities
|
||||
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
from ebook_converter.utils import entities
|
||||
|
||||
|
||||
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
|
||||
@@ -16,7 +16,7 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
if replace_entities:
|
||||
raw = xml_replace_entities(raw)
|
||||
raw = entities.xml_replace_entities(raw)
|
||||
if fix_newlines:
|
||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||
raw = clean_xml_chars(raw)
|
||||
@@ -61,7 +61,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True,
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
raw = handle_private_entities(raw)
|
||||
if replace_entities:
|
||||
raw = xml_replace_entities(raw).replace('\0', '') # Handle �
|
||||
# Handle �
|
||||
raw = entities.xml_replace_entities(raw).replace('\0', '')
|
||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# Remove any preamble before the opening html tag as it can cause problems,
|
||||
|
||||
@@ -23,7 +23,7 @@ from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
from ebook_converter.utils.localization import get_lang
|
||||
from ebook_converter.ptempfile import TemporaryDirectory
|
||||
from ebook_converter.constants_old import __appname__, __version__
|
||||
from ebook_converter import xml_replace_entities
|
||||
from ebook_converter.utils import entities
|
||||
from ebook_converter.polyglot.urllib import unquote
|
||||
|
||||
|
||||
@@ -107,7 +107,8 @@ class OEBReader(object):
|
||||
try:
|
||||
opf = etree.fromstring(data)
|
||||
except etree.XMLSyntaxError:
|
||||
data = xml_replace_entities(clean_xml_chars(data), encoding=None)
|
||||
data = entities.xml_replace_entities(clean_xml_chars(data),
|
||||
encoding=None)
|
||||
try:
|
||||
opf = etree.fromstring(data)
|
||||
self.logger.warn('OPF contains invalid HTML named entities')
|
||||
|
||||
@@ -6,11 +6,12 @@ import subprocess
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter import CurrentDir, xml_replace_entities
|
||||
from ebook_converter import CurrentDir
|
||||
from ebook_converter.ebooks import ConversionError, DRMError
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.ptempfile import PersistentTemporaryFile
|
||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
from ebook_converter.utils import entities
|
||||
from ebook_converter.utils.ipc import eintr_retry_call
|
||||
|
||||
|
||||
@@ -84,7 +85,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
||||
flags=re.I)
|
||||
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"',
|
||||
raw, flags=re.I)
|
||||
raw = xml_replace_entities(raw)
|
||||
raw = entities.xml_replace_entities(raw)
|
||||
raw = raw.replace('\u00a0', ' ')
|
||||
|
||||
i.write(raw.encode('utf-8'))
|
||||
|
||||
Reference in New Issue
Block a user