mirror of
https://github.com/gryf/ebook-converter.git
synced 2025-12-18 21:20:17 +01:00
Moved xml_replace_entities to utils.entities.
This commit is contained in:
@@ -29,11 +29,6 @@ class CurrentDir(object):
|
|||||||
_ent_pat = re.compile(r'&(\S+?);')
|
_ent_pat = re.compile(r'&(\S+?);')
|
||||||
|
|
||||||
|
|
||||||
def xml_replace_entities(raw, encoding='cp1252'):
|
|
||||||
return _ent_pat.sub(partial(entities.xml_entity_to_unicode,
|
|
||||||
encoding=encoding), raw)
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_string_for_xml(raw, attribute=False):
|
def prepare_string_for_xml(raw, attribute=False):
|
||||||
raw = _ent_pat.sub(entities.entity_to_unicode, raw)
|
raw = _ent_pat.sub(entities.entity_to_unicode, raw)
|
||||||
raw = raw.replace('&', '&').replace('<', '<').replace('>', '>')
|
raw = raw.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||||
|
|||||||
@@ -5,10 +5,10 @@ from lxml import etree
|
|||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
from ebook_converter import constants as const
|
from ebook_converter import constants as const
|
||||||
from ebook_converter import xml_replace_entities
|
|
||||||
from ebook_converter.constants_old import filesystem_encoding
|
from ebook_converter.constants_old import filesystem_encoding
|
||||||
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||||
from ebook_converter.utils import encoding as uenc
|
from ebook_converter.utils import encoding as uenc
|
||||||
|
from ebook_converter.utils import entities
|
||||||
|
|
||||||
|
|
||||||
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True,
|
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True,
|
||||||
@@ -208,9 +208,8 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
data = etree.fromstring(data)
|
data = etree.fromstring(data)
|
||||||
check_for_html5(pre, data)
|
check_for_html5(pre, data)
|
||||||
except (HTML5Doc, etree.XMLSyntaxError):
|
except (HTML5Doc, etree.XMLSyntaxError):
|
||||||
log.debug('Initial parse failed, using more'
|
log.debug('Initial parse failed, using more forgiving parsers')
|
||||||
' forgiving parsers')
|
raw = data = entities.xml_replace_entities(raw)
|
||||||
raw = data = xml_replace_entities(raw)
|
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data)
|
data = etree.fromstring(data)
|
||||||
check_for_html5(pre, data)
|
check_for_html5(pre, data)
|
||||||
|
|||||||
@@ -4,10 +4,10 @@ from lxml import etree
|
|||||||
import html5_parser
|
import html5_parser
|
||||||
|
|
||||||
from ebook_converter import constants as const
|
from ebook_converter import constants as const
|
||||||
from ebook_converter import xml_replace_entities
|
|
||||||
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
||||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||||
|
from ebook_converter.utils import entities
|
||||||
|
|
||||||
|
|
||||||
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
|
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
|
||||||
@@ -16,7 +16,7 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
|
|||||||
if isinstance(raw, bytes):
|
if isinstance(raw, bytes):
|
||||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||||
if replace_entities:
|
if replace_entities:
|
||||||
raw = xml_replace_entities(raw)
|
raw = entities.xml_replace_entities(raw)
|
||||||
if fix_newlines:
|
if fix_newlines:
|
||||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||||
raw = clean_xml_chars(raw)
|
raw = clean_xml_chars(raw)
|
||||||
@@ -61,7 +61,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True,
|
|||||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||||
raw = handle_private_entities(raw)
|
raw = handle_private_entities(raw)
|
||||||
if replace_entities:
|
if replace_entities:
|
||||||
raw = xml_replace_entities(raw).replace('\0', '') # Handle �
|
# Handle �
|
||||||
|
raw = entities.xml_replace_entities(raw).replace('\0', '')
|
||||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||||
|
|
||||||
# Remove any preamble before the opening html tag as it can cause problems,
|
# Remove any preamble before the opening html tag as it can cause problems,
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ from ebook_converter.utils.cleantext import clean_xml_chars
|
|||||||
from ebook_converter.utils.localization import get_lang
|
from ebook_converter.utils.localization import get_lang
|
||||||
from ebook_converter.ptempfile import TemporaryDirectory
|
from ebook_converter.ptempfile import TemporaryDirectory
|
||||||
from ebook_converter.constants_old import __appname__, __version__
|
from ebook_converter.constants_old import __appname__, __version__
|
||||||
from ebook_converter import xml_replace_entities
|
from ebook_converter.utils import entities
|
||||||
from ebook_converter.polyglot.urllib import unquote
|
from ebook_converter.polyglot.urllib import unquote
|
||||||
|
|
||||||
|
|
||||||
@@ -107,7 +107,8 @@ class OEBReader(object):
|
|||||||
try:
|
try:
|
||||||
opf = etree.fromstring(data)
|
opf = etree.fromstring(data)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
data = xml_replace_entities(clean_xml_chars(data), encoding=None)
|
data = entities.xml_replace_entities(clean_xml_chars(data),
|
||||||
|
encoding=None)
|
||||||
try:
|
try:
|
||||||
opf = etree.fromstring(data)
|
opf = etree.fromstring(data)
|
||||||
self.logger.warn('OPF contains invalid HTML named entities')
|
self.logger.warn('OPF contains invalid HTML named entities')
|
||||||
|
|||||||
@@ -6,11 +6,12 @@ import subprocess
|
|||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from ebook_converter import CurrentDir, xml_replace_entities
|
from ebook_converter import CurrentDir
|
||||||
from ebook_converter.ebooks import ConversionError, DRMError
|
from ebook_converter.ebooks import ConversionError, DRMError
|
||||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||||
from ebook_converter.ptempfile import PersistentTemporaryFile
|
from ebook_converter.ptempfile import PersistentTemporaryFile
|
||||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||||
|
from ebook_converter.utils import entities
|
||||||
from ebook_converter.utils.ipc import eintr_retry_call
|
from ebook_converter.utils.ipc import eintr_retry_call
|
||||||
|
|
||||||
|
|
||||||
@@ -84,7 +85,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
|||||||
flags=re.I)
|
flags=re.I)
|
||||||
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"',
|
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"',
|
||||||
raw, flags=re.I)
|
raw, flags=re.I)
|
||||||
raw = xml_replace_entities(raw)
|
raw = entities.xml_replace_entities(raw)
|
||||||
raw = raw.replace('\u00a0', ' ')
|
raw = raw.replace('\u00a0', ' ')
|
||||||
|
|
||||||
i.write(raw.encode('utf-8'))
|
i.write(raw.encode('utf-8'))
|
||||||
|
|||||||
Reference in New Issue
Block a user