mirror of
https://github.com/gryf/ebook-converter.git
synced 2025-12-18 13:10:17 +01:00
Added missing entities module
This commit is contained in:
98
ebook_converter/utils/entities.py
Normal file
98
ebook_converter/utils/entities.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
Misc util functions for converting HTML/XML entites to unicode.
|
||||
"""
|
||||
import html
|
||||
import functools
|
||||
import re
|
||||
|
||||
from ebook_converter.ebooks.html_entities import html5_entities
|
||||
|
||||
|
||||
ENT_PAT = re.compile(r'&(\S+?);')
|
||||
|
||||
|
||||
def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
||||
result_exceptions={}):
|
||||
"""
|
||||
:param match: A match object such that '&'+match.group(1)';' is the entity.
|
||||
|
||||
:param exceptions: A list of entities to not convert (Each entry is the
|
||||
name of the entity, for e.g. 'apos' or '#1234'
|
||||
|
||||
:param encoding: The encoding to use to decode numeric entities between
|
||||
128 and 256. If None, the Unicode UCS encoding is used.
|
||||
A common encoding is cp1252.
|
||||
|
||||
:param result_exceptions: A mapping of characters to entities. If the
|
||||
result is in result_exceptions,
|
||||
result_exception[result] is returned instead.
|
||||
Convenient way to specify exception for things
|
||||
like < or > that can be specified by various
|
||||
actual entities.
|
||||
"""
|
||||
|
||||
def my_unichr(num):
|
||||
try:
|
||||
return chr(num)
|
||||
except (ValueError, OverflowError):
|
||||
return '?'
|
||||
|
||||
def check(ch):
|
||||
return result_exceptions.get(ch, ch)
|
||||
|
||||
ent = match.group(1)
|
||||
if ent in exceptions:
|
||||
return '&'+ent+';'
|
||||
# squot is generated by some broken CMS software
|
||||
if ent in {'apos', 'squot'}:
|
||||
return check("'")
|
||||
if ent == 'hellips':
|
||||
ent = 'hellip'
|
||||
if ent.startswith('#'):
|
||||
try:
|
||||
if ent[1] in ('x', 'X'):
|
||||
num = int(ent[2:], 16)
|
||||
else:
|
||||
num = int(ent[1:])
|
||||
except Exception:
|
||||
return '&'+ent+';'
|
||||
if encoding is None or num > 255:
|
||||
return check(my_unichr(num))
|
||||
try:
|
||||
return check(bytes(bytearray((num,))).decode(encoding))
|
||||
except UnicodeDecodeError:
|
||||
return check(my_unichr(num))
|
||||
try:
|
||||
return check(html5_entities[ent])
|
||||
except KeyError:
|
||||
pass
|
||||
try:
|
||||
return check(my_unichr(html.entities.name2codepoint[ent]))
|
||||
except KeyError:
|
||||
return '&'+ent+';'
|
||||
|
||||
|
||||
xml_entity_to_unicode = functools.partial(entity_to_unicode,
|
||||
result_exceptions={'"': '"',
|
||||
"'": ''',
|
||||
'<': '<',
|
||||
'>': '>',
|
||||
'&': '&'})
|
||||
|
||||
|
||||
def replace_entities(raw, encoding='cp1252'):
|
||||
return ENT_PAT.sub(functools.partial(entity_to_unicode, encoding=encoding),
|
||||
raw)
|
||||
|
||||
|
||||
def xml_replace_entities(raw, encoding='cp1252'):
|
||||
return ENT_PAT.sub(functools.partial(xml_entity_to_unicode,
|
||||
encoding=encoding), raw)
|
||||
|
||||
|
||||
def prepare_string_for_xml(raw, attribute=False):
|
||||
raw = ENT_PAT.sub(entity_to_unicode, raw)
|
||||
raw = raw.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
if attribute:
|
||||
raw = raw.replace('"', '"').replace("'", ''')
|
||||
return raw
|
||||
Reference in New Issue
Block a user