mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-11 12:05:45 +01:00
Clean up cleantext module
This commit is contained in:
@@ -1,8 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
import html.entities
|
import html.entities
|
||||||
|
|
||||||
from ebook_converter.constants_old import plugins, preferred_encoding
|
|
||||||
|
|
||||||
|
|
||||||
def ascii_pat(for_binary=False):
|
def ascii_pat(for_binary=False):
|
||||||
attr = 'binary' if for_binary else 'text'
|
attr = 'binary' if for_binary else 'text'
|
||||||
@@ -39,7 +37,8 @@ def clean_ascii_chars(txt, charlist=None):
|
|||||||
|
|
||||||
def allowed(x):
|
def allowed(x):
|
||||||
x = ord(x)
|
x = ord(x)
|
||||||
return (x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
|
return ((x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or
|
||||||
|
(0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff))
|
||||||
|
|
||||||
|
|
||||||
def py_clean_xml_chars(unicode_string):
|
def py_clean_xml_chars(unicode_string):
|
||||||
@@ -49,12 +48,6 @@ def py_clean_xml_chars(unicode_string):
|
|||||||
clean_xml_chars = py_clean_xml_chars
|
clean_xml_chars = py_clean_xml_chars
|
||||||
|
|
||||||
|
|
||||||
def test_clean_xml_chars():
|
|
||||||
raw = 'asd\x02a\U00010437x\ud801b\udffe\ud802'
|
|
||||||
if native_clean_xml_chars(raw) != 'asda\U00010437xb':
|
|
||||||
raise ValueError('Failed to XML clean: %r' % raw)
|
|
||||||
|
|
||||||
|
|
||||||
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
|
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
|
||||||
# Removes HTML or XML character references and entities from a text string.
|
# Removes HTML or XML character references and entities from a text string.
|
||||||
#
|
#
|
||||||
@@ -76,8 +69,7 @@ def unescape(text, rm=False, rchar=''):
|
|||||||
else:
|
else:
|
||||||
# named entity
|
# named entity
|
||||||
try:
|
try:
|
||||||
text = chr(html.entities
|
text = chr(html.entities.name2codepoint[text[1:-1]])
|
||||||
.name2codepoint[text[1:-1]])
|
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
if rm:
|
if rm:
|
||||||
|
|||||||
Reference in New Issue
Block a user