mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-19 19:14:11 +01:00
Clean up cleantext module
This commit is contained in:
@@ -1,8 +1,6 @@
|
||||
import re
|
||||
import html.entities
|
||||
|
||||
from ebook_converter.constants_old import plugins, preferred_encoding
|
||||
|
||||
|
||||
def ascii_pat(for_binary=False):
|
||||
attr = 'binary' if for_binary else 'text'
|
||||
@@ -39,7 +37,8 @@ def clean_ascii_chars(txt, charlist=None):
|
||||
|
||||
def allowed(x):
|
||||
x = ord(x)
|
||||
return (x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
|
||||
return ((x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or
|
||||
(0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff))
|
||||
|
||||
|
||||
def py_clean_xml_chars(unicode_string):
|
||||
@@ -49,12 +48,6 @@ def py_clean_xml_chars(unicode_string):
|
||||
clean_xml_chars = py_clean_xml_chars
|
||||
|
||||
|
||||
def test_clean_xml_chars():
|
||||
raw = 'asd\x02a\U00010437x\ud801b\udffe\ud802'
|
||||
if native_clean_xml_chars(raw) != 'asda\U00010437xb':
|
||||
raise ValueError('Failed to XML clean: %r' % raw)
|
||||
|
||||
|
||||
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
|
||||
# Removes HTML or XML character references and entities from a text string.
|
||||
#
|
||||
@@ -76,8 +69,7 @@ def unescape(text, rm=False, rchar=''):
|
||||
else:
|
||||
# named entity
|
||||
try:
|
||||
text = chr(html.entities
|
||||
.name2codepoint[text[1:-1]])
|
||||
text = chr(html.entities.name2codepoint[text[1:-1]])
|
||||
except KeyError:
|
||||
pass
|
||||
if rm:
|
||||
|
||||
Reference in New Issue
Block a user