mirror of
https://github.com/gryf/ebook-converter.git
synced 2025-12-28 20:42:27 +01:00
99 lines
2.9 KiB
Python
99 lines
2.9 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=utf-8
|
|
# License: GPLv3 Copyright: 2010, Kovid Goyal <kovid at kovidgoyal.net>
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
import re
|
|
from polyglot.builtins import codepoint_to_chr, map, range, filter
|
|
from polyglot.html_entities import name2codepoint
|
|
from calibre.constants import plugins, preferred_encoding
|
|
|
|
_ncxc = plugins['speedup'][0].clean_xml_chars
|
|
|
|
|
|
def native_clean_xml_chars(x):
|
|
if isinstance(x, bytes):
|
|
x = x.decode(preferred_encoding)
|
|
return _ncxc(x)
|
|
|
|
|
|
def ascii_pat(for_binary=False):
|
|
attr = 'binary' if for_binary else 'text'
|
|
ans = getattr(ascii_pat, attr, None)
|
|
if ans is None:
|
|
chars = set(range(32)) - {9, 10, 13}
|
|
chars.add(127)
|
|
pat = '|'.join(map(codepoint_to_chr, chars))
|
|
if for_binary:
|
|
pat = pat.encode('ascii')
|
|
ans = re.compile(pat)
|
|
setattr(ascii_pat, attr, ans)
|
|
return ans
|
|
|
|
|
|
def clean_ascii_chars(txt, charlist=None):
|
|
r'''
|
|
Remove ASCII control chars.
|
|
This is all control chars except \t, \n and \r
|
|
'''
|
|
is_binary = isinstance(txt, bytes)
|
|
empty = b'' if is_binary else ''
|
|
if not txt:
|
|
return empty
|
|
|
|
if charlist is None:
|
|
pat = ascii_pat(is_binary)
|
|
else:
|
|
pat = '|'.join(map(codepoint_to_chr, charlist))
|
|
if is_binary:
|
|
pat = pat.encode('utf-8')
|
|
return pat.sub(empty, txt)
|
|
|
|
|
|
def allowed(x):
|
|
x = ord(x)
|
|
return (x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
|
|
|
|
|
|
def py_clean_xml_chars(unicode_string):
|
|
return ''.join(filter(allowed, unicode_string))
|
|
|
|
|
|
clean_xml_chars = native_clean_xml_chars or py_clean_xml_chars
|
|
|
|
|
|
def test_clean_xml_chars():
|
|
raw = 'asd\x02a\U00010437x\ud801b\udffe\ud802'
|
|
if native_clean_xml_chars(raw) != 'asda\U00010437xb':
|
|
raise ValueError('Failed to XML clean: %r' % raw)
|
|
|
|
|
|
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
|
|
# Removes HTML or XML character references and entities from a text string.
|
|
#
|
|
# @param text The HTML (or XML) source text.
|
|
# @return The plain text, as a Unicode string, if necessary.
|
|
|
|
def unescape(text, rm=False, rchar=''):
|
|
def fixup(m, rm=rm, rchar=rchar):
|
|
text = m.group(0)
|
|
if text[:2] == "&#":
|
|
# character reference
|
|
try:
|
|
if text[:3] == "&#x":
|
|
return codepoint_to_chr(int(text[3:-1], 16))
|
|
else:
|
|
return codepoint_to_chr(int(text[2:-1]))
|
|
except ValueError:
|
|
pass
|
|
else:
|
|
# named entity
|
|
try:
|
|
text = codepoint_to_chr(name2codepoint[text[1:-1]])
|
|
except KeyError:
|
|
pass
|
|
if rm:
|
|
return rchar # replace by char
|
|
return text # leave as is
|
|
return re.sub("&#?\\w+;", fixup, text)
|