mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-29 17:55:45 +01:00
140 lines
4.3 KiB
Python
140 lines
4.3 KiB
Python
import html
|
|
import os
|
|
import re
|
|
|
|
from functools import partial
|
|
|
|
from ebook_converter import constants_old
|
|
from ebook_converter.ebooks.html_entities import html5_entities
|
|
|
|
|
|
class CurrentDir(object):
|
|
|
|
def __init__(self, path):
|
|
self.path = path
|
|
self.cwd = None
|
|
|
|
def __enter__(self, *args):
|
|
self.cwd = os.getcwd()
|
|
os.chdir(self.path)
|
|
return self.cwd
|
|
|
|
def __exit__(self, *args):
|
|
try:
|
|
os.chdir(self.cwd)
|
|
except EnvironmentError:
|
|
# The previous CWD no longer exists
|
|
pass
|
|
|
|
|
|
def walk(dir):
|
|
"""A nice interface to os.walk"""
|
|
for record in os.walk(dir):
|
|
for f in record[-1]:
|
|
yield os.path.join(record[0], f)
|
|
|
|
|
|
def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
|
result_exceptions={}):
|
|
"""
|
|
:param match: A match object such that '&'+match.group(1)';' is the entity.
|
|
|
|
:param exceptions: A list of entities to not convert (Each entry is the
|
|
name of the entity, for e.g. 'apos' or '#1234'
|
|
|
|
:param encoding: The encoding to use to decode numeric entities between
|
|
128 and 256. If None, the Unicode UCS encoding is used.
|
|
A common encoding is cp1252.
|
|
|
|
:param result_exceptions: A mapping of characters to entities. If the
|
|
result is in result_exceptions,
|
|
result_exception[result] is returned instead.
|
|
Convenient way to specify exception for things
|
|
like < or > that can be specified by various
|
|
actual entities.
|
|
"""
|
|
|
|
def my_unichr(num):
|
|
try:
|
|
return chr(num)
|
|
except (ValueError, OverflowError):
|
|
return '?'
|
|
|
|
def check(ch):
|
|
return result_exceptions.get(ch, ch)
|
|
|
|
ent = match.group(1)
|
|
if ent in exceptions:
|
|
return '&'+ent+';'
|
|
# squot is generated by some broken CMS software
|
|
if ent in {'apos', 'squot'}:
|
|
return check("'")
|
|
if ent == 'hellips':
|
|
ent = 'hellip'
|
|
if ent.startswith('#'):
|
|
try:
|
|
if ent[1] in ('x', 'X'):
|
|
num = int(ent[2:], 16)
|
|
else:
|
|
num = int(ent[1:])
|
|
except Exception:
|
|
return '&'+ent+';'
|
|
if encoding is None or num > 255:
|
|
return check(my_unichr(num))
|
|
try:
|
|
return check(bytes(bytearray((num,))).decode(encoding))
|
|
except UnicodeDecodeError:
|
|
return check(my_unichr(num))
|
|
try:
|
|
return check(html5_entities[ent])
|
|
except KeyError:
|
|
pass
|
|
try:
|
|
return check(my_unichr(html.entities.name2codepoint[ent]))
|
|
except KeyError:
|
|
return '&'+ent+';'
|
|
|
|
|
|
_ent_pat = re.compile(r'&(\S+?);')
|
|
xml_entity_to_unicode = partial(entity_to_unicode,
|
|
result_exceptions={'"': '"',
|
|
"'": ''',
|
|
'<': '<',
|
|
'>': '>',
|
|
'&': '&'})
|
|
|
|
|
|
def replace_entities(raw, encoding='cp1252'):
|
|
return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)
|
|
|
|
|
|
def xml_replace_entities(raw, encoding='cp1252'):
|
|
return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw)
|
|
|
|
|
|
def prepare_string_for_xml(raw, attribute=False):
|
|
raw = _ent_pat.sub(entity_to_unicode, raw)
|
|
raw = raw.replace('&', '&').replace('<', '<').replace('>', '>')
|
|
if attribute:
|
|
raw = raw.replace('"', '"').replace("'", ''')
|
|
return raw
|
|
|
|
|
|
def force_unicode(obj, enc=constants_old.preferred_encoding):
|
|
if isinstance(obj, bytes):
|
|
try:
|
|
obj = obj.decode(enc)
|
|
except Exception:
|
|
try:
|
|
obj = obj.decode(constants_old.filesystem_encoding
|
|
if enc == constants_old.preferred_encoding
|
|
else constants_old.preferred_encoding)
|
|
except Exception:
|
|
try:
|
|
obj = obj.decode('utf-8')
|
|
except Exception:
|
|
obj = repr(obj)
|
|
if isinstance(obj, bytes):
|
|
obj = obj.decode('utf-8')
|
|
return obj
|