ebook-converter/ebook_converter/__init__.py

import html
import os
import re

from functools import partial

from ebook_converter import constants_old
from ebook_converter.ebooks.html_entities import html5_entities


class CurrentDir(object):

    def __init__(self, path):
        self.path = path
        self.cwd = None

    def __enter__(self, *args):
        self.cwd = os.getcwd()
        os.chdir(self.path)
        return self.cwd

    def __exit__(self, *args):
        try:
            os.chdir(self.cwd)
        except EnvironmentError:
            # The previous CWD no longer exists
            pass


def walk(dir):
    """A nice interface to os.walk"""
    for record in os.walk(dir):
        for f in record[-1]:
            yield os.path.join(record[0], f)


def entity_to_unicode(match, exceptions=[], encoding='cp1252',
                      result_exceptions={}):
    """
    :param match: A match object such that '&'+match.group(1)';' is the entity.

    :param exceptions: A list of entities to not convert (Each entry is the
                       name of the entity, for e.g. 'apos' or '#1234'

    :param encoding: The encoding to use to decode numeric entities between
                     128 and 256. If None, the Unicode UCS encoding is used.
                     A common encoding is cp1252.

    :param result_exceptions: A mapping of characters to entities. If the
                              result is in result_exceptions,
                              result_exception[result] is returned instead.
                              Convenient way to specify exception for things
                              like < or > that can be specified by various
                              actual entities.
    """

    def my_unichr(num):
        try:
            return chr(num)
        except (ValueError, OverflowError):
            return '?'

    def check(ch):
        return result_exceptions.get(ch, ch)

    ent = match.group(1)
    if ent in exceptions:
        return '&'+ent+';'
    # squot is generated by some broken CMS software
    if ent in {'apos', 'squot'}:
        return check("'")
    if ent == 'hellips':
        ent = 'hellip'
    if ent.startswith('#'):
        try:
            if ent[1] in ('x', 'X'):
                num = int(ent[2:], 16)
            else:
                num = int(ent[1:])
        except Exception:
            return '&'+ent+';'
        if encoding is None or num > 255:
            return check(my_unichr(num))
        try:
            return check(bytes(bytearray((num,))).decode(encoding))
        except UnicodeDecodeError:
            return check(my_unichr(num))
    try:
        return check(html5_entities[ent])
    except KeyError:
        pass
    try:
        return check(my_unichr(html.entities.name2codepoint[ent]))
    except KeyError:
        return '&'+ent+';'


_ent_pat = re.compile(r'&(\S+?);')
xml_entity_to_unicode = partial(entity_to_unicode,
                                result_exceptions={'"': '&quot;',
                                                   "'": '&apos;',
                                                   '<': '&lt;',
                                                   '>': '&gt;',
                                                   '&': '&amp;'})


def replace_entities(raw, encoding='cp1252'):
    return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)


def xml_replace_entities(raw, encoding='cp1252'):
    return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw)


def prepare_string_for_xml(raw, attribute=False):
    raw = _ent_pat.sub(entity_to_unicode, raw)
    raw = raw.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
    if attribute:
        raw = raw.replace('"', '&quot;').replace("'", '&apos;')
    return raw


def force_unicode(obj, enc=constants_old.preferred_encoding):
    if isinstance(obj, bytes):
        try:
            obj = obj.decode(enc)
        except Exception:
            try:
                obj = obj.decode(constants_old.filesystem_encoding
                                 if enc == constants_old.preferred_encoding
                                 else constants_old.preferred_encoding)
            except Exception:
                try:
                    obj = obj.decode('utf-8')
                except Exception:
                    obj = repr(obj)
                    if isinstance(obj, bytes):
                        obj = obj.decode('utf-8')
    return obj