ebook-converter/ebook_converter/ebooks/oeb/polish/utils.py

#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals

__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

import re, os
from bisect import bisect

from ebook_converter import guess_type as _guess_type, replace_entities
from ebook_converter.polyglot.builtins import filter


def guess_type(x):
    return _guess_type(x)[0] or 'application/octet-stream'


def setup_css_parser_serialization(tab_width=2):
    import css_parser
    prefs = css_parser.ser.prefs
    prefs.indent = tab_width * ' '
    prefs.indentClosingBrace = False
    prefs.omitLastSemicolon = False


def actual_case_for_name(container, name):
    from ebook_converter.utils.filenames import samefile
    if not container.exists(name):
        raise ValueError('Cannot get actual case for %s as it does not exist' % name)
    parts = name.split('/')
    base = ''
    ans = []
    for i, x in enumerate(parts):
        base = '/'.join(ans + [x])
        path = container.name_to_abspath(base)
        pdir = os.path.dirname(path)
        candidates = {os.path.join(pdir, q) for q in os.listdir(pdir)}
        if x in candidates:
            correctx = x
        else:
            for q in candidates:
                if samefile(q, path):
                    correctx = os.path.basename(q)
                    break
            else:
                raise RuntimeError('Something bad happened')
        ans.append(correctx)
    return '/'.join(ans)


def corrected_case_for_name(container, name):
    parts = name.split('/')
    ans = []
    base = ''
    for i, x in enumerate(parts):
        base = '/'.join(ans + [x])
        if container.exists(base):
            correctx = x
        else:
            try:
                candidates = {q for q in os.listdir(os.path.dirname(container.name_to_abspath(base)))}
            except EnvironmentError:
                return None  # one of the non-terminal components of name is a file instead of a directory
            for q in candidates:
                if q.lower() == x.lower():
                    correctx = q
                    break
            else:
                return None
        ans.append(correctx)
    return '/'.join(ans)


class PositionFinder(object):

    def __init__(self, raw):
        pat = br'\n' if isinstance(raw, bytes) else r'\n'
        self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))

    def __call__(self, pos):
        lnum = bisect(self.new_lines, pos)
        try:
            offset = abs(pos - self.new_lines[lnum - 1])
        except IndexError:
            offset = pos
        return (lnum + 1, offset)


class CommentFinder(object):

    def __init__(self, raw, pat=r'(?s)/\*.*?\*/'):
        self.starts, self.ends = [], []
        for m in re.finditer(pat, raw):
            start, end = m.span()
            self.starts.append(start), self.ends.append(end)

    def __call__(self, offset):
        if not self.starts:
            return False
        q = bisect(self.starts, offset) - 1
        return q >= 0 and self.starts[q] <= offset <= self.ends[q]


def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'):
    from ebook_converter.ebooks.oeb.base import XPath, XHTML
    changed_names = set()
    snames = set(sheets)
    lp = XPath('//h:link[@href]')
    hp = XPath('//h:head')
    for name in names:
        root = container.parsed(name)
        if remove:
            for link in lp(root):
                if (link.get('type', mtype) or mtype) == mtype:
                    container.remove_from_xml(link)
                    changed_names.add(name)
                    container.dirty(name)
        existing = {container.href_to_name(l.get('href'), name) for l in lp(root) if (l.get('type', mtype) or mtype) == mtype}
        extra = snames - existing
        if extra:
            changed_names.add(name)
            try:
                parent = hp(root)[0]
            except (TypeError, IndexError):
                parent = root.makeelement(XHTML('head'))
                container.insert_into_xml(root, parent, index=0)
            for sheet in sheets:
                if sheet in extra:
                    container.insert_into_xml(
                        parent, parent.makeelement(XHTML('link'), rel='stylesheet', type=mtype,
                                                   href=container.name_to_href(sheet, name)))
            container.dirty(name)

    return changed_names


def lead_text(top_elem, num_words=10):
    ''' Return the leading text contained in top_elem (including descendants)
    up to a maximum of num_words words. More efficient than using
    etree.tostring(method='text') as it does not have to serialize the entire
    sub-tree rooted at top_elem.'''
    pat = re.compile(r'\s+', flags=re.UNICODE)
    words = []

    def get_text(x, attr='text'):
        ans = getattr(x, attr)
        if ans:
            words.extend(filter(None, pat.split(ans)))

    stack = [(top_elem, 'text')]
    while stack and len(words) < num_words:
        elem, attr = stack.pop()
        get_text(elem, attr)
        if attr == 'text':
            if elem is not top_elem:
                stack.append((elem, 'tail'))
            stack.extend(reversed(list((c, 'text') for c in elem.iterchildren('*'))))
    return ' '.join(words[:num_words])


def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None):
    if log_level is None:
        import logging
        log_level = logging.WARNING
    from css_parser import CSSParser, log
    from ebook_converter.ebooks.oeb.base import _css_logger
    log.setLevel(log_level)
    log.raiseExceptions = False
    data = data or ''
    if isinstance(data, bytes):
        data = data.decode('utf-8') if decode is None else decode(data)
    if css_preprocessor is not None:
        data = css_preprocessor(data)
    parser = CSSParser(loglevel=log_level,
                        # We dont care about @import rules
                        fetcher=lambda x: (None, None), log=_css_logger)
    if is_declaration:
        data = parser.parseStyle(data, validate=False)
    else:
        data = parser.parseString(data, href=fname, validate=False)
    return data


def handle_entities(text, func):
    return func(replace_entities(text))


def apply_func_to_match_groups(match, func=icu_upper, handle_entities=handle_entities):
    '''Apply the specified function to individual groups in the match object (the result of re.search() or
    the whole match if no groups were defined. Returns the replaced string.'''
    found_groups = False
    i = 0
    parts, pos = [], match.start()
    f = lambda text:handle_entities(text, func)
    while True:
        i += 1
        try:
            start, end = match.span(i)
        except IndexError:
            break
        found_groups = True
        if start > -1:
            parts.append(match.string[pos:start])
            parts.append(f(match.string[start:end]))
            pos = end
    if not found_groups:
        return f(match.group())
    parts.append(match.string[pos:match.end()])
    return ''.join(parts)


def apply_func_to_html_text(match, func=icu_upper, handle_entities=handle_entities):
    ''' Apply the specified function only to text between HTML tag definitions. '''
    f = lambda text:handle_entities(text, func)
    parts = re.split(r'(<[^>]+>)', match.group())
    parts = (x if x.startswith('<') else f(x) for x in parts)
    return ''.join(parts)


def extract(elem):
    ''' Remove an element from the tree, keeping elem.tail '''
    p = elem.getparent()
    if p is not None:
        idx = p.index(elem)
        p.remove(elem)
        if elem.tail:
            if idx > 0:
                p[idx-1].tail = (p[idx-1].tail or '') + elem.tail
            else:
                p.text = (p.text or '') + elem.tail