Initial import

2026-04-24 23:31:29 +02:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
@@ -0,0 +1,231 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re, os
+from bisect import bisect
+
+from calibre import guess_type as _guess_type, replace_entities
+from polyglot.builtins import filter
+
+
+def guess_type(x):
+    return _guess_type(x)[0] or 'application/octet-stream'
+
+
+def setup_css_parser_serialization(tab_width=2):
+    import css_parser
+    prefs = css_parser.ser.prefs
+    prefs.indent = tab_width * ' '
+    prefs.indentClosingBrace = False
+    prefs.omitLastSemicolon = False
+
+
+def actual_case_for_name(container, name):
+    from calibre.utils.filenames import samefile
+    if not container.exists(name):
+        raise ValueError('Cannot get actual case for %s as it does not exist' % name)
+    parts = name.split('/')
+    base = ''
+    ans = []
+    for i, x in enumerate(parts):
+        base = '/'.join(ans + [x])
+        path = container.name_to_abspath(base)
+        pdir = os.path.dirname(path)
+        candidates = {os.path.join(pdir, q) for q in os.listdir(pdir)}
+        if x in candidates:
+            correctx = x
+        else:
+            for q in candidates:
+                if samefile(q, path):
+                    correctx = os.path.basename(q)
+                    break
+            else:
+                raise RuntimeError('Something bad happened')
+        ans.append(correctx)
+    return '/'.join(ans)
+
+
+def corrected_case_for_name(container, name):
+    parts = name.split('/')
+    ans = []
+    base = ''
+    for i, x in enumerate(parts):
+        base = '/'.join(ans + [x])
+        if container.exists(base):
+            correctx = x
+        else:
+            try:
+                candidates = {q for q in os.listdir(os.path.dirname(container.name_to_abspath(base)))}
+            except EnvironmentError:
+                return None  # one of the non-terminal components of name is a file instead of a directory
+            for q in candidates:
+                if q.lower() == x.lower():
+                    correctx = q
+                    break
+            else:
+                return None
+        ans.append(correctx)
+    return '/'.join(ans)
+
+
+class PositionFinder(object):
+
+    def __init__(self, raw):
+        pat = br'\n' if isinstance(raw, bytes) else r'\n'
+        self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))
+
+    def __call__(self, pos):
+        lnum = bisect(self.new_lines, pos)
+        try:
+            offset = abs(pos - self.new_lines[lnum - 1])
+        except IndexError:
+            offset = pos
+        return (lnum + 1, offset)
+
+
+class CommentFinder(object):
+
+    def __init__(self, raw, pat=r'(?s)/\*.*?\*/'):
+        self.starts, self.ends = [], []
+        for m in re.finditer(pat, raw):
+            start, end = m.span()
+            self.starts.append(start), self.ends.append(end)
+
+    def __call__(self, offset):
+        if not self.starts:
+            return False
+        q = bisect(self.starts, offset) - 1
+        return q >= 0 and self.starts[q] <= offset <= self.ends[q]
+
+
+def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'):
+    from calibre.ebooks.oeb.base import XPath, XHTML
+    changed_names = set()
+    snames = set(sheets)
+    lp = XPath('//h:link[@href]')
+    hp = XPath('//h:head')
+    for name in names:
+        root = container.parsed(name)
+        if remove:
+            for link in lp(root):
+                if (link.get('type', mtype) or mtype) == mtype:
+                    container.remove_from_xml(link)
+                    changed_names.add(name)
+                    container.dirty(name)
+        existing = {container.href_to_name(l.get('href'), name) for l in lp(root) if (l.get('type', mtype) or mtype) == mtype}
+        extra = snames - existing
+        if extra:
+            changed_names.add(name)
+            try:
+                parent = hp(root)[0]
+            except (TypeError, IndexError):
+                parent = root.makeelement(XHTML('head'))
+                container.insert_into_xml(root, parent, index=0)
+            for sheet in sheets:
+                if sheet in extra:
+                    container.insert_into_xml(
+                        parent, parent.makeelement(XHTML('link'), rel='stylesheet', type=mtype,
+                                                   href=container.name_to_href(sheet, name)))
+            container.dirty(name)
+
+    return changed_names
+
+
+def lead_text(top_elem, num_words=10):
+    ''' Return the leading text contained in top_elem (including descendants)
+    up to a maximum of num_words words. More efficient than using
+    etree.tostring(method='text') as it does not have to serialize the entire
+    sub-tree rooted at top_elem.'''
+    pat = re.compile(r'\s+', flags=re.UNICODE)
+    words = []
+
+    def get_text(x, attr='text'):
+        ans = getattr(x, attr)
+        if ans:
+            words.extend(filter(None, pat.split(ans)))
+
+    stack = [(top_elem, 'text')]
+    while stack and len(words) < num_words:
+        elem, attr = stack.pop()
+        get_text(elem, attr)
+        if attr == 'text':
+            if elem is not top_elem:
+                stack.append((elem, 'tail'))
+            stack.extend(reversed(list((c, 'text') for c in elem.iterchildren('*'))))
+    return ' '.join(words[:num_words])
+
+
+def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None):
+    if log_level is None:
+        import logging
+        log_level = logging.WARNING
+    from css_parser import CSSParser, log
+    from calibre.ebooks.oeb.base import _css_logger
+    log.setLevel(log_level)
+    log.raiseExceptions = False
+    data = data or ''
+    if isinstance(data, bytes):
+        data = data.decode('utf-8') if decode is None else decode(data)
+    if css_preprocessor is not None:
+        data = css_preprocessor(data)
+    parser = CSSParser(loglevel=log_level,
+                        # We dont care about @import rules
+                        fetcher=lambda x: (None, None), log=_css_logger)
+    if is_declaration:
+        data = parser.parseStyle(data, validate=False)
+    else:
+        data = parser.parseString(data, href=fname, validate=False)
+    return data
+
+
+def handle_entities(text, func):
+    return func(replace_entities(text))
+
+
+def apply_func_to_match_groups(match, func=icu_upper, handle_entities=handle_entities):
+    '''Apply the specified function to individual groups in the match object (the result of re.search() or
+    the whole match if no groups were defined. Returns the replaced string.'''
+    found_groups = False
+    i = 0
+    parts, pos = [], match.start()
+    f = lambda text:handle_entities(text, func)
+    while True:
+        i += 1
+        try:
+            start, end = match.span(i)
+        except IndexError:
+            break
+        found_groups = True
+        if start > -1:
+            parts.append(match.string[pos:start])
+            parts.append(f(match.string[start:end]))
+            pos = end
+    if not found_groups:
+        return f(match.group())
+    parts.append(match.string[pos:match.end()])
+    return ''.join(parts)
+
+
+def apply_func_to_html_text(match, func=icu_upper, handle_entities=handle_entities):
+    ''' Apply the specified function only to text between HTML tag definitions. '''
+    f = lambda text:handle_entities(text, func)
+    parts = re.split(r'(<[^>]+>)', match.group())
+    parts = (x if x.startswith('<') else f(x) for x in parts)
+    return ''.join(parts)
+
+
+def extract(elem):
+    ''' Remove an element from the tree, keeping elem.tail '''
+    p = elem.getparent()
+    if p is not None:
+        idx = p.index(elem)
+        p.remove(elem)
+        if elem.tail:
+            if idx > 0:
+                p[idx-1].tail = (p[idx-1].tail or '') + elem.tail
+            else:
+                p.text = (p.text or '') + elem.tail