Initial import

2026-04-23 22:51:30 +02:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
@@ -0,0 +1,235 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os
+from polyglot.builtins import itervalues, range
+
+NBSP = '\xa0'
+
+
+def mergeable(previous, current):
+    if previous.tail or current.tail:
+        return False
+    if previous.get('class', None) != current.get('class', None):
+        return False
+    if current.get('id', False):
+        return False
+    for attr in ('style', 'lang', 'dir'):
+        if previous.get(attr) != current.get(attr):
+            return False
+    try:
+        return next(previous.itersiblings()) is current
+    except StopIteration:
+        return False
+
+
+def append_text(parent, text):
+    if len(parent) > 0:
+        parent[-1].tail = (parent[-1].tail or '') + text
+    else:
+        parent.text = (parent.text or '') + text
+
+
+def merge(parent, span):
+    if span.text:
+        append_text(parent, span.text)
+    for child in span:
+        parent.append(child)
+    if span.tail:
+        append_text(parent, span.tail)
+    span.getparent().remove(span)
+
+
+def merge_run(run):
+    parent = run[0]
+    for span in run[1:]:
+        merge(parent, span)
+
+
+def liftable(css):
+    # A <span> is liftable if all its styling would work just as well if it is
+    # specified on the parent element.
+    prefixes = {x.partition('-')[0] for x in css}
+    return not (prefixes - {'text', 'font', 'letter', 'color', 'background'})
+
+
+def add_text(elem, attr, text):
+    old = getattr(elem, attr) or ''
+    setattr(elem, attr, old + text)
+
+
+def lift(span):
+    # Replace an element by its content (text, children and tail)
+    parent = span.getparent()
+    idx = parent.index(span)
+    try:
+        last_child = span[-1]
+    except IndexError:
+        last_child = None
+
+    if span.text:
+        if idx == 0:
+            add_text(parent, 'text', span.text)
+        else:
+            add_text(parent[idx - 1], 'tail', span.text)
+
+    for child in reversed(span):
+        parent.insert(idx, child)
+    parent.remove(span)
+
+    if span.tail:
+        if last_child is None:
+            if idx == 0:
+                add_text(parent, 'text', span.tail)
+            else:
+                add_text(parent[idx - 1], 'tail', span.tail)
+        else:
+            add_text(last_child, 'tail', span.tail)
+
+
+def before_count(root, tag, limit=10):
+    body = root.xpath('//body[1]')
+    if not body:
+        return limit
+    ans = 0
+    for elem in body[0].iterdescendants():
+        if elem is tag:
+            return ans
+        ans += 1
+        if ans > limit:
+            return limit
+
+
+def wrap_contents(tag_name, elem):
+    wrapper = elem.makeelement(tag_name)
+    wrapper.text, elem.text = elem.text, ''
+    for child in elem:
+        elem.remove(child)
+        wrapper.append(child)
+    elem.append(wrapper)
+
+
+def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath):
+    # Apply vertical-align
+    for span in root.xpath('//span[@data-docx-vert]'):
+        wrap_contents(span.attrib.pop('data-docx-vert'), span)
+
+    # Move <hr>s outside paragraphs, if possible.
+    pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
+    for hr in root.xpath('//span/hr'):
+        p = pancestor(hr)
+        if p:
+            p = p[0]
+            descendants = tuple(p.iterdescendants())
+            if descendants[-1] is hr:
+                parent = p.getparent()
+                idx = parent.index(p)
+                parent.insert(idx+1, hr)
+                hr.tail = '\n\t'
+
+    # Merge consecutive spans that have the same styling
+    current_run = []
+    for span in root.xpath('//span'):
+        if not current_run:
+            current_run.append(span)
+        else:
+            last = current_run[-1]
+            if mergeable(last, span):
+                current_run.append(span)
+            else:
+                if len(current_run) > 1:
+                    merge_run(current_run)
+                current_run = [span]
+
+    # Process dir attributes
+    class_map = dict(itervalues(styles.classes))
+    parents = ('p', 'div') + tuple('h%d' % i for i in range(1, 7))
+    for parent in root.xpath('//*[(%s)]' % ' or '.join('name()="%s"' % t for t in parents)):
+        # Ensure that children of rtl parents that are not rtl have an
+        # explicit dir set. Also, remove dir from children if it is the same as
+        # that of the parent.
+        if len(parent):
+            parent_dir = parent.get('dir')
+            for child in parent.iterchildren('span'):
+                child_dir = child.get('dir')
+                if parent_dir == 'rtl' and child_dir != 'rtl':
+                    child_dir = 'ltr'
+                    child.set('dir', child_dir)
+                if child_dir and child_dir == parent_dir:
+                    child.attrib.pop('dir')
+
+    # Remove unnecessary span tags that are the only child of a parent block
+    # element
+    for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)):
+        if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None):
+            # We have a block whose contents are entirely enclosed in a <span>
+            span = parent[0]
+            span_class = span.get('class', None)
+            span_css = class_map.get(span_class, {})
+            span_dir = span.get('dir')
+            if liftable(span_css) and (not span_dir or span_dir == parent.get('dir')):
+                pclass = parent.get('class', None)
+                if span_class:
+                    pclass = (pclass + ' ' + span_class) if pclass else span_class
+                    parent.set('class', pclass)
+                parent.text = span.text
+                parent.remove(span)
+                if span.get('lang'):
+                    parent.set('lang', span.get('lang'))
+                if span.get('dir'):
+                    parent.set('dir', span.get('dir'))
+                for child in span:
+                    parent.append(child)
+
+    # Make spans whose only styling is bold or italic into <b> and <i> tags
+    for span in root.xpath('//span[@class and not(@style)]'):
+        css = class_map.get(span.get('class', None), {})
+        if len(css) == 1:
+            if css == {'font-style':'italic'}:
+                span.tag = 'i'
+                del span.attrib['class']
+            elif css == {'font-weight':'bold'}:
+                span.tag = 'b'
+                del span.attrib['class']
+
+    # Get rid of <span>s that have no styling
+    for span in root.xpath('//span[not(@class or @id or @style or @lang or @dir)]'):
+        lift(span)
+
+    # Convert <p><br style="page-break-after:always"> </p> style page breaks
+    # into something the viewer will render as a page break
+    for p in root.xpath('//p[br[@style="page-break-after:always"]]'):
+        if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()):
+            p.remove(p[0])
+            prefix = p.get('style', '')
+            if prefix:
+                prefix += '; '
+            p.set('style', prefix + 'page-break-after:always')
+            p.text = NBSP if not p.text else p.text
+
+    if detect_cover:
+        # Check if the first image in the document is possibly a cover
+        img = root.xpath('//img[@src][1]')
+        if img:
+            img = img[0]
+            path = os.path.join(dest_dir, img.get('src'))
+            if os.path.exists(path) and before_count(root, img, limit=10) < 5:
+                from calibre.utils.imghdr import identify
+                try:
+                    with lopen(path, 'rb') as imf:
+                        fmt, width, height = identify(imf)
+                except:
+                    width, height, fmt = 0, 0, None  # noqa
+                del fmt
+                try:
+                    is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
+                except ZeroDivisionError:
+                    is_cover = False
+                if is_cover:
+                    log.debug('Detected an image that looks like a cover')
+                    img.getparent().remove(img)
+                    return path