Initial import

2026-04-23 22:51:30 +02:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
@@ -0,0 +1,343 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os
+
+from lxml.html.builder import IMG, HR
+
+from calibre.constants import iswindows
+from calibre.ebooks.docx.names import barename
+from calibre.utils.filenames import ascii_filename
+from calibre.utils.img import resize_to_fit, image_to_data
+from calibre.utils.imghdr import what
+from polyglot.builtins import iteritems, itervalues
+
+
+class LinkedImageNotFound(ValueError):
+
+    def __init__(self, fname):
+        ValueError.__init__(self, fname)
+        self.fname = fname
+
+
+def image_filename(x):
+    return ascii_filename(x).replace(' ', '_').replace('#', '_')
+
+
+def emu_to_pt(x):
+    return x / 12700
+
+
+def pt_to_emu(x):
+    return int(x * 12700)
+
+
+def get_image_properties(parent, XPath, get):
+    width = height = None
+    for extent in XPath('./wp:extent')(parent):
+        try:
+            width = emu_to_pt(int(extent.get('cx')))
+        except (TypeError, ValueError):
+            pass
+        try:
+            height = emu_to_pt(int(extent.get('cy')))
+        except (TypeError, ValueError):
+            pass
+    ans = {}
+    if width is not None:
+        ans['width'] = '%.3gpt' % width
+    if height is not None:
+        ans['height'] = '%.3gpt' % height
+
+    alt = None
+    title = None
+    for docPr in XPath('./wp:docPr')(parent):
+        alt = docPr.get('descr') or alt
+        title = docPr.get('title') or title
+        if docPr.get('hidden', None) in {'true', 'on', '1'}:
+            ans['display'] = 'none'
+
+    return ans, alt, title
+
+
+def get_image_margins(elem):
+    ans = {}
+    for w, css in iteritems({'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}):
+        val = elem.get('dist%s' % w, None)
+        if val is not None:
+            try:
+                val = emu_to_pt(val)
+            except (TypeError, ValueError):
+                continue
+            ans['padding-%s' % css] = '%.3gpt' % val
+    return ans
+
+
+def get_hpos(anchor, page_width, XPath, get, width_frac):
+    for ph in XPath('./wp:positionH')(anchor):
+        rp = ph.get('relativeFrom', None)
+        if rp == 'leftMargin':
+            return 0 + width_frac
+        if rp == 'rightMargin':
+            return 1 + width_frac
+        al = None
+        almap = {'left':0, 'center':0.5, 'right':1}
+        for align in XPath('./wp:align')(ph):
+            al = almap.get(align.text)
+            if al is not None:
+                if rp == 'page':
+                    return al
+                return al + width_frac
+        for po in XPath('./wp:posOffset')(ph):
+            try:
+                pos = emu_to_pt(int(po.text))
+            except (TypeError, ValueError):
+                continue
+            return pos/page_width + width_frac
+
+    for sp in XPath('./wp:simplePos')(anchor):
+        try:
+            x = emu_to_pt(sp.get('x', None))
+        except (TypeError, ValueError):
+            continue
+        return x/page_width + width_frac
+
+    return 0
+
+
+class Images(object):
+
+    def __init__(self, namespace, log):
+        self.namespace = namespace
+        self.rid_map = {}
+        self.used = {}
+        self.resized = {}
+        self.names = set()
+        self.all_images = set()
+        self.links = []
+        self.log = log
+
+    def __call__(self, relationships_by_id):
+        self.rid_map = relationships_by_id
+
+    def read_image_data(self, fname, base=None):
+        if fname.startswith('file://'):
+            src = fname[len('file://'):]
+            if iswindows and src and src[0] == '/':
+                src = src[1:]
+            if not src or not os.path.exists(src):
+                raise LinkedImageNotFound(src)
+            with open(src, 'rb') as rawsrc:
+                raw = rawsrc.read()
+        else:
+            try:
+                raw = self.docx.read(fname)
+            except KeyError:
+                raise LinkedImageNotFound(fname)
+        base = base or image_filename(fname.rpartition('/')[-1]) or 'image'
+        ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
+        if ext == 'emf':
+            # For an example, see: https://bugs.launchpad.net/bugs/1224849
+            self.log('Found an EMF image: %s, trying to extract embedded raster image' % fname)
+            from calibre.utils.wmf.emf import emf_unwrap
+            try:
+                raw = emf_unwrap(raw)
+            except Exception:
+                self.log.exception('Failed to extract embedded raster image from EMF')
+            else:
+                ext = 'png'
+        base = base.rpartition('.')[0]
+        if not base:
+            base = 'image'
+        base += '.' + ext
+        return raw, base
+
+    def unique_name(self, base):
+        exists = frozenset(itervalues(self.used))
+        c = 1
+        name = base
+        while name in exists:
+            n, e = base.rpartition('.')[0::2]
+            name = '%s-%d.%s' % (n, c, e)
+            c += 1
+        return name
+
+    def resize_image(self, raw, base, max_width, max_height):
+        resized, img = resize_to_fit(raw, max_width, max_height)
+        if resized:
+            base, ext = os.path.splitext(base)
+            base = base + '-%dx%d%s' % (max_width, max_height, ext)
+            raw = image_to_data(img, fmt=ext[1:])
+        return raw, base, resized
+
+    def generate_filename(self, rid, base=None, rid_map=None, max_width=None, max_height=None):
+        rid_map = self.rid_map if rid_map is None else rid_map
+        fname = rid_map[rid]
+        key = (fname, max_width, max_height)
+        ans = self.used.get(key)
+        if ans is not None:
+            return ans
+        raw, base = self.read_image_data(fname, base=base)
+        resized = False
+        if max_width is not None and max_height is not None:
+            raw, base, resized = self.resize_image(raw, base, max_width, max_height)
+        name = self.unique_name(base)
+        self.used[key] = name
+        if max_width is not None and max_height is not None and not resized:
+            okey = (fname, None, None)
+            if okey in self.used:
+                return self.used[okey]
+            self.used[okey] = name
+        with open(os.path.join(self.dest_dir, name), 'wb') as f:
+            f.write(raw)
+        self.all_images.add('images/' + name)
+        return name
+
+    def pic_to_img(self, pic, alt, parent, title):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        name = None
+        link = None
+        for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
+            link = {'id':get(hl, 'r:id')}
+            tgt = hl.get('tgtFrame', None)
+            if tgt:
+                link['target'] = tgt
+            title = hl.get('tooltip', None)
+            if title:
+                link['title'] = title
+
+        for pr in XPath('descendant::pic:cNvPr')(pic):
+            name = pr.get('name', None)
+            if name:
+                name = image_filename(name)
+            alt = pr.get('descr') or alt
+            for a in XPath('descendant::a:blip[@r:embed or @r:link]')(pic):
+                rid = get(a, 'r:embed')
+                if not rid:
+                    rid = get(a, 'r:link')
+                if rid and rid in self.rid_map:
+                    try:
+                        src = self.generate_filename(rid, name)
+                    except LinkedImageNotFound as err:
+                        self.log.warn('Linked image: %s not found, ignoring' % err.fname)
+                        continue
+                    img = IMG(src='images/%s' % src)
+                    img.set('alt', alt or 'Image')
+                    if title:
+                        img.set('title', title)
+                    if link is not None:
+                        self.links.append((img, link, self.rid_map))
+                    return img
+
+    def drawing_to_html(self, drawing, page):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        # First process the inline pictures
+        for inline in XPath('./wp:inline')(drawing):
+            style, alt, title = get_image_properties(inline, XPath, get)
+            for pic in XPath('descendant::pic:pic')(inline):
+                ans = self.pic_to_img(pic, alt, inline, title)
+                if ans is not None:
+                    if style:
+                        ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
+                    yield ans
+
+        # Now process the floats
+        for anchor in XPath('./wp:anchor')(drawing):
+            style, alt, title = get_image_properties(anchor, XPath, get)
+            self.get_float_properties(anchor, style, page)
+            for pic in XPath('descendant::pic:pic')(anchor):
+                ans = self.pic_to_img(pic, alt, anchor, title)
+                if ans is not None:
+                    if style:
+                        ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
+                    yield ans
+
+    def pict_to_html(self, pict, page):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        # First see if we have an <hr>
+        is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'}
+        if is_hr:
+            style = {}
+            hr = HR()
+            try:
+                pct = float(get(pict[0], 'o:hrpct'))
+            except (ValueError, TypeError, AttributeError):
+                pass
+            else:
+                if pct > 0:
+                    style['width'] = '%.3g%%' % pct
+            align = get(pict[0], 'o:hralign', 'center')
+            if align in {'left', 'right'}:
+                style['margin-left'] = '0' if align == 'left' else 'auto'
+                style['margin-right'] = 'auto' if align == 'left' else '0'
+            if style:
+                hr.set('style', '; '.join(('%s:%s' % (k, v) for k, v in iteritems(style))))
+            yield hr
+
+        for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict):
+            rid = get(imagedata, 'r:id')
+            if rid in self.rid_map:
+                try:
+                    src = self.generate_filename(rid)
+                except LinkedImageNotFound as err:
+                    self.log.warn('Linked image: %s not found, ignoring' % err.fname)
+                    continue
+                img = IMG(src='images/%s' % src, style="display:block")
+                alt = get(imagedata, 'o:title')
+                img.set('alt', alt or 'Image')
+                yield img
+
+    def get_float_properties(self, anchor, style, page):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        if 'display' not in style:
+            style['display'] = 'block'
+        padding = get_image_margins(anchor)
+        width = float(style.get('width', '100pt')[:-2])
+
+        page_width = page.width - page.margin_left - page.margin_right
+        if page_width <= 0:
+            # Ignore margins
+            page_width = page.width
+
+        hpos = get_hpos(anchor, page_width, XPath, get, width/(2*page_width))
+
+        wrap_elem = None
+        dofloat = False
+
+        for child in reversed(anchor):
+            bt = barename(child.tag)
+            if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}:
+                wrap_elem = child
+                dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'}
+                break
+
+        if wrap_elem is not None:
+            padding.update(get_image_margins(wrap_elem))
+            wt = wrap_elem.get('wrapText', None)
+            hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos
+            if dofloat:
+                style['float'] = 'left' if hpos < 0.65 else 'right'
+            else:
+                ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto')
+                if ml is not None:
+                    style['margin-left'] = ml
+                if mr is not None:
+                    style['margin-right'] = mr
+
+        style.update(padding)
+
+    def to_html(self, elem, page, docx, dest_dir):
+        dest = os.path.join(dest_dir, 'images')
+        if not os.path.exists(dest):
+            os.mkdir(dest)
+        self.dest_dir, self.docx = dest, docx
+        if elem.tag.endswith('}drawing'):
+            for tag in self.drawing_to_html(elem, page):
+                yield tag
+        else:
+            for tag in self.pict_to_html(elem, page):
+                yield tag