mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-19 12:43:35 +02:00
Initial import
This commit is contained in:
343
ebook_converter/ebooks/docx/images.py
Normal file
343
ebook_converter/ebooks/docx/images.py
Normal file
@@ -0,0 +1,343 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os
|
||||
|
||||
from lxml.html.builder import IMG, HR
|
||||
|
||||
from calibre.constants import iswindows
|
||||
from calibre.ebooks.docx.names import barename
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.utils.img import resize_to_fit, image_to_data
|
||||
from calibre.utils.imghdr import what
|
||||
from polyglot.builtins import iteritems, itervalues
|
||||
|
||||
|
||||
class LinkedImageNotFound(ValueError):
|
||||
|
||||
def __init__(self, fname):
|
||||
ValueError.__init__(self, fname)
|
||||
self.fname = fname
|
||||
|
||||
|
||||
def image_filename(x):
|
||||
return ascii_filename(x).replace(' ', '_').replace('#', '_')
|
||||
|
||||
|
||||
def emu_to_pt(x):
|
||||
return x / 12700
|
||||
|
||||
|
||||
def pt_to_emu(x):
|
||||
return int(x * 12700)
|
||||
|
||||
|
||||
def get_image_properties(parent, XPath, get):
|
||||
width = height = None
|
||||
for extent in XPath('./wp:extent')(parent):
|
||||
try:
|
||||
width = emu_to_pt(int(extent.get('cx')))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
try:
|
||||
height = emu_to_pt(int(extent.get('cy')))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
ans = {}
|
||||
if width is not None:
|
||||
ans['width'] = '%.3gpt' % width
|
||||
if height is not None:
|
||||
ans['height'] = '%.3gpt' % height
|
||||
|
||||
alt = None
|
||||
title = None
|
||||
for docPr in XPath('./wp:docPr')(parent):
|
||||
alt = docPr.get('descr') or alt
|
||||
title = docPr.get('title') or title
|
||||
if docPr.get('hidden', None) in {'true', 'on', '1'}:
|
||||
ans['display'] = 'none'
|
||||
|
||||
return ans, alt, title
|
||||
|
||||
|
||||
def get_image_margins(elem):
|
||||
ans = {}
|
||||
for w, css in iteritems({'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}):
|
||||
val = elem.get('dist%s' % w, None)
|
||||
if val is not None:
|
||||
try:
|
||||
val = emu_to_pt(val)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
ans['padding-%s' % css] = '%.3gpt' % val
|
||||
return ans
|
||||
|
||||
|
||||
def get_hpos(anchor, page_width, XPath, get, width_frac):
|
||||
for ph in XPath('./wp:positionH')(anchor):
|
||||
rp = ph.get('relativeFrom', None)
|
||||
if rp == 'leftMargin':
|
||||
return 0 + width_frac
|
||||
if rp == 'rightMargin':
|
||||
return 1 + width_frac
|
||||
al = None
|
||||
almap = {'left':0, 'center':0.5, 'right':1}
|
||||
for align in XPath('./wp:align')(ph):
|
||||
al = almap.get(align.text)
|
||||
if al is not None:
|
||||
if rp == 'page':
|
||||
return al
|
||||
return al + width_frac
|
||||
for po in XPath('./wp:posOffset')(ph):
|
||||
try:
|
||||
pos = emu_to_pt(int(po.text))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
return pos/page_width + width_frac
|
||||
|
||||
for sp in XPath('./wp:simplePos')(anchor):
|
||||
try:
|
||||
x = emu_to_pt(sp.get('x', None))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
return x/page_width + width_frac
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
class Images(object):
|
||||
|
||||
def __init__(self, namespace, log):
|
||||
self.namespace = namespace
|
||||
self.rid_map = {}
|
||||
self.used = {}
|
||||
self.resized = {}
|
||||
self.names = set()
|
||||
self.all_images = set()
|
||||
self.links = []
|
||||
self.log = log
|
||||
|
||||
def __call__(self, relationships_by_id):
|
||||
self.rid_map = relationships_by_id
|
||||
|
||||
def read_image_data(self, fname, base=None):
|
||||
if fname.startswith('file://'):
|
||||
src = fname[len('file://'):]
|
||||
if iswindows and src and src[0] == '/':
|
||||
src = src[1:]
|
||||
if not src or not os.path.exists(src):
|
||||
raise LinkedImageNotFound(src)
|
||||
with open(src, 'rb') as rawsrc:
|
||||
raw = rawsrc.read()
|
||||
else:
|
||||
try:
|
||||
raw = self.docx.read(fname)
|
||||
except KeyError:
|
||||
raise LinkedImageNotFound(fname)
|
||||
base = base or image_filename(fname.rpartition('/')[-1]) or 'image'
|
||||
ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
|
||||
if ext == 'emf':
|
||||
# For an example, see: https://bugs.launchpad.net/bugs/1224849
|
||||
self.log('Found an EMF image: %s, trying to extract embedded raster image' % fname)
|
||||
from calibre.utils.wmf.emf import emf_unwrap
|
||||
try:
|
||||
raw = emf_unwrap(raw)
|
||||
except Exception:
|
||||
self.log.exception('Failed to extract embedded raster image from EMF')
|
||||
else:
|
||||
ext = 'png'
|
||||
base = base.rpartition('.')[0]
|
||||
if not base:
|
||||
base = 'image'
|
||||
base += '.' + ext
|
||||
return raw, base
|
||||
|
||||
def unique_name(self, base):
|
||||
exists = frozenset(itervalues(self.used))
|
||||
c = 1
|
||||
name = base
|
||||
while name in exists:
|
||||
n, e = base.rpartition('.')[0::2]
|
||||
name = '%s-%d.%s' % (n, c, e)
|
||||
c += 1
|
||||
return name
|
||||
|
||||
def resize_image(self, raw, base, max_width, max_height):
|
||||
resized, img = resize_to_fit(raw, max_width, max_height)
|
||||
if resized:
|
||||
base, ext = os.path.splitext(base)
|
||||
base = base + '-%dx%d%s' % (max_width, max_height, ext)
|
||||
raw = image_to_data(img, fmt=ext[1:])
|
||||
return raw, base, resized
|
||||
|
||||
def generate_filename(self, rid, base=None, rid_map=None, max_width=None, max_height=None):
|
||||
rid_map = self.rid_map if rid_map is None else rid_map
|
||||
fname = rid_map[rid]
|
||||
key = (fname, max_width, max_height)
|
||||
ans = self.used.get(key)
|
||||
if ans is not None:
|
||||
return ans
|
||||
raw, base = self.read_image_data(fname, base=base)
|
||||
resized = False
|
||||
if max_width is not None and max_height is not None:
|
||||
raw, base, resized = self.resize_image(raw, base, max_width, max_height)
|
||||
name = self.unique_name(base)
|
||||
self.used[key] = name
|
||||
if max_width is not None and max_height is not None and not resized:
|
||||
okey = (fname, None, None)
|
||||
if okey in self.used:
|
||||
return self.used[okey]
|
||||
self.used[okey] = name
|
||||
with open(os.path.join(self.dest_dir, name), 'wb') as f:
|
||||
f.write(raw)
|
||||
self.all_images.add('images/' + name)
|
||||
return name
|
||||
|
||||
def pic_to_img(self, pic, alt, parent, title):
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
name = None
|
||||
link = None
|
||||
for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
|
||||
link = {'id':get(hl, 'r:id')}
|
||||
tgt = hl.get('tgtFrame', None)
|
||||
if tgt:
|
||||
link['target'] = tgt
|
||||
title = hl.get('tooltip', None)
|
||||
if title:
|
||||
link['title'] = title
|
||||
|
||||
for pr in XPath('descendant::pic:cNvPr')(pic):
|
||||
name = pr.get('name', None)
|
||||
if name:
|
||||
name = image_filename(name)
|
||||
alt = pr.get('descr') or alt
|
||||
for a in XPath('descendant::a:blip[@r:embed or @r:link]')(pic):
|
||||
rid = get(a, 'r:embed')
|
||||
if not rid:
|
||||
rid = get(a, 'r:link')
|
||||
if rid and rid in self.rid_map:
|
||||
try:
|
||||
src = self.generate_filename(rid, name)
|
||||
except LinkedImageNotFound as err:
|
||||
self.log.warn('Linked image: %s not found, ignoring' % err.fname)
|
||||
continue
|
||||
img = IMG(src='images/%s' % src)
|
||||
img.set('alt', alt or 'Image')
|
||||
if title:
|
||||
img.set('title', title)
|
||||
if link is not None:
|
||||
self.links.append((img, link, self.rid_map))
|
||||
return img
|
||||
|
||||
def drawing_to_html(self, drawing, page):
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
# First process the inline pictures
|
||||
for inline in XPath('./wp:inline')(drawing):
|
||||
style, alt, title = get_image_properties(inline, XPath, get)
|
||||
for pic in XPath('descendant::pic:pic')(inline):
|
||||
ans = self.pic_to_img(pic, alt, inline, title)
|
||||
if ans is not None:
|
||||
if style:
|
||||
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
|
||||
yield ans
|
||||
|
||||
# Now process the floats
|
||||
for anchor in XPath('./wp:anchor')(drawing):
|
||||
style, alt, title = get_image_properties(anchor, XPath, get)
|
||||
self.get_float_properties(anchor, style, page)
|
||||
for pic in XPath('descendant::pic:pic')(anchor):
|
||||
ans = self.pic_to_img(pic, alt, anchor, title)
|
||||
if ans is not None:
|
||||
if style:
|
||||
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
|
||||
yield ans
|
||||
|
||||
def pict_to_html(self, pict, page):
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
# First see if we have an <hr>
|
||||
is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'}
|
||||
if is_hr:
|
||||
style = {}
|
||||
hr = HR()
|
||||
try:
|
||||
pct = float(get(pict[0], 'o:hrpct'))
|
||||
except (ValueError, TypeError, AttributeError):
|
||||
pass
|
||||
else:
|
||||
if pct > 0:
|
||||
style['width'] = '%.3g%%' % pct
|
||||
align = get(pict[0], 'o:hralign', 'center')
|
||||
if align in {'left', 'right'}:
|
||||
style['margin-left'] = '0' if align == 'left' else 'auto'
|
||||
style['margin-right'] = 'auto' if align == 'left' else '0'
|
||||
if style:
|
||||
hr.set('style', '; '.join(('%s:%s' % (k, v) for k, v in iteritems(style))))
|
||||
yield hr
|
||||
|
||||
for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict):
|
||||
rid = get(imagedata, 'r:id')
|
||||
if rid in self.rid_map:
|
||||
try:
|
||||
src = self.generate_filename(rid)
|
||||
except LinkedImageNotFound as err:
|
||||
self.log.warn('Linked image: %s not found, ignoring' % err.fname)
|
||||
continue
|
||||
img = IMG(src='images/%s' % src, style="display:block")
|
||||
alt = get(imagedata, 'o:title')
|
||||
img.set('alt', alt or 'Image')
|
||||
yield img
|
||||
|
||||
def get_float_properties(self, anchor, style, page):
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
if 'display' not in style:
|
||||
style['display'] = 'block'
|
||||
padding = get_image_margins(anchor)
|
||||
width = float(style.get('width', '100pt')[:-2])
|
||||
|
||||
page_width = page.width - page.margin_left - page.margin_right
|
||||
if page_width <= 0:
|
||||
# Ignore margins
|
||||
page_width = page.width
|
||||
|
||||
hpos = get_hpos(anchor, page_width, XPath, get, width/(2*page_width))
|
||||
|
||||
wrap_elem = None
|
||||
dofloat = False
|
||||
|
||||
for child in reversed(anchor):
|
||||
bt = barename(child.tag)
|
||||
if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}:
|
||||
wrap_elem = child
|
||||
dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'}
|
||||
break
|
||||
|
||||
if wrap_elem is not None:
|
||||
padding.update(get_image_margins(wrap_elem))
|
||||
wt = wrap_elem.get('wrapText', None)
|
||||
hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos
|
||||
if dofloat:
|
||||
style['float'] = 'left' if hpos < 0.65 else 'right'
|
||||
else:
|
||||
ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto')
|
||||
if ml is not None:
|
||||
style['margin-left'] = ml
|
||||
if mr is not None:
|
||||
style['margin-right'] = mr
|
||||
|
||||
style.update(padding)
|
||||
|
||||
def to_html(self, elem, page, docx, dest_dir):
|
||||
dest = os.path.join(dest_dir, 'images')
|
||||
if not os.path.exists(dest):
|
||||
os.mkdir(dest)
|
||||
self.dest_dir, self.docx = dest, docx
|
||||
if elem.tag.endswith('}drawing'):
|
||||
for tag in self.drawing_to_html(elem, page):
|
||||
yield tag
|
||||
else:
|
||||
for tag in self.pict_to_html(elem, page):
|
||||
yield tag
|
||||
Reference in New Issue
Block a user