mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-08 04:34:12 +01:00
341 lines
12 KiB
Python
341 lines
12 KiB
Python
import os
|
|
|
|
from lxml.html.builder import IMG, HR
|
|
|
|
from ebook_converter.constants import iswindows
|
|
from ebook_converter.ebooks.docx.names import barename
|
|
from ebook_converter.utils.filenames import ascii_filename
|
|
from ebook_converter.utils.img import resize_to_fit, image_to_data
|
|
from ebook_converter.utils.imghdr import what
|
|
from ebook_converter.polyglot.builtins import iteritems, itervalues
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
|
|
class LinkedImageNotFound(ValueError):
|
|
|
|
def __init__(self, fname):
|
|
ValueError.__init__(self, fname)
|
|
self.fname = fname
|
|
|
|
|
|
def image_filename(x):
|
|
return ascii_filename(x).replace(' ', '_').replace('#', '_')
|
|
|
|
|
|
def emu_to_pt(x):
|
|
return x / 12700
|
|
|
|
|
|
def pt_to_emu(x):
|
|
return int(x * 12700)
|
|
|
|
|
|
def get_image_properties(parent, XPath, get):
|
|
width = height = None
|
|
for extent in XPath('./wp:extent')(parent):
|
|
try:
|
|
width = emu_to_pt(int(extent.get('cx')))
|
|
except (TypeError, ValueError):
|
|
pass
|
|
try:
|
|
height = emu_to_pt(int(extent.get('cy')))
|
|
except (TypeError, ValueError):
|
|
pass
|
|
ans = {}
|
|
if width is not None:
|
|
ans['width'] = '%.3gpt' % width
|
|
if height is not None:
|
|
ans['height'] = '%.3gpt' % height
|
|
|
|
alt = None
|
|
title = None
|
|
for docPr in XPath('./wp:docPr')(parent):
|
|
alt = docPr.get('descr') or alt
|
|
title = docPr.get('title') or title
|
|
if docPr.get('hidden', None) in {'true', 'on', '1'}:
|
|
ans['display'] = 'none'
|
|
|
|
return ans, alt, title
|
|
|
|
|
|
def get_image_margins(elem):
|
|
ans = {}
|
|
for w, css in iteritems({'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}):
|
|
val = elem.get('dist%s' % w, None)
|
|
if val is not None:
|
|
try:
|
|
val = emu_to_pt(val)
|
|
except (TypeError, ValueError):
|
|
continue
|
|
ans['padding-%s' % css] = '%.3gpt' % val
|
|
return ans
|
|
|
|
|
|
def get_hpos(anchor, page_width, XPath, get, width_frac):
|
|
for ph in XPath('./wp:positionH')(anchor):
|
|
rp = ph.get('relativeFrom', None)
|
|
if rp == 'leftMargin':
|
|
return 0 + width_frac
|
|
if rp == 'rightMargin':
|
|
return 1 + width_frac
|
|
al = None
|
|
almap = {'left':0, 'center':0.5, 'right':1}
|
|
for align in XPath('./wp:align')(ph):
|
|
al = almap.get(align.text)
|
|
if al is not None:
|
|
if rp == 'page':
|
|
return al
|
|
return al + width_frac
|
|
for po in XPath('./wp:posOffset')(ph):
|
|
try:
|
|
pos = emu_to_pt(int(po.text))
|
|
except (TypeError, ValueError):
|
|
continue
|
|
return pos/page_width + width_frac
|
|
|
|
for sp in XPath('./wp:simplePos')(anchor):
|
|
try:
|
|
x = emu_to_pt(sp.get('x', None))
|
|
except (TypeError, ValueError):
|
|
continue
|
|
return x/page_width + width_frac
|
|
|
|
return 0
|
|
|
|
|
|
class Images(object):
|
|
|
|
def __init__(self, namespace, log):
|
|
self.namespace = namespace
|
|
self.rid_map = {}
|
|
self.used = {}
|
|
self.resized = {}
|
|
self.names = set()
|
|
self.all_images = set()
|
|
self.links = []
|
|
self.log = log
|
|
|
|
def __call__(self, relationships_by_id):
|
|
self.rid_map = relationships_by_id
|
|
|
|
def read_image_data(self, fname, base=None):
|
|
if fname.startswith('file://'):
|
|
src = fname[len('file://'):]
|
|
if iswindows and src and src[0] == '/':
|
|
src = src[1:]
|
|
if not src or not os.path.exists(src):
|
|
raise LinkedImageNotFound(src)
|
|
with open(src, 'rb') as rawsrc:
|
|
raw = rawsrc.read()
|
|
else:
|
|
try:
|
|
raw = self.docx.read(fname)
|
|
except KeyError:
|
|
raise LinkedImageNotFound(fname)
|
|
base = base or image_filename(fname.rpartition('/')[-1]) or 'image'
|
|
ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
|
|
if ext == 'emf':
|
|
# For an example, see: https://bugs.launchpad.net/bugs/1224849
|
|
self.log('Found an EMF image: %s, trying to extract embedded raster image' % fname)
|
|
from ebook_converter.utils.wmf.emf import emf_unwrap
|
|
try:
|
|
raw = emf_unwrap(raw)
|
|
except Exception:
|
|
self.log.exception('Failed to extract embedded raster image from EMF')
|
|
else:
|
|
ext = 'png'
|
|
base = base.rpartition('.')[0]
|
|
if not base:
|
|
base = 'image'
|
|
base += '.' + ext
|
|
return raw, base
|
|
|
|
def unique_name(self, base):
|
|
exists = frozenset(itervalues(self.used))
|
|
c = 1
|
|
name = base
|
|
while name in exists:
|
|
n, e = base.rpartition('.')[0::2]
|
|
name = '%s-%d.%s' % (n, c, e)
|
|
c += 1
|
|
return name
|
|
|
|
def resize_image(self, raw, base, max_width, max_height):
|
|
resized, img = resize_to_fit(raw, max_width, max_height)
|
|
if resized:
|
|
base, ext = os.path.splitext(base)
|
|
base = base + '-%dx%d%s' % (max_width, max_height, ext)
|
|
raw = image_to_data(img, fmt=ext[1:])
|
|
return raw, base, resized
|
|
|
|
def generate_filename(self, rid, base=None, rid_map=None, max_width=None, max_height=None):
|
|
rid_map = self.rid_map if rid_map is None else rid_map
|
|
fname = rid_map[rid]
|
|
key = (fname, max_width, max_height)
|
|
ans = self.used.get(key)
|
|
if ans is not None:
|
|
return ans
|
|
raw, base = self.read_image_data(fname, base=base)
|
|
resized = False
|
|
if max_width is not None and max_height is not None:
|
|
raw, base, resized = self.resize_image(raw, base, max_width, max_height)
|
|
name = self.unique_name(base)
|
|
self.used[key] = name
|
|
if max_width is not None and max_height is not None and not resized:
|
|
okey = (fname, None, None)
|
|
if okey in self.used:
|
|
return self.used[okey]
|
|
self.used[okey] = name
|
|
with open(os.path.join(self.dest_dir, name), 'wb') as f:
|
|
f.write(raw)
|
|
self.all_images.add('images/' + name)
|
|
return name
|
|
|
|
def pic_to_img(self, pic, alt, parent, title):
|
|
XPath, get = self.namespace.XPath, self.namespace.get
|
|
name = None
|
|
link = None
|
|
for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
|
|
link = {'id':get(hl, 'r:id')}
|
|
tgt = hl.get('tgtFrame', None)
|
|
if tgt:
|
|
link['target'] = tgt
|
|
title = hl.get('tooltip', None)
|
|
if title:
|
|
link['title'] = title
|
|
|
|
for pr in XPath('descendant::pic:cNvPr')(pic):
|
|
name = pr.get('name', None)
|
|
if name:
|
|
name = image_filename(name)
|
|
alt = pr.get('descr') or alt
|
|
for a in XPath('descendant::a:blip[@r:embed or @r:link]')(pic):
|
|
rid = get(a, 'r:embed')
|
|
if not rid:
|
|
rid = get(a, 'r:link')
|
|
if rid and rid in self.rid_map:
|
|
try:
|
|
src = self.generate_filename(rid, name)
|
|
except LinkedImageNotFound as err:
|
|
self.log.warn('Linked image: %s not found, ignoring' % err.fname)
|
|
continue
|
|
img = IMG(src='images/%s' % src)
|
|
img.set('alt', alt or 'Image')
|
|
if title:
|
|
img.set('title', title)
|
|
if link is not None:
|
|
self.links.append((img, link, self.rid_map))
|
|
return img
|
|
|
|
def drawing_to_html(self, drawing, page):
|
|
XPath, get = self.namespace.XPath, self.namespace.get
|
|
# First process the inline pictures
|
|
for inline in XPath('./wp:inline')(drawing):
|
|
style, alt, title = get_image_properties(inline, XPath, get)
|
|
for pic in XPath('descendant::pic:pic')(inline):
|
|
ans = self.pic_to_img(pic, alt, inline, title)
|
|
if ans is not None:
|
|
if style:
|
|
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
|
|
yield ans
|
|
|
|
# Now process the floats
|
|
for anchor in XPath('./wp:anchor')(drawing):
|
|
style, alt, title = get_image_properties(anchor, XPath, get)
|
|
self.get_float_properties(anchor, style, page)
|
|
for pic in XPath('descendant::pic:pic')(anchor):
|
|
ans = self.pic_to_img(pic, alt, anchor, title)
|
|
if ans is not None:
|
|
if style:
|
|
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
|
|
yield ans
|
|
|
|
def pict_to_html(self, pict, page):
|
|
XPath, get = self.namespace.XPath, self.namespace.get
|
|
# First see if we have an <hr>
|
|
is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'}
|
|
if is_hr:
|
|
style = {}
|
|
hr = HR()
|
|
try:
|
|
pct = float(get(pict[0], 'o:hrpct'))
|
|
except (ValueError, TypeError, AttributeError):
|
|
pass
|
|
else:
|
|
if pct > 0:
|
|
style['width'] = '%.3g%%' % pct
|
|
align = get(pict[0], 'o:hralign', 'center')
|
|
if align in {'left', 'right'}:
|
|
style['margin-left'] = '0' if align == 'left' else 'auto'
|
|
style['margin-right'] = 'auto' if align == 'left' else '0'
|
|
if style:
|
|
hr.set('style', '; '.join(('%s:%s' % (k, v) for k, v in iteritems(style))))
|
|
yield hr
|
|
|
|
for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict):
|
|
rid = get(imagedata, 'r:id')
|
|
if rid in self.rid_map:
|
|
try:
|
|
src = self.generate_filename(rid)
|
|
except LinkedImageNotFound as err:
|
|
self.log.warn('Linked image: %s not found, ignoring' % err.fname)
|
|
continue
|
|
img = IMG(src='images/%s' % src, style="display:block")
|
|
alt = get(imagedata, 'o:title')
|
|
img.set('alt', alt or 'Image')
|
|
yield img
|
|
|
|
def get_float_properties(self, anchor, style, page):
|
|
XPath, get = self.namespace.XPath, self.namespace.get
|
|
if 'display' not in style:
|
|
style['display'] = 'block'
|
|
padding = get_image_margins(anchor)
|
|
width = float(style.get('width', '100pt')[:-2])
|
|
|
|
page_width = page.width - page.margin_left - page.margin_right
|
|
if page_width <= 0:
|
|
# Ignore margins
|
|
page_width = page.width
|
|
|
|
hpos = get_hpos(anchor, page_width, XPath, get, width/(2*page_width))
|
|
|
|
wrap_elem = None
|
|
dofloat = False
|
|
|
|
for child in reversed(anchor):
|
|
bt = barename(child.tag)
|
|
if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}:
|
|
wrap_elem = child
|
|
dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'}
|
|
break
|
|
|
|
if wrap_elem is not None:
|
|
padding.update(get_image_margins(wrap_elem))
|
|
wt = wrap_elem.get('wrapText', None)
|
|
hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos
|
|
if dofloat:
|
|
style['float'] = 'left' if hpos < 0.65 else 'right'
|
|
else:
|
|
ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto')
|
|
if ml is not None:
|
|
style['margin-left'] = ml
|
|
if mr is not None:
|
|
style['margin-right'] = mr
|
|
|
|
style.update(padding)
|
|
|
|
def to_html(self, elem, page, docx, dest_dir):
|
|
dest = os.path.join(dest_dir, 'images')
|
|
if not os.path.exists(dest):
|
|
os.mkdir(dest)
|
|
self.dest_dir, self.docx = dest, docx
|
|
if elem.tag.endswith('}drawing'):
|
|
for tag in self.drawing_to_html(elem, page):
|
|
yield tag
|
|
else:
|
|
for tag in self.pict_to_html(elem, page):
|
|
yield tag
|