mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-25 23:05:44 +01:00
Every mime related function in main __init__.py has a flag check for the check if initialization has already done. This is nonsense, since it should be done implicitly early on the converter is starting. This commit straight the things out, and initialization is done in cli module. Also, function guess_type was removed, since it's just a proxy for mimetypes.guess_type function.
396 lines
14 KiB
Python
396 lines
14 KiB
Python
import copy
|
|
import mimetypes
|
|
import operator
|
|
import textwrap
|
|
|
|
from lxml import etree
|
|
|
|
from ebook_converter.polyglot.builtins import as_bytes
|
|
|
|
|
|
class Canvas(etree.XSLTExtension):
|
|
|
|
def __init__(self, doc, styles, text_block, log):
|
|
self.doc = doc
|
|
self.styles = styles
|
|
self.text_block = text_block
|
|
self.log = log
|
|
self.processed = set()
|
|
|
|
def execute(self, context, self_node, input_node, output_parent):
|
|
cid = input_node.get('objid', None)
|
|
if cid is None or cid in self.processed:
|
|
return
|
|
self.processed.add(cid)
|
|
input_node = self.doc.xpath('//Canvas[@objid="%s"]'%cid)[0]
|
|
|
|
objects = list(self.get_objects(input_node))
|
|
if len(objects) == 1 and objects[0][0].tag == 'ImageBlock':
|
|
self.image_page(input_node, objects[0][0], output_parent)
|
|
else:
|
|
canvases = [input_node]
|
|
for x in input_node.itersiblings():
|
|
if x.tag == 'Canvas':
|
|
oid = x.get('objid', None)
|
|
if oid is not None:
|
|
canvases.append(x)
|
|
self.processed.add(oid)
|
|
else:
|
|
break
|
|
|
|
table = etree.Element('table')
|
|
table.text = '\n\t'
|
|
for canvas in canvases:
|
|
oid = canvas.get('objid')
|
|
tr = table.makeelement('tr')
|
|
tr.set('id', oid)
|
|
tr.tail = '\n\t'
|
|
table.append(tr)
|
|
for obj, x, y in self.get_objects(canvas):
|
|
if obj.tag != 'TextBlock':
|
|
self.log.warn(obj.tag, 'elements in Canvas not supported')
|
|
continue
|
|
td = table.makeelement('td')
|
|
self.text_block.render_block(obj, td)
|
|
tr.append(td)
|
|
output_parent.append(table)
|
|
|
|
def image_page(self, input_node, block, output_parent):
|
|
div = etree.Element('div')
|
|
div.set('id', input_node.get('objid', 'scuzzy'))
|
|
div.set('class', 'image_page')
|
|
width = self.styles.to_num(block.get("xsize", None))
|
|
height = self.styles.to_num(block.get("ysize", None))
|
|
img = div.makeelement('img')
|
|
if width is not None:
|
|
img.set('width', str(int(width)))
|
|
if height is not None:
|
|
img.set('height', str(int(height)))
|
|
ref = block.get('refstream', None)
|
|
if ref is not None:
|
|
imstr = self.doc.xpath('//ImageStream[@objid="%s"]'%ref)
|
|
if imstr:
|
|
src = imstr[0].get('file', None)
|
|
if src:
|
|
img.set('src', src)
|
|
div.append(img)
|
|
output_parent.append(div)
|
|
|
|
def get_objects(self, node):
|
|
for x in node.xpath('descendant::PutObj[@refobj and @x1 and @y1]'):
|
|
objs = node.xpath('//*[@objid="%s"]'%x.get('refobj'))
|
|
x, y = map(self.styles.to_num, (x.get('x1'), x.get('y1')))
|
|
if objs and x is not None and y is not None:
|
|
yield objs[0], int(x), int(y)
|
|
|
|
|
|
class MediaType(etree.XSLTExtension):
|
|
|
|
def execute(self, context, self_node, input_node, output_parent):
|
|
name = input_node.get('file', None)
|
|
typ = mimetypes.guess_type(name)[0]
|
|
if not typ:
|
|
typ = 'application/octet-stream'
|
|
output_parent.text = typ
|
|
|
|
|
|
class ImageBlock(etree.XSLTExtension):
|
|
|
|
def __init__(self, canvas):
|
|
etree.XSLTExtension.__init__(self)
|
|
self.canvas = canvas
|
|
|
|
def execute(self, context, self_node, input_node, output_parent):
|
|
self.canvas.image_page(input_node, input_node, output_parent)
|
|
|
|
|
|
class RuledLine(etree.XSLTExtension):
|
|
|
|
def execute(self, context, self_node, input_node, output_parent):
|
|
hr = etree.Element('hr')
|
|
output_parent.append(hr)
|
|
|
|
|
|
class TextBlock(etree.XSLTExtension):
|
|
|
|
def __init__(self, styles, char_button_map, plot_map, log):
|
|
etree.XSLTExtension.__init__(self)
|
|
self.styles = styles
|
|
self.log = log
|
|
self.char_button_map = char_button_map
|
|
self.plot_map = plot_map
|
|
|
|
def execute(self, context, self_node, input_node, output_parent):
|
|
input_node = copy.deepcopy(input_node)
|
|
div = etree.Element('div')
|
|
self.render_block(input_node, div)
|
|
output_parent.append(div)
|
|
|
|
def render_block(self, node, root):
|
|
ts = node.get('textstyle', None)
|
|
classes = []
|
|
bs = node.get('blockstyle')
|
|
if bs in self.styles.block_style_map:
|
|
classes.append('bs%d'%self.styles.block_style_map[bs])
|
|
if ts in self.styles.text_style_map:
|
|
classes.append('ts%d'%self.styles.text_style_map[ts])
|
|
if classes:
|
|
root.set('class', ' '.join(classes))
|
|
objid = node.get('objid', None)
|
|
if objid:
|
|
root.set('id', objid)
|
|
root.text = node.text
|
|
self.root = root
|
|
self.parent = root
|
|
self.add_text_to = (self.parent, 'text')
|
|
self.fix_deep_nesting(node)
|
|
for child in node:
|
|
self.process_child(child)
|
|
|
|
def fix_deep_nesting(self, node):
|
|
deepest = 1
|
|
|
|
def depth(node):
|
|
parent = node.getparent()
|
|
ans = 1
|
|
while parent is not None:
|
|
ans += 1
|
|
parent = parent.getparent()
|
|
return ans
|
|
|
|
for span in node.xpath('descendant::Span'):
|
|
d = depth(span)
|
|
if d > deepest:
|
|
deepest = d
|
|
if d > 500:
|
|
break
|
|
|
|
if deepest < 500:
|
|
return
|
|
|
|
self.log.warn('Found deeply nested spans. Flattening.')
|
|
# with open('/t/before.xml', 'wb') as f:
|
|
# f.write(etree.tostring(node, method='xml'))
|
|
|
|
spans = [(depth(span), span) for span in node.xpath('descendant::Span')]
|
|
spans.sort(key=operator.itemgetter(0), reverse=True)
|
|
|
|
for depth, span in spans:
|
|
if depth < 3:
|
|
continue
|
|
p = span.getparent()
|
|
gp = p.getparent()
|
|
idx = p.index(span)
|
|
pidx = gp.index(p)
|
|
children = list(p)[idx:]
|
|
t = children[-1].tail
|
|
t = t if t else ''
|
|
children[-1].tail = t + (p.tail if p.tail else '')
|
|
p.tail = ''
|
|
pattrib = dict(**p.attrib) if p.tag == 'Span' else {}
|
|
for child in children:
|
|
p.remove(child)
|
|
if pattrib and child.tag == "Span":
|
|
attrib = copy.copy(pattrib)
|
|
attrib.update(child.attrib)
|
|
child.attrib.update(attrib)
|
|
|
|
for child in reversed(children):
|
|
gp.insert(pidx+1, child)
|
|
|
|
# with open('/t/after.xml', 'wb') as f:
|
|
# f.write(etree.tostring(node, method='xml'))
|
|
|
|
def add_text(self, text):
|
|
if text:
|
|
if getattr(self.add_text_to[0], self.add_text_to[1]) is None:
|
|
setattr(self.add_text_to[0], self.add_text_to[1], '')
|
|
setattr(self.add_text_to[0], self.add_text_to[1],
|
|
getattr(self.add_text_to[0], self.add_text_to[1])+ text)
|
|
|
|
def process_container(self, child, tgt):
|
|
idx = self.styles.get_text_styles(child)
|
|
if idx is not None:
|
|
tgt.set('class', 'ts%d'%idx)
|
|
self.parent.append(tgt)
|
|
orig_parent = self.parent
|
|
self.parent = tgt
|
|
self.add_text_to = (self.parent, 'text')
|
|
self.add_text(child.text)
|
|
for gchild in child:
|
|
self.process_child(gchild)
|
|
self.parent = orig_parent
|
|
self.add_text_to = (tgt, 'tail')
|
|
self.add_text(child.tail)
|
|
|
|
def process_child(self, child):
|
|
if child.tag == 'CR':
|
|
if self.parent == self.root or self.parent.tag == 'p':
|
|
self.parent = self.root.makeelement('p')
|
|
self.root.append(self.parent)
|
|
self.add_text_to = (self.parent, 'text')
|
|
else:
|
|
br = self.parent.makeelement('br')
|
|
self.parent.append(br)
|
|
self.add_text_to = (br, 'tail')
|
|
self.add_text(child.tail)
|
|
elif child.tag in ('P', 'Span', 'EmpLine', 'NoBR'):
|
|
span = self.root.makeelement('span')
|
|
if child.tag == 'EmpLine':
|
|
td = 'underline' if child.get('emplineposition', 'before') == 'before' else 'overline'
|
|
span.set('style', 'text-decoration: '+td)
|
|
self.process_container(child, span)
|
|
elif child.tag == 'Sup':
|
|
sup = self.root.makeelement('sup')
|
|
self.process_container(child, sup)
|
|
elif child.tag == 'Sub':
|
|
sub = self.root.makeelement('sub')
|
|
self.process_container(child, sub)
|
|
elif child.tag == 'Italic':
|
|
sup = self.root.makeelement('i')
|
|
self.process_container(child, sup)
|
|
elif child.tag == 'CharButton':
|
|
a = self.root.makeelement('a')
|
|
oid = child.get('refobj', None)
|
|
if oid in self.char_button_map:
|
|
a.set('href', self.char_button_map[oid])
|
|
self.process_container(child, a)
|
|
elif child.tag == 'Plot':
|
|
xsize = self.styles.to_num(child.get('xsize', None), 166/720)
|
|
ysize = self.styles.to_num(child.get('ysize', None), 166/720)
|
|
img = self.root.makeelement('img')
|
|
if xsize is not None:
|
|
img.set('width', str(int(xsize)))
|
|
if ysize is not None:
|
|
img.set('height', str(int(ysize)))
|
|
ro = child.get('refobj', None)
|
|
if ro in self.plot_map:
|
|
img.set('src', self.plot_map[ro])
|
|
self.parent.append(img)
|
|
self.add_text_to = (img, 'tail')
|
|
self.add_text(child.tail)
|
|
else:
|
|
self.log.warn('Unhandled Text element:', child.tag)
|
|
|
|
|
|
class Styles(etree.XSLTExtension):
|
|
|
|
def __init__(self):
|
|
etree.XSLTExtension.__init__(self)
|
|
self.text_styles, self.block_styles = [], []
|
|
self.text_style_map, self.block_style_map = {}, {}
|
|
self.CSS = textwrap.dedent('''
|
|
.image_page { text-align:center }
|
|
''')
|
|
|
|
def write(self, name='styles.css'):
|
|
|
|
def join(style):
|
|
ans = ['%s : %s;'%(k, v) for k, v in style.items()]
|
|
if ans:
|
|
ans[-1] = ans[-1][:-1]
|
|
return '\n\t'.join(ans)
|
|
|
|
with open(name, 'wb') as f:
|
|
f.write(as_bytes(self.CSS))
|
|
for (w, sel) in [(self.text_styles, 'ts'), (self.block_styles,
|
|
'bs')]:
|
|
for i, s in enumerate(w):
|
|
if not s:
|
|
continue
|
|
rsel = '.%s%d'%(sel, i)
|
|
s = join(s)
|
|
f.write(as_bytes(rsel + ' {\n\t' + s + '\n}\n\n'))
|
|
|
|
def execute(self, context, self_node, input_node, output_parent):
|
|
if input_node.tag == 'TextStyle':
|
|
idx = self.get_text_styles(input_node)
|
|
if idx is not None:
|
|
self.text_style_map[input_node.get('objid')] = idx
|
|
else:
|
|
idx = self.get_block_styles(input_node)
|
|
self.block_style_map[input_node.get('objid')] = idx
|
|
|
|
def px_to_pt(self, px):
|
|
try:
|
|
return px * 72/166
|
|
except:
|
|
return None
|
|
|
|
def color(self, val):
|
|
try:
|
|
val = int(val, 16)
|
|
r, g, b, a = val & 0xFF, (val>>8)&0xFF, (val>>16)&0xFF, (val>>24)&0xFF
|
|
if a == 255:
|
|
return None
|
|
if a == 0:
|
|
return 'rgb(%d,%d,%d)'%(r,g,b)
|
|
return 'rgba(%d,%d,%d,%f)'%(r,g,b,1.-a/255.)
|
|
except:
|
|
return None
|
|
|
|
def get_block_styles(self, node):
|
|
ans = {}
|
|
sm = self.px_to_pt(node.get('sidemargin', None))
|
|
if sm is not None:
|
|
ans['margin-left'] = ans['margin-right'] = '%fpt'%sm
|
|
ts = self.px_to_pt(node.get('topskip', None))
|
|
if ts is not None:
|
|
ans['margin-top'] = '%fpt'%ts
|
|
fs = self.px_to_pt(node.get('footskip', None))
|
|
if fs is not None:
|
|
ans['margin-bottom'] = '%fpt'%fs
|
|
fw = self.px_to_pt(node.get('framewidth', None))
|
|
if fw is not None:
|
|
ans['border-width'] = '%fpt'%fw
|
|
ans['border-style'] = 'solid'
|
|
fc = self.color(node.get('framecolor', None))
|
|
if fc is not None:
|
|
ans['border-color'] = fc
|
|
bc = self.color(node.get('bgcolor', None))
|
|
if bc is not None:
|
|
ans['background-color'] = bc
|
|
if ans not in self.block_styles:
|
|
self.block_styles.append(ans)
|
|
return self.block_styles.index(ans)
|
|
|
|
def to_num(self, val, factor=1.):
|
|
try:
|
|
return float(val)*factor
|
|
except:
|
|
return None
|
|
|
|
def get_text_styles(self, node):
|
|
ans = {}
|
|
fs = self.to_num(node.get('fontsize', None), 0.1)
|
|
if fs is not None:
|
|
ans['font-size'] = '%fpt'%fs
|
|
fw = self.to_num(node.get('fontweight', None))
|
|
if fw is not None:
|
|
ans['font-weight'] = ('bold' if fw >= 700 else 'normal')
|
|
# fn = getattr(obj, 'fontfacename', None)
|
|
# if fn is not None:
|
|
# fn = cls.FONT_MAP[fn]
|
|
# item('font-family: %s;'%fn)
|
|
fg = self.color(node.get('textcolor', None))
|
|
if fg is not None:
|
|
ans['color'] = fg
|
|
bg = self.color(node.get('textbgcolor', None))
|
|
if bg is not None:
|
|
ans['background-color'] = bg
|
|
al = node.get('align', None)
|
|
if al is not None:
|
|
all = dict(head='left', center='center', foot='right')
|
|
ans['text-align'] = all.get(al, 'left')
|
|
# lh = self.to_num(node.get('linespace', None), 0.1)
|
|
# if lh is not None:
|
|
# ans['line-height'] = '%fpt'%lh
|
|
pi = self.to_num(node.get('parindent', None), 0.1)
|
|
if pi is not None:
|
|
ans['text-indent'] = '%fpt'%pi
|
|
if not ans:
|
|
return None
|
|
if ans not in self.text_styles:
|
|
self.text_styles.append(ans)
|
|
return self.text_styles.index(ans)
|