mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-02 02:13:38 +02:00
Added LRF input format support.
This commit is contained in:
394
ebook_converter/ebooks/lrf/input.py
Normal file
394
ebook_converter/ebooks/lrf/input.py
Normal file
@@ -0,0 +1,394 @@
|
||||
import textwrap, operator
|
||||
from copy import deepcopy, copy
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter import guess_type
|
||||
from ebook_converter.polyglot.builtins import as_bytes
|
||||
|
||||
|
||||
class Canvas(etree.XSLTExtension):
|
||||
|
||||
def __init__(self, doc, styles, text_block, log):
|
||||
self.doc = doc
|
||||
self.styles = styles
|
||||
self.text_block = text_block
|
||||
self.log = log
|
||||
self.processed = set()
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
cid = input_node.get('objid', None)
|
||||
if cid is None or cid in self.processed:
|
||||
return
|
||||
self.processed.add(cid)
|
||||
input_node = self.doc.xpath('//Canvas[@objid="%s"]'%cid)[0]
|
||||
|
||||
objects = list(self.get_objects(input_node))
|
||||
if len(objects) == 1 and objects[0][0].tag == 'ImageBlock':
|
||||
self.image_page(input_node, objects[0][0], output_parent)
|
||||
else:
|
||||
canvases = [input_node]
|
||||
for x in input_node.itersiblings():
|
||||
if x.tag == 'Canvas':
|
||||
oid = x.get('objid', None)
|
||||
if oid is not None:
|
||||
canvases.append(x)
|
||||
self.processed.add(oid)
|
||||
else:
|
||||
break
|
||||
|
||||
table = etree.Element('table')
|
||||
table.text = '\n\t'
|
||||
for canvas in canvases:
|
||||
oid = canvas.get('objid')
|
||||
tr = table.makeelement('tr')
|
||||
tr.set('id', oid)
|
||||
tr.tail = '\n\t'
|
||||
table.append(tr)
|
||||
for obj, x, y in self.get_objects(canvas):
|
||||
if obj.tag != 'TextBlock':
|
||||
self.log.warn(obj.tag, 'elements in Canvas not supported')
|
||||
continue
|
||||
td = table.makeelement('td')
|
||||
self.text_block.render_block(obj, td)
|
||||
tr.append(td)
|
||||
output_parent.append(table)
|
||||
|
||||
def image_page(self, input_node, block, output_parent):
|
||||
div = etree.Element('div')
|
||||
div.set('id', input_node.get('objid', 'scuzzy'))
|
||||
div.set('class', 'image_page')
|
||||
width = self.styles.to_num(block.get("xsize", None))
|
||||
height = self.styles.to_num(block.get("ysize", None))
|
||||
img = div.makeelement('img')
|
||||
if width is not None:
|
||||
img.set('width', str(int(width)))
|
||||
if height is not None:
|
||||
img.set('height', str(int(height)))
|
||||
ref = block.get('refstream', None)
|
||||
if ref is not None:
|
||||
imstr = self.doc.xpath('//ImageStream[@objid="%s"]'%ref)
|
||||
if imstr:
|
||||
src = imstr[0].get('file', None)
|
||||
if src:
|
||||
img.set('src', src)
|
||||
div.append(img)
|
||||
output_parent.append(div)
|
||||
|
||||
def get_objects(self, node):
|
||||
for x in node.xpath('descendant::PutObj[@refobj and @x1 and @y1]'):
|
||||
objs = node.xpath('//*[@objid="%s"]'%x.get('refobj'))
|
||||
x, y = map(self.styles.to_num, (x.get('x1'), x.get('y1')))
|
||||
if objs and x is not None and y is not None:
|
||||
yield objs[0], int(x), int(y)
|
||||
|
||||
|
||||
class MediaType(etree.XSLTExtension):
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
name = input_node.get('file', None)
|
||||
typ = guess_type(name)[0]
|
||||
if not typ:
|
||||
typ = 'application/octet-stream'
|
||||
output_parent.text = typ
|
||||
|
||||
|
||||
class ImageBlock(etree.XSLTExtension):
|
||||
|
||||
def __init__(self, canvas):
|
||||
etree.XSLTExtension.__init__(self)
|
||||
self.canvas = canvas
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
self.canvas.image_page(input_node, input_node, output_parent)
|
||||
|
||||
|
||||
class RuledLine(etree.XSLTExtension):
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
hr = etree.Element('hr')
|
||||
output_parent.append(hr)
|
||||
|
||||
|
||||
class TextBlock(etree.XSLTExtension):
|
||||
|
||||
def __init__(self, styles, char_button_map, plot_map, log):
|
||||
etree.XSLTExtension.__init__(self)
|
||||
self.styles = styles
|
||||
self.log = log
|
||||
self.char_button_map = char_button_map
|
||||
self.plot_map = plot_map
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
input_node = deepcopy(input_node)
|
||||
div = etree.Element('div')
|
||||
self.render_block(input_node, div)
|
||||
output_parent.append(div)
|
||||
|
||||
def render_block(self, node, root):
|
||||
ts = node.get('textstyle', None)
|
||||
classes = []
|
||||
bs = node.get('blockstyle')
|
||||
if bs in self.styles.block_style_map:
|
||||
classes.append('bs%d'%self.styles.block_style_map[bs])
|
||||
if ts in self.styles.text_style_map:
|
||||
classes.append('ts%d'%self.styles.text_style_map[ts])
|
||||
if classes:
|
||||
root.set('class', ' '.join(classes))
|
||||
objid = node.get('objid', None)
|
||||
if objid:
|
||||
root.set('id', objid)
|
||||
root.text = node.text
|
||||
self.root = root
|
||||
self.parent = root
|
||||
self.add_text_to = (self.parent, 'text')
|
||||
self.fix_deep_nesting(node)
|
||||
for child in node:
|
||||
self.process_child(child)
|
||||
|
||||
def fix_deep_nesting(self, node):
|
||||
deepest = 1
|
||||
|
||||
def depth(node):
|
||||
parent = node.getparent()
|
||||
ans = 1
|
||||
while parent is not None:
|
||||
ans += 1
|
||||
parent = parent.getparent()
|
||||
return ans
|
||||
|
||||
for span in node.xpath('descendant::Span'):
|
||||
d = depth(span)
|
||||
if d > deepest:
|
||||
deepest = d
|
||||
if d > 500:
|
||||
break
|
||||
|
||||
if deepest < 500:
|
||||
return
|
||||
|
||||
self.log.warn('Found deeply nested spans. Flattening.')
|
||||
# with open('/t/before.xml', 'wb') as f:
|
||||
# f.write(etree.tostring(node, method='xml'))
|
||||
|
||||
spans = [(depth(span), span) for span in node.xpath('descendant::Span')]
|
||||
spans.sort(key=operator.itemgetter(0), reverse=True)
|
||||
|
||||
for depth, span in spans:
|
||||
if depth < 3:
|
||||
continue
|
||||
p = span.getparent()
|
||||
gp = p.getparent()
|
||||
idx = p.index(span)
|
||||
pidx = gp.index(p)
|
||||
children = list(p)[idx:]
|
||||
t = children[-1].tail
|
||||
t = t if t else ''
|
||||
children[-1].tail = t + (p.tail if p.tail else '')
|
||||
p.tail = ''
|
||||
pattrib = dict(**p.attrib) if p.tag == 'Span' else {}
|
||||
for child in children:
|
||||
p.remove(child)
|
||||
if pattrib and child.tag == "Span":
|
||||
attrib = copy(pattrib)
|
||||
attrib.update(child.attrib)
|
||||
child.attrib.update(attrib)
|
||||
|
||||
for child in reversed(children):
|
||||
gp.insert(pidx+1, child)
|
||||
|
||||
# with open('/t/after.xml', 'wb') as f:
|
||||
# f.write(etree.tostring(node, method='xml'))
|
||||
|
||||
def add_text(self, text):
|
||||
if text:
|
||||
if getattr(self.add_text_to[0], self.add_text_to[1]) is None:
|
||||
setattr(self.add_text_to[0], self.add_text_to[1], '')
|
||||
setattr(self.add_text_to[0], self.add_text_to[1],
|
||||
getattr(self.add_text_to[0], self.add_text_to[1])+ text)
|
||||
|
||||
def process_container(self, child, tgt):
|
||||
idx = self.styles.get_text_styles(child)
|
||||
if idx is not None:
|
||||
tgt.set('class', 'ts%d'%idx)
|
||||
self.parent.append(tgt)
|
||||
orig_parent = self.parent
|
||||
self.parent = tgt
|
||||
self.add_text_to = (self.parent, 'text')
|
||||
self.add_text(child.text)
|
||||
for gchild in child:
|
||||
self.process_child(gchild)
|
||||
self.parent = orig_parent
|
||||
self.add_text_to = (tgt, 'tail')
|
||||
self.add_text(child.tail)
|
||||
|
||||
def process_child(self, child):
|
||||
if child.tag == 'CR':
|
||||
if self.parent == self.root or self.parent.tag == 'p':
|
||||
self.parent = self.root.makeelement('p')
|
||||
self.root.append(self.parent)
|
||||
self.add_text_to = (self.parent, 'text')
|
||||
else:
|
||||
br = self.parent.makeelement('br')
|
||||
self.parent.append(br)
|
||||
self.add_text_to = (br, 'tail')
|
||||
self.add_text(child.tail)
|
||||
elif child.tag in ('P', 'Span', 'EmpLine', 'NoBR'):
|
||||
span = self.root.makeelement('span')
|
||||
if child.tag == 'EmpLine':
|
||||
td = 'underline' if child.get('emplineposition', 'before') == 'before' else 'overline'
|
||||
span.set('style', 'text-decoration: '+td)
|
||||
self.process_container(child, span)
|
||||
elif child.tag == 'Sup':
|
||||
sup = self.root.makeelement('sup')
|
||||
self.process_container(child, sup)
|
||||
elif child.tag == 'Sub':
|
||||
sub = self.root.makeelement('sub')
|
||||
self.process_container(child, sub)
|
||||
elif child.tag == 'Italic':
|
||||
sup = self.root.makeelement('i')
|
||||
self.process_container(child, sup)
|
||||
elif child.tag == 'CharButton':
|
||||
a = self.root.makeelement('a')
|
||||
oid = child.get('refobj', None)
|
||||
if oid in self.char_button_map:
|
||||
a.set('href', self.char_button_map[oid])
|
||||
self.process_container(child, a)
|
||||
elif child.tag == 'Plot':
|
||||
xsize = self.styles.to_num(child.get('xsize', None), 166/720)
|
||||
ysize = self.styles.to_num(child.get('ysize', None), 166/720)
|
||||
img = self.root.makeelement('img')
|
||||
if xsize is not None:
|
||||
img.set('width', str(int(xsize)))
|
||||
if ysize is not None:
|
||||
img.set('height', str(int(ysize)))
|
||||
ro = child.get('refobj', None)
|
||||
if ro in self.plot_map:
|
||||
img.set('src', self.plot_map[ro])
|
||||
self.parent.append(img)
|
||||
self.add_text_to = (img, 'tail')
|
||||
self.add_text(child.tail)
|
||||
else:
|
||||
self.log.warn('Unhandled Text element:', child.tag)
|
||||
|
||||
|
||||
class Styles(etree.XSLTExtension):
|
||||
|
||||
def __init__(self):
|
||||
etree.XSLTExtension.__init__(self)
|
||||
self.text_styles, self.block_styles = [], []
|
||||
self.text_style_map, self.block_style_map = {}, {}
|
||||
self.CSS = textwrap.dedent('''
|
||||
.image_page { text-align:center }
|
||||
''')
|
||||
|
||||
def write(self, name='styles.css'):
|
||||
|
||||
def join(style):
|
||||
ans = ['%s : %s;'%(k, v) for k, v in style.items()]
|
||||
if ans:
|
||||
ans[-1] = ans[-1][:-1]
|
||||
return '\n\t'.join(ans)
|
||||
|
||||
with open(name, 'wb') as f:
|
||||
f.write(as_bytes(self.CSS))
|
||||
for (w, sel) in [(self.text_styles, 'ts'), (self.block_styles,
|
||||
'bs')]:
|
||||
for i, s in enumerate(w):
|
||||
if not s:
|
||||
continue
|
||||
rsel = '.%s%d'%(sel, i)
|
||||
s = join(s)
|
||||
f.write(as_bytes(rsel + ' {\n\t' + s + '\n}\n\n'))
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
if input_node.tag == 'TextStyle':
|
||||
idx = self.get_text_styles(input_node)
|
||||
if idx is not None:
|
||||
self.text_style_map[input_node.get('objid')] = idx
|
||||
else:
|
||||
idx = self.get_block_styles(input_node)
|
||||
self.block_style_map[input_node.get('objid')] = idx
|
||||
|
||||
def px_to_pt(self, px):
|
||||
try:
|
||||
return px * 72/166
|
||||
except:
|
||||
return None
|
||||
|
||||
def color(self, val):
|
||||
try:
|
||||
val = int(val, 16)
|
||||
r, g, b, a = val & 0xFF, (val>>8)&0xFF, (val>>16)&0xFF, (val>>24)&0xFF
|
||||
if a == 255:
|
||||
return None
|
||||
if a == 0:
|
||||
return 'rgb(%d,%d,%d)'%(r,g,b)
|
||||
return 'rgba(%d,%d,%d,%f)'%(r,g,b,1.-a/255.)
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_block_styles(self, node):
|
||||
ans = {}
|
||||
sm = self.px_to_pt(node.get('sidemargin', None))
|
||||
if sm is not None:
|
||||
ans['margin-left'] = ans['margin-right'] = '%fpt'%sm
|
||||
ts = self.px_to_pt(node.get('topskip', None))
|
||||
if ts is not None:
|
||||
ans['margin-top'] = '%fpt'%ts
|
||||
fs = self.px_to_pt(node.get('footskip', None))
|
||||
if fs is not None:
|
||||
ans['margin-bottom'] = '%fpt'%fs
|
||||
fw = self.px_to_pt(node.get('framewidth', None))
|
||||
if fw is not None:
|
||||
ans['border-width'] = '%fpt'%fw
|
||||
ans['border-style'] = 'solid'
|
||||
fc = self.color(node.get('framecolor', None))
|
||||
if fc is not None:
|
||||
ans['border-color'] = fc
|
||||
bc = self.color(node.get('bgcolor', None))
|
||||
if bc is not None:
|
||||
ans['background-color'] = bc
|
||||
if ans not in self.block_styles:
|
||||
self.block_styles.append(ans)
|
||||
return self.block_styles.index(ans)
|
||||
|
||||
def to_num(self, val, factor=1.):
|
||||
try:
|
||||
return float(val)*factor
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_text_styles(self, node):
|
||||
ans = {}
|
||||
fs = self.to_num(node.get('fontsize', None), 0.1)
|
||||
if fs is not None:
|
||||
ans['font-size'] = '%fpt'%fs
|
||||
fw = self.to_num(node.get('fontweight', None))
|
||||
if fw is not None:
|
||||
ans['font-weight'] = ('bold' if fw >= 700 else 'normal')
|
||||
# fn = getattr(obj, 'fontfacename', None)
|
||||
# if fn is not None:
|
||||
# fn = cls.FONT_MAP[fn]
|
||||
# item('font-family: %s;'%fn)
|
||||
fg = self.color(node.get('textcolor', None))
|
||||
if fg is not None:
|
||||
ans['color'] = fg
|
||||
bg = self.color(node.get('textbgcolor', None))
|
||||
if bg is not None:
|
||||
ans['background-color'] = bg
|
||||
al = node.get('align', None)
|
||||
if al is not None:
|
||||
all = dict(head='left', center='center', foot='right')
|
||||
ans['text-align'] = all.get(al, 'left')
|
||||
# lh = self.to_num(node.get('linespace', None), 0.1)
|
||||
# if lh is not None:
|
||||
# ans['line-height'] = '%fpt'%lh
|
||||
pi = self.to_num(node.get('parindent', None), 0.1)
|
||||
if pi is not None:
|
||||
ans['text-indent'] = '%fpt'%pi
|
||||
if not ans:
|
||||
return None
|
||||
if ans not in self.text_styles:
|
||||
self.text_styles.append(ans)
|
||||
return self.text_styles.index(ans)
|
||||
Reference in New Issue
Block a user