mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-18 10:14:12 +01:00
Added LRF input format support.
This commit is contained in:
@@ -54,6 +54,7 @@ Currently, I've tested following input formats:
|
||||
- fb2
|
||||
- html
|
||||
- pdf
|
||||
- lrf
|
||||
|
||||
Note, that old Microsoft doc format is not supported, although old documents
|
||||
can be fairly easy converted using text processors programs, lik Word or
|
||||
@@ -65,7 +66,7 @@ Output formats
|
||||
|
||||
Currently, following formats are supported:
|
||||
|
||||
- lrf (for Sony readers)
|
||||
- lrf
|
||||
- epub
|
||||
- mobi
|
||||
- docx
|
||||
|
||||
394
ebook_converter/ebooks/lrf/input.py
Normal file
394
ebook_converter/ebooks/lrf/input.py
Normal file
@@ -0,0 +1,394 @@
|
||||
import textwrap, operator
|
||||
from copy import deepcopy, copy
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter import guess_type
|
||||
from ebook_converter.polyglot.builtins import as_bytes
|
||||
|
||||
|
||||
class Canvas(etree.XSLTExtension):
|
||||
|
||||
def __init__(self, doc, styles, text_block, log):
|
||||
self.doc = doc
|
||||
self.styles = styles
|
||||
self.text_block = text_block
|
||||
self.log = log
|
||||
self.processed = set()
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
cid = input_node.get('objid', None)
|
||||
if cid is None or cid in self.processed:
|
||||
return
|
||||
self.processed.add(cid)
|
||||
input_node = self.doc.xpath('//Canvas[@objid="%s"]'%cid)[0]
|
||||
|
||||
objects = list(self.get_objects(input_node))
|
||||
if len(objects) == 1 and objects[0][0].tag == 'ImageBlock':
|
||||
self.image_page(input_node, objects[0][0], output_parent)
|
||||
else:
|
||||
canvases = [input_node]
|
||||
for x in input_node.itersiblings():
|
||||
if x.tag == 'Canvas':
|
||||
oid = x.get('objid', None)
|
||||
if oid is not None:
|
||||
canvases.append(x)
|
||||
self.processed.add(oid)
|
||||
else:
|
||||
break
|
||||
|
||||
table = etree.Element('table')
|
||||
table.text = '\n\t'
|
||||
for canvas in canvases:
|
||||
oid = canvas.get('objid')
|
||||
tr = table.makeelement('tr')
|
||||
tr.set('id', oid)
|
||||
tr.tail = '\n\t'
|
||||
table.append(tr)
|
||||
for obj, x, y in self.get_objects(canvas):
|
||||
if obj.tag != 'TextBlock':
|
||||
self.log.warn(obj.tag, 'elements in Canvas not supported')
|
||||
continue
|
||||
td = table.makeelement('td')
|
||||
self.text_block.render_block(obj, td)
|
||||
tr.append(td)
|
||||
output_parent.append(table)
|
||||
|
||||
def image_page(self, input_node, block, output_parent):
|
||||
div = etree.Element('div')
|
||||
div.set('id', input_node.get('objid', 'scuzzy'))
|
||||
div.set('class', 'image_page')
|
||||
width = self.styles.to_num(block.get("xsize", None))
|
||||
height = self.styles.to_num(block.get("ysize", None))
|
||||
img = div.makeelement('img')
|
||||
if width is not None:
|
||||
img.set('width', str(int(width)))
|
||||
if height is not None:
|
||||
img.set('height', str(int(height)))
|
||||
ref = block.get('refstream', None)
|
||||
if ref is not None:
|
||||
imstr = self.doc.xpath('//ImageStream[@objid="%s"]'%ref)
|
||||
if imstr:
|
||||
src = imstr[0].get('file', None)
|
||||
if src:
|
||||
img.set('src', src)
|
||||
div.append(img)
|
||||
output_parent.append(div)
|
||||
|
||||
def get_objects(self, node):
|
||||
for x in node.xpath('descendant::PutObj[@refobj and @x1 and @y1]'):
|
||||
objs = node.xpath('//*[@objid="%s"]'%x.get('refobj'))
|
||||
x, y = map(self.styles.to_num, (x.get('x1'), x.get('y1')))
|
||||
if objs and x is not None and y is not None:
|
||||
yield objs[0], int(x), int(y)
|
||||
|
||||
|
||||
class MediaType(etree.XSLTExtension):
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
name = input_node.get('file', None)
|
||||
typ = guess_type(name)[0]
|
||||
if not typ:
|
||||
typ = 'application/octet-stream'
|
||||
output_parent.text = typ
|
||||
|
||||
|
||||
class ImageBlock(etree.XSLTExtension):
|
||||
|
||||
def __init__(self, canvas):
|
||||
etree.XSLTExtension.__init__(self)
|
||||
self.canvas = canvas
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
self.canvas.image_page(input_node, input_node, output_parent)
|
||||
|
||||
|
||||
class RuledLine(etree.XSLTExtension):
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
hr = etree.Element('hr')
|
||||
output_parent.append(hr)
|
||||
|
||||
|
||||
class TextBlock(etree.XSLTExtension):
|
||||
|
||||
def __init__(self, styles, char_button_map, plot_map, log):
|
||||
etree.XSLTExtension.__init__(self)
|
||||
self.styles = styles
|
||||
self.log = log
|
||||
self.char_button_map = char_button_map
|
||||
self.plot_map = plot_map
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
input_node = deepcopy(input_node)
|
||||
div = etree.Element('div')
|
||||
self.render_block(input_node, div)
|
||||
output_parent.append(div)
|
||||
|
||||
def render_block(self, node, root):
|
||||
ts = node.get('textstyle', None)
|
||||
classes = []
|
||||
bs = node.get('blockstyle')
|
||||
if bs in self.styles.block_style_map:
|
||||
classes.append('bs%d'%self.styles.block_style_map[bs])
|
||||
if ts in self.styles.text_style_map:
|
||||
classes.append('ts%d'%self.styles.text_style_map[ts])
|
||||
if classes:
|
||||
root.set('class', ' '.join(classes))
|
||||
objid = node.get('objid', None)
|
||||
if objid:
|
||||
root.set('id', objid)
|
||||
root.text = node.text
|
||||
self.root = root
|
||||
self.parent = root
|
||||
self.add_text_to = (self.parent, 'text')
|
||||
self.fix_deep_nesting(node)
|
||||
for child in node:
|
||||
self.process_child(child)
|
||||
|
||||
def fix_deep_nesting(self, node):
|
||||
deepest = 1
|
||||
|
||||
def depth(node):
|
||||
parent = node.getparent()
|
||||
ans = 1
|
||||
while parent is not None:
|
||||
ans += 1
|
||||
parent = parent.getparent()
|
||||
return ans
|
||||
|
||||
for span in node.xpath('descendant::Span'):
|
||||
d = depth(span)
|
||||
if d > deepest:
|
||||
deepest = d
|
||||
if d > 500:
|
||||
break
|
||||
|
||||
if deepest < 500:
|
||||
return
|
||||
|
||||
self.log.warn('Found deeply nested spans. Flattening.')
|
||||
# with open('/t/before.xml', 'wb') as f:
|
||||
# f.write(etree.tostring(node, method='xml'))
|
||||
|
||||
spans = [(depth(span), span) for span in node.xpath('descendant::Span')]
|
||||
spans.sort(key=operator.itemgetter(0), reverse=True)
|
||||
|
||||
for depth, span in spans:
|
||||
if depth < 3:
|
||||
continue
|
||||
p = span.getparent()
|
||||
gp = p.getparent()
|
||||
idx = p.index(span)
|
||||
pidx = gp.index(p)
|
||||
children = list(p)[idx:]
|
||||
t = children[-1].tail
|
||||
t = t if t else ''
|
||||
children[-1].tail = t + (p.tail if p.tail else '')
|
||||
p.tail = ''
|
||||
pattrib = dict(**p.attrib) if p.tag == 'Span' else {}
|
||||
for child in children:
|
||||
p.remove(child)
|
||||
if pattrib and child.tag == "Span":
|
||||
attrib = copy(pattrib)
|
||||
attrib.update(child.attrib)
|
||||
child.attrib.update(attrib)
|
||||
|
||||
for child in reversed(children):
|
||||
gp.insert(pidx+1, child)
|
||||
|
||||
# with open('/t/after.xml', 'wb') as f:
|
||||
# f.write(etree.tostring(node, method='xml'))
|
||||
|
||||
def add_text(self, text):
|
||||
if text:
|
||||
if getattr(self.add_text_to[0], self.add_text_to[1]) is None:
|
||||
setattr(self.add_text_to[0], self.add_text_to[1], '')
|
||||
setattr(self.add_text_to[0], self.add_text_to[1],
|
||||
getattr(self.add_text_to[0], self.add_text_to[1])+ text)
|
||||
|
||||
def process_container(self, child, tgt):
|
||||
idx = self.styles.get_text_styles(child)
|
||||
if idx is not None:
|
||||
tgt.set('class', 'ts%d'%idx)
|
||||
self.parent.append(tgt)
|
||||
orig_parent = self.parent
|
||||
self.parent = tgt
|
||||
self.add_text_to = (self.parent, 'text')
|
||||
self.add_text(child.text)
|
||||
for gchild in child:
|
||||
self.process_child(gchild)
|
||||
self.parent = orig_parent
|
||||
self.add_text_to = (tgt, 'tail')
|
||||
self.add_text(child.tail)
|
||||
|
||||
def process_child(self, child):
|
||||
if child.tag == 'CR':
|
||||
if self.parent == self.root or self.parent.tag == 'p':
|
||||
self.parent = self.root.makeelement('p')
|
||||
self.root.append(self.parent)
|
||||
self.add_text_to = (self.parent, 'text')
|
||||
else:
|
||||
br = self.parent.makeelement('br')
|
||||
self.parent.append(br)
|
||||
self.add_text_to = (br, 'tail')
|
||||
self.add_text(child.tail)
|
||||
elif child.tag in ('P', 'Span', 'EmpLine', 'NoBR'):
|
||||
span = self.root.makeelement('span')
|
||||
if child.tag == 'EmpLine':
|
||||
td = 'underline' if child.get('emplineposition', 'before') == 'before' else 'overline'
|
||||
span.set('style', 'text-decoration: '+td)
|
||||
self.process_container(child, span)
|
||||
elif child.tag == 'Sup':
|
||||
sup = self.root.makeelement('sup')
|
||||
self.process_container(child, sup)
|
||||
elif child.tag == 'Sub':
|
||||
sub = self.root.makeelement('sub')
|
||||
self.process_container(child, sub)
|
||||
elif child.tag == 'Italic':
|
||||
sup = self.root.makeelement('i')
|
||||
self.process_container(child, sup)
|
||||
elif child.tag == 'CharButton':
|
||||
a = self.root.makeelement('a')
|
||||
oid = child.get('refobj', None)
|
||||
if oid in self.char_button_map:
|
||||
a.set('href', self.char_button_map[oid])
|
||||
self.process_container(child, a)
|
||||
elif child.tag == 'Plot':
|
||||
xsize = self.styles.to_num(child.get('xsize', None), 166/720)
|
||||
ysize = self.styles.to_num(child.get('ysize', None), 166/720)
|
||||
img = self.root.makeelement('img')
|
||||
if xsize is not None:
|
||||
img.set('width', str(int(xsize)))
|
||||
if ysize is not None:
|
||||
img.set('height', str(int(ysize)))
|
||||
ro = child.get('refobj', None)
|
||||
if ro in self.plot_map:
|
||||
img.set('src', self.plot_map[ro])
|
||||
self.parent.append(img)
|
||||
self.add_text_to = (img, 'tail')
|
||||
self.add_text(child.tail)
|
||||
else:
|
||||
self.log.warn('Unhandled Text element:', child.tag)
|
||||
|
||||
|
||||
class Styles(etree.XSLTExtension):
|
||||
|
||||
def __init__(self):
|
||||
etree.XSLTExtension.__init__(self)
|
||||
self.text_styles, self.block_styles = [], []
|
||||
self.text_style_map, self.block_style_map = {}, {}
|
||||
self.CSS = textwrap.dedent('''
|
||||
.image_page { text-align:center }
|
||||
''')
|
||||
|
||||
def write(self, name='styles.css'):
|
||||
|
||||
def join(style):
|
||||
ans = ['%s : %s;'%(k, v) for k, v in style.items()]
|
||||
if ans:
|
||||
ans[-1] = ans[-1][:-1]
|
||||
return '\n\t'.join(ans)
|
||||
|
||||
with open(name, 'wb') as f:
|
||||
f.write(as_bytes(self.CSS))
|
||||
for (w, sel) in [(self.text_styles, 'ts'), (self.block_styles,
|
||||
'bs')]:
|
||||
for i, s in enumerate(w):
|
||||
if not s:
|
||||
continue
|
||||
rsel = '.%s%d'%(sel, i)
|
||||
s = join(s)
|
||||
f.write(as_bytes(rsel + ' {\n\t' + s + '\n}\n\n'))
|
||||
|
||||
def execute(self, context, self_node, input_node, output_parent):
|
||||
if input_node.tag == 'TextStyle':
|
||||
idx = self.get_text_styles(input_node)
|
||||
if idx is not None:
|
||||
self.text_style_map[input_node.get('objid')] = idx
|
||||
else:
|
||||
idx = self.get_block_styles(input_node)
|
||||
self.block_style_map[input_node.get('objid')] = idx
|
||||
|
||||
def px_to_pt(self, px):
|
||||
try:
|
||||
return px * 72/166
|
||||
except:
|
||||
return None
|
||||
|
||||
def color(self, val):
|
||||
try:
|
||||
val = int(val, 16)
|
||||
r, g, b, a = val & 0xFF, (val>>8)&0xFF, (val>>16)&0xFF, (val>>24)&0xFF
|
||||
if a == 255:
|
||||
return None
|
||||
if a == 0:
|
||||
return 'rgb(%d,%d,%d)'%(r,g,b)
|
||||
return 'rgba(%d,%d,%d,%f)'%(r,g,b,1.-a/255.)
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_block_styles(self, node):
|
||||
ans = {}
|
||||
sm = self.px_to_pt(node.get('sidemargin', None))
|
||||
if sm is not None:
|
||||
ans['margin-left'] = ans['margin-right'] = '%fpt'%sm
|
||||
ts = self.px_to_pt(node.get('topskip', None))
|
||||
if ts is not None:
|
||||
ans['margin-top'] = '%fpt'%ts
|
||||
fs = self.px_to_pt(node.get('footskip', None))
|
||||
if fs is not None:
|
||||
ans['margin-bottom'] = '%fpt'%fs
|
||||
fw = self.px_to_pt(node.get('framewidth', None))
|
||||
if fw is not None:
|
||||
ans['border-width'] = '%fpt'%fw
|
||||
ans['border-style'] = 'solid'
|
||||
fc = self.color(node.get('framecolor', None))
|
||||
if fc is not None:
|
||||
ans['border-color'] = fc
|
||||
bc = self.color(node.get('bgcolor', None))
|
||||
if bc is not None:
|
||||
ans['background-color'] = bc
|
||||
if ans not in self.block_styles:
|
||||
self.block_styles.append(ans)
|
||||
return self.block_styles.index(ans)
|
||||
|
||||
def to_num(self, val, factor=1.):
|
||||
try:
|
||||
return float(val)*factor
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_text_styles(self, node):
|
||||
ans = {}
|
||||
fs = self.to_num(node.get('fontsize', None), 0.1)
|
||||
if fs is not None:
|
||||
ans['font-size'] = '%fpt'%fs
|
||||
fw = self.to_num(node.get('fontweight', None))
|
||||
if fw is not None:
|
||||
ans['font-weight'] = ('bold' if fw >= 700 else 'normal')
|
||||
# fn = getattr(obj, 'fontfacename', None)
|
||||
# if fn is not None:
|
||||
# fn = cls.FONT_MAP[fn]
|
||||
# item('font-family: %s;'%fn)
|
||||
fg = self.color(node.get('textcolor', None))
|
||||
if fg is not None:
|
||||
ans['color'] = fg
|
||||
bg = self.color(node.get('textbgcolor', None))
|
||||
if bg is not None:
|
||||
ans['background-color'] = bg
|
||||
al = node.get('align', None)
|
||||
if al is not None:
|
||||
all = dict(head='left', center='center', foot='right')
|
||||
ans['text-align'] = all.get(al, 'left')
|
||||
# lh = self.to_num(node.get('linespace', None), 0.1)
|
||||
# if lh is not None:
|
||||
# ans['line-height'] = '%fpt'%lh
|
||||
pi = self.to_num(node.get('parindent', None), 0.1)
|
||||
if pi is not None:
|
||||
ans['text-indent'] = '%fpt'%pi
|
||||
if not ans:
|
||||
return None
|
||||
if ans not in self.text_styles:
|
||||
self.text_styles.append(ans)
|
||||
return self.text_styles.index(ans)
|
||||
171
ebook_converter/ebooks/lrf/lrfparser.py
Normal file
171
ebook_converter/ebooks/lrf/lrfparser.py
Normal file
@@ -0,0 +1,171 @@
|
||||
import sys, array, os, re, codecs, logging
|
||||
from itertools import chain
|
||||
|
||||
from ebook_converter import setup_cli_handlers
|
||||
from ebook_converter.utils.config import OptionParser
|
||||
from ebook_converter.utils.filenames import ascii_filename
|
||||
from ebook_converter.ebooks.lrf.meta import LRFMetaFile
|
||||
from ebook_converter.ebooks.lrf.objects import get_object, PageTree, StyleObject, \
|
||||
Font, Text, TOCObject, BookAttr, ruby_tags
|
||||
|
||||
|
||||
class LRFDocument(LRFMetaFile):
|
||||
|
||||
class temp(object):
|
||||
pass
|
||||
|
||||
def __init__(self, stream):
|
||||
LRFMetaFile.__init__(self, stream)
|
||||
self.scramble_key = self.xor_key
|
||||
self.page_trees = []
|
||||
self.font_map = {}
|
||||
self.image_map = {}
|
||||
self.toc = ''
|
||||
self.keep_parsing = True
|
||||
|
||||
def parse(self):
|
||||
self._parse_objects()
|
||||
self.metadata = LRFDocument.temp()
|
||||
for a in ('title', 'title_reading', 'author', 'author_reading', 'book_id',
|
||||
'classification', 'free_text', 'publisher', 'label', 'category'):
|
||||
setattr(self.metadata, a, getattr(self, a))
|
||||
self.doc_info = LRFDocument.temp()
|
||||
for a in ('thumbnail', 'language', 'creator', 'producer', 'page'):
|
||||
setattr(self.doc_info, a, getattr(self, a))
|
||||
self.doc_info.thumbnail_extension = self.thumbail_extension()
|
||||
self.device_info = LRFDocument.temp()
|
||||
for a in ('dpi', 'width', 'height'):
|
||||
setattr(self.device_info, a, getattr(self, a))
|
||||
|
||||
def _parse_objects(self):
|
||||
self.objects = {}
|
||||
self._file.seek(self.object_index_offset)
|
||||
obj_array = array.array("I", self._file.read(4*4*self.number_of_objects))
|
||||
if ord(array.array("i",[1]).tostring()[0:1])==0: # big-endian
|
||||
obj_array.byteswap()
|
||||
for i in range(self.number_of_objects):
|
||||
if not self.keep_parsing:
|
||||
break
|
||||
objid, objoff, objsize = obj_array[i*4:i*4+3]
|
||||
self._parse_object(objid, objoff, objsize)
|
||||
for obj in self.objects.values():
|
||||
if not self.keep_parsing:
|
||||
break
|
||||
if hasattr(obj, 'initialize'):
|
||||
obj.initialize()
|
||||
|
||||
def _parse_object(self, objid, objoff, objsize):
|
||||
obj = get_object(self, self._file, objid, objoff, objsize, self.scramble_key)
|
||||
self.objects[objid] = obj
|
||||
if isinstance(obj, PageTree):
|
||||
self.page_trees.append(obj)
|
||||
elif isinstance(obj, TOCObject):
|
||||
self.toc = obj
|
||||
elif isinstance(obj, BookAttr):
|
||||
self.ruby_tags = {}
|
||||
for h in ruby_tags.values():
|
||||
attr = h[0]
|
||||
if hasattr(obj, attr):
|
||||
self.ruby_tags[attr] = getattr(obj, attr)
|
||||
|
||||
def __iter__(self):
|
||||
for pt in self.page_trees:
|
||||
yield pt
|
||||
|
||||
def write_files(self):
|
||||
for obj in chain(self.image_map.values(), self.font_map.values()):
|
||||
with open(obj.file, 'wb') as f:
|
||||
f.write(obj.stream)
|
||||
|
||||
def to_xml(self, write_files=True):
|
||||
bookinfo = '<BookInformation>\n<Info version="1.1">\n<BookInfo>\n'
|
||||
bookinfo += '<Title reading="%s">%s</Title>\n'%(self.metadata.title_reading, self.metadata.title)
|
||||
bookinfo += '<Author reading="%s">%s</Author>\n'%(self.metadata.author_reading, self.metadata.author)
|
||||
bookinfo += '<BookID>%s</BookID>\n'%(self.metadata.book_id,)
|
||||
bookinfo += '<Publisher reading="">%s</Publisher>\n'%(self.metadata.publisher,)
|
||||
bookinfo += '<Label reading="">%s</Label>\n'%(self.metadata.label,)
|
||||
bookinfo += '<Category reading="">%s</Category>\n'%(self.metadata.category,)
|
||||
bookinfo += '<Classification reading="">%s</Classification>\n'%(self.metadata.classification,)
|
||||
bookinfo += '<FreeText reading="">%s</FreeText>\n</BookInfo>\n<DocInfo>\n'%(self.metadata.free_text,)
|
||||
th = self.doc_info.thumbnail
|
||||
if th:
|
||||
prefix = ascii_filename(self.metadata.title)
|
||||
bookinfo += '<CThumbnail file="%s" />\n'%(prefix+'_thumbnail.'+self.doc_info.thumbnail_extension,)
|
||||
if write_files:
|
||||
with open(prefix+'_thumbnail.'+self.doc_info.thumbnail_extension, 'wb') as f:
|
||||
f.write(th)
|
||||
bookinfo += '<Language reading="">%s</Language>\n'%(self.doc_info.language,)
|
||||
bookinfo += '<Creator reading="">%s</Creator>\n'%(self.doc_info.creator,)
|
||||
bookinfo += '<Producer reading="">%s</Producer>\n'%(self.doc_info.producer,)
|
||||
bookinfo += '<SumPage>%s</SumPage>\n</DocInfo>\n</Info>\n%s</BookInformation>\n'%(self.doc_info.page,self.toc)
|
||||
pages = ''
|
||||
done_main = False
|
||||
pt_id = -1
|
||||
for page_tree in self:
|
||||
if not done_main:
|
||||
done_main = True
|
||||
pages += '<Main>\n'
|
||||
close = '</Main>\n'
|
||||
pt_id = page_tree.id
|
||||
else:
|
||||
pages += '<PageTree objid="%d">\n'%(page_tree.id,)
|
||||
close = '</PageTree>\n'
|
||||
for page in page_tree:
|
||||
pages += str(page)
|
||||
pages += close
|
||||
traversed_objects = [int(i) for i in re.findall(r'objid="(\w+)"', pages)] + [pt_id]
|
||||
|
||||
objects = '\n<Objects>\n'
|
||||
styles = '\n<Style>\n'
|
||||
for obj in self.objects:
|
||||
obj = self.objects[obj]
|
||||
if obj.id in traversed_objects:
|
||||
continue
|
||||
if isinstance(obj, (Font, Text, TOCObject)):
|
||||
continue
|
||||
if isinstance(obj, StyleObject):
|
||||
styles += str(obj)
|
||||
else:
|
||||
objects += str(obj)
|
||||
styles += '</Style>\n'
|
||||
objects += '</Objects>\n'
|
||||
if write_files:
|
||||
self.write_files()
|
||||
return '<BBeBXylog version="1.0">\n' + bookinfo + pages + styles + objects + '</BBeBXylog>'
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(usage=_('%prog book.lrf\nConvert an LRF file into an LRS (XML UTF-8 encoded) file'))
|
||||
parser.add_option('--output', '-o', default=None, help=_('Output LRS file'), dest='out')
|
||||
parser.add_option('--dont-output-resources', default=True, action='store_false',
|
||||
help=_('Do not save embedded image and font files to disk'),
|
||||
dest='output_resources')
|
||||
parser.add_option('--verbose', default=False, action='store_true', dest='verbose', help=_('Be more verbose'))
|
||||
return parser
|
||||
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
opts, args = parser.parse_args(args)
|
||||
if logger is None:
|
||||
level = logging.DEBUG if opts.verbose else logging.INFO
|
||||
logger = logging.getLogger('lrf2lrs')
|
||||
setup_cli_handlers(logger, level)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
return 1
|
||||
if opts.out is None:
|
||||
opts.out = os.path.join(os.path.dirname(args[1]), os.path.splitext(os.path.basename(args[1]))[0]+".lrs")
|
||||
logger.info(_('Parsing LRF...'))
|
||||
d = LRFDocument(open(args[1], 'rb'))
|
||||
d.parse()
|
||||
logger.info(_('Creating XML...'))
|
||||
with codecs.open(os.path.abspath(os.path.expanduser(opts.out)), 'wb', 'utf-8') as f:
|
||||
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
|
||||
f.write(d.to_xml(write_files=opts.output_resources))
|
||||
logger.info(_('LRS written to ')+opts.out)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
766
ebook_converter/ebooks/lrf/meta.py
Normal file
766
ebook_converter/ebooks/lrf/meta.py
Normal file
@@ -0,0 +1,766 @@
|
||||
"""
|
||||
This module presents an easy to use interface for getting and setting
|
||||
meta information in LRF files.
|
||||
Just create an L{LRFMetaFile} object and use its properties
|
||||
to get and set meta information. For example:
|
||||
|
||||
>>> lrf = LRFMetaFile("mybook.lrf")
|
||||
>>> print(lrf.title, lrf.author)
|
||||
>>> lrf.category = "History"
|
||||
"""
|
||||
|
||||
import functools
|
||||
import io
|
||||
import os
|
||||
import shutil
|
||||
import struct
|
||||
import sys
|
||||
from xml.dom import minidom
|
||||
import zlib
|
||||
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.ebooks.metadata import MetaInformation, string_to_authors
|
||||
|
||||
BYTE = "<B" #: Unsigned char little endian encoded in 1 byte
|
||||
WORD = "<H" #: Unsigned short little endian encoded in 2 bytes
|
||||
DWORD = "<I" #: Unsigned integer little endian encoded in 4 bytes
|
||||
QWORD = "<Q" #: Unsigned long long little endian encoded in 8 bytes
|
||||
|
||||
|
||||
class field(object):
|
||||
""" A U{Descriptor<http://www.cafepy.com/article/python_attributes_
|
||||
and_methods/python_attributes_and_methods.html>},
|
||||
that implements access to protocol packets in a human readable way.
|
||||
"""
|
||||
|
||||
def __init__(self, start=16, fmt=DWORD):
|
||||
"""
|
||||
@param start: The byte at which this field is stored in the buffer
|
||||
@param fmt: The packing format for this field.
|
||||
See U{struct<http://docs.python.org/lib/module-struct.html>}.
|
||||
"""
|
||||
self._fmt, self._start = fmt, start
|
||||
|
||||
def __get__(self, obj, typ=None):
|
||||
return obj.unpack(start=self._start, fmt=self._fmt)[0]
|
||||
|
||||
def __set__(self, obj, val):
|
||||
obj.pack(val, start=self._start, fmt=self._fmt)
|
||||
|
||||
def __repr__(self):
|
||||
typ = {DWORD: 'unsigned int', 'QWORD': 'unsigned long long',
|
||||
BYTE: 'unsigned char',
|
||||
WORD: 'unsigned short'}.get(self._fmt, '')
|
||||
return ("An " + typ + " stored in " +
|
||||
str(struct.calcsize(self._fmt)) +
|
||||
" bytes starting at byte " + str(self._start))
|
||||
|
||||
|
||||
class versioned_field(field):
|
||||
|
||||
def __init__(self, vfield, version, start=0, fmt=WORD):
|
||||
field.__init__(self, start=start, fmt=fmt)
|
||||
self.vfield, self.version = vfield, version
|
||||
|
||||
def enabled(self, obj):
|
||||
return self.vfield.__get__(obj) > self.version
|
||||
|
||||
def __get__(self, obj, typ=None):
|
||||
if self.enabled(obj):
|
||||
return field.__get__(self, obj, typ=typ)
|
||||
else:
|
||||
return None
|
||||
|
||||
def __set__(self, obj, val):
|
||||
if not self.enabled(obj):
|
||||
raise LRFException("Trying to set disabled field")
|
||||
else:
|
||||
field.__set__(self, obj, val)
|
||||
|
||||
|
||||
class LRFException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class fixed_stringfield(object):
|
||||
""" A field storing a variable length string. """
|
||||
|
||||
def __init__(self, length=8, start=0):
|
||||
"""
|
||||
@param length: Size of this string
|
||||
@param start: The byte at which this field is stored in the buffer
|
||||
"""
|
||||
self._length = length
|
||||
self._start = start
|
||||
|
||||
def __get__(self, obj, typ=None):
|
||||
length = str(self._length)
|
||||
return obj.unpack(start=self._start, fmt="<"+length+"s")[0]
|
||||
|
||||
def __set__(self, obj, val):
|
||||
if not isinstance(val, (str, bytes)):
|
||||
val = str(val)
|
||||
if isinstance(val, str):
|
||||
val = val.encode('utf-8')
|
||||
if len(val) != self._length:
|
||||
raise LRFException("Trying to set fixed_stringfield with a " +
|
||||
"string of incorrect length")
|
||||
obj.pack(val, start=self._start, fmt="<"+str(len(val))+"s")
|
||||
|
||||
def __repr__(self):
|
||||
return "A string of length " + str(self._length) + \
|
||||
" starting at byte " + str(self._start)
|
||||
|
||||
|
||||
class xml_attr_field(object):
|
||||
|
||||
def __init__(self, tag_name, attr, parent='BookInfo'):
|
||||
self.tag_name = tag_name
|
||||
self.parent = parent
|
||||
self.attr = attr
|
||||
|
||||
def __get__(self, obj, typ=None):
|
||||
""" Return the data in this field or '' if the field is empty """
|
||||
document = obj.info
|
||||
elems = document.getElementsByTagName(self.tag_name)
|
||||
if len(elems):
|
||||
elem = None
|
||||
for candidate in elems:
|
||||
if candidate.parentNode.nodeName == self.parent:
|
||||
elem = candidate
|
||||
if elem and elem.hasAttribute(self.attr):
|
||||
return elem.getAttribute(self.attr)
|
||||
return ''
|
||||
|
||||
def __set__(self, obj, val):
|
||||
if val is None:
|
||||
val = ""
|
||||
document = obj.info
|
||||
elems = document.getElementsByTagName(self.tag_name)
|
||||
if len(elems):
|
||||
elem = None
|
||||
for candidate in elems:
|
||||
if candidate.parentNode.nodeName == self.parent:
|
||||
elem = candidate
|
||||
if elem:
|
||||
elem.setAttribute(self.attr, val)
|
||||
obj.info = document
|
||||
|
||||
def __repr__(self):
|
||||
return "XML Attr Field: " + self.tag_name + " in " + self.parent
|
||||
|
||||
def __str__(self):
|
||||
return self.tag_name+'.'+self.attr
|
||||
|
||||
|
||||
class xml_field(object):
|
||||
"""
|
||||
Descriptor that gets and sets XML based meta information from an LRF file.
|
||||
Works for simple XML fields of the form <tagname>data</tagname>
|
||||
"""
|
||||
|
||||
def __init__(self, tag_name, parent="BookInfo"):
|
||||
"""
|
||||
@param tag_name: The XML tag whose data we operate on
|
||||
@param parent: The tagname of the parent element of C{tag_name}
|
||||
"""
|
||||
self.tag_name = tag_name
|
||||
self.parent = parent
|
||||
|
||||
def __get__(self, obj, typ=None):
|
||||
""" Return the data in this field or '' if the field is empty """
|
||||
document = obj.info
|
||||
|
||||
elems = document.getElementsByTagName(self.tag_name)
|
||||
if len(elems):
|
||||
elem = None
|
||||
for candidate in elems:
|
||||
if candidate.parentNode.nodeName == self.parent:
|
||||
elem = candidate
|
||||
if elem:
|
||||
elem.normalize()
|
||||
if elem.hasChildNodes():
|
||||
return elem.firstChild.data.strip()
|
||||
return ''
|
||||
|
||||
def __set__(self, obj, val):
|
||||
if not val:
|
||||
val = ''
|
||||
document = obj.info
|
||||
|
||||
def create_elem():
|
||||
elem = document.createElement(self.tag_name)
|
||||
parent = document.getElementsByTagName(self.parent)[0]
|
||||
parent.appendChild(elem)
|
||||
return elem
|
||||
|
||||
if not val:
|
||||
val = ''
|
||||
if not isinstance(val, str):
|
||||
val = val.decode('utf-8')
|
||||
|
||||
elems = document.getElementsByTagName(self.tag_name)
|
||||
elem = None
|
||||
if len(elems):
|
||||
for candidate in elems:
|
||||
if candidate.parentNode.nodeName == self.parent:
|
||||
elem = candidate
|
||||
if not elem:
|
||||
elem = create_elem()
|
||||
else:
|
||||
elem.normalize()
|
||||
while elem.hasChildNodes():
|
||||
elem.removeChild(elem.lastChild)
|
||||
else:
|
||||
elem = create_elem()
|
||||
elem.appendChild(document.createTextNode(val))
|
||||
|
||||
obj.info = document
|
||||
|
||||
def __str__(self):
|
||||
return self.tag_name
|
||||
|
||||
def __repr__(self):
|
||||
return "XML Field: " + self.tag_name + " in " + self.parent
|
||||
|
||||
|
||||
def insert_into_file(fileobj, data, start, end):
|
||||
"""
|
||||
Insert data into fileobj at position C{start}.
|
||||
|
||||
This function inserts data into a file, overwriting all data between start
|
||||
and end. If end == start no data is overwritten. Do not use this function
|
||||
to append data to a file.
|
||||
|
||||
@param fileobj: file like object
|
||||
@param data: data to be inserted into fileobj
|
||||
@param start: The position at which to start inserting data
|
||||
@param end: The position in fileobj of data that must not be overwritten
|
||||
@return: C{start + len(data) - end}
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
fileobj.seek(end)
|
||||
shutil.copyfileobj(fileobj, buffer, -1)
|
||||
buffer.flush()
|
||||
buffer.seek(0)
|
||||
fileobj.seek(start)
|
||||
fileobj.write(data)
|
||||
fileobj.flush()
|
||||
fileobj.truncate()
|
||||
delta = fileobj.tell() - end # < 0 if len(data) < end-start
|
||||
shutil.copyfileobj(buffer, fileobj, -1)
|
||||
fileobj.flush()
|
||||
buffer.close()
|
||||
return delta
|
||||
|
||||
|
||||
def get_metadata(stream):
|
||||
"""
|
||||
Return basic meta-data about the LRF file in C{stream} as a
|
||||
L{MetaInformation} object.
|
||||
@param stream: A file like object or an instance of L{LRFMetaFile}
|
||||
"""
|
||||
lrf = stream if isinstance(stream, LRFMetaFile) else LRFMetaFile(stream)
|
||||
authors = string_to_authors(lrf.author)
|
||||
mi = MetaInformation(lrf.title.strip(), authors)
|
||||
mi.author = lrf.author.strip()
|
||||
mi.comments = lrf.free_text.strip()
|
||||
mi.category = lrf.category.strip()+', '+lrf.classification.strip()
|
||||
tags = [x.strip() for x in mi.category.split(',') if x.strip()]
|
||||
if tags:
|
||||
mi.tags = tags
|
||||
if mi.category.strip() == ',':
|
||||
mi.category = None
|
||||
mi.publisher = lrf.publisher.strip()
|
||||
mi.cover_data = lrf.get_cover()
|
||||
try:
|
||||
mi.title_sort = lrf.title_reading.strip()
|
||||
if not mi.title_sort:
|
||||
mi.title_sort = None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
mi.author_sort = lrf.author_reading.strip()
|
||||
if not mi.author_sort:
|
||||
mi.author_sort = None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not mi.title or 'unknown' in mi.title.lower():
|
||||
mi.title = None
|
||||
if not mi.authors:
|
||||
mi.authors = None
|
||||
if not mi.author or 'unknown' in mi.author.lower():
|
||||
mi.author = None
|
||||
if not mi.category or 'unknown' in mi.category.lower():
|
||||
mi.category = None
|
||||
if not mi.publisher or 'unknown' in mi.publisher.lower() or \
|
||||
'some publisher' in mi.publisher.lower():
|
||||
mi.publisher = None
|
||||
|
||||
return mi
|
||||
|
||||
|
||||
class LRFMetaFile(object):
|
||||
"""Has properties to read and write all Meta information in a LRF file."""
|
||||
#: The first 6 bytes of all valid LRF files
|
||||
LRF_HEADER = 'LRF'.encode('utf-16le')
|
||||
|
||||
lrf_header = fixed_stringfield(length=6, start=0x0)
|
||||
version = field(fmt=WORD, start=0x8)
|
||||
xor_key = field(fmt=WORD, start=0xa)
|
||||
root_object_id = field(fmt=DWORD, start=0xc)
|
||||
number_of_objects = field(fmt=QWORD, start=0x10)
|
||||
object_index_offset = field(fmt=QWORD, start=0x18)
|
||||
binding = field(fmt=BYTE, start=0x24)
|
||||
dpi = field(fmt=WORD, start=0x26)
|
||||
width = field(fmt=WORD, start=0x2a)
|
||||
height = field(fmt=WORD, start=0x2c)
|
||||
color_depth = field(fmt=BYTE, start=0x2e)
|
||||
toc_object_id = field(fmt=DWORD, start=0x44)
|
||||
toc_object_offset = field(fmt=DWORD, start=0x48)
|
||||
compressed_info_size = field(fmt=WORD, start=0x4c)
|
||||
thumbnail_type = versioned_field(version, 800, fmt=WORD, start=0x4e)
|
||||
thumbnail_size = versioned_field(version, 800, fmt=DWORD, start=0x50)
|
||||
uncompressed_info_size = versioned_field(compressed_info_size, 0,
|
||||
fmt=DWORD, start=0x54)
|
||||
|
||||
title = xml_field("Title", parent="BookInfo")
|
||||
title_reading = xml_attr_field("Title", 'reading', parent="BookInfo")
|
||||
author = xml_field("Author", parent="BookInfo")
|
||||
author_reading = xml_attr_field("Author", 'reading', parent="BookInfo")
|
||||
# 16 characters. First two chars should be FB for personal use ebooks.
|
||||
book_id = xml_field("BookID", parent="BookInfo")
|
||||
publisher = xml_field("Publisher", parent="BookInfo")
|
||||
label = xml_field("Label", parent="BookInfo")
|
||||
category = xml_field("Category", parent="BookInfo")
|
||||
classification = xml_field("Classification", parent="BookInfo")
|
||||
free_text = xml_field("FreeText", parent="BookInfo")
|
||||
# Should use ISO 639 language codes
|
||||
language = xml_field("Language", parent="DocInfo")
|
||||
creator = xml_field("Creator", parent="DocInfo")
|
||||
# Format is %Y-%m-%d
|
||||
creation_date = xml_field("CreationDate", parent="DocInfo")
|
||||
producer = xml_field("Producer", parent="DocInfo")
|
||||
page = xml_field("SumPage", parent="DocInfo")
|
||||
|
||||
def safe(func):
|
||||
"""
|
||||
Decorator that ensures that function calls leave the pos
|
||||
in the underlying file unchanged
|
||||
"""
|
||||
@functools.wraps(func)
|
||||
def restore_pos(*args, **kwargs):
|
||||
obj = args[0]
|
||||
pos = obj._file.tell()
|
||||
res = func(*args, **kwargs)
|
||||
obj._file.seek(0, 2)
|
||||
if obj._file.tell() >= pos:
|
||||
obj._file.seek(pos)
|
||||
return res
|
||||
return restore_pos
|
||||
|
||||
def safe_property(func):
|
||||
"""
|
||||
Decorator that ensures that read or writing a property leaves
|
||||
the position in the underlying file unchanged
|
||||
"""
|
||||
def decorator(f):
|
||||
def restore_pos(*args, **kwargs):
|
||||
obj = args[0]
|
||||
pos = obj._file.tell()
|
||||
res = f(*args, **kwargs)
|
||||
obj._file.seek(0, 2)
|
||||
if obj._file.tell() >= pos:
|
||||
obj._file.seek(pos)
|
||||
return res
|
||||
return restore_pos
|
||||
locals_ = func()
|
||||
if 'fget' in locals_:
|
||||
locals_["fget"] = decorator(locals_["fget"])
|
||||
if 'fset' in locals_:
|
||||
locals_["fset"] = decorator(locals_["fset"])
|
||||
return property(**locals_)
|
||||
|
||||
@safe_property
|
||||
def info():
|
||||
doc = """\
|
||||
Document meta information as a minidom Document object.
|
||||
To set use a minidom document object.
|
||||
"""
|
||||
|
||||
def fget(self):
|
||||
if self.compressed_info_size == 0:
|
||||
raise LRFException("This document has no meta info")
|
||||
size = self.compressed_info_size - 4
|
||||
self._file.seek(self.info_start)
|
||||
try:
|
||||
src = zlib.decompress(self._file.read(size))
|
||||
if len(src) != self.uncompressed_info_size:
|
||||
raise LRFException("Decompression of document meta info\
|
||||
yielded unexpected results")
|
||||
|
||||
src = xml_to_unicode(src, strip_encoding_pats=True,
|
||||
resolve_entities=True,
|
||||
assume_utf8=True)[0]
|
||||
return minidom.parseString(src)
|
||||
except zlib.error:
|
||||
raise LRFException("Unable to decompress document meta "
|
||||
"information")
|
||||
|
||||
def fset(self, document):
|
||||
info = document.toxml('utf-8')
|
||||
self.uncompressed_info_size = len(info)
|
||||
stream = zlib.compress(info)
|
||||
orig_size = self.compressed_info_size
|
||||
self.compressed_info_size = len(stream) + 4
|
||||
delta = insert_into_file(self._file, stream, self.info_start,
|
||||
self.info_start + orig_size - 4)
|
||||
|
||||
if self.toc_object_offset > 0:
|
||||
self.toc_object_offset += delta
|
||||
self.object_index_offset += delta
|
||||
self.update_object_offsets(delta)
|
||||
|
||||
return {"fget": fget, "fset": fset, "doc": doc}
|
||||
|
||||
@safe_property
|
||||
def thumbnail_pos():
|
||||
doc = """The position of the thumbnail in the LRF file"""
|
||||
|
||||
def fget(self):
|
||||
return self.info_start + self.compressed_info_size-4
|
||||
return {"fget": fget, "doc": doc}
|
||||
|
||||
@classmethod
|
||||
def _detect_thumbnail_type(cls, slice):
|
||||
""" @param slice: The first 16 bytes of the thumbnail """
|
||||
ttype = 0x14 # GIF
|
||||
if "PNG" in slice:
|
||||
ttype = 0x12
|
||||
if "BM" in slice:
|
||||
ttype = 0x13
|
||||
if "JFIF" in slice:
|
||||
ttype = 0x11
|
||||
return ttype
|
||||
|
||||
@safe_property
|
||||
def thumbnail():
|
||||
doc = """\
|
||||
The thumbnail.
|
||||
Represented as a string.
|
||||
The string you would get from the file read function.
|
||||
"""
|
||||
|
||||
def fget(self):
|
||||
size = self.thumbnail_size
|
||||
if size:
|
||||
self._file.seek(self.thumbnail_pos)
|
||||
return self._file.read(size)
|
||||
|
||||
def fset(self, data):
|
||||
if self.version <= 800:
|
||||
raise LRFException("Cannot store thumbnails in LRF files "
|
||||
"of version <= 800")
|
||||
slice = data[0:16]
|
||||
orig_size = self.thumbnail_size
|
||||
self.thumbnail_size = len(data)
|
||||
delta = insert_into_file(self._file, data, self.thumbnail_pos,
|
||||
self.thumbnail_pos + orig_size)
|
||||
self.toc_object_offset += delta
|
||||
self.object_index_offset += delta
|
||||
self.thumbnail_type = self._detect_thumbnail_type(slice)
|
||||
self.update_object_offsets(delta)
|
||||
|
||||
return {"fget": fget, "fset": fset, "doc": doc}
|
||||
|
||||
def __init__(self, file):
|
||||
""" @param file: A file object opened in the r+b mode """
|
||||
file.seek(0, 2)
|
||||
self.size = file.tell()
|
||||
self._file = file
|
||||
if self.lrf_header != LRFMetaFile.LRF_HEADER:
|
||||
raise LRFException(file.name + " has an invalid LRF header. Are "
|
||||
"you sure it is an LRF file?")
|
||||
# Byte at which the compressed meta information starts
|
||||
self.info_start = 0x58 if self.version > 800 else 0x53
|
||||
|
||||
@safe
|
||||
def update_object_offsets(self, delta):
|
||||
"""
|
||||
Run through the LRF Object index changing the offset by C{delta}.
|
||||
"""
|
||||
self._file.seek(self.object_index_offset)
|
||||
count = self.number_of_objects
|
||||
while count > 0:
|
||||
raw = self._file.read(8)
|
||||
new_offset = struct.unpack(DWORD, raw[4:8])[0] + delta
|
||||
if new_offset >= (2**8)**4 or new_offset < 0x4C:
|
||||
raise LRFException('Invalid LRF file. Could not set metadata.')
|
||||
self._file.seek(-4, os.SEEK_CUR)
|
||||
self._file.write(struct.pack(DWORD, new_offset))
|
||||
self._file.seek(8, os.SEEK_CUR)
|
||||
count -= 1
|
||||
self._file.flush()
|
||||
|
||||
@safe
|
||||
def unpack(self, fmt=DWORD, start=0):
|
||||
"""
|
||||
Return decoded data from file.
|
||||
|
||||
@param fmt: See http://docs.python.org/lib/module-struct.html
|
||||
@param start: Position in file from which to decode
|
||||
"""
|
||||
end = start + struct.calcsize(fmt)
|
||||
self._file.seek(start)
|
||||
ret = struct.unpack(fmt, self._file.read(end-start))
|
||||
return ret
|
||||
|
||||
@safe
|
||||
def pack(self, *args, **kwargs):
|
||||
"""
|
||||
Encode C{args} and write them to file.
|
||||
C{kwargs} must contain the keywords C{fmt} and C{start}
|
||||
|
||||
@param args: The values to pack
|
||||
@param fmt: See http://docs.python.org/lib/module-struct.html
|
||||
@param start: Position in file at which to write encoded data
|
||||
"""
|
||||
encoded = struct.pack(kwargs["fmt"], *args)
|
||||
self._file.seek(kwargs["start"])
|
||||
self._file.write(encoded)
|
||||
self._file.flush()
|
||||
|
||||
def thumbail_extension(self):
|
||||
"""
|
||||
Return the extension for the thumbnail image type as specified
|
||||
by L{self.thumbnail_type}. If the LRF file was created by buggy
|
||||
software, the extension maye be incorrect. See
|
||||
L{self.fix_thumbnail_type}.
|
||||
"""
|
||||
ext = "gif"
|
||||
ttype = self.thumbnail_type
|
||||
if ttype == 0x11:
|
||||
ext = "jpeg"
|
||||
elif ttype == 0x12:
|
||||
ext = "png"
|
||||
elif ttype == 0x13:
|
||||
ext = "bmp"
|
||||
return ext
|
||||
|
||||
def fix_thumbnail_type(self):
|
||||
"""
|
||||
Attempt to guess the thumbnail image format and set
|
||||
L{self.thumbnail_type} accordingly.
|
||||
"""
|
||||
slice = self.thumbnail[0:16]
|
||||
self.thumbnail_type = self._detect_thumbnail_type(slice)
|
||||
|
||||
def seek(self, *args):
|
||||
""" See L{file.seek} """
|
||||
return self._file.seek(*args)
|
||||
|
||||
def tell(self):
|
||||
""" See L{file.tell} """
|
||||
return self._file.tell()
|
||||
|
||||
def read(self):
|
||||
""" See L{file.read} """
|
||||
return self._file.read()
|
||||
|
||||
def write(self, val):
|
||||
""" See L{file.write} """
|
||||
self._file.write(val)
|
||||
|
||||
def _objects(self):
|
||||
self._file.seek(self.object_index_offset)
|
||||
c = self.number_of_objects
|
||||
while c > 0:
|
||||
c -= 1
|
||||
raw = self._file.read(16)
|
||||
pos = self._file.tell()
|
||||
yield struct.unpack('<IIII', raw)[:3]
|
||||
self._file.seek(pos)
|
||||
|
||||
def get_objects_by_type(self, type):
|
||||
from ebook_converter.ebooks.lrf.tags import Tag
|
||||
objects = []
|
||||
for id, offset, size in self._objects():
|
||||
self._file.seek(offset)
|
||||
tag = Tag(self._file)
|
||||
if tag.id == 0xF500:
|
||||
obj_id, obj_type = struct.unpack("<IH", tag.contents)
|
||||
if obj_type == type:
|
||||
objects.append((obj_id, offset, size))
|
||||
return objects
|
||||
|
||||
def get_object_by_id(self, tid):
|
||||
from ebook_converter.ebooks.lrf.tags import Tag
|
||||
for id, offset, size in self._objects():
|
||||
self._file.seek(offset)
|
||||
tag = Tag(self._file)
|
||||
if tag.id == 0xF500:
|
||||
obj_id, obj_type = struct.unpack("<IH", tag.contents)
|
||||
if obj_id == tid:
|
||||
return obj_id, offset, size, obj_type
|
||||
return (False, False, False, False)
|
||||
|
||||
@safe
|
||||
def get_cover(self):
|
||||
from ebook_converter.ebooks.lrf.objects import get_object
|
||||
|
||||
for id, offset, size in self.get_objects_by_type(0x0C):
|
||||
image = get_object(None, self._file, id, offset, size,
|
||||
self.xor_key)
|
||||
id, offset, size = self.get_object_by_id(image.refstream)[:3]
|
||||
image_stream = get_object(None, self._file, id, offset, size,
|
||||
self.xor_key)
|
||||
return image_stream.file.rpartition('.')[-1], image_stream.stream
|
||||
return None
|
||||
|
||||
|
||||
def option_parser():
|
||||
from ebook_converter.utils.config import OptionParser
|
||||
from ebook_converter.constants import __appname__, __version__
|
||||
parser = OptionParser(usage=('''%prog [options] mybook.lrf
|
||||
|
||||
|
||||
Show/edit the metadata in an LRF file.\n\n'''),
|
||||
version=__appname__+' '+__version__,
|
||||
epilog='Created by Kovid Goyal')
|
||||
parser.add_option("-t", "--title", action="store", type="string",
|
||||
dest="title", help="Set the book title")
|
||||
parser.add_option('--title-sort', action='store', type='string',
|
||||
default=None, dest='title_reading',
|
||||
help='Set sort key for the title')
|
||||
parser.add_option("-a", "--author", action="store", type="string",
|
||||
dest="author", help="Set the author")
|
||||
parser.add_option('--author-sort', action='store', type='string',
|
||||
default=None, dest='author_reading',
|
||||
help='Set sort key for the author')
|
||||
parser.add_option("-c", "--category", action="store", type="string",
|
||||
dest="category", help="The category this book belongs "
|
||||
"to. E.g.: History")
|
||||
parser.add_option("--thumbnail", action="store", type="string",
|
||||
dest="thumbnail", help="Path to a graphic that will be "
|
||||
"set as this files' thumbnail")
|
||||
parser.add_option("--comment", action="store", type="string",
|
||||
dest="comment", help="Path to a TXT file containing the "
|
||||
"comment to be stored in the LRF file.")
|
||||
parser.add_option("--get-thumbnail", action="store_true",
|
||||
dest="get_thumbnail", default=False,
|
||||
help="Extract thumbnail from LRF file")
|
||||
parser.add_option('--publisher', default=None, help='Set the publisher')
|
||||
parser.add_option('--classification', default=None,
|
||||
help='Set the book classification')
|
||||
parser.add_option('--creator', default=None, help='Set the book creator')
|
||||
parser.add_option('--producer', default=None, help='Set the book '
|
||||
'producer')
|
||||
parser.add_option('--get-cover', action='store_true', default=False,
|
||||
help='Extract cover from LRF file. Note that the LRF '
|
||||
'format has no defined cover, so we use some heuristics '
|
||||
'to guess the cover.')
|
||||
parser.add_option('--bookid', action='store', type='string', default=None,
|
||||
dest='book_id', help='Set book ID')
|
||||
# The SumPage element specifies the number of "View"s (visible pages for
|
||||
# the BookSetting element conditions) of the content.
|
||||
# Basically, the total pages per the page size, font size, etc. when the
|
||||
# LRF is first created. Since this will change as the book is reflowed, it
|
||||
# is probably not worth using.
|
||||
# parser.add_option("-p", "--page", action="store", type="string", \
|
||||
# dest="page", help=_("Don't know what this is for"))
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def set_metadata(stream, mi):
|
||||
lrf = LRFMetaFile(stream)
|
||||
if mi.title:
|
||||
lrf.title = mi.title
|
||||
if mi.authors:
|
||||
lrf.author = ', '.join(mi.authors)
|
||||
if mi.tags:
|
||||
lrf.category = mi.tags[0]
|
||||
if getattr(mi, 'category', False):
|
||||
lrf.category = mi.category
|
||||
if mi.comments:
|
||||
lrf.free_text = mi.comments
|
||||
if mi.author_sort:
|
||||
lrf.author_reading = mi.author_sort
|
||||
if mi.publisher:
|
||||
lrf.publisher = mi.publisher
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print()
|
||||
print('No lrf file specified')
|
||||
return 1
|
||||
lrf = LRFMetaFile(open(args[1], "r+b"))
|
||||
|
||||
if options.title:
|
||||
lrf.title = options.title
|
||||
if options.title_reading is not None:
|
||||
lrf.title_reading = options.title_reading
|
||||
if options.author_reading is not None:
|
||||
lrf.author_reading = options.author_reading
|
||||
if options.author:
|
||||
lrf.author = options.author
|
||||
if options.publisher:
|
||||
lrf.publisher = options.publisher
|
||||
if options.classification:
|
||||
lrf.classification = options.classification
|
||||
if options.category:
|
||||
lrf.category = options.category
|
||||
if options.creator:
|
||||
lrf.creator = options.creator
|
||||
if options.producer:
|
||||
lrf.producer = options.producer
|
||||
if options.thumbnail:
|
||||
path = os.path.expanduser(os.path.expandvars(options.thumbnail))
|
||||
with open(path, "rb") as f:
|
||||
lrf.thumbnail = f.read()
|
||||
if options.book_id is not None:
|
||||
lrf.book_id = options.book_id
|
||||
if options.comment:
|
||||
path = os.path.expanduser(os.path.expandvars(options.comment))
|
||||
with open(path, 'rb') as f:
|
||||
lrf.free_text = f.read().decode('utf-8', 'replace')
|
||||
if options.get_thumbnail:
|
||||
t = lrf.thumbnail
|
||||
td = "None"
|
||||
if t and len(t) > 0:
|
||||
td = (os.path.basename(args[1]) + "_thumbnail." +
|
||||
lrf.thumbail_extension())
|
||||
with open(td, "wb") as f:
|
||||
f.write(t)
|
||||
|
||||
fields = LRFMetaFile.__dict__.items()
|
||||
fields.sort()
|
||||
for f in fields:
|
||||
if "XML" in str(f):
|
||||
print(str(f[1]) + ":",
|
||||
getattr(lrf, f[0]).encode('utf-8'))
|
||||
if options.get_thumbnail:
|
||||
print("Thumbnail:", td)
|
||||
if options.get_cover:
|
||||
try:
|
||||
ext, data = lrf.get_cover()
|
||||
except Exception: # Fails on books created by LRFCreator 1.0
|
||||
ext, data = None, None
|
||||
if data:
|
||||
cover = (os.path.splitext(os.path.basename(args[1]))[0] +
|
||||
"_cover." + ext)
|
||||
with open(cover, 'wb') as f:
|
||||
f.write(data)
|
||||
print('Cover:', cover)
|
||||
else:
|
||||
print('Could not find cover in the LRF file')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
1279
ebook_converter/ebooks/lrf/objects.py
Normal file
1279
ebook_converter/ebooks/lrf/objects.py
Normal file
File diff suppressed because it is too large
Load Diff
255
ebook_converter/ebooks/lrf/tags.py
Normal file
255
ebook_converter/ebooks/lrf/tags.py
Normal file
@@ -0,0 +1,255 @@
|
||||
import struct
|
||||
|
||||
from ebook_converter.ebooks.lrf import LRFParseError
|
||||
|
||||
|
||||
class Tag(object):
|
||||
|
||||
tags = {0x00: (6, "*ObjectStart"),
|
||||
0x01: (0, "*ObjectEnd"),
|
||||
0x02: (4, "*ObjectInfoLink"),
|
||||
0x03: (4, "*Link"),
|
||||
0x04: (4, "*StreamSize"),
|
||||
0x05: (0, "*StreamStart"),
|
||||
0x06: (0, "*StreamEnd"),
|
||||
0x07: (4, None),
|
||||
0x08: (4, None),
|
||||
0x09: (4, None),
|
||||
0x0A: (4, None),
|
||||
0x0B: ("type_one", "*ContainedObjectsList"),
|
||||
0x0D: (2, None),
|
||||
0x0E: (2, None),
|
||||
0x11: (2, None),
|
||||
0x12: (2, None),
|
||||
0x13: (2, None),
|
||||
0x14: (2, None),
|
||||
0x15: (2, None),
|
||||
0x16: ("string", None),
|
||||
0x17: (4, None),
|
||||
0x18: (4, None),
|
||||
0x19: (2, None),
|
||||
0x1A: (2, None),
|
||||
0x1B: (2, None),
|
||||
0x1C: (2, None),
|
||||
0x1D: (2, None),
|
||||
0x1E: (2, None),
|
||||
0x21: (2, None),
|
||||
0x22: (2, None),
|
||||
0x23: (2, None),
|
||||
0x24: (2, None),
|
||||
0x25: (2, None),
|
||||
0x26: (2, None),
|
||||
0x27: (2, None),
|
||||
0x28: (2, None),
|
||||
0x29: (6, None),
|
||||
0x2A: (2, None),
|
||||
0x2B: (2, None),
|
||||
0x2C: (2, None),
|
||||
0x2D: (4, None),
|
||||
0x2E: (2, None),
|
||||
0x31: (2, None),
|
||||
0x32: (2, None),
|
||||
0x33: (2, None),
|
||||
0x34: (4, None),
|
||||
0x35: (2, None),
|
||||
0x36: (2, None),
|
||||
0x37: (4, None),
|
||||
0x38: (2, None),
|
||||
0x39: (2, None),
|
||||
0x3A: (2, None),
|
||||
0x3C: (2, None),
|
||||
0x3D: (2, None),
|
||||
0x3E: (2, None),
|
||||
0x41: (2, None),
|
||||
0x42: (2, None),
|
||||
0x44: (4, None),
|
||||
0x45: (4, None),
|
||||
0x46: (2, None),
|
||||
0x47: (2, None),
|
||||
0x48: (2, None),
|
||||
0x49: (8, None),
|
||||
0x4A: (8, None),
|
||||
0x4B: (4, None),
|
||||
0x4C: (4, None),
|
||||
0x4D: (0, None),
|
||||
0x4E: (12, None),
|
||||
0x51: (2, None),
|
||||
0x52: (2, None),
|
||||
0x53: (4, None),
|
||||
0x54: (2, "*StreamFlags"),
|
||||
0x55: ("string", None),
|
||||
0x56: (2, None),
|
||||
0x57: (2, None),
|
||||
0x58: (2, None),
|
||||
0x59: ("string", None),
|
||||
0x5A: ("string", None),
|
||||
0x5B: (4, None),
|
||||
0x5C: ("type_one", None),
|
||||
0x5D: ("string", None),
|
||||
0x5E: (2, None),
|
||||
0x61: (2, None),
|
||||
0x62: (0, None),
|
||||
0x63: (0, None),
|
||||
0x64: (0, None),
|
||||
0x65: (0, None),
|
||||
0x66: (0, None),
|
||||
0x67: (0, None),
|
||||
0x68: (0, None),
|
||||
0x69: (0, None),
|
||||
0x6A: (0, None),
|
||||
0x6B: (0, None),
|
||||
0x6C: (8, None),
|
||||
0x6D: (2, None),
|
||||
0x6E: (0, None),
|
||||
0x71: (0, None),
|
||||
0x72: (0, None),
|
||||
0x73: (10, None),
|
||||
0x75: (2, None),
|
||||
0x76: (2, None),
|
||||
0x77: (2, None),
|
||||
0x78: ("tag_78", None),
|
||||
0x79: (2, None),
|
||||
0x7A: (2, None),
|
||||
0x7B: (4, None),
|
||||
0x7C: (4, "*ParentPageTree"),
|
||||
0x81: (0, None),
|
||||
0x82: (0, None),
|
||||
0xA1: (4, None),
|
||||
0xA2: (0, None),
|
||||
0xA5: ("unknown", None),
|
||||
0xA6: (0, None),
|
||||
0xA7: (4, None),
|
||||
0xA8: (0, None),
|
||||
0xA9: (0, None),
|
||||
0xAA: (0, None),
|
||||
0xAB: (0, None),
|
||||
0xAC: (0, None),
|
||||
0xAD: (0, None),
|
||||
0xAE: (0, None),
|
||||
0xB1: (0, None),
|
||||
0xB2: (0, None),
|
||||
0xB3: (0, None),
|
||||
0xB4: (0, None),
|
||||
0xB5: (0, None),
|
||||
0xB6: (0, None),
|
||||
0xB7: (0, None),
|
||||
0xB8: (0, None),
|
||||
0xB9: (0, None),
|
||||
0xBA: (0, None),
|
||||
0xBB: (0, None),
|
||||
0xBC: (0, None),
|
||||
0xBD: (0, None),
|
||||
0xBE: (0, None),
|
||||
0xC1: (0, None),
|
||||
0xC2: (0, None),
|
||||
0xC3: (2, None),
|
||||
0xC4: (0, None),
|
||||
0xC5: (2, None),
|
||||
0xC6: (2, None),
|
||||
0xC7: (0, None),
|
||||
0xC8: (2, None),
|
||||
0xC9: (0, None),
|
||||
0xCA: (2, None),
|
||||
0xCB: ("unknown", None),
|
||||
0xCC: (2, None),
|
||||
0xD1: (12, None),
|
||||
0xD2: (0, None),
|
||||
0xD4: (2, None),
|
||||
0xD6: (0, None),
|
||||
0xD7: (14, None),
|
||||
0xD8: (4, None),
|
||||
0xD9: (8, None),
|
||||
0xDA: (2, None),
|
||||
0xDB: (2, None),
|
||||
0xDC: (2, None),
|
||||
0xDD: (2, None),
|
||||
0xF1: (2, None),
|
||||
0xF2: (4, None),
|
||||
0xF3: (4, None),
|
||||
0xF4: (2, None),
|
||||
0xF5: (4, None),
|
||||
0xF6: (4, None),
|
||||
0xF7: (4, None),
|
||||
0xF8: (4, None),
|
||||
0xF9: (6, None)}
|
||||
name_map = {}
|
||||
for key in tags.keys():
|
||||
temp = tags[key][1]
|
||||
if temp is not None:
|
||||
name_map[key] = temp
|
||||
|
||||
def __init__(self, stream):
|
||||
self.offset = stream.tell()
|
||||
tag_id = struct.unpack("<BB", stream.read(2))
|
||||
if tag_id[1] != 0xF5:
|
||||
raise LRFParseError("Bad tag ID %02X at %d" % (tag_id[1],
|
||||
self.offset))
|
||||
if tag_id[0] not in self.__class__.tags:
|
||||
raise LRFParseError("Unknown tag ID: F5%02X" % tag_id[0])
|
||||
|
||||
self.id = 0xF500 + tag_id[0]
|
||||
|
||||
size, self.name = self.__class__.tags[tag_id[0]]
|
||||
if isinstance(size, str):
|
||||
parser = getattr(self, size + '_parser')
|
||||
self.contents = parser(stream)
|
||||
else:
|
||||
self.contents = stream.read(size)
|
||||
|
||||
def __str__(self):
|
||||
s = "Tag %04X " % self.id
|
||||
if self.name:
|
||||
s += self.name
|
||||
s += " at %08X, contents: %s" % (self.offset, repr(self.contents))
|
||||
return s
|
||||
|
||||
@property
|
||||
def byte(self):
|
||||
if len(self.contents) != 1:
|
||||
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
|
||||
return struct.unpack("<B", self.contents)[0]
|
||||
|
||||
@property
|
||||
def word(self):
|
||||
if len(self.contents) != 2:
|
||||
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
|
||||
return struct.unpack("<H", self.contents)[0]
|
||||
|
||||
@property
|
||||
def sword(self):
|
||||
if len(self.contents) != 2:
|
||||
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
|
||||
return struct.unpack("<h", self.contents)[0]
|
||||
|
||||
@property
|
||||
def dword(self):
|
||||
if len(self.contents) != 4:
|
||||
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
|
||||
return struct.unpack("<I", self.contents)[0]
|
||||
|
||||
def dummy_parser(self, stream):
|
||||
raise LRFParseError("Unknown tag at %08X" % stream.tell())
|
||||
|
||||
@classmethod
|
||||
def string_parser(self, stream):
|
||||
size = struct.unpack("<H", stream.read(2))[0]
|
||||
return str(stream.read(size), "utf_16")
|
||||
|
||||
def type_one_parser(self, stream):
|
||||
cnt = struct.unpack("<H", stream.read(2))[0]
|
||||
res = []
|
||||
while cnt > 0:
|
||||
res.append(struct.unpack("<I", stream.read(4))[0])
|
||||
cnt -= 1
|
||||
return res
|
||||
|
||||
def tag_78_parser(self, stream):
|
||||
pos = stream.tell()
|
||||
res = []
|
||||
res.append(struct.unpack("<I", stream.read(4))[0])
|
||||
tag = Tag(stream)
|
||||
if tag.id != 0xF516:
|
||||
raise LRFParseError("Bad tag 78 at %08X" % pos)
|
||||
res.append(tag.contents)
|
||||
res.append(struct.unpack("<H", stream.read(2))[0])
|
||||
return res
|
||||
Reference in New Issue
Block a user