1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-18 10:14:12 +01:00

Added LRF input format support.

This commit is contained in:
2020-05-24 12:43:33 +02:00
parent 17c52a14a4
commit 399456d9ad
6 changed files with 2867 additions and 1 deletions

View File

@@ -54,6 +54,7 @@ Currently, I've tested following input formats:
- fb2
- html
- pdf
- lrf
Note, that old Microsoft doc format is not supported, although old documents
can be fairly easy converted using text processors programs, lik Word or
@@ -65,7 +66,7 @@ Output formats
Currently, following formats are supported:
- lrf (for Sony readers)
- lrf
- epub
- mobi
- docx

View File

@@ -0,0 +1,394 @@
import textwrap, operator
from copy import deepcopy, copy
from lxml import etree
from ebook_converter import guess_type
from ebook_converter.polyglot.builtins import as_bytes
class Canvas(etree.XSLTExtension):
def __init__(self, doc, styles, text_block, log):
self.doc = doc
self.styles = styles
self.text_block = text_block
self.log = log
self.processed = set()
def execute(self, context, self_node, input_node, output_parent):
cid = input_node.get('objid', None)
if cid is None or cid in self.processed:
return
self.processed.add(cid)
input_node = self.doc.xpath('//Canvas[@objid="%s"]'%cid)[0]
objects = list(self.get_objects(input_node))
if len(objects) == 1 and objects[0][0].tag == 'ImageBlock':
self.image_page(input_node, objects[0][0], output_parent)
else:
canvases = [input_node]
for x in input_node.itersiblings():
if x.tag == 'Canvas':
oid = x.get('objid', None)
if oid is not None:
canvases.append(x)
self.processed.add(oid)
else:
break
table = etree.Element('table')
table.text = '\n\t'
for canvas in canvases:
oid = canvas.get('objid')
tr = table.makeelement('tr')
tr.set('id', oid)
tr.tail = '\n\t'
table.append(tr)
for obj, x, y in self.get_objects(canvas):
if obj.tag != 'TextBlock':
self.log.warn(obj.tag, 'elements in Canvas not supported')
continue
td = table.makeelement('td')
self.text_block.render_block(obj, td)
tr.append(td)
output_parent.append(table)
def image_page(self, input_node, block, output_parent):
div = etree.Element('div')
div.set('id', input_node.get('objid', 'scuzzy'))
div.set('class', 'image_page')
width = self.styles.to_num(block.get("xsize", None))
height = self.styles.to_num(block.get("ysize", None))
img = div.makeelement('img')
if width is not None:
img.set('width', str(int(width)))
if height is not None:
img.set('height', str(int(height)))
ref = block.get('refstream', None)
if ref is not None:
imstr = self.doc.xpath('//ImageStream[@objid="%s"]'%ref)
if imstr:
src = imstr[0].get('file', None)
if src:
img.set('src', src)
div.append(img)
output_parent.append(div)
def get_objects(self, node):
for x in node.xpath('descendant::PutObj[@refobj and @x1 and @y1]'):
objs = node.xpath('//*[@objid="%s"]'%x.get('refobj'))
x, y = map(self.styles.to_num, (x.get('x1'), x.get('y1')))
if objs and x is not None and y is not None:
yield objs[0], int(x), int(y)
class MediaType(etree.XSLTExtension):
def execute(self, context, self_node, input_node, output_parent):
name = input_node.get('file', None)
typ = guess_type(name)[0]
if not typ:
typ = 'application/octet-stream'
output_parent.text = typ
class ImageBlock(etree.XSLTExtension):
def __init__(self, canvas):
etree.XSLTExtension.__init__(self)
self.canvas = canvas
def execute(self, context, self_node, input_node, output_parent):
self.canvas.image_page(input_node, input_node, output_parent)
class RuledLine(etree.XSLTExtension):
def execute(self, context, self_node, input_node, output_parent):
hr = etree.Element('hr')
output_parent.append(hr)
class TextBlock(etree.XSLTExtension):
def __init__(self, styles, char_button_map, plot_map, log):
etree.XSLTExtension.__init__(self)
self.styles = styles
self.log = log
self.char_button_map = char_button_map
self.plot_map = plot_map
def execute(self, context, self_node, input_node, output_parent):
input_node = deepcopy(input_node)
div = etree.Element('div')
self.render_block(input_node, div)
output_parent.append(div)
def render_block(self, node, root):
ts = node.get('textstyle', None)
classes = []
bs = node.get('blockstyle')
if bs in self.styles.block_style_map:
classes.append('bs%d'%self.styles.block_style_map[bs])
if ts in self.styles.text_style_map:
classes.append('ts%d'%self.styles.text_style_map[ts])
if classes:
root.set('class', ' '.join(classes))
objid = node.get('objid', None)
if objid:
root.set('id', objid)
root.text = node.text
self.root = root
self.parent = root
self.add_text_to = (self.parent, 'text')
self.fix_deep_nesting(node)
for child in node:
self.process_child(child)
def fix_deep_nesting(self, node):
deepest = 1
def depth(node):
parent = node.getparent()
ans = 1
while parent is not None:
ans += 1
parent = parent.getparent()
return ans
for span in node.xpath('descendant::Span'):
d = depth(span)
if d > deepest:
deepest = d
if d > 500:
break
if deepest < 500:
return
self.log.warn('Found deeply nested spans. Flattening.')
# with open('/t/before.xml', 'wb') as f:
# f.write(etree.tostring(node, method='xml'))
spans = [(depth(span), span) for span in node.xpath('descendant::Span')]
spans.sort(key=operator.itemgetter(0), reverse=True)
for depth, span in spans:
if depth < 3:
continue
p = span.getparent()
gp = p.getparent()
idx = p.index(span)
pidx = gp.index(p)
children = list(p)[idx:]
t = children[-1].tail
t = t if t else ''
children[-1].tail = t + (p.tail if p.tail else '')
p.tail = ''
pattrib = dict(**p.attrib) if p.tag == 'Span' else {}
for child in children:
p.remove(child)
if pattrib and child.tag == "Span":
attrib = copy(pattrib)
attrib.update(child.attrib)
child.attrib.update(attrib)
for child in reversed(children):
gp.insert(pidx+1, child)
# with open('/t/after.xml', 'wb') as f:
# f.write(etree.tostring(node, method='xml'))
def add_text(self, text):
if text:
if getattr(self.add_text_to[0], self.add_text_to[1]) is None:
setattr(self.add_text_to[0], self.add_text_to[1], '')
setattr(self.add_text_to[0], self.add_text_to[1],
getattr(self.add_text_to[0], self.add_text_to[1])+ text)
def process_container(self, child, tgt):
idx = self.styles.get_text_styles(child)
if idx is not None:
tgt.set('class', 'ts%d'%idx)
self.parent.append(tgt)
orig_parent = self.parent
self.parent = tgt
self.add_text_to = (self.parent, 'text')
self.add_text(child.text)
for gchild in child:
self.process_child(gchild)
self.parent = orig_parent
self.add_text_to = (tgt, 'tail')
self.add_text(child.tail)
def process_child(self, child):
if child.tag == 'CR':
if self.parent == self.root or self.parent.tag == 'p':
self.parent = self.root.makeelement('p')
self.root.append(self.parent)
self.add_text_to = (self.parent, 'text')
else:
br = self.parent.makeelement('br')
self.parent.append(br)
self.add_text_to = (br, 'tail')
self.add_text(child.tail)
elif child.tag in ('P', 'Span', 'EmpLine', 'NoBR'):
span = self.root.makeelement('span')
if child.tag == 'EmpLine':
td = 'underline' if child.get('emplineposition', 'before') == 'before' else 'overline'
span.set('style', 'text-decoration: '+td)
self.process_container(child, span)
elif child.tag == 'Sup':
sup = self.root.makeelement('sup')
self.process_container(child, sup)
elif child.tag == 'Sub':
sub = self.root.makeelement('sub')
self.process_container(child, sub)
elif child.tag == 'Italic':
sup = self.root.makeelement('i')
self.process_container(child, sup)
elif child.tag == 'CharButton':
a = self.root.makeelement('a')
oid = child.get('refobj', None)
if oid in self.char_button_map:
a.set('href', self.char_button_map[oid])
self.process_container(child, a)
elif child.tag == 'Plot':
xsize = self.styles.to_num(child.get('xsize', None), 166/720)
ysize = self.styles.to_num(child.get('ysize', None), 166/720)
img = self.root.makeelement('img')
if xsize is not None:
img.set('width', str(int(xsize)))
if ysize is not None:
img.set('height', str(int(ysize)))
ro = child.get('refobj', None)
if ro in self.plot_map:
img.set('src', self.plot_map[ro])
self.parent.append(img)
self.add_text_to = (img, 'tail')
self.add_text(child.tail)
else:
self.log.warn('Unhandled Text element:', child.tag)
class Styles(etree.XSLTExtension):
def __init__(self):
etree.XSLTExtension.__init__(self)
self.text_styles, self.block_styles = [], []
self.text_style_map, self.block_style_map = {}, {}
self.CSS = textwrap.dedent('''
.image_page { text-align:center }
''')
def write(self, name='styles.css'):
def join(style):
ans = ['%s : %s;'%(k, v) for k, v in style.items()]
if ans:
ans[-1] = ans[-1][:-1]
return '\n\t'.join(ans)
with open(name, 'wb') as f:
f.write(as_bytes(self.CSS))
for (w, sel) in [(self.text_styles, 'ts'), (self.block_styles,
'bs')]:
for i, s in enumerate(w):
if not s:
continue
rsel = '.%s%d'%(sel, i)
s = join(s)
f.write(as_bytes(rsel + ' {\n\t' + s + '\n}\n\n'))
def execute(self, context, self_node, input_node, output_parent):
if input_node.tag == 'TextStyle':
idx = self.get_text_styles(input_node)
if idx is not None:
self.text_style_map[input_node.get('objid')] = idx
else:
idx = self.get_block_styles(input_node)
self.block_style_map[input_node.get('objid')] = idx
def px_to_pt(self, px):
try:
return px * 72/166
except:
return None
def color(self, val):
try:
val = int(val, 16)
r, g, b, a = val & 0xFF, (val>>8)&0xFF, (val>>16)&0xFF, (val>>24)&0xFF
if a == 255:
return None
if a == 0:
return 'rgb(%d,%d,%d)'%(r,g,b)
return 'rgba(%d,%d,%d,%f)'%(r,g,b,1.-a/255.)
except:
return None
def get_block_styles(self, node):
ans = {}
sm = self.px_to_pt(node.get('sidemargin', None))
if sm is not None:
ans['margin-left'] = ans['margin-right'] = '%fpt'%sm
ts = self.px_to_pt(node.get('topskip', None))
if ts is not None:
ans['margin-top'] = '%fpt'%ts
fs = self.px_to_pt(node.get('footskip', None))
if fs is not None:
ans['margin-bottom'] = '%fpt'%fs
fw = self.px_to_pt(node.get('framewidth', None))
if fw is not None:
ans['border-width'] = '%fpt'%fw
ans['border-style'] = 'solid'
fc = self.color(node.get('framecolor', None))
if fc is not None:
ans['border-color'] = fc
bc = self.color(node.get('bgcolor', None))
if bc is not None:
ans['background-color'] = bc
if ans not in self.block_styles:
self.block_styles.append(ans)
return self.block_styles.index(ans)
def to_num(self, val, factor=1.):
try:
return float(val)*factor
except:
return None
def get_text_styles(self, node):
ans = {}
fs = self.to_num(node.get('fontsize', None), 0.1)
if fs is not None:
ans['font-size'] = '%fpt'%fs
fw = self.to_num(node.get('fontweight', None))
if fw is not None:
ans['font-weight'] = ('bold' if fw >= 700 else 'normal')
# fn = getattr(obj, 'fontfacename', None)
# if fn is not None:
# fn = cls.FONT_MAP[fn]
# item('font-family: %s;'%fn)
fg = self.color(node.get('textcolor', None))
if fg is not None:
ans['color'] = fg
bg = self.color(node.get('textbgcolor', None))
if bg is not None:
ans['background-color'] = bg
al = node.get('align', None)
if al is not None:
all = dict(head='left', center='center', foot='right')
ans['text-align'] = all.get(al, 'left')
# lh = self.to_num(node.get('linespace', None), 0.1)
# if lh is not None:
# ans['line-height'] = '%fpt'%lh
pi = self.to_num(node.get('parindent', None), 0.1)
if pi is not None:
ans['text-indent'] = '%fpt'%pi
if not ans:
return None
if ans not in self.text_styles:
self.text_styles.append(ans)
return self.text_styles.index(ans)

View File

@@ -0,0 +1,171 @@
import sys, array, os, re, codecs, logging
from itertools import chain
from ebook_converter import setup_cli_handlers
from ebook_converter.utils.config import OptionParser
from ebook_converter.utils.filenames import ascii_filename
from ebook_converter.ebooks.lrf.meta import LRFMetaFile
from ebook_converter.ebooks.lrf.objects import get_object, PageTree, StyleObject, \
Font, Text, TOCObject, BookAttr, ruby_tags
class LRFDocument(LRFMetaFile):
class temp(object):
pass
def __init__(self, stream):
LRFMetaFile.__init__(self, stream)
self.scramble_key = self.xor_key
self.page_trees = []
self.font_map = {}
self.image_map = {}
self.toc = ''
self.keep_parsing = True
def parse(self):
self._parse_objects()
self.metadata = LRFDocument.temp()
for a in ('title', 'title_reading', 'author', 'author_reading', 'book_id',
'classification', 'free_text', 'publisher', 'label', 'category'):
setattr(self.metadata, a, getattr(self, a))
self.doc_info = LRFDocument.temp()
for a in ('thumbnail', 'language', 'creator', 'producer', 'page'):
setattr(self.doc_info, a, getattr(self, a))
self.doc_info.thumbnail_extension = self.thumbail_extension()
self.device_info = LRFDocument.temp()
for a in ('dpi', 'width', 'height'):
setattr(self.device_info, a, getattr(self, a))
def _parse_objects(self):
self.objects = {}
self._file.seek(self.object_index_offset)
obj_array = array.array("I", self._file.read(4*4*self.number_of_objects))
if ord(array.array("i",[1]).tostring()[0:1])==0: # big-endian
obj_array.byteswap()
for i in range(self.number_of_objects):
if not self.keep_parsing:
break
objid, objoff, objsize = obj_array[i*4:i*4+3]
self._parse_object(objid, objoff, objsize)
for obj in self.objects.values():
if not self.keep_parsing:
break
if hasattr(obj, 'initialize'):
obj.initialize()
def _parse_object(self, objid, objoff, objsize):
obj = get_object(self, self._file, objid, objoff, objsize, self.scramble_key)
self.objects[objid] = obj
if isinstance(obj, PageTree):
self.page_trees.append(obj)
elif isinstance(obj, TOCObject):
self.toc = obj
elif isinstance(obj, BookAttr):
self.ruby_tags = {}
for h in ruby_tags.values():
attr = h[0]
if hasattr(obj, attr):
self.ruby_tags[attr] = getattr(obj, attr)
def __iter__(self):
for pt in self.page_trees:
yield pt
def write_files(self):
for obj in chain(self.image_map.values(), self.font_map.values()):
with open(obj.file, 'wb') as f:
f.write(obj.stream)
def to_xml(self, write_files=True):
bookinfo = '<BookInformation>\n<Info version="1.1">\n<BookInfo>\n'
bookinfo += '<Title reading="%s">%s</Title>\n'%(self.metadata.title_reading, self.metadata.title)
bookinfo += '<Author reading="%s">%s</Author>\n'%(self.metadata.author_reading, self.metadata.author)
bookinfo += '<BookID>%s</BookID>\n'%(self.metadata.book_id,)
bookinfo += '<Publisher reading="">%s</Publisher>\n'%(self.metadata.publisher,)
bookinfo += '<Label reading="">%s</Label>\n'%(self.metadata.label,)
bookinfo += '<Category reading="">%s</Category>\n'%(self.metadata.category,)
bookinfo += '<Classification reading="">%s</Classification>\n'%(self.metadata.classification,)
bookinfo += '<FreeText reading="">%s</FreeText>\n</BookInfo>\n<DocInfo>\n'%(self.metadata.free_text,)
th = self.doc_info.thumbnail
if th:
prefix = ascii_filename(self.metadata.title)
bookinfo += '<CThumbnail file="%s" />\n'%(prefix+'_thumbnail.'+self.doc_info.thumbnail_extension,)
if write_files:
with open(prefix+'_thumbnail.'+self.doc_info.thumbnail_extension, 'wb') as f:
f.write(th)
bookinfo += '<Language reading="">%s</Language>\n'%(self.doc_info.language,)
bookinfo += '<Creator reading="">%s</Creator>\n'%(self.doc_info.creator,)
bookinfo += '<Producer reading="">%s</Producer>\n'%(self.doc_info.producer,)
bookinfo += '<SumPage>%s</SumPage>\n</DocInfo>\n</Info>\n%s</BookInformation>\n'%(self.doc_info.page,self.toc)
pages = ''
done_main = False
pt_id = -1
for page_tree in self:
if not done_main:
done_main = True
pages += '<Main>\n'
close = '</Main>\n'
pt_id = page_tree.id
else:
pages += '<PageTree objid="%d">\n'%(page_tree.id,)
close = '</PageTree>\n'
for page in page_tree:
pages += str(page)
pages += close
traversed_objects = [int(i) for i in re.findall(r'objid="(\w+)"', pages)] + [pt_id]
objects = '\n<Objects>\n'
styles = '\n<Style>\n'
for obj in self.objects:
obj = self.objects[obj]
if obj.id in traversed_objects:
continue
if isinstance(obj, (Font, Text, TOCObject)):
continue
if isinstance(obj, StyleObject):
styles += str(obj)
else:
objects += str(obj)
styles += '</Style>\n'
objects += '</Objects>\n'
if write_files:
self.write_files()
return '<BBeBXylog version="1.0">\n' + bookinfo + pages + styles + objects + '</BBeBXylog>'
def option_parser():
parser = OptionParser(usage=_('%prog book.lrf\nConvert an LRF file into an LRS (XML UTF-8 encoded) file'))
parser.add_option('--output', '-o', default=None, help=_('Output LRS file'), dest='out')
parser.add_option('--dont-output-resources', default=True, action='store_false',
help=_('Do not save embedded image and font files to disk'),
dest='output_resources')
parser.add_option('--verbose', default=False, action='store_true', dest='verbose', help=_('Be more verbose'))
return parser
def main(args=sys.argv, logger=None):
parser = option_parser()
opts, args = parser.parse_args(args)
if logger is None:
level = logging.DEBUG if opts.verbose else logging.INFO
logger = logging.getLogger('lrf2lrs')
setup_cli_handlers(logger, level)
if len(args) != 2:
parser.print_help()
return 1
if opts.out is None:
opts.out = os.path.join(os.path.dirname(args[1]), os.path.splitext(os.path.basename(args[1]))[0]+".lrs")
logger.info(_('Parsing LRF...'))
d = LRFDocument(open(args[1], 'rb'))
d.parse()
logger.info(_('Creating XML...'))
with codecs.open(os.path.abspath(os.path.expanduser(opts.out)), 'wb', 'utf-8') as f:
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write(d.to_xml(write_files=opts.output_resources))
logger.info(_('LRS written to ')+opts.out)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,766 @@
"""
This module presents an easy to use interface for getting and setting
meta information in LRF files.
Just create an L{LRFMetaFile} object and use its properties
to get and set meta information. For example:
>>> lrf = LRFMetaFile("mybook.lrf")
>>> print(lrf.title, lrf.author)
>>> lrf.category = "History"
"""
import functools
import io
import os
import shutil
import struct
import sys
from xml.dom import minidom
import zlib
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ebooks.metadata import MetaInformation, string_to_authors
BYTE = "<B" #: Unsigned char little endian encoded in 1 byte
WORD = "<H" #: Unsigned short little endian encoded in 2 bytes
DWORD = "<I" #: Unsigned integer little endian encoded in 4 bytes
QWORD = "<Q" #: Unsigned long long little endian encoded in 8 bytes
class field(object):
""" A U{Descriptor<http://www.cafepy.com/article/python_attributes_
and_methods/python_attributes_and_methods.html>},
that implements access to protocol packets in a human readable way.
"""
def __init__(self, start=16, fmt=DWORD):
"""
@param start: The byte at which this field is stored in the buffer
@param fmt: The packing format for this field.
See U{struct<http://docs.python.org/lib/module-struct.html>}.
"""
self._fmt, self._start = fmt, start
def __get__(self, obj, typ=None):
return obj.unpack(start=self._start, fmt=self._fmt)[0]
def __set__(self, obj, val):
obj.pack(val, start=self._start, fmt=self._fmt)
def __repr__(self):
typ = {DWORD: 'unsigned int', 'QWORD': 'unsigned long long',
BYTE: 'unsigned char',
WORD: 'unsigned short'}.get(self._fmt, '')
return ("An " + typ + " stored in " +
str(struct.calcsize(self._fmt)) +
" bytes starting at byte " + str(self._start))
class versioned_field(field):
def __init__(self, vfield, version, start=0, fmt=WORD):
field.__init__(self, start=start, fmt=fmt)
self.vfield, self.version = vfield, version
def enabled(self, obj):
return self.vfield.__get__(obj) > self.version
def __get__(self, obj, typ=None):
if self.enabled(obj):
return field.__get__(self, obj, typ=typ)
else:
return None
def __set__(self, obj, val):
if not self.enabled(obj):
raise LRFException("Trying to set disabled field")
else:
field.__set__(self, obj, val)
class LRFException(Exception):
pass
class fixed_stringfield(object):
""" A field storing a variable length string. """
def __init__(self, length=8, start=0):
"""
@param length: Size of this string
@param start: The byte at which this field is stored in the buffer
"""
self._length = length
self._start = start
def __get__(self, obj, typ=None):
length = str(self._length)
return obj.unpack(start=self._start, fmt="<"+length+"s")[0]
def __set__(self, obj, val):
if not isinstance(val, (str, bytes)):
val = str(val)
if isinstance(val, str):
val = val.encode('utf-8')
if len(val) != self._length:
raise LRFException("Trying to set fixed_stringfield with a " +
"string of incorrect length")
obj.pack(val, start=self._start, fmt="<"+str(len(val))+"s")
def __repr__(self):
return "A string of length " + str(self._length) + \
" starting at byte " + str(self._start)
class xml_attr_field(object):
def __init__(self, tag_name, attr, parent='BookInfo'):
self.tag_name = tag_name
self.parent = parent
self.attr = attr
def __get__(self, obj, typ=None):
""" Return the data in this field or '' if the field is empty """
document = obj.info
elems = document.getElementsByTagName(self.tag_name)
if len(elems):
elem = None
for candidate in elems:
if candidate.parentNode.nodeName == self.parent:
elem = candidate
if elem and elem.hasAttribute(self.attr):
return elem.getAttribute(self.attr)
return ''
def __set__(self, obj, val):
if val is None:
val = ""
document = obj.info
elems = document.getElementsByTagName(self.tag_name)
if len(elems):
elem = None
for candidate in elems:
if candidate.parentNode.nodeName == self.parent:
elem = candidate
if elem:
elem.setAttribute(self.attr, val)
obj.info = document
def __repr__(self):
return "XML Attr Field: " + self.tag_name + " in " + self.parent
def __str__(self):
return self.tag_name+'.'+self.attr
class xml_field(object):
"""
Descriptor that gets and sets XML based meta information from an LRF file.
Works for simple XML fields of the form <tagname>data</tagname>
"""
def __init__(self, tag_name, parent="BookInfo"):
"""
@param tag_name: The XML tag whose data we operate on
@param parent: The tagname of the parent element of C{tag_name}
"""
self.tag_name = tag_name
self.parent = parent
def __get__(self, obj, typ=None):
""" Return the data in this field or '' if the field is empty """
document = obj.info
elems = document.getElementsByTagName(self.tag_name)
if len(elems):
elem = None
for candidate in elems:
if candidate.parentNode.nodeName == self.parent:
elem = candidate
if elem:
elem.normalize()
if elem.hasChildNodes():
return elem.firstChild.data.strip()
return ''
def __set__(self, obj, val):
if not val:
val = ''
document = obj.info
def create_elem():
elem = document.createElement(self.tag_name)
parent = document.getElementsByTagName(self.parent)[0]
parent.appendChild(elem)
return elem
if not val:
val = ''
if not isinstance(val, str):
val = val.decode('utf-8')
elems = document.getElementsByTagName(self.tag_name)
elem = None
if len(elems):
for candidate in elems:
if candidate.parentNode.nodeName == self.parent:
elem = candidate
if not elem:
elem = create_elem()
else:
elem.normalize()
while elem.hasChildNodes():
elem.removeChild(elem.lastChild)
else:
elem = create_elem()
elem.appendChild(document.createTextNode(val))
obj.info = document
def __str__(self):
return self.tag_name
def __repr__(self):
return "XML Field: " + self.tag_name + " in " + self.parent
def insert_into_file(fileobj, data, start, end):
"""
Insert data into fileobj at position C{start}.
This function inserts data into a file, overwriting all data between start
and end. If end == start no data is overwritten. Do not use this function
to append data to a file.
@param fileobj: file like object
@param data: data to be inserted into fileobj
@param start: The position at which to start inserting data
@param end: The position in fileobj of data that must not be overwritten
@return: C{start + len(data) - end}
"""
buffer = io.BytesIO()
fileobj.seek(end)
shutil.copyfileobj(fileobj, buffer, -1)
buffer.flush()
buffer.seek(0)
fileobj.seek(start)
fileobj.write(data)
fileobj.flush()
fileobj.truncate()
delta = fileobj.tell() - end # < 0 if len(data) < end-start
shutil.copyfileobj(buffer, fileobj, -1)
fileobj.flush()
buffer.close()
return delta
def get_metadata(stream):
"""
Return basic meta-data about the LRF file in C{stream} as a
L{MetaInformation} object.
@param stream: A file like object or an instance of L{LRFMetaFile}
"""
lrf = stream if isinstance(stream, LRFMetaFile) else LRFMetaFile(stream)
authors = string_to_authors(lrf.author)
mi = MetaInformation(lrf.title.strip(), authors)
mi.author = lrf.author.strip()
mi.comments = lrf.free_text.strip()
mi.category = lrf.category.strip()+', '+lrf.classification.strip()
tags = [x.strip() for x in mi.category.split(',') if x.strip()]
if tags:
mi.tags = tags
if mi.category.strip() == ',':
mi.category = None
mi.publisher = lrf.publisher.strip()
mi.cover_data = lrf.get_cover()
try:
mi.title_sort = lrf.title_reading.strip()
if not mi.title_sort:
mi.title_sort = None
except Exception:
pass
try:
mi.author_sort = lrf.author_reading.strip()
if not mi.author_sort:
mi.author_sort = None
except Exception:
pass
if not mi.title or 'unknown' in mi.title.lower():
mi.title = None
if not mi.authors:
mi.authors = None
if not mi.author or 'unknown' in mi.author.lower():
mi.author = None
if not mi.category or 'unknown' in mi.category.lower():
mi.category = None
if not mi.publisher or 'unknown' in mi.publisher.lower() or \
'some publisher' in mi.publisher.lower():
mi.publisher = None
return mi
class LRFMetaFile(object):
"""Has properties to read and write all Meta information in a LRF file."""
#: The first 6 bytes of all valid LRF files
LRF_HEADER = 'LRF'.encode('utf-16le')
lrf_header = fixed_stringfield(length=6, start=0x0)
version = field(fmt=WORD, start=0x8)
xor_key = field(fmt=WORD, start=0xa)
root_object_id = field(fmt=DWORD, start=0xc)
number_of_objects = field(fmt=QWORD, start=0x10)
object_index_offset = field(fmt=QWORD, start=0x18)
binding = field(fmt=BYTE, start=0x24)
dpi = field(fmt=WORD, start=0x26)
width = field(fmt=WORD, start=0x2a)
height = field(fmt=WORD, start=0x2c)
color_depth = field(fmt=BYTE, start=0x2e)
toc_object_id = field(fmt=DWORD, start=0x44)
toc_object_offset = field(fmt=DWORD, start=0x48)
compressed_info_size = field(fmt=WORD, start=0x4c)
thumbnail_type = versioned_field(version, 800, fmt=WORD, start=0x4e)
thumbnail_size = versioned_field(version, 800, fmt=DWORD, start=0x50)
uncompressed_info_size = versioned_field(compressed_info_size, 0,
fmt=DWORD, start=0x54)
title = xml_field("Title", parent="BookInfo")
title_reading = xml_attr_field("Title", 'reading', parent="BookInfo")
author = xml_field("Author", parent="BookInfo")
author_reading = xml_attr_field("Author", 'reading', parent="BookInfo")
# 16 characters. First two chars should be FB for personal use ebooks.
book_id = xml_field("BookID", parent="BookInfo")
publisher = xml_field("Publisher", parent="BookInfo")
label = xml_field("Label", parent="BookInfo")
category = xml_field("Category", parent="BookInfo")
classification = xml_field("Classification", parent="BookInfo")
free_text = xml_field("FreeText", parent="BookInfo")
# Should use ISO 639 language codes
language = xml_field("Language", parent="DocInfo")
creator = xml_field("Creator", parent="DocInfo")
# Format is %Y-%m-%d
creation_date = xml_field("CreationDate", parent="DocInfo")
producer = xml_field("Producer", parent="DocInfo")
page = xml_field("SumPage", parent="DocInfo")
def safe(func):
"""
Decorator that ensures that function calls leave the pos
in the underlying file unchanged
"""
@functools.wraps(func)
def restore_pos(*args, **kwargs):
obj = args[0]
pos = obj._file.tell()
res = func(*args, **kwargs)
obj._file.seek(0, 2)
if obj._file.tell() >= pos:
obj._file.seek(pos)
return res
return restore_pos
def safe_property(func):
"""
Decorator that ensures that read or writing a property leaves
the position in the underlying file unchanged
"""
def decorator(f):
def restore_pos(*args, **kwargs):
obj = args[0]
pos = obj._file.tell()
res = f(*args, **kwargs)
obj._file.seek(0, 2)
if obj._file.tell() >= pos:
obj._file.seek(pos)
return res
return restore_pos
locals_ = func()
if 'fget' in locals_:
locals_["fget"] = decorator(locals_["fget"])
if 'fset' in locals_:
locals_["fset"] = decorator(locals_["fset"])
return property(**locals_)
@safe_property
def info():
doc = """\
Document meta information as a minidom Document object.
To set use a minidom document object.
"""
def fget(self):
if self.compressed_info_size == 0:
raise LRFException("This document has no meta info")
size = self.compressed_info_size - 4
self._file.seek(self.info_start)
try:
src = zlib.decompress(self._file.read(size))
if len(src) != self.uncompressed_info_size:
raise LRFException("Decompression of document meta info\
yielded unexpected results")
src = xml_to_unicode(src, strip_encoding_pats=True,
resolve_entities=True,
assume_utf8=True)[0]
return minidom.parseString(src)
except zlib.error:
raise LRFException("Unable to decompress document meta "
"information")
def fset(self, document):
info = document.toxml('utf-8')
self.uncompressed_info_size = len(info)
stream = zlib.compress(info)
orig_size = self.compressed_info_size
self.compressed_info_size = len(stream) + 4
delta = insert_into_file(self._file, stream, self.info_start,
self.info_start + orig_size - 4)
if self.toc_object_offset > 0:
self.toc_object_offset += delta
self.object_index_offset += delta
self.update_object_offsets(delta)
return {"fget": fget, "fset": fset, "doc": doc}
@safe_property
def thumbnail_pos():
doc = """The position of the thumbnail in the LRF file"""
def fget(self):
return self.info_start + self.compressed_info_size-4
return {"fget": fget, "doc": doc}
@classmethod
def _detect_thumbnail_type(cls, slice):
""" @param slice: The first 16 bytes of the thumbnail """
ttype = 0x14 # GIF
if "PNG" in slice:
ttype = 0x12
if "BM" in slice:
ttype = 0x13
if "JFIF" in slice:
ttype = 0x11
return ttype
@safe_property
def thumbnail():
doc = """\
The thumbnail.
Represented as a string.
The string you would get from the file read function.
"""
def fget(self):
size = self.thumbnail_size
if size:
self._file.seek(self.thumbnail_pos)
return self._file.read(size)
def fset(self, data):
if self.version <= 800:
raise LRFException("Cannot store thumbnails in LRF files "
"of version <= 800")
slice = data[0:16]
orig_size = self.thumbnail_size
self.thumbnail_size = len(data)
delta = insert_into_file(self._file, data, self.thumbnail_pos,
self.thumbnail_pos + orig_size)
self.toc_object_offset += delta
self.object_index_offset += delta
self.thumbnail_type = self._detect_thumbnail_type(slice)
self.update_object_offsets(delta)
return {"fget": fget, "fset": fset, "doc": doc}
def __init__(self, file):
""" @param file: A file object opened in the r+b mode """
file.seek(0, 2)
self.size = file.tell()
self._file = file
if self.lrf_header != LRFMetaFile.LRF_HEADER:
raise LRFException(file.name + " has an invalid LRF header. Are "
"you sure it is an LRF file?")
# Byte at which the compressed meta information starts
self.info_start = 0x58 if self.version > 800 else 0x53
@safe
def update_object_offsets(self, delta):
"""
Run through the LRF Object index changing the offset by C{delta}.
"""
self._file.seek(self.object_index_offset)
count = self.number_of_objects
while count > 0:
raw = self._file.read(8)
new_offset = struct.unpack(DWORD, raw[4:8])[0] + delta
if new_offset >= (2**8)**4 or new_offset < 0x4C:
raise LRFException('Invalid LRF file. Could not set metadata.')
self._file.seek(-4, os.SEEK_CUR)
self._file.write(struct.pack(DWORD, new_offset))
self._file.seek(8, os.SEEK_CUR)
count -= 1
self._file.flush()
@safe
def unpack(self, fmt=DWORD, start=0):
"""
Return decoded data from file.
@param fmt: See http://docs.python.org/lib/module-struct.html
@param start: Position in file from which to decode
"""
end = start + struct.calcsize(fmt)
self._file.seek(start)
ret = struct.unpack(fmt, self._file.read(end-start))
return ret
@safe
def pack(self, *args, **kwargs):
"""
Encode C{args} and write them to file.
C{kwargs} must contain the keywords C{fmt} and C{start}
@param args: The values to pack
@param fmt: See http://docs.python.org/lib/module-struct.html
@param start: Position in file at which to write encoded data
"""
encoded = struct.pack(kwargs["fmt"], *args)
self._file.seek(kwargs["start"])
self._file.write(encoded)
self._file.flush()
def thumbail_extension(self):
"""
Return the extension for the thumbnail image type as specified
by L{self.thumbnail_type}. If the LRF file was created by buggy
software, the extension maye be incorrect. See
L{self.fix_thumbnail_type}.
"""
ext = "gif"
ttype = self.thumbnail_type
if ttype == 0x11:
ext = "jpeg"
elif ttype == 0x12:
ext = "png"
elif ttype == 0x13:
ext = "bmp"
return ext
def fix_thumbnail_type(self):
"""
Attempt to guess the thumbnail image format and set
L{self.thumbnail_type} accordingly.
"""
slice = self.thumbnail[0:16]
self.thumbnail_type = self._detect_thumbnail_type(slice)
def seek(self, *args):
""" See L{file.seek} """
return self._file.seek(*args)
def tell(self):
""" See L{file.tell} """
return self._file.tell()
def read(self):
""" See L{file.read} """
return self._file.read()
def write(self, val):
""" See L{file.write} """
self._file.write(val)
def _objects(self):
self._file.seek(self.object_index_offset)
c = self.number_of_objects
while c > 0:
c -= 1
raw = self._file.read(16)
pos = self._file.tell()
yield struct.unpack('<IIII', raw)[:3]
self._file.seek(pos)
def get_objects_by_type(self, type):
from ebook_converter.ebooks.lrf.tags import Tag
objects = []
for id, offset, size in self._objects():
self._file.seek(offset)
tag = Tag(self._file)
if tag.id == 0xF500:
obj_id, obj_type = struct.unpack("<IH", tag.contents)
if obj_type == type:
objects.append((obj_id, offset, size))
return objects
def get_object_by_id(self, tid):
from ebook_converter.ebooks.lrf.tags import Tag
for id, offset, size in self._objects():
self._file.seek(offset)
tag = Tag(self._file)
if tag.id == 0xF500:
obj_id, obj_type = struct.unpack("<IH", tag.contents)
if obj_id == tid:
return obj_id, offset, size, obj_type
return (False, False, False, False)
@safe
def get_cover(self):
from ebook_converter.ebooks.lrf.objects import get_object
for id, offset, size in self.get_objects_by_type(0x0C):
image = get_object(None, self._file, id, offset, size,
self.xor_key)
id, offset, size = self.get_object_by_id(image.refstream)[:3]
image_stream = get_object(None, self._file, id, offset, size,
self.xor_key)
return image_stream.file.rpartition('.')[-1], image_stream.stream
return None
def option_parser():
from ebook_converter.utils.config import OptionParser
from ebook_converter.constants import __appname__, __version__
parser = OptionParser(usage=('''%prog [options] mybook.lrf
Show/edit the metadata in an LRF file.\n\n'''),
version=__appname__+' '+__version__,
epilog='Created by Kovid Goyal')
parser.add_option("-t", "--title", action="store", type="string",
dest="title", help="Set the book title")
parser.add_option('--title-sort', action='store', type='string',
default=None, dest='title_reading',
help='Set sort key for the title')
parser.add_option("-a", "--author", action="store", type="string",
dest="author", help="Set the author")
parser.add_option('--author-sort', action='store', type='string',
default=None, dest='author_reading',
help='Set sort key for the author')
parser.add_option("-c", "--category", action="store", type="string",
dest="category", help="The category this book belongs "
"to. E.g.: History")
parser.add_option("--thumbnail", action="store", type="string",
dest="thumbnail", help="Path to a graphic that will be "
"set as this files' thumbnail")
parser.add_option("--comment", action="store", type="string",
dest="comment", help="Path to a TXT file containing the "
"comment to be stored in the LRF file.")
parser.add_option("--get-thumbnail", action="store_true",
dest="get_thumbnail", default=False,
help="Extract thumbnail from LRF file")
parser.add_option('--publisher', default=None, help='Set the publisher')
parser.add_option('--classification', default=None,
help='Set the book classification')
parser.add_option('--creator', default=None, help='Set the book creator')
parser.add_option('--producer', default=None, help='Set the book '
'producer')
parser.add_option('--get-cover', action='store_true', default=False,
help='Extract cover from LRF file. Note that the LRF '
'format has no defined cover, so we use some heuristics '
'to guess the cover.')
parser.add_option('--bookid', action='store', type='string', default=None,
dest='book_id', help='Set book ID')
# The SumPage element specifies the number of "View"s (visible pages for
# the BookSetting element conditions) of the content.
# Basically, the total pages per the page size, font size, etc. when the
# LRF is first created. Since this will change as the book is reflowed, it
# is probably not worth using.
# parser.add_option("-p", "--page", action="store", type="string", \
# dest="page", help=_("Don't know what this is for"))
return parser
def set_metadata(stream, mi):
lrf = LRFMetaFile(stream)
if mi.title:
lrf.title = mi.title
if mi.authors:
lrf.author = ', '.join(mi.authors)
if mi.tags:
lrf.category = mi.tags[0]
if getattr(mi, 'category', False):
lrf.category = mi.category
if mi.comments:
lrf.free_text = mi.comments
if mi.author_sort:
lrf.author_reading = mi.author_sort
if mi.publisher:
lrf.publisher = mi.publisher
def main(args=sys.argv):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print()
print('No lrf file specified')
return 1
lrf = LRFMetaFile(open(args[1], "r+b"))
if options.title:
lrf.title = options.title
if options.title_reading is not None:
lrf.title_reading = options.title_reading
if options.author_reading is not None:
lrf.author_reading = options.author_reading
if options.author:
lrf.author = options.author
if options.publisher:
lrf.publisher = options.publisher
if options.classification:
lrf.classification = options.classification
if options.category:
lrf.category = options.category
if options.creator:
lrf.creator = options.creator
if options.producer:
lrf.producer = options.producer
if options.thumbnail:
path = os.path.expanduser(os.path.expandvars(options.thumbnail))
with open(path, "rb") as f:
lrf.thumbnail = f.read()
if options.book_id is not None:
lrf.book_id = options.book_id
if options.comment:
path = os.path.expanduser(os.path.expandvars(options.comment))
with open(path, 'rb') as f:
lrf.free_text = f.read().decode('utf-8', 'replace')
if options.get_thumbnail:
t = lrf.thumbnail
td = "None"
if t and len(t) > 0:
td = (os.path.basename(args[1]) + "_thumbnail." +
lrf.thumbail_extension())
with open(td, "wb") as f:
f.write(t)
fields = LRFMetaFile.__dict__.items()
fields.sort()
for f in fields:
if "XML" in str(f):
print(str(f[1]) + ":",
getattr(lrf, f[0]).encode('utf-8'))
if options.get_thumbnail:
print("Thumbnail:", td)
if options.get_cover:
try:
ext, data = lrf.get_cover()
except Exception: # Fails on books created by LRFCreator 1.0
ext, data = None, None
if data:
cover = (os.path.splitext(os.path.basename(args[1]))[0] +
"_cover." + ext)
with open(cover, 'wb') as f:
f.write(data)
print('Cover:', cover)
else:
print('Could not find cover in the LRF file')
if __name__ == '__main__':
sys.exit(main())

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,255 @@
import struct
from ebook_converter.ebooks.lrf import LRFParseError
class Tag(object):
tags = {0x00: (6, "*ObjectStart"),
0x01: (0, "*ObjectEnd"),
0x02: (4, "*ObjectInfoLink"),
0x03: (4, "*Link"),
0x04: (4, "*StreamSize"),
0x05: (0, "*StreamStart"),
0x06: (0, "*StreamEnd"),
0x07: (4, None),
0x08: (4, None),
0x09: (4, None),
0x0A: (4, None),
0x0B: ("type_one", "*ContainedObjectsList"),
0x0D: (2, None),
0x0E: (2, None),
0x11: (2, None),
0x12: (2, None),
0x13: (2, None),
0x14: (2, None),
0x15: (2, None),
0x16: ("string", None),
0x17: (4, None),
0x18: (4, None),
0x19: (2, None),
0x1A: (2, None),
0x1B: (2, None),
0x1C: (2, None),
0x1D: (2, None),
0x1E: (2, None),
0x21: (2, None),
0x22: (2, None),
0x23: (2, None),
0x24: (2, None),
0x25: (2, None),
0x26: (2, None),
0x27: (2, None),
0x28: (2, None),
0x29: (6, None),
0x2A: (2, None),
0x2B: (2, None),
0x2C: (2, None),
0x2D: (4, None),
0x2E: (2, None),
0x31: (2, None),
0x32: (2, None),
0x33: (2, None),
0x34: (4, None),
0x35: (2, None),
0x36: (2, None),
0x37: (4, None),
0x38: (2, None),
0x39: (2, None),
0x3A: (2, None),
0x3C: (2, None),
0x3D: (2, None),
0x3E: (2, None),
0x41: (2, None),
0x42: (2, None),
0x44: (4, None),
0x45: (4, None),
0x46: (2, None),
0x47: (2, None),
0x48: (2, None),
0x49: (8, None),
0x4A: (8, None),
0x4B: (4, None),
0x4C: (4, None),
0x4D: (0, None),
0x4E: (12, None),
0x51: (2, None),
0x52: (2, None),
0x53: (4, None),
0x54: (2, "*StreamFlags"),
0x55: ("string", None),
0x56: (2, None),
0x57: (2, None),
0x58: (2, None),
0x59: ("string", None),
0x5A: ("string", None),
0x5B: (4, None),
0x5C: ("type_one", None),
0x5D: ("string", None),
0x5E: (2, None),
0x61: (2, None),
0x62: (0, None),
0x63: (0, None),
0x64: (0, None),
0x65: (0, None),
0x66: (0, None),
0x67: (0, None),
0x68: (0, None),
0x69: (0, None),
0x6A: (0, None),
0x6B: (0, None),
0x6C: (8, None),
0x6D: (2, None),
0x6E: (0, None),
0x71: (0, None),
0x72: (0, None),
0x73: (10, None),
0x75: (2, None),
0x76: (2, None),
0x77: (2, None),
0x78: ("tag_78", None),
0x79: (2, None),
0x7A: (2, None),
0x7B: (4, None),
0x7C: (4, "*ParentPageTree"),
0x81: (0, None),
0x82: (0, None),
0xA1: (4, None),
0xA2: (0, None),
0xA5: ("unknown", None),
0xA6: (0, None),
0xA7: (4, None),
0xA8: (0, None),
0xA9: (0, None),
0xAA: (0, None),
0xAB: (0, None),
0xAC: (0, None),
0xAD: (0, None),
0xAE: (0, None),
0xB1: (0, None),
0xB2: (0, None),
0xB3: (0, None),
0xB4: (0, None),
0xB5: (0, None),
0xB6: (0, None),
0xB7: (0, None),
0xB8: (0, None),
0xB9: (0, None),
0xBA: (0, None),
0xBB: (0, None),
0xBC: (0, None),
0xBD: (0, None),
0xBE: (0, None),
0xC1: (0, None),
0xC2: (0, None),
0xC3: (2, None),
0xC4: (0, None),
0xC5: (2, None),
0xC6: (2, None),
0xC7: (0, None),
0xC8: (2, None),
0xC9: (0, None),
0xCA: (2, None),
0xCB: ("unknown", None),
0xCC: (2, None),
0xD1: (12, None),
0xD2: (0, None),
0xD4: (2, None),
0xD6: (0, None),
0xD7: (14, None),
0xD8: (4, None),
0xD9: (8, None),
0xDA: (2, None),
0xDB: (2, None),
0xDC: (2, None),
0xDD: (2, None),
0xF1: (2, None),
0xF2: (4, None),
0xF3: (4, None),
0xF4: (2, None),
0xF5: (4, None),
0xF6: (4, None),
0xF7: (4, None),
0xF8: (4, None),
0xF9: (6, None)}
name_map = {}
for key in tags.keys():
temp = tags[key][1]
if temp is not None:
name_map[key] = temp
def __init__(self, stream):
self.offset = stream.tell()
tag_id = struct.unpack("<BB", stream.read(2))
if tag_id[1] != 0xF5:
raise LRFParseError("Bad tag ID %02X at %d" % (tag_id[1],
self.offset))
if tag_id[0] not in self.__class__.tags:
raise LRFParseError("Unknown tag ID: F5%02X" % tag_id[0])
self.id = 0xF500 + tag_id[0]
size, self.name = self.__class__.tags[tag_id[0]]
if isinstance(size, str):
parser = getattr(self, size + '_parser')
self.contents = parser(stream)
else:
self.contents = stream.read(size)
def __str__(self):
s = "Tag %04X " % self.id
if self.name:
s += self.name
s += " at %08X, contents: %s" % (self.offset, repr(self.contents))
return s
@property
def byte(self):
if len(self.contents) != 1:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
return struct.unpack("<B", self.contents)[0]
@property
def word(self):
if len(self.contents) != 2:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
return struct.unpack("<H", self.contents)[0]
@property
def sword(self):
if len(self.contents) != 2:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
return struct.unpack("<h", self.contents)[0]
@property
def dword(self):
if len(self.contents) != 4:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
return struct.unpack("<I", self.contents)[0]
def dummy_parser(self, stream):
raise LRFParseError("Unknown tag at %08X" % stream.tell())
@classmethod
def string_parser(self, stream):
size = struct.unpack("<H", stream.read(2))[0]
return str(stream.read(size), "utf_16")
def type_one_parser(self, stream):
cnt = struct.unpack("<H", stream.read(2))[0]
res = []
while cnt > 0:
res.append(struct.unpack("<I", stream.read(4))[0])
cnt -= 1
return res
def tag_78_parser(self, stream):
pos = stream.tell()
res = []
res.append(struct.unpack("<I", stream.read(4))[0])
tag = Tag(stream)
if tag.id != 0xF516:
raise LRFParseError("Bad tag 78 at %08X" % pos)
res.append(tag.contents)
res.append(struct.unpack("<H", stream.read(2))[0])
return res