1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-13 04:55:49 +01:00

Added LRF input format support.

This commit is contained in:
2020-05-24 12:43:33 +02:00
parent 17c52a14a4
commit 399456d9ad
6 changed files with 2867 additions and 1 deletions

View File

@@ -54,6 +54,7 @@ Currently, I've tested following input formats:
- fb2 - fb2
- html - html
- pdf - pdf
- lrf
Note, that old Microsoft doc format is not supported, although old documents Note, that old Microsoft doc format is not supported, although old documents
can be fairly easy converted using text processors programs, lik Word or can be fairly easy converted using text processors programs, lik Word or
@@ -65,7 +66,7 @@ Output formats
Currently, following formats are supported: Currently, following formats are supported:
- lrf (for Sony readers) - lrf
- epub - epub
- mobi - mobi
- docx - docx

View File

@@ -0,0 +1,394 @@
import textwrap, operator
from copy import deepcopy, copy
from lxml import etree
from ebook_converter import guess_type
from ebook_converter.polyglot.builtins import as_bytes
class Canvas(etree.XSLTExtension):
def __init__(self, doc, styles, text_block, log):
self.doc = doc
self.styles = styles
self.text_block = text_block
self.log = log
self.processed = set()
def execute(self, context, self_node, input_node, output_parent):
cid = input_node.get('objid', None)
if cid is None or cid in self.processed:
return
self.processed.add(cid)
input_node = self.doc.xpath('//Canvas[@objid="%s"]'%cid)[0]
objects = list(self.get_objects(input_node))
if len(objects) == 1 and objects[0][0].tag == 'ImageBlock':
self.image_page(input_node, objects[0][0], output_parent)
else:
canvases = [input_node]
for x in input_node.itersiblings():
if x.tag == 'Canvas':
oid = x.get('objid', None)
if oid is not None:
canvases.append(x)
self.processed.add(oid)
else:
break
table = etree.Element('table')
table.text = '\n\t'
for canvas in canvases:
oid = canvas.get('objid')
tr = table.makeelement('tr')
tr.set('id', oid)
tr.tail = '\n\t'
table.append(tr)
for obj, x, y in self.get_objects(canvas):
if obj.tag != 'TextBlock':
self.log.warn(obj.tag, 'elements in Canvas not supported')
continue
td = table.makeelement('td')
self.text_block.render_block(obj, td)
tr.append(td)
output_parent.append(table)
def image_page(self, input_node, block, output_parent):
div = etree.Element('div')
div.set('id', input_node.get('objid', 'scuzzy'))
div.set('class', 'image_page')
width = self.styles.to_num(block.get("xsize", None))
height = self.styles.to_num(block.get("ysize", None))
img = div.makeelement('img')
if width is not None:
img.set('width', str(int(width)))
if height is not None:
img.set('height', str(int(height)))
ref = block.get('refstream', None)
if ref is not None:
imstr = self.doc.xpath('//ImageStream[@objid="%s"]'%ref)
if imstr:
src = imstr[0].get('file', None)
if src:
img.set('src', src)
div.append(img)
output_parent.append(div)
def get_objects(self, node):
for x in node.xpath('descendant::PutObj[@refobj and @x1 and @y1]'):
objs = node.xpath('//*[@objid="%s"]'%x.get('refobj'))
x, y = map(self.styles.to_num, (x.get('x1'), x.get('y1')))
if objs and x is not None and y is not None:
yield objs[0], int(x), int(y)
class MediaType(etree.XSLTExtension):
def execute(self, context, self_node, input_node, output_parent):
name = input_node.get('file', None)
typ = guess_type(name)[0]
if not typ:
typ = 'application/octet-stream'
output_parent.text = typ
class ImageBlock(etree.XSLTExtension):
def __init__(self, canvas):
etree.XSLTExtension.__init__(self)
self.canvas = canvas
def execute(self, context, self_node, input_node, output_parent):
self.canvas.image_page(input_node, input_node, output_parent)
class RuledLine(etree.XSLTExtension):
def execute(self, context, self_node, input_node, output_parent):
hr = etree.Element('hr')
output_parent.append(hr)
class TextBlock(etree.XSLTExtension):
def __init__(self, styles, char_button_map, plot_map, log):
etree.XSLTExtension.__init__(self)
self.styles = styles
self.log = log
self.char_button_map = char_button_map
self.plot_map = plot_map
def execute(self, context, self_node, input_node, output_parent):
input_node = deepcopy(input_node)
div = etree.Element('div')
self.render_block(input_node, div)
output_parent.append(div)
def render_block(self, node, root):
ts = node.get('textstyle', None)
classes = []
bs = node.get('blockstyle')
if bs in self.styles.block_style_map:
classes.append('bs%d'%self.styles.block_style_map[bs])
if ts in self.styles.text_style_map:
classes.append('ts%d'%self.styles.text_style_map[ts])
if classes:
root.set('class', ' '.join(classes))
objid = node.get('objid', None)
if objid:
root.set('id', objid)
root.text = node.text
self.root = root
self.parent = root
self.add_text_to = (self.parent, 'text')
self.fix_deep_nesting(node)
for child in node:
self.process_child(child)
def fix_deep_nesting(self, node):
deepest = 1
def depth(node):
parent = node.getparent()
ans = 1
while parent is not None:
ans += 1
parent = parent.getparent()
return ans
for span in node.xpath('descendant::Span'):
d = depth(span)
if d > deepest:
deepest = d
if d > 500:
break
if deepest < 500:
return
self.log.warn('Found deeply nested spans. Flattening.')
# with open('/t/before.xml', 'wb') as f:
# f.write(etree.tostring(node, method='xml'))
spans = [(depth(span), span) for span in node.xpath('descendant::Span')]
spans.sort(key=operator.itemgetter(0), reverse=True)
for depth, span in spans:
if depth < 3:
continue
p = span.getparent()
gp = p.getparent()
idx = p.index(span)
pidx = gp.index(p)
children = list(p)[idx:]
t = children[-1].tail
t = t if t else ''
children[-1].tail = t + (p.tail if p.tail else '')
p.tail = ''
pattrib = dict(**p.attrib) if p.tag == 'Span' else {}
for child in children:
p.remove(child)
if pattrib and child.tag == "Span":
attrib = copy(pattrib)
attrib.update(child.attrib)
child.attrib.update(attrib)
for child in reversed(children):
gp.insert(pidx+1, child)
# with open('/t/after.xml', 'wb') as f:
# f.write(etree.tostring(node, method='xml'))
def add_text(self, text):
if text:
if getattr(self.add_text_to[0], self.add_text_to[1]) is None:
setattr(self.add_text_to[0], self.add_text_to[1], '')
setattr(self.add_text_to[0], self.add_text_to[1],
getattr(self.add_text_to[0], self.add_text_to[1])+ text)
def process_container(self, child, tgt):
idx = self.styles.get_text_styles(child)
if idx is not None:
tgt.set('class', 'ts%d'%idx)
self.parent.append(tgt)
orig_parent = self.parent
self.parent = tgt
self.add_text_to = (self.parent, 'text')
self.add_text(child.text)
for gchild in child:
self.process_child(gchild)
self.parent = orig_parent
self.add_text_to = (tgt, 'tail')
self.add_text(child.tail)
def process_child(self, child):
if child.tag == 'CR':
if self.parent == self.root or self.parent.tag == 'p':
self.parent = self.root.makeelement('p')
self.root.append(self.parent)
self.add_text_to = (self.parent, 'text')
else:
br = self.parent.makeelement('br')
self.parent.append(br)
self.add_text_to = (br, 'tail')
self.add_text(child.tail)
elif child.tag in ('P', 'Span', 'EmpLine', 'NoBR'):
span = self.root.makeelement('span')
if child.tag == 'EmpLine':
td = 'underline' if child.get('emplineposition', 'before') == 'before' else 'overline'
span.set('style', 'text-decoration: '+td)
self.process_container(child, span)
elif child.tag == 'Sup':
sup = self.root.makeelement('sup')
self.process_container(child, sup)
elif child.tag == 'Sub':
sub = self.root.makeelement('sub')
self.process_container(child, sub)
elif child.tag == 'Italic':
sup = self.root.makeelement('i')
self.process_container(child, sup)
elif child.tag == 'CharButton':
a = self.root.makeelement('a')
oid = child.get('refobj', None)
if oid in self.char_button_map:
a.set('href', self.char_button_map[oid])
self.process_container(child, a)
elif child.tag == 'Plot':
xsize = self.styles.to_num(child.get('xsize', None), 166/720)
ysize = self.styles.to_num(child.get('ysize', None), 166/720)
img = self.root.makeelement('img')
if xsize is not None:
img.set('width', str(int(xsize)))
if ysize is not None:
img.set('height', str(int(ysize)))
ro = child.get('refobj', None)
if ro in self.plot_map:
img.set('src', self.plot_map[ro])
self.parent.append(img)
self.add_text_to = (img, 'tail')
self.add_text(child.tail)
else:
self.log.warn('Unhandled Text element:', child.tag)
class Styles(etree.XSLTExtension):
def __init__(self):
etree.XSLTExtension.__init__(self)
self.text_styles, self.block_styles = [], []
self.text_style_map, self.block_style_map = {}, {}
self.CSS = textwrap.dedent('''
.image_page { text-align:center }
''')
def write(self, name='styles.css'):
def join(style):
ans = ['%s : %s;'%(k, v) for k, v in style.items()]
if ans:
ans[-1] = ans[-1][:-1]
return '\n\t'.join(ans)
with open(name, 'wb') as f:
f.write(as_bytes(self.CSS))
for (w, sel) in [(self.text_styles, 'ts'), (self.block_styles,
'bs')]:
for i, s in enumerate(w):
if not s:
continue
rsel = '.%s%d'%(sel, i)
s = join(s)
f.write(as_bytes(rsel + ' {\n\t' + s + '\n}\n\n'))
def execute(self, context, self_node, input_node, output_parent):
if input_node.tag == 'TextStyle':
idx = self.get_text_styles(input_node)
if idx is not None:
self.text_style_map[input_node.get('objid')] = idx
else:
idx = self.get_block_styles(input_node)
self.block_style_map[input_node.get('objid')] = idx
def px_to_pt(self, px):
try:
return px * 72/166
except:
return None
def color(self, val):
try:
val = int(val, 16)
r, g, b, a = val & 0xFF, (val>>8)&0xFF, (val>>16)&0xFF, (val>>24)&0xFF
if a == 255:
return None
if a == 0:
return 'rgb(%d,%d,%d)'%(r,g,b)
return 'rgba(%d,%d,%d,%f)'%(r,g,b,1.-a/255.)
except:
return None
def get_block_styles(self, node):
ans = {}
sm = self.px_to_pt(node.get('sidemargin', None))
if sm is not None:
ans['margin-left'] = ans['margin-right'] = '%fpt'%sm
ts = self.px_to_pt(node.get('topskip', None))
if ts is not None:
ans['margin-top'] = '%fpt'%ts
fs = self.px_to_pt(node.get('footskip', None))
if fs is not None:
ans['margin-bottom'] = '%fpt'%fs
fw = self.px_to_pt(node.get('framewidth', None))
if fw is not None:
ans['border-width'] = '%fpt'%fw
ans['border-style'] = 'solid'
fc = self.color(node.get('framecolor', None))
if fc is not None:
ans['border-color'] = fc
bc = self.color(node.get('bgcolor', None))
if bc is not None:
ans['background-color'] = bc
if ans not in self.block_styles:
self.block_styles.append(ans)
return self.block_styles.index(ans)
def to_num(self, val, factor=1.):
try:
return float(val)*factor
except:
return None
def get_text_styles(self, node):
ans = {}
fs = self.to_num(node.get('fontsize', None), 0.1)
if fs is not None:
ans['font-size'] = '%fpt'%fs
fw = self.to_num(node.get('fontweight', None))
if fw is not None:
ans['font-weight'] = ('bold' if fw >= 700 else 'normal')
# fn = getattr(obj, 'fontfacename', None)
# if fn is not None:
# fn = cls.FONT_MAP[fn]
# item('font-family: %s;'%fn)
fg = self.color(node.get('textcolor', None))
if fg is not None:
ans['color'] = fg
bg = self.color(node.get('textbgcolor', None))
if bg is not None:
ans['background-color'] = bg
al = node.get('align', None)
if al is not None:
all = dict(head='left', center='center', foot='right')
ans['text-align'] = all.get(al, 'left')
# lh = self.to_num(node.get('linespace', None), 0.1)
# if lh is not None:
# ans['line-height'] = '%fpt'%lh
pi = self.to_num(node.get('parindent', None), 0.1)
if pi is not None:
ans['text-indent'] = '%fpt'%pi
if not ans:
return None
if ans not in self.text_styles:
self.text_styles.append(ans)
return self.text_styles.index(ans)

View File

@@ -0,0 +1,171 @@
import sys, array, os, re, codecs, logging
from itertools import chain
from ebook_converter import setup_cli_handlers
from ebook_converter.utils.config import OptionParser
from ebook_converter.utils.filenames import ascii_filename
from ebook_converter.ebooks.lrf.meta import LRFMetaFile
from ebook_converter.ebooks.lrf.objects import get_object, PageTree, StyleObject, \
Font, Text, TOCObject, BookAttr, ruby_tags
class LRFDocument(LRFMetaFile):
class temp(object):
pass
def __init__(self, stream):
LRFMetaFile.__init__(self, stream)
self.scramble_key = self.xor_key
self.page_trees = []
self.font_map = {}
self.image_map = {}
self.toc = ''
self.keep_parsing = True
def parse(self):
self._parse_objects()
self.metadata = LRFDocument.temp()
for a in ('title', 'title_reading', 'author', 'author_reading', 'book_id',
'classification', 'free_text', 'publisher', 'label', 'category'):
setattr(self.metadata, a, getattr(self, a))
self.doc_info = LRFDocument.temp()
for a in ('thumbnail', 'language', 'creator', 'producer', 'page'):
setattr(self.doc_info, a, getattr(self, a))
self.doc_info.thumbnail_extension = self.thumbail_extension()
self.device_info = LRFDocument.temp()
for a in ('dpi', 'width', 'height'):
setattr(self.device_info, a, getattr(self, a))
def _parse_objects(self):
self.objects = {}
self._file.seek(self.object_index_offset)
obj_array = array.array("I", self._file.read(4*4*self.number_of_objects))
if ord(array.array("i",[1]).tostring()[0:1])==0: # big-endian
obj_array.byteswap()
for i in range(self.number_of_objects):
if not self.keep_parsing:
break
objid, objoff, objsize = obj_array[i*4:i*4+3]
self._parse_object(objid, objoff, objsize)
for obj in self.objects.values():
if not self.keep_parsing:
break
if hasattr(obj, 'initialize'):
obj.initialize()
def _parse_object(self, objid, objoff, objsize):
obj = get_object(self, self._file, objid, objoff, objsize, self.scramble_key)
self.objects[objid] = obj
if isinstance(obj, PageTree):
self.page_trees.append(obj)
elif isinstance(obj, TOCObject):
self.toc = obj
elif isinstance(obj, BookAttr):
self.ruby_tags = {}
for h in ruby_tags.values():
attr = h[0]
if hasattr(obj, attr):
self.ruby_tags[attr] = getattr(obj, attr)
def __iter__(self):
for pt in self.page_trees:
yield pt
def write_files(self):
for obj in chain(self.image_map.values(), self.font_map.values()):
with open(obj.file, 'wb') as f:
f.write(obj.stream)
def to_xml(self, write_files=True):
bookinfo = '<BookInformation>\n<Info version="1.1">\n<BookInfo>\n'
bookinfo += '<Title reading="%s">%s</Title>\n'%(self.metadata.title_reading, self.metadata.title)
bookinfo += '<Author reading="%s">%s</Author>\n'%(self.metadata.author_reading, self.metadata.author)
bookinfo += '<BookID>%s</BookID>\n'%(self.metadata.book_id,)
bookinfo += '<Publisher reading="">%s</Publisher>\n'%(self.metadata.publisher,)
bookinfo += '<Label reading="">%s</Label>\n'%(self.metadata.label,)
bookinfo += '<Category reading="">%s</Category>\n'%(self.metadata.category,)
bookinfo += '<Classification reading="">%s</Classification>\n'%(self.metadata.classification,)
bookinfo += '<FreeText reading="">%s</FreeText>\n</BookInfo>\n<DocInfo>\n'%(self.metadata.free_text,)
th = self.doc_info.thumbnail
if th:
prefix = ascii_filename(self.metadata.title)
bookinfo += '<CThumbnail file="%s" />\n'%(prefix+'_thumbnail.'+self.doc_info.thumbnail_extension,)
if write_files:
with open(prefix+'_thumbnail.'+self.doc_info.thumbnail_extension, 'wb') as f:
f.write(th)
bookinfo += '<Language reading="">%s</Language>\n'%(self.doc_info.language,)
bookinfo += '<Creator reading="">%s</Creator>\n'%(self.doc_info.creator,)
bookinfo += '<Producer reading="">%s</Producer>\n'%(self.doc_info.producer,)
bookinfo += '<SumPage>%s</SumPage>\n</DocInfo>\n</Info>\n%s</BookInformation>\n'%(self.doc_info.page,self.toc)
pages = ''
done_main = False
pt_id = -1
for page_tree in self:
if not done_main:
done_main = True
pages += '<Main>\n'
close = '</Main>\n'
pt_id = page_tree.id
else:
pages += '<PageTree objid="%d">\n'%(page_tree.id,)
close = '</PageTree>\n'
for page in page_tree:
pages += str(page)
pages += close
traversed_objects = [int(i) for i in re.findall(r'objid="(\w+)"', pages)] + [pt_id]
objects = '\n<Objects>\n'
styles = '\n<Style>\n'
for obj in self.objects:
obj = self.objects[obj]
if obj.id in traversed_objects:
continue
if isinstance(obj, (Font, Text, TOCObject)):
continue
if isinstance(obj, StyleObject):
styles += str(obj)
else:
objects += str(obj)
styles += '</Style>\n'
objects += '</Objects>\n'
if write_files:
self.write_files()
return '<BBeBXylog version="1.0">\n' + bookinfo + pages + styles + objects + '</BBeBXylog>'
def option_parser():
parser = OptionParser(usage=_('%prog book.lrf\nConvert an LRF file into an LRS (XML UTF-8 encoded) file'))
parser.add_option('--output', '-o', default=None, help=_('Output LRS file'), dest='out')
parser.add_option('--dont-output-resources', default=True, action='store_false',
help=_('Do not save embedded image and font files to disk'),
dest='output_resources')
parser.add_option('--verbose', default=False, action='store_true', dest='verbose', help=_('Be more verbose'))
return parser
def main(args=sys.argv, logger=None):
parser = option_parser()
opts, args = parser.parse_args(args)
if logger is None:
level = logging.DEBUG if opts.verbose else logging.INFO
logger = logging.getLogger('lrf2lrs')
setup_cli_handlers(logger, level)
if len(args) != 2:
parser.print_help()
return 1
if opts.out is None:
opts.out = os.path.join(os.path.dirname(args[1]), os.path.splitext(os.path.basename(args[1]))[0]+".lrs")
logger.info(_('Parsing LRF...'))
d = LRFDocument(open(args[1], 'rb'))
d.parse()
logger.info(_('Creating XML...'))
with codecs.open(os.path.abspath(os.path.expanduser(opts.out)), 'wb', 'utf-8') as f:
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write(d.to_xml(write_files=opts.output_resources))
logger.info(_('LRS written to ')+opts.out)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,766 @@
"""
This module presents an easy to use interface for getting and setting
meta information in LRF files.
Just create an L{LRFMetaFile} object and use its properties
to get and set meta information. For example:
>>> lrf = LRFMetaFile("mybook.lrf")
>>> print(lrf.title, lrf.author)
>>> lrf.category = "History"
"""
import functools
import io
import os
import shutil
import struct
import sys
from xml.dom import minidom
import zlib
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ebooks.metadata import MetaInformation, string_to_authors
BYTE = "<B" #: Unsigned char little endian encoded in 1 byte
WORD = "<H" #: Unsigned short little endian encoded in 2 bytes
DWORD = "<I" #: Unsigned integer little endian encoded in 4 bytes
QWORD = "<Q" #: Unsigned long long little endian encoded in 8 bytes
class field(object):
""" A U{Descriptor<http://www.cafepy.com/article/python_attributes_
and_methods/python_attributes_and_methods.html>},
that implements access to protocol packets in a human readable way.
"""
def __init__(self, start=16, fmt=DWORD):
"""
@param start: The byte at which this field is stored in the buffer
@param fmt: The packing format for this field.
See U{struct<http://docs.python.org/lib/module-struct.html>}.
"""
self._fmt, self._start = fmt, start
def __get__(self, obj, typ=None):
return obj.unpack(start=self._start, fmt=self._fmt)[0]
def __set__(self, obj, val):
obj.pack(val, start=self._start, fmt=self._fmt)
def __repr__(self):
typ = {DWORD: 'unsigned int', 'QWORD': 'unsigned long long',
BYTE: 'unsigned char',
WORD: 'unsigned short'}.get(self._fmt, '')
return ("An " + typ + " stored in " +
str(struct.calcsize(self._fmt)) +
" bytes starting at byte " + str(self._start))
class versioned_field(field):
def __init__(self, vfield, version, start=0, fmt=WORD):
field.__init__(self, start=start, fmt=fmt)
self.vfield, self.version = vfield, version
def enabled(self, obj):
return self.vfield.__get__(obj) > self.version
def __get__(self, obj, typ=None):
if self.enabled(obj):
return field.__get__(self, obj, typ=typ)
else:
return None
def __set__(self, obj, val):
if not self.enabled(obj):
raise LRFException("Trying to set disabled field")
else:
field.__set__(self, obj, val)
class LRFException(Exception):
pass
class fixed_stringfield(object):
""" A field storing a variable length string. """
def __init__(self, length=8, start=0):
"""
@param length: Size of this string
@param start: The byte at which this field is stored in the buffer
"""
self._length = length
self._start = start
def __get__(self, obj, typ=None):
length = str(self._length)
return obj.unpack(start=self._start, fmt="<"+length+"s")[0]
def __set__(self, obj, val):
if not isinstance(val, (str, bytes)):
val = str(val)
if isinstance(val, str):
val = val.encode('utf-8')
if len(val) != self._length:
raise LRFException("Trying to set fixed_stringfield with a " +
"string of incorrect length")
obj.pack(val, start=self._start, fmt="<"+str(len(val))+"s")
def __repr__(self):
return "A string of length " + str(self._length) + \
" starting at byte " + str(self._start)
class xml_attr_field(object):
def __init__(self, tag_name, attr, parent='BookInfo'):
self.tag_name = tag_name
self.parent = parent
self.attr = attr
def __get__(self, obj, typ=None):
""" Return the data in this field or '' if the field is empty """
document = obj.info
elems = document.getElementsByTagName(self.tag_name)
if len(elems):
elem = None
for candidate in elems:
if candidate.parentNode.nodeName == self.parent:
elem = candidate
if elem and elem.hasAttribute(self.attr):
return elem.getAttribute(self.attr)
return ''
def __set__(self, obj, val):
if val is None:
val = ""
document = obj.info
elems = document.getElementsByTagName(self.tag_name)
if len(elems):
elem = None
for candidate in elems:
if candidate.parentNode.nodeName == self.parent:
elem = candidate
if elem:
elem.setAttribute(self.attr, val)
obj.info = document
def __repr__(self):
return "XML Attr Field: " + self.tag_name + " in " + self.parent
def __str__(self):
return self.tag_name+'.'+self.attr
class xml_field(object):
"""
Descriptor that gets and sets XML based meta information from an LRF file.
Works for simple XML fields of the form <tagname>data</tagname>
"""
def __init__(self, tag_name, parent="BookInfo"):
"""
@param tag_name: The XML tag whose data we operate on
@param parent: The tagname of the parent element of C{tag_name}
"""
self.tag_name = tag_name
self.parent = parent
def __get__(self, obj, typ=None):
""" Return the data in this field or '' if the field is empty """
document = obj.info
elems = document.getElementsByTagName(self.tag_name)
if len(elems):
elem = None
for candidate in elems:
if candidate.parentNode.nodeName == self.parent:
elem = candidate
if elem:
elem.normalize()
if elem.hasChildNodes():
return elem.firstChild.data.strip()
return ''
def __set__(self, obj, val):
if not val:
val = ''
document = obj.info
def create_elem():
elem = document.createElement(self.tag_name)
parent = document.getElementsByTagName(self.parent)[0]
parent.appendChild(elem)
return elem
if not val:
val = ''
if not isinstance(val, str):
val = val.decode('utf-8')
elems = document.getElementsByTagName(self.tag_name)
elem = None
if len(elems):
for candidate in elems:
if candidate.parentNode.nodeName == self.parent:
elem = candidate
if not elem:
elem = create_elem()
else:
elem.normalize()
while elem.hasChildNodes():
elem.removeChild(elem.lastChild)
else:
elem = create_elem()
elem.appendChild(document.createTextNode(val))
obj.info = document
def __str__(self):
return self.tag_name
def __repr__(self):
return "XML Field: " + self.tag_name + " in " + self.parent
def insert_into_file(fileobj, data, start, end):
"""
Insert data into fileobj at position C{start}.
This function inserts data into a file, overwriting all data between start
and end. If end == start no data is overwritten. Do not use this function
to append data to a file.
@param fileobj: file like object
@param data: data to be inserted into fileobj
@param start: The position at which to start inserting data
@param end: The position in fileobj of data that must not be overwritten
@return: C{start + len(data) - end}
"""
buffer = io.BytesIO()
fileobj.seek(end)
shutil.copyfileobj(fileobj, buffer, -1)
buffer.flush()
buffer.seek(0)
fileobj.seek(start)
fileobj.write(data)
fileobj.flush()
fileobj.truncate()
delta = fileobj.tell() - end # < 0 if len(data) < end-start
shutil.copyfileobj(buffer, fileobj, -1)
fileobj.flush()
buffer.close()
return delta
def get_metadata(stream):
"""
Return basic meta-data about the LRF file in C{stream} as a
L{MetaInformation} object.
@param stream: A file like object or an instance of L{LRFMetaFile}
"""
lrf = stream if isinstance(stream, LRFMetaFile) else LRFMetaFile(stream)
authors = string_to_authors(lrf.author)
mi = MetaInformation(lrf.title.strip(), authors)
mi.author = lrf.author.strip()
mi.comments = lrf.free_text.strip()
mi.category = lrf.category.strip()+', '+lrf.classification.strip()
tags = [x.strip() for x in mi.category.split(',') if x.strip()]
if tags:
mi.tags = tags
if mi.category.strip() == ',':
mi.category = None
mi.publisher = lrf.publisher.strip()
mi.cover_data = lrf.get_cover()
try:
mi.title_sort = lrf.title_reading.strip()
if not mi.title_sort:
mi.title_sort = None
except Exception:
pass
try:
mi.author_sort = lrf.author_reading.strip()
if not mi.author_sort:
mi.author_sort = None
except Exception:
pass
if not mi.title or 'unknown' in mi.title.lower():
mi.title = None
if not mi.authors:
mi.authors = None
if not mi.author or 'unknown' in mi.author.lower():
mi.author = None
if not mi.category or 'unknown' in mi.category.lower():
mi.category = None
if not mi.publisher or 'unknown' in mi.publisher.lower() or \
'some publisher' in mi.publisher.lower():
mi.publisher = None
return mi
class LRFMetaFile(object):
"""Has properties to read and write all Meta information in a LRF file."""
#: The first 6 bytes of all valid LRF files
LRF_HEADER = 'LRF'.encode('utf-16le')
lrf_header = fixed_stringfield(length=6, start=0x0)
version = field(fmt=WORD, start=0x8)
xor_key = field(fmt=WORD, start=0xa)
root_object_id = field(fmt=DWORD, start=0xc)
number_of_objects = field(fmt=QWORD, start=0x10)
object_index_offset = field(fmt=QWORD, start=0x18)
binding = field(fmt=BYTE, start=0x24)
dpi = field(fmt=WORD, start=0x26)
width = field(fmt=WORD, start=0x2a)
height = field(fmt=WORD, start=0x2c)
color_depth = field(fmt=BYTE, start=0x2e)
toc_object_id = field(fmt=DWORD, start=0x44)
toc_object_offset = field(fmt=DWORD, start=0x48)
compressed_info_size = field(fmt=WORD, start=0x4c)
thumbnail_type = versioned_field(version, 800, fmt=WORD, start=0x4e)
thumbnail_size = versioned_field(version, 800, fmt=DWORD, start=0x50)
uncompressed_info_size = versioned_field(compressed_info_size, 0,
fmt=DWORD, start=0x54)
title = xml_field("Title", parent="BookInfo")
title_reading = xml_attr_field("Title", 'reading', parent="BookInfo")
author = xml_field("Author", parent="BookInfo")
author_reading = xml_attr_field("Author", 'reading', parent="BookInfo")
# 16 characters. First two chars should be FB for personal use ebooks.
book_id = xml_field("BookID", parent="BookInfo")
publisher = xml_field("Publisher", parent="BookInfo")
label = xml_field("Label", parent="BookInfo")
category = xml_field("Category", parent="BookInfo")
classification = xml_field("Classification", parent="BookInfo")
free_text = xml_field("FreeText", parent="BookInfo")
# Should use ISO 639 language codes
language = xml_field("Language", parent="DocInfo")
creator = xml_field("Creator", parent="DocInfo")
# Format is %Y-%m-%d
creation_date = xml_field("CreationDate", parent="DocInfo")
producer = xml_field("Producer", parent="DocInfo")
page = xml_field("SumPage", parent="DocInfo")
def safe(func):
"""
Decorator that ensures that function calls leave the pos
in the underlying file unchanged
"""
@functools.wraps(func)
def restore_pos(*args, **kwargs):
obj = args[0]
pos = obj._file.tell()
res = func(*args, **kwargs)
obj._file.seek(0, 2)
if obj._file.tell() >= pos:
obj._file.seek(pos)
return res
return restore_pos
def safe_property(func):
"""
Decorator that ensures that read or writing a property leaves
the position in the underlying file unchanged
"""
def decorator(f):
def restore_pos(*args, **kwargs):
obj = args[0]
pos = obj._file.tell()
res = f(*args, **kwargs)
obj._file.seek(0, 2)
if obj._file.tell() >= pos:
obj._file.seek(pos)
return res
return restore_pos
locals_ = func()
if 'fget' in locals_:
locals_["fget"] = decorator(locals_["fget"])
if 'fset' in locals_:
locals_["fset"] = decorator(locals_["fset"])
return property(**locals_)
@safe_property
def info():
doc = """\
Document meta information as a minidom Document object.
To set use a minidom document object.
"""
def fget(self):
if self.compressed_info_size == 0:
raise LRFException("This document has no meta info")
size = self.compressed_info_size - 4
self._file.seek(self.info_start)
try:
src = zlib.decompress(self._file.read(size))
if len(src) != self.uncompressed_info_size:
raise LRFException("Decompression of document meta info\
yielded unexpected results")
src = xml_to_unicode(src, strip_encoding_pats=True,
resolve_entities=True,
assume_utf8=True)[0]
return minidom.parseString(src)
except zlib.error:
raise LRFException("Unable to decompress document meta "
"information")
def fset(self, document):
info = document.toxml('utf-8')
self.uncompressed_info_size = len(info)
stream = zlib.compress(info)
orig_size = self.compressed_info_size
self.compressed_info_size = len(stream) + 4
delta = insert_into_file(self._file, stream, self.info_start,
self.info_start + orig_size - 4)
if self.toc_object_offset > 0:
self.toc_object_offset += delta
self.object_index_offset += delta
self.update_object_offsets(delta)
return {"fget": fget, "fset": fset, "doc": doc}
@safe_property
def thumbnail_pos():
doc = """The position of the thumbnail in the LRF file"""
def fget(self):
return self.info_start + self.compressed_info_size-4
return {"fget": fget, "doc": doc}
@classmethod
def _detect_thumbnail_type(cls, slice):
""" @param slice: The first 16 bytes of the thumbnail """
ttype = 0x14 # GIF
if "PNG" in slice:
ttype = 0x12
if "BM" in slice:
ttype = 0x13
if "JFIF" in slice:
ttype = 0x11
return ttype
@safe_property
def thumbnail():
doc = """\
The thumbnail.
Represented as a string.
The string you would get from the file read function.
"""
def fget(self):
size = self.thumbnail_size
if size:
self._file.seek(self.thumbnail_pos)
return self._file.read(size)
def fset(self, data):
if self.version <= 800:
raise LRFException("Cannot store thumbnails in LRF files "
"of version <= 800")
slice = data[0:16]
orig_size = self.thumbnail_size
self.thumbnail_size = len(data)
delta = insert_into_file(self._file, data, self.thumbnail_pos,
self.thumbnail_pos + orig_size)
self.toc_object_offset += delta
self.object_index_offset += delta
self.thumbnail_type = self._detect_thumbnail_type(slice)
self.update_object_offsets(delta)
return {"fget": fget, "fset": fset, "doc": doc}
def __init__(self, file):
""" @param file: A file object opened in the r+b mode """
file.seek(0, 2)
self.size = file.tell()
self._file = file
if self.lrf_header != LRFMetaFile.LRF_HEADER:
raise LRFException(file.name + " has an invalid LRF header. Are "
"you sure it is an LRF file?")
# Byte at which the compressed meta information starts
self.info_start = 0x58 if self.version > 800 else 0x53
@safe
def update_object_offsets(self, delta):
"""
Run through the LRF Object index changing the offset by C{delta}.
"""
self._file.seek(self.object_index_offset)
count = self.number_of_objects
while count > 0:
raw = self._file.read(8)
new_offset = struct.unpack(DWORD, raw[4:8])[0] + delta
if new_offset >= (2**8)**4 or new_offset < 0x4C:
raise LRFException('Invalid LRF file. Could not set metadata.')
self._file.seek(-4, os.SEEK_CUR)
self._file.write(struct.pack(DWORD, new_offset))
self._file.seek(8, os.SEEK_CUR)
count -= 1
self._file.flush()
@safe
def unpack(self, fmt=DWORD, start=0):
"""
Return decoded data from file.
@param fmt: See http://docs.python.org/lib/module-struct.html
@param start: Position in file from which to decode
"""
end = start + struct.calcsize(fmt)
self._file.seek(start)
ret = struct.unpack(fmt, self._file.read(end-start))
return ret
@safe
def pack(self, *args, **kwargs):
"""
Encode C{args} and write them to file.
C{kwargs} must contain the keywords C{fmt} and C{start}
@param args: The values to pack
@param fmt: See http://docs.python.org/lib/module-struct.html
@param start: Position in file at which to write encoded data
"""
encoded = struct.pack(kwargs["fmt"], *args)
self._file.seek(kwargs["start"])
self._file.write(encoded)
self._file.flush()
def thumbail_extension(self):
"""
Return the extension for the thumbnail image type as specified
by L{self.thumbnail_type}. If the LRF file was created by buggy
software, the extension maye be incorrect. See
L{self.fix_thumbnail_type}.
"""
ext = "gif"
ttype = self.thumbnail_type
if ttype == 0x11:
ext = "jpeg"
elif ttype == 0x12:
ext = "png"
elif ttype == 0x13:
ext = "bmp"
return ext
def fix_thumbnail_type(self):
"""
Attempt to guess the thumbnail image format and set
L{self.thumbnail_type} accordingly.
"""
slice = self.thumbnail[0:16]
self.thumbnail_type = self._detect_thumbnail_type(slice)
def seek(self, *args):
""" See L{file.seek} """
return self._file.seek(*args)
def tell(self):
""" See L{file.tell} """
return self._file.tell()
def read(self):
""" See L{file.read} """
return self._file.read()
def write(self, val):
""" See L{file.write} """
self._file.write(val)
def _objects(self):
self._file.seek(self.object_index_offset)
c = self.number_of_objects
while c > 0:
c -= 1
raw = self._file.read(16)
pos = self._file.tell()
yield struct.unpack('<IIII', raw)[:3]
self._file.seek(pos)
def get_objects_by_type(self, type):
from ebook_converter.ebooks.lrf.tags import Tag
objects = []
for id, offset, size in self._objects():
self._file.seek(offset)
tag = Tag(self._file)
if tag.id == 0xF500:
obj_id, obj_type = struct.unpack("<IH", tag.contents)
if obj_type == type:
objects.append((obj_id, offset, size))
return objects
def get_object_by_id(self, tid):
from ebook_converter.ebooks.lrf.tags import Tag
for id, offset, size in self._objects():
self._file.seek(offset)
tag = Tag(self._file)
if tag.id == 0xF500:
obj_id, obj_type = struct.unpack("<IH", tag.contents)
if obj_id == tid:
return obj_id, offset, size, obj_type
return (False, False, False, False)
@safe
def get_cover(self):
from ebook_converter.ebooks.lrf.objects import get_object
for id, offset, size in self.get_objects_by_type(0x0C):
image = get_object(None, self._file, id, offset, size,
self.xor_key)
id, offset, size = self.get_object_by_id(image.refstream)[:3]
image_stream = get_object(None, self._file, id, offset, size,
self.xor_key)
return image_stream.file.rpartition('.')[-1], image_stream.stream
return None
def option_parser():
from ebook_converter.utils.config import OptionParser
from ebook_converter.constants import __appname__, __version__
parser = OptionParser(usage=('''%prog [options] mybook.lrf
Show/edit the metadata in an LRF file.\n\n'''),
version=__appname__+' '+__version__,
epilog='Created by Kovid Goyal')
parser.add_option("-t", "--title", action="store", type="string",
dest="title", help="Set the book title")
parser.add_option('--title-sort', action='store', type='string',
default=None, dest='title_reading',
help='Set sort key for the title')
parser.add_option("-a", "--author", action="store", type="string",
dest="author", help="Set the author")
parser.add_option('--author-sort', action='store', type='string',
default=None, dest='author_reading',
help='Set sort key for the author')
parser.add_option("-c", "--category", action="store", type="string",
dest="category", help="The category this book belongs "
"to. E.g.: History")
parser.add_option("--thumbnail", action="store", type="string",
dest="thumbnail", help="Path to a graphic that will be "
"set as this files' thumbnail")
parser.add_option("--comment", action="store", type="string",
dest="comment", help="Path to a TXT file containing the "
"comment to be stored in the LRF file.")
parser.add_option("--get-thumbnail", action="store_true",
dest="get_thumbnail", default=False,
help="Extract thumbnail from LRF file")
parser.add_option('--publisher', default=None, help='Set the publisher')
parser.add_option('--classification', default=None,
help='Set the book classification')
parser.add_option('--creator', default=None, help='Set the book creator')
parser.add_option('--producer', default=None, help='Set the book '
'producer')
parser.add_option('--get-cover', action='store_true', default=False,
help='Extract cover from LRF file. Note that the LRF '
'format has no defined cover, so we use some heuristics '
'to guess the cover.')
parser.add_option('--bookid', action='store', type='string', default=None,
dest='book_id', help='Set book ID')
# The SumPage element specifies the number of "View"s (visible pages for
# the BookSetting element conditions) of the content.
# Basically, the total pages per the page size, font size, etc. when the
# LRF is first created. Since this will change as the book is reflowed, it
# is probably not worth using.
# parser.add_option("-p", "--page", action="store", type="string", \
# dest="page", help=_("Don't know what this is for"))
return parser
def set_metadata(stream, mi):
lrf = LRFMetaFile(stream)
if mi.title:
lrf.title = mi.title
if mi.authors:
lrf.author = ', '.join(mi.authors)
if mi.tags:
lrf.category = mi.tags[0]
if getattr(mi, 'category', False):
lrf.category = mi.category
if mi.comments:
lrf.free_text = mi.comments
if mi.author_sort:
lrf.author_reading = mi.author_sort
if mi.publisher:
lrf.publisher = mi.publisher
def main(args=sys.argv):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print()
print('No lrf file specified')
return 1
lrf = LRFMetaFile(open(args[1], "r+b"))
if options.title:
lrf.title = options.title
if options.title_reading is not None:
lrf.title_reading = options.title_reading
if options.author_reading is not None:
lrf.author_reading = options.author_reading
if options.author:
lrf.author = options.author
if options.publisher:
lrf.publisher = options.publisher
if options.classification:
lrf.classification = options.classification
if options.category:
lrf.category = options.category
if options.creator:
lrf.creator = options.creator
if options.producer:
lrf.producer = options.producer
if options.thumbnail:
path = os.path.expanduser(os.path.expandvars(options.thumbnail))
with open(path, "rb") as f:
lrf.thumbnail = f.read()
if options.book_id is not None:
lrf.book_id = options.book_id
if options.comment:
path = os.path.expanduser(os.path.expandvars(options.comment))
with open(path, 'rb') as f:
lrf.free_text = f.read().decode('utf-8', 'replace')
if options.get_thumbnail:
t = lrf.thumbnail
td = "None"
if t and len(t) > 0:
td = (os.path.basename(args[1]) + "_thumbnail." +
lrf.thumbail_extension())
with open(td, "wb") as f:
f.write(t)
fields = LRFMetaFile.__dict__.items()
fields.sort()
for f in fields:
if "XML" in str(f):
print(str(f[1]) + ":",
getattr(lrf, f[0]).encode('utf-8'))
if options.get_thumbnail:
print("Thumbnail:", td)
if options.get_cover:
try:
ext, data = lrf.get_cover()
except Exception: # Fails on books created by LRFCreator 1.0
ext, data = None, None
if data:
cover = (os.path.splitext(os.path.basename(args[1]))[0] +
"_cover." + ext)
with open(cover, 'wb') as f:
f.write(data)
print('Cover:', cover)
else:
print('Could not find cover in the LRF file')
if __name__ == '__main__':
sys.exit(main())

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,255 @@
import struct
from ebook_converter.ebooks.lrf import LRFParseError
class Tag(object):
tags = {0x00: (6, "*ObjectStart"),
0x01: (0, "*ObjectEnd"),
0x02: (4, "*ObjectInfoLink"),
0x03: (4, "*Link"),
0x04: (4, "*StreamSize"),
0x05: (0, "*StreamStart"),
0x06: (0, "*StreamEnd"),
0x07: (4, None),
0x08: (4, None),
0x09: (4, None),
0x0A: (4, None),
0x0B: ("type_one", "*ContainedObjectsList"),
0x0D: (2, None),
0x0E: (2, None),
0x11: (2, None),
0x12: (2, None),
0x13: (2, None),
0x14: (2, None),
0x15: (2, None),
0x16: ("string", None),
0x17: (4, None),
0x18: (4, None),
0x19: (2, None),
0x1A: (2, None),
0x1B: (2, None),
0x1C: (2, None),
0x1D: (2, None),
0x1E: (2, None),
0x21: (2, None),
0x22: (2, None),
0x23: (2, None),
0x24: (2, None),
0x25: (2, None),
0x26: (2, None),
0x27: (2, None),
0x28: (2, None),
0x29: (6, None),
0x2A: (2, None),
0x2B: (2, None),
0x2C: (2, None),
0x2D: (4, None),
0x2E: (2, None),
0x31: (2, None),
0x32: (2, None),
0x33: (2, None),
0x34: (4, None),
0x35: (2, None),
0x36: (2, None),
0x37: (4, None),
0x38: (2, None),
0x39: (2, None),
0x3A: (2, None),
0x3C: (2, None),
0x3D: (2, None),
0x3E: (2, None),
0x41: (2, None),
0x42: (2, None),
0x44: (4, None),
0x45: (4, None),
0x46: (2, None),
0x47: (2, None),
0x48: (2, None),
0x49: (8, None),
0x4A: (8, None),
0x4B: (4, None),
0x4C: (4, None),
0x4D: (0, None),
0x4E: (12, None),
0x51: (2, None),
0x52: (2, None),
0x53: (4, None),
0x54: (2, "*StreamFlags"),
0x55: ("string", None),
0x56: (2, None),
0x57: (2, None),
0x58: (2, None),
0x59: ("string", None),
0x5A: ("string", None),
0x5B: (4, None),
0x5C: ("type_one", None),
0x5D: ("string", None),
0x5E: (2, None),
0x61: (2, None),
0x62: (0, None),
0x63: (0, None),
0x64: (0, None),
0x65: (0, None),
0x66: (0, None),
0x67: (0, None),
0x68: (0, None),
0x69: (0, None),
0x6A: (0, None),
0x6B: (0, None),
0x6C: (8, None),
0x6D: (2, None),
0x6E: (0, None),
0x71: (0, None),
0x72: (0, None),
0x73: (10, None),
0x75: (2, None),
0x76: (2, None),
0x77: (2, None),
0x78: ("tag_78", None),
0x79: (2, None),
0x7A: (2, None),
0x7B: (4, None),
0x7C: (4, "*ParentPageTree"),
0x81: (0, None),
0x82: (0, None),
0xA1: (4, None),
0xA2: (0, None),
0xA5: ("unknown", None),
0xA6: (0, None),
0xA7: (4, None),
0xA8: (0, None),
0xA9: (0, None),
0xAA: (0, None),
0xAB: (0, None),
0xAC: (0, None),
0xAD: (0, None),
0xAE: (0, None),
0xB1: (0, None),
0xB2: (0, None),
0xB3: (0, None),
0xB4: (0, None),
0xB5: (0, None),
0xB6: (0, None),
0xB7: (0, None),
0xB8: (0, None),
0xB9: (0, None),
0xBA: (0, None),
0xBB: (0, None),
0xBC: (0, None),
0xBD: (0, None),
0xBE: (0, None),
0xC1: (0, None),
0xC2: (0, None),
0xC3: (2, None),
0xC4: (0, None),
0xC5: (2, None),
0xC6: (2, None),
0xC7: (0, None),
0xC8: (2, None),
0xC9: (0, None),
0xCA: (2, None),
0xCB: ("unknown", None),
0xCC: (2, None),
0xD1: (12, None),
0xD2: (0, None),
0xD4: (2, None),
0xD6: (0, None),
0xD7: (14, None),
0xD8: (4, None),
0xD9: (8, None),
0xDA: (2, None),
0xDB: (2, None),
0xDC: (2, None),
0xDD: (2, None),
0xF1: (2, None),
0xF2: (4, None),
0xF3: (4, None),
0xF4: (2, None),
0xF5: (4, None),
0xF6: (4, None),
0xF7: (4, None),
0xF8: (4, None),
0xF9: (6, None)}
name_map = {}
for key in tags.keys():
temp = tags[key][1]
if temp is not None:
name_map[key] = temp
def __init__(self, stream):
self.offset = stream.tell()
tag_id = struct.unpack("<BB", stream.read(2))
if tag_id[1] != 0xF5:
raise LRFParseError("Bad tag ID %02X at %d" % (tag_id[1],
self.offset))
if tag_id[0] not in self.__class__.tags:
raise LRFParseError("Unknown tag ID: F5%02X" % tag_id[0])
self.id = 0xF500 + tag_id[0]
size, self.name = self.__class__.tags[tag_id[0]]
if isinstance(size, str):
parser = getattr(self, size + '_parser')
self.contents = parser(stream)
else:
self.contents = stream.read(size)
def __str__(self):
s = "Tag %04X " % self.id
if self.name:
s += self.name
s += " at %08X, contents: %s" % (self.offset, repr(self.contents))
return s
@property
def byte(self):
if len(self.contents) != 1:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
return struct.unpack("<B", self.contents)[0]
@property
def word(self):
if len(self.contents) != 2:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
return struct.unpack("<H", self.contents)[0]
@property
def sword(self):
if len(self.contents) != 2:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
return struct.unpack("<h", self.contents)[0]
@property
def dword(self):
if len(self.contents) != 4:
raise LRFParseError("Bad parameter for tag ID: %04X" % self.id)
return struct.unpack("<I", self.contents)[0]
def dummy_parser(self, stream):
raise LRFParseError("Unknown tag at %08X" % stream.tell())
@classmethod
def string_parser(self, stream):
size = struct.unpack("<H", stream.read(2))[0]
return str(stream.read(size), "utf_16")
def type_one_parser(self, stream):
cnt = struct.unpack("<H", stream.read(2))[0]
res = []
while cnt > 0:
res.append(struct.unpack("<I", stream.read(4))[0])
cnt -= 1
return res
def tag_78_parser(self, stream):
pos = stream.tell()
res = []
res.append(struct.unpack("<I", stream.read(4))[0])
tag = Tag(stream)
if tag.id != 0xF516:
raise LRFParseError("Bad tag 78 at %08X" % pos)
res.append(tag.contents)
res.append(struct.unpack("<H", stream.read(2))[0])
return res