From 399456d9ad4abe307f51e6cbe58b80be0835dbb6 Mon Sep 17 00:00:00 2001 From: gryf Date: Sun, 24 May 2020 12:43:33 +0200 Subject: [PATCH] Added LRF input format support. --- README.rst | 3 +- ebook_converter/ebooks/lrf/input.py | 394 +++++++ ebook_converter/ebooks/lrf/lrfparser.py | 171 +++ ebook_converter/ebooks/lrf/meta.py | 766 ++++++++++++++ ebook_converter/ebooks/lrf/objects.py | 1279 +++++++++++++++++++++++ ebook_converter/ebooks/lrf/tags.py | 255 +++++ 6 files changed, 2867 insertions(+), 1 deletion(-) create mode 100644 ebook_converter/ebooks/lrf/input.py create mode 100644 ebook_converter/ebooks/lrf/lrfparser.py create mode 100644 ebook_converter/ebooks/lrf/meta.py create mode 100644 ebook_converter/ebooks/lrf/objects.py create mode 100644 ebook_converter/ebooks/lrf/tags.py diff --git a/README.rst b/README.rst index d0059f7..ec7b5f5 100644 --- a/README.rst +++ b/README.rst @@ -54,6 +54,7 @@ Currently, I've tested following input formats: - fb2 - html - pdf +- lrf Note, that old Microsoft doc format is not supported, although old documents can be fairly easy converted using text processors programs, lik Word or @@ -65,7 +66,7 @@ Output formats Currently, following formats are supported: -- lrf (for Sony readers) +- lrf - epub - mobi - docx diff --git a/ebook_converter/ebooks/lrf/input.py b/ebook_converter/ebooks/lrf/input.py new file mode 100644 index 0000000..d9ed86f --- /dev/null +++ b/ebook_converter/ebooks/lrf/input.py @@ -0,0 +1,394 @@ +import textwrap, operator +from copy import deepcopy, copy + +from lxml import etree + +from ebook_converter import guess_type +from ebook_converter.polyglot.builtins import as_bytes + + +class Canvas(etree.XSLTExtension): + + def __init__(self, doc, styles, text_block, log): + self.doc = doc + self.styles = styles + self.text_block = text_block + self.log = log + self.processed = set() + + def execute(self, context, self_node, input_node, output_parent): + cid = input_node.get('objid', None) + if cid is None or cid in self.processed: + return + self.processed.add(cid) + input_node = self.doc.xpath('//Canvas[@objid="%s"]'%cid)[0] + + objects = list(self.get_objects(input_node)) + if len(objects) == 1 and objects[0][0].tag == 'ImageBlock': + self.image_page(input_node, objects[0][0], output_parent) + else: + canvases = [input_node] + for x in input_node.itersiblings(): + if x.tag == 'Canvas': + oid = x.get('objid', None) + if oid is not None: + canvases.append(x) + self.processed.add(oid) + else: + break + + table = etree.Element('table') + table.text = '\n\t' + for canvas in canvases: + oid = canvas.get('objid') + tr = table.makeelement('tr') + tr.set('id', oid) + tr.tail = '\n\t' + table.append(tr) + for obj, x, y in self.get_objects(canvas): + if obj.tag != 'TextBlock': + self.log.warn(obj.tag, 'elements in Canvas not supported') + continue + td = table.makeelement('td') + self.text_block.render_block(obj, td) + tr.append(td) + output_parent.append(table) + + def image_page(self, input_node, block, output_parent): + div = etree.Element('div') + div.set('id', input_node.get('objid', 'scuzzy')) + div.set('class', 'image_page') + width = self.styles.to_num(block.get("xsize", None)) + height = self.styles.to_num(block.get("ysize", None)) + img = div.makeelement('img') + if width is not None: + img.set('width', str(int(width))) + if height is not None: + img.set('height', str(int(height))) + ref = block.get('refstream', None) + if ref is not None: + imstr = self.doc.xpath('//ImageStream[@objid="%s"]'%ref) + if imstr: + src = imstr[0].get('file', None) + if src: + img.set('src', src) + div.append(img) + output_parent.append(div) + + def get_objects(self, node): + for x in node.xpath('descendant::PutObj[@refobj and @x1 and @y1]'): + objs = node.xpath('//*[@objid="%s"]'%x.get('refobj')) + x, y = map(self.styles.to_num, (x.get('x1'), x.get('y1'))) + if objs and x is not None and y is not None: + yield objs[0], int(x), int(y) + + +class MediaType(etree.XSLTExtension): + + def execute(self, context, self_node, input_node, output_parent): + name = input_node.get('file', None) + typ = guess_type(name)[0] + if not typ: + typ = 'application/octet-stream' + output_parent.text = typ + + +class ImageBlock(etree.XSLTExtension): + + def __init__(self, canvas): + etree.XSLTExtension.__init__(self) + self.canvas = canvas + + def execute(self, context, self_node, input_node, output_parent): + self.canvas.image_page(input_node, input_node, output_parent) + + +class RuledLine(etree.XSLTExtension): + + def execute(self, context, self_node, input_node, output_parent): + hr = etree.Element('hr') + output_parent.append(hr) + + +class TextBlock(etree.XSLTExtension): + + def __init__(self, styles, char_button_map, plot_map, log): + etree.XSLTExtension.__init__(self) + self.styles = styles + self.log = log + self.char_button_map = char_button_map + self.plot_map = plot_map + + def execute(self, context, self_node, input_node, output_parent): + input_node = deepcopy(input_node) + div = etree.Element('div') + self.render_block(input_node, div) + output_parent.append(div) + + def render_block(self, node, root): + ts = node.get('textstyle', None) + classes = [] + bs = node.get('blockstyle') + if bs in self.styles.block_style_map: + classes.append('bs%d'%self.styles.block_style_map[bs]) + if ts in self.styles.text_style_map: + classes.append('ts%d'%self.styles.text_style_map[ts]) + if classes: + root.set('class', ' '.join(classes)) + objid = node.get('objid', None) + if objid: + root.set('id', objid) + root.text = node.text + self.root = root + self.parent = root + self.add_text_to = (self.parent, 'text') + self.fix_deep_nesting(node) + for child in node: + self.process_child(child) + + def fix_deep_nesting(self, node): + deepest = 1 + + def depth(node): + parent = node.getparent() + ans = 1 + while parent is not None: + ans += 1 + parent = parent.getparent() + return ans + + for span in node.xpath('descendant::Span'): + d = depth(span) + if d > deepest: + deepest = d + if d > 500: + break + + if deepest < 500: + return + + self.log.warn('Found deeply nested spans. Flattening.') + # with open('/t/before.xml', 'wb') as f: + # f.write(etree.tostring(node, method='xml')) + + spans = [(depth(span), span) for span in node.xpath('descendant::Span')] + spans.sort(key=operator.itemgetter(0), reverse=True) + + for depth, span in spans: + if depth < 3: + continue + p = span.getparent() + gp = p.getparent() + idx = p.index(span) + pidx = gp.index(p) + children = list(p)[idx:] + t = children[-1].tail + t = t if t else '' + children[-1].tail = t + (p.tail if p.tail else '') + p.tail = '' + pattrib = dict(**p.attrib) if p.tag == 'Span' else {} + for child in children: + p.remove(child) + if pattrib and child.tag == "Span": + attrib = copy(pattrib) + attrib.update(child.attrib) + child.attrib.update(attrib) + + for child in reversed(children): + gp.insert(pidx+1, child) + + # with open('/t/after.xml', 'wb') as f: + # f.write(etree.tostring(node, method='xml')) + + def add_text(self, text): + if text: + if getattr(self.add_text_to[0], self.add_text_to[1]) is None: + setattr(self.add_text_to[0], self.add_text_to[1], '') + setattr(self.add_text_to[0], self.add_text_to[1], + getattr(self.add_text_to[0], self.add_text_to[1])+ text) + + def process_container(self, child, tgt): + idx = self.styles.get_text_styles(child) + if idx is not None: + tgt.set('class', 'ts%d'%idx) + self.parent.append(tgt) + orig_parent = self.parent + self.parent = tgt + self.add_text_to = (self.parent, 'text') + self.add_text(child.text) + for gchild in child: + self.process_child(gchild) + self.parent = orig_parent + self.add_text_to = (tgt, 'tail') + self.add_text(child.tail) + + def process_child(self, child): + if child.tag == 'CR': + if self.parent == self.root or self.parent.tag == 'p': + self.parent = self.root.makeelement('p') + self.root.append(self.parent) + self.add_text_to = (self.parent, 'text') + else: + br = self.parent.makeelement('br') + self.parent.append(br) + self.add_text_to = (br, 'tail') + self.add_text(child.tail) + elif child.tag in ('P', 'Span', 'EmpLine', 'NoBR'): + span = self.root.makeelement('span') + if child.tag == 'EmpLine': + td = 'underline' if child.get('emplineposition', 'before') == 'before' else 'overline' + span.set('style', 'text-decoration: '+td) + self.process_container(child, span) + elif child.tag == 'Sup': + sup = self.root.makeelement('sup') + self.process_container(child, sup) + elif child.tag == 'Sub': + sub = self.root.makeelement('sub') + self.process_container(child, sub) + elif child.tag == 'Italic': + sup = self.root.makeelement('i') + self.process_container(child, sup) + elif child.tag == 'CharButton': + a = self.root.makeelement('a') + oid = child.get('refobj', None) + if oid in self.char_button_map: + a.set('href', self.char_button_map[oid]) + self.process_container(child, a) + elif child.tag == 'Plot': + xsize = self.styles.to_num(child.get('xsize', None), 166/720) + ysize = self.styles.to_num(child.get('ysize', None), 166/720) + img = self.root.makeelement('img') + if xsize is not None: + img.set('width', str(int(xsize))) + if ysize is not None: + img.set('height', str(int(ysize))) + ro = child.get('refobj', None) + if ro in self.plot_map: + img.set('src', self.plot_map[ro]) + self.parent.append(img) + self.add_text_to = (img, 'tail') + self.add_text(child.tail) + else: + self.log.warn('Unhandled Text element:', child.tag) + + +class Styles(etree.XSLTExtension): + + def __init__(self): + etree.XSLTExtension.__init__(self) + self.text_styles, self.block_styles = [], [] + self.text_style_map, self.block_style_map = {}, {} + self.CSS = textwrap.dedent(''' + .image_page { text-align:center } + ''') + + def write(self, name='styles.css'): + + def join(style): + ans = ['%s : %s;'%(k, v) for k, v in style.items()] + if ans: + ans[-1] = ans[-1][:-1] + return '\n\t'.join(ans) + + with open(name, 'wb') as f: + f.write(as_bytes(self.CSS)) + for (w, sel) in [(self.text_styles, 'ts'), (self.block_styles, + 'bs')]: + for i, s in enumerate(w): + if not s: + continue + rsel = '.%s%d'%(sel, i) + s = join(s) + f.write(as_bytes(rsel + ' {\n\t' + s + '\n}\n\n')) + + def execute(self, context, self_node, input_node, output_parent): + if input_node.tag == 'TextStyle': + idx = self.get_text_styles(input_node) + if idx is not None: + self.text_style_map[input_node.get('objid')] = idx + else: + idx = self.get_block_styles(input_node) + self.block_style_map[input_node.get('objid')] = idx + + def px_to_pt(self, px): + try: + return px * 72/166 + except: + return None + + def color(self, val): + try: + val = int(val, 16) + r, g, b, a = val & 0xFF, (val>>8)&0xFF, (val>>16)&0xFF, (val>>24)&0xFF + if a == 255: + return None + if a == 0: + return 'rgb(%d,%d,%d)'%(r,g,b) + return 'rgba(%d,%d,%d,%f)'%(r,g,b,1.-a/255.) + except: + return None + + def get_block_styles(self, node): + ans = {} + sm = self.px_to_pt(node.get('sidemargin', None)) + if sm is not None: + ans['margin-left'] = ans['margin-right'] = '%fpt'%sm + ts = self.px_to_pt(node.get('topskip', None)) + if ts is not None: + ans['margin-top'] = '%fpt'%ts + fs = self.px_to_pt(node.get('footskip', None)) + if fs is not None: + ans['margin-bottom'] = '%fpt'%fs + fw = self.px_to_pt(node.get('framewidth', None)) + if fw is not None: + ans['border-width'] = '%fpt'%fw + ans['border-style'] = 'solid' + fc = self.color(node.get('framecolor', None)) + if fc is not None: + ans['border-color'] = fc + bc = self.color(node.get('bgcolor', None)) + if bc is not None: + ans['background-color'] = bc + if ans not in self.block_styles: + self.block_styles.append(ans) + return self.block_styles.index(ans) + + def to_num(self, val, factor=1.): + try: + return float(val)*factor + except: + return None + + def get_text_styles(self, node): + ans = {} + fs = self.to_num(node.get('fontsize', None), 0.1) + if fs is not None: + ans['font-size'] = '%fpt'%fs + fw = self.to_num(node.get('fontweight', None)) + if fw is not None: + ans['font-weight'] = ('bold' if fw >= 700 else 'normal') + # fn = getattr(obj, 'fontfacename', None) + # if fn is not None: + # fn = cls.FONT_MAP[fn] + # item('font-family: %s;'%fn) + fg = self.color(node.get('textcolor', None)) + if fg is not None: + ans['color'] = fg + bg = self.color(node.get('textbgcolor', None)) + if bg is not None: + ans['background-color'] = bg + al = node.get('align', None) + if al is not None: + all = dict(head='left', center='center', foot='right') + ans['text-align'] = all.get(al, 'left') + # lh = self.to_num(node.get('linespace', None), 0.1) + # if lh is not None: + # ans['line-height'] = '%fpt'%lh + pi = self.to_num(node.get('parindent', None), 0.1) + if pi is not None: + ans['text-indent'] = '%fpt'%pi + if not ans: + return None + if ans not in self.text_styles: + self.text_styles.append(ans) + return self.text_styles.index(ans) diff --git a/ebook_converter/ebooks/lrf/lrfparser.py b/ebook_converter/ebooks/lrf/lrfparser.py new file mode 100644 index 0000000..6b876a5 --- /dev/null +++ b/ebook_converter/ebooks/lrf/lrfparser.py @@ -0,0 +1,171 @@ +import sys, array, os, re, codecs, logging +from itertools import chain + +from ebook_converter import setup_cli_handlers +from ebook_converter.utils.config import OptionParser +from ebook_converter.utils.filenames import ascii_filename +from ebook_converter.ebooks.lrf.meta import LRFMetaFile +from ebook_converter.ebooks.lrf.objects import get_object, PageTree, StyleObject, \ + Font, Text, TOCObject, BookAttr, ruby_tags + + +class LRFDocument(LRFMetaFile): + + class temp(object): + pass + + def __init__(self, stream): + LRFMetaFile.__init__(self, stream) + self.scramble_key = self.xor_key + self.page_trees = [] + self.font_map = {} + self.image_map = {} + self.toc = '' + self.keep_parsing = True + + def parse(self): + self._parse_objects() + self.metadata = LRFDocument.temp() + for a in ('title', 'title_reading', 'author', 'author_reading', 'book_id', + 'classification', 'free_text', 'publisher', 'label', 'category'): + setattr(self.metadata, a, getattr(self, a)) + self.doc_info = LRFDocument.temp() + for a in ('thumbnail', 'language', 'creator', 'producer', 'page'): + setattr(self.doc_info, a, getattr(self, a)) + self.doc_info.thumbnail_extension = self.thumbail_extension() + self.device_info = LRFDocument.temp() + for a in ('dpi', 'width', 'height'): + setattr(self.device_info, a, getattr(self, a)) + + def _parse_objects(self): + self.objects = {} + self._file.seek(self.object_index_offset) + obj_array = array.array("I", self._file.read(4*4*self.number_of_objects)) + if ord(array.array("i",[1]).tostring()[0:1])==0: # big-endian + obj_array.byteswap() + for i in range(self.number_of_objects): + if not self.keep_parsing: + break + objid, objoff, objsize = obj_array[i*4:i*4+3] + self._parse_object(objid, objoff, objsize) + for obj in self.objects.values(): + if not self.keep_parsing: + break + if hasattr(obj, 'initialize'): + obj.initialize() + + def _parse_object(self, objid, objoff, objsize): + obj = get_object(self, self._file, objid, objoff, objsize, self.scramble_key) + self.objects[objid] = obj + if isinstance(obj, PageTree): + self.page_trees.append(obj) + elif isinstance(obj, TOCObject): + self.toc = obj + elif isinstance(obj, BookAttr): + self.ruby_tags = {} + for h in ruby_tags.values(): + attr = h[0] + if hasattr(obj, attr): + self.ruby_tags[attr] = getattr(obj, attr) + + def __iter__(self): + for pt in self.page_trees: + yield pt + + def write_files(self): + for obj in chain(self.image_map.values(), self.font_map.values()): + with open(obj.file, 'wb') as f: + f.write(obj.stream) + + def to_xml(self, write_files=True): + bookinfo = '\n\n\n' + bookinfo += '%s\n'%(self.metadata.title_reading, self.metadata.title) + bookinfo += '%s\n'%(self.metadata.author_reading, self.metadata.author) + bookinfo += '%s\n'%(self.metadata.book_id,) + bookinfo += '%s\n'%(self.metadata.publisher,) + bookinfo += '\n'%(self.metadata.label,) + bookinfo += '%s\n'%(self.metadata.category,) + bookinfo += '%s\n'%(self.metadata.classification,) + bookinfo += '%s\n\n\n'%(self.metadata.free_text,) + th = self.doc_info.thumbnail + if th: + prefix = ascii_filename(self.metadata.title) + bookinfo += '\n'%(prefix+'_thumbnail.'+self.doc_info.thumbnail_extension,) + if write_files: + with open(prefix+'_thumbnail.'+self.doc_info.thumbnail_extension, 'wb') as f: + f.write(th) + bookinfo += '%s\n'%(self.doc_info.language,) + bookinfo += '%s\n'%(self.doc_info.creator,) + bookinfo += '%s\n'%(self.doc_info.producer,) + bookinfo += '%s\n\n\n%s\n'%(self.doc_info.page,self.toc) + pages = '' + done_main = False + pt_id = -1 + for page_tree in self: + if not done_main: + done_main = True + pages += '
\n' + close = '
\n' + pt_id = page_tree.id + else: + pages += '\n'%(page_tree.id,) + close = '\n' + for page in page_tree: + pages += str(page) + pages += close + traversed_objects = [int(i) for i in re.findall(r'objid="(\w+)"', pages)] + [pt_id] + + objects = '\n\n' + styles = '\n\n' + objects += '\n' + if write_files: + self.write_files() + return '\n' + bookinfo + pages + styles + objects + '' + + +def option_parser(): + parser = OptionParser(usage=_('%prog book.lrf\nConvert an LRF file into an LRS (XML UTF-8 encoded) file')) + parser.add_option('--output', '-o', default=None, help=_('Output LRS file'), dest='out') + parser.add_option('--dont-output-resources', default=True, action='store_false', + help=_('Do not save embedded image and font files to disk'), + dest='output_resources') + parser.add_option('--verbose', default=False, action='store_true', dest='verbose', help=_('Be more verbose')) + return parser + + +def main(args=sys.argv, logger=None): + parser = option_parser() + opts, args = parser.parse_args(args) + if logger is None: + level = logging.DEBUG if opts.verbose else logging.INFO + logger = logging.getLogger('lrf2lrs') + setup_cli_handlers(logger, level) + if len(args) != 2: + parser.print_help() + return 1 + if opts.out is None: + opts.out = os.path.join(os.path.dirname(args[1]), os.path.splitext(os.path.basename(args[1]))[0]+".lrs") + logger.info(_('Parsing LRF...')) + d = LRFDocument(open(args[1], 'rb')) + d.parse() + logger.info(_('Creating XML...')) + with codecs.open(os.path.abspath(os.path.expanduser(opts.out)), 'wb', 'utf-8') as f: + f.write('\n') + f.write(d.to_xml(write_files=opts.output_resources)) + logger.info(_('LRS written to ')+opts.out) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/ebook_converter/ebooks/lrf/meta.py b/ebook_converter/ebooks/lrf/meta.py new file mode 100644 index 0000000..248e3b7 --- /dev/null +++ b/ebook_converter/ebooks/lrf/meta.py @@ -0,0 +1,766 @@ +""" +This module presents an easy to use interface for getting and setting +meta information in LRF files. +Just create an L{LRFMetaFile} object and use its properties +to get and set meta information. For example: + +>>> lrf = LRFMetaFile("mybook.lrf") +>>> print(lrf.title, lrf.author) +>>> lrf.category = "History" +""" + +import functools +import io +import os +import shutil +import struct +import sys +from xml.dom import minidom +import zlib + +from ebook_converter.ebooks.chardet import xml_to_unicode +from ebook_converter.ebooks.metadata import MetaInformation, string_to_authors + +BYTE = "}, + that implements access to protocol packets in a human readable way. + """ + + def __init__(self, start=16, fmt=DWORD): + """ + @param start: The byte at which this field is stored in the buffer + @param fmt: The packing format for this field. + See U{struct}. + """ + self._fmt, self._start = fmt, start + + def __get__(self, obj, typ=None): + return obj.unpack(start=self._start, fmt=self._fmt)[0] + + def __set__(self, obj, val): + obj.pack(val, start=self._start, fmt=self._fmt) + + def __repr__(self): + typ = {DWORD: 'unsigned int', 'QWORD': 'unsigned long long', + BYTE: 'unsigned char', + WORD: 'unsigned short'}.get(self._fmt, '') + return ("An " + typ + " stored in " + + str(struct.calcsize(self._fmt)) + + " bytes starting at byte " + str(self._start)) + + +class versioned_field(field): + + def __init__(self, vfield, version, start=0, fmt=WORD): + field.__init__(self, start=start, fmt=fmt) + self.vfield, self.version = vfield, version + + def enabled(self, obj): + return self.vfield.__get__(obj) > self.version + + def __get__(self, obj, typ=None): + if self.enabled(obj): + return field.__get__(self, obj, typ=typ) + else: + return None + + def __set__(self, obj, val): + if not self.enabled(obj): + raise LRFException("Trying to set disabled field") + else: + field.__set__(self, obj, val) + + +class LRFException(Exception): + pass + + +class fixed_stringfield(object): + """ A field storing a variable length string. """ + + def __init__(self, length=8, start=0): + """ + @param length: Size of this string + @param start: The byte at which this field is stored in the buffer + """ + self._length = length + self._start = start + + def __get__(self, obj, typ=None): + length = str(self._length) + return obj.unpack(start=self._start, fmt="<"+length+"s")[0] + + def __set__(self, obj, val): + if not isinstance(val, (str, bytes)): + val = str(val) + if isinstance(val, str): + val = val.encode('utf-8') + if len(val) != self._length: + raise LRFException("Trying to set fixed_stringfield with a " + + "string of incorrect length") + obj.pack(val, start=self._start, fmt="<"+str(len(val))+"s") + + def __repr__(self): + return "A string of length " + str(self._length) + \ + " starting at byte " + str(self._start) + + +class xml_attr_field(object): + + def __init__(self, tag_name, attr, parent='BookInfo'): + self.tag_name = tag_name + self.parent = parent + self.attr = attr + + def __get__(self, obj, typ=None): + """ Return the data in this field or '' if the field is empty """ + document = obj.info + elems = document.getElementsByTagName(self.tag_name) + if len(elems): + elem = None + for candidate in elems: + if candidate.parentNode.nodeName == self.parent: + elem = candidate + if elem and elem.hasAttribute(self.attr): + return elem.getAttribute(self.attr) + return '' + + def __set__(self, obj, val): + if val is None: + val = "" + document = obj.info + elems = document.getElementsByTagName(self.tag_name) + if len(elems): + elem = None + for candidate in elems: + if candidate.parentNode.nodeName == self.parent: + elem = candidate + if elem: + elem.setAttribute(self.attr, val) + obj.info = document + + def __repr__(self): + return "XML Attr Field: " + self.tag_name + " in " + self.parent + + def __str__(self): + return self.tag_name+'.'+self.attr + + +class xml_field(object): + """ + Descriptor that gets and sets XML based meta information from an LRF file. + Works for simple XML fields of the form data + """ + + def __init__(self, tag_name, parent="BookInfo"): + """ + @param tag_name: The XML tag whose data we operate on + @param parent: The tagname of the parent element of C{tag_name} + """ + self.tag_name = tag_name + self.parent = parent + + def __get__(self, obj, typ=None): + """ Return the data in this field or '' if the field is empty """ + document = obj.info + + elems = document.getElementsByTagName(self.tag_name) + if len(elems): + elem = None + for candidate in elems: + if candidate.parentNode.nodeName == self.parent: + elem = candidate + if elem: + elem.normalize() + if elem.hasChildNodes(): + return elem.firstChild.data.strip() + return '' + + def __set__(self, obj, val): + if not val: + val = '' + document = obj.info + + def create_elem(): + elem = document.createElement(self.tag_name) + parent = document.getElementsByTagName(self.parent)[0] + parent.appendChild(elem) + return elem + + if not val: + val = '' + if not isinstance(val, str): + val = val.decode('utf-8') + + elems = document.getElementsByTagName(self.tag_name) + elem = None + if len(elems): + for candidate in elems: + if candidate.parentNode.nodeName == self.parent: + elem = candidate + if not elem: + elem = create_elem() + else: + elem.normalize() + while elem.hasChildNodes(): + elem.removeChild(elem.lastChild) + else: + elem = create_elem() + elem.appendChild(document.createTextNode(val)) + + obj.info = document + + def __str__(self): + return self.tag_name + + def __repr__(self): + return "XML Field: " + self.tag_name + " in " + self.parent + + +def insert_into_file(fileobj, data, start, end): + """ + Insert data into fileobj at position C{start}. + + This function inserts data into a file, overwriting all data between start + and end. If end == start no data is overwritten. Do not use this function + to append data to a file. + + @param fileobj: file like object + @param data: data to be inserted into fileobj + @param start: The position at which to start inserting data + @param end: The position in fileobj of data that must not be overwritten + @return: C{start + len(data) - end} + """ + buffer = io.BytesIO() + fileobj.seek(end) + shutil.copyfileobj(fileobj, buffer, -1) + buffer.flush() + buffer.seek(0) + fileobj.seek(start) + fileobj.write(data) + fileobj.flush() + fileobj.truncate() + delta = fileobj.tell() - end # < 0 if len(data) < end-start + shutil.copyfileobj(buffer, fileobj, -1) + fileobj.flush() + buffer.close() + return delta + + +def get_metadata(stream): + """ + Return basic meta-data about the LRF file in C{stream} as a + L{MetaInformation} object. + @param stream: A file like object or an instance of L{LRFMetaFile} + """ + lrf = stream if isinstance(stream, LRFMetaFile) else LRFMetaFile(stream) + authors = string_to_authors(lrf.author) + mi = MetaInformation(lrf.title.strip(), authors) + mi.author = lrf.author.strip() + mi.comments = lrf.free_text.strip() + mi.category = lrf.category.strip()+', '+lrf.classification.strip() + tags = [x.strip() for x in mi.category.split(',') if x.strip()] + if tags: + mi.tags = tags + if mi.category.strip() == ',': + mi.category = None + mi.publisher = lrf.publisher.strip() + mi.cover_data = lrf.get_cover() + try: + mi.title_sort = lrf.title_reading.strip() + if not mi.title_sort: + mi.title_sort = None + except Exception: + pass + + try: + mi.author_sort = lrf.author_reading.strip() + if not mi.author_sort: + mi.author_sort = None + except Exception: + pass + + if not mi.title or 'unknown' in mi.title.lower(): + mi.title = None + if not mi.authors: + mi.authors = None + if not mi.author or 'unknown' in mi.author.lower(): + mi.author = None + if not mi.category or 'unknown' in mi.category.lower(): + mi.category = None + if not mi.publisher or 'unknown' in mi.publisher.lower() or \ + 'some publisher' in mi.publisher.lower(): + mi.publisher = None + + return mi + + +class LRFMetaFile(object): + """Has properties to read and write all Meta information in a LRF file.""" + #: The first 6 bytes of all valid LRF files + LRF_HEADER = 'LRF'.encode('utf-16le') + + lrf_header = fixed_stringfield(length=6, start=0x0) + version = field(fmt=WORD, start=0x8) + xor_key = field(fmt=WORD, start=0xa) + root_object_id = field(fmt=DWORD, start=0xc) + number_of_objects = field(fmt=QWORD, start=0x10) + object_index_offset = field(fmt=QWORD, start=0x18) + binding = field(fmt=BYTE, start=0x24) + dpi = field(fmt=WORD, start=0x26) + width = field(fmt=WORD, start=0x2a) + height = field(fmt=WORD, start=0x2c) + color_depth = field(fmt=BYTE, start=0x2e) + toc_object_id = field(fmt=DWORD, start=0x44) + toc_object_offset = field(fmt=DWORD, start=0x48) + compressed_info_size = field(fmt=WORD, start=0x4c) + thumbnail_type = versioned_field(version, 800, fmt=WORD, start=0x4e) + thumbnail_size = versioned_field(version, 800, fmt=DWORD, start=0x50) + uncompressed_info_size = versioned_field(compressed_info_size, 0, + fmt=DWORD, start=0x54) + + title = xml_field("Title", parent="BookInfo") + title_reading = xml_attr_field("Title", 'reading', parent="BookInfo") + author = xml_field("Author", parent="BookInfo") + author_reading = xml_attr_field("Author", 'reading', parent="BookInfo") + # 16 characters. First two chars should be FB for personal use ebooks. + book_id = xml_field("BookID", parent="BookInfo") + publisher = xml_field("Publisher", parent="BookInfo") + label = xml_field("Label", parent="BookInfo") + category = xml_field("Category", parent="BookInfo") + classification = xml_field("Classification", parent="BookInfo") + free_text = xml_field("FreeText", parent="BookInfo") + # Should use ISO 639 language codes + language = xml_field("Language", parent="DocInfo") + creator = xml_field("Creator", parent="DocInfo") + # Format is %Y-%m-%d + creation_date = xml_field("CreationDate", parent="DocInfo") + producer = xml_field("Producer", parent="DocInfo") + page = xml_field("SumPage", parent="DocInfo") + + def safe(func): + """ + Decorator that ensures that function calls leave the pos + in the underlying file unchanged + """ + @functools.wraps(func) + def restore_pos(*args, **kwargs): + obj = args[0] + pos = obj._file.tell() + res = func(*args, **kwargs) + obj._file.seek(0, 2) + if obj._file.tell() >= pos: + obj._file.seek(pos) + return res + return restore_pos + + def safe_property(func): + """ + Decorator that ensures that read or writing a property leaves + the position in the underlying file unchanged + """ + def decorator(f): + def restore_pos(*args, **kwargs): + obj = args[0] + pos = obj._file.tell() + res = f(*args, **kwargs) + obj._file.seek(0, 2) + if obj._file.tell() >= pos: + obj._file.seek(pos) + return res + return restore_pos + locals_ = func() + if 'fget' in locals_: + locals_["fget"] = decorator(locals_["fget"]) + if 'fset' in locals_: + locals_["fset"] = decorator(locals_["fset"]) + return property(**locals_) + + @safe_property + def info(): + doc = """\ + Document meta information as a minidom Document object. + To set use a minidom document object. + """ + + def fget(self): + if self.compressed_info_size == 0: + raise LRFException("This document has no meta info") + size = self.compressed_info_size - 4 + self._file.seek(self.info_start) + try: + src = zlib.decompress(self._file.read(size)) + if len(src) != self.uncompressed_info_size: + raise LRFException("Decompression of document meta info\ + yielded unexpected results") + + src = xml_to_unicode(src, strip_encoding_pats=True, + resolve_entities=True, + assume_utf8=True)[0] + return minidom.parseString(src) + except zlib.error: + raise LRFException("Unable to decompress document meta " + "information") + + def fset(self, document): + info = document.toxml('utf-8') + self.uncompressed_info_size = len(info) + stream = zlib.compress(info) + orig_size = self.compressed_info_size + self.compressed_info_size = len(stream) + 4 + delta = insert_into_file(self._file, stream, self.info_start, + self.info_start + orig_size - 4) + + if self.toc_object_offset > 0: + self.toc_object_offset += delta + self.object_index_offset += delta + self.update_object_offsets(delta) + + return {"fget": fget, "fset": fset, "doc": doc} + + @safe_property + def thumbnail_pos(): + doc = """The position of the thumbnail in the LRF file""" + + def fget(self): + return self.info_start + self.compressed_info_size-4 + return {"fget": fget, "doc": doc} + + @classmethod + def _detect_thumbnail_type(cls, slice): + """ @param slice: The first 16 bytes of the thumbnail """ + ttype = 0x14 # GIF + if "PNG" in slice: + ttype = 0x12 + if "BM" in slice: + ttype = 0x13 + if "JFIF" in slice: + ttype = 0x11 + return ttype + + @safe_property + def thumbnail(): + doc = """\ + The thumbnail. + Represented as a string. + The string you would get from the file read function. + """ + + def fget(self): + size = self.thumbnail_size + if size: + self._file.seek(self.thumbnail_pos) + return self._file.read(size) + + def fset(self, data): + if self.version <= 800: + raise LRFException("Cannot store thumbnails in LRF files " + "of version <= 800") + slice = data[0:16] + orig_size = self.thumbnail_size + self.thumbnail_size = len(data) + delta = insert_into_file(self._file, data, self.thumbnail_pos, + self.thumbnail_pos + orig_size) + self.toc_object_offset += delta + self.object_index_offset += delta + self.thumbnail_type = self._detect_thumbnail_type(slice) + self.update_object_offsets(delta) + + return {"fget": fget, "fset": fset, "doc": doc} + + def __init__(self, file): + """ @param file: A file object opened in the r+b mode """ + file.seek(0, 2) + self.size = file.tell() + self._file = file + if self.lrf_header != LRFMetaFile.LRF_HEADER: + raise LRFException(file.name + " has an invalid LRF header. Are " + "you sure it is an LRF file?") + # Byte at which the compressed meta information starts + self.info_start = 0x58 if self.version > 800 else 0x53 + + @safe + def update_object_offsets(self, delta): + """ + Run through the LRF Object index changing the offset by C{delta}. + """ + self._file.seek(self.object_index_offset) + count = self.number_of_objects + while count > 0: + raw = self._file.read(8) + new_offset = struct.unpack(DWORD, raw[4:8])[0] + delta + if new_offset >= (2**8)**4 or new_offset < 0x4C: + raise LRFException('Invalid LRF file. Could not set metadata.') + self._file.seek(-4, os.SEEK_CUR) + self._file.write(struct.pack(DWORD, new_offset)) + self._file.seek(8, os.SEEK_CUR) + count -= 1 + self._file.flush() + + @safe + def unpack(self, fmt=DWORD, start=0): + """ + Return decoded data from file. + + @param fmt: See http://docs.python.org/lib/module-struct.html + @param start: Position in file from which to decode + """ + end = start + struct.calcsize(fmt) + self._file.seek(start) + ret = struct.unpack(fmt, self._file.read(end-start)) + return ret + + @safe + def pack(self, *args, **kwargs): + """ + Encode C{args} and write them to file. + C{kwargs} must contain the keywords C{fmt} and C{start} + + @param args: The values to pack + @param fmt: See http://docs.python.org/lib/module-struct.html + @param start: Position in file at which to write encoded data + """ + encoded = struct.pack(kwargs["fmt"], *args) + self._file.seek(kwargs["start"]) + self._file.write(encoded) + self._file.flush() + + def thumbail_extension(self): + """ + Return the extension for the thumbnail image type as specified + by L{self.thumbnail_type}. If the LRF file was created by buggy + software, the extension maye be incorrect. See + L{self.fix_thumbnail_type}. + """ + ext = "gif" + ttype = self.thumbnail_type + if ttype == 0x11: + ext = "jpeg" + elif ttype == 0x12: + ext = "png" + elif ttype == 0x13: + ext = "bmp" + return ext + + def fix_thumbnail_type(self): + """ + Attempt to guess the thumbnail image format and set + L{self.thumbnail_type} accordingly. + """ + slice = self.thumbnail[0:16] + self.thumbnail_type = self._detect_thumbnail_type(slice) + + def seek(self, *args): + """ See L{file.seek} """ + return self._file.seek(*args) + + def tell(self): + """ See L{file.tell} """ + return self._file.tell() + + def read(self): + """ See L{file.read} """ + return self._file.read() + + def write(self, val): + """ See L{file.write} """ + self._file.write(val) + + def _objects(self): + self._file.seek(self.object_index_offset) + c = self.number_of_objects + while c > 0: + c -= 1 + raw = self._file.read(16) + pos = self._file.tell() + yield struct.unpack(' 0: + td = (os.path.basename(args[1]) + "_thumbnail." + + lrf.thumbail_extension()) + with open(td, "wb") as f: + f.write(t) + + fields = LRFMetaFile.__dict__.items() + fields.sort() + for f in fields: + if "XML" in str(f): + print(str(f[1]) + ":", + getattr(lrf, f[0]).encode('utf-8')) + if options.get_thumbnail: + print("Thumbnail:", td) + if options.get_cover: + try: + ext, data = lrf.get_cover() + except Exception: # Fails on books created by LRFCreator 1.0 + ext, data = None, None + if data: + cover = (os.path.splitext(os.path.basename(args[1]))[0] + + "_cover." + ext) + with open(cover, 'wb') as f: + f.write(data) + print('Cover:', cover) + else: + print('Could not find cover in the LRF file') + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/ebook_converter/ebooks/lrf/objects.py b/ebook_converter/ebooks/lrf/objects.py new file mode 100644 index 0000000..306300f --- /dev/null +++ b/ebook_converter/ebooks/lrf/objects.py @@ -0,0 +1,1279 @@ +import array +import collections +import io +import re +import struct +import zlib + +from ebook_converter.ebooks.lrf import LRFParseError, PRS500_PROFILE +from ebook_converter import entity_to_unicode, prepare_string_for_xml +from ebook_converter.ebooks.lrf.tags import Tag + +ruby_tags = {0xF575: ['rubyAlignAndAdjust', 'W'], + 0xF576: ['rubyoverhang', 'W', {0: 'none', 1: 'auto'}], + 0xF577: ['empdotsposition', 'W', {1: 'before', 2: 'after'}], + 0xF578: ['', 'parse_empdots'], + 0xF579: ['emplineposition', 'W', {1: 'before', 2: 'after'}], + 0xF57A: ['emplinetype', 'W', {0: 'none', 0x10: 'solid', + 0x20: 'dashed', 0x30: 'double', + 0x40: 'dotted'}]} + + +class LRFObject(object): + + tag_map = { + 0xF500: ['', ''], + 0xF502: ['infoLink', 'D'], + 0xF501: ['', ''], + } + + @classmethod + def descramble_buffer(cls, buf, l, xorKey): + i = 0 + a = array.array('B', buf) + while l > 0: + a[i] ^= xorKey + i += 1 + l -= 1 + return a.tostring() + + @classmethod + def parse_empdots(self, tag, f): + self.refEmpDotsFont, self.empDotsFontName, self.empDotsCode = tag.contents + + @staticmethod + def tag_to_val(h, obj, tag, stream): + val = None + if h[1] == 'D': + val = tag.dword + elif h[1] == 'W': + val = tag.word + elif h[1] == 'w': + val = tag.word + if val > 0x8000: + val -= 0x10000 + elif h[1] == 'B': + val = tag.byte + elif h[1] == 'P': + val = tag.contents + elif h[1] != '': + val = getattr(obj, h[1])(tag, stream) + if len(h) > 2: + val = h[2](val) if callable(h[2]) else h[2][val] + return val + + def __init__(self, document, stream, id, scramble_key, boundary): + self._scramble_key = scramble_key + self._document = document + self.id = id + + while stream.tell() < boundary: + tag = Tag(stream) + self.handle_tag(tag, stream) + + def parse_bg_image(self, tag, f): + self.bg_image_mode, self.bg_image_id = struct.unpack(" 0x400 and (isinstance(self, ImageStream) or isinstance(self, Font) or isinstance(self, SoundStream)): + l = 0x400 + self.stream = self.descramble_buffer(self.stream, l, key) + if self.stream_flags & 0x100 !=0: + decomp_size = struct.unpack(">8)&0xFF, (val>>16)&0xFF, (val>>24)&0xFF + + def __str__(self): + return '0x%02x%02x%02x%02x'%(self.a, self.r, self.g, self.b) + + def __len__(self): + return 4 + + def __getitem__(self, i): # Qt compatible ordering and values + return (self.r, self.g, self.b, 0xff-self.a)[i] # In Qt 0xff is opaque while in LRS 0x00 is opaque + + def to_html(self): + return 'rgb(%d, %d, %d)'%(self.r, self.g, self.b) + + +class EmptyPageElement(object): + + def __iter__(self): + for i in range(0): + yield i + + def __str__(self): + return str(self) + + +class PageDiv(EmptyPageElement): + + def __init__(self, pain, spacesize, linewidth, linecolor): + self.pain, self.spacesize, self.linewidth = pain, spacesize, linewidth + self.linecolor = Color(linecolor) + + def __str__(self): + return '\n\n'%\ + (self.pain, self.spacesize, self.linewidth, self.color) + + +class RuledLine(EmptyPageElement): + + linetype_map = {0x00: 'none', 0x10: 'solid', 0x20: 'dashed', 0x30: 'double', 0x40: 'dotted', 0x13: 'unknown13'} + + def __init__(self, linelength, linetype, linewidth, linecolor): + self.linelength, self.linewidth = linelength, linewidth + self.linetype = self.linetype_map[linetype] + self.linecolor = Color(linecolor) + self.id = -1 + + def __str__(self): + return '\n\n'%\ + (self.linelength, self.linetype, self.linewidth, self.linecolor) + + +class Wait(EmptyPageElement): + + def __init__(self, time): + self.time = time + + def __str__(self): + return '\n\n'%(self.time) + + +class Locate(EmptyPageElement): + + pos_map = {1:'bottomleft', 2:'bottomright', 3:'topright', 4:'topleft', 5:'base'} + + def __init__(self, pos): + self.pos = self.pos_map[pos] + + def __str__(self): + return '\n\n'%(self.pos) + + +class BlockSpace(EmptyPageElement): + + def __init__(self, xspace, yspace): + self.xspace, self.yspace = xspace, yspace + + def __str__(self): + return '\n\n'%\ + (self.xspace, self.yspace) + + +class Page(LRFStream): + tag_map = { + 0xF503: ['style_id', 'D'], + 0xF50B: ['obj_list', 'P'], + 0xF571: ['', ''], + 0xF57C: ['parent_page_tree', 'D'], + } + tag_map.update(PageAttr.tag_map) + tag_map.update(LRFStream.tag_map) + style = property(fget=lambda self : self._document.objects[self.style_id]) + evenheader = property(fget=lambda self : self._document.objects[self.style.evenheaderid]) + evenfooter = property(fget=lambda self : self._document.objects[self.style.evenfooterid]) + oddheader = property(fget=lambda self : self._document.objects[self.style.oddheaderid]) + oddfooter = property(fget=lambda self : self._document.objects[self.style.oddfooterid]) + + class Content(LRFContentObject): + tag_map = { + 0xF503: 'link', + 0xF54E: 'page_div', + 0xF547: 'x_space', + 0xF546: 'y_space', + 0xF548: 'do_pos', + 0xF573: 'ruled_line', + 0xF5D4: 'wait', + 0xF5D6: 'sound_stop', + } + + def __init__(self, byts, objects): + self.in_blockspace = False + LRFContentObject.__init__(self, byts, objects) + + def link(self, tag): + self.close_blockspace() + self._contents.append(self.objects[tag.dword]) + + def page_div(self, tag): + self.close_blockspace() + pars = struct.unpack("= 700 else 'normal')) + fn = getattr(obj, 'fontfacename', None) + if fn is not None: + fn = cls.FONT_MAP[fn] + ans += item('font-family: %s;'%fn) + fg = getattr(obj, 'textcolor', None) + if fg is not None: + fg = fg.to_html() + ans += item('color: %s;'%fg) + bg = getattr(obj, 'textbgcolor', None) + if bg is not None: + bg = bg.to_html() + ans += item('background-color: %s;'%bg) + al = getattr(obj, 'align', None) + if al is not None: + al = dict(head='left', center='center', foot='right') + ans += item('text-align: %s;'%al) + lh = getattr(obj, 'linespace', None) + if lh is not None: + ans += item('text-align: %fpt;'%(int(lh)/10)) + pi = getattr(obj, 'parindent', None) + if pi is not None: + ans += item('text-indent: %fpt;'%(int(pi)/10)) + + return ans + + +class TextAttr(StyleObject, LRFObject, TextCSS): + + FONT_MAP = collections.defaultdict(lambda : 'serif') + for key, value in PRS500_PROFILE.default_fonts.items(): + FONT_MAP[value] = key + + tag_map = { + 0xF511: ['fontsize', 'w'], + 0xF512: ['fontwidth', 'w'], + 0xF513: ['fontescapement', 'w'], + 0xF514: ['fontorientation', 'w'], + 0xF515: ['fontweight', 'W'], + 0xF516: ['fontfacename', 'P'], + 0xF517: ['textcolor', 'D', Color], + 0xF518: ['textbgcolor', 'D', Color], + 0xF519: ['wordspace', 'w'], + 0xF51A: ['letterspace', 'w'], + 0xF51B: ['baselineskip', 'w'], + 0xF51C: ['linespace', 'w'], + 0xF51D: ['parindent', 'w'], + 0xF51E: ['parskip', 'w'], + 0xF53C: ['align', 'W', {1: 'head', 4: 'center', 8: 'foot'}], + 0xF53D: ['column', 'W'], + 0xF53E: ['columnsep', 'W'], + 0xF5DD: ['charspace', 'w'], + 0xF5F1: ['textlinewidth', 'W'], + 0xF5F2: ['linecolor', 'D', Color], + } + tag_map.update(ruby_tags) + tag_map.update(LRFObject.tag_map) + + +class Block(LRFStream, TextCSS): + tag_map = { + 0xF503: ['style_id', 'D'], + } + tag_map.update(BlockAttr.tag_map) + tag_map.update(TextAttr.tag_map) + tag_map.update(LRFStream.tag_map) + extra_attrs = [i[0] for i in BlockAttr.tag_map.values()] + extra_attrs.extend([i[0] for i in TextAttr.tag_map.values()]) + + style = property(fget=lambda self : self._document.objects[self.style_id]) + textstyle = property(fget=lambda self : self._document.objects[self.textstyle_id]) + + def initialize(self): + self.attrs = {} + stream = io.BytesIO(self.stream) + tag = Tag(stream) + if tag.id != 0xF503: + raise LRFParseError("Bad block content") + obj = self._document.objects[tag.dword] + if isinstance(obj, SimpleText): + self.name = 'SimpleTextBlock' + self.textstyle_id = obj.style_id + elif isinstance(obj, Text): + self.name = 'TextBlock' + self.textstyle_id = obj.style_id + elif isinstance(obj, Image): + self.name = 'ImageBlock' + for attr in ('x0', 'x1', 'y0', 'y1', 'xsize', 'ysize', 'refstream'): + self.attrs[attr] = getattr(obj, attr) + self.refstream = self._document.objects[self.attrs['refstream']] + elif isinstance(obj, Button): + self.name = 'ButtonBlock' + else: + raise LRFParseError("Unexpected block type: "+obj.__class__.__name__) + + self.content = obj + + for attr in self.extra_attrs: + if hasattr(self, attr): + self.attrs[attr] = getattr(self, attr) + + def __str__(self): + s = '\n<%s objid="%d" blockstyle="%d" '%(self.name, self.id, self.style_id) + if hasattr(self, 'textstyle_id'): + s += 'textstyle="%d" '%(self.textstyle_id,) + for attr in self.attrs: + s += '%s="%s" '%(attr, self.attrs[attr]) + if self.name != 'ImageBlock': + s = s.rstrip()+'>\n' + s += str(self.content) + s += '\n'%(self.name,) + return s + return s.rstrip() + ' />\n' + + def to_html(self): + if self.name == 'TextBlock': + return '
%s
'%(self.style_id, self.textstyle_id, self.content.to_html()) + return '' + + +class MiniPage(LRFStream): + tag_map = { + 0xF541: ['minipagewidth', 'W'], + 0xF542: ['minipageheight', 'W'], + } + tag_map.update(LRFStream.tag_map) + tag_map.update(BlockAttr.tag_map) + + +class Text(LRFStream): + tag_map = { + 0xF503: ['style_id', 'D'], + } + tag_map.update(TextAttr.tag_map) + tag_map.update(LRFStream.tag_map) + + style = property(fget=lambda self : self._document.objects[self.style_id]) + + text_map = {0x22: '"', 0x26: '&', 0x27: '\'', 0x3c: '<', 0x3e: '>'} + entity_pattern = re.compile(r'&(\S+?);') + + text_tags = { + 0xF581: ['simple_container', 'Italic'], + 0xF582: 'end_container', + 0xF5B1: ['simple_container', 'Yoko'], + 0xF5B2: 'end_container', + 0xF5B3: ['simple_container', 'Tate'], + 0xF5B4: 'end_container', + 0xF5B5: ['simple_container', 'Nekase'], + 0xF5B6: 'end_container', + 0xF5A1: 'start_para', + 0xF5A2: 'end_para', + 0xF5A7: 'char_button', + 0xF5A8: 'end_container', + 0xF5A9: ['simple_container', 'Rubi'], + 0xF5AA: 'end_container', + 0xF5AB: ['simple_container', 'Oyamoji'], + 0xF5AC: 'end_container', + 0xF5AD: ['simple_container', 'Rubimoji'], + 0xF5AE: 'end_container', + 0xF5B7: ['simple_container', 'Sup'], + 0xF5B8: 'end_container', + 0xF5B9: ['simple_container', 'Sub'], + 0xF5BA: 'end_container', + 0xF5BB: ['simple_container', 'NoBR'], + 0xF5BC: 'end_container', + 0xF5BD: ['simple_container', 'EmpDots'], + 0xF5BE: 'end_container', + 0xF5C1: 'empline', + 0xF5C2: 'end_container', + 0xF5C3: 'draw_char', + 0xF5C4: 'end_container', + 0xF5C6: 'box', + 0xF5C7: 'end_container', + 0xF5CA: 'space', + 0xF5D1: 'plot', + 0xF5D2: 'cr', + } + + class TextTag(object): + + def __init__(self, name, attrs={}, self_closing=False): + self.name = name + self.attrs = attrs + self.self_closing = self_closing + + def __str__(self): + s = '<%s '%(self.name,) + for name, val in self.attrs.items(): + s += '%s="%s" '%(name, val) + return s.rstrip() + (' />' if self.self_closing else '>') + + def to_html(self): + s = '' + return s + + def close_html(self): + return '' + + class Span(TextTag): + pass + + linetype_map = {0: 'none', 0x10: 'solid', 0x20: 'dashed', 0x30: 'double', 0x40: 'dotted'} + adjustment_map = {1: 'top', 2: 'center', 3: 'baseline', 4: 'bottom'} + lineposition_map = {1:'before', 2:'after'} + + def add_text(self, text): + s = str(text, "utf-16-le") + if s: + s = s.translate(self.text_map) + self.content.append(self.entity_pattern.sub(entity_to_unicode, s)) + + def end_container(self, tag, stream): + self.content.append(None) + + def start_para(self, tag, stream): + self.content.append(self.__class__.TextTag('P')) + + def close_containers(self, start=0): + if len(self.content) == 0: + return + open_containers = 0 + if len(self.content) > 0 and isinstance(self.content[-1], self.__class__.Span): + self.content.pop() + while start < len(self.content): + c = self.content[start] + if c is None: + open_containers -= 1 + elif isinstance(c, self.__class__.TextTag) and not c.self_closing: + open_containers += 1 + start += 1 + self.content.extend(None for i in range(open_containers)) + + def end_para(self, tag, stream): + i = len(self.content)-1 + while i > -1: + if isinstance(self.content[i], Text.TextTag) and self.content[i].name == 'P': + break + i -= 1 + self.close_containers(start=i) + + def cr(self, tag, stream): + self.content.append(self.__class__.TextTag('CR', self_closing=True)) + + def char_button(self, tag, stream): + self.content.append(self.__class__.TextTag( + 'CharButton', attrs={'refobj':tag.dword})) + + def simple_container(self, tag, name): + self.content.append(self.__class__.TextTag(name)) + + def empline(self, tag, stream): + def invalid(op): + stream.seek(op) + # self.simple_container(None, 'EmpLine') + + oldpos = stream.tell() + try: + t = Tag(stream) + if t.id not in (0xF579, 0xF57A): + raise LRFParseError + except LRFParseError: + invalid(oldpos) + return + h = TextAttr.tag_map[t.id] + attrs = {} + attrs[h[0]] = TextAttr.tag_to_val(h, None, t, None) + oldpos = stream.tell() + try: + t = Tag(stream) + if t.id not in (0xF579, 0xF57A): + raise LRFParseError + h = TextAttr.tag_map[t.id] + attrs[h[0]] = TextAttr.tag_to_val(h, None, t, None) + except LRFParseError: + stream.seek(oldpos) + + if attrs: + self.content.append(self.__class__.TextTag( + 'EmpLine', attrs=attrs)) + + def space(self, tag, stream): + self.content.append(self.__class__.TextTag('Space', + attrs={'xsize':tag.sword}, + self_closing=True)) + + def plot(self, tag, stream): + xsize, ysize, refobj, adjustment = struct.unpack("= start_pos: + if tag_pos > start_pos: + self.add_text(self.stream[start_pos:tag_pos]) + stream.seek(tag_pos) + else: # No tags in this stream + self.add_text(self.stream) + stream.seek(0, 2) + break + + tag = Tag(stream) + + if tag.id == 0xF5CC: + self.add_text(stream.read(tag.word)) + elif tag.id in self.__class__.text_tags: # A Text tag + action = self.__class__.text_tags[tag.id] + if isinstance(action, str): + getattr(self, action)(tag, stream) + else: + getattr(self, action[0])(tag, action[1]) + elif tag.id in TextAttr.tag_map: # A Span attribute + action = TextAttr.tag_map[tag.id] + if len(self.content) == 0: + current_style = style.copy() + name, val = action[0], LRFObject.tag_to_val(action, self, tag, None) + if name and (name not in current_style or current_style[name] != val): + # No existing Span + if len(self.content) > 0 and isinstance(self.content[-1], self.__class__.Span): + self.content[-1].attrs[name] = val + else: + self.content.append(self.__class__.Span('Span', {name:val})) + current_style[name] = val + if len(self.content) > 0: + self.close_containers() + self.stream = None + + def __str__(self): + s = '' + open_containers = collections.deque() + for c in self.content: + if isinstance(c, str): + s += prepare_string_for_xml(c).replace('\0', '') + elif c is None: + if open_containers: + p = open_containers.pop() + s += ''%(p.name,) + else: + s += str(c) + if not c.self_closing: + open_containers.append(c) + + if len(open_containers) > 0: + if len(open_containers) == 1: + s += ''%(open_containers[0].name,) + else: + raise LRFParseError('Malformed text stream %s'%([i.name for i in open_containers if isinstance(i, Text.TextTag)],)) + return s + + def to_html(self): + s = '' + open_containers = collections.deque() + in_p = False + for c in self.content: + if isinstance(c, str): + s += c + elif c is None: + p = open_containers.pop() + s += p.close_html() + else: + if c.name == 'P': + in_p = True + elif c.name == 'CR': + s += '
' if in_p else '

' + else: + s += c.to_html() + if not c.self_closing: + open_containers.append(c) + + if len(open_containers) > 0: + raise LRFParseError('Malformed text stream %s'%([i.name for i in open_containers if isinstance(i, Text.TextTag)],)) + return s + + +class Image(LRFObject): + tag_map = { + 0xF54A: ['', 'parse_image_rect'], + 0xF54B: ['', 'parse_image_size'], + 0xF54C: ['refstream', 'D'], + 0xF555: ['comment', 'P'], + } + + def parse_image_rect(self, tag, f): + self.x0, self.y0, self.x1, self.y1 = struct.unpack("\n'%\ + (self.id, self.x0, self.y0, self.x1, self.y1, self.xsize, self.ysize, self.refstream) + + +class PutObj(EmptyPageElement): + + def __init__(self, objects, x1, y1, refobj): + self.x1, self.y1, self.refobj = x1, y1, refobj + self.object = objects[refobj] + + def __str__(self): + return ''%(self.x1, self.y1, self.refobj) + + +class Canvas(LRFStream): + tag_map = { + 0xF551: ['canvaswidth', 'W'], + 0xF552: ['canvasheight', 'W'], + 0xF5DA: ['', 'parse_waits'], + 0xF533: ['blockrule', 'W', {0x44: "block-fixed", 0x22: "block-adjustable"}], + 0xF534: ['bgcolor', 'D', Color], + 0xF535: ['layout', 'W', {0x41: 'TbRl', 0x34: 'LrTb'}], + 0xF536: ['framewidth', 'W'], + 0xF537: ['framecolor', 'D', Color], + 0xF52E: ['framemode', 'W', {0: 'none', 2: 'curve', 1:'square'}], + } + tag_map.update(LRFStream.tag_map) + extra_attrs = ['canvaswidth', 'canvasheight', 'blockrule', 'layout', + 'framewidth', 'framecolor', 'framemode'] + + def parse_waits(self, tag, f): + val = tag.word + self.setwaitprop = val&0xF + self.setwaitsync = val&0xF0 + + def initialize(self): + self.attrs = {} + for attr in self.extra_attrs: + if hasattr(self, attr): + self.attrs[attr] = getattr(self, attr) + self._contents = [] + stream = io.BytesIO(self.stream) + while stream.tell() < len(self.stream): + tag = Tag(stream) + try: + self._contents.append( + PutObj(self._document.objects, + *struct.unpack("\n' + for po in self: + s += str(po) + '\n' + s += '\n'%(self.__class__.__name__,) + return s + + def __iter__(self): + for i in self._contents: + yield i + + +class Header(Canvas): + pass + + +class Footer(Canvas): + pass + + +class ESound(LRFObject): + pass + + +class ImageStream(LRFStream): + tag_map = { + 0xF555: ['comment', 'P'], + } + imgext = {0x11: 'jpeg', 0x12: 'png', 0x13: 'bmp', 0x14: 'gif'} + + tag_map.update(LRFStream.tag_map) + + encoding = property(fget=lambda self : self.imgext[self.stream_flags & 0xFF].upper()) + + def end_stream(self, *args): + LRFStream.end_stream(self, *args) + self.file = str(self.id) + '.' + self.encoding.lower() + if self._document is not None: + self._document.image_map[self.id] = self + + def __str__(self): + return '\n'%\ + (self.id, self.encoding, self.file) + + +class Import(LRFStream): + pass + + +class Button(LRFObject): + tag_map = { + 0xF503: ['', 'do_ref_image'], + 0xF561: ['button_flags','W'], # \n' + return s + + refpage = property(fget=lambda self : self.jump_action(2)[0]) + refobj = property(fget=lambda self : self.jump_action(2)[1]) + + +class Window(LRFObject): + pass + + +class PopUpWin(LRFObject): + pass + + +class Sound(LRFObject): + pass + + +class SoundStream(LRFObject): + pass + + +class Font(LRFStream): + tag_map = { + 0xF559: ['fontfilename', 'P'], + 0xF55D: ['fontfacename', 'P'], + } + tag_map.update(LRFStream.tag_map) + data = property(fget=lambda self: self.stream) + + def end_stream(self, *args): + LRFStream.end_stream(self, *args) + self._document.font_map[self.fontfacename] = self + self.file = self.fontfacename + '.ttf' + + def __unicode__(self): + s = '\n'%\ + (self.id, self.fontfilename, self.fontfacename, self.file) + return s + + +class ObjectInfo(LRFStream): + pass + + +class BookAttr(StyleObject, LRFObject): + tag_map = { + 0xF57B: ['page_tree_id', 'D'], + 0xF5D8: ['', 'add_font'], + 0xF5DA: ['setwaitprop', 'W', {1: 'replay', 2: 'noreplay'}], + } + tag_map.update(ruby_tags) + tag_map.update(LRFObject.tag_map) + binding_map = {1: 'Lr', 16 : 'Rl'} + + def __init__(self, document, stream, id, scramble_key, boundary): + self.font_link_list = [] + LRFObject.__init__(self, document, stream, id, scramble_key, boundary) + + def add_font(self, tag, f): + self.font_link_list.append(tag.dword) + + def __str__(self): + s = '\n'%(self.id, self.id) + s += '\n'%(self._tags_to_xml(),) + doc = self._document + s += '\n'%\ + (self.binding_map[doc.binding], doc.dpi, doc.width, doc.height, doc.color_depth) + for font in self._document.font_map.values(): + s += str(font) + s += '\n' + return s + + +class SimpleText(Text): + pass + + +class TocLabel(object): + + def __init__(self, refpage, refobject, label): + self.refpage, self.refobject, self.label = refpage, refobject, label + + def __str__(self): + return '%s\n'%(self.refpage, self.refobject, self.label) + + +class TOCObject(LRFStream): + + def initialize(self): + stream = io.BytesIO(self.stream) + c = struct.unpack(" 0: + refpage = struct.unpack("\n' + + +object_map = [None, # 00 + PageTree, # 01 + Page, # 02 + Header, # 03 + Footer, # 04 + PageAttr, # 05 + Block, # 06 + BlockAttr, # 07 + MiniPage, # 08 + None, # 09 + Text, # 0A + TextAttr, # 0B + Image, # 0C + Canvas, # 0D + ESound, # 0E + None, # 0F + None, # 10 + ImageStream, # 11 + Import, # 12 + Button, # 13 + Window, # 14 + PopUpWin, # 15 + Sound, # 16 + SoundStream, # 17 + None, # 18 + Font, # 19 + ObjectInfo, # 1A + None, # 1B + BookAttr, # 1C + SimpleText, # 1D + TOCObject] # 1E + + +def get_object(document, stream, id, offset, size, scramble_key): + stream.seek(offset) + start_tag = Tag(stream) + if start_tag.id != 0xF500: + raise LRFParseError('Bad object start') + obj_id, obj_type = struct.unpack(" 0: + res.append(struct.unpack("