""" Read content from ereader pdb file with a 132 byte header created by Dropbook. """ import os import re import struct import zlib from ebook_converter.ebooks import DRMError from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.pdb.ereader import EreaderError from ebook_converter.ebooks.pdb.formatreader import FormatReader from ebook_converter.utils import directory class HeaderRecord(object): ''' The first record in the file is always the header record. It holds information related to the location of text, images, and so on in the file. This is used in conjunction with the sections defined in the file header. ''' def __init__(self, raw): self.compression, = struct.unpack('>H', raw[0:2]) self.non_text_offset, = struct.unpack('>H', raw[12:14]) self.chapter_count, = struct.unpack('>H', raw[14:16]) self.image_count, = struct.unpack('>H', raw[20:22]) self.link_count, = struct.unpack('>H', raw[22:24]) self.has_metadata, = struct.unpack('>H', raw[24:26]) self.footnote_count, = struct.unpack('>H', raw[28:30]) self.sidebar_count, = struct.unpack('>H', raw[30:32]) self.chapter_offset, = struct.unpack('>H', raw[32:34]) self.small_font_page_offset, = struct.unpack('>H', raw[36:38]) self.large_font_page_offset, = struct.unpack('>H', raw[38:40]) self.image_data_offset, = struct.unpack('>H', raw[40:42]) self.link_offset, = struct.unpack('>H', raw[42:44]) self.metadata_offset, = struct.unpack('>H', raw[44:46]) self.footnote_offset, = struct.unpack('>H', raw[48:50]) self.sidebar_offset, = struct.unpack('>H', raw[50:52]) self.last_data_offset, = struct.unpack('>H', raw[52:54]) self.num_text_pages = self.non_text_offset - 1 self.num_image_pages = self.metadata_offset - self.image_data_offset class Reader132(FormatReader): def __init__(self, header, stream, log, options): self.log = log self.encoding = options.input_encoding self.log.debug('132 byte header version found.') self.sections = [] for i in range(header.num_sections): self.sections.append(header.section_data(i)) self.header_record = HeaderRecord(self.section_data(0)) if self.header_record.compression not in (2, 10): if self.header_record.compression in (260, 272): raise DRMError('eReader DRM is not supported.') else: raise EreaderError('Unknown book compression %i.' % self.header_record.compression) from ebook_converter.ebooks.metadata.pdb import get_metadata self.mi = get_metadata(stream, False) def section_data(self, number): return self.sections[number] def decompress_text(self, number): if self.header_record.compression == 2: from ebook_converter.ebooks.compression.palmdoc import decompress_doc return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') if self.header_record.compression == 10: return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') def get_image(self, number): if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: return 'empty', b'' data = self.section_data(number) name = data[4:4 + 32].strip(b'\x00').decode(self.encoding or 'cp1252') img = data[62:] return name, img def get_text_page(self, number): ''' Only palmdoc and zlib compressed are supported. The text is assumed to be encoded as Windows-1252. The encoding is part of the eReader file spec and should always be this encoding. ''' if not (1 <= number <= self.header_record.num_text_pages): return '' return self.decompress_text(number) def extract_content(self, output_dir): from ebook_converter.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html from ebook_converter.ebooks.pml.pmlconverter import PML_HTMLizer output_dir = os.path.abspath(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) title = self.mi.title if not isinstance(title, str): title = title.decode('utf-8', 'replace') html = '