mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-30 00:33:32 +02:00
Initial import
This commit is contained in:
106
ebook_converter/ebooks/pdb/__init__.py
Normal file
106
ebook_converter/ebooks/pdb/__init__.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
class PDBError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
FORMAT_READERS = None
|
||||
|
||||
|
||||
def _import_readers():
|
||||
global FORMAT_READERS
|
||||
from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader
|
||||
from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader
|
||||
from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader
|
||||
from calibre.ebooks.pdb.pdf.reader import Reader as pdf_reader
|
||||
from calibre.ebooks.pdb.plucker.reader import Reader as plucker_reader
|
||||
from calibre.ebooks.pdb.haodoo.reader import Reader as haodoo_reader
|
||||
|
||||
FORMAT_READERS = {
|
||||
'PNPdPPrs': ereader_reader,
|
||||
'PNRdPPrs': ereader_reader,
|
||||
'zTXTGPlm': ztxt_reader,
|
||||
'TEXtREAd': palmdoc_reader,
|
||||
'.pdfADBE': pdf_reader,
|
||||
'DataPlkr': plucker_reader,
|
||||
'BOOKMTIT': haodoo_reader,
|
||||
'BOOKMTIU': haodoo_reader,
|
||||
}
|
||||
|
||||
|
||||
ALL_FORMAT_WRITERS = {'doc', 'ztxt', 'ereader'}
|
||||
FORMAT_WRITERS = None
|
||||
|
||||
|
||||
def _import_writers():
|
||||
global FORMAT_WRITERS
|
||||
from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
|
||||
from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer
|
||||
from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer
|
||||
|
||||
FORMAT_WRITERS = {
|
||||
'doc': palmdoc_writer,
|
||||
'ztxt': ztxt_writer,
|
||||
'ereader': ereader_writer,
|
||||
}
|
||||
|
||||
|
||||
IDENTITY_TO_NAME = {
|
||||
'PNPdPPrs': 'eReader',
|
||||
'PNRdPPrs': 'eReader',
|
||||
'zTXTGPlm': 'zTXT',
|
||||
'TEXtREAd': 'PalmDOC',
|
||||
'.pdfADBE': 'Adobe Reader',
|
||||
'DataPlkr': 'Plucker',
|
||||
'BOOKMTIT': 'Haodoo.net',
|
||||
'BOOKMTIU': 'Haodoo.net',
|
||||
|
||||
'BVokBDIC': 'BDicty',
|
||||
'DB99DBOS': 'DB (Database program)',
|
||||
'vIMGView': 'FireViewer (ImageViewer)',
|
||||
'PmDBPmDB': 'HanDBase',
|
||||
'InfoINDB': 'InfoView',
|
||||
'ToGoToGo': 'iSilo',
|
||||
'SDocSilX': 'iSilo 3',
|
||||
'JbDbJBas': 'JFile',
|
||||
'JfDbJFil': 'JFile Pro',
|
||||
'DATALSdb': 'LIST',
|
||||
'Mdb1Mdb1': 'MobileDB',
|
||||
'BOOKMOBI': 'MobiPocket',
|
||||
'DataSprd': 'QuickSheet',
|
||||
'SM01SMem': 'SuperMemo',
|
||||
'TEXtTlDc': 'TealDoc',
|
||||
'InfoTlIf': 'TealInfo',
|
||||
'DataTlMl': 'TealMeal',
|
||||
'DataTlPt': 'TealPaint',
|
||||
'dataTDBP': 'ThinkDB',
|
||||
'TdatTide': 'Tides',
|
||||
'ToRaTRPW': 'TomeRaider',
|
||||
'BDOCWrdS': 'WordSmith',
|
||||
}
|
||||
|
||||
|
||||
def get_reader(identity):
|
||||
'''
|
||||
Returns None if no reader is found for the identity.
|
||||
'''
|
||||
global FORMAT_READERS
|
||||
if FORMAT_READERS is None:
|
||||
_import_readers()
|
||||
return FORMAT_READERS.get(identity, None)
|
||||
|
||||
|
||||
def get_writer(extension):
|
||||
'''
|
||||
Returns None if no writer is found for extension.
|
||||
'''
|
||||
global FORMAT_WRITERS
|
||||
if FORMAT_WRITERS is None:
|
||||
_import_writers()
|
||||
return FORMAT_WRITERS.get(extension, None)
|
||||
30
ebook_converter/ebooks/pdb/ereader/__init__.py
Normal file
30
ebook_converter/ebooks/pdb/ereader/__init__.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
|
||||
class EreaderError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def image_name(name, taken_names=()):
|
||||
name = os.path.basename(name)
|
||||
|
||||
if len(name) > 32:
|
||||
cut = len(name) - 32
|
||||
names = name[:10]
|
||||
namee = name[10+cut:]
|
||||
name = '%s%s.png' % (names, namee)
|
||||
|
||||
i = 0
|
||||
base_name, ext = os.path.splitext(name)
|
||||
while name in taken_names:
|
||||
i += 1
|
||||
name = '%s%s%s' % (base_name, i, ext)
|
||||
|
||||
return name.ljust(32, '\x00')[:32]
|
||||
37
ebook_converter/ebooks/pdb/ereader/reader.py
Normal file
37
ebook_converter/ebooks/pdb/ereader/reader.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
Read content from ereader pdb file.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.ebooks.pdb.ereader import EreaderError
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ebooks.pdb.ereader.reader132 import Reader132
|
||||
from calibre.ebooks.pdb.ereader.reader202 import Reader202
|
||||
|
||||
|
||||
class Reader(FormatReader):
|
||||
|
||||
def __init__(self, header, stream, log, options):
|
||||
record0_size = len(header.section_data(0))
|
||||
|
||||
if record0_size == 132:
|
||||
self.reader = Reader132(header, stream, log, options)
|
||||
elif record0_size in (116, 202):
|
||||
self.reader = Reader202(header, stream, log, options)
|
||||
else:
|
||||
raise EreaderError('Size mismatch. eReader header record size %s KB is not supported.' % record0_size)
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
return self.reader.extract_content(output_dir)
|
||||
|
||||
def dump_pml(self):
|
||||
return self.reader.dump_pml()
|
||||
|
||||
def dump_images(self, out_dir):
|
||||
return self.reader.dump_images(out_dir)
|
||||
221
ebook_converter/ebooks/pdb/ereader/reader132.py
Normal file
221
ebook_converter/ebooks/pdb/ereader/reader132.py
Normal file
@@ -0,0 +1,221 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
Read content from ereader pdb file with a 132 byte header created by Dropbook.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
import zlib
|
||||
|
||||
from calibre import CurrentDir
|
||||
from calibre.ebooks import DRMError
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.pdb.ereader import EreaderError
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from polyglot.builtins import unicode_type, range
|
||||
|
||||
|
||||
class HeaderRecord(object):
|
||||
'''
|
||||
The first record in the file is always the header record. It holds
|
||||
information related to the location of text, images, and so on
|
||||
in the file. This is used in conjunction with the sections
|
||||
defined in the file header.
|
||||
'''
|
||||
|
||||
def __init__(self, raw):
|
||||
self.compression, = struct.unpack('>H', raw[0:2])
|
||||
self.non_text_offset, = struct.unpack('>H', raw[12:14])
|
||||
self.chapter_count, = struct.unpack('>H', raw[14:16])
|
||||
self.image_count, = struct.unpack('>H', raw[20:22])
|
||||
self.link_count, = struct.unpack('>H', raw[22:24])
|
||||
self.has_metadata, = struct.unpack('>H', raw[24:26])
|
||||
self.footnote_count, = struct.unpack('>H', raw[28:30])
|
||||
self.sidebar_count, = struct.unpack('>H', raw[30:32])
|
||||
self.chapter_offset, = struct.unpack('>H', raw[32:34])
|
||||
self.small_font_page_offset, = struct.unpack('>H', raw[36:38])
|
||||
self.large_font_page_offset, = struct.unpack('>H', raw[38:40])
|
||||
self.image_data_offset, = struct.unpack('>H', raw[40:42])
|
||||
self.link_offset, = struct.unpack('>H', raw[42:44])
|
||||
self.metadata_offset, = struct.unpack('>H', raw[44:46])
|
||||
self.footnote_offset, = struct.unpack('>H', raw[48:50])
|
||||
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
|
||||
self.last_data_offset, = struct.unpack('>H', raw[52:54])
|
||||
|
||||
self.num_text_pages = self.non_text_offset - 1
|
||||
self.num_image_pages = self.metadata_offset - self.image_data_offset
|
||||
|
||||
|
||||
class Reader132(FormatReader):
|
||||
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.log = log
|
||||
self.encoding = options.input_encoding
|
||||
|
||||
self.log.debug('132 byte header version found.')
|
||||
|
||||
self.sections = []
|
||||
for i in range(header.num_sections):
|
||||
self.sections.append(header.section_data(i))
|
||||
|
||||
self.header_record = HeaderRecord(self.section_data(0))
|
||||
|
||||
if self.header_record.compression not in (2, 10):
|
||||
if self.header_record.compression in (260, 272):
|
||||
raise DRMError('eReader DRM is not supported.')
|
||||
else:
|
||||
raise EreaderError('Unknown book compression %i.' % self.header_record.compression)
|
||||
|
||||
from calibre.ebooks.metadata.pdb import get_metadata
|
||||
self.mi = get_metadata(stream, False)
|
||||
|
||||
def section_data(self, number):
|
||||
return self.sections[number]
|
||||
|
||||
def decompress_text(self, number):
|
||||
if self.header_record.compression == 2:
|
||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
|
||||
if self.header_record.compression == 10:
|
||||
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
|
||||
|
||||
def get_image(self, number):
|
||||
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
|
||||
return 'empty', b''
|
||||
data = self.section_data(number)
|
||||
name = data[4:4 + 32].strip(b'\x00').decode(self.encoding or 'cp1252')
|
||||
img = data[62:]
|
||||
return name, img
|
||||
|
||||
def get_text_page(self, number):
|
||||
'''
|
||||
Only palmdoc and zlib compressed are supported. The text is
|
||||
assumed to be encoded as Windows-1252. The encoding is part of
|
||||
the eReader file spec and should always be this encoding.
|
||||
'''
|
||||
if not (1 <= number <= self.header_record.num_text_pages):
|
||||
return ''
|
||||
|
||||
return self.decompress_text(number)
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
from calibre.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html
|
||||
from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
|
||||
|
||||
output_dir = os.path.abspath(output_dir)
|
||||
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
title = self.mi.title
|
||||
if not isinstance(title, unicode_type):
|
||||
title = title.decode('utf-8', 'replace')
|
||||
html = '<html><head><title>%s</title></head><body>' % title
|
||||
|
||||
pml = ''
|
||||
for i in range(1, self.header_record.num_text_pages + 1):
|
||||
self.log.debug('Extracting text page %i' % i)
|
||||
pml += self.get_text_page(i)
|
||||
hizer = PML_HTMLizer()
|
||||
html += hizer.parse_pml(pml, 'index.html')
|
||||
toc = hizer.get_toc()
|
||||
|
||||
if self.header_record.footnote_count > 0:
|
||||
html += '<br /><h1>%s</h1>' % _('Footnotes')
|
||||
footnoteids = re.findall(
|
||||
'\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
|
||||
self.log.debug('Extracting footnote page %i' % i)
|
||||
if fid < len(footnoteids):
|
||||
fid = footnoteids[fid]
|
||||
else:
|
||||
fid = ''
|
||||
html += footnote_to_html(fid, self.decompress_text(i))
|
||||
|
||||
if self.header_record.sidebar_count > 0:
|
||||
html += '<br /><h1>%s</h1>' % _('Sidebar')
|
||||
sidebarids = re.findall(
|
||||
'\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
|
||||
self.log.debug('Extracting sidebar page %i' % i)
|
||||
if sid < len(sidebarids):
|
||||
sid = sidebarids[sid]
|
||||
else:
|
||||
sid = ''
|
||||
html += sidebar_to_html(sid, self.decompress_text(i))
|
||||
|
||||
html += '</body></html>'
|
||||
|
||||
with CurrentDir(output_dir):
|
||||
with open('index.html', 'wb') as index:
|
||||
self.log.debug('Writing text to index.html')
|
||||
index.write(html.encode('utf-8'))
|
||||
|
||||
if not os.path.exists(os.path.join(output_dir, 'images/')):
|
||||
os.makedirs(os.path.join(output_dir, 'images/'))
|
||||
images = []
|
||||
with CurrentDir(os.path.join(output_dir, 'images/')):
|
||||
for i in range(0, self.header_record.num_image_pages):
|
||||
name, img = self.get_image(self.header_record.image_data_offset + i)
|
||||
images.append(name)
|
||||
with open(name, 'wb') as imgf:
|
||||
self.log.debug('Writing image %s to images/' % name)
|
||||
imgf.write(img)
|
||||
|
||||
opf_path = self.create_opf(output_dir, images, toc)
|
||||
|
||||
return opf_path
|
||||
|
||||
def create_opf(self, output_dir, images, toc):
|
||||
with CurrentDir(output_dir):
|
||||
if 'cover.png' in images:
|
||||
self.mi.cover = os.path.join('images', 'cover.png')
|
||||
|
||||
opf = OPFCreator(output_dir, self.mi)
|
||||
|
||||
manifest = [('index.html', None)]
|
||||
|
||||
for i in images:
|
||||
manifest.append((os.path.join('images', i), None))
|
||||
|
||||
opf.create_manifest(manifest)
|
||||
opf.create_spine(['index.html'])
|
||||
opf.set_toc(toc)
|
||||
with open('metadata.opf', 'wb') as opffile:
|
||||
with open('toc.ncx', 'wb') as tocfile:
|
||||
opf.render(opffile, tocfile, 'toc.ncx')
|
||||
|
||||
return os.path.join(output_dir, 'metadata.opf')
|
||||
|
||||
def dump_pml(self):
|
||||
'''
|
||||
This is primarily used for debugging and 3rd party tools to
|
||||
get the plm markup that comprises the text in the file.
|
||||
'''
|
||||
pml = ''
|
||||
|
||||
for i in range(1, self.header_record.num_text_pages + 1):
|
||||
pml += self.get_text_page(i)
|
||||
|
||||
return pml
|
||||
|
||||
def dump_images(self, output_dir):
|
||||
'''
|
||||
This is primarily used for debugging and 3rd party tools to
|
||||
get the images in the file.
|
||||
'''
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
with CurrentDir(output_dir):
|
||||
for i in range(0, self.header_record.num_image_pages):
|
||||
name, img = self.get_image(self.header_record.image_data_offset + i)
|
||||
with open(name, 'wb') as imgf:
|
||||
imgf.write(img)
|
||||
169
ebook_converter/ebooks/pdb/ereader/reader202.py
Normal file
169
ebook_converter/ebooks/pdb/ereader/reader202.py
Normal file
@@ -0,0 +1,169 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
Read content from ereader pdb file with a 116 and 202 byte header created by Makebook.
|
||||
'''
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import struct
|
||||
|
||||
from calibre import CurrentDir
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ebooks.pdb.ereader import EreaderError
|
||||
from polyglot.builtins import unicode_type, range
|
||||
|
||||
|
||||
class HeaderRecord(object):
|
||||
'''
|
||||
The first record in the file is always the header record. It holds
|
||||
information related to the location of text, images, and so on
|
||||
in the file. This is used in conjunction with the sections
|
||||
defined in the file header.
|
||||
'''
|
||||
|
||||
def __init__(self, raw):
|
||||
self.version, = struct.unpack('>H', raw[0:2])
|
||||
self.non_text_offset, = struct.unpack('>H', raw[8:10])
|
||||
|
||||
self.num_text_pages = self.non_text_offset - 1
|
||||
|
||||
|
||||
class Reader202(FormatReader):
|
||||
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.log = log
|
||||
self.encoding = options.input_encoding
|
||||
|
||||
self.log.debug('202 byte header version found.')
|
||||
|
||||
self.sections = []
|
||||
for i in range(header.num_sections):
|
||||
self.sections.append(header.section_data(i))
|
||||
|
||||
self.header_record = HeaderRecord(self.section_data(0))
|
||||
|
||||
if self.header_record.version not in (2, 4):
|
||||
raise EreaderError('Unknown book version %i.' % self.header_record.version)
|
||||
|
||||
from calibre.ebooks.metadata.pdb import get_metadata
|
||||
self.mi = get_metadata(stream, False)
|
||||
|
||||
def section_data(self, number):
|
||||
return self.sections[number]
|
||||
|
||||
def decompress_text(self, number):
|
||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||
data = bytearray(self.section_data(number))
|
||||
data = bytes(bytearray(x ^ 0xA5 for x in data))
|
||||
return decompress_doc(data).decode(self.encoding or 'cp1252', 'replace')
|
||||
|
||||
def get_image(self, number):
|
||||
name = None
|
||||
img = None
|
||||
|
||||
data = self.section_data(number)
|
||||
if data.startswith(b'PNG'):
|
||||
name = data[4:4 + 32].strip(b'\x00')
|
||||
img = data[62:]
|
||||
|
||||
return name, img
|
||||
|
||||
def get_text_page(self, number):
|
||||
'''
|
||||
Only palmdoc compression is supported. The text is xored with 0xA5 and
|
||||
assumed to be encoded as Windows-1252. The encoding is part of
|
||||
the eReader file spec and should always be this encoding.
|
||||
'''
|
||||
if not (1 <= number <= self.header_record.num_text_pages):
|
||||
return ''
|
||||
|
||||
return self.decompress_text(number)
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
from calibre.ebooks.pml.pmlconverter import pml_to_html
|
||||
|
||||
output_dir = os.path.abspath(output_dir)
|
||||
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
pml = ''
|
||||
for i in range(1, self.header_record.num_text_pages + 1):
|
||||
self.log.debug('Extracting text page %i' % i)
|
||||
pml += self.get_text_page(i)
|
||||
|
||||
title = self.mi.title
|
||||
if not isinstance(title, unicode_type):
|
||||
title = title.decode('utf-8', 'replace')
|
||||
|
||||
html = '<html><head><title>%s</title></head><body>%s</body></html>' % \
|
||||
(title, pml_to_html(pml))
|
||||
|
||||
with CurrentDir(output_dir):
|
||||
with open('index.html', 'wb') as index:
|
||||
self.log.debug('Writing text to index.html')
|
||||
index.write(html.encode('utf-8'))
|
||||
|
||||
if not os.path.exists(os.path.join(output_dir, 'images/')):
|
||||
os.makedirs(os.path.join(output_dir, 'images/'))
|
||||
images = []
|
||||
with CurrentDir(os.path.join(output_dir, 'images/')):
|
||||
for i in range(self.header_record.non_text_offset, len(self.sections)):
|
||||
name, img = self.get_image(i)
|
||||
if name:
|
||||
images.append(name)
|
||||
with open(name, 'wb') as imgf:
|
||||
self.log.debug('Writing image %s to images/' % name)
|
||||
imgf.write(img)
|
||||
|
||||
opf_path = self.create_opf(output_dir, images)
|
||||
|
||||
return opf_path
|
||||
|
||||
def create_opf(self, output_dir, images):
|
||||
with CurrentDir(output_dir):
|
||||
opf = OPFCreator(output_dir, self.mi)
|
||||
|
||||
manifest = [('index.html', None)]
|
||||
|
||||
for i in images:
|
||||
manifest.append((os.path.join('images/', i), None))
|
||||
|
||||
opf.create_manifest(manifest)
|
||||
opf.create_spine(['index.html'])
|
||||
with open('metadata.opf', 'wb') as opffile:
|
||||
opf.render(opffile)
|
||||
|
||||
return os.path.join(output_dir, 'metadata.opf')
|
||||
|
||||
def dump_pml(self):
|
||||
'''
|
||||
This is primarily used for debugging and 3rd party tools to
|
||||
get the plm markup that comprises the text in the file.
|
||||
'''
|
||||
pml = ''
|
||||
|
||||
for i in range(1, self.header_record.num_text_pages + 1):
|
||||
pml += self.get_text_page(i)
|
||||
|
||||
return pml
|
||||
|
||||
def dump_images(self, output_dir):
|
||||
'''
|
||||
This is primarily used for debugging and 3rd party tools to
|
||||
get the images in the file.
|
||||
'''
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
with CurrentDir(output_dir):
|
||||
for i in range(0, self.header_record.num_image_pages):
|
||||
name, img = self.get_image(self.header_record.image_data_offset + i)
|
||||
with open(name, 'wb') as imgf:
|
||||
imgf.write(img)
|
||||
19
ebook_converter/ebooks/pdb/formatreader.py
Normal file
19
ebook_converter/ebooks/pdb/formatreader.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
Interface defining the necessary public functions for a pdb format reader.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
class FormatReader(object):
|
||||
|
||||
def __init__(self, header, stream, log, options):
|
||||
raise NotImplementedError()
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
raise NotImplementedError()
|
||||
10
ebook_converter/ebooks/pdb/haodoo/__init__.py
Normal file
10
ebook_converter/ebooks/pdb/haodoo/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
||||
157
ebook_converter/ebooks/pdb/haodoo/reader.py
Normal file
157
ebook_converter/ebooks/pdb/haodoo/reader.py
Normal file
@@ -0,0 +1,157 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
Read content from Haodoo.net pdb file.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kan-Ru Chen <kanru@kanru.info>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
import struct
|
||||
import os
|
||||
|
||||
from calibre import prepare_string_for_xml
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.txt.processor import opf_writer, HTML_TEMPLATE
|
||||
from polyglot.builtins import range, map
|
||||
|
||||
BPDB_IDENT = b'BOOKMTIT'
|
||||
UPDB_IDENT = b'BOOKMTIU'
|
||||
|
||||
punct_table = {
|
||||
u"︵": u"(",
|
||||
u"︶": u")",
|
||||
u"︷": u"{",
|
||||
u"︸": u"}",
|
||||
u"︹": u"〔",
|
||||
u"︺": u"〕",
|
||||
u"︻": u"【",
|
||||
u"︼": u"】",
|
||||
u"︗": u"〖",
|
||||
u"︘": u"〗",
|
||||
u"﹇": u"[]",
|
||||
u"﹈": u"[]",
|
||||
u"︽": u"《",
|
||||
u"︾": u"》",
|
||||
u"︿": u"〈",
|
||||
u"﹀": u"〉",
|
||||
u"﹁": u"「",
|
||||
u"﹂": u"」",
|
||||
u"﹃": u"『",
|
||||
u"﹄": u"』",
|
||||
u"|": u"—",
|
||||
u"︙": u"…",
|
||||
u"ⸯ": u"~",
|
||||
u"│": u"…",
|
||||
u"¦": u"…",
|
||||
u" ": u" ",
|
||||
}
|
||||
|
||||
|
||||
def fix_punct(line):
|
||||
for (key, value) in punct_table.items():
|
||||
line = line.replace(key, value)
|
||||
return line
|
||||
|
||||
|
||||
class LegacyHeaderRecord(object):
|
||||
|
||||
def __init__(self, raw):
|
||||
fields = raw.lstrip().replace(b'\x1b\x1b\x1b', b'\x1b').split(b'\x1b')
|
||||
self.title = fix_punct(fields[0].decode('cp950', 'replace'))
|
||||
self.num_records = int(fields[1])
|
||||
self.chapter_titles = list(map(
|
||||
lambda x: fix_punct(x.decode('cp950', 'replace').rstrip('\x00')),
|
||||
fields[2:]))
|
||||
|
||||
|
||||
class UnicodeHeaderRecord(object):
|
||||
|
||||
def __init__(self, raw):
|
||||
fields = raw.lstrip().replace(b'\x1b\x00\x1b\x00\x1b\x00',
|
||||
b'\x1b\x00').split(b'\x1b\x00')
|
||||
self.title = fix_punct(fields[0].decode('utf_16_le', 'ignore'))
|
||||
self.num_records = int(fields[1])
|
||||
self.chapter_titles = list(map(
|
||||
lambda x: fix_punct(x.decode('utf_16_le', 'replace').rstrip('\x00')),
|
||||
fields[2].split(b'\r\x00\n\x00')))
|
||||
|
||||
|
||||
class Reader(FormatReader):
|
||||
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.stream = stream
|
||||
self.log = log
|
||||
|
||||
self.sections = []
|
||||
for i in range(header.num_sections):
|
||||
self.sections.append(header.section_data(i))
|
||||
|
||||
if header.ident == BPDB_IDENT:
|
||||
self.header_record = LegacyHeaderRecord(self.section_data(0))
|
||||
self.encoding = 'cp950'
|
||||
else:
|
||||
self.header_record = UnicodeHeaderRecord(self.section_data(0))
|
||||
self.encoding = 'utf_16_le'
|
||||
|
||||
def author(self):
|
||||
self.stream.seek(35)
|
||||
version = struct.unpack('>b', self.stream.read(1))[0]
|
||||
if version == 2:
|
||||
self.stream.seek(0)
|
||||
author = self.stream.read(35).rstrip(b'\x00').decode(self.encoding, 'replace')
|
||||
return author
|
||||
else:
|
||||
return 'Unknown'
|
||||
|
||||
def get_metadata(self):
|
||||
mi = MetaInformation(self.header_record.title,
|
||||
[self.author()])
|
||||
mi.language = 'zh-tw'
|
||||
|
||||
return mi
|
||||
|
||||
def section_data(self, number):
|
||||
return self.sections[number]
|
||||
|
||||
def decompress_text(self, number):
|
||||
return self.section_data(number).decode(self.encoding,
|
||||
'replace').rstrip('\x00')
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
txt = ''
|
||||
|
||||
self.log.info(u'Decompressing text...')
|
||||
for i in range(1, self.header_record.num_records + 1):
|
||||
self.log.debug(u'\tDecompressing text section %i' % i)
|
||||
title = self.header_record.chapter_titles[i-1]
|
||||
lines = []
|
||||
title_added = False
|
||||
for line in self.decompress_text(i).splitlines():
|
||||
line = fix_punct(line)
|
||||
line = line.strip()
|
||||
if not title_added and title in line:
|
||||
line = '<h1 class="chapter">' + line + '</h1>\n'
|
||||
title_added = True
|
||||
else:
|
||||
line = prepare_string_for_xml(line)
|
||||
lines.append('<p>%s</p>' % line)
|
||||
if not title_added:
|
||||
lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
|
||||
txt += '\n'.join(lines)
|
||||
|
||||
self.log.info(u'Converting text to OEB...')
|
||||
html = HTML_TEMPLATE % (self.header_record.title, txt)
|
||||
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
||||
index.write(html.encode('utf-8'))
|
||||
|
||||
mi = self.get_metadata()
|
||||
manifest = [('index.html', None)]
|
||||
spine = ['index.html']
|
||||
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
|
||||
|
||||
return os.path.join(output_dir, 'metadata.opf')
|
||||
91
ebook_converter/ebooks/pdb/header.py
Normal file
91
ebook_converter/ebooks/pdb/header.py
Normal file
@@ -0,0 +1,91 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
Read the header data from a pdb file.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
import struct
|
||||
import time
|
||||
from polyglot.builtins import long_type
|
||||
|
||||
|
||||
class PdbHeaderReader(object):
|
||||
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
self.ident = self.identity()
|
||||
self.num_sections = self.section_count()
|
||||
self.title = self.name()
|
||||
|
||||
def identity(self):
|
||||
self.stream.seek(60)
|
||||
ident = self.stream.read(8)
|
||||
return ident.decode('utf-8')
|
||||
|
||||
def section_count(self):
|
||||
self.stream.seek(76)
|
||||
return struct.unpack('>H', self.stream.read(2))[0]
|
||||
|
||||
def name(self):
|
||||
self.stream.seek(0)
|
||||
return re.sub(b'[^-A-Za-z0-9 ]+', b'_', self.stream.read(32).replace(b'\x00', b''))
|
||||
|
||||
def full_section_info(self, number):
|
||||
if not (0 <= number < self.num_sections):
|
||||
raise ValueError('Not a valid section number %i' % number)
|
||||
|
||||
self.stream.seek(78 + number * 8)
|
||||
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0]
|
||||
flags, val = a1, a2 << 16 | a3 << 8 | a4
|
||||
return (offset, flags, val)
|
||||
|
||||
def section_offset(self, number):
|
||||
if not (0 <= number < self.num_sections):
|
||||
raise ValueError('Not a valid section number %i' % number)
|
||||
|
||||
self.stream.seek(78 + number * 8)
|
||||
return struct.unpack('>LBBBB', self.stream.read(8))[0]
|
||||
|
||||
def section_data(self, number):
|
||||
if not (0 <= number < self.num_sections):
|
||||
raise ValueError('Not a valid section number %i' % number)
|
||||
|
||||
start = self.section_offset(number)
|
||||
if number == self.num_sections -1:
|
||||
self.stream.seek(0, 2)
|
||||
end = self.stream.tell()
|
||||
else:
|
||||
end = self.section_offset(number + 1)
|
||||
self.stream.seek(start)
|
||||
return self.stream.read(end - start)
|
||||
|
||||
|
||||
class PdbHeaderBuilder(object):
|
||||
|
||||
def __init__(self, identity, title):
|
||||
self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8')
|
||||
self.title = b'%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace')
|
||||
|
||||
def build_header(self, section_lengths, out_stream):
|
||||
'''
|
||||
section_lengths = Lenght of each section in file.
|
||||
'''
|
||||
|
||||
now = int(time.time())
|
||||
nrecords = len(section_lengths)
|
||||
|
||||
out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0))
|
||||
out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords))
|
||||
|
||||
offset = 78 + (8 * nrecords) + 2
|
||||
for id, record in enumerate(section_lengths):
|
||||
out_stream.write(struct.pack('>LBBBB', long_type(offset), 0, 0, 0, 0))
|
||||
offset += record
|
||||
out_stream.write(b'\x00\x00')
|
||||
0
ebook_converter/ebooks/pdb/palmdoc/__init__.py
Normal file
0
ebook_converter/ebooks/pdb/palmdoc/__init__.py
Normal file
74
ebook_converter/ebooks/pdb/palmdoc/reader.py
Normal file
74
ebook_converter/ebooks/pdb/palmdoc/reader.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
Read content from palmdoc pdb file.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, io
|
||||
|
||||
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
|
||||
|
||||
class HeaderRecord(object):
|
||||
'''
|
||||
The first record in the file is always the header record. It holds
|
||||
information related to the location of text, images, and so on
|
||||
in the file. This is used in conjunction with the sections
|
||||
defined in the file header.
|
||||
'''
|
||||
|
||||
def __init__(self, raw):
|
||||
self.compression, = struct.unpack('>H', raw[0:2])
|
||||
self.num_records, = struct.unpack('>H', raw[8:10])
|
||||
|
||||
|
||||
class Reader(FormatReader):
|
||||
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.stream = stream
|
||||
self.log = log
|
||||
self.options = options
|
||||
|
||||
self.sections = []
|
||||
for i in range(header.num_sections):
|
||||
self.sections.append(header.section_data(i))
|
||||
|
||||
self.header_record = HeaderRecord(self.section_data(0))
|
||||
|
||||
def section_data(self, number):
|
||||
return self.sections[number]
|
||||
|
||||
def decompress_text(self, number):
|
||||
if self.header_record.compression == 1:
|
||||
return self.section_data(number)
|
||||
if self.header_record.compression == 2 or self.header_record.compression == 258:
|
||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||
return decompress_doc(self.section_data(number))
|
||||
return b''
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
raw_txt = b''
|
||||
|
||||
self.log.info('Decompressing text...')
|
||||
for i in range(1, self.header_record.num_records + 1):
|
||||
self.log.debug('\tDecompressing text section %i' % i)
|
||||
raw_txt += self.decompress_text(i)
|
||||
|
||||
self.log.info('Converting text to OEB...')
|
||||
stream = io.BytesIO(raw_txt)
|
||||
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
txt_plugin = plugin_for_input_format('txt')
|
||||
for opt in txt_plugin.options:
|
||||
if not hasattr(self.options, opt.option.name):
|
||||
setattr(self.options, opt.option.name, opt.recommended_value)
|
||||
|
||||
stream.seek(0)
|
||||
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
|
||||
0
ebook_converter/ebooks/pdb/pdf/__init__.py
Normal file
0
ebook_converter/ebooks/pdb/pdf/__init__.py
Normal file
43
ebook_converter/ebooks/pdb/pdf/reader.py
Normal file
43
ebook_converter/ebooks/pdb/pdf/reader.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
Read content from palmdoc pdb file.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from polyglot.builtins import range
|
||||
|
||||
|
||||
class Reader(FormatReader):
|
||||
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.header = header
|
||||
self.stream = stream
|
||||
self.log = log
|
||||
self.options = options
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
self.log.info('Extracting PDF...')
|
||||
|
||||
pdf = PersistentTemporaryFile('.pdf')
|
||||
pdf.close()
|
||||
pdf = open(pdf, 'wb')
|
||||
for x in range(self.header.section_count()):
|
||||
pdf.write(self.header.section_data(x))
|
||||
pdf.close()
|
||||
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
pdf_plugin = plugin_for_input_format('pdf')
|
||||
for opt in pdf_plugin.options:
|
||||
if not hasattr(self.options, opt.option.name):
|
||||
setattr(self.options, opt.option.name, opt.recommended_value)
|
||||
|
||||
return pdf_plugin.convert(open(pdf, 'rb'), self.options, 'pdf', self.log, {})
|
||||
0
ebook_converter/ebooks/pdb/plucker/__init__.py
Normal file
0
ebook_converter/ebooks/pdb/plucker/__init__.py
Normal file
737
ebook_converter/ebooks/pdb/plucker/reader.py
Normal file
737
ebook_converter/ebooks/pdb/plucker/reader.py
Normal file
@@ -0,0 +1,737 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '20011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import struct
|
||||
import zlib
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from calibre import CurrentDir
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||
from calibre.utils.imghdr import identify
|
||||
from calibre.utils.img import save_cover_data_to, Canvas, image_from_data
|
||||
from polyglot.builtins import codepoint_to_chr, range
|
||||
|
||||
DATATYPE_PHTML = 0
|
||||
DATATYPE_PHTML_COMPRESSED = 1
|
||||
DATATYPE_TBMP = 2
|
||||
DATATYPE_TBMP_COMPRESSED = 3
|
||||
DATATYPE_MAILTO = 4
|
||||
DATATYPE_LINK_INDEX = 5
|
||||
DATATYPE_LINKS = 6
|
||||
DATATYPE_LINKS_COMPRESSED = 7
|
||||
DATATYPE_BOOKMARKS = 8
|
||||
DATATYPE_CATEGORY = 9
|
||||
DATATYPE_METADATA = 10
|
||||
DATATYPE_STYLE_SHEET = 11
|
||||
DATATYPE_FONT_PAGE = 12
|
||||
DATATYPE_TABLE = 13
|
||||
DATATYPE_TABLE_COMPRESSED = 14
|
||||
DATATYPE_COMPOSITE_IMAGE = 15
|
||||
DATATYPE_PAGELIST_METADATA = 16
|
||||
DATATYPE_SORTED_URL_INDEX = 17
|
||||
DATATYPE_SORTED_URL = 18
|
||||
DATATYPE_SORTED_URL_COMPRESSED = 19
|
||||
DATATYPE_EXT_ANCHOR_INDEX = 20
|
||||
DATATYPE_EXT_ANCHOR = 21
|
||||
DATATYPE_EXT_ANCHOR_COMPRESSED = 22
|
||||
|
||||
# IETF IANA MIBenum value for the character set.
|
||||
# See the http://www.iana.org/assignments/character-sets for valid values.
|
||||
# Not all character sets are handled by Python. This is a small subset that
|
||||
# the MIBenum maps to Python standard encodings
|
||||
# from http://docs.python.org/library/codecs.html#standard-encodings
|
||||
MIBNUM_TO_NAME = {
|
||||
3: 'ascii',
|
||||
4: 'latin_1',
|
||||
5: 'iso8859_2',
|
||||
6: 'iso8859_3',
|
||||
7: 'iso8859_4',
|
||||
8: 'iso8859_5',
|
||||
9: 'iso8859_6',
|
||||
10: 'iso8859_7',
|
||||
11: 'iso8859_8',
|
||||
12: 'iso8859_9',
|
||||
13: 'iso8859_10',
|
||||
17: 'shift_jis',
|
||||
18: 'euc_jp',
|
||||
27: 'utf_7',
|
||||
36: 'euc_kr',
|
||||
37: 'iso2022_kr',
|
||||
38: 'euc_kr',
|
||||
39: 'iso2022_jp',
|
||||
40: 'iso2022_jp_2',
|
||||
106: 'utf-8',
|
||||
109: 'iso8859_13',
|
||||
110: 'iso8859_14',
|
||||
111: 'iso8859_15',
|
||||
112: 'iso8859_16',
|
||||
1013: 'utf_16_be',
|
||||
1014: 'utf_16_le',
|
||||
1015: 'utf_16',
|
||||
2009: 'cp850',
|
||||
2010: 'cp852',
|
||||
2011: 'cp437',
|
||||
2013: 'cp862',
|
||||
2025: 'gb2312',
|
||||
2026: 'big5',
|
||||
2028: 'cp037',
|
||||
2043: 'cp424',
|
||||
2044: 'cp500',
|
||||
2046: 'cp855',
|
||||
2047: 'cp857',
|
||||
2048: 'cp860',
|
||||
2049: 'cp861',
|
||||
2050: 'cp863',
|
||||
2051: 'cp864',
|
||||
2052: 'cp865',
|
||||
2054: 'cp869',
|
||||
2063: 'cp1026',
|
||||
2085: 'hz',
|
||||
2086: 'cp866',
|
||||
2087: 'cp775',
|
||||
2089: 'cp858',
|
||||
2091: 'cp1140',
|
||||
2102: 'big5hkscs',
|
||||
2250: 'cp1250',
|
||||
2251: 'cp1251',
|
||||
2252: 'cp1252',
|
||||
2253: 'cp1253',
|
||||
2254: 'cp1254',
|
||||
2255: 'cp1255',
|
||||
2256: 'cp1256',
|
||||
2257: 'cp1257',
|
||||
2258: 'cp1258',
|
||||
}
|
||||
|
||||
|
||||
class HeaderRecord(object):
|
||||
'''
|
||||
Plucker header. PDB record 0.
|
||||
'''
|
||||
|
||||
def __init__(self, raw):
|
||||
self.uid, = struct.unpack('>H', raw[0:2])
|
||||
# This is labled version in the spec.
|
||||
# 2 is ZLIB compressed,
|
||||
# 1 is DOC compressed
|
||||
self.compression, = struct.unpack('>H', raw[2:4])
|
||||
self.records, = struct.unpack('>H', raw[4:6])
|
||||
# uid of the first html file. This should link
|
||||
# to other files which in turn may link to others.
|
||||
self.home_html = None
|
||||
|
||||
self.reserved = {}
|
||||
for i in range(self.records):
|
||||
adv = 4*i
|
||||
name, = struct.unpack('>H', raw[6+adv:8+adv])
|
||||
id, = struct.unpack('>H', raw[8+adv:10+adv])
|
||||
self.reserved[id] = name
|
||||
if name == 0:
|
||||
self.home_html = id
|
||||
|
||||
|
||||
class SectionHeader(object):
|
||||
'''
|
||||
Every sections (record) has this header. It gives
|
||||
details about the section such as it's uid.
|
||||
'''
|
||||
|
||||
def __init__(self, raw):
|
||||
self.uid, = struct.unpack('>H', raw[0:2])
|
||||
self.paragraphs, = struct.unpack('>H', raw[2:4])
|
||||
self.size, = struct.unpack('>H', raw[4:6])
|
||||
self.type, = struct.unpack('>B', raw[6])
|
||||
self.flags, = struct.unpack('>B', raw[7])
|
||||
|
||||
|
||||
class SectionHeaderText(object):
|
||||
'''
|
||||
Sub header for text records.
|
||||
'''
|
||||
|
||||
def __init__(self, section_header, raw):
|
||||
# The uncompressed size of each paragraph.
|
||||
self.sizes = []
|
||||
# uncompressed offset of each paragraph starting
|
||||
# at the beginning of the PHTML.
|
||||
self.paragraph_offsets = []
|
||||
# Paragraph attributes.
|
||||
self.attributes = []
|
||||
|
||||
for i in range(section_header.paragraphs):
|
||||
adv = 4*i
|
||||
self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0])
|
||||
self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0])
|
||||
|
||||
running_offset = 0
|
||||
for size in self.sizes:
|
||||
running_offset += size
|
||||
self.paragraph_offsets.append(running_offset)
|
||||
|
||||
|
||||
class SectionMetadata(object):
|
||||
'''
|
||||
Metadata.
|
||||
|
||||
This does not store metadata such as title, or author.
|
||||
That metadata would be best retrieved with the PDB (plucker)
|
||||
metdata reader.
|
||||
|
||||
This stores document specific information such as the
|
||||
text encoding.
|
||||
|
||||
Note: There is a default encoding but each text section
|
||||
can be assigned a different encoding.
|
||||
'''
|
||||
|
||||
def __init__(self, raw):
|
||||
self.default_encoding = 'latin-1'
|
||||
self.exceptional_uid_encodings = {}
|
||||
self.owner_id = None
|
||||
|
||||
record_count, = struct.unpack('>H', raw[0:2])
|
||||
|
||||
adv = 0
|
||||
for i in range(record_count):
|
||||
try:
|
||||
type, length = struct.unpack_from('>HH', raw, 2 + adv)
|
||||
except struct.error:
|
||||
break
|
||||
|
||||
# CharSet
|
||||
if type == 1:
|
||||
val, = struct.unpack('>H', raw[6+adv:8+adv])
|
||||
self.default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1')
|
||||
# ExceptionalCharSets
|
||||
elif type == 2:
|
||||
ii_adv = 0
|
||||
for ii in range(length / 2):
|
||||
uid, = struct.unpack('>H', raw[6+adv+ii_adv:8+adv+ii_adv])
|
||||
mib, = struct.unpack('>H', raw[8+adv+ii_adv:10+adv+ii_adv])
|
||||
self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'latin-1')
|
||||
ii_adv += 4
|
||||
# OwnerID
|
||||
elif type == 3:
|
||||
self.owner_id = struct.unpack('>I', raw[6+adv:10+adv])
|
||||
# Author, Title, PubDate
|
||||
# Ignored here. The metadata reader plugin
|
||||
# will get this info because if it's missing
|
||||
# the metadata reader plugin will use fall
|
||||
# back data from elsewhere in the file.
|
||||
elif type in (4, 5, 6):
|
||||
pass
|
||||
# Linked Documents
|
||||
elif type == 7:
|
||||
pass
|
||||
|
||||
adv += 2*length
|
||||
|
||||
|
||||
class SectionText(object):
|
||||
'''
|
||||
Text data. Stores a text section header and the PHTML.
|
||||
'''
|
||||
|
||||
def __init__(self, section_header, raw):
|
||||
self.header = SectionHeaderText(section_header, raw)
|
||||
self.data = raw[section_header.paragraphs * 4:]
|
||||
|
||||
|
||||
class SectionCompositeImage(object):
|
||||
'''
|
||||
A composite image consists of a 2D array
|
||||
of rows and columns. The entries in the array
|
||||
are uid's.
|
||||
'''
|
||||
|
||||
def __init__(self, raw):
|
||||
self.columns, = struct.unpack('>H', raw[0:2])
|
||||
self.rows, = struct.unpack('>H', raw[2:4])
|
||||
|
||||
# [
|
||||
# [uid, uid, uid, ...],
|
||||
# [uid, uid, uid, ...],
|
||||
# ...
|
||||
# ]
|
||||
#
|
||||
# Each item in the layout is in it's
|
||||
# correct position in the final
|
||||
# composite.
|
||||
#
|
||||
# Each item in the layout is a uid
|
||||
# to an image record.
|
||||
self.layout = []
|
||||
offset = 4
|
||||
for i in range(self.rows):
|
||||
col = []
|
||||
for j in range(self.columns):
|
||||
col.append(struct.unpack('>H', raw[offset:offset+2])[0])
|
||||
offset += 2
|
||||
self.layout.append(col)
|
||||
|
||||
|
||||
class Reader(FormatReader):
|
||||
'''
|
||||
Convert a plucker archive into HTML.
|
||||
|
||||
TODO:
|
||||
* UTF 16 and 32 characters.
|
||||
* Margins.
|
||||
* Alignment.
|
||||
* Font color.
|
||||
* DATATYPE_MAILTO
|
||||
* DATATYPE_TABLE(_COMPRESSED)
|
||||
* DATATYPE_EXT_ANCHOR_INDEX
|
||||
* DATATYPE_EXT_ANCHOR(_COMPRESSED)
|
||||
'''
|
||||
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.stream = stream
|
||||
self.log = log
|
||||
self.options = options
|
||||
|
||||
# Mapping of section uid to our internal
|
||||
# list of sections.
|
||||
self.uid_section_number = OrderedDict()
|
||||
self.uid_text_secion_number = OrderedDict()
|
||||
self.uid_text_secion_encoding = {}
|
||||
self.uid_image_section_number = {}
|
||||
self.uid_composite_image_section_number = {}
|
||||
self.metadata_section_number = None
|
||||
self.default_encoding = 'latin-1'
|
||||
self.owner_id = None
|
||||
self.sections = []
|
||||
|
||||
# The Plucker record0 header
|
||||
self.header_record = HeaderRecord(header.section_data(0))
|
||||
|
||||
for i in range(1, header.num_sections):
|
||||
section_number = len(self.sections)
|
||||
# The length of the section header.
|
||||
# Where the actual data in the section starts.
|
||||
start = 8
|
||||
section = None
|
||||
|
||||
raw_data = header.section_data(i)
|
||||
# Every sections has a section header.
|
||||
section_header = SectionHeader(raw_data)
|
||||
|
||||
# Store sections we care able.
|
||||
if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
|
||||
self.uid_text_secion_number[section_header.uid] = section_number
|
||||
section = SectionText(section_header, raw_data[start:])
|
||||
elif section_header.type in (DATATYPE_TBMP, DATATYPE_TBMP_COMPRESSED):
|
||||
self.uid_image_section_number[section_header.uid] = section_number
|
||||
section = raw_data[start:]
|
||||
elif section_header.type == DATATYPE_METADATA:
|
||||
self.metadata_section_number = section_number
|
||||
section = SectionMetadata(raw_data[start:])
|
||||
elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
|
||||
self.uid_composite_image_section_number[section_header.uid] = section_number
|
||||
section = SectionCompositeImage(raw_data[start:])
|
||||
|
||||
# Store the section.
|
||||
if section:
|
||||
self.uid_section_number[section_header.uid] = section_number
|
||||
self.sections.append((section_header, section))
|
||||
|
||||
# Store useful information from the metadata section locally
|
||||
# to make access easier.
|
||||
if self.metadata_section_number:
|
||||
mdata_section = self.sections[self.metadata_section_number][1]
|
||||
for k, v in mdata_section.exceptional_uid_encodings.items():
|
||||
self.uid_text_secion_encoding[k] = v
|
||||
self.default_encoding = mdata_section.default_encoding
|
||||
self.owner_id = mdata_section.owner_id
|
||||
|
||||
# Get the metadata (tile, author, ...) with the metadata reader.
|
||||
from calibre.ebooks.metadata.pdb import get_metadata
|
||||
self.mi = get_metadata(stream, False)
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
# Each text record is independent (unless the continuation
|
||||
# value is set in the previous record). Put each converted
|
||||
# text recored into a separate file. We will reference the
|
||||
# home.html file as the first file and let the HTML input
|
||||
# plugin assemble the order based on hyperlinks.
|
||||
with CurrentDir(output_dir):
|
||||
for uid, num in self.uid_text_secion_number.items():
|
||||
self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid))
|
||||
with open('%s.html' % uid, 'wb') as htmlf:
|
||||
html = u'<html><body>'
|
||||
section_header, section_data = self.sections[num]
|
||||
if section_header.type == DATATYPE_PHTML:
|
||||
html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets)
|
||||
elif section_header.type == DATATYPE_PHTML_COMPRESSED:
|
||||
d = self.decompress_phtml(section_data.data)
|
||||
html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
|
||||
html += '</body></html>'
|
||||
htmlf.write(html.encode('utf-8'))
|
||||
|
||||
# Images.
|
||||
# Cache the image sizes in case they are used by a composite image.
|
||||
images = set()
|
||||
if not os.path.exists(os.path.join(output_dir, 'images/')):
|
||||
os.makedirs(os.path.join(output_dir, 'images/'))
|
||||
with CurrentDir(os.path.join(output_dir, 'images/')):
|
||||
# Single images.
|
||||
for uid, num in self.uid_image_section_number.items():
|
||||
section_header, section_data = self.sections[num]
|
||||
if section_data:
|
||||
idata = None
|
||||
if section_header.type == DATATYPE_TBMP:
|
||||
idata = section_data
|
||||
elif section_header.type == DATATYPE_TBMP_COMPRESSED:
|
||||
if self.header_record.compression == 1:
|
||||
idata = decompress_doc(section_data)
|
||||
elif self.header_record.compression == 2:
|
||||
idata = zlib.decompress(section_data)
|
||||
try:
|
||||
save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70)
|
||||
images.add(uid)
|
||||
self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
|
||||
except Exception as e:
|
||||
self.log.error('Failed to write image with uid %s: %s' % (uid, e))
|
||||
else:
|
||||
self.log.error('Failed to write image with uid %s: No data.' % uid)
|
||||
# Composite images.
|
||||
# We're going to use the already compressed .jpg images here.
|
||||
for uid, num in self.uid_composite_image_section_number.items():
|
||||
try:
|
||||
section_header, section_data = self.sections[num]
|
||||
# Get the final width and height.
|
||||
width = 0
|
||||
height = 0
|
||||
for row in section_data.layout:
|
||||
row_width = 0
|
||||
col_height = 0
|
||||
for col in row:
|
||||
if col not in images:
|
||||
raise Exception('Image with uid: %s missing.' % col)
|
||||
w, h = identify(lopen('%s.jpg' % col, 'rb'))[1:]
|
||||
row_width += w
|
||||
if col_height < h:
|
||||
col_height = h
|
||||
if width < row_width:
|
||||
width = row_width
|
||||
height += col_height
|
||||
# Create a new image the total size of all image
|
||||
# parts. Put the parts into the new image.
|
||||
with Canvas(width, height) as canvas:
|
||||
y_off = 0
|
||||
for row in section_data.layout:
|
||||
x_off = 0
|
||||
largest_height = 0
|
||||
for col in row:
|
||||
im = image_from_data(lopen('%s.jpg' % col, 'rb').read())
|
||||
canvas.compose(im, x_off, y_off)
|
||||
w, h = im.width(), im.height()
|
||||
x_off += w
|
||||
if largest_height < h:
|
||||
largest_height = h
|
||||
y_off += largest_height
|
||||
with lopen('%s.jpg' % uid) as out:
|
||||
out.write(canvas.export(compression_quality=70))
|
||||
self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid))
|
||||
except Exception as e:
|
||||
self.log.error('Failed to write composite image with uid %s: %s' % (uid, e))
|
||||
|
||||
# Run the HTML through the html processing plugin.
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(self.options, opt.option.name, opt.recommended_value)
|
||||
self.options.input_encoding = 'utf-8'
|
||||
odi = self.options.debug_pipeline
|
||||
self.options.debug_pipeline = None
|
||||
# Determine the home.html record uid. This should be set in the
|
||||
# reserved values in the metadata recored. home.html is the first
|
||||
# text record (should have hyper link references to other records)
|
||||
# in the document.
|
||||
try:
|
||||
home_html = self.header_record.home_html
|
||||
if not home_html:
|
||||
home_html = self.uid_text_secion_number.items()[0][0]
|
||||
except:
|
||||
raise Exception('Could not determine home.html')
|
||||
# Generate oeb from html conversion.
|
||||
oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {})
|
||||
self.options.debug_pipeline = odi
|
||||
|
||||
return oeb
|
||||
|
||||
def decompress_phtml(self, data):
|
||||
if self.header_record.compression == 2:
|
||||
if self.owner_id:
|
||||
raise NotImplementedError
|
||||
return zlib.decompress(data)
|
||||
elif self.header_record.compression == 1:
|
||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||
return decompress_doc(data)
|
||||
|
||||
def process_phtml(self, d, paragraph_offsets=[]):
|
||||
html = u'<p id="p0">'
|
||||
offset = 0
|
||||
paragraph_open = True
|
||||
link_open = False
|
||||
need_set_p_id = False
|
||||
p_num = 1
|
||||
font_specifier_close = ''
|
||||
|
||||
while offset < len(d):
|
||||
if not paragraph_open:
|
||||
if need_set_p_id:
|
||||
html += u'<p id="p%s">' % p_num
|
||||
p_num += 1
|
||||
need_set_p_id = False
|
||||
else:
|
||||
html += u'<p>'
|
||||
paragraph_open = True
|
||||
|
||||
c = ord(d[offset:offset+1])
|
||||
# PHTML "functions"
|
||||
if c == 0x0:
|
||||
offset += 1
|
||||
c = ord(d[offset:offset+1])
|
||||
# Page link begins
|
||||
# 2 Bytes
|
||||
# record ID
|
||||
if c == 0x0a:
|
||||
offset += 1
|
||||
id = struct.unpack('>H', d[offset:offset+2])[0]
|
||||
if id in self.uid_text_secion_number:
|
||||
html += '<a href="%s.html">' % id
|
||||
link_open = True
|
||||
offset += 1
|
||||
# Targeted page link begins
|
||||
# 3 Bytes
|
||||
# record ID, target
|
||||
elif c == 0x0b:
|
||||
offset += 3
|
||||
# Paragraph link begins
|
||||
# 4 Bytes
|
||||
# record ID, paragraph number
|
||||
elif c == 0x0c:
|
||||
offset += 1
|
||||
id = struct.unpack('>H', d[offset:offset+2])[0]
|
||||
offset += 2
|
||||
pid = struct.unpack('>H', d[offset:offset+2])[0]
|
||||
if id in self.uid_text_secion_number:
|
||||
html += '<a href="%s.html#p%s">' % (id, pid)
|
||||
link_open = True
|
||||
offset += 1
|
||||
# Targeted paragraph link begins
|
||||
# 5 Bytes
|
||||
# record ID, paragraph number, target
|
||||
elif c == 0x0d:
|
||||
offset += 5
|
||||
# Link ends
|
||||
# 0 Bytes
|
||||
elif c == 0x08:
|
||||
if link_open:
|
||||
html += '</a>'
|
||||
link_open = False
|
||||
# Set font
|
||||
# 1 Bytes
|
||||
# font specifier
|
||||
elif c == 0x11:
|
||||
offset += 1
|
||||
specifier = d[offset]
|
||||
html += font_specifier_close
|
||||
# Regular text
|
||||
if specifier == 0:
|
||||
font_specifier_close = ''
|
||||
# h1
|
||||
elif specifier == 1:
|
||||
html += '<h1>'
|
||||
font_specifier_close = '</h1>'
|
||||
# h2
|
||||
elif specifier == 2:
|
||||
html += '<h2>'
|
||||
font_specifier_close = '</h2>'
|
||||
# h3
|
||||
elif specifier == 3:
|
||||
html += '<h13>'
|
||||
font_specifier_close = '</h3>'
|
||||
# h4
|
||||
elif specifier == 4:
|
||||
html += '<h4>'
|
||||
font_specifier_close = '</h4>'
|
||||
# h5
|
||||
elif specifier == 5:
|
||||
html += '<h5>'
|
||||
font_specifier_close = '</h5>'
|
||||
# h6
|
||||
elif specifier == 6:
|
||||
html += '<h6>'
|
||||
font_specifier_close = '</h6>'
|
||||
# Bold
|
||||
elif specifier == 7:
|
||||
html += '<b>'
|
||||
font_specifier_close = '</b>'
|
||||
# Fixed-width
|
||||
elif specifier == 8:
|
||||
html += '<tt>'
|
||||
font_specifier_close = '</tt>'
|
||||
# Small
|
||||
elif specifier == 9:
|
||||
html += '<small>'
|
||||
font_specifier_close = '</small>'
|
||||
# Subscript
|
||||
elif specifier == 10:
|
||||
html += '<sub>'
|
||||
font_specifier_close = '</sub>'
|
||||
# Superscript
|
||||
elif specifier == 11:
|
||||
html += '<sup>'
|
||||
font_specifier_close = '</sup>'
|
||||
# Embedded image
|
||||
# 2 Bytes
|
||||
# image record ID
|
||||
elif c == 0x1a:
|
||||
offset += 1
|
||||
uid = struct.unpack('>H', d[offset:offset+2])[0]
|
||||
html += '<img src="images/%s.jpg" />' % uid
|
||||
offset += 1
|
||||
# Set margin
|
||||
# 2 Bytes
|
||||
# left margin, right margin
|
||||
elif c == 0x22:
|
||||
offset += 2
|
||||
# Alignment of text
|
||||
# 1 Bytes
|
||||
# alignment
|
||||
elif c == 0x29:
|
||||
offset += 1
|
||||
# Horizontal rule
|
||||
# 3 Bytes
|
||||
# 8-bit height, 8-bit width (pixels), 8-bit width (%, 1-100)
|
||||
elif c == 0x33:
|
||||
offset += 3
|
||||
if paragraph_open:
|
||||
html += u'</p>'
|
||||
paragraph_open = False
|
||||
html += u'<hr />'
|
||||
# New line
|
||||
# 0 Bytes
|
||||
elif c == 0x38:
|
||||
if paragraph_open:
|
||||
html += u'</p>\n'
|
||||
paragraph_open = False
|
||||
# Italic text begins
|
||||
# 0 Bytes
|
||||
elif c == 0x40:
|
||||
html += u'<i>'
|
||||
# Italic text ends
|
||||
# 0 Bytes
|
||||
elif c == 0x48:
|
||||
html += u'</i>'
|
||||
# Set text color
|
||||
# 3 Bytes
|
||||
# 8-bit red, 8-bit green, 8-bit blue
|
||||
elif c == 0x53:
|
||||
offset += 3
|
||||
# Multiple embedded image
|
||||
# 4 Bytes
|
||||
# alternate image record ID, image record ID
|
||||
elif c == 0x5c:
|
||||
offset += 3
|
||||
uid = struct.unpack('>H', d[offset:offset+2])[0]
|
||||
html += '<img src="images/%s.jpg" />' % uid
|
||||
offset += 1
|
||||
# Underline text begins
|
||||
# 0 Bytes
|
||||
elif c == 0x60:
|
||||
html += u'<u>'
|
||||
# Underline text ends
|
||||
# 0 Bytes
|
||||
elif c == 0x68:
|
||||
html += u'</u>'
|
||||
# Strike-through text begins
|
||||
# 0 Bytes
|
||||
elif c == 0x70:
|
||||
html += u'<s>'
|
||||
# Strike-through text ends
|
||||
# 0 Bytes
|
||||
elif c == 0x78:
|
||||
html += u'</s>'
|
||||
# 16-bit Unicode character
|
||||
# 3 Bytes
|
||||
# alternate text length, 16-bit unicode character
|
||||
elif c == 0x83:
|
||||
offset += 3
|
||||
# 32-bit Unicode character
|
||||
# 5 Bytes
|
||||
# alternate text length, 32-bit unicode character
|
||||
elif c == 0x85:
|
||||
offset += 5
|
||||
# Begin custom font span
|
||||
# 6 Bytes
|
||||
# font page record ID, X page position, Y page position
|
||||
elif c == 0x8e:
|
||||
offset += 6
|
||||
# Adjust custom font glyph position
|
||||
# 4 Bytes
|
||||
# X page position, Y page position
|
||||
elif c == 0x8c:
|
||||
offset += 4
|
||||
# Change font page
|
||||
# 2 Bytes
|
||||
# font record ID
|
||||
elif c == 0x8a:
|
||||
offset += 2
|
||||
# End custom font span
|
||||
# 0 Bytes
|
||||
elif c == 0x88:
|
||||
pass
|
||||
# Begin new table row
|
||||
# 0 Bytes
|
||||
elif c == 0x90:
|
||||
pass
|
||||
# Insert table (or table link)
|
||||
# 2 Bytes
|
||||
# table record ID
|
||||
elif c == 0x92:
|
||||
offset += 2
|
||||
# Table cell data
|
||||
# 7 Bytes
|
||||
# 8-bit alignment, 16-bit image record ID, 8-bit columns, 8-bit rows, 16-bit text length
|
||||
elif c == 0x97:
|
||||
offset += 7
|
||||
# Exact link modifier
|
||||
# 2 Bytes
|
||||
# Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or
|
||||
# Targeted Paragraph Link function to specify an exact byte offset within
|
||||
# the paragraph. This function must be followed immediately by the
|
||||
# function it modifies).
|
||||
elif c == 0x9a:
|
||||
offset += 2
|
||||
elif c == 0xa0:
|
||||
html += ' '
|
||||
else:
|
||||
html += codepoint_to_chr(c)
|
||||
offset += 1
|
||||
if offset in paragraph_offsets:
|
||||
need_set_p_id = True
|
||||
if paragraph_open:
|
||||
html += u'</p>\n'
|
||||
paragraph_open = False
|
||||
|
||||
if paragraph_open:
|
||||
html += u'</p>'
|
||||
|
||||
return html
|
||||
|
||||
def get_text_uid_encoding(self, uid):
|
||||
# Return the user sepcified input encoding,
|
||||
# otherwise return the alternate encoding specified for the uid,
|
||||
# otherwise retur the default encoding for the document.
|
||||
return self.options.input_encoding if self.options.input_encoding else self.uid_text_secion_encoding.get(uid, self.default_encoding)
|
||||
10
ebook_converter/ebooks/pdb/ztxt/__init__.py
Normal file
10
ebook_converter/ebooks/pdb/ztxt/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
class zTXTError(Exception):
|
||||
pass
|
||||
94
ebook_converter/ebooks/pdb/ztxt/reader.py
Normal file
94
ebook_converter/ebooks/pdb/ztxt/reader.py
Normal file
@@ -0,0 +1,94 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
Read content from ztxt pdb file.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct
|
||||
import zlib
|
||||
import io
|
||||
|
||||
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ebooks.pdb.ztxt import zTXTError
|
||||
|
||||
SUPPORTED_VERSION = (1, 40)
|
||||
|
||||
|
||||
class HeaderRecord(object):
|
||||
'''
|
||||
The first record in the file is always the header record. It holds
|
||||
information related to the location of text, images, and so on
|
||||
in the file. This is used in conjunction with the sections
|
||||
defined in the file header.
|
||||
'''
|
||||
|
||||
def __init__(self, raw):
|
||||
self.version, = struct.unpack('>H', raw[0:2])
|
||||
self.num_records, = struct.unpack('>H', raw[2:4])
|
||||
self.size, = struct.unpack('>L', raw[4:8])
|
||||
self.record_size, = struct.unpack('>H', raw[8:10])
|
||||
self.flags, = struct.unpack('>B', raw[18:19])
|
||||
|
||||
|
||||
class Reader(FormatReader):
|
||||
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.stream = stream
|
||||
self.log = log
|
||||
self.options = options
|
||||
|
||||
self.sections = []
|
||||
for i in range(header.num_sections):
|
||||
self.sections.append(header.section_data(i))
|
||||
|
||||
self.header_record = HeaderRecord(self.section_data(0))
|
||||
|
||||
vmajor = (self.header_record.version & 0x0000FF00) >> 8
|
||||
vminor = self.header_record.version & 0x000000FF
|
||||
if vmajor < 1 or (vmajor == 1 and vminor < 40):
|
||||
raise zTXTError('Unsupported ztxt version (%i.%i). Only versions newer than %i.%i are supported.' %
|
||||
(vmajor, vminor, SUPPORTED_VERSION[0], SUPPORTED_VERSION[1]))
|
||||
|
||||
if (self.header_record.flags & 0x01) == 0:
|
||||
raise zTXTError('Only compression method 1 (random access) is supported')
|
||||
|
||||
self.log.debug('Foud ztxt version: %i.%i' % (vmajor, vminor))
|
||||
|
||||
# Initalize the decompressor
|
||||
self.uncompressor = zlib.decompressobj()
|
||||
self.uncompressor.decompress(self.section_data(1))
|
||||
|
||||
def section_data(self, number):
|
||||
return self.sections[number]
|
||||
|
||||
def decompress_text(self, number):
|
||||
if number == 1:
|
||||
self.uncompressor = zlib.decompressobj()
|
||||
return self.uncompressor.decompress(self.section_data(number))
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
raw_txt = b''
|
||||
|
||||
self.log.info('Decompressing text...')
|
||||
for i in range(1, self.header_record.num_records + 1):
|
||||
self.log.debug('\tDecompressing text section %i' % i)
|
||||
raw_txt += self.decompress_text(i)
|
||||
|
||||
self.log.info('Converting text to OEB...')
|
||||
stream = io.BytesIO(raw_txt)
|
||||
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
txt_plugin = plugin_for_input_format('txt')
|
||||
for opt in txt_plugin.options:
|
||||
if not hasattr(self.options, opt.option.name):
|
||||
setattr(self.options, opt.option.name, opt.recommended_value)
|
||||
|
||||
stream.seek(0)
|
||||
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
|
||||
Reference in New Issue
Block a user