1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-30 00:33:32 +02:00

Initial import

This commit is contained in:
2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions

View File

@@ -0,0 +1,106 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
class PDBError(Exception):
pass
FORMAT_READERS = None
def _import_readers():
global FORMAT_READERS
from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader
from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader
from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader
from calibre.ebooks.pdb.pdf.reader import Reader as pdf_reader
from calibre.ebooks.pdb.plucker.reader import Reader as plucker_reader
from calibre.ebooks.pdb.haodoo.reader import Reader as haodoo_reader
FORMAT_READERS = {
'PNPdPPrs': ereader_reader,
'PNRdPPrs': ereader_reader,
'zTXTGPlm': ztxt_reader,
'TEXtREAd': palmdoc_reader,
'.pdfADBE': pdf_reader,
'DataPlkr': plucker_reader,
'BOOKMTIT': haodoo_reader,
'BOOKMTIU': haodoo_reader,
}
ALL_FORMAT_WRITERS = {'doc', 'ztxt', 'ereader'}
FORMAT_WRITERS = None
def _import_writers():
global FORMAT_WRITERS
from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer
from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer
FORMAT_WRITERS = {
'doc': palmdoc_writer,
'ztxt': ztxt_writer,
'ereader': ereader_writer,
}
IDENTITY_TO_NAME = {
'PNPdPPrs': 'eReader',
'PNRdPPrs': 'eReader',
'zTXTGPlm': 'zTXT',
'TEXtREAd': 'PalmDOC',
'.pdfADBE': 'Adobe Reader',
'DataPlkr': 'Plucker',
'BOOKMTIT': 'Haodoo.net',
'BOOKMTIU': 'Haodoo.net',
'BVokBDIC': 'BDicty',
'DB99DBOS': 'DB (Database program)',
'vIMGView': 'FireViewer (ImageViewer)',
'PmDBPmDB': 'HanDBase',
'InfoINDB': 'InfoView',
'ToGoToGo': 'iSilo',
'SDocSilX': 'iSilo 3',
'JbDbJBas': 'JFile',
'JfDbJFil': 'JFile Pro',
'DATALSdb': 'LIST',
'Mdb1Mdb1': 'MobileDB',
'BOOKMOBI': 'MobiPocket',
'DataSprd': 'QuickSheet',
'SM01SMem': 'SuperMemo',
'TEXtTlDc': 'TealDoc',
'InfoTlIf': 'TealInfo',
'DataTlMl': 'TealMeal',
'DataTlPt': 'TealPaint',
'dataTDBP': 'ThinkDB',
'TdatTide': 'Tides',
'ToRaTRPW': 'TomeRaider',
'BDOCWrdS': 'WordSmith',
}
def get_reader(identity):
'''
Returns None if no reader is found for the identity.
'''
global FORMAT_READERS
if FORMAT_READERS is None:
_import_readers()
return FORMAT_READERS.get(identity, None)
def get_writer(extension):
'''
Returns None if no writer is found for extension.
'''
global FORMAT_WRITERS
if FORMAT_WRITERS is None:
_import_writers()
return FORMAT_WRITERS.get(extension, None)

View File

@@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
class EreaderError(Exception):
pass
def image_name(name, taken_names=()):
name = os.path.basename(name)
if len(name) > 32:
cut = len(name) - 32
names = name[:10]
namee = name[10+cut:]
name = '%s%s.png' % (names, namee)
i = 0
base_name, ext = os.path.splitext(name)
while name in taken_names:
i += 1
name = '%s%s%s' % (base_name, i, ext)
return name.ljust(32, '\x00')[:32]

View File

@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read content from ereader pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ereader.reader132 import Reader132
from calibre.ebooks.pdb.ereader.reader202 import Reader202
class Reader(FormatReader):
def __init__(self, header, stream, log, options):
record0_size = len(header.section_data(0))
if record0_size == 132:
self.reader = Reader132(header, stream, log, options)
elif record0_size in (116, 202):
self.reader = Reader202(header, stream, log, options)
else:
raise EreaderError('Size mismatch. eReader header record size %s KB is not supported.' % record0_size)
def extract_content(self, output_dir):
return self.reader.extract_content(output_dir)
def dump_pml(self):
return self.reader.dump_pml()
def dump_images(self, out_dir):
return self.reader.dump_images(out_dir)

View File

@@ -0,0 +1,221 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read content from ereader pdb file with a 132 byte header created by Dropbook.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import re
import struct
import zlib
from calibre import CurrentDir
from calibre.ebooks import DRMError
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pdb.formatreader import FormatReader
from polyglot.builtins import unicode_type, range
class HeaderRecord(object):
'''
The first record in the file is always the header record. It holds
information related to the location of text, images, and so on
in the file. This is used in conjunction with the sections
defined in the file header.
'''
def __init__(self, raw):
self.compression, = struct.unpack('>H', raw[0:2])
self.non_text_offset, = struct.unpack('>H', raw[12:14])
self.chapter_count, = struct.unpack('>H', raw[14:16])
self.image_count, = struct.unpack('>H', raw[20:22])
self.link_count, = struct.unpack('>H', raw[22:24])
self.has_metadata, = struct.unpack('>H', raw[24:26])
self.footnote_count, = struct.unpack('>H', raw[28:30])
self.sidebar_count, = struct.unpack('>H', raw[30:32])
self.chapter_offset, = struct.unpack('>H', raw[32:34])
self.small_font_page_offset, = struct.unpack('>H', raw[36:38])
self.large_font_page_offset, = struct.unpack('>H', raw[38:40])
self.image_data_offset, = struct.unpack('>H', raw[40:42])
self.link_offset, = struct.unpack('>H', raw[42:44])
self.metadata_offset, = struct.unpack('>H', raw[44:46])
self.footnote_offset, = struct.unpack('>H', raw[48:50])
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
self.last_data_offset, = struct.unpack('>H', raw[52:54])
self.num_text_pages = self.non_text_offset - 1
self.num_image_pages = self.metadata_offset - self.image_data_offset
class Reader132(FormatReader):
def __init__(self, header, stream, log, options):
self.log = log
self.encoding = options.input_encoding
self.log.debug('132 byte header version found.')
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
self.header_record = HeaderRecord(self.section_data(0))
if self.header_record.compression not in (2, 10):
if self.header_record.compression in (260, 272):
raise DRMError('eReader DRM is not supported.')
else:
raise EreaderError('Unknown book compression %i.' % self.header_record.compression)
from calibre.ebooks.metadata.pdb import get_metadata
self.mi = get_metadata(stream, False)
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
if self.header_record.compression == 2:
from calibre.ebooks.compression.palmdoc import decompress_doc
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
if self.header_record.compression == 10:
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
def get_image(self, number):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', b''
data = self.section_data(number)
name = data[4:4 + 32].strip(b'\x00').decode(self.encoding or 'cp1252')
img = data[62:]
return name, img
def get_text_page(self, number):
'''
Only palmdoc and zlib compressed are supported. The text is
assumed to be encoded as Windows-1252. The encoding is part of
the eReader file spec and should always be this encoding.
'''
if not (1 <= number <= self.header_record.num_text_pages):
return ''
return self.decompress_text(number)
def extract_content(self, output_dir):
from calibre.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html
from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
output_dir = os.path.abspath(output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
title = self.mi.title
if not isinstance(title, unicode_type):
title = title.decode('utf-8', 'replace')
html = '<html><head><title>%s</title></head><body>' % title
pml = ''
for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i)
pml += self.get_text_page(i)
hizer = PML_HTMLizer()
html += hizer.parse_pml(pml, 'index.html')
toc = hizer.get_toc()
if self.header_record.footnote_count > 0:
html += '<br /><h1>%s</h1>' % _('Footnotes')
footnoteids = re.findall(
'\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
self.log.debug('Extracting footnote page %i' % i)
if fid < len(footnoteids):
fid = footnoteids[fid]
else:
fid = ''
html += footnote_to_html(fid, self.decompress_text(i))
if self.header_record.sidebar_count > 0:
html += '<br /><h1>%s</h1>' % _('Sidebar')
sidebarids = re.findall(
'\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
self.log.debug('Extracting sidebar page %i' % i)
if sid < len(sidebarids):
sid = sidebarids[sid]
else:
sid = ''
html += sidebar_to_html(sid, self.decompress_text(i))
html += '</body></html>'
with CurrentDir(output_dir):
with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html')
index.write(html.encode('utf-8'))
if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/'))
images = []
with CurrentDir(os.path.join(output_dir, 'images/')):
for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i)
images.append(name)
with open(name, 'wb') as imgf:
self.log.debug('Writing image %s to images/' % name)
imgf.write(img)
opf_path = self.create_opf(output_dir, images, toc)
return opf_path
def create_opf(self, output_dir, images, toc):
with CurrentDir(output_dir):
if 'cover.png' in images:
self.mi.cover = os.path.join('images', 'cover.png')
opf = OPFCreator(output_dir, self.mi)
manifest = [('index.html', None)]
for i in images:
manifest.append((os.path.join('images', i), None))
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
opf.set_toc(toc)
with open('metadata.opf', 'wb') as opffile:
with open('toc.ncx', 'wb') as tocfile:
opf.render(opffile, tocfile, 'toc.ncx')
return os.path.join(output_dir, 'metadata.opf')
def dump_pml(self):
'''
This is primarily used for debugging and 3rd party tools to
get the plm markup that comprises the text in the file.
'''
pml = ''
for i in range(1, self.header_record.num_text_pages + 1):
pml += self.get_text_page(i)
return pml
def dump_images(self, output_dir):
'''
This is primarily used for debugging and 3rd party tools to
get the images in the file.
'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with CurrentDir(output_dir):
for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i)
with open(name, 'wb') as imgf:
imgf.write(img)

View File

@@ -0,0 +1,169 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read content from ereader pdb file with a 116 and 202 byte header created by Makebook.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import struct
from calibre import CurrentDir
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ereader import EreaderError
from polyglot.builtins import unicode_type, range
class HeaderRecord(object):
'''
The first record in the file is always the header record. It holds
information related to the location of text, images, and so on
in the file. This is used in conjunction with the sections
defined in the file header.
'''
def __init__(self, raw):
self.version, = struct.unpack('>H', raw[0:2])
self.non_text_offset, = struct.unpack('>H', raw[8:10])
self.num_text_pages = self.non_text_offset - 1
class Reader202(FormatReader):
def __init__(self, header, stream, log, options):
self.log = log
self.encoding = options.input_encoding
self.log.debug('202 byte header version found.')
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
self.header_record = HeaderRecord(self.section_data(0))
if self.header_record.version not in (2, 4):
raise EreaderError('Unknown book version %i.' % self.header_record.version)
from calibre.ebooks.metadata.pdb import get_metadata
self.mi = get_metadata(stream, False)
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
from calibre.ebooks.compression.palmdoc import decompress_doc
data = bytearray(self.section_data(number))
data = bytes(bytearray(x ^ 0xA5 for x in data))
return decompress_doc(data).decode(self.encoding or 'cp1252', 'replace')
def get_image(self, number):
name = None
img = None
data = self.section_data(number)
if data.startswith(b'PNG'):
name = data[4:4 + 32].strip(b'\x00')
img = data[62:]
return name, img
def get_text_page(self, number):
'''
Only palmdoc compression is supported. The text is xored with 0xA5 and
assumed to be encoded as Windows-1252. The encoding is part of
the eReader file spec and should always be this encoding.
'''
if not (1 <= number <= self.header_record.num_text_pages):
return ''
return self.decompress_text(number)
def extract_content(self, output_dir):
from calibre.ebooks.pml.pmlconverter import pml_to_html
output_dir = os.path.abspath(output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
pml = ''
for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i)
pml += self.get_text_page(i)
title = self.mi.title
if not isinstance(title, unicode_type):
title = title.decode('utf-8', 'replace')
html = '<html><head><title>%s</title></head><body>%s</body></html>' % \
(title, pml_to_html(pml))
with CurrentDir(output_dir):
with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html')
index.write(html.encode('utf-8'))
if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/'))
images = []
with CurrentDir(os.path.join(output_dir, 'images/')):
for i in range(self.header_record.non_text_offset, len(self.sections)):
name, img = self.get_image(i)
if name:
images.append(name)
with open(name, 'wb') as imgf:
self.log.debug('Writing image %s to images/' % name)
imgf.write(img)
opf_path = self.create_opf(output_dir, images)
return opf_path
def create_opf(self, output_dir, images):
with CurrentDir(output_dir):
opf = OPFCreator(output_dir, self.mi)
manifest = [('index.html', None)]
for i in images:
manifest.append((os.path.join('images/', i), None))
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile:
opf.render(opffile)
return os.path.join(output_dir, 'metadata.opf')
def dump_pml(self):
'''
This is primarily used for debugging and 3rd party tools to
get the plm markup that comprises the text in the file.
'''
pml = ''
for i in range(1, self.header_record.num_text_pages + 1):
pml += self.get_text_page(i)
return pml
def dump_images(self, output_dir):
'''
This is primarily used for debugging and 3rd party tools to
get the images in the file.
'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with CurrentDir(output_dir):
for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i)
with open(name, 'wb') as imgf:
imgf.write(img)

View File

@@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Interface defining the necessary public functions for a pdb format reader.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
class FormatReader(object):
def __init__(self, header, stream, log, options):
raise NotImplementedError()
def extract_content(self, output_dir):
raise NotImplementedError()

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,157 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read content from Haodoo.net pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2012, Kan-Ru Chen <kanru@kanru.info>'
__docformat__ = 'restructuredtext en'
import struct
import os
from calibre import prepare_string_for_xml
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.txt.processor import opf_writer, HTML_TEMPLATE
from polyglot.builtins import range, map
BPDB_IDENT = b'BOOKMTIT'
UPDB_IDENT = b'BOOKMTIU'
punct_table = {
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"︿": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u" ": u" ",
}
def fix_punct(line):
for (key, value) in punct_table.items():
line = line.replace(key, value)
return line
class LegacyHeaderRecord(object):
def __init__(self, raw):
fields = raw.lstrip().replace(b'\x1b\x1b\x1b', b'\x1b').split(b'\x1b')
self.title = fix_punct(fields[0].decode('cp950', 'replace'))
self.num_records = int(fields[1])
self.chapter_titles = list(map(
lambda x: fix_punct(x.decode('cp950', 'replace').rstrip('\x00')),
fields[2:]))
class UnicodeHeaderRecord(object):
def __init__(self, raw):
fields = raw.lstrip().replace(b'\x1b\x00\x1b\x00\x1b\x00',
b'\x1b\x00').split(b'\x1b\x00')
self.title = fix_punct(fields[0].decode('utf_16_le', 'ignore'))
self.num_records = int(fields[1])
self.chapter_titles = list(map(
lambda x: fix_punct(x.decode('utf_16_le', 'replace').rstrip('\x00')),
fields[2].split(b'\r\x00\n\x00')))
class Reader(FormatReader):
def __init__(self, header, stream, log, options):
self.stream = stream
self.log = log
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
if header.ident == BPDB_IDENT:
self.header_record = LegacyHeaderRecord(self.section_data(0))
self.encoding = 'cp950'
else:
self.header_record = UnicodeHeaderRecord(self.section_data(0))
self.encoding = 'utf_16_le'
def author(self):
self.stream.seek(35)
version = struct.unpack('>b', self.stream.read(1))[0]
if version == 2:
self.stream.seek(0)
author = self.stream.read(35).rstrip(b'\x00').decode(self.encoding, 'replace')
return author
else:
return 'Unknown'
def get_metadata(self):
mi = MetaInformation(self.header_record.title,
[self.author()])
mi.language = 'zh-tw'
return mi
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
return self.section_data(number).decode(self.encoding,
'replace').rstrip('\x00')
def extract_content(self, output_dir):
txt = ''
self.log.info(u'Decompressing text...')
for i in range(1, self.header_record.num_records + 1):
self.log.debug(u'\tDecompressing text section %i' % i)
title = self.header_record.chapter_titles[i-1]
lines = []
title_added = False
for line in self.decompress_text(i).splitlines():
line = fix_punct(line)
line = line.strip()
if not title_added and title in line:
line = '<h1 class="chapter">' + line + '</h1>\n'
title_added = True
else:
line = prepare_string_for_xml(line)
lines.append('<p>%s</p>' % line)
if not title_added:
lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
txt += '\n'.join(lines)
self.log.info(u'Converting text to OEB...')
html = HTML_TEMPLATE % (self.header_record.title, txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
mi = self.get_metadata()
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf')

View File

@@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read the header data from a pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
import struct
import time
from polyglot.builtins import long_type
class PdbHeaderReader(object):
def __init__(self, stream):
self.stream = stream
self.ident = self.identity()
self.num_sections = self.section_count()
self.title = self.name()
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8)
return ident.decode('utf-8')
def section_count(self):
self.stream.seek(76)
return struct.unpack('>H', self.stream.read(2))[0]
def name(self):
self.stream.seek(0)
return re.sub(b'[^-A-Za-z0-9 ]+', b'_', self.stream.read(32).replace(b'\x00', b''))
def full_section_info(self, number):
if not (0 <= number < self.num_sections):
raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78 + number * 8)
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0]
flags, val = a1, a2 << 16 | a3 << 8 | a4
return (offset, flags, val)
def section_offset(self, number):
if not (0 <= number < self.num_sections):
raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def section_data(self, number):
if not (0 <= number < self.num_sections):
raise ValueError('Not a valid section number %i' % number)
start = self.section_offset(number)
if number == self.num_sections -1:
self.stream.seek(0, 2)
end = self.stream.tell()
else:
end = self.section_offset(number + 1)
self.stream.seek(start)
return self.stream.read(end - start)
class PdbHeaderBuilder(object):
def __init__(self, identity, title):
self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8')
self.title = b'%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace')
def build_header(self, section_lengths, out_stream):
'''
section_lengths = Lenght of each section in file.
'''
now = int(time.time())
nrecords = len(section_lengths)
out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0))
out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords))
offset = 78 + (8 * nrecords) + 2
for id, record in enumerate(section_lengths):
out_stream.write(struct.pack('>LBBBB', long_type(offset), 0, 0, 0, 0))
offset += record
out_stream.write(b'\x00\x00')

View File

@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read content from palmdoc pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import struct, io
from calibre.ebooks.pdb.formatreader import FormatReader
class HeaderRecord(object):
'''
The first record in the file is always the header record. It holds
information related to the location of text, images, and so on
in the file. This is used in conjunction with the sections
defined in the file header.
'''
def __init__(self, raw):
self.compression, = struct.unpack('>H', raw[0:2])
self.num_records, = struct.unpack('>H', raw[8:10])
class Reader(FormatReader):
def __init__(self, header, stream, log, options):
self.stream = stream
self.log = log
self.options = options
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
self.header_record = HeaderRecord(self.section_data(0))
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
if self.header_record.compression == 1:
return self.section_data(number)
if self.header_record.compression == 2 or self.header_record.compression == 258:
from calibre.ebooks.compression.palmdoc import decompress_doc
return decompress_doc(self.section_data(number))
return b''
def extract_content(self, output_dir):
raw_txt = b''
self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i)
raw_txt += self.decompress_text(i)
self.log.info('Converting text to OEB...')
stream = io.BytesIO(raw_txt)
from calibre.customize.ui import plugin_for_input_format
txt_plugin = plugin_for_input_format('txt')
for opt in txt_plugin.options:
if not hasattr(self.options, opt.option.name):
setattr(self.options, opt.option.name, opt.recommended_value)
stream.seek(0)
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})

View File

@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read content from palmdoc pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2010, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ptempfile import PersistentTemporaryFile
from polyglot.builtins import range
class Reader(FormatReader):
def __init__(self, header, stream, log, options):
self.header = header
self.stream = stream
self.log = log
self.options = options
def extract_content(self, output_dir):
self.log.info('Extracting PDF...')
pdf = PersistentTemporaryFile('.pdf')
pdf.close()
pdf = open(pdf, 'wb')
for x in range(self.header.section_count()):
pdf.write(self.header.section_data(x))
pdf.close()
from calibre.customize.ui import plugin_for_input_format
pdf_plugin = plugin_for_input_format('pdf')
for opt in pdf_plugin.options:
if not hasattr(self.options, opt.option.name):
setattr(self.options, opt.option.name, opt.recommended_value)
return pdf_plugin.convert(open(pdf, 'rb'), self.options, 'pdf', self.log, {})

View File

@@ -0,0 +1,737 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '20011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import struct
import zlib
from collections import OrderedDict
from calibre import CurrentDir
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.utils.imghdr import identify
from calibre.utils.img import save_cover_data_to, Canvas, image_from_data
from polyglot.builtins import codepoint_to_chr, range
DATATYPE_PHTML = 0
DATATYPE_PHTML_COMPRESSED = 1
DATATYPE_TBMP = 2
DATATYPE_TBMP_COMPRESSED = 3
DATATYPE_MAILTO = 4
DATATYPE_LINK_INDEX = 5
DATATYPE_LINKS = 6
DATATYPE_LINKS_COMPRESSED = 7
DATATYPE_BOOKMARKS = 8
DATATYPE_CATEGORY = 9
DATATYPE_METADATA = 10
DATATYPE_STYLE_SHEET = 11
DATATYPE_FONT_PAGE = 12
DATATYPE_TABLE = 13
DATATYPE_TABLE_COMPRESSED = 14
DATATYPE_COMPOSITE_IMAGE = 15
DATATYPE_PAGELIST_METADATA = 16
DATATYPE_SORTED_URL_INDEX = 17
DATATYPE_SORTED_URL = 18
DATATYPE_SORTED_URL_COMPRESSED = 19
DATATYPE_EXT_ANCHOR_INDEX = 20
DATATYPE_EXT_ANCHOR = 21
DATATYPE_EXT_ANCHOR_COMPRESSED = 22
# IETF IANA MIBenum value for the character set.
# See the http://www.iana.org/assignments/character-sets for valid values.
# Not all character sets are handled by Python. This is a small subset that
# the MIBenum maps to Python standard encodings
# from http://docs.python.org/library/codecs.html#standard-encodings
MIBNUM_TO_NAME = {
3: 'ascii',
4: 'latin_1',
5: 'iso8859_2',
6: 'iso8859_3',
7: 'iso8859_4',
8: 'iso8859_5',
9: 'iso8859_6',
10: 'iso8859_7',
11: 'iso8859_8',
12: 'iso8859_9',
13: 'iso8859_10',
17: 'shift_jis',
18: 'euc_jp',
27: 'utf_7',
36: 'euc_kr',
37: 'iso2022_kr',
38: 'euc_kr',
39: 'iso2022_jp',
40: 'iso2022_jp_2',
106: 'utf-8',
109: 'iso8859_13',
110: 'iso8859_14',
111: 'iso8859_15',
112: 'iso8859_16',
1013: 'utf_16_be',
1014: 'utf_16_le',
1015: 'utf_16',
2009: 'cp850',
2010: 'cp852',
2011: 'cp437',
2013: 'cp862',
2025: 'gb2312',
2026: 'big5',
2028: 'cp037',
2043: 'cp424',
2044: 'cp500',
2046: 'cp855',
2047: 'cp857',
2048: 'cp860',
2049: 'cp861',
2050: 'cp863',
2051: 'cp864',
2052: 'cp865',
2054: 'cp869',
2063: 'cp1026',
2085: 'hz',
2086: 'cp866',
2087: 'cp775',
2089: 'cp858',
2091: 'cp1140',
2102: 'big5hkscs',
2250: 'cp1250',
2251: 'cp1251',
2252: 'cp1252',
2253: 'cp1253',
2254: 'cp1254',
2255: 'cp1255',
2256: 'cp1256',
2257: 'cp1257',
2258: 'cp1258',
}
class HeaderRecord(object):
'''
Plucker header. PDB record 0.
'''
def __init__(self, raw):
self.uid, = struct.unpack('>H', raw[0:2])
# This is labled version in the spec.
# 2 is ZLIB compressed,
# 1 is DOC compressed
self.compression, = struct.unpack('>H', raw[2:4])
self.records, = struct.unpack('>H', raw[4:6])
# uid of the first html file. This should link
# to other files which in turn may link to others.
self.home_html = None
self.reserved = {}
for i in range(self.records):
adv = 4*i
name, = struct.unpack('>H', raw[6+adv:8+adv])
id, = struct.unpack('>H', raw[8+adv:10+adv])
self.reserved[id] = name
if name == 0:
self.home_html = id
class SectionHeader(object):
'''
Every sections (record) has this header. It gives
details about the section such as it's uid.
'''
def __init__(self, raw):
self.uid, = struct.unpack('>H', raw[0:2])
self.paragraphs, = struct.unpack('>H', raw[2:4])
self.size, = struct.unpack('>H', raw[4:6])
self.type, = struct.unpack('>B', raw[6])
self.flags, = struct.unpack('>B', raw[7])
class SectionHeaderText(object):
'''
Sub header for text records.
'''
def __init__(self, section_header, raw):
# The uncompressed size of each paragraph.
self.sizes = []
# uncompressed offset of each paragraph starting
# at the beginning of the PHTML.
self.paragraph_offsets = []
# Paragraph attributes.
self.attributes = []
for i in range(section_header.paragraphs):
adv = 4*i
self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0])
self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0])
running_offset = 0
for size in self.sizes:
running_offset += size
self.paragraph_offsets.append(running_offset)
class SectionMetadata(object):
'''
Metadata.
This does not store metadata such as title, or author.
That metadata would be best retrieved with the PDB (plucker)
metdata reader.
This stores document specific information such as the
text encoding.
Note: There is a default encoding but each text section
can be assigned a different encoding.
'''
def __init__(self, raw):
self.default_encoding = 'latin-1'
self.exceptional_uid_encodings = {}
self.owner_id = None
record_count, = struct.unpack('>H', raw[0:2])
adv = 0
for i in range(record_count):
try:
type, length = struct.unpack_from('>HH', raw, 2 + adv)
except struct.error:
break
# CharSet
if type == 1:
val, = struct.unpack('>H', raw[6+adv:8+adv])
self.default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1')
# ExceptionalCharSets
elif type == 2:
ii_adv = 0
for ii in range(length / 2):
uid, = struct.unpack('>H', raw[6+adv+ii_adv:8+adv+ii_adv])
mib, = struct.unpack('>H', raw[8+adv+ii_adv:10+adv+ii_adv])
self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'latin-1')
ii_adv += 4
# OwnerID
elif type == 3:
self.owner_id = struct.unpack('>I', raw[6+adv:10+adv])
# Author, Title, PubDate
# Ignored here. The metadata reader plugin
# will get this info because if it's missing
# the metadata reader plugin will use fall
# back data from elsewhere in the file.
elif type in (4, 5, 6):
pass
# Linked Documents
elif type == 7:
pass
adv += 2*length
class SectionText(object):
'''
Text data. Stores a text section header and the PHTML.
'''
def __init__(self, section_header, raw):
self.header = SectionHeaderText(section_header, raw)
self.data = raw[section_header.paragraphs * 4:]
class SectionCompositeImage(object):
'''
A composite image consists of a 2D array
of rows and columns. The entries in the array
are uid's.
'''
def __init__(self, raw):
self.columns, = struct.unpack('>H', raw[0:2])
self.rows, = struct.unpack('>H', raw[2:4])
# [
# [uid, uid, uid, ...],
# [uid, uid, uid, ...],
# ...
# ]
#
# Each item in the layout is in it's
# correct position in the final
# composite.
#
# Each item in the layout is a uid
# to an image record.
self.layout = []
offset = 4
for i in range(self.rows):
col = []
for j in range(self.columns):
col.append(struct.unpack('>H', raw[offset:offset+2])[0])
offset += 2
self.layout.append(col)
class Reader(FormatReader):
'''
Convert a plucker archive into HTML.
TODO:
* UTF 16 and 32 characters.
* Margins.
* Alignment.
* Font color.
* DATATYPE_MAILTO
* DATATYPE_TABLE(_COMPRESSED)
* DATATYPE_EXT_ANCHOR_INDEX
* DATATYPE_EXT_ANCHOR(_COMPRESSED)
'''
def __init__(self, header, stream, log, options):
self.stream = stream
self.log = log
self.options = options
# Mapping of section uid to our internal
# list of sections.
self.uid_section_number = OrderedDict()
self.uid_text_secion_number = OrderedDict()
self.uid_text_secion_encoding = {}
self.uid_image_section_number = {}
self.uid_composite_image_section_number = {}
self.metadata_section_number = None
self.default_encoding = 'latin-1'
self.owner_id = None
self.sections = []
# The Plucker record0 header
self.header_record = HeaderRecord(header.section_data(0))
for i in range(1, header.num_sections):
section_number = len(self.sections)
# The length of the section header.
# Where the actual data in the section starts.
start = 8
section = None
raw_data = header.section_data(i)
# Every sections has a section header.
section_header = SectionHeader(raw_data)
# Store sections we care able.
if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
self.uid_text_secion_number[section_header.uid] = section_number
section = SectionText(section_header, raw_data[start:])
elif section_header.type in (DATATYPE_TBMP, DATATYPE_TBMP_COMPRESSED):
self.uid_image_section_number[section_header.uid] = section_number
section = raw_data[start:]
elif section_header.type == DATATYPE_METADATA:
self.metadata_section_number = section_number
section = SectionMetadata(raw_data[start:])
elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
self.uid_composite_image_section_number[section_header.uid] = section_number
section = SectionCompositeImage(raw_data[start:])
# Store the section.
if section:
self.uid_section_number[section_header.uid] = section_number
self.sections.append((section_header, section))
# Store useful information from the metadata section locally
# to make access easier.
if self.metadata_section_number:
mdata_section = self.sections[self.metadata_section_number][1]
for k, v in mdata_section.exceptional_uid_encodings.items():
self.uid_text_secion_encoding[k] = v
self.default_encoding = mdata_section.default_encoding
self.owner_id = mdata_section.owner_id
# Get the metadata (tile, author, ...) with the metadata reader.
from calibre.ebooks.metadata.pdb import get_metadata
self.mi = get_metadata(stream, False)
def extract_content(self, output_dir):
# Each text record is independent (unless the continuation
# value is set in the previous record). Put each converted
# text recored into a separate file. We will reference the
# home.html file as the first file and let the HTML input
# plugin assemble the order based on hyperlinks.
with CurrentDir(output_dir):
for uid, num in self.uid_text_secion_number.items():
self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid))
with open('%s.html' % uid, 'wb') as htmlf:
html = u'<html><body>'
section_header, section_data = self.sections[num]
if section_header.type == DATATYPE_PHTML:
html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets)
elif section_header.type == DATATYPE_PHTML_COMPRESSED:
d = self.decompress_phtml(section_data.data)
html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
html += '</body></html>'
htmlf.write(html.encode('utf-8'))
# Images.
# Cache the image sizes in case they are used by a composite image.
images = set()
if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/'))
with CurrentDir(os.path.join(output_dir, 'images/')):
# Single images.
for uid, num in self.uid_image_section_number.items():
section_header, section_data = self.sections[num]
if section_data:
idata = None
if section_header.type == DATATYPE_TBMP:
idata = section_data
elif section_header.type == DATATYPE_TBMP_COMPRESSED:
if self.header_record.compression == 1:
idata = decompress_doc(section_data)
elif self.header_record.compression == 2:
idata = zlib.decompress(section_data)
try:
save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70)
images.add(uid)
self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
except Exception as e:
self.log.error('Failed to write image with uid %s: %s' % (uid, e))
else:
self.log.error('Failed to write image with uid %s: No data.' % uid)
# Composite images.
# We're going to use the already compressed .jpg images here.
for uid, num in self.uid_composite_image_section_number.items():
try:
section_header, section_data = self.sections[num]
# Get the final width and height.
width = 0
height = 0
for row in section_data.layout:
row_width = 0
col_height = 0
for col in row:
if col not in images:
raise Exception('Image with uid: %s missing.' % col)
w, h = identify(lopen('%s.jpg' % col, 'rb'))[1:]
row_width += w
if col_height < h:
col_height = h
if width < row_width:
width = row_width
height += col_height
# Create a new image the total size of all image
# parts. Put the parts into the new image.
with Canvas(width, height) as canvas:
y_off = 0
for row in section_data.layout:
x_off = 0
largest_height = 0
for col in row:
im = image_from_data(lopen('%s.jpg' % col, 'rb').read())
canvas.compose(im, x_off, y_off)
w, h = im.width(), im.height()
x_off += w
if largest_height < h:
largest_height = h
y_off += largest_height
with lopen('%s.jpg' % uid) as out:
out.write(canvas.export(compression_quality=70))
self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid))
except Exception as e:
self.log.error('Failed to write composite image with uid %s: %s' % (uid, e))
# Run the HTML through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(self.options, opt.option.name, opt.recommended_value)
self.options.input_encoding = 'utf-8'
odi = self.options.debug_pipeline
self.options.debug_pipeline = None
# Determine the home.html record uid. This should be set in the
# reserved values in the metadata recored. home.html is the first
# text record (should have hyper link references to other records)
# in the document.
try:
home_html = self.header_record.home_html
if not home_html:
home_html = self.uid_text_secion_number.items()[0][0]
except:
raise Exception('Could not determine home.html')
# Generate oeb from html conversion.
oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {})
self.options.debug_pipeline = odi
return oeb
def decompress_phtml(self, data):
if self.header_record.compression == 2:
if self.owner_id:
raise NotImplementedError
return zlib.decompress(data)
elif self.header_record.compression == 1:
from calibre.ebooks.compression.palmdoc import decompress_doc
return decompress_doc(data)
def process_phtml(self, d, paragraph_offsets=[]):
html = u'<p id="p0">'
offset = 0
paragraph_open = True
link_open = False
need_set_p_id = False
p_num = 1
font_specifier_close = ''
while offset < len(d):
if not paragraph_open:
if need_set_p_id:
html += u'<p id="p%s">' % p_num
p_num += 1
need_set_p_id = False
else:
html += u'<p>'
paragraph_open = True
c = ord(d[offset:offset+1])
# PHTML "functions"
if c == 0x0:
offset += 1
c = ord(d[offset:offset+1])
# Page link begins
# 2 Bytes
# record ID
if c == 0x0a:
offset += 1
id = struct.unpack('>H', d[offset:offset+2])[0]
if id in self.uid_text_secion_number:
html += '<a href="%s.html">' % id
link_open = True
offset += 1
# Targeted page link begins
# 3 Bytes
# record ID, target
elif c == 0x0b:
offset += 3
# Paragraph link begins
# 4 Bytes
# record ID, paragraph number
elif c == 0x0c:
offset += 1
id = struct.unpack('>H', d[offset:offset+2])[0]
offset += 2
pid = struct.unpack('>H', d[offset:offset+2])[0]
if id in self.uid_text_secion_number:
html += '<a href="%s.html#p%s">' % (id, pid)
link_open = True
offset += 1
# Targeted paragraph link begins
# 5 Bytes
# record ID, paragraph number, target
elif c == 0x0d:
offset += 5
# Link ends
# 0 Bytes
elif c == 0x08:
if link_open:
html += '</a>'
link_open = False
# Set font
# 1 Bytes
# font specifier
elif c == 0x11:
offset += 1
specifier = d[offset]
html += font_specifier_close
# Regular text
if specifier == 0:
font_specifier_close = ''
# h1
elif specifier == 1:
html += '<h1>'
font_specifier_close = '</h1>'
# h2
elif specifier == 2:
html += '<h2>'
font_specifier_close = '</h2>'
# h3
elif specifier == 3:
html += '<h13>'
font_specifier_close = '</h3>'
# h4
elif specifier == 4:
html += '<h4>'
font_specifier_close = '</h4>'
# h5
elif specifier == 5:
html += '<h5>'
font_specifier_close = '</h5>'
# h6
elif specifier == 6:
html += '<h6>'
font_specifier_close = '</h6>'
# Bold
elif specifier == 7:
html += '<b>'
font_specifier_close = '</b>'
# Fixed-width
elif specifier == 8:
html += '<tt>'
font_specifier_close = '</tt>'
# Small
elif specifier == 9:
html += '<small>'
font_specifier_close = '</small>'
# Subscript
elif specifier == 10:
html += '<sub>'
font_specifier_close = '</sub>'
# Superscript
elif specifier == 11:
html += '<sup>'
font_specifier_close = '</sup>'
# Embedded image
# 2 Bytes
# image record ID
elif c == 0x1a:
offset += 1
uid = struct.unpack('>H', d[offset:offset+2])[0]
html += '<img src="images/%s.jpg" />' % uid
offset += 1
# Set margin
# 2 Bytes
# left margin, right margin
elif c == 0x22:
offset += 2
# Alignment of text
# 1 Bytes
# alignment
elif c == 0x29:
offset += 1
# Horizontal rule
# 3 Bytes
# 8-bit height, 8-bit width (pixels), 8-bit width (%, 1-100)
elif c == 0x33:
offset += 3
if paragraph_open:
html += u'</p>'
paragraph_open = False
html += u'<hr />'
# New line
# 0 Bytes
elif c == 0x38:
if paragraph_open:
html += u'</p>\n'
paragraph_open = False
# Italic text begins
# 0 Bytes
elif c == 0x40:
html += u'<i>'
# Italic text ends
# 0 Bytes
elif c == 0x48:
html += u'</i>'
# Set text color
# 3 Bytes
# 8-bit red, 8-bit green, 8-bit blue
elif c == 0x53:
offset += 3
# Multiple embedded image
# 4 Bytes
# alternate image record ID, image record ID
elif c == 0x5c:
offset += 3
uid = struct.unpack('>H', d[offset:offset+2])[0]
html += '<img src="images/%s.jpg" />' % uid
offset += 1
# Underline text begins
# 0 Bytes
elif c == 0x60:
html += u'<u>'
# Underline text ends
# 0 Bytes
elif c == 0x68:
html += u'</u>'
# Strike-through text begins
# 0 Bytes
elif c == 0x70:
html += u'<s>'
# Strike-through text ends
# 0 Bytes
elif c == 0x78:
html += u'</s>'
# 16-bit Unicode character
# 3 Bytes
# alternate text length, 16-bit unicode character
elif c == 0x83:
offset += 3
# 32-bit Unicode character
# 5 Bytes
# alternate text length, 32-bit unicode character
elif c == 0x85:
offset += 5
# Begin custom font span
# 6 Bytes
# font page record ID, X page position, Y page position
elif c == 0x8e:
offset += 6
# Adjust custom font glyph position
# 4 Bytes
# X page position, Y page position
elif c == 0x8c:
offset += 4
# Change font page
# 2 Bytes
# font record ID
elif c == 0x8a:
offset += 2
# End custom font span
# 0 Bytes
elif c == 0x88:
pass
# Begin new table row
# 0 Bytes
elif c == 0x90:
pass
# Insert table (or table link)
# 2 Bytes
# table record ID
elif c == 0x92:
offset += 2
# Table cell data
# 7 Bytes
# 8-bit alignment, 16-bit image record ID, 8-bit columns, 8-bit rows, 16-bit text length
elif c == 0x97:
offset += 7
# Exact link modifier
# 2 Bytes
# Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or
# Targeted Paragraph Link function to specify an exact byte offset within
# the paragraph. This function must be followed immediately by the
# function it modifies).
elif c == 0x9a:
offset += 2
elif c == 0xa0:
html += '&nbsp;'
else:
html += codepoint_to_chr(c)
offset += 1
if offset in paragraph_offsets:
need_set_p_id = True
if paragraph_open:
html += u'</p>\n'
paragraph_open = False
if paragraph_open:
html += u'</p>'
return html
def get_text_uid_encoding(self, uid):
# Return the user sepcified input encoding,
# otherwise return the alternate encoding specified for the uid,
# otherwise retur the default encoding for the document.
return self.options.input_encoding if self.options.input_encoding else self.uid_text_secion_encoding.get(uid, self.default_encoding)

View File

@@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
class zTXTError(Exception):
pass

View File

@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read content from ztxt pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import struct
import zlib
import io
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ztxt import zTXTError
SUPPORTED_VERSION = (1, 40)
class HeaderRecord(object):
'''
The first record in the file is always the header record. It holds
information related to the location of text, images, and so on
in the file. This is used in conjunction with the sections
defined in the file header.
'''
def __init__(self, raw):
self.version, = struct.unpack('>H', raw[0:2])
self.num_records, = struct.unpack('>H', raw[2:4])
self.size, = struct.unpack('>L', raw[4:8])
self.record_size, = struct.unpack('>H', raw[8:10])
self.flags, = struct.unpack('>B', raw[18:19])
class Reader(FormatReader):
def __init__(self, header, stream, log, options):
self.stream = stream
self.log = log
self.options = options
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
self.header_record = HeaderRecord(self.section_data(0))
vmajor = (self.header_record.version & 0x0000FF00) >> 8
vminor = self.header_record.version & 0x000000FF
if vmajor < 1 or (vmajor == 1 and vminor < 40):
raise zTXTError('Unsupported ztxt version (%i.%i). Only versions newer than %i.%i are supported.' %
(vmajor, vminor, SUPPORTED_VERSION[0], SUPPORTED_VERSION[1]))
if (self.header_record.flags & 0x01) == 0:
raise zTXTError('Only compression method 1 (random access) is supported')
self.log.debug('Foud ztxt version: %i.%i' % (vmajor, vminor))
# Initalize the decompressor
self.uncompressor = zlib.decompressobj()
self.uncompressor.decompress(self.section_data(1))
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
if number == 1:
self.uncompressor = zlib.decompressobj()
return self.uncompressor.decompress(self.section_data(number))
def extract_content(self, output_dir):
raw_txt = b''
self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i)
raw_txt += self.decompress_text(i)
self.log.info('Converting text to OEB...')
stream = io.BytesIO(raw_txt)
from calibre.customize.ui import plugin_for_input_format
txt_plugin = plugin_for_input_format('txt')
for opt in txt_plugin.options:
if not hasattr(self.options, opt.option.name):
setattr(self.options, opt.option.name, opt.recommended_value)
stream.seek(0)
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})