1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-02-23 10:35:49 +01:00
Files
ebook-converter/ebook_converter/ebooks/pdb/haodoo/reader.py
gryf 0f9792df36 Convert calibre modules to ebook_converter.
Here is the first batch of modules, which are needed for converting
several formats to LRF. Some of the logic has been change, more cleanups
will follow.
2020-04-19 15:16:48 +02:00

158 lines
4.7 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read content from Haodoo.net pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2012, Kan-Ru Chen <kanru@kanru.info>'
__docformat__ = 'restructuredtext en'
import struct
import os
from ebook_converter import prepare_string_for_xml
from ebook_converter.ebooks.pdb.formatreader import FormatReader
from ebook_converter.ebooks.metadata import MetaInformation
from ebook_converter.ebooks.txt.processor import opf_writer, HTML_TEMPLATE
from ebook_converter.polyglot.builtins import range, map
BPDB_IDENT = b'BOOKMTIT'
UPDB_IDENT = b'BOOKMTIU'
punct_table = {
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"︿": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u" ": u" ",
}
def fix_punct(line):
for (key, value) in punct_table.items():
line = line.replace(key, value)
return line
class LegacyHeaderRecord(object):
def __init__(self, raw):
fields = raw.lstrip().replace(b'\x1b\x1b\x1b', b'\x1b').split(b'\x1b')
self.title = fix_punct(fields[0].decode('cp950', 'replace'))
self.num_records = int(fields[1])
self.chapter_titles = list(map(
lambda x: fix_punct(x.decode('cp950', 'replace').rstrip('\x00')),
fields[2:]))
class UnicodeHeaderRecord(object):
def __init__(self, raw):
fields = raw.lstrip().replace(b'\x1b\x00\x1b\x00\x1b\x00',
b'\x1b\x00').split(b'\x1b\x00')
self.title = fix_punct(fields[0].decode('utf_16_le', 'ignore'))
self.num_records = int(fields[1])
self.chapter_titles = list(map(
lambda x: fix_punct(x.decode('utf_16_le', 'replace').rstrip('\x00')),
fields[2].split(b'\r\x00\n\x00')))
class Reader(FormatReader):
def __init__(self, header, stream, log, options):
self.stream = stream
self.log = log
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
if header.ident == BPDB_IDENT:
self.header_record = LegacyHeaderRecord(self.section_data(0))
self.encoding = 'cp950'
else:
self.header_record = UnicodeHeaderRecord(self.section_data(0))
self.encoding = 'utf_16_le'
def author(self):
self.stream.seek(35)
version = struct.unpack('>b', self.stream.read(1))[0]
if version == 2:
self.stream.seek(0)
author = self.stream.read(35).rstrip(b'\x00').decode(self.encoding, 'replace')
return author
else:
return 'Unknown'
def get_metadata(self):
mi = MetaInformation(self.header_record.title,
[self.author()])
mi.language = 'zh-tw'
return mi
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
return self.section_data(number).decode(self.encoding,
'replace').rstrip('\x00')
def extract_content(self, output_dir):
txt = ''
self.log.info(u'Decompressing text...')
for i in range(1, self.header_record.num_records + 1):
self.log.debug(u'\tDecompressing text section %i' % i)
title = self.header_record.chapter_titles[i-1]
lines = []
title_added = False
for line in self.decompress_text(i).splitlines():
line = fix_punct(line)
line = line.strip()
if not title_added and title in line:
line = '<h1 class="chapter">' + line + '</h1>\n'
title_added = True
else:
line = prepare_string_for_xml(line)
lines.append('<p>%s</p>' % line)
if not title_added:
lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
txt += '\n'.join(lines)
self.log.info(u'Converting text to OEB...')
html = HTML_TEMPLATE % (self.header_record.title, txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
mi = self.get_metadata()
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf')