1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-06 01:05:52 +01:00
Files
ebook-converter/ebook_converter/ebooks/pdb/haodoo/reader.py
2020-04-11 19:27:06 +02:00

158 lines
4.7 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read content from Haodoo.net pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2012, Kan-Ru Chen <kanru@kanru.info>'
__docformat__ = 'restructuredtext en'
import struct
import os
from calibre import prepare_string_for_xml
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.txt.processor import opf_writer, HTML_TEMPLATE
from polyglot.builtins import range, map
BPDB_IDENT = b'BOOKMTIT'
UPDB_IDENT = b'BOOKMTIU'
punct_table = {
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"︿": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u" ": u" ",
}
def fix_punct(line):
for (key, value) in punct_table.items():
line = line.replace(key, value)
return line
class LegacyHeaderRecord(object):
def __init__(self, raw):
fields = raw.lstrip().replace(b'\x1b\x1b\x1b', b'\x1b').split(b'\x1b')
self.title = fix_punct(fields[0].decode('cp950', 'replace'))
self.num_records = int(fields[1])
self.chapter_titles = list(map(
lambda x: fix_punct(x.decode('cp950', 'replace').rstrip('\x00')),
fields[2:]))
class UnicodeHeaderRecord(object):
def __init__(self, raw):
fields = raw.lstrip().replace(b'\x1b\x00\x1b\x00\x1b\x00',
b'\x1b\x00').split(b'\x1b\x00')
self.title = fix_punct(fields[0].decode('utf_16_le', 'ignore'))
self.num_records = int(fields[1])
self.chapter_titles = list(map(
lambda x: fix_punct(x.decode('utf_16_le', 'replace').rstrip('\x00')),
fields[2].split(b'\r\x00\n\x00')))
class Reader(FormatReader):
def __init__(self, header, stream, log, options):
self.stream = stream
self.log = log
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
if header.ident == BPDB_IDENT:
self.header_record = LegacyHeaderRecord(self.section_data(0))
self.encoding = 'cp950'
else:
self.header_record = UnicodeHeaderRecord(self.section_data(0))
self.encoding = 'utf_16_le'
def author(self):
self.stream.seek(35)
version = struct.unpack('>b', self.stream.read(1))[0]
if version == 2:
self.stream.seek(0)
author = self.stream.read(35).rstrip(b'\x00').decode(self.encoding, 'replace')
return author
else:
return 'Unknown'
def get_metadata(self):
mi = MetaInformation(self.header_record.title,
[self.author()])
mi.language = 'zh-tw'
return mi
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
return self.section_data(number).decode(self.encoding,
'replace').rstrip('\x00')
def extract_content(self, output_dir):
txt = ''
self.log.info(u'Decompressing text...')
for i in range(1, self.header_record.num_records + 1):
self.log.debug(u'\tDecompressing text section %i' % i)
title = self.header_record.chapter_titles[i-1]
lines = []
title_added = False
for line in self.decompress_text(i).splitlines():
line = fix_punct(line)
line = line.strip()
if not title_added and title in line:
line = '<h1 class="chapter">' + line + '</h1>\n'
title_added = True
else:
line = prepare_string_for_xml(line)
lines.append('<p>%s</p>' % line)
if not title_added:
lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
txt += '\n'.join(lines)
self.log.info(u'Converting text to OEB...')
html = HTML_TEMPLATE % (self.header_record.title, txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
mi = self.get_metadata()
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf')