mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-19 16:25:55 +01:00
153 lines
4.6 KiB
Python
153 lines
4.6 KiB
Python
"""
|
||
Read content from Haodoo.net pdb file.
|
||
"""
|
||
import struct
|
||
import os
|
||
|
||
from ebook_converter import prepare_string_for_xml
|
||
from ebook_converter.ebooks.pdb.formatreader import FormatReader
|
||
from ebook_converter.ebooks.metadata import MetaInformation
|
||
from ebook_converter.ebooks.txt.processor import opf_writer, HTML_TEMPLATE
|
||
|
||
|
||
__license__ = 'GPL v3'
|
||
__copyright__ = '2012, Kan-Ru Chen <kanru@kanru.info>'
|
||
__docformat__ = 'restructuredtext en'
|
||
|
||
BPDB_IDENT = b'BOOKMTIT'
|
||
UPDB_IDENT = b'BOOKMTIU'
|
||
|
||
punct_table = {
|
||
u"︵": u"(",
|
||
u"︶": u")",
|
||
u"︷": u"{",
|
||
u"︸": u"}",
|
||
u"︹": u"〔",
|
||
u"︺": u"〕",
|
||
u"︻": u"【",
|
||
u"︼": u"】",
|
||
u"︗": u"〖",
|
||
u"︘": u"〗",
|
||
u"﹇": u"[]",
|
||
u"﹈": u"[]",
|
||
u"︽": u"《",
|
||
u"︾": u"》",
|
||
u"︿": u"〈",
|
||
u"﹀": u"〉",
|
||
u"﹁": u"「",
|
||
u"﹂": u"」",
|
||
u"﹃": u"『",
|
||
u"﹄": u"』",
|
||
u"|": u"—",
|
||
u"︙": u"…",
|
||
u"ⸯ": u"~",
|
||
u"│": u"…",
|
||
u"¦": u"…",
|
||
u" ": u" ",
|
||
}
|
||
|
||
|
||
def fix_punct(line):
|
||
for (key, value) in punct_table.items():
|
||
line = line.replace(key, value)
|
||
return line
|
||
|
||
|
||
class LegacyHeaderRecord(object):
|
||
|
||
def __init__(self, raw):
|
||
fields = raw.lstrip().replace(b'\x1b\x1b\x1b', b'\x1b').split(b'\x1b')
|
||
self.title = fix_punct(fields[0].decode('cp950', 'replace'))
|
||
self.num_records = int(fields[1])
|
||
self.chapter_titles = list(map(
|
||
lambda x: fix_punct(x.decode('cp950', 'replace').rstrip('\x00')),
|
||
fields[2:]))
|
||
|
||
|
||
class UnicodeHeaderRecord(object):
|
||
|
||
def __init__(self, raw):
|
||
fields = raw.lstrip().replace(b'\x1b\x00\x1b\x00\x1b\x00',
|
||
b'\x1b\x00').split(b'\x1b\x00')
|
||
self.title = fix_punct(fields[0].decode('utf_16_le', 'ignore'))
|
||
self.num_records = int(fields[1])
|
||
self.chapter_titles = list(map(
|
||
lambda x: fix_punct(x.decode('utf_16_le', 'replace').rstrip('\x00')),
|
||
fields[2].split(b'\r\x00\n\x00')))
|
||
|
||
|
||
class Reader(FormatReader):
|
||
|
||
def __init__(self, header, stream, log, options):
|
||
self.stream = stream
|
||
self.log = log
|
||
|
||
self.sections = []
|
||
for i in range(header.num_sections):
|
||
self.sections.append(header.section_data(i))
|
||
|
||
if header.ident == BPDB_IDENT:
|
||
self.header_record = LegacyHeaderRecord(self.section_data(0))
|
||
self.encoding = 'cp950'
|
||
else:
|
||
self.header_record = UnicodeHeaderRecord(self.section_data(0))
|
||
self.encoding = 'utf_16_le'
|
||
|
||
def author(self):
|
||
self.stream.seek(35)
|
||
version = struct.unpack('>b', self.stream.read(1))[0]
|
||
if version == 2:
|
||
self.stream.seek(0)
|
||
author = self.stream.read(35).rstrip(b'\x00').decode(self.encoding, 'replace')
|
||
return author
|
||
else:
|
||
return 'Unknown'
|
||
|
||
def get_metadata(self):
|
||
mi = MetaInformation(self.header_record.title,
|
||
[self.author()])
|
||
mi.language = 'zh-tw'
|
||
|
||
return mi
|
||
|
||
def section_data(self, number):
|
||
return self.sections[number]
|
||
|
||
def decompress_text(self, number):
|
||
return self.section_data(number).decode(self.encoding,
|
||
'replace').rstrip('\x00')
|
||
|
||
def extract_content(self, output_dir):
|
||
txt = ''
|
||
|
||
self.log.info(u'Decompressing text...')
|
||
for i in range(1, self.header_record.num_records + 1):
|
||
self.log.debug(u'\tDecompressing text section %i' % i)
|
||
title = self.header_record.chapter_titles[i-1]
|
||
lines = []
|
||
title_added = False
|
||
for line in self.decompress_text(i).splitlines():
|
||
line = fix_punct(line)
|
||
line = line.strip()
|
||
if not title_added and title in line:
|
||
line = '<h1 class="chapter">' + line + '</h1>\n'
|
||
title_added = True
|
||
else:
|
||
line = prepare_string_for_xml(line)
|
||
lines.append('<p>%s</p>' % line)
|
||
if not title_added:
|
||
lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
|
||
txt += '\n'.join(lines)
|
||
|
||
self.log.info(u'Converting text to OEB...')
|
||
html = HTML_TEMPLATE % (self.header_record.title, txt)
|
||
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
||
index.write(html.encode('utf-8'))
|
||
|
||
mi = self.get_metadata()
|
||
manifest = [('index.html', None)]
|
||
spine = ['index.html']
|
||
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
|
||
|
||
return os.path.join(output_dir, 'metadata.opf')
|