Initial import

2026-04-16 11:03:33 +02:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
--- a/ebook_converter/ebooks/pdb/haodoo/reader.py
+++ b/ebook_converter/ebooks/pdb/haodoo/reader.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+'''
+Read content from Haodoo.net pdb file.
+'''
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kan-Ru Chen <kanru@kanru.info>'
+__docformat__ = 'restructuredtext en'
+
+
+import struct
+import os
+
+from calibre import prepare_string_for_xml
+from calibre.ebooks.pdb.formatreader import FormatReader
+from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.txt.processor import opf_writer, HTML_TEMPLATE
+from polyglot.builtins import range, map
+
+BPDB_IDENT = b'BOOKMTIT'
+UPDB_IDENT = b'BOOKMTIU'
+
+punct_table = {
+    u"︵": u"（",
+    u"︶": u"）",
+    u"︷": u"｛",
+    u"︸": u"｝",
+    u"︹": u"〔",
+    u"︺": u"〕",
+    u"︻": u"【",
+    u"︼": u"】",
+    u"︗": u"〖",
+    u"︘": u"〗",
+    u"﹇": u"［］",
+    u"﹈": u"［］",
+    u"︽": u"《",
+    u"︾": u"》",
+    u"︿": u"〈",
+    u"﹀": u"〉",
+    u"﹁": u"「",
+    u"﹂": u"」",
+    u"﹃": u"『",
+    u"﹄": u"』",
+    u"｜": u"—",
+    u"︙": u"…",
+    u"ⸯ": u"～",
+    u"│": u"…",
+    u"￤": u"…",
+    u"　": u"  ",
+    }
+
+
+def fix_punct(line):
+    for (key, value) in punct_table.items():
+        line = line.replace(key, value)
+    return line
+
+
+class LegacyHeaderRecord(object):
+
+    def __init__(self, raw):
+        fields = raw.lstrip().replace(b'\x1b\x1b\x1b', b'\x1b').split(b'\x1b')
+        self.title = fix_punct(fields[0].decode('cp950', 'replace'))
+        self.num_records = int(fields[1])
+        self.chapter_titles = list(map(
+            lambda x: fix_punct(x.decode('cp950', 'replace').rstrip('\x00')),
+            fields[2:]))
+
+
+class UnicodeHeaderRecord(object):
+
+    def __init__(self, raw):
+        fields = raw.lstrip().replace(b'\x1b\x00\x1b\x00\x1b\x00',
+                b'\x1b\x00').split(b'\x1b\x00')
+        self.title = fix_punct(fields[0].decode('utf_16_le', 'ignore'))
+        self.num_records = int(fields[1])
+        self.chapter_titles = list(map(
+            lambda x: fix_punct(x.decode('utf_16_le', 'replace').rstrip('\x00')),
+            fields[2].split(b'\r\x00\n\x00')))
+
+
+class Reader(FormatReader):
+
+    def __init__(self, header, stream, log, options):
+        self.stream = stream
+        self.log = log
+
+        self.sections = []
+        for i in range(header.num_sections):
+            self.sections.append(header.section_data(i))
+
+        if header.ident == BPDB_IDENT:
+            self.header_record = LegacyHeaderRecord(self.section_data(0))
+            self.encoding = 'cp950'
+        else:
+            self.header_record = UnicodeHeaderRecord(self.section_data(0))
+            self.encoding = 'utf_16_le'
+
+    def author(self):
+        self.stream.seek(35)
+        version = struct.unpack('>b', self.stream.read(1))[0]
+        if version == 2:
+            self.stream.seek(0)
+            author = self.stream.read(35).rstrip(b'\x00').decode(self.encoding, 'replace')
+            return author
+        else:
+            return 'Unknown'
+
+    def get_metadata(self):
+        mi = MetaInformation(self.header_record.title,
+                             [self.author()])
+        mi.language = 'zh-tw'
+
+        return mi
+
+    def section_data(self, number):
+        return self.sections[number]
+
+    def decompress_text(self, number):
+        return self.section_data(number).decode(self.encoding,
+                'replace').rstrip('\x00')
+
+    def extract_content(self, output_dir):
+        txt = ''
+
+        self.log.info(u'Decompressing text...')
+        for i in range(1, self.header_record.num_records + 1):
+            self.log.debug(u'\tDecompressing text section %i' % i)
+            title = self.header_record.chapter_titles[i-1]
+            lines = []
+            title_added = False
+            for line in self.decompress_text(i).splitlines():
+                line = fix_punct(line)
+                line = line.strip()
+                if not title_added and title in line:
+                    line = '<h1 class="chapter">' + line + '</h1>\n'
+                    title_added = True
+                else:
+                    line = prepare_string_for_xml(line)
+                lines.append('<p>%s</p>' % line)
+            if not title_added:
+                lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
+            txt += '\n'.join(lines)
+
+        self.log.info(u'Converting text to OEB...')
+        html = HTML_TEMPLATE % (self.header_record.title, txt)
+        with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
+            index.write(html.encode('utf-8'))
+
+        mi = self.get_metadata()
+        manifest = [('index.html', None)]
+        spine = ['index.html']
+        opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
+
+        return os.path.join(output_dir, 'metadata.opf')