1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-04-10 07:03:35 +02:00

Initial import

This commit is contained in:
2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions

View File

@@ -0,0 +1,157 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read content from Haodoo.net pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2012, Kan-Ru Chen <kanru@kanru.info>'
__docformat__ = 'restructuredtext en'
import struct
import os
from calibre import prepare_string_for_xml
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.txt.processor import opf_writer, HTML_TEMPLATE
from polyglot.builtins import range, map
BPDB_IDENT = b'BOOKMTIT'
UPDB_IDENT = b'BOOKMTIU'
punct_table = {
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"︿": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u"": u"",
u" ": u" ",
}
def fix_punct(line):
for (key, value) in punct_table.items():
line = line.replace(key, value)
return line
class LegacyHeaderRecord(object):
def __init__(self, raw):
fields = raw.lstrip().replace(b'\x1b\x1b\x1b', b'\x1b').split(b'\x1b')
self.title = fix_punct(fields[0].decode('cp950', 'replace'))
self.num_records = int(fields[1])
self.chapter_titles = list(map(
lambda x: fix_punct(x.decode('cp950', 'replace').rstrip('\x00')),
fields[2:]))
class UnicodeHeaderRecord(object):
def __init__(self, raw):
fields = raw.lstrip().replace(b'\x1b\x00\x1b\x00\x1b\x00',
b'\x1b\x00').split(b'\x1b\x00')
self.title = fix_punct(fields[0].decode('utf_16_le', 'ignore'))
self.num_records = int(fields[1])
self.chapter_titles = list(map(
lambda x: fix_punct(x.decode('utf_16_le', 'replace').rstrip('\x00')),
fields[2].split(b'\r\x00\n\x00')))
class Reader(FormatReader):
def __init__(self, header, stream, log, options):
self.stream = stream
self.log = log
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
if header.ident == BPDB_IDENT:
self.header_record = LegacyHeaderRecord(self.section_data(0))
self.encoding = 'cp950'
else:
self.header_record = UnicodeHeaderRecord(self.section_data(0))
self.encoding = 'utf_16_le'
def author(self):
self.stream.seek(35)
version = struct.unpack('>b', self.stream.read(1))[0]
if version == 2:
self.stream.seek(0)
author = self.stream.read(35).rstrip(b'\x00').decode(self.encoding, 'replace')
return author
else:
return 'Unknown'
def get_metadata(self):
mi = MetaInformation(self.header_record.title,
[self.author()])
mi.language = 'zh-tw'
return mi
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
return self.section_data(number).decode(self.encoding,
'replace').rstrip('\x00')
def extract_content(self, output_dir):
txt = ''
self.log.info(u'Decompressing text...')
for i in range(1, self.header_record.num_records + 1):
self.log.debug(u'\tDecompressing text section %i' % i)
title = self.header_record.chapter_titles[i-1]
lines = []
title_added = False
for line in self.decompress_text(i).splitlines():
line = fix_punct(line)
line = line.strip()
if not title_added and title in line:
line = '<h1 class="chapter">' + line + '</h1>\n'
title_added = True
else:
line = prepare_string_for_xml(line)
lines.append('<p>%s</p>' % line)
if not title_added:
lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
txt += '\n'.join(lines)
self.log.info(u'Converting text to OEB...')
html = HTML_TEMPLATE % (self.header_record.title, txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
mi = self.get_metadata()
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf')