mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-29 01:25:46 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
356 lines
14 KiB
Python
356 lines
14 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
import struct, re, os
|
|
|
|
from ebook_converter import replace_entities
|
|
from ebook_converter.utils.date import parse_date
|
|
from ebook_converter.ebooks.mobi import MobiError
|
|
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
|
|
from ebook_converter.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
|
|
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
|
from ebook_converter.utils.localization import canonicalize_lang
|
|
from ebook_converter.utils.config_base import tweaks
|
|
from ebook_converter.polyglot.builtins import unicode_type
|
|
|
|
NULL_INDEX = 0xffffffff
|
|
|
|
|
|
def uniq(vals):
|
|
''' Remove all duplicates from vals, while preserving order. '''
|
|
vals = vals or ()
|
|
seen = set()
|
|
seen_add = seen.add
|
|
return list(x for x in vals if x not in seen and not seen_add(x))
|
|
|
|
|
|
class EXTHHeader(object): # {{{
|
|
|
|
def __init__(self, raw, codec, title):
|
|
self.doctype = raw[:4]
|
|
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
|
|
raw = raw[12:]
|
|
pos = 0
|
|
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
|
self.has_fake_cover = True
|
|
self.start_offset = None
|
|
left = self.num_items
|
|
self.kf8_header = None
|
|
self.uuid = self.cdetype = None
|
|
self.page_progression_direction = None
|
|
self.primary_writing_mode = None
|
|
|
|
self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace'))
|
|
|
|
while left > 0:
|
|
left -= 1
|
|
idx, size = struct.unpack('>LL', raw[pos:pos + 8])
|
|
content = raw[pos + 8:pos + size]
|
|
pos += size
|
|
if idx >= 100 and idx < 200:
|
|
self.process_metadata(idx, content, codec)
|
|
elif idx == 203:
|
|
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
|
|
elif idx == 201:
|
|
co, = struct.unpack('>L', content)
|
|
if co < NULL_INDEX:
|
|
self.cover_offset = co
|
|
elif idx == 202:
|
|
self.thumbnail_offset, = struct.unpack('>L', content)
|
|
elif idx == 501:
|
|
try:
|
|
self.cdetype = content.decode('ascii')
|
|
except UnicodeDecodeError:
|
|
self.cdetype = None
|
|
# cdetype
|
|
if content == b'EBSP':
|
|
if not self.mi.tags:
|
|
self.mi.tags = []
|
|
self.mi.tags.append(_('Sample Book'))
|
|
elif idx == 502:
|
|
# last update time
|
|
pass
|
|
elif idx == 503: # Long title
|
|
# Amazon seems to regard this as the definitive book title
|
|
# rather than the title from the PDB header. In fact when
|
|
# sending MOBI files through Amazon's email service if the
|
|
# title contains non ASCII chars or non filename safe chars
|
|
# they are messed up in the PDB header
|
|
try:
|
|
title = self.decode(content)
|
|
except Exception:
|
|
pass
|
|
elif idx == 524: # Lang code
|
|
try:
|
|
lang = content.decode(codec)
|
|
lang = canonicalize_lang(lang)
|
|
if lang:
|
|
self.mi.language = lang
|
|
except Exception:
|
|
pass
|
|
elif idx == 525:
|
|
try:
|
|
pwm = content.decode(codec)
|
|
if pwm:
|
|
self.primary_writing_mode = pwm
|
|
except Exception:
|
|
pass
|
|
elif idx == 527:
|
|
try:
|
|
ppd = content.decode(codec)
|
|
if ppd:
|
|
self.page_progression_direction = ppd
|
|
except Exception:
|
|
pass
|
|
# else:
|
|
# print 'unknown record', idx, repr(content)
|
|
if title:
|
|
self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
|
|
|
|
def process_metadata(self, idx, content, codec):
|
|
if idx == 100:
|
|
if self.mi.is_null('authors'):
|
|
self.mi.authors = []
|
|
au = clean_xml_chars(self.decode(content).strip())
|
|
# Author names in Amazon MOBI files are usually in LN, FN format,
|
|
# try to detect and auto-correct that.
|
|
m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
|
|
if m is not None:
|
|
if tweaks['author_sort_copy_method'] != 'copy':
|
|
self.mi.authors.append(m.group(2) + ' ' + m.group(1))
|
|
else:
|
|
self.mi.authors.append(m.group())
|
|
if self.mi.is_null('author_sort'):
|
|
self.mi.author_sort = m.group()
|
|
else:
|
|
self.mi.authors.append(au)
|
|
elif idx == 101:
|
|
self.mi.publisher = clean_xml_chars(self.decode(content).strip())
|
|
if self.mi.publisher in {'Unknown', _('Unknown')}:
|
|
self.mi.publisher = None
|
|
elif idx == 103:
|
|
self.mi.comments = clean_xml_chars(self.decode(content).strip())
|
|
elif idx == 104:
|
|
raw = check_isbn(self.decode(content).strip().replace('-', ''))
|
|
if raw:
|
|
self.mi.isbn = raw
|
|
elif idx == 105:
|
|
if not self.mi.tags:
|
|
self.mi.tags = []
|
|
self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
|
|
self.mi.tags = uniq(self.mi.tags)
|
|
elif idx == 106:
|
|
try:
|
|
self.mi.pubdate = parse_date(self.decode(content), as_utc=False)
|
|
except Exception:
|
|
pass
|
|
elif idx == 108:
|
|
self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
|
|
elif idx == 109:
|
|
self.mi.rights = clean_xml_chars(self.decode(content).strip())
|
|
elif idx == 112: # dc:source set in some EBSP amazon samples
|
|
try:
|
|
content = content.decode(codec).strip()
|
|
isig = 'urn:isbn:'
|
|
if content.lower().startswith(isig):
|
|
raw = check_isbn(content[len(isig):])
|
|
if raw and not self.mi.isbn:
|
|
self.mi.isbn = raw
|
|
elif content.startswith('calibre:'):
|
|
# calibre book uuid is stored here by recent calibre
|
|
# releases
|
|
cid = content[len('calibre:'):]
|
|
if cid:
|
|
self.mi.application_id = self.mi.uuid = cid
|
|
except:
|
|
pass
|
|
elif idx == 113: # ASIN or other id
|
|
try:
|
|
self.uuid = content.decode('ascii')
|
|
self.mi.set_identifier('mobi-asin', self.uuid)
|
|
except Exception:
|
|
self.uuid = None
|
|
elif idx == 116:
|
|
self.start_offset, = struct.unpack(b'>L', content)
|
|
elif idx == 121:
|
|
self.kf8_header, = struct.unpack(b'>L', content)
|
|
if self.kf8_header == NULL_INDEX:
|
|
self.kf8_header = None
|
|
# else:
|
|
# print 'unhandled metadata record', idx, repr(content)
|
|
# }}}
|
|
|
|
|
|
class BookHeader(object):
|
|
|
|
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
|
|
self.log = log
|
|
self.compression_type = raw[:2]
|
|
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
|
|
self.encryption_type, = struct.unpack('>H', raw[12:14])
|
|
if ident == b'TEXTREAD':
|
|
self.codepage = 1252
|
|
if len(raw) <= 16:
|
|
self.codec = 'cp1252'
|
|
self.extra_flags = 0
|
|
self.title = _('Unknown')
|
|
self.language = 'ENGLISH'
|
|
self.sublanguage = 'NEUTRAL'
|
|
self.exth_flag, self.exth = 0, None
|
|
self.ancient = True
|
|
self.first_image_index = -1
|
|
self.mobi_version = 1
|
|
else:
|
|
self.ancient = False
|
|
self.doctype = raw[16:20]
|
|
self.length, self.type, self.codepage, self.unique_id, \
|
|
self.version = struct.unpack('>LLLLL', raw[20:40])
|
|
|
|
try:
|
|
self.codec = {
|
|
1252: 'cp1252',
|
|
65001: 'utf-8',
|
|
}[self.codepage]
|
|
except (IndexError, KeyError):
|
|
self.codec = 'cp1252' if not user_encoding else user_encoding
|
|
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
|
|
self.codec))
|
|
# Some KF8 files have header length == 264 (generated by kindlegen
|
|
# 2.9?). See https://bugs.launchpad.net/bugs/1179144
|
|
max_header_length = 500 # We choose 500 for future versions of kindlegen
|
|
|
|
if (ident == b'TEXTREAD' or self.length < 0xE4 or
|
|
self.length > max_header_length or
|
|
(try_extra_data_fix and self.length == 0xE4)):
|
|
self.extra_flags = 0
|
|
else:
|
|
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
|
|
|
|
if self.compression_type == b'DH':
|
|
self.huff_offset, self.huff_number = struct.unpack('>LL',
|
|
raw[0x70:0x78])
|
|
|
|
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
|
|
tend = toff + tlen
|
|
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
|
|
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
|
|
langid = langcode & 0xFF
|
|
sublangid = (langcode >> 10) & 0xFF
|
|
self.language = main_language.get(langid, 'ENGLISH')
|
|
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
|
|
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
|
|
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
|
|
|
|
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
|
|
self.exth = None
|
|
if not isinstance(self.title, unicode_type):
|
|
self.title = self.title.decode(self.codec, 'replace')
|
|
if self.exth_flag & 0x40:
|
|
try:
|
|
self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
|
|
self.title)
|
|
self.exth.mi.uid = self.unique_id
|
|
if self.exth.mi.is_null('language'):
|
|
try:
|
|
self.exth.mi.language = mobi2iana(langid, sublangid)
|
|
except:
|
|
self.log.exception('Unknown language code')
|
|
except:
|
|
self.log.exception('Invalid EXTH header')
|
|
self.exth_flag = 0
|
|
|
|
self.ncxidx = NULL_INDEX
|
|
if len(raw) >= 0xF8:
|
|
self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)
|
|
|
|
# Ancient PRC files from Baen can have random values for
|
|
# mobi_version, so be conservative
|
|
if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
|
|
self.dividx, self.skelidx, self.datpidx, self.othidx = \
|
|
struct.unpack_from(b'>4L', raw, 0xF8)
|
|
|
|
# need to use the FDST record to find out how to properly
|
|
# unpack the raw_ml into pieces it is simply a table of start
|
|
# and end locations for each flow piece
|
|
self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0)
|
|
# if cnt is 1 or less, fdst section number can be garbage
|
|
if self.fdstcnt <= 1:
|
|
self.fdstidx = NULL_INDEX
|
|
else: # Null values
|
|
self.skelidx = self.dividx = self.othidx = self.fdstidx = \
|
|
NULL_INDEX
|
|
|
|
|
|
class MetadataHeader(BookHeader):
|
|
|
|
def __init__(self, stream, log):
|
|
self.stream = stream
|
|
self.ident = self.identity()
|
|
self.num_sections = self.section_count()
|
|
if self.num_sections >= 2:
|
|
header = self.header()
|
|
BookHeader.__init__(self, header, self.ident, None, log)
|
|
else:
|
|
self.exth = None
|
|
|
|
@property
|
|
def kf8_type(self):
|
|
if (self.mobi_version == 8 and getattr(self, 'skelidx', NULL_INDEX) !=
|
|
NULL_INDEX):
|
|
return 'standalone'
|
|
|
|
kf8_header_index = getattr(self.exth, 'kf8_header', None)
|
|
if kf8_header_index is None:
|
|
return None
|
|
try:
|
|
if self.section_data(kf8_header_index-1) == b'BOUNDARY':
|
|
return 'joint'
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
def identity(self):
|
|
self.stream.seek(60)
|
|
ident = self.stream.read(8).upper()
|
|
if ident not in (b'BOOKMOBI', b'TEXTREAD'):
|
|
raise MobiError('Unknown book type: %s' % ident)
|
|
return ident
|
|
|
|
def section_count(self):
|
|
self.stream.seek(76)
|
|
return struct.unpack('>H', self.stream.read(2))[0]
|
|
|
|
def section_offset(self, number):
|
|
self.stream.seek(78 + number * 8)
|
|
return struct.unpack('>LBBBB', self.stream.read(8))[0]
|
|
|
|
def header(self):
|
|
section_headers = []
|
|
# First section with the metadata
|
|
section_headers.append(self.section_offset(0))
|
|
# Second section used to get the length of the first
|
|
section_headers.append(self.section_offset(1))
|
|
|
|
end_off = section_headers[1]
|
|
off = section_headers[0]
|
|
self.stream.seek(off)
|
|
return self.stream.read(end_off - off)
|
|
|
|
def section_data(self, number):
|
|
start = self.section_offset(number)
|
|
if number == self.num_sections -1:
|
|
end = os.stat(self.stream.name).st_size
|
|
else:
|
|
end = self.section_offset(number + 1)
|
|
self.stream.seek(start)
|
|
try:
|
|
return self.stream.read(end - start)
|
|
except OverflowError:
|
|
self.stream.seek(start)
|
|
return self.stream.read()
|