mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-15 06:03:38 +01:00
Initial import
This commit is contained in:
355
ebook_converter/ebooks/mobi/reader/headers.py
Normal file
355
ebook_converter/ebooks/mobi/reader/headers.py
Normal file
@@ -0,0 +1,355 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, re, os
|
||||
|
||||
from calibre import replace_entities
|
||||
from calibre.utils.date import parse_date
|
||||
from calibre.ebooks.mobi import MobiError
|
||||
from calibre.ebooks.metadata import MetaInformation, check_isbn
|
||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
|
||||
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
from calibre.utils.config_base import tweaks
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
NULL_INDEX = 0xffffffff
|
||||
|
||||
|
||||
def uniq(vals):
|
||||
''' Remove all duplicates from vals, while preserving order. '''
|
||||
vals = vals or ()
|
||||
seen = set()
|
||||
seen_add = seen.add
|
||||
return list(x for x in vals if x not in seen and not seen_add(x))
|
||||
|
||||
|
||||
class EXTHHeader(object): # {{{
|
||||
|
||||
def __init__(self, raw, codec, title):
|
||||
self.doctype = raw[:4]
|
||||
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
|
||||
raw = raw[12:]
|
||||
pos = 0
|
||||
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
||||
self.has_fake_cover = True
|
||||
self.start_offset = None
|
||||
left = self.num_items
|
||||
self.kf8_header = None
|
||||
self.uuid = self.cdetype = None
|
||||
self.page_progression_direction = None
|
||||
self.primary_writing_mode = None
|
||||
|
||||
self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace'))
|
||||
|
||||
while left > 0:
|
||||
left -= 1
|
||||
idx, size = struct.unpack('>LL', raw[pos:pos + 8])
|
||||
content = raw[pos + 8:pos + size]
|
||||
pos += size
|
||||
if idx >= 100 and idx < 200:
|
||||
self.process_metadata(idx, content, codec)
|
||||
elif idx == 203:
|
||||
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
|
||||
elif idx == 201:
|
||||
co, = struct.unpack('>L', content)
|
||||
if co < NULL_INDEX:
|
||||
self.cover_offset = co
|
||||
elif idx == 202:
|
||||
self.thumbnail_offset, = struct.unpack('>L', content)
|
||||
elif idx == 501:
|
||||
try:
|
||||
self.cdetype = content.decode('ascii')
|
||||
except UnicodeDecodeError:
|
||||
self.cdetype = None
|
||||
# cdetype
|
||||
if content == b'EBSP':
|
||||
if not self.mi.tags:
|
||||
self.mi.tags = []
|
||||
self.mi.tags.append(_('Sample Book'))
|
||||
elif idx == 502:
|
||||
# last update time
|
||||
pass
|
||||
elif idx == 503: # Long title
|
||||
# Amazon seems to regard this as the definitive book title
|
||||
# rather than the title from the PDB header. In fact when
|
||||
# sending MOBI files through Amazon's email service if the
|
||||
# title contains non ASCII chars or non filename safe chars
|
||||
# they are messed up in the PDB header
|
||||
try:
|
||||
title = self.decode(content)
|
||||
except Exception:
|
||||
pass
|
||||
elif idx == 524: # Lang code
|
||||
try:
|
||||
lang = content.decode(codec)
|
||||
lang = canonicalize_lang(lang)
|
||||
if lang:
|
||||
self.mi.language = lang
|
||||
except Exception:
|
||||
pass
|
||||
elif idx == 525:
|
||||
try:
|
||||
pwm = content.decode(codec)
|
||||
if pwm:
|
||||
self.primary_writing_mode = pwm
|
||||
except Exception:
|
||||
pass
|
||||
elif idx == 527:
|
||||
try:
|
||||
ppd = content.decode(codec)
|
||||
if ppd:
|
||||
self.page_progression_direction = ppd
|
||||
except Exception:
|
||||
pass
|
||||
# else:
|
||||
# print 'unknown record', idx, repr(content)
|
||||
if title:
|
||||
self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
|
||||
|
||||
def process_metadata(self, idx, content, codec):
|
||||
if idx == 100:
|
||||
if self.mi.is_null('authors'):
|
||||
self.mi.authors = []
|
||||
au = clean_xml_chars(self.decode(content).strip())
|
||||
# Author names in Amazon MOBI files are usually in LN, FN format,
|
||||
# try to detect and auto-correct that.
|
||||
m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
|
||||
if m is not None:
|
||||
if tweaks['author_sort_copy_method'] != 'copy':
|
||||
self.mi.authors.append(m.group(2) + ' ' + m.group(1))
|
||||
else:
|
||||
self.mi.authors.append(m.group())
|
||||
if self.mi.is_null('author_sort'):
|
||||
self.mi.author_sort = m.group()
|
||||
else:
|
||||
self.mi.authors.append(au)
|
||||
elif idx == 101:
|
||||
self.mi.publisher = clean_xml_chars(self.decode(content).strip())
|
||||
if self.mi.publisher in {'Unknown', _('Unknown')}:
|
||||
self.mi.publisher = None
|
||||
elif idx == 103:
|
||||
self.mi.comments = clean_xml_chars(self.decode(content).strip())
|
||||
elif idx == 104:
|
||||
raw = check_isbn(self.decode(content).strip().replace('-', ''))
|
||||
if raw:
|
||||
self.mi.isbn = raw
|
||||
elif idx == 105:
|
||||
if not self.mi.tags:
|
||||
self.mi.tags = []
|
||||
self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
|
||||
self.mi.tags = uniq(self.mi.tags)
|
||||
elif idx == 106:
|
||||
try:
|
||||
self.mi.pubdate = parse_date(self.decode(content), as_utc=False)
|
||||
except Exception:
|
||||
pass
|
||||
elif idx == 108:
|
||||
self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
|
||||
elif idx == 109:
|
||||
self.mi.rights = clean_xml_chars(self.decode(content).strip())
|
||||
elif idx == 112: # dc:source set in some EBSP amazon samples
|
||||
try:
|
||||
content = content.decode(codec).strip()
|
||||
isig = 'urn:isbn:'
|
||||
if content.lower().startswith(isig):
|
||||
raw = check_isbn(content[len(isig):])
|
||||
if raw and not self.mi.isbn:
|
||||
self.mi.isbn = raw
|
||||
elif content.startswith('calibre:'):
|
||||
# calibre book uuid is stored here by recent calibre
|
||||
# releases
|
||||
cid = content[len('calibre:'):]
|
||||
if cid:
|
||||
self.mi.application_id = self.mi.uuid = cid
|
||||
except:
|
||||
pass
|
||||
elif idx == 113: # ASIN or other id
|
||||
try:
|
||||
self.uuid = content.decode('ascii')
|
||||
self.mi.set_identifier('mobi-asin', self.uuid)
|
||||
except Exception:
|
||||
self.uuid = None
|
||||
elif idx == 116:
|
||||
self.start_offset, = struct.unpack(b'>L', content)
|
||||
elif idx == 121:
|
||||
self.kf8_header, = struct.unpack(b'>L', content)
|
||||
if self.kf8_header == NULL_INDEX:
|
||||
self.kf8_header = None
|
||||
# else:
|
||||
# print 'unhandled metadata record', idx, repr(content)
|
||||
# }}}
|
||||
|
||||
|
||||
class BookHeader(object):
|
||||
|
||||
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
|
||||
self.log = log
|
||||
self.compression_type = raw[:2]
|
||||
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
|
||||
self.encryption_type, = struct.unpack('>H', raw[12:14])
|
||||
if ident == b'TEXTREAD':
|
||||
self.codepage = 1252
|
||||
if len(raw) <= 16:
|
||||
self.codec = 'cp1252'
|
||||
self.extra_flags = 0
|
||||
self.title = _('Unknown')
|
||||
self.language = 'ENGLISH'
|
||||
self.sublanguage = 'NEUTRAL'
|
||||
self.exth_flag, self.exth = 0, None
|
||||
self.ancient = True
|
||||
self.first_image_index = -1
|
||||
self.mobi_version = 1
|
||||
else:
|
||||
self.ancient = False
|
||||
self.doctype = raw[16:20]
|
||||
self.length, self.type, self.codepage, self.unique_id, \
|
||||
self.version = struct.unpack('>LLLLL', raw[20:40])
|
||||
|
||||
try:
|
||||
self.codec = {
|
||||
1252: 'cp1252',
|
||||
65001: 'utf-8',
|
||||
}[self.codepage]
|
||||
except (IndexError, KeyError):
|
||||
self.codec = 'cp1252' if not user_encoding else user_encoding
|
||||
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
|
||||
self.codec))
|
||||
# Some KF8 files have header length == 264 (generated by kindlegen
|
||||
# 2.9?). See https://bugs.launchpad.net/bugs/1179144
|
||||
max_header_length = 500 # We choose 500 for future versions of kindlegen
|
||||
|
||||
if (ident == b'TEXTREAD' or self.length < 0xE4 or
|
||||
self.length > max_header_length or
|
||||
(try_extra_data_fix and self.length == 0xE4)):
|
||||
self.extra_flags = 0
|
||||
else:
|
||||
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
|
||||
|
||||
if self.compression_type == b'DH':
|
||||
self.huff_offset, self.huff_number = struct.unpack('>LL',
|
||||
raw[0x70:0x78])
|
||||
|
||||
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
|
||||
tend = toff + tlen
|
||||
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
|
||||
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
|
||||
langid = langcode & 0xFF
|
||||
sublangid = (langcode >> 10) & 0xFF
|
||||
self.language = main_language.get(langid, 'ENGLISH')
|
||||
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
|
||||
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
|
||||
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
|
||||
|
||||
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
|
||||
self.exth = None
|
||||
if not isinstance(self.title, unicode_type):
|
||||
self.title = self.title.decode(self.codec, 'replace')
|
||||
if self.exth_flag & 0x40:
|
||||
try:
|
||||
self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
|
||||
self.title)
|
||||
self.exth.mi.uid = self.unique_id
|
||||
if self.exth.mi.is_null('language'):
|
||||
try:
|
||||
self.exth.mi.language = mobi2iana(langid, sublangid)
|
||||
except:
|
||||
self.log.exception('Unknown language code')
|
||||
except:
|
||||
self.log.exception('Invalid EXTH header')
|
||||
self.exth_flag = 0
|
||||
|
||||
self.ncxidx = NULL_INDEX
|
||||
if len(raw) >= 0xF8:
|
||||
self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)
|
||||
|
||||
# Ancient PRC files from Baen can have random values for
|
||||
# mobi_version, so be conservative
|
||||
if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
|
||||
self.dividx, self.skelidx, self.datpidx, self.othidx = \
|
||||
struct.unpack_from(b'>4L', raw, 0xF8)
|
||||
|
||||
# need to use the FDST record to find out how to properly
|
||||
# unpack the raw_ml into pieces it is simply a table of start
|
||||
# and end locations for each flow piece
|
||||
self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0)
|
||||
# if cnt is 1 or less, fdst section number can be garbage
|
||||
if self.fdstcnt <= 1:
|
||||
self.fdstidx = NULL_INDEX
|
||||
else: # Null values
|
||||
self.skelidx = self.dividx = self.othidx = self.fdstidx = \
|
||||
NULL_INDEX
|
||||
|
||||
|
||||
class MetadataHeader(BookHeader):
|
||||
|
||||
def __init__(self, stream, log):
|
||||
self.stream = stream
|
||||
self.ident = self.identity()
|
||||
self.num_sections = self.section_count()
|
||||
if self.num_sections >= 2:
|
||||
header = self.header()
|
||||
BookHeader.__init__(self, header, self.ident, None, log)
|
||||
else:
|
||||
self.exth = None
|
||||
|
||||
@property
|
||||
def kf8_type(self):
|
||||
if (self.mobi_version == 8 and getattr(self, 'skelidx', NULL_INDEX) !=
|
||||
NULL_INDEX):
|
||||
return 'standalone'
|
||||
|
||||
kf8_header_index = getattr(self.exth, 'kf8_header', None)
|
||||
if kf8_header_index is None:
|
||||
return None
|
||||
try:
|
||||
if self.section_data(kf8_header_index-1) == b'BOUNDARY':
|
||||
return 'joint'
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def identity(self):
|
||||
self.stream.seek(60)
|
||||
ident = self.stream.read(8).upper()
|
||||
if ident not in (b'BOOKMOBI', b'TEXTREAD'):
|
||||
raise MobiError('Unknown book type: %s' % ident)
|
||||
return ident
|
||||
|
||||
def section_count(self):
|
||||
self.stream.seek(76)
|
||||
return struct.unpack('>H', self.stream.read(2))[0]
|
||||
|
||||
def section_offset(self, number):
|
||||
self.stream.seek(78 + number * 8)
|
||||
return struct.unpack('>LBBBB', self.stream.read(8))[0]
|
||||
|
||||
def header(self):
|
||||
section_headers = []
|
||||
# First section with the metadata
|
||||
section_headers.append(self.section_offset(0))
|
||||
# Second section used to get the length of the first
|
||||
section_headers.append(self.section_offset(1))
|
||||
|
||||
end_off = section_headers[1]
|
||||
off = section_headers[0]
|
||||
self.stream.seek(off)
|
||||
return self.stream.read(end_off - off)
|
||||
|
||||
def section_data(self, number):
|
||||
start = self.section_offset(number)
|
||||
if number == self.num_sections -1:
|
||||
end = os.stat(self.stream.name).st_size
|
||||
else:
|
||||
end = self.section_offset(number + 1)
|
||||
self.stream.seek(start)
|
||||
try:
|
||||
return self.stream.read(end - start)
|
||||
except OverflowError:
|
||||
self.stream.seek(start)
|
||||
return self.stream.read()
|
||||
Reference in New Issue
Block a user