Initial import

2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
@@ -0,0 +1,277 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import struct
+from collections import OrderedDict, namedtuple
+
+from calibre.ebooks.mobi.utils import (decint, count_set_bits,
+        decode_string)
+from polyglot.builtins import iteritems, range, zip
+
+TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
+PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
+INDEX_HEADER_FIELDS = (
+            'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
+            'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
+    ) + tuple('unknown%d'%i for i in range(27)) + ('ocnt', 'oentries',
+            'ordt1', 'ordt2', 'tagx')
+
+
+class InvalidFile(ValueError):
+    pass
+
+
+def check_signature(data, signature):
+    if data[:len(signature)] != signature:
+        raise InvalidFile('Not a valid %r section'%signature)
+
+
+class NotAnINDXRecord(InvalidFile):
+    pass
+
+
+class NotATAGXSection(InvalidFile):
+    pass
+
+
+def format_bytes(byts):
+    byts = bytearray(byts)
+    byts = [hex(b)[2:] for b in byts]
+    return ' '.join(byts)
+
+
+def parse_indx_header(data):
+    check_signature(data, b'INDX')
+    words = INDEX_HEADER_FIELDS
+    num = len(words)
+    values = struct.unpack('>%dL' % num, data[4:4*(num+1)])
+    ans = dict(zip(words, values))
+    ordt1, ordt2 = ans['ordt1'], ans['ordt2']
+    ans['ordt1_raw'], ans['ordt2_raw'] = [], []
+    ans['ordt_map'] = ''
+
+    if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
+        # I dont know what this is, but using it seems to be unnecessary, so
+        # just leave it as the raw bytestring
+        ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
+    if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
+        ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
+        if ans['code'] == 65002:
+            # This appears to be EBCDIC-UTF (65002) encoded. I can't be
+            # bothered to write a decoder for this (see
+            # http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
+            # Instead, we use a weird hack that seems to do the trick for all
+            # the books with this type of ORDT record that I have come across.
+            # Some EBSP book samples in KF8 format from Amazon have this type
+            # of encoding.
+            # Basically we try to interpret every second byte as a printable
+            # ascii character. If we cannot, we map to the ? char.
+
+            parsed = bytearray(ans['oentries'])
+            for i in range(0, 2*ans['oentries'], 2):
+                parsed[i//2] = raw[i+1] if 0x20 < raw[i+1] < 0x7f else ord(b'?')
+            ans['ordt_map'] = bytes(parsed).decode('ascii')
+        else:
+            ans['ordt_map'] = '?'*ans['oentries']
+
+    return ans
+
+
+class CNCX(object):  # {{{
+
+    '''
+    Parses the records that contain the compiled NCX (all strings from the
+    NCX). Presents a simple offset : string mapping interface to access the
+    data.
+    '''
+
+    def __init__(self, records, codec):
+        self.records = OrderedDict()
+        record_offset = 0
+        for raw in records:
+            pos = 0
+            while pos < len(raw):
+                length, consumed = decint(raw[pos:])
+                if length > 0:
+                    try:
+                        self.records[pos+record_offset] = raw[
+                            pos+consumed:pos+consumed+length].decode(codec)
+                    except:
+                        byts = raw[pos:]
+                        r = format_bytes(byts)
+                        print('CNCX entry at offset %d has unknown format %s'%(
+                            pos+record_offset, r))
+                        self.records[pos+record_offset] = r
+                        pos = len(raw)
+                pos += consumed+length
+            record_offset += 0x10000
+
+    def __getitem__(self, offset):
+        return self.records.get(offset)
+
+    def get(self, offset, default=None):
+        return self.records.get(offset, default)
+
+    def __bool__(self):
+        return bool(self.records)
+    __nonzero__ = __bool__
+
+    def iteritems(self):
+        return iteritems(self.records)
+
+    def items(self):
+        return iteritems(self.records)
+# }}}
+
+
+def parse_tagx_section(data):
+    check_signature(data, b'TAGX')
+
+    tags = []
+    first_entry_offset, = struct.unpack_from(b'>L', data, 4)
+    control_byte_count, = struct.unpack_from(b'>L', data, 8)
+
+    for i in range(12, first_entry_offset, 4):
+        vals = list(bytearray(data[i:i+4]))
+        tags.append(TagX(*vals))
+    return control_byte_count, tags
+
+
+def get_tag_map(control_byte_count, tagx, data, strict=False):
+    ptags = []
+    ans = {}
+    control_bytes = list(bytearray(data[:control_byte_count]))
+    data = data[control_byte_count:]
+
+    for x in tagx:
+        if x.eof == 0x01:
+            control_bytes = control_bytes[1:]
+            continue
+        value = control_bytes[0] & x.bitmask
+        if value != 0:
+            value_count = value_bytes = None
+            if value == x.bitmask:
+                if count_set_bits(x.bitmask) > 1:
+                    # If all bits of masked value are set and the mask has more
+                    # than one bit, a variable width value will follow after
+                    # the control bytes which defines the length of bytes (NOT
+                    # the value count!) which will contain the corresponding
+                    # variable width values.
+                    value_bytes, consumed = decint(data)
+                    data = data[consumed:]
+                else:
+                    value_count = 1
+            else:
+                # Shift bits to get the masked value.
+                mask = x.bitmask
+                while mask & 0b1 == 0:
+                    mask >>= 1
+                    value >>= 1
+                value_count = value
+            ptags.append(PTagX(x.tag, value_count, value_bytes,
+                x.num_of_values))
+
+    for x in ptags:
+        values = []
+        if x.value_count is not None:
+            # Read value_count * values_per_entry variable width values.
+            for _ in range(x.value_count * x.num_of_values):
+                byts, consumed = decint(data)
+                data = data[consumed:]
+                values.append(byts)
+        else:  # value_bytes is not None
+            # Convert value_bytes to variable width values.
+            total_consumed = 0
+            while total_consumed < x.value_bytes:
+                # Does this work for values_per_entry != 1?
+                byts, consumed = decint(data)
+                data = data[consumed:]
+                total_consumed += consumed
+                values.append(byts)
+            if total_consumed != x.value_bytes:
+                err = ("Error: Should consume %s bytes, but consumed %s" %
+                        (x.value_bytes, total_consumed))
+                if strict:
+                    raise ValueError(err)
+                else:
+                    print(err)
+        ans[x.tag] = values
+    # Test that all bytes have been processed
+    if data.replace(b'\0', b''):
+        err = ("Warning: There are unprocessed index bytes left: %s" %
+                format_bytes(data))
+        if strict:
+            raise ValueError(err)
+        else:
+            print(err)
+
+    return ans
+
+
+def parse_index_record(table, data, control_byte_count, tags, codec,
+        ordt_map, strict=False):
+    header = parse_indx_header(data)
+    idxt_pos = header['start']
+    if data[idxt_pos:idxt_pos+4] != b'IDXT':
+        print('WARNING: Invalid INDX record')
+    entry_count = header['count']
+
+    # loop through to build up the IDXT position starts
+    idx_positions= []
+    for j in range(entry_count):
+        pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j))
+        idx_positions.append(pos)
+    # The last entry ends before the IDXT tag (but there might be zero fill
+    # bytes we need to ignore!)
+    idx_positions.append(idxt_pos)
+
+    # For each entry in the IDXT build up the tag map and any associated
+    # text
+    for j in range(entry_count):
+        start, end = idx_positions[j:j+2]
+        rec = data[start:end]
+        # Sometimes (in the guide table if the type attribute has non ascii
+        # values) the ident is UTF-16 encoded. Try to handle that.
+        try:
+            ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
+        except UnicodeDecodeError:
+            ident, consumed = decode_string(rec, codec='utf-16', ordt_map=ordt_map)
+        if u'\x00' in ident:
+            try:
+                ident, consumed = decode_string(rec, codec='utf-16',
+                        ordt_map=ordt_map)
+            except UnicodeDecodeError:
+                ident = ident.replace('u\x00', u'')
+        rec = rec[consumed:]
+        tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
+        table[ident] = tag_map
+    return header
+
+
+def read_index(sections, idx, codec):
+    table, cncx = OrderedDict(), CNCX([], codec)
+
+    data = sections[idx][0]
+
+    indx_header = parse_indx_header(data)
+    indx_count = indx_header['count']
+
+    if indx_header['ncncx'] > 0:
+        off = idx + indx_count + 1
+        cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
+        cncx = CNCX(cncx_records, codec)
+
+    tag_section_start = indx_header['tagx']
+    control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
+
+    for i in range(idx + 1, idx + 1 + indx_count):
+        # Index record
+        data = sections[i][0]
+        parse_index_record(table, data, control_byte_count, tags, codec,
+                indx_header['ordt_map'])
+    return table, cncx