mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-07 12:14:11 +01:00
644 lines
19 KiB
Python
644 lines
19 KiB
Python
import struct, string, zlib, os
|
|
from collections import OrderedDict
|
|
from io import BytesIO
|
|
|
|
from ebook_converter.utils.img import save_cover_data_to, scale_image, image_to_data, image_from_data, resize_image, png_data_to_gif_data
|
|
from ebook_converter.utils.imghdr import what
|
|
from ebook_converter.ebooks import normalize
|
|
from ebook_converter.polyglot.builtins import as_bytes
|
|
from ebook_converter.tinycss.color3 import parse_color_string
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
|
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
|
|
|
|
|
|
class PolyglotDict(dict):
|
|
|
|
def __setitem__(self, key, val):
|
|
if isinstance(key, str):
|
|
key = key.encode('utf-8')
|
|
dict.__setitem__(self, key, val)
|
|
|
|
def __getitem__(self, key):
|
|
if isinstance(key, str):
|
|
key = key.encode('utf-8')
|
|
return dict.__getitem__(self, key)
|
|
|
|
def __contains__(self, key):
|
|
if isinstance(key, str):
|
|
key = key.encode('utf-8')
|
|
return dict.__contains__(self, key)
|
|
|
|
|
|
def decode_string(raw, codec='utf-8', ordt_map=None):
|
|
length, = struct.unpack(b'>B', raw[0:1])
|
|
raw = raw[1:1+length]
|
|
consumed = length+1
|
|
if ordt_map:
|
|
return ''.join(ordt_map[x] for x in bytearray(raw)), consumed
|
|
return raw.decode(codec), consumed
|
|
|
|
|
|
def decode_hex_number(raw, codec='utf-8'):
|
|
'''
|
|
Return a variable length number encoded using hexadecimal encoding. These
|
|
numbers have the first byte which tells the number of bytes that follow.
|
|
The bytes that follow are simply the hexadecimal representation of the
|
|
number.
|
|
|
|
:param raw: Raw binary data as a bytestring
|
|
|
|
:return: The number and the number of bytes from raw that the number
|
|
occupies.
|
|
'''
|
|
raw, consumed = decode_string(raw, codec=codec)
|
|
return int(raw, 16), consumed
|
|
|
|
|
|
def encode_string(raw):
|
|
ans = bytearray(as_bytes(raw))
|
|
ans.insert(0, len(ans))
|
|
return bytes(ans)
|
|
|
|
|
|
def encode_number_as_hex(num):
|
|
'''
|
|
Encode num as a variable length encoded hexadecimal number. Returns the
|
|
bytestring containing the encoded number. These
|
|
numbers have the first byte which tells the number of bytes that follow.
|
|
The bytes that follow are simply the hexadecimal representation of the
|
|
number.
|
|
'''
|
|
num = hex(num)[2:].upper().encode('ascii')
|
|
nlen = len(num)
|
|
if nlen % 2 != 0:
|
|
num = b'0'+num
|
|
return encode_string(num)
|
|
|
|
|
|
def encint(value, forward=True):
|
|
'''
|
|
Some parts of the Mobipocket format encode data as variable-width integers.
|
|
These integers are represented big-endian with 7 bits per byte in bits 1-7.
|
|
They may be either forward-encoded, in which case only the first byte has bit 8 set,
|
|
or backward-encoded, in which case only the last byte has bit 8 set.
|
|
For example, the number 0x11111 = 0b10001000100010001 would be represented
|
|
forward-encoded as:
|
|
|
|
0x04 0x22 0x91 = 0b100 0b100010 0b10010001
|
|
|
|
And backward-encoded as:
|
|
|
|
0x84 0x22 0x11 = 0b10000100 0b100010 0b10001
|
|
|
|
This function encodes the integer ``value`` as a variable width integer and
|
|
returns the bytestring corresponding to it.
|
|
|
|
If forward is True the bytes returned are suitable for prepending to the
|
|
output buffer, otherwise they must be append to the output buffer.
|
|
'''
|
|
if value < 0:
|
|
raise ValueError('Cannot encode negative numbers as vwi')
|
|
# Encode vwi
|
|
byts = bytearray()
|
|
while True:
|
|
b = value & 0b01111111
|
|
value >>= 7 # shift value to the right by 7 bits
|
|
|
|
byts.append(b)
|
|
if value == 0:
|
|
break
|
|
byts[0 if forward else -1] |= 0b10000000
|
|
byts.reverse()
|
|
return bytes(byts)
|
|
|
|
|
|
def decint(raw, forward=True):
|
|
'''
|
|
Read a variable width integer from the bytestring or bytearray raw and return the
|
|
integer and the number of bytes read. If forward is True bytes are read
|
|
from the start of raw, otherwise from the end of raw.
|
|
|
|
This function is the inverse of encint above, see its docs for more
|
|
details.
|
|
'''
|
|
val = 0
|
|
byts = bytearray()
|
|
src = bytearray(raw)
|
|
if not forward:
|
|
src.reverse()
|
|
for bnum in src:
|
|
byts.append(bnum & 0b01111111)
|
|
if bnum & 0b10000000:
|
|
break
|
|
if not forward:
|
|
byts.reverse()
|
|
for byte in byts:
|
|
val <<= 7 # Shift value to the left by 7 bits
|
|
val |= byte
|
|
|
|
return val, len(byts)
|
|
|
|
|
|
def test_decint(num):
|
|
for d in (True, False):
|
|
raw = encint(num, forward=d)
|
|
sz = len(raw)
|
|
if (num, sz) != decint(raw, forward=d):
|
|
raise ValueError('Failed for num %d, forward=%r: %r != %r' % (
|
|
num, d, (num, sz), decint(raw, forward=d)))
|
|
|
|
|
|
def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None):
|
|
'''
|
|
Convert image setting all transparent pixels to white and changing format
|
|
to JPEG. Ensure the resultant image has a byte size less than
|
|
maxsizeb.
|
|
|
|
If dimen is not None, generate a thumbnail of
|
|
width=dimen, height=dimen or width, height = dimen (depending on the type
|
|
of dimen)
|
|
|
|
Returns the image as a bytestring
|
|
'''
|
|
if dimen is not None:
|
|
if hasattr(dimen, '__len__'):
|
|
width, height = dimen
|
|
else:
|
|
width = height = dimen
|
|
data = scale_image(data, width=width, height=height, compression_quality=90)[-1]
|
|
# else:
|
|
# Replace transparent pixels with white pixels and convert to JPEG
|
|
#data = save_cover_data_to(data)
|
|
if len(data) <= maxsizeb:
|
|
return data
|
|
orig_data = data # save it in case compression fails
|
|
quality = 90
|
|
while len(data) > maxsizeb and quality >= 5:
|
|
data = image_to_data(image_from_data(orig_data), compression_quality=quality)
|
|
quality -= 5
|
|
if len(data) <= maxsizeb:
|
|
return data
|
|
orig_data = data
|
|
|
|
scale = 0.9
|
|
while len(data) > maxsizeb and scale >= 0.05:
|
|
img = image_from_data(data)
|
|
w, h = img.width(), img.height()
|
|
img = resize_image(img, int(scale*w), int(scale*h))
|
|
data = image_to_data(img, compression_quality=quality)
|
|
scale -= 0.05
|
|
return data
|
|
|
|
|
|
def get_trailing_data(record, extra_data_flags):
|
|
'''
|
|
Given a text record as a bytestring and the extra data flags from the MOBI
|
|
header, return the trailing data as a dictionary, mapping bit number to
|
|
data as bytestring. Also returns the record - all trailing data.
|
|
|
|
:return: Trailing data, record - trailing data
|
|
'''
|
|
data = OrderedDict()
|
|
flags = extra_data_flags >> 1
|
|
|
|
num = 0
|
|
while flags:
|
|
num += 1
|
|
if flags & 0b1:
|
|
sz, consumed = decint(record, forward=False)
|
|
if sz > consumed:
|
|
data[num] = record[-sz:-consumed]
|
|
record = record[:-sz]
|
|
flags >>= 1
|
|
# Read multibyte chars if any
|
|
if extra_data_flags & 0b1:
|
|
# Only the first two bits are used for the size since there can
|
|
# never be more than 3 trailing multibyte chars
|
|
sz = (ord(record[-1:]) & 0b11) + 1
|
|
consumed = 1
|
|
if sz > consumed:
|
|
data[0] = record[-sz:-consumed]
|
|
record = record[:-sz]
|
|
return data, record
|
|
|
|
|
|
def encode_trailing_data(raw):
|
|
'''
|
|
Given some data in the bytestring raw, return a bytestring of the form
|
|
|
|
<data><size>
|
|
|
|
where size is a backwards encoded vwi whose value is the length of the
|
|
entire returned bytestring. data is the bytestring passed in as raw.
|
|
|
|
This is the encoding used for trailing data entries at the end of text
|
|
records. See get_trailing_data() for details.
|
|
'''
|
|
lsize = 1
|
|
while True:
|
|
encoded = encint(len(raw) + lsize, forward=False)
|
|
if len(encoded) == lsize:
|
|
break
|
|
lsize += 1
|
|
return raw + encoded
|
|
|
|
|
|
def encode_fvwi(val, flags, flag_size=4):
|
|
'''
|
|
Encode the value val and the flag_size bits from flags as a fvwi. This encoding is
|
|
used in the trailing byte sequences for indexing. Returns encoded
|
|
bytestring.
|
|
'''
|
|
ans = val << flag_size
|
|
for i in range(flag_size):
|
|
ans |= (flags & (1 << i))
|
|
return encint(ans)
|
|
|
|
|
|
def decode_fvwi(byts, flag_size=4):
|
|
'''
|
|
Decode encoded fvwi. Returns number, flags, consumed
|
|
'''
|
|
arg, consumed = decint(bytes(byts))
|
|
val = arg >> flag_size
|
|
flags = 0
|
|
for i in range(flag_size):
|
|
flags |= (arg & (1 << i))
|
|
return val, flags, consumed
|
|
|
|
|
|
def decode_tbs(byts, flag_size=4):
|
|
'''
|
|
Trailing byte sequences for indexing consists of series of fvwi numbers.
|
|
This function reads the fvwi number and its associated flags. It then uses
|
|
the flags to read any more numbers that belong to the series. The flags are
|
|
the lowest 4 bits of the vwi (see the encode_fvwi function above).
|
|
|
|
Returns the fvwi number, a dictionary mapping flags bits to the associated
|
|
data and the number of bytes consumed.
|
|
'''
|
|
byts = bytes(byts)
|
|
val, flags, consumed = decode_fvwi(byts, flag_size=flag_size)
|
|
extra = {}
|
|
byts = byts[consumed:]
|
|
if flags & 0b1000 and flag_size > 3:
|
|
extra[0b1000] = True
|
|
if flags & 0b0010:
|
|
x, consumed2 = decint(byts)
|
|
byts = byts[consumed2:]
|
|
extra[0b0010] = x
|
|
consumed += consumed2
|
|
if flags & 0b0100:
|
|
extra[0b0100] = ord(byts[0:1])
|
|
byts = byts[1:]
|
|
consumed += 1
|
|
if flags & 0b0001:
|
|
x, consumed2 = decint(byts)
|
|
byts = byts[consumed2:]
|
|
extra[0b0001] = x
|
|
consumed += consumed2
|
|
return val, extra, consumed
|
|
|
|
|
|
def encode_tbs(val, extra, flag_size=4):
|
|
'''
|
|
Encode the number val and the extra data in the extra dict as an fvwi. See
|
|
decode_tbs above.
|
|
'''
|
|
flags = 0
|
|
for flag in extra:
|
|
flags |= flag
|
|
ans = encode_fvwi(val, flags, flag_size=flag_size)
|
|
|
|
if 0b0010 in extra:
|
|
ans += encint(extra[0b0010])
|
|
if 0b0100 in extra:
|
|
ans += bytes(bytearray([extra[0b0100]]))
|
|
if 0b0001 in extra:
|
|
ans += encint(extra[0b0001])
|
|
return ans
|
|
|
|
|
|
def utf8_text(text):
|
|
'''
|
|
Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
|
|
empty, normalized bytestring.
|
|
'''
|
|
if text and text.strip():
|
|
text = text.strip()
|
|
if not isinstance(text, str):
|
|
text = text.decode('utf-8', 'replace')
|
|
text = normalize(text).encode('utf-8')
|
|
else:
|
|
text = _('Unknown').encode('utf-8')
|
|
return text
|
|
|
|
|
|
def align_block(raw, multiple=4, pad=b'\0'):
|
|
'''
|
|
Return raw with enough pad bytes append to ensure its length is a multiple
|
|
of 4.
|
|
'''
|
|
extra = len(raw) % multiple
|
|
if extra == 0:
|
|
return raw
|
|
return raw + pad*(multiple - extra)
|
|
|
|
|
|
def detect_periodical(toc, log=None):
|
|
'''
|
|
Detect if the TOC object toc contains a periodical that conforms to the
|
|
structure required by kindlegen to generate a periodical.
|
|
'''
|
|
if toc.count() < 1 or not toc[0].klass == 'periodical':
|
|
return False
|
|
for node in toc.iterdescendants():
|
|
if node.depth() == 1 and node.klass != 'article':
|
|
if log is not None:
|
|
log.debug(
|
|
'Not a periodical: Deepest node does not have '
|
|
'class="article"')
|
|
return False
|
|
if node.depth() == 2 and node.klass != 'section':
|
|
if log is not None:
|
|
log.debug(
|
|
'Not a periodical: Second deepest node does not have'
|
|
' class="section"')
|
|
return False
|
|
if node.depth() == 3 and node.klass != 'periodical':
|
|
if log is not None:
|
|
log.debug('Not a periodical: Third deepest node'
|
|
' does not have class="periodical"')
|
|
return False
|
|
if node.depth() > 3:
|
|
if log is not None:
|
|
log.debug('Not a periodical: Has nodes of depth > 3')
|
|
return False
|
|
return True
|
|
|
|
|
|
def count_set_bits(num):
|
|
if num < 0:
|
|
num = -num
|
|
ans = 0
|
|
while num > 0:
|
|
ans += (num & 0b1)
|
|
num >>= 1
|
|
return ans
|
|
|
|
|
|
def to_base(num, base=32, min_num_digits=None):
|
|
digits = string.digits + string.ascii_uppercase
|
|
sign = 1 if num >= 0 else -1
|
|
if num == 0:
|
|
return ('0' if min_num_digits is None else '0'*min_num_digits)
|
|
num *= sign
|
|
ans = []
|
|
while num:
|
|
ans.append(digits[(num % base)])
|
|
num //= base
|
|
if min_num_digits is not None and len(ans) < min_num_digits:
|
|
ans.extend('0'*(min_num_digits - len(ans)))
|
|
if sign < 0:
|
|
ans.append('-')
|
|
ans.reverse()
|
|
return ''.join(ans)
|
|
|
|
|
|
def mobify_image(data):
|
|
'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG'
|
|
fmt = what(None, data)
|
|
if fmt == 'png':
|
|
data = png_data_to_gif_data(data)
|
|
return data
|
|
|
|
# Font records {{{
|
|
|
|
|
|
def read_font_record(data, extent=1040):
|
|
'''
|
|
Return the font encoded in the MOBI FONT record represented by data.
|
|
The return value in a dict with fields raw_data, font_data, err, ext,
|
|
headers.
|
|
|
|
:param extent: The number of obfuscated bytes. So far I have only
|
|
encountered files with 1040 obfuscated bytes. If you encounter an
|
|
obfuscated record for which this function fails, try different extent
|
|
values (easily automated).
|
|
|
|
raw_data is the raw data in the font record
|
|
font_data is the decoded font_data or None if an error occurred
|
|
err is not None if some error occurred
|
|
ext is the font type (ttf for TrueType, dat for unknown and failed if an
|
|
error occurred)
|
|
headers is the list of decoded headers from the font record or None if
|
|
decoding failed
|
|
'''
|
|
# Format:
|
|
# bytes 0 - 3: 'FONT'
|
|
# bytes 4 - 7: Uncompressed size
|
|
# bytes 8 - 11: flags
|
|
# bit 1 - zlib compression
|
|
# bit 2 - XOR obfuscated
|
|
# bytes 12 - 15: offset to start of compressed data
|
|
# bytes 16 - 19: length of XOR string
|
|
# bytes 19 - 23: offset to start of XOR data
|
|
# The zlib compressed data begins with 2 bytes of header and
|
|
# has 4 bytes of checksum at the end
|
|
ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed',
|
|
'headers':None, 'encrypted':False}
|
|
|
|
try:
|
|
usize, flags, dstart, xor_len, xor_start = struct.unpack_from(
|
|
b'>LLLLL', data, 4)
|
|
except:
|
|
ans['err'] = 'Failed to read font record header fields'
|
|
return ans
|
|
font_data = data[dstart:]
|
|
ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len,
|
|
'xor_start':xor_start, 'dstart':dstart}
|
|
|
|
if flags & 0b10:
|
|
# De-obfuscate the data
|
|
key = bytearray(data[xor_start:xor_start+xor_len])
|
|
buf = bytearray(font_data)
|
|
extent = len(font_data) if extent is None else extent
|
|
extent = min(extent, len(font_data))
|
|
|
|
for n in range(extent):
|
|
buf[n] ^= key[n%xor_len] # XOR of buf and key
|
|
|
|
font_data = bytes(buf)
|
|
ans['encrypted'] = True
|
|
|
|
if flags & 0b1:
|
|
# ZLIB compressed data
|
|
try:
|
|
font_data = zlib.decompress(font_data)
|
|
except Exception as e:
|
|
ans['err'] = 'Failed to zlib decompress font data (%s)'%e
|
|
return ans
|
|
|
|
if len(font_data) != usize:
|
|
ans['err'] = 'Uncompressed font size mismatch'
|
|
return ans
|
|
|
|
ans['font_data'] = font_data
|
|
sig = font_data[:4]
|
|
ans['ext'] = ('ttf' if sig in {b'\0\1\0\0', b'true', b'ttcf'}
|
|
else 'otf' if sig == b'OTTO' else 'dat')
|
|
|
|
return ans
|
|
|
|
|
|
def write_font_record(data, obfuscate=True, compress=True):
|
|
'''
|
|
Write the ttf/otf font represented by data into a font record. See
|
|
read_font_record() for details on the format of the record.
|
|
'''
|
|
|
|
flags = 0
|
|
key_len = 20
|
|
usize = len(data)
|
|
xor_key = b''
|
|
if compress:
|
|
flags |= 0b1
|
|
data = zlib.compress(data, 9)
|
|
if obfuscate and len(data) >= 1040:
|
|
flags |= 0b10
|
|
xor_key = os.urandom(key_len)
|
|
key = bytearray(xor_key)
|
|
data = bytearray(data)
|
|
for i in range(1040):
|
|
data[i] ^= key[i%key_len]
|
|
data = bytes(data)
|
|
|
|
key_start = struct.calcsize(b'>5L') + 4
|
|
data_start = key_start + len(xor_key)
|
|
|
|
header = b'FONT' + struct.pack(b'>5L', usize, flags, data_start,
|
|
len(xor_key), key_start)
|
|
|
|
return header + xor_key + data
|
|
|
|
# }}}
|
|
|
|
|
|
def create_text_record(text):
|
|
'''
|
|
Return a Palmdoc record of size RECORD_SIZE from the text file object.
|
|
In case the record ends in the middle of a multibyte character return
|
|
the overlap as well.
|
|
|
|
Returns data, overlap: where both are byte strings. overlap is the
|
|
extra bytes needed to complete the truncated multibyte character.
|
|
'''
|
|
opos = text.tell()
|
|
text.seek(0, 2)
|
|
# npos is the position of the next record
|
|
npos = min((opos + RECORD_SIZE, text.tell()))
|
|
# Number of bytes from the next record needed to complete the last
|
|
# character in this record
|
|
extra = 0
|
|
|
|
last = b''
|
|
while not last.decode('utf-8', 'ignore'):
|
|
# last contains no valid utf-8 characters
|
|
size = len(last) + 1
|
|
text.seek(npos - size)
|
|
last = text.read(size)
|
|
|
|
# last now has one valid utf-8 char and possibly some bytes that belong
|
|
# to a truncated char
|
|
|
|
try:
|
|
last.decode('utf-8', 'strict')
|
|
except UnicodeDecodeError:
|
|
# There are some truncated bytes in last
|
|
prev = len(last)
|
|
while True:
|
|
text.seek(npos - prev)
|
|
last = text.read(len(last) + 1)
|
|
try:
|
|
last.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
pass
|
|
else:
|
|
break
|
|
extra = len(last) - prev
|
|
|
|
text.seek(opos)
|
|
data = text.read(RECORD_SIZE)
|
|
overlap = text.read(extra)
|
|
text.seek(npos)
|
|
|
|
return data, overlap
|
|
|
|
|
|
class CNCX(object): # {{{
|
|
|
|
'''
|
|
Create the CNCX records. These are records containing all the strings from
|
|
an index. Each record is of the form: <vwi string size><utf-8 encoded
|
|
string>
|
|
'''
|
|
|
|
MAX_STRING_LENGTH = 500
|
|
|
|
def __init__(self, strings=()):
|
|
self.strings = OrderedDict((s, 0) for s in strings)
|
|
|
|
self.records = []
|
|
offset = 0
|
|
buf = BytesIO()
|
|
RECORD_LIMIT = 0x10000 - 1024 # kindlegen appears to use 1024, PDB limit is 0x10000
|
|
for key in self.strings:
|
|
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
|
|
l = len(utf8)
|
|
sz_bytes = encint(l)
|
|
raw = sz_bytes + utf8
|
|
if buf.tell() + len(raw) > RECORD_LIMIT:
|
|
self.records.append(align_block(buf.getvalue()))
|
|
buf.seek(0), buf.truncate(0)
|
|
offset = len(self.records) * 0x10000
|
|
buf.write(raw)
|
|
self.strings[key] = offset
|
|
offset += len(raw)
|
|
|
|
val = buf.getvalue()
|
|
if val:
|
|
self.records.append(align_block(val))
|
|
|
|
def __getitem__(self, string):
|
|
return self.strings[string]
|
|
|
|
def __bool__(self):
|
|
return bool(self.records)
|
|
__nonzero__ = __bool__
|
|
|
|
def __len__(self):
|
|
return len(self.records)
|
|
|
|
# }}}
|
|
|
|
|
|
def is_guide_ref_start(ref):
|
|
return (ref.title.lower() == 'start' or
|
|
(ref.type and ref.type.lower() in {'start',
|
|
'other.start', 'text'}))
|
|
|
|
|
|
def convert_color_for_font_tag(val):
|
|
rgba = parse_color_string(str(val or ''))
|
|
if rgba is None or rgba == 'currentColor':
|
|
return val
|
|
clamp = lambda x: min(x, max(0, x), 1)
|
|
rgb = map(clamp, rgba[:3])
|
|
return '#' + ''.join(map(lambda x:'%02x' % int(x * 255), rgb))
|