mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-17 14:45:46 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
647 lines
20 KiB
Python
647 lines
20 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
import struct, string, zlib, os
|
|
from collections import OrderedDict
|
|
from io import BytesIO
|
|
|
|
from ebook_converter.utils.img import save_cover_data_to, scale_image, image_to_data, image_from_data, resize_image, png_data_to_gif_data
|
|
from ebook_converter.utils.imghdr import what
|
|
from ebook_converter.ebooks import normalize
|
|
from ebook_converter.polyglot.builtins import unicode_type, range, as_bytes, map
|
|
from tinycss.color3 import parse_color_string
|
|
|
|
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
|
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
|
|
|
|
|
|
class PolyglotDict(dict):
|
|
|
|
def __setitem__(self, key, val):
|
|
if isinstance(key, unicode_type):
|
|
key = key.encode('utf-8')
|
|
dict.__setitem__(self, key, val)
|
|
|
|
def __getitem__(self, key):
|
|
if isinstance(key, unicode_type):
|
|
key = key.encode('utf-8')
|
|
return dict.__getitem__(self, key)
|
|
|
|
def __contains__(self, key):
|
|
if isinstance(key, unicode_type):
|
|
key = key.encode('utf-8')
|
|
return dict.__contains__(self, key)
|
|
|
|
|
|
def decode_string(raw, codec='utf-8', ordt_map=None):
|
|
length, = struct.unpack(b'>B', raw[0:1])
|
|
raw = raw[1:1+length]
|
|
consumed = length+1
|
|
if ordt_map:
|
|
return ''.join(ordt_map[x] for x in bytearray(raw)), consumed
|
|
return raw.decode(codec), consumed
|
|
|
|
|
|
def decode_hex_number(raw, codec='utf-8'):
|
|
'''
|
|
Return a variable length number encoded using hexadecimal encoding. These
|
|
numbers have the first byte which tells the number of bytes that follow.
|
|
The bytes that follow are simply the hexadecimal representation of the
|
|
number.
|
|
|
|
:param raw: Raw binary data as a bytestring
|
|
|
|
:return: The number and the number of bytes from raw that the number
|
|
occupies.
|
|
'''
|
|
raw, consumed = decode_string(raw, codec=codec)
|
|
return int(raw, 16), consumed
|
|
|
|
|
|
def encode_string(raw):
|
|
ans = bytearray(as_bytes(raw))
|
|
ans.insert(0, len(ans))
|
|
return bytes(ans)
|
|
|
|
|
|
def encode_number_as_hex(num):
|
|
'''
|
|
Encode num as a variable length encoded hexadecimal number. Returns the
|
|
bytestring containing the encoded number. These
|
|
numbers have the first byte which tells the number of bytes that follow.
|
|
The bytes that follow are simply the hexadecimal representation of the
|
|
number.
|
|
'''
|
|
num = hex(num)[2:].upper().encode('ascii')
|
|
nlen = len(num)
|
|
if nlen % 2 != 0:
|
|
num = b'0'+num
|
|
return encode_string(num)
|
|
|
|
|
|
def encint(value, forward=True):
|
|
'''
|
|
Some parts of the Mobipocket format encode data as variable-width integers.
|
|
These integers are represented big-endian with 7 bits per byte in bits 1-7.
|
|
They may be either forward-encoded, in which case only the first byte has bit 8 set,
|
|
or backward-encoded, in which case only the last byte has bit 8 set.
|
|
For example, the number 0x11111 = 0b10001000100010001 would be represented
|
|
forward-encoded as:
|
|
|
|
0x04 0x22 0x91 = 0b100 0b100010 0b10010001
|
|
|
|
And backward-encoded as:
|
|
|
|
0x84 0x22 0x11 = 0b10000100 0b100010 0b10001
|
|
|
|
This function encodes the integer ``value`` as a variable width integer and
|
|
returns the bytestring corresponding to it.
|
|
|
|
If forward is True the bytes returned are suitable for prepending to the
|
|
output buffer, otherwise they must be append to the output buffer.
|
|
'''
|
|
if value < 0:
|
|
raise ValueError('Cannot encode negative numbers as vwi')
|
|
# Encode vwi
|
|
byts = bytearray()
|
|
while True:
|
|
b = value & 0b01111111
|
|
value >>= 7 # shift value to the right by 7 bits
|
|
|
|
byts.append(b)
|
|
if value == 0:
|
|
break
|
|
byts[0 if forward else -1] |= 0b10000000
|
|
byts.reverse()
|
|
return bytes(byts)
|
|
|
|
|
|
def decint(raw, forward=True):
|
|
'''
|
|
Read a variable width integer from the bytestring or bytearray raw and return the
|
|
integer and the number of bytes read. If forward is True bytes are read
|
|
from the start of raw, otherwise from the end of raw.
|
|
|
|
This function is the inverse of encint above, see its docs for more
|
|
details.
|
|
'''
|
|
val = 0
|
|
byts = bytearray()
|
|
src = bytearray(raw)
|
|
if not forward:
|
|
src.reverse()
|
|
for bnum in src:
|
|
byts.append(bnum & 0b01111111)
|
|
if bnum & 0b10000000:
|
|
break
|
|
if not forward:
|
|
byts.reverse()
|
|
for byte in byts:
|
|
val <<= 7 # Shift value to the left by 7 bits
|
|
val |= byte
|
|
|
|
return val, len(byts)
|
|
|
|
|
|
def test_decint(num):
|
|
for d in (True, False):
|
|
raw = encint(num, forward=d)
|
|
sz = len(raw)
|
|
if (num, sz) != decint(raw, forward=d):
|
|
raise ValueError('Failed for num %d, forward=%r: %r != %r' % (
|
|
num, d, (num, sz), decint(raw, forward=d)))
|
|
|
|
|
|
def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None):
|
|
'''
|
|
Convert image setting all transparent pixels to white and changing format
|
|
to JPEG. Ensure the resultant image has a byte size less than
|
|
maxsizeb.
|
|
|
|
If dimen is not None, generate a thumbnail of
|
|
width=dimen, height=dimen or width, height = dimen (depending on the type
|
|
of dimen)
|
|
|
|
Returns the image as a bytestring
|
|
'''
|
|
if dimen is not None:
|
|
if hasattr(dimen, '__len__'):
|
|
width, height = dimen
|
|
else:
|
|
width = height = dimen
|
|
data = scale_image(data, width=width, height=height, compression_quality=90)[-1]
|
|
else:
|
|
# Replace transparent pixels with white pixels and convert to JPEG
|
|
data = save_cover_data_to(data)
|
|
if len(data) <= maxsizeb:
|
|
return data
|
|
orig_data = data # save it in case compression fails
|
|
quality = 90
|
|
while len(data) > maxsizeb and quality >= 5:
|
|
data = image_to_data(image_from_data(orig_data), compression_quality=quality)
|
|
quality -= 5
|
|
if len(data) <= maxsizeb:
|
|
return data
|
|
orig_data = data
|
|
|
|
scale = 0.9
|
|
while len(data) > maxsizeb and scale >= 0.05:
|
|
img = image_from_data(data)
|
|
w, h = img.width(), img.height()
|
|
img = resize_image(img, int(scale*w), int(scale*h))
|
|
data = image_to_data(img, compression_quality=quality)
|
|
scale -= 0.05
|
|
return data
|
|
|
|
|
|
def get_trailing_data(record, extra_data_flags):
|
|
'''
|
|
Given a text record as a bytestring and the extra data flags from the MOBI
|
|
header, return the trailing data as a dictionary, mapping bit number to
|
|
data as bytestring. Also returns the record - all trailing data.
|
|
|
|
:return: Trailing data, record - trailing data
|
|
'''
|
|
data = OrderedDict()
|
|
flags = extra_data_flags >> 1
|
|
|
|
num = 0
|
|
while flags:
|
|
num += 1
|
|
if flags & 0b1:
|
|
sz, consumed = decint(record, forward=False)
|
|
if sz > consumed:
|
|
data[num] = record[-sz:-consumed]
|
|
record = record[:-sz]
|
|
flags >>= 1
|
|
# Read multibyte chars if any
|
|
if extra_data_flags & 0b1:
|
|
# Only the first two bits are used for the size since there can
|
|
# never be more than 3 trailing multibyte chars
|
|
sz = (ord(record[-1:]) & 0b11) + 1
|
|
consumed = 1
|
|
if sz > consumed:
|
|
data[0] = record[-sz:-consumed]
|
|
record = record[:-sz]
|
|
return data, record
|
|
|
|
|
|
def encode_trailing_data(raw):
|
|
'''
|
|
Given some data in the bytestring raw, return a bytestring of the form
|
|
|
|
<data><size>
|
|
|
|
where size is a backwards encoded vwi whose value is the length of the
|
|
entire returned bytestring. data is the bytestring passed in as raw.
|
|
|
|
This is the encoding used for trailing data entries at the end of text
|
|
records. See get_trailing_data() for details.
|
|
'''
|
|
lsize = 1
|
|
while True:
|
|
encoded = encint(len(raw) + lsize, forward=False)
|
|
if len(encoded) == lsize:
|
|
break
|
|
lsize += 1
|
|
return raw + encoded
|
|
|
|
|
|
def encode_fvwi(val, flags, flag_size=4):
|
|
'''
|
|
Encode the value val and the flag_size bits from flags as a fvwi. This encoding is
|
|
used in the trailing byte sequences for indexing. Returns encoded
|
|
bytestring.
|
|
'''
|
|
ans = val << flag_size
|
|
for i in range(flag_size):
|
|
ans |= (flags & (1 << i))
|
|
return encint(ans)
|
|
|
|
|
|
def decode_fvwi(byts, flag_size=4):
|
|
'''
|
|
Decode encoded fvwi. Returns number, flags, consumed
|
|
'''
|
|
arg, consumed = decint(bytes(byts))
|
|
val = arg >> flag_size
|
|
flags = 0
|
|
for i in range(flag_size):
|
|
flags |= (arg & (1 << i))
|
|
return val, flags, consumed
|
|
|
|
|
|
def decode_tbs(byts, flag_size=4):
|
|
'''
|
|
Trailing byte sequences for indexing consists of series of fvwi numbers.
|
|
This function reads the fvwi number and its associated flags. It then uses
|
|
the flags to read any more numbers that belong to the series. The flags are
|
|
the lowest 4 bits of the vwi (see the encode_fvwi function above).
|
|
|
|
Returns the fvwi number, a dictionary mapping flags bits to the associated
|
|
data and the number of bytes consumed.
|
|
'''
|
|
byts = bytes(byts)
|
|
val, flags, consumed = decode_fvwi(byts, flag_size=flag_size)
|
|
extra = {}
|
|
byts = byts[consumed:]
|
|
if flags & 0b1000 and flag_size > 3:
|
|
extra[0b1000] = True
|
|
if flags & 0b0010:
|
|
x, consumed2 = decint(byts)
|
|
byts = byts[consumed2:]
|
|
extra[0b0010] = x
|
|
consumed += consumed2
|
|
if flags & 0b0100:
|
|
extra[0b0100] = ord(byts[0:1])
|
|
byts = byts[1:]
|
|
consumed += 1
|
|
if flags & 0b0001:
|
|
x, consumed2 = decint(byts)
|
|
byts = byts[consumed2:]
|
|
extra[0b0001] = x
|
|
consumed += consumed2
|
|
return val, extra, consumed
|
|
|
|
|
|
def encode_tbs(val, extra, flag_size=4):
|
|
'''
|
|
Encode the number val and the extra data in the extra dict as an fvwi. See
|
|
decode_tbs above.
|
|
'''
|
|
flags = 0
|
|
for flag in extra:
|
|
flags |= flag
|
|
ans = encode_fvwi(val, flags, flag_size=flag_size)
|
|
|
|
if 0b0010 in extra:
|
|
ans += encint(extra[0b0010])
|
|
if 0b0100 in extra:
|
|
ans += bytes(bytearray([extra[0b0100]]))
|
|
if 0b0001 in extra:
|
|
ans += encint(extra[0b0001])
|
|
return ans
|
|
|
|
|
|
def utf8_text(text):
|
|
'''
|
|
Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
|
|
empty, normalized bytestring.
|
|
'''
|
|
if text and text.strip():
|
|
text = text.strip()
|
|
if not isinstance(text, unicode_type):
|
|
text = text.decode('utf-8', 'replace')
|
|
text = normalize(text).encode('utf-8')
|
|
else:
|
|
text = _('Unknown').encode('utf-8')
|
|
return text
|
|
|
|
|
|
def align_block(raw, multiple=4, pad=b'\0'):
|
|
'''
|
|
Return raw with enough pad bytes append to ensure its length is a multiple
|
|
of 4.
|
|
'''
|
|
extra = len(raw) % multiple
|
|
if extra == 0:
|
|
return raw
|
|
return raw + pad*(multiple - extra)
|
|
|
|
|
|
def detect_periodical(toc, log=None):
|
|
'''
|
|
Detect if the TOC object toc contains a periodical that conforms to the
|
|
structure required by kindlegen to generate a periodical.
|
|
'''
|
|
if toc.count() < 1 or not toc[0].klass == 'periodical':
|
|
return False
|
|
for node in toc.iterdescendants():
|
|
if node.depth() == 1 and node.klass != 'article':
|
|
if log is not None:
|
|
log.debug(
|
|
'Not a periodical: Deepest node does not have '
|
|
'class="article"')
|
|
return False
|
|
if node.depth() == 2 and node.klass != 'section':
|
|
if log is not None:
|
|
log.debug(
|
|
'Not a periodical: Second deepest node does not have'
|
|
' class="section"')
|
|
return False
|
|
if node.depth() == 3 and node.klass != 'periodical':
|
|
if log is not None:
|
|
log.debug('Not a periodical: Third deepest node'
|
|
' does not have class="periodical"')
|
|
return False
|
|
if node.depth() > 3:
|
|
if log is not None:
|
|
log.debug('Not a periodical: Has nodes of depth > 3')
|
|
return False
|
|
return True
|
|
|
|
|
|
def count_set_bits(num):
|
|
if num < 0:
|
|
num = -num
|
|
ans = 0
|
|
while num > 0:
|
|
ans += (num & 0b1)
|
|
num >>= 1
|
|
return ans
|
|
|
|
|
|
def to_base(num, base=32, min_num_digits=None):
|
|
digits = string.digits + string.ascii_uppercase
|
|
sign = 1 if num >= 0 else -1
|
|
if num == 0:
|
|
return ('0' if min_num_digits is None else '0'*min_num_digits)
|
|
num *= sign
|
|
ans = []
|
|
while num:
|
|
ans.append(digits[(num % base)])
|
|
num //= base
|
|
if min_num_digits is not None and len(ans) < min_num_digits:
|
|
ans.extend('0'*(min_num_digits - len(ans)))
|
|
if sign < 0:
|
|
ans.append('-')
|
|
ans.reverse()
|
|
return ''.join(ans)
|
|
|
|
|
|
def mobify_image(data):
|
|
'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG'
|
|
fmt = what(None, data)
|
|
if fmt == 'png':
|
|
data = png_data_to_gif_data(data)
|
|
return data
|
|
|
|
# Font records {{{
|
|
|
|
|
|
def read_font_record(data, extent=1040):
|
|
'''
|
|
Return the font encoded in the MOBI FONT record represented by data.
|
|
The return value in a dict with fields raw_data, font_data, err, ext,
|
|
headers.
|
|
|
|
:param extent: The number of obfuscated bytes. So far I have only
|
|
encountered files with 1040 obfuscated bytes. If you encounter an
|
|
obfuscated record for which this function fails, try different extent
|
|
values (easily automated).
|
|
|
|
raw_data is the raw data in the font record
|
|
font_data is the decoded font_data or None if an error occurred
|
|
err is not None if some error occurred
|
|
ext is the font type (ttf for TrueType, dat for unknown and failed if an
|
|
error occurred)
|
|
headers is the list of decoded headers from the font record or None if
|
|
decoding failed
|
|
'''
|
|
# Format:
|
|
# bytes 0 - 3: 'FONT'
|
|
# bytes 4 - 7: Uncompressed size
|
|
# bytes 8 - 11: flags
|
|
# bit 1 - zlib compression
|
|
# bit 2 - XOR obfuscated
|
|
# bytes 12 - 15: offset to start of compressed data
|
|
# bytes 16 - 19: length of XOR string
|
|
# bytes 19 - 23: offset to start of XOR data
|
|
# The zlib compressed data begins with 2 bytes of header and
|
|
# has 4 bytes of checksum at the end
|
|
ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed',
|
|
'headers':None, 'encrypted':False}
|
|
|
|
try:
|
|
usize, flags, dstart, xor_len, xor_start = struct.unpack_from(
|
|
b'>LLLLL', data, 4)
|
|
except:
|
|
ans['err'] = 'Failed to read font record header fields'
|
|
return ans
|
|
font_data = data[dstart:]
|
|
ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len,
|
|
'xor_start':xor_start, 'dstart':dstart}
|
|
|
|
if flags & 0b10:
|
|
# De-obfuscate the data
|
|
key = bytearray(data[xor_start:xor_start+xor_len])
|
|
buf = bytearray(font_data)
|
|
extent = len(font_data) if extent is None else extent
|
|
extent = min(extent, len(font_data))
|
|
|
|
for n in range(extent):
|
|
buf[n] ^= key[n%xor_len] # XOR of buf and key
|
|
|
|
font_data = bytes(buf)
|
|
ans['encrypted'] = True
|
|
|
|
if flags & 0b1:
|
|
# ZLIB compressed data
|
|
try:
|
|
font_data = zlib.decompress(font_data)
|
|
except Exception as e:
|
|
ans['err'] = 'Failed to zlib decompress font data (%s)'%e
|
|
return ans
|
|
|
|
if len(font_data) != usize:
|
|
ans['err'] = 'Uncompressed font size mismatch'
|
|
return ans
|
|
|
|
ans['font_data'] = font_data
|
|
sig = font_data[:4]
|
|
ans['ext'] = ('ttf' if sig in {b'\0\1\0\0', b'true', b'ttcf'}
|
|
else 'otf' if sig == b'OTTO' else 'dat')
|
|
|
|
return ans
|
|
|
|
|
|
def write_font_record(data, obfuscate=True, compress=True):
|
|
'''
|
|
Write the ttf/otf font represented by data into a font record. See
|
|
read_font_record() for details on the format of the record.
|
|
'''
|
|
|
|
flags = 0
|
|
key_len = 20
|
|
usize = len(data)
|
|
xor_key = b''
|
|
if compress:
|
|
flags |= 0b1
|
|
data = zlib.compress(data, 9)
|
|
if obfuscate and len(data) >= 1040:
|
|
flags |= 0b10
|
|
xor_key = os.urandom(key_len)
|
|
key = bytearray(xor_key)
|
|
data = bytearray(data)
|
|
for i in range(1040):
|
|
data[i] ^= key[i%key_len]
|
|
data = bytes(data)
|
|
|
|
key_start = struct.calcsize(b'>5L') + 4
|
|
data_start = key_start + len(xor_key)
|
|
|
|
header = b'FONT' + struct.pack(b'>5L', usize, flags, data_start,
|
|
len(xor_key), key_start)
|
|
|
|
return header + xor_key + data
|
|
|
|
# }}}
|
|
|
|
|
|
def create_text_record(text):
|
|
'''
|
|
Return a Palmdoc record of size RECORD_SIZE from the text file object.
|
|
In case the record ends in the middle of a multibyte character return
|
|
the overlap as well.
|
|
|
|
Returns data, overlap: where both are byte strings. overlap is the
|
|
extra bytes needed to complete the truncated multibyte character.
|
|
'''
|
|
opos = text.tell()
|
|
text.seek(0, 2)
|
|
# npos is the position of the next record
|
|
npos = min((opos + RECORD_SIZE, text.tell()))
|
|
# Number of bytes from the next record needed to complete the last
|
|
# character in this record
|
|
extra = 0
|
|
|
|
last = b''
|
|
while not last.decode('utf-8', 'ignore'):
|
|
# last contains no valid utf-8 characters
|
|
size = len(last) + 1
|
|
text.seek(npos - size)
|
|
last = text.read(size)
|
|
|
|
# last now has one valid utf-8 char and possibly some bytes that belong
|
|
# to a truncated char
|
|
|
|
try:
|
|
last.decode('utf-8', 'strict')
|
|
except UnicodeDecodeError:
|
|
# There are some truncated bytes in last
|
|
prev = len(last)
|
|
while True:
|
|
text.seek(npos - prev)
|
|
last = text.read(len(last) + 1)
|
|
try:
|
|
last.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
pass
|
|
else:
|
|
break
|
|
extra = len(last) - prev
|
|
|
|
text.seek(opos)
|
|
data = text.read(RECORD_SIZE)
|
|
overlap = text.read(extra)
|
|
text.seek(npos)
|
|
|
|
return data, overlap
|
|
|
|
|
|
class CNCX(object): # {{{
|
|
|
|
'''
|
|
Create the CNCX records. These are records containing all the strings from
|
|
an index. Each record is of the form: <vwi string size><utf-8 encoded
|
|
string>
|
|
'''
|
|
|
|
MAX_STRING_LENGTH = 500
|
|
|
|
def __init__(self, strings=()):
|
|
self.strings = OrderedDict((s, 0) for s in strings)
|
|
|
|
self.records = []
|
|
offset = 0
|
|
buf = BytesIO()
|
|
RECORD_LIMIT = 0x10000 - 1024 # kindlegen appears to use 1024, PDB limit is 0x10000
|
|
for key in self.strings:
|
|
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
|
|
l = len(utf8)
|
|
sz_bytes = encint(l)
|
|
raw = sz_bytes + utf8
|
|
if buf.tell() + len(raw) > RECORD_LIMIT:
|
|
self.records.append(align_block(buf.getvalue()))
|
|
buf.seek(0), buf.truncate(0)
|
|
offset = len(self.records) * 0x10000
|
|
buf.write(raw)
|
|
self.strings[key] = offset
|
|
offset += len(raw)
|
|
|
|
val = buf.getvalue()
|
|
if val:
|
|
self.records.append(align_block(val))
|
|
|
|
def __getitem__(self, string):
|
|
return self.strings[string]
|
|
|
|
def __bool__(self):
|
|
return bool(self.records)
|
|
__nonzero__ = __bool__
|
|
|
|
def __len__(self):
|
|
return len(self.records)
|
|
|
|
# }}}
|
|
|
|
|
|
def is_guide_ref_start(ref):
|
|
return (ref.title.lower() == 'start' or
|
|
(ref.type and ref.type.lower() in {'start',
|
|
'other.start', 'text'}))
|
|
|
|
|
|
def convert_color_for_font_tag(val):
|
|
rgba = parse_color_string(unicode_type(val or ''))
|
|
if rgba is None or rgba == 'currentColor':
|
|
return val
|
|
clamp = lambda x: min(x, max(0, x), 1)
|
|
rgb = map(clamp, rgba[:3])
|
|
return '#' + ''.join(map(lambda x:'%02x' % int(x * 255), rgb))
|