1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-04-17 11:43:30 +02:00

Initial import

This commit is contained in:
2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
class MobiError(Exception):
pass
# That might be a bit small on the PW, but Amazon/KG 2.5 still uses these values, even when delivered to a PW
MAX_THUMB_SIZE = 16 * 1024
MAX_THUMB_DIMEN = (180, 240)

View File

@@ -0,0 +1,108 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Decompress MOBI files compressed with the Huff/cdic algorithm. Code thanks to darkninja
and igorsk.
'''
import struct
from calibre.ebooks.mobi import MobiError
from polyglot.builtins import map
class Reader(object):
def __init__(self):
self.q = struct.Struct(b'>Q').unpack_from
def load_huff(self, huff):
if huff[0:8] != b'HUFF\x00\x00\x00\x18':
raise MobiError('Invalid HUFF header')
off1, off2 = struct.unpack_from(b'>LL', huff, 8)
def dict1_unpack(v):
codelen, term, maxcode = v&0x1f, v&0x80, v>>8
assert codelen != 0
if codelen <= 8:
assert term
maxcode = ((maxcode + 1) << (32 - codelen)) - 1
return (codelen, term, maxcode)
self.dict1 = tuple(map(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)))
dict2 = struct.unpack_from(b'>64L', huff, off2)
self.mincode, self.maxcode = (), ()
for codelen, mincode in enumerate((0,) + dict2[0::2]):
self.mincode += (mincode << (32 - codelen), )
for codelen, maxcode in enumerate((0,) + dict2[1::2]):
self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
self.dictionary = []
def load_cdic(self, cdic):
if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
raise MobiError('Invalid CDIC header')
phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
n = min(1<<bits, phrases-len(self.dictionary))
h = struct.Struct(b'>H').unpack_from
def getslice(off):
blen, = h(cdic, 16+off)
slice = cdic[18+off:18+off+(blen&0x7fff)]
return (slice, blen&0x8000)
self.dictionary += map(getslice, struct.unpack_from(b'>%dH' % n, cdic, 16))
def unpack(self, data):
q = self.q
bitsleft = len(data) * 8
data += b'\x00\x00\x00\x00\x00\x00\x00\x00'
pos = 0
x, = q(data, pos)
n = 32
s = []
while True:
if n <= 0:
pos += 4
x, = q(data, pos)
n += 32
code = (x >> n) & ((1 << 32) - 1)
codelen, term, maxcode = self.dict1[code >> 24]
if not term:
while code < self.mincode[codelen]:
codelen += 1
maxcode = self.maxcode[codelen]
n -= codelen
bitsleft -= codelen
if bitsleft < 0:
break
r = (maxcode - code) >> (32 - codelen)
slice_, flag = self.dictionary[r]
if not flag:
self.dictionary[r] = None
slice_ = self.unpack(slice_)
self.dictionary[r] = (slice_, 1)
s.append(slice_)
return b''.join(s)
class HuffReader(object):
def __init__(self, huffs):
self.reader = Reader()
self.reader.load_huff(huffs[0])
for cdic in huffs[1:]:
self.reader.load_cdic(cdic)
def unpack(self, section):
return self.reader.unpack(section)

View File

@@ -0,0 +1,355 @@
#!/usr/bin/env python2
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from struct import pack
from calibre.utils.localization import lang_as_iso639_1
lang_codes = {
}
main_language = {
0 : "NEUTRAL",
54 : "AFRIKAANS",
28 : "ALBANIAN",
1 : "ARABIC",
43 : "ARMENIAN",
77 : "ASSAMESE",
44 : "AZERI",
45 : "BASQUE",
35 : "BELARUSIAN",
69 : "BENGALI",
2 : "BULGARIAN",
3 : "CATALAN",
4 : "CHINESE",
# 26 : "CROATIAN",
5 : "CZECH",
6 : "DANISH",
19 : "DUTCH",
9 : "ENGLISH",
37 : "ESTONIAN",
56 : "FAEROESE",
41 : "FARSI",
11 : "FINNISH",
12 : "FRENCH",
55 : "GEORGIAN",
7 : "GERMAN",
8 : "GREEK",
71 : "GUJARATI",
13 : "HEBREW",
57 : "HINDI",
14 : "HUNGARIAN",
15 : "ICELANDIC",
33 : "INDONESIAN",
16 : "ITALIAN",
17 : "JAPANESE",
75 : "KANNADA",
63 : "KAZAK",
87 : "KONKANI",
18 : "KOREAN",
38 : "LATVIAN",
39 : "LITHUANIAN",
47 : "MACEDONIAN",
62 : "MALAY",
76 : "MALAYALAM",
58 : "MALTESE",
78 : "MARATHI",
97 : "NEPALI",
20 : "NORWEGIAN",
72 : "ORIYA",
21 : "POLISH",
22 : "PORTUGUESE",
70 : "PUNJABI",
23 : "RHAETOROMANIC",
24 : "ROMANIAN",
25 : "RUSSIAN",
59 : "SAMI",
79 : "SANSKRIT",
26 : "SERBIAN",
27 : "SLOVAK",
36 : "SLOVENIAN",
46 : "SORBIAN",
10 : "SPANISH",
48 : "SUTU",
65 : "SWAHILI",
29 : "SWEDISH",
73 : "TAMIL",
68 : "TATAR",
74 : "TELUGU",
30 : "THAI",
49 : "TSONGA",
50 : "TSWANA",
31 : "TURKISH",
34 : "UKRAINIAN",
32 : "URDU",
67 : "UZBEK",
42 : "VIETNAMESE",
52 : "XHOSA",
53 : "ZULU",
}
sub_language = {
0 : "NEUTRAL",
# 1 : "ARABIC_SAUDI_ARABIA",
# 2 : "ARABIC_IRAQ",
# 3 : "ARABIC_EGYPT",
# 4 : "ARABIC_LIBYA",
# 5 : "ARABIC_ALGERIA",
# 6 : "ARABIC_MOROCCO",
# 7 : "ARABIC_TUNISIA",
# 8 : "ARABIC_OMAN",
# 9 : "ARABIC_YEMEN",
# 10 : "ARABIC_SYRIA",
# 11 : "ARABIC_JORDAN",
# 12 : "ARABIC_LEBANON",
# 13 : "ARABIC_KUWAIT",
# 14 : "ARABIC_UAE",
# 15 : "ARABIC_BAHRAIN",
# 16 : "ARABIC_QATAR",
# 1 : "AZERI_LATIN",
# 2 : "AZERI_CYRILLIC",
# 1 : "CHINESE_TRADITIONAL",
# 2 : "CHINESE_SIMPLIFIED",
# 3 : "CHINESE_HONGKONG",
# 4 : "CHINESE_SINGAPORE",
# 1 : "DUTCH",
# 2 : "DUTCH_BELGIAN",
# 1 : "FRENCH",
# 2 : "FRENCH_BELGIAN",
# 3 : "FRENCH_CANADIAN",
# 4 : "FRENCH_SWISS",
# 5 : "FRENCH_LUXEMBOURG",
# 6 : "FRENCH_MONACO",
# 1 : "GERMAN",
# 2 : "GERMAN_SWISS",
# 3 : "GERMAN_AUSTRIAN",
# 4 : "GERMAN_LUXEMBOURG",
# 5 : "GERMAN_LIECHTENSTEIN",
# 1 : "ITALIAN",
# 2 : "ITALIAN_SWISS",
# 1 : "KOREAN",
# 1 : "LITHUANIAN",
# 1 : "MALAY_MALAYSIA",
# 2 : "MALAY_BRUNEI_DARUSSALAM",
# 1 : "NORWEGIAN_BOKMAL",
# 2 : "NORWEGIAN_NYNORSK",
# 2 : "PORTUGUESE",
# 1 : "PORTUGUESE_BRAZILIAN",
# 2 : "SERBIAN_LATIN",
3 : "SERBIAN_CYRILLIC",
# 1 : "SPANISH",
# 2 : "SPANISH_MEXICAN",
4 : "SPANISH_GUATEMALA",
5 : "SPANISH_COSTA_RICA",
6 : "SPANISH_PANAMA",
7 : "SPANISH_DOMINICAN_REPUBLIC",
8 : "SPANISH_VENEZUELA",
9 : "SPANISH_COLOMBIA",
10 : "SPANISH_PERU",
11 : "SPANISH_ARGENTINA",
12 : "SPANISH_ECUADOR",
13 : "SPANISH_CHILE",
14 : "SPANISH_URUGUAY",
15 : "SPANISH_PARAGUAY",
16 : "SPANISH_BOLIVIA",
17 : "SPANISH_EL_SALVADOR",
18 : "SPANISH_HONDURAS",
19 : "SPANISH_NICARAGUA",
20 : "SPANISH_PUERTO_RICO",
# 1 : "SWEDISH",
# 2 : "SWEDISH_FINLAND",
1 : "UZBEK_LATIN",
2 : "UZBEK_CYRILLIC",
}
IANA_MOBI = \
{None: {None: (0, 0)},
'af': {None: (54, 0)},
'ar': {None: (1, 0),
'AE': (1, 56),
'BH': (1, 60),
'DZ': (1, 20),
'EG': (1, 12),
'JO': (1, 44),
'KW': (1, 52),
'LB': (1, 48),
'MA': (1, 24),
'OM': (1, 32),
'QA': (1, 64),
'SA': (1, 4),
'SY': (1, 40),
'TN': (1, 28),
'YE': (1, 36)},
'as': {None: (77, 0)},
'az': {None: (44, 0)},
'be': {None: (35, 0)},
'bg': {None: (2, 0)},
'bn': {None: (69, 0)},
'ca': {None: (3, 0)},
'cs': {None: (5, 0)},
'da': {None: (6, 0)},
'de': {None: (7, 0),
'AT': (7, 12),
'CH': (7, 8),
'LI': (7, 20),
'LU': (7, 16)},
'el': {None: (8, 0)},
'en': {None: (9, 0),
'AU': (9, 12),
'BZ': (9, 40),
'CA': (9, 16),
'GB': (9, 8),
'IE': (9, 24),
'JM': (9, 32),
'NZ': (9, 20),
'PH': (9, 52),
'TT': (9, 44),
'US': (9, 4),
'ZA': (9, 28),
'ZW': (9, 48)},
'es': {None: (10, 0),
'AR': (10, 44),
'BO': (10, 64),
'CL': (10, 52),
'CO': (10, 36),
'CR': (10, 20),
'DO': (10, 28),
'EC': (10, 48),
'ES': (10, 4),
'GT': (10, 16),
'HN': (10, 72),
'MX': (10, 8),
'NI': (10, 76),
'PA': (10, 24),
'PE': (10, 40),
'PR': (10, 80),
'PY': (10, 60),
'SV': (10, 68),
'UY': (10, 56),
'VE': (10, 32)},
'et': {None: (37, 0)},
'eu': {None: (45, 0)},
'fa': {None: (41, 0)},
'fi': {None: (11, 0)},
'fo': {None: (56, 0)},
'fr': {None: (12, 0),
'BE': (12, 8),
'CA': (12, 12),
'CH': (12, 16),
'FR': (12, 4),
'LU': (12, 20),
'MC': (12, 24)},
'gu': {None: (71, 0)},
'he': {None: (13, 0)},
'hi': {None: (57, 0)},
'hr': {None: (26, 0)},
'hu': {None: (14, 0)},
'hy': {None: (43, 0)},
'id': {None: (33, 0)},
'is': {None: (15, 0)},
'it': {None: (16, 0),
'CH': (16, 8),
'IT': (16, 4)},
'ja': {None: (17, 0)},
'ka': {None: (55, 0)},
'kk': {None: (63, 0)},
'kn': {None: (75, 0)},
'ko': {None: (18, 0)},
'kok': {None: (87, 0)},
'lt': {None: (39, 0)},
'lv': {None: (38, 0)},
'mk': {None: (47, 0)},
'ml': {None: (76, 0)},
'mr': {None: (78, 0)},
'ms': {None: (62, 0)},
'mt': {None: (58, 0)},
'ne': {None: (97, 0)},
'nl': {None: (19, 0),
'BE': (19, 8)},
'no': {None: (20, 0)},
'or': {None: (72, 0)},
'pa': {None: (70, 0)},
'pl': {None: (21, 0)},
'pt': {None: (22, 0),
'BR': (22, 4),
'PT': (22, 8)},
'rm': {None: (23, 0)},
'ro': {None: (24, 0)},
'ru': {None: (25, 0)},
'sa': {None: (79, 0)},
'se': {None: (59, 0)},
'sk': {None: (27, 0)},
'sl': {None: (36, 0)},
'sq': {None: (28, 0)},
'sr': {None: (26, 12),
'RS': (26, 12)},
'st': {None: (48, 0)},
'sv': {None: (29, 0),
'FI': (29, 8)},
'sw': {None: (65, 0)},
'ta': {None: (73, 0)},
'te': {None: (74, 0)},
'th': {None: (30, 0)},
'tn': {None: (50, 0)},
'tr': {None: (31, 0)},
'ts': {None: (49, 0)},
'tt': {None: (68, 0)},
'uk': {None: (34, 0)},
'ur': {None: (32, 0)},
'uz': {None: (67, 0),
'UZ': (67, 8)},
'vi': {None: (42, 0)},
'wen': {None: (46, 0)},
'xh': {None: (52, 0)},
'zh': {None: (4, 0),
'CN': (4, 8),
'HK': (4, 12),
'SG': (4, 16),
'TW': (4, 4)},
'zu': {None: (53, 0)}}
def iana2mobi(icode):
langdict, subtags = IANA_MOBI[None], []
if icode:
subtags = list(icode.split('-'))
while len(subtags) > 0:
lang = subtags.pop(0).lower()
lang = lang_as_iso639_1(lang)
if lang and lang in IANA_MOBI:
langdict = IANA_MOBI[lang]
break
mcode = langdict[None]
while len(subtags) > 0:
subtag = subtags.pop(0)
if subtag not in langdict:
subtag = subtag.title()
if subtag not in langdict:
subtag = subtag.upper()
if subtag in langdict:
mcode = langdict[subtag]
break
return pack('>HBB', 0, mcode[1], mcode[0])
def mobi2iana(langcode, sublangcode):
prefix = suffix = None
for code, d in IANA_MOBI.items():
for subcode, t in d.items():
cc, cl = t
if cc == langcode:
prefix = code
if cl == sublangcode:
suffix = subcode.lower() if subcode else None
break
if prefix is not None:
break
if prefix is None:
return 'und'
if suffix is None:
return prefix
return prefix + '-' + suffix

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from struct import unpack_from, error
from calibre.utils.imghdr import what
def find_imgtype(data):
return what(None, data) or 'unknown'
class Container(object):
def __init__(self, data):
self.is_image_container = False
self.resource_index = 0
if len(data) > 60 and data[48:52] == b'EXTH':
length, num_items = unpack_from(b'>LL', data, 52)
pos = 60
while pos < 60 + length - 8:
try:
idx, size = unpack_from(b'>LL', data, pos)
except error:
break
pos += 8
size -= 8
if size < 0:
break
if idx == 539:
self.is_image_container = data[pos:pos+size] == b'application/image'
break
pos += size
def load_image(self, data):
self.resource_index += 1
if self.is_image_container:
data = data[12:]
imgtype = find_imgtype(data)
if imgtype != 'unknown':
return data, imgtype
return None, None

View File

@@ -0,0 +1,355 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, re, os
from calibre import replace_entities
from calibre.utils.date import parse_date
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.metadata import MetaInformation, check_isbn
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.utils.localization import canonicalize_lang
from calibre.utils.config_base import tweaks
from polyglot.builtins import unicode_type
NULL_INDEX = 0xffffffff
def uniq(vals):
''' Remove all duplicates from vals, while preserving order. '''
vals = vals or ()
seen = set()
seen_add = seen.add
return list(x for x in vals if x not in seen and not seen_add(x))
class EXTHHeader(object): # {{{
def __init__(self, raw, codec, title):
self.doctype = raw[:4]
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
raw = raw[12:]
pos = 0
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
self.has_fake_cover = True
self.start_offset = None
left = self.num_items
self.kf8_header = None
self.uuid = self.cdetype = None
self.page_progression_direction = None
self.primary_writing_mode = None
self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace'))
while left > 0:
left -= 1
idx, size = struct.unpack('>LL', raw[pos:pos + 8])
content = raw[pos + 8:pos + size]
pos += size
if idx >= 100 and idx < 200:
self.process_metadata(idx, content, codec)
elif idx == 203:
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
elif idx == 201:
co, = struct.unpack('>L', content)
if co < NULL_INDEX:
self.cover_offset = co
elif idx == 202:
self.thumbnail_offset, = struct.unpack('>L', content)
elif idx == 501:
try:
self.cdetype = content.decode('ascii')
except UnicodeDecodeError:
self.cdetype = None
# cdetype
if content == b'EBSP':
if not self.mi.tags:
self.mi.tags = []
self.mi.tags.append(_('Sample Book'))
elif idx == 502:
# last update time
pass
elif idx == 503: # Long title
# Amazon seems to regard this as the definitive book title
# rather than the title from the PDB header. In fact when
# sending MOBI files through Amazon's email service if the
# title contains non ASCII chars or non filename safe chars
# they are messed up in the PDB header
try:
title = self.decode(content)
except Exception:
pass
elif idx == 524: # Lang code
try:
lang = content.decode(codec)
lang = canonicalize_lang(lang)
if lang:
self.mi.language = lang
except Exception:
pass
elif idx == 525:
try:
pwm = content.decode(codec)
if pwm:
self.primary_writing_mode = pwm
except Exception:
pass
elif idx == 527:
try:
ppd = content.decode(codec)
if ppd:
self.page_progression_direction = ppd
except Exception:
pass
# else:
# print 'unknown record', idx, repr(content)
if title:
self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
def process_metadata(self, idx, content, codec):
if idx == 100:
if self.mi.is_null('authors'):
self.mi.authors = []
au = clean_xml_chars(self.decode(content).strip())
# Author names in Amazon MOBI files are usually in LN, FN format,
# try to detect and auto-correct that.
m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
if m is not None:
if tweaks['author_sort_copy_method'] != 'copy':
self.mi.authors.append(m.group(2) + ' ' + m.group(1))
else:
self.mi.authors.append(m.group())
if self.mi.is_null('author_sort'):
self.mi.author_sort = m.group()
else:
self.mi.authors.append(au)
elif idx == 101:
self.mi.publisher = clean_xml_chars(self.decode(content).strip())
if self.mi.publisher in {'Unknown', _('Unknown')}:
self.mi.publisher = None
elif idx == 103:
self.mi.comments = clean_xml_chars(self.decode(content).strip())
elif idx == 104:
raw = check_isbn(self.decode(content).strip().replace('-', ''))
if raw:
self.mi.isbn = raw
elif idx == 105:
if not self.mi.tags:
self.mi.tags = []
self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
self.mi.tags = uniq(self.mi.tags)
elif idx == 106:
try:
self.mi.pubdate = parse_date(self.decode(content), as_utc=False)
except Exception:
pass
elif idx == 108:
self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
elif idx == 109:
self.mi.rights = clean_xml_chars(self.decode(content).strip())
elif idx == 112: # dc:source set in some EBSP amazon samples
try:
content = content.decode(codec).strip()
isig = 'urn:isbn:'
if content.lower().startswith(isig):
raw = check_isbn(content[len(isig):])
if raw and not self.mi.isbn:
self.mi.isbn = raw
elif content.startswith('calibre:'):
# calibre book uuid is stored here by recent calibre
# releases
cid = content[len('calibre:'):]
if cid:
self.mi.application_id = self.mi.uuid = cid
except:
pass
elif idx == 113: # ASIN or other id
try:
self.uuid = content.decode('ascii')
self.mi.set_identifier('mobi-asin', self.uuid)
except Exception:
self.uuid = None
elif idx == 116:
self.start_offset, = struct.unpack(b'>L', content)
elif idx == 121:
self.kf8_header, = struct.unpack(b'>L', content)
if self.kf8_header == NULL_INDEX:
self.kf8_header = None
# else:
# print 'unhandled metadata record', idx, repr(content)
# }}}
class BookHeader(object):
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
self.log = log
self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
self.encryption_type, = struct.unpack('>H', raw[12:14])
if ident == b'TEXTREAD':
self.codepage = 1252
if len(raw) <= 16:
self.codec = 'cp1252'
self.extra_flags = 0
self.title = _('Unknown')
self.language = 'ENGLISH'
self.sublanguage = 'NEUTRAL'
self.exth_flag, self.exth = 0, None
self.ancient = True
self.first_image_index = -1
self.mobi_version = 1
else:
self.ancient = False
self.doctype = raw[16:20]
self.length, self.type, self.codepage, self.unique_id, \
self.version = struct.unpack('>LLLLL', raw[20:40])
try:
self.codec = {
1252: 'cp1252',
65001: 'utf-8',
}[self.codepage]
except (IndexError, KeyError):
self.codec = 'cp1252' if not user_encoding else user_encoding
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec))
# Some KF8 files have header length == 264 (generated by kindlegen
# 2.9?). See https://bugs.launchpad.net/bugs/1179144
max_header_length = 500 # We choose 500 for future versions of kindlegen
if (ident == b'TEXTREAD' or self.length < 0xE4 or
self.length > max_header_length or
(try_extra_data_fix and self.length == 0xE4)):
self.extra_flags = 0
else:
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
if self.compression_type == b'DH':
self.huff_offset, self.huff_number = struct.unpack('>LL',
raw[0x70:0x78])
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
tend = toff + tlen
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
langid = langcode & 0xFF
sublangid = (langcode >> 10) & 0xFF
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
if not isinstance(self.title, unicode_type):
self.title = self.title.decode(self.codec, 'replace')
if self.exth_flag & 0x40:
try:
self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
self.title)
self.exth.mi.uid = self.unique_id
if self.exth.mi.is_null('language'):
try:
self.exth.mi.language = mobi2iana(langid, sublangid)
except:
self.log.exception('Unknown language code')
except:
self.log.exception('Invalid EXTH header')
self.exth_flag = 0
self.ncxidx = NULL_INDEX
if len(raw) >= 0xF8:
self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)
# Ancient PRC files from Baen can have random values for
# mobi_version, so be conservative
if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
self.dividx, self.skelidx, self.datpidx, self.othidx = \
struct.unpack_from(b'>4L', raw, 0xF8)
# need to use the FDST record to find out how to properly
# unpack the raw_ml into pieces it is simply a table of start
# and end locations for each flow piece
self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0)
# if cnt is 1 or less, fdst section number can be garbage
if self.fdstcnt <= 1:
self.fdstidx = NULL_INDEX
else: # Null values
self.skelidx = self.dividx = self.othidx = self.fdstidx = \
NULL_INDEX
class MetadataHeader(BookHeader):
def __init__(self, stream, log):
self.stream = stream
self.ident = self.identity()
self.num_sections = self.section_count()
if self.num_sections >= 2:
header = self.header()
BookHeader.__init__(self, header, self.ident, None, log)
else:
self.exth = None
@property
def kf8_type(self):
if (self.mobi_version == 8 and getattr(self, 'skelidx', NULL_INDEX) !=
NULL_INDEX):
return 'standalone'
kf8_header_index = getattr(self.exth, 'kf8_header', None)
if kf8_header_index is None:
return None
try:
if self.section_data(kf8_header_index-1) == b'BOUNDARY':
return 'joint'
except Exception:
pass
return None
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8).upper()
if ident not in (b'BOOKMOBI', b'TEXTREAD'):
raise MobiError('Unknown book type: %s' % ident)
return ident
def section_count(self):
self.stream.seek(76)
return struct.unpack('>H', self.stream.read(2))[0]
def section_offset(self, number):
self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def header(self):
section_headers = []
# First section with the metadata
section_headers.append(self.section_offset(0))
# Second section used to get the length of the first
section_headers.append(self.section_offset(1))
end_off = section_headers[1]
off = section_headers[0]
self.stream.seek(off)
return self.stream.read(end_off - off)
def section_data(self, number):
start = self.section_offset(number)
if number == self.num_sections -1:
end = os.stat(self.stream.name).st_size
else:
end = self.section_offset(number + 1)
self.stream.seek(start)
try:
return self.stream.read(end - start)
except OverflowError:
self.stream.seek(start)
return self.stream.read()

View File

@@ -0,0 +1,277 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct
from collections import OrderedDict, namedtuple
from calibre.ebooks.mobi.utils import (decint, count_set_bits,
decode_string)
from polyglot.builtins import iteritems, range, zip
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
INDEX_HEADER_FIELDS = (
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
) + tuple('unknown%d'%i for i in range(27)) + ('ocnt', 'oentries',
'ordt1', 'ordt2', 'tagx')
class InvalidFile(ValueError):
pass
def check_signature(data, signature):
if data[:len(signature)] != signature:
raise InvalidFile('Not a valid %r section'%signature)
class NotAnINDXRecord(InvalidFile):
pass
class NotATAGXSection(InvalidFile):
pass
def format_bytes(byts):
byts = bytearray(byts)
byts = [hex(b)[2:] for b in byts]
return ' '.join(byts)
def parse_indx_header(data):
check_signature(data, b'INDX')
words = INDEX_HEADER_FIELDS
num = len(words)
values = struct.unpack('>%dL' % num, data[4:4*(num+1)])
ans = dict(zip(words, values))
ordt1, ordt2 = ans['ordt1'], ans['ordt2']
ans['ordt1_raw'], ans['ordt2_raw'] = [], []
ans['ordt_map'] = ''
if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
# I dont know what this is, but using it seems to be unnecessary, so
# just leave it as the raw bytestring
ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
if ans['code'] == 65002:
# This appears to be EBCDIC-UTF (65002) encoded. I can't be
# bothered to write a decoder for this (see
# http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
# Instead, we use a weird hack that seems to do the trick for all
# the books with this type of ORDT record that I have come across.
# Some EBSP book samples in KF8 format from Amazon have this type
# of encoding.
# Basically we try to interpret every second byte as a printable
# ascii character. If we cannot, we map to the ? char.
parsed = bytearray(ans['oentries'])
for i in range(0, 2*ans['oentries'], 2):
parsed[i//2] = raw[i+1] if 0x20 < raw[i+1] < 0x7f else ord(b'?')
ans['ordt_map'] = bytes(parsed).decode('ascii')
else:
ans['ordt_map'] = '?'*ans['oentries']
return ans
class CNCX(object): # {{{
'''
Parses the records that contain the compiled NCX (all strings from the
NCX). Presents a simple offset : string mapping interface to access the
data.
'''
def __init__(self, records, codec):
self.records = OrderedDict()
record_offset = 0
for raw in records:
pos = 0
while pos < len(raw):
length, consumed = decint(raw[pos:])
if length > 0:
try:
self.records[pos+record_offset] = raw[
pos+consumed:pos+consumed+length].decode(codec)
except:
byts = raw[pos:]
r = format_bytes(byts)
print('CNCX entry at offset %d has unknown format %s'%(
pos+record_offset, r))
self.records[pos+record_offset] = r
pos = len(raw)
pos += consumed+length
record_offset += 0x10000
def __getitem__(self, offset):
return self.records.get(offset)
def get(self, offset, default=None):
return self.records.get(offset, default)
def __bool__(self):
return bool(self.records)
__nonzero__ = __bool__
def iteritems(self):
return iteritems(self.records)
def items(self):
return iteritems(self.records)
# }}}
def parse_tagx_section(data):
check_signature(data, b'TAGX')
tags = []
first_entry_offset, = struct.unpack_from(b'>L', data, 4)
control_byte_count, = struct.unpack_from(b'>L', data, 8)
for i in range(12, first_entry_offset, 4):
vals = list(bytearray(data[i:i+4]))
tags.append(TagX(*vals))
return control_byte_count, tags
def get_tag_map(control_byte_count, tagx, data, strict=False):
ptags = []
ans = {}
control_bytes = list(bytearray(data[:control_byte_count]))
data = data[control_byte_count:]
for x in tagx:
if x.eof == 0x01:
control_bytes = control_bytes[1:]
continue
value = control_bytes[0] & x.bitmask
if value != 0:
value_count = value_bytes = None
if value == x.bitmask:
if count_set_bits(x.bitmask) > 1:
# If all bits of masked value are set and the mask has more
# than one bit, a variable width value will follow after
# the control bytes which defines the length of bytes (NOT
# the value count!) which will contain the corresponding
# variable width values.
value_bytes, consumed = decint(data)
data = data[consumed:]
else:
value_count = 1
else:
# Shift bits to get the masked value.
mask = x.bitmask
while mask & 0b1 == 0:
mask >>= 1
value >>= 1
value_count = value
ptags.append(PTagX(x.tag, value_count, value_bytes,
x.num_of_values))
for x in ptags:
values = []
if x.value_count is not None:
# Read value_count * values_per_entry variable width values.
for _ in range(x.value_count * x.num_of_values):
byts, consumed = decint(data)
data = data[consumed:]
values.append(byts)
else: # value_bytes is not None
# Convert value_bytes to variable width values.
total_consumed = 0
while total_consumed < x.value_bytes:
# Does this work for values_per_entry != 1?
byts, consumed = decint(data)
data = data[consumed:]
total_consumed += consumed
values.append(byts)
if total_consumed != x.value_bytes:
err = ("Error: Should consume %s bytes, but consumed %s" %
(x.value_bytes, total_consumed))
if strict:
raise ValueError(err)
else:
print(err)
ans[x.tag] = values
# Test that all bytes have been processed
if data.replace(b'\0', b''):
err = ("Warning: There are unprocessed index bytes left: %s" %
format_bytes(data))
if strict:
raise ValueError(err)
else:
print(err)
return ans
def parse_index_record(table, data, control_byte_count, tags, codec,
ordt_map, strict=False):
header = parse_indx_header(data)
idxt_pos = header['start']
if data[idxt_pos:idxt_pos+4] != b'IDXT':
print('WARNING: Invalid INDX record')
entry_count = header['count']
# loop through to build up the IDXT position starts
idx_positions= []
for j in range(entry_count):
pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j))
idx_positions.append(pos)
# The last entry ends before the IDXT tag (but there might be zero fill
# bytes we need to ignore!)
idx_positions.append(idxt_pos)
# For each entry in the IDXT build up the tag map and any associated
# text
for j in range(entry_count):
start, end = idx_positions[j:j+2]
rec = data[start:end]
# Sometimes (in the guide table if the type attribute has non ascii
# values) the ident is UTF-16 encoded. Try to handle that.
try:
ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
except UnicodeDecodeError:
ident, consumed = decode_string(rec, codec='utf-16', ordt_map=ordt_map)
if u'\x00' in ident:
try:
ident, consumed = decode_string(rec, codec='utf-16',
ordt_map=ordt_map)
except UnicodeDecodeError:
ident = ident.replace('u\x00', u'')
rec = rec[consumed:]
tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
table[ident] = tag_map
return header
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)
data = sections[idx][0]
indx_header = parse_indx_header(data)
indx_count = indx_header['count']
if indx_header['ncncx'] > 0:
off = idx + indx_count + 1
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['tagx']
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
for i in range(idx + 1, idx + 1 + indx_count):
# Index record
data = sections[i][0]
parse_index_record(table, data, control_byte_count, tags, codec,
indx_header['ordt_map'])
return table, cncx

View File

@@ -0,0 +1,373 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, os
from calibre.ebooks.chardet import strip_encoding_declarations
from polyglot.builtins import unicode_type, range
def update_internal_links(mobi8_reader, log):
# need to update all links that are internal which
# are based on positions within the xhtml files **BEFORE**
# cutting and pasting any pieces into the xhtml text files
# kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
# XXXX is the offset in records into divtbl
# YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
mr = mobi8_reader
# pos:fid pattern
posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
parts = []
for part in mr.parts:
srcpieces = posfid_pattern.split(part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith(b'<'):
for m in posfid_index_pattern.finditer(tag):
posfid = m.group(1)
offset = m.group(2)
try:
filename, idtag = mr.get_id_tag_by_pos_fid(
int(posfid, 32), int(offset, 32))
except ValueError:
log.warn('Invalid link, points to nowhere, ignoring')
replacement = b'#'
else:
suffix = (b'#' + idtag) if idtag else b''
replacement = filename.split('/')[-1].encode(
mr.header.codec) + suffix
replacement = replacement.replace(b'"', b'&quot;')
tag = posfid_index_pattern.sub(b'"' + replacement + b'"', tag, 1)
srcpieces[j] = tag
raw = b''.join(srcpieces)
try:
parts.append(raw.decode(mr.header.codec))
except UnicodeDecodeError:
log.warn('Failed to decode text in KF8 part, replacing bad bytes')
parts.append(raw.decode(mr.header.codec, 'replace'))
# All parts are now unicode and have no internal links
return parts
def remove_kindlegen_markup(parts, aid_anchor_suffix, linked_aids):
# we can safely remove all of the Kindlegen generated aid attributes and
# calibre generated cid attributes
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\s[ac]id\s*=[^>]*>)''',
re.IGNORECASE)
within_tag_aid_position_pattern = re.compile(r'''\s[ac]id\s*=['"]([^'"]*)['"]''')
for i in range(len(parts)):
part = parts[i]
srcpieces = find_tag_with_aid_pattern.split(part)
for j in range(len(srcpieces)):
tag = srcpieces[j]
if tag.startswith('<'):
for m in within_tag_aid_position_pattern.finditer(tag):
try:
aid = m.group(1)
except IndexError:
aid = None
replacement = ''
if aid in linked_aids:
replacement = ' id="%s"' % (aid + '-' + aid_anchor_suffix)
tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces)
parts[i] = part
# we can safely remove all of the Kindlegen generated data-AmznPageBreak
# attributes
find_tag_with_AmznPageBreak_pattern = re.compile(
r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
within_tag_AmznPageBreak_position_pattern = re.compile(
r'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
for i in range(len(parts)):
part = parts[i]
srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
for j in range(len(srcpieces)):
tag = srcpieces[j]
if tag.startswith('<'):
srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
lambda m:' style="page-break-after:%s"'%m.group(1), tag)
part = "".join(srcpieces)
parts[i] = part
def update_flow_links(mobi8_reader, resource_map, log):
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
# kindle:embed:XXXX (used for fonts)
mr = mobi8_reader
flows = []
img_pattern = re.compile(r'''(<[img\s|image\s|svg:image\s][^>]*>)''', re.IGNORECASE)
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''', re.IGNORECASE)
tag_pattern = re.compile(r'''(<[^>]*>)''')
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE)
url_img_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*''', re.IGNORECASE)
font_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)''', re.IGNORECASE)
url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
for flow in mr.flows:
if flow is None: # 0th flow is None
flows.append(flow)
continue
if not isinstance(flow, unicode_type):
try:
flow = flow.decode(mr.header.codec)
except UnicodeDecodeError:
log.error('Flow part has invalid %s encoded bytes'%mr.header.codec)
flow = flow.decode(mr.header.codec, 'replace')
# links to raster image files from image tags
# image_pattern
srcpieces = img_pattern.split(flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<im') or tag.startswith('<svg:image'):
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../'+ href)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized '
'as a valid image in %s' % (num, tag))
srcpieces[j] = tag
flow = "".join(srcpieces)
# replacements inside css url():
srcpieces = url_pattern.split(flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
# process links to raster image files
for m in url_img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../'+ href)
tag = url_img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as a '
'valid image in %s' % (num, tag))
# process links to fonts
for m in font_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href is None:
log.warn('Referenced font %s was not recognized as a '
'valid font in %s' % (num, tag))
else:
replacement = '"%s"'%('../'+ href)
if href.endswith('.failed'):
replacement = '"%s"'%('failed-'+href)
tag = font_index_pattern.sub(replacement, tag, 1)
# process links to other css pieces
for m in url_css_index_pattern.finditer(tag):
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = url_css_index_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
flow = "".join(srcpieces)
# flow pattern not inside url()
srcpieces = re.split(tag_pattern, flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<'):
for m in re.finditer(flow_pattern, tag):
try:
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
except IndexError:
log.warn('Ignoring invalid flow reference in tag', tag)
tag = ''
else:
if fi.format == 'inline':
flowtext = mr.flows[num]
tag = flowtext
else:
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = flow_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
flow = "".join(srcpieces)
flows.append(flow)
# All flows are now unicode and have links resolved
return flows
def insert_flows_into_markup(parts, flows, mobi8_reader, log):
mr = mobi8_reader
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
tag_pattern = re.compile(r'''(<[^>]*>)''')
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
for i in range(len(parts)):
part = parts[i]
# flow pattern
srcpieces = tag_pattern.split(part)
for j in range(1, len(srcpieces),2):
tag = srcpieces[j]
if tag.startswith('<'):
for m in flow_pattern.finditer(tag):
num = int(m.group(1), 32)
try:
fi = mr.flowinfo[num]
except IndexError:
log.warn('Ignoring invalid flow reference: %s'%m.group())
tag = ''
else:
if fi.format == 'inline':
tag = flows[num]
else:
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = flow_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def insert_images_into_markup(parts, resource_map, log):
# Handle any embedded raster images links in the xhtml text
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^')"]*[)'"]''')
style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''',
re.IGNORECASE)
for i in range(len(parts)):
part = parts[i]
srcpieces = img_pattern.split(part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<im'):
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../' + href)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as '
'a valid image in %s' % (num, tag))
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
# Replace urls used in style attributes
for i in range(len(parts)):
part = parts[i]
srcpieces = style_pattern.split(part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if 'kindle:embed' in tag:
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
osep = m.group()[0]
csep = m.group()[-1]
if href:
replacement = '%s%s%s'%(osep, '../' + href, csep)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as '
'a valid image in %s' % (num, tag))
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def upshift_markup(parts):
tag_pattern = re.compile(r'''(<(?:svg)[^>]*>)''', re.IGNORECASE)
for i in range(len(parts)):
part = parts[i]
# tag pattern
srcpieces = re.split(tag_pattern, part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag[:4].lower() == '<svg':
tag = tag.replace('preserveaspectratio','preserveAspectRatio')
tag = tag.replace('viewbox','viewBox')
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def expand_mobi8_markup(mobi8_reader, resource_map, log):
# First update all internal links that are based on offsets
parts = update_internal_links(mobi8_reader, log)
# Remove pointless markup inserted by kindlegen
remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids)
# Handle substitutions for the flows pieces first as they may
# be inlined into the xhtml text
flows = update_flow_links(mobi8_reader, resource_map, log)
# Insert inline flows into the markup
insert_flows_into_markup(parts, flows, mobi8_reader, log)
# Insert raster images into markup
insert_images_into_markup(parts, resource_map, log)
# Perform general markup cleanups
upshift_markup(parts)
# Update the parts and flows stored in the reader
mobi8_reader.parts = parts
mobi8_reader.flows = flows
# write out the parts and file flows
os.mkdir('text') # directory containing all parts
spine = []
for i, part in enumerate(parts):
pi = mobi8_reader.partinfo[i]
with open(os.path.join(pi.type, pi.filename), 'wb') as f:
part = strip_encoding_declarations(part)
part = part.replace('<head>', '<head><meta charset="UTF-8"/>', 1)
f.write(part.encode('utf-8'))
spine.append(f.name)
for i, flow in enumerate(flows):
fi = mobi8_reader.flowinfo[i]
if fi.format == 'file':
if not os.path.exists(fi.dir):
os.mkdir(fi.dir)
with open(os.path.join(fi.dir, fi.fname), 'wb') as f:
f.write(flow.encode('utf-8'))
return spine

View File

@@ -0,0 +1,935 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import shutil, os, re, struct, textwrap, io
from lxml import html, etree
from calibre import xml_entity_to_unicode, entity_to_unicode, guess_type
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.ebooks import DRMError, unit_convert
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.reader.headers import BookHeader
from calibre.utils.img import save_cover_data_to, gif_data_to_png_data, AnimatedGIF
from calibre.utils.imghdr import what
from polyglot.builtins import iteritems, unicode_type, range, map
class TopazError(ValueError):
pass
class KFXError(ValueError):
def __init__(self):
ValueError.__init__(self, _(
'This is an Amazon KFX book. It cannot be processed.'
' See {} for information on how to handle KFX books.'
).format('https://www.mobileread.com/forums/showthread.php?t=283371'))
class MobiReader(object):
PAGE_BREAK_PAT = re.compile(
r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
try_extra_data_fix=False):
self.log = log
self.debug = debug
self.embedded_mi = None
self.warned_about_trailing_entry_corruption = False
self.base_css_rules = textwrap.dedent('''
body { text-align: justify }
blockquote { margin: 0em 0em 0em 2em; }
p { margin: 0em; text-indent: 1.5em }
.bold { font-weight: bold }
.italic { font-style: italic }
.underline { text-decoration: underline }
.mbp_pagebreak {
page-break-after: always; margin: 0; display: block
}
''')
self.tag_css_rules = {}
self.left_margins = {}
self.text_indents = {}
if hasattr(filename_or_stream, 'read'):
stream = filename_or_stream
stream.seek(0)
else:
stream = open(filename_or_stream, 'rb')
raw = stream.read()
if raw.startswith(b'TPZ'):
raise TopazError(_('This is an Amazon Topaz book. It cannot be processed.'))
if raw.startswith(b'\xeaDRMION\xee'):
raise KFXError()
self.header = raw[0:72]
self.name = self.header[:32].replace(b'\x00', b'')
self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C + 8].upper()
if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
raise MobiError('Unknown book type: %s' % repr(self.ident))
self.sections = []
self.section_headers = []
for i in range(self.num_sections):
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
flags, val = a1, a2 << 16 | a3 << 8 | a4
self.section_headers.append((offset, flags, val))
def section(section_number):
if section_number == self.num_sections - 1:
end_off = len(raw)
else:
end_off = self.section_headers[section_number + 1][0]
off = self.section_headers[section_number][0]
return raw[off:end_off]
for i in range(self.num_sections):
self.sections.append((section(i), self.section_headers[i]))
self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
self.name = self.name.decode(self.book_header.codec, 'replace')
self.kf8_type = None
k8i = getattr(self.book_header.exth, 'kf8_header', None)
# Ancient PRC files from Baen can have random values for
# mobi_version, so be conservative
if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
'skelidx')):
self.kf8_type = 'standalone'
elif k8i is not None: # Check for joint mobi 6 and kf 8 file
try:
raw = self.sections[k8i-1][0]
except:
raw = None
if raw == b'BOUNDARY':
try:
self.book_header = BookHeader(self.sections[k8i][0],
self.ident, user_encoding, self.log)
self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i
self.book_header.mobi6_records = bh.records
# Need the first_image_index from the mobi 6 header as well
for x in ('first_image_index',):
setattr(self.book_header, x, getattr(bh, x))
# We need to do this because the MOBI 6 text extract code
# does not know anything about the kf8 offset
if hasattr(self.book_header, 'huff_offset'):
self.book_header.huff_offset += k8i
self.kf8_type = 'joint'
self.kf8_boundary = k8i-1
except:
self.book_header = bh
def check_for_drm(self):
if self.book_header.encryption_type != 0:
try:
name = self.book_header.exth.mi.title
except:
name = self.name
if not name:
name = self.name
raise DRMError(name)
def extract_content(self, output_dir, parse_cache):
output_dir = os.path.abspath(output_dir)
self.check_for_drm()
processed_records = self.extract_text()
if self.debug is not None:
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec,
'ignore')
self.processed_html = self.processed_html.replace('</</', '</')
self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
self.processed_html)
self.processed_html = self.processed_html.replace('\ufeff', '')
# Remove tags of the form <xyz: ...> as they can cause issues further
# along the pipeline
self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
self.processed_html)
self.processed_html = strip_encoding_declarations(self.processed_html)
self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
self.processed_html)
image_name_map = self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
self.cleanup_html()
self.log.debug('Parsing HTML...')
self.processed_html = clean_xml_chars(self.processed_html)
try:
root = html.fromstring(self.processed_html)
if len(root.xpath('//html')) > 5:
root = html.fromstring(self.processed_html.replace('\x0c',
'').replace('\x14', ''))
except Exception:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html)
root = html.fromstring(self.processed_html)
if root.xpath('descendant::p/descendant::p'):
from html5_parser import parse
self.log.warning('Malformed markup, parsing using html5-parser')
self.processed_html = strip_encoding_declarations(self.processed_html)
# These trip up the html5 parser causing all content to be placed
# under the <guide> tag
self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I)
self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I)
try:
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
except Exception:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html)
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
if len(root.xpath('body/descendant::*')) < 1:
# There are probably stray </html>s in the markup
self.processed_html = self.processed_html.replace('</html>',
'')
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
if root.tag != 'html':
self.log.warn('File does not have opening <html> tag')
nroot = html.fromstring('<html><head></head><body></body></html>')
bod = nroot.find('body')
for child in list(root):
child.getparent().remove(child)
bod.append(child)
root = nroot
htmls = list(root.xpath('//html'))
if len(htmls) > 1:
self.log.warn('Markup contains multiple <html> tags, merging.')
# Merge all <head> and <body> sections
for h in htmls:
p = h.getparent()
if hasattr(p, 'remove'):
p.remove(h)
bodies, heads = root.xpath('//body'), root.xpath('//head')
for x in root:
root.remove(x)
head, body = map(root.makeelement, ('head', 'body'))
for h in heads:
for x in h:
h.remove(x)
head.append(x)
for b in bodies:
for x in b:
b.remove(x)
body.append(x)
root.append(head), root.append(body)
for x in root.xpath('//script'):
x.getparent().remove(x)
head = root.xpath('//head')
if head:
head = head[0]
else:
head = root.makeelement('head', {})
root.insert(0, head)
head.text = '\n\t'
link = head.makeelement('link', {'type':'text/css',
'href':'styles.css', 'rel':'stylesheet'})
head.insert(0, link)
link.tail = '\n\t'
title = head.xpath('descendant::title')
m = head.makeelement('meta', {'http-equiv':'Content-Type',
'content':'text/html; charset=utf-8'})
head.insert(0, m)
if not title:
title = head.makeelement('title', {})
try:
title.text = self.book_header.title
except ValueError:
title.text = clean_ascii_chars(self.book_header.title)
title.tail = '\n\t'
head.insert(0, title)
head.text = '\n\t'
self.upshift_markup(root, image_name_map)
guides = root.xpath('//guide')
guide = guides[0] if guides else None
metadata_elems = root.xpath('//metadata')
if metadata_elems and self.book_header.exth is None:
self.read_embedded_metadata(root, metadata_elems[0], guide)
for elem in guides + metadata_elems:
elem.getparent().remove(elem)
htmlfile = os.path.join(output_dir, 'index.html')
try:
for ref in guide.xpath('descendant::reference'):
if 'href' in ref.attrib:
ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
except AttributeError:
pass
def write_as_utf8(path, data):
if isinstance(data, unicode_type):
data = data.encode('utf-8')
with lopen(path, 'wb') as f:
f.write(data)
parse_cache[htmlfile] = root
self.htmlfile = htmlfile
ncx = io.BytesIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
opf.render(lopen(self.created_opf_path, 'wb'), ncx,
ncx_manifest_entry=ncx_manifest_entry)
ncx = ncx.getvalue()
if ncx:
ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
write_as_utf8(ncx_path, ncx)
css = [self.base_css_rules, '\n\n']
for cls, rule in self.tag_css_rules.items():
css.append('.%s { %s }\n\n' % (cls, rule))
write_as_utf8('styles.css', ''.join(css))
if self.book_header.exth is not None or self.embedded_mi is not None:
self.log.debug('Creating OPF...')
ncx = io.BytesIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
ncx_manifest_entry)
ncx = ncx.getvalue()
if ncx:
write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
def read_embedded_metadata(self, root, elem, guide):
raw = b'<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \
html.tostring(elem, encoding='utf-8') + b'</package>'
stream = io.BytesIO(raw)
opf = OPF(stream)
self.embedded_mi = opf.to_book_metadata()
if guide is not None:
for ref in guide.xpath('descendant::reference'):
if 'cover' in ref.get('type', '').lower():
href = ref.get('href', '')
if href.startswith('#'):
href = href[1:]
anchors = root.xpath('//*[@id="%s"]' % href)
if anchors:
cpos = anchors[0]
reached = False
for elem in root.iter():
if elem is cpos:
reached = True
if reached and elem.tag == 'img':
cover = elem.get('src', None)
self.embedded_mi.cover = cover
elem.getparent().remove(elem)
break
break
def cleanup_html(self):
self.log.debug('Cleaning up HTML...')
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
self.processed_html = self.processed_html.replace('\r\n', '\n')
self.processed_html = self.processed_html.replace('> <', '>\n<')
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html)
# Swap inline and block level elements, and order block level elements according to priority
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
self.processed_html = re.sub(
r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>'+r'\g<styletags>', self.processed_html)
self.processed_html = re.sub(
r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>'+r'\g<para>', self.processed_html)
self.processed_html = re.sub(
r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>'+r'\g<blockquote>', self.processed_html)
self.processed_html = re.sub(
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html)
bods = htmls = 0
for x in re.finditer('</body>|</html>', self.processed_html):
if x == '</body>':
bods +=1
else:
htmls += 1
if bods > 1 and htmls > 1:
break
if bods > 1:
self.processed_html = self.processed_html.replace('</body>', '')
if htmls > 1:
self.processed_html = self.processed_html.replace('</html>', '')
def remove_random_bytes(self, html):
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07',
'', html)
def ensure_unit(self, raw, unit='px'):
if re.search(r'\d+$', raw) is not None:
raw += unit
return raw
def upshift_markup(self, root, image_name_map=None):
self.log.debug('Converting style information to CSS...')
image_name_map = image_name_map or {}
size_map = {
'xx-small': '0.5',
'x-small': '1',
'small': '2',
'medium': '3',
'large': '4',
'x-large': '5',
'xx-large': '6',
}
def barename(x):
return x.rpartition(':')[-1]
mobi_version = self.book_header.mobi_version
for x in root.xpath('//ncx'):
x.getparent().remove(x)
svg_tags = []
forwardable_anchors = []
pagebreak_anchors = []
BLOCK_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'p'}
for i, tag in enumerate(root.iter(etree.Element)):
tag.attrib.pop('xmlns', '')
for x in tag.attrib:
if ':' in x:
del tag.attrib[x]
if tag.tag and barename(tag.tag) == 'svg':
svg_tags.append(tag)
if tag.tag and barename(tag.tag.lower()) in \
('country-region', 'place', 'placetype', 'placename',
'state', 'city', 'street', 'address', 'content', 'form'):
tag.tag = 'div' if tag.tag in ('content', 'form') else 'span'
for key in tag.attrib.keys():
tag.attrib.pop(key)
continue
styles, attrib = [], tag.attrib
if 'style' in attrib:
style = attrib.pop('style').strip()
if style:
styles.append(style)
if 'height' in attrib:
height = attrib.pop('height').strip()
if (
height and '<' not in height and '>' not in height and
re.search(r'\d+', height)):
if tag.tag in ('table', 'td', 'tr'):
pass
elif tag.tag == 'img':
tag.set('height', height)
else:
if tag.tag == 'div' and not tag.text and \
(not tag.tail or not tag.tail.strip()) and \
not len(list(tag.iterdescendants())):
# Paragraph spacer
# Insert nbsp so that the element is never
# discarded by a renderer
tag.text = '\u00a0' # nbsp
styles.append('height: %s' %
self.ensure_unit(height))
else:
styles.append('margin-top: %s' % self.ensure_unit(height))
if 'width' in attrib:
width = attrib.pop('width').strip()
if width and re.search(r'\d+', width):
if tag.tag in ('table', 'td', 'tr'):
pass
elif tag.tag == 'img':
tag.set('width', width)
else:
ewidth = self.ensure_unit(width)
styles.append('text-indent: %s' % ewidth)
try:
ewidth_val = unit_convert(ewidth, 12, 500, 166)
self.text_indents[tag] = ewidth_val
except:
pass
if width.startswith('-'):
styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
try:
ewidth_val = unit_convert(ewidth[1:], 12, 500, 166)
self.left_margins[tag] = ewidth_val
except:
pass
if 'align' in attrib:
align = attrib.pop('align').strip()
if align:
align = align.lower()
if align == 'baseline':
styles.append('vertical-align: '+align)
else:
styles.append('text-align: %s' % align)
if tag.tag == 'hr':
if mobi_version == 1:
tag.tag = 'div'
styles.append('page-break-before: always')
styles.append('display: block')
styles.append('margin: 0')
elif tag.tag == 'i':
tag.tag = 'span'
tag.attrib['class'] = 'italic'
elif tag.tag == 'u':
tag.tag = 'span'
tag.attrib['class'] = 'underline'
elif tag.tag == 'b':
tag.tag = 'span'
tag.attrib['class'] = 'bold'
elif tag.tag == 'font':
sz = tag.get('size', '').lower()
try:
float(sz)
except ValueError:
if sz in list(size_map.keys()):
attrib['size'] = size_map[sz]
elif tag.tag == 'img':
recindex = None
for attr in self.IMAGE_ATTRS:
recindex = attrib.pop(attr, None) or recindex
if recindex is not None:
try:
recindex = int(recindex)
except Exception:
pass
else:
attrib['src'] = 'images/' + image_name_map.get(recindex, '%05d.jpg' % recindex)
for attr in ('width', 'height'):
if attr in attrib:
val = attrib[attr]
if val.lower().endswith('em'):
try:
nval = float(val[:-2])
nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
attrib[attr] = "%dpx"%int(nval)
except:
del attrib[attr]
elif val.lower().endswith('%'):
del attrib[attr]
elif tag.tag == 'pre':
if not tag.text:
tag.tag = 'div'
if (attrib.get('class', None) == 'mbp_pagebreak' and tag.tag ==
'div' and 'filepos-id' in attrib):
pagebreak_anchors.append(tag)
if 'color' in attrib:
styles.append('color: ' + attrib.pop('color'))
if 'bgcolor' in attrib:
styles.append('background-color: ' + attrib.pop('bgcolor'))
if 'filepos-id' in attrib:
attrib['id'] = attrib.pop('filepos-id')
if 'name' in attrib and attrib['name'] != attrib['id']:
attrib['name'] = attrib['id']
if 'filepos' in attrib:
filepos = attrib.pop('filepos')
try:
attrib['href'] = "#filepos%d" % int(filepos)
except ValueError:
pass
if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos') and
not tag.text and len(tag) == 0 and (tag.tail is None or not
tag.tail.strip()) and getattr(tag.getnext(), 'tag',
None) in BLOCK_TAGS):
# This is an empty anchor immediately before a block tag, move
# the id onto the block tag instead
forwardable_anchors.append(tag)
if styles:
ncls = None
rule = '; '.join(styles)
for sel, srule in self.tag_css_rules.items():
if srule == rule:
ncls = sel
break
if ncls is None:
ncls = 'calibre_%d' % i
self.tag_css_rules[ncls] = rule
cls = attrib.get('class', '')
cls = cls + (' ' if cls else '') + ncls
attrib['class'] = cls
for tag in svg_tags:
images = tag.xpath('descendant::img[@src]')
parent = tag.getparent()
if images and hasattr(parent, 'find'):
index = parent.index(tag)
for img in images:
img.getparent().remove(img)
img.tail = img.text = None
parent.insert(index, img)
if hasattr(parent, 'remove'):
parent.remove(tag)
for tag in pagebreak_anchors:
anchor = tag.attrib['id']
del tag.attrib['id']
if 'name' in tag.attrib:
del tag.attrib['name']
p = tag.getparent()
a = p.makeelement('a')
a.attrib['id'] = anchor
p.insert(p.index(tag)+1, a)
if getattr(a.getnext(), 'tag', None) in BLOCK_TAGS:
forwardable_anchors.append(a)
for tag in forwardable_anchors:
block = tag.getnext()
tag.getparent().remove(tag)
if 'id' in block.attrib:
tag.tail = block.text
block.text = None
block.insert(0, tag)
else:
block.attrib['id'] = tag.attrib['id']
# WebKit fails to navigate to anchors located on <br> tags
for br in root.xpath('/body/br[@id]'):
br.tag = 'div'
def get_left_whitespace(self, tag):
def whitespace(tag):
lm = ti = 0.0
if tag.tag == 'p':
ti = unit_convert('1.5em', 12, 500, 166)
if tag.tag == 'blockquote':
lm = unit_convert('2em', 12, 500, 166)
lm = self.left_margins.get(tag, lm)
ti = self.text_indents.get(tag, ti)
try:
lm = float(lm)
except:
lm = 0.0
try:
ti = float(ti)
except:
ti = 0.0
return lm + ti
parent = tag
ans = 0.0
while parent is not None:
ans += whitespace(parent)
parent = parent.getparent()
return ans
def create_opf(self, htmlfile, guide=None, root=None):
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
if mi is None:
mi = MetaInformation(self.book_header.title, [_('Unknown')])
opf = OPFCreator(os.path.dirname(htmlfile), mi)
if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1)
elif mi.cover is not None:
opf.cover = mi.cover
else:
opf.cover = 'images/%05d.jpg' % 1
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
* opf.cover.split('/'))):
opf.cover = None
cover = opf.cover
cover_copied = None
if cover is not None:
cover = cover.replace('/', os.sep)
if os.path.exists(cover):
ncover = 'images'+os.sep+'calibre_cover.jpg'
if os.path.exists(ncover):
os.remove(ncover)
shutil.copyfile(cover, ncover)
cover_copied = os.path.abspath(ncover)
opf.cover = ncover.replace(os.sep, '/')
manifest = [(htmlfile, 'application/xhtml+xml'),
(os.path.abspath('styles.css'), 'text/css')]
bp = os.path.dirname(htmlfile)
added = set()
for i in getattr(self, 'image_names', []):
path = os.path.join(bp, 'images', i)
added.add(path)
manifest.append((path, guess_type(path)[0] or 'image/jpeg'))
if cover_copied is not None:
manifest.append((cover_copied, 'image/jpeg'))
opf.create_manifest(manifest)
opf.create_spine([os.path.basename(htmlfile)])
toc = None
if guide is not None:
opf.create_guide(guide)
for ref in opf.guide:
if ref.type.lower() == 'toc':
toc = ref.href()
ncx_manifest_entry = None
if toc:
ncx_manifest_entry = 'toc.ncx'
elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
tocobj = None
ent_pat = re.compile(r'&(\S+?);')
if elems:
tocobj = TOC()
found = False
reached = False
for x in root.iter():
if x == elems[-1]:
reached = True
continue
if reached and x.tag == 'a':
href = x.get('href', '')
if href and re.match(r'\w+://', href) is None:
try:
text = ' '.join([t.strip() for t in
x.xpath('descendant::text()')])
except:
text = ''
text = ent_pat.sub(entity_to_unicode, text)
item = tocobj.add_item(toc.partition('#')[0], href[1:],
text)
item.left_space = int(self.get_left_whitespace(x))
found = True
if reached and found and x.get('class', None) == 'mbp_pagebreak':
break
if tocobj is not None:
tocobj = self.structure_toc(tocobj)
opf.set_toc(tocobj)
return opf, ncx_manifest_entry
def structure_toc(self, toc):
indent_vals = set()
for item in toc:
indent_vals.add(item.left_space)
if len(indent_vals) > 6 or len(indent_vals) < 2:
# Too many or too few levels, give up
return toc
indent_vals = sorted(indent_vals)
last_found = [None for i in indent_vals]
newtoc = TOC()
def find_parent(level):
candidates = last_found[:level]
for x in reversed(candidates):
if x is not None:
return x
return newtoc
for item in toc:
level = indent_vals.index(item.left_space)
parent = find_parent(level)
last_found[level] = parent.add_item(item.href, item.fragment,
item.text)
return newtoc
def sizeof_trailing_entries(self, data):
def sizeof_trailing_entry(ptr, psize):
bitpos, result = 0, 0
while True:
v = ord(ptr[psize-1:psize])
result |= (v & 0x7F) << bitpos
bitpos += 7
psize -= 1
if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
return result
num = 0
size = len(data)
flags = self.book_header.extra_flags >> 1
while flags:
if flags & 1:
try:
num += sizeof_trailing_entry(data, size - num)
except IndexError:
self.warn_about_trailing_entry_corruption()
return 0
flags >>= 1
if self.book_header.extra_flags & 1:
off = size - num - 1
num += (ord(data[off:off+1]) & 0x3) + 1
return num
def warn_about_trailing_entry_corruption(self):
if not self.warned_about_trailing_entry_corruption:
self.warned_about_trailing_entry_corruption = True
self.log.warn('The trailing data entries in this MOBI file are corrupted, you might see corrupted text in the output')
def text_section(self, index):
data = self.sections[index][0]
trail_size = self.sizeof_trailing_entries(data)
return data[:len(data)-trail_size]
def extract_text(self, offset=1):
self.log.debug('Extracting text...')
text_sections = [self.text_section(i) for i in range(offset,
min(self.book_header.records + offset, len(self.sections)))]
processed_records = list(range(offset-1, self.book_header.records +
offset))
self.mobi_html = b''
if self.book_header.compression_type == b'DH':
huffs = [self.sections[i][0] for i in
range(self.book_header.huff_offset,
self.book_header.huff_offset + self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset + self.book_header.huff_number))
huff = HuffReader(huffs)
unpack = huff.unpack
elif self.book_header.compression_type == b'\x00\x02':
unpack = decompress_doc
elif self.book_header.compression_type == b'\x00\x01':
unpack = lambda x: x
else:
raise MobiError('Unknown compression algorithm: %r' % self.book_header.compression_type)
self.mobi_html = b''.join(map(unpack, text_sections))
if self.mobi_html.endswith(b'#'):
self.mobi_html = self.mobi_html[:-1]
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ')
self.mobi_html = self.mobi_html.replace(b'\0', b'')
if self.book_header.codec == 'cp1252':
self.mobi_html = self.mobi_html.replace(b'\x1e', b'') # record separator
self.mobi_html = self.mobi_html.replace(b'\x02', b'') # start of text
return processed_records
def replace_page_breaks(self):
self.processed_html = self.PAGE_BREAK_PAT.sub(
r'<div \1 class="mbp_pagebreak" />',
self.processed_html)
def add_anchors(self):
self.log.debug('Adding anchors...')
positions = set()
link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
re.IGNORECASE)
for match in link_pattern.finditer(self.mobi_html):
positions.add(int(match.group(1)))
pos = 0
processed_html = []
end_tag_re = re.compile(br'<\s*/')
for end in sorted(positions):
if end == 0:
continue
oend = end
l = self.mobi_html.find(b'<', end)
r = self.mobi_html.find(b'>', end)
anchor = b'<a id="filepos%d"></a>'
if r > -1 and (r < l or l == end or l == -1):
p = self.mobi_html.rfind(b'<', 0, end + 1)
if (pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and
not self.mobi_html[p:r + 1].endswith(b'/>')):
anchor = b' filepos-id="filepos%d"'
end = r
else:
end = r + 1
processed_html.append(self.mobi_html[pos:end] + (anchor % oend))
pos = end
processed_html.append(self.mobi_html[pos:])
processed_html = b''.join(processed_html)
# Remove anchors placed inside entities
self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)([^;]*);',
br'&\1\3;\2', processed_html)
def extract_images(self, processed_records, output_dir):
self.log.debug('Extracting images...')
output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
image_index = 0
self.image_names = []
image_name_map = {}
start = getattr(self.book_header, 'first_image_index', -1)
if start > self.num_sections or start < 0:
# BAEN PRC files have bad headers
start = 0
for i in range(start, self.num_sections):
if i in processed_records:
continue
processed_records.append(i)
data = self.sections[i][0]
image_index += 1
if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
# This record is a known non image type, no need to try to
# load the image
continue
try:
imgfmt = what(None, data)
except Exception:
continue
if imgfmt not in {'jpg', 'jpeg', 'gif', 'png', 'bmp'}:
continue
if imgfmt == 'jpeg':
imgfmt = 'jpg'
if imgfmt == 'gif':
try:
data = gif_data_to_png_data(data)
imgfmt = 'png'
except AnimatedGIF:
pass
path = os.path.join(output_dir, '%05d.%s' % (image_index, imgfmt))
image_name_map[image_index] = os.path.basename(path)
if imgfmt == 'png':
with open(path, 'wb') as f:
f.write(data)
else:
try:
save_cover_data_to(data, path, minify_to=(10000, 10000))
except Exception:
continue
self.image_names.append(os.path.basename(path))
return image_name_map
def test_mbp_regex():
for raw, m in iteritems({
'<mbp:pagebreak></mbp:pagebreak>':'',
'<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy',
'<mbp:pagebreak> </mbp:pagebreak>':'',
'<mbp:pagebreak>xxx':'xxx',
'<mbp:pagebreak/>xxx':'xxx',
'<mbp:pagebreak sdf/ >xxx':' sdfxxx',
'<mbp:pagebreak / >':' ',
'</mbp:pagebreak>':'',
'</mbp:pagebreak sdf>':' sdf',
'</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':'xxx',
}):
ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
if ans != m:
raise Exception('%r != %r for %r'%(ans, m, raw))

View File

@@ -0,0 +1,590 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, re, os
from collections import namedtuple
from itertools import repeat
from uuid import uuid4
from lxml import etree
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import read_index
from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
from calibre.ebooks.mobi.reader.containers import Container, find_imgtype
from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.utils import read_font_record
from calibre.ebooks.oeb.parse_utils import parse_html
from calibre.ebooks.oeb.base import XPath, XHTML, xml2text
from polyglot.builtins import range, zip, unicode_type, getcwd, as_unicode
from polyglot.urllib import urldefrag
Part = namedtuple('Part',
'num type filename start end aid')
Elem = namedtuple('Elem',
'insert_pos toc_text file_number sequence_number start_pos '
'length')
FlowInfo = namedtuple('FlowInfo',
'type format dir fname')
# locate beginning and ending positions of tag with specific aid attribute
def locate_beg_end_of_tag(ml, aid):
pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid
aid_pattern = re.compile(pattern, re.IGNORECASE)
for m in re.finditer(aid_pattern, ml):
plt = m.start()
pgt = ml.find(b'>', plt+1)
return plt, pgt
return 0, 0
def reverse_tag_iter(block):
''' Iterate over all tags in block in reverse order, i.e. last tag
to first tag. '''
end = len(block)
while True:
pgt = block.rfind(b'>', 0, end)
if pgt == -1:
break
plt = block.rfind(b'<', 0, pgt)
if plt == -1:
break
yield block[plt:pgt+1]
end = plt
def get_first_resource_index(first_image_index, num_of_text_records, first_text_record_number):
first_resource_index = first_image_index
if first_resource_index in {-1, NULL_INDEX}:
first_resource_index = num_of_text_records + first_text_record_number
return first_resource_index
class Mobi8Reader(object):
def __init__(self, mobi6_reader, log, for_tweak=False):
self.for_tweak = for_tweak
self.mobi6_reader, self.log = mobi6_reader, log
self.header = mobi6_reader.book_header
self.encrypted_fonts = []
self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
def __call__(self):
self.mobi6_reader.check_for_drm()
self.aid_anchor_suffix = uuid4().hex.encode('utf-8')
bh = self.mobi6_reader.book_header
if self.mobi6_reader.kf8_type == 'joint':
offset = self.mobi6_reader.kf8_boundary + 2
self.resource_offsets = [
(get_first_resource_index(bh.first_image_index, bh.mobi6_records, 1), offset - 2),
(get_first_resource_index(bh.kf8_first_image_index, bh.records, offset), len(self.mobi6_reader.sections)),
]
else:
offset = 1
self.resource_offsets = [(get_first_resource_index(bh.first_image_index, bh.records, offset), len(self.mobi6_reader.sections))]
self.processed_records = self.mobi6_reader.extract_text(offset=offset)
self.raw_ml = self.mobi6_reader.mobi_html
with open('debug-raw.html', 'wb') as f:
f.write(self.raw_ml)
self.kf8_sections = self.mobi6_reader.sections[offset-1:]
self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
self.linked_aids = set()
self.read_indices()
self.build_parts()
guide = self.create_guide()
ncx = self.create_ncx()
resource_map = self.extract_resources(self.mobi6_reader.sections)
spine = self.expand_text(resource_map)
return self.write_opf(guide, ncx, spine, resource_map)
def read_indices(self):
self.flow_table = ()
if self.header.fdstidx != NULL_INDEX:
header = self.kf8_sections[self.header.fdstidx][0]
if header[:4] != b'FDST':
raise ValueError('KF8 does not have a valid FDST record')
sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
secs = struct.unpack_from(b'>%dL' % (num_sections*2),
header, sec_start)
self.flow_table = tuple(zip(secs[::2], secs[1::2]))
self.files = []
if self.header.skelidx != NULL_INDEX:
table = read_index(self.kf8_sections, self.header.skelidx,
self.header.codec)[0]
File = namedtuple('File',
'file_number name divtbl_count start_position length')
for i, text in enumerate(table):
tag_map = table[text]
self.files.append(File(i, text, tag_map[1][0],
tag_map[6][0], tag_map[6][1]))
self.elems = []
if self.header.dividx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.dividx,
self.header.codec)
for i, text in enumerate(table):
tag_map = table[text]
toc_text = cncx[tag_map[2][0]]
self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
tag_map[4][0], tag_map[6][0], tag_map[6][1]))
self.guide = []
if self.header.othidx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.othidx,
self.header.codec)
Item = namedtuple('Item',
'type title pos_fid')
for i, ref_type in enumerate(table):
tag_map = table[ref_type]
# ref_type, ref_title, div/frag number
title = cncx[tag_map[1][0]]
fileno = None
if 3 in list(tag_map.keys()):
fileno = tag_map[3][0]
if 6 in list(tag_map.keys()):
fileno = tag_map[6]
if isinstance(ref_type, bytes):
ref_type = ref_type.decode(self.header.codec)
self.guide.append(Item(ref_type, title, fileno))
def build_parts(self):
raw_ml = self.mobi6_reader.mobi_html
self.flows = []
self.flowinfo = []
ft = self.flow_table if self.flow_table else [(0, len(raw_ml))]
# now split the raw_ml into its flow pieces
for start, end in ft:
self.flows.append(raw_ml[start:end])
# the first piece represents the xhtml text
text = self.flows[0]
self.flows[0] = b''
# walk the <skeleton> and <div> tables to build original source xhtml
# files *without* destroying any file position information needed for
# later href processing and create final list of file separation start:
# stop points and etc in partinfo
self.parts = []
self.partinfo = []
divptr = 0
baseptr = 0
for skelnum, skelname, divcnt, skelpos, skellen in self.files:
baseptr = skelpos + skellen
skeleton = text[skelpos:baseptr]
inspos_warned = False
for i in range(divcnt):
insertpos, idtext, filenum, seqnum, startpos, length = \
self.elems[divptr]
if i == 0:
aidtext = idtext[12:-2]
filename = 'part%04d.html' % filenum
part = text[baseptr:baseptr + length]
insertpos = insertpos - skelpos
head = skeleton[:insertpos]
tail = skeleton[insertpos:]
if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') <
head.rfind(b'<')):
# There is an incomplete tag in either the head or tail.
# This can happen for some badly formed KF8 files, see for
# example, https://bugs.launchpad.net/bugs/1082669
if not inspos_warned:
self.log.warn(
'The div table for %s has incorrect insert '
'positions. Calculating manually.'%skelname)
inspos_warned = True
bp, ep = locate_beg_end_of_tag(skeleton, aidtext if
isinstance(aidtext, bytes) else aidtext.encode('utf-8'))
if bp != ep:
insertpos = ep + 1 + startpos
skeleton = skeleton[0:insertpos] + part + skeleton[insertpos:]
baseptr = baseptr + length
divptr += 1
self.parts.append(skeleton)
if divcnt < 1:
# Empty file
aidtext = unicode_type(uuid4())
filename = aidtext + '.html'
self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
baseptr, aidtext))
# The primary css style sheet is typically stored next followed by any
# snippets of code that were previously inlined in the
# original xhtml but have been stripped out and placed here.
# This can include local CDATA snippets and svg sections.
# The problem is that for most browsers and ereaders, you can not
# use <img src="imageXXXX.svg" /> to import any svg image that itself
# properly uses an <image/> tag to import some raster image - it
# should work according to the spec but does not for almost all browsers
# and ereaders and causes epub validation issues because those raster
# images are in manifest but not in xhtml text - since they only
# referenced from an svg image
# So we need to check the remaining flow pieces to see if they are css
# or svg images. if svg images, we must check if they have an <image/>
# and if so inline them into the xhtml text pieces.
# there may be other sorts of pieces stored here but until we see one
# in the wild to reverse engineer we won't be able to tell
self.flowinfo.append(FlowInfo(None, None, None, None))
svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''', re.IGNORECASE)
for j in range(1, len(self.flows)):
flowpart = self.flows[j]
nstr = '%04d' % j
m = svg_tag_pattern.search(flowpart)
if m is not None:
# svg
typ = 'svg'
start = m.start()
m2 = image_tag_pattern.search(flowpart)
if m2 is not None:
format = 'inline'
dir = None
fname = None
# strip off anything before <svg if inlining
flowpart = re.sub(br'(</?)svg:', r'\1', flowpart[start:])
else:
format = 'file'
dir = "images"
fname = 'svgimg' + nstr + '.svg'
else:
# search for CDATA and if exists inline it
if flowpart.find(b'[CDATA[') >= 0:
typ = 'css'
flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
format = 'inline'
dir = None
fname = None
else:
# css - assume as standalone css file
typ = 'css'
format = 'file'
dir = "styles"
fname = nstr + '.css'
self.flows[j] = flowpart
self.flowinfo.append(FlowInfo(typ, format, dir, fname))
def get_file_info(self, pos):
''' Get information about the part (file) that exists at pos in
the raw markup '''
for part in self.partinfo:
if pos >= part.start and pos < part.end:
return part
return Part(*repeat(None, len(Part._fields)))
def get_id_tag_by_pos_fid(self, posfid, offset):
# first convert kindle:pos:fid and offset info to position in file
insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid]
pos = insertpos + offset
fi = self.get_file_info(pos)
# an existing "id=" must exist in original xhtml otherwise it would not
# have worked for linking. Amazon seems to have added its own
# additional "aid=" inside tags whose contents seem to represent some
# position information encoded into Base32 name.
# so find the closest "id=" before position the file by actually
# searching in that file
idtext = self.get_id_tag(pos)
return '%s/%s'%(fi.type, fi.filename), idtext
def get_id_tag(self, pos):
# Find the first tag with a named anchor (name or id attribute) before
# pos
fi = self.get_file_info(pos)
if fi.num is None and fi.start is None:
raise ValueError('No file contains pos: %d'%pos)
textblock = self.parts[fi.num]
npos = pos - fi.start
pgt = textblock.find(b'>', npos)
plt = textblock.find(b'<', npos)
# if npos inside a tag then search all text before the its end of tag marker
# else not in a tag need to search the preceding tag
if plt == npos or pgt < plt:
npos = pgt + 1
textblock = textblock[0:npos]
for tag in reverse_tag_iter(textblock):
m = self.id_re.match(tag) or self.name_re.match(tag)
if m is not None:
return m.group(1)
# For some files, kindlegen apparently creates links to tags
# without HTML anchors, using the AID instead. See
# See https://www.mobileread.com/forums/showthread.php?t=259557
m = self.aid_re.match(tag)
if m is not None:
self.linked_aids.add(m.group(1))
return m.group(1) + b'-' + self.aid_anchor_suffix
# No tag found, link to start of file
return b''
def create_guide(self):
guide = Guide()
has_start = False
for ref_type, ref_title, pos_fid in self.guide:
try:
if len(pos_fid) != 2:
continue
except TypeError:
continue # thumbnailstandard record, ignore it
linktgt, idtext = self.get_id_tag_by_pos_fid(*pos_fid)
if idtext:
if isinstance(idtext, bytes):
idtext = idtext.decode(self.header.codec)
linktgt += '#' + idtext
g = Guide.Reference(linktgt, getcwd())
g.title, g.type = ref_title, ref_type
if g.title == 'start' or g.type == 'text':
has_start = True
guide.append(g)
so = self.header.exth.start_offset
if so not in {None, NULL_INDEX} and not has_start:
fi = self.get_file_info(so)
if fi.filename is not None:
idtext = self.get_id_tag(so).decode(self.header.codec)
linktgt = fi.filename
if idtext:
linktgt += '#' + idtext
g = Guide.Reference('%s/%s'%(fi.type, linktgt), getcwd())
g.title, g.type = 'start', 'text'
guide.append(g)
return guide
def create_ncx(self):
index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
self.header.codec)
remove = []
# Add href and anchor info to the index entries
for entry in index_entries:
pos_fid = entry['pos_fid']
if pos_fid is None:
pos = entry['pos']
fi = self.get_file_info(pos)
if fi.filename is None:
raise ValueError('Index entry has invalid pos: %d'%pos)
idtag = self.get_id_tag(pos)
href = '%s/%s'%(fi.type, fi.filename)
else:
try:
href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
except ValueError:
self.log.warn('Invalid entry in NCX (title: %s), ignoring'
%entry['text'])
remove.append(entry)
continue
entry['href'] = href
entry['idtag'] = as_unicode(idtag, self.header.codec or 'utf-8')
for e in remove:
index_entries.remove(e)
# Build the TOC object
return build_toc(index_entries)
def extract_resources(self, sections):
from calibre.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF
resource_map = []
container = None
for x in ('fonts', 'images'):
os.mkdir(x)
for start, end in self.resource_offsets:
for i, sec in enumerate(sections[start:end]):
fname_idx = i+1
data = sec[0]
typ = data[:4]
href = None
if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN',
b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET', b'PAGE'}:
pass # Ignore these records
elif typ == b'FONT':
font = read_font_record(data)
href = "fonts/%05d.%s" % (fname_idx, font['ext'])
if font['err']:
self.log.warn('Reading font record %d failed: %s'%(
fname_idx, font['err']))
if font['headers']:
self.log.debug('Font record headers: %s'%font['headers'])
with open(href.replace('/', os.sep), 'wb') as f:
f.write(font['font_data'] if font['font_data'] else
font['raw_data'])
if font['encrypted']:
self.encrypted_fonts.append(href)
elif typ == b'CONT':
if data == b'CONTBOUNDARY':
container = None
continue
container = Container(data)
elif typ == b'CRES':
data, imgtype = container.load_image(data)
if data is not None:
href = 'images/%05d.%s'%(container.resource_index, imgtype)
with open(href.replace('/', os.sep), 'wb') as f:
f.write(data)
elif typ == b'\xa0\xa0\xa0\xa0' and len(data) == 4 and container is not None:
container.resource_index += 1
elif container is None:
if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF):
imgtype = find_imgtype(data)
href = 'images/%05d.%s'%(fname_idx, imgtype)
with open(href.replace('/', os.sep), 'wb') as f:
f.write(data)
resource_map.append(href)
return resource_map
def expand_text(self, resource_map):
return expand_mobi8_markup(self, resource_map, self.log)
def write_opf(self, guide, toc, spine, resource_map):
mi = self.header.exth.mi
if (self.cover_offset is not None and self.cover_offset <
len(resource_map)):
mi.cover = resource_map[self.cover_offset]
if len(list(toc)) < 2:
self.log.warn('KF8 has no metadata Table of Contents')
for ref in guide:
if ref.type == 'toc':
href = ref.href()
href, frag = urldefrag(href)
if os.path.exists(href.replace('/', os.sep)):
try:
toc = self.read_inline_toc(href, frag)
except:
self.log.exception('Failed to read inline ToC')
opf = OPFCreator(getcwd(), mi)
opf.guide = guide
def exclude(path):
return os.path.basename(path) == 'debug-raw.html'
# If there are no images then the azw3 input plugin dumps all
# binary records as .unknown images, remove them
if self.for_tweak and os.path.exists('images') and os.path.isdir('images'):
files = os.listdir('images')
unknown = [x for x in files if x.endswith('.unknown')]
if len(files) == len(unknown):
[os.remove('images/'+f) for f in files]
if self.for_tweak:
try:
os.remove('debug-raw.html')
except:
pass
opf.create_manifest_from_files_in([getcwd()], exclude=exclude)
for entry in opf.manifest:
if entry.mime_type == 'text/html':
entry.mime_type = 'application/xhtml+xml'
opf.create_spine(spine)
opf.set_toc(toc)
ppd = getattr(self.header.exth, 'page_progression_direction', None)
if ppd in {'ltr', 'rtl', 'default'}:
opf.page_progression_direction = ppd
pwm = getattr(self.header.exth, 'primary_writing_mode', None)
if pwm is not None:
opf.primary_writing_mode = pwm
with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx:
opf.render(of, ncx, 'toc.ncx')
return 'metadata.opf'
def read_inline_toc(self, href, frag):
ans = TOC()
base_href = '/'.join(href.split('/')[:-1])
with open(href.replace('/', os.sep), 'rb') as f:
raw = f.read().decode(self.header.codec)
root = parse_html(raw, log=self.log)
body = XPath('//h:body')(root)
reached = False
if body:
start = body[0]
else:
start = None
reached = True
if frag:
elems = XPath('//*[@id="%s"]'%frag)(root)
if elems:
start = elems[0]
def node_depth(elem):
ans = 0
parent = elem.getparent()
while parent is not None:
parent = parent.getparent()
ans += 1
return ans
# Layer the ToC based on nesting order in the source HTML
current_depth = None
parent = ans
seen = set()
links = []
for elem in root.iterdescendants(etree.Element):
if reached and elem.tag == XHTML('a') and elem.get('href',
False):
href = elem.get('href')
href, frag = urldefrag(href)
href = base_href + '/' + href
text = xml2text(elem).strip()
if (text, href, frag) in seen:
continue
seen.add((text, href, frag))
links.append((text, href, frag, node_depth(elem)))
elif elem is start:
reached = True
depths = sorted(set(x[-1] for x in links))
depth_map = {x:i for i, x in enumerate(depths)}
for text, href, frag, depth in links:
depth = depth_map[depth]
if current_depth is None:
current_depth = 0
parent.add_item(href, frag, text)
elif current_depth == depth:
parent.add_item(href, frag, text)
elif current_depth < depth:
parent = parent[-1] if len(parent) > 0 else parent
parent.add_item(href, frag, text)
current_depth += 1
else:
delta = current_depth - depth
while delta > 0 and parent.parent is not None:
parent = parent.parent
delta -= 1
parent.add_item(href, frag, text)
current_depth = depth
return ans

View File

@@ -0,0 +1,100 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre import replace_entities
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import read_index
from polyglot.builtins import iteritems, getcwd
tag_fieldname_map = {
1: ['pos',0],
2: ['len',0],
3: ['noffs',0],
4: ['hlvl',0],
5: ['koffs',0],
6: ['pos_fid',0],
21: ['parent',0],
22: ['child1',0],
23: ['childn',0],
69: ['image_index',0],
70 : ['desc_offset', 0], # 'Description offset in cncx'
71 : ['author_offset', 0], # 'Author offset in cncx'
72 : ['image_caption_offset', 0], # 'Image caption offset in cncx',
73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx',
}
default_entry = {
'pos': -1,
'len': 0,
'noffs': -1,
'text' : "Unknown Text",
'hlvl' : -1,
'kind' : "Unknown Class",
'pos_fid' : None,
'parent' : -1,
'child1' : -1,
'childn' : -1,
'description': None,
'author': None,
'image_caption': None,
'image_attribution': None,
}
def read_ncx(sections, index, codec):
index_entries = []
if index != NULL_INDEX:
table, cncx = read_index(sections, index, codec)
for num, x in enumerate(iteritems(table)):
text, tag_map = x
entry = default_entry.copy()
entry['name'] = text
entry['num'] = num
for tag in tag_fieldname_map:
fieldname, i = tag_fieldname_map[tag]
if tag in tag_map:
fieldvalue = tag_map[tag][i]
if tag == 6:
# Appears to be an idx into the KF8 elems table with an
# offset
fieldvalue = tuple(tag_map[tag])
entry[fieldname] = fieldvalue
for which, name in iteritems({3:'text', 5:'kind', 70:'description',
71:'author', 72:'image_caption',
73:'image_attribution'}):
if tag == which:
entry[name] = cncx.get(fieldvalue,
default_entry[name])
index_entries.append(entry)
return index_entries
def build_toc(index_entries):
ans = TOC(base_path=getcwd())
levels = {x['hlvl'] for x in index_entries}
num_map = {-1: ans}
level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in
levels}
for lvl in sorted(levels):
for item in level_map[lvl]:
parent = num_map[item['parent']]
child = parent.add_item(item['href'], item['idtag'],
replace_entities(item['text'], encoding=None))
num_map[item['num']] = child
# Set play orders in depth first order
for i, item in enumerate(ans.flat()):
item.play_order = i
return ans

View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, glob
from calibre import CurrentDir
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.utils.logging import default_log
from calibre.ebooks import DRMError
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
from calibre.customize.ui import (plugin_for_input_format,
plugin_for_output_format)
from calibre.utils.ipc.simple_worker import fork_job
class BadFormat(ValueError):
pass
def do_explode(path, dest):
with open(path, 'rb') as stream:
mr = MobiReader(stream, default_log, None, None)
with CurrentDir(dest):
mr = Mobi8Reader(mr, default_log)
opf = os.path.abspath(mr())
try:
os.remove('debug-raw.html')
except:
pass
return opf
def explode(path, dest, question=lambda x:True):
with open(path, 'rb') as stream:
raw = stream.read(3)
stream.seek(0)
if raw == b'TPZ':
raise BadFormat(_('This is not a MOBI file. It is a Topaz file.'))
try:
header = MetadataHeader(stream, default_log)
except MobiError:
raise BadFormat(_('This is not a MOBI file.'))
if header.encryption_type != 0:
raise DRMError(_('This file is locked with DRM. It cannot be tweaked.'))
kf8_type = header.kf8_type
if kf8_type is None:
raise BadFormat(_('This MOBI file does not contain a KF8 format '
'book. KF8 is the new format from Amazon. calibre can '
'only tweak MOBI files that contain KF8 books. Older '
'MOBI files without KF8 are not tweakable.'))
if kf8_type == 'joint':
if not question(_('This MOBI file contains both KF8 and '
'older Mobi6 data. Tweaking it will remove the Mobi6 data, which '
'means the file will not be usable on older Kindles. Are you '
'sure?')):
return None
return fork_job('calibre.ebooks.mobi.tweak', 'do_explode', args=(path,
dest), no_output=True)['result']
def set_cover(oeb):
if 'cover' not in oeb.guide or oeb.metadata['cover']:
return
cover = oeb.guide['cover']
if cover.href in oeb.manifest.hrefs:
item = oeb.manifest.hrefs[cover.href]
oeb.metadata.clear('cover')
oeb.metadata.add('cover', item.id)
def do_rebuild(opf, dest_path):
plumber = Plumber(opf, dest_path, default_log)
plumber.setup_options()
inp = plugin_for_input_format('azw3')
outp = plugin_for_output_format('azw3')
plumber.opts.mobi_passthrough = True
oeb = create_oebbook(default_log, opf, plumber.opts)
set_cover(oeb)
outp.convert(oeb, dest_path, inp, plumber.opts, default_log)
def rebuild(src_dir, dest_path):
opf = glob.glob(os.path.join(src_dir, '*.opf'))
if not opf:
raise ValueError('No OPF file found in %s'%src_dir)
opf = opf[0]
# For debugging, uncomment the following two lines
# def fork_job(a, b, args=None, no_output=True):
# do_rebuild(*args)
fork_job('calibre.ebooks.mobi.tweak', 'do_rebuild', args=(opf, dest_path),
no_output=True)

View File

@@ -0,0 +1,646 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, string, zlib, os
from collections import OrderedDict
from io import BytesIO
from calibre.utils.img import save_cover_data_to, scale_image, image_to_data, image_from_data, resize_image, png_data_to_gif_data
from calibre.utils.imghdr import what
from calibre.ebooks import normalize
from polyglot.builtins import unicode_type, range, as_bytes, map
from tinycss.color3 import parse_color_string
IMAGE_MAX_SIZE = 10 * 1024 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
class PolyglotDict(dict):
def __setitem__(self, key, val):
if isinstance(key, unicode_type):
key = key.encode('utf-8')
dict.__setitem__(self, key, val)
def __getitem__(self, key):
if isinstance(key, unicode_type):
key = key.encode('utf-8')
return dict.__getitem__(self, key)
def __contains__(self, key):
if isinstance(key, unicode_type):
key = key.encode('utf-8')
return dict.__contains__(self, key)
def decode_string(raw, codec='utf-8', ordt_map=None):
length, = struct.unpack(b'>B', raw[0:1])
raw = raw[1:1+length]
consumed = length+1
if ordt_map:
return ''.join(ordt_map[x] for x in bytearray(raw)), consumed
return raw.decode(codec), consumed
def decode_hex_number(raw, codec='utf-8'):
'''
Return a variable length number encoded using hexadecimal encoding. These
numbers have the first byte which tells the number of bytes that follow.
The bytes that follow are simply the hexadecimal representation of the
number.
:param raw: Raw binary data as a bytestring
:return: The number and the number of bytes from raw that the number
occupies.
'''
raw, consumed = decode_string(raw, codec=codec)
return int(raw, 16), consumed
def encode_string(raw):
ans = bytearray(as_bytes(raw))
ans.insert(0, len(ans))
return bytes(ans)
def encode_number_as_hex(num):
'''
Encode num as a variable length encoded hexadecimal number. Returns the
bytestring containing the encoded number. These
numbers have the first byte which tells the number of bytes that follow.
The bytes that follow are simply the hexadecimal representation of the
number.
'''
num = hex(num)[2:].upper().encode('ascii')
nlen = len(num)
if nlen % 2 != 0:
num = b'0'+num
return encode_string(num)
def encint(value, forward=True):
'''
Some parts of the Mobipocket format encode data as variable-width integers.
These integers are represented big-endian with 7 bits per byte in bits 1-7.
They may be either forward-encoded, in which case only the first byte has bit 8 set,
or backward-encoded, in which case only the last byte has bit 8 set.
For example, the number 0x11111 = 0b10001000100010001 would be represented
forward-encoded as:
0x04 0x22 0x91 = 0b100 0b100010 0b10010001
And backward-encoded as:
0x84 0x22 0x11 = 0b10000100 0b100010 0b10001
This function encodes the integer ``value`` as a variable width integer and
returns the bytestring corresponding to it.
If forward is True the bytes returned are suitable for prepending to the
output buffer, otherwise they must be append to the output buffer.
'''
if value < 0:
raise ValueError('Cannot encode negative numbers as vwi')
# Encode vwi
byts = bytearray()
while True:
b = value & 0b01111111
value >>= 7 # shift value to the right by 7 bits
byts.append(b)
if value == 0:
break
byts[0 if forward else -1] |= 0b10000000
byts.reverse()
return bytes(byts)
def decint(raw, forward=True):
'''
Read a variable width integer from the bytestring or bytearray raw and return the
integer and the number of bytes read. If forward is True bytes are read
from the start of raw, otherwise from the end of raw.
This function is the inverse of encint above, see its docs for more
details.
'''
val = 0
byts = bytearray()
src = bytearray(raw)
if not forward:
src.reverse()
for bnum in src:
byts.append(bnum & 0b01111111)
if bnum & 0b10000000:
break
if not forward:
byts.reverse()
for byte in byts:
val <<= 7 # Shift value to the left by 7 bits
val |= byte
return val, len(byts)
def test_decint(num):
for d in (True, False):
raw = encint(num, forward=d)
sz = len(raw)
if (num, sz) != decint(raw, forward=d):
raise ValueError('Failed for num %d, forward=%r: %r != %r' % (
num, d, (num, sz), decint(raw, forward=d)))
def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None):
'''
Convert image setting all transparent pixels to white and changing format
to JPEG. Ensure the resultant image has a byte size less than
maxsizeb.
If dimen is not None, generate a thumbnail of
width=dimen, height=dimen or width, height = dimen (depending on the type
of dimen)
Returns the image as a bytestring
'''
if dimen is not None:
if hasattr(dimen, '__len__'):
width, height = dimen
else:
width = height = dimen
data = scale_image(data, width=width, height=height, compression_quality=90)[-1]
else:
# Replace transparent pixels with white pixels and convert to JPEG
data = save_cover_data_to(data)
if len(data) <= maxsizeb:
return data
orig_data = data # save it in case compression fails
quality = 90
while len(data) > maxsizeb and quality >= 5:
data = image_to_data(image_from_data(orig_data), compression_quality=quality)
quality -= 5
if len(data) <= maxsizeb:
return data
orig_data = data
scale = 0.9
while len(data) > maxsizeb and scale >= 0.05:
img = image_from_data(data)
w, h = img.width(), img.height()
img = resize_image(img, int(scale*w), int(scale*h))
data = image_to_data(img, compression_quality=quality)
scale -= 0.05
return data
def get_trailing_data(record, extra_data_flags):
'''
Given a text record as a bytestring and the extra data flags from the MOBI
header, return the trailing data as a dictionary, mapping bit number to
data as bytestring. Also returns the record - all trailing data.
:return: Trailing data, record - trailing data
'''
data = OrderedDict()
flags = extra_data_flags >> 1
num = 0
while flags:
num += 1
if flags & 0b1:
sz, consumed = decint(record, forward=False)
if sz > consumed:
data[num] = record[-sz:-consumed]
record = record[:-sz]
flags >>= 1
# Read multibyte chars if any
if extra_data_flags & 0b1:
# Only the first two bits are used for the size since there can
# never be more than 3 trailing multibyte chars
sz = (ord(record[-1:]) & 0b11) + 1
consumed = 1
if sz > consumed:
data[0] = record[-sz:-consumed]
record = record[:-sz]
return data, record
def encode_trailing_data(raw):
'''
Given some data in the bytestring raw, return a bytestring of the form
<data><size>
where size is a backwards encoded vwi whose value is the length of the
entire returned bytestring. data is the bytestring passed in as raw.
This is the encoding used for trailing data entries at the end of text
records. See get_trailing_data() for details.
'''
lsize = 1
while True:
encoded = encint(len(raw) + lsize, forward=False)
if len(encoded) == lsize:
break
lsize += 1
return raw + encoded
def encode_fvwi(val, flags, flag_size=4):
'''
Encode the value val and the flag_size bits from flags as a fvwi. This encoding is
used in the trailing byte sequences for indexing. Returns encoded
bytestring.
'''
ans = val << flag_size
for i in range(flag_size):
ans |= (flags & (1 << i))
return encint(ans)
def decode_fvwi(byts, flag_size=4):
'''
Decode encoded fvwi. Returns number, flags, consumed
'''
arg, consumed = decint(bytes(byts))
val = arg >> flag_size
flags = 0
for i in range(flag_size):
flags |= (arg & (1 << i))
return val, flags, consumed
def decode_tbs(byts, flag_size=4):
'''
Trailing byte sequences for indexing consists of series of fvwi numbers.
This function reads the fvwi number and its associated flags. It then uses
the flags to read any more numbers that belong to the series. The flags are
the lowest 4 bits of the vwi (see the encode_fvwi function above).
Returns the fvwi number, a dictionary mapping flags bits to the associated
data and the number of bytes consumed.
'''
byts = bytes(byts)
val, flags, consumed = decode_fvwi(byts, flag_size=flag_size)
extra = {}
byts = byts[consumed:]
if flags & 0b1000 and flag_size > 3:
extra[0b1000] = True
if flags & 0b0010:
x, consumed2 = decint(byts)
byts = byts[consumed2:]
extra[0b0010] = x
consumed += consumed2
if flags & 0b0100:
extra[0b0100] = ord(byts[0:1])
byts = byts[1:]
consumed += 1
if flags & 0b0001:
x, consumed2 = decint(byts)
byts = byts[consumed2:]
extra[0b0001] = x
consumed += consumed2
return val, extra, consumed
def encode_tbs(val, extra, flag_size=4):
'''
Encode the number val and the extra data in the extra dict as an fvwi. See
decode_tbs above.
'''
flags = 0
for flag in extra:
flags |= flag
ans = encode_fvwi(val, flags, flag_size=flag_size)
if 0b0010 in extra:
ans += encint(extra[0b0010])
if 0b0100 in extra:
ans += bytes(bytearray([extra[0b0100]]))
if 0b0001 in extra:
ans += encint(extra[0b0001])
return ans
def utf8_text(text):
'''
Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
empty, normalized bytestring.
'''
if text and text.strip():
text = text.strip()
if not isinstance(text, unicode_type):
text = text.decode('utf-8', 'replace')
text = normalize(text).encode('utf-8')
else:
text = _('Unknown').encode('utf-8')
return text
def align_block(raw, multiple=4, pad=b'\0'):
'''
Return raw with enough pad bytes append to ensure its length is a multiple
of 4.
'''
extra = len(raw) % multiple
if extra == 0:
return raw
return raw + pad*(multiple - extra)
def detect_periodical(toc, log=None):
'''
Detect if the TOC object toc contains a periodical that conforms to the
structure required by kindlegen to generate a periodical.
'''
if toc.count() < 1 or not toc[0].klass == 'periodical':
return False
for node in toc.iterdescendants():
if node.depth() == 1 and node.klass != 'article':
if log is not None:
log.debug(
'Not a periodical: Deepest node does not have '
'class="article"')
return False
if node.depth() == 2 and node.klass != 'section':
if log is not None:
log.debug(
'Not a periodical: Second deepest node does not have'
' class="section"')
return False
if node.depth() == 3 and node.klass != 'periodical':
if log is not None:
log.debug('Not a periodical: Third deepest node'
' does not have class="periodical"')
return False
if node.depth() > 3:
if log is not None:
log.debug('Not a periodical: Has nodes of depth > 3')
return False
return True
def count_set_bits(num):
if num < 0:
num = -num
ans = 0
while num > 0:
ans += (num & 0b1)
num >>= 1
return ans
def to_base(num, base=32, min_num_digits=None):
digits = string.digits + string.ascii_uppercase
sign = 1 if num >= 0 else -1
if num == 0:
return ('0' if min_num_digits is None else '0'*min_num_digits)
num *= sign
ans = []
while num:
ans.append(digits[(num % base)])
num //= base
if min_num_digits is not None and len(ans) < min_num_digits:
ans.extend('0'*(min_num_digits - len(ans)))
if sign < 0:
ans.append('-')
ans.reverse()
return ''.join(ans)
def mobify_image(data):
'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG'
fmt = what(None, data)
if fmt == 'png':
data = png_data_to_gif_data(data)
return data
# Font records {{{
def read_font_record(data, extent=1040):
'''
Return the font encoded in the MOBI FONT record represented by data.
The return value in a dict with fields raw_data, font_data, err, ext,
headers.
:param extent: The number of obfuscated bytes. So far I have only
encountered files with 1040 obfuscated bytes. If you encounter an
obfuscated record for which this function fails, try different extent
values (easily automated).
raw_data is the raw data in the font record
font_data is the decoded font_data or None if an error occurred
err is not None if some error occurred
ext is the font type (ttf for TrueType, dat for unknown and failed if an
error occurred)
headers is the list of decoded headers from the font record or None if
decoding failed
'''
# Format:
# bytes 0 - 3: 'FONT'
# bytes 4 - 7: Uncompressed size
# bytes 8 - 11: flags
# bit 1 - zlib compression
# bit 2 - XOR obfuscated
# bytes 12 - 15: offset to start of compressed data
# bytes 16 - 19: length of XOR string
# bytes 19 - 23: offset to start of XOR data
# The zlib compressed data begins with 2 bytes of header and
# has 4 bytes of checksum at the end
ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed',
'headers':None, 'encrypted':False}
try:
usize, flags, dstart, xor_len, xor_start = struct.unpack_from(
b'>LLLLL', data, 4)
except:
ans['err'] = 'Failed to read font record header fields'
return ans
font_data = data[dstart:]
ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len,
'xor_start':xor_start, 'dstart':dstart}
if flags & 0b10:
# De-obfuscate the data
key = bytearray(data[xor_start:xor_start+xor_len])
buf = bytearray(font_data)
extent = len(font_data) if extent is None else extent
extent = min(extent, len(font_data))
for n in range(extent):
buf[n] ^= key[n%xor_len] # XOR of buf and key
font_data = bytes(buf)
ans['encrypted'] = True
if flags & 0b1:
# ZLIB compressed data
try:
font_data = zlib.decompress(font_data)
except Exception as e:
ans['err'] = 'Failed to zlib decompress font data (%s)'%e
return ans
if len(font_data) != usize:
ans['err'] = 'Uncompressed font size mismatch'
return ans
ans['font_data'] = font_data
sig = font_data[:4]
ans['ext'] = ('ttf' if sig in {b'\0\1\0\0', b'true', b'ttcf'}
else 'otf' if sig == b'OTTO' else 'dat')
return ans
def write_font_record(data, obfuscate=True, compress=True):
'''
Write the ttf/otf font represented by data into a font record. See
read_font_record() for details on the format of the record.
'''
flags = 0
key_len = 20
usize = len(data)
xor_key = b''
if compress:
flags |= 0b1
data = zlib.compress(data, 9)
if obfuscate and len(data) >= 1040:
flags |= 0b10
xor_key = os.urandom(key_len)
key = bytearray(xor_key)
data = bytearray(data)
for i in range(1040):
data[i] ^= key[i%key_len]
data = bytes(data)
key_start = struct.calcsize(b'>5L') + 4
data_start = key_start + len(xor_key)
header = b'FONT' + struct.pack(b'>5L', usize, flags, data_start,
len(xor_key), key_start)
return header + xor_key + data
# }}}
def create_text_record(text):
'''
Return a Palmdoc record of size RECORD_SIZE from the text file object.
In case the record ends in the middle of a multibyte character return
the overlap as well.
Returns data, overlap: where both are byte strings. overlap is the
extra bytes needed to complete the truncated multibyte character.
'''
opos = text.tell()
text.seek(0, 2)
# npos is the position of the next record
npos = min((opos + RECORD_SIZE, text.tell()))
# Number of bytes from the next record needed to complete the last
# character in this record
extra = 0
last = b''
while not last.decode('utf-8', 'ignore'):
# last contains no valid utf-8 characters
size = len(last) + 1
text.seek(npos - size)
last = text.read(size)
# last now has one valid utf-8 char and possibly some bytes that belong
# to a truncated char
try:
last.decode('utf-8', 'strict')
except UnicodeDecodeError:
# There are some truncated bytes in last
prev = len(last)
while True:
text.seek(npos - prev)
last = text.read(len(last) + 1)
try:
last.decode('utf-8')
except UnicodeDecodeError:
pass
else:
break
extra = len(last) - prev
text.seek(opos)
data = text.read(RECORD_SIZE)
overlap = text.read(extra)
text.seek(npos)
return data, overlap
class CNCX(object): # {{{
'''
Create the CNCX records. These are records containing all the strings from
an index. Each record is of the form: <vwi string size><utf-8 encoded
string>
'''
MAX_STRING_LENGTH = 500
def __init__(self, strings=()):
self.strings = OrderedDict((s, 0) for s in strings)
self.records = []
offset = 0
buf = BytesIO()
RECORD_LIMIT = 0x10000 - 1024 # kindlegen appears to use 1024, PDB limit is 0x10000
for key in self.strings:
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
l = len(utf8)
sz_bytes = encint(l)
raw = sz_bytes + utf8
if buf.tell() + len(raw) > RECORD_LIMIT:
self.records.append(align_block(buf.getvalue()))
buf.seek(0), buf.truncate(0)
offset = len(self.records) * 0x10000
buf.write(raw)
self.strings[key] = offset
offset += len(raw)
val = buf.getvalue()
if val:
self.records.append(align_block(val))
def __getitem__(self, string):
return self.strings[string]
def __bool__(self):
return bool(self.records)
__nonzero__ = __bool__
def __len__(self):
return len(self.records)
# }}}
def is_guide_ref_start(ref):
return (ref.title.lower() == 'start' or
(ref.type and ref.type.lower() in {'start',
'other.start', 'text'}))
def convert_color_for_font_tag(val):
rgba = parse_color_string(unicode_type(val or ''))
if rgba is None or rgba == 'currentColor':
return val
clamp = lambda x: min(x, max(0, x), 1)
rgb = map(clamp, rgba[:3])
return '#' + ''.join(map(lambda x:'%02x' % int(x * 255), rgb))

View File

@@ -0,0 +1,14 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
UNCOMPRESSED = 1
PALMDOC = 2
HUFFDIC = 17480
PALM_MAX_IMAGE_SIZE = 63 * 1024

View File

@@ -0,0 +1,158 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
from calibre.ebooks.mobi.utils import (rescale_image, mobify_image,
write_font_record)
from calibre.ebooks import generate_masthead
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.imghdr import what
from polyglot.builtins import iteritems, unicode_type
PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\xf0\x00\x00\x00\x00\x00\xff\xff\xff!\xf9\x04\x01\x00\x00\x00\x00!\xfe calibre-placeholder-gif-for-azw3\x00,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;' # noqa
class Resources(object):
def __init__(self, oeb, opts, is_periodical, add_fonts=False,
process_images=True):
self.oeb, self.log, self.opts = oeb, oeb.log, opts
self.is_periodical = is_periodical
self.process_images = process_images
self.item_map = {}
self.records = []
self.mime_map = {}
self.masthead_offset = 0
self.used_image_indices = set()
self.image_indices = set()
self.cover_offset = self.thumbnail_offset = None
self.has_fonts = False
self.add_resources(add_fonts)
def process_image(self, data):
if not self.process_images:
return data
func = mobify_image if self.opts.mobi_keep_original_images else rescale_image
try:
return func(data)
except Exception:
if 'png' != what(None, data):
raise
with PersistentTemporaryFile(suffix='.png') as pt:
pt.write(data)
try:
from calibre.utils.img import optimize_png
optimize_png(pt.name)
data = lopen(pt.name, 'rb').read()
finally:
os.remove(pt.name)
return func(data)
def add_resources(self, add_fonts):
oeb = self.oeb
oeb.logger.info('Serializing resources...')
index = 1
mh_href = None
if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
mh_href = oeb.guide['masthead'].href
self.records.append(None)
index += 1
self.used_image_indices.add(0)
self.image_indices.add(0)
elif self.is_periodical:
# Generate a default masthead
data = generate_masthead(unicode_type(self.oeb.metadata['title'][0]))
self.records.append(data)
self.used_image_indices.add(0)
self.image_indices.add(0)
index += 1
cover_href = self.cover_offset = self.thumbnail_offset = None
if (oeb.metadata.cover and
unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
cover_id = unicode_type(oeb.metadata.cover[0])
item = oeb.manifest.ids[cover_id]
cover_href = item.href
for item in self.oeb.manifest.values():
if item.media_type not in OEB_RASTER_IMAGES:
continue
try:
data = self.process_image(item.data)
except:
self.log.warn('Bad image file %r' % item.href)
continue
else:
if mh_href and item.href == mh_href:
self.records[0] = data
continue
self.image_indices.add(len(self.records))
self.records.append(data)
self.item_map[item.href] = index
self.mime_map[item.href] = 'image/%s'%what(None, data)
index += 1
if cover_href and item.href == cover_href:
self.cover_offset = self.item_map[item.href] - 1
self.used_image_indices.add(self.cover_offset)
try:
data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
maxsizeb=MAX_THUMB_SIZE)
except:
self.log.warn('Failed to generate thumbnail')
else:
self.image_indices.add(len(self.records))
self.records.append(data)
self.thumbnail_offset = index - 1
self.used_image_indices.add(self.thumbnail_offset)
index += 1
finally:
item.unload_data_from_memory()
if add_fonts:
for item in self.oeb.manifest.values():
if item.href and item.href.rpartition('.')[-1].lower() in {
'ttf', 'otf'} and isinstance(item.data, bytes):
self.records.append(write_font_record(item.data))
self.item_map[item.href] = len(self.records)
self.has_fonts = True
def add_extra_images(self):
'''
Add any images that were created after the call to add_resources()
'''
for item in self.oeb.manifest.values():
if (item.media_type not in OEB_RASTER_IMAGES or item.href in self.item_map):
continue
try:
data = self.process_image(item.data)
except:
self.log.warn('Bad image file %r' % item.href)
else:
self.records.append(data)
self.item_map[item.href] = len(self.records)
finally:
item.unload_data_from_memory()
def serialize(self, records, used_images):
used_image_indices = self.used_image_indices | {
v-1 for k, v in iteritems(self.item_map) if k in used_images}
for i in self.image_indices-used_image_indices:
self.records[i] = PLACEHOLDER_GIF
records.extend(self.records)
def __bool__(self):
return bool(self.records)
__nonzero__ = __bool__