mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-17 11:43:30 +02:00
Initial import
This commit is contained in:
15
ebook_converter/ebooks/mobi/__init__.py
Normal file
15
ebook_converter/ebooks/mobi/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
|
||||
class MobiError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# That might be a bit small on the PW, but Amazon/KG 2.5 still uses these values, even when delivered to a PW
|
||||
MAX_THUMB_SIZE = 16 * 1024
|
||||
MAX_THUMB_DIMEN = (180, 240)
|
||||
108
ebook_converter/ebooks/mobi/huffcdic.py
Normal file
108
ebook_converter/ebooks/mobi/huffcdic.py
Normal file
@@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decompress MOBI files compressed with the Huff/cdic algorithm. Code thanks to darkninja
|
||||
and igorsk.
|
||||
'''
|
||||
|
||||
import struct
|
||||
|
||||
from calibre.ebooks.mobi import MobiError
|
||||
from polyglot.builtins import map
|
||||
|
||||
|
||||
class Reader(object):
|
||||
|
||||
def __init__(self):
|
||||
self.q = struct.Struct(b'>Q').unpack_from
|
||||
|
||||
def load_huff(self, huff):
|
||||
if huff[0:8] != b'HUFF\x00\x00\x00\x18':
|
||||
raise MobiError('Invalid HUFF header')
|
||||
off1, off2 = struct.unpack_from(b'>LL', huff, 8)
|
||||
|
||||
def dict1_unpack(v):
|
||||
codelen, term, maxcode = v&0x1f, v&0x80, v>>8
|
||||
assert codelen != 0
|
||||
if codelen <= 8:
|
||||
assert term
|
||||
maxcode = ((maxcode + 1) << (32 - codelen)) - 1
|
||||
return (codelen, term, maxcode)
|
||||
self.dict1 = tuple(map(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)))
|
||||
|
||||
dict2 = struct.unpack_from(b'>64L', huff, off2)
|
||||
self.mincode, self.maxcode = (), ()
|
||||
for codelen, mincode in enumerate((0,) + dict2[0::2]):
|
||||
self.mincode += (mincode << (32 - codelen), )
|
||||
for codelen, maxcode in enumerate((0,) + dict2[1::2]):
|
||||
self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
|
||||
|
||||
self.dictionary = []
|
||||
|
||||
def load_cdic(self, cdic):
|
||||
if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
|
||||
raise MobiError('Invalid CDIC header')
|
||||
phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
|
||||
n = min(1<<bits, phrases-len(self.dictionary))
|
||||
h = struct.Struct(b'>H').unpack_from
|
||||
|
||||
def getslice(off):
|
||||
blen, = h(cdic, 16+off)
|
||||
slice = cdic[18+off:18+off+(blen&0x7fff)]
|
||||
return (slice, blen&0x8000)
|
||||
self.dictionary += map(getslice, struct.unpack_from(b'>%dH' % n, cdic, 16))
|
||||
|
||||
def unpack(self, data):
|
||||
q = self.q
|
||||
|
||||
bitsleft = len(data) * 8
|
||||
data += b'\x00\x00\x00\x00\x00\x00\x00\x00'
|
||||
pos = 0
|
||||
x, = q(data, pos)
|
||||
n = 32
|
||||
|
||||
s = []
|
||||
while True:
|
||||
if n <= 0:
|
||||
pos += 4
|
||||
x, = q(data, pos)
|
||||
n += 32
|
||||
code = (x >> n) & ((1 << 32) - 1)
|
||||
|
||||
codelen, term, maxcode = self.dict1[code >> 24]
|
||||
if not term:
|
||||
while code < self.mincode[codelen]:
|
||||
codelen += 1
|
||||
maxcode = self.maxcode[codelen]
|
||||
|
||||
n -= codelen
|
||||
bitsleft -= codelen
|
||||
if bitsleft < 0:
|
||||
break
|
||||
|
||||
r = (maxcode - code) >> (32 - codelen)
|
||||
slice_, flag = self.dictionary[r]
|
||||
if not flag:
|
||||
self.dictionary[r] = None
|
||||
slice_ = self.unpack(slice_)
|
||||
self.dictionary[r] = (slice_, 1)
|
||||
s.append(slice_)
|
||||
return b''.join(s)
|
||||
|
||||
|
||||
class HuffReader(object):
|
||||
|
||||
def __init__(self, huffs):
|
||||
self.reader = Reader()
|
||||
self.reader.load_huff(huffs[0])
|
||||
for cdic in huffs[1:]:
|
||||
self.reader.load_cdic(cdic)
|
||||
|
||||
def unpack(self, section):
|
||||
return self.reader.unpack(section)
|
||||
355
ebook_converter/ebooks/mobi/langcodes.py
Normal file
355
ebook_converter/ebooks/mobi/langcodes.py
Normal file
@@ -0,0 +1,355 @@
|
||||
#!/usr/bin/env python2
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from struct import pack
|
||||
from calibre.utils.localization import lang_as_iso639_1
|
||||
|
||||
lang_codes = {
|
||||
}
|
||||
|
||||
main_language = {
|
||||
0 : "NEUTRAL",
|
||||
54 : "AFRIKAANS",
|
||||
28 : "ALBANIAN",
|
||||
1 : "ARABIC",
|
||||
43 : "ARMENIAN",
|
||||
77 : "ASSAMESE",
|
||||
44 : "AZERI",
|
||||
45 : "BASQUE",
|
||||
35 : "BELARUSIAN",
|
||||
69 : "BENGALI",
|
||||
2 : "BULGARIAN",
|
||||
3 : "CATALAN",
|
||||
4 : "CHINESE",
|
||||
# 26 : "CROATIAN",
|
||||
5 : "CZECH",
|
||||
6 : "DANISH",
|
||||
19 : "DUTCH",
|
||||
9 : "ENGLISH",
|
||||
37 : "ESTONIAN",
|
||||
56 : "FAEROESE",
|
||||
41 : "FARSI",
|
||||
11 : "FINNISH",
|
||||
12 : "FRENCH",
|
||||
55 : "GEORGIAN",
|
||||
7 : "GERMAN",
|
||||
8 : "GREEK",
|
||||
71 : "GUJARATI",
|
||||
13 : "HEBREW",
|
||||
57 : "HINDI",
|
||||
14 : "HUNGARIAN",
|
||||
15 : "ICELANDIC",
|
||||
33 : "INDONESIAN",
|
||||
16 : "ITALIAN",
|
||||
17 : "JAPANESE",
|
||||
75 : "KANNADA",
|
||||
63 : "KAZAK",
|
||||
87 : "KONKANI",
|
||||
18 : "KOREAN",
|
||||
38 : "LATVIAN",
|
||||
39 : "LITHUANIAN",
|
||||
47 : "MACEDONIAN",
|
||||
62 : "MALAY",
|
||||
76 : "MALAYALAM",
|
||||
58 : "MALTESE",
|
||||
78 : "MARATHI",
|
||||
97 : "NEPALI",
|
||||
20 : "NORWEGIAN",
|
||||
72 : "ORIYA",
|
||||
21 : "POLISH",
|
||||
22 : "PORTUGUESE",
|
||||
70 : "PUNJABI",
|
||||
23 : "RHAETOROMANIC",
|
||||
24 : "ROMANIAN",
|
||||
25 : "RUSSIAN",
|
||||
59 : "SAMI",
|
||||
79 : "SANSKRIT",
|
||||
26 : "SERBIAN",
|
||||
27 : "SLOVAK",
|
||||
36 : "SLOVENIAN",
|
||||
46 : "SORBIAN",
|
||||
10 : "SPANISH",
|
||||
48 : "SUTU",
|
||||
65 : "SWAHILI",
|
||||
29 : "SWEDISH",
|
||||
73 : "TAMIL",
|
||||
68 : "TATAR",
|
||||
74 : "TELUGU",
|
||||
30 : "THAI",
|
||||
49 : "TSONGA",
|
||||
50 : "TSWANA",
|
||||
31 : "TURKISH",
|
||||
34 : "UKRAINIAN",
|
||||
32 : "URDU",
|
||||
67 : "UZBEK",
|
||||
42 : "VIETNAMESE",
|
||||
52 : "XHOSA",
|
||||
53 : "ZULU",
|
||||
}
|
||||
|
||||
sub_language = {
|
||||
0 : "NEUTRAL",
|
||||
# 1 : "ARABIC_SAUDI_ARABIA",
|
||||
# 2 : "ARABIC_IRAQ",
|
||||
# 3 : "ARABIC_EGYPT",
|
||||
# 4 : "ARABIC_LIBYA",
|
||||
# 5 : "ARABIC_ALGERIA",
|
||||
# 6 : "ARABIC_MOROCCO",
|
||||
# 7 : "ARABIC_TUNISIA",
|
||||
# 8 : "ARABIC_OMAN",
|
||||
# 9 : "ARABIC_YEMEN",
|
||||
# 10 : "ARABIC_SYRIA",
|
||||
# 11 : "ARABIC_JORDAN",
|
||||
# 12 : "ARABIC_LEBANON",
|
||||
# 13 : "ARABIC_KUWAIT",
|
||||
# 14 : "ARABIC_UAE",
|
||||
# 15 : "ARABIC_BAHRAIN",
|
||||
# 16 : "ARABIC_QATAR",
|
||||
# 1 : "AZERI_LATIN",
|
||||
# 2 : "AZERI_CYRILLIC",
|
||||
# 1 : "CHINESE_TRADITIONAL",
|
||||
# 2 : "CHINESE_SIMPLIFIED",
|
||||
# 3 : "CHINESE_HONGKONG",
|
||||
# 4 : "CHINESE_SINGAPORE",
|
||||
# 1 : "DUTCH",
|
||||
# 2 : "DUTCH_BELGIAN",
|
||||
# 1 : "FRENCH",
|
||||
# 2 : "FRENCH_BELGIAN",
|
||||
# 3 : "FRENCH_CANADIAN",
|
||||
# 4 : "FRENCH_SWISS",
|
||||
# 5 : "FRENCH_LUXEMBOURG",
|
||||
# 6 : "FRENCH_MONACO",
|
||||
# 1 : "GERMAN",
|
||||
# 2 : "GERMAN_SWISS",
|
||||
# 3 : "GERMAN_AUSTRIAN",
|
||||
# 4 : "GERMAN_LUXEMBOURG",
|
||||
# 5 : "GERMAN_LIECHTENSTEIN",
|
||||
# 1 : "ITALIAN",
|
||||
# 2 : "ITALIAN_SWISS",
|
||||
# 1 : "KOREAN",
|
||||
# 1 : "LITHUANIAN",
|
||||
# 1 : "MALAY_MALAYSIA",
|
||||
# 2 : "MALAY_BRUNEI_DARUSSALAM",
|
||||
# 1 : "NORWEGIAN_BOKMAL",
|
||||
# 2 : "NORWEGIAN_NYNORSK",
|
||||
# 2 : "PORTUGUESE",
|
||||
# 1 : "PORTUGUESE_BRAZILIAN",
|
||||
# 2 : "SERBIAN_LATIN",
|
||||
3 : "SERBIAN_CYRILLIC",
|
||||
# 1 : "SPANISH",
|
||||
# 2 : "SPANISH_MEXICAN",
|
||||
4 : "SPANISH_GUATEMALA",
|
||||
5 : "SPANISH_COSTA_RICA",
|
||||
6 : "SPANISH_PANAMA",
|
||||
7 : "SPANISH_DOMINICAN_REPUBLIC",
|
||||
8 : "SPANISH_VENEZUELA",
|
||||
9 : "SPANISH_COLOMBIA",
|
||||
10 : "SPANISH_PERU",
|
||||
11 : "SPANISH_ARGENTINA",
|
||||
12 : "SPANISH_ECUADOR",
|
||||
13 : "SPANISH_CHILE",
|
||||
14 : "SPANISH_URUGUAY",
|
||||
15 : "SPANISH_PARAGUAY",
|
||||
16 : "SPANISH_BOLIVIA",
|
||||
17 : "SPANISH_EL_SALVADOR",
|
||||
18 : "SPANISH_HONDURAS",
|
||||
19 : "SPANISH_NICARAGUA",
|
||||
20 : "SPANISH_PUERTO_RICO",
|
||||
# 1 : "SWEDISH",
|
||||
# 2 : "SWEDISH_FINLAND",
|
||||
1 : "UZBEK_LATIN",
|
||||
2 : "UZBEK_CYRILLIC",
|
||||
}
|
||||
|
||||
IANA_MOBI = \
|
||||
{None: {None: (0, 0)},
|
||||
'af': {None: (54, 0)},
|
||||
'ar': {None: (1, 0),
|
||||
'AE': (1, 56),
|
||||
'BH': (1, 60),
|
||||
'DZ': (1, 20),
|
||||
'EG': (1, 12),
|
||||
'JO': (1, 44),
|
||||
'KW': (1, 52),
|
||||
'LB': (1, 48),
|
||||
'MA': (1, 24),
|
||||
'OM': (1, 32),
|
||||
'QA': (1, 64),
|
||||
'SA': (1, 4),
|
||||
'SY': (1, 40),
|
||||
'TN': (1, 28),
|
||||
'YE': (1, 36)},
|
||||
'as': {None: (77, 0)},
|
||||
'az': {None: (44, 0)},
|
||||
'be': {None: (35, 0)},
|
||||
'bg': {None: (2, 0)},
|
||||
'bn': {None: (69, 0)},
|
||||
'ca': {None: (3, 0)},
|
||||
'cs': {None: (5, 0)},
|
||||
'da': {None: (6, 0)},
|
||||
'de': {None: (7, 0),
|
||||
'AT': (7, 12),
|
||||
'CH': (7, 8),
|
||||
'LI': (7, 20),
|
||||
'LU': (7, 16)},
|
||||
'el': {None: (8, 0)},
|
||||
'en': {None: (9, 0),
|
||||
'AU': (9, 12),
|
||||
'BZ': (9, 40),
|
||||
'CA': (9, 16),
|
||||
'GB': (9, 8),
|
||||
'IE': (9, 24),
|
||||
'JM': (9, 32),
|
||||
'NZ': (9, 20),
|
||||
'PH': (9, 52),
|
||||
'TT': (9, 44),
|
||||
'US': (9, 4),
|
||||
'ZA': (9, 28),
|
||||
'ZW': (9, 48)},
|
||||
'es': {None: (10, 0),
|
||||
'AR': (10, 44),
|
||||
'BO': (10, 64),
|
||||
'CL': (10, 52),
|
||||
'CO': (10, 36),
|
||||
'CR': (10, 20),
|
||||
'DO': (10, 28),
|
||||
'EC': (10, 48),
|
||||
'ES': (10, 4),
|
||||
'GT': (10, 16),
|
||||
'HN': (10, 72),
|
||||
'MX': (10, 8),
|
||||
'NI': (10, 76),
|
||||
'PA': (10, 24),
|
||||
'PE': (10, 40),
|
||||
'PR': (10, 80),
|
||||
'PY': (10, 60),
|
||||
'SV': (10, 68),
|
||||
'UY': (10, 56),
|
||||
'VE': (10, 32)},
|
||||
'et': {None: (37, 0)},
|
||||
'eu': {None: (45, 0)},
|
||||
'fa': {None: (41, 0)},
|
||||
'fi': {None: (11, 0)},
|
||||
'fo': {None: (56, 0)},
|
||||
'fr': {None: (12, 0),
|
||||
'BE': (12, 8),
|
||||
'CA': (12, 12),
|
||||
'CH': (12, 16),
|
||||
'FR': (12, 4),
|
||||
'LU': (12, 20),
|
||||
'MC': (12, 24)},
|
||||
'gu': {None: (71, 0)},
|
||||
'he': {None: (13, 0)},
|
||||
'hi': {None: (57, 0)},
|
||||
'hr': {None: (26, 0)},
|
||||
'hu': {None: (14, 0)},
|
||||
'hy': {None: (43, 0)},
|
||||
'id': {None: (33, 0)},
|
||||
'is': {None: (15, 0)},
|
||||
'it': {None: (16, 0),
|
||||
'CH': (16, 8),
|
||||
'IT': (16, 4)},
|
||||
'ja': {None: (17, 0)},
|
||||
'ka': {None: (55, 0)},
|
||||
'kk': {None: (63, 0)},
|
||||
'kn': {None: (75, 0)},
|
||||
'ko': {None: (18, 0)},
|
||||
'kok': {None: (87, 0)},
|
||||
'lt': {None: (39, 0)},
|
||||
'lv': {None: (38, 0)},
|
||||
'mk': {None: (47, 0)},
|
||||
'ml': {None: (76, 0)},
|
||||
'mr': {None: (78, 0)},
|
||||
'ms': {None: (62, 0)},
|
||||
'mt': {None: (58, 0)},
|
||||
'ne': {None: (97, 0)},
|
||||
'nl': {None: (19, 0),
|
||||
'BE': (19, 8)},
|
||||
'no': {None: (20, 0)},
|
||||
'or': {None: (72, 0)},
|
||||
'pa': {None: (70, 0)},
|
||||
'pl': {None: (21, 0)},
|
||||
'pt': {None: (22, 0),
|
||||
'BR': (22, 4),
|
||||
'PT': (22, 8)},
|
||||
'rm': {None: (23, 0)},
|
||||
'ro': {None: (24, 0)},
|
||||
'ru': {None: (25, 0)},
|
||||
'sa': {None: (79, 0)},
|
||||
'se': {None: (59, 0)},
|
||||
'sk': {None: (27, 0)},
|
||||
'sl': {None: (36, 0)},
|
||||
'sq': {None: (28, 0)},
|
||||
'sr': {None: (26, 12),
|
||||
'RS': (26, 12)},
|
||||
'st': {None: (48, 0)},
|
||||
'sv': {None: (29, 0),
|
||||
'FI': (29, 8)},
|
||||
'sw': {None: (65, 0)},
|
||||
'ta': {None: (73, 0)},
|
||||
'te': {None: (74, 0)},
|
||||
'th': {None: (30, 0)},
|
||||
'tn': {None: (50, 0)},
|
||||
'tr': {None: (31, 0)},
|
||||
'ts': {None: (49, 0)},
|
||||
'tt': {None: (68, 0)},
|
||||
'uk': {None: (34, 0)},
|
||||
'ur': {None: (32, 0)},
|
||||
'uz': {None: (67, 0),
|
||||
'UZ': (67, 8)},
|
||||
'vi': {None: (42, 0)},
|
||||
'wen': {None: (46, 0)},
|
||||
'xh': {None: (52, 0)},
|
||||
'zh': {None: (4, 0),
|
||||
'CN': (4, 8),
|
||||
'HK': (4, 12),
|
||||
'SG': (4, 16),
|
||||
'TW': (4, 4)},
|
||||
'zu': {None: (53, 0)}}
|
||||
|
||||
|
||||
def iana2mobi(icode):
|
||||
langdict, subtags = IANA_MOBI[None], []
|
||||
if icode:
|
||||
subtags = list(icode.split('-'))
|
||||
while len(subtags) > 0:
|
||||
lang = subtags.pop(0).lower()
|
||||
lang = lang_as_iso639_1(lang)
|
||||
if lang and lang in IANA_MOBI:
|
||||
langdict = IANA_MOBI[lang]
|
||||
break
|
||||
|
||||
mcode = langdict[None]
|
||||
while len(subtags) > 0:
|
||||
subtag = subtags.pop(0)
|
||||
if subtag not in langdict:
|
||||
subtag = subtag.title()
|
||||
if subtag not in langdict:
|
||||
subtag = subtag.upper()
|
||||
if subtag in langdict:
|
||||
mcode = langdict[subtag]
|
||||
break
|
||||
return pack('>HBB', 0, mcode[1], mcode[0])
|
||||
|
||||
|
||||
def mobi2iana(langcode, sublangcode):
|
||||
prefix = suffix = None
|
||||
for code, d in IANA_MOBI.items():
|
||||
for subcode, t in d.items():
|
||||
cc, cl = t
|
||||
if cc == langcode:
|
||||
prefix = code
|
||||
if cl == sublangcode:
|
||||
suffix = subcode.lower() if subcode else None
|
||||
break
|
||||
if prefix is not None:
|
||||
break
|
||||
if prefix is None:
|
||||
return 'und'
|
||||
if suffix is None:
|
||||
return prefix
|
||||
return prefix + '-' + suffix
|
||||
10
ebook_converter/ebooks/mobi/reader/__init__.py
Normal file
10
ebook_converter/ebooks/mobi/reader/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
||||
49
ebook_converter/ebooks/mobi/reader/containers.py
Normal file
49
ebook_converter/ebooks/mobi/reader/containers.py
Normal file
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from struct import unpack_from, error
|
||||
|
||||
from calibre.utils.imghdr import what
|
||||
|
||||
|
||||
def find_imgtype(data):
|
||||
return what(None, data) or 'unknown'
|
||||
|
||||
|
||||
class Container(object):
|
||||
|
||||
def __init__(self, data):
|
||||
self.is_image_container = False
|
||||
self.resource_index = 0
|
||||
|
||||
if len(data) > 60 and data[48:52] == b'EXTH':
|
||||
length, num_items = unpack_from(b'>LL', data, 52)
|
||||
pos = 60
|
||||
while pos < 60 + length - 8:
|
||||
try:
|
||||
idx, size = unpack_from(b'>LL', data, pos)
|
||||
except error:
|
||||
break
|
||||
pos += 8
|
||||
size -= 8
|
||||
if size < 0:
|
||||
break
|
||||
if idx == 539:
|
||||
self.is_image_container = data[pos:pos+size] == b'application/image'
|
||||
break
|
||||
pos += size
|
||||
|
||||
def load_image(self, data):
|
||||
self.resource_index += 1
|
||||
if self.is_image_container:
|
||||
data = data[12:]
|
||||
imgtype = find_imgtype(data)
|
||||
if imgtype != 'unknown':
|
||||
return data, imgtype
|
||||
return None, None
|
||||
|
||||
|
||||
355
ebook_converter/ebooks/mobi/reader/headers.py
Normal file
355
ebook_converter/ebooks/mobi/reader/headers.py
Normal file
@@ -0,0 +1,355 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, re, os
|
||||
|
||||
from calibre import replace_entities
|
||||
from calibre.utils.date import parse_date
|
||||
from calibre.ebooks.mobi import MobiError
|
||||
from calibre.ebooks.metadata import MetaInformation, check_isbn
|
||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
|
||||
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
from calibre.utils.config_base import tweaks
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
NULL_INDEX = 0xffffffff
|
||||
|
||||
|
||||
def uniq(vals):
|
||||
''' Remove all duplicates from vals, while preserving order. '''
|
||||
vals = vals or ()
|
||||
seen = set()
|
||||
seen_add = seen.add
|
||||
return list(x for x in vals if x not in seen and not seen_add(x))
|
||||
|
||||
|
||||
class EXTHHeader(object): # {{{
|
||||
|
||||
def __init__(self, raw, codec, title):
|
||||
self.doctype = raw[:4]
|
||||
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
|
||||
raw = raw[12:]
|
||||
pos = 0
|
||||
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
|
||||
self.has_fake_cover = True
|
||||
self.start_offset = None
|
||||
left = self.num_items
|
||||
self.kf8_header = None
|
||||
self.uuid = self.cdetype = None
|
||||
self.page_progression_direction = None
|
||||
self.primary_writing_mode = None
|
||||
|
||||
self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace'))
|
||||
|
||||
while left > 0:
|
||||
left -= 1
|
||||
idx, size = struct.unpack('>LL', raw[pos:pos + 8])
|
||||
content = raw[pos + 8:pos + size]
|
||||
pos += size
|
||||
if idx >= 100 and idx < 200:
|
||||
self.process_metadata(idx, content, codec)
|
||||
elif idx == 203:
|
||||
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
|
||||
elif idx == 201:
|
||||
co, = struct.unpack('>L', content)
|
||||
if co < NULL_INDEX:
|
||||
self.cover_offset = co
|
||||
elif idx == 202:
|
||||
self.thumbnail_offset, = struct.unpack('>L', content)
|
||||
elif idx == 501:
|
||||
try:
|
||||
self.cdetype = content.decode('ascii')
|
||||
except UnicodeDecodeError:
|
||||
self.cdetype = None
|
||||
# cdetype
|
||||
if content == b'EBSP':
|
||||
if not self.mi.tags:
|
||||
self.mi.tags = []
|
||||
self.mi.tags.append(_('Sample Book'))
|
||||
elif idx == 502:
|
||||
# last update time
|
||||
pass
|
||||
elif idx == 503: # Long title
|
||||
# Amazon seems to regard this as the definitive book title
|
||||
# rather than the title from the PDB header. In fact when
|
||||
# sending MOBI files through Amazon's email service if the
|
||||
# title contains non ASCII chars or non filename safe chars
|
||||
# they are messed up in the PDB header
|
||||
try:
|
||||
title = self.decode(content)
|
||||
except Exception:
|
||||
pass
|
||||
elif idx == 524: # Lang code
|
||||
try:
|
||||
lang = content.decode(codec)
|
||||
lang = canonicalize_lang(lang)
|
||||
if lang:
|
||||
self.mi.language = lang
|
||||
except Exception:
|
||||
pass
|
||||
elif idx == 525:
|
||||
try:
|
||||
pwm = content.decode(codec)
|
||||
if pwm:
|
||||
self.primary_writing_mode = pwm
|
||||
except Exception:
|
||||
pass
|
||||
elif idx == 527:
|
||||
try:
|
||||
ppd = content.decode(codec)
|
||||
if ppd:
|
||||
self.page_progression_direction = ppd
|
||||
except Exception:
|
||||
pass
|
||||
# else:
|
||||
# print 'unknown record', idx, repr(content)
|
||||
if title:
|
||||
self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
|
||||
|
||||
def process_metadata(self, idx, content, codec):
|
||||
if idx == 100:
|
||||
if self.mi.is_null('authors'):
|
||||
self.mi.authors = []
|
||||
au = clean_xml_chars(self.decode(content).strip())
|
||||
# Author names in Amazon MOBI files are usually in LN, FN format,
|
||||
# try to detect and auto-correct that.
|
||||
m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
|
||||
if m is not None:
|
||||
if tweaks['author_sort_copy_method'] != 'copy':
|
||||
self.mi.authors.append(m.group(2) + ' ' + m.group(1))
|
||||
else:
|
||||
self.mi.authors.append(m.group())
|
||||
if self.mi.is_null('author_sort'):
|
||||
self.mi.author_sort = m.group()
|
||||
else:
|
||||
self.mi.authors.append(au)
|
||||
elif idx == 101:
|
||||
self.mi.publisher = clean_xml_chars(self.decode(content).strip())
|
||||
if self.mi.publisher in {'Unknown', _('Unknown')}:
|
||||
self.mi.publisher = None
|
||||
elif idx == 103:
|
||||
self.mi.comments = clean_xml_chars(self.decode(content).strip())
|
||||
elif idx == 104:
|
||||
raw = check_isbn(self.decode(content).strip().replace('-', ''))
|
||||
if raw:
|
||||
self.mi.isbn = raw
|
||||
elif idx == 105:
|
||||
if not self.mi.tags:
|
||||
self.mi.tags = []
|
||||
self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
|
||||
self.mi.tags = uniq(self.mi.tags)
|
||||
elif idx == 106:
|
||||
try:
|
||||
self.mi.pubdate = parse_date(self.decode(content), as_utc=False)
|
||||
except Exception:
|
||||
pass
|
||||
elif idx == 108:
|
||||
self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
|
||||
elif idx == 109:
|
||||
self.mi.rights = clean_xml_chars(self.decode(content).strip())
|
||||
elif idx == 112: # dc:source set in some EBSP amazon samples
|
||||
try:
|
||||
content = content.decode(codec).strip()
|
||||
isig = 'urn:isbn:'
|
||||
if content.lower().startswith(isig):
|
||||
raw = check_isbn(content[len(isig):])
|
||||
if raw and not self.mi.isbn:
|
||||
self.mi.isbn = raw
|
||||
elif content.startswith('calibre:'):
|
||||
# calibre book uuid is stored here by recent calibre
|
||||
# releases
|
||||
cid = content[len('calibre:'):]
|
||||
if cid:
|
||||
self.mi.application_id = self.mi.uuid = cid
|
||||
except:
|
||||
pass
|
||||
elif idx == 113: # ASIN or other id
|
||||
try:
|
||||
self.uuid = content.decode('ascii')
|
||||
self.mi.set_identifier('mobi-asin', self.uuid)
|
||||
except Exception:
|
||||
self.uuid = None
|
||||
elif idx == 116:
|
||||
self.start_offset, = struct.unpack(b'>L', content)
|
||||
elif idx == 121:
|
||||
self.kf8_header, = struct.unpack(b'>L', content)
|
||||
if self.kf8_header == NULL_INDEX:
|
||||
self.kf8_header = None
|
||||
# else:
|
||||
# print 'unhandled metadata record', idx, repr(content)
|
||||
# }}}
|
||||
|
||||
|
||||
class BookHeader(object):
|
||||
|
||||
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
|
||||
self.log = log
|
||||
self.compression_type = raw[:2]
|
||||
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
|
||||
self.encryption_type, = struct.unpack('>H', raw[12:14])
|
||||
if ident == b'TEXTREAD':
|
||||
self.codepage = 1252
|
||||
if len(raw) <= 16:
|
||||
self.codec = 'cp1252'
|
||||
self.extra_flags = 0
|
||||
self.title = _('Unknown')
|
||||
self.language = 'ENGLISH'
|
||||
self.sublanguage = 'NEUTRAL'
|
||||
self.exth_flag, self.exth = 0, None
|
||||
self.ancient = True
|
||||
self.first_image_index = -1
|
||||
self.mobi_version = 1
|
||||
else:
|
||||
self.ancient = False
|
||||
self.doctype = raw[16:20]
|
||||
self.length, self.type, self.codepage, self.unique_id, \
|
||||
self.version = struct.unpack('>LLLLL', raw[20:40])
|
||||
|
||||
try:
|
||||
self.codec = {
|
||||
1252: 'cp1252',
|
||||
65001: 'utf-8',
|
||||
}[self.codepage]
|
||||
except (IndexError, KeyError):
|
||||
self.codec = 'cp1252' if not user_encoding else user_encoding
|
||||
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
|
||||
self.codec))
|
||||
# Some KF8 files have header length == 264 (generated by kindlegen
|
||||
# 2.9?). See https://bugs.launchpad.net/bugs/1179144
|
||||
max_header_length = 500 # We choose 500 for future versions of kindlegen
|
||||
|
||||
if (ident == b'TEXTREAD' or self.length < 0xE4 or
|
||||
self.length > max_header_length or
|
||||
(try_extra_data_fix and self.length == 0xE4)):
|
||||
self.extra_flags = 0
|
||||
else:
|
||||
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
|
||||
|
||||
if self.compression_type == b'DH':
|
||||
self.huff_offset, self.huff_number = struct.unpack('>LL',
|
||||
raw[0x70:0x78])
|
||||
|
||||
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
|
||||
tend = toff + tlen
|
||||
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
|
||||
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
|
||||
langid = langcode & 0xFF
|
||||
sublangid = (langcode >> 10) & 0xFF
|
||||
self.language = main_language.get(langid, 'ENGLISH')
|
||||
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
|
||||
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
|
||||
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
|
||||
|
||||
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
|
||||
self.exth = None
|
||||
if not isinstance(self.title, unicode_type):
|
||||
self.title = self.title.decode(self.codec, 'replace')
|
||||
if self.exth_flag & 0x40:
|
||||
try:
|
||||
self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
|
||||
self.title)
|
||||
self.exth.mi.uid = self.unique_id
|
||||
if self.exth.mi.is_null('language'):
|
||||
try:
|
||||
self.exth.mi.language = mobi2iana(langid, sublangid)
|
||||
except:
|
||||
self.log.exception('Unknown language code')
|
||||
except:
|
||||
self.log.exception('Invalid EXTH header')
|
||||
self.exth_flag = 0
|
||||
|
||||
self.ncxidx = NULL_INDEX
|
||||
if len(raw) >= 0xF8:
|
||||
self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)
|
||||
|
||||
# Ancient PRC files from Baen can have random values for
|
||||
# mobi_version, so be conservative
|
||||
if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
|
||||
self.dividx, self.skelidx, self.datpidx, self.othidx = \
|
||||
struct.unpack_from(b'>4L', raw, 0xF8)
|
||||
|
||||
# need to use the FDST record to find out how to properly
|
||||
# unpack the raw_ml into pieces it is simply a table of start
|
||||
# and end locations for each flow piece
|
||||
self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0)
|
||||
# if cnt is 1 or less, fdst section number can be garbage
|
||||
if self.fdstcnt <= 1:
|
||||
self.fdstidx = NULL_INDEX
|
||||
else: # Null values
|
||||
self.skelidx = self.dividx = self.othidx = self.fdstidx = \
|
||||
NULL_INDEX
|
||||
|
||||
|
||||
class MetadataHeader(BookHeader):
|
||||
|
||||
def __init__(self, stream, log):
|
||||
self.stream = stream
|
||||
self.ident = self.identity()
|
||||
self.num_sections = self.section_count()
|
||||
if self.num_sections >= 2:
|
||||
header = self.header()
|
||||
BookHeader.__init__(self, header, self.ident, None, log)
|
||||
else:
|
||||
self.exth = None
|
||||
|
||||
@property
|
||||
def kf8_type(self):
|
||||
if (self.mobi_version == 8 and getattr(self, 'skelidx', NULL_INDEX) !=
|
||||
NULL_INDEX):
|
||||
return 'standalone'
|
||||
|
||||
kf8_header_index = getattr(self.exth, 'kf8_header', None)
|
||||
if kf8_header_index is None:
|
||||
return None
|
||||
try:
|
||||
if self.section_data(kf8_header_index-1) == b'BOUNDARY':
|
||||
return 'joint'
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def identity(self):
|
||||
self.stream.seek(60)
|
||||
ident = self.stream.read(8).upper()
|
||||
if ident not in (b'BOOKMOBI', b'TEXTREAD'):
|
||||
raise MobiError('Unknown book type: %s' % ident)
|
||||
return ident
|
||||
|
||||
def section_count(self):
|
||||
self.stream.seek(76)
|
||||
return struct.unpack('>H', self.stream.read(2))[0]
|
||||
|
||||
def section_offset(self, number):
|
||||
self.stream.seek(78 + number * 8)
|
||||
return struct.unpack('>LBBBB', self.stream.read(8))[0]
|
||||
|
||||
def header(self):
|
||||
section_headers = []
|
||||
# First section with the metadata
|
||||
section_headers.append(self.section_offset(0))
|
||||
# Second section used to get the length of the first
|
||||
section_headers.append(self.section_offset(1))
|
||||
|
||||
end_off = section_headers[1]
|
||||
off = section_headers[0]
|
||||
self.stream.seek(off)
|
||||
return self.stream.read(end_off - off)
|
||||
|
||||
def section_data(self, number):
|
||||
start = self.section_offset(number)
|
||||
if number == self.num_sections -1:
|
||||
end = os.stat(self.stream.name).st_size
|
||||
else:
|
||||
end = self.section_offset(number + 1)
|
||||
self.stream.seek(start)
|
||||
try:
|
||||
return self.stream.read(end - start)
|
||||
except OverflowError:
|
||||
self.stream.seek(start)
|
||||
return self.stream.read()
|
||||
277
ebook_converter/ebooks/mobi/reader/index.py
Normal file
277
ebook_converter/ebooks/mobi/reader/index.py
Normal file
@@ -0,0 +1,277 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct
|
||||
from collections import OrderedDict, namedtuple
|
||||
|
||||
from calibre.ebooks.mobi.utils import (decint, count_set_bits,
|
||||
decode_string)
|
||||
from polyglot.builtins import iteritems, range, zip
|
||||
|
||||
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
|
||||
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
|
||||
INDEX_HEADER_FIELDS = (
|
||||
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
|
||||
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
||||
) + tuple('unknown%d'%i for i in range(27)) + ('ocnt', 'oentries',
|
||||
'ordt1', 'ordt2', 'tagx')
|
||||
|
||||
|
||||
class InvalidFile(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def check_signature(data, signature):
|
||||
if data[:len(signature)] != signature:
|
||||
raise InvalidFile('Not a valid %r section'%signature)
|
||||
|
||||
|
||||
class NotAnINDXRecord(InvalidFile):
|
||||
pass
|
||||
|
||||
|
||||
class NotATAGXSection(InvalidFile):
|
||||
pass
|
||||
|
||||
|
||||
def format_bytes(byts):
|
||||
byts = bytearray(byts)
|
||||
byts = [hex(b)[2:] for b in byts]
|
||||
return ' '.join(byts)
|
||||
|
||||
|
||||
def parse_indx_header(data):
|
||||
check_signature(data, b'INDX')
|
||||
words = INDEX_HEADER_FIELDS
|
||||
num = len(words)
|
||||
values = struct.unpack('>%dL' % num, data[4:4*(num+1)])
|
||||
ans = dict(zip(words, values))
|
||||
ordt1, ordt2 = ans['ordt1'], ans['ordt2']
|
||||
ans['ordt1_raw'], ans['ordt2_raw'] = [], []
|
||||
ans['ordt_map'] = ''
|
||||
|
||||
if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
|
||||
# I dont know what this is, but using it seems to be unnecessary, so
|
||||
# just leave it as the raw bytestring
|
||||
ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
|
||||
if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
|
||||
ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
|
||||
if ans['code'] == 65002:
|
||||
# This appears to be EBCDIC-UTF (65002) encoded. I can't be
|
||||
# bothered to write a decoder for this (see
|
||||
# http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
|
||||
# Instead, we use a weird hack that seems to do the trick for all
|
||||
# the books with this type of ORDT record that I have come across.
|
||||
# Some EBSP book samples in KF8 format from Amazon have this type
|
||||
# of encoding.
|
||||
# Basically we try to interpret every second byte as a printable
|
||||
# ascii character. If we cannot, we map to the ? char.
|
||||
|
||||
parsed = bytearray(ans['oentries'])
|
||||
for i in range(0, 2*ans['oentries'], 2):
|
||||
parsed[i//2] = raw[i+1] if 0x20 < raw[i+1] < 0x7f else ord(b'?')
|
||||
ans['ordt_map'] = bytes(parsed).decode('ascii')
|
||||
else:
|
||||
ans['ordt_map'] = '?'*ans['oentries']
|
||||
|
||||
return ans
|
||||
|
||||
|
||||
class CNCX(object): # {{{
|
||||
|
||||
'''
|
||||
Parses the records that contain the compiled NCX (all strings from the
|
||||
NCX). Presents a simple offset : string mapping interface to access the
|
||||
data.
|
||||
'''
|
||||
|
||||
def __init__(self, records, codec):
|
||||
self.records = OrderedDict()
|
||||
record_offset = 0
|
||||
for raw in records:
|
||||
pos = 0
|
||||
while pos < len(raw):
|
||||
length, consumed = decint(raw[pos:])
|
||||
if length > 0:
|
||||
try:
|
||||
self.records[pos+record_offset] = raw[
|
||||
pos+consumed:pos+consumed+length].decode(codec)
|
||||
except:
|
||||
byts = raw[pos:]
|
||||
r = format_bytes(byts)
|
||||
print('CNCX entry at offset %d has unknown format %s'%(
|
||||
pos+record_offset, r))
|
||||
self.records[pos+record_offset] = r
|
||||
pos = len(raw)
|
||||
pos += consumed+length
|
||||
record_offset += 0x10000
|
||||
|
||||
def __getitem__(self, offset):
|
||||
return self.records.get(offset)
|
||||
|
||||
def get(self, offset, default=None):
|
||||
return self.records.get(offset, default)
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.records)
|
||||
__nonzero__ = __bool__
|
||||
|
||||
def iteritems(self):
|
||||
return iteritems(self.records)
|
||||
|
||||
def items(self):
|
||||
return iteritems(self.records)
|
||||
# }}}
|
||||
|
||||
|
||||
def parse_tagx_section(data):
|
||||
check_signature(data, b'TAGX')
|
||||
|
||||
tags = []
|
||||
first_entry_offset, = struct.unpack_from(b'>L', data, 4)
|
||||
control_byte_count, = struct.unpack_from(b'>L', data, 8)
|
||||
|
||||
for i in range(12, first_entry_offset, 4):
|
||||
vals = list(bytearray(data[i:i+4]))
|
||||
tags.append(TagX(*vals))
|
||||
return control_byte_count, tags
|
||||
|
||||
|
||||
def get_tag_map(control_byte_count, tagx, data, strict=False):
|
||||
ptags = []
|
||||
ans = {}
|
||||
control_bytes = list(bytearray(data[:control_byte_count]))
|
||||
data = data[control_byte_count:]
|
||||
|
||||
for x in tagx:
|
||||
if x.eof == 0x01:
|
||||
control_bytes = control_bytes[1:]
|
||||
continue
|
||||
value = control_bytes[0] & x.bitmask
|
||||
if value != 0:
|
||||
value_count = value_bytes = None
|
||||
if value == x.bitmask:
|
||||
if count_set_bits(x.bitmask) > 1:
|
||||
# If all bits of masked value are set and the mask has more
|
||||
# than one bit, a variable width value will follow after
|
||||
# the control bytes which defines the length of bytes (NOT
|
||||
# the value count!) which will contain the corresponding
|
||||
# variable width values.
|
||||
value_bytes, consumed = decint(data)
|
||||
data = data[consumed:]
|
||||
else:
|
||||
value_count = 1
|
||||
else:
|
||||
# Shift bits to get the masked value.
|
||||
mask = x.bitmask
|
||||
while mask & 0b1 == 0:
|
||||
mask >>= 1
|
||||
value >>= 1
|
||||
value_count = value
|
||||
ptags.append(PTagX(x.tag, value_count, value_bytes,
|
||||
x.num_of_values))
|
||||
|
||||
for x in ptags:
|
||||
values = []
|
||||
if x.value_count is not None:
|
||||
# Read value_count * values_per_entry variable width values.
|
||||
for _ in range(x.value_count * x.num_of_values):
|
||||
byts, consumed = decint(data)
|
||||
data = data[consumed:]
|
||||
values.append(byts)
|
||||
else: # value_bytes is not None
|
||||
# Convert value_bytes to variable width values.
|
||||
total_consumed = 0
|
||||
while total_consumed < x.value_bytes:
|
||||
# Does this work for values_per_entry != 1?
|
||||
byts, consumed = decint(data)
|
||||
data = data[consumed:]
|
||||
total_consumed += consumed
|
||||
values.append(byts)
|
||||
if total_consumed != x.value_bytes:
|
||||
err = ("Error: Should consume %s bytes, but consumed %s" %
|
||||
(x.value_bytes, total_consumed))
|
||||
if strict:
|
||||
raise ValueError(err)
|
||||
else:
|
||||
print(err)
|
||||
ans[x.tag] = values
|
||||
# Test that all bytes have been processed
|
||||
if data.replace(b'\0', b''):
|
||||
err = ("Warning: There are unprocessed index bytes left: %s" %
|
||||
format_bytes(data))
|
||||
if strict:
|
||||
raise ValueError(err)
|
||||
else:
|
||||
print(err)
|
||||
|
||||
return ans
|
||||
|
||||
|
||||
def parse_index_record(table, data, control_byte_count, tags, codec,
|
||||
ordt_map, strict=False):
|
||||
header = parse_indx_header(data)
|
||||
idxt_pos = header['start']
|
||||
if data[idxt_pos:idxt_pos+4] != b'IDXT':
|
||||
print('WARNING: Invalid INDX record')
|
||||
entry_count = header['count']
|
||||
|
||||
# loop through to build up the IDXT position starts
|
||||
idx_positions= []
|
||||
for j in range(entry_count):
|
||||
pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j))
|
||||
idx_positions.append(pos)
|
||||
# The last entry ends before the IDXT tag (but there might be zero fill
|
||||
# bytes we need to ignore!)
|
||||
idx_positions.append(idxt_pos)
|
||||
|
||||
# For each entry in the IDXT build up the tag map and any associated
|
||||
# text
|
||||
for j in range(entry_count):
|
||||
start, end = idx_positions[j:j+2]
|
||||
rec = data[start:end]
|
||||
# Sometimes (in the guide table if the type attribute has non ascii
|
||||
# values) the ident is UTF-16 encoded. Try to handle that.
|
||||
try:
|
||||
ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
|
||||
except UnicodeDecodeError:
|
||||
ident, consumed = decode_string(rec, codec='utf-16', ordt_map=ordt_map)
|
||||
if u'\x00' in ident:
|
||||
try:
|
||||
ident, consumed = decode_string(rec, codec='utf-16',
|
||||
ordt_map=ordt_map)
|
||||
except UnicodeDecodeError:
|
||||
ident = ident.replace('u\x00', u'')
|
||||
rec = rec[consumed:]
|
||||
tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
|
||||
table[ident] = tag_map
|
||||
return header
|
||||
|
||||
|
||||
def read_index(sections, idx, codec):
|
||||
table, cncx = OrderedDict(), CNCX([], codec)
|
||||
|
||||
data = sections[idx][0]
|
||||
|
||||
indx_header = parse_indx_header(data)
|
||||
indx_count = indx_header['count']
|
||||
|
||||
if indx_header['ncncx'] > 0:
|
||||
off = idx + indx_count + 1
|
||||
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
|
||||
cncx = CNCX(cncx_records, codec)
|
||||
|
||||
tag_section_start = indx_header['tagx']
|
||||
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
|
||||
|
||||
for i in range(idx + 1, idx + 1 + indx_count):
|
||||
# Index record
|
||||
data = sections[i][0]
|
||||
parse_index_record(table, data, control_byte_count, tags, codec,
|
||||
indx_header['ordt_map'])
|
||||
return table, cncx
|
||||
373
ebook_converter/ebooks/mobi/reader/markup.py
Normal file
373
ebook_converter/ebooks/mobi/reader/markup.py
Normal file
@@ -0,0 +1,373 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re, os
|
||||
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations
|
||||
from polyglot.builtins import unicode_type, range
|
||||
|
||||
|
||||
def update_internal_links(mobi8_reader, log):
|
||||
# need to update all links that are internal which
|
||||
# are based on positions within the xhtml files **BEFORE**
|
||||
# cutting and pasting any pieces into the xhtml text files
|
||||
|
||||
# kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
|
||||
# XXXX is the offset in records into divtbl
|
||||
# YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
|
||||
|
||||
mr = mobi8_reader
|
||||
|
||||
# pos:fid pattern
|
||||
posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
|
||||
posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
|
||||
|
||||
parts = []
|
||||
for part in mr.parts:
|
||||
srcpieces = posfid_pattern.split(part)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith(b'<'):
|
||||
for m in posfid_index_pattern.finditer(tag):
|
||||
posfid = m.group(1)
|
||||
offset = m.group(2)
|
||||
try:
|
||||
filename, idtag = mr.get_id_tag_by_pos_fid(
|
||||
int(posfid, 32), int(offset, 32))
|
||||
except ValueError:
|
||||
log.warn('Invalid link, points to nowhere, ignoring')
|
||||
replacement = b'#'
|
||||
else:
|
||||
suffix = (b'#' + idtag) if idtag else b''
|
||||
replacement = filename.split('/')[-1].encode(
|
||||
mr.header.codec) + suffix
|
||||
replacement = replacement.replace(b'"', b'"')
|
||||
tag = posfid_index_pattern.sub(b'"' + replacement + b'"', tag, 1)
|
||||
srcpieces[j] = tag
|
||||
raw = b''.join(srcpieces)
|
||||
try:
|
||||
parts.append(raw.decode(mr.header.codec))
|
||||
except UnicodeDecodeError:
|
||||
log.warn('Failed to decode text in KF8 part, replacing bad bytes')
|
||||
parts.append(raw.decode(mr.header.codec, 'replace'))
|
||||
|
||||
# All parts are now unicode and have no internal links
|
||||
return parts
|
||||
|
||||
|
||||
def remove_kindlegen_markup(parts, aid_anchor_suffix, linked_aids):
|
||||
|
||||
# we can safely remove all of the Kindlegen generated aid attributes and
|
||||
# calibre generated cid attributes
|
||||
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\s[ac]id\s*=[^>]*>)''',
|
||||
re.IGNORECASE)
|
||||
within_tag_aid_position_pattern = re.compile(r'''\s[ac]id\s*=['"]([^'"]*)['"]''')
|
||||
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
srcpieces = find_tag_with_aid_pattern.split(part)
|
||||
for j in range(len(srcpieces)):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith('<'):
|
||||
for m in within_tag_aid_position_pattern.finditer(tag):
|
||||
try:
|
||||
aid = m.group(1)
|
||||
except IndexError:
|
||||
aid = None
|
||||
replacement = ''
|
||||
if aid in linked_aids:
|
||||
replacement = ' id="%s"' % (aid + '-' + aid_anchor_suffix)
|
||||
tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
|
||||
srcpieces[j] = tag
|
||||
part = "".join(srcpieces)
|
||||
parts[i] = part
|
||||
|
||||
# we can safely remove all of the Kindlegen generated data-AmznPageBreak
|
||||
# attributes
|
||||
find_tag_with_AmznPageBreak_pattern = re.compile(
|
||||
r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
|
||||
within_tag_AmznPageBreak_position_pattern = re.compile(
|
||||
r'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
|
||||
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
|
||||
for j in range(len(srcpieces)):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith('<'):
|
||||
srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
|
||||
lambda m:' style="page-break-after:%s"'%m.group(1), tag)
|
||||
part = "".join(srcpieces)
|
||||
parts[i] = part
|
||||
|
||||
|
||||
def update_flow_links(mobi8_reader, resource_map, log):
|
||||
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
|
||||
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
|
||||
# kindle:embed:XXXX (used for fonts)
|
||||
|
||||
mr = mobi8_reader
|
||||
flows = []
|
||||
|
||||
img_pattern = re.compile(r'''(<[img\s|image\s|svg:image\s][^>]*>)''', re.IGNORECASE)
|
||||
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''', re.IGNORECASE)
|
||||
|
||||
tag_pattern = re.compile(r'''(<[^>]*>)''')
|
||||
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
|
||||
|
||||
url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE)
|
||||
url_img_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*''', re.IGNORECASE)
|
||||
font_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)''', re.IGNORECASE)
|
||||
url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
|
||||
|
||||
for flow in mr.flows:
|
||||
if flow is None: # 0th flow is None
|
||||
flows.append(flow)
|
||||
continue
|
||||
|
||||
if not isinstance(flow, unicode_type):
|
||||
try:
|
||||
flow = flow.decode(mr.header.codec)
|
||||
except UnicodeDecodeError:
|
||||
log.error('Flow part has invalid %s encoded bytes'%mr.header.codec)
|
||||
flow = flow.decode(mr.header.codec, 'replace')
|
||||
|
||||
# links to raster image files from image tags
|
||||
# image_pattern
|
||||
srcpieces = img_pattern.split(flow)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith('<im') or tag.startswith('<svg:image'):
|
||||
for m in img_index_pattern.finditer(tag):
|
||||
num = int(m.group(1), 32)
|
||||
href = resource_map[num-1]
|
||||
if href:
|
||||
replacement = '"%s"'%('../'+ href)
|
||||
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||
else:
|
||||
log.warn('Referenced image %s was not recognized '
|
||||
'as a valid image in %s' % (num, tag))
|
||||
srcpieces[j] = tag
|
||||
flow = "".join(srcpieces)
|
||||
|
||||
# replacements inside css url():
|
||||
srcpieces = url_pattern.split(flow)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
|
||||
# process links to raster image files
|
||||
for m in url_img_index_pattern.finditer(tag):
|
||||
num = int(m.group(1), 32)
|
||||
href = resource_map[num-1]
|
||||
if href:
|
||||
replacement = '"%s"'%('../'+ href)
|
||||
tag = url_img_index_pattern.sub(replacement, tag, 1)
|
||||
else:
|
||||
log.warn('Referenced image %s was not recognized as a '
|
||||
'valid image in %s' % (num, tag))
|
||||
|
||||
# process links to fonts
|
||||
for m in font_index_pattern.finditer(tag):
|
||||
num = int(m.group(1), 32)
|
||||
href = resource_map[num-1]
|
||||
if href is None:
|
||||
log.warn('Referenced font %s was not recognized as a '
|
||||
'valid font in %s' % (num, tag))
|
||||
else:
|
||||
replacement = '"%s"'%('../'+ href)
|
||||
if href.endswith('.failed'):
|
||||
replacement = '"%s"'%('failed-'+href)
|
||||
tag = font_index_pattern.sub(replacement, tag, 1)
|
||||
|
||||
# process links to other css pieces
|
||||
for m in url_css_index_pattern.finditer(tag):
|
||||
num = int(m.group(1), 32)
|
||||
fi = mr.flowinfo[num]
|
||||
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
|
||||
tag = url_css_index_pattern.sub(replacement, tag, 1)
|
||||
|
||||
srcpieces[j] = tag
|
||||
flow = "".join(srcpieces)
|
||||
|
||||
# flow pattern not inside url()
|
||||
srcpieces = re.split(tag_pattern, flow)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith('<'):
|
||||
for m in re.finditer(flow_pattern, tag):
|
||||
try:
|
||||
num = int(m.group(1), 32)
|
||||
fi = mr.flowinfo[num]
|
||||
except IndexError:
|
||||
log.warn('Ignoring invalid flow reference in tag', tag)
|
||||
tag = ''
|
||||
else:
|
||||
if fi.format == 'inline':
|
||||
flowtext = mr.flows[num]
|
||||
tag = flowtext
|
||||
else:
|
||||
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
|
||||
tag = flow_pattern.sub(replacement, tag, 1)
|
||||
srcpieces[j] = tag
|
||||
flow = "".join(srcpieces)
|
||||
|
||||
flows.append(flow)
|
||||
|
||||
# All flows are now unicode and have links resolved
|
||||
return flows
|
||||
|
||||
|
||||
def insert_flows_into_markup(parts, flows, mobi8_reader, log):
|
||||
mr = mobi8_reader
|
||||
|
||||
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
|
||||
tag_pattern = re.compile(r'''(<[^>]*>)''')
|
||||
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
|
||||
# flow pattern
|
||||
srcpieces = tag_pattern.split(part)
|
||||
for j in range(1, len(srcpieces),2):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith('<'):
|
||||
for m in flow_pattern.finditer(tag):
|
||||
num = int(m.group(1), 32)
|
||||
try:
|
||||
fi = mr.flowinfo[num]
|
||||
except IndexError:
|
||||
log.warn('Ignoring invalid flow reference: %s'%m.group())
|
||||
tag = ''
|
||||
else:
|
||||
if fi.format == 'inline':
|
||||
tag = flows[num]
|
||||
else:
|
||||
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
|
||||
tag = flow_pattern.sub(replacement, tag, 1)
|
||||
srcpieces[j] = tag
|
||||
part = "".join(srcpieces)
|
||||
# store away modified version
|
||||
parts[i] = part
|
||||
|
||||
|
||||
def insert_images_into_markup(parts, resource_map, log):
|
||||
# Handle any embedded raster images links in the xhtml text
|
||||
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
|
||||
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
|
||||
img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^')"]*[)'"]''')
|
||||
|
||||
style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''',
|
||||
re.IGNORECASE)
|
||||
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
srcpieces = img_pattern.split(part)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith('<im'):
|
||||
for m in img_index_pattern.finditer(tag):
|
||||
num = int(m.group(1), 32)
|
||||
href = resource_map[num-1]
|
||||
if href:
|
||||
replacement = '"%s"'%('../' + href)
|
||||
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||
else:
|
||||
log.warn('Referenced image %s was not recognized as '
|
||||
'a valid image in %s' % (num, tag))
|
||||
srcpieces[j] = tag
|
||||
part = "".join(srcpieces)
|
||||
# store away modified version
|
||||
parts[i] = part
|
||||
|
||||
# Replace urls used in style attributes
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
srcpieces = style_pattern.split(part)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if 'kindle:embed' in tag:
|
||||
for m in img_index_pattern.finditer(tag):
|
||||
num = int(m.group(1), 32)
|
||||
href = resource_map[num-1]
|
||||
osep = m.group()[0]
|
||||
csep = m.group()[-1]
|
||||
if href:
|
||||
replacement = '%s%s%s'%(osep, '../' + href, csep)
|
||||
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||
else:
|
||||
log.warn('Referenced image %s was not recognized as '
|
||||
'a valid image in %s' % (num, tag))
|
||||
srcpieces[j] = tag
|
||||
part = "".join(srcpieces)
|
||||
# store away modified version
|
||||
parts[i] = part
|
||||
|
||||
|
||||
def upshift_markup(parts):
|
||||
tag_pattern = re.compile(r'''(<(?:svg)[^>]*>)''', re.IGNORECASE)
|
||||
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
|
||||
# tag pattern
|
||||
srcpieces = re.split(tag_pattern, part)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if tag[:4].lower() == '<svg':
|
||||
tag = tag.replace('preserveaspectratio','preserveAspectRatio')
|
||||
tag = tag.replace('viewbox','viewBox')
|
||||
srcpieces[j] = tag
|
||||
part = "".join(srcpieces)
|
||||
# store away modified version
|
||||
parts[i] = part
|
||||
|
||||
|
||||
def expand_mobi8_markup(mobi8_reader, resource_map, log):
|
||||
# First update all internal links that are based on offsets
|
||||
parts = update_internal_links(mobi8_reader, log)
|
||||
|
||||
# Remove pointless markup inserted by kindlegen
|
||||
remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids)
|
||||
|
||||
# Handle substitutions for the flows pieces first as they may
|
||||
# be inlined into the xhtml text
|
||||
flows = update_flow_links(mobi8_reader, resource_map, log)
|
||||
|
||||
# Insert inline flows into the markup
|
||||
insert_flows_into_markup(parts, flows, mobi8_reader, log)
|
||||
|
||||
# Insert raster images into markup
|
||||
insert_images_into_markup(parts, resource_map, log)
|
||||
|
||||
# Perform general markup cleanups
|
||||
upshift_markup(parts)
|
||||
|
||||
# Update the parts and flows stored in the reader
|
||||
mobi8_reader.parts = parts
|
||||
mobi8_reader.flows = flows
|
||||
|
||||
# write out the parts and file flows
|
||||
os.mkdir('text') # directory containing all parts
|
||||
spine = []
|
||||
for i, part in enumerate(parts):
|
||||
pi = mobi8_reader.partinfo[i]
|
||||
with open(os.path.join(pi.type, pi.filename), 'wb') as f:
|
||||
part = strip_encoding_declarations(part)
|
||||
part = part.replace('<head>', '<head><meta charset="UTF-8"/>', 1)
|
||||
f.write(part.encode('utf-8'))
|
||||
spine.append(f.name)
|
||||
|
||||
for i, flow in enumerate(flows):
|
||||
fi = mobi8_reader.flowinfo[i]
|
||||
if fi.format == 'file':
|
||||
if not os.path.exists(fi.dir):
|
||||
os.mkdir(fi.dir)
|
||||
with open(os.path.join(fi.dir, fi.fname), 'wb') as f:
|
||||
f.write(flow.encode('utf-8'))
|
||||
|
||||
return spine
|
||||
935
ebook_converter/ebooks/mobi/reader/mobi6.py
Normal file
935
ebook_converter/ebooks/mobi/reader/mobi6.py
Normal file
@@ -0,0 +1,935 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import shutil, os, re, struct, textwrap, io
|
||||
|
||||
from lxml import html, etree
|
||||
|
||||
from calibre import xml_entity_to_unicode, entity_to_unicode, guess_type
|
||||
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
||||
from calibre.ebooks import DRMError, unit_convert
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations
|
||||
from calibre.ebooks.mobi import MobiError
|
||||
from calibre.ebooks.mobi.huffcdic import HuffReader
|
||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.mobi.reader.headers import BookHeader
|
||||
from calibre.utils.img import save_cover_data_to, gif_data_to_png_data, AnimatedGIF
|
||||
from calibre.utils.imghdr import what
|
||||
from polyglot.builtins import iteritems, unicode_type, range, map
|
||||
|
||||
|
||||
class TopazError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class KFXError(ValueError):
|
||||
|
||||
def __init__(self):
|
||||
ValueError.__init__(self, _(
|
||||
'This is an Amazon KFX book. It cannot be processed.'
|
||||
' See {} for information on how to handle KFX books.'
|
||||
).format('https://www.mobileread.com/forums/showthread.php?t=283371'))
|
||||
|
||||
|
||||
class MobiReader(object):
|
||||
PAGE_BREAK_PAT = re.compile(
|
||||
r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
|
||||
re.IGNORECASE)
|
||||
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
|
||||
|
||||
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
|
||||
try_extra_data_fix=False):
|
||||
self.log = log
|
||||
self.debug = debug
|
||||
self.embedded_mi = None
|
||||
self.warned_about_trailing_entry_corruption = False
|
||||
self.base_css_rules = textwrap.dedent('''
|
||||
body { text-align: justify }
|
||||
|
||||
blockquote { margin: 0em 0em 0em 2em; }
|
||||
|
||||
p { margin: 0em; text-indent: 1.5em }
|
||||
|
||||
.bold { font-weight: bold }
|
||||
|
||||
.italic { font-style: italic }
|
||||
|
||||
.underline { text-decoration: underline }
|
||||
|
||||
.mbp_pagebreak {
|
||||
page-break-after: always; margin: 0; display: block
|
||||
}
|
||||
''')
|
||||
self.tag_css_rules = {}
|
||||
self.left_margins = {}
|
||||
self.text_indents = {}
|
||||
|
||||
if hasattr(filename_or_stream, 'read'):
|
||||
stream = filename_or_stream
|
||||
stream.seek(0)
|
||||
else:
|
||||
stream = open(filename_or_stream, 'rb')
|
||||
|
||||
raw = stream.read()
|
||||
if raw.startswith(b'TPZ'):
|
||||
raise TopazError(_('This is an Amazon Topaz book. It cannot be processed.'))
|
||||
if raw.startswith(b'\xeaDRMION\xee'):
|
||||
raise KFXError()
|
||||
|
||||
self.header = raw[0:72]
|
||||
self.name = self.header[:32].replace(b'\x00', b'')
|
||||
self.num_sections, = struct.unpack('>H', raw[76:78])
|
||||
|
||||
self.ident = self.header[0x3C:0x3C + 8].upper()
|
||||
if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
|
||||
raise MobiError('Unknown book type: %s' % repr(self.ident))
|
||||
|
||||
self.sections = []
|
||||
self.section_headers = []
|
||||
for i in range(self.num_sections):
|
||||
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
|
||||
flags, val = a1, a2 << 16 | a3 << 8 | a4
|
||||
self.section_headers.append((offset, flags, val))
|
||||
|
||||
def section(section_number):
|
||||
if section_number == self.num_sections - 1:
|
||||
end_off = len(raw)
|
||||
else:
|
||||
end_off = self.section_headers[section_number + 1][0]
|
||||
off = self.section_headers[section_number][0]
|
||||
return raw[off:end_off]
|
||||
|
||||
for i in range(self.num_sections):
|
||||
self.sections.append((section(i), self.section_headers[i]))
|
||||
|
||||
self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
|
||||
user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
|
||||
self.name = self.name.decode(self.book_header.codec, 'replace')
|
||||
self.kf8_type = None
|
||||
k8i = getattr(self.book_header.exth, 'kf8_header', None)
|
||||
|
||||
# Ancient PRC files from Baen can have random values for
|
||||
# mobi_version, so be conservative
|
||||
if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
|
||||
'skelidx')):
|
||||
self.kf8_type = 'standalone'
|
||||
elif k8i is not None: # Check for joint mobi 6 and kf 8 file
|
||||
try:
|
||||
raw = self.sections[k8i-1][0]
|
||||
except:
|
||||
raw = None
|
||||
if raw == b'BOUNDARY':
|
||||
try:
|
||||
self.book_header = BookHeader(self.sections[k8i][0],
|
||||
self.ident, user_encoding, self.log)
|
||||
self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i
|
||||
self.book_header.mobi6_records = bh.records
|
||||
|
||||
# Need the first_image_index from the mobi 6 header as well
|
||||
for x in ('first_image_index',):
|
||||
setattr(self.book_header, x, getattr(bh, x))
|
||||
|
||||
# We need to do this because the MOBI 6 text extract code
|
||||
# does not know anything about the kf8 offset
|
||||
if hasattr(self.book_header, 'huff_offset'):
|
||||
self.book_header.huff_offset += k8i
|
||||
|
||||
self.kf8_type = 'joint'
|
||||
self.kf8_boundary = k8i-1
|
||||
except:
|
||||
self.book_header = bh
|
||||
|
||||
def check_for_drm(self):
|
||||
if self.book_header.encryption_type != 0:
|
||||
try:
|
||||
name = self.book_header.exth.mi.title
|
||||
except:
|
||||
name = self.name
|
||||
if not name:
|
||||
name = self.name
|
||||
raise DRMError(name)
|
||||
|
||||
def extract_content(self, output_dir, parse_cache):
|
||||
output_dir = os.path.abspath(output_dir)
|
||||
self.check_for_drm()
|
||||
processed_records = self.extract_text()
|
||||
if self.debug is not None:
|
||||
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
|
||||
self.add_anchors()
|
||||
self.processed_html = self.processed_html.decode(self.book_header.codec,
|
||||
'ignore')
|
||||
self.processed_html = self.processed_html.replace('</</', '</')
|
||||
self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
|
||||
self.processed_html)
|
||||
self.processed_html = self.processed_html.replace('\ufeff', '')
|
||||
# Remove tags of the form <xyz: ...> as they can cause issues further
|
||||
# along the pipeline
|
||||
self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
|
||||
self.processed_html)
|
||||
|
||||
self.processed_html = strip_encoding_declarations(self.processed_html)
|
||||
self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
|
||||
self.processed_html)
|
||||
image_name_map = self.extract_images(processed_records, output_dir)
|
||||
self.replace_page_breaks()
|
||||
self.cleanup_html()
|
||||
|
||||
self.log.debug('Parsing HTML...')
|
||||
self.processed_html = clean_xml_chars(self.processed_html)
|
||||
try:
|
||||
root = html.fromstring(self.processed_html)
|
||||
if len(root.xpath('//html')) > 5:
|
||||
root = html.fromstring(self.processed_html.replace('\x0c',
|
||||
'').replace('\x14', ''))
|
||||
except Exception:
|
||||
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
|
||||
self.processed_html = self.remove_random_bytes(self.processed_html)
|
||||
root = html.fromstring(self.processed_html)
|
||||
if root.xpath('descendant::p/descendant::p'):
|
||||
from html5_parser import parse
|
||||
self.log.warning('Malformed markup, parsing using html5-parser')
|
||||
self.processed_html = strip_encoding_declarations(self.processed_html)
|
||||
# These trip up the html5 parser causing all content to be placed
|
||||
# under the <guide> tag
|
||||
self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I)
|
||||
self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I)
|
||||
try:
|
||||
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
|
||||
except Exception:
|
||||
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
|
||||
self.processed_html = self.remove_random_bytes(self.processed_html)
|
||||
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
|
||||
if len(root.xpath('body/descendant::*')) < 1:
|
||||
# There are probably stray </html>s in the markup
|
||||
self.processed_html = self.processed_html.replace('</html>',
|
||||
'')
|
||||
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
|
||||
|
||||
if root.tag != 'html':
|
||||
self.log.warn('File does not have opening <html> tag')
|
||||
nroot = html.fromstring('<html><head></head><body></body></html>')
|
||||
bod = nroot.find('body')
|
||||
for child in list(root):
|
||||
child.getparent().remove(child)
|
||||
bod.append(child)
|
||||
root = nroot
|
||||
|
||||
htmls = list(root.xpath('//html'))
|
||||
|
||||
if len(htmls) > 1:
|
||||
self.log.warn('Markup contains multiple <html> tags, merging.')
|
||||
# Merge all <head> and <body> sections
|
||||
for h in htmls:
|
||||
p = h.getparent()
|
||||
if hasattr(p, 'remove'):
|
||||
p.remove(h)
|
||||
bodies, heads = root.xpath('//body'), root.xpath('//head')
|
||||
for x in root:
|
||||
root.remove(x)
|
||||
head, body = map(root.makeelement, ('head', 'body'))
|
||||
for h in heads:
|
||||
for x in h:
|
||||
h.remove(x)
|
||||
head.append(x)
|
||||
for b in bodies:
|
||||
for x in b:
|
||||
b.remove(x)
|
||||
body.append(x)
|
||||
root.append(head), root.append(body)
|
||||
for x in root.xpath('//script'):
|
||||
x.getparent().remove(x)
|
||||
|
||||
head = root.xpath('//head')
|
||||
if head:
|
||||
head = head[0]
|
||||
else:
|
||||
head = root.makeelement('head', {})
|
||||
root.insert(0, head)
|
||||
head.text = '\n\t'
|
||||
link = head.makeelement('link', {'type':'text/css',
|
||||
'href':'styles.css', 'rel':'stylesheet'})
|
||||
head.insert(0, link)
|
||||
link.tail = '\n\t'
|
||||
title = head.xpath('descendant::title')
|
||||
m = head.makeelement('meta', {'http-equiv':'Content-Type',
|
||||
'content':'text/html; charset=utf-8'})
|
||||
head.insert(0, m)
|
||||
if not title:
|
||||
title = head.makeelement('title', {})
|
||||
try:
|
||||
title.text = self.book_header.title
|
||||
except ValueError:
|
||||
title.text = clean_ascii_chars(self.book_header.title)
|
||||
title.tail = '\n\t'
|
||||
head.insert(0, title)
|
||||
head.text = '\n\t'
|
||||
|
||||
self.upshift_markup(root, image_name_map)
|
||||
guides = root.xpath('//guide')
|
||||
guide = guides[0] if guides else None
|
||||
metadata_elems = root.xpath('//metadata')
|
||||
if metadata_elems and self.book_header.exth is None:
|
||||
self.read_embedded_metadata(root, metadata_elems[0], guide)
|
||||
for elem in guides + metadata_elems:
|
||||
elem.getparent().remove(elem)
|
||||
htmlfile = os.path.join(output_dir, 'index.html')
|
||||
try:
|
||||
for ref in guide.xpath('descendant::reference'):
|
||||
if 'href' in ref.attrib:
|
||||
ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
def write_as_utf8(path, data):
|
||||
if isinstance(data, unicode_type):
|
||||
data = data.encode('utf-8')
|
||||
with lopen(path, 'wb') as f:
|
||||
f.write(data)
|
||||
|
||||
parse_cache[htmlfile] = root
|
||||
self.htmlfile = htmlfile
|
||||
ncx = io.BytesIO()
|
||||
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
|
||||
self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
|
||||
opf.render(lopen(self.created_opf_path, 'wb'), ncx,
|
||||
ncx_manifest_entry=ncx_manifest_entry)
|
||||
ncx = ncx.getvalue()
|
||||
if ncx:
|
||||
ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
|
||||
write_as_utf8(ncx_path, ncx)
|
||||
|
||||
css = [self.base_css_rules, '\n\n']
|
||||
for cls, rule in self.tag_css_rules.items():
|
||||
css.append('.%s { %s }\n\n' % (cls, rule))
|
||||
write_as_utf8('styles.css', ''.join(css))
|
||||
|
||||
if self.book_header.exth is not None or self.embedded_mi is not None:
|
||||
self.log.debug('Creating OPF...')
|
||||
ncx = io.BytesIO()
|
||||
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
|
||||
opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
|
||||
ncx_manifest_entry)
|
||||
ncx = ncx.getvalue()
|
||||
if ncx:
|
||||
write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
|
||||
|
||||
def read_embedded_metadata(self, root, elem, guide):
|
||||
raw = b'<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \
|
||||
html.tostring(elem, encoding='utf-8') + b'</package>'
|
||||
stream = io.BytesIO(raw)
|
||||
opf = OPF(stream)
|
||||
self.embedded_mi = opf.to_book_metadata()
|
||||
if guide is not None:
|
||||
for ref in guide.xpath('descendant::reference'):
|
||||
if 'cover' in ref.get('type', '').lower():
|
||||
href = ref.get('href', '')
|
||||
if href.startswith('#'):
|
||||
href = href[1:]
|
||||
anchors = root.xpath('//*[@id="%s"]' % href)
|
||||
if anchors:
|
||||
cpos = anchors[0]
|
||||
reached = False
|
||||
for elem in root.iter():
|
||||
if elem is cpos:
|
||||
reached = True
|
||||
if reached and elem.tag == 'img':
|
||||
cover = elem.get('src', None)
|
||||
self.embedded_mi.cover = cover
|
||||
elem.getparent().remove(elem)
|
||||
break
|
||||
break
|
||||
|
||||
def cleanup_html(self):
|
||||
self.log.debug('Cleaning up HTML...')
|
||||
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
|
||||
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
|
||||
self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
|
||||
self.processed_html = self.processed_html.replace('\r\n', '\n')
|
||||
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
||||
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
|
||||
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
|
||||
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html)
|
||||
# Swap inline and block level elements, and order block level elements according to priority
|
||||
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
|
||||
self.processed_html = re.sub(
|
||||
r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>'+r'\g<styletags>', self.processed_html)
|
||||
self.processed_html = re.sub(
|
||||
r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>'+r'\g<para>', self.processed_html)
|
||||
self.processed_html = re.sub(
|
||||
r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>'+r'\g<blockquote>', self.processed_html)
|
||||
self.processed_html = re.sub(
|
||||
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html)
|
||||
bods = htmls = 0
|
||||
for x in re.finditer('</body>|</html>', self.processed_html):
|
||||
if x == '</body>':
|
||||
bods +=1
|
||||
else:
|
||||
htmls += 1
|
||||
if bods > 1 and htmls > 1:
|
||||
break
|
||||
if bods > 1:
|
||||
self.processed_html = self.processed_html.replace('</body>', '')
|
||||
if htmls > 1:
|
||||
self.processed_html = self.processed_html.replace('</html>', '')
|
||||
|
||||
def remove_random_bytes(self, html):
|
||||
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07',
|
||||
'', html)
|
||||
|
||||
def ensure_unit(self, raw, unit='px'):
|
||||
if re.search(r'\d+$', raw) is not None:
|
||||
raw += unit
|
||||
return raw
|
||||
|
||||
def upshift_markup(self, root, image_name_map=None):
|
||||
self.log.debug('Converting style information to CSS...')
|
||||
image_name_map = image_name_map or {}
|
||||
size_map = {
|
||||
'xx-small': '0.5',
|
||||
'x-small': '1',
|
||||
'small': '2',
|
||||
'medium': '3',
|
||||
'large': '4',
|
||||
'x-large': '5',
|
||||
'xx-large': '6',
|
||||
}
|
||||
|
||||
def barename(x):
|
||||
return x.rpartition(':')[-1]
|
||||
|
||||
mobi_version = self.book_header.mobi_version
|
||||
for x in root.xpath('//ncx'):
|
||||
x.getparent().remove(x)
|
||||
svg_tags = []
|
||||
forwardable_anchors = []
|
||||
pagebreak_anchors = []
|
||||
BLOCK_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'p'}
|
||||
for i, tag in enumerate(root.iter(etree.Element)):
|
||||
tag.attrib.pop('xmlns', '')
|
||||
for x in tag.attrib:
|
||||
if ':' in x:
|
||||
del tag.attrib[x]
|
||||
if tag.tag and barename(tag.tag) == 'svg':
|
||||
svg_tags.append(tag)
|
||||
if tag.tag and barename(tag.tag.lower()) in \
|
||||
('country-region', 'place', 'placetype', 'placename',
|
||||
'state', 'city', 'street', 'address', 'content', 'form'):
|
||||
tag.tag = 'div' if tag.tag in ('content', 'form') else 'span'
|
||||
for key in tag.attrib.keys():
|
||||
tag.attrib.pop(key)
|
||||
continue
|
||||
styles, attrib = [], tag.attrib
|
||||
if 'style' in attrib:
|
||||
style = attrib.pop('style').strip()
|
||||
if style:
|
||||
styles.append(style)
|
||||
if 'height' in attrib:
|
||||
height = attrib.pop('height').strip()
|
||||
if (
|
||||
height and '<' not in height and '>' not in height and
|
||||
re.search(r'\d+', height)):
|
||||
if tag.tag in ('table', 'td', 'tr'):
|
||||
pass
|
||||
elif tag.tag == 'img':
|
||||
tag.set('height', height)
|
||||
else:
|
||||
if tag.tag == 'div' and not tag.text and \
|
||||
(not tag.tail or not tag.tail.strip()) and \
|
||||
not len(list(tag.iterdescendants())):
|
||||
# Paragraph spacer
|
||||
# Insert nbsp so that the element is never
|
||||
# discarded by a renderer
|
||||
tag.text = '\u00a0' # nbsp
|
||||
styles.append('height: %s' %
|
||||
self.ensure_unit(height))
|
||||
else:
|
||||
styles.append('margin-top: %s' % self.ensure_unit(height))
|
||||
if 'width' in attrib:
|
||||
width = attrib.pop('width').strip()
|
||||
if width and re.search(r'\d+', width):
|
||||
if tag.tag in ('table', 'td', 'tr'):
|
||||
pass
|
||||
elif tag.tag == 'img':
|
||||
tag.set('width', width)
|
||||
else:
|
||||
ewidth = self.ensure_unit(width)
|
||||
styles.append('text-indent: %s' % ewidth)
|
||||
try:
|
||||
ewidth_val = unit_convert(ewidth, 12, 500, 166)
|
||||
self.text_indents[tag] = ewidth_val
|
||||
except:
|
||||
pass
|
||||
if width.startswith('-'):
|
||||
styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
|
||||
try:
|
||||
ewidth_val = unit_convert(ewidth[1:], 12, 500, 166)
|
||||
self.left_margins[tag] = ewidth_val
|
||||
except:
|
||||
pass
|
||||
|
||||
if 'align' in attrib:
|
||||
align = attrib.pop('align').strip()
|
||||
if align:
|
||||
align = align.lower()
|
||||
if align == 'baseline':
|
||||
styles.append('vertical-align: '+align)
|
||||
else:
|
||||
styles.append('text-align: %s' % align)
|
||||
if tag.tag == 'hr':
|
||||
if mobi_version == 1:
|
||||
tag.tag = 'div'
|
||||
styles.append('page-break-before: always')
|
||||
styles.append('display: block')
|
||||
styles.append('margin: 0')
|
||||
elif tag.tag == 'i':
|
||||
tag.tag = 'span'
|
||||
tag.attrib['class'] = 'italic'
|
||||
elif tag.tag == 'u':
|
||||
tag.tag = 'span'
|
||||
tag.attrib['class'] = 'underline'
|
||||
elif tag.tag == 'b':
|
||||
tag.tag = 'span'
|
||||
tag.attrib['class'] = 'bold'
|
||||
elif tag.tag == 'font':
|
||||
sz = tag.get('size', '').lower()
|
||||
try:
|
||||
float(sz)
|
||||
except ValueError:
|
||||
if sz in list(size_map.keys()):
|
||||
attrib['size'] = size_map[sz]
|
||||
elif tag.tag == 'img':
|
||||
recindex = None
|
||||
for attr in self.IMAGE_ATTRS:
|
||||
recindex = attrib.pop(attr, None) or recindex
|
||||
if recindex is not None:
|
||||
try:
|
||||
recindex = int(recindex)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
attrib['src'] = 'images/' + image_name_map.get(recindex, '%05d.jpg' % recindex)
|
||||
for attr in ('width', 'height'):
|
||||
if attr in attrib:
|
||||
val = attrib[attr]
|
||||
if val.lower().endswith('em'):
|
||||
try:
|
||||
nval = float(val[:-2])
|
||||
nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
|
||||
attrib[attr] = "%dpx"%int(nval)
|
||||
except:
|
||||
del attrib[attr]
|
||||
elif val.lower().endswith('%'):
|
||||
del attrib[attr]
|
||||
elif tag.tag == 'pre':
|
||||
if not tag.text:
|
||||
tag.tag = 'div'
|
||||
|
||||
if (attrib.get('class', None) == 'mbp_pagebreak' and tag.tag ==
|
||||
'div' and 'filepos-id' in attrib):
|
||||
pagebreak_anchors.append(tag)
|
||||
|
||||
if 'color' in attrib:
|
||||
styles.append('color: ' + attrib.pop('color'))
|
||||
if 'bgcolor' in attrib:
|
||||
styles.append('background-color: ' + attrib.pop('bgcolor'))
|
||||
|
||||
if 'filepos-id' in attrib:
|
||||
attrib['id'] = attrib.pop('filepos-id')
|
||||
if 'name' in attrib and attrib['name'] != attrib['id']:
|
||||
attrib['name'] = attrib['id']
|
||||
if 'filepos' in attrib:
|
||||
filepos = attrib.pop('filepos')
|
||||
try:
|
||||
attrib['href'] = "#filepos%d" % int(filepos)
|
||||
except ValueError:
|
||||
pass
|
||||
if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos') and
|
||||
not tag.text and len(tag) == 0 and (tag.tail is None or not
|
||||
tag.tail.strip()) and getattr(tag.getnext(), 'tag',
|
||||
None) in BLOCK_TAGS):
|
||||
# This is an empty anchor immediately before a block tag, move
|
||||
# the id onto the block tag instead
|
||||
forwardable_anchors.append(tag)
|
||||
|
||||
if styles:
|
||||
ncls = None
|
||||
rule = '; '.join(styles)
|
||||
for sel, srule in self.tag_css_rules.items():
|
||||
if srule == rule:
|
||||
ncls = sel
|
||||
break
|
||||
if ncls is None:
|
||||
ncls = 'calibre_%d' % i
|
||||
self.tag_css_rules[ncls] = rule
|
||||
cls = attrib.get('class', '')
|
||||
cls = cls + (' ' if cls else '') + ncls
|
||||
attrib['class'] = cls
|
||||
|
||||
for tag in svg_tags:
|
||||
images = tag.xpath('descendant::img[@src]')
|
||||
parent = tag.getparent()
|
||||
|
||||
if images and hasattr(parent, 'find'):
|
||||
index = parent.index(tag)
|
||||
for img in images:
|
||||
img.getparent().remove(img)
|
||||
img.tail = img.text = None
|
||||
parent.insert(index, img)
|
||||
|
||||
if hasattr(parent, 'remove'):
|
||||
parent.remove(tag)
|
||||
|
||||
for tag in pagebreak_anchors:
|
||||
anchor = tag.attrib['id']
|
||||
del tag.attrib['id']
|
||||
if 'name' in tag.attrib:
|
||||
del tag.attrib['name']
|
||||
p = tag.getparent()
|
||||
a = p.makeelement('a')
|
||||
a.attrib['id'] = anchor
|
||||
p.insert(p.index(tag)+1, a)
|
||||
if getattr(a.getnext(), 'tag', None) in BLOCK_TAGS:
|
||||
forwardable_anchors.append(a)
|
||||
|
||||
for tag in forwardable_anchors:
|
||||
block = tag.getnext()
|
||||
tag.getparent().remove(tag)
|
||||
|
||||
if 'id' in block.attrib:
|
||||
tag.tail = block.text
|
||||
block.text = None
|
||||
block.insert(0, tag)
|
||||
else:
|
||||
block.attrib['id'] = tag.attrib['id']
|
||||
|
||||
# WebKit fails to navigate to anchors located on <br> tags
|
||||
for br in root.xpath('/body/br[@id]'):
|
||||
br.tag = 'div'
|
||||
|
||||
def get_left_whitespace(self, tag):
|
||||
|
||||
def whitespace(tag):
|
||||
lm = ti = 0.0
|
||||
if tag.tag == 'p':
|
||||
ti = unit_convert('1.5em', 12, 500, 166)
|
||||
if tag.tag == 'blockquote':
|
||||
lm = unit_convert('2em', 12, 500, 166)
|
||||
lm = self.left_margins.get(tag, lm)
|
||||
ti = self.text_indents.get(tag, ti)
|
||||
try:
|
||||
lm = float(lm)
|
||||
except:
|
||||
lm = 0.0
|
||||
try:
|
||||
ti = float(ti)
|
||||
except:
|
||||
ti = 0.0
|
||||
return lm + ti
|
||||
|
||||
parent = tag
|
||||
ans = 0.0
|
||||
while parent is not None:
|
||||
ans += whitespace(parent)
|
||||
parent = parent.getparent()
|
||||
|
||||
return ans
|
||||
|
||||
def create_opf(self, htmlfile, guide=None, root=None):
|
||||
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
|
||||
if mi is None:
|
||||
mi = MetaInformation(self.book_header.title, [_('Unknown')])
|
||||
opf = OPFCreator(os.path.dirname(htmlfile), mi)
|
||||
if hasattr(self.book_header.exth, 'cover_offset'):
|
||||
opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1)
|
||||
elif mi.cover is not None:
|
||||
opf.cover = mi.cover
|
||||
else:
|
||||
opf.cover = 'images/%05d.jpg' % 1
|
||||
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
|
||||
* opf.cover.split('/'))):
|
||||
opf.cover = None
|
||||
|
||||
cover = opf.cover
|
||||
cover_copied = None
|
||||
if cover is not None:
|
||||
cover = cover.replace('/', os.sep)
|
||||
if os.path.exists(cover):
|
||||
ncover = 'images'+os.sep+'calibre_cover.jpg'
|
||||
if os.path.exists(ncover):
|
||||
os.remove(ncover)
|
||||
shutil.copyfile(cover, ncover)
|
||||
cover_copied = os.path.abspath(ncover)
|
||||
opf.cover = ncover.replace(os.sep, '/')
|
||||
|
||||
manifest = [(htmlfile, 'application/xhtml+xml'),
|
||||
(os.path.abspath('styles.css'), 'text/css')]
|
||||
bp = os.path.dirname(htmlfile)
|
||||
added = set()
|
||||
for i in getattr(self, 'image_names', []):
|
||||
path = os.path.join(bp, 'images', i)
|
||||
added.add(path)
|
||||
manifest.append((path, guess_type(path)[0] or 'image/jpeg'))
|
||||
if cover_copied is not None:
|
||||
manifest.append((cover_copied, 'image/jpeg'))
|
||||
|
||||
opf.create_manifest(manifest)
|
||||
opf.create_spine([os.path.basename(htmlfile)])
|
||||
toc = None
|
||||
if guide is not None:
|
||||
opf.create_guide(guide)
|
||||
for ref in opf.guide:
|
||||
if ref.type.lower() == 'toc':
|
||||
toc = ref.href()
|
||||
|
||||
ncx_manifest_entry = None
|
||||
if toc:
|
||||
ncx_manifest_entry = 'toc.ncx'
|
||||
elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
|
||||
tocobj = None
|
||||
ent_pat = re.compile(r'&(\S+?);')
|
||||
if elems:
|
||||
tocobj = TOC()
|
||||
found = False
|
||||
reached = False
|
||||
for x in root.iter():
|
||||
if x == elems[-1]:
|
||||
reached = True
|
||||
continue
|
||||
if reached and x.tag == 'a':
|
||||
href = x.get('href', '')
|
||||
if href and re.match(r'\w+://', href) is None:
|
||||
try:
|
||||
text = ' '.join([t.strip() for t in
|
||||
x.xpath('descendant::text()')])
|
||||
except:
|
||||
text = ''
|
||||
text = ent_pat.sub(entity_to_unicode, text)
|
||||
item = tocobj.add_item(toc.partition('#')[0], href[1:],
|
||||
text)
|
||||
item.left_space = int(self.get_left_whitespace(x))
|
||||
found = True
|
||||
if reached and found and x.get('class', None) == 'mbp_pagebreak':
|
||||
break
|
||||
if tocobj is not None:
|
||||
tocobj = self.structure_toc(tocobj)
|
||||
opf.set_toc(tocobj)
|
||||
|
||||
return opf, ncx_manifest_entry
|
||||
|
||||
def structure_toc(self, toc):
|
||||
indent_vals = set()
|
||||
for item in toc:
|
||||
indent_vals.add(item.left_space)
|
||||
if len(indent_vals) > 6 or len(indent_vals) < 2:
|
||||
# Too many or too few levels, give up
|
||||
return toc
|
||||
indent_vals = sorted(indent_vals)
|
||||
|
||||
last_found = [None for i in indent_vals]
|
||||
|
||||
newtoc = TOC()
|
||||
|
||||
def find_parent(level):
|
||||
candidates = last_found[:level]
|
||||
for x in reversed(candidates):
|
||||
if x is not None:
|
||||
return x
|
||||
return newtoc
|
||||
|
||||
for item in toc:
|
||||
level = indent_vals.index(item.left_space)
|
||||
parent = find_parent(level)
|
||||
last_found[level] = parent.add_item(item.href, item.fragment,
|
||||
item.text)
|
||||
|
||||
return newtoc
|
||||
|
||||
def sizeof_trailing_entries(self, data):
|
||||
def sizeof_trailing_entry(ptr, psize):
|
||||
bitpos, result = 0, 0
|
||||
while True:
|
||||
v = ord(ptr[psize-1:psize])
|
||||
result |= (v & 0x7F) << bitpos
|
||||
bitpos += 7
|
||||
psize -= 1
|
||||
if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
|
||||
return result
|
||||
|
||||
num = 0
|
||||
size = len(data)
|
||||
flags = self.book_header.extra_flags >> 1
|
||||
while flags:
|
||||
if flags & 1:
|
||||
try:
|
||||
num += sizeof_trailing_entry(data, size - num)
|
||||
except IndexError:
|
||||
self.warn_about_trailing_entry_corruption()
|
||||
return 0
|
||||
flags >>= 1
|
||||
if self.book_header.extra_flags & 1:
|
||||
off = size - num - 1
|
||||
num += (ord(data[off:off+1]) & 0x3) + 1
|
||||
return num
|
||||
|
||||
def warn_about_trailing_entry_corruption(self):
|
||||
if not self.warned_about_trailing_entry_corruption:
|
||||
self.warned_about_trailing_entry_corruption = True
|
||||
self.log.warn('The trailing data entries in this MOBI file are corrupted, you might see corrupted text in the output')
|
||||
|
||||
def text_section(self, index):
|
||||
data = self.sections[index][0]
|
||||
trail_size = self.sizeof_trailing_entries(data)
|
||||
return data[:len(data)-trail_size]
|
||||
|
||||
def extract_text(self, offset=1):
|
||||
self.log.debug('Extracting text...')
|
||||
text_sections = [self.text_section(i) for i in range(offset,
|
||||
min(self.book_header.records + offset, len(self.sections)))]
|
||||
processed_records = list(range(offset-1, self.book_header.records +
|
||||
offset))
|
||||
|
||||
self.mobi_html = b''
|
||||
|
||||
if self.book_header.compression_type == b'DH':
|
||||
huffs = [self.sections[i][0] for i in
|
||||
range(self.book_header.huff_offset,
|
||||
self.book_header.huff_offset + self.book_header.huff_number)]
|
||||
processed_records += list(range(self.book_header.huff_offset,
|
||||
self.book_header.huff_offset + self.book_header.huff_number))
|
||||
huff = HuffReader(huffs)
|
||||
unpack = huff.unpack
|
||||
|
||||
elif self.book_header.compression_type == b'\x00\x02':
|
||||
unpack = decompress_doc
|
||||
|
||||
elif self.book_header.compression_type == b'\x00\x01':
|
||||
unpack = lambda x: x
|
||||
else:
|
||||
raise MobiError('Unknown compression algorithm: %r' % self.book_header.compression_type)
|
||||
self.mobi_html = b''.join(map(unpack, text_sections))
|
||||
if self.mobi_html.endswith(b'#'):
|
||||
self.mobi_html = self.mobi_html[:-1]
|
||||
|
||||
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
|
||||
self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ')
|
||||
self.mobi_html = self.mobi_html.replace(b'\0', b'')
|
||||
if self.book_header.codec == 'cp1252':
|
||||
self.mobi_html = self.mobi_html.replace(b'\x1e', b'') # record separator
|
||||
self.mobi_html = self.mobi_html.replace(b'\x02', b'') # start of text
|
||||
return processed_records
|
||||
|
||||
def replace_page_breaks(self):
|
||||
self.processed_html = self.PAGE_BREAK_PAT.sub(
|
||||
r'<div \1 class="mbp_pagebreak" />',
|
||||
self.processed_html)
|
||||
|
||||
def add_anchors(self):
|
||||
self.log.debug('Adding anchors...')
|
||||
positions = set()
|
||||
link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
|
||||
re.IGNORECASE)
|
||||
for match in link_pattern.finditer(self.mobi_html):
|
||||
positions.add(int(match.group(1)))
|
||||
pos = 0
|
||||
processed_html = []
|
||||
end_tag_re = re.compile(br'<\s*/')
|
||||
for end in sorted(positions):
|
||||
if end == 0:
|
||||
continue
|
||||
oend = end
|
||||
l = self.mobi_html.find(b'<', end)
|
||||
r = self.mobi_html.find(b'>', end)
|
||||
anchor = b'<a id="filepos%d"></a>'
|
||||
if r > -1 and (r < l or l == end or l == -1):
|
||||
p = self.mobi_html.rfind(b'<', 0, end + 1)
|
||||
if (pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and
|
||||
not self.mobi_html[p:r + 1].endswith(b'/>')):
|
||||
anchor = b' filepos-id="filepos%d"'
|
||||
end = r
|
||||
else:
|
||||
end = r + 1
|
||||
processed_html.append(self.mobi_html[pos:end] + (anchor % oend))
|
||||
pos = end
|
||||
processed_html.append(self.mobi_html[pos:])
|
||||
processed_html = b''.join(processed_html)
|
||||
|
||||
# Remove anchors placed inside entities
|
||||
self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)([^;]*);',
|
||||
br'&\1\3;\2', processed_html)
|
||||
|
||||
def extract_images(self, processed_records, output_dir):
|
||||
self.log.debug('Extracting images...')
|
||||
output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
image_index = 0
|
||||
self.image_names = []
|
||||
image_name_map = {}
|
||||
start = getattr(self.book_header, 'first_image_index', -1)
|
||||
if start > self.num_sections or start < 0:
|
||||
# BAEN PRC files have bad headers
|
||||
start = 0
|
||||
for i in range(start, self.num_sections):
|
||||
if i in processed_records:
|
||||
continue
|
||||
processed_records.append(i)
|
||||
data = self.sections[i][0]
|
||||
image_index += 1
|
||||
if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
|
||||
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
|
||||
# This record is a known non image type, no need to try to
|
||||
# load the image
|
||||
continue
|
||||
|
||||
try:
|
||||
imgfmt = what(None, data)
|
||||
except Exception:
|
||||
continue
|
||||
if imgfmt not in {'jpg', 'jpeg', 'gif', 'png', 'bmp'}:
|
||||
continue
|
||||
if imgfmt == 'jpeg':
|
||||
imgfmt = 'jpg'
|
||||
if imgfmt == 'gif':
|
||||
try:
|
||||
data = gif_data_to_png_data(data)
|
||||
imgfmt = 'png'
|
||||
except AnimatedGIF:
|
||||
pass
|
||||
path = os.path.join(output_dir, '%05d.%s' % (image_index, imgfmt))
|
||||
image_name_map[image_index] = os.path.basename(path)
|
||||
if imgfmt == 'png':
|
||||
with open(path, 'wb') as f:
|
||||
f.write(data)
|
||||
else:
|
||||
try:
|
||||
save_cover_data_to(data, path, minify_to=(10000, 10000))
|
||||
except Exception:
|
||||
continue
|
||||
self.image_names.append(os.path.basename(path))
|
||||
return image_name_map
|
||||
|
||||
|
||||
def test_mbp_regex():
|
||||
for raw, m in iteritems({
|
||||
'<mbp:pagebreak></mbp:pagebreak>':'',
|
||||
'<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy',
|
||||
'<mbp:pagebreak> </mbp:pagebreak>':'',
|
||||
'<mbp:pagebreak>xxx':'xxx',
|
||||
'<mbp:pagebreak/>xxx':'xxx',
|
||||
'<mbp:pagebreak sdf/ >xxx':' sdfxxx',
|
||||
'<mbp:pagebreak / >':' ',
|
||||
'</mbp:pagebreak>':'',
|
||||
'</mbp:pagebreak sdf>':' sdf',
|
||||
'</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':'xxx',
|
||||
}):
|
||||
ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
|
||||
if ans != m:
|
||||
raise Exception('%r != %r for %r'%(ans, m, raw))
|
||||
590
ebook_converter/ebooks/mobi/reader/mobi8.py
Normal file
590
ebook_converter/ebooks/mobi/reader/mobi8.py
Normal file
@@ -0,0 +1,590 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, re, os
|
||||
from collections import namedtuple
|
||||
from itertools import repeat
|
||||
from uuid import uuid4
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||
from calibre.ebooks.mobi.reader.index import read_index
|
||||
from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
|
||||
from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
|
||||
from calibre.ebooks.mobi.reader.containers import Container, find_imgtype
|
||||
from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.mobi.utils import read_font_record
|
||||
from calibre.ebooks.oeb.parse_utils import parse_html
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML, xml2text
|
||||
from polyglot.builtins import range, zip, unicode_type, getcwd, as_unicode
|
||||
from polyglot.urllib import urldefrag
|
||||
|
||||
Part = namedtuple('Part',
|
||||
'num type filename start end aid')
|
||||
|
||||
Elem = namedtuple('Elem',
|
||||
'insert_pos toc_text file_number sequence_number start_pos '
|
||||
'length')
|
||||
|
||||
FlowInfo = namedtuple('FlowInfo',
|
||||
'type format dir fname')
|
||||
|
||||
# locate beginning and ending positions of tag with specific aid attribute
|
||||
|
||||
|
||||
def locate_beg_end_of_tag(ml, aid):
|
||||
pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid
|
||||
aid_pattern = re.compile(pattern, re.IGNORECASE)
|
||||
for m in re.finditer(aid_pattern, ml):
|
||||
plt = m.start()
|
||||
pgt = ml.find(b'>', plt+1)
|
||||
return plt, pgt
|
||||
return 0, 0
|
||||
|
||||
|
||||
def reverse_tag_iter(block):
|
||||
''' Iterate over all tags in block in reverse order, i.e. last tag
|
||||
to first tag. '''
|
||||
end = len(block)
|
||||
while True:
|
||||
pgt = block.rfind(b'>', 0, end)
|
||||
if pgt == -1:
|
||||
break
|
||||
plt = block.rfind(b'<', 0, pgt)
|
||||
if plt == -1:
|
||||
break
|
||||
yield block[plt:pgt+1]
|
||||
end = plt
|
||||
|
||||
|
||||
def get_first_resource_index(first_image_index, num_of_text_records, first_text_record_number):
|
||||
first_resource_index = first_image_index
|
||||
if first_resource_index in {-1, NULL_INDEX}:
|
||||
first_resource_index = num_of_text_records + first_text_record_number
|
||||
return first_resource_index
|
||||
|
||||
|
||||
class Mobi8Reader(object):
|
||||
|
||||
def __init__(self, mobi6_reader, log, for_tweak=False):
|
||||
self.for_tweak = for_tweak
|
||||
self.mobi6_reader, self.log = mobi6_reader, log
|
||||
self.header = mobi6_reader.book_header
|
||||
self.encrypted_fonts = []
|
||||
self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
|
||||
self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
|
||||
self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
|
||||
|
||||
def __call__(self):
|
||||
self.mobi6_reader.check_for_drm()
|
||||
self.aid_anchor_suffix = uuid4().hex.encode('utf-8')
|
||||
bh = self.mobi6_reader.book_header
|
||||
if self.mobi6_reader.kf8_type == 'joint':
|
||||
offset = self.mobi6_reader.kf8_boundary + 2
|
||||
self.resource_offsets = [
|
||||
(get_first_resource_index(bh.first_image_index, bh.mobi6_records, 1), offset - 2),
|
||||
(get_first_resource_index(bh.kf8_first_image_index, bh.records, offset), len(self.mobi6_reader.sections)),
|
||||
]
|
||||
else:
|
||||
offset = 1
|
||||
self.resource_offsets = [(get_first_resource_index(bh.first_image_index, bh.records, offset), len(self.mobi6_reader.sections))]
|
||||
|
||||
self.processed_records = self.mobi6_reader.extract_text(offset=offset)
|
||||
self.raw_ml = self.mobi6_reader.mobi_html
|
||||
with open('debug-raw.html', 'wb') as f:
|
||||
f.write(self.raw_ml)
|
||||
|
||||
self.kf8_sections = self.mobi6_reader.sections[offset-1:]
|
||||
|
||||
self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
|
||||
self.linked_aids = set()
|
||||
|
||||
self.read_indices()
|
||||
self.build_parts()
|
||||
guide = self.create_guide()
|
||||
ncx = self.create_ncx()
|
||||
resource_map = self.extract_resources(self.mobi6_reader.sections)
|
||||
spine = self.expand_text(resource_map)
|
||||
return self.write_opf(guide, ncx, spine, resource_map)
|
||||
|
||||
def read_indices(self):
|
||||
self.flow_table = ()
|
||||
|
||||
if self.header.fdstidx != NULL_INDEX:
|
||||
header = self.kf8_sections[self.header.fdstidx][0]
|
||||
if header[:4] != b'FDST':
|
||||
raise ValueError('KF8 does not have a valid FDST record')
|
||||
sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
|
||||
secs = struct.unpack_from(b'>%dL' % (num_sections*2),
|
||||
header, sec_start)
|
||||
self.flow_table = tuple(zip(secs[::2], secs[1::2]))
|
||||
|
||||
self.files = []
|
||||
if self.header.skelidx != NULL_INDEX:
|
||||
table = read_index(self.kf8_sections, self.header.skelidx,
|
||||
self.header.codec)[0]
|
||||
File = namedtuple('File',
|
||||
'file_number name divtbl_count start_position length')
|
||||
|
||||
for i, text in enumerate(table):
|
||||
tag_map = table[text]
|
||||
self.files.append(File(i, text, tag_map[1][0],
|
||||
tag_map[6][0], tag_map[6][1]))
|
||||
|
||||
self.elems = []
|
||||
if self.header.dividx != NULL_INDEX:
|
||||
table, cncx = read_index(self.kf8_sections, self.header.dividx,
|
||||
self.header.codec)
|
||||
for i, text in enumerate(table):
|
||||
tag_map = table[text]
|
||||
toc_text = cncx[tag_map[2][0]]
|
||||
self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
|
||||
tag_map[4][0], tag_map[6][0], tag_map[6][1]))
|
||||
|
||||
self.guide = []
|
||||
if self.header.othidx != NULL_INDEX:
|
||||
table, cncx = read_index(self.kf8_sections, self.header.othidx,
|
||||
self.header.codec)
|
||||
Item = namedtuple('Item',
|
||||
'type title pos_fid')
|
||||
|
||||
for i, ref_type in enumerate(table):
|
||||
tag_map = table[ref_type]
|
||||
# ref_type, ref_title, div/frag number
|
||||
title = cncx[tag_map[1][0]]
|
||||
fileno = None
|
||||
if 3 in list(tag_map.keys()):
|
||||
fileno = tag_map[3][0]
|
||||
if 6 in list(tag_map.keys()):
|
||||
fileno = tag_map[6]
|
||||
if isinstance(ref_type, bytes):
|
||||
ref_type = ref_type.decode(self.header.codec)
|
||||
self.guide.append(Item(ref_type, title, fileno))
|
||||
|
||||
def build_parts(self):
|
||||
raw_ml = self.mobi6_reader.mobi_html
|
||||
self.flows = []
|
||||
self.flowinfo = []
|
||||
ft = self.flow_table if self.flow_table else [(0, len(raw_ml))]
|
||||
|
||||
# now split the raw_ml into its flow pieces
|
||||
for start, end in ft:
|
||||
self.flows.append(raw_ml[start:end])
|
||||
|
||||
# the first piece represents the xhtml text
|
||||
text = self.flows[0]
|
||||
self.flows[0] = b''
|
||||
|
||||
# walk the <skeleton> and <div> tables to build original source xhtml
|
||||
# files *without* destroying any file position information needed for
|
||||
# later href processing and create final list of file separation start:
|
||||
# stop points and etc in partinfo
|
||||
self.parts = []
|
||||
self.partinfo = []
|
||||
divptr = 0
|
||||
baseptr = 0
|
||||
for skelnum, skelname, divcnt, skelpos, skellen in self.files:
|
||||
baseptr = skelpos + skellen
|
||||
skeleton = text[skelpos:baseptr]
|
||||
inspos_warned = False
|
||||
for i in range(divcnt):
|
||||
insertpos, idtext, filenum, seqnum, startpos, length = \
|
||||
self.elems[divptr]
|
||||
if i == 0:
|
||||
aidtext = idtext[12:-2]
|
||||
filename = 'part%04d.html' % filenum
|
||||
part = text[baseptr:baseptr + length]
|
||||
insertpos = insertpos - skelpos
|
||||
head = skeleton[:insertpos]
|
||||
tail = skeleton[insertpos:]
|
||||
if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') <
|
||||
head.rfind(b'<')):
|
||||
# There is an incomplete tag in either the head or tail.
|
||||
# This can happen for some badly formed KF8 files, see for
|
||||
# example, https://bugs.launchpad.net/bugs/1082669
|
||||
if not inspos_warned:
|
||||
self.log.warn(
|
||||
'The div table for %s has incorrect insert '
|
||||
'positions. Calculating manually.'%skelname)
|
||||
inspos_warned = True
|
||||
bp, ep = locate_beg_end_of_tag(skeleton, aidtext if
|
||||
isinstance(aidtext, bytes) else aidtext.encode('utf-8'))
|
||||
if bp != ep:
|
||||
insertpos = ep + 1 + startpos
|
||||
|
||||
skeleton = skeleton[0:insertpos] + part + skeleton[insertpos:]
|
||||
baseptr = baseptr + length
|
||||
divptr += 1
|
||||
self.parts.append(skeleton)
|
||||
if divcnt < 1:
|
||||
# Empty file
|
||||
aidtext = unicode_type(uuid4())
|
||||
filename = aidtext + '.html'
|
||||
self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
|
||||
baseptr, aidtext))
|
||||
|
||||
# The primary css style sheet is typically stored next followed by any
|
||||
# snippets of code that were previously inlined in the
|
||||
# original xhtml but have been stripped out and placed here.
|
||||
# This can include local CDATA snippets and svg sections.
|
||||
|
||||
# The problem is that for most browsers and ereaders, you can not
|
||||
# use <img src="imageXXXX.svg" /> to import any svg image that itself
|
||||
# properly uses an <image/> tag to import some raster image - it
|
||||
# should work according to the spec but does not for almost all browsers
|
||||
# and ereaders and causes epub validation issues because those raster
|
||||
# images are in manifest but not in xhtml text - since they only
|
||||
# referenced from an svg image
|
||||
|
||||
# So we need to check the remaining flow pieces to see if they are css
|
||||
# or svg images. if svg images, we must check if they have an <image/>
|
||||
# and if so inline them into the xhtml text pieces.
|
||||
|
||||
# there may be other sorts of pieces stored here but until we see one
|
||||
# in the wild to reverse engineer we won't be able to tell
|
||||
|
||||
self.flowinfo.append(FlowInfo(None, None, None, None))
|
||||
svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
|
||||
image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''', re.IGNORECASE)
|
||||
for j in range(1, len(self.flows)):
|
||||
flowpart = self.flows[j]
|
||||
nstr = '%04d' % j
|
||||
m = svg_tag_pattern.search(flowpart)
|
||||
if m is not None:
|
||||
# svg
|
||||
typ = 'svg'
|
||||
start = m.start()
|
||||
m2 = image_tag_pattern.search(flowpart)
|
||||
if m2 is not None:
|
||||
format = 'inline'
|
||||
dir = None
|
||||
fname = None
|
||||
# strip off anything before <svg if inlining
|
||||
flowpart = re.sub(br'(</?)svg:', r'\1', flowpart[start:])
|
||||
else:
|
||||
format = 'file'
|
||||
dir = "images"
|
||||
fname = 'svgimg' + nstr + '.svg'
|
||||
else:
|
||||
# search for CDATA and if exists inline it
|
||||
if flowpart.find(b'[CDATA[') >= 0:
|
||||
typ = 'css'
|
||||
flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
|
||||
format = 'inline'
|
||||
dir = None
|
||||
fname = None
|
||||
else:
|
||||
# css - assume as standalone css file
|
||||
typ = 'css'
|
||||
format = 'file'
|
||||
dir = "styles"
|
||||
fname = nstr + '.css'
|
||||
|
||||
self.flows[j] = flowpart
|
||||
self.flowinfo.append(FlowInfo(typ, format, dir, fname))
|
||||
|
||||
def get_file_info(self, pos):
|
||||
''' Get information about the part (file) that exists at pos in
|
||||
the raw markup '''
|
||||
for part in self.partinfo:
|
||||
if pos >= part.start and pos < part.end:
|
||||
return part
|
||||
return Part(*repeat(None, len(Part._fields)))
|
||||
|
||||
def get_id_tag_by_pos_fid(self, posfid, offset):
|
||||
# first convert kindle:pos:fid and offset info to position in file
|
||||
insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid]
|
||||
pos = insertpos + offset
|
||||
fi = self.get_file_info(pos)
|
||||
# an existing "id=" must exist in original xhtml otherwise it would not
|
||||
# have worked for linking. Amazon seems to have added its own
|
||||
# additional "aid=" inside tags whose contents seem to represent some
|
||||
# position information encoded into Base32 name.
|
||||
|
||||
# so find the closest "id=" before position the file by actually
|
||||
# searching in that file
|
||||
idtext = self.get_id_tag(pos)
|
||||
return '%s/%s'%(fi.type, fi.filename), idtext
|
||||
|
||||
def get_id_tag(self, pos):
|
||||
# Find the first tag with a named anchor (name or id attribute) before
|
||||
# pos
|
||||
fi = self.get_file_info(pos)
|
||||
if fi.num is None and fi.start is None:
|
||||
raise ValueError('No file contains pos: %d'%pos)
|
||||
textblock = self.parts[fi.num]
|
||||
npos = pos - fi.start
|
||||
pgt = textblock.find(b'>', npos)
|
||||
plt = textblock.find(b'<', npos)
|
||||
# if npos inside a tag then search all text before the its end of tag marker
|
||||
# else not in a tag need to search the preceding tag
|
||||
if plt == npos or pgt < plt:
|
||||
npos = pgt + 1
|
||||
textblock = textblock[0:npos]
|
||||
for tag in reverse_tag_iter(textblock):
|
||||
m = self.id_re.match(tag) or self.name_re.match(tag)
|
||||
if m is not None:
|
||||
return m.group(1)
|
||||
# For some files, kindlegen apparently creates links to tags
|
||||
# without HTML anchors, using the AID instead. See
|
||||
# See https://www.mobileread.com/forums/showthread.php?t=259557
|
||||
m = self.aid_re.match(tag)
|
||||
if m is not None:
|
||||
self.linked_aids.add(m.group(1))
|
||||
return m.group(1) + b'-' + self.aid_anchor_suffix
|
||||
|
||||
# No tag found, link to start of file
|
||||
return b''
|
||||
|
||||
def create_guide(self):
|
||||
guide = Guide()
|
||||
has_start = False
|
||||
for ref_type, ref_title, pos_fid in self.guide:
|
||||
try:
|
||||
if len(pos_fid) != 2:
|
||||
continue
|
||||
except TypeError:
|
||||
continue # thumbnailstandard record, ignore it
|
||||
linktgt, idtext = self.get_id_tag_by_pos_fid(*pos_fid)
|
||||
if idtext:
|
||||
if isinstance(idtext, bytes):
|
||||
idtext = idtext.decode(self.header.codec)
|
||||
linktgt += '#' + idtext
|
||||
g = Guide.Reference(linktgt, getcwd())
|
||||
g.title, g.type = ref_title, ref_type
|
||||
if g.title == 'start' or g.type == 'text':
|
||||
has_start = True
|
||||
guide.append(g)
|
||||
|
||||
so = self.header.exth.start_offset
|
||||
if so not in {None, NULL_INDEX} and not has_start:
|
||||
fi = self.get_file_info(so)
|
||||
if fi.filename is not None:
|
||||
idtext = self.get_id_tag(so).decode(self.header.codec)
|
||||
linktgt = fi.filename
|
||||
if idtext:
|
||||
linktgt += '#' + idtext
|
||||
g = Guide.Reference('%s/%s'%(fi.type, linktgt), getcwd())
|
||||
g.title, g.type = 'start', 'text'
|
||||
guide.append(g)
|
||||
|
||||
return guide
|
||||
|
||||
def create_ncx(self):
|
||||
index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
|
||||
self.header.codec)
|
||||
remove = []
|
||||
|
||||
# Add href and anchor info to the index entries
|
||||
for entry in index_entries:
|
||||
pos_fid = entry['pos_fid']
|
||||
if pos_fid is None:
|
||||
pos = entry['pos']
|
||||
fi = self.get_file_info(pos)
|
||||
if fi.filename is None:
|
||||
raise ValueError('Index entry has invalid pos: %d'%pos)
|
||||
idtag = self.get_id_tag(pos)
|
||||
href = '%s/%s'%(fi.type, fi.filename)
|
||||
else:
|
||||
try:
|
||||
href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
|
||||
except ValueError:
|
||||
self.log.warn('Invalid entry in NCX (title: %s), ignoring'
|
||||
%entry['text'])
|
||||
remove.append(entry)
|
||||
continue
|
||||
|
||||
entry['href'] = href
|
||||
entry['idtag'] = as_unicode(idtag, self.header.codec or 'utf-8')
|
||||
|
||||
for e in remove:
|
||||
index_entries.remove(e)
|
||||
|
||||
# Build the TOC object
|
||||
return build_toc(index_entries)
|
||||
|
||||
def extract_resources(self, sections):
|
||||
from calibre.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF
|
||||
resource_map = []
|
||||
container = None
|
||||
for x in ('fonts', 'images'):
|
||||
os.mkdir(x)
|
||||
|
||||
for start, end in self.resource_offsets:
|
||||
for i, sec in enumerate(sections[start:end]):
|
||||
fname_idx = i+1
|
||||
data = sec[0]
|
||||
typ = data[:4]
|
||||
href = None
|
||||
if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN',
|
||||
b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET', b'PAGE'}:
|
||||
pass # Ignore these records
|
||||
elif typ == b'FONT':
|
||||
font = read_font_record(data)
|
||||
href = "fonts/%05d.%s" % (fname_idx, font['ext'])
|
||||
if font['err']:
|
||||
self.log.warn('Reading font record %d failed: %s'%(
|
||||
fname_idx, font['err']))
|
||||
if font['headers']:
|
||||
self.log.debug('Font record headers: %s'%font['headers'])
|
||||
with open(href.replace('/', os.sep), 'wb') as f:
|
||||
f.write(font['font_data'] if font['font_data'] else
|
||||
font['raw_data'])
|
||||
if font['encrypted']:
|
||||
self.encrypted_fonts.append(href)
|
||||
elif typ == b'CONT':
|
||||
if data == b'CONTBOUNDARY':
|
||||
container = None
|
||||
continue
|
||||
container = Container(data)
|
||||
elif typ == b'CRES':
|
||||
data, imgtype = container.load_image(data)
|
||||
if data is not None:
|
||||
href = 'images/%05d.%s'%(container.resource_index, imgtype)
|
||||
with open(href.replace('/', os.sep), 'wb') as f:
|
||||
f.write(data)
|
||||
elif typ == b'\xa0\xa0\xa0\xa0' and len(data) == 4 and container is not None:
|
||||
container.resource_index += 1
|
||||
elif container is None:
|
||||
if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF):
|
||||
imgtype = find_imgtype(data)
|
||||
href = 'images/%05d.%s'%(fname_idx, imgtype)
|
||||
with open(href.replace('/', os.sep), 'wb') as f:
|
||||
f.write(data)
|
||||
|
||||
resource_map.append(href)
|
||||
|
||||
return resource_map
|
||||
|
||||
def expand_text(self, resource_map):
|
||||
return expand_mobi8_markup(self, resource_map, self.log)
|
||||
|
||||
def write_opf(self, guide, toc, spine, resource_map):
|
||||
mi = self.header.exth.mi
|
||||
if (self.cover_offset is not None and self.cover_offset <
|
||||
len(resource_map)):
|
||||
mi.cover = resource_map[self.cover_offset]
|
||||
|
||||
if len(list(toc)) < 2:
|
||||
self.log.warn('KF8 has no metadata Table of Contents')
|
||||
|
||||
for ref in guide:
|
||||
if ref.type == 'toc':
|
||||
href = ref.href()
|
||||
href, frag = urldefrag(href)
|
||||
if os.path.exists(href.replace('/', os.sep)):
|
||||
try:
|
||||
toc = self.read_inline_toc(href, frag)
|
||||
except:
|
||||
self.log.exception('Failed to read inline ToC')
|
||||
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
opf.guide = guide
|
||||
|
||||
def exclude(path):
|
||||
return os.path.basename(path) == 'debug-raw.html'
|
||||
|
||||
# If there are no images then the azw3 input plugin dumps all
|
||||
# binary records as .unknown images, remove them
|
||||
if self.for_tweak and os.path.exists('images') and os.path.isdir('images'):
|
||||
files = os.listdir('images')
|
||||
unknown = [x for x in files if x.endswith('.unknown')]
|
||||
if len(files) == len(unknown):
|
||||
[os.remove('images/'+f) for f in files]
|
||||
|
||||
if self.for_tweak:
|
||||
try:
|
||||
os.remove('debug-raw.html')
|
||||
except:
|
||||
pass
|
||||
|
||||
opf.create_manifest_from_files_in([getcwd()], exclude=exclude)
|
||||
for entry in opf.manifest:
|
||||
if entry.mime_type == 'text/html':
|
||||
entry.mime_type = 'application/xhtml+xml'
|
||||
opf.create_spine(spine)
|
||||
opf.set_toc(toc)
|
||||
ppd = getattr(self.header.exth, 'page_progression_direction', None)
|
||||
if ppd in {'ltr', 'rtl', 'default'}:
|
||||
opf.page_progression_direction = ppd
|
||||
pwm = getattr(self.header.exth, 'primary_writing_mode', None)
|
||||
if pwm is not None:
|
||||
opf.primary_writing_mode = pwm
|
||||
|
||||
with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx:
|
||||
opf.render(of, ncx, 'toc.ncx')
|
||||
return 'metadata.opf'
|
||||
|
||||
def read_inline_toc(self, href, frag):
|
||||
ans = TOC()
|
||||
base_href = '/'.join(href.split('/')[:-1])
|
||||
with open(href.replace('/', os.sep), 'rb') as f:
|
||||
raw = f.read().decode(self.header.codec)
|
||||
root = parse_html(raw, log=self.log)
|
||||
body = XPath('//h:body')(root)
|
||||
reached = False
|
||||
if body:
|
||||
start = body[0]
|
||||
else:
|
||||
start = None
|
||||
reached = True
|
||||
if frag:
|
||||
elems = XPath('//*[@id="%s"]'%frag)(root)
|
||||
if elems:
|
||||
start = elems[0]
|
||||
|
||||
def node_depth(elem):
|
||||
ans = 0
|
||||
parent = elem.getparent()
|
||||
while parent is not None:
|
||||
parent = parent.getparent()
|
||||
ans += 1
|
||||
return ans
|
||||
|
||||
# Layer the ToC based on nesting order in the source HTML
|
||||
current_depth = None
|
||||
parent = ans
|
||||
seen = set()
|
||||
links = []
|
||||
for elem in root.iterdescendants(etree.Element):
|
||||
if reached and elem.tag == XHTML('a') and elem.get('href',
|
||||
False):
|
||||
href = elem.get('href')
|
||||
href, frag = urldefrag(href)
|
||||
href = base_href + '/' + href
|
||||
text = xml2text(elem).strip()
|
||||
if (text, href, frag) in seen:
|
||||
continue
|
||||
seen.add((text, href, frag))
|
||||
links.append((text, href, frag, node_depth(elem)))
|
||||
elif elem is start:
|
||||
reached = True
|
||||
|
||||
depths = sorted(set(x[-1] for x in links))
|
||||
depth_map = {x:i for i, x in enumerate(depths)}
|
||||
for text, href, frag, depth in links:
|
||||
depth = depth_map[depth]
|
||||
if current_depth is None:
|
||||
current_depth = 0
|
||||
parent.add_item(href, frag, text)
|
||||
elif current_depth == depth:
|
||||
parent.add_item(href, frag, text)
|
||||
elif current_depth < depth:
|
||||
parent = parent[-1] if len(parent) > 0 else parent
|
||||
parent.add_item(href, frag, text)
|
||||
current_depth += 1
|
||||
else:
|
||||
delta = current_depth - depth
|
||||
while delta > 0 and parent.parent is not None:
|
||||
parent = parent.parent
|
||||
delta -= 1
|
||||
parent.add_item(href, frag, text)
|
||||
current_depth = depth
|
||||
return ans
|
||||
100
ebook_converter/ebooks/mobi/reader/ncx.py
Normal file
100
ebook_converter/ebooks/mobi/reader/ncx.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre import replace_entities
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||
from calibre.ebooks.mobi.reader.index import read_index
|
||||
from polyglot.builtins import iteritems, getcwd
|
||||
|
||||
tag_fieldname_map = {
|
||||
1: ['pos',0],
|
||||
2: ['len',0],
|
||||
3: ['noffs',0],
|
||||
4: ['hlvl',0],
|
||||
5: ['koffs',0],
|
||||
6: ['pos_fid',0],
|
||||
21: ['parent',0],
|
||||
22: ['child1',0],
|
||||
23: ['childn',0],
|
||||
69: ['image_index',0],
|
||||
70 : ['desc_offset', 0], # 'Description offset in cncx'
|
||||
71 : ['author_offset', 0], # 'Author offset in cncx'
|
||||
72 : ['image_caption_offset', 0], # 'Image caption offset in cncx',
|
||||
73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx',
|
||||
|
||||
}
|
||||
|
||||
default_entry = {
|
||||
'pos': -1,
|
||||
'len': 0,
|
||||
'noffs': -1,
|
||||
'text' : "Unknown Text",
|
||||
'hlvl' : -1,
|
||||
'kind' : "Unknown Class",
|
||||
'pos_fid' : None,
|
||||
'parent' : -1,
|
||||
'child1' : -1,
|
||||
'childn' : -1,
|
||||
'description': None,
|
||||
'author': None,
|
||||
'image_caption': None,
|
||||
'image_attribution': None,
|
||||
}
|
||||
|
||||
|
||||
def read_ncx(sections, index, codec):
|
||||
index_entries = []
|
||||
|
||||
if index != NULL_INDEX:
|
||||
table, cncx = read_index(sections, index, codec)
|
||||
|
||||
for num, x in enumerate(iteritems(table)):
|
||||
text, tag_map = x
|
||||
entry = default_entry.copy()
|
||||
entry['name'] = text
|
||||
entry['num'] = num
|
||||
|
||||
for tag in tag_fieldname_map:
|
||||
fieldname, i = tag_fieldname_map[tag]
|
||||
if tag in tag_map:
|
||||
fieldvalue = tag_map[tag][i]
|
||||
if tag == 6:
|
||||
# Appears to be an idx into the KF8 elems table with an
|
||||
# offset
|
||||
fieldvalue = tuple(tag_map[tag])
|
||||
entry[fieldname] = fieldvalue
|
||||
for which, name in iteritems({3:'text', 5:'kind', 70:'description',
|
||||
71:'author', 72:'image_caption',
|
||||
73:'image_attribution'}):
|
||||
if tag == which:
|
||||
entry[name] = cncx.get(fieldvalue,
|
||||
default_entry[name])
|
||||
index_entries.append(entry)
|
||||
|
||||
return index_entries
|
||||
|
||||
|
||||
def build_toc(index_entries):
|
||||
ans = TOC(base_path=getcwd())
|
||||
levels = {x['hlvl'] for x in index_entries}
|
||||
num_map = {-1: ans}
|
||||
level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in
|
||||
levels}
|
||||
for lvl in sorted(levels):
|
||||
for item in level_map[lvl]:
|
||||
parent = num_map[item['parent']]
|
||||
child = parent.add_item(item['href'], item['idtag'],
|
||||
replace_entities(item['text'], encoding=None))
|
||||
num_map[item['num']] = child
|
||||
|
||||
# Set play orders in depth first order
|
||||
for i, item in enumerate(ans.flat()):
|
||||
item.play_order = i
|
||||
|
||||
return ans
|
||||
109
ebook_converter/ebooks/mobi/tweak.py
Normal file
109
ebook_converter/ebooks/mobi/tweak.py
Normal file
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, glob
|
||||
|
||||
from calibre import CurrentDir
|
||||
from calibre.ebooks.mobi import MobiError
|
||||
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
|
||||
from calibre.ebooks.mobi.reader.headers import MetadataHeader
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.ebooks import DRMError
|
||||
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
|
||||
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
|
||||
from calibre.customize.ui import (plugin_for_input_format,
|
||||
plugin_for_output_format)
|
||||
from calibre.utils.ipc.simple_worker import fork_job
|
||||
|
||||
|
||||
class BadFormat(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def do_explode(path, dest):
|
||||
with open(path, 'rb') as stream:
|
||||
mr = MobiReader(stream, default_log, None, None)
|
||||
|
||||
with CurrentDir(dest):
|
||||
mr = Mobi8Reader(mr, default_log)
|
||||
opf = os.path.abspath(mr())
|
||||
try:
|
||||
os.remove('debug-raw.html')
|
||||
except:
|
||||
pass
|
||||
|
||||
return opf
|
||||
|
||||
|
||||
def explode(path, dest, question=lambda x:True):
|
||||
with open(path, 'rb') as stream:
|
||||
raw = stream.read(3)
|
||||
stream.seek(0)
|
||||
if raw == b'TPZ':
|
||||
raise BadFormat(_('This is not a MOBI file. It is a Topaz file.'))
|
||||
|
||||
try:
|
||||
header = MetadataHeader(stream, default_log)
|
||||
except MobiError:
|
||||
raise BadFormat(_('This is not a MOBI file.'))
|
||||
|
||||
if header.encryption_type != 0:
|
||||
raise DRMError(_('This file is locked with DRM. It cannot be tweaked.'))
|
||||
|
||||
kf8_type = header.kf8_type
|
||||
|
||||
if kf8_type is None:
|
||||
raise BadFormat(_('This MOBI file does not contain a KF8 format '
|
||||
'book. KF8 is the new format from Amazon. calibre can '
|
||||
'only tweak MOBI files that contain KF8 books. Older '
|
||||
'MOBI files without KF8 are not tweakable.'))
|
||||
|
||||
if kf8_type == 'joint':
|
||||
if not question(_('This MOBI file contains both KF8 and '
|
||||
'older Mobi6 data. Tweaking it will remove the Mobi6 data, which '
|
||||
'means the file will not be usable on older Kindles. Are you '
|
||||
'sure?')):
|
||||
return None
|
||||
|
||||
return fork_job('calibre.ebooks.mobi.tweak', 'do_explode', args=(path,
|
||||
dest), no_output=True)['result']
|
||||
|
||||
|
||||
def set_cover(oeb):
|
||||
if 'cover' not in oeb.guide or oeb.metadata['cover']:
|
||||
return
|
||||
cover = oeb.guide['cover']
|
||||
if cover.href in oeb.manifest.hrefs:
|
||||
item = oeb.manifest.hrefs[cover.href]
|
||||
oeb.metadata.clear('cover')
|
||||
oeb.metadata.add('cover', item.id)
|
||||
|
||||
|
||||
def do_rebuild(opf, dest_path):
|
||||
plumber = Plumber(opf, dest_path, default_log)
|
||||
plumber.setup_options()
|
||||
inp = plugin_for_input_format('azw3')
|
||||
outp = plugin_for_output_format('azw3')
|
||||
|
||||
plumber.opts.mobi_passthrough = True
|
||||
oeb = create_oebbook(default_log, opf, plumber.opts)
|
||||
set_cover(oeb)
|
||||
outp.convert(oeb, dest_path, inp, plumber.opts, default_log)
|
||||
|
||||
|
||||
def rebuild(src_dir, dest_path):
|
||||
opf = glob.glob(os.path.join(src_dir, '*.opf'))
|
||||
if not opf:
|
||||
raise ValueError('No OPF file found in %s'%src_dir)
|
||||
opf = opf[0]
|
||||
# For debugging, uncomment the following two lines
|
||||
# def fork_job(a, b, args=None, no_output=True):
|
||||
# do_rebuild(*args)
|
||||
fork_job('calibre.ebooks.mobi.tweak', 'do_rebuild', args=(opf, dest_path),
|
||||
no_output=True)
|
||||
|
||||
646
ebook_converter/ebooks/mobi/utils.py
Normal file
646
ebook_converter/ebooks/mobi/utils.py
Normal file
@@ -0,0 +1,646 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, string, zlib, os
|
||||
from collections import OrderedDict
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.utils.img import save_cover_data_to, scale_image, image_to_data, image_from_data, resize_image, png_data_to_gif_data
|
||||
from calibre.utils.imghdr import what
|
||||
from calibre.ebooks import normalize
|
||||
from polyglot.builtins import unicode_type, range, as_bytes, map
|
||||
from tinycss.color3 import parse_color_string
|
||||
|
||||
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
||||
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
|
||||
|
||||
|
||||
class PolyglotDict(dict):
|
||||
|
||||
def __setitem__(self, key, val):
|
||||
if isinstance(key, unicode_type):
|
||||
key = key.encode('utf-8')
|
||||
dict.__setitem__(self, key, val)
|
||||
|
||||
def __getitem__(self, key):
|
||||
if isinstance(key, unicode_type):
|
||||
key = key.encode('utf-8')
|
||||
return dict.__getitem__(self, key)
|
||||
|
||||
def __contains__(self, key):
|
||||
if isinstance(key, unicode_type):
|
||||
key = key.encode('utf-8')
|
||||
return dict.__contains__(self, key)
|
||||
|
||||
|
||||
def decode_string(raw, codec='utf-8', ordt_map=None):
|
||||
length, = struct.unpack(b'>B', raw[0:1])
|
||||
raw = raw[1:1+length]
|
||||
consumed = length+1
|
||||
if ordt_map:
|
||||
return ''.join(ordt_map[x] for x in bytearray(raw)), consumed
|
||||
return raw.decode(codec), consumed
|
||||
|
||||
|
||||
def decode_hex_number(raw, codec='utf-8'):
|
||||
'''
|
||||
Return a variable length number encoded using hexadecimal encoding. These
|
||||
numbers have the first byte which tells the number of bytes that follow.
|
||||
The bytes that follow are simply the hexadecimal representation of the
|
||||
number.
|
||||
|
||||
:param raw: Raw binary data as a bytestring
|
||||
|
||||
:return: The number and the number of bytes from raw that the number
|
||||
occupies.
|
||||
'''
|
||||
raw, consumed = decode_string(raw, codec=codec)
|
||||
return int(raw, 16), consumed
|
||||
|
||||
|
||||
def encode_string(raw):
|
||||
ans = bytearray(as_bytes(raw))
|
||||
ans.insert(0, len(ans))
|
||||
return bytes(ans)
|
||||
|
||||
|
||||
def encode_number_as_hex(num):
|
||||
'''
|
||||
Encode num as a variable length encoded hexadecimal number. Returns the
|
||||
bytestring containing the encoded number. These
|
||||
numbers have the first byte which tells the number of bytes that follow.
|
||||
The bytes that follow are simply the hexadecimal representation of the
|
||||
number.
|
||||
'''
|
||||
num = hex(num)[2:].upper().encode('ascii')
|
||||
nlen = len(num)
|
||||
if nlen % 2 != 0:
|
||||
num = b'0'+num
|
||||
return encode_string(num)
|
||||
|
||||
|
||||
def encint(value, forward=True):
|
||||
'''
|
||||
Some parts of the Mobipocket format encode data as variable-width integers.
|
||||
These integers are represented big-endian with 7 bits per byte in bits 1-7.
|
||||
They may be either forward-encoded, in which case only the first byte has bit 8 set,
|
||||
or backward-encoded, in which case only the last byte has bit 8 set.
|
||||
For example, the number 0x11111 = 0b10001000100010001 would be represented
|
||||
forward-encoded as:
|
||||
|
||||
0x04 0x22 0x91 = 0b100 0b100010 0b10010001
|
||||
|
||||
And backward-encoded as:
|
||||
|
||||
0x84 0x22 0x11 = 0b10000100 0b100010 0b10001
|
||||
|
||||
This function encodes the integer ``value`` as a variable width integer and
|
||||
returns the bytestring corresponding to it.
|
||||
|
||||
If forward is True the bytes returned are suitable for prepending to the
|
||||
output buffer, otherwise they must be append to the output buffer.
|
||||
'''
|
||||
if value < 0:
|
||||
raise ValueError('Cannot encode negative numbers as vwi')
|
||||
# Encode vwi
|
||||
byts = bytearray()
|
||||
while True:
|
||||
b = value & 0b01111111
|
||||
value >>= 7 # shift value to the right by 7 bits
|
||||
|
||||
byts.append(b)
|
||||
if value == 0:
|
||||
break
|
||||
byts[0 if forward else -1] |= 0b10000000
|
||||
byts.reverse()
|
||||
return bytes(byts)
|
||||
|
||||
|
||||
def decint(raw, forward=True):
|
||||
'''
|
||||
Read a variable width integer from the bytestring or bytearray raw and return the
|
||||
integer and the number of bytes read. If forward is True bytes are read
|
||||
from the start of raw, otherwise from the end of raw.
|
||||
|
||||
This function is the inverse of encint above, see its docs for more
|
||||
details.
|
||||
'''
|
||||
val = 0
|
||||
byts = bytearray()
|
||||
src = bytearray(raw)
|
||||
if not forward:
|
||||
src.reverse()
|
||||
for bnum in src:
|
||||
byts.append(bnum & 0b01111111)
|
||||
if bnum & 0b10000000:
|
||||
break
|
||||
if not forward:
|
||||
byts.reverse()
|
||||
for byte in byts:
|
||||
val <<= 7 # Shift value to the left by 7 bits
|
||||
val |= byte
|
||||
|
||||
return val, len(byts)
|
||||
|
||||
|
||||
def test_decint(num):
|
||||
for d in (True, False):
|
||||
raw = encint(num, forward=d)
|
||||
sz = len(raw)
|
||||
if (num, sz) != decint(raw, forward=d):
|
||||
raise ValueError('Failed for num %d, forward=%r: %r != %r' % (
|
||||
num, d, (num, sz), decint(raw, forward=d)))
|
||||
|
||||
|
||||
def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None):
|
||||
'''
|
||||
Convert image setting all transparent pixels to white and changing format
|
||||
to JPEG. Ensure the resultant image has a byte size less than
|
||||
maxsizeb.
|
||||
|
||||
If dimen is not None, generate a thumbnail of
|
||||
width=dimen, height=dimen or width, height = dimen (depending on the type
|
||||
of dimen)
|
||||
|
||||
Returns the image as a bytestring
|
||||
'''
|
||||
if dimen is not None:
|
||||
if hasattr(dimen, '__len__'):
|
||||
width, height = dimen
|
||||
else:
|
||||
width = height = dimen
|
||||
data = scale_image(data, width=width, height=height, compression_quality=90)[-1]
|
||||
else:
|
||||
# Replace transparent pixels with white pixels and convert to JPEG
|
||||
data = save_cover_data_to(data)
|
||||
if len(data) <= maxsizeb:
|
||||
return data
|
||||
orig_data = data # save it in case compression fails
|
||||
quality = 90
|
||||
while len(data) > maxsizeb and quality >= 5:
|
||||
data = image_to_data(image_from_data(orig_data), compression_quality=quality)
|
||||
quality -= 5
|
||||
if len(data) <= maxsizeb:
|
||||
return data
|
||||
orig_data = data
|
||||
|
||||
scale = 0.9
|
||||
while len(data) > maxsizeb and scale >= 0.05:
|
||||
img = image_from_data(data)
|
||||
w, h = img.width(), img.height()
|
||||
img = resize_image(img, int(scale*w), int(scale*h))
|
||||
data = image_to_data(img, compression_quality=quality)
|
||||
scale -= 0.05
|
||||
return data
|
||||
|
||||
|
||||
def get_trailing_data(record, extra_data_flags):
|
||||
'''
|
||||
Given a text record as a bytestring and the extra data flags from the MOBI
|
||||
header, return the trailing data as a dictionary, mapping bit number to
|
||||
data as bytestring. Also returns the record - all trailing data.
|
||||
|
||||
:return: Trailing data, record - trailing data
|
||||
'''
|
||||
data = OrderedDict()
|
||||
flags = extra_data_flags >> 1
|
||||
|
||||
num = 0
|
||||
while flags:
|
||||
num += 1
|
||||
if flags & 0b1:
|
||||
sz, consumed = decint(record, forward=False)
|
||||
if sz > consumed:
|
||||
data[num] = record[-sz:-consumed]
|
||||
record = record[:-sz]
|
||||
flags >>= 1
|
||||
# Read multibyte chars if any
|
||||
if extra_data_flags & 0b1:
|
||||
# Only the first two bits are used for the size since there can
|
||||
# never be more than 3 trailing multibyte chars
|
||||
sz = (ord(record[-1:]) & 0b11) + 1
|
||||
consumed = 1
|
||||
if sz > consumed:
|
||||
data[0] = record[-sz:-consumed]
|
||||
record = record[:-sz]
|
||||
return data, record
|
||||
|
||||
|
||||
def encode_trailing_data(raw):
|
||||
'''
|
||||
Given some data in the bytestring raw, return a bytestring of the form
|
||||
|
||||
<data><size>
|
||||
|
||||
where size is a backwards encoded vwi whose value is the length of the
|
||||
entire returned bytestring. data is the bytestring passed in as raw.
|
||||
|
||||
This is the encoding used for trailing data entries at the end of text
|
||||
records. See get_trailing_data() for details.
|
||||
'''
|
||||
lsize = 1
|
||||
while True:
|
||||
encoded = encint(len(raw) + lsize, forward=False)
|
||||
if len(encoded) == lsize:
|
||||
break
|
||||
lsize += 1
|
||||
return raw + encoded
|
||||
|
||||
|
||||
def encode_fvwi(val, flags, flag_size=4):
|
||||
'''
|
||||
Encode the value val and the flag_size bits from flags as a fvwi. This encoding is
|
||||
used in the trailing byte sequences for indexing. Returns encoded
|
||||
bytestring.
|
||||
'''
|
||||
ans = val << flag_size
|
||||
for i in range(flag_size):
|
||||
ans |= (flags & (1 << i))
|
||||
return encint(ans)
|
||||
|
||||
|
||||
def decode_fvwi(byts, flag_size=4):
|
||||
'''
|
||||
Decode encoded fvwi. Returns number, flags, consumed
|
||||
'''
|
||||
arg, consumed = decint(bytes(byts))
|
||||
val = arg >> flag_size
|
||||
flags = 0
|
||||
for i in range(flag_size):
|
||||
flags |= (arg & (1 << i))
|
||||
return val, flags, consumed
|
||||
|
||||
|
||||
def decode_tbs(byts, flag_size=4):
|
||||
'''
|
||||
Trailing byte sequences for indexing consists of series of fvwi numbers.
|
||||
This function reads the fvwi number and its associated flags. It then uses
|
||||
the flags to read any more numbers that belong to the series. The flags are
|
||||
the lowest 4 bits of the vwi (see the encode_fvwi function above).
|
||||
|
||||
Returns the fvwi number, a dictionary mapping flags bits to the associated
|
||||
data and the number of bytes consumed.
|
||||
'''
|
||||
byts = bytes(byts)
|
||||
val, flags, consumed = decode_fvwi(byts, flag_size=flag_size)
|
||||
extra = {}
|
||||
byts = byts[consumed:]
|
||||
if flags & 0b1000 and flag_size > 3:
|
||||
extra[0b1000] = True
|
||||
if flags & 0b0010:
|
||||
x, consumed2 = decint(byts)
|
||||
byts = byts[consumed2:]
|
||||
extra[0b0010] = x
|
||||
consumed += consumed2
|
||||
if flags & 0b0100:
|
||||
extra[0b0100] = ord(byts[0:1])
|
||||
byts = byts[1:]
|
||||
consumed += 1
|
||||
if flags & 0b0001:
|
||||
x, consumed2 = decint(byts)
|
||||
byts = byts[consumed2:]
|
||||
extra[0b0001] = x
|
||||
consumed += consumed2
|
||||
return val, extra, consumed
|
||||
|
||||
|
||||
def encode_tbs(val, extra, flag_size=4):
|
||||
'''
|
||||
Encode the number val and the extra data in the extra dict as an fvwi. See
|
||||
decode_tbs above.
|
||||
'''
|
||||
flags = 0
|
||||
for flag in extra:
|
||||
flags |= flag
|
||||
ans = encode_fvwi(val, flags, flag_size=flag_size)
|
||||
|
||||
if 0b0010 in extra:
|
||||
ans += encint(extra[0b0010])
|
||||
if 0b0100 in extra:
|
||||
ans += bytes(bytearray([extra[0b0100]]))
|
||||
if 0b0001 in extra:
|
||||
ans += encint(extra[0b0001])
|
||||
return ans
|
||||
|
||||
|
||||
def utf8_text(text):
|
||||
'''
|
||||
Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
|
||||
empty, normalized bytestring.
|
||||
'''
|
||||
if text and text.strip():
|
||||
text = text.strip()
|
||||
if not isinstance(text, unicode_type):
|
||||
text = text.decode('utf-8', 'replace')
|
||||
text = normalize(text).encode('utf-8')
|
||||
else:
|
||||
text = _('Unknown').encode('utf-8')
|
||||
return text
|
||||
|
||||
|
||||
def align_block(raw, multiple=4, pad=b'\0'):
|
||||
'''
|
||||
Return raw with enough pad bytes append to ensure its length is a multiple
|
||||
of 4.
|
||||
'''
|
||||
extra = len(raw) % multiple
|
||||
if extra == 0:
|
||||
return raw
|
||||
return raw + pad*(multiple - extra)
|
||||
|
||||
|
||||
def detect_periodical(toc, log=None):
|
||||
'''
|
||||
Detect if the TOC object toc contains a periodical that conforms to the
|
||||
structure required by kindlegen to generate a periodical.
|
||||
'''
|
||||
if toc.count() < 1 or not toc[0].klass == 'periodical':
|
||||
return False
|
||||
for node in toc.iterdescendants():
|
||||
if node.depth() == 1 and node.klass != 'article':
|
||||
if log is not None:
|
||||
log.debug(
|
||||
'Not a periodical: Deepest node does not have '
|
||||
'class="article"')
|
||||
return False
|
||||
if node.depth() == 2 and node.klass != 'section':
|
||||
if log is not None:
|
||||
log.debug(
|
||||
'Not a periodical: Second deepest node does not have'
|
||||
' class="section"')
|
||||
return False
|
||||
if node.depth() == 3 and node.klass != 'periodical':
|
||||
if log is not None:
|
||||
log.debug('Not a periodical: Third deepest node'
|
||||
' does not have class="periodical"')
|
||||
return False
|
||||
if node.depth() > 3:
|
||||
if log is not None:
|
||||
log.debug('Not a periodical: Has nodes of depth > 3')
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def count_set_bits(num):
|
||||
if num < 0:
|
||||
num = -num
|
||||
ans = 0
|
||||
while num > 0:
|
||||
ans += (num & 0b1)
|
||||
num >>= 1
|
||||
return ans
|
||||
|
||||
|
||||
def to_base(num, base=32, min_num_digits=None):
|
||||
digits = string.digits + string.ascii_uppercase
|
||||
sign = 1 if num >= 0 else -1
|
||||
if num == 0:
|
||||
return ('0' if min_num_digits is None else '0'*min_num_digits)
|
||||
num *= sign
|
||||
ans = []
|
||||
while num:
|
||||
ans.append(digits[(num % base)])
|
||||
num //= base
|
||||
if min_num_digits is not None and len(ans) < min_num_digits:
|
||||
ans.extend('0'*(min_num_digits - len(ans)))
|
||||
if sign < 0:
|
||||
ans.append('-')
|
||||
ans.reverse()
|
||||
return ''.join(ans)
|
||||
|
||||
|
||||
def mobify_image(data):
|
||||
'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG'
|
||||
fmt = what(None, data)
|
||||
if fmt == 'png':
|
||||
data = png_data_to_gif_data(data)
|
||||
return data
|
||||
|
||||
# Font records {{{
|
||||
|
||||
|
||||
def read_font_record(data, extent=1040):
|
||||
'''
|
||||
Return the font encoded in the MOBI FONT record represented by data.
|
||||
The return value in a dict with fields raw_data, font_data, err, ext,
|
||||
headers.
|
||||
|
||||
:param extent: The number of obfuscated bytes. So far I have only
|
||||
encountered files with 1040 obfuscated bytes. If you encounter an
|
||||
obfuscated record for which this function fails, try different extent
|
||||
values (easily automated).
|
||||
|
||||
raw_data is the raw data in the font record
|
||||
font_data is the decoded font_data or None if an error occurred
|
||||
err is not None if some error occurred
|
||||
ext is the font type (ttf for TrueType, dat for unknown and failed if an
|
||||
error occurred)
|
||||
headers is the list of decoded headers from the font record or None if
|
||||
decoding failed
|
||||
'''
|
||||
# Format:
|
||||
# bytes 0 - 3: 'FONT'
|
||||
# bytes 4 - 7: Uncompressed size
|
||||
# bytes 8 - 11: flags
|
||||
# bit 1 - zlib compression
|
||||
# bit 2 - XOR obfuscated
|
||||
# bytes 12 - 15: offset to start of compressed data
|
||||
# bytes 16 - 19: length of XOR string
|
||||
# bytes 19 - 23: offset to start of XOR data
|
||||
# The zlib compressed data begins with 2 bytes of header and
|
||||
# has 4 bytes of checksum at the end
|
||||
ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed',
|
||||
'headers':None, 'encrypted':False}
|
||||
|
||||
try:
|
||||
usize, flags, dstart, xor_len, xor_start = struct.unpack_from(
|
||||
b'>LLLLL', data, 4)
|
||||
except:
|
||||
ans['err'] = 'Failed to read font record header fields'
|
||||
return ans
|
||||
font_data = data[dstart:]
|
||||
ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len,
|
||||
'xor_start':xor_start, 'dstart':dstart}
|
||||
|
||||
if flags & 0b10:
|
||||
# De-obfuscate the data
|
||||
key = bytearray(data[xor_start:xor_start+xor_len])
|
||||
buf = bytearray(font_data)
|
||||
extent = len(font_data) if extent is None else extent
|
||||
extent = min(extent, len(font_data))
|
||||
|
||||
for n in range(extent):
|
||||
buf[n] ^= key[n%xor_len] # XOR of buf and key
|
||||
|
||||
font_data = bytes(buf)
|
||||
ans['encrypted'] = True
|
||||
|
||||
if flags & 0b1:
|
||||
# ZLIB compressed data
|
||||
try:
|
||||
font_data = zlib.decompress(font_data)
|
||||
except Exception as e:
|
||||
ans['err'] = 'Failed to zlib decompress font data (%s)'%e
|
||||
return ans
|
||||
|
||||
if len(font_data) != usize:
|
||||
ans['err'] = 'Uncompressed font size mismatch'
|
||||
return ans
|
||||
|
||||
ans['font_data'] = font_data
|
||||
sig = font_data[:4]
|
||||
ans['ext'] = ('ttf' if sig in {b'\0\1\0\0', b'true', b'ttcf'}
|
||||
else 'otf' if sig == b'OTTO' else 'dat')
|
||||
|
||||
return ans
|
||||
|
||||
|
||||
def write_font_record(data, obfuscate=True, compress=True):
|
||||
'''
|
||||
Write the ttf/otf font represented by data into a font record. See
|
||||
read_font_record() for details on the format of the record.
|
||||
'''
|
||||
|
||||
flags = 0
|
||||
key_len = 20
|
||||
usize = len(data)
|
||||
xor_key = b''
|
||||
if compress:
|
||||
flags |= 0b1
|
||||
data = zlib.compress(data, 9)
|
||||
if obfuscate and len(data) >= 1040:
|
||||
flags |= 0b10
|
||||
xor_key = os.urandom(key_len)
|
||||
key = bytearray(xor_key)
|
||||
data = bytearray(data)
|
||||
for i in range(1040):
|
||||
data[i] ^= key[i%key_len]
|
||||
data = bytes(data)
|
||||
|
||||
key_start = struct.calcsize(b'>5L') + 4
|
||||
data_start = key_start + len(xor_key)
|
||||
|
||||
header = b'FONT' + struct.pack(b'>5L', usize, flags, data_start,
|
||||
len(xor_key), key_start)
|
||||
|
||||
return header + xor_key + data
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
def create_text_record(text):
|
||||
'''
|
||||
Return a Palmdoc record of size RECORD_SIZE from the text file object.
|
||||
In case the record ends in the middle of a multibyte character return
|
||||
the overlap as well.
|
||||
|
||||
Returns data, overlap: where both are byte strings. overlap is the
|
||||
extra bytes needed to complete the truncated multibyte character.
|
||||
'''
|
||||
opos = text.tell()
|
||||
text.seek(0, 2)
|
||||
# npos is the position of the next record
|
||||
npos = min((opos + RECORD_SIZE, text.tell()))
|
||||
# Number of bytes from the next record needed to complete the last
|
||||
# character in this record
|
||||
extra = 0
|
||||
|
||||
last = b''
|
||||
while not last.decode('utf-8', 'ignore'):
|
||||
# last contains no valid utf-8 characters
|
||||
size = len(last) + 1
|
||||
text.seek(npos - size)
|
||||
last = text.read(size)
|
||||
|
||||
# last now has one valid utf-8 char and possibly some bytes that belong
|
||||
# to a truncated char
|
||||
|
||||
try:
|
||||
last.decode('utf-8', 'strict')
|
||||
except UnicodeDecodeError:
|
||||
# There are some truncated bytes in last
|
||||
prev = len(last)
|
||||
while True:
|
||||
text.seek(npos - prev)
|
||||
last = text.read(len(last) + 1)
|
||||
try:
|
||||
last.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
extra = len(last) - prev
|
||||
|
||||
text.seek(opos)
|
||||
data = text.read(RECORD_SIZE)
|
||||
overlap = text.read(extra)
|
||||
text.seek(npos)
|
||||
|
||||
return data, overlap
|
||||
|
||||
|
||||
class CNCX(object): # {{{
|
||||
|
||||
'''
|
||||
Create the CNCX records. These are records containing all the strings from
|
||||
an index. Each record is of the form: <vwi string size><utf-8 encoded
|
||||
string>
|
||||
'''
|
||||
|
||||
MAX_STRING_LENGTH = 500
|
||||
|
||||
def __init__(self, strings=()):
|
||||
self.strings = OrderedDict((s, 0) for s in strings)
|
||||
|
||||
self.records = []
|
||||
offset = 0
|
||||
buf = BytesIO()
|
||||
RECORD_LIMIT = 0x10000 - 1024 # kindlegen appears to use 1024, PDB limit is 0x10000
|
||||
for key in self.strings:
|
||||
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
|
||||
l = len(utf8)
|
||||
sz_bytes = encint(l)
|
||||
raw = sz_bytes + utf8
|
||||
if buf.tell() + len(raw) > RECORD_LIMIT:
|
||||
self.records.append(align_block(buf.getvalue()))
|
||||
buf.seek(0), buf.truncate(0)
|
||||
offset = len(self.records) * 0x10000
|
||||
buf.write(raw)
|
||||
self.strings[key] = offset
|
||||
offset += len(raw)
|
||||
|
||||
val = buf.getvalue()
|
||||
if val:
|
||||
self.records.append(align_block(val))
|
||||
|
||||
def __getitem__(self, string):
|
||||
return self.strings[string]
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.records)
|
||||
__nonzero__ = __bool__
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
def is_guide_ref_start(ref):
|
||||
return (ref.title.lower() == 'start' or
|
||||
(ref.type and ref.type.lower() in {'start',
|
||||
'other.start', 'text'}))
|
||||
|
||||
|
||||
def convert_color_for_font_tag(val):
|
||||
rgba = parse_color_string(unicode_type(val or ''))
|
||||
if rgba is None or rgba == 'currentColor':
|
||||
return val
|
||||
clamp = lambda x: min(x, max(0, x), 1)
|
||||
rgb = map(clamp, rgba[:3])
|
||||
return '#' + ''.join(map(lambda x:'%02x' % int(x * 255), rgb))
|
||||
14
ebook_converter/ebooks/mobi/writer2/__init__.py
Normal file
14
ebook_converter/ebooks/mobi/writer2/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
UNCOMPRESSED = 1
|
||||
PALMDOC = 2
|
||||
HUFFDIC = 17480
|
||||
PALM_MAX_IMAGE_SIZE = 63 * 1024
|
||||
|
||||
158
ebook_converter/ebooks/mobi/writer2/resources.py
Normal file
158
ebook_converter/ebooks/mobi/writer2/resources.py
Normal file
@@ -0,0 +1,158 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
|
||||
from calibre.ebooks.mobi.utils import (rescale_image, mobify_image,
|
||||
write_font_record)
|
||||
from calibre.ebooks import generate_masthead
|
||||
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.utils.imghdr import what
|
||||
from polyglot.builtins import iteritems, unicode_type
|
||||
|
||||
PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\xf0\x00\x00\x00\x00\x00\xff\xff\xff!\xf9\x04\x01\x00\x00\x00\x00!\xfe calibre-placeholder-gif-for-azw3\x00,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;' # noqa
|
||||
|
||||
|
||||
class Resources(object):
|
||||
|
||||
def __init__(self, oeb, opts, is_periodical, add_fonts=False,
|
||||
process_images=True):
|
||||
self.oeb, self.log, self.opts = oeb, oeb.log, opts
|
||||
self.is_periodical = is_periodical
|
||||
self.process_images = process_images
|
||||
|
||||
self.item_map = {}
|
||||
self.records = []
|
||||
self.mime_map = {}
|
||||
self.masthead_offset = 0
|
||||
self.used_image_indices = set()
|
||||
self.image_indices = set()
|
||||
self.cover_offset = self.thumbnail_offset = None
|
||||
self.has_fonts = False
|
||||
|
||||
self.add_resources(add_fonts)
|
||||
|
||||
def process_image(self, data):
|
||||
if not self.process_images:
|
||||
return data
|
||||
func = mobify_image if self.opts.mobi_keep_original_images else rescale_image
|
||||
try:
|
||||
return func(data)
|
||||
except Exception:
|
||||
if 'png' != what(None, data):
|
||||
raise
|
||||
with PersistentTemporaryFile(suffix='.png') as pt:
|
||||
pt.write(data)
|
||||
try:
|
||||
from calibre.utils.img import optimize_png
|
||||
optimize_png(pt.name)
|
||||
data = lopen(pt.name, 'rb').read()
|
||||
finally:
|
||||
os.remove(pt.name)
|
||||
return func(data)
|
||||
|
||||
def add_resources(self, add_fonts):
|
||||
oeb = self.oeb
|
||||
oeb.logger.info('Serializing resources...')
|
||||
index = 1
|
||||
|
||||
mh_href = None
|
||||
if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
|
||||
mh_href = oeb.guide['masthead'].href
|
||||
self.records.append(None)
|
||||
index += 1
|
||||
self.used_image_indices.add(0)
|
||||
self.image_indices.add(0)
|
||||
elif self.is_periodical:
|
||||
# Generate a default masthead
|
||||
data = generate_masthead(unicode_type(self.oeb.metadata['title'][0]))
|
||||
self.records.append(data)
|
||||
self.used_image_indices.add(0)
|
||||
self.image_indices.add(0)
|
||||
index += 1
|
||||
|
||||
cover_href = self.cover_offset = self.thumbnail_offset = None
|
||||
if (oeb.metadata.cover and
|
||||
unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
|
||||
cover_id = unicode_type(oeb.metadata.cover[0])
|
||||
item = oeb.manifest.ids[cover_id]
|
||||
cover_href = item.href
|
||||
|
||||
for item in self.oeb.manifest.values():
|
||||
if item.media_type not in OEB_RASTER_IMAGES:
|
||||
continue
|
||||
try:
|
||||
data = self.process_image(item.data)
|
||||
except:
|
||||
self.log.warn('Bad image file %r' % item.href)
|
||||
continue
|
||||
else:
|
||||
if mh_href and item.href == mh_href:
|
||||
self.records[0] = data
|
||||
continue
|
||||
|
||||
self.image_indices.add(len(self.records))
|
||||
self.records.append(data)
|
||||
self.item_map[item.href] = index
|
||||
self.mime_map[item.href] = 'image/%s'%what(None, data)
|
||||
index += 1
|
||||
|
||||
if cover_href and item.href == cover_href:
|
||||
self.cover_offset = self.item_map[item.href] - 1
|
||||
self.used_image_indices.add(self.cover_offset)
|
||||
try:
|
||||
data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
|
||||
maxsizeb=MAX_THUMB_SIZE)
|
||||
except:
|
||||
self.log.warn('Failed to generate thumbnail')
|
||||
else:
|
||||
self.image_indices.add(len(self.records))
|
||||
self.records.append(data)
|
||||
self.thumbnail_offset = index - 1
|
||||
self.used_image_indices.add(self.thumbnail_offset)
|
||||
index += 1
|
||||
finally:
|
||||
item.unload_data_from_memory()
|
||||
|
||||
if add_fonts:
|
||||
for item in self.oeb.manifest.values():
|
||||
if item.href and item.href.rpartition('.')[-1].lower() in {
|
||||
'ttf', 'otf'} and isinstance(item.data, bytes):
|
||||
self.records.append(write_font_record(item.data))
|
||||
self.item_map[item.href] = len(self.records)
|
||||
self.has_fonts = True
|
||||
|
||||
def add_extra_images(self):
|
||||
'''
|
||||
Add any images that were created after the call to add_resources()
|
||||
'''
|
||||
for item in self.oeb.manifest.values():
|
||||
if (item.media_type not in OEB_RASTER_IMAGES or item.href in self.item_map):
|
||||
continue
|
||||
try:
|
||||
data = self.process_image(item.data)
|
||||
except:
|
||||
self.log.warn('Bad image file %r' % item.href)
|
||||
else:
|
||||
self.records.append(data)
|
||||
self.item_map[item.href] = len(self.records)
|
||||
finally:
|
||||
item.unload_data_from_memory()
|
||||
|
||||
def serialize(self, records, used_images):
|
||||
used_image_indices = self.used_image_indices | {
|
||||
v-1 for k, v in iteritems(self.item_map) if k in used_images}
|
||||
for i in self.image_indices-used_image_indices:
|
||||
self.records[i] = PLACEHOLDER_GIF
|
||||
records.extend(self.records)
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.records)
|
||||
__nonzero__ = __bool__
|
||||
Reference in New Issue
Block a user