1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-27 13:53:32 +01:00

Initial import

This commit is contained in:
2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from struct import unpack_from, error
from calibre.utils.imghdr import what
def find_imgtype(data):
return what(None, data) or 'unknown'
class Container(object):
def __init__(self, data):
self.is_image_container = False
self.resource_index = 0
if len(data) > 60 and data[48:52] == b'EXTH':
length, num_items = unpack_from(b'>LL', data, 52)
pos = 60
while pos < 60 + length - 8:
try:
idx, size = unpack_from(b'>LL', data, pos)
except error:
break
pos += 8
size -= 8
if size < 0:
break
if idx == 539:
self.is_image_container = data[pos:pos+size] == b'application/image'
break
pos += size
def load_image(self, data):
self.resource_index += 1
if self.is_image_container:
data = data[12:]
imgtype = find_imgtype(data)
if imgtype != 'unknown':
return data, imgtype
return None, None

View File

@@ -0,0 +1,355 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, re, os
from calibre import replace_entities
from calibre.utils.date import parse_date
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.metadata import MetaInformation, check_isbn
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.utils.localization import canonicalize_lang
from calibre.utils.config_base import tweaks
from polyglot.builtins import unicode_type
NULL_INDEX = 0xffffffff
def uniq(vals):
''' Remove all duplicates from vals, while preserving order. '''
vals = vals or ()
seen = set()
seen_add = seen.add
return list(x for x in vals if x not in seen and not seen_add(x))
class EXTHHeader(object): # {{{
def __init__(self, raw, codec, title):
self.doctype = raw[:4]
self.length, self.num_items = struct.unpack('>LL', raw[4:12])
raw = raw[12:]
pos = 0
self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
self.has_fake_cover = True
self.start_offset = None
left = self.num_items
self.kf8_header = None
self.uuid = self.cdetype = None
self.page_progression_direction = None
self.primary_writing_mode = None
self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace'))
while left > 0:
left -= 1
idx, size = struct.unpack('>LL', raw[pos:pos + 8])
content = raw[pos + 8:pos + size]
pos += size
if idx >= 100 and idx < 200:
self.process_metadata(idx, content, codec)
elif idx == 203:
self.has_fake_cover = bool(struct.unpack('>L', content)[0])
elif idx == 201:
co, = struct.unpack('>L', content)
if co < NULL_INDEX:
self.cover_offset = co
elif idx == 202:
self.thumbnail_offset, = struct.unpack('>L', content)
elif idx == 501:
try:
self.cdetype = content.decode('ascii')
except UnicodeDecodeError:
self.cdetype = None
# cdetype
if content == b'EBSP':
if not self.mi.tags:
self.mi.tags = []
self.mi.tags.append(_('Sample Book'))
elif idx == 502:
# last update time
pass
elif idx == 503: # Long title
# Amazon seems to regard this as the definitive book title
# rather than the title from the PDB header. In fact when
# sending MOBI files through Amazon's email service if the
# title contains non ASCII chars or non filename safe chars
# they are messed up in the PDB header
try:
title = self.decode(content)
except Exception:
pass
elif idx == 524: # Lang code
try:
lang = content.decode(codec)
lang = canonicalize_lang(lang)
if lang:
self.mi.language = lang
except Exception:
pass
elif idx == 525:
try:
pwm = content.decode(codec)
if pwm:
self.primary_writing_mode = pwm
except Exception:
pass
elif idx == 527:
try:
ppd = content.decode(codec)
if ppd:
self.page_progression_direction = ppd
except Exception:
pass
# else:
# print 'unknown record', idx, repr(content)
if title:
self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
def process_metadata(self, idx, content, codec):
if idx == 100:
if self.mi.is_null('authors'):
self.mi.authors = []
au = clean_xml_chars(self.decode(content).strip())
# Author names in Amazon MOBI files are usually in LN, FN format,
# try to detect and auto-correct that.
m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
if m is not None:
if tweaks['author_sort_copy_method'] != 'copy':
self.mi.authors.append(m.group(2) + ' ' + m.group(1))
else:
self.mi.authors.append(m.group())
if self.mi.is_null('author_sort'):
self.mi.author_sort = m.group()
else:
self.mi.authors.append(au)
elif idx == 101:
self.mi.publisher = clean_xml_chars(self.decode(content).strip())
if self.mi.publisher in {'Unknown', _('Unknown')}:
self.mi.publisher = None
elif idx == 103:
self.mi.comments = clean_xml_chars(self.decode(content).strip())
elif idx == 104:
raw = check_isbn(self.decode(content).strip().replace('-', ''))
if raw:
self.mi.isbn = raw
elif idx == 105:
if not self.mi.tags:
self.mi.tags = []
self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
self.mi.tags = uniq(self.mi.tags)
elif idx == 106:
try:
self.mi.pubdate = parse_date(self.decode(content), as_utc=False)
except Exception:
pass
elif idx == 108:
self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
elif idx == 109:
self.mi.rights = clean_xml_chars(self.decode(content).strip())
elif idx == 112: # dc:source set in some EBSP amazon samples
try:
content = content.decode(codec).strip()
isig = 'urn:isbn:'
if content.lower().startswith(isig):
raw = check_isbn(content[len(isig):])
if raw and not self.mi.isbn:
self.mi.isbn = raw
elif content.startswith('calibre:'):
# calibre book uuid is stored here by recent calibre
# releases
cid = content[len('calibre:'):]
if cid:
self.mi.application_id = self.mi.uuid = cid
except:
pass
elif idx == 113: # ASIN or other id
try:
self.uuid = content.decode('ascii')
self.mi.set_identifier('mobi-asin', self.uuid)
except Exception:
self.uuid = None
elif idx == 116:
self.start_offset, = struct.unpack(b'>L', content)
elif idx == 121:
self.kf8_header, = struct.unpack(b'>L', content)
if self.kf8_header == NULL_INDEX:
self.kf8_header = None
# else:
# print 'unhandled metadata record', idx, repr(content)
# }}}
class BookHeader(object):
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
self.log = log
self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
self.encryption_type, = struct.unpack('>H', raw[12:14])
if ident == b'TEXTREAD':
self.codepage = 1252
if len(raw) <= 16:
self.codec = 'cp1252'
self.extra_flags = 0
self.title = _('Unknown')
self.language = 'ENGLISH'
self.sublanguage = 'NEUTRAL'
self.exth_flag, self.exth = 0, None
self.ancient = True
self.first_image_index = -1
self.mobi_version = 1
else:
self.ancient = False
self.doctype = raw[16:20]
self.length, self.type, self.codepage, self.unique_id, \
self.version = struct.unpack('>LLLLL', raw[20:40])
try:
self.codec = {
1252: 'cp1252',
65001: 'utf-8',
}[self.codepage]
except (IndexError, KeyError):
self.codec = 'cp1252' if not user_encoding else user_encoding
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec))
# Some KF8 files have header length == 264 (generated by kindlegen
# 2.9?). See https://bugs.launchpad.net/bugs/1179144
max_header_length = 500 # We choose 500 for future versions of kindlegen
if (ident == b'TEXTREAD' or self.length < 0xE4 or
self.length > max_header_length or
(try_extra_data_fix and self.length == 0xE4)):
self.extra_flags = 0
else:
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
if self.compression_type == b'DH':
self.huff_offset, self.huff_number = struct.unpack('>LL',
raw[0x70:0x78])
toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
tend = toff + tlen
self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
langid = langcode & 0xFF
sublangid = (langcode >> 10) & 0xFF
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
if not isinstance(self.title, unicode_type):
self.title = self.title.decode(self.codec, 'replace')
if self.exth_flag & 0x40:
try:
self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
self.title)
self.exth.mi.uid = self.unique_id
if self.exth.mi.is_null('language'):
try:
self.exth.mi.language = mobi2iana(langid, sublangid)
except:
self.log.exception('Unknown language code')
except:
self.log.exception('Invalid EXTH header')
self.exth_flag = 0
self.ncxidx = NULL_INDEX
if len(raw) >= 0xF8:
self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)
# Ancient PRC files from Baen can have random values for
# mobi_version, so be conservative
if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
self.dividx, self.skelidx, self.datpidx, self.othidx = \
struct.unpack_from(b'>4L', raw, 0xF8)
# need to use the FDST record to find out how to properly
# unpack the raw_ml into pieces it is simply a table of start
# and end locations for each flow piece
self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0)
# if cnt is 1 or less, fdst section number can be garbage
if self.fdstcnt <= 1:
self.fdstidx = NULL_INDEX
else: # Null values
self.skelidx = self.dividx = self.othidx = self.fdstidx = \
NULL_INDEX
class MetadataHeader(BookHeader):
def __init__(self, stream, log):
self.stream = stream
self.ident = self.identity()
self.num_sections = self.section_count()
if self.num_sections >= 2:
header = self.header()
BookHeader.__init__(self, header, self.ident, None, log)
else:
self.exth = None
@property
def kf8_type(self):
if (self.mobi_version == 8 and getattr(self, 'skelidx', NULL_INDEX) !=
NULL_INDEX):
return 'standalone'
kf8_header_index = getattr(self.exth, 'kf8_header', None)
if kf8_header_index is None:
return None
try:
if self.section_data(kf8_header_index-1) == b'BOUNDARY':
return 'joint'
except Exception:
pass
return None
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8).upper()
if ident not in (b'BOOKMOBI', b'TEXTREAD'):
raise MobiError('Unknown book type: %s' % ident)
return ident
def section_count(self):
self.stream.seek(76)
return struct.unpack('>H', self.stream.read(2))[0]
def section_offset(self, number):
self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def header(self):
section_headers = []
# First section with the metadata
section_headers.append(self.section_offset(0))
# Second section used to get the length of the first
section_headers.append(self.section_offset(1))
end_off = section_headers[1]
off = section_headers[0]
self.stream.seek(off)
return self.stream.read(end_off - off)
def section_data(self, number):
start = self.section_offset(number)
if number == self.num_sections -1:
end = os.stat(self.stream.name).st_size
else:
end = self.section_offset(number + 1)
self.stream.seek(start)
try:
return self.stream.read(end - start)
except OverflowError:
self.stream.seek(start)
return self.stream.read()

View File

@@ -0,0 +1,277 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct
from collections import OrderedDict, namedtuple
from calibre.ebooks.mobi.utils import (decint, count_set_bits,
decode_string)
from polyglot.builtins import iteritems, range, zip
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
INDEX_HEADER_FIELDS = (
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
) + tuple('unknown%d'%i for i in range(27)) + ('ocnt', 'oentries',
'ordt1', 'ordt2', 'tagx')
class InvalidFile(ValueError):
pass
def check_signature(data, signature):
if data[:len(signature)] != signature:
raise InvalidFile('Not a valid %r section'%signature)
class NotAnINDXRecord(InvalidFile):
pass
class NotATAGXSection(InvalidFile):
pass
def format_bytes(byts):
byts = bytearray(byts)
byts = [hex(b)[2:] for b in byts]
return ' '.join(byts)
def parse_indx_header(data):
check_signature(data, b'INDX')
words = INDEX_HEADER_FIELDS
num = len(words)
values = struct.unpack('>%dL' % num, data[4:4*(num+1)])
ans = dict(zip(words, values))
ordt1, ordt2 = ans['ordt1'], ans['ordt2']
ans['ordt1_raw'], ans['ordt2_raw'] = [], []
ans['ordt_map'] = ''
if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
# I dont know what this is, but using it seems to be unnecessary, so
# just leave it as the raw bytestring
ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
if ans['code'] == 65002:
# This appears to be EBCDIC-UTF (65002) encoded. I can't be
# bothered to write a decoder for this (see
# http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
# Instead, we use a weird hack that seems to do the trick for all
# the books with this type of ORDT record that I have come across.
# Some EBSP book samples in KF8 format from Amazon have this type
# of encoding.
# Basically we try to interpret every second byte as a printable
# ascii character. If we cannot, we map to the ? char.
parsed = bytearray(ans['oentries'])
for i in range(0, 2*ans['oentries'], 2):
parsed[i//2] = raw[i+1] if 0x20 < raw[i+1] < 0x7f else ord(b'?')
ans['ordt_map'] = bytes(parsed).decode('ascii')
else:
ans['ordt_map'] = '?'*ans['oentries']
return ans
class CNCX(object): # {{{
'''
Parses the records that contain the compiled NCX (all strings from the
NCX). Presents a simple offset : string mapping interface to access the
data.
'''
def __init__(self, records, codec):
self.records = OrderedDict()
record_offset = 0
for raw in records:
pos = 0
while pos < len(raw):
length, consumed = decint(raw[pos:])
if length > 0:
try:
self.records[pos+record_offset] = raw[
pos+consumed:pos+consumed+length].decode(codec)
except:
byts = raw[pos:]
r = format_bytes(byts)
print('CNCX entry at offset %d has unknown format %s'%(
pos+record_offset, r))
self.records[pos+record_offset] = r
pos = len(raw)
pos += consumed+length
record_offset += 0x10000
def __getitem__(self, offset):
return self.records.get(offset)
def get(self, offset, default=None):
return self.records.get(offset, default)
def __bool__(self):
return bool(self.records)
__nonzero__ = __bool__
def iteritems(self):
return iteritems(self.records)
def items(self):
return iteritems(self.records)
# }}}
def parse_tagx_section(data):
check_signature(data, b'TAGX')
tags = []
first_entry_offset, = struct.unpack_from(b'>L', data, 4)
control_byte_count, = struct.unpack_from(b'>L', data, 8)
for i in range(12, first_entry_offset, 4):
vals = list(bytearray(data[i:i+4]))
tags.append(TagX(*vals))
return control_byte_count, tags
def get_tag_map(control_byte_count, tagx, data, strict=False):
ptags = []
ans = {}
control_bytes = list(bytearray(data[:control_byte_count]))
data = data[control_byte_count:]
for x in tagx:
if x.eof == 0x01:
control_bytes = control_bytes[1:]
continue
value = control_bytes[0] & x.bitmask
if value != 0:
value_count = value_bytes = None
if value == x.bitmask:
if count_set_bits(x.bitmask) > 1:
# If all bits of masked value are set and the mask has more
# than one bit, a variable width value will follow after
# the control bytes which defines the length of bytes (NOT
# the value count!) which will contain the corresponding
# variable width values.
value_bytes, consumed = decint(data)
data = data[consumed:]
else:
value_count = 1
else:
# Shift bits to get the masked value.
mask = x.bitmask
while mask & 0b1 == 0:
mask >>= 1
value >>= 1
value_count = value
ptags.append(PTagX(x.tag, value_count, value_bytes,
x.num_of_values))
for x in ptags:
values = []
if x.value_count is not None:
# Read value_count * values_per_entry variable width values.
for _ in range(x.value_count * x.num_of_values):
byts, consumed = decint(data)
data = data[consumed:]
values.append(byts)
else: # value_bytes is not None
# Convert value_bytes to variable width values.
total_consumed = 0
while total_consumed < x.value_bytes:
# Does this work for values_per_entry != 1?
byts, consumed = decint(data)
data = data[consumed:]
total_consumed += consumed
values.append(byts)
if total_consumed != x.value_bytes:
err = ("Error: Should consume %s bytes, but consumed %s" %
(x.value_bytes, total_consumed))
if strict:
raise ValueError(err)
else:
print(err)
ans[x.tag] = values
# Test that all bytes have been processed
if data.replace(b'\0', b''):
err = ("Warning: There are unprocessed index bytes left: %s" %
format_bytes(data))
if strict:
raise ValueError(err)
else:
print(err)
return ans
def parse_index_record(table, data, control_byte_count, tags, codec,
ordt_map, strict=False):
header = parse_indx_header(data)
idxt_pos = header['start']
if data[idxt_pos:idxt_pos+4] != b'IDXT':
print('WARNING: Invalid INDX record')
entry_count = header['count']
# loop through to build up the IDXT position starts
idx_positions= []
for j in range(entry_count):
pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j))
idx_positions.append(pos)
# The last entry ends before the IDXT tag (but there might be zero fill
# bytes we need to ignore!)
idx_positions.append(idxt_pos)
# For each entry in the IDXT build up the tag map and any associated
# text
for j in range(entry_count):
start, end = idx_positions[j:j+2]
rec = data[start:end]
# Sometimes (in the guide table if the type attribute has non ascii
# values) the ident is UTF-16 encoded. Try to handle that.
try:
ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
except UnicodeDecodeError:
ident, consumed = decode_string(rec, codec='utf-16', ordt_map=ordt_map)
if u'\x00' in ident:
try:
ident, consumed = decode_string(rec, codec='utf-16',
ordt_map=ordt_map)
except UnicodeDecodeError:
ident = ident.replace('u\x00', u'')
rec = rec[consumed:]
tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
table[ident] = tag_map
return header
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)
data = sections[idx][0]
indx_header = parse_indx_header(data)
indx_count = indx_header['count']
if indx_header['ncncx'] > 0:
off = idx + indx_count + 1
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['tagx']
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
for i in range(idx + 1, idx + 1 + indx_count):
# Index record
data = sections[i][0]
parse_index_record(table, data, control_byte_count, tags, codec,
indx_header['ordt_map'])
return table, cncx

View File

@@ -0,0 +1,373 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, os
from calibre.ebooks.chardet import strip_encoding_declarations
from polyglot.builtins import unicode_type, range
def update_internal_links(mobi8_reader, log):
# need to update all links that are internal which
# are based on positions within the xhtml files **BEFORE**
# cutting and pasting any pieces into the xhtml text files
# kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
# XXXX is the offset in records into divtbl
# YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
mr = mobi8_reader
# pos:fid pattern
posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
parts = []
for part in mr.parts:
srcpieces = posfid_pattern.split(part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith(b'<'):
for m in posfid_index_pattern.finditer(tag):
posfid = m.group(1)
offset = m.group(2)
try:
filename, idtag = mr.get_id_tag_by_pos_fid(
int(posfid, 32), int(offset, 32))
except ValueError:
log.warn('Invalid link, points to nowhere, ignoring')
replacement = b'#'
else:
suffix = (b'#' + idtag) if idtag else b''
replacement = filename.split('/')[-1].encode(
mr.header.codec) + suffix
replacement = replacement.replace(b'"', b'&quot;')
tag = posfid_index_pattern.sub(b'"' + replacement + b'"', tag, 1)
srcpieces[j] = tag
raw = b''.join(srcpieces)
try:
parts.append(raw.decode(mr.header.codec))
except UnicodeDecodeError:
log.warn('Failed to decode text in KF8 part, replacing bad bytes')
parts.append(raw.decode(mr.header.codec, 'replace'))
# All parts are now unicode and have no internal links
return parts
def remove_kindlegen_markup(parts, aid_anchor_suffix, linked_aids):
# we can safely remove all of the Kindlegen generated aid attributes and
# calibre generated cid attributes
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\s[ac]id\s*=[^>]*>)''',
re.IGNORECASE)
within_tag_aid_position_pattern = re.compile(r'''\s[ac]id\s*=['"]([^'"]*)['"]''')
for i in range(len(parts)):
part = parts[i]
srcpieces = find_tag_with_aid_pattern.split(part)
for j in range(len(srcpieces)):
tag = srcpieces[j]
if tag.startswith('<'):
for m in within_tag_aid_position_pattern.finditer(tag):
try:
aid = m.group(1)
except IndexError:
aid = None
replacement = ''
if aid in linked_aids:
replacement = ' id="%s"' % (aid + '-' + aid_anchor_suffix)
tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces)
parts[i] = part
# we can safely remove all of the Kindlegen generated data-AmznPageBreak
# attributes
find_tag_with_AmznPageBreak_pattern = re.compile(
r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
within_tag_AmznPageBreak_position_pattern = re.compile(
r'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
for i in range(len(parts)):
part = parts[i]
srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
for j in range(len(srcpieces)):
tag = srcpieces[j]
if tag.startswith('<'):
srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
lambda m:' style="page-break-after:%s"'%m.group(1), tag)
part = "".join(srcpieces)
parts[i] = part
def update_flow_links(mobi8_reader, resource_map, log):
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
# kindle:embed:XXXX (used for fonts)
mr = mobi8_reader
flows = []
img_pattern = re.compile(r'''(<[img\s|image\s|svg:image\s][^>]*>)''', re.IGNORECASE)
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''', re.IGNORECASE)
tag_pattern = re.compile(r'''(<[^>]*>)''')
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE)
url_img_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*''', re.IGNORECASE)
font_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)''', re.IGNORECASE)
url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
for flow in mr.flows:
if flow is None: # 0th flow is None
flows.append(flow)
continue
if not isinstance(flow, unicode_type):
try:
flow = flow.decode(mr.header.codec)
except UnicodeDecodeError:
log.error('Flow part has invalid %s encoded bytes'%mr.header.codec)
flow = flow.decode(mr.header.codec, 'replace')
# links to raster image files from image tags
# image_pattern
srcpieces = img_pattern.split(flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<im') or tag.startswith('<svg:image'):
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../'+ href)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized '
'as a valid image in %s' % (num, tag))
srcpieces[j] = tag
flow = "".join(srcpieces)
# replacements inside css url():
srcpieces = url_pattern.split(flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
# process links to raster image files
for m in url_img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../'+ href)
tag = url_img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as a '
'valid image in %s' % (num, tag))
# process links to fonts
for m in font_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href is None:
log.warn('Referenced font %s was not recognized as a '
'valid font in %s' % (num, tag))
else:
replacement = '"%s"'%('../'+ href)
if href.endswith('.failed'):
replacement = '"%s"'%('failed-'+href)
tag = font_index_pattern.sub(replacement, tag, 1)
# process links to other css pieces
for m in url_css_index_pattern.finditer(tag):
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = url_css_index_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
flow = "".join(srcpieces)
# flow pattern not inside url()
srcpieces = re.split(tag_pattern, flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<'):
for m in re.finditer(flow_pattern, tag):
try:
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
except IndexError:
log.warn('Ignoring invalid flow reference in tag', tag)
tag = ''
else:
if fi.format == 'inline':
flowtext = mr.flows[num]
tag = flowtext
else:
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = flow_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
flow = "".join(srcpieces)
flows.append(flow)
# All flows are now unicode and have links resolved
return flows
def insert_flows_into_markup(parts, flows, mobi8_reader, log):
mr = mobi8_reader
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
tag_pattern = re.compile(r'''(<[^>]*>)''')
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
for i in range(len(parts)):
part = parts[i]
# flow pattern
srcpieces = tag_pattern.split(part)
for j in range(1, len(srcpieces),2):
tag = srcpieces[j]
if tag.startswith('<'):
for m in flow_pattern.finditer(tag):
num = int(m.group(1), 32)
try:
fi = mr.flowinfo[num]
except IndexError:
log.warn('Ignoring invalid flow reference: %s'%m.group())
tag = ''
else:
if fi.format == 'inline':
tag = flows[num]
else:
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = flow_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def insert_images_into_markup(parts, resource_map, log):
# Handle any embedded raster images links in the xhtml text
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^')"]*[)'"]''')
style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''',
re.IGNORECASE)
for i in range(len(parts)):
part = parts[i]
srcpieces = img_pattern.split(part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<im'):
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../' + href)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as '
'a valid image in %s' % (num, tag))
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
# Replace urls used in style attributes
for i in range(len(parts)):
part = parts[i]
srcpieces = style_pattern.split(part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if 'kindle:embed' in tag:
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
osep = m.group()[0]
csep = m.group()[-1]
if href:
replacement = '%s%s%s'%(osep, '../' + href, csep)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as '
'a valid image in %s' % (num, tag))
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def upshift_markup(parts):
tag_pattern = re.compile(r'''(<(?:svg)[^>]*>)''', re.IGNORECASE)
for i in range(len(parts)):
part = parts[i]
# tag pattern
srcpieces = re.split(tag_pattern, part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag[:4].lower() == '<svg':
tag = tag.replace('preserveaspectratio','preserveAspectRatio')
tag = tag.replace('viewbox','viewBox')
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def expand_mobi8_markup(mobi8_reader, resource_map, log):
# First update all internal links that are based on offsets
parts = update_internal_links(mobi8_reader, log)
# Remove pointless markup inserted by kindlegen
remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids)
# Handle substitutions for the flows pieces first as they may
# be inlined into the xhtml text
flows = update_flow_links(mobi8_reader, resource_map, log)
# Insert inline flows into the markup
insert_flows_into_markup(parts, flows, mobi8_reader, log)
# Insert raster images into markup
insert_images_into_markup(parts, resource_map, log)
# Perform general markup cleanups
upshift_markup(parts)
# Update the parts and flows stored in the reader
mobi8_reader.parts = parts
mobi8_reader.flows = flows
# write out the parts and file flows
os.mkdir('text') # directory containing all parts
spine = []
for i, part in enumerate(parts):
pi = mobi8_reader.partinfo[i]
with open(os.path.join(pi.type, pi.filename), 'wb') as f:
part = strip_encoding_declarations(part)
part = part.replace('<head>', '<head><meta charset="UTF-8"/>', 1)
f.write(part.encode('utf-8'))
spine.append(f.name)
for i, flow in enumerate(flows):
fi = mobi8_reader.flowinfo[i]
if fi.format == 'file':
if not os.path.exists(fi.dir):
os.mkdir(fi.dir)
with open(os.path.join(fi.dir, fi.fname), 'wb') as f:
f.write(flow.encode('utf-8'))
return spine

View File

@@ -0,0 +1,935 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import shutil, os, re, struct, textwrap, io
from lxml import html, etree
from calibre import xml_entity_to_unicode, entity_to_unicode, guess_type
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.ebooks import DRMError, unit_convert
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.reader.headers import BookHeader
from calibre.utils.img import save_cover_data_to, gif_data_to_png_data, AnimatedGIF
from calibre.utils.imghdr import what
from polyglot.builtins import iteritems, unicode_type, range, map
class TopazError(ValueError):
pass
class KFXError(ValueError):
def __init__(self):
ValueError.__init__(self, _(
'This is an Amazon KFX book. It cannot be processed.'
' See {} for information on how to handle KFX books.'
).format('https://www.mobileread.com/forums/showthread.php?t=283371'))
class MobiReader(object):
PAGE_BREAK_PAT = re.compile(
r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
try_extra_data_fix=False):
self.log = log
self.debug = debug
self.embedded_mi = None
self.warned_about_trailing_entry_corruption = False
self.base_css_rules = textwrap.dedent('''
body { text-align: justify }
blockquote { margin: 0em 0em 0em 2em; }
p { margin: 0em; text-indent: 1.5em }
.bold { font-weight: bold }
.italic { font-style: italic }
.underline { text-decoration: underline }
.mbp_pagebreak {
page-break-after: always; margin: 0; display: block
}
''')
self.tag_css_rules = {}
self.left_margins = {}
self.text_indents = {}
if hasattr(filename_or_stream, 'read'):
stream = filename_or_stream
stream.seek(0)
else:
stream = open(filename_or_stream, 'rb')
raw = stream.read()
if raw.startswith(b'TPZ'):
raise TopazError(_('This is an Amazon Topaz book. It cannot be processed.'))
if raw.startswith(b'\xeaDRMION\xee'):
raise KFXError()
self.header = raw[0:72]
self.name = self.header[:32].replace(b'\x00', b'')
self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C + 8].upper()
if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
raise MobiError('Unknown book type: %s' % repr(self.ident))
self.sections = []
self.section_headers = []
for i in range(self.num_sections):
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
flags, val = a1, a2 << 16 | a3 << 8 | a4
self.section_headers.append((offset, flags, val))
def section(section_number):
if section_number == self.num_sections - 1:
end_off = len(raw)
else:
end_off = self.section_headers[section_number + 1][0]
off = self.section_headers[section_number][0]
return raw[off:end_off]
for i in range(self.num_sections):
self.sections.append((section(i), self.section_headers[i]))
self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
self.name = self.name.decode(self.book_header.codec, 'replace')
self.kf8_type = None
k8i = getattr(self.book_header.exth, 'kf8_header', None)
# Ancient PRC files from Baen can have random values for
# mobi_version, so be conservative
if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
'skelidx')):
self.kf8_type = 'standalone'
elif k8i is not None: # Check for joint mobi 6 and kf 8 file
try:
raw = self.sections[k8i-1][0]
except:
raw = None
if raw == b'BOUNDARY':
try:
self.book_header = BookHeader(self.sections[k8i][0],
self.ident, user_encoding, self.log)
self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i
self.book_header.mobi6_records = bh.records
# Need the first_image_index from the mobi 6 header as well
for x in ('first_image_index',):
setattr(self.book_header, x, getattr(bh, x))
# We need to do this because the MOBI 6 text extract code
# does not know anything about the kf8 offset
if hasattr(self.book_header, 'huff_offset'):
self.book_header.huff_offset += k8i
self.kf8_type = 'joint'
self.kf8_boundary = k8i-1
except:
self.book_header = bh
def check_for_drm(self):
if self.book_header.encryption_type != 0:
try:
name = self.book_header.exth.mi.title
except:
name = self.name
if not name:
name = self.name
raise DRMError(name)
def extract_content(self, output_dir, parse_cache):
output_dir = os.path.abspath(output_dir)
self.check_for_drm()
processed_records = self.extract_text()
if self.debug is not None:
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec,
'ignore')
self.processed_html = self.processed_html.replace('</</', '</')
self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
self.processed_html)
self.processed_html = self.processed_html.replace('\ufeff', '')
# Remove tags of the form <xyz: ...> as they can cause issues further
# along the pipeline
self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
self.processed_html)
self.processed_html = strip_encoding_declarations(self.processed_html)
self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
self.processed_html)
image_name_map = self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
self.cleanup_html()
self.log.debug('Parsing HTML...')
self.processed_html = clean_xml_chars(self.processed_html)
try:
root = html.fromstring(self.processed_html)
if len(root.xpath('//html')) > 5:
root = html.fromstring(self.processed_html.replace('\x0c',
'').replace('\x14', ''))
except Exception:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html)
root = html.fromstring(self.processed_html)
if root.xpath('descendant::p/descendant::p'):
from html5_parser import parse
self.log.warning('Malformed markup, parsing using html5-parser')
self.processed_html = strip_encoding_declarations(self.processed_html)
# These trip up the html5 parser causing all content to be placed
# under the <guide> tag
self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I)
self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I)
try:
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
except Exception:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html)
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
if len(root.xpath('body/descendant::*')) < 1:
# There are probably stray </html>s in the markup
self.processed_html = self.processed_html.replace('</html>',
'')
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
if root.tag != 'html':
self.log.warn('File does not have opening <html> tag')
nroot = html.fromstring('<html><head></head><body></body></html>')
bod = nroot.find('body')
for child in list(root):
child.getparent().remove(child)
bod.append(child)
root = nroot
htmls = list(root.xpath('//html'))
if len(htmls) > 1:
self.log.warn('Markup contains multiple <html> tags, merging.')
# Merge all <head> and <body> sections
for h in htmls:
p = h.getparent()
if hasattr(p, 'remove'):
p.remove(h)
bodies, heads = root.xpath('//body'), root.xpath('//head')
for x in root:
root.remove(x)
head, body = map(root.makeelement, ('head', 'body'))
for h in heads:
for x in h:
h.remove(x)
head.append(x)
for b in bodies:
for x in b:
b.remove(x)
body.append(x)
root.append(head), root.append(body)
for x in root.xpath('//script'):
x.getparent().remove(x)
head = root.xpath('//head')
if head:
head = head[0]
else:
head = root.makeelement('head', {})
root.insert(0, head)
head.text = '\n\t'
link = head.makeelement('link', {'type':'text/css',
'href':'styles.css', 'rel':'stylesheet'})
head.insert(0, link)
link.tail = '\n\t'
title = head.xpath('descendant::title')
m = head.makeelement('meta', {'http-equiv':'Content-Type',
'content':'text/html; charset=utf-8'})
head.insert(0, m)
if not title:
title = head.makeelement('title', {})
try:
title.text = self.book_header.title
except ValueError:
title.text = clean_ascii_chars(self.book_header.title)
title.tail = '\n\t'
head.insert(0, title)
head.text = '\n\t'
self.upshift_markup(root, image_name_map)
guides = root.xpath('//guide')
guide = guides[0] if guides else None
metadata_elems = root.xpath('//metadata')
if metadata_elems and self.book_header.exth is None:
self.read_embedded_metadata(root, metadata_elems[0], guide)
for elem in guides + metadata_elems:
elem.getparent().remove(elem)
htmlfile = os.path.join(output_dir, 'index.html')
try:
for ref in guide.xpath('descendant::reference'):
if 'href' in ref.attrib:
ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
except AttributeError:
pass
def write_as_utf8(path, data):
if isinstance(data, unicode_type):
data = data.encode('utf-8')
with lopen(path, 'wb') as f:
f.write(data)
parse_cache[htmlfile] = root
self.htmlfile = htmlfile
ncx = io.BytesIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
opf.render(lopen(self.created_opf_path, 'wb'), ncx,
ncx_manifest_entry=ncx_manifest_entry)
ncx = ncx.getvalue()
if ncx:
ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
write_as_utf8(ncx_path, ncx)
css = [self.base_css_rules, '\n\n']
for cls, rule in self.tag_css_rules.items():
css.append('.%s { %s }\n\n' % (cls, rule))
write_as_utf8('styles.css', ''.join(css))
if self.book_header.exth is not None or self.embedded_mi is not None:
self.log.debug('Creating OPF...')
ncx = io.BytesIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
ncx_manifest_entry)
ncx = ncx.getvalue()
if ncx:
write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
def read_embedded_metadata(self, root, elem, guide):
raw = b'<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \
html.tostring(elem, encoding='utf-8') + b'</package>'
stream = io.BytesIO(raw)
opf = OPF(stream)
self.embedded_mi = opf.to_book_metadata()
if guide is not None:
for ref in guide.xpath('descendant::reference'):
if 'cover' in ref.get('type', '').lower():
href = ref.get('href', '')
if href.startswith('#'):
href = href[1:]
anchors = root.xpath('//*[@id="%s"]' % href)
if anchors:
cpos = anchors[0]
reached = False
for elem in root.iter():
if elem is cpos:
reached = True
if reached and elem.tag == 'img':
cover = elem.get('src', None)
self.embedded_mi.cover = cover
elem.getparent().remove(elem)
break
break
def cleanup_html(self):
self.log.debug('Cleaning up HTML...')
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
self.processed_html = self.processed_html.replace('\r\n', '\n')
self.processed_html = self.processed_html.replace('> <', '>\n<')
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html)
# Swap inline and block level elements, and order block level elements according to priority
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
self.processed_html = re.sub(
r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>'+r'\g<styletags>', self.processed_html)
self.processed_html = re.sub(
r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>'+r'\g<para>', self.processed_html)
self.processed_html = re.sub(
r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>'+r'\g<blockquote>', self.processed_html)
self.processed_html = re.sub(
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html)
bods = htmls = 0
for x in re.finditer('</body>|</html>', self.processed_html):
if x == '</body>':
bods +=1
else:
htmls += 1
if bods > 1 and htmls > 1:
break
if bods > 1:
self.processed_html = self.processed_html.replace('</body>', '')
if htmls > 1:
self.processed_html = self.processed_html.replace('</html>', '')
def remove_random_bytes(self, html):
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07',
'', html)
def ensure_unit(self, raw, unit='px'):
if re.search(r'\d+$', raw) is not None:
raw += unit
return raw
def upshift_markup(self, root, image_name_map=None):
self.log.debug('Converting style information to CSS...')
image_name_map = image_name_map or {}
size_map = {
'xx-small': '0.5',
'x-small': '1',
'small': '2',
'medium': '3',
'large': '4',
'x-large': '5',
'xx-large': '6',
}
def barename(x):
return x.rpartition(':')[-1]
mobi_version = self.book_header.mobi_version
for x in root.xpath('//ncx'):
x.getparent().remove(x)
svg_tags = []
forwardable_anchors = []
pagebreak_anchors = []
BLOCK_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'p'}
for i, tag in enumerate(root.iter(etree.Element)):
tag.attrib.pop('xmlns', '')
for x in tag.attrib:
if ':' in x:
del tag.attrib[x]
if tag.tag and barename(tag.tag) == 'svg':
svg_tags.append(tag)
if tag.tag and barename(tag.tag.lower()) in \
('country-region', 'place', 'placetype', 'placename',
'state', 'city', 'street', 'address', 'content', 'form'):
tag.tag = 'div' if tag.tag in ('content', 'form') else 'span'
for key in tag.attrib.keys():
tag.attrib.pop(key)
continue
styles, attrib = [], tag.attrib
if 'style' in attrib:
style = attrib.pop('style').strip()
if style:
styles.append(style)
if 'height' in attrib:
height = attrib.pop('height').strip()
if (
height and '<' not in height and '>' not in height and
re.search(r'\d+', height)):
if tag.tag in ('table', 'td', 'tr'):
pass
elif tag.tag == 'img':
tag.set('height', height)
else:
if tag.tag == 'div' and not tag.text and \
(not tag.tail or not tag.tail.strip()) and \
not len(list(tag.iterdescendants())):
# Paragraph spacer
# Insert nbsp so that the element is never
# discarded by a renderer
tag.text = '\u00a0' # nbsp
styles.append('height: %s' %
self.ensure_unit(height))
else:
styles.append('margin-top: %s' % self.ensure_unit(height))
if 'width' in attrib:
width = attrib.pop('width').strip()
if width and re.search(r'\d+', width):
if tag.tag in ('table', 'td', 'tr'):
pass
elif tag.tag == 'img':
tag.set('width', width)
else:
ewidth = self.ensure_unit(width)
styles.append('text-indent: %s' % ewidth)
try:
ewidth_val = unit_convert(ewidth, 12, 500, 166)
self.text_indents[tag] = ewidth_val
except:
pass
if width.startswith('-'):
styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
try:
ewidth_val = unit_convert(ewidth[1:], 12, 500, 166)
self.left_margins[tag] = ewidth_val
except:
pass
if 'align' in attrib:
align = attrib.pop('align').strip()
if align:
align = align.lower()
if align == 'baseline':
styles.append('vertical-align: '+align)
else:
styles.append('text-align: %s' % align)
if tag.tag == 'hr':
if mobi_version == 1:
tag.tag = 'div'
styles.append('page-break-before: always')
styles.append('display: block')
styles.append('margin: 0')
elif tag.tag == 'i':
tag.tag = 'span'
tag.attrib['class'] = 'italic'
elif tag.tag == 'u':
tag.tag = 'span'
tag.attrib['class'] = 'underline'
elif tag.tag == 'b':
tag.tag = 'span'
tag.attrib['class'] = 'bold'
elif tag.tag == 'font':
sz = tag.get('size', '').lower()
try:
float(sz)
except ValueError:
if sz in list(size_map.keys()):
attrib['size'] = size_map[sz]
elif tag.tag == 'img':
recindex = None
for attr in self.IMAGE_ATTRS:
recindex = attrib.pop(attr, None) or recindex
if recindex is not None:
try:
recindex = int(recindex)
except Exception:
pass
else:
attrib['src'] = 'images/' + image_name_map.get(recindex, '%05d.jpg' % recindex)
for attr in ('width', 'height'):
if attr in attrib:
val = attrib[attr]
if val.lower().endswith('em'):
try:
nval = float(val[:-2])
nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
attrib[attr] = "%dpx"%int(nval)
except:
del attrib[attr]
elif val.lower().endswith('%'):
del attrib[attr]
elif tag.tag == 'pre':
if not tag.text:
tag.tag = 'div'
if (attrib.get('class', None) == 'mbp_pagebreak' and tag.tag ==
'div' and 'filepos-id' in attrib):
pagebreak_anchors.append(tag)
if 'color' in attrib:
styles.append('color: ' + attrib.pop('color'))
if 'bgcolor' in attrib:
styles.append('background-color: ' + attrib.pop('bgcolor'))
if 'filepos-id' in attrib:
attrib['id'] = attrib.pop('filepos-id')
if 'name' in attrib and attrib['name'] != attrib['id']:
attrib['name'] = attrib['id']
if 'filepos' in attrib:
filepos = attrib.pop('filepos')
try:
attrib['href'] = "#filepos%d" % int(filepos)
except ValueError:
pass
if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos') and
not tag.text and len(tag) == 0 and (tag.tail is None or not
tag.tail.strip()) and getattr(tag.getnext(), 'tag',
None) in BLOCK_TAGS):
# This is an empty anchor immediately before a block tag, move
# the id onto the block tag instead
forwardable_anchors.append(tag)
if styles:
ncls = None
rule = '; '.join(styles)
for sel, srule in self.tag_css_rules.items():
if srule == rule:
ncls = sel
break
if ncls is None:
ncls = 'calibre_%d' % i
self.tag_css_rules[ncls] = rule
cls = attrib.get('class', '')
cls = cls + (' ' if cls else '') + ncls
attrib['class'] = cls
for tag in svg_tags:
images = tag.xpath('descendant::img[@src]')
parent = tag.getparent()
if images and hasattr(parent, 'find'):
index = parent.index(tag)
for img in images:
img.getparent().remove(img)
img.tail = img.text = None
parent.insert(index, img)
if hasattr(parent, 'remove'):
parent.remove(tag)
for tag in pagebreak_anchors:
anchor = tag.attrib['id']
del tag.attrib['id']
if 'name' in tag.attrib:
del tag.attrib['name']
p = tag.getparent()
a = p.makeelement('a')
a.attrib['id'] = anchor
p.insert(p.index(tag)+1, a)
if getattr(a.getnext(), 'tag', None) in BLOCK_TAGS:
forwardable_anchors.append(a)
for tag in forwardable_anchors:
block = tag.getnext()
tag.getparent().remove(tag)
if 'id' in block.attrib:
tag.tail = block.text
block.text = None
block.insert(0, tag)
else:
block.attrib['id'] = tag.attrib['id']
# WebKit fails to navigate to anchors located on <br> tags
for br in root.xpath('/body/br[@id]'):
br.tag = 'div'
def get_left_whitespace(self, tag):
def whitespace(tag):
lm = ti = 0.0
if tag.tag == 'p':
ti = unit_convert('1.5em', 12, 500, 166)
if tag.tag == 'blockquote':
lm = unit_convert('2em', 12, 500, 166)
lm = self.left_margins.get(tag, lm)
ti = self.text_indents.get(tag, ti)
try:
lm = float(lm)
except:
lm = 0.0
try:
ti = float(ti)
except:
ti = 0.0
return lm + ti
parent = tag
ans = 0.0
while parent is not None:
ans += whitespace(parent)
parent = parent.getparent()
return ans
def create_opf(self, htmlfile, guide=None, root=None):
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
if mi is None:
mi = MetaInformation(self.book_header.title, [_('Unknown')])
opf = OPFCreator(os.path.dirname(htmlfile), mi)
if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1)
elif mi.cover is not None:
opf.cover = mi.cover
else:
opf.cover = 'images/%05d.jpg' % 1
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
* opf.cover.split('/'))):
opf.cover = None
cover = opf.cover
cover_copied = None
if cover is not None:
cover = cover.replace('/', os.sep)
if os.path.exists(cover):
ncover = 'images'+os.sep+'calibre_cover.jpg'
if os.path.exists(ncover):
os.remove(ncover)
shutil.copyfile(cover, ncover)
cover_copied = os.path.abspath(ncover)
opf.cover = ncover.replace(os.sep, '/')
manifest = [(htmlfile, 'application/xhtml+xml'),
(os.path.abspath('styles.css'), 'text/css')]
bp = os.path.dirname(htmlfile)
added = set()
for i in getattr(self, 'image_names', []):
path = os.path.join(bp, 'images', i)
added.add(path)
manifest.append((path, guess_type(path)[0] or 'image/jpeg'))
if cover_copied is not None:
manifest.append((cover_copied, 'image/jpeg'))
opf.create_manifest(manifest)
opf.create_spine([os.path.basename(htmlfile)])
toc = None
if guide is not None:
opf.create_guide(guide)
for ref in opf.guide:
if ref.type.lower() == 'toc':
toc = ref.href()
ncx_manifest_entry = None
if toc:
ncx_manifest_entry = 'toc.ncx'
elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
tocobj = None
ent_pat = re.compile(r'&(\S+?);')
if elems:
tocobj = TOC()
found = False
reached = False
for x in root.iter():
if x == elems[-1]:
reached = True
continue
if reached and x.tag == 'a':
href = x.get('href', '')
if href and re.match(r'\w+://', href) is None:
try:
text = ' '.join([t.strip() for t in
x.xpath('descendant::text()')])
except:
text = ''
text = ent_pat.sub(entity_to_unicode, text)
item = tocobj.add_item(toc.partition('#')[0], href[1:],
text)
item.left_space = int(self.get_left_whitespace(x))
found = True
if reached and found and x.get('class', None) == 'mbp_pagebreak':
break
if tocobj is not None:
tocobj = self.structure_toc(tocobj)
opf.set_toc(tocobj)
return opf, ncx_manifest_entry
def structure_toc(self, toc):
indent_vals = set()
for item in toc:
indent_vals.add(item.left_space)
if len(indent_vals) > 6 or len(indent_vals) < 2:
# Too many or too few levels, give up
return toc
indent_vals = sorted(indent_vals)
last_found = [None for i in indent_vals]
newtoc = TOC()
def find_parent(level):
candidates = last_found[:level]
for x in reversed(candidates):
if x is not None:
return x
return newtoc
for item in toc:
level = indent_vals.index(item.left_space)
parent = find_parent(level)
last_found[level] = parent.add_item(item.href, item.fragment,
item.text)
return newtoc
def sizeof_trailing_entries(self, data):
def sizeof_trailing_entry(ptr, psize):
bitpos, result = 0, 0
while True:
v = ord(ptr[psize-1:psize])
result |= (v & 0x7F) << bitpos
bitpos += 7
psize -= 1
if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
return result
num = 0
size = len(data)
flags = self.book_header.extra_flags >> 1
while flags:
if flags & 1:
try:
num += sizeof_trailing_entry(data, size - num)
except IndexError:
self.warn_about_trailing_entry_corruption()
return 0
flags >>= 1
if self.book_header.extra_flags & 1:
off = size - num - 1
num += (ord(data[off:off+1]) & 0x3) + 1
return num
def warn_about_trailing_entry_corruption(self):
if not self.warned_about_trailing_entry_corruption:
self.warned_about_trailing_entry_corruption = True
self.log.warn('The trailing data entries in this MOBI file are corrupted, you might see corrupted text in the output')
def text_section(self, index):
data = self.sections[index][0]
trail_size = self.sizeof_trailing_entries(data)
return data[:len(data)-trail_size]
def extract_text(self, offset=1):
self.log.debug('Extracting text...')
text_sections = [self.text_section(i) for i in range(offset,
min(self.book_header.records + offset, len(self.sections)))]
processed_records = list(range(offset-1, self.book_header.records +
offset))
self.mobi_html = b''
if self.book_header.compression_type == b'DH':
huffs = [self.sections[i][0] for i in
range(self.book_header.huff_offset,
self.book_header.huff_offset + self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset + self.book_header.huff_number))
huff = HuffReader(huffs)
unpack = huff.unpack
elif self.book_header.compression_type == b'\x00\x02':
unpack = decompress_doc
elif self.book_header.compression_type == b'\x00\x01':
unpack = lambda x: x
else:
raise MobiError('Unknown compression algorithm: %r' % self.book_header.compression_type)
self.mobi_html = b''.join(map(unpack, text_sections))
if self.mobi_html.endswith(b'#'):
self.mobi_html = self.mobi_html[:-1]
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ')
self.mobi_html = self.mobi_html.replace(b'\0', b'')
if self.book_header.codec == 'cp1252':
self.mobi_html = self.mobi_html.replace(b'\x1e', b'') # record separator
self.mobi_html = self.mobi_html.replace(b'\x02', b'') # start of text
return processed_records
def replace_page_breaks(self):
self.processed_html = self.PAGE_BREAK_PAT.sub(
r'<div \1 class="mbp_pagebreak" />',
self.processed_html)
def add_anchors(self):
self.log.debug('Adding anchors...')
positions = set()
link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
re.IGNORECASE)
for match in link_pattern.finditer(self.mobi_html):
positions.add(int(match.group(1)))
pos = 0
processed_html = []
end_tag_re = re.compile(br'<\s*/')
for end in sorted(positions):
if end == 0:
continue
oend = end
l = self.mobi_html.find(b'<', end)
r = self.mobi_html.find(b'>', end)
anchor = b'<a id="filepos%d"></a>'
if r > -1 and (r < l or l == end or l == -1):
p = self.mobi_html.rfind(b'<', 0, end + 1)
if (pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and
not self.mobi_html[p:r + 1].endswith(b'/>')):
anchor = b' filepos-id="filepos%d"'
end = r
else:
end = r + 1
processed_html.append(self.mobi_html[pos:end] + (anchor % oend))
pos = end
processed_html.append(self.mobi_html[pos:])
processed_html = b''.join(processed_html)
# Remove anchors placed inside entities
self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)([^;]*);',
br'&\1\3;\2', processed_html)
def extract_images(self, processed_records, output_dir):
self.log.debug('Extracting images...')
output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
image_index = 0
self.image_names = []
image_name_map = {}
start = getattr(self.book_header, 'first_image_index', -1)
if start > self.num_sections or start < 0:
# BAEN PRC files have bad headers
start = 0
for i in range(start, self.num_sections):
if i in processed_records:
continue
processed_records.append(i)
data = self.sections[i][0]
image_index += 1
if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
# This record is a known non image type, no need to try to
# load the image
continue
try:
imgfmt = what(None, data)
except Exception:
continue
if imgfmt not in {'jpg', 'jpeg', 'gif', 'png', 'bmp'}:
continue
if imgfmt == 'jpeg':
imgfmt = 'jpg'
if imgfmt == 'gif':
try:
data = gif_data_to_png_data(data)
imgfmt = 'png'
except AnimatedGIF:
pass
path = os.path.join(output_dir, '%05d.%s' % (image_index, imgfmt))
image_name_map[image_index] = os.path.basename(path)
if imgfmt == 'png':
with open(path, 'wb') as f:
f.write(data)
else:
try:
save_cover_data_to(data, path, minify_to=(10000, 10000))
except Exception:
continue
self.image_names.append(os.path.basename(path))
return image_name_map
def test_mbp_regex():
for raw, m in iteritems({
'<mbp:pagebreak></mbp:pagebreak>':'',
'<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy',
'<mbp:pagebreak> </mbp:pagebreak>':'',
'<mbp:pagebreak>xxx':'xxx',
'<mbp:pagebreak/>xxx':'xxx',
'<mbp:pagebreak sdf/ >xxx':' sdfxxx',
'<mbp:pagebreak / >':' ',
'</mbp:pagebreak>':'',
'</mbp:pagebreak sdf>':' sdf',
'</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':'xxx',
}):
ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
if ans != m:
raise Exception('%r != %r for %r'%(ans, m, raw))

View File

@@ -0,0 +1,590 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, re, os
from collections import namedtuple
from itertools import repeat
from uuid import uuid4
from lxml import etree
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import read_index
from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
from calibre.ebooks.mobi.reader.containers import Container, find_imgtype
from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.utils import read_font_record
from calibre.ebooks.oeb.parse_utils import parse_html
from calibre.ebooks.oeb.base import XPath, XHTML, xml2text
from polyglot.builtins import range, zip, unicode_type, getcwd, as_unicode
from polyglot.urllib import urldefrag
Part = namedtuple('Part',
'num type filename start end aid')
Elem = namedtuple('Elem',
'insert_pos toc_text file_number sequence_number start_pos '
'length')
FlowInfo = namedtuple('FlowInfo',
'type format dir fname')
# locate beginning and ending positions of tag with specific aid attribute
def locate_beg_end_of_tag(ml, aid):
pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid
aid_pattern = re.compile(pattern, re.IGNORECASE)
for m in re.finditer(aid_pattern, ml):
plt = m.start()
pgt = ml.find(b'>', plt+1)
return plt, pgt
return 0, 0
def reverse_tag_iter(block):
''' Iterate over all tags in block in reverse order, i.e. last tag
to first tag. '''
end = len(block)
while True:
pgt = block.rfind(b'>', 0, end)
if pgt == -1:
break
plt = block.rfind(b'<', 0, pgt)
if plt == -1:
break
yield block[plt:pgt+1]
end = plt
def get_first_resource_index(first_image_index, num_of_text_records, first_text_record_number):
first_resource_index = first_image_index
if first_resource_index in {-1, NULL_INDEX}:
first_resource_index = num_of_text_records + first_text_record_number
return first_resource_index
class Mobi8Reader(object):
def __init__(self, mobi6_reader, log, for_tweak=False):
self.for_tweak = for_tweak
self.mobi6_reader, self.log = mobi6_reader, log
self.header = mobi6_reader.book_header
self.encrypted_fonts = []
self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
def __call__(self):
self.mobi6_reader.check_for_drm()
self.aid_anchor_suffix = uuid4().hex.encode('utf-8')
bh = self.mobi6_reader.book_header
if self.mobi6_reader.kf8_type == 'joint':
offset = self.mobi6_reader.kf8_boundary + 2
self.resource_offsets = [
(get_first_resource_index(bh.first_image_index, bh.mobi6_records, 1), offset - 2),
(get_first_resource_index(bh.kf8_first_image_index, bh.records, offset), len(self.mobi6_reader.sections)),
]
else:
offset = 1
self.resource_offsets = [(get_first_resource_index(bh.first_image_index, bh.records, offset), len(self.mobi6_reader.sections))]
self.processed_records = self.mobi6_reader.extract_text(offset=offset)
self.raw_ml = self.mobi6_reader.mobi_html
with open('debug-raw.html', 'wb') as f:
f.write(self.raw_ml)
self.kf8_sections = self.mobi6_reader.sections[offset-1:]
self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
self.linked_aids = set()
self.read_indices()
self.build_parts()
guide = self.create_guide()
ncx = self.create_ncx()
resource_map = self.extract_resources(self.mobi6_reader.sections)
spine = self.expand_text(resource_map)
return self.write_opf(guide, ncx, spine, resource_map)
def read_indices(self):
self.flow_table = ()
if self.header.fdstidx != NULL_INDEX:
header = self.kf8_sections[self.header.fdstidx][0]
if header[:4] != b'FDST':
raise ValueError('KF8 does not have a valid FDST record')
sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
secs = struct.unpack_from(b'>%dL' % (num_sections*2),
header, sec_start)
self.flow_table = tuple(zip(secs[::2], secs[1::2]))
self.files = []
if self.header.skelidx != NULL_INDEX:
table = read_index(self.kf8_sections, self.header.skelidx,
self.header.codec)[0]
File = namedtuple('File',
'file_number name divtbl_count start_position length')
for i, text in enumerate(table):
tag_map = table[text]
self.files.append(File(i, text, tag_map[1][0],
tag_map[6][0], tag_map[6][1]))
self.elems = []
if self.header.dividx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.dividx,
self.header.codec)
for i, text in enumerate(table):
tag_map = table[text]
toc_text = cncx[tag_map[2][0]]
self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
tag_map[4][0], tag_map[6][0], tag_map[6][1]))
self.guide = []
if self.header.othidx != NULL_INDEX:
table, cncx = read_index(self.kf8_sections, self.header.othidx,
self.header.codec)
Item = namedtuple('Item',
'type title pos_fid')
for i, ref_type in enumerate(table):
tag_map = table[ref_type]
# ref_type, ref_title, div/frag number
title = cncx[tag_map[1][0]]
fileno = None
if 3 in list(tag_map.keys()):
fileno = tag_map[3][0]
if 6 in list(tag_map.keys()):
fileno = tag_map[6]
if isinstance(ref_type, bytes):
ref_type = ref_type.decode(self.header.codec)
self.guide.append(Item(ref_type, title, fileno))
def build_parts(self):
raw_ml = self.mobi6_reader.mobi_html
self.flows = []
self.flowinfo = []
ft = self.flow_table if self.flow_table else [(0, len(raw_ml))]
# now split the raw_ml into its flow pieces
for start, end in ft:
self.flows.append(raw_ml[start:end])
# the first piece represents the xhtml text
text = self.flows[0]
self.flows[0] = b''
# walk the <skeleton> and <div> tables to build original source xhtml
# files *without* destroying any file position information needed for
# later href processing and create final list of file separation start:
# stop points and etc in partinfo
self.parts = []
self.partinfo = []
divptr = 0
baseptr = 0
for skelnum, skelname, divcnt, skelpos, skellen in self.files:
baseptr = skelpos + skellen
skeleton = text[skelpos:baseptr]
inspos_warned = False
for i in range(divcnt):
insertpos, idtext, filenum, seqnum, startpos, length = \
self.elems[divptr]
if i == 0:
aidtext = idtext[12:-2]
filename = 'part%04d.html' % filenum
part = text[baseptr:baseptr + length]
insertpos = insertpos - skelpos
head = skeleton[:insertpos]
tail = skeleton[insertpos:]
if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') <
head.rfind(b'<')):
# There is an incomplete tag in either the head or tail.
# This can happen for some badly formed KF8 files, see for
# example, https://bugs.launchpad.net/bugs/1082669
if not inspos_warned:
self.log.warn(
'The div table for %s has incorrect insert '
'positions. Calculating manually.'%skelname)
inspos_warned = True
bp, ep = locate_beg_end_of_tag(skeleton, aidtext if
isinstance(aidtext, bytes) else aidtext.encode('utf-8'))
if bp != ep:
insertpos = ep + 1 + startpos
skeleton = skeleton[0:insertpos] + part + skeleton[insertpos:]
baseptr = baseptr + length
divptr += 1
self.parts.append(skeleton)
if divcnt < 1:
# Empty file
aidtext = unicode_type(uuid4())
filename = aidtext + '.html'
self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
baseptr, aidtext))
# The primary css style sheet is typically stored next followed by any
# snippets of code that were previously inlined in the
# original xhtml but have been stripped out and placed here.
# This can include local CDATA snippets and svg sections.
# The problem is that for most browsers and ereaders, you can not
# use <img src="imageXXXX.svg" /> to import any svg image that itself
# properly uses an <image/> tag to import some raster image - it
# should work according to the spec but does not for almost all browsers
# and ereaders and causes epub validation issues because those raster
# images are in manifest but not in xhtml text - since they only
# referenced from an svg image
# So we need to check the remaining flow pieces to see if they are css
# or svg images. if svg images, we must check if they have an <image/>
# and if so inline them into the xhtml text pieces.
# there may be other sorts of pieces stored here but until we see one
# in the wild to reverse engineer we won't be able to tell
self.flowinfo.append(FlowInfo(None, None, None, None))
svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''', re.IGNORECASE)
for j in range(1, len(self.flows)):
flowpart = self.flows[j]
nstr = '%04d' % j
m = svg_tag_pattern.search(flowpart)
if m is not None:
# svg
typ = 'svg'
start = m.start()
m2 = image_tag_pattern.search(flowpart)
if m2 is not None:
format = 'inline'
dir = None
fname = None
# strip off anything before <svg if inlining
flowpart = re.sub(br'(</?)svg:', r'\1', flowpart[start:])
else:
format = 'file'
dir = "images"
fname = 'svgimg' + nstr + '.svg'
else:
# search for CDATA and if exists inline it
if flowpart.find(b'[CDATA[') >= 0:
typ = 'css'
flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
format = 'inline'
dir = None
fname = None
else:
# css - assume as standalone css file
typ = 'css'
format = 'file'
dir = "styles"
fname = nstr + '.css'
self.flows[j] = flowpart
self.flowinfo.append(FlowInfo(typ, format, dir, fname))
def get_file_info(self, pos):
''' Get information about the part (file) that exists at pos in
the raw markup '''
for part in self.partinfo:
if pos >= part.start and pos < part.end:
return part
return Part(*repeat(None, len(Part._fields)))
def get_id_tag_by_pos_fid(self, posfid, offset):
# first convert kindle:pos:fid and offset info to position in file
insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid]
pos = insertpos + offset
fi = self.get_file_info(pos)
# an existing "id=" must exist in original xhtml otherwise it would not
# have worked for linking. Amazon seems to have added its own
# additional "aid=" inside tags whose contents seem to represent some
# position information encoded into Base32 name.
# so find the closest "id=" before position the file by actually
# searching in that file
idtext = self.get_id_tag(pos)
return '%s/%s'%(fi.type, fi.filename), idtext
def get_id_tag(self, pos):
# Find the first tag with a named anchor (name or id attribute) before
# pos
fi = self.get_file_info(pos)
if fi.num is None and fi.start is None:
raise ValueError('No file contains pos: %d'%pos)
textblock = self.parts[fi.num]
npos = pos - fi.start
pgt = textblock.find(b'>', npos)
plt = textblock.find(b'<', npos)
# if npos inside a tag then search all text before the its end of tag marker
# else not in a tag need to search the preceding tag
if plt == npos or pgt < plt:
npos = pgt + 1
textblock = textblock[0:npos]
for tag in reverse_tag_iter(textblock):
m = self.id_re.match(tag) or self.name_re.match(tag)
if m is not None:
return m.group(1)
# For some files, kindlegen apparently creates links to tags
# without HTML anchors, using the AID instead. See
# See https://www.mobileread.com/forums/showthread.php?t=259557
m = self.aid_re.match(tag)
if m is not None:
self.linked_aids.add(m.group(1))
return m.group(1) + b'-' + self.aid_anchor_suffix
# No tag found, link to start of file
return b''
def create_guide(self):
guide = Guide()
has_start = False
for ref_type, ref_title, pos_fid in self.guide:
try:
if len(pos_fid) != 2:
continue
except TypeError:
continue # thumbnailstandard record, ignore it
linktgt, idtext = self.get_id_tag_by_pos_fid(*pos_fid)
if idtext:
if isinstance(idtext, bytes):
idtext = idtext.decode(self.header.codec)
linktgt += '#' + idtext
g = Guide.Reference(linktgt, getcwd())
g.title, g.type = ref_title, ref_type
if g.title == 'start' or g.type == 'text':
has_start = True
guide.append(g)
so = self.header.exth.start_offset
if so not in {None, NULL_INDEX} and not has_start:
fi = self.get_file_info(so)
if fi.filename is not None:
idtext = self.get_id_tag(so).decode(self.header.codec)
linktgt = fi.filename
if idtext:
linktgt += '#' + idtext
g = Guide.Reference('%s/%s'%(fi.type, linktgt), getcwd())
g.title, g.type = 'start', 'text'
guide.append(g)
return guide
def create_ncx(self):
index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
self.header.codec)
remove = []
# Add href and anchor info to the index entries
for entry in index_entries:
pos_fid = entry['pos_fid']
if pos_fid is None:
pos = entry['pos']
fi = self.get_file_info(pos)
if fi.filename is None:
raise ValueError('Index entry has invalid pos: %d'%pos)
idtag = self.get_id_tag(pos)
href = '%s/%s'%(fi.type, fi.filename)
else:
try:
href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
except ValueError:
self.log.warn('Invalid entry in NCX (title: %s), ignoring'
%entry['text'])
remove.append(entry)
continue
entry['href'] = href
entry['idtag'] = as_unicode(idtag, self.header.codec or 'utf-8')
for e in remove:
index_entries.remove(e)
# Build the TOC object
return build_toc(index_entries)
def extract_resources(self, sections):
from calibre.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF
resource_map = []
container = None
for x in ('fonts', 'images'):
os.mkdir(x)
for start, end in self.resource_offsets:
for i, sec in enumerate(sections[start:end]):
fname_idx = i+1
data = sec[0]
typ = data[:4]
href = None
if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN',
b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET', b'PAGE'}:
pass # Ignore these records
elif typ == b'FONT':
font = read_font_record(data)
href = "fonts/%05d.%s" % (fname_idx, font['ext'])
if font['err']:
self.log.warn('Reading font record %d failed: %s'%(
fname_idx, font['err']))
if font['headers']:
self.log.debug('Font record headers: %s'%font['headers'])
with open(href.replace('/', os.sep), 'wb') as f:
f.write(font['font_data'] if font['font_data'] else
font['raw_data'])
if font['encrypted']:
self.encrypted_fonts.append(href)
elif typ == b'CONT':
if data == b'CONTBOUNDARY':
container = None
continue
container = Container(data)
elif typ == b'CRES':
data, imgtype = container.load_image(data)
if data is not None:
href = 'images/%05d.%s'%(container.resource_index, imgtype)
with open(href.replace('/', os.sep), 'wb') as f:
f.write(data)
elif typ == b'\xa0\xa0\xa0\xa0' and len(data) == 4 and container is not None:
container.resource_index += 1
elif container is None:
if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF):
imgtype = find_imgtype(data)
href = 'images/%05d.%s'%(fname_idx, imgtype)
with open(href.replace('/', os.sep), 'wb') as f:
f.write(data)
resource_map.append(href)
return resource_map
def expand_text(self, resource_map):
return expand_mobi8_markup(self, resource_map, self.log)
def write_opf(self, guide, toc, spine, resource_map):
mi = self.header.exth.mi
if (self.cover_offset is not None and self.cover_offset <
len(resource_map)):
mi.cover = resource_map[self.cover_offset]
if len(list(toc)) < 2:
self.log.warn('KF8 has no metadata Table of Contents')
for ref in guide:
if ref.type == 'toc':
href = ref.href()
href, frag = urldefrag(href)
if os.path.exists(href.replace('/', os.sep)):
try:
toc = self.read_inline_toc(href, frag)
except:
self.log.exception('Failed to read inline ToC')
opf = OPFCreator(getcwd(), mi)
opf.guide = guide
def exclude(path):
return os.path.basename(path) == 'debug-raw.html'
# If there are no images then the azw3 input plugin dumps all
# binary records as .unknown images, remove them
if self.for_tweak and os.path.exists('images') and os.path.isdir('images'):
files = os.listdir('images')
unknown = [x for x in files if x.endswith('.unknown')]
if len(files) == len(unknown):
[os.remove('images/'+f) for f in files]
if self.for_tweak:
try:
os.remove('debug-raw.html')
except:
pass
opf.create_manifest_from_files_in([getcwd()], exclude=exclude)
for entry in opf.manifest:
if entry.mime_type == 'text/html':
entry.mime_type = 'application/xhtml+xml'
opf.create_spine(spine)
opf.set_toc(toc)
ppd = getattr(self.header.exth, 'page_progression_direction', None)
if ppd in {'ltr', 'rtl', 'default'}:
opf.page_progression_direction = ppd
pwm = getattr(self.header.exth, 'primary_writing_mode', None)
if pwm is not None:
opf.primary_writing_mode = pwm
with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx:
opf.render(of, ncx, 'toc.ncx')
return 'metadata.opf'
def read_inline_toc(self, href, frag):
ans = TOC()
base_href = '/'.join(href.split('/')[:-1])
with open(href.replace('/', os.sep), 'rb') as f:
raw = f.read().decode(self.header.codec)
root = parse_html(raw, log=self.log)
body = XPath('//h:body')(root)
reached = False
if body:
start = body[0]
else:
start = None
reached = True
if frag:
elems = XPath('//*[@id="%s"]'%frag)(root)
if elems:
start = elems[0]
def node_depth(elem):
ans = 0
parent = elem.getparent()
while parent is not None:
parent = parent.getparent()
ans += 1
return ans
# Layer the ToC based on nesting order in the source HTML
current_depth = None
parent = ans
seen = set()
links = []
for elem in root.iterdescendants(etree.Element):
if reached and elem.tag == XHTML('a') and elem.get('href',
False):
href = elem.get('href')
href, frag = urldefrag(href)
href = base_href + '/' + href
text = xml2text(elem).strip()
if (text, href, frag) in seen:
continue
seen.add((text, href, frag))
links.append((text, href, frag, node_depth(elem)))
elif elem is start:
reached = True
depths = sorted(set(x[-1] for x in links))
depth_map = {x:i for i, x in enumerate(depths)}
for text, href, frag, depth in links:
depth = depth_map[depth]
if current_depth is None:
current_depth = 0
parent.add_item(href, frag, text)
elif current_depth == depth:
parent.add_item(href, frag, text)
elif current_depth < depth:
parent = parent[-1] if len(parent) > 0 else parent
parent.add_item(href, frag, text)
current_depth += 1
else:
delta = current_depth - depth
while delta > 0 and parent.parent is not None:
parent = parent.parent
delta -= 1
parent.add_item(href, frag, text)
current_depth = depth
return ans

View File

@@ -0,0 +1,100 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre import replace_entities
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import read_index
from polyglot.builtins import iteritems, getcwd
tag_fieldname_map = {
1: ['pos',0],
2: ['len',0],
3: ['noffs',0],
4: ['hlvl',0],
5: ['koffs',0],
6: ['pos_fid',0],
21: ['parent',0],
22: ['child1',0],
23: ['childn',0],
69: ['image_index',0],
70 : ['desc_offset', 0], # 'Description offset in cncx'
71 : ['author_offset', 0], # 'Author offset in cncx'
72 : ['image_caption_offset', 0], # 'Image caption offset in cncx',
73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx',
}
default_entry = {
'pos': -1,
'len': 0,
'noffs': -1,
'text' : "Unknown Text",
'hlvl' : -1,
'kind' : "Unknown Class",
'pos_fid' : None,
'parent' : -1,
'child1' : -1,
'childn' : -1,
'description': None,
'author': None,
'image_caption': None,
'image_attribution': None,
}
def read_ncx(sections, index, codec):
index_entries = []
if index != NULL_INDEX:
table, cncx = read_index(sections, index, codec)
for num, x in enumerate(iteritems(table)):
text, tag_map = x
entry = default_entry.copy()
entry['name'] = text
entry['num'] = num
for tag in tag_fieldname_map:
fieldname, i = tag_fieldname_map[tag]
if tag in tag_map:
fieldvalue = tag_map[tag][i]
if tag == 6:
# Appears to be an idx into the KF8 elems table with an
# offset
fieldvalue = tuple(tag_map[tag])
entry[fieldname] = fieldvalue
for which, name in iteritems({3:'text', 5:'kind', 70:'description',
71:'author', 72:'image_caption',
73:'image_attribution'}):
if tag == which:
entry[name] = cncx.get(fieldvalue,
default_entry[name])
index_entries.append(entry)
return index_entries
def build_toc(index_entries):
ans = TOC(base_path=getcwd())
levels = {x['hlvl'] for x in index_entries}
num_map = {-1: ans}
level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in
levels}
for lvl in sorted(levels):
for item in level_map[lvl]:
parent = num_map[item['parent']]
child = parent.add_item(item['href'], item['idtag'],
replace_entities(item['text'], encoding=None))
num_map[item['num']] = child
# Set play orders in depth first order
for i, item in enumerate(ans.flat()):
item.play_order = i
return ans