Added mobi writer files

2020-04-13 15:24:23 +02:00
parent 79cad46732
commit ae80ae5640
12 changed files with 3346 additions and 0 deletions
@@ -0,0 +1,891 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import numbers
+from struct import pack
+import io
+from collections import OrderedDict, defaultdict
+
+from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
+        encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)
+from polyglot.builtins import filter, iteritems, itervalues, map, range
+
+
+class CNCX(CNCX_):  # {{{
+
+    def __init__(self, toc, is_periodical):
+        strings = []
+        for item in toc.iterdescendants(breadth_first=True):
+            strings.append(item.title)
+            if is_periodical:
+                strings.append(item.klass)
+                if item.author:
+                    strings.append(item.author)
+                if item.description:
+                    strings.append(item.description)
+        CNCX_.__init__(self, strings)
+# }}}
+
+
+class TAGX(object):  # {{{
+
+    BITMASKS = {11:0b1}
+    BITMASKS.update({x:(1 << i) for i, x in enumerate([1, 2, 3, 4, 5, 21, 22, 23])})
+    BITMASKS.update({x:(1 << i) for i, x in enumerate([69, 70, 71, 72, 73])})
+
+    NUM_VALUES = defaultdict(lambda :1)
+    NUM_VALUES[11] = 3
+    NUM_VALUES[0] = 0
+
+    def __init__(self):
+        self.byts = bytearray()
+
+    def add_tag(self, tag):
+        buf = self.byts
+        buf.append(tag)
+        buf.append(self.NUM_VALUES[tag])
+        # bitmask
+        buf.append(self.BITMASKS[tag] if tag else 0)
+        # eof
+        buf.append(0 if tag else 1)
+
+    def header(self, control_byte_count):
+        header = b'TAGX'
+        # table length, control byte count
+        header += pack(b'>II', 12+len(self.byts), control_byte_count)
+        return header
+
+    @property
+    def periodical(self):
+        '''
+        TAGX block for the Primary index header of a periodical
+        '''
+        list(map(self.add_tag, (1, 2, 3, 4, 5, 21, 22, 23, 0, 69, 70, 71, 72,
+            73, 0)))
+        return self.header(2) + bytes(self.byts)
+
+    @property
+    def secondary(self):
+        '''
+        TAGX block for the secondary index header of a periodical
+        '''
+        list(map(self.add_tag, (11, 0)))
+        return self.header(1) + bytes(self.byts)
+
+    @property
+    def flat_book(self):
+        '''
+        TAGX block for the primary index header of a flat book
+        '''
+        list(map(self.add_tag, (1, 2, 3, 4, 0)))
+        return self.header(1) + bytes(self.byts)
+
+
+# }}}
+
+# Index Entries {{{
+
+class IndexEntry(object):
+
+    TAG_VALUES = {
+            'offset': 1,
+            'size': 2,
+            'label_offset': 3,
+            'depth': 4,
+            'class_offset': 5,
+            'secondary': 11,
+            'parent_index': 21,
+            'first_child_index': 22,
+            'last_child_index': 23,
+            'image_index': 69,
+            'desc_offset': 70,
+            'author_offset': 71,
+
+    }
+    RTAG_MAP = {v:k for k, v in iteritems(TAG_VALUES)}  # noqa
+
+    def __init__(self, offset, label_offset):
+        self.offset, self.label_offset = offset, label_offset
+        self.depth, self.class_offset = 0, None
+        self.control_byte_count = 1
+
+        self.length = 0
+        self.index = 0
+
+        self.parent_index = None
+        self.first_child_index = None
+        self.last_child_index = None
+
+        self.image_index = None
+        self.author_offset = None
+        self.desc_offset = None
+
+    def __repr__(self):
+        return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,'
+                ' parent_index=%r)')%(self.offset, self.depth, self.length,
+                        self.index, self.parent_index)
+
+    @property
+    def size(self):
+        return self.length
+
+    @size.setter
+    def size(self, val):
+        self.length = val
+
+    @property
+    def next_offset(self):
+        return self.offset + self.length
+
+    @property
+    def tag_nums(self):
+        for i in range(1, 5):
+            yield i
+        for attr in ('class_offset', 'parent_index', 'first_child_index',
+                'last_child_index'):
+            if getattr(self, attr) is not None:
+                yield self.TAG_VALUES[attr]
+
+    @property
+    def entry_type(self):
+        ans = 0
+        for tag in self.tag_nums:
+            ans |= TAGX.BITMASKS[tag]
+        return ans
+
+    def attr_for_tag(self, tag):
+        return self.RTAG_MAP[tag]
+
+    @property
+    def bytestring(self):
+        buf = io.BytesIO()
+        if isinstance(self.index, numbers.Integral):
+            buf.write(encode_number_as_hex(self.index))
+        else:
+            raw = bytearray(self.index.encode('ascii'))
+            raw.insert(0, len(raw))
+            buf.write(bytes(raw))
+        et = self.entry_type
+        buf.write(bytes(bytearray([et])))
+
+        if self.control_byte_count == 2:
+            flags = 0
+            for attr in ('image_index', 'desc_offset', 'author_offset'):
+                val = getattr(self, attr)
+                if val is not None:
+                    tag = self.TAG_VALUES[attr]
+                    bm = TAGX.BITMASKS[tag]
+                    flags |= bm
+            buf.write(bytes(bytearray([flags])))
+
+        for tag in self.tag_nums:
+            attr = self.attr_for_tag(tag)
+            val = getattr(self, attr)
+            if isinstance(val, numbers.Integral):
+                val = [val]
+            for x in val:
+                buf.write(encint(x))
+
+        if self.control_byte_count == 2:
+            for attr in ('image_index', 'desc_offset', 'author_offset'):
+                val = getattr(self, attr)
+                if val is not None:
+                    buf.write(encint(val))
+
+        ans = buf.getvalue()
+        return ans
+
+
+class PeriodicalIndexEntry(IndexEntry):
+
+    def __init__(self, offset, label_offset, class_offset, depth):
+        IndexEntry.__init__(self, offset, label_offset)
+        self.depth = depth
+        self.class_offset = class_offset
+        self.control_byte_count = 2
+
+
+class SecondaryIndexEntry(IndexEntry):
+
+    INDEX_MAP = {'author':73, 'caption':72, 'credit':71, 'description':70,
+                'mastheadImage':69}
+
+    def __init__(self, index):
+        IndexEntry.__init__(self, 0, 0)
+        self.index = index
+
+        tag = self.INDEX_MAP[index]
+
+        # The values for this index entry
+        # I dont know what the 5 means, it is not the number of entries
+        self.secondary = [5 if tag == min(
+            itervalues(self.INDEX_MAP)) else 0, 0, tag]
+
+    @property
+    def tag_nums(self):
+        yield 11
+
+    @property
+    def entry_type(self):
+        return 1
+
+    @classmethod
+    def entries(cls):
+        rmap = {v:k for k,v in iteritems(cls.INDEX_MAP)}
+        for tag in sorted(rmap, reverse=True):
+            yield cls(rmap[tag])
+
+# }}}
+
+
+class TBS(object):  # {{{
+
+    '''
+    Take the list of index nodes starting/ending on a record and calculate the
+    trailing byte sequence for the record.
+    '''
+
+    def __init__(self, data, is_periodical, first=False, section_map={},
+            after_first=False):
+        self.section_map = section_map
+
+        if is_periodical:
+            # The starting bytes.
+            # The value is zero which I think indicates the periodical
+            # index entry. The values for the various flags seem to be
+            # unused. If the 0b100 is present, it means that the record
+            # deals with section 1 (or is the final record with section
+            # transitions).
+            self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
+            self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
+                    flag_size=3)
+            self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
+                    flag_size=3)
+            self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
+                0}, flag_size=3)
+
+            if not data:
+                byts = b''
+                if after_first:
+                    # This can happen if a record contains only text between
+                    # the periodical start and the first section
+                    byts = self.type_011
+                self.bytestring = byts
+            else:
+                depth_map = defaultdict(list)
+                for x in ('starts', 'ends', 'completes'):
+                    for idx in data[x]:
+                        depth_map[idx.depth].append(idx)
+                for l in itervalues(depth_map):
+                    l.sort(key=lambda x:x.offset)
+                self.periodical_tbs(data, first, depth_map)
+        else:
+            if not data:
+                self.bytestring = b''
+            else:
+                self.book_tbs(data, first)
+
+    def periodical_tbs(self, data, first, depth_map):
+        buf = io.BytesIO()
+
+        has_section_start = (depth_map[1] and
+                set(depth_map[1]).intersection(set(data['starts'])))
+        spanner = data['spans']
+        parent_section_index = -1
+
+        if depth_map[0]:
+            # We have a terminal record
+
+            # Find the first non periodical node
+            first_node = None
+            for nodes in (depth_map[1], depth_map[2]):
+                for node in nodes:
+                    if (first_node is None or (node.offset, node.depth) <
+                            (first_node.offset, first_node.depth)):
+                        first_node = node
+
+            typ = (self.type_110 if has_section_start else self.type_010)
+
+            # parent_section_index is needed for the last record
+            if first_node is not None and first_node.depth > 0:
+                parent_section_index = (first_node.index if first_node.depth == 1 else first_node.parent_index)
+            else:
+                parent_section_index = max(iter(self.section_map))
+
+        else:
+            # Non terminal record
+
+            if spanner is not None:
+                # record is spanned by a single article
+                parent_section_index = spanner.parent_index
+                typ = (self.type_110 if parent_section_index == 1 else
+                        self.type_010)
+            elif not depth_map[1]:
+                # has only article nodes, i.e. spanned by a section
+                parent_section_index = depth_map[2][0].parent_index
+                typ = (self.type_111 if parent_section_index == 1 else
+                        self.type_010)
+            else:
+                # has section transitions
+                if depth_map[2]:
+                    parent_section_index = depth_map[2][0].parent_index
+                else:
+                    parent_section_index = depth_map[1][0].index
+                typ = self.type_011
+
+        buf.write(typ)
+
+        if typ not in (self.type_110, self.type_111) and parent_section_index > 0:
+            extra = {}
+            # Write starting section information
+            if spanner is None:
+                num_articles = len([a for a in depth_map[1] if a.parent_index == parent_section_index])
+                if not depth_map[1]:
+                    extra = {0b0001: 0}
+                if num_articles > 1:
+                    extra = {0b0100: num_articles}
+            buf.write(encode_tbs(parent_section_index, extra))
+
+        if spanner is None:
+            articles = depth_map[2]
+            sections = {self.section_map[a.parent_index] for a in
+                articles}
+            sections = sorted(sections, key=lambda x:x.offset)
+            section_map = {s:[a for a in articles if a.parent_index ==
+                s.index] for s in sections}
+            for i, section in enumerate(sections):
+                # All the articles in this record that belong to section
+                articles = section_map[section]
+                first_article = articles[0]
+                last_article = articles[-1]
+                num = len(articles)
+                last_article_ends = (last_article in data['ends'] or
+                        last_article in data['completes'])
+
+                try:
+                    next_sec = sections[i+1]
+                except:
+                    next_sec = None
+
+                extra = {}
+                if num > 1:
+                    extra[0b0100] = num
+                if False and i == 0 and next_sec is not None:
+                    # Write offset to next section from start of record
+                    # I can't figure out exactly when Kindlegen decides to
+                    # write this so I have disabled it for now.
+                    extra[0b0001] = next_sec.offset - data['offset']
+
+                buf.write(encode_tbs(first_article.index-section.index, extra))
+
+                if next_sec is not None:
+                    buf.write(encode_tbs(last_article.index-next_sec.index,
+                        {0b1000: 0}))
+
+                # If a section TOC starts and extends into the next record add
+                # a trailing vwi. We detect this by TBS type==3, processing last
+                # section present in the record, and the last article in that
+                # section either ends or completes and doesn't finish
+                # on the last byte of the record.
+                elif (typ == self.type_011 and last_article_ends and
+                      ((last_article.offset+last_article.size) % RECORD_SIZE > 0)
+                     ):
+                    buf.write(encode_tbs(last_article.index-section.index-1,
+                        {0b1000: 0}))
+
+        else:
+            buf.write(encode_tbs(spanner.index - parent_section_index,
+                {0b0001: 0}))
+
+        self.bytestring = buf.getvalue()
+
+    def book_tbs(self, data, first):
+        spanner = data['spans']
+        if spanner is not None:
+            self.bytestring = encode_tbs(spanner.index, {0b010: 0, 0b001: 0},
+                    flag_size=3)
+        else:
+            starts, completes, ends = (data['starts'], data['completes'],
+                                        data['ends'])
+            if (not completes and (
+                (len(starts) == 1 and not ends) or (len(ends) == 1 and not
+                    starts))):
+                node = starts[0] if starts else ends[0]
+                self.bytestring = encode_tbs(node.index, {0b010: 0}, flag_size=3)
+            else:
+                nodes = []
+                for x in (starts, completes, ends):
+                    nodes.extend(x)
+                nodes.sort(key=lambda x:x.index)
+                self.bytestring = encode_tbs(nodes[0].index, {0b010:0,
+                    0b100: len(nodes)}, flag_size=3)
+
+# }}}
+
+
+class Indexer(object):  # {{{
+
+    def __init__(self, serializer, number_of_text_records,
+            size_of_last_text_record, masthead_offset, is_periodical,
+            opts, oeb):
+        self.serializer = serializer
+        self.number_of_text_records = number_of_text_records
+        self.text_size = (RECORD_SIZE * (self.number_of_text_records-1) +
+                            size_of_last_text_record)
+        self.masthead_offset = masthead_offset
+        self.secondary_record_offset = None
+
+        self.oeb = oeb
+        self.log = oeb.log
+        self.opts = opts
+
+        self.is_periodical = is_periodical
+        if self.is_periodical and self.masthead_offset is None:
+            raise ValueError('Periodicals must have a masthead')
+
+        self.log('Generating MOBI index for a %s'%('periodical' if
+            self.is_periodical else 'book'))
+        self.is_flat_periodical = False
+        if self.is_periodical:
+            periodical_node = next(iter(oeb.toc))
+            sections = tuple(periodical_node)
+            self.is_flat_periodical = len(sections) == 1
+
+        self.records = []
+
+        if self.is_periodical:
+            # Ensure all articles have an author and description before
+            # creating the CNCX
+            for node in oeb.toc.iterdescendants():
+                if node.klass == 'article':
+                    aut, desc = node.author, node.description
+                    if not aut:
+                        aut = _('Unknown')
+                    if not desc:
+                        desc = _('No details available')
+                    node.author, node.description = aut, desc
+
+        self.cncx = CNCX(oeb.toc, self.is_periodical)
+
+        if self.is_periodical:
+            self.indices = self.create_periodical_index()
+        else:
+            self.indices = self.create_book_index()
+
+        if not self.indices:
+            raise ValueError('No valid entries in TOC, cannot generate index')
+
+        self.records.append(self.create_index_record())
+        self.records.insert(0, self.create_header())
+        self.records.extend(self.cncx.records)
+
+        if is_periodical:
+            self.secondary_record_offset = len(self.records)
+            self.records.append(self.create_header(secondary=True))
+            self.records.append(self.create_index_record(secondary=True))
+
+        self.calculate_trailing_byte_sequences()
+
+    def create_index_record(self, secondary=False):  # {{{
+        header_length = 192
+        buf = io.BytesIO()
+        indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices
+
+        # Write index entries
+        offsets = []
+        for i in indices:
+            offsets.append(buf.tell())
+            buf.write(i.bytestring)
+
+        index_block = align_block(buf.getvalue())
+
+        # Write offsets to index entries as an IDXT block
+        idxt_block = b'IDXT'
+        buf.seek(0), buf.truncate(0)
+        for offset in offsets:
+            buf.write(pack(b'>H', header_length+offset))
+        idxt_block = align_block(idxt_block + buf.getvalue())
+        body = index_block + idxt_block
+
+        header = b'INDX'
+        buf.seek(0), buf.truncate(0)
+        buf.write(pack(b'>I', header_length))
+        buf.write(b'\0'*4)  # Unknown
+        buf.write(pack(b'>I', 1))  # Header type? Or index record number?
+        buf.write(b'\0'*4)  # Unknown
+        # IDXT block offset
+        buf.write(pack(b'>I', header_length + len(index_block)))
+        # Number of index entries
+        buf.write(pack(b'>I', len(offsets)))
+        # Unknown
+        buf.write(b'\xff'*8)
+        # Unknown
+        buf.write(b'\0'*156)
+
+        header += buf.getvalue()
+
+        ans = header + body
+        if len(ans) > 0x10000:
+            raise ValueError('Too many entries (%d) in the TOC'%len(offsets))
+        return ans
+    # }}}
+
+    def create_header(self, secondary=False):  # {{{
+        buf = io.BytesIO()
+        if secondary:
+            tagx_block = TAGX().secondary
+        else:
+            tagx_block = (TAGX().periodical if self.is_periodical else
+                                TAGX().flat_book)
+        header_length = 192
+
+        # Ident 0 - 4
+        buf.write(b'INDX')
+
+        # Header length 4 - 8
+        buf.write(pack(b'>I', header_length))
+
+        # Unknown 8-16
+        buf.write(b'\0'*8)
+
+        # Index type: 0 - normal, 2 - inflection 16 - 20
+        buf.write(pack(b'>I', 2))
+
+        # IDXT offset 20-24
+        buf.write(pack(b'>I', 0))  # Filled in later
+
+        # Number of index records 24-28
+        buf.write(pack(b'>I', 1 if secondary else len(self.records)))
+
+        # Index Encoding 28-32
+        buf.write(pack(b'>I', 65001))  # utf-8
+
+        # Unknown 32-36
+        buf.write(b'\xff'*4)
+
+        # Number of index entries 36-40
+        indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices
+        buf.write(pack(b'>I', len(indices)))
+
+        # ORDT offset 40-44
+        buf.write(pack(b'>I', 0))
+
+        # LIGT offset 44-48
+        buf.write(pack(b'>I', 0))
+
+        # Number of LIGT entries 48-52
+        buf.write(pack(b'>I', 0))
+
+        # Number of CNCX records 52-56
+        buf.write(pack(b'>I', 0 if secondary else len(self.cncx.records)))
+
+        # Unknown 56-180
+        buf.write(b'\0'*124)
+
+        # TAGX offset 180-184
+        buf.write(pack(b'>I', header_length))
+
+        # Unknown 184-192
+        buf.write(b'\0'*8)
+
+        # TAGX block
+        buf.write(tagx_block)
+
+        num = len(indices)
+
+        # The index of the last entry in the NCX
+        idx = indices[-1].index
+        if isinstance(idx, numbers.Integral):
+            idx = encode_number_as_hex(idx)
+        else:
+            idx = idx.encode('ascii')
+            idx = (bytes(bytearray([len(idx)]))) + idx
+        buf.write(idx)
+
+        # The number of entries in the NCX
+        buf.write(pack(b'>H', num))
+
+        # Padding
+        pad = (4 - (buf.tell()%4))%4
+        if pad:
+            buf.write(b'\0'*pad)
+
+        idxt_offset = buf.tell()
+
+        buf.write(b'IDXT')
+        buf.write(pack(b'>H', header_length + len(tagx_block)))
+        buf.write(b'\0')
+        buf.seek(20)
+        buf.write(pack(b'>I', idxt_offset))
+
+        return align_block(buf.getvalue())
+    # }}}
+
+    def create_book_index(self):  # {{{
+        indices = []
+        seen = set()
+        id_offsets = self.serializer.id_offsets
+
+        # Flatten toc so that chapter to chapter jumps work with all sub
+        # chapter levels as well
+        for node in self.oeb.toc.iterdescendants():
+            try:
+                offset = id_offsets[node.href]
+                label = self.cncx[node.title]
+            except:
+                self.log.warn('TOC item %s [%s] not found in document'%(
+                    node.title, node.href))
+                continue
+
+            if offset in seen:
+                continue
+            seen.add(offset)
+
+            indices.append(IndexEntry(offset, label))
+
+        indices.sort(key=lambda x:x.offset)
+
+        # Set lengths
+        for i, index in enumerate(indices):
+            try:
+                next_offset = indices[i+1].offset
+            except:
+                next_offset = self.serializer.body_end_offset
+            index.length = next_offset - index.offset
+
+        # Remove empty indices
+        indices = [x for x in indices if x.length > 0]
+
+        # Reset lengths in case any were removed
+        for i, index in enumerate(indices):
+            try:
+                next_offset = indices[i+1].offset
+            except:
+                next_offset = self.serializer.body_end_offset
+            index.length = next_offset - index.offset
+
+        # Set index values
+        for index, x in enumerate(indices):
+            x.index = index
+
+        return indices
+
+    # }}}
+
+    def create_periodical_index(self):  # {{{
+        periodical_node = next(iter(self.oeb.toc))
+        periodical_node_offset = self.serializer.body_start_offset
+        periodical_node_size = (self.serializer.body_end_offset -
+                periodical_node_offset)
+
+        normalized_sections = []
+
+        id_offsets = self.serializer.id_offsets
+
+        periodical = PeriodicalIndexEntry(periodical_node_offset,
+                self.cncx[periodical_node.title],
+                self.cncx[periodical_node.klass], 0)
+        periodical.length = periodical_node_size
+        periodical.first_child_index = 1
+        periodical.image_index = self.masthead_offset
+
+        seen_sec_offsets = set()
+        seen_art_offsets = set()
+
+        for sec in periodical_node:
+            normalized_articles = []
+            try:
+                offset = id_offsets[sec.href]
+                label = self.cncx[sec.title]
+                klass = self.cncx[sec.klass]
+            except:
+                continue
+            if offset in seen_sec_offsets:
+                continue
+
+            seen_sec_offsets.add(offset)
+            section = PeriodicalIndexEntry(offset, label, klass, 1)
+            section.parent_index = 0
+
+            for art in sec:
+                try:
+                    offset = id_offsets[art.href]
+                    label = self.cncx[art.title]
+                    klass = self.cncx[art.klass]
+                except:
+                    continue
+                if offset in seen_art_offsets:
+                    continue
+                seen_art_offsets.add(offset)
+                article = PeriodicalIndexEntry(offset, label, klass, 2)
+                normalized_articles.append(article)
+                article.author_offset = self.cncx[art.author]
+                article.desc_offset = self.cncx[art.description]
+                if getattr(art, 'toc_thumbnail', None) is not None:
+                    try:
+                        ii = self.serializer.images[art.toc_thumbnail] - 1
+                        if ii > -1:
+                            article.image_index = ii
+                    except KeyError:
+                        pass  # Image not found in serializer
+
+            if normalized_articles:
+                normalized_articles.sort(key=lambda x:x.offset)
+                normalized_sections.append((section, normalized_articles))
+
+        normalized_sections.sort(key=lambda x:x[0].offset)
+
+        # Set lengths
+        for s, x in enumerate(normalized_sections):
+            sec, normalized_articles = x
+            try:
+                sec.length = normalized_sections[s+1][0].offset - sec.offset
+            except:
+                sec.length = self.serializer.body_end_offset - sec.offset
+            for i, art in enumerate(normalized_articles):
+                try:
+                    art.length = normalized_articles[i+1].offset - art.offset
+                except:
+                    art.length = sec.offset + sec.length - art.offset
+
+        # Filter
+        for i, x in list(enumerate(normalized_sections)):
+            sec, normalized_articles = x
+            normalized_articles = list(filter(lambda x: x.length > 0,
+                normalized_articles))
+            normalized_sections[i] = (sec, normalized_articles)
+
+        normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1],
+            normalized_sections))
+
+        # Set indices
+        i = 0
+        for sec, articles in normalized_sections:
+            i += 1
+            sec.index = i
+            sec.parent_index = 0
+
+        for sec, articles in normalized_sections:
+            for art in articles:
+                i += 1
+                art.index = i
+
+                art.parent_index = sec.index
+
+        for sec, normalized_articles in normalized_sections:
+            sec.first_child_index = normalized_articles[0].index
+            sec.last_child_index = normalized_articles[-1].index
+
+        # Set lengths again to close up any gaps left by filtering
+        for s, x in enumerate(normalized_sections):
+            sec, articles = x
+            try:
+                next_offset = normalized_sections[s+1][0].offset
+            except:
+                next_offset = self.serializer.body_end_offset
+            sec.length = next_offset - sec.offset
+
+            for a, art in enumerate(articles):
+                try:
+                    next_offset = articles[a+1].offset
+                except:
+                    next_offset = sec.next_offset
+                art.length = next_offset - art.offset
+
+        # Sanity check
+        for s, x in enumerate(normalized_sections):
+            sec, articles = x
+            try:
+                next_sec = normalized_sections[s+1][0]
+            except:
+                if (sec.length == 0 or sec.next_offset !=
+                        self.serializer.body_end_offset):
+                    raise ValueError('Invalid section layout')
+            else:
+                if next_sec.offset != sec.next_offset or sec.length == 0:
+                    raise ValueError('Invalid section layout')
+            for a, art in enumerate(articles):
+                try:
+                    next_art = articles[a+1]
+                except:
+                    if (art.length == 0 or art.next_offset !=
+                            sec.next_offset):
+                        raise ValueError('Invalid article layout')
+                else:
+                    if art.length == 0 or art.next_offset != next_art.offset:
+                        raise ValueError('Invalid article layout')
+
+        # Flatten
+        indices = [periodical]
+        for sec, articles in normalized_sections:
+            indices.append(sec)
+            periodical.last_child_index = sec.index
+
+        for sec, articles in normalized_sections:
+            for a in articles:
+                indices.append(a)
+
+        return indices
+    # }}}
+
+    # TBS {{{
+    def calculate_trailing_byte_sequences(self):
+        self.tbs_map = {}
+        found_node = False
+        sections = [i for i in self.indices if i.depth == 1]
+        section_map = OrderedDict((i.index, i) for i in
+                sorted(sections, key=lambda x:x.offset))
+
+        deepest = max(i.depth for i in self.indices)
+
+        for i in range(self.number_of_text_records):
+            offset = i * RECORD_SIZE
+            next_offset = offset + RECORD_SIZE
+            data = {'ends':[], 'completes':[], 'starts':[],
+                    'spans':None, 'offset':offset, 'record_number':i+1}
+
+            for index in self.indices:
+
+                if index.offset >= next_offset:
+                    # Node starts after current record
+                    if index.depth == deepest:
+                        break
+                    else:
+                        continue
+                if index.next_offset <= offset:
+                    # Node ends before current record
+                    continue
+                if index.offset >= offset:
+                    # Node starts in current record
+                    if index.next_offset <= next_offset:
+                        # Node ends in current record
+                        data['completes'].append(index)
+                    else:
+                        data['starts'].append(index)
+                else:
+                    # Node starts before current records
+                    if index.next_offset <= next_offset:
+                        # Node ends in current record
+                        data['ends'].append(index)
+                    elif index.depth == deepest:
+                        data['spans'] = index
+
+            if (data['ends'] or data['completes'] or data['starts'] or
+                    data['spans'] is not None):
+                self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
+                        found_node, section_map=section_map)
+                found_node = True
+            else:
+                self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False,
+                        after_first=found_node, section_map=section_map)
+
+    def get_trailing_byte_sequence(self, num):
+        return self.tbs_map[num].bytestring
+    # }}}
+
+# }}}
@@ -0,0 +1,480 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import io, random, time
+from struct import pack
+
+from calibre.ebooks import normalize
+from calibre.ebooks.mobi.writer2.serializer import Serializer
+from calibre.ebooks.compression.palmdoc import compress_doc
+from calibre.ebooks.mobi.langcodes import iana2mobi
+from calibre.utils.filenames import ascii_filename
+from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
+from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
+        align_block, detect_periodical, RECORD_SIZE, create_text_record)
+from calibre.ebooks.mobi.writer2.indexer import Indexer
+from polyglot.builtins import iteritems, unicode_type, range
+
+# Disabled as I dont care about uncrossable breaks
+WRITE_UNCROSSABLE_BREAKS = False
+NULL_INDEX = 0xffffffff
+
+FLIS = (b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+
+            b'\xff'*4)
+
+
+def fcis(text_length):
+    fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
+    fcis += pack(b'>I', text_length)
+    fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+    return fcis
+
+
+class MobiWriter(object):
+
+    def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True):
+        self.opts = opts
+        self.resources = resources
+        self.kf8 = kf8
+        self.for_joint = kf8 is not None
+        self.write_page_breaks_after_item = write_page_breaks_after_item
+        self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
+        self.prefer_author_sort = opts.prefer_author_sort
+        self.last_text_record_idx = 1
+
+    def __call__(self, oeb, path_or_stream):
+        self.log = oeb.log
+        pt = None
+        if oeb.metadata.publication_type:
+            x = unicode_type(oeb.metadata.publication_type[0]).split(':')
+            if len(x) > 1:
+                pt = x[1].lower()
+        self.publication_type = pt
+
+        if hasattr(path_or_stream, 'write'):
+            return self.dump_stream(oeb, path_or_stream)
+        with open(path_or_stream, 'w+b') as stream:
+            return self.dump_stream(oeb, stream)
+
+    def write(self, *args):
+        for datum in args:
+            self.stream.write(datum)
+
+    def tell(self):
+        return self.stream.tell()
+
+    def dump_stream(self, oeb, stream):
+        self.oeb = oeb
+        self.stream = stream
+        self.records = [None]
+        self.generate_content()
+        self.generate_joint_record0() if self.for_joint else self.generate_record0()
+        self.write_header()
+        self.write_content()
+
+    def generate_content(self):
+        self.is_periodical = detect_periodical(self.oeb.toc, self.oeb.log)
+        # Image records are stored in their own list, they are merged into the
+        # main record list at the end
+        self.generate_images()
+        self.generate_text()
+        # The uncrossable breaks trailing entries come before the indexing
+        # trailing entries
+        self.write_uncrossable_breaks()
+        # Index records come after text records
+        self.generate_index()
+
+    # Indexing {{{
+    def generate_index(self):
+        self.primary_index_record_idx = None
+        if self.oeb.toc.count() < 1:
+            self.log.warn('No TOC, MOBI index not generated')
+            return
+        try:
+            self.indexer = Indexer(self.serializer, self.last_text_record_idx,
+                    len(self.records[self.last_text_record_idx]),
+                    self.masthead_offset, self.is_periodical,
+                    self.opts, self.oeb)
+        except:
+            self.log.exception('Failed to generate MOBI index:')
+        else:
+            self.primary_index_record_idx = len(self.records)
+            for i in range(self.last_text_record_idx + 1):
+                if i == 0:
+                    continue
+                tbs = self.indexer.get_trailing_byte_sequence(i)
+                self.records[i] += encode_trailing_data(tbs)
+            self.records.extend(self.indexer.records)
+
+    # }}}
+
+    def write_uncrossable_breaks(self):  # {{{
+        '''
+        Write information about uncrossable breaks (non linear items in
+        the spine.
+        '''
+        if not WRITE_UNCROSSABLE_BREAKS:
+            return
+
+        breaks = self.serializer.breaks
+
+        for i in range(1, self.last_text_record_idx+1):
+            offset = i * RECORD_SIZE
+            pbreak = 0
+            running = offset
+
+            buf = io.BytesIO()
+
+            while breaks and (breaks[0] - offset) < RECORD_SIZE:
+                pbreak = (breaks.pop(0) - running) >> 3
+                encoded = encint(pbreak)
+                buf.write(encoded)
+                running += pbreak << 3
+            encoded = encode_trailing_data(buf.getvalue())
+            self.records[i] += encoded
+    # }}}
+
+    # Images {{{
+
+    def generate_images(self):
+        resources = self.resources
+        image_records = resources.records
+        self.image_map = resources.item_map
+        self.masthead_offset = resources.masthead_offset
+        self.cover_offset = resources.cover_offset
+        self.thumbnail_offset = resources.thumbnail_offset
+
+        if image_records and image_records[0] is None:
+            raise ValueError('Failed to find masthead image in manifest')
+
+    # }}}
+
+    def generate_text(self):  # {{{
+        self.oeb.logger.info('Serializing markup content...')
+        self.serializer = Serializer(self.oeb, self.image_map,
+                self.is_periodical,
+                write_page_breaks_after_item=self.write_page_breaks_after_item)
+        text = self.serializer()
+        self.text_length = len(text)
+        text = io.BytesIO(text)
+        nrecords = 0
+        records_size = 0
+
+        if self.compression != UNCOMPRESSED:
+            self.oeb.logger.info('  Compressing markup content...')
+
+        while text.tell() < self.text_length:
+            data, overlap = create_text_record(text)
+            if self.compression == PALMDOC:
+                data = compress_doc(data)
+
+            data += overlap
+            data += pack(b'>B', len(overlap))
+
+            self.records.append(data)
+            records_size += len(data)
+            nrecords += 1
+
+        self.last_text_record_idx = nrecords
+        self.first_non_text_record_idx = nrecords + 1
+        # Pad so that the next records starts at a 4 byte boundary
+        if records_size % 4 != 0:
+            self.records.append(b'\x00'*(records_size % 4))
+            self.first_non_text_record_idx += 1
+    # }}}
+
+    def generate_record0(self):  # MOBI header {{{
+        metadata = self.oeb.metadata
+        bt = 0x002
+        if self.primary_index_record_idx is not None:
+            if False and self.indexer.is_flat_periodical:
+                # Disabled as setting this to 0x102 causes the Kindle to not
+                # auto archive the issues
+                bt = 0x102
+            elif self.indexer.is_periodical:
+                # If you change this, remember to change the cdetype in the EXTH
+                # header as well
+                bt = 0x103 if self.indexer.is_flat_periodical else 0x101
+
+        from calibre.ebooks.mobi.writer8.exth import build_exth
+        exth = build_exth(metadata,
+                prefer_author_sort=self.opts.prefer_author_sort,
+                is_periodical=self.is_periodical,
+                share_not_sync=self.opts.share_not_sync,
+                cover_offset=self.cover_offset,
+                thumbnail_offset=self.thumbnail_offset,
+                start_offset=self.serializer.start_offset, mobi_doctype=bt
+                )
+        first_image_record = None
+        if self.resources:
+            used_images = self.serializer.used_images
+            first_image_record  = len(self.records)
+            self.resources.serialize(self.records, used_images)
+        last_content_record = len(self.records) - 1
+
+        # FCIS/FLIS (Seems to serve no purpose)
+        flis_number = len(self.records)
+        self.records.append(FLIS)
+        fcis_number = len(self.records)
+        self.records.append(fcis(self.text_length))
+
+        # EOF record
+        self.records.append(b'\xE9\x8E\x0D\x0A')
+
+        record0 = io.BytesIO()
+        # The MOBI Header
+        record0.write(pack(b'>HHIHHHH',
+            self.compression,  # compression type # compression type
+            0,  # Unused
+            self.text_length,  # Text length
+            self.last_text_record_idx,  # Number of text records or last tr idx
+            RECORD_SIZE,  # Text record size
+            0,  # Unused
+            0  # Unused
+        ))  # 0 - 15 (0x0 - 0xf)
+        uid = random.randint(0, 0xffffffff)
+        title = normalize(unicode_type(metadata.title[0])).encode('utf-8')
+
+        # 0x0 - 0x3
+        record0.write(b'MOBI')
+
+        # 0x4 - 0x7   : Length of header
+        # 0x8 - 0x11  : MOBI type
+        #   type    meaning
+        #   0x002   MOBI book (chapter - chapter navigation)
+        #   0x101   News - Hierarchical navigation with sections and articles
+        #   0x102   News feed - Flat navigation
+        #   0x103   News magazine - same as 0x101
+        # 0xC - 0xF   : Text encoding (65001 is utf-8)
+        # 0x10 - 0x13 : UID
+        # 0x14 - 0x17 : Generator version
+
+        record0.write(pack(b'>IIIII',
+            0xe8, bt, 65001, uid, 6))
+
+        # 0x18 - 0x1f : Unknown
+        record0.write(b'\xff' * 8)
+
+        # 0x20 - 0x23 : Secondary index record
+        sir = 0xffffffff
+        if (self.primary_index_record_idx is not None and
+                self.indexer.secondary_record_offset is not None):
+            sir = (self.primary_index_record_idx +
+                    self.indexer.secondary_record_offset)
+        record0.write(pack(b'>I', sir))
+
+        # 0x24 - 0x3f : Unknown
+        record0.write(b'\xff' * 28)
+
+        # 0x40 - 0x43 : Offset of first non-text record
+        record0.write(pack(b'>I',
+            self.first_non_text_record_idx))
+
+        # 0x44 - 0x4b : title offset, title length
+        record0.write(pack(b'>II',
+            0xe8 + 16 + len(exth), len(title)))
+
+        # 0x4c - 0x4f : Language specifier
+        record0.write(iana2mobi(
+            unicode_type(metadata.language[0])))
+
+        # 0x50 - 0x57 : Input language and Output language
+        record0.write(b'\0' * 8)
+
+        # 0x58 - 0x5b : Format version
+        # 0x5c - 0x5f : First image record number
+        record0.write(pack(b'>II',
+            6, first_image_record if first_image_record else len(self.records)))
+
+        # 0x60 - 0x63 : First HUFF/CDIC record number
+        # 0x64 - 0x67 : Number of HUFF/CDIC records
+        # 0x68 - 0x6b : First DATP record number
+        # 0x6c - 0x6f : Number of DATP records
+        record0.write(b'\0' * 16)
+
+        # 0x70 - 0x73 : EXTH flags
+        # Bit 6 (0b1000000) being set indicates the presence of an EXTH header
+        # Bit 12 being set indicates the presence of embedded fonts
+        # The purpose of the other bits is unknown
+        exth_flags = 0b1010000
+        if self.is_periodical:
+            exth_flags |= 0b1000
+        if self.resources.has_fonts:
+            exth_flags |= 0b1000000000000
+        record0.write(pack(b'>I', exth_flags))
+
+        # 0x74 - 0x93 : Unknown
+        record0.write(b'\0' * 32)
+
+        # 0x94 - 0x97 : DRM offset
+        # 0x98 - 0x9b : DRM count
+        # 0x9c - 0x9f : DRM size
+        # 0xa0 - 0xa3 : DRM flags
+        record0.write(pack(b'>IIII',
+            0xffffffff, 0xffffffff, 0, 0))
+
+        # 0xa4 - 0xaf : Unknown
+        record0.write(b'\0'*12)
+
+        # 0xb0 - 0xb1 : First content record number
+        # 0xb2 - 0xb3 : last content record number
+        # (Includes Image, DATP, HUFF, DRM)
+        record0.write(pack(b'>HH', 1, last_content_record))
+
+        # 0xb4 - 0xb7 : Unknown
+        record0.write(b'\0\0\0\x01')
+
+        # 0xb8 - 0xbb : FCIS record number
+        record0.write(pack(b'>I', fcis_number))
+
+        # 0xbc - 0xbf : Unknown (FCIS record count?)
+        record0.write(pack(b'>I', 1))
+
+        # 0xc0 - 0xc3 : FLIS record number
+        record0.write(pack(b'>I', flis_number))
+
+        # 0xc4 - 0xc7 : Unknown (FLIS record count?)
+        record0.write(pack(b'>I', 1))
+
+        # 0xc8 - 0xcf : Unknown
+        record0.write(b'\0'*8)
+
+        # 0xd0 - 0xdf : Unknown
+        record0.write(pack(b'>IIII', 0xffffffff, 0, 0xffffffff, 0xffffffff))
+
+        # 0xe0 - 0xe3 : Extra record data
+        # Extra record data flags:
+        #   - 0b1  : <extra multibyte bytes><size>
+        #   - 0b10 : <TBS indexing description of this HTML record><size>
+        #   - 0b100: <uncrossable breaks><size>
+        # Setting bit 2 (0x2) disables <guide><reference type="start"> functionality
+        extra_data_flags = 0b1  # Has multibyte overlap bytes
+        if self.primary_index_record_idx is not None:
+            extra_data_flags |= 0b10
+        if WRITE_UNCROSSABLE_BREAKS:
+            extra_data_flags |= 0b100
+        record0.write(pack(b'>I', extra_data_flags))
+
+        # 0xe4 - 0xe7 : Primary index record
+        record0.write(pack(b'>I', 0xffffffff if self.primary_index_record_idx
+            is None else self.primary_index_record_idx))
+
+        record0.write(exth)
+        record0.write(title)
+        record0 = record0.getvalue()
+        # Add some buffer so that Amazon can add encryption information if this
+        # MOBI is submitted for publication
+        record0 += (b'\0' * (1024*8))
+        self.records[0] = align_block(record0)
+    # }}}
+
+    def generate_joint_record0(self):  # {{{
+        from calibre.ebooks.mobi.writer8.mobi import (MOBIHeader,
+                HEADER_FIELDS)
+        from calibre.ebooks.mobi.writer8.exth import build_exth
+
+        # Insert resource records
+        first_image_record = None
+        old = len(self.records)
+        if self.resources:
+            used_images = self.serializer.used_images | self.kf8.used_images
+            first_image_record  = len(self.records)
+            self.resources.serialize(self.records, used_images)
+        resource_record_count = len(self.records) - old
+        last_content_record = len(self.records) - 1
+
+        # FCIS/FLIS (Seems to serve no purpose)
+        flis_number = len(self.records)
+        self.records.append(FLIS)
+        fcis_number = len(self.records)
+        self.records.append(fcis(self.text_length))
+
+        # Insert KF8 records
+        self.records.append(b'BOUNDARY')
+        kf8_header_index = len(self.records)
+        self.kf8.start_offset = (self.serializer.start_offset,
+                self.kf8.start_offset)
+        self.records.append(self.kf8.record0)
+        self.records.extend(self.kf8.records[1:])
+
+        first_image_record = (first_image_record if first_image_record else
+                len(self.records))
+
+        header_fields = {k:getattr(self.kf8, k) for k in HEADER_FIELDS}
+
+        # Now change the header fields that need to be different in the MOBI 6
+        # header
+        header_fields['first_resource_record'] = first_image_record
+        ef = 0b100001010000  # Kinglegen uses this
+        if self.resources.has_fonts:
+            ef |= 0b1000000000000
+        header_fields['exth_flags'] = ef
+        header_fields['fdst_record'] = pack(b'>HH', 1, last_content_record)
+        header_fields['fdst_count'] = 1  # Why not 0? Kindlegen uses 1
+        header_fields['flis_record'] = flis_number
+        header_fields['fcis_record'] = fcis_number
+        header_fields['text_length'] = self.text_length
+        extra_data_flags = 0b1  # Has multibyte overlap bytes
+        if self.primary_index_record_idx is not None:
+            extra_data_flags |= 0b10
+        header_fields['extra_data_flags'] = extra_data_flags
+
+        for k, v in iteritems({'last_text_record':'last_text_record_idx',
+                'first_non_text_record':'first_non_text_record_idx',
+                'ncx_index':'primary_index_record_idx',
+                }):
+            header_fields[k] = getattr(self, v)
+        if header_fields['ncx_index'] is None:
+            header_fields['ncx_index'] = NULL_INDEX
+
+        for x in ('skel', 'chunk', 'guide'):
+            header_fields[x+'_index'] = NULL_INDEX
+
+        # Create the MOBI 6 EXTH
+        opts = self.opts
+        kuc = 0 if resource_record_count > 0 else None
+
+        header_fields['exth'] = build_exth(self.oeb.metadata,
+                prefer_author_sort=opts.prefer_author_sort,
+                is_periodical=opts.mobi_periodical,
+                share_not_sync=opts.share_not_sync,
+                cover_offset=self.cover_offset,
+                thumbnail_offset=self.thumbnail_offset,
+                num_of_resources=resource_record_count,
+                kf8_unknown_count=kuc, be_kindlegen2=True,
+                kf8_header_index=kf8_header_index,
+                start_offset=self.serializer.start_offset,
+                mobi_doctype=2)
+        self.records[0] = MOBIHeader(file_version=6)(**header_fields)
+
+    # }}}
+
+    def write_header(self):  # PalmDB header {{{
+        '''
+        Write the PalmDB header
+        '''
+        title = ascii_filename(unicode_type(self.oeb.metadata.title[0])).replace(
+                ' ', '_')
+        if not isinstance(title, bytes):
+            title = title.encode('ascii')
+        title = title[:31]
+        title = title + (b'\0' * (32 - len(title)))
+        now = int(time.time())
+        nrecords = len(self.records)
+        self.write(title, pack(b'>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0),
+            b'BOOK', b'MOBI', pack(b'>IIH', (2*nrecords)-1, 0, nrecords))
+        offset = self.tell() + (8 * nrecords) + 2
+        for i, record in enumerate(self.records):
+            self.write(pack(b'>I', offset), b'\0', pack(b'>I', 2*i)[1:])
+            offset += len(record)
+        self.write(b'\0\0')
+    # }}}
+
+    def write_content(self):
+        for record in self.records:
+            self.write(record)
@@ -0,0 +1,396 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+import re
+import unicodedata
+from collections import defaultdict
+from io import BytesIO
+
+from calibre.ebooks.mobi.mobiml import MBP_NS
+from calibre.ebooks.mobi.utils import is_guide_ref_start
+from calibre.ebooks.oeb.base import (
+    OEB_DOCS, XHTML, XHTML_NS, XML_NS, namespace, prefixname, urlnormalize
+)
+from polyglot.builtins import unicode_type, string_or_bytes
+from polyglot.urllib import urldefrag
+
+
+class Buf(BytesIO):
+
+    def write(self, x):
+        if isinstance(x, unicode_type):
+            x = x.encode('utf-8')
+        BytesIO.write(self, x)
+
+
+class Serializer(object):
+    NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
+
+    def __init__(self, oeb, images, is_periodical, write_page_breaks_after_item=True):
+        '''
+        Write all the HTML markup in oeb into a single in memory buffer
+        containing a single html document with links replaced by offsets into
+        the buffer.
+
+        :param oeb: OEBBook object that encapsulates the document to be
+        processed.
+
+        :param images: Mapping of image hrefs (urlnormalized) to image record
+        indices.
+
+        :param write_page_breaks_after_item: If True a MOBIpocket pagebreak tag
+        is written after every element of the spine in ``oeb``.
+        '''
+        self.oeb = oeb
+        # Map of image hrefs to image index in the MOBI file
+        self.images = images
+        self.used_images = set()
+        self.logger = oeb.logger
+        self.is_periodical = is_periodical
+        self.write_page_breaks_after_item = write_page_breaks_after_item
+
+        # If not None, this is a number pointing to the location at which to
+        # open the MOBI file on the Kindle
+        self.start_offset = None
+
+        # Mapping of hrefs (urlnormalized) to the offset in the buffer where
+        # the resource pointed to by the href lives. Used at the end to fill in
+        # the correct values into all filepos="..." links.
+        self.id_offsets = {}
+
+        # Mapping of hrefs (urlnormalized) to a list of offsets into the buffer
+        # where filepos="..." elements are written corresponding to links that
+        # point to the href. This is used at the end to fill in the correct values.
+        self.href_offsets = defaultdict(list)
+
+        # List of offsets in the buffer of non linear items in the spine. These
+        # become uncrossable breaks in the MOBI
+        self.breaks = []
+
+        self.find_blocks()
+
+    def find_blocks(self):
+        '''
+        Mark every item in the spine if it is the start/end of a
+        section/article, so that it can be wrapped in divs appropriately.
+        '''
+        for item in self.oeb.spine:
+            item.is_section_start = item.is_section_end = False
+            item.is_article_start = item.is_article_end = False
+
+        def spine_item(tocitem):
+            href = urldefrag(tocitem.href)[0]
+            for item in self.oeb.spine:
+                if item.href == href:
+                    return item
+
+        for item in self.oeb.toc.iterdescendants():
+            if item.klass == 'section':
+                articles = list(item)
+                if not articles:
+                    continue
+                spine_item(item).is_section_start = True
+                for i, article in enumerate(articles):
+                    si = spine_item(article)
+                    if si is not None:
+                        si.is_article_start = True
+
+        items = list(self.oeb.spine)
+        in_sec = in_art = False
+        for i, item in enumerate(items):
+            try:
+                prev_item = items[i-1]
+            except:
+                prev_item = None
+            if in_art and item.is_article_start is True:
+                prev_item.is_article_end = True
+                in_art = False
+            if in_sec and item.is_section_start is True:
+                prev_item.is_section_end = True
+                in_sec = False
+            if item.is_section_start:
+                in_sec = True
+            if item.is_article_start:
+                in_art = True
+
+        item.is_section_end = item.is_article_end = True
+
+    def __call__(self):
+        '''
+        Return the document serialized as a single UTF-8 encoded bytestring.
+        '''
+        buf = self.buf = Buf()
+        buf.write(b'<html>')
+        self.serialize_head()
+        self.serialize_body()
+        buf.write(b'</html>')
+        self.end_offset = buf.tell()
+        self.fixup_links()
+        if self.start_offset is None and not self.is_periodical:
+            # If we don't set a start offset, the stupid Kindle will
+            # open the book at the location of the first IndexEntry, which
+            # could be anywhere. So ensure the book is always opened at the
+            # beginning, instead.
+            self.start_offset = self.body_start_offset
+        return buf.getvalue()
+
+    def serialize_head(self):
+        buf = self.buf
+        buf.write(b'<head>')
+        if len(self.oeb.guide) > 0:
+            self.serialize_guide()
+        buf.write(b'</head>')
+
+    def serialize_guide(self):
+        '''
+        The Kindle decides where to open a book based on the presence of
+        an item in the guide that looks like
+        <reference type="text" title="Start" href="chapter-one.xhtml"/>
+
+        Similarly an item with type="toc" controls where the Goto Table of
+        Contents operation on the kindle goes.
+        '''
+
+        buf = self.buf
+        hrefs = self.oeb.manifest.hrefs
+        buf.write(b'<guide>')
+        for ref in self.oeb.guide.values():
+            path = urldefrag(ref.href)[0]
+            if path not in hrefs or hrefs[path].media_type not in OEB_DOCS:
+                continue
+
+            buf.write(b'<reference type="')
+            if ref.type.startswith('other.') :
+                self.serialize_text(ref.type.replace('other.',''), quot=True)
+            else:
+                self.serialize_text(ref.type, quot=True)
+            buf.write(b'" ')
+            if ref.title is not None:
+                buf.write(b'title="')
+                self.serialize_text(ref.title, quot=True)
+                buf.write(b'" ')
+                if is_guide_ref_start(ref):
+                    self._start_href = ref.href
+            self.serialize_href(ref.href)
+            # Space required or won't work, I kid you not
+            buf.write(b' />')
+
+        buf.write(b'</guide>')
+
+    def serialize_href(self, href, base=None):
+        '''
+        Serialize the href attribute of an <a> or <reference> tag. It is
+        serialized as filepos="000000000" and a pointer to its location is
+        stored in self.href_offsets so that the correct value can be filled in
+        at the end.
+        '''
+        hrefs = self.oeb.manifest.hrefs
+        try:
+            path, frag = urldefrag(urlnormalize(href))
+        except ValueError:
+            # Unparseable URL
+            return False
+        if path and base:
+            path = base.abshref(path)
+        if path and path not in hrefs:
+            return False
+        buf = self.buf
+        item = hrefs[path] if path else None
+        if item and item.spine_position is None:
+            return False
+        path = item.href if item else base.href
+        href = '#'.join((path, frag)) if frag else path
+        buf.write(b'filepos=')
+        self.href_offsets[href].append(buf.tell())
+        buf.write(b'0000000000')
+        return True
+
+    def serialize_body(self):
+        '''
+        Serialize all items in the spine of the document. Non linear items are
+        moved to the end.
+        '''
+        buf = self.buf
+
+        def serialize_toc_level(tocref, href=None):
+            # add the provided toc level to the output stream
+            # if href is provided add a link ref to the toc level output (e.g. feed_0/index.html)
+            if href is not None:
+                # resolve the section url in id_offsets
+                buf.write(b'<mbp:pagebreak />')
+                self.id_offsets[urlnormalize(href)] = buf.tell()
+
+            if tocref.klass == "periodical":
+                buf.write(b'<div> <div height="1em"></div>')
+            else:
+                t = tocref.title
+                if isinstance(t, unicode_type):
+                    t = t.encode('utf-8')
+                buf.write(b'<div></div> <div> <h2 height="1em"><font size="+2"><b>' + t +
+                          b'</b></font></h2> <div height="1em"></div>')
+
+            buf.write(b'<ul>')
+
+            for tocitem in tocref.nodes:
+                buf.write(b'<li><a filepos=')
+                itemhref = tocitem.href
+                if tocref.klass == 'periodical':
+                    # This is a section node.
+                    # For periodical tocs, the section urls are like r'feed_\d+/index.html'
+                    # We dont want to point to the start of the first article
+                    # so we change the href.
+                    itemhref = re.sub(r'article_\d+/', '', itemhref)
+                self.href_offsets[itemhref].append(buf.tell())
+                buf.write(b'0000000000')
+                buf.write(b' ><font size="+1"><b><u>')
+                t = tocitem.title
+                if isinstance(t, unicode_type):
+                    t = t.encode('utf-8')
+                buf.write(t)
+                buf.write(b'</u></b></font></a></li>')
+
+            buf.write(b'</ul><div height="1em"></div></div><mbp:pagebreak />')
+
+        self.anchor_offset = buf.tell()
+        buf.write(b'<body>')
+        self.body_start_offset = buf.tell()
+
+        if self.is_periodical:
+            top_toc = self.oeb.toc.nodes[0]
+            serialize_toc_level(top_toc)
+
+        spine = [item for item in self.oeb.spine if item.linear]
+        spine.extend([item for item in self.oeb.spine if not item.linear])
+
+        for item in spine:
+
+            if self.is_periodical and item.is_section_start:
+                for section_toc in top_toc.nodes:
+                    if urlnormalize(item.href) == section_toc.href:
+                        # create section url of the form r'feed_\d+/index.html'
+                        section_url = re.sub(r'article_\d+/', '', section_toc.href)
+                        serialize_toc_level(section_toc, section_url)
+                        section_toc.href = section_url
+                        break
+
+            self.serialize_item(item)
+
+        self.body_end_offset = buf.tell()
+        buf.write(b'</body>')
+
+    def serialize_item(self, item):
+        '''
+        Serialize an individual item from the spine of the input document.
+        A reference to this item is stored in self.href_offsets
+        '''
+        buf = self.buf
+        if not item.linear:
+            self.breaks.append(buf.tell() - 1)
+        self.id_offsets[urlnormalize(item.href)] = buf.tell()
+        if item.is_section_start:
+            buf.write(b'<a ></a> ')
+        if item.is_article_start:
+            buf.write(b'<a ></a> <a ></a>')
+        for elem in item.data.find(XHTML('body')):
+            self.serialize_elem(elem, item)
+        if self.write_page_breaks_after_item:
+            buf.write(b'<mbp:pagebreak/>')
+        if item.is_article_end:
+            # Kindle periodical article end marker
+            buf.write(b'<a ></a> <a ></a>')
+        if item.is_section_end:
+            buf.write(b' <a ></a>')
+        self.anchor_offset = None
+
+    def serialize_elem(self, elem, item, nsrmap=NSRMAP):
+        buf = self.buf
+        if not isinstance(elem.tag, string_or_bytes) \
+            or namespace(elem.tag) not in nsrmap:
+            return
+        tag = prefixname(elem.tag, nsrmap)
+        # Previous layers take care of @name
+        id_ = elem.attrib.pop('id', None)
+        if id_:
+            href = '#'.join((item.href, id_))
+            offset = self.anchor_offset or buf.tell()
+            key = urlnormalize(href)
+            # Only set this id_offset if it wasn't previously seen
+            self.id_offsets[key] = self.id_offsets.get(key, offset)
+        if self.anchor_offset is not None and \
+            tag == 'a' and not elem.attrib and \
+            not len(elem) and not elem.text:
+            return
+        self.anchor_offset = buf.tell()
+        buf.write(b'<')
+        buf.write(tag.encode('utf-8'))
+        if elem.attrib:
+            for attr, val in elem.attrib.items():
+                if namespace(attr) not in nsrmap:
+                    continue
+                attr = prefixname(attr, nsrmap)
+                buf.write(b' ')
+                if attr == 'href':
+                    if self.serialize_href(val, item):
+                        continue
+                elif attr == 'src':
+                    href = urlnormalize(item.abshref(val))
+                    if href in self.images:
+                        index = self.images[href]
+                        self.used_images.add(href)
+                        buf.write(b'recindex="%05d"' % index)
+                        continue
+                buf.write(attr.encode('utf-8'))
+                buf.write(b'="')
+                self.serialize_text(val, quot=True)
+                buf.write(b'"')
+        buf.write(b'>')
+        if elem.text or len(elem) > 0:
+            if elem.text:
+                self.anchor_offset = None
+                self.serialize_text(elem.text)
+            for child in elem:
+                self.serialize_elem(child, item)
+                if child.tail:
+                    self.anchor_offset = None
+                    self.serialize_text(child.tail)
+        buf.write(('</%s>' % tag).encode('utf-8'))
+
+    def serialize_text(self, text, quot=False):
+        text = text.replace('&', '&amp;')
+        text = text.replace('<', '&lt;')
+        text = text.replace('>', '&gt;')
+        text = text.replace(u'\u00AD', '')  # Soft-hyphen
+        if quot:
+            text = text.replace('"', '&quot;')
+        if isinstance(text, unicode_type):
+            text = unicodedata.normalize('NFC', text)
+        self.buf.write(text.encode('utf-8'))
+
+    def fixup_links(self):
+        '''
+        Fill in the correct values for all filepos="..." links with the offsets
+        of the linked to content (as stored in id_offsets).
+        '''
+        buf = self.buf
+        id_offsets = self.id_offsets
+        start_href = getattr(self, '_start_href', None)
+        for href, hoffs in self.href_offsets.items():
+            is_start = (href and href == start_href)
+            # Iterate over all filepos items
+            if href not in id_offsets:
+                self.logger.warn('Hyperlink target %r not found' % href)
+                # Link to the top of the document, better than just ignoring
+                href, _ = urldefrag(href)
+            if href in self.id_offsets:
+                ioff = self.id_offsets[href]
+                if is_start:
+                    self.start_offset = ioff
+                for hoff in hoffs:
+                    buf.seek(hoff)
+                    buf.write(('%010d' % ioff).encode('utf-8'))