import collections import io import re import unicodedata import urllib.parse from ebook_converter import constants as const from ebook_converter.ebooks.mobi.mobiml import MBP_NS from ebook_converter.ebooks.mobi.utils import is_guide_ref_start from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb import parse_utils class Buf(io.BytesIO): def write(self, x): if isinstance(x, str): x = x.encode('utf-8') io.BytesIO.write(self, x) class Serializer(object): NSRMAP = {'': None, const.XML_NS: 'xml', const.XHTML_NS: '', MBP_NS: 'mbp'} # TODO(gryf): check why this is different than # MBP_NS from const. def __init__(self, oeb, images, is_periodical, write_page_breaks_after_item=True): ''' Write all the HTML markup in oeb into a single in memory buffer containing a single html document with links replaced by offsets into the buffer. :param oeb: OEBBook object that encapsulates the document to be processed. :param images: Mapping of image hrefs (urlnormalized) to image record indices. :param write_page_breaks_after_item: If True a MOBIpocket pagebreak tag is written after every element of the spine in ``oeb``. ''' self.oeb = oeb # Map of image hrefs to image index in the MOBI file self.images = images self.used_images = set() self.logger = oeb.logger self.is_periodical = is_periodical self.write_page_breaks_after_item = write_page_breaks_after_item # If not None, this is a number pointing to the location at which to # open the MOBI file on the Kindle self.start_offset = None # Mapping of hrefs (urlnormalized) to the offset in the buffer where # the resource pointed to by the href lives. Used at the end to fill in # the correct values into all filepos="..." links. self.id_offsets = {} # Mapping of hrefs (urlnormalized) to a list of offsets into the buffer # where filepos="..." elements are written corresponding to links that # point to the href. This is used at the end to fill in the correct values. self.href_offsets = collections.defaultdict(list) # List of offsets in the buffer of non linear items in the spine. These # become uncrossable breaks in the MOBI self.breaks = [] self.find_blocks() def find_blocks(self): ''' Mark every item in the spine if it is the start/end of a section/article, so that it can be wrapped in divs appropriately. ''' for item in self.oeb.spine: item.is_section_start = item.is_section_end = False item.is_article_start = item.is_article_end = False def spine_item(tocitem): href = urllib.parse.urldefrag(tocitem.href)[0] for item in self.oeb.spine: if item.href == href: return item for item in self.oeb.toc.iterdescendants(): if item.klass == 'section': articles = list(item) if not articles: continue spine_item(item).is_section_start = True for i, article in enumerate(articles): si = spine_item(article) if si is not None: si.is_article_start = True items = list(self.oeb.spine) in_sec = in_art = False for i, item in enumerate(items): try: prev_item = items[i-1] except: prev_item = None if in_art and item.is_article_start is True: prev_item.is_article_end = True in_art = False if in_sec and item.is_section_start is True: prev_item.is_section_end = True in_sec = False if item.is_section_start: in_sec = True if item.is_article_start: in_art = True item.is_section_end = item.is_article_end = True def __call__(self): ''' Return the document serialized as a single UTF-8 encoded bytestring. ''' buf = self.buf = Buf() buf.write(b'') self.serialize_head() self.serialize_body() buf.write(b'') self.end_offset = buf.tell() self.fixup_links() if self.start_offset is None and not self.is_periodical: # If we don't set a start offset, the stupid Kindle will # open the book at the location of the first IndexEntry, which # could be anywhere. So ensure the book is always opened at the # beginning, instead. self.start_offset = self.body_start_offset return buf.getvalue() def serialize_head(self): buf = self.buf buf.write(b'
') if len(self.oeb.guide) > 0: self.serialize_guide() buf.write(b'') def serialize_guide(self): ''' The Kindle decides where to open a book based on the presence of an item in the guide that looks like