1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-02-18 15:35:48 +01:00
Files
ebook-converter/ebook_converter/ebooks/mobi/writer2/serializer.py
gryf ce89f5c9d1 Use the real constants module.
This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
2020-05-29 17:04:53 +02:00

393 lines
15 KiB
Python

import collections
import io
import re
import unicodedata
import urllib.parse
from ebook_converter import constants as const
from ebook_converter.ebooks.mobi.mobiml import MBP_NS
from ebook_converter.ebooks.mobi.utils import is_guide_ref_start
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
class Buf(io.BytesIO):
def write(self, x):
if isinstance(x, str):
x = x.encode('utf-8')
io.BytesIO.write(self, x)
class Serializer(object):
NSRMAP = {'': None,
const.XML_NS: 'xml',
const.XHTML_NS: '',
MBP_NS: 'mbp'} # TODO(gryf): check why this is different than
# MBP_NS from const.
def __init__(self, oeb, images, is_periodical,
write_page_breaks_after_item=True):
'''
Write all the HTML markup in oeb into a single in memory buffer
containing a single html document with links replaced by offsets into
the buffer.
:param oeb: OEBBook object that encapsulates the document to be
processed.
:param images: Mapping of image hrefs (urlnormalized) to image record
indices.
:param write_page_breaks_after_item: If True a MOBIpocket pagebreak tag
is written after every element of the spine in ``oeb``.
'''
self.oeb = oeb
# Map of image hrefs to image index in the MOBI file
self.images = images
self.used_images = set()
self.logger = oeb.logger
self.is_periodical = is_periodical
self.write_page_breaks_after_item = write_page_breaks_after_item
# If not None, this is a number pointing to the location at which to
# open the MOBI file on the Kindle
self.start_offset = None
# Mapping of hrefs (urlnormalized) to the offset in the buffer where
# the resource pointed to by the href lives. Used at the end to fill in
# the correct values into all filepos="..." links.
self.id_offsets = {}
# Mapping of hrefs (urlnormalized) to a list of offsets into the buffer
# where filepos="..." elements are written corresponding to links that
# point to the href. This is used at the end to fill in the correct values.
self.href_offsets = collections.defaultdict(list)
# List of offsets in the buffer of non linear items in the spine. These
# become uncrossable breaks in the MOBI
self.breaks = []
self.find_blocks()
def find_blocks(self):
'''
Mark every item in the spine if it is the start/end of a
section/article, so that it can be wrapped in divs appropriately.
'''
for item in self.oeb.spine:
item.is_section_start = item.is_section_end = False
item.is_article_start = item.is_article_end = False
def spine_item(tocitem):
href = urllib.parse.urldefrag(tocitem.href)[0]
for item in self.oeb.spine:
if item.href == href:
return item
for item in self.oeb.toc.iterdescendants():
if item.klass == 'section':
articles = list(item)
if not articles:
continue
spine_item(item).is_section_start = True
for i, article in enumerate(articles):
si = spine_item(article)
if si is not None:
si.is_article_start = True
items = list(self.oeb.spine)
in_sec = in_art = False
for i, item in enumerate(items):
try:
prev_item = items[i-1]
except:
prev_item = None
if in_art and item.is_article_start is True:
prev_item.is_article_end = True
in_art = False
if in_sec and item.is_section_start is True:
prev_item.is_section_end = True
in_sec = False
if item.is_section_start:
in_sec = True
if item.is_article_start:
in_art = True
item.is_section_end = item.is_article_end = True
def __call__(self):
'''
Return the document serialized as a single UTF-8 encoded bytestring.
'''
buf = self.buf = Buf()
buf.write(b'<html>')
self.serialize_head()
self.serialize_body()
buf.write(b'</html>')
self.end_offset = buf.tell()
self.fixup_links()
if self.start_offset is None and not self.is_periodical:
# If we don't set a start offset, the stupid Kindle will
# open the book at the location of the first IndexEntry, which
# could be anywhere. So ensure the book is always opened at the
# beginning, instead.
self.start_offset = self.body_start_offset
return buf.getvalue()
def serialize_head(self):
buf = self.buf
buf.write(b'<head>')
if len(self.oeb.guide) > 0:
self.serialize_guide()
buf.write(b'</head>')
def serialize_guide(self):
'''
The Kindle decides where to open a book based on the presence of
an item in the guide that looks like
<reference type="text" title="Start" href="chapter-one.xhtml"/>
Similarly an item with type="toc" controls where the Goto Table of
Contents operation on the kindle goes.
'''
buf = self.buf
hrefs = self.oeb.manifest.hrefs
buf.write(b'<guide>')
for ref in self.oeb.guide.values():
path = urllib.parse.urldefrag(ref.href)[0]
if (path not in hrefs or
hrefs[path].media_type not in base.OEB_DOCS):
continue
buf.write(b'<reference type="')
if ref.type.startswith('other.') :
self.serialize_text(ref.type.replace('other.',''), quot=True)
else:
self.serialize_text(ref.type, quot=True)
buf.write(b'" ')
if ref.title is not None:
buf.write(b'title="')
self.serialize_text(ref.title, quot=True)
buf.write(b'" ')
if is_guide_ref_start(ref):
self._start_href = ref.href
self.serialize_href(ref.href)
# Space required or won't work, I kid you not
buf.write(b' />')
buf.write(b'</guide>')
def serialize_href(self, href, _base=None):
"""
Serialize the href attribute of an <a> or <reference> tag. It is
serialized as filepos="000000000" and a pointer to its location is
stored in self.href_offsets so that the correct value can be filled in
at the end.
"""
hrefs = self.oeb.manifest.hrefs
try:
path, frag = urllib.parse.urldefrag(base.urlnormalize(href))
except ValueError:
# Unparseable URL
return False
if path and _base:
path = _base.abshref(path)
if path and path not in hrefs:
return False
buf = self.buf
item = hrefs[path] if path else None
if item and item.spine_position is None:
return False
path = item.href if item else _base.href
href = '#'.join((path, frag)) if frag else path
buf.write(b'filepos=')
self.href_offsets[href].append(buf.tell())
buf.write(b'0000000000')
return True
def serialize_body(self):
'''
Serialize all items in the spine of the document. Non linear items are
moved to the end.
'''
buf = self.buf
def serialize_toc_level(tocref, href=None):
# add the provided toc level to the output stream
# if href is provided add a link ref to the toc level output (e.g. feed_0/index.html)
if href is not None:
# resolve the section url in id_offsets
buf.write(b'<mbp:pagebreak />')
self.id_offsets[base.urlnormalize(href)] = buf.tell()
if tocref.klass == "periodical":
buf.write(b'<div> <div height="1em"></div>')
else:
t = tocref.title
if isinstance(t, str):
t = t.encode('utf-8')
buf.write(b'<div></div> <div> <h2 height="1em"><font size="+2"><b>' + t +
b'</b></font></h2> <div height="1em"></div>')
buf.write(b'<ul>')
for tocitem in tocref.nodes:
buf.write(b'<li><a filepos=')
itemhref = tocitem.href
if tocref.klass == 'periodical':
# This is a section node.
# For periodical tocs, the section urls are like r'feed_\d+/index.html'
# We dont want to point to the start of the first article
# so we change the href.
itemhref = re.sub(r'article_\d+/', '', itemhref)
self.href_offsets[itemhref].append(buf.tell())
buf.write(b'0000000000')
buf.write(b' ><font size="+1"><b><u>')
t = tocitem.title
if isinstance(t, str):
t = t.encode('utf-8')
buf.write(t)
buf.write(b'</u></b></font></a></li>')
buf.write(b'</ul><div height="1em"></div></div><mbp:pagebreak />')
self.anchor_offset = buf.tell()
buf.write(b'<body>')
self.body_start_offset = buf.tell()
if self.is_periodical:
top_toc = self.oeb.toc.nodes[0]
serialize_toc_level(top_toc)
spine = [item for item in self.oeb.spine if item.linear]
spine.extend([item for item in self.oeb.spine if not item.linear])
for item in spine:
if self.is_periodical and item.is_section_start:
for section_toc in top_toc.nodes:
if base.urlnormalize(item.href) == section_toc.href:
# create section url of the form r'feed_\d+/index.html'
section_url = re.sub(r'article_\d+/', '', section_toc.href)
serialize_toc_level(section_toc, section_url)
section_toc.href = section_url
break
self.serialize_item(item)
self.body_end_offset = buf.tell()
buf.write(b'</body>')
def serialize_item(self, item):
'''
Serialize an individual item from the spine of the input document.
A reference to this item is stored in self.href_offsets
'''
buf = self.buf
if not item.linear:
self.breaks.append(buf.tell() - 1)
self.id_offsets[base.urlnormalize(item.href)] = buf.tell()
if item.is_section_start:
buf.write(b'<a ></a> ')
if item.is_article_start:
buf.write(b'<a ></a> <a ></a>')
for elem in item.data.find(base.tag('xhtml', 'body')):
self.serialize_elem(elem, item)
if self.write_page_breaks_after_item:
buf.write(b'<mbp:pagebreak/>')
if item.is_article_end:
# Kindle periodical article end marker
buf.write(b'<a ></a> <a ></a>')
if item.is_section_end:
buf.write(b' <a ></a>')
self.anchor_offset = None
def serialize_elem(self, elem, item, nsrmap=NSRMAP):
buf = self.buf
if not isinstance(elem.tag, (str, bytes)) \
or parse_utils.namespace(elem.tag) not in nsrmap:
return
tag = base.prefixname(elem.tag, nsrmap)
# Previous layers take care of @name
id_ = elem.attrib.pop('id', None)
if id_:
href = '#'.join((item.href, id_))
offset = self.anchor_offset or buf.tell()
key = base.urlnormalize(href)
# Only set this id_offset if it wasn't previously seen
self.id_offsets[key] = self.id_offsets.get(key, offset)
if self.anchor_offset is not None and \
tag == 'a' and not elem.attrib and \
not len(elem) and not elem.text:
return
self.anchor_offset = buf.tell()
buf.write(b'<')
buf.write(tag.encode('utf-8'))
if elem.attrib:
for attr, val in elem.attrib.items():
if parse_utils.namespace(attr) not in nsrmap:
continue
attr = base.prefixname(attr, nsrmap)
buf.write(b' ')
if attr == 'href':
if self.serialize_href(val, item):
continue
elif attr == 'src':
href = base.urlnormalize(item.abshref(val))
if href in self.images:
index = self.images[href]
self.used_images.add(href)
buf.write(b'recindex="%05d"' % index)
continue
buf.write(attr.encode('utf-8'))
buf.write(b'="')
self.serialize_text(val, quot=True)
buf.write(b'"')
buf.write(b'>')
if elem.text or len(elem) > 0:
if elem.text:
self.anchor_offset = None
self.serialize_text(elem.text)
for child in elem:
self.serialize_elem(child, item)
if child.tail:
self.anchor_offset = None
self.serialize_text(child.tail)
buf.write(('</%s>' % tag).encode('utf-8'))
def serialize_text(self, text, quot=False):
text = text.replace('&', '&amp;')
text = text.replace('<', '&lt;')
text = text.replace('>', '&gt;')
text = text.replace(u'\u00AD', '') # Soft-hyphen
if quot:
text = text.replace('"', '&quot;')
if isinstance(text, str):
text = unicodedata.normalize('NFC', text)
self.buf.write(text.encode('utf-8'))
def fixup_links(self):
'''
Fill in the correct values for all filepos="..." links with the offsets
of the linked to content (as stored in id_offsets).
'''
buf = self.buf
id_offsets = self.id_offsets
start_href = getattr(self, '_start_href', None)
for href, hoffs in self.href_offsets.items():
is_start = (href and href == start_href)
# Iterate over all filepos items
if href not in id_offsets:
self.logger.warn('Hyperlink target %r not found' % href)
# Link to the top of the document, better than just ignoring
href, _ = urllib.parse.urldefrag(href)
if href in self.id_offsets:
ioff = self.id_offsets[href]
if is_start:
self.start_offset = ioff
for hoff in hoffs:
buf.seek(hoff)
buf.write(('%010d' % ioff).encode('utf-8'))