mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-31 02:25:45 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
924 lines
33 KiB
Python
924 lines
33 KiB
Python
import collections
|
|
import functools
|
|
import operator
|
|
import pkg_resources
|
|
import re
|
|
import urllib.parse
|
|
|
|
from lxml import etree
|
|
from lxml.builder import ElementMaker
|
|
|
|
from ebook_converter import __version__
|
|
from ebook_converter import constants as const
|
|
from ebook_converter.ebooks.oeb import base
|
|
from ebook_converter.ebooks.oeb.polish.errors import MalformedMarkup
|
|
from ebook_converter.ebooks.oeb.polish.utils import guess_type, extract
|
|
from ebook_converter.ebooks.oeb.polish.opf import set_guide_item, get_book_language
|
|
from ebook_converter.ebooks.oeb.polish.pretty import pretty_html_tree
|
|
from ebook_converter.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
|
|
|
|
|
|
ns = etree.FunctionNamespace('calibre_xpath_extensions')
|
|
ns.prefix = 'calibre'
|
|
ns['lower-case'] = lambda c, x: x.lower() if hasattr(x, 'lower') else x
|
|
|
|
|
|
class TOC(object):
|
|
|
|
toc_title = None
|
|
|
|
def __init__(self, title=None, dest=None, frag=None):
|
|
self.title, self.dest, self.frag = title, dest, frag
|
|
self.dest_exists = self.dest_error = None
|
|
if self.title:
|
|
self.title = self.title.strip()
|
|
self.parent = None
|
|
self.children = []
|
|
self.page_list = []
|
|
|
|
def add(self, title, dest, frag=None):
|
|
c = TOC(title, dest, frag)
|
|
self.children.append(c)
|
|
c.parent = self
|
|
return c
|
|
|
|
def remove(self, child):
|
|
self.children.remove(child)
|
|
child.parent = None
|
|
|
|
def remove_from_parent(self):
|
|
if self.parent is None:
|
|
return
|
|
idx = self.parent.children.index(self)
|
|
for child in reversed(self.children):
|
|
child.parent = self.parent
|
|
self.parent.children.insert(idx, child)
|
|
self.parent.children.remove(self)
|
|
self.parent = None
|
|
|
|
def __iter__(self):
|
|
for c in self.children:
|
|
yield c
|
|
|
|
def __len__(self):
|
|
return len(self.children)
|
|
|
|
def iterdescendants(self, level=None):
|
|
gc_level = None if level is None else level + 1
|
|
for child in self:
|
|
if level is None:
|
|
yield child
|
|
else:
|
|
yield level, child
|
|
for gc in child.iterdescendants(level=gc_level):
|
|
yield gc
|
|
|
|
def remove_duplicates(self, only_text=True):
|
|
seen = set()
|
|
remove = []
|
|
for child in self:
|
|
key = child.title if only_text else (child.title, child.dest,
|
|
(child.frag or None))
|
|
if key in seen:
|
|
remove.append(child)
|
|
else:
|
|
seen.add(key)
|
|
child.remove_duplicates()
|
|
for child in remove:
|
|
self.remove(child)
|
|
|
|
@property
|
|
def depth(self):
|
|
"""The maximum depth of the navigation tree rooted at this node."""
|
|
try:
|
|
return max(node.depth for node in self) + 1
|
|
except ValueError:
|
|
return 1
|
|
|
|
@property
|
|
def last_child(self):
|
|
return self.children[-1] if self.children else None
|
|
|
|
def get_lines(self, lvl=0):
|
|
frag = ('#'+self.frag) if self.frag else ''
|
|
ans = [('\t'*lvl) + 'TOC: %s --> %s%s' % (self.title, self.dest, frag)]
|
|
for child in self:
|
|
ans.extend(child.get_lines(lvl+1))
|
|
return ans
|
|
|
|
def __str__(self):
|
|
return '\n'.join(self.get_lines())
|
|
|
|
def to_dict(self, node_counter=None):
|
|
ans = {'title': self.title, 'dest': self.dest, 'frag': self.frag,
|
|
'children': [c.to_dict(node_counter) for c in self.children]}
|
|
if self.dest_exists is not None:
|
|
ans['dest_exists'] = self.dest_exists
|
|
if self.dest_error is not None:
|
|
ans['dest_error'] = self.dest_error
|
|
if node_counter is not None:
|
|
ans['id'] = next(node_counter)
|
|
return ans
|
|
|
|
@property
|
|
def as_dict(self):
|
|
return self.to_dict()
|
|
|
|
|
|
def child_xpath(tag, name):
|
|
return tag.xpath('./*[calibre:lower-case(local-name()) = "%s"]' % name)
|
|
|
|
|
|
def add_from_navpoint(container, navpoint, parent, ncx_name):
|
|
dest = frag = text = None
|
|
nl = child_xpath(navpoint, 'navlabel')
|
|
if nl:
|
|
nl = nl[0]
|
|
text = ''
|
|
for txt in child_xpath(nl, 'text'):
|
|
text += etree.tostring(txt, method='text',
|
|
encoding='unicode', with_tail=False)
|
|
content = child_xpath(navpoint, 'content')
|
|
if content:
|
|
content = content[0]
|
|
href = content.get('src', None)
|
|
if href:
|
|
dest = container.href_to_name(href, base=ncx_name)
|
|
frag = urllib.parse.urlparse(href).fragment or None
|
|
return parent.add(text or None, dest or None, frag or None)
|
|
|
|
|
|
def process_ncx_node(container, node, toc_parent, ncx_name):
|
|
for navpoint in node.xpath('./*[calibre:lower-case(local-name()) '
|
|
'= "navpoint"]'):
|
|
child = add_from_navpoint(container, navpoint, toc_parent, ncx_name)
|
|
if child is not None:
|
|
process_ncx_node(container, navpoint, child, ncx_name)
|
|
|
|
|
|
def parse_ncx(container, ncx_name):
|
|
root = container.parsed(ncx_name)
|
|
toc_root = TOC()
|
|
navmaps = root.xpath('//*[calibre:lower-case(local-name()) = "navmap"]')
|
|
if navmaps:
|
|
process_ncx_node(container, navmaps[0], toc_root, ncx_name)
|
|
toc_root.lang = toc_root.uid = None
|
|
for attr, val in root.attrib.items():
|
|
if attr.endswith('lang'):
|
|
toc_root.lang = str(val)
|
|
break
|
|
for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and '
|
|
'@name="dtb:uid"]/@content'):
|
|
if uid:
|
|
toc_root.uid = str(uid)
|
|
break
|
|
for pl in root.xpath('//*[calibre:lower-case(local-name()) = "pagelist"]'):
|
|
for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = '
|
|
'"pagetarget"]'):
|
|
pagenum = pt.get('value')
|
|
if pagenum:
|
|
href = pt.xpath('descendant::*[calibre:lower-case(local-name()'
|
|
') = "content"]/@src')
|
|
if href:
|
|
dest = container.href_to_name(href[0], base=ncx_name)
|
|
frag = urllib.parse.urlparse(href[0]).fragment or None
|
|
toc_root.page_list.append({'dest': dest,
|
|
'pagenum': pagenum,
|
|
'frag': frag})
|
|
return toc_root
|
|
|
|
|
|
def add_from_li(container, li, parent, nav_name):
|
|
dest = frag = text = None
|
|
for x in li.iterchildren(base.tag('xhtml', 'a'),
|
|
base.tag('xhtml', 'span')):
|
|
text = (etree.tostring(x, method='text', encoding='unicode',
|
|
with_tail=False).strip() or
|
|
' '.join(x.xpath('descendant-or-self::*/@title')).strip())
|
|
href = x.get('href')
|
|
if href:
|
|
dest = (nav_name if href.startswith('#') else
|
|
container.href_to_name(href, base=nav_name))
|
|
frag = urllib.parse.urlparse(href).fragment or None
|
|
break
|
|
return parent.add(text or None, dest or None, frag or None)
|
|
|
|
|
|
def first_child(parent, tagname):
|
|
try:
|
|
return next(parent.iterchildren(tagname))
|
|
except StopIteration:
|
|
return None
|
|
|
|
|
|
def process_nav_node(container, node, toc_parent, nav_name):
|
|
for li in node.iterchildren(base.tag('xhtml', 'li')):
|
|
child = add_from_li(container, li, toc_parent, nav_name)
|
|
ol = first_child(li, base.tag('xhtml', 'ol'))
|
|
if child is not None and ol is not None:
|
|
process_nav_node(container, ol, child, nav_name)
|
|
|
|
|
|
def parse_nav(container, nav_name):
|
|
root = container.parsed(nav_name)
|
|
toc_root = TOC()
|
|
toc_root.lang = toc_root.uid = None
|
|
xhtml = functools.partial(base.tag, 'xhtml')
|
|
for nav in root.iterdescendants(base.tag('xhtml', 'nav')):
|
|
if nav.get(base.tag('epub', 'type')) == 'toc':
|
|
ol = first_child(nav, base.tag('xhtml', 'ol'))
|
|
if ol is not None:
|
|
process_nav_node(container, ol, toc_root, nav_name)
|
|
for h in nav.iterchildren(*map(xhtml,
|
|
'h1 h2 h3 h4 h5 h6'.split())):
|
|
text = etree.tostring(h, method='text', encoding='unicode',
|
|
with_tail=False) or h.get('title')
|
|
if text:
|
|
toc_root.toc_title = text
|
|
break
|
|
break
|
|
return toc_root
|
|
|
|
|
|
def verify_toc_destinations(container, toc):
|
|
anchor_map = {}
|
|
anchor_xpath = base.XPath('//*/@id|//h:a/@name')
|
|
for item in toc.iterdescendants():
|
|
name = item.dest
|
|
if not name:
|
|
item.dest_exists = False
|
|
item.dest_error = 'No file named %s exists' % name
|
|
continue
|
|
try:
|
|
root = container.parsed(name)
|
|
except KeyError:
|
|
item.dest_exists = False
|
|
item.dest_error = 'No file named %s exists' % name
|
|
continue
|
|
if not hasattr(root, 'xpath'):
|
|
item.dest_exists = False
|
|
item.dest_error = 'No HTML file named %s exists' % name
|
|
continue
|
|
if not item.frag:
|
|
item.dest_exists = True
|
|
continue
|
|
if name not in anchor_map:
|
|
anchor_map[name] = frozenset(anchor_xpath(root))
|
|
item.dest_exists = item.frag in anchor_map[name]
|
|
if not item.dest_exists:
|
|
item.dest_error = ('The anchor %(a)s does not exist in file '
|
|
'%(f)s' % dict(a=item.frag, f=name))
|
|
|
|
|
|
def find_existing_ncx_toc(container):
|
|
toc = container.opf_xpath('//opf:spine/@toc')
|
|
if toc:
|
|
toc = container.manifest_id_map.get(toc[0], None)
|
|
if not toc:
|
|
ncx = guess_type('a.ncx')
|
|
toc = container.manifest_type_map.get(ncx, [None])[0]
|
|
return toc or None
|
|
|
|
|
|
def find_existing_nav_toc(container):
|
|
for name in container.manifest_items_with_property('nav'):
|
|
return name
|
|
|
|
|
|
def get_x_toc(container, find_toc, parse_toc, verify_destinations=True):
|
|
def empty_toc():
|
|
ans = TOC()
|
|
ans.lang = ans.uid = None
|
|
return ans
|
|
toc = find_toc(container)
|
|
ans = (empty_toc() if toc is None or not container.has_name(toc) else
|
|
parse_toc(container, toc))
|
|
ans.toc_file_name = toc if toc and container.has_name(toc) else None
|
|
if verify_destinations:
|
|
verify_toc_destinations(container, ans)
|
|
return ans
|
|
|
|
|
|
def get_toc(container, verify_destinations=True):
|
|
ver = container.opf_version_parsed
|
|
if ver.major < 3:
|
|
return get_x_toc(container, find_existing_ncx_toc, parse_ncx,
|
|
verify_destinations=verify_destinations)
|
|
else:
|
|
ans = get_x_toc(container, find_existing_nav_toc, parse_nav,
|
|
verify_destinations=verify_destinations)
|
|
if len(ans) == 0:
|
|
ans = get_x_toc(container, find_existing_ncx_toc, parse_ncx,
|
|
verify_destinations=verify_destinations)
|
|
return ans
|
|
|
|
|
|
def get_guide_landmarks(container):
|
|
for ref in container.opf_xpath('./opf:guide/opf:reference'):
|
|
href, title, rtype = ref.get('href'), ref.get('title'), ref.get('type')
|
|
href, frag = href.partition('#')[::2]
|
|
name = container.href_to_name(href, container.opf_name)
|
|
if container.has_name(name):
|
|
yield {'dest': name,
|
|
'frag': frag,
|
|
'title': title or '',
|
|
'type': rtype or ''}
|
|
|
|
|
|
def get_nav_landmarks(container):
|
|
nav = find_existing_nav_toc(container)
|
|
if nav and container.has_name(nav):
|
|
root = container.parsed(nav)
|
|
et = base('epub', 'type')
|
|
for elem in root.iterdescendants(base.tag('xhtml', 'nav')):
|
|
if elem.get(et) == 'landmarks':
|
|
for li in elem.iterdescendants(base.tag('xhtml', 'li')):
|
|
for a in li.iterdescendants(base.tag('xhtml', 'a')):
|
|
href, rtype = a.get('href'), a.get(et)
|
|
if href:
|
|
title = etree.tostring(a, method='text',
|
|
encoding='unicode',
|
|
with_tail=False).strip()
|
|
href, frag = href.partition('#')[::2]
|
|
name = container.href_to_name(href, nav)
|
|
if container.has_name(name):
|
|
yield {'dest': name,
|
|
'frag': frag,
|
|
'title': title or '',
|
|
'type': rtype or ''}
|
|
break
|
|
|
|
|
|
def get_landmarks(container):
|
|
ver = container.opf_version_parsed
|
|
if ver.major < 3:
|
|
return list(get_guide_landmarks(container))
|
|
ans = list(get_nav_landmarks(container))
|
|
if len(ans) == 0:
|
|
ans = list(get_guide_landmarks(container))
|
|
return ans
|
|
|
|
|
|
def ensure_id(elem, all_ids):
|
|
elem_id = elem.get('id')
|
|
if elem_id:
|
|
return False, elem_id
|
|
if elem.tag == base.tag('xhtml', 'a'):
|
|
anchor = elem.get('name', None)
|
|
if anchor:
|
|
elem.set('id', anchor)
|
|
return False, anchor
|
|
c = 0
|
|
while True:
|
|
c += 1
|
|
q = 'toc_{}'.format(c)
|
|
if q not in all_ids:
|
|
elem.set('id', q)
|
|
all_ids.add(q)
|
|
break
|
|
return True, elem.get('id')
|
|
|
|
|
|
def elem_to_toc_text(elem):
|
|
text = base.xml2text(elem).strip()
|
|
if not text:
|
|
text = elem.get('title', '')
|
|
if not text:
|
|
text = elem.get('alt', '')
|
|
text = re.sub(r'\s+', ' ', text.strip())
|
|
text = text[:1000].strip()
|
|
if not text:
|
|
text = '(Untitled)'
|
|
return text
|
|
|
|
|
|
def item_at_top(elem):
|
|
try:
|
|
body = base.XPath('//h:body')(elem.getroottree().getroot())[0]
|
|
except (TypeError, IndexError, KeyError, AttributeError):
|
|
return False
|
|
tree = body.getroottree()
|
|
path = tree.getpath(elem)
|
|
for el in body.iterdescendants(etree.Element):
|
|
epath = tree.getpath(el)
|
|
if epath == path:
|
|
break
|
|
try:
|
|
if el.tag.endswith('}img') or (el.text and el.text.strip()):
|
|
return False
|
|
except Exception:
|
|
return False
|
|
if not path.startswith(epath):
|
|
# Only check tail of non-parent elements
|
|
if el.tail and el.tail.strip():
|
|
return False
|
|
return True
|
|
|
|
|
|
def from_xpaths(container, xpaths):
|
|
'''
|
|
Generate a Table of Contents from a list of XPath expressions. Each
|
|
expression in the list corresponds to a level of the generate ToC. For
|
|
example: :code:`['//h:h1', '//h:h2', '//h:h3']` will generate a three level
|
|
Table of Contents from the ``<h1>``, ``<h2>`` and ``<h3>`` tags.
|
|
'''
|
|
tocroot = TOC()
|
|
xpaths = [base.XPath(xp) for xp in xpaths]
|
|
|
|
# Find those levels that have no elements in all spine items
|
|
maps = collections.OrderedDict()
|
|
empty_levels = {i+1 for i, xp in enumerate(xpaths)}
|
|
for spinepath in container.spine_items:
|
|
name = container.abspath_to_name(spinepath)
|
|
root = container.parsed(name)
|
|
level_item_map = maps[name] = {i + 1: frozenset(xp(root))
|
|
for i, xp in enumerate(xpaths)}
|
|
for lvl, elems in level_item_map.items():
|
|
if elems:
|
|
empty_levels.discard(lvl)
|
|
# Remove empty levels from all level_maps
|
|
if empty_levels:
|
|
for name, lmap in tuple(maps.items()):
|
|
lmap = {lvl: items for lvl, items in lmap.items()
|
|
if lvl not in empty_levels}
|
|
lmap = sorted(lmap.items(), key=operator.itemgetter(0))
|
|
lmap = {i + 1: items for i, (l, items) in enumerate(lmap)}
|
|
maps[name] = lmap
|
|
|
|
node_level_map = {tocroot: 0}
|
|
|
|
def parent_for_level(child_level):
|
|
limit = child_level - 1
|
|
|
|
def process_node(node):
|
|
child = node.last_child
|
|
if child is None:
|
|
return node
|
|
lvl = node_level_map[child]
|
|
return (node if lvl > limit else
|
|
child if lvl == limit else process_node(child))
|
|
|
|
return process_node(tocroot)
|
|
|
|
for name, level_item_map in maps.items():
|
|
root = container.parsed(name)
|
|
item_level_map = {e: i for i, elems in level_item_map.items()
|
|
for e in elems}
|
|
item_dirtied = False
|
|
all_ids = set(root.xpath('//*/@id'))
|
|
|
|
for item in root.iterdescendants(etree.Element):
|
|
lvl = item_level_map.get(item, None)
|
|
if lvl is None:
|
|
continue
|
|
text = elem_to_toc_text(item)
|
|
parent = parent_for_level(lvl)
|
|
if item_at_top(item):
|
|
dirtied, elem_id = False, None
|
|
else:
|
|
dirtied, elem_id = ensure_id(item, all_ids)
|
|
item_dirtied = dirtied or item_dirtied
|
|
toc = parent.add(text, name, elem_id)
|
|
node_level_map[toc] = lvl
|
|
toc.dest_exists = True
|
|
|
|
if item_dirtied:
|
|
container.commit_item(name, keep_parsed=True)
|
|
|
|
return tocroot
|
|
|
|
|
|
def from_links(container):
|
|
'''
|
|
Generate a Table of Contents from links in the book.
|
|
'''
|
|
toc = TOC()
|
|
link_path = base.XPath('//h:a[@href]')
|
|
seen_titles, seen_dests = set(), set()
|
|
for name, is_linear in container.spine_names:
|
|
root = container.parsed(name)
|
|
for a in link_path(root):
|
|
href = a.get('href')
|
|
if not href or not href.strip():
|
|
continue
|
|
frag = None
|
|
if href.startswith('#'):
|
|
dest = name
|
|
frag = href[1:]
|
|
else:
|
|
href, _, frag = href.partition('#')
|
|
dest = container.href_to_name(href, base=name)
|
|
frag = frag or None
|
|
if (dest, frag) in seen_dests:
|
|
continue
|
|
seen_dests.add((dest, frag))
|
|
text = elem_to_toc_text(a)
|
|
if text in seen_titles:
|
|
continue
|
|
seen_titles.add(text)
|
|
toc.add(text, dest, frag=frag)
|
|
verify_toc_destinations(container, toc)
|
|
for child in toc:
|
|
if not child.dest_exists:
|
|
toc.remove(child)
|
|
return toc
|
|
|
|
|
|
def find_text(node):
|
|
LIMIT = 200
|
|
pat = re.compile(r'\s+')
|
|
for child in node:
|
|
if isinstance(child, etree._Element):
|
|
text = base.xml2text(child).strip()
|
|
text = pat.sub(' ', text)
|
|
if len(text) < 1:
|
|
continue
|
|
if len(text) > LIMIT:
|
|
# Look for less text in a child of this node, recursively
|
|
ntext = find_text(child)
|
|
return ntext or (text[:LIMIT] + '...')
|
|
else:
|
|
return text
|
|
|
|
|
|
def from_files(container):
|
|
'''
|
|
Generate a Table of Contents from files in the book.
|
|
'''
|
|
toc = TOC()
|
|
for i, spinepath in enumerate(container.spine_items):
|
|
name = container.abspath_to_name(spinepath)
|
|
root = container.parsed(name)
|
|
body = base.XPath('//h:body')(root)
|
|
if not body:
|
|
continue
|
|
text = find_text(body[0])
|
|
if not text:
|
|
text = name.rpartition('/')[-1]
|
|
if i == 0 and text.rpartition('.')[0].lower() in {'titlepage',
|
|
'cover'}:
|
|
text = 'Cover'
|
|
toc.add(text, name)
|
|
return toc
|
|
|
|
|
|
def node_from_loc(root, locs, totals=None):
|
|
node = root.xpath('//*[local-name()="body"]')[0]
|
|
for i, loc in enumerate(locs):
|
|
children = tuple(node.iterchildren(etree.Element))
|
|
if totals is not None and totals[i] != len(children):
|
|
raise MalformedMarkup()
|
|
node = children[loc]
|
|
return node
|
|
|
|
|
|
def add_id(container, name, loc, totals=None):
|
|
root = container.parsed(name)
|
|
try:
|
|
node = node_from_loc(root, loc, totals=totals)
|
|
except MalformedMarkup:
|
|
# The webkit HTML parser and the container parser have yielded
|
|
# different node counts, this can happen if the file is valid XML
|
|
# but contains constructs like nested <p> tags. So force parse it
|
|
# with the HTML 5 parser and try again.
|
|
raw = container.raw_data(name)
|
|
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
|
|
try:
|
|
node = node_from_loc(root, loc, totals=totals)
|
|
except MalformedMarkup:
|
|
raise MalformedMarkup('The file %s has malformed markup. Try '
|
|
'running the Fix HTML tool before '
|
|
'editing.' % name)
|
|
container.replace(name, root)
|
|
|
|
if not node.get('id'):
|
|
ensure_id(node, set(root.xpath('//*/@id')))
|
|
container.commit_item(name, keep_parsed=True)
|
|
return node.get('id')
|
|
|
|
|
|
def create_ncx(toc, to_href, btitle, lang, uid):
|
|
lang = lang.replace('_', '-')
|
|
ncx = etree.Element(base.tag('ncx', 'ncx'),
|
|
attrib={'version': '2005-1',
|
|
base.tag('xml', 'lang'): lang},
|
|
nsmap={None: const.NCX_NS})
|
|
head = etree.SubElement(ncx, base.tag('ncx', 'head'))
|
|
etree.SubElement(head, base.tag('ncx', 'meta'),
|
|
name='dtb:uid', content=str(uid))
|
|
etree.SubElement(head, base.tag('ncx', 'meta'),
|
|
name='dtb:depth', content=str(toc.depth))
|
|
generator = ''.join(['calibre (', __version__, ')'])
|
|
etree.SubElement(head, base.tag('ncx', 'meta'),
|
|
name='dtb:generator', content=generator)
|
|
etree.SubElement(head, base.tag('ncx', 'meta'), name='dtb:totalPageCount',
|
|
content='0')
|
|
etree.SubElement(head, base.tag('ncx', 'meta'), name='dtb:maxPageNumber',
|
|
content='0')
|
|
title = etree.SubElement(ncx, base.tag('ncx', 'docTitle'))
|
|
text = etree.SubElement(title, base.tag('ncx', 'text'))
|
|
text.text = btitle
|
|
navmap = etree.SubElement(ncx, base.tag('ncx', 'navMap'))
|
|
spat = re.compile(r'\s+')
|
|
|
|
play_order = collections.Counter()
|
|
|
|
def process_node(xml_parent, toc_parent):
|
|
for child in toc_parent:
|
|
play_order['c'] += 1
|
|
point = etree.SubElement(xml_parent, base.tag('ncx', 'navPoint'),
|
|
id='num_%d' % play_order['c'],
|
|
playOrder=str(play_order['c']))
|
|
label = etree.SubElement(point, base.tag('ncx', 'navLabel'))
|
|
title = child.title
|
|
if title:
|
|
title = spat.sub(' ', title)
|
|
etree.SubElement(label, base.tag('ncx', 'text')).text = title
|
|
if child.dest:
|
|
href = to_href(child.dest)
|
|
if child.frag:
|
|
href += '#'+child.frag
|
|
etree.SubElement(point, base.tag('ncx', 'content'), src=href)
|
|
process_node(point, child)
|
|
|
|
process_node(navmap, toc)
|
|
return ncx
|
|
|
|
|
|
def commit_ncx_toc(container, toc, lang=None, uid=None):
|
|
tocname = find_existing_ncx_toc(container)
|
|
if tocname is None:
|
|
item = container.generate_item('toc.ncx', id_prefix='toc')
|
|
tocname = container.href_to_name(item.get('href'),
|
|
base=container.opf_name)
|
|
ncx_id = item.get('id')
|
|
[s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')]
|
|
if not lang:
|
|
lang = get_lang()
|
|
for _l in container.opf_xpath('//dc:language'):
|
|
_l = canonicalize_lang(base.xml2text(_l).strip())
|
|
if _l:
|
|
lang = _l
|
|
lang = lang_as_iso639_1(_l) or _l
|
|
break
|
|
lang = lang_as_iso639_1(lang) or lang
|
|
if not uid:
|
|
uid = base.uuid_id()
|
|
eid = container.opf.get('unique-identifier', None)
|
|
if eid:
|
|
m = container.opf_xpath('//*[@id="%s"]' % eid)
|
|
if m:
|
|
uid = base.xml2text(m[0])
|
|
|
|
title = 'Table of Contents'
|
|
m = container.opf_xpath('//dc:title')
|
|
if m:
|
|
x = base.xml2text(m[0]).strip()
|
|
title = x or title
|
|
|
|
to_href = functools.partial(container.name_to_href, base=tocname)
|
|
root = create_ncx(toc, to_href, title, lang, uid)
|
|
container.replace(tocname, root)
|
|
container.pretty_print.add(tocname)
|
|
|
|
|
|
def ensure_single_nav_of_type(root, ntype='toc'):
|
|
et = base('epub', 'type')
|
|
navs = [n for n in root.iterdescendants(base.tag('xhtml', 'nav'))
|
|
if n.get(et) == ntype]
|
|
for x in navs[1:]:
|
|
extract(x)
|
|
if navs:
|
|
nav = navs[0]
|
|
tail = nav.tail
|
|
attrib = dict(nav.attrib)
|
|
nav.clear()
|
|
nav.attrib.update(attrib)
|
|
nav.tail = tail
|
|
else:
|
|
nav = root.makeelement(base.tag('xhtml', 'nav'))
|
|
first_child(root, base.tag('xhtml', 'body')).append(nav)
|
|
nav.set(et, ntype)
|
|
return nav
|
|
|
|
|
|
def commit_nav_toc(container, toc, lang=None, landmarks=None,
|
|
previous_nav=None):
|
|
from ebook_converter.ebooks.oeb.polish.pretty import pretty_xml_tree
|
|
tocname = find_existing_nav_toc(container)
|
|
if previous_nav is not None:
|
|
nav_name = container.href_to_name(previous_nav[0])
|
|
if nav_name and container.exists(nav_name):
|
|
tocname = nav_name
|
|
container.apply_unique_properties(tocname, 'nav')
|
|
if tocname is None:
|
|
item = container.generate_item('nav.xhtml', id_prefix='nav')
|
|
item.set('properties', 'nav')
|
|
tocname = container.href_to_name(item.get('href'),
|
|
base=container.opf_name)
|
|
if previous_nav is not None:
|
|
root = previous_nav[1]
|
|
else:
|
|
with open(pkg_resources.
|
|
resource_filename('ebook_converter',
|
|
'data/new_nav.html')) as fobj:
|
|
root = container.parse_xhtml(fobj.read())
|
|
container.replace(tocname, root)
|
|
else:
|
|
root = container.parsed(tocname)
|
|
if lang:
|
|
lang = lang_as_iso639_1(lang) or lang
|
|
root.set('lang', lang)
|
|
root.set(base.tag('xml', 'lang'), lang)
|
|
nav = ensure_single_nav_of_type(root, 'toc')
|
|
if toc.toc_title:
|
|
nav.append(nav.makeelement(base.tag('xhtml', 'h1')))
|
|
nav[-1].text = toc.toc_title
|
|
|
|
rnode = nav.makeelement(base.tag('xhtml', 'ol'))
|
|
nav.append(rnode)
|
|
to_href = functools.partial(container.name_to_href, base=tocname)
|
|
spat = re.compile(r'\s+')
|
|
|
|
def process_node(xml_parent, toc_parent):
|
|
for child in toc_parent:
|
|
li = xml_parent.makeelement(base.tag('xhtml', 'li'))
|
|
xml_parent.append(li)
|
|
title = child.title or ''
|
|
title = spat.sub(' ', title).strip()
|
|
a = li.makeelement(base.tag('xhtml', 'a'
|
|
if child.dest else 'span'))
|
|
a.text = title
|
|
li.append(a)
|
|
if child.dest:
|
|
href = to_href(child.dest)
|
|
if child.frag:
|
|
href += '#'+child.frag
|
|
a.set('href', href)
|
|
if len(child):
|
|
ol = li.makeelement(base.tag('xhtml', 'ol'))
|
|
li.append(ol)
|
|
process_node(ol, child)
|
|
process_node(rnode, toc)
|
|
pretty_xml_tree(nav)
|
|
|
|
def collapse_li(parent):
|
|
for li in parent.iterdescendants(base.tag('xhtml', 'li')):
|
|
if len(li) == 1:
|
|
li.text = None
|
|
li[0].tail = None
|
|
collapse_li(nav)
|
|
nav.tail = '\n'
|
|
|
|
def create_li(ol, entry):
|
|
li = ol.makeelement(base.tag('xhtml', 'li'))
|
|
ol.append(li)
|
|
a = li.makeelement(base.tag('xhtml', 'a'))
|
|
li.append(a)
|
|
href = container.name_to_href(entry['dest'], tocname)
|
|
if entry['frag']:
|
|
href += '#' + entry['frag']
|
|
a.set('href', href)
|
|
return a
|
|
|
|
if landmarks is not None:
|
|
nav = ensure_single_nav_of_type(root, 'landmarks')
|
|
nav.set('hidden', '')
|
|
ol = nav.makeelement(base.tag('xhtml', 'ol'))
|
|
nav.append(ol)
|
|
for entry in landmarks:
|
|
if (entry['type'] and container.has_name(entry['dest']) and
|
|
container.mime_map[entry['dest']] in base.OEB_DOCS):
|
|
a = create_li(ol, entry)
|
|
a.set(base.tag('epub', 'type'), entry['type'])
|
|
a.text = entry['title'] or None
|
|
pretty_xml_tree(nav)
|
|
collapse_li(nav)
|
|
|
|
if toc.page_list:
|
|
nav = ensure_single_nav_of_type(root, 'page-list')
|
|
nav.set('hidden', '')
|
|
ol = nav.makeelement(base.tag('xhtml', 'ol'))
|
|
nav.append(ol)
|
|
for entry in toc.page_list:
|
|
if (container.has_name(entry['dest']) and
|
|
container.mime_map[entry['dest']] in base.OEB_DOCS):
|
|
a = create_li(ol, entry)
|
|
a.text = str(entry['pagenum'])
|
|
pretty_xml_tree(nav)
|
|
collapse_li(nav)
|
|
container.replace(tocname, root)
|
|
|
|
|
|
def commit_toc(container, toc, lang=None, uid=None):
|
|
commit_ncx_toc(container, toc, lang=lang, uid=uid)
|
|
if container.opf_version_parsed.major > 2:
|
|
commit_nav_toc(container, toc, lang=lang)
|
|
|
|
|
|
def remove_names_from_toc(container, names):
|
|
changed = []
|
|
names = frozenset(names)
|
|
for find_toc, parse_toc, commit_toc in ((find_existing_ncx_toc,
|
|
parse_ncx, commit_ncx_toc),
|
|
(find_existing_nav_toc,
|
|
parse_nav, commit_nav_toc)):
|
|
toc = get_x_toc(container, find_toc, parse_toc,
|
|
verify_destinations=False)
|
|
if len(toc) > 0:
|
|
remove = []
|
|
for node in toc.iterdescendants():
|
|
if node.dest in names:
|
|
remove.append(node)
|
|
if remove:
|
|
for node in reversed(remove):
|
|
node.remove_from_parent()
|
|
commit_toc(container, toc)
|
|
changed.append(find_toc(container))
|
|
return changed
|
|
|
|
|
|
def find_inline_toc(container):
|
|
for name, linear in container.spine_names:
|
|
if container.parsed(name).xpath('//*[local-name()="body" and @id='
|
|
'"calibre_generated_inline_toc"]'):
|
|
return name
|
|
|
|
|
|
def toc_to_html(toc, container, toc_name, title, lang=None):
|
|
|
|
def process_node(html_parent, toc, level=1, indent=' ', style_level=2):
|
|
li = html_parent.makeelement(base.tag('xhtml', 'li'))
|
|
li.tail = '\n' + (indent * level)
|
|
html_parent.append(li)
|
|
name, frag = toc.dest, toc.frag
|
|
href = '#'
|
|
if name:
|
|
href = container.name_to_href(name, toc_name)
|
|
if frag:
|
|
href += '#' + frag
|
|
a = li.makeelement(base.tag('xhtml', 'a'), href=href)
|
|
a.text = toc.title
|
|
li.append(a)
|
|
if len(toc) > 0:
|
|
parent = li.makeelement(base.tag('xhtml', 'ul'))
|
|
parent.set('class', 'level%d' % (style_level))
|
|
li.append(parent)
|
|
a.tail = '\n\n' + (indent*(level+2))
|
|
parent.text = '\n'+(indent*(level+3))
|
|
parent.tail = '\n\n' + (indent*(level+1))
|
|
for child in toc:
|
|
process_node(parent, child, level+3,
|
|
style_level=style_level + 1)
|
|
parent[-1].tail = '\n' + (indent*(level+2))
|
|
|
|
E = ElementMaker(namespace=const.XHTML_NS, nsmap={None: const.XHTML_NS})
|
|
# TODO(gryf): revisit lack of css.
|
|
css_f = pkg_resources.resource_filename('ebook_converter',
|
|
'data/inline_toc_styles.css')
|
|
html = E.html(E.head(E.title(title),
|
|
E.style(css_f, type='text/css')),
|
|
E.body(E.h2(title), E.ul(),
|
|
id="calibre_generated_inline_toc"))
|
|
|
|
ul = html[1][1]
|
|
ul.set('class', 'level1')
|
|
for child in toc:
|
|
process_node(ul, child)
|
|
if lang:
|
|
html.set('lang', lang)
|
|
pretty_html_tree(container, html)
|
|
return html
|
|
|
|
|
|
def create_inline_toc(container, title=None):
|
|
"""
|
|
Create an inline (HTML) Table of Contents from an existing NCX Table of
|
|
Contents.
|
|
|
|
:param title: The title for this table of contents.
|
|
"""
|
|
lang = get_book_language(container)
|
|
default_title = 'Table of Contents'
|
|
title = title or default_title
|
|
toc = get_toc(container)
|
|
if len(toc) == 0:
|
|
return None
|
|
toc_name = find_inline_toc(container)
|
|
|
|
name = toc_name
|
|
html = toc_to_html(toc, container, name, title, lang)
|
|
raw = base.serialize(html, 'text/html')
|
|
if name is None:
|
|
name, c = 'toc.xhtml', 0
|
|
while container.has_name(name):
|
|
c += 1
|
|
name = 'toc%d.xhtml' % c
|
|
container.add_file(name, raw, spine_index=0)
|
|
else:
|
|
with container.open(name, 'wb') as f:
|
|
f.write(raw)
|
|
set_guide_item(container, 'toc', title, name,
|
|
frag='calibre_generated_inline_toc')
|
|
return name
|