mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-29 02:04:05 +02:00
Initial import
This commit is contained in:
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.ebooks import DRMError as _DRMError
|
||||
|
||||
|
||||
class InvalidBook(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class DRMError(_DRMError):
|
||||
|
||||
def __init__(self):
|
||||
super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.'))
|
||||
|
||||
|
||||
class MalformedMarkup(ValueError):
|
||||
pass
|
||||
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
|
||||
def get_book_language(container):
|
||||
for lang in container.opf_xpath('//dc:language'):
|
||||
raw = lang.text
|
||||
if raw:
|
||||
code = canonicalize_lang(raw.split(',')[0].strip())
|
||||
if code:
|
||||
return code
|
||||
|
||||
|
||||
def set_guide_item(container, item_type, title, name, frag=None):
|
||||
ref_tag = '{%s}reference' % OPF_NAMESPACES['opf']
|
||||
href = None
|
||||
if name:
|
||||
href = container.name_to_href(name, container.opf_name)
|
||||
if frag:
|
||||
href += '#' + frag
|
||||
|
||||
guides = container.opf_xpath('//opf:guide')
|
||||
if not guides and href:
|
||||
g = container.opf.makeelement('{%s}guide' % OPF_NAMESPACES['opf'], nsmap={'opf':OPF_NAMESPACES['opf']})
|
||||
container.insert_into_xml(container.opf, g)
|
||||
guides = [g]
|
||||
|
||||
for guide in guides:
|
||||
matches = []
|
||||
for child in guide.iterchildren(etree.Element):
|
||||
if child.tag == ref_tag and child.get('type', '').lower() == item_type.lower():
|
||||
matches.append(child)
|
||||
if not matches and href:
|
||||
r = guide.makeelement(ref_tag, type=item_type, nsmap={'opf':OPF_NAMESPACES['opf']})
|
||||
container.insert_into_xml(guide, r)
|
||||
matches.append(r)
|
||||
for m in matches:
|
||||
if href:
|
||||
m.set('title', title), m.set('href', href), m.set('type', item_type)
|
||||
else:
|
||||
container.remove_from_xml(m)
|
||||
container.dirty(container.opf_name)
|
||||
|
||||
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re
|
||||
|
||||
from lxml.etree import Element as LxmlElement
|
||||
import html5_parser
|
||||
|
||||
from calibre import xml_replace_entities
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||
|
||||
|
||||
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
if replace_entities:
|
||||
raw = xml_replace_entities(raw)
|
||||
if fix_newlines:
|
||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||
raw = clean_xml_chars(raw)
|
||||
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
|
||||
if (discard_namespaces and root.tag != 'html') or (
|
||||
not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
|
||||
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
||||
return root
|
||||
|
||||
|
||||
def handle_private_entities(data):
|
||||
# Process private entities
|
||||
pre = ''
|
||||
idx = data.find('<html')
|
||||
if idx == -1:
|
||||
idx = data.find('<HTML')
|
||||
if idx > -1:
|
||||
pre = data[:idx]
|
||||
num_of_nl_in_pre = pre.count('\n')
|
||||
if '<!DOCTYPE' in pre: # Handle user defined entities
|
||||
user_entities = {}
|
||||
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
||||
val = match.group(2)
|
||||
if val.startswith('"') and val.endswith('"'):
|
||||
val = val[1:-1]
|
||||
user_entities[match.group(1)] = val
|
||||
if user_entities:
|
||||
data = ('\n' * num_of_nl_in_pre) + data[idx:]
|
||||
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||
return data
|
||||
|
||||
|
||||
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
raw = handle_private_entities(raw)
|
||||
if replace_entities:
|
||||
raw = xml_replace_entities(raw).replace('\0', '') # Handle �
|
||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# Remove any preamble before the opening html tag as it can cause problems,
|
||||
# especially doctypes, preserve the original linenumbers by inserting
|
||||
# newlines at the start
|
||||
pre = raw[:2048]
|
||||
for match in re.finditer(r'<\s*html', pre, flags=re.I):
|
||||
newlines = raw.count('\n', 0, match.start())
|
||||
raw = ('\n' * newlines) + raw[match.start():]
|
||||
break
|
||||
|
||||
raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True)
|
||||
if force_html5_parse:
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
||||
try:
|
||||
ans = safe_xml_fromstring(raw, recover=False)
|
||||
if ans.tag != '{%s}html' % XHTML_NS:
|
||||
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
||||
if linenumber_attribute:
|
||||
for elem in ans.iter(LxmlElement):
|
||||
if elem.sourceline is not None:
|
||||
elem.set(linenumber_attribute, unicode_type(elem.sourceline))
|
||||
return ans
|
||||
except Exception:
|
||||
if log is not None:
|
||||
log.exception('Failed to parse as XML, parsing as tag soup')
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from lxml import etree
|
||||
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0> \n<b>b<svg ass="wipe" viewbox="0">', discard_namespaces=False)
|
||||
print(etree.tostring(root, encoding='utf-8'))
|
||||
print()
|
||||
@@ -0,0 +1,252 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import textwrap
|
||||
from polyglot.builtins import iteritems, map
|
||||
|
||||
# from lxml.etree import Element
|
||||
|
||||
from calibre import force_unicode
|
||||
from calibre.ebooks.oeb.base import (
|
||||
serialize, OEB_DOCS, barename, OEB_STYLES, XPNSMAP, XHTML, SVG)
|
||||
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES
|
||||
from calibre.ebooks.oeb.polish.utils import guess_type
|
||||
from calibre.utils.icu import sort_key
|
||||
|
||||
|
||||
def isspace(x):
|
||||
return not x.strip('\u0009\u000a\u000c\u000d\u0020')
|
||||
|
||||
|
||||
def pretty_xml_tree(elem, level=0, indent=' '):
|
||||
''' XML beautifier, assumes that elements that have children do not have
|
||||
textual content. Also assumes that there is no text immediately after
|
||||
closing tags. These are true for opf/ncx and container.xml files. If either
|
||||
of the assumptions are violated, there should be no data loss, but pretty
|
||||
printing wont produce optimal results.'''
|
||||
if (not elem.text and len(elem) > 0) or (elem.text and isspace(elem.text)):
|
||||
elem.text = '\n' + (indent * (level+1))
|
||||
for i, child in enumerate(elem):
|
||||
pretty_xml_tree(child, level=level+1, indent=indent)
|
||||
if not child.tail or isspace(child.tail):
|
||||
l = level + 1
|
||||
if i == len(elem) - 1:
|
||||
l -= 1
|
||||
child.tail = '\n' + (indent * l)
|
||||
|
||||
|
||||
def pretty_opf(root):
|
||||
# Put all dc: tags first starting with title and author. Preserve order for
|
||||
# the rest.
|
||||
def dckey(x):
|
||||
return {'title':0, 'creator':1}.get(barename(x.tag), 2)
|
||||
for metadata in root.xpath('//opf:metadata', namespaces=OPF_NAMESPACES):
|
||||
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc'])
|
||||
dc_tags.sort(key=dckey)
|
||||
for x in reversed(dc_tags):
|
||||
metadata.insert(0, x)
|
||||
|
||||
# Group items in the manifest
|
||||
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=OPF_NAMESPACES)
|
||||
spine_ids = {x:i for i, x in enumerate(spine_ids)}
|
||||
|
||||
def manifest_key(x):
|
||||
mt = x.get('media-type', '')
|
||||
href = x.get('href', '')
|
||||
ext = href.rpartition('.')[-1].lower()
|
||||
cat = 1000
|
||||
if mt in OEB_DOCS:
|
||||
cat = 0
|
||||
elif mt == guess_type('a.ncx'):
|
||||
cat = 1
|
||||
elif mt in OEB_STYLES:
|
||||
cat = 2
|
||||
elif mt.startswith('image/'):
|
||||
cat = 3
|
||||
elif ext in {'otf', 'ttf', 'woff'}:
|
||||
cat = 4
|
||||
elif mt.startswith('audio/'):
|
||||
cat = 5
|
||||
elif mt.startswith('video/'):
|
||||
cat = 6
|
||||
|
||||
if cat == 0:
|
||||
i = spine_ids.get(x.get('id', None), 1000000000)
|
||||
else:
|
||||
i = sort_key(href)
|
||||
return (cat, i)
|
||||
|
||||
for manifest in root.xpath('//opf:manifest', namespaces=OPF_NAMESPACES):
|
||||
try:
|
||||
children = sorted(manifest, key=manifest_key)
|
||||
except AttributeError:
|
||||
continue # There are comments so dont sort since that would mess up the comments
|
||||
for x in reversed(children):
|
||||
manifest.insert(0, x)
|
||||
|
||||
|
||||
SVG_TAG = SVG('svg')
|
||||
BLOCK_TAGS = frozenset(map(XHTML, (
|
||||
'address', 'article', 'aside', 'audio', 'blockquote', 'body', 'canvas', 'col', 'colgroup', 'dd',
|
||||
'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li',
|
||||
'noscript', 'ol', 'output', 'p', 'pre', 'script', 'section', 'style', 'table', 'tbody', 'td',
|
||||
'tfoot', 'th', 'thead', 'tr', 'ul', 'video', 'img'))) | {SVG_TAG}
|
||||
|
||||
|
||||
def isblock(x):
|
||||
if callable(x.tag) or not x.tag:
|
||||
return True
|
||||
if x.tag in BLOCK_TAGS:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def has_only_blocks(x):
|
||||
if hasattr(x.tag, 'split') and len(x) == 0:
|
||||
# Tag with no children,
|
||||
return False
|
||||
if x.text and not isspace(x.text):
|
||||
return False
|
||||
for child in x:
|
||||
if not isblock(child) or (child.tail and not isspace(child.tail)):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def indent_for_tag(x):
|
||||
prev = x.getprevious()
|
||||
x = x.getparent().text if prev is None else prev.tail
|
||||
if not x:
|
||||
return ''
|
||||
s = x.rpartition('\n')[-1]
|
||||
return s if isspace(s) else ''
|
||||
|
||||
|
||||
def set_indent(elem, attr, indent):
|
||||
x = getattr(elem, attr)
|
||||
if not x:
|
||||
x = indent
|
||||
else:
|
||||
lines = x.splitlines()
|
||||
if isspace(lines[-1]):
|
||||
lines[-1] = indent
|
||||
else:
|
||||
lines.append(indent)
|
||||
x = '\n'.join(lines)
|
||||
setattr(elem, attr, x)
|
||||
|
||||
|
||||
def pretty_block(parent, level=1, indent=' '):
|
||||
''' Surround block tags with blank lines and recurse into child block tags
|
||||
that contain only other block tags '''
|
||||
if not parent.text or isspace(parent.text):
|
||||
parent.text = ''
|
||||
nn = '\n' if hasattr(parent.tag, 'strip') and barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
|
||||
parent.text = parent.text + nn + (indent * level)
|
||||
for i, child in enumerate(parent):
|
||||
if isblock(child) and has_only_blocks(child):
|
||||
pretty_block(child, level=level+1, indent=indent)
|
||||
elif child.tag == SVG_TAG:
|
||||
pretty_xml_tree(child, level=level, indent=indent)
|
||||
l = level
|
||||
if i == len(parent) - 1:
|
||||
l -= 1
|
||||
if not child.tail or isspace(child.tail):
|
||||
child.tail = ''
|
||||
child.tail = child.tail + nn + (indent * l)
|
||||
|
||||
|
||||
def pretty_script_or_style(container, child):
|
||||
if child.text:
|
||||
indent = indent_for_tag(child)
|
||||
if child.tag.endswith('style'):
|
||||
child.text = force_unicode(pretty_css(container, '', child.text), 'utf-8')
|
||||
child.text = textwrap.dedent(child.text)
|
||||
child.text = '\n' + '\n'.join([(indent + x) if x else '' for x in child.text.splitlines()])
|
||||
set_indent(child, 'text', indent)
|
||||
|
||||
|
||||
def pretty_html_tree(container, root):
|
||||
root.text = '\n\n'
|
||||
for child in root:
|
||||
child.tail = '\n\n'
|
||||
if hasattr(child.tag, 'endswith') and child.tag.endswith('}head'):
|
||||
pretty_xml_tree(child)
|
||||
for body in root.findall('h:body', namespaces=XPNSMAP):
|
||||
pretty_block(body)
|
||||
# Special case the handling of a body that contains a single block tag
|
||||
# with all content. In this case we prettify the containing block tag
|
||||
# even if it has non block children.
|
||||
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(
|
||||
body[0]) and barename(body[0].tag) not in (
|
||||
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
|
||||
pretty_block(body[0], level=2)
|
||||
|
||||
if container is not None:
|
||||
# Handle <script> and <style> tags
|
||||
for child in root.xpath('//*[local-name()="script" or local-name()="style"]'):
|
||||
pretty_script_or_style(container, child)
|
||||
|
||||
|
||||
def fix_html(container, raw):
|
||||
' Fix any parsing errors in the HTML represented as a string in raw. Fixing is done using the HTML5 parsing algorithm. '
|
||||
root = container.parse_xhtml(raw)
|
||||
return serialize(root, 'text/html')
|
||||
|
||||
|
||||
def pretty_html(container, name, raw):
|
||||
' Pretty print the HTML represented as a string in raw '
|
||||
root = container.parse_xhtml(raw)
|
||||
pretty_html_tree(container, root)
|
||||
return serialize(root, 'text/html')
|
||||
|
||||
|
||||
def pretty_css(container, name, raw):
|
||||
' Pretty print the CSS represented as a string in raw '
|
||||
sheet = container.parse_css(raw)
|
||||
return serialize(sheet, 'text/css')
|
||||
|
||||
|
||||
def pretty_xml(container, name, raw):
|
||||
' Pretty print the XML represented as a string in raw. If ``name`` is the name of the OPF, extra OPF-specific prettying is performed. '
|
||||
root = container.parse_xml(raw)
|
||||
if name == container.opf_name:
|
||||
pretty_opf(root)
|
||||
pretty_xml_tree(root)
|
||||
return serialize(root, 'text/xml')
|
||||
|
||||
|
||||
def fix_all_html(container):
|
||||
' Fix any parsing errors in all HTML files in the container. Fixing is done using the HTML5 parsing algorithm. '
|
||||
for name, mt in iteritems(container.mime_map):
|
||||
if mt in OEB_DOCS:
|
||||
container.parsed(name)
|
||||
container.dirty(name)
|
||||
|
||||
|
||||
def pretty_all(container):
|
||||
' Pretty print all HTML/CSS/XML files in the container '
|
||||
xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
|
||||
for name, mt in iteritems(container.mime_map):
|
||||
prettied = False
|
||||
if mt in OEB_DOCS:
|
||||
pretty_html_tree(container, container.parsed(name))
|
||||
prettied = True
|
||||
elif mt in OEB_STYLES:
|
||||
container.parsed(name)
|
||||
prettied = True
|
||||
elif name == container.opf_name:
|
||||
root = container.parsed(name)
|
||||
pretty_opf(root)
|
||||
pretty_xml_tree(root)
|
||||
prettied = True
|
||||
elif mt in xml_types:
|
||||
pretty_xml_tree(container.parsed(name))
|
||||
prettied = True
|
||||
if prettied:
|
||||
container.dirty(name)
|
||||
@@ -0,0 +1,891 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from collections import Counter, OrderedDict
|
||||
from functools import partial
|
||||
from operator import itemgetter
|
||||
|
||||
from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
|
||||
from calibre import __version__
|
||||
from calibre.ebooks.oeb.base import (
|
||||
XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize, EPUB_NS, XML_NS, OEB_DOCS)
|
||||
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
|
||||
from calibre.ebooks.oeb.polish.utils import guess_type, extract
|
||||
from calibre.ebooks.oeb.polish.opf import set_guide_item, get_book_language
|
||||
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
|
||||
from calibre.translations.dynamic import translate
|
||||
from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
|
||||
from polyglot.builtins import iteritems, map, unicode_type
|
||||
from polyglot.urllib import urlparse
|
||||
|
||||
ns = etree.FunctionNamespace('calibre_xpath_extensions')
|
||||
ns.prefix = 'calibre'
|
||||
ns['lower-case'] = lambda c, x: x.lower() if hasattr(x, 'lower') else x
|
||||
|
||||
|
||||
class TOC(object):
|
||||
|
||||
toc_title = None
|
||||
|
||||
def __init__(self, title=None, dest=None, frag=None):
|
||||
self.title, self.dest, self.frag = title, dest, frag
|
||||
self.dest_exists = self.dest_error = None
|
||||
if self.title:
|
||||
self.title = self.title.strip()
|
||||
self.parent = None
|
||||
self.children = []
|
||||
self.page_list = []
|
||||
|
||||
def add(self, title, dest, frag=None):
|
||||
c = TOC(title, dest, frag)
|
||||
self.children.append(c)
|
||||
c.parent = self
|
||||
return c
|
||||
|
||||
def remove(self, child):
|
||||
self.children.remove(child)
|
||||
child.parent = None
|
||||
|
||||
def remove_from_parent(self):
|
||||
if self.parent is None:
|
||||
return
|
||||
idx = self.parent.children.index(self)
|
||||
for child in reversed(self.children):
|
||||
child.parent = self.parent
|
||||
self.parent.children.insert(idx, child)
|
||||
self.parent.children.remove(self)
|
||||
self.parent = None
|
||||
|
||||
def __iter__(self):
|
||||
for c in self.children:
|
||||
yield c
|
||||
|
||||
def __len__(self):
|
||||
return len(self.children)
|
||||
|
||||
def iterdescendants(self, level=None):
|
||||
gc_level = None if level is None else level + 1
|
||||
for child in self:
|
||||
if level is None:
|
||||
yield child
|
||||
else:
|
||||
yield level, child
|
||||
for gc in child.iterdescendants(level=gc_level):
|
||||
yield gc
|
||||
|
||||
def remove_duplicates(self, only_text=True):
|
||||
seen = set()
|
||||
remove = []
|
||||
for child in self:
|
||||
key = child.title if only_text else (child.title, child.dest, (child.frag or None))
|
||||
if key in seen:
|
||||
remove.append(child)
|
||||
else:
|
||||
seen.add(key)
|
||||
child.remove_duplicates()
|
||||
for child in remove:
|
||||
self.remove(child)
|
||||
|
||||
@property
|
||||
def depth(self):
|
||||
"""The maximum depth of the navigation tree rooted at this node."""
|
||||
try:
|
||||
return max(node.depth for node in self) + 1
|
||||
except ValueError:
|
||||
return 1
|
||||
|
||||
@property
|
||||
def last_child(self):
|
||||
return self.children[-1] if self.children else None
|
||||
|
||||
def get_lines(self, lvl=0):
|
||||
frag = ('#'+self.frag) if self.frag else ''
|
||||
ans = [('\t'*lvl) + 'TOC: %s --> %s%s'%(self.title, self.dest, frag)]
|
||||
for child in self:
|
||||
ans.extend(child.get_lines(lvl+1))
|
||||
return ans
|
||||
|
||||
def __str__(self):
|
||||
return '\n'.join(self.get_lines())
|
||||
|
||||
def to_dict(self, node_counter=None):
|
||||
ans = {
|
||||
'title':self.title, 'dest':self.dest, 'frag':self.frag,
|
||||
'children':[c.to_dict(node_counter) for c in self.children]
|
||||
}
|
||||
if self.dest_exists is not None:
|
||||
ans['dest_exists'] = self.dest_exists
|
||||
if self.dest_error is not None:
|
||||
ans['dest_error'] = self.dest_error
|
||||
if node_counter is not None:
|
||||
ans['id'] = next(node_counter)
|
||||
return ans
|
||||
|
||||
@property
|
||||
def as_dict(self):
|
||||
return self.to_dict()
|
||||
|
||||
|
||||
def child_xpath(tag, name):
|
||||
return tag.xpath('./*[calibre:lower-case(local-name()) = "%s"]'%name)
|
||||
|
||||
|
||||
def add_from_navpoint(container, navpoint, parent, ncx_name):
|
||||
dest = frag = text = None
|
||||
nl = child_xpath(navpoint, 'navlabel')
|
||||
if nl:
|
||||
nl = nl[0]
|
||||
text = ''
|
||||
for txt in child_xpath(nl, 'text'):
|
||||
text += etree.tostring(txt, method='text',
|
||||
encoding='unicode', with_tail=False)
|
||||
content = child_xpath(navpoint, 'content')
|
||||
if content:
|
||||
content = content[0]
|
||||
href = content.get('src', None)
|
||||
if href:
|
||||
dest = container.href_to_name(href, base=ncx_name)
|
||||
frag = urlparse(href).fragment or None
|
||||
return parent.add(text or None, dest or None, frag or None)
|
||||
|
||||
|
||||
def process_ncx_node(container, node, toc_parent, ncx_name):
|
||||
for navpoint in node.xpath('./*[calibre:lower-case(local-name()) = "navpoint"]'):
|
||||
child = add_from_navpoint(container, navpoint, toc_parent, ncx_name)
|
||||
if child is not None:
|
||||
process_ncx_node(container, navpoint, child, ncx_name)
|
||||
|
||||
|
||||
def parse_ncx(container, ncx_name):
|
||||
root = container.parsed(ncx_name)
|
||||
toc_root = TOC()
|
||||
navmaps = root.xpath('//*[calibre:lower-case(local-name()) = "navmap"]')
|
||||
if navmaps:
|
||||
process_ncx_node(container, navmaps[0], toc_root, ncx_name)
|
||||
toc_root.lang = toc_root.uid = None
|
||||
for attr, val in iteritems(root.attrib):
|
||||
if attr.endswith('lang'):
|
||||
toc_root.lang = unicode_type(val)
|
||||
break
|
||||
for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and @name="dtb:uid"]/@content'):
|
||||
if uid:
|
||||
toc_root.uid = unicode_type(uid)
|
||||
break
|
||||
for pl in root.xpath('//*[calibre:lower-case(local-name()) = "pagelist"]'):
|
||||
for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = "pagetarget"]'):
|
||||
pagenum = pt.get('value')
|
||||
if pagenum:
|
||||
href = pt.xpath('descendant::*[calibre:lower-case(local-name()) = "content"]/@src')
|
||||
if href:
|
||||
dest = container.href_to_name(href[0], base=ncx_name)
|
||||
frag = urlparse(href[0]).fragment or None
|
||||
toc_root.page_list.append({'dest': dest, 'pagenum': pagenum, 'frag': frag})
|
||||
return toc_root
|
||||
|
||||
|
||||
def add_from_li(container, li, parent, nav_name):
|
||||
dest = frag = text = None
|
||||
for x in li.iterchildren(XHTML('a'), XHTML('span')):
|
||||
text = etree.tostring(x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
|
||||
href = x.get('href')
|
||||
if href:
|
||||
dest = nav_name if href.startswith('#') else container.href_to_name(href, base=nav_name)
|
||||
frag = urlparse(href).fragment or None
|
||||
break
|
||||
return parent.add(text or None, dest or None, frag or None)
|
||||
|
||||
|
||||
def first_child(parent, tagname):
|
||||
try:
|
||||
return next(parent.iterchildren(tagname))
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
|
||||
def process_nav_node(container, node, toc_parent, nav_name):
|
||||
for li in node.iterchildren(XHTML('li')):
|
||||
child = add_from_li(container, li, toc_parent, nav_name)
|
||||
ol = first_child(li, XHTML('ol'))
|
||||
if child is not None and ol is not None:
|
||||
process_nav_node(container, ol, child, nav_name)
|
||||
|
||||
|
||||
def parse_nav(container, nav_name):
|
||||
root = container.parsed(nav_name)
|
||||
toc_root = TOC()
|
||||
toc_root.lang = toc_root.uid = None
|
||||
et = '{%s}type' % EPUB_NS
|
||||
for nav in root.iterdescendants(XHTML('nav')):
|
||||
if nav.get(et) == 'toc':
|
||||
ol = first_child(nav, XHTML('ol'))
|
||||
if ol is not None:
|
||||
process_nav_node(container, ol, toc_root, nav_name)
|
||||
for h in nav.iterchildren(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
|
||||
text = etree.tostring(h, method='text', encoding='unicode', with_tail=False) or h.get('title')
|
||||
if text:
|
||||
toc_root.toc_title = text
|
||||
break
|
||||
break
|
||||
return toc_root
|
||||
|
||||
|
||||
def verify_toc_destinations(container, toc):
|
||||
anchor_map = {}
|
||||
anchor_xpath = XPath('//*/@id|//h:a/@name')
|
||||
for item in toc.iterdescendants():
|
||||
name = item.dest
|
||||
if not name:
|
||||
item.dest_exists = False
|
||||
item.dest_error = _('No file named %s exists')%name
|
||||
continue
|
||||
try:
|
||||
root = container.parsed(name)
|
||||
except KeyError:
|
||||
item.dest_exists = False
|
||||
item.dest_error = _('No file named %s exists')%name
|
||||
continue
|
||||
if not hasattr(root, 'xpath'):
|
||||
item.dest_exists = False
|
||||
item.dest_error = _('No HTML file named %s exists')%name
|
||||
continue
|
||||
if not item.frag:
|
||||
item.dest_exists = True
|
||||
continue
|
||||
if name not in anchor_map:
|
||||
anchor_map[name] = frozenset(anchor_xpath(root))
|
||||
item.dest_exists = item.frag in anchor_map[name]
|
||||
if not item.dest_exists:
|
||||
item.dest_error = _(
|
||||
'The anchor %(a)s does not exist in file %(f)s')%dict(
|
||||
a=item.frag, f=name)
|
||||
|
||||
|
||||
def find_existing_ncx_toc(container):
|
||||
toc = container.opf_xpath('//opf:spine/@toc')
|
||||
if toc:
|
||||
toc = container.manifest_id_map.get(toc[0], None)
|
||||
if not toc:
|
||||
ncx = guess_type('a.ncx')
|
||||
toc = container.manifest_type_map.get(ncx, [None])[0]
|
||||
return toc or None
|
||||
|
||||
|
||||
def find_existing_nav_toc(container):
|
||||
for name in container.manifest_items_with_property('nav'):
|
||||
return name
|
||||
|
||||
|
||||
def get_x_toc(container, find_toc, parse_toc, verify_destinations=True):
|
||||
def empty_toc():
|
||||
ans = TOC()
|
||||
ans.lang = ans.uid = None
|
||||
return ans
|
||||
toc = find_toc(container)
|
||||
ans = empty_toc() if toc is None or not container.has_name(toc) else parse_toc(container, toc)
|
||||
ans.toc_file_name = toc if toc and container.has_name(toc) else None
|
||||
if verify_destinations:
|
||||
verify_toc_destinations(container, ans)
|
||||
return ans
|
||||
|
||||
|
||||
def get_toc(container, verify_destinations=True):
|
||||
ver = container.opf_version_parsed
|
||||
if ver.major < 3:
|
||||
return get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations)
|
||||
else:
|
||||
ans = get_x_toc(container, find_existing_nav_toc, parse_nav, verify_destinations=verify_destinations)
|
||||
if len(ans) == 0:
|
||||
ans = get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations)
|
||||
return ans
|
||||
|
||||
|
||||
def get_guide_landmarks(container):
|
||||
for ref in container.opf_xpath('./opf:guide/opf:reference'):
|
||||
href, title, rtype = ref.get('href'), ref.get('title'), ref.get('type')
|
||||
href, frag = href.partition('#')[::2]
|
||||
name = container.href_to_name(href, container.opf_name)
|
||||
if container.has_name(name):
|
||||
yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''}
|
||||
|
||||
|
||||
def get_nav_landmarks(container):
|
||||
nav = find_existing_nav_toc(container)
|
||||
if nav and container.has_name(nav):
|
||||
root = container.parsed(nav)
|
||||
et = '{%s}type' % EPUB_NS
|
||||
for elem in root.iterdescendants(XHTML('nav')):
|
||||
if elem.get(et) == 'landmarks':
|
||||
for li in elem.iterdescendants(XHTML('li')):
|
||||
for a in li.iterdescendants(XHTML('a')):
|
||||
href, rtype = a.get('href'), a.get(et)
|
||||
if href:
|
||||
title = etree.tostring(a, method='text', encoding='unicode', with_tail=False).strip()
|
||||
href, frag = href.partition('#')[::2]
|
||||
name = container.href_to_name(href, nav)
|
||||
if container.has_name(name):
|
||||
yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''}
|
||||
break
|
||||
|
||||
|
||||
def get_landmarks(container):
|
||||
ver = container.opf_version_parsed
|
||||
if ver.major < 3:
|
||||
return list(get_guide_landmarks(container))
|
||||
ans = list(get_nav_landmarks(container))
|
||||
if len(ans) == 0:
|
||||
ans = list(get_guide_landmarks(container))
|
||||
return ans
|
||||
|
||||
|
||||
def ensure_id(elem, all_ids):
|
||||
elem_id = elem.get('id')
|
||||
if elem_id:
|
||||
return False, elem_id
|
||||
if elem.tag == XHTML('a'):
|
||||
anchor = elem.get('name', None)
|
||||
if anchor:
|
||||
elem.set('id', anchor)
|
||||
return False, anchor
|
||||
c = 0
|
||||
while True:
|
||||
c += 1
|
||||
q = 'toc_{}'.format(c)
|
||||
if q not in all_ids:
|
||||
elem.set('id', q)
|
||||
all_ids.add(q)
|
||||
break
|
||||
return True, elem.get('id')
|
||||
|
||||
|
||||
def elem_to_toc_text(elem):
|
||||
text = xml2text(elem).strip()
|
||||
if not text:
|
||||
text = elem.get('title', '')
|
||||
if not text:
|
||||
text = elem.get('alt', '')
|
||||
text = re.sub(r'\s+', ' ', text.strip())
|
||||
text = text[:1000].strip()
|
||||
if not text:
|
||||
text = _('(Untitled)')
|
||||
return text
|
||||
|
||||
|
||||
def item_at_top(elem):
|
||||
try:
|
||||
body = XPath('//h:body')(elem.getroottree().getroot())[0]
|
||||
except (TypeError, IndexError, KeyError, AttributeError):
|
||||
return False
|
||||
tree = body.getroottree()
|
||||
path = tree.getpath(elem)
|
||||
for el in body.iterdescendants(etree.Element):
|
||||
epath = tree.getpath(el)
|
||||
if epath == path:
|
||||
break
|
||||
try:
|
||||
if el.tag.endswith('}img') or (el.text and el.text.strip()):
|
||||
return False
|
||||
except:
|
||||
return False
|
||||
if not path.startswith(epath):
|
||||
# Only check tail of non-parent elements
|
||||
if el.tail and el.tail.strip():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def from_xpaths(container, xpaths):
|
||||
'''
|
||||
Generate a Table of Contents from a list of XPath expressions. Each
|
||||
expression in the list corresponds to a level of the generate ToC. For
|
||||
example: :code:`['//h:h1', '//h:h2', '//h:h3']` will generate a three level
|
||||
Table of Contents from the ``<h1>``, ``<h2>`` and ``<h3>`` tags.
|
||||
'''
|
||||
tocroot = TOC()
|
||||
xpaths = [XPath(xp) for xp in xpaths]
|
||||
|
||||
# Find those levels that have no elements in all spine items
|
||||
maps = OrderedDict()
|
||||
empty_levels = {i+1 for i, xp in enumerate(xpaths)}
|
||||
for spinepath in container.spine_items:
|
||||
name = container.abspath_to_name(spinepath)
|
||||
root = container.parsed(name)
|
||||
level_item_map = maps[name] = {i+1:frozenset(xp(root)) for i, xp in enumerate(xpaths)}
|
||||
for lvl, elems in iteritems(level_item_map):
|
||||
if elems:
|
||||
empty_levels.discard(lvl)
|
||||
# Remove empty levels from all level_maps
|
||||
if empty_levels:
|
||||
for name, lmap in tuple(iteritems(maps)):
|
||||
lmap = {lvl:items for lvl, items in iteritems(lmap) if lvl not in empty_levels}
|
||||
lmap = sorted(iteritems(lmap), key=itemgetter(0))
|
||||
lmap = {i+1:items for i, (l, items) in enumerate(lmap)}
|
||||
maps[name] = lmap
|
||||
|
||||
node_level_map = {tocroot: 0}
|
||||
|
||||
def parent_for_level(child_level):
|
||||
limit = child_level - 1
|
||||
|
||||
def process_node(node):
|
||||
child = node.last_child
|
||||
if child is None:
|
||||
return node
|
||||
lvl = node_level_map[child]
|
||||
return node if lvl > limit else child if lvl == limit else process_node(child)
|
||||
|
||||
return process_node(tocroot)
|
||||
|
||||
for name, level_item_map in iteritems(maps):
|
||||
root = container.parsed(name)
|
||||
item_level_map = {e:i for i, elems in iteritems(level_item_map) for e in elems}
|
||||
item_dirtied = False
|
||||
all_ids = set(root.xpath('//*/@id'))
|
||||
|
||||
for item in root.iterdescendants(etree.Element):
|
||||
lvl = item_level_map.get(item, None)
|
||||
if lvl is None:
|
||||
continue
|
||||
text = elem_to_toc_text(item)
|
||||
parent = parent_for_level(lvl)
|
||||
if item_at_top(item):
|
||||
dirtied, elem_id = False, None
|
||||
else:
|
||||
dirtied, elem_id = ensure_id(item, all_ids)
|
||||
item_dirtied = dirtied or item_dirtied
|
||||
toc = parent.add(text, name, elem_id)
|
||||
node_level_map[toc] = lvl
|
||||
toc.dest_exists = True
|
||||
|
||||
if item_dirtied:
|
||||
container.commit_item(name, keep_parsed=True)
|
||||
|
||||
return tocroot
|
||||
|
||||
|
||||
def from_links(container):
|
||||
'''
|
||||
Generate a Table of Contents from links in the book.
|
||||
'''
|
||||
toc = TOC()
|
||||
link_path = XPath('//h:a[@href]')
|
||||
seen_titles, seen_dests = set(), set()
|
||||
for name, is_linear in container.spine_names:
|
||||
root = container.parsed(name)
|
||||
for a in link_path(root):
|
||||
href = a.get('href')
|
||||
if not href or not href.strip():
|
||||
continue
|
||||
frag = None
|
||||
if href.startswith('#'):
|
||||
dest = name
|
||||
frag = href[1:]
|
||||
else:
|
||||
href, _, frag = href.partition('#')
|
||||
dest = container.href_to_name(href, base=name)
|
||||
frag = frag or None
|
||||
if (dest, frag) in seen_dests:
|
||||
continue
|
||||
seen_dests.add((dest, frag))
|
||||
text = elem_to_toc_text(a)
|
||||
if text in seen_titles:
|
||||
continue
|
||||
seen_titles.add(text)
|
||||
toc.add(text, dest, frag=frag)
|
||||
verify_toc_destinations(container, toc)
|
||||
for child in toc:
|
||||
if not child.dest_exists:
|
||||
toc.remove(child)
|
||||
return toc
|
||||
|
||||
|
||||
def find_text(node):
|
||||
LIMIT = 200
|
||||
pat = re.compile(r'\s+')
|
||||
for child in node:
|
||||
if isinstance(child, etree._Element):
|
||||
text = xml2text(child).strip()
|
||||
text = pat.sub(' ', text)
|
||||
if len(text) < 1:
|
||||
continue
|
||||
if len(text) > LIMIT:
|
||||
# Look for less text in a child of this node, recursively
|
||||
ntext = find_text(child)
|
||||
return ntext or (text[:LIMIT] + '...')
|
||||
else:
|
||||
return text
|
||||
|
||||
|
||||
def from_files(container):
|
||||
'''
|
||||
Generate a Table of Contents from files in the book.
|
||||
'''
|
||||
toc = TOC()
|
||||
for i, spinepath in enumerate(container.spine_items):
|
||||
name = container.abspath_to_name(spinepath)
|
||||
root = container.parsed(name)
|
||||
body = XPath('//h:body')(root)
|
||||
if not body:
|
||||
continue
|
||||
text = find_text(body[0])
|
||||
if not text:
|
||||
text = name.rpartition('/')[-1]
|
||||
if i == 0 and text.rpartition('.')[0].lower() in {'titlepage', 'cover'}:
|
||||
text = _('Cover')
|
||||
toc.add(text, name)
|
||||
return toc
|
||||
|
||||
|
||||
def node_from_loc(root, locs, totals=None):
|
||||
node = root.xpath('//*[local-name()="body"]')[0]
|
||||
for i, loc in enumerate(locs):
|
||||
children = tuple(node.iterchildren(etree.Element))
|
||||
if totals is not None and totals[i] != len(children):
|
||||
raise MalformedMarkup()
|
||||
node = children[loc]
|
||||
return node
|
||||
|
||||
|
||||
def add_id(container, name, loc, totals=None):
|
||||
root = container.parsed(name)
|
||||
try:
|
||||
node = node_from_loc(root, loc, totals=totals)
|
||||
except MalformedMarkup:
|
||||
# The webkit HTML parser and the container parser have yielded
|
||||
# different node counts, this can happen if the file is valid XML
|
||||
# but contains constructs like nested <p> tags. So force parse it
|
||||
# with the HTML 5 parser and try again.
|
||||
raw = container.raw_data(name)
|
||||
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
|
||||
try:
|
||||
node = node_from_loc(root, loc, totals=totals)
|
||||
except MalformedMarkup:
|
||||
raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
|
||||
' before editing.') % name)
|
||||
container.replace(name, root)
|
||||
|
||||
if not node.get('id'):
|
||||
ensure_id(node, set(root.xpath('//*/@id')))
|
||||
container.commit_item(name, keep_parsed=True)
|
||||
return node.get('id')
|
||||
|
||||
|
||||
def create_ncx(toc, to_href, btitle, lang, uid):
|
||||
lang = lang.replace('_', '-')
|
||||
ncx = etree.Element(NCX('ncx'),
|
||||
attrib={'version': '2005-1', XML('lang'): lang},
|
||||
nsmap={None: NCX_NS})
|
||||
head = etree.SubElement(ncx, NCX('head'))
|
||||
etree.SubElement(head, NCX('meta'),
|
||||
name='dtb:uid', content=unicode_type(uid))
|
||||
etree.SubElement(head, NCX('meta'),
|
||||
name='dtb:depth', content=unicode_type(toc.depth))
|
||||
generator = ''.join(['calibre (', __version__, ')'])
|
||||
etree.SubElement(head, NCX('meta'),
|
||||
name='dtb:generator', content=generator)
|
||||
etree.SubElement(head, NCX('meta'), name='dtb:totalPageCount', content='0')
|
||||
etree.SubElement(head, NCX('meta'), name='dtb:maxPageNumber', content='0')
|
||||
title = etree.SubElement(ncx, NCX('docTitle'))
|
||||
text = etree.SubElement(title, NCX('text'))
|
||||
text.text = btitle
|
||||
navmap = etree.SubElement(ncx, NCX('navMap'))
|
||||
spat = re.compile(r'\s+')
|
||||
|
||||
play_order = Counter()
|
||||
|
||||
def process_node(xml_parent, toc_parent):
|
||||
for child in toc_parent:
|
||||
play_order['c'] += 1
|
||||
point = etree.SubElement(xml_parent, NCX('navPoint'), id='num_%d' % play_order['c'],
|
||||
playOrder=unicode_type(play_order['c']))
|
||||
label = etree.SubElement(point, NCX('navLabel'))
|
||||
title = child.title
|
||||
if title:
|
||||
title = spat.sub(' ', title)
|
||||
etree.SubElement(label, NCX('text')).text = title
|
||||
if child.dest:
|
||||
href = to_href(child.dest)
|
||||
if child.frag:
|
||||
href += '#'+child.frag
|
||||
etree.SubElement(point, NCX('content'), src=href)
|
||||
process_node(point, child)
|
||||
|
||||
process_node(navmap, toc)
|
||||
return ncx
|
||||
|
||||
|
||||
def commit_ncx_toc(container, toc, lang=None, uid=None):
|
||||
tocname = find_existing_ncx_toc(container)
|
||||
if tocname is None:
|
||||
item = container.generate_item('toc.ncx', id_prefix='toc')
|
||||
tocname = container.href_to_name(item.get('href'), base=container.opf_name)
|
||||
ncx_id = item.get('id')
|
||||
[s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')]
|
||||
if not lang:
|
||||
lang = get_lang()
|
||||
for l in container.opf_xpath('//dc:language'):
|
||||
l = canonicalize_lang(xml2text(l).strip())
|
||||
if l:
|
||||
lang = l
|
||||
lang = lang_as_iso639_1(l) or l
|
||||
break
|
||||
lang = lang_as_iso639_1(lang) or lang
|
||||
if not uid:
|
||||
uid = uuid_id()
|
||||
eid = container.opf.get('unique-identifier', None)
|
||||
if eid:
|
||||
m = container.opf_xpath('//*[@id="%s"]'%eid)
|
||||
if m:
|
||||
uid = xml2text(m[0])
|
||||
|
||||
title = _('Table of Contents')
|
||||
m = container.opf_xpath('//dc:title')
|
||||
if m:
|
||||
x = xml2text(m[0]).strip()
|
||||
title = x or title
|
||||
|
||||
to_href = partial(container.name_to_href, base=tocname)
|
||||
root = create_ncx(toc, to_href, title, lang, uid)
|
||||
container.replace(tocname, root)
|
||||
container.pretty_print.add(tocname)
|
||||
|
||||
|
||||
def ensure_single_nav_of_type(root, ntype='toc'):
|
||||
et = '{%s}type' % EPUB_NS
|
||||
navs = [n for n in root.iterdescendants(XHTML('nav')) if n.get(et) == ntype]
|
||||
for x in navs[1:]:
|
||||
extract(x)
|
||||
if navs:
|
||||
nav = navs[0]
|
||||
tail = nav.tail
|
||||
attrib = dict(nav.attrib)
|
||||
nav.clear()
|
||||
nav.attrib.update(attrib)
|
||||
nav.tail = tail
|
||||
else:
|
||||
nav = root.makeelement(XHTML('nav'))
|
||||
first_child(root, XHTML('body')).append(nav)
|
||||
nav.set('{%s}type' % EPUB_NS, ntype)
|
||||
return nav
|
||||
|
||||
|
||||
def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None):
|
||||
from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree
|
||||
tocname = find_existing_nav_toc(container)
|
||||
if previous_nav is not None:
|
||||
nav_name = container.href_to_name(previous_nav[0])
|
||||
if nav_name and container.exists(nav_name):
|
||||
tocname = nav_name
|
||||
container.apply_unique_properties(tocname, 'nav')
|
||||
if tocname is None:
|
||||
item = container.generate_item('nav.xhtml', id_prefix='nav')
|
||||
item.set('properties', 'nav')
|
||||
tocname = container.href_to_name(item.get('href'), base=container.opf_name)
|
||||
if previous_nav is not None:
|
||||
root = previous_nav[1]
|
||||
else:
|
||||
root = container.parse_xhtml(P('templates/new_nav.html', data=True).decode('utf-8'))
|
||||
container.replace(tocname, root)
|
||||
else:
|
||||
root = container.parsed(tocname)
|
||||
if lang:
|
||||
lang = lang_as_iso639_1(lang) or lang
|
||||
root.set('lang', lang)
|
||||
root.set('{%s}lang' % XML_NS, lang)
|
||||
nav = ensure_single_nav_of_type(root, 'toc')
|
||||
if toc.toc_title:
|
||||
nav.append(nav.makeelement(XHTML('h1')))
|
||||
nav[-1].text = toc.toc_title
|
||||
|
||||
rnode = nav.makeelement(XHTML('ol'))
|
||||
nav.append(rnode)
|
||||
to_href = partial(container.name_to_href, base=tocname)
|
||||
spat = re.compile(r'\s+')
|
||||
|
||||
def process_node(xml_parent, toc_parent):
|
||||
for child in toc_parent:
|
||||
li = xml_parent.makeelement(XHTML('li'))
|
||||
xml_parent.append(li)
|
||||
title = child.title or ''
|
||||
title = spat.sub(' ', title).strip()
|
||||
a = li.makeelement(XHTML('a' if child.dest else 'span'))
|
||||
a.text = title
|
||||
li.append(a)
|
||||
if child.dest:
|
||||
href = to_href(child.dest)
|
||||
if child.frag:
|
||||
href += '#'+child.frag
|
||||
a.set('href', href)
|
||||
if len(child):
|
||||
ol = li.makeelement(XHTML('ol'))
|
||||
li.append(ol)
|
||||
process_node(ol, child)
|
||||
process_node(rnode, toc)
|
||||
pretty_xml_tree(nav)
|
||||
|
||||
def collapse_li(parent):
|
||||
for li in parent.iterdescendants(XHTML('li')):
|
||||
if len(li) == 1:
|
||||
li.text = None
|
||||
li[0].tail = None
|
||||
collapse_li(nav)
|
||||
nav.tail = '\n'
|
||||
|
||||
def create_li(ol, entry):
|
||||
li = ol.makeelement(XHTML('li'))
|
||||
ol.append(li)
|
||||
a = li.makeelement(XHTML('a'))
|
||||
li.append(a)
|
||||
href = container.name_to_href(entry['dest'], tocname)
|
||||
if entry['frag']:
|
||||
href += '#' + entry['frag']
|
||||
a.set('href', href)
|
||||
return a
|
||||
|
||||
if landmarks is not None:
|
||||
nav = ensure_single_nav_of_type(root, 'landmarks')
|
||||
nav.set('hidden', '')
|
||||
ol = nav.makeelement(XHTML('ol'))
|
||||
nav.append(ol)
|
||||
for entry in landmarks:
|
||||
if entry['type'] and container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS:
|
||||
a = create_li(ol, entry)
|
||||
a.set('{%s}type' % EPUB_NS, entry['type'])
|
||||
a.text = entry['title'] or None
|
||||
pretty_xml_tree(nav)
|
||||
collapse_li(nav)
|
||||
|
||||
if toc.page_list:
|
||||
nav = ensure_single_nav_of_type(root, 'page-list')
|
||||
nav.set('hidden', '')
|
||||
ol = nav.makeelement(XHTML('ol'))
|
||||
nav.append(ol)
|
||||
for entry in toc.page_list:
|
||||
if container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS:
|
||||
a = create_li(ol, entry)
|
||||
a.text = unicode_type(entry['pagenum'])
|
||||
pretty_xml_tree(nav)
|
||||
collapse_li(nav)
|
||||
container.replace(tocname, root)
|
||||
|
||||
|
||||
def commit_toc(container, toc, lang=None, uid=None):
|
||||
commit_ncx_toc(container, toc, lang=lang, uid=uid)
|
||||
if container.opf_version_parsed.major > 2:
|
||||
commit_nav_toc(container, toc, lang=lang)
|
||||
|
||||
|
||||
def remove_names_from_toc(container, names):
|
||||
changed = []
|
||||
names = frozenset(names)
|
||||
for find_toc, parse_toc, commit_toc in (
|
||||
(find_existing_ncx_toc, parse_ncx, commit_ncx_toc),
|
||||
(find_existing_nav_toc, parse_nav, commit_nav_toc),
|
||||
):
|
||||
toc = get_x_toc(container, find_toc, parse_toc, verify_destinations=False)
|
||||
if len(toc) > 0:
|
||||
remove = []
|
||||
for node in toc.iterdescendants():
|
||||
if node.dest in names:
|
||||
remove.append(node)
|
||||
if remove:
|
||||
for node in reversed(remove):
|
||||
node.remove_from_parent()
|
||||
commit_toc(container, toc)
|
||||
changed.append(find_toc(container))
|
||||
return changed
|
||||
|
||||
|
||||
def find_inline_toc(container):
|
||||
for name, linear in container.spine_names:
|
||||
if container.parsed(name).xpath('//*[local-name()="body" and @id="calibre_generated_inline_toc"]'):
|
||||
return name
|
||||
|
||||
|
||||
def toc_to_html(toc, container, toc_name, title, lang=None):
|
||||
|
||||
def process_node(html_parent, toc, level=1, indent=' ', style_level=2):
|
||||
li = html_parent.makeelement(XHTML('li'))
|
||||
li.tail = '\n'+ (indent*level)
|
||||
html_parent.append(li)
|
||||
name, frag = toc.dest, toc.frag
|
||||
href = '#'
|
||||
if name:
|
||||
href = container.name_to_href(name, toc_name)
|
||||
if frag:
|
||||
href += '#' + frag
|
||||
a = li.makeelement(XHTML('a'), href=href)
|
||||
a.text = toc.title
|
||||
li.append(a)
|
||||
if len(toc) > 0:
|
||||
parent = li.makeelement(XHTML('ul'))
|
||||
parent.set('class', 'level%d' % (style_level))
|
||||
li.append(parent)
|
||||
a.tail = '\n\n' + (indent*(level+2))
|
||||
parent.text = '\n'+(indent*(level+3))
|
||||
parent.tail = '\n\n' + (indent*(level+1))
|
||||
for child in toc:
|
||||
process_node(parent, child, level+3, style_level=style_level + 1)
|
||||
parent[-1].tail = '\n' + (indent*(level+2))
|
||||
|
||||
E = ElementMaker(namespace=XHTML_NS, nsmap={None:XHTML_NS})
|
||||
html = E.html(
|
||||
E.head(
|
||||
E.title(title),
|
||||
E.style(P('templates/inline_toc_styles.css', data=True), type='text/css'),
|
||||
),
|
||||
E.body(
|
||||
E.h2(title),
|
||||
E.ul(),
|
||||
id="calibre_generated_inline_toc",
|
||||
)
|
||||
)
|
||||
|
||||
ul = html[1][1]
|
||||
ul.set('class', 'level1')
|
||||
for child in toc:
|
||||
process_node(ul, child)
|
||||
if lang:
|
||||
html.set('lang', lang)
|
||||
pretty_html_tree(container, html)
|
||||
return html
|
||||
|
||||
|
||||
def create_inline_toc(container, title=None):
|
||||
'''
|
||||
Create an inline (HTML) Table of Contents from an existing NCX Table of Contents.
|
||||
|
||||
:param title: The title for this table of contents.
|
||||
'''
|
||||
lang = get_book_language(container)
|
||||
default_title = 'Table of Contents'
|
||||
if lang:
|
||||
lang = lang_as_iso639_1(lang) or lang
|
||||
default_title = translate(lang, default_title)
|
||||
title = title or default_title
|
||||
toc = get_toc(container)
|
||||
if len(toc) == 0:
|
||||
return None
|
||||
toc_name = find_inline_toc(container)
|
||||
|
||||
name = toc_name
|
||||
html = toc_to_html(toc, container, name, title, lang)
|
||||
raw = serialize(html, 'text/html')
|
||||
if name is None:
|
||||
name, c = 'toc.xhtml', 0
|
||||
while container.has_name(name):
|
||||
c += 1
|
||||
name = 'toc%d.xhtml' % c
|
||||
container.add_file(name, raw, spine_index=0)
|
||||
else:
|
||||
with container.open(name, 'wb') as f:
|
||||
f.write(raw)
|
||||
set_guide_item(container, 'toc', title, name, frag='calibre_generated_inline_toc')
|
||||
return name
|
||||
@@ -0,0 +1,231 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re, os
|
||||
from bisect import bisect
|
||||
|
||||
from calibre import guess_type as _guess_type, replace_entities
|
||||
from polyglot.builtins import filter
|
||||
|
||||
|
||||
def guess_type(x):
|
||||
return _guess_type(x)[0] or 'application/octet-stream'
|
||||
|
||||
|
||||
def setup_css_parser_serialization(tab_width=2):
|
||||
import css_parser
|
||||
prefs = css_parser.ser.prefs
|
||||
prefs.indent = tab_width * ' '
|
||||
prefs.indentClosingBrace = False
|
||||
prefs.omitLastSemicolon = False
|
||||
|
||||
|
||||
def actual_case_for_name(container, name):
|
||||
from calibre.utils.filenames import samefile
|
||||
if not container.exists(name):
|
||||
raise ValueError('Cannot get actual case for %s as it does not exist' % name)
|
||||
parts = name.split('/')
|
||||
base = ''
|
||||
ans = []
|
||||
for i, x in enumerate(parts):
|
||||
base = '/'.join(ans + [x])
|
||||
path = container.name_to_abspath(base)
|
||||
pdir = os.path.dirname(path)
|
||||
candidates = {os.path.join(pdir, q) for q in os.listdir(pdir)}
|
||||
if x in candidates:
|
||||
correctx = x
|
||||
else:
|
||||
for q in candidates:
|
||||
if samefile(q, path):
|
||||
correctx = os.path.basename(q)
|
||||
break
|
||||
else:
|
||||
raise RuntimeError('Something bad happened')
|
||||
ans.append(correctx)
|
||||
return '/'.join(ans)
|
||||
|
||||
|
||||
def corrected_case_for_name(container, name):
|
||||
parts = name.split('/')
|
||||
ans = []
|
||||
base = ''
|
||||
for i, x in enumerate(parts):
|
||||
base = '/'.join(ans + [x])
|
||||
if container.exists(base):
|
||||
correctx = x
|
||||
else:
|
||||
try:
|
||||
candidates = {q for q in os.listdir(os.path.dirname(container.name_to_abspath(base)))}
|
||||
except EnvironmentError:
|
||||
return None # one of the non-terminal components of name is a file instead of a directory
|
||||
for q in candidates:
|
||||
if q.lower() == x.lower():
|
||||
correctx = q
|
||||
break
|
||||
else:
|
||||
return None
|
||||
ans.append(correctx)
|
||||
return '/'.join(ans)
|
||||
|
||||
|
||||
class PositionFinder(object):
|
||||
|
||||
def __init__(self, raw):
|
||||
pat = br'\n' if isinstance(raw, bytes) else r'\n'
|
||||
self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))
|
||||
|
||||
def __call__(self, pos):
|
||||
lnum = bisect(self.new_lines, pos)
|
||||
try:
|
||||
offset = abs(pos - self.new_lines[lnum - 1])
|
||||
except IndexError:
|
||||
offset = pos
|
||||
return (lnum + 1, offset)
|
||||
|
||||
|
||||
class CommentFinder(object):
|
||||
|
||||
def __init__(self, raw, pat=r'(?s)/\*.*?\*/'):
|
||||
self.starts, self.ends = [], []
|
||||
for m in re.finditer(pat, raw):
|
||||
start, end = m.span()
|
||||
self.starts.append(start), self.ends.append(end)
|
||||
|
||||
def __call__(self, offset):
|
||||
if not self.starts:
|
||||
return False
|
||||
q = bisect(self.starts, offset) - 1
|
||||
return q >= 0 and self.starts[q] <= offset <= self.ends[q]
|
||||
|
||||
|
||||
def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'):
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML
|
||||
changed_names = set()
|
||||
snames = set(sheets)
|
||||
lp = XPath('//h:link[@href]')
|
||||
hp = XPath('//h:head')
|
||||
for name in names:
|
||||
root = container.parsed(name)
|
||||
if remove:
|
||||
for link in lp(root):
|
||||
if (link.get('type', mtype) or mtype) == mtype:
|
||||
container.remove_from_xml(link)
|
||||
changed_names.add(name)
|
||||
container.dirty(name)
|
||||
existing = {container.href_to_name(l.get('href'), name) for l in lp(root) if (l.get('type', mtype) or mtype) == mtype}
|
||||
extra = snames - existing
|
||||
if extra:
|
||||
changed_names.add(name)
|
||||
try:
|
||||
parent = hp(root)[0]
|
||||
except (TypeError, IndexError):
|
||||
parent = root.makeelement(XHTML('head'))
|
||||
container.insert_into_xml(root, parent, index=0)
|
||||
for sheet in sheets:
|
||||
if sheet in extra:
|
||||
container.insert_into_xml(
|
||||
parent, parent.makeelement(XHTML('link'), rel='stylesheet', type=mtype,
|
||||
href=container.name_to_href(sheet, name)))
|
||||
container.dirty(name)
|
||||
|
||||
return changed_names
|
||||
|
||||
|
||||
def lead_text(top_elem, num_words=10):
|
||||
''' Return the leading text contained in top_elem (including descendants)
|
||||
up to a maximum of num_words words. More efficient than using
|
||||
etree.tostring(method='text') as it does not have to serialize the entire
|
||||
sub-tree rooted at top_elem.'''
|
||||
pat = re.compile(r'\s+', flags=re.UNICODE)
|
||||
words = []
|
||||
|
||||
def get_text(x, attr='text'):
|
||||
ans = getattr(x, attr)
|
||||
if ans:
|
||||
words.extend(filter(None, pat.split(ans)))
|
||||
|
||||
stack = [(top_elem, 'text')]
|
||||
while stack and len(words) < num_words:
|
||||
elem, attr = stack.pop()
|
||||
get_text(elem, attr)
|
||||
if attr == 'text':
|
||||
if elem is not top_elem:
|
||||
stack.append((elem, 'tail'))
|
||||
stack.extend(reversed(list((c, 'text') for c in elem.iterchildren('*'))))
|
||||
return ' '.join(words[:num_words])
|
||||
|
||||
|
||||
def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None):
|
||||
if log_level is None:
|
||||
import logging
|
||||
log_level = logging.WARNING
|
||||
from css_parser import CSSParser, log
|
||||
from calibre.ebooks.oeb.base import _css_logger
|
||||
log.setLevel(log_level)
|
||||
log.raiseExceptions = False
|
||||
data = data or ''
|
||||
if isinstance(data, bytes):
|
||||
data = data.decode('utf-8') if decode is None else decode(data)
|
||||
if css_preprocessor is not None:
|
||||
data = css_preprocessor(data)
|
||||
parser = CSSParser(loglevel=log_level,
|
||||
# We dont care about @import rules
|
||||
fetcher=lambda x: (None, None), log=_css_logger)
|
||||
if is_declaration:
|
||||
data = parser.parseStyle(data, validate=False)
|
||||
else:
|
||||
data = parser.parseString(data, href=fname, validate=False)
|
||||
return data
|
||||
|
||||
|
||||
def handle_entities(text, func):
|
||||
return func(replace_entities(text))
|
||||
|
||||
|
||||
def apply_func_to_match_groups(match, func=icu_upper, handle_entities=handle_entities):
|
||||
'''Apply the specified function to individual groups in the match object (the result of re.search() or
|
||||
the whole match if no groups were defined. Returns the replaced string.'''
|
||||
found_groups = False
|
||||
i = 0
|
||||
parts, pos = [], match.start()
|
||||
f = lambda text:handle_entities(text, func)
|
||||
while True:
|
||||
i += 1
|
||||
try:
|
||||
start, end = match.span(i)
|
||||
except IndexError:
|
||||
break
|
||||
found_groups = True
|
||||
if start > -1:
|
||||
parts.append(match.string[pos:start])
|
||||
parts.append(f(match.string[start:end]))
|
||||
pos = end
|
||||
if not found_groups:
|
||||
return f(match.group())
|
||||
parts.append(match.string[pos:match.end()])
|
||||
return ''.join(parts)
|
||||
|
||||
|
||||
def apply_func_to_html_text(match, func=icu_upper, handle_entities=handle_entities):
|
||||
''' Apply the specified function only to text between HTML tag definitions. '''
|
||||
f = lambda text:handle_entities(text, func)
|
||||
parts = re.split(r'(<[^>]+>)', match.group())
|
||||
parts = (x if x.startswith('<') else f(x) for x in parts)
|
||||
return ''.join(parts)
|
||||
|
||||
|
||||
def extract(elem):
|
||||
''' Remove an element from the tree, keeping elem.tail '''
|
||||
p = elem.getparent()
|
||||
if p is not None:
|
||||
idx = p.index(elem)
|
||||
p.remove(elem)
|
||||
if elem.tail:
|
||||
if idx > 0:
|
||||
p[idx-1].tail = (p[idx-1].tail or '') + elem.tail
|
||||
else:
|
||||
p.text = (p.text or '') + elem.tail
|
||||
Reference in New Issue
Block a user