1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-24 14:05:46 +01:00
Files
ebook-converter/ebook_converter/ebooks/metadata/toc.py
gryf ce89f5c9d1 Use the real constants module.
This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
2020-05-29 17:04:53 +02:00

302 lines
11 KiB
Python

import collections
import functools
import glob
import os
import re
import urllib.parse
from lxml import etree
from lxml.builder import ElementMaker
from ebook_converter.constants_old import __appname__, __version__
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.polyglot.urllib import unquote
NCX_NS = "http://www.daisy.org/z3986/2005/ncx/"
CALIBRE_NS = "http://calibre.kovidgoyal.net/2009/metadata"
NSMAP = {None: NCX_NS, 'calibre': CALIBRE_NS}
E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP)
C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
def parse_html_toc(data):
from html5_parser import parse
from ebook_converter.utils.cleantext import clean_xml_chars
from lxml import etree
if isinstance(data, bytes):
data = xml_to_unicode(data, strip_encoding_pats=True,
resolve_entities=True)[0]
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False,
sanitize_names=True)
for a in root.xpath('//*[@href and local-name()="a"]'):
purl = urllib.parse.urlparse(unquote(a.get('href')))
href, fragment = purl[2], purl[5]
if not fragment:
fragment = None
else:
fragment = fragment.strip()
href = href.strip()
txt = etree.tostring(a, method='text', encoding='unicode')
yield href, fragment, txt
class TOC(list):
def __init__(self, href=None, fragment=None, text=None, parent=None,
play_order=0, base_path=os.getcwd(), type='unknown',
author=None, description=None, toc_thumbnail=None):
self.href = href
self.fragment = fragment
if not self.fragment:
self.fragment = None
self.text = text
self.parent = parent
self.base_path = base_path
self.play_order = play_order
self.type = type
self.author = author
self.description = description
self.toc_thumbnail = toc_thumbnail
def __str__(self):
lines = ['TOC: %s#%s %s' % (self.href, self.fragment, self.text)]
for child in self:
c = str(child).splitlines()
for l in c:
lines.append('\t'+l)
return '\n'.join(lines)
def count(self, type):
return len([i for i in self.flat() if i.type == type])
def purge(self, types, max=0):
remove = []
for entry in self.flat():
if entry.type in types:
remove.append(entry)
remove = remove[max:]
for entry in remove:
if entry.parent is None:
continue
entry.parent.remove(entry)
return remove
def remove(self, entry):
list.remove(self, entry)
entry.parent = None
def add_item(self, href, fragment, text, play_order=None, type='unknown',
author=None, description=None, toc_thumbnail=None):
if play_order is None:
play_order = (self[-1].play_order
if len(self) else self.play_order) + 1
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
base_path=self.base_path, play_order=play_order,
type=type, author=author, description=description,
toc_thumbnail=toc_thumbnail))
return self[-1]
def top_level_items(self):
for item in self:
if item.text is not None:
yield item
def depth(self):
depth = 1
for obj in self:
c = obj.depth()
if c > depth - 1:
depth = c + 1
return depth
def flat(self):
'Depth first iteration over the tree rooted at self'
yield self
for obj in self:
for i in obj.flat():
yield i
@property
def abspath(self):
"""
Return the file this toc entry points to as a absolute path to a file
on the system.
"""
if self.href is None:
return None
path = self.href.replace('/', os.sep)
if not os.path.isabs(path):
path = os.path.join(self.base_path, path)
return path
def read_from_opf(self, opfreader):
toc = opfreader.soup.find('spine', toc=True)
if toc is not None:
toc = toc['toc']
if toc is None:
try:
toc = (opfreader.soup.find('guide')
.find('reference', attrs={'type': 'toc'})['href'])
except Exception:
for item in opfreader.manifest:
if 'toc' in item.href().lower():
toc = item.href()
break
if toc is not None:
if toc.lower() not in ('ncx', 'ncxtoc'):
toc = urllib.parse.urlparse(unquote(toc))[2]
toc = toc.replace('/', os.sep)
if not os.path.isabs(toc):
toc = os.path.join(self.base_path, toc)
try:
if not os.path.exists(toc):
bn = os.path.basename(toc)
# Bug in BAEN OPF files
bn = bn.replace('_top.htm', '_toc.htm')
toc = os.path.join(os.path.dirname(toc), bn)
self.read_html_toc(toc)
except Exception:
print('WARNING: Could not read Table of Contents. '
'Continuing anyway.')
else:
path = opfreader.manifest.item(toc.lower())
path = getattr(path, 'path', path)
if path and os.access(path, os.R_OK):
try:
self.read_ncx_toc(path)
except Exception as err:
print('WARNING: Invalid NCX file:', err)
return
cwd = os.path.abspath(self.base_path)
m = glob.glob(os.path.join(cwd, '*.ncx'))
if m:
toc = m[0]
self.read_ncx_toc(toc)
def read_ncx_toc(self, toc, root=None):
self.base_path = os.path.dirname(toc)
if root is None:
with open(toc, 'rb') as f:
raw = xml_to_unicode(f.read(), assume_utf8=True,
strip_encoding_pats=True)[0]
root = etree.fromstring(raw)
xpn = {'re': 'http://exslt.org/regular-expressions'}
XPath = functools.partial(etree.XPath, namespaces=xpn)
def get_attr(node, default=None, attr='playorder'):
for name, val in node.attrib.items():
if name and val and name.lower().endswith(attr):
return val
return default
nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]')
txt_path = XPath('./*[re:match(local-name(), "text$", "i")]')
content_path = XPath('./*[re:match(local-name(), "content$", "i")]')
np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]')
def process_navpoint(np, dest):
try:
play_order = int(get_attr(np, 1))
except Exception:
play_order = 1
href = fragment = text = None
nd = dest
nl = nl_path(np)
if nl:
nl = nl[0]
text = ''
for txt in txt_path(nl):
text += etree.tostring(txt, method='text',
encoding='unicode', with_tail=False)
content = content_path(np)
if content and text:
content = content[0]
# if get_attr(content, attr='src'):
purl = urllib.parse.urlparse(content.get('src'))
href, fragment = unquote(purl[2]), unquote(purl[5])
nd = dest.add_item(href, fragment, text)
nd.play_order = play_order
for c in np_path(np):
process_navpoint(c, nd)
nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root)
if not nm:
raise ValueError('NCX files must have a <navmap> element.')
nm = nm[0]
for child in np_path(nm):
process_navpoint(child, self)
def read_html_toc(self, toc):
self.base_path = os.path.dirname(toc)
with open(toc, 'rb') as f:
parsed_toc = parse_html_toc(f.read())
for href, fragment, txt in parsed_toc:
add = True
for i in self.flat():
if i.href == href and i.fragment == fragment:
add = False
break
if add:
self.add_item(href, fragment, txt)
def render(self, stream, uid):
root = E.ncx(E.head(E.meta(name='dtb:uid', content=str(uid)),
E.meta(name='dtb:depth',
content=str(self.depth())),
E.meta(name='dtb:generator', content='%s (%s)' %
(__appname__, __version__)),
E.meta(name='dtb:totalPageCount', content='0'),
E.meta(name='dtb:maxPageNumber', content='0')),
E.docTitle(E.text('Table of Contents')))
navmap = E.navMap()
root.append(navmap)
root.set('{http://www.w3.org/XML/1998/namespace}lang', 'en')
c = collections.Counter()
def navpoint(parent, np):
text = np.text
if not text:
text = ''
c[1] += 1
item_id = 'num_%d' % c[1]
text = clean_xml_chars(text)
elem = E.navPoint(
E.navLabel(E.text(re.sub(r'\s+', ' ', text))),
E.content(src=str(np.href)+(('#' + str(np.fragment))
if np.fragment else '')),
id=item_id,
playOrder=str(np.play_order)
)
au = getattr(np, 'author', None)
if au:
au = re.sub(r'\s+', ' ', au)
elem.append(C.meta(au, name='author'))
desc = getattr(np, 'description', None)
if desc:
desc = re.sub(r'\s+', ' ', desc)
try:
elem.append(C.meta(desc, name='description'))
except ValueError:
elem.append(C.meta(clean_xml_chars(desc),
name='description'))
idx = getattr(np, 'toc_thumbnail', None)
if idx:
elem.append(C.meta(idx, name='toc_thumbnail'))
parent.append(elem)
for np2 in np:
navpoint(elem, np2)
for np in self:
navpoint(navmap, np)
raw = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=True)
stream.write(raw)