mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-08 04:34:12 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
297 lines
11 KiB
Python
297 lines
11 KiB
Python
#!/usr/bin/env python2
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2010, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
import os, glob, re, functools
|
|
from collections import Counter
|
|
|
|
from lxml import etree
|
|
from lxml.builder import ElementMaker
|
|
|
|
from ebook_converter.constants import __appname__, __version__
|
|
from ebook_converter.ebooks.chardet import xml_to_unicode
|
|
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
|
from ebook_converter.utils.cleantext import clean_xml_chars
|
|
from ebook_converter.polyglot.builtins import unicode_type, getcwd
|
|
from ebook_converter.polyglot.urllib import unquote, urlparse
|
|
|
|
NCX_NS = "http://www.daisy.org/z3986/2005/ncx/"
|
|
CALIBRE_NS = "http://calibre.kovidgoyal.net/2009/metadata"
|
|
NSMAP = {None: NCX_NS, 'calibre':CALIBRE_NS}
|
|
E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP)
|
|
C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
|
|
|
|
|
|
def parse_html_toc(data):
|
|
from html5_parser import parse
|
|
from ebook_converter.utils.cleantext import clean_xml_chars
|
|
from lxml import etree
|
|
if isinstance(data, bytes):
|
|
data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
|
|
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
|
|
for a in root.xpath('//*[@href and local-name()="a"]'):
|
|
purl = urlparse(unquote(a.get('href')))
|
|
href, fragment = purl[2], purl[5]
|
|
if not fragment:
|
|
fragment = None
|
|
else:
|
|
fragment = fragment.strip()
|
|
href = href.strip()
|
|
|
|
txt = etree.tostring(a, method='text', encoding='unicode')
|
|
yield href, fragment, txt
|
|
|
|
|
|
class TOC(list):
|
|
|
|
def __init__(self, href=None, fragment=None, text=None, parent=None,
|
|
play_order=0, base_path=getcwd(), type='unknown', author=None,
|
|
description=None, toc_thumbnail=None):
|
|
self.href = href
|
|
self.fragment = fragment
|
|
if not self.fragment:
|
|
self.fragment = None
|
|
self.text = text
|
|
self.parent = parent
|
|
self.base_path = base_path
|
|
self.play_order = play_order
|
|
self.type = type
|
|
self.author = author
|
|
self.description = description
|
|
self.toc_thumbnail = toc_thumbnail
|
|
|
|
def __str__(self):
|
|
lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)]
|
|
for child in self:
|
|
c = unicode_type(child).splitlines()
|
|
for l in c:
|
|
lines.append('\t'+l)
|
|
return '\n'.join(lines)
|
|
|
|
def count(self, type):
|
|
return len([i for i in self.flat() if i.type == type])
|
|
|
|
def purge(self, types, max=0):
|
|
remove = []
|
|
for entry in self.flat():
|
|
if entry.type in types:
|
|
remove.append(entry)
|
|
remove = remove[max:]
|
|
for entry in remove:
|
|
if entry.parent is None:
|
|
continue
|
|
entry.parent.remove(entry)
|
|
return remove
|
|
|
|
def remove(self, entry):
|
|
list.remove(self, entry)
|
|
entry.parent = None
|
|
|
|
def add_item(self, href, fragment, text, play_order=None, type='unknown',
|
|
author=None, description=None, toc_thumbnail=None):
|
|
if play_order is None:
|
|
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
|
|
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
|
|
base_path=self.base_path, play_order=play_order,
|
|
type=type, author=author, description=description, toc_thumbnail=toc_thumbnail))
|
|
return self[-1]
|
|
|
|
def top_level_items(self):
|
|
for item in self:
|
|
if item.text is not None:
|
|
yield item
|
|
|
|
def depth(self):
|
|
depth = 1
|
|
for obj in self:
|
|
c = obj.depth()
|
|
if c > depth - 1:
|
|
depth = c + 1
|
|
return depth
|
|
|
|
def flat(self):
|
|
'Depth first iteration over the tree rooted at self'
|
|
yield self
|
|
for obj in self:
|
|
for i in obj.flat():
|
|
yield i
|
|
|
|
@property
|
|
def abspath(self):
|
|
'Return the file this toc entry points to as a absolute path to a file on the system.'
|
|
|
|
if self.href is None:
|
|
return None
|
|
path = self.href.replace('/', os.sep)
|
|
if not os.path.isabs(path):
|
|
path = os.path.join(self.base_path, path)
|
|
return path
|
|
|
|
def read_from_opf(self, opfreader):
|
|
toc = opfreader.soup.find('spine', toc=True)
|
|
if toc is not None:
|
|
toc = toc['toc']
|
|
if toc is None:
|
|
try:
|
|
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
|
|
except:
|
|
for item in opfreader.manifest:
|
|
if 'toc' in item.href().lower():
|
|
toc = item.href()
|
|
break
|
|
|
|
if toc is not None:
|
|
if toc.lower() not in ('ncx', 'ncxtoc'):
|
|
toc = urlparse(unquote(toc))[2]
|
|
toc = toc.replace('/', os.sep)
|
|
if not os.path.isabs(toc):
|
|
toc = os.path.join(self.base_path, toc)
|
|
try:
|
|
if not os.path.exists(toc):
|
|
bn = os.path.basename(toc)
|
|
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
|
|
toc = os.path.join(os.path.dirname(toc), bn)
|
|
|
|
self.read_html_toc(toc)
|
|
except:
|
|
print('WARNING: Could not read Table of Contents. Continuing anyway.')
|
|
else:
|
|
path = opfreader.manifest.item(toc.lower())
|
|
path = getattr(path, 'path', path)
|
|
if path and os.access(path, os.R_OK):
|
|
try:
|
|
self.read_ncx_toc(path)
|
|
except Exception as err:
|
|
print('WARNING: Invalid NCX file:', err)
|
|
return
|
|
cwd = os.path.abspath(self.base_path)
|
|
m = glob.glob(os.path.join(cwd, '*.ncx'))
|
|
if m:
|
|
toc = m[0]
|
|
self.read_ncx_toc(toc)
|
|
|
|
def read_ncx_toc(self, toc, root=None):
|
|
self.base_path = os.path.dirname(toc)
|
|
if root is None:
|
|
with open(toc, 'rb') as f:
|
|
raw = xml_to_unicode(f.read(), assume_utf8=True,
|
|
strip_encoding_pats=True)[0]
|
|
root = safe_xml_fromstring(raw)
|
|
xpn = {'re': 'http://exslt.org/regular-expressions'}
|
|
XPath = functools.partial(etree.XPath, namespaces=xpn)
|
|
|
|
def get_attr(node, default=None, attr='playorder'):
|
|
for name, val in node.attrib.items():
|
|
if name and val and name.lower().endswith(attr):
|
|
return val
|
|
return default
|
|
|
|
nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]')
|
|
txt_path = XPath('./*[re:match(local-name(), "text$", "i")]')
|
|
content_path = XPath('./*[re:match(local-name(), "content$", "i")]')
|
|
np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]')
|
|
|
|
def process_navpoint(np, dest):
|
|
try:
|
|
play_order = int(get_attr(np, 1))
|
|
except:
|
|
play_order = 1
|
|
href = fragment = text = None
|
|
nd = dest
|
|
nl = nl_path(np)
|
|
if nl:
|
|
nl = nl[0]
|
|
text = ''
|
|
for txt in txt_path(nl):
|
|
text += etree.tostring(txt, method='text',
|
|
encoding='unicode', with_tail=False)
|
|
content = content_path(np)
|
|
if content and text:
|
|
content = content[0]
|
|
# if get_attr(content, attr='src'):
|
|
purl = urlparse(content.get('src'))
|
|
href, fragment = unquote(purl[2]), unquote(purl[5])
|
|
nd = dest.add_item(href, fragment, text)
|
|
nd.play_order = play_order
|
|
|
|
for c in np_path(np):
|
|
process_navpoint(c, nd)
|
|
|
|
nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root)
|
|
if not nm:
|
|
raise ValueError('NCX files must have a <navmap> element.')
|
|
nm = nm[0]
|
|
|
|
for child in np_path(nm):
|
|
process_navpoint(child, self)
|
|
|
|
def read_html_toc(self, toc):
|
|
self.base_path = os.path.dirname(toc)
|
|
with lopen(toc, 'rb') as f:
|
|
parsed_toc = parse_html_toc(f.read())
|
|
for href, fragment, txt in parsed_toc:
|
|
add = True
|
|
for i in self.flat():
|
|
if i.href == href and i.fragment == fragment:
|
|
add = False
|
|
break
|
|
if add:
|
|
self.add_item(href, fragment, txt)
|
|
|
|
def render(self, stream, uid):
|
|
root = E.ncx(
|
|
E.head(
|
|
E.meta(name='dtb:uid', content=unicode_type(uid)),
|
|
E.meta(name='dtb:depth', content=unicode_type(self.depth())),
|
|
E.meta(name='dtb:generator', content='%s (%s)'%(__appname__,
|
|
__version__)),
|
|
E.meta(name='dtb:totalPageCount', content='0'),
|
|
E.meta(name='dtb:maxPageNumber', content='0'),
|
|
),
|
|
E.docTitle(E.text('Table of Contents')),
|
|
)
|
|
navmap = E.navMap()
|
|
root.append(navmap)
|
|
root.set('{http://www.w3.org/XML/1998/namespace}lang', 'en')
|
|
c = Counter()
|
|
|
|
def navpoint(parent, np):
|
|
text = np.text
|
|
if not text:
|
|
text = ''
|
|
c[1] += 1
|
|
item_id = 'num_%d'%c[1]
|
|
text = clean_xml_chars(text)
|
|
elem = E.navPoint(
|
|
E.navLabel(E.text(re.sub(r'\s+', ' ', text))),
|
|
E.content(src=unicode_type(np.href)+(('#' + unicode_type(np.fragment))
|
|
if np.fragment else '')),
|
|
id=item_id,
|
|
playOrder=unicode_type(np.play_order)
|
|
)
|
|
au = getattr(np, 'author', None)
|
|
if au:
|
|
au = re.sub(r'\s+', ' ', au)
|
|
elem.append(C.meta(au, name='author'))
|
|
desc = getattr(np, 'description', None)
|
|
if desc:
|
|
desc = re.sub(r'\s+', ' ', desc)
|
|
try:
|
|
elem.append(C.meta(desc, name='description'))
|
|
except ValueError:
|
|
elem.append(C.meta(clean_xml_chars(desc), name='description'))
|
|
idx = getattr(np, 'toc_thumbnail', None)
|
|
if idx:
|
|
elem.append(C.meta(idx, name='toc_thumbnail'))
|
|
parent.append(elem)
|
|
for np2 in np:
|
|
navpoint(elem, np2)
|
|
|
|
for np in self:
|
|
navpoint(navmap, np)
|
|
raw = etree.tostring(root, encoding='utf-8', xml_declaration=True,
|
|
pretty_print=True)
|
|
stream.write(raw)
|