mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-07 04:04:12 +01:00
323 lines
13 KiB
Python
323 lines
13 KiB
Python
import re
|
|
import uuid
|
|
import urllib.parse
|
|
|
|
from lxml import etree
|
|
from collections import OrderedDict, Counter
|
|
|
|
from ebook_converter.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename
|
|
from ebook_converter.ebooks import ConversionError
|
|
from ebook_converter.polyglot.builtins import itervalues
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
|
|
def XPath(x):
|
|
try:
|
|
return etree.XPath(x, namespaces=XPNSMAP)
|
|
except etree.XPathSyntaxError:
|
|
raise ConversionError(
|
|
'The syntax of the XPath expression %s is invalid.' % repr(x))
|
|
|
|
|
|
def isspace(x):
|
|
return not x or x.replace('\xa0', '').isspace()
|
|
|
|
|
|
def at_start(elem):
|
|
' Return True if there is no content before elem '
|
|
body = XPath('ancestor-or-self::h:body')(elem)
|
|
if not body:
|
|
return True
|
|
body = body[0]
|
|
ancestors = frozenset(XPath('ancestor::*')(elem))
|
|
for x in body.iter():
|
|
if x is elem:
|
|
return True
|
|
if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}:
|
|
return False
|
|
if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))):
|
|
continue
|
|
return False
|
|
return False
|
|
|
|
|
|
class DetectStructure(object):
|
|
|
|
def __call__(self, oeb, opts):
|
|
self.log = oeb.log
|
|
self.oeb = oeb
|
|
self.opts = opts
|
|
self.log('Detecting structure...')
|
|
|
|
self.detect_chapters()
|
|
if self.oeb.auto_generated_toc or opts.use_auto_toc:
|
|
orig_toc = self.oeb.toc
|
|
self.oeb.toc = TOC()
|
|
self.create_level_based_toc()
|
|
if self.oeb.toc.count() < 1:
|
|
if not opts.no_chapters_in_toc and self.detected_chapters:
|
|
self.create_toc_from_chapters()
|
|
if self.oeb.toc.count() < opts.toc_threshold:
|
|
self.create_toc_from_links()
|
|
if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
|
|
self.oeb.toc = orig_toc
|
|
else:
|
|
self.oeb.auto_generated_toc = True
|
|
self.log('Auto generated TOC with %d entries.' %
|
|
self.oeb.toc.count())
|
|
|
|
if opts.toc_filter is not None:
|
|
regexp = re.compile(opts.toc_filter)
|
|
for node in list(self.oeb.toc.iter()):
|
|
if not node.title or regexp.search(node.title) is not None:
|
|
self.log('Filtering', node.title if node.title else
|
|
'empty node', 'from TOC')
|
|
self.oeb.toc.remove(node)
|
|
|
|
if opts.page_breaks_before is not None:
|
|
pb_xpath = XPath(opts.page_breaks_before)
|
|
for item in oeb.spine:
|
|
for elem in pb_xpath(item.data):
|
|
try:
|
|
prev = next(elem.itersiblings(tag=etree.Element,
|
|
preceding=True))
|
|
if (barename(elem.tag) in {'h1', 'h2'} and barename(
|
|
prev.tag) in {'h1', 'h2'} and (not prev.tail or
|
|
not prev.tail.split())):
|
|
# We have two adjacent headings, do not put a page
|
|
# break on the second one
|
|
continue
|
|
except StopIteration:
|
|
pass
|
|
|
|
style = elem.get('style', '')
|
|
if style:
|
|
style += '; '
|
|
elem.set('style', style+'page-break-before:always')
|
|
|
|
for node in self.oeb.toc.iter():
|
|
if not node.title or not node.title.strip():
|
|
node.title = _('Unnamed')
|
|
|
|
if self.opts.start_reading_at:
|
|
self.detect_start_reading()
|
|
|
|
def detect_start_reading(self):
|
|
expr = self.opts.start_reading_at
|
|
try:
|
|
expr = XPath(expr)
|
|
except:
|
|
self.log.warn(
|
|
'Invalid start reading at XPath expression, ignoring: %s'%expr)
|
|
return
|
|
for item in self.oeb.spine:
|
|
if not hasattr(item.data, 'xpath'):
|
|
continue
|
|
matches = expr(item.data)
|
|
if matches:
|
|
elem = matches[0]
|
|
eid = elem.get('id', None)
|
|
if not eid:
|
|
eid = 'start_reading_at_'+str(uuid.uuid4()).replace('-', '')
|
|
elem.set('id', eid)
|
|
if 'text' in self.oeb.guide:
|
|
self.oeb.guide.remove('text')
|
|
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
|
|
self.log('Setting start reading at position to %s in %s'%(
|
|
self.opts.start_reading_at, item.href))
|
|
return
|
|
self.log.warn("Failed to find start reading at position: %s"%
|
|
self.opts.start_reading_at)
|
|
|
|
def get_toc_parts_for_xpath(self, expr):
|
|
# if an attribute is selected by the xpath expr then truncate it
|
|
# from the path and instead return it as where to find the title text
|
|
title_attribute_regex = re.compile(r'/@([-\w]+)$')
|
|
match = title_attribute_regex.search(expr)
|
|
if match is not None:
|
|
return expr[0:match.start()], match.group(1)
|
|
|
|
return expr, None
|
|
|
|
def detect_chapters(self):
|
|
self.detected_chapters = []
|
|
self.chapter_title_attribute = None
|
|
|
|
def find_matches(expr, doc):
|
|
try:
|
|
ans = XPath(expr)(doc)
|
|
len(ans)
|
|
return ans
|
|
except:
|
|
self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
|
|
return []
|
|
|
|
if self.opts.chapter:
|
|
chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter)
|
|
self.chapter_title_attribute = title_attribute
|
|
for item in self.oeb.spine:
|
|
for x in find_matches(chapter_path, item.data):
|
|
self.detected_chapters.append((item, x))
|
|
|
|
chapter_mark = self.opts.chapter_mark
|
|
page_break_before = 'display: block; page-break-before: always'
|
|
page_break_after = 'display: block; page-break-after: always'
|
|
c = Counter()
|
|
for item, elem in self.detected_chapters:
|
|
c[item] += 1
|
|
text = xml2text(elem).strip()
|
|
text = re.sub(r'\s+', ' ', text.strip())
|
|
self.log('\tDetected chapter:', text[:50])
|
|
if chapter_mark == 'none':
|
|
continue
|
|
if chapter_mark == 'rule':
|
|
mark = elem.makeelement(XHTML('hr'))
|
|
elif chapter_mark == 'pagebreak':
|
|
if c[item] < 3 and at_start(elem):
|
|
# For the first two elements in this item, check if they
|
|
# are at the start of the file, in which case inserting a
|
|
# page break in unnecessary and can lead to extra blank
|
|
# pages in the PDF Output plugin. We need to use two as
|
|
# feedbooks epubs match both a heading tag and its
|
|
# containing div with the default chapter expression.
|
|
continue
|
|
mark = elem.makeelement(XHTML('div'), style=page_break_after)
|
|
else: # chapter_mark == 'both':
|
|
mark = elem.makeelement(XHTML('hr'), style=page_break_before)
|
|
try:
|
|
elem.addprevious(mark)
|
|
except TypeError:
|
|
self.log.exception('Failed to mark chapter')
|
|
|
|
def create_level_based_toc(self):
|
|
if self.opts.level1_toc is not None:
|
|
self.add_leveled_toc_items()
|
|
|
|
def create_toc_from_chapters(self):
|
|
counter = self.oeb.toc.next_play_order()
|
|
for item, elem in self.detected_chapters:
|
|
text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter)
|
|
self.oeb.toc.add(text, href, play_order=counter)
|
|
counter += 1
|
|
|
|
def create_toc_from_links(self):
|
|
num = 0
|
|
for item in self.oeb.spine:
|
|
for a in XPath('//h:a[@href]')(item.data):
|
|
href = a.get('href')
|
|
try:
|
|
purl = urllib.parse.urlparse(href)
|
|
except ValueError:
|
|
self.log.warning('Ignoring malformed URL:', href)
|
|
continue
|
|
if not purl[0] or purl[0] == 'file':
|
|
href, frag = purl.path, purl.fragment
|
|
href = item.abshref(href)
|
|
if frag:
|
|
href = '#'.join((href, frag))
|
|
if not self.oeb.toc.has_href(href):
|
|
text = xml2text(a)
|
|
text = text[:100].strip()
|
|
if (not self.opts.duplicate_links_in_toc and
|
|
self.oeb.toc.has_text(text)):
|
|
continue
|
|
try:
|
|
self.oeb.toc.add(text, href,
|
|
play_order=self.oeb.toc.next_play_order())
|
|
num += 1
|
|
except ValueError:
|
|
self.oeb.log.exception('Failed to process link: %r' % href)
|
|
continue # Most likely an incorrectly URL encoded link
|
|
if self.opts.max_toc_links > 0 and \
|
|
num >= self.opts.max_toc_links:
|
|
self.log('Maximum TOC links reached, stopping.')
|
|
return
|
|
|
|
def elem_to_link(self, item, elem, title_attribute, counter):
|
|
text = ''
|
|
if title_attribute is not None:
|
|
text = elem.get(title_attribute, '')
|
|
if not text:
|
|
text = xml2text(elem).strip()
|
|
if not text:
|
|
text = elem.get('title', '')
|
|
if not text:
|
|
text = elem.get('alt', '')
|
|
text = re.sub(r'\s+', ' ', text.strip())
|
|
text = text[:1000].strip()
|
|
id = elem.get('id', 'calibre_toc_%d'%counter)
|
|
elem.set('id', id)
|
|
href = '#'.join((item.href, id))
|
|
return text, href
|
|
|
|
def add_leveled_toc_items(self):
|
|
added = OrderedDict()
|
|
added2 = OrderedDict()
|
|
counter = 1
|
|
|
|
def find_matches(expr, doc):
|
|
try:
|
|
ans = XPath(expr)(doc)
|
|
len(ans)
|
|
return ans
|
|
except:
|
|
self.log.warn('Invalid ToC expression, ignoring: %s'%expr)
|
|
return []
|
|
|
|
for document in self.oeb.spine:
|
|
previous_level1 = list(itervalues(added))[-1] if added else None
|
|
previous_level2 = list(itervalues(added2))[-1] if added2 else None
|
|
|
|
level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc)
|
|
for elem in find_matches(level1_toc, document.data):
|
|
text, _href = self.elem_to_link(document, elem, level1_title, counter)
|
|
counter += 1
|
|
if text:
|
|
node = self.oeb.toc.add(text, _href,
|
|
play_order=self.oeb.toc.next_play_order())
|
|
added[elem] = node
|
|
# node.add(_('Top'), _href)
|
|
|
|
if self.opts.level2_toc is not None and added:
|
|
level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc)
|
|
for elem in find_matches(level2_toc, document.data):
|
|
level1 = None
|
|
for item in document.data.iterdescendants():
|
|
if item in added:
|
|
level1 = added[item]
|
|
elif item == elem:
|
|
if level1 is None:
|
|
if previous_level1 is None:
|
|
break
|
|
level1 = previous_level1
|
|
text, _href = self.elem_to_link(document, elem, level2_title, counter)
|
|
counter += 1
|
|
if text:
|
|
added2[elem] = level1.add(text, _href,
|
|
play_order=self.oeb.toc.next_play_order())
|
|
break
|
|
|
|
if self.opts.level3_toc is not None and added2:
|
|
level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc)
|
|
for elem in find_matches(level3_toc, document.data):
|
|
level2 = None
|
|
for item in document.data.iterdescendants():
|
|
if item in added2:
|
|
level2 = added2[item]
|
|
elif item == elem:
|
|
if level2 is None:
|
|
if previous_level2 is None:
|
|
break
|
|
level2 = previous_level2
|
|
text, _href = \
|
|
self.elem_to_link(document, elem, level3_title, counter)
|
|
counter += 1
|
|
if text:
|
|
level2.add(text, _href,
|
|
play_order=self.oeb.toc.next_play_order())
|
|
break
|