1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-17 07:13:42 +01:00

Fixing leftovers from first concept of constants

This commit is contained in:
2020-06-07 11:59:00 +02:00
parent 7419954e0c
commit a69884d724
9 changed files with 652 additions and 464 deletions

View File

@@ -7,7 +7,7 @@ from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.base import TOC, xml2text
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks import ConversionError
@@ -15,8 +15,8 @@ def XPath(x):
try:
return etree.XPath(x, namespaces=const.XPNSMAP)
except etree.XPathSyntaxError:
raise ConversionError(
'The syntax of the XPath expression %s is invalid.' % repr(x))
raise ConversionError('The syntax of the XPath expression %s is '
'invalid.' % repr(x))
def isspace(x):
@@ -33,9 +33,13 @@ def at_start(elem):
for x in body.iter():
if x is elem:
return True
if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}:
if hasattr(getattr(x, 'tag', None),
'rpartition') and x.tag.rpartition('}')[-1] in {'img',
'svg'}:
return False
if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))):
if isspace(getattr(x, 'text', None)) and (x in ancestors or
isspace(getattr(x, 'tail',
None))):
continue
return False
return False
@@ -52,7 +56,7 @@ class DetectStructure(object):
self.detect_chapters()
if self.oeb.auto_generated_toc or opts.use_auto_toc:
orig_toc = self.oeb.toc
self.oeb.toc = TOC()
self.oeb.toc = base.TOC()
self.create_level_based_toc()
if self.oeb.toc.count() < 1:
if not opts.no_chapters_in_toc and self.detected_chapters:
@@ -64,14 +68,14 @@ class DetectStructure(object):
else:
self.oeb.auto_generated_toc = True
self.log('Auto generated TOC with %d entries.' %
self.oeb.toc.count())
self.oeb.toc.count())
if opts.toc_filter is not None:
regexp = re.compile(opts.toc_filter)
for node in list(self.oeb.toc.iter()):
if not node.title or regexp.search(node.title) is not None:
self.log('Filtering', node.title if node.title else
'empty node', 'from TOC')
'empty node', 'from TOC')
self.oeb.toc.remove(node)
if opts.page_breaks_before is not None:
@@ -80,10 +84,11 @@ class DetectStructure(object):
for elem in pb_xpath(item.data):
try:
prev = next(elem.itersiblings(tag=etree.Element,
preceding=True))
if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and parse_utils.barename(
prev.tag) in {'h1', 'h2'} and (not prev.tail or
not prev.tail.split())):
preceding=True))
if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and
parse_utils.barename(prev.tag) in {'h1',
'h2'} and
(not prev.tail or not prev.tail.split())):
# We have two adjacent headings, do not put a page
# break on the second one
continue
@@ -106,9 +111,9 @@ class DetectStructure(object):
expr = self.opts.start_reading_at
try:
expr = XPath(expr)
except:
self.log.warn(
'Invalid start reading at XPath expression, ignoring: %s'%expr)
except Exception:
self.log.warn('Invalid start reading at XPath expression, '
'ignoring: %s' % expr)
return
for item in self.oeb.spine:
if not hasattr(item.data, 'xpath'):
@@ -118,16 +123,17 @@ class DetectStructure(object):
elem = matches[0]
eid = elem.get('id', None)
if not eid:
eid = 'start_reading_at_'+str(uuid.uuid4()).replace('-', '')
eid = 'start_reading_at_' + str(uuid.uuid4()).replace('-',
'')
elem.set('id', eid)
if 'text' in self.oeb.guide:
self.oeb.guide.remove('text')
self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
self.log('Setting start reading at position to %s in %s'%(
self.opts.start_reading_at, item.href))
self.log('Setting start reading at position to %s in %s' %
(self.opts.start_reading_at, item.href))
return
self.log.warn("Failed to find start reading at position: %s"%
self.opts.start_reading_at)
self.log.warn("Failed to find start reading at position: %s" %
self.opts.start_reading_at)
def get_toc_parts_for_xpath(self, expr):
# if an attribute is selected by the xpath expr then truncate it
@@ -148,12 +154,14 @@ class DetectStructure(object):
ans = XPath(expr)(doc)
len(ans)
return ans
except:
self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
except Exception:
self.log.warn('Invalid chapter expression, ignoring: %s' %
expr)
return []
if self.opts.chapter:
chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter)
chapter_path, title_attribute = (
self.get_toc_parts_for_xpath(self.opts.chapter))
self.chapter_title_attribute = title_attribute
for item in self.oeb.spine:
for x in find_matches(chapter_path, item.data):
@@ -165,25 +173,28 @@ class DetectStructure(object):
c = collections.Counter()
for item, elem in self.detected_chapters:
c[item] += 1
text = xml2text(elem).strip()
text = base.xml2text(elem).strip()
text = re.sub(r'\s+', ' ', text.strip())
self.log('\tDetected chapter:', text[:50])
if chapter_mark == 'none':
continue
if chapter_mark == 'rule':
mark = elem.makeelement(const.XHTML_HR)
mark = elem.makeelement(base.tag('xhtml', 'hr'))
elif chapter_mark == 'pagebreak':
if c[item] < 3 and at_start(elem):
# For the first two elements in this item, check if they
# are at the start of the file, in which case inserting a
# page break in unnecessary and can lead to extra blank
# pages in the PDF Output plugin. We need to use two as
# feedbooks epubs match both a heading tag and its
# containing div with the default chapter expression.
# For the first two elements in this item, check if
# they are at the start of the file, in which case
# inserting a page break in unnecessary and can lead
# to extra blank pages in the PDF Output plugin. We
# need to use two as feedbooks epubs match both a
# heading tag and its containing div with the default
# chapter expression.
continue
mark = elem.makeelement(const.XHTML_DIV, style=page_break_after)
mark = elem.makeelement(base.tag('xhtml', 'div'),
style=page_break_after)
else: # chapter_mark == 'both':
mark = elem.makeelement(const.XHTML_HR, style=page_break_before)
mark = elem.makeelement(base.tag('xhtml', 'hr'),
style=page_break_before)
try:
elem.addprevious(mark)
except TypeError:
@@ -196,7 +207,9 @@ class DetectStructure(object):
def create_toc_from_chapters(self):
counter = self.oeb.toc.next_play_order()
for item, elem in self.detected_chapters:
text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter)
text, href = self.elem_to_link(item, elem,
self.chapter_title_attribute,
counter)
self.oeb.toc.add(text, href, play_order=counter)
counter += 1
@@ -216,18 +229,21 @@ class DetectStructure(object):
if frag:
href = '#'.join((href, frag))
if not self.oeb.toc.has_href(href):
text = xml2text(a)
text = base.xml2text(a)
text = text[:100].strip()
if (not self.opts.duplicate_links_in_toc and
self.oeb.toc.has_text(text)):
continue
try:
self.oeb.toc.add(text, href,
self.oeb.toc.add(
text, href,
play_order=self.oeb.toc.next_play_order())
num += 1
except ValueError:
self.oeb.log.exception('Failed to process link: %r' % href)
continue # Most likely an incorrectly URL encoded link
self.oeb.log.exception('Failed to process link: '
'%r' % href)
# Most likely an incorrectly URL encoded link
continue
if self.opts.max_toc_links > 0 and \
num >= self.opts.max_toc_links:
self.log('Maximum TOC links reached, stopping.')
@@ -238,14 +254,14 @@ class DetectStructure(object):
if title_attribute is not None:
text = elem.get(title_attribute, '')
if not text:
text = xml2text(elem).strip()
text = base.xml2text(elem).strip()
if not text:
text = elem.get('title', '')
if not text:
text = elem.get('alt', '')
text = re.sub(r'\s+', ' ', text.strip())
text = text[:1000].strip()
id = elem.get('id', 'calibre_toc_%d'%counter)
id = elem.get('id', 'calibre_toc_%d' % counter)
elem.set('id', id)
href = '#'.join((item.href, id))
return text, href
@@ -260,26 +276,29 @@ class DetectStructure(object):
ans = XPath(expr)(doc)
len(ans)
return ans
except:
self.log.warn('Invalid ToC expression, ignoring: %s'%expr)
except Exception:
self.log.warn('Invalid ToC expression, ignoring: %s' % expr)
return []
for document in self.oeb.spine:
previous_level1 = list(added.values())[-1] if added else None
previous_level2 = list(added2.values())[-1] if added2 else None
level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc)
(level1_toc,
level1_title) = self.get_toc_parts_for_xpath(self.opts.level1_toc)
for elem in find_matches(level1_toc, document.data):
text, _href = self.elem_to_link(document, elem, level1_title, counter)
text, _href = self.elem_to_link(document, elem, level1_title,
counter)
counter += 1
if text:
node = self.oeb.toc.add(text, _href,
play_order=self.oeb.toc.next_play_order())
node = self.oeb.toc.add(
text, _href, play_order=self.oeb.toc.next_play_order())
added[elem] = node
# node.add('Top', _href)
if self.opts.level2_toc is not None and added:
level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc)
level2_toc, level2_title = self.get_toc_parts_for_xpath(
self.opts.level2_toc)
for elem in find_matches(level2_toc, document.data):
level1 = None
for item in document.data.iterdescendants():
@@ -290,15 +309,19 @@ class DetectStructure(object):
if previous_level1 is None:
break
level1 = previous_level1
text, _href = self.elem_to_link(document, elem, level2_title, counter)
text, _href = self.elem_to_link(document, elem,
level2_title,
counter)
counter += 1
if text:
added2[elem] = level1.add(text, _href,
added2[elem] = level1.add(
text, _href,
play_order=self.oeb.toc.next_play_order())
break
if self.opts.level3_toc is not None and added2:
level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc)
level3_toc, level3_title = self.get_toc_parts_for_xpath(
self.opts.level3_toc)
for elem in find_matches(level3_toc, document.data):
level2 = None
for item in document.data.iterdescendants():
@@ -309,10 +332,13 @@ class DetectStructure(object):
if previous_level2 is None:
break
level2 = previous_level2
text, _href = \
self.elem_to_link(document, elem, level3_title, counter)
text, _href = self.elem_to_link(document,
elem,
level3_title,
counter)
counter += 1
if text:
level2.add(text, _href,
play_order=self.oeb.toc.next_play_order())
play_order=self.oeb
.toc.next_play_order())
break