import re import uuid import urllib.parse from lxml import etree from collections import OrderedDict, Counter from ebook_converter.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename from ebook_converter.ebooks import ConversionError from ebook_converter.polyglot.builtins import itervalues __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' def XPath(x): try: return etree.XPath(x, namespaces=XPNSMAP) except etree.XPathSyntaxError: raise ConversionError( 'The syntax of the XPath expression %s is invalid.' % repr(x)) def isspace(x): return not x or x.replace('\xa0', '').isspace() def at_start(elem): ' Return True if there is no content before elem ' body = XPath('ancestor-or-self::h:body')(elem) if not body: return True body = body[0] ancestors = frozenset(XPath('ancestor::*')(elem)) for x in body.iter(): if x is elem: return True if hasattr(getattr(x, 'tag', None), 'rpartition') and x.tag.rpartition('}')[-1] in {'img', 'svg'}: return False if isspace(getattr(x, 'text', None)) and (x in ancestors or isspace(getattr(x, 'tail', None))): continue return False return False class DetectStructure(object): def __call__(self, oeb, opts): self.log = oeb.log self.oeb = oeb self.opts = opts self.log('Detecting structure...') self.detect_chapters() if self.oeb.auto_generated_toc or opts.use_auto_toc: orig_toc = self.oeb.toc self.oeb.toc = TOC() self.create_level_based_toc() if self.oeb.toc.count() < 1: if not opts.no_chapters_in_toc and self.detected_chapters: self.create_toc_from_chapters() if self.oeb.toc.count() < opts.toc_threshold: self.create_toc_from_links() if self.oeb.toc.count() < 2 and orig_toc.count() > 2: self.oeb.toc = orig_toc else: self.oeb.auto_generated_toc = True self.log('Auto generated TOC with %d entries.' % self.oeb.toc.count()) if opts.toc_filter is not None: regexp = re.compile(opts.toc_filter) for node in list(self.oeb.toc.iter()): if not node.title or regexp.search(node.title) is not None: self.log('Filtering', node.title if node.title else 'empty node', 'from TOC') self.oeb.toc.remove(node) if opts.page_breaks_before is not None: pb_xpath = XPath(opts.page_breaks_before) for item in oeb.spine: for elem in pb_xpath(item.data): try: prev = next(elem.itersiblings(tag=etree.Element, preceding=True)) if (barename(elem.tag) in {'h1', 'h2'} and barename( prev.tag) in {'h1', 'h2'} and (not prev.tail or not prev.tail.split())): # We have two adjacent headings, do not put a page # break on the second one continue except StopIteration: pass style = elem.get('style', '') if style: style += '; ' elem.set('style', style+'page-break-before:always') for node in self.oeb.toc.iter(): if not node.title or not node.title.strip(): node.title = _('Unnamed') if self.opts.start_reading_at: self.detect_start_reading() def detect_start_reading(self): expr = self.opts.start_reading_at try: expr = XPath(expr) except: self.log.warn( 'Invalid start reading at XPath expression, ignoring: %s'%expr) return for item in self.oeb.spine: if not hasattr(item.data, 'xpath'): continue matches = expr(item.data) if matches: elem = matches[0] eid = elem.get('id', None) if not eid: eid = 'start_reading_at_'+str(uuid.uuid4()).replace('-', '') elem.set('id', eid) if 'text' in self.oeb.guide: self.oeb.guide.remove('text') self.oeb.guide.add('text', 'Start', item.href+'#'+eid) self.log('Setting start reading at position to %s in %s'%( self.opts.start_reading_at, item.href)) return self.log.warn("Failed to find start reading at position: %s"% self.opts.start_reading_at) def get_toc_parts_for_xpath(self, expr): # if an attribute is selected by the xpath expr then truncate it # from the path and instead return it as where to find the title text title_attribute_regex = re.compile(r'/@([-\w]+)$') match = title_attribute_regex.search(expr) if match is not None: return expr[0:match.start()], match.group(1) return expr, None def detect_chapters(self): self.detected_chapters = [] self.chapter_title_attribute = None def find_matches(expr, doc): try: ans = XPath(expr)(doc) len(ans) return ans except: self.log.warn('Invalid chapter expression, ignoring: %s'%expr) return [] if self.opts.chapter: chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter) self.chapter_title_attribute = title_attribute for item in self.oeb.spine: for x in find_matches(chapter_path, item.data): self.detected_chapters.append((item, x)) chapter_mark = self.opts.chapter_mark page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' c = Counter() for item, elem in self.detected_chapters: c[item] += 1 text = xml2text(elem).strip() text = re.sub(r'\s+', ' ', text.strip()) self.log('\tDetected chapter:', text[:50]) if chapter_mark == 'none': continue if chapter_mark == 'rule': mark = elem.makeelement(XHTML('hr')) elif chapter_mark == 'pagebreak': if c[item] < 3 and at_start(elem): # For the first two elements in this item, check if they # are at the start of the file, in which case inserting a # page break in unnecessary and can lead to extra blank # pages in the PDF Output plugin. We need to use two as # feedbooks epubs match both a heading tag and its # containing div with the default chapter expression. continue mark = elem.makeelement(XHTML('div'), style=page_break_after) else: # chapter_mark == 'both': mark = elem.makeelement(XHTML('hr'), style=page_break_before) try: elem.addprevious(mark) except TypeError: self.log.exception('Failed to mark chapter') def create_level_based_toc(self): if self.opts.level1_toc is not None: self.add_leveled_toc_items() def create_toc_from_chapters(self): counter = self.oeb.toc.next_play_order() for item, elem in self.detected_chapters: text, href = self.elem_to_link(item, elem, self.chapter_title_attribute, counter) self.oeb.toc.add(text, href, play_order=counter) counter += 1 def create_toc_from_links(self): num = 0 for item in self.oeb.spine: for a in XPath('//h:a[@href]')(item.data): href = a.get('href') try: purl = urllib.parse.urlparse(href) except ValueError: self.log.warning('Ignoring malformed URL:', href) continue if not purl[0] or purl[0] == 'file': href, frag = purl.path, purl.fragment href = item.abshref(href) if frag: href = '#'.join((href, frag)) if not self.oeb.toc.has_href(href): text = xml2text(a) text = text[:100].strip() if (not self.opts.duplicate_links_in_toc and self.oeb.toc.has_text(text)): continue try: self.oeb.toc.add(text, href, play_order=self.oeb.toc.next_play_order()) num += 1 except ValueError: self.oeb.log.exception('Failed to process link: %r' % href) continue # Most likely an incorrectly URL encoded link if self.opts.max_toc_links > 0 and \ num >= self.opts.max_toc_links: self.log('Maximum TOC links reached, stopping.') return def elem_to_link(self, item, elem, title_attribute, counter): text = '' if title_attribute is not None: text = elem.get(title_attribute, '') if not text: text = xml2text(elem).strip() if not text: text = elem.get('title', '') if not text: text = elem.get('alt', '') text = re.sub(r'\s+', ' ', text.strip()) text = text[:1000].strip() id = elem.get('id', 'calibre_toc_%d'%counter) elem.set('id', id) href = '#'.join((item.href, id)) return text, href def add_leveled_toc_items(self): added = OrderedDict() added2 = OrderedDict() counter = 1 def find_matches(expr, doc): try: ans = XPath(expr)(doc) len(ans) return ans except: self.log.warn('Invalid ToC expression, ignoring: %s'%expr) return [] for document in self.oeb.spine: previous_level1 = list(itervalues(added))[-1] if added else None previous_level2 = list(itervalues(added2))[-1] if added2 else None level1_toc, level1_title = self.get_toc_parts_for_xpath(self.opts.level1_toc) for elem in find_matches(level1_toc, document.data): text, _href = self.elem_to_link(document, elem, level1_title, counter) counter += 1 if text: node = self.oeb.toc.add(text, _href, play_order=self.oeb.toc.next_play_order()) added[elem] = node # node.add(_('Top'), _href) if self.opts.level2_toc is not None and added: level2_toc, level2_title = self.get_toc_parts_for_xpath(self.opts.level2_toc) for elem in find_matches(level2_toc, document.data): level1 = None for item in document.data.iterdescendants(): if item in added: level1 = added[item] elif item == elem: if level1 is None: if previous_level1 is None: break level1 = previous_level1 text, _href = self.elem_to_link(document, elem, level2_title, counter) counter += 1 if text: added2[elem] = level1.add(text, _href, play_order=self.oeb.toc.next_play_order()) break if self.opts.level3_toc is not None and added2: level3_toc, level3_title = self.get_toc_parts_for_xpath(self.opts.level3_toc) for elem in find_matches(level3_toc, document.data): level2 = None for item in document.data.iterdescendants(): if item in added2: level2 = added2[item] elif item == elem: if level2 is None: if previous_level2 is None: break level2 = previous_level2 text, _href = \ self.elem_to_link(document, elem, level3_title, counter) counter += 1 if text: level2.add(text, _href, play_order=self.oeb.toc.next_play_order()) break