import posixpath import re import urllib.parse import uuid from ebook_converter.utils.filenames import ascii_text __license__ = 'GPL v3' __copyright__ = '2015, Kovid Goyal ' def start_text(tag, prefix_len=0, top_level=True): ans = tag.text or '' limit = 50 - prefix_len if len(ans) < limit: for child in tag.iterchildren('*'): ans += start_text(child, len(ans), top_level=False) + (child.tail or '') if len(ans) >= limit: break if top_level and len(ans) > limit: ans = ans[:limit] + '...' return ans class TOCItem(object): def __init__(self, title, bmark, level): self.title, self.bmark, self.level = title, bmark, level self.is_first = self.is_last = False def serialize(self, body, makeelement): p = makeelement(body, 'w:p', append=False) ppr = makeelement(p, 'w:pPr') makeelement(ppr, 'w:pStyle', w_val="Normal") makeelement(ppr, 'w:ind', w_left='0', w_firstLineChars='0', w_firstLine='0', w_leftChars=str(200 * self.level)) if self.is_first: makeelement(ppr, 'w:pageBreakBefore', w_val='off') r = makeelement(p, 'w:r') makeelement(r, 'w:fldChar', w_fldCharType='begin') r = makeelement(p, 'w:r') makeelement(r, 'w:instrText').text = r' TOC \h ' r[0].set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') r = makeelement(p, 'w:r') makeelement(r, 'w:fldChar', w_fldCharType='separate') hl = makeelement(p, 'w:hyperlink', w_anchor=self.bmark) r = makeelement(hl, 'w:r') rpr = makeelement(r, 'w:rPr') makeelement(rpr, 'w:color', w_val='0000FF', w_themeColor='hyperlink') makeelement(rpr, 'w:u', w_val='single') makeelement(r, 'w:t').text = self.title if self.is_last: r = makeelement(p, 'w:r') makeelement(r, 'w:fldChar', w_fldCharType='end') body.insert(0, p) def sanitize_bookmark_name(base): # Max length allowed by Word appears to be 40, we use 32 to leave some # space for making the name unique return re.sub(r'[^0-9a-zA-Z]', '_', ascii_text(base))[:32].rstrip('_') class LinksManager(object): def __init__(self, namespace, document_relationships, log): self.namespace = namespace self.log = log self.document_relationships = document_relationships self.top_anchor = str(uuid.uuid4().hex) self.anchor_map = {} self.used_bookmark_names = set() self.bmark_id = 0 self.document_hrefs = set() self.external_links = {} self.toc = [] def bookmark_for_anchor(self, anchor, current_item, html_tag): key = (current_item.href, anchor) if key in self.anchor_map: return self.anchor_map[key] if anchor == self.top_anchor: name = ('Top of %s' % posixpath.basename(current_item.href)) self.document_hrefs.add(current_item.href) else: name = start_text(html_tag).strip() or anchor name = sanitize_bookmark_name(name) i, bname = 0, name while name in self.used_bookmark_names: i += 1 name = bname + ('_%d' % i) self.anchor_map[key] = name self.used_bookmark_names.add(name) return name @property def bookmark_id(self): self.bmark_id += 1 return self.bmark_id def serialize_hyperlink(self, parent, link): item, url, tooltip = link purl = urllib.parse.urlparse(url) href = purl.path def make_link(parent, anchor=None, id=None, tooltip=None): kw = {} if anchor is not None: kw['w_anchor'] = anchor elif id is not None: kw['r_id'] = id if tooltip: kw['w_tooltip'] = tooltip return self.namespace.makeelement(parent, 'w:hyperlink', **kw) if not purl.scheme: href = item.abshref(href) if href in self.document_hrefs: key = (href, purl.fragment or self.top_anchor) if key in self.anchor_map: bmark = self.anchor_map[key] else: bmark = self.anchor_map[(href, self.top_anchor)] return make_link(parent, anchor=bmark, tooltip=tooltip) else: self.log.warn('Ignoring internal hyperlink with href (%s) pointing to unknown destination' % url) if purl.scheme in {'http', 'https', 'ftp'}: if url not in self.external_links: self.external_links[url] = self.document_relationships.add_relationship(url, self.namespace.names['LINKS'], target_mode='External') return make_link(parent, id=self.external_links[url], tooltip=tooltip) return parent def process_toc_node(self, toc, level=0): href = toc.href if href: purl = urllib.parse.urlparse(href) href = purl.path if href in self.document_hrefs: key = (href, purl.fragment or self.top_anchor) if key in self.anchor_map: bmark = self.anchor_map[key] else: bmark = self.anchor_map[(href, self.top_anchor)] self.toc.append(TOCItem(toc.title, bmark, level)) for child in toc: self.process_toc_node(child, level+1) def process_toc_links(self, oeb): self.toc = [] has_toc = oeb.toc and oeb.toc.count() > 1 if not has_toc: return for child in oeb.toc: self.process_toc_node(child) if self.toc: self.toc[0].is_first = True self.toc[-1].is_last = True def serialize_toc(self, body, primary_heading_style): pbb = body[0].xpath('//*[local-name()="pageBreakBefore"]')[0] pbb.set('{%s}val' % self.namespace.namespaces['w'], 'on') for block in reversed(self.toc): block.serialize(body, self.namespace.makeelement) title = 'Table of Contents' makeelement = self.namespace.makeelement p = makeelement(body, 'w:p', append=False) ppr = makeelement(p, 'w:pPr') if primary_heading_style is not None: makeelement(ppr, 'w:pStyle', w_val=primary_heading_style.id) makeelement(ppr, 'w:pageBreakBefore', w_val='off') makeelement(makeelement(p, 'w:r'), 'w:t').text = title body.insert(0, p)