mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-22 18:15:49 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
487 lines
18 KiB
Python
487 lines
18 KiB
Python
"""
|
|
Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
|
|
forced at "likely" locations to conform to size limitations. This transform
|
|
assumes a prior call to the flatcss transform.
|
|
"""
|
|
import os, functools, collections, re, copy
|
|
from collections import OrderedDict
|
|
import urllib.parse
|
|
|
|
from lxml.etree import XPath as _XPath
|
|
from lxml import etree
|
|
|
|
from ebook_converter import constants as const
|
|
from ebook_converter import as_unicode, force_unicode
|
|
from ebook_converter.ebooks.epub import rules
|
|
from ebook_converter.ebooks.oeb.base import \
|
|
OEB_STYLES, rewrite_links, urlnormalize
|
|
from ebook_converter.ebooks.oeb.polish.split import do_split
|
|
from ebook_converter.polyglot.urllib import unquote
|
|
from ebook_converter.css_selectors import Select, SelectorError
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
|
__docformat__ = 'restructuredtext en'
|
|
XPath = functools.partial(_XPath, namespaces=const.XPNSMAP)
|
|
|
|
SPLIT_POINT_ATTR = 'csp'
|
|
|
|
|
|
def tostring(root):
|
|
return etree.tostring(root, encoding='utf-8')
|
|
|
|
|
|
class SplitError(ValueError):
|
|
|
|
def __init__(self, path, root):
|
|
size = len(tostring(root))/1024.
|
|
ValueError.__init__(self, 'Could not find reasonable point at which '
|
|
'to split: %(path)s Sub-tree size: %(size)d KB' %
|
|
{'path': path, 'size': size})
|
|
|
|
|
|
class Split(object):
|
|
|
|
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
|
|
max_flow_size=0, remove_css_pagebreaks=True):
|
|
self.split_on_page_breaks = split_on_page_breaks
|
|
self.page_breaks_xpath = page_breaks_xpath
|
|
self.max_flow_size = max_flow_size
|
|
self.page_break_selectors = None
|
|
self.remove_css_pagebreaks = remove_css_pagebreaks
|
|
if self.page_breaks_xpath is not None:
|
|
self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)]
|
|
|
|
def __call__(self, oeb, opts):
|
|
self.oeb = oeb
|
|
self.log = oeb.log
|
|
self.log('Splitting markup on page breaks and flow limits, if any...')
|
|
self.opts = opts
|
|
self.map = {}
|
|
for item in list(self.oeb.manifest.items):
|
|
if item.spine_position is not None and etree.iselement(item.data):
|
|
self.split_item(item)
|
|
|
|
self.fix_links()
|
|
|
|
def split_item(self, item):
|
|
page_breaks, page_break_ids = [], []
|
|
if self.split_on_page_breaks:
|
|
page_breaks, page_break_ids = self.find_page_breaks(item)
|
|
|
|
splitter = FlowSplitter(item, page_breaks, page_break_ids,
|
|
self.max_flow_size, self.oeb, self.opts)
|
|
if splitter.was_split:
|
|
am = splitter.anchor_map
|
|
self.map[item.href] = collections.defaultdict(
|
|
am.default_factory, am)
|
|
|
|
def find_page_breaks(self, item):
|
|
if self.page_break_selectors is None:
|
|
self.page_break_selectors = set()
|
|
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
|
|
OEB_STYLES]
|
|
for rule in rules(stylesheets):
|
|
before = force_unicode(getattr(rule.style.getPropertyCSSValue(
|
|
'page-break-before'), 'cssText', '').strip().lower())
|
|
after = force_unicode(getattr(rule.style.getPropertyCSSValue(
|
|
'page-break-after'), 'cssText', '').strip().lower())
|
|
try:
|
|
if before and before not in {'avoid', 'auto', 'inherit'}:
|
|
self.page_break_selectors.add((rule.selectorText, True))
|
|
if self.remove_css_pagebreaks:
|
|
rule.style.removeProperty('page-break-before')
|
|
except:
|
|
pass
|
|
try:
|
|
if after and after not in {'avoid', 'auto', 'inherit'}:
|
|
self.page_break_selectors.add((rule.selectorText, False))
|
|
if self.remove_css_pagebreaks:
|
|
rule.style.removeProperty('page-break-after')
|
|
except:
|
|
pass
|
|
page_breaks = set()
|
|
select = Select(item.data)
|
|
if not self.page_break_selectors:
|
|
return [], []
|
|
body = item.data.xpath('//h:body', namespaces=const.XPNSMAP)
|
|
if not body:
|
|
return [], []
|
|
descendants = frozenset(body[0].iterdescendants('*'))
|
|
|
|
for selector, before in self.page_break_selectors:
|
|
try:
|
|
for elem in select(selector):
|
|
if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
|
|
elem.set('pb_before', '1' if before else '0')
|
|
page_breaks.add(elem)
|
|
except SelectorError as err:
|
|
self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))
|
|
|
|
for i, elem in enumerate(item.data.iter('*')):
|
|
try:
|
|
elem.set('pb_order', str(i))
|
|
except TypeError: # Cant set attributes on comment nodes etc.
|
|
continue
|
|
|
|
page_breaks = list(page_breaks)
|
|
page_breaks.sort(key=lambda x:int(x.get('pb_order')))
|
|
page_break_ids, page_breaks_ = [], []
|
|
for i, x in enumerate(page_breaks):
|
|
x.set('id', x.get('id', 'calibre_pb_%d'%i))
|
|
id = x.get('id')
|
|
try:
|
|
xp = XPath('//*[@id="%s"]'%id)
|
|
except:
|
|
try:
|
|
xp = XPath("//*[@id='%s']"%id)
|
|
except:
|
|
# The id has both a quote and an apostrophe or some other
|
|
# Just replace it since I doubt its going to work anywhere else
|
|
# either
|
|
id = 'calibre_pb_%d'%i
|
|
x.set('id', id)
|
|
xp = XPath('//*[@id=%r]'%id)
|
|
page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
|
|
page_break_ids.append(id)
|
|
|
|
for elem in item.data.iter(etree.Element):
|
|
elem.attrib.pop('pb_order', False)
|
|
elem.attrib.pop('pb_before', False)
|
|
|
|
return page_breaks_, page_break_ids
|
|
|
|
def fix_links(self):
|
|
'''
|
|
Fix references to the split files in other content files.
|
|
'''
|
|
for item in self.oeb.manifest:
|
|
if etree.iselement(item.data):
|
|
self.current_item = item
|
|
rewrite_links(item.data, self.rewrite_links)
|
|
|
|
def rewrite_links(self, url):
|
|
href, frag = urllib.parse.urldefrag(url)
|
|
try:
|
|
href = self.current_item.abshref(href)
|
|
except ValueError:
|
|
# Unparseable URL
|
|
return url
|
|
try:
|
|
href = urlnormalize(href)
|
|
except ValueError:
|
|
# href has non utf-8 quoting
|
|
return url
|
|
if href in self.map:
|
|
anchor_map = self.map[href]
|
|
nhref = anchor_map[frag if frag else None]
|
|
nhref = self.current_item.relhref(nhref)
|
|
if frag:
|
|
nhref = '#'.join((unquote(nhref), frag))
|
|
|
|
return nhref
|
|
return url
|
|
|
|
|
|
class FlowSplitter(object):
|
|
'The actual splitting logic'
|
|
|
|
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
|
|
opts):
|
|
self.item = item
|
|
self.oeb = oeb
|
|
self.opts = opts
|
|
self.log = oeb.log
|
|
self.page_breaks = page_breaks
|
|
self.page_break_ids = page_break_ids
|
|
self.max_flow_size = max_flow_size
|
|
self.base = item.href
|
|
self.csp_counter = 0
|
|
|
|
base, ext = os.path.splitext(self.base)
|
|
self.base = base.replace('%', '%%')+'_split_%.3d'+ext
|
|
|
|
self.trees = [self.item.data.getroottree()]
|
|
self.splitting_on_page_breaks = True
|
|
if self.page_breaks:
|
|
self.split_on_page_breaks(self.trees[0])
|
|
self.splitting_on_page_breaks = False
|
|
|
|
if self.max_flow_size > 0:
|
|
lt_found = False
|
|
self.log('\tLooking for large trees in %s...'%item.href)
|
|
trees = list(self.trees)
|
|
self.tree_map = {}
|
|
for i, tree in enumerate(trees):
|
|
size = len(tostring(tree.getroot()))
|
|
if size > self.max_flow_size:
|
|
self.log('\tFound large tree #%d'%i)
|
|
lt_found = True
|
|
self.split_trees = []
|
|
self.split_to_size(tree)
|
|
self.tree_map[tree] = self.split_trees
|
|
if not lt_found:
|
|
self.log('\tNo large trees found')
|
|
self.trees = []
|
|
for x in trees:
|
|
self.trees.extend(self.tree_map.get(x, [x]))
|
|
|
|
self.was_split = len(self.trees) > 1
|
|
if self.was_split:
|
|
self.log('\tSplit into %d parts'%len(self.trees))
|
|
self.commit()
|
|
|
|
def split_on_page_breaks(self, orig_tree):
|
|
ordered_ids = OrderedDict()
|
|
all_page_break_ids = frozenset(self.page_break_ids)
|
|
for elem_id in orig_tree.xpath('//*/@id'):
|
|
if elem_id in all_page_break_ids:
|
|
ordered_ids[elem_id] = self.page_breaks[
|
|
self.page_break_ids.index(elem_id)]
|
|
|
|
self.trees = [orig_tree]
|
|
while ordered_ids:
|
|
pb_id, (pattern, before) = next(iter(ordered_ids.items()))
|
|
del ordered_ids[pb_id]
|
|
for i in range(len(self.trees)-1, -1, -1):
|
|
tree = self.trees[i]
|
|
elem = pattern(tree)
|
|
if elem:
|
|
self.log.debug('\t\tSplitting on page-break at id=%s'%
|
|
elem[0].get('id'))
|
|
before_tree, after_tree = self.do_split(tree, elem[0], before)
|
|
self.trees[i:i+1] = [before_tree, after_tree]
|
|
break
|
|
|
|
trees, ids = [], set()
|
|
for tree in self.trees:
|
|
root = tree.getroot()
|
|
if self.is_page_empty(root):
|
|
discarded_ids = root.xpath('//*[@id]')
|
|
for x in discarded_ids:
|
|
x = x.get('id')
|
|
if not x.startswith('calibre_'):
|
|
ids.add(x)
|
|
else:
|
|
if ids:
|
|
body = self.get_body(root)
|
|
if body is not None:
|
|
existing_ids = frozenset(body.xpath('//*/@id'))
|
|
for x in ids - existing_ids:
|
|
body.insert(0, body.makeelement(const.XHTML_div, id=x, style='height:0pt'))
|
|
ids = set()
|
|
trees.append(tree)
|
|
self.trees = trees
|
|
|
|
def get_body(self, root):
|
|
body = root.xpath('//h:body', namespaces=const.XPNSMAP)
|
|
if not body:
|
|
return None
|
|
return body[0]
|
|
|
|
def do_split(self, tree, split_point, before):
|
|
'''
|
|
Split ``tree`` into a *before* and *after* tree at ``split_point``.
|
|
|
|
:param before: If True tree is split before split_point, otherwise after split_point
|
|
:return: before_tree, after_tree
|
|
'''
|
|
return do_split(split_point, self.log, before=before)
|
|
|
|
def is_page_empty(self, root):
|
|
body = self.get_body(root)
|
|
if body is None:
|
|
return False
|
|
txt = re.sub(r'\s+|\xa0', '',
|
|
etree.tostring(body, method='text', encoding='unicode'))
|
|
if len(txt) > 1:
|
|
return False
|
|
for img in root.xpath('//h:img', namespaces=const.XPNSMAP):
|
|
if img.get('style', '') != 'display:none':
|
|
return False
|
|
if root.xpath('//*[local-name() = "svg"]'):
|
|
return False
|
|
return True
|
|
|
|
def split_text(self, text, root, size):
|
|
self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
|
|
rest = text.replace('\r', '')
|
|
parts = re.split('\n\n', rest)
|
|
self.log.debug('\t\t\t\tFound %d parts'%len(parts))
|
|
if max(map(len, parts)) > size:
|
|
raise SplitError('Cannot split as file contains a <pre> tag '
|
|
'with a very large paragraph', root)
|
|
ans = []
|
|
buf = ''
|
|
for part in parts:
|
|
if len(buf) + len(part) < size:
|
|
buf += '\n\n'+part
|
|
else:
|
|
ans.append(buf)
|
|
buf = part
|
|
return ans
|
|
|
|
def split_to_size(self, tree):
|
|
self.log.debug('\t\tSplitting...')
|
|
root = tree.getroot()
|
|
# Split large <pre> tags if they contain only text
|
|
for pre in XPath('//h:pre')(root):
|
|
if len(tuple(pre.iterchildren(etree.Element))) > 0:
|
|
continue
|
|
if pre.text and len(pre.text) > self.max_flow_size*0.5:
|
|
self.log.debug('\t\tSplitting large <pre> tag')
|
|
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
|
|
new_pres = []
|
|
for frag in frags:
|
|
pre2 = copy.copy(pre)
|
|
pre2.text = frag
|
|
pre2.tail = ''
|
|
new_pres.append(pre2)
|
|
new_pres[-1].tail = pre.tail
|
|
p = pre.getparent()
|
|
i = p.index(pre)
|
|
p[i:i+1] = new_pres
|
|
|
|
split_point, before = self.find_split_point(root)
|
|
if split_point is None:
|
|
raise SplitError(self.item.href, root)
|
|
self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))
|
|
|
|
trees = self.do_split(tree, split_point, before)
|
|
sizes = [len(tostring(t.getroot())) for t in trees]
|
|
if min(sizes) < 5*1024:
|
|
self.log.debug('\t\t\tSplit tree too small')
|
|
self.split_to_size(tree)
|
|
return
|
|
|
|
for t, size in zip(trees, sizes):
|
|
r = t.getroot()
|
|
if self.is_page_empty(r):
|
|
continue
|
|
elif size <= self.max_flow_size:
|
|
self.split_trees.append(t)
|
|
self.log.debug(
|
|
'\t\t\tCommitted sub-tree #%d (%d KB)'%(
|
|
len(self.split_trees), size/1024.))
|
|
else:
|
|
self.log.debug(
|
|
'\t\t\tSplit tree still too large: %d KB' % (size/1024.))
|
|
self.split_to_size(t)
|
|
|
|
def find_split_point(self, root):
|
|
'''
|
|
Find the tag at which to split the tree rooted at `root`.
|
|
Search order is:
|
|
* Heading tags
|
|
* <div> tags
|
|
* <pre> tags
|
|
* <hr> tags
|
|
* <p> tags
|
|
* <br> tags
|
|
* <li> tags
|
|
|
|
We try to split in the "middle" of the file (as defined by tag counts.
|
|
'''
|
|
def pick_elem(elems):
|
|
if elems:
|
|
elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') !=
|
|
'1']
|
|
if elems:
|
|
i = int(len(elems)//2)
|
|
elems[i].set(SPLIT_POINT_ATTR, '1')
|
|
return elems[i]
|
|
|
|
for path in (
|
|
'//*[re:match(name(), "h[1-6]", "i")]',
|
|
'/h:html/h:body/h:div',
|
|
'//h:pre',
|
|
'//h:hr',
|
|
'//h:p',
|
|
'//h:div',
|
|
'//h:br',
|
|
'//h:li',
|
|
):
|
|
elems = root.xpath(path, namespaces=const.XPNSMAP)
|
|
elem = pick_elem(elems)
|
|
if elem is not None:
|
|
try:
|
|
XPath(elem.getroottree().getpath(elem))
|
|
except:
|
|
continue
|
|
return elem, True
|
|
|
|
return None, True
|
|
|
|
def commit(self):
|
|
'''
|
|
Commit all changes caused by the split. Calculates an *anchor_map* for
|
|
all anchors in the original tree. Internal links are re-directed. The
|
|
original file is deleted and the split files are saved.
|
|
'''
|
|
if not self.was_split:
|
|
return
|
|
self.anchor_map = collections.defaultdict(lambda :self.base%0)
|
|
self.files = []
|
|
|
|
for i, tree in enumerate(self.trees):
|
|
root = tree.getroot()
|
|
self.files.append(self.base%i)
|
|
for elem in root.xpath('//*[@id or @name]'):
|
|
for anchor in elem.get('id', ''), elem.get('name', ''):
|
|
if anchor != '' and anchor not in self.anchor_map:
|
|
self.anchor_map[anchor] = self.files[-1]
|
|
for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR):
|
|
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
|
|
|
|
spine_pos = self.item.spine_position
|
|
|
|
for current, tree in zip(*map(reversed, (self.files, self.trees))):
|
|
for a in tree.getroot().xpath('//h:a[@href]', namespaces=const.XPNSMAP):
|
|
href = a.get('href').strip()
|
|
if href.startswith('#'):
|
|
anchor = href[1:]
|
|
file = self.anchor_map[anchor]
|
|
file = self.item.relhref(file)
|
|
if file != current:
|
|
a.set('href', file+href)
|
|
|
|
new_id = self.oeb.manifest.generate(id=self.item.id)[0]
|
|
new_item = self.oeb.manifest.add(new_id, current,
|
|
self.item.media_type, data=tree.getroot())
|
|
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
|
|
|
|
if self.oeb.guide:
|
|
for ref in self.oeb.guide.values():
|
|
href, frag = urllib.parse.urldefrag(ref.href)
|
|
if href == self.item.href:
|
|
nhref = self.anchor_map[frag if frag else None]
|
|
if frag:
|
|
nhref = '#'.join((nhref, frag))
|
|
ref.href = nhref
|
|
|
|
def fix_toc_entry(toc):
|
|
if toc.href:
|
|
href, frag = urllib.parse.urldefrag(toc.href)
|
|
if href == self.item.href:
|
|
nhref = self.anchor_map[frag if frag else None]
|
|
if frag:
|
|
nhref = '#'.join((nhref, frag))
|
|
toc.href = nhref
|
|
for x in toc:
|
|
fix_toc_entry(x)
|
|
|
|
if self.oeb.toc:
|
|
fix_toc_entry(self.oeb.toc)
|
|
|
|
if self.oeb.pages:
|
|
for page in self.oeb.pages:
|
|
href, frag = urllib.parse.urldefrag(page.href)
|
|
if href == self.item.href:
|
|
nhref = self.anchor_map[frag if frag else None]
|
|
if frag:
|
|
nhref = '#'.join((nhref, frag))
|
|
page.href = nhref
|
|
|
|
self.oeb.manifest.remove(self.item)
|