mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-31 02:25:45 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
518 lines
19 KiB
Python
518 lines
19 KiB
Python
import copy
|
|
import os
|
|
import re
|
|
import urllib.parse
|
|
|
|
from ebook_converter import constants as const
|
|
from ebook_converter.ebooks.oeb import base
|
|
from ebook_converter.ebooks.oeb import parse_utils
|
|
from ebook_converter.ebooks.oeb.polish.errors import MalformedMarkup
|
|
from ebook_converter.ebooks.oeb.polish.toc import node_from_loc
|
|
from ebook_converter.ebooks.oeb.polish.replace import LinkRebaser
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
|
|
class AbortError(ValueError):
|
|
pass
|
|
|
|
|
|
def in_table(node):
|
|
while node is not None:
|
|
if node.tag.endswith('}table'):
|
|
return True
|
|
node = node.getparent()
|
|
return False
|
|
|
|
|
|
def adjust_split_point(split_point, log):
|
|
'''
|
|
Move the split point up its ancestor chain if it has no content
|
|
before it. This handles the common case:
|
|
<div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
|
|
h2.
|
|
'''
|
|
sp = split_point
|
|
while True:
|
|
parent = sp.getparent()
|
|
if (
|
|
parent is None or
|
|
parse_utils.barename(parent.tag) in {'body', 'html'} or
|
|
(parent.text and parent.text.strip()) or
|
|
parent.index(sp) > 0
|
|
):
|
|
break
|
|
sp = parent
|
|
|
|
if sp is not split_point:
|
|
log.debug('Adjusted split point to ancestor')
|
|
|
|
return sp
|
|
|
|
|
|
def get_body(root):
|
|
return root.find('h:body', namespaces=const.XPNSMAP)
|
|
|
|
|
|
def do_split(split_point, log, before=True):
|
|
'''
|
|
Split tree into a *before* and an *after* tree at ``split_point``.
|
|
|
|
:param split_point: The Element at which to split
|
|
:param before: If True tree is split before split_point, otherwise after split_point
|
|
:return: before_tree, after_tree
|
|
'''
|
|
if before:
|
|
# We cannot adjust for after since moving an after split point to a
|
|
# parent will cause breakage if the parent contains any content
|
|
# after the original split point
|
|
split_point = adjust_split_point(split_point, log)
|
|
tree = split_point.getroottree()
|
|
path = tree.getpath(split_point)
|
|
|
|
tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree)
|
|
root, root2 = tree.getroot(), tree2.getroot()
|
|
body, body2 = map(get_body, (root, root2))
|
|
split_point = root.xpath(path)[0]
|
|
split_point2 = root2.xpath(path)[0]
|
|
|
|
def nix_element(elem, top=True):
|
|
# Remove elem unless top is False in which case replace elem by its
|
|
# children
|
|
parent = elem.getparent()
|
|
if top:
|
|
parent.remove(elem)
|
|
else:
|
|
index = parent.index(elem)
|
|
parent[index:index+1] = list(elem.iterchildren())
|
|
|
|
# Tree 1
|
|
hit_split_point = False
|
|
keep_descendants = False
|
|
split_point_descendants = frozenset(split_point.iterdescendants())
|
|
for elem in tuple(body.iterdescendants()):
|
|
if elem is split_point:
|
|
hit_split_point = True
|
|
if before:
|
|
nix_element(elem)
|
|
else:
|
|
# We want to keep the descendants of the split point in
|
|
# Tree 1
|
|
keep_descendants = True
|
|
# We want the split point element, but not its tail
|
|
elem.tail = '\n'
|
|
|
|
continue
|
|
if hit_split_point:
|
|
if keep_descendants:
|
|
if elem in split_point_descendants:
|
|
# elem is a descendant keep it
|
|
continue
|
|
else:
|
|
# We are out of split_point, so prevent further set
|
|
# lookups of split_point_descendants
|
|
keep_descendants = False
|
|
nix_element(elem)
|
|
|
|
# Tree 2
|
|
ancestors = frozenset(base.XPath('ancestor::*')(split_point2))
|
|
for elem in tuple(body2.iterdescendants()):
|
|
if elem is split_point2:
|
|
if not before:
|
|
# Keep the split point element's tail, if it contains non-whitespace
|
|
# text
|
|
tail = elem.tail
|
|
if tail and not tail.isspace():
|
|
parent = elem.getparent()
|
|
idx = parent.index(elem)
|
|
if idx == 0:
|
|
parent.text = (parent.text or '') + tail
|
|
else:
|
|
sib = parent[idx-1]
|
|
sib.tail = (sib.tail or '') + tail
|
|
# Remove the element itself
|
|
nix_element(elem)
|
|
break
|
|
if elem in ancestors:
|
|
# We have to preserve the ancestors as they could have CSS
|
|
# styles that are inherited/applicable, like font or
|
|
# width. So we only remove the text, if any.
|
|
elem.text = '\n'
|
|
else:
|
|
nix_element(elem, top=False)
|
|
|
|
body2.text = '\n'
|
|
|
|
return tree, tree2
|
|
|
|
|
|
class SplitLinkReplacer(object):
|
|
|
|
def __init__(self, base, bottom_anchors, top_name, bottom_name, container):
|
|
self.bottom_anchors, self.bottom_name = bottom_anchors, bottom_name
|
|
self.container, self.top_name = container, top_name
|
|
self.base = base
|
|
self.replaced = False
|
|
|
|
def __call__(self, url):
|
|
if url and url.startswith('#'):
|
|
return url
|
|
name = self.container.href_to_name(url, self.base)
|
|
if name != self.top_name:
|
|
return url
|
|
purl = urllib.parse.urlparse(url)
|
|
if purl.fragment and purl.fragment in self.bottom_anchors:
|
|
url = self.container.name_to_href(self.bottom_name, self.base) + '#' + purl.fragment
|
|
self.replaced = True
|
|
return url
|
|
|
|
|
|
def split(container, name, loc_or_xpath, before=True, totals=None):
|
|
'''
|
|
Split the file specified by name at the position specified by loc_or_xpath.
|
|
Splitting automatically migrates all links and references to the affected
|
|
files.
|
|
|
|
:param loc_or_xpath: Should be an XPath expression such as
|
|
//h:div[@id="split_here"]. Can also be a *loc* which is used internally to
|
|
implement splitting in the preview panel.
|
|
:param before: If True the split occurs before the identified element otherwise after it.
|
|
:param totals: Used internally
|
|
'''
|
|
|
|
root = container.parsed(name)
|
|
if isinstance(loc_or_xpath, str):
|
|
split_point = root.xpath(loc_or_xpath)[0]
|
|
else:
|
|
try:
|
|
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
|
|
except MalformedMarkup:
|
|
# The webkit HTML parser and the container parser have yielded
|
|
# different node counts, this can happen if the file is valid XML
|
|
# but contains constructs like nested <p> tags. So force parse it
|
|
# with the HTML 5 parser and try again.
|
|
raw = container.raw_data(name)
|
|
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
|
|
try:
|
|
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
|
|
except MalformedMarkup:
|
|
raise MalformedMarkup('The file %s has malformed markup. Try '
|
|
'running the Fix HTML tool before '
|
|
'splitting' % name)
|
|
container.replace(name, root)
|
|
if in_table(split_point):
|
|
raise AbortError('Cannot split inside tables')
|
|
if split_point.tag.endswith('}body'):
|
|
raise AbortError('Cannot split on the <body> tag')
|
|
tree1, tree2 = do_split(split_point, container.log, before=before)
|
|
root1, root2 = tree1.getroot(), tree2.getroot()
|
|
anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''}
|
|
anchors_in_bottom = frozenset(root2.xpath('//*/@id')) | frozenset(root2.xpath('//*/@name'))
|
|
base, ext = name.rpartition('.')[0::2]
|
|
base = re.sub(r'_split\d+$', '', base)
|
|
nname, s = None, 0
|
|
while not nname or container.exists(nname):
|
|
s += 1
|
|
nname = '%s_split%d.%s' % (base, s, ext)
|
|
manifest_item = container.generate_item(nname, media_type=container.mime_map[name])
|
|
bottom_name = container.href_to_name(manifest_item.get('href'), container.opf_name)
|
|
|
|
# Fix links in the split trees
|
|
for r in (root1, root2):
|
|
for a in r.xpath('//*[@href]'):
|
|
url = a.get('href')
|
|
if url.startswith('#'):
|
|
fname = name
|
|
else:
|
|
fname = container.href_to_name(url, name)
|
|
if fname == name:
|
|
purl = urllib.parse.urlparse(url)
|
|
if purl.fragment in anchors_in_top:
|
|
if r is root2:
|
|
a.set('href', '%s#%s' % (container.name_to_href(name, bottom_name), purl.fragment))
|
|
else:
|
|
a.set('href', '#' + purl.fragment)
|
|
elif purl.fragment in anchors_in_bottom:
|
|
if r is root1:
|
|
a.set('href', '%s#%s' % (container.name_to_href(bottom_name, name), purl.fragment))
|
|
else:
|
|
a.set('href', '#' + purl.fragment)
|
|
|
|
# Fix all links in the container that point to anchors in the bottom tree
|
|
for fname, media_type in container.mime_map.items():
|
|
if fname not in {name, bottom_name}:
|
|
repl = SplitLinkReplacer(fname, anchors_in_bottom, name, bottom_name, container)
|
|
container.replace_links(fname, repl)
|
|
|
|
container.replace(name, root1)
|
|
container.replace(bottom_name, root2)
|
|
|
|
spine = container.opf_xpath('//opf:spine')[0]
|
|
for spine_item, spine_name, linear in container.spine_iter:
|
|
if spine_name == name:
|
|
break
|
|
index = spine.index(spine_item) + 1
|
|
|
|
si = spine.makeelement(base.tag('opf', 'itemref'), idref=manifest_item.get('id'))
|
|
if not linear:
|
|
si.set('linear', 'no')
|
|
container.insert_into_xml(spine, si, index=index)
|
|
container.dirty(container.opf_name)
|
|
return bottom_name
|
|
|
|
|
|
def multisplit(container, name, xpath, before=True):
|
|
'''
|
|
Split the specified file at multiple locations (all tags that match the specified XPath expression). See also: :func:`split`.
|
|
Splitting automatically migrates all links and references to the affected
|
|
files.
|
|
|
|
:param before: If True the splits occur before the identified element otherwise after it.
|
|
'''
|
|
root = container.parsed(name)
|
|
nodes = root.xpath(xpath, namespaces=const.XPNSMAP)
|
|
if not nodes:
|
|
raise AbortError('The expression %s did not match any nodes' % xpath)
|
|
for split_point in nodes:
|
|
if in_table(split_point):
|
|
raise AbortError('Cannot split inside tables')
|
|
if split_point.tag.endswith('}body'):
|
|
raise AbortError('Cannot split on the <body> tag')
|
|
|
|
for i, tag in enumerate(nodes):
|
|
tag.set('calibre-split-point', str(i))
|
|
|
|
current = name
|
|
all_names = [name]
|
|
for i in range(len(nodes)):
|
|
current = split(container, current, '//*[@calibre-split-point="%d"]' % i, before=before)
|
|
all_names.append(current)
|
|
|
|
for x in all_names:
|
|
for tag in container.parsed(x).xpath('//*[@calibre-split-point]'):
|
|
tag.attrib.pop('calibre-split-point')
|
|
container.dirty(x)
|
|
|
|
return all_names[1:]
|
|
|
|
|
|
class MergeLinkReplacer(object):
|
|
|
|
def __init__(self, base, anchor_map, master, container):
|
|
self.container, self.anchor_map = container, anchor_map
|
|
self.master = master
|
|
self.base = base
|
|
self.replaced = False
|
|
|
|
def __call__(self, url):
|
|
if url and url.startswith('#'):
|
|
return url
|
|
name = self.container.href_to_name(url, self.base)
|
|
amap = self.anchor_map.get(name, None)
|
|
if amap is None:
|
|
return url
|
|
purl = urllib.parse.urlparse(url)
|
|
frag = purl.fragment or ''
|
|
frag = amap.get(frag, frag)
|
|
url = self.container.name_to_href(self.master, self.base) + '#' + frag
|
|
self.replaced = True
|
|
return url
|
|
|
|
|
|
def add_text(body, text):
|
|
if len(body) > 0:
|
|
body[-1].tail = (body[-1].tail or '') + text
|
|
else:
|
|
body.text = (body.text or '') + text
|
|
|
|
|
|
def all_anchors(root):
|
|
return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name'))
|
|
|
|
|
|
def all_stylesheets(container, name):
|
|
for link in base.XPath('//h:head/h:link[@href]')(container.parsed(name)):
|
|
name = container.href_to_name(link.get('href'), name)
|
|
typ = link.get('type', 'text/css')
|
|
if typ == 'text/css':
|
|
yield name
|
|
|
|
|
|
def unique_anchor(seen_anchors, current):
|
|
c = 0
|
|
ans = current
|
|
while ans in seen_anchors:
|
|
c += 1
|
|
ans = '%s_%d' % (current, c)
|
|
return ans
|
|
|
|
|
|
def remove_name_attributes(root):
|
|
# Remove all name attributes, replacing them with id attributes
|
|
for elem in root.xpath('//*[@id and @name]'):
|
|
del elem.attrib['name']
|
|
for elem in root.xpath('//*[@name]'):
|
|
elem.set('id', elem.attrib.pop('name'))
|
|
|
|
|
|
def merge_html(container, names, master, insert_page_breaks=False):
|
|
p = container.parsed
|
|
root = p(master)
|
|
|
|
# Ensure master has a <head>
|
|
head = root.find('h:head', namespaces=const.XPNSMAP)
|
|
if head is None:
|
|
head = root.makeelement(base.tag('xhtml', 'head'))
|
|
container.insert_into_xml(root, head, 0)
|
|
|
|
seen_anchors = all_anchors(root)
|
|
seen_stylesheets = set(all_stylesheets(container, master))
|
|
master_body = p(master).findall('h:body', namespaces=const.XPNSMAP)[-1]
|
|
master_base = os.path.dirname(master)
|
|
anchor_map = {n:{} for n in names if n != master}
|
|
first_anchor_map = {}
|
|
|
|
for name in names:
|
|
if name == master:
|
|
continue
|
|
# Insert new stylesheets into master
|
|
for sheet in all_stylesheets(container, name):
|
|
if sheet not in seen_stylesheets:
|
|
seen_stylesheets.add(sheet)
|
|
link = head.makeelement(base.tag('xhtml', 'link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
|
|
container.insert_into_xml(head, link)
|
|
|
|
# Rebase links if master is in a different directory
|
|
if os.path.dirname(name) != master_base:
|
|
container.replace_links(name, LinkRebaser(container, name, master))
|
|
|
|
root = p(name)
|
|
children = []
|
|
for body in p(name).findall('h:body', namespaces=const.XPNSMAP):
|
|
children.append(body.text if body.text and body.text.strip() else '\n\n')
|
|
children.extend(body)
|
|
|
|
first_child = ''
|
|
for first_child in children:
|
|
if not isinstance(first_child, (str, bytes)):
|
|
break
|
|
if isinstance(first_child, (str, bytes)):
|
|
# body contained only text, no tags
|
|
first_child = body.makeelement(base.tag('xhtml', 'p'))
|
|
first_child.text, children[0] = children[0], first_child
|
|
|
|
amap = anchor_map[name]
|
|
remove_name_attributes(root)
|
|
|
|
for elem in root.xpath('//*[@id]'):
|
|
val = elem.get('id')
|
|
if not val:
|
|
continue
|
|
if val in seen_anchors:
|
|
nval = unique_anchor(seen_anchors, val)
|
|
elem.set('id', nval)
|
|
amap[val] = nval
|
|
else:
|
|
seen_anchors.add(val)
|
|
|
|
if 'id' not in first_child.attrib:
|
|
first_child.set('id', unique_anchor(seen_anchors, 'top'))
|
|
seen_anchors.add(first_child.get('id'))
|
|
first_anchor_map[name] = first_child.get('id')
|
|
|
|
if insert_page_breaks:
|
|
first_child.set('style', first_child.get('style', '') + '; page-break-before: always')
|
|
|
|
amap[''] = first_child.get('id')
|
|
|
|
# Fix links that point to local changed anchors
|
|
for a in base.XPath('//h:a[starts-with(@href, "#")]')(root):
|
|
q = a.get('href')[1:]
|
|
if q in amap:
|
|
a.set('href', '#' + amap[q])
|
|
|
|
for child in children:
|
|
if isinstance(child, (str, bytes)):
|
|
add_text(master_body, child)
|
|
else:
|
|
master_body.append(copy.deepcopy(child))
|
|
|
|
container.remove_item(name, remove_from_guide=False)
|
|
|
|
# Fix all links in the container that point to merged files
|
|
for fname, media_type in container.mime_map.items():
|
|
repl = MergeLinkReplacer(fname, anchor_map, master, container)
|
|
container.replace_links(fname, repl)
|
|
|
|
return first_anchor_map
|
|
|
|
|
|
def merge_css(container, names, master):
|
|
p = container.parsed
|
|
msheet = p(master)
|
|
master_base = os.path.dirname(master)
|
|
merged = set()
|
|
|
|
for name in names:
|
|
if name == master:
|
|
continue
|
|
# Rebase links if master is in a different directory
|
|
if os.path.dirname(name) != master_base:
|
|
container.replace_links(name, LinkRebaser(container, name, master))
|
|
|
|
sheet = p(name)
|
|
|
|
# Remove charset rules
|
|
cr = [r for r in sheet.cssRules if r.type == r.CHARSET_RULE]
|
|
[sheet.deleteRule(sheet.cssRules.index(r)) for r in cr]
|
|
for rule in sheet.cssRules:
|
|
msheet.add(rule)
|
|
|
|
container.remove_item(name)
|
|
merged.add(name)
|
|
|
|
# Remove links to merged stylesheets in the html files, replacing with a
|
|
# link to the master sheet
|
|
for name, mt in container.mime_map.items():
|
|
if mt in base.OEB_DOCS:
|
|
removed = False
|
|
root = p(name)
|
|
for link in base.XPath('//h:link[@href]')(root):
|
|
q = container.href_to_name(link.get('href'), name)
|
|
if q in merged:
|
|
container.remove_from_xml(link)
|
|
removed = True
|
|
if removed:
|
|
container.dirty(name)
|
|
if removed and master not in set(all_stylesheets(container, name)):
|
|
head = root.find('h:head', namespaces=const.XPNSMAP)
|
|
if head is not None:
|
|
link = head.makeelement(base.tag('xhtml', 'link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name))
|
|
container.insert_into_xml(head, link)
|
|
|
|
|
|
def merge(container, category, names, master):
|
|
'''
|
|
Merge the specified files into a single file, automatically migrating all
|
|
links and references to the affected files. The file must all either be HTML or CSS files.
|
|
|
|
:param category: Must be either ``'text'`` for HTML files or ``'styles'`` for CSS files
|
|
:param names: The list of files to be merged
|
|
:param master: Which of the merged files is the *master* file, that is, the file that will remain after merging.
|
|
'''
|
|
if category not in {'text', 'styles'}:
|
|
raise AbortError('Cannot merge files of type: %s' % category)
|
|
if len(names) < 2:
|
|
raise AbortError('Must specify at least two files to be merged')
|
|
if master not in names:
|
|
raise AbortError('The master file (%s) must be one of the files being merged' % master)
|
|
|
|
if category == 'text':
|
|
merge_html(container, names, master)
|
|
elif category == 'styles':
|
|
merge_css(container, names, master)
|
|
|
|
container.dirty(master)
|