1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-05-02 03:40:53 +02:00

Added epub write support

This commit is contained in:
2020-04-13 12:46:37 +02:00
parent 9f18513787
commit 79cad46732
9 changed files with 3049 additions and 0 deletions
+389
View File
@@ -0,0 +1,389 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import defaultdict
from functools import partial
from css_parser.css import CSSRule, CSSStyleDeclaration
from css_selectors import parse, SelectorSyntaxError
from calibre import force_unicode
from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XHTML, css_text
from calibre.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize
from calibre.utils.icu import numeric_sort_key
from css_selectors import Select, SelectorError
from polyglot.builtins import iteritems, itervalues, unicode_type, filter
def filter_used_rules(rules, log, select):
for rule in rules:
used = False
for selector in rule.selectorList:
try:
if select.has_matches(selector.selectorText):
used = True
break
except SelectorError:
# Cannot parse/execute this selector, be safe and assume it
# matches something
used = True
break
if not used:
yield rule
def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None):
ans = set()
sheet = sheet or sheets[name]
for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
if rule.href:
iname = container.href_to_name(rule.href, name)
if iname in sheets:
ans.add(iname)
if recursion_level > 0:
for imported_sheet in tuple(ans):
ans |= get_imported_sheets(imported_sheet, container, sheets, recursion_level=recursion_level-1)
ans.discard(name)
return ans
def merge_declarations(first, second):
for prop in second.getProperties():
first.setProperty(prop)
def merge_identical_selectors(sheet):
' Merge rules that have identical selectors '
selector_map = defaultdict(list)
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
selector_map[rule.selectorText].append(rule)
remove = []
for rule_group in itervalues(selector_map):
if len(rule_group) > 1:
for i in range(1, len(rule_group)):
merge_declarations(rule_group[0].style, rule_group[i].style)
remove.append(rule_group[i])
for rule in remove:
sheet.cssRules.remove(rule)
return len(remove)
def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False):
'''
Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content.
:param report: An optional callable that takes a single argument. It is called with information about the operations being performed.
:param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed.
:param merge_rules: If True, rules with identical selectors are merged.
'''
report = report or (lambda x:x)
def safe_parse(name):
try:
return container.parsed(name)
except TypeError:
pass
sheets = {name:safe_parse(name) for name, mt in iteritems(container.mime_map) if mt in OEB_STYLES}
sheets = {k:v for k, v in iteritems(sheets) if v is not None}
num_merged = 0
if merge_rules:
for name, sheet in iteritems(sheets):
num = merge_identical_selectors(sheet)
if num:
container.dirty(name)
num_merged += num
import_map = {name:get_imported_sheets(name, container, sheets) for name in sheets}
if remove_unused_classes:
class_map = {name:{icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in iteritems(sheets)}
style_rules = {name:tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in iteritems(sheets)}
num_of_removed_rules = num_of_removed_classes = 0
for name, mt in iteritems(container.mime_map):
if mt not in OEB_DOCS:
continue
root = container.parsed(name)
select = Select(root, ignore_inappropriate_pseudo_classes=True)
used_classes = set()
for style in root.xpath('//*[local-name()="style"]'):
if style.get('type', 'text/css') == 'text/css' and style.text:
sheet = container.parse_css(style.text)
if merge_rules:
num = merge_identical_selectors(sheet)
if num:
num_merged += num
container.dirty(name)
if remove_unused_classes:
used_classes |= {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets, sheet=sheet)
for imported_sheet in imports:
style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
if remove_unused_classes:
used_classes |= class_map[imported_sheet]
rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
unused_rules = tuple(filter_used_rules(rules, container.log, select))
if unused_rules:
num_of_removed_rules += len(unused_rules)
[sheet.cssRules.remove(r) for r in unused_rules]
style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style)
container.dirty(name)
for link in root.xpath('//*[local-name()="link" and @href]'):
sname = container.href_to_name(link.get('href'), name)
if sname not in sheets:
continue
style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select))
if remove_unused_classes:
used_classes |= class_map[sname]
for iname in import_map[sname]:
style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select))
if remove_unused_classes:
used_classes |= class_map[iname]
if remove_unused_classes:
for elem in root.xpath('//*[@class]'):
original_classes, classes = elem.get('class', '').split(), []
for x in original_classes:
if icu_lower(x) in used_classes:
classes.append(x)
if len(classes) != len(original_classes):
if classes:
elem.set('class', ' '.join(classes))
else:
del elem.attrib['class']
num_of_removed_classes += len(original_classes) - len(classes)
container.dirty(name)
for name, sheet in iteritems(sheets):
unused_rules = style_rules[name]
if unused_rules:
num_of_removed_rules += len(unused_rules)
[sheet.cssRules.remove(r) for r in unused_rules]
container.dirty(name)
num_changes = num_of_removed_rules + num_merged + num_of_removed_classes
if num_changes > 0:
if num_of_removed_rules > 0:
report(ngettext('Removed one unused CSS style rule', 'Removed {} unused CSS style rules',
num_of_removed_rules).format(num_of_removed_rules))
if num_of_removed_classes > 0:
report(ngettext('Removed one unused class from the HTML', 'Removed {} unused classes from the HTML',
num_of_removed_classes).format(num_of_removed_classes))
if num_merged > 0:
report(ngettext('Merged one CSS style rule', 'Merged {} CSS style rules',
num_merged).format(num_merged))
if num_of_removed_rules == 0:
report(_('No unused CSS style rules found'))
if remove_unused_classes and num_of_removed_classes == 0:
report(_('No unused class attributes found'))
if merge_rules and num_merged == 0:
report(_('No style rules that could be merged found'))
return num_changes > 0
def filter_declaration(style, properties=()):
changed = False
for prop in properties:
if style.removeProperty(prop) != '':
changed = True
all_props = set(style.keys())
for prop in style.getProperties():
n = normalizers.get(prop.name, None)
if n is not None:
normalized = n(prop.name, prop.propertyValue)
removed = properties.intersection(set(normalized))
if removed:
changed = True
style.removeProperty(prop.name)
for prop in set(normalized) - removed - all_props:
style.setProperty(prop, normalized[prop])
return changed
def filter_sheet(sheet, properties=()):
from css_parser.css import CSSRule
changed = False
remove = []
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
if filter_declaration(rule.style, properties):
changed = True
if rule.style.length == 0:
remove.append(rule)
for rule in remove:
sheet.cssRules.remove(rule)
return changed
def transform_inline_styles(container, name, transform_sheet, transform_style):
root = container.parsed(name)
changed = False
for style in root.xpath('//*[local-name()="style"]'):
if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
sheet = container.parse_css(style.text)
if transform_sheet(sheet):
changed = True
style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style)
for elem in root.xpath('//*[@style]'):
text = elem.get('style', None)
if text:
style = container.parse_css(text, is_declaration=True)
if transform_style(style):
changed = True
if style.length == 0:
del elem.attrib['style']
else:
elem.set('style', force_unicode(style.getCssText(separator=' '), 'utf-8'))
return changed
def transform_css(container, transform_sheet=None, transform_style=None, names=()):
if not names:
types = OEB_STYLES | OEB_DOCS
names = []
for name, mt in iteritems(container.mime_map):
if mt in types:
names.append(name)
doc_changed = False
for name in names:
mt = container.mime_map[name]
if mt in OEB_STYLES:
sheet = container.parsed(name)
if transform_sheet(sheet):
container.dirty(name)
doc_changed = True
elif mt in OEB_DOCS:
if transform_inline_styles(container, name, transform_sheet, transform_style):
container.dirty(name)
doc_changed = True
return doc_changed
def filter_css(container, properties, names=()):
'''
Remove the specified CSS properties from all CSS rules in the book.
:param properties: Set of properties to remove. For example: :code:`{'font-family', 'color'}`.
:param names: The files from which to remove the properties. Defaults to all HTML and CSS files in the book.
'''
properties = normalize_filter_css(properties)
return transform_css(container, transform_sheet=partial(filter_sheet, properties=properties),
transform_style=partial(filter_declaration, properties=properties), names=names)
def _classes_in_selector(selector, classes):
for attr in ('selector', 'subselector', 'parsed_tree'):
s = getattr(selector, attr, None)
if s is not None:
_classes_in_selector(s, classes)
cn = getattr(selector, 'class_name', None)
if cn is not None:
classes.add(cn)
def classes_in_selector(text):
classes = set()
try:
for selector in parse(text):
_classes_in_selector(selector, classes)
except SelectorSyntaxError:
pass
return classes
def classes_in_rule_list(css_rules):
classes = set()
for rule in css_rules:
if rule.type == rule.STYLE_RULE:
classes |= classes_in_selector(rule.selectorText)
elif hasattr(rule, 'cssRules'):
classes |= classes_in_rule_list(rule.cssRules)
return classes
def iter_declarations(sheet_or_rule):
if hasattr(sheet_or_rule, 'cssRules'):
for rule in sheet_or_rule.cssRules:
for x in iter_declarations(rule):
yield x
elif hasattr(sheet_or_rule, 'style'):
yield sheet_or_rule.style
elif isinstance(sheet_or_rule, CSSStyleDeclaration):
yield sheet_or_rule
def remove_property_value(prop, predicate):
''' Remove the Values that match the predicate from this property. If all
values of the property would be removed, the property is removed from its
parent instead. Note that this means the property must have a parent (a
CSSStyleDeclaration). '''
removed_vals = list(filter(predicate, prop.propertyValue))
if len(removed_vals) == len(prop.propertyValue):
prop.parent.removeProperty(prop.name)
else:
x = css_text(prop.propertyValue)
for v in removed_vals:
x = x.replace(css_text(v), '').strip()
prop.propertyValue.cssText = x
return bool(removed_vals)
RULE_PRIORITIES = {t:i for i, t in enumerate((CSSRule.COMMENT, CSSRule.CHARSET_RULE, CSSRule.IMPORT_RULE, CSSRule.NAMESPACE_RULE))}
def sort_sheet(container, sheet_or_text):
''' Sort the rules in a stylesheet. Note that in the general case this can
change the effective styles, but for most common sheets, it should be safe.
'''
sheet = container.parse_css(sheet_or_text) if isinstance(sheet_or_text, unicode_type) else sheet_or_text
def text_sort_key(x):
return numeric_sort_key(unicode_type(x or ''))
def selector_sort_key(x):
return (x.specificity, text_sort_key(x.selectorText))
def rule_sort_key(rule):
primary = RULE_PRIORITIES.get(rule.type, len(RULE_PRIORITIES))
secondary = text_sort_key(getattr(rule, 'atkeyword', '') or '')
tertiary = None
if rule.type == CSSRule.STYLE_RULE:
primary += 1
selectors = sorted(rule.selectorList, key=selector_sort_key)
tertiary = selector_sort_key(selectors[0])
rule.selectorText = ', '.join(s.selectorText for s in selectors)
elif rule.type == CSSRule.FONT_FACE_RULE:
try:
tertiary = text_sort_key(rule.style.getPropertyValue('font-family'))
except Exception:
pass
return primary, secondary, tertiary
sheet.cssRules.sort(key=rule_sort_key)
return sheet
def add_stylesheet_links(container, name, text):
root = container.parse_xhtml(text, name)
head = root.xpath('//*[local-name() = "head"]')
if not head:
return
head = head[0]
sheets = tuple(container.manifest_items_of_type(lambda mt: mt in OEB_STYLES))
if not sheets:
return
for sname in sheets:
link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(sname, name))
head.append(link)
pretty_xml_tree(head)
return serialize(root, 'text/html')
@@ -0,0 +1,404 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import codecs, shutil, os, posixpath
from polyglot.builtins import iteritems, itervalues, map
from functools import partial
from collections import Counter, defaultdict
from calibre import sanitize_file_name
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.oeb.base import css_text
from calibre.ebooks.oeb.polish.css import iter_declarations, remove_property_value
from calibre.ebooks.oeb.polish.utils import extract
from polyglot.urllib import urlparse, urlunparse
class LinkReplacer(object):
def __init__(self, base, container, link_map, frag_map):
self.base = base
self.frag_map = frag_map
self.link_map = link_map
self.container = container
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
repl = self.frag_map(self.base, url[1:])
if not repl or repl == url[1:]:
return url
self.replaced = True
return '#' + repl
name = self.container.href_to_name(url, self.base)
if not name:
return url
nname = self.link_map.get(name, None)
if not nname:
return url
purl = urlparse(url)
href = self.container.name_to_href(nname, self.base)
if purl.fragment:
nfrag = self.frag_map(name, purl.fragment)
if nfrag:
href += '#%s'%nfrag
if href != url:
self.replaced = True
return href
class IdReplacer(object):
def __init__(self, base, container, id_map):
self.base, self.container, self.replaced = base, container, False
self.id_map = id_map
def __call__(self, url):
if url and url.startswith('#'):
repl = self.id_map.get(self.base, {}).get(url[1:])
if repl is None or repl == url[1:]:
return url
self.replaced = True
return '#' + repl
name = self.container.href_to_name(url, self.base)
if not name:
return url
id_map = self.id_map.get(name)
if id_map is None:
return url
purl = urlparse(url)
nfrag = id_map.get(purl.fragment)
if nfrag is None:
return url
purl = purl._replace(fragment=nfrag)
href = urlunparse(purl)
if href != url:
self.replaced = True
return href
class LinkRebaser(object):
def __init__(self, container, old_name, new_name):
self.old_name, self.new_name = old_name, new_name
self.container = container
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
return url
purl = urlparse(url)
frag = purl.fragment
name = self.container.href_to_name(url, self.old_name)
if not name:
return url
if name == self.old_name:
name = self.new_name
href = self.container.name_to_href(name, self.new_name)
if frag:
href += '#' + frag
if href != url:
self.replaced = True
return href
def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_in_opf=False):
'''
Replace links to files in the container. Will iterate over all files in the container and change the specified links in them.
:param link_map: A mapping of old canonical name to new canonical name. For example: :code:`{'images/old.png': 'images/new.png'}`
:param frag_map: A callable that takes two arguments ``(name, anchor)`` and
returns a new anchor. This is useful if you need to change the anchors in
HTML files. By default, it does nothing.
:param replace_in_opf: If False, links are not replaced in the OPF file.
'''
for name, media_type in iteritems(container.mime_map):
if name == container.opf_name and not replace_in_opf:
continue
repl = LinkReplacer(name, container, link_map, frag_map)
container.replace_links(name, repl)
def replace_ids(container, id_map):
'''
Replace all links in the container that pointed to the changed ids.
:param id_map: A mapping of {name:id_map} where each id_map is a mapping of {old_id:new_id}
:return: True iff at least one link was changed
'''
changed = False
for name, media_type in iteritems(container.mime_map):
repl = IdReplacer(name, container, id_map)
container.replace_links(name, repl)
if name == container.opf_name:
imap = id_map.get(name, {})
for item in container.opf_xpath('//*[@idref]'):
old_id = item.get('idref')
if old_id is not None:
new_id = imap.get(old_id)
if new_id is not None:
item.set('idref', new_id)
if repl.replaced:
changed = True
return changed
def smarten_punctuation(container, report):
from calibre.ebooks.conversion.preprocess import smarten_punctuation
smartened = False
for path in container.spine_items:
name = container.abspath_to_name(path)
changed = False
with container.open(name, 'r+b') as f:
html = container.decode(f.read())
newhtml = smarten_punctuation(html, container.log)
if newhtml != html:
changed = True
report(_('Smartened punctuation in: %s')%name)
newhtml = strip_encoding_declarations(newhtml)
f.seek(0)
f.truncate()
f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8'))
if changed:
# Add an encoding declaration (it will be added automatically when
# serialized)
root = container.parsed(name)
for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'):
m.getparent().remove(m)
container.dirty(name)
smartened = True
if not smartened:
report(_('No punctuation that could be smartened found'))
return smartened
def rename_files(container, file_map):
'''
Rename files in the container, automatically updating all links to them.
:param file_map: A mapping of old canonical name to new canonical name, for
example: :code:`{'text/chapter1.html': 'chapter1.html'}`.
'''
overlap = set(file_map).intersection(set(itervalues(file_map)))
if overlap:
raise ValueError('Circular rename detected. The files %s are both rename targets and destinations' % ', '.join(overlap))
for name, dest in iteritems(file_map):
if container.exists(dest):
if name != dest and name.lower() == dest.lower():
# A case change on an OS with a case insensitive file-system.
continue
raise ValueError('Cannot rename {0} to {1} as {1} already exists'.format(name, dest))
if len(tuple(itervalues(file_map))) != len(set(itervalues(file_map))):
raise ValueError('Cannot rename, the set of destination files contains duplicates')
link_map = {}
for current_name, new_name in iteritems(file_map):
container.rename(current_name, new_name)
if new_name != container.opf_name: # OPF is handled by the container
link_map[current_name] = new_name
replace_links(container, link_map, replace_in_opf=True)
def replace_file(container, name, path, basename, force_mt=None):
dirname, base = name.rpartition('/')[0::2]
nname = sanitize_file_name(basename)
if dirname:
nname = dirname + '/' + nname
with open(path, 'rb') as src:
if name != nname:
count = 0
b, e = nname.rpartition('.')[0::2]
while container.exists(nname):
count += 1
nname = b + ('_%d.%s' % (count, e))
rename_files(container, {name:nname})
mt = force_mt or container.guess_type(nname)
container.mime_map[nname] = mt
for itemid, q in iteritems(container.manifest_id_map):
if q == nname:
for item in container.opf_xpath('//opf:manifest/opf:item[@href and @id="%s"]' % itemid):
item.set('media-type', mt)
container.dirty(container.opf_name)
with container.open(nname, 'wb') as dest:
shutil.copyfileobj(src, dest)
def mt_to_category(container, mt):
from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.ebooks.oeb.polish.container import OEB_FONTS
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES
if mt in OEB_DOCS:
category = 'text'
elif mt in OEB_STYLES:
category = 'style'
elif mt in OEB_FONTS:
category = 'font'
elif mt == guess_type('a.opf'):
category = 'opf'
elif mt == guess_type('a.ncx'):
category = 'toc'
else:
category = mt.partition('/')[0]
return category
def get_recommended_folders(container, names):
''' Return the folders that are recommended for the given filenames. The
recommendation is based on where the majority of files of the same type are
located in the container. If no files of a particular type are present, the
recommended folder is assumed to be the folder containing the OPF file. '''
from calibre.ebooks.oeb.polish.utils import guess_type
counts = defaultdict(Counter)
for name, mt in iteritems(container.mime_map):
folder = name.rpartition('/')[0] if '/' in name else ''
counts[mt_to_category(container, mt)][folder] += 1
try:
opf_folder = counts['opf'].most_common(1)[0][0]
except KeyError:
opf_folder = ''
recommendations = {category:counter.most_common(1)[0][0] for category, counter in iteritems(counts)}
return {n:recommendations.get(mt_to_category(container, guess_type(os.path.basename(n))), opf_folder) for n in names}
def normalize_case(container, val):
def safe_listdir(x):
try:
return os.listdir(x)
except EnvironmentError:
return ()
parts = val.split('/')
ans = []
for i in range(len(parts)):
q = '/'.join(parts[:i+1])
x = container.name_to_abspath(q)
xl = parts[i].lower()
candidates = [c for c in safe_listdir(os.path.dirname(x)) if c != parts[i] and c.lower() == xl]
ans.append(candidates[0] if candidates else parts[i])
return '/'.join(ans)
def rationalize_folders(container, folder_type_map):
all_names = set(container.mime_map)
new_names = set()
name_map = {}
for key in tuple(folder_type_map):
val = folder_type_map[key]
folder_type_map[key] = normalize_case(container, val)
for name in all_names:
if name.startswith('META-INF/'):
continue
category = mt_to_category(container, container.mime_map[name])
folder = folder_type_map.get(category, None)
if folder is not None:
bn = posixpath.basename(name)
new_name = posixpath.join(folder, bn)
if new_name != name:
c = 0
while new_name in all_names or new_name in new_names:
c += 1
n, ext = bn.rpartition('.')[0::2]
new_name = posixpath.join(folder, '%s_%d.%s' % (n, c, ext))
name_map[name] = new_name
new_names.add(new_name)
return name_map
def remove_links_in_sheet(href_to_name, sheet, predicate):
import_rules_to_remove = []
changed = False
for i, r in enumerate(sheet):
if r.type == r.IMPORT_RULE:
name = href_to_name(r.href)
if predicate(name, r.href, None):
import_rules_to_remove.append(i)
for i in sorted(import_rules_to_remove, reverse=True):
sheet.deleteRule(i)
changed = True
for dec in iter_declarations(sheet):
changed = remove_links_in_declaration(href_to_name, dec, predicate) or changed
return changed
def remove_links_in_declaration(href_to_name, style, predicate):
def check_pval(v):
if v.type == v.URI:
name = href_to_name(v.uri)
return predicate(name, v.uri, None)
return False
changed = False
for p in tuple(style.getProperties(all=True)):
changed = remove_property_value(p, check_pval) or changed
return changed
def remove_links_to(container, predicate):
''' predicate must be a function that takes the arguments (name, href,
fragment=None) and returns True iff the link should be removed '''
from calibre.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML
stylepath = XPath('//h:style')
styleattrpath = XPath('//*[@style]')
changed = set()
for name, mt in iteritems(container.mime_map):
removed = False
if mt in OEB_DOCS:
root = container.parsed(name)
for el, attr, href, pos in iterlinks(root, find_links_in_css=False):
hname = container.href_to_name(href, name)
frag = href.partition('#')[-1]
if predicate(hname, href, frag):
if attr is None:
el.text = None
else:
if el.tag == XHTML('link') or el.tag == XHTML('img'):
extract(el)
else:
del el.attrib[attr]
removed = True
for tag in stylepath(root):
if tag.text and (tag.get('type') or 'text/css').lower() == 'text/css':
sheet = container.parse_css(tag.text)
if remove_links_in_sheet(partial(container.href_to_name, base=name), sheet, predicate):
tag.text = css_text(sheet)
removed = True
for tag in styleattrpath(root):
style = tag.get('style')
if style:
style = container.parse_css(style, is_declaration=True)
if remove_links_in_declaration(partial(container.href_to_name, base=name), style, predicate):
removed = True
tag.set('style', css_text(style))
elif mt in OEB_STYLES:
removed = remove_links_in_sheet(partial(container.href_to_name, base=name), container.parsed(name), predicate)
if removed:
changed.add(name)
tuple(map(container.dirty, changed))
return changed
def get_spine_order_for_all_files(container):
linear_names, non_linear_names = [], []
for name, is_linear in container.spine_names:
(linear_names if is_linear else non_linear_names).append(name)
all_names = linear_names + non_linear_names
spine_names = frozenset(all_names)
ans = {}
for spine_pos, name in enumerate(all_names):
ans.setdefault(name, (spine_pos, -1))
for i, href in enumerate(container.iterlinks(name, get_line_numbers=False)):
lname = container.href_to_name(href, name)
if lname not in spine_names:
ans.setdefault(lname, (spine_pos, i))
return ans
+517
View File
@@ -0,0 +1,517 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import copy, os, re
from polyglot.builtins import map, string_or_bytes, range
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
from calibre.ebooks.oeb.polish.toc import node_from_loc
from calibre.ebooks.oeb.polish.replace import LinkRebaser
from polyglot.builtins import iteritems, unicode_type
from polyglot.urllib import urlparse
class AbortError(ValueError):
pass
def in_table(node):
while node is not None:
if node.tag.endswith('}table'):
return True
node = node.getparent()
return False
def adjust_split_point(split_point, log):
'''
Move the split point up its ancestor chain if it has no content
before it. This handles the common case:
<div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
h2.
'''
sp = split_point
while True:
parent = sp.getparent()
if (
parent is None or
barename(parent.tag) in {'body', 'html'} or
(parent.text and parent.text.strip()) or
parent.index(sp) > 0
):
break
sp = parent
if sp is not split_point:
log.debug('Adjusted split point to ancestor')
return sp
def get_body(root):
return root.find('h:body', namespaces=XPNSMAP)
def do_split(split_point, log, before=True):
'''
Split tree into a *before* and an *after* tree at ``split_point``.
:param split_point: The Element at which to split
:param before: If True tree is split before split_point, otherwise after split_point
:return: before_tree, after_tree
'''
if before:
# We cannot adjust for after since moving an after split point to a
# parent will cause breakage if the parent contains any content
# after the original split point
split_point = adjust_split_point(split_point, log)
tree = split_point.getroottree()
path = tree.getpath(split_point)
tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree)
root, root2 = tree.getroot(), tree2.getroot()
body, body2 = map(get_body, (root, root2))
split_point = root.xpath(path)[0]
split_point2 = root2.xpath(path)[0]
def nix_element(elem, top=True):
# Remove elem unless top is False in which case replace elem by its
# children
parent = elem.getparent()
if top:
parent.remove(elem)
else:
index = parent.index(elem)
parent[index:index+1] = list(elem.iterchildren())
# Tree 1
hit_split_point = False
keep_descendants = False
split_point_descendants = frozenset(split_point.iterdescendants())
for elem in tuple(body.iterdescendants()):
if elem is split_point:
hit_split_point = True
if before:
nix_element(elem)
else:
# We want to keep the descendants of the split point in
# Tree 1
keep_descendants = True
# We want the split point element, but not its tail
elem.tail = '\n'
continue
if hit_split_point:
if keep_descendants:
if elem in split_point_descendants:
# elem is a descendant keep it
continue
else:
# We are out of split_point, so prevent further set
# lookups of split_point_descendants
keep_descendants = False
nix_element(elem)
# Tree 2
ancestors = frozenset(XPath('ancestor::*')(split_point2))
for elem in tuple(body2.iterdescendants()):
if elem is split_point2:
if not before:
# Keep the split point element's tail, if it contains non-whitespace
# text
tail = elem.tail
if tail and not tail.isspace():
parent = elem.getparent()
idx = parent.index(elem)
if idx == 0:
parent.text = (parent.text or '') + tail
else:
sib = parent[idx-1]
sib.tail = (sib.tail or '') + tail
# Remove the element itself
nix_element(elem)
break
if elem in ancestors:
# We have to preserve the ancestors as they could have CSS
# styles that are inherited/applicable, like font or
# width. So we only remove the text, if any.
elem.text = '\n'
else:
nix_element(elem, top=False)
body2.text = '\n'
return tree, tree2
class SplitLinkReplacer(object):
def __init__(self, base, bottom_anchors, top_name, bottom_name, container):
self.bottom_anchors, self.bottom_name = bottom_anchors, bottom_name
self.container, self.top_name = container, top_name
self.base = base
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
return url
name = self.container.href_to_name(url, self.base)
if name != self.top_name:
return url
purl = urlparse(url)
if purl.fragment and purl.fragment in self.bottom_anchors:
url = self.container.name_to_href(self.bottom_name, self.base) + '#' + purl.fragment
self.replaced = True
return url
def split(container, name, loc_or_xpath, before=True, totals=None):
'''
Split the file specified by name at the position specified by loc_or_xpath.
Splitting automatically migrates all links and references to the affected
files.
:param loc_or_xpath: Should be an XPath expression such as
//h:div[@id="split_here"]. Can also be a *loc* which is used internally to
implement splitting in the preview panel.
:param before: If True the split occurs before the identified element otherwise after it.
:param totals: Used internally
'''
root = container.parsed(name)
if isinstance(loc_or_xpath, unicode_type):
split_point = root.xpath(loc_or_xpath)[0]
else:
try:
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
except MalformedMarkup:
# The webkit HTML parser and the container parser have yielded
# different node counts, this can happen if the file is valid XML
# but contains constructs like nested <p> tags. So force parse it
# with the HTML 5 parser and try again.
raw = container.raw_data(name)
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
try:
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
except MalformedMarkup:
raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
' before splitting') % name)
container.replace(name, root)
if in_table(split_point):
raise AbortError('Cannot split inside tables')
if split_point.tag.endswith('}body'):
raise AbortError('Cannot split on the <body> tag')
tree1, tree2 = do_split(split_point, container.log, before=before)
root1, root2 = tree1.getroot(), tree2.getroot()
anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''}
anchors_in_bottom = frozenset(root2.xpath('//*/@id')) | frozenset(root2.xpath('//*/@name'))
base, ext = name.rpartition('.')[0::2]
base = re.sub(r'_split\d+$', '', base)
nname, s = None, 0
while not nname or container.exists(nname):
s += 1
nname = '%s_split%d.%s' % (base, s, ext)
manifest_item = container.generate_item(nname, media_type=container.mime_map[name])
bottom_name = container.href_to_name(manifest_item.get('href'), container.opf_name)
# Fix links in the split trees
for r in (root1, root2):
for a in r.xpath('//*[@href]'):
url = a.get('href')
if url.startswith('#'):
fname = name
else:
fname = container.href_to_name(url, name)
if fname == name:
purl = urlparse(url)
if purl.fragment in anchors_in_top:
if r is root2:
a.set('href', '%s#%s' % (container.name_to_href(name, bottom_name), purl.fragment))
else:
a.set('href', '#' + purl.fragment)
elif purl.fragment in anchors_in_bottom:
if r is root1:
a.set('href', '%s#%s' % (container.name_to_href(bottom_name, name), purl.fragment))
else:
a.set('href', '#' + purl.fragment)
# Fix all links in the container that point to anchors in the bottom tree
for fname, media_type in iteritems(container.mime_map):
if fname not in {name, bottom_name}:
repl = SplitLinkReplacer(fname, anchors_in_bottom, name, bottom_name, container)
container.replace_links(fname, repl)
container.replace(name, root1)
container.replace(bottom_name, root2)
spine = container.opf_xpath('//opf:spine')[0]
for spine_item, spine_name, linear in container.spine_iter:
if spine_name == name:
break
index = spine.index(spine_item) + 1
si = spine.makeelement(OPF('itemref'), idref=manifest_item.get('id'))
if not linear:
si.set('linear', 'no')
container.insert_into_xml(spine, si, index=index)
container.dirty(container.opf_name)
return bottom_name
def multisplit(container, name, xpath, before=True):
'''
Split the specified file at multiple locations (all tags that match the specified XPath expression). See also: :func:`split`.
Splitting automatically migrates all links and references to the affected
files.
:param before: If True the splits occur before the identified element otherwise after it.
'''
root = container.parsed(name)
nodes = root.xpath(xpath, namespaces=XPNSMAP)
if not nodes:
raise AbortError(_('The expression %s did not match any nodes') % xpath)
for split_point in nodes:
if in_table(split_point):
raise AbortError('Cannot split inside tables')
if split_point.tag.endswith('}body'):
raise AbortError('Cannot split on the <body> tag')
for i, tag in enumerate(nodes):
tag.set('calibre-split-point', unicode_type(i))
current = name
all_names = [name]
for i in range(len(nodes)):
current = split(container, current, '//*[@calibre-split-point="%d"]' % i, before=before)
all_names.append(current)
for x in all_names:
for tag in container.parsed(x).xpath('//*[@calibre-split-point]'):
tag.attrib.pop('calibre-split-point')
container.dirty(x)
return all_names[1:]
class MergeLinkReplacer(object):
def __init__(self, base, anchor_map, master, container):
self.container, self.anchor_map = container, anchor_map
self.master = master
self.base = base
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
return url
name = self.container.href_to_name(url, self.base)
amap = self.anchor_map.get(name, None)
if amap is None:
return url
purl = urlparse(url)
frag = purl.fragment or ''
frag = amap.get(frag, frag)
url = self.container.name_to_href(self.master, self.base) + '#' + frag
self.replaced = True
return url
def add_text(body, text):
if len(body) > 0:
body[-1].tail = (body[-1].tail or '') + text
else:
body.text = (body.text or '') + text
def all_anchors(root):
return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name'))
def all_stylesheets(container, name):
for link in XPath('//h:head/h:link[@href]')(container.parsed(name)):
name = container.href_to_name(link.get('href'), name)
typ = link.get('type', 'text/css')
if typ == 'text/css':
yield name
def unique_anchor(seen_anchors, current):
c = 0
ans = current
while ans in seen_anchors:
c += 1
ans = '%s_%d' % (current, c)
return ans
def remove_name_attributes(root):
# Remove all name attributes, replacing them with id attributes
for elem in root.xpath('//*[@id and @name]'):
del elem.attrib['name']
for elem in root.xpath('//*[@name]'):
elem.set('id', elem.attrib.pop('name'))
def merge_html(container, names, master, insert_page_breaks=False):
p = container.parsed
root = p(master)
# Ensure master has a <head>
head = root.find('h:head', namespaces=XPNSMAP)
if head is None:
head = root.makeelement(XHTML('head'))
container.insert_into_xml(root, head, 0)
seen_anchors = all_anchors(root)
seen_stylesheets = set(all_stylesheets(container, master))
master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
master_base = os.path.dirname(master)
anchor_map = {n:{} for n in names if n != master}
first_anchor_map = {}
for name in names:
if name == master:
continue
# Insert new stylesheets into master
for sheet in all_stylesheets(container, name):
if sheet not in seen_stylesheets:
seen_stylesheets.add(sheet)
link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
container.insert_into_xml(head, link)
# Rebase links if master is in a different directory
if os.path.dirname(name) != master_base:
container.replace_links(name, LinkRebaser(container, name, master))
root = p(name)
children = []
for body in p(name).findall('h:body', namespaces=XPNSMAP):
children.append(body.text if body.text and body.text.strip() else '\n\n')
children.extend(body)
first_child = ''
for first_child in children:
if not isinstance(first_child, string_or_bytes):
break
if isinstance(first_child, string_or_bytes):
# body contained only text, no tags
first_child = body.makeelement(XHTML('p'))
first_child.text, children[0] = children[0], first_child
amap = anchor_map[name]
remove_name_attributes(root)
for elem in root.xpath('//*[@id]'):
val = elem.get('id')
if not val:
continue
if val in seen_anchors:
nval = unique_anchor(seen_anchors, val)
elem.set('id', nval)
amap[val] = nval
else:
seen_anchors.add(val)
if 'id' not in first_child.attrib:
first_child.set('id', unique_anchor(seen_anchors, 'top'))
seen_anchors.add(first_child.get('id'))
first_anchor_map[name] = first_child.get('id')
if insert_page_breaks:
first_child.set('style', first_child.get('style', '') + '; page-break-before: always')
amap[''] = first_child.get('id')
# Fix links that point to local changed anchors
for a in XPath('//h:a[starts-with(@href, "#")]')(root):
q = a.get('href')[1:]
if q in amap:
a.set('href', '#' + amap[q])
for child in children:
if isinstance(child, string_or_bytes):
add_text(master_body, child)
else:
master_body.append(copy.deepcopy(child))
container.remove_item(name, remove_from_guide=False)
# Fix all links in the container that point to merged files
for fname, media_type in iteritems(container.mime_map):
repl = MergeLinkReplacer(fname, anchor_map, master, container)
container.replace_links(fname, repl)
return first_anchor_map
def merge_css(container, names, master):
p = container.parsed
msheet = p(master)
master_base = os.path.dirname(master)
merged = set()
for name in names:
if name == master:
continue
# Rebase links if master is in a different directory
if os.path.dirname(name) != master_base:
container.replace_links(name, LinkRebaser(container, name, master))
sheet = p(name)
# Remove charset rules
cr = [r for r in sheet.cssRules if r.type == r.CHARSET_RULE]
[sheet.deleteRule(sheet.cssRules.index(r)) for r in cr]
for rule in sheet.cssRules:
msheet.add(rule)
container.remove_item(name)
merged.add(name)
# Remove links to merged stylesheets in the html files, replacing with a
# link to the master sheet
for name, mt in iteritems(container.mime_map):
if mt in OEB_DOCS:
removed = False
root = p(name)
for link in XPath('//h:link[@href]')(root):
q = container.href_to_name(link.get('href'), name)
if q in merged:
container.remove_from_xml(link)
removed = True
if removed:
container.dirty(name)
if removed and master not in set(all_stylesheets(container, name)):
head = root.find('h:head', namespaces=XPNSMAP)
if head is not None:
link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name))
container.insert_into_xml(head, link)
def merge(container, category, names, master):
'''
Merge the specified files into a single file, automatically migrating all
links and references to the affected files. The file must all either be HTML or CSS files.
:param category: Must be either ``'text'`` for HTML files or ``'styles'`` for CSS files
:param names: The list of files to be merged
:param master: Which of the merged files is the *master* file, that is, the file that will remain after merging.
'''
if category not in {'text', 'styles'}:
raise AbortError('Cannot merge files of type: %s' % category)
if len(names) < 2:
raise AbortError('Must specify at least two files to be merged')
if master not in names:
raise AbortError('The master file (%s) must be one of the files being merged' % master)
if category == 'text':
merge_html(container, names, master)
elif category == 'styles':
merge_css(container, names, master)
container.dirty(master)