mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-05-02 03:40:53 +02:00
Added epub write support
This commit is contained in:
@@ -0,0 +1,389 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
|
||||
from css_parser.css import CSSRule, CSSStyleDeclaration
|
||||
from css_selectors import parse, SelectorSyntaxError
|
||||
|
||||
from calibre import force_unicode
|
||||
from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XHTML, css_text
|
||||
from calibre.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
|
||||
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize
|
||||
from calibre.utils.icu import numeric_sort_key
|
||||
from css_selectors import Select, SelectorError
|
||||
from polyglot.builtins import iteritems, itervalues, unicode_type, filter
|
||||
|
||||
|
||||
def filter_used_rules(rules, log, select):
|
||||
for rule in rules:
|
||||
used = False
|
||||
for selector in rule.selectorList:
|
||||
try:
|
||||
if select.has_matches(selector.selectorText):
|
||||
used = True
|
||||
break
|
||||
except SelectorError:
|
||||
# Cannot parse/execute this selector, be safe and assume it
|
||||
# matches something
|
||||
used = True
|
||||
break
|
||||
if not used:
|
||||
yield rule
|
||||
|
||||
|
||||
def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None):
|
||||
ans = set()
|
||||
sheet = sheet or sheets[name]
|
||||
for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
|
||||
if rule.href:
|
||||
iname = container.href_to_name(rule.href, name)
|
||||
if iname in sheets:
|
||||
ans.add(iname)
|
||||
if recursion_level > 0:
|
||||
for imported_sheet in tuple(ans):
|
||||
ans |= get_imported_sheets(imported_sheet, container, sheets, recursion_level=recursion_level-1)
|
||||
ans.discard(name)
|
||||
return ans
|
||||
|
||||
|
||||
def merge_declarations(first, second):
|
||||
for prop in second.getProperties():
|
||||
first.setProperty(prop)
|
||||
|
||||
|
||||
def merge_identical_selectors(sheet):
|
||||
' Merge rules that have identical selectors '
|
||||
selector_map = defaultdict(list)
|
||||
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
selector_map[rule.selectorText].append(rule)
|
||||
remove = []
|
||||
for rule_group in itervalues(selector_map):
|
||||
if len(rule_group) > 1:
|
||||
for i in range(1, len(rule_group)):
|
||||
merge_declarations(rule_group[0].style, rule_group[i].style)
|
||||
remove.append(rule_group[i])
|
||||
for rule in remove:
|
||||
sheet.cssRules.remove(rule)
|
||||
return len(remove)
|
||||
|
||||
|
||||
def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False):
|
||||
'''
|
||||
Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content.
|
||||
|
||||
:param report: An optional callable that takes a single argument. It is called with information about the operations being performed.
|
||||
:param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed.
|
||||
:param merge_rules: If True, rules with identical selectors are merged.
|
||||
'''
|
||||
report = report or (lambda x:x)
|
||||
|
||||
def safe_parse(name):
|
||||
try:
|
||||
return container.parsed(name)
|
||||
except TypeError:
|
||||
pass
|
||||
sheets = {name:safe_parse(name) for name, mt in iteritems(container.mime_map) if mt in OEB_STYLES}
|
||||
sheets = {k:v for k, v in iteritems(sheets) if v is not None}
|
||||
num_merged = 0
|
||||
if merge_rules:
|
||||
for name, sheet in iteritems(sheets):
|
||||
num = merge_identical_selectors(sheet)
|
||||
if num:
|
||||
container.dirty(name)
|
||||
num_merged += num
|
||||
import_map = {name:get_imported_sheets(name, container, sheets) for name in sheets}
|
||||
if remove_unused_classes:
|
||||
class_map = {name:{icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in iteritems(sheets)}
|
||||
style_rules = {name:tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in iteritems(sheets)}
|
||||
|
||||
num_of_removed_rules = num_of_removed_classes = 0
|
||||
|
||||
for name, mt in iteritems(container.mime_map):
|
||||
if mt not in OEB_DOCS:
|
||||
continue
|
||||
root = container.parsed(name)
|
||||
select = Select(root, ignore_inappropriate_pseudo_classes=True)
|
||||
used_classes = set()
|
||||
for style in root.xpath('//*[local-name()="style"]'):
|
||||
if style.get('type', 'text/css') == 'text/css' and style.text:
|
||||
sheet = container.parse_css(style.text)
|
||||
if merge_rules:
|
||||
num = merge_identical_selectors(sheet)
|
||||
if num:
|
||||
num_merged += num
|
||||
container.dirty(name)
|
||||
if remove_unused_classes:
|
||||
used_classes |= {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)}
|
||||
imports = get_imported_sheets(name, container, sheets, sheet=sheet)
|
||||
for imported_sheet in imports:
|
||||
style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
|
||||
if remove_unused_classes:
|
||||
used_classes |= class_map[imported_sheet]
|
||||
rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
|
||||
unused_rules = tuple(filter_used_rules(rules, container.log, select))
|
||||
if unused_rules:
|
||||
num_of_removed_rules += len(unused_rules)
|
||||
[sheet.cssRules.remove(r) for r in unused_rules]
|
||||
style.text = force_unicode(sheet.cssText, 'utf-8')
|
||||
pretty_script_or_style(container, style)
|
||||
container.dirty(name)
|
||||
|
||||
for link in root.xpath('//*[local-name()="link" and @href]'):
|
||||
sname = container.href_to_name(link.get('href'), name)
|
||||
if sname not in sheets:
|
||||
continue
|
||||
style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select))
|
||||
if remove_unused_classes:
|
||||
used_classes |= class_map[sname]
|
||||
|
||||
for iname in import_map[sname]:
|
||||
style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select))
|
||||
if remove_unused_classes:
|
||||
used_classes |= class_map[iname]
|
||||
|
||||
if remove_unused_classes:
|
||||
for elem in root.xpath('//*[@class]'):
|
||||
original_classes, classes = elem.get('class', '').split(), []
|
||||
for x in original_classes:
|
||||
if icu_lower(x) in used_classes:
|
||||
classes.append(x)
|
||||
if len(classes) != len(original_classes):
|
||||
if classes:
|
||||
elem.set('class', ' '.join(classes))
|
||||
else:
|
||||
del elem.attrib['class']
|
||||
num_of_removed_classes += len(original_classes) - len(classes)
|
||||
container.dirty(name)
|
||||
|
||||
for name, sheet in iteritems(sheets):
|
||||
unused_rules = style_rules[name]
|
||||
if unused_rules:
|
||||
num_of_removed_rules += len(unused_rules)
|
||||
[sheet.cssRules.remove(r) for r in unused_rules]
|
||||
container.dirty(name)
|
||||
|
||||
num_changes = num_of_removed_rules + num_merged + num_of_removed_classes
|
||||
if num_changes > 0:
|
||||
if num_of_removed_rules > 0:
|
||||
report(ngettext('Removed one unused CSS style rule', 'Removed {} unused CSS style rules',
|
||||
num_of_removed_rules).format(num_of_removed_rules))
|
||||
if num_of_removed_classes > 0:
|
||||
report(ngettext('Removed one unused class from the HTML', 'Removed {} unused classes from the HTML',
|
||||
num_of_removed_classes).format(num_of_removed_classes))
|
||||
if num_merged > 0:
|
||||
report(ngettext('Merged one CSS style rule', 'Merged {} CSS style rules',
|
||||
num_merged).format(num_merged))
|
||||
if num_of_removed_rules == 0:
|
||||
report(_('No unused CSS style rules found'))
|
||||
if remove_unused_classes and num_of_removed_classes == 0:
|
||||
report(_('No unused class attributes found'))
|
||||
if merge_rules and num_merged == 0:
|
||||
report(_('No style rules that could be merged found'))
|
||||
return num_changes > 0
|
||||
|
||||
|
||||
def filter_declaration(style, properties=()):
|
||||
changed = False
|
||||
for prop in properties:
|
||||
if style.removeProperty(prop) != '':
|
||||
changed = True
|
||||
all_props = set(style.keys())
|
||||
for prop in style.getProperties():
|
||||
n = normalizers.get(prop.name, None)
|
||||
if n is not None:
|
||||
normalized = n(prop.name, prop.propertyValue)
|
||||
removed = properties.intersection(set(normalized))
|
||||
if removed:
|
||||
changed = True
|
||||
style.removeProperty(prop.name)
|
||||
for prop in set(normalized) - removed - all_props:
|
||||
style.setProperty(prop, normalized[prop])
|
||||
return changed
|
||||
|
||||
|
||||
def filter_sheet(sheet, properties=()):
|
||||
from css_parser.css import CSSRule
|
||||
changed = False
|
||||
remove = []
|
||||
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
if filter_declaration(rule.style, properties):
|
||||
changed = True
|
||||
if rule.style.length == 0:
|
||||
remove.append(rule)
|
||||
for rule in remove:
|
||||
sheet.cssRules.remove(rule)
|
||||
return changed
|
||||
|
||||
|
||||
def transform_inline_styles(container, name, transform_sheet, transform_style):
|
||||
root = container.parsed(name)
|
||||
changed = False
|
||||
for style in root.xpath('//*[local-name()="style"]'):
|
||||
if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
|
||||
sheet = container.parse_css(style.text)
|
||||
if transform_sheet(sheet):
|
||||
changed = True
|
||||
style.text = force_unicode(sheet.cssText, 'utf-8')
|
||||
pretty_script_or_style(container, style)
|
||||
for elem in root.xpath('//*[@style]'):
|
||||
text = elem.get('style', None)
|
||||
if text:
|
||||
style = container.parse_css(text, is_declaration=True)
|
||||
if transform_style(style):
|
||||
changed = True
|
||||
if style.length == 0:
|
||||
del elem.attrib['style']
|
||||
else:
|
||||
elem.set('style', force_unicode(style.getCssText(separator=' '), 'utf-8'))
|
||||
return changed
|
||||
|
||||
|
||||
def transform_css(container, transform_sheet=None, transform_style=None, names=()):
|
||||
if not names:
|
||||
types = OEB_STYLES | OEB_DOCS
|
||||
names = []
|
||||
for name, mt in iteritems(container.mime_map):
|
||||
if mt in types:
|
||||
names.append(name)
|
||||
|
||||
doc_changed = False
|
||||
|
||||
for name in names:
|
||||
mt = container.mime_map[name]
|
||||
if mt in OEB_STYLES:
|
||||
sheet = container.parsed(name)
|
||||
if transform_sheet(sheet):
|
||||
container.dirty(name)
|
||||
doc_changed = True
|
||||
elif mt in OEB_DOCS:
|
||||
if transform_inline_styles(container, name, transform_sheet, transform_style):
|
||||
container.dirty(name)
|
||||
doc_changed = True
|
||||
|
||||
return doc_changed
|
||||
|
||||
|
||||
def filter_css(container, properties, names=()):
|
||||
'''
|
||||
Remove the specified CSS properties from all CSS rules in the book.
|
||||
|
||||
:param properties: Set of properties to remove. For example: :code:`{'font-family', 'color'}`.
|
||||
:param names: The files from which to remove the properties. Defaults to all HTML and CSS files in the book.
|
||||
'''
|
||||
properties = normalize_filter_css(properties)
|
||||
return transform_css(container, transform_sheet=partial(filter_sheet, properties=properties),
|
||||
transform_style=partial(filter_declaration, properties=properties), names=names)
|
||||
|
||||
|
||||
def _classes_in_selector(selector, classes):
|
||||
for attr in ('selector', 'subselector', 'parsed_tree'):
|
||||
s = getattr(selector, attr, None)
|
||||
if s is not None:
|
||||
_classes_in_selector(s, classes)
|
||||
cn = getattr(selector, 'class_name', None)
|
||||
if cn is not None:
|
||||
classes.add(cn)
|
||||
|
||||
|
||||
def classes_in_selector(text):
|
||||
classes = set()
|
||||
try:
|
||||
for selector in parse(text):
|
||||
_classes_in_selector(selector, classes)
|
||||
except SelectorSyntaxError:
|
||||
pass
|
||||
return classes
|
||||
|
||||
|
||||
def classes_in_rule_list(css_rules):
|
||||
classes = set()
|
||||
for rule in css_rules:
|
||||
if rule.type == rule.STYLE_RULE:
|
||||
classes |= classes_in_selector(rule.selectorText)
|
||||
elif hasattr(rule, 'cssRules'):
|
||||
classes |= classes_in_rule_list(rule.cssRules)
|
||||
return classes
|
||||
|
||||
|
||||
def iter_declarations(sheet_or_rule):
|
||||
if hasattr(sheet_or_rule, 'cssRules'):
|
||||
for rule in sheet_or_rule.cssRules:
|
||||
for x in iter_declarations(rule):
|
||||
yield x
|
||||
elif hasattr(sheet_or_rule, 'style'):
|
||||
yield sheet_or_rule.style
|
||||
elif isinstance(sheet_or_rule, CSSStyleDeclaration):
|
||||
yield sheet_or_rule
|
||||
|
||||
|
||||
def remove_property_value(prop, predicate):
|
||||
''' Remove the Values that match the predicate from this property. If all
|
||||
values of the property would be removed, the property is removed from its
|
||||
parent instead. Note that this means the property must have a parent (a
|
||||
CSSStyleDeclaration). '''
|
||||
removed_vals = list(filter(predicate, prop.propertyValue))
|
||||
if len(removed_vals) == len(prop.propertyValue):
|
||||
prop.parent.removeProperty(prop.name)
|
||||
else:
|
||||
x = css_text(prop.propertyValue)
|
||||
for v in removed_vals:
|
||||
x = x.replace(css_text(v), '').strip()
|
||||
prop.propertyValue.cssText = x
|
||||
return bool(removed_vals)
|
||||
|
||||
|
||||
RULE_PRIORITIES = {t:i for i, t in enumerate((CSSRule.COMMENT, CSSRule.CHARSET_RULE, CSSRule.IMPORT_RULE, CSSRule.NAMESPACE_RULE))}
|
||||
|
||||
|
||||
def sort_sheet(container, sheet_or_text):
|
||||
''' Sort the rules in a stylesheet. Note that in the general case this can
|
||||
change the effective styles, but for most common sheets, it should be safe.
|
||||
'''
|
||||
sheet = container.parse_css(sheet_or_text) if isinstance(sheet_or_text, unicode_type) else sheet_or_text
|
||||
|
||||
def text_sort_key(x):
|
||||
return numeric_sort_key(unicode_type(x or ''))
|
||||
|
||||
def selector_sort_key(x):
|
||||
return (x.specificity, text_sort_key(x.selectorText))
|
||||
|
||||
def rule_sort_key(rule):
|
||||
primary = RULE_PRIORITIES.get(rule.type, len(RULE_PRIORITIES))
|
||||
secondary = text_sort_key(getattr(rule, 'atkeyword', '') or '')
|
||||
tertiary = None
|
||||
if rule.type == CSSRule.STYLE_RULE:
|
||||
primary += 1
|
||||
selectors = sorted(rule.selectorList, key=selector_sort_key)
|
||||
tertiary = selector_sort_key(selectors[0])
|
||||
rule.selectorText = ', '.join(s.selectorText for s in selectors)
|
||||
elif rule.type == CSSRule.FONT_FACE_RULE:
|
||||
try:
|
||||
tertiary = text_sort_key(rule.style.getPropertyValue('font-family'))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return primary, secondary, tertiary
|
||||
sheet.cssRules.sort(key=rule_sort_key)
|
||||
return sheet
|
||||
|
||||
|
||||
def add_stylesheet_links(container, name, text):
|
||||
root = container.parse_xhtml(text, name)
|
||||
head = root.xpath('//*[local-name() = "head"]')
|
||||
if not head:
|
||||
return
|
||||
head = head[0]
|
||||
sheets = tuple(container.manifest_items_of_type(lambda mt: mt in OEB_STYLES))
|
||||
if not sheets:
|
||||
return
|
||||
for sname in sheets:
|
||||
link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(sname, name))
|
||||
head.append(link)
|
||||
pretty_xml_tree(head)
|
||||
return serialize(root, 'text/html')
|
||||
@@ -0,0 +1,404 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import codecs, shutil, os, posixpath
|
||||
from polyglot.builtins import iteritems, itervalues, map
|
||||
from functools import partial
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
from calibre import sanitize_file_name
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations
|
||||
from calibre.ebooks.oeb.base import css_text
|
||||
from calibre.ebooks.oeb.polish.css import iter_declarations, remove_property_value
|
||||
from calibre.ebooks.oeb.polish.utils import extract
|
||||
from polyglot.urllib import urlparse, urlunparse
|
||||
|
||||
|
||||
class LinkReplacer(object):
|
||||
|
||||
def __init__(self, base, container, link_map, frag_map):
|
||||
self.base = base
|
||||
self.frag_map = frag_map
|
||||
self.link_map = link_map
|
||||
self.container = container
|
||||
self.replaced = False
|
||||
|
||||
def __call__(self, url):
|
||||
if url and url.startswith('#'):
|
||||
repl = self.frag_map(self.base, url[1:])
|
||||
if not repl or repl == url[1:]:
|
||||
return url
|
||||
self.replaced = True
|
||||
return '#' + repl
|
||||
name = self.container.href_to_name(url, self.base)
|
||||
if not name:
|
||||
return url
|
||||
nname = self.link_map.get(name, None)
|
||||
if not nname:
|
||||
return url
|
||||
purl = urlparse(url)
|
||||
href = self.container.name_to_href(nname, self.base)
|
||||
if purl.fragment:
|
||||
nfrag = self.frag_map(name, purl.fragment)
|
||||
if nfrag:
|
||||
href += '#%s'%nfrag
|
||||
if href != url:
|
||||
self.replaced = True
|
||||
return href
|
||||
|
||||
|
||||
class IdReplacer(object):
|
||||
|
||||
def __init__(self, base, container, id_map):
|
||||
self.base, self.container, self.replaced = base, container, False
|
||||
self.id_map = id_map
|
||||
|
||||
def __call__(self, url):
|
||||
if url and url.startswith('#'):
|
||||
repl = self.id_map.get(self.base, {}).get(url[1:])
|
||||
if repl is None or repl == url[1:]:
|
||||
return url
|
||||
self.replaced = True
|
||||
return '#' + repl
|
||||
name = self.container.href_to_name(url, self.base)
|
||||
if not name:
|
||||
return url
|
||||
id_map = self.id_map.get(name)
|
||||
if id_map is None:
|
||||
return url
|
||||
purl = urlparse(url)
|
||||
nfrag = id_map.get(purl.fragment)
|
||||
if nfrag is None:
|
||||
return url
|
||||
purl = purl._replace(fragment=nfrag)
|
||||
href = urlunparse(purl)
|
||||
if href != url:
|
||||
self.replaced = True
|
||||
return href
|
||||
|
||||
|
||||
class LinkRebaser(object):
|
||||
|
||||
def __init__(self, container, old_name, new_name):
|
||||
self.old_name, self.new_name = old_name, new_name
|
||||
self.container = container
|
||||
self.replaced = False
|
||||
|
||||
def __call__(self, url):
|
||||
if url and url.startswith('#'):
|
||||
return url
|
||||
purl = urlparse(url)
|
||||
frag = purl.fragment
|
||||
name = self.container.href_to_name(url, self.old_name)
|
||||
if not name:
|
||||
return url
|
||||
if name == self.old_name:
|
||||
name = self.new_name
|
||||
href = self.container.name_to_href(name, self.new_name)
|
||||
if frag:
|
||||
href += '#' + frag
|
||||
if href != url:
|
||||
self.replaced = True
|
||||
return href
|
||||
|
||||
|
||||
def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_in_opf=False):
|
||||
'''
|
||||
Replace links to files in the container. Will iterate over all files in the container and change the specified links in them.
|
||||
|
||||
:param link_map: A mapping of old canonical name to new canonical name. For example: :code:`{'images/old.png': 'images/new.png'}`
|
||||
:param frag_map: A callable that takes two arguments ``(name, anchor)`` and
|
||||
returns a new anchor. This is useful if you need to change the anchors in
|
||||
HTML files. By default, it does nothing.
|
||||
:param replace_in_opf: If False, links are not replaced in the OPF file.
|
||||
|
||||
'''
|
||||
for name, media_type in iteritems(container.mime_map):
|
||||
if name == container.opf_name and not replace_in_opf:
|
||||
continue
|
||||
repl = LinkReplacer(name, container, link_map, frag_map)
|
||||
container.replace_links(name, repl)
|
||||
|
||||
|
||||
def replace_ids(container, id_map):
|
||||
'''
|
||||
Replace all links in the container that pointed to the changed ids.
|
||||
|
||||
:param id_map: A mapping of {name:id_map} where each id_map is a mapping of {old_id:new_id}
|
||||
:return: True iff at least one link was changed
|
||||
|
||||
'''
|
||||
changed = False
|
||||
for name, media_type in iteritems(container.mime_map):
|
||||
repl = IdReplacer(name, container, id_map)
|
||||
container.replace_links(name, repl)
|
||||
if name == container.opf_name:
|
||||
imap = id_map.get(name, {})
|
||||
for item in container.opf_xpath('//*[@idref]'):
|
||||
old_id = item.get('idref')
|
||||
if old_id is not None:
|
||||
new_id = imap.get(old_id)
|
||||
if new_id is not None:
|
||||
item.set('idref', new_id)
|
||||
if repl.replaced:
|
||||
changed = True
|
||||
return changed
|
||||
|
||||
|
||||
def smarten_punctuation(container, report):
|
||||
from calibre.ebooks.conversion.preprocess import smarten_punctuation
|
||||
smartened = False
|
||||
for path in container.spine_items:
|
||||
name = container.abspath_to_name(path)
|
||||
changed = False
|
||||
with container.open(name, 'r+b') as f:
|
||||
html = container.decode(f.read())
|
||||
newhtml = smarten_punctuation(html, container.log)
|
||||
if newhtml != html:
|
||||
changed = True
|
||||
report(_('Smartened punctuation in: %s')%name)
|
||||
newhtml = strip_encoding_declarations(newhtml)
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8'))
|
||||
if changed:
|
||||
# Add an encoding declaration (it will be added automatically when
|
||||
# serialized)
|
||||
root = container.parsed(name)
|
||||
for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'):
|
||||
m.getparent().remove(m)
|
||||
container.dirty(name)
|
||||
smartened = True
|
||||
if not smartened:
|
||||
report(_('No punctuation that could be smartened found'))
|
||||
return smartened
|
||||
|
||||
|
||||
def rename_files(container, file_map):
|
||||
'''
|
||||
Rename files in the container, automatically updating all links to them.
|
||||
|
||||
:param file_map: A mapping of old canonical name to new canonical name, for
|
||||
example: :code:`{'text/chapter1.html': 'chapter1.html'}`.
|
||||
'''
|
||||
overlap = set(file_map).intersection(set(itervalues(file_map)))
|
||||
if overlap:
|
||||
raise ValueError('Circular rename detected. The files %s are both rename targets and destinations' % ', '.join(overlap))
|
||||
for name, dest in iteritems(file_map):
|
||||
if container.exists(dest):
|
||||
if name != dest and name.lower() == dest.lower():
|
||||
# A case change on an OS with a case insensitive file-system.
|
||||
continue
|
||||
raise ValueError('Cannot rename {0} to {1} as {1} already exists'.format(name, dest))
|
||||
if len(tuple(itervalues(file_map))) != len(set(itervalues(file_map))):
|
||||
raise ValueError('Cannot rename, the set of destination files contains duplicates')
|
||||
link_map = {}
|
||||
for current_name, new_name in iteritems(file_map):
|
||||
container.rename(current_name, new_name)
|
||||
if new_name != container.opf_name: # OPF is handled by the container
|
||||
link_map[current_name] = new_name
|
||||
replace_links(container, link_map, replace_in_opf=True)
|
||||
|
||||
|
||||
def replace_file(container, name, path, basename, force_mt=None):
|
||||
dirname, base = name.rpartition('/')[0::2]
|
||||
nname = sanitize_file_name(basename)
|
||||
if dirname:
|
||||
nname = dirname + '/' + nname
|
||||
with open(path, 'rb') as src:
|
||||
if name != nname:
|
||||
count = 0
|
||||
b, e = nname.rpartition('.')[0::2]
|
||||
while container.exists(nname):
|
||||
count += 1
|
||||
nname = b + ('_%d.%s' % (count, e))
|
||||
rename_files(container, {name:nname})
|
||||
mt = force_mt or container.guess_type(nname)
|
||||
container.mime_map[nname] = mt
|
||||
for itemid, q in iteritems(container.manifest_id_map):
|
||||
if q == nname:
|
||||
for item in container.opf_xpath('//opf:manifest/opf:item[@href and @id="%s"]' % itemid):
|
||||
item.set('media-type', mt)
|
||||
container.dirty(container.opf_name)
|
||||
with container.open(nname, 'wb') as dest:
|
||||
shutil.copyfileobj(src, dest)
|
||||
|
||||
|
||||
def mt_to_category(container, mt):
|
||||
from calibre.ebooks.oeb.polish.utils import guess_type
|
||||
from calibre.ebooks.oeb.polish.container import OEB_FONTS
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES
|
||||
if mt in OEB_DOCS:
|
||||
category = 'text'
|
||||
elif mt in OEB_STYLES:
|
||||
category = 'style'
|
||||
elif mt in OEB_FONTS:
|
||||
category = 'font'
|
||||
elif mt == guess_type('a.opf'):
|
||||
category = 'opf'
|
||||
elif mt == guess_type('a.ncx'):
|
||||
category = 'toc'
|
||||
else:
|
||||
category = mt.partition('/')[0]
|
||||
return category
|
||||
|
||||
|
||||
def get_recommended_folders(container, names):
|
||||
''' Return the folders that are recommended for the given filenames. The
|
||||
recommendation is based on where the majority of files of the same type are
|
||||
located in the container. If no files of a particular type are present, the
|
||||
recommended folder is assumed to be the folder containing the OPF file. '''
|
||||
from calibre.ebooks.oeb.polish.utils import guess_type
|
||||
counts = defaultdict(Counter)
|
||||
for name, mt in iteritems(container.mime_map):
|
||||
folder = name.rpartition('/')[0] if '/' in name else ''
|
||||
counts[mt_to_category(container, mt)][folder] += 1
|
||||
|
||||
try:
|
||||
opf_folder = counts['opf'].most_common(1)[0][0]
|
||||
except KeyError:
|
||||
opf_folder = ''
|
||||
|
||||
recommendations = {category:counter.most_common(1)[0][0] for category, counter in iteritems(counts)}
|
||||
return {n:recommendations.get(mt_to_category(container, guess_type(os.path.basename(n))), opf_folder) for n in names}
|
||||
|
||||
|
||||
def normalize_case(container, val):
|
||||
|
||||
def safe_listdir(x):
|
||||
try:
|
||||
return os.listdir(x)
|
||||
except EnvironmentError:
|
||||
return ()
|
||||
|
||||
parts = val.split('/')
|
||||
ans = []
|
||||
for i in range(len(parts)):
|
||||
q = '/'.join(parts[:i+1])
|
||||
x = container.name_to_abspath(q)
|
||||
xl = parts[i].lower()
|
||||
candidates = [c for c in safe_listdir(os.path.dirname(x)) if c != parts[i] and c.lower() == xl]
|
||||
ans.append(candidates[0] if candidates else parts[i])
|
||||
return '/'.join(ans)
|
||||
|
||||
|
||||
def rationalize_folders(container, folder_type_map):
|
||||
all_names = set(container.mime_map)
|
||||
new_names = set()
|
||||
name_map = {}
|
||||
for key in tuple(folder_type_map):
|
||||
val = folder_type_map[key]
|
||||
folder_type_map[key] = normalize_case(container, val)
|
||||
for name in all_names:
|
||||
if name.startswith('META-INF/'):
|
||||
continue
|
||||
category = mt_to_category(container, container.mime_map[name])
|
||||
folder = folder_type_map.get(category, None)
|
||||
if folder is not None:
|
||||
bn = posixpath.basename(name)
|
||||
new_name = posixpath.join(folder, bn)
|
||||
if new_name != name:
|
||||
c = 0
|
||||
while new_name in all_names or new_name in new_names:
|
||||
c += 1
|
||||
n, ext = bn.rpartition('.')[0::2]
|
||||
new_name = posixpath.join(folder, '%s_%d.%s' % (n, c, ext))
|
||||
name_map[name] = new_name
|
||||
new_names.add(new_name)
|
||||
return name_map
|
||||
|
||||
|
||||
def remove_links_in_sheet(href_to_name, sheet, predicate):
|
||||
import_rules_to_remove = []
|
||||
changed = False
|
||||
for i, r in enumerate(sheet):
|
||||
if r.type == r.IMPORT_RULE:
|
||||
name = href_to_name(r.href)
|
||||
if predicate(name, r.href, None):
|
||||
import_rules_to_remove.append(i)
|
||||
for i in sorted(import_rules_to_remove, reverse=True):
|
||||
sheet.deleteRule(i)
|
||||
changed = True
|
||||
|
||||
for dec in iter_declarations(sheet):
|
||||
changed = remove_links_in_declaration(href_to_name, dec, predicate) or changed
|
||||
return changed
|
||||
|
||||
|
||||
def remove_links_in_declaration(href_to_name, style, predicate):
|
||||
def check_pval(v):
|
||||
if v.type == v.URI:
|
||||
name = href_to_name(v.uri)
|
||||
return predicate(name, v.uri, None)
|
||||
return False
|
||||
|
||||
changed = False
|
||||
|
||||
for p in tuple(style.getProperties(all=True)):
|
||||
changed = remove_property_value(p, check_pval) or changed
|
||||
return changed
|
||||
|
||||
|
||||
def remove_links_to(container, predicate):
|
||||
''' predicate must be a function that takes the arguments (name, href,
|
||||
fragment=None) and returns True iff the link should be removed '''
|
||||
from calibre.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML
|
||||
stylepath = XPath('//h:style')
|
||||
styleattrpath = XPath('//*[@style]')
|
||||
changed = set()
|
||||
for name, mt in iteritems(container.mime_map):
|
||||
removed = False
|
||||
if mt in OEB_DOCS:
|
||||
root = container.parsed(name)
|
||||
for el, attr, href, pos in iterlinks(root, find_links_in_css=False):
|
||||
hname = container.href_to_name(href, name)
|
||||
frag = href.partition('#')[-1]
|
||||
if predicate(hname, href, frag):
|
||||
if attr is None:
|
||||
el.text = None
|
||||
else:
|
||||
if el.tag == XHTML('link') or el.tag == XHTML('img'):
|
||||
extract(el)
|
||||
else:
|
||||
del el.attrib[attr]
|
||||
removed = True
|
||||
for tag in stylepath(root):
|
||||
if tag.text and (tag.get('type') or 'text/css').lower() == 'text/css':
|
||||
sheet = container.parse_css(tag.text)
|
||||
if remove_links_in_sheet(partial(container.href_to_name, base=name), sheet, predicate):
|
||||
tag.text = css_text(sheet)
|
||||
removed = True
|
||||
for tag in styleattrpath(root):
|
||||
style = tag.get('style')
|
||||
if style:
|
||||
style = container.parse_css(style, is_declaration=True)
|
||||
if remove_links_in_declaration(partial(container.href_to_name, base=name), style, predicate):
|
||||
removed = True
|
||||
tag.set('style', css_text(style))
|
||||
elif mt in OEB_STYLES:
|
||||
removed = remove_links_in_sheet(partial(container.href_to_name, base=name), container.parsed(name), predicate)
|
||||
if removed:
|
||||
changed.add(name)
|
||||
tuple(map(container.dirty, changed))
|
||||
return changed
|
||||
|
||||
|
||||
def get_spine_order_for_all_files(container):
|
||||
linear_names, non_linear_names = [], []
|
||||
for name, is_linear in container.spine_names:
|
||||
(linear_names if is_linear else non_linear_names).append(name)
|
||||
all_names = linear_names + non_linear_names
|
||||
spine_names = frozenset(all_names)
|
||||
ans = {}
|
||||
for spine_pos, name in enumerate(all_names):
|
||||
ans.setdefault(name, (spine_pos, -1))
|
||||
for i, href in enumerate(container.iterlinks(name, get_line_numbers=False)):
|
||||
lname = container.href_to_name(href, name)
|
||||
if lname not in spine_names:
|
||||
ans.setdefault(lname, (spine_pos, i))
|
||||
return ans
|
||||
@@ -0,0 +1,517 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import copy, os, re
|
||||
from polyglot.builtins import map, string_or_bytes, range
|
||||
|
||||
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
|
||||
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
|
||||
from calibre.ebooks.oeb.polish.toc import node_from_loc
|
||||
from calibre.ebooks.oeb.polish.replace import LinkRebaser
|
||||
from polyglot.builtins import iteritems, unicode_type
|
||||
from polyglot.urllib import urlparse
|
||||
|
||||
|
||||
class AbortError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def in_table(node):
|
||||
while node is not None:
|
||||
if node.tag.endswith('}table'):
|
||||
return True
|
||||
node = node.getparent()
|
||||
return False
|
||||
|
||||
|
||||
def adjust_split_point(split_point, log):
|
||||
'''
|
||||
Move the split point up its ancestor chain if it has no content
|
||||
before it. This handles the common case:
|
||||
<div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
|
||||
h2.
|
||||
'''
|
||||
sp = split_point
|
||||
while True:
|
||||
parent = sp.getparent()
|
||||
if (
|
||||
parent is None or
|
||||
barename(parent.tag) in {'body', 'html'} or
|
||||
(parent.text and parent.text.strip()) or
|
||||
parent.index(sp) > 0
|
||||
):
|
||||
break
|
||||
sp = parent
|
||||
|
||||
if sp is not split_point:
|
||||
log.debug('Adjusted split point to ancestor')
|
||||
|
||||
return sp
|
||||
|
||||
|
||||
def get_body(root):
|
||||
return root.find('h:body', namespaces=XPNSMAP)
|
||||
|
||||
|
||||
def do_split(split_point, log, before=True):
|
||||
'''
|
||||
Split tree into a *before* and an *after* tree at ``split_point``.
|
||||
|
||||
:param split_point: The Element at which to split
|
||||
:param before: If True tree is split before split_point, otherwise after split_point
|
||||
:return: before_tree, after_tree
|
||||
'''
|
||||
if before:
|
||||
# We cannot adjust for after since moving an after split point to a
|
||||
# parent will cause breakage if the parent contains any content
|
||||
# after the original split point
|
||||
split_point = adjust_split_point(split_point, log)
|
||||
tree = split_point.getroottree()
|
||||
path = tree.getpath(split_point)
|
||||
|
||||
tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree)
|
||||
root, root2 = tree.getroot(), tree2.getroot()
|
||||
body, body2 = map(get_body, (root, root2))
|
||||
split_point = root.xpath(path)[0]
|
||||
split_point2 = root2.xpath(path)[0]
|
||||
|
||||
def nix_element(elem, top=True):
|
||||
# Remove elem unless top is False in which case replace elem by its
|
||||
# children
|
||||
parent = elem.getparent()
|
||||
if top:
|
||||
parent.remove(elem)
|
||||
else:
|
||||
index = parent.index(elem)
|
||||
parent[index:index+1] = list(elem.iterchildren())
|
||||
|
||||
# Tree 1
|
||||
hit_split_point = False
|
||||
keep_descendants = False
|
||||
split_point_descendants = frozenset(split_point.iterdescendants())
|
||||
for elem in tuple(body.iterdescendants()):
|
||||
if elem is split_point:
|
||||
hit_split_point = True
|
||||
if before:
|
||||
nix_element(elem)
|
||||
else:
|
||||
# We want to keep the descendants of the split point in
|
||||
# Tree 1
|
||||
keep_descendants = True
|
||||
# We want the split point element, but not its tail
|
||||
elem.tail = '\n'
|
||||
|
||||
continue
|
||||
if hit_split_point:
|
||||
if keep_descendants:
|
||||
if elem in split_point_descendants:
|
||||
# elem is a descendant keep it
|
||||
continue
|
||||
else:
|
||||
# We are out of split_point, so prevent further set
|
||||
# lookups of split_point_descendants
|
||||
keep_descendants = False
|
||||
nix_element(elem)
|
||||
|
||||
# Tree 2
|
||||
ancestors = frozenset(XPath('ancestor::*')(split_point2))
|
||||
for elem in tuple(body2.iterdescendants()):
|
||||
if elem is split_point2:
|
||||
if not before:
|
||||
# Keep the split point element's tail, if it contains non-whitespace
|
||||
# text
|
||||
tail = elem.tail
|
||||
if tail and not tail.isspace():
|
||||
parent = elem.getparent()
|
||||
idx = parent.index(elem)
|
||||
if idx == 0:
|
||||
parent.text = (parent.text or '') + tail
|
||||
else:
|
||||
sib = parent[idx-1]
|
||||
sib.tail = (sib.tail or '') + tail
|
||||
# Remove the element itself
|
||||
nix_element(elem)
|
||||
break
|
||||
if elem in ancestors:
|
||||
# We have to preserve the ancestors as they could have CSS
|
||||
# styles that are inherited/applicable, like font or
|
||||
# width. So we only remove the text, if any.
|
||||
elem.text = '\n'
|
||||
else:
|
||||
nix_element(elem, top=False)
|
||||
|
||||
body2.text = '\n'
|
||||
|
||||
return tree, tree2
|
||||
|
||||
|
||||
class SplitLinkReplacer(object):
|
||||
|
||||
def __init__(self, base, bottom_anchors, top_name, bottom_name, container):
|
||||
self.bottom_anchors, self.bottom_name = bottom_anchors, bottom_name
|
||||
self.container, self.top_name = container, top_name
|
||||
self.base = base
|
||||
self.replaced = False
|
||||
|
||||
def __call__(self, url):
|
||||
if url and url.startswith('#'):
|
||||
return url
|
||||
name = self.container.href_to_name(url, self.base)
|
||||
if name != self.top_name:
|
||||
return url
|
||||
purl = urlparse(url)
|
||||
if purl.fragment and purl.fragment in self.bottom_anchors:
|
||||
url = self.container.name_to_href(self.bottom_name, self.base) + '#' + purl.fragment
|
||||
self.replaced = True
|
||||
return url
|
||||
|
||||
|
||||
def split(container, name, loc_or_xpath, before=True, totals=None):
|
||||
'''
|
||||
Split the file specified by name at the position specified by loc_or_xpath.
|
||||
Splitting automatically migrates all links and references to the affected
|
||||
files.
|
||||
|
||||
:param loc_or_xpath: Should be an XPath expression such as
|
||||
//h:div[@id="split_here"]. Can also be a *loc* which is used internally to
|
||||
implement splitting in the preview panel.
|
||||
:param before: If True the split occurs before the identified element otherwise after it.
|
||||
:param totals: Used internally
|
||||
'''
|
||||
|
||||
root = container.parsed(name)
|
||||
if isinstance(loc_or_xpath, unicode_type):
|
||||
split_point = root.xpath(loc_or_xpath)[0]
|
||||
else:
|
||||
try:
|
||||
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
|
||||
except MalformedMarkup:
|
||||
# The webkit HTML parser and the container parser have yielded
|
||||
# different node counts, this can happen if the file is valid XML
|
||||
# but contains constructs like nested <p> tags. So force parse it
|
||||
# with the HTML 5 parser and try again.
|
||||
raw = container.raw_data(name)
|
||||
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
|
||||
try:
|
||||
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
|
||||
except MalformedMarkup:
|
||||
raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
|
||||
' before splitting') % name)
|
||||
container.replace(name, root)
|
||||
if in_table(split_point):
|
||||
raise AbortError('Cannot split inside tables')
|
||||
if split_point.tag.endswith('}body'):
|
||||
raise AbortError('Cannot split on the <body> tag')
|
||||
tree1, tree2 = do_split(split_point, container.log, before=before)
|
||||
root1, root2 = tree1.getroot(), tree2.getroot()
|
||||
anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''}
|
||||
anchors_in_bottom = frozenset(root2.xpath('//*/@id')) | frozenset(root2.xpath('//*/@name'))
|
||||
base, ext = name.rpartition('.')[0::2]
|
||||
base = re.sub(r'_split\d+$', '', base)
|
||||
nname, s = None, 0
|
||||
while not nname or container.exists(nname):
|
||||
s += 1
|
||||
nname = '%s_split%d.%s' % (base, s, ext)
|
||||
manifest_item = container.generate_item(nname, media_type=container.mime_map[name])
|
||||
bottom_name = container.href_to_name(manifest_item.get('href'), container.opf_name)
|
||||
|
||||
# Fix links in the split trees
|
||||
for r in (root1, root2):
|
||||
for a in r.xpath('//*[@href]'):
|
||||
url = a.get('href')
|
||||
if url.startswith('#'):
|
||||
fname = name
|
||||
else:
|
||||
fname = container.href_to_name(url, name)
|
||||
if fname == name:
|
||||
purl = urlparse(url)
|
||||
if purl.fragment in anchors_in_top:
|
||||
if r is root2:
|
||||
a.set('href', '%s#%s' % (container.name_to_href(name, bottom_name), purl.fragment))
|
||||
else:
|
||||
a.set('href', '#' + purl.fragment)
|
||||
elif purl.fragment in anchors_in_bottom:
|
||||
if r is root1:
|
||||
a.set('href', '%s#%s' % (container.name_to_href(bottom_name, name), purl.fragment))
|
||||
else:
|
||||
a.set('href', '#' + purl.fragment)
|
||||
|
||||
# Fix all links in the container that point to anchors in the bottom tree
|
||||
for fname, media_type in iteritems(container.mime_map):
|
||||
if fname not in {name, bottom_name}:
|
||||
repl = SplitLinkReplacer(fname, anchors_in_bottom, name, bottom_name, container)
|
||||
container.replace_links(fname, repl)
|
||||
|
||||
container.replace(name, root1)
|
||||
container.replace(bottom_name, root2)
|
||||
|
||||
spine = container.opf_xpath('//opf:spine')[0]
|
||||
for spine_item, spine_name, linear in container.spine_iter:
|
||||
if spine_name == name:
|
||||
break
|
||||
index = spine.index(spine_item) + 1
|
||||
|
||||
si = spine.makeelement(OPF('itemref'), idref=manifest_item.get('id'))
|
||||
if not linear:
|
||||
si.set('linear', 'no')
|
||||
container.insert_into_xml(spine, si, index=index)
|
||||
container.dirty(container.opf_name)
|
||||
return bottom_name
|
||||
|
||||
|
||||
def multisplit(container, name, xpath, before=True):
|
||||
'''
|
||||
Split the specified file at multiple locations (all tags that match the specified XPath expression). See also: :func:`split`.
|
||||
Splitting automatically migrates all links and references to the affected
|
||||
files.
|
||||
|
||||
:param before: If True the splits occur before the identified element otherwise after it.
|
||||
'''
|
||||
root = container.parsed(name)
|
||||
nodes = root.xpath(xpath, namespaces=XPNSMAP)
|
||||
if not nodes:
|
||||
raise AbortError(_('The expression %s did not match any nodes') % xpath)
|
||||
for split_point in nodes:
|
||||
if in_table(split_point):
|
||||
raise AbortError('Cannot split inside tables')
|
||||
if split_point.tag.endswith('}body'):
|
||||
raise AbortError('Cannot split on the <body> tag')
|
||||
|
||||
for i, tag in enumerate(nodes):
|
||||
tag.set('calibre-split-point', unicode_type(i))
|
||||
|
||||
current = name
|
||||
all_names = [name]
|
||||
for i in range(len(nodes)):
|
||||
current = split(container, current, '//*[@calibre-split-point="%d"]' % i, before=before)
|
||||
all_names.append(current)
|
||||
|
||||
for x in all_names:
|
||||
for tag in container.parsed(x).xpath('//*[@calibre-split-point]'):
|
||||
tag.attrib.pop('calibre-split-point')
|
||||
container.dirty(x)
|
||||
|
||||
return all_names[1:]
|
||||
|
||||
|
||||
class MergeLinkReplacer(object):
|
||||
|
||||
def __init__(self, base, anchor_map, master, container):
|
||||
self.container, self.anchor_map = container, anchor_map
|
||||
self.master = master
|
||||
self.base = base
|
||||
self.replaced = False
|
||||
|
||||
def __call__(self, url):
|
||||
if url and url.startswith('#'):
|
||||
return url
|
||||
name = self.container.href_to_name(url, self.base)
|
||||
amap = self.anchor_map.get(name, None)
|
||||
if amap is None:
|
||||
return url
|
||||
purl = urlparse(url)
|
||||
frag = purl.fragment or ''
|
||||
frag = amap.get(frag, frag)
|
||||
url = self.container.name_to_href(self.master, self.base) + '#' + frag
|
||||
self.replaced = True
|
||||
return url
|
||||
|
||||
|
||||
def add_text(body, text):
|
||||
if len(body) > 0:
|
||||
body[-1].tail = (body[-1].tail or '') + text
|
||||
else:
|
||||
body.text = (body.text or '') + text
|
||||
|
||||
|
||||
def all_anchors(root):
|
||||
return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name'))
|
||||
|
||||
|
||||
def all_stylesheets(container, name):
|
||||
for link in XPath('//h:head/h:link[@href]')(container.parsed(name)):
|
||||
name = container.href_to_name(link.get('href'), name)
|
||||
typ = link.get('type', 'text/css')
|
||||
if typ == 'text/css':
|
||||
yield name
|
||||
|
||||
|
||||
def unique_anchor(seen_anchors, current):
|
||||
c = 0
|
||||
ans = current
|
||||
while ans in seen_anchors:
|
||||
c += 1
|
||||
ans = '%s_%d' % (current, c)
|
||||
return ans
|
||||
|
||||
|
||||
def remove_name_attributes(root):
|
||||
# Remove all name attributes, replacing them with id attributes
|
||||
for elem in root.xpath('//*[@id and @name]'):
|
||||
del elem.attrib['name']
|
||||
for elem in root.xpath('//*[@name]'):
|
||||
elem.set('id', elem.attrib.pop('name'))
|
||||
|
||||
|
||||
def merge_html(container, names, master, insert_page_breaks=False):
|
||||
p = container.parsed
|
||||
root = p(master)
|
||||
|
||||
# Ensure master has a <head>
|
||||
head = root.find('h:head', namespaces=XPNSMAP)
|
||||
if head is None:
|
||||
head = root.makeelement(XHTML('head'))
|
||||
container.insert_into_xml(root, head, 0)
|
||||
|
||||
seen_anchors = all_anchors(root)
|
||||
seen_stylesheets = set(all_stylesheets(container, master))
|
||||
master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
|
||||
master_base = os.path.dirname(master)
|
||||
anchor_map = {n:{} for n in names if n != master}
|
||||
first_anchor_map = {}
|
||||
|
||||
for name in names:
|
||||
if name == master:
|
||||
continue
|
||||
# Insert new stylesheets into master
|
||||
for sheet in all_stylesheets(container, name):
|
||||
if sheet not in seen_stylesheets:
|
||||
seen_stylesheets.add(sheet)
|
||||
link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
|
||||
container.insert_into_xml(head, link)
|
||||
|
||||
# Rebase links if master is in a different directory
|
||||
if os.path.dirname(name) != master_base:
|
||||
container.replace_links(name, LinkRebaser(container, name, master))
|
||||
|
||||
root = p(name)
|
||||
children = []
|
||||
for body in p(name).findall('h:body', namespaces=XPNSMAP):
|
||||
children.append(body.text if body.text and body.text.strip() else '\n\n')
|
||||
children.extend(body)
|
||||
|
||||
first_child = ''
|
||||
for first_child in children:
|
||||
if not isinstance(first_child, string_or_bytes):
|
||||
break
|
||||
if isinstance(first_child, string_or_bytes):
|
||||
# body contained only text, no tags
|
||||
first_child = body.makeelement(XHTML('p'))
|
||||
first_child.text, children[0] = children[0], first_child
|
||||
|
||||
amap = anchor_map[name]
|
||||
remove_name_attributes(root)
|
||||
|
||||
for elem in root.xpath('//*[@id]'):
|
||||
val = elem.get('id')
|
||||
if not val:
|
||||
continue
|
||||
if val in seen_anchors:
|
||||
nval = unique_anchor(seen_anchors, val)
|
||||
elem.set('id', nval)
|
||||
amap[val] = nval
|
||||
else:
|
||||
seen_anchors.add(val)
|
||||
|
||||
if 'id' not in first_child.attrib:
|
||||
first_child.set('id', unique_anchor(seen_anchors, 'top'))
|
||||
seen_anchors.add(first_child.get('id'))
|
||||
first_anchor_map[name] = first_child.get('id')
|
||||
|
||||
if insert_page_breaks:
|
||||
first_child.set('style', first_child.get('style', '') + '; page-break-before: always')
|
||||
|
||||
amap[''] = first_child.get('id')
|
||||
|
||||
# Fix links that point to local changed anchors
|
||||
for a in XPath('//h:a[starts-with(@href, "#")]')(root):
|
||||
q = a.get('href')[1:]
|
||||
if q in amap:
|
||||
a.set('href', '#' + amap[q])
|
||||
|
||||
for child in children:
|
||||
if isinstance(child, string_or_bytes):
|
||||
add_text(master_body, child)
|
||||
else:
|
||||
master_body.append(copy.deepcopy(child))
|
||||
|
||||
container.remove_item(name, remove_from_guide=False)
|
||||
|
||||
# Fix all links in the container that point to merged files
|
||||
for fname, media_type in iteritems(container.mime_map):
|
||||
repl = MergeLinkReplacer(fname, anchor_map, master, container)
|
||||
container.replace_links(fname, repl)
|
||||
|
||||
return first_anchor_map
|
||||
|
||||
|
||||
def merge_css(container, names, master):
|
||||
p = container.parsed
|
||||
msheet = p(master)
|
||||
master_base = os.path.dirname(master)
|
||||
merged = set()
|
||||
|
||||
for name in names:
|
||||
if name == master:
|
||||
continue
|
||||
# Rebase links if master is in a different directory
|
||||
if os.path.dirname(name) != master_base:
|
||||
container.replace_links(name, LinkRebaser(container, name, master))
|
||||
|
||||
sheet = p(name)
|
||||
|
||||
# Remove charset rules
|
||||
cr = [r for r in sheet.cssRules if r.type == r.CHARSET_RULE]
|
||||
[sheet.deleteRule(sheet.cssRules.index(r)) for r in cr]
|
||||
for rule in sheet.cssRules:
|
||||
msheet.add(rule)
|
||||
|
||||
container.remove_item(name)
|
||||
merged.add(name)
|
||||
|
||||
# Remove links to merged stylesheets in the html files, replacing with a
|
||||
# link to the master sheet
|
||||
for name, mt in iteritems(container.mime_map):
|
||||
if mt in OEB_DOCS:
|
||||
removed = False
|
||||
root = p(name)
|
||||
for link in XPath('//h:link[@href]')(root):
|
||||
q = container.href_to_name(link.get('href'), name)
|
||||
if q in merged:
|
||||
container.remove_from_xml(link)
|
||||
removed = True
|
||||
if removed:
|
||||
container.dirty(name)
|
||||
if removed and master not in set(all_stylesheets(container, name)):
|
||||
head = root.find('h:head', namespaces=XPNSMAP)
|
||||
if head is not None:
|
||||
link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name))
|
||||
container.insert_into_xml(head, link)
|
||||
|
||||
|
||||
def merge(container, category, names, master):
|
||||
'''
|
||||
Merge the specified files into a single file, automatically migrating all
|
||||
links and references to the affected files. The file must all either be HTML or CSS files.
|
||||
|
||||
:param category: Must be either ``'text'`` for HTML files or ``'styles'`` for CSS files
|
||||
:param names: The list of files to be merged
|
||||
:param master: Which of the merged files is the *master* file, that is, the file that will remain after merging.
|
||||
'''
|
||||
if category not in {'text', 'styles'}:
|
||||
raise AbortError('Cannot merge files of type: %s' % category)
|
||||
if len(names) < 2:
|
||||
raise AbortError('Must specify at least two files to be merged')
|
||||
if master not in names:
|
||||
raise AbortError('The master file (%s) must be one of the files being merged' % master)
|
||||
|
||||
if category == 'text':
|
||||
merge_html(container, names, master)
|
||||
elif category == 'styles':
|
||||
merge_css(container, names, master)
|
||||
|
||||
container.dirty(master)
|
||||
Reference in New Issue
Block a user