Added epub write support

2020-04-13 12:46:37 +02:00
parent 9f18513787
commit 79cad46732
9 changed files with 3049 additions and 0 deletions
@@ -0,0 +1,389 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import defaultdict
+from functools import partial
+
+from css_parser.css import CSSRule, CSSStyleDeclaration
+from css_selectors import parse, SelectorSyntaxError
+
+from calibre import force_unicode
+from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XHTML, css_text
+from calibre.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
+from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize
+from calibre.utils.icu import numeric_sort_key
+from css_selectors import Select, SelectorError
+from polyglot.builtins import iteritems, itervalues, unicode_type, filter
+
+
+def filter_used_rules(rules, log, select):
+    for rule in rules:
+        used = False
+        for selector in rule.selectorList:
+            try:
+                if select.has_matches(selector.selectorText):
+                    used = True
+                    break
+            except SelectorError:
+                # Cannot parse/execute this selector, be safe and assume it
+                # matches something
+                used = True
+                break
+        if not used:
+            yield rule
+
+
+def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None):
+    ans = set()
+    sheet = sheet or sheets[name]
+    for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
+        if rule.href:
+            iname = container.href_to_name(rule.href, name)
+            if iname in sheets:
+                ans.add(iname)
+    if recursion_level > 0:
+        for imported_sheet in tuple(ans):
+            ans |= get_imported_sheets(imported_sheet, container, sheets, recursion_level=recursion_level-1)
+    ans.discard(name)
+    return ans
+
+
+def merge_declarations(first, second):
+    for prop in second.getProperties():
+        first.setProperty(prop)
+
+
+def merge_identical_selectors(sheet):
+    ' Merge rules that have identical selectors '
+    selector_map = defaultdict(list)
+    for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
+        selector_map[rule.selectorText].append(rule)
+    remove = []
+    for rule_group in itervalues(selector_map):
+        if len(rule_group) > 1:
+            for i in range(1, len(rule_group)):
+                merge_declarations(rule_group[0].style, rule_group[i].style)
+                remove.append(rule_group[i])
+    for rule in remove:
+        sheet.cssRules.remove(rule)
+    return len(remove)
+
+
+def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False):
+    '''
+    Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content.
+
+    :param report: An optional callable that takes a single argument. It is called with information about the operations being performed.
+    :param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed.
+    :param merge_rules: If True, rules with identical selectors are merged.
+    '''
+    report = report or (lambda x:x)
+
+    def safe_parse(name):
+        try:
+            return container.parsed(name)
+        except TypeError:
+            pass
+    sheets = {name:safe_parse(name) for name, mt in iteritems(container.mime_map) if mt in OEB_STYLES}
+    sheets = {k:v for k, v in iteritems(sheets) if v is not None}
+    num_merged = 0
+    if merge_rules:
+        for name, sheet in iteritems(sheets):
+            num = merge_identical_selectors(sheet)
+            if num:
+                container.dirty(name)
+                num_merged += num
+    import_map = {name:get_imported_sheets(name, container, sheets) for name in sheets}
+    if remove_unused_classes:
+        class_map = {name:{icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in iteritems(sheets)}
+    style_rules = {name:tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in iteritems(sheets)}
+
+    num_of_removed_rules = num_of_removed_classes = 0
+
+    for name, mt in iteritems(container.mime_map):
+        if mt not in OEB_DOCS:
+            continue
+        root = container.parsed(name)
+        select = Select(root, ignore_inappropriate_pseudo_classes=True)
+        used_classes = set()
+        for style in root.xpath('//*[local-name()="style"]'):
+            if style.get('type', 'text/css') == 'text/css' and style.text:
+                sheet = container.parse_css(style.text)
+                if merge_rules:
+                    num = merge_identical_selectors(sheet)
+                    if num:
+                        num_merged += num
+                        container.dirty(name)
+                if remove_unused_classes:
+                    used_classes |= {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)}
+                imports = get_imported_sheets(name, container, sheets, sheet=sheet)
+                for imported_sheet in imports:
+                    style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
+                    if remove_unused_classes:
+                        used_classes |= class_map[imported_sheet]
+                rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
+                unused_rules = tuple(filter_used_rules(rules, container.log, select))
+                if unused_rules:
+                    num_of_removed_rules += len(unused_rules)
+                    [sheet.cssRules.remove(r) for r in unused_rules]
+                    style.text = force_unicode(sheet.cssText, 'utf-8')
+                    pretty_script_or_style(container, style)
+                    container.dirty(name)
+
+        for link in root.xpath('//*[local-name()="link" and @href]'):
+            sname = container.href_to_name(link.get('href'), name)
+            if sname not in sheets:
+                continue
+            style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select))
+            if remove_unused_classes:
+                used_classes |= class_map[sname]
+
+            for iname in import_map[sname]:
+                style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select))
+                if remove_unused_classes:
+                    used_classes |= class_map[iname]
+
+        if remove_unused_classes:
+            for elem in root.xpath('//*[@class]'):
+                original_classes, classes = elem.get('class', '').split(), []
+                for x in original_classes:
+                    if icu_lower(x) in used_classes:
+                        classes.append(x)
+                if len(classes) != len(original_classes):
+                    if classes:
+                        elem.set('class', ' '.join(classes))
+                    else:
+                        del elem.attrib['class']
+                    num_of_removed_classes += len(original_classes) - len(classes)
+                    container.dirty(name)
+
+    for name, sheet in iteritems(sheets):
+        unused_rules = style_rules[name]
+        if unused_rules:
+            num_of_removed_rules += len(unused_rules)
+            [sheet.cssRules.remove(r) for r in unused_rules]
+            container.dirty(name)
+
+    num_changes = num_of_removed_rules + num_merged + num_of_removed_classes
+    if num_changes > 0:
+        if num_of_removed_rules > 0:
+            report(ngettext('Removed one unused CSS style rule', 'Removed {} unused CSS style rules',
+                            num_of_removed_rules).format(num_of_removed_rules))
+        if num_of_removed_classes > 0:
+            report(ngettext('Removed one unused class from the HTML', 'Removed {} unused classes from the HTML',
+                   num_of_removed_classes).format(num_of_removed_classes))
+        if num_merged > 0:
+            report(ngettext('Merged one CSS style rule', 'Merged {} CSS style rules',
+                            num_merged).format(num_merged))
+    if num_of_removed_rules == 0:
+        report(_('No unused CSS style rules found'))
+    if remove_unused_classes and num_of_removed_classes == 0:
+        report(_('No unused class attributes found'))
+    if merge_rules and num_merged == 0:
+        report(_('No style rules that could be merged found'))
+    return num_changes > 0
+
+
+def filter_declaration(style, properties=()):
+    changed = False
+    for prop in properties:
+        if style.removeProperty(prop) != '':
+            changed = True
+    all_props = set(style.keys())
+    for prop in style.getProperties():
+        n = normalizers.get(prop.name, None)
+        if n is not None:
+            normalized = n(prop.name, prop.propertyValue)
+            removed = properties.intersection(set(normalized))
+            if removed:
+                changed = True
+                style.removeProperty(prop.name)
+                for prop in set(normalized) - removed - all_props:
+                    style.setProperty(prop, normalized[prop])
+    return changed
+
+
+def filter_sheet(sheet, properties=()):
+    from css_parser.css import CSSRule
+    changed = False
+    remove = []
+    for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
+        if filter_declaration(rule.style, properties):
+            changed = True
+            if rule.style.length == 0:
+                remove.append(rule)
+    for rule in remove:
+        sheet.cssRules.remove(rule)
+    return changed
+
+
+def transform_inline_styles(container, name, transform_sheet, transform_style):
+    root = container.parsed(name)
+    changed = False
+    for style in root.xpath('//*[local-name()="style"]'):
+        if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
+            sheet = container.parse_css(style.text)
+            if transform_sheet(sheet):
+                changed = True
+                style.text = force_unicode(sheet.cssText, 'utf-8')
+                pretty_script_or_style(container, style)
+    for elem in root.xpath('//*[@style]'):
+        text = elem.get('style', None)
+        if text:
+            style = container.parse_css(text, is_declaration=True)
+            if transform_style(style):
+                changed = True
+                if style.length == 0:
+                    del elem.attrib['style']
+                else:
+                    elem.set('style', force_unicode(style.getCssText(separator=' '), 'utf-8'))
+    return changed
+
+
+def transform_css(container, transform_sheet=None, transform_style=None, names=()):
+    if not names:
+        types = OEB_STYLES | OEB_DOCS
+        names = []
+        for name, mt in iteritems(container.mime_map):
+            if mt in types:
+                names.append(name)
+
+    doc_changed = False
+
+    for name in names:
+        mt = container.mime_map[name]
+        if mt in OEB_STYLES:
+            sheet = container.parsed(name)
+            if transform_sheet(sheet):
+                container.dirty(name)
+                doc_changed = True
+        elif mt in OEB_DOCS:
+            if transform_inline_styles(container, name, transform_sheet, transform_style):
+                container.dirty(name)
+                doc_changed = True
+
+    return doc_changed
+
+
+def filter_css(container, properties, names=()):
+    '''
+    Remove the specified CSS properties from all CSS rules in the book.
+
+    :param properties: Set of properties to remove. For example: :code:`{'font-family', 'color'}`.
+    :param names: The files from which to remove the properties. Defaults to all HTML and CSS files in the book.
+    '''
+    properties = normalize_filter_css(properties)
+    return transform_css(container, transform_sheet=partial(filter_sheet, properties=properties),
+                         transform_style=partial(filter_declaration, properties=properties), names=names)
+
+
+def _classes_in_selector(selector, classes):
+    for attr in ('selector', 'subselector', 'parsed_tree'):
+        s = getattr(selector, attr, None)
+        if s is not None:
+            _classes_in_selector(s, classes)
+    cn = getattr(selector, 'class_name', None)
+    if cn is not None:
+        classes.add(cn)
+
+
+def classes_in_selector(text):
+    classes = set()
+    try:
+        for selector in parse(text):
+            _classes_in_selector(selector, classes)
+    except SelectorSyntaxError:
+        pass
+    return classes
+
+
+def classes_in_rule_list(css_rules):
+    classes = set()
+    for rule in css_rules:
+        if rule.type == rule.STYLE_RULE:
+            classes |= classes_in_selector(rule.selectorText)
+        elif hasattr(rule, 'cssRules'):
+            classes |= classes_in_rule_list(rule.cssRules)
+    return classes
+
+
+def iter_declarations(sheet_or_rule):
+    if hasattr(sheet_or_rule, 'cssRules'):
+        for rule in sheet_or_rule.cssRules:
+            for x in iter_declarations(rule):
+                yield x
+    elif hasattr(sheet_or_rule, 'style'):
+        yield sheet_or_rule.style
+    elif isinstance(sheet_or_rule, CSSStyleDeclaration):
+        yield sheet_or_rule
+
+
+def remove_property_value(prop, predicate):
+    ''' Remove the Values that match the predicate from this property. If all
+    values of the property would be removed, the property is removed from its
+    parent instead. Note that this means the property must have a parent (a
+    CSSStyleDeclaration). '''
+    removed_vals = list(filter(predicate, prop.propertyValue))
+    if len(removed_vals) == len(prop.propertyValue):
+        prop.parent.removeProperty(prop.name)
+    else:
+        x = css_text(prop.propertyValue)
+        for v in removed_vals:
+            x = x.replace(css_text(v), '').strip()
+        prop.propertyValue.cssText = x
+    return bool(removed_vals)
+
+
+RULE_PRIORITIES = {t:i for i, t in enumerate((CSSRule.COMMENT, CSSRule.CHARSET_RULE, CSSRule.IMPORT_RULE, CSSRule.NAMESPACE_RULE))}
+
+
+def sort_sheet(container, sheet_or_text):
+    ''' Sort the rules in a stylesheet. Note that in the general case this can
+    change the effective styles, but for most common sheets, it should be safe.
+    '''
+    sheet = container.parse_css(sheet_or_text) if isinstance(sheet_or_text, unicode_type) else sheet_or_text
+
+    def text_sort_key(x):
+        return numeric_sort_key(unicode_type(x or ''))
+
+    def selector_sort_key(x):
+        return (x.specificity, text_sort_key(x.selectorText))
+
+    def rule_sort_key(rule):
+        primary = RULE_PRIORITIES.get(rule.type, len(RULE_PRIORITIES))
+        secondary = text_sort_key(getattr(rule, 'atkeyword', '') or '')
+        tertiary = None
+        if rule.type == CSSRule.STYLE_RULE:
+            primary += 1
+            selectors = sorted(rule.selectorList, key=selector_sort_key)
+            tertiary = selector_sort_key(selectors[0])
+            rule.selectorText = ', '.join(s.selectorText for s in selectors)
+        elif rule.type == CSSRule.FONT_FACE_RULE:
+            try:
+                tertiary = text_sort_key(rule.style.getPropertyValue('font-family'))
+            except Exception:
+                pass
+
+        return primary, secondary, tertiary
+    sheet.cssRules.sort(key=rule_sort_key)
+    return sheet
+
+
+def add_stylesheet_links(container, name, text):
+    root = container.parse_xhtml(text, name)
+    head = root.xpath('//*[local-name() = "head"]')
+    if not head:
+        return
+    head = head[0]
+    sheets = tuple(container.manifest_items_of_type(lambda mt: mt in OEB_STYLES))
+    if not sheets:
+        return
+    for sname in sheets:
+        link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(sname, name))
+        head.append(link)
+    pretty_xml_tree(head)
+    return serialize(root, 'text/html')
@@ -0,0 +1,404 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import codecs, shutil, os, posixpath
+from polyglot.builtins import iteritems, itervalues, map
+from functools import partial
+from collections import Counter, defaultdict
+
+from calibre import sanitize_file_name
+from calibre.ebooks.chardet import strip_encoding_declarations
+from calibre.ebooks.oeb.base import css_text
+from calibre.ebooks.oeb.polish.css import iter_declarations, remove_property_value
+from calibre.ebooks.oeb.polish.utils import extract
+from polyglot.urllib import urlparse, urlunparse
+
+
+class LinkReplacer(object):
+
+    def __init__(self, base, container, link_map, frag_map):
+        self.base = base
+        self.frag_map = frag_map
+        self.link_map = link_map
+        self.container = container
+        self.replaced = False
+
+    def __call__(self, url):
+        if url and url.startswith('#'):
+            repl = self.frag_map(self.base, url[1:])
+            if not repl or repl == url[1:]:
+                return url
+            self.replaced = True
+            return '#' + repl
+        name = self.container.href_to_name(url, self.base)
+        if not name:
+            return url
+        nname = self.link_map.get(name, None)
+        if not nname:
+            return url
+        purl = urlparse(url)
+        href = self.container.name_to_href(nname, self.base)
+        if purl.fragment:
+            nfrag = self.frag_map(name, purl.fragment)
+            if nfrag:
+                href += '#%s'%nfrag
+        if href != url:
+            self.replaced = True
+        return href
+
+
+class IdReplacer(object):
+
+    def __init__(self, base, container, id_map):
+        self.base, self.container, self.replaced = base, container, False
+        self.id_map = id_map
+
+    def __call__(self, url):
+        if url and url.startswith('#'):
+            repl = self.id_map.get(self.base, {}).get(url[1:])
+            if repl is None or repl == url[1:]:
+                return url
+            self.replaced = True
+            return '#' + repl
+        name = self.container.href_to_name(url, self.base)
+        if not name:
+            return url
+        id_map = self.id_map.get(name)
+        if id_map is None:
+            return url
+        purl = urlparse(url)
+        nfrag = id_map.get(purl.fragment)
+        if nfrag is None:
+            return url
+        purl = purl._replace(fragment=nfrag)
+        href = urlunparse(purl)
+        if href != url:
+            self.replaced = True
+        return href
+
+
+class LinkRebaser(object):
+
+    def __init__(self, container, old_name, new_name):
+        self.old_name, self.new_name = old_name, new_name
+        self.container = container
+        self.replaced = False
+
+    def __call__(self, url):
+        if url and url.startswith('#'):
+            return url
+        purl = urlparse(url)
+        frag = purl.fragment
+        name = self.container.href_to_name(url, self.old_name)
+        if not name:
+            return url
+        if name == self.old_name:
+            name = self.new_name
+        href = self.container.name_to_href(name, self.new_name)
+        if frag:
+            href += '#' + frag
+        if href != url:
+            self.replaced = True
+        return href
+
+
+def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_in_opf=False):
+    '''
+    Replace links to files in the container. Will iterate over all files in the container and change the specified links in them.
+
+    :param link_map: A mapping of old canonical name to new canonical name. For example: :code:`{'images/old.png': 'images/new.png'}`
+    :param frag_map: A callable that takes two arguments ``(name, anchor)`` and
+        returns a new anchor. This is useful if you need to change the anchors in
+        HTML files. By default, it does nothing.
+    :param replace_in_opf: If False, links are not replaced in the OPF file.
+
+    '''
+    for name, media_type in iteritems(container.mime_map):
+        if name == container.opf_name and not replace_in_opf:
+            continue
+        repl = LinkReplacer(name, container, link_map, frag_map)
+        container.replace_links(name, repl)
+
+
+def replace_ids(container, id_map):
+    '''
+    Replace all links in the container that pointed to the changed ids.
+
+    :param id_map: A mapping of {name:id_map} where each id_map is a mapping of {old_id:new_id}
+    :return: True iff at least one link was changed
+
+    '''
+    changed = False
+    for name, media_type in iteritems(container.mime_map):
+        repl = IdReplacer(name, container, id_map)
+        container.replace_links(name, repl)
+        if name == container.opf_name:
+            imap = id_map.get(name, {})
+            for item in container.opf_xpath('//*[@idref]'):
+                old_id = item.get('idref')
+                if old_id is not None:
+                    new_id = imap.get(old_id)
+                    if new_id is not None:
+                        item.set('idref', new_id)
+        if repl.replaced:
+            changed = True
+    return changed
+
+
+def smarten_punctuation(container, report):
+    from calibre.ebooks.conversion.preprocess import smarten_punctuation
+    smartened = False
+    for path in container.spine_items:
+        name = container.abspath_to_name(path)
+        changed = False
+        with container.open(name, 'r+b') as f:
+            html = container.decode(f.read())
+            newhtml = smarten_punctuation(html, container.log)
+            if newhtml != html:
+                changed = True
+                report(_('Smartened punctuation in: %s')%name)
+                newhtml = strip_encoding_declarations(newhtml)
+                f.seek(0)
+                f.truncate()
+                f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8'))
+        if changed:
+            # Add an encoding declaration (it will be added automatically when
+            # serialized)
+            root = container.parsed(name)
+            for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'):
+                m.getparent().remove(m)
+            container.dirty(name)
+            smartened = True
+    if not smartened:
+        report(_('No punctuation that could be smartened found'))
+    return smartened
+
+
+def rename_files(container, file_map):
+    '''
+    Rename files in the container, automatically updating all links to them.
+
+    :param file_map: A mapping of old canonical name to new canonical name, for
+        example: :code:`{'text/chapter1.html': 'chapter1.html'}`.
+    '''
+    overlap = set(file_map).intersection(set(itervalues(file_map)))
+    if overlap:
+        raise ValueError('Circular rename detected. The files %s are both rename targets and destinations' % ', '.join(overlap))
+    for name, dest in iteritems(file_map):
+        if container.exists(dest):
+            if name != dest and name.lower() == dest.lower():
+                # A case change on an OS with a case insensitive file-system.
+                continue
+            raise ValueError('Cannot rename {0} to {1} as {1} already exists'.format(name, dest))
+    if len(tuple(itervalues(file_map))) != len(set(itervalues(file_map))):
+        raise ValueError('Cannot rename, the set of destination files contains duplicates')
+    link_map = {}
+    for current_name, new_name in iteritems(file_map):
+        container.rename(current_name, new_name)
+        if new_name != container.opf_name:  # OPF is handled by the container
+            link_map[current_name] = new_name
+    replace_links(container, link_map, replace_in_opf=True)
+
+
+def replace_file(container, name, path, basename, force_mt=None):
+    dirname, base = name.rpartition('/')[0::2]
+    nname = sanitize_file_name(basename)
+    if dirname:
+        nname = dirname + '/' + nname
+    with open(path, 'rb') as src:
+        if name != nname:
+            count = 0
+            b, e = nname.rpartition('.')[0::2]
+            while container.exists(nname):
+                count += 1
+                nname = b + ('_%d.%s' % (count, e))
+            rename_files(container, {name:nname})
+            mt = force_mt or container.guess_type(nname)
+            container.mime_map[nname] = mt
+            for itemid, q in iteritems(container.manifest_id_map):
+                if q == nname:
+                    for item in container.opf_xpath('//opf:manifest/opf:item[@href and @id="%s"]' % itemid):
+                        item.set('media-type', mt)
+        container.dirty(container.opf_name)
+        with container.open(nname, 'wb') as dest:
+            shutil.copyfileobj(src, dest)
+
+
+def mt_to_category(container, mt):
+    from calibre.ebooks.oeb.polish.utils import guess_type
+    from calibre.ebooks.oeb.polish.container import OEB_FONTS
+    from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES
+    if mt in OEB_DOCS:
+        category = 'text'
+    elif mt in OEB_STYLES:
+        category = 'style'
+    elif mt in OEB_FONTS:
+        category = 'font'
+    elif mt == guess_type('a.opf'):
+        category = 'opf'
+    elif mt == guess_type('a.ncx'):
+        category = 'toc'
+    else:
+        category = mt.partition('/')[0]
+    return category
+
+
+def get_recommended_folders(container, names):
+    ''' Return the folders that are recommended for the given filenames. The
+    recommendation is based on where the majority of files of the same type are
+    located in the container. If no files of a particular type are present, the
+    recommended folder is assumed to be the folder containing the OPF file. '''
+    from calibre.ebooks.oeb.polish.utils import guess_type
+    counts = defaultdict(Counter)
+    for name, mt in iteritems(container.mime_map):
+        folder = name.rpartition('/')[0] if '/' in name else ''
+        counts[mt_to_category(container, mt)][folder] += 1
+
+    try:
+        opf_folder = counts['opf'].most_common(1)[0][0]
+    except KeyError:
+        opf_folder = ''
+
+    recommendations = {category:counter.most_common(1)[0][0] for category, counter in iteritems(counts)}
+    return {n:recommendations.get(mt_to_category(container, guess_type(os.path.basename(n))), opf_folder) for n in names}
+
+
+def normalize_case(container, val):
+
+    def safe_listdir(x):
+        try:
+            return os.listdir(x)
+        except EnvironmentError:
+            return ()
+
+    parts = val.split('/')
+    ans = []
+    for i in range(len(parts)):
+        q = '/'.join(parts[:i+1])
+        x = container.name_to_abspath(q)
+        xl = parts[i].lower()
+        candidates = [c for c in safe_listdir(os.path.dirname(x)) if c != parts[i] and c.lower() == xl]
+        ans.append(candidates[0] if candidates else parts[i])
+    return '/'.join(ans)
+
+
+def rationalize_folders(container, folder_type_map):
+    all_names = set(container.mime_map)
+    new_names = set()
+    name_map = {}
+    for key in tuple(folder_type_map):
+        val = folder_type_map[key]
+        folder_type_map[key] = normalize_case(container, val)
+    for name in all_names:
+        if name.startswith('META-INF/'):
+            continue
+        category = mt_to_category(container, container.mime_map[name])
+        folder = folder_type_map.get(category, None)
+        if folder is not None:
+            bn = posixpath.basename(name)
+            new_name = posixpath.join(folder, bn)
+            if new_name != name:
+                c = 0
+                while new_name in all_names or new_name in new_names:
+                    c += 1
+                    n, ext = bn.rpartition('.')[0::2]
+                    new_name = posixpath.join(folder, '%s_%d.%s' % (n, c, ext))
+                name_map[name] = new_name
+                new_names.add(new_name)
+    return name_map
+
+
+def remove_links_in_sheet(href_to_name, sheet, predicate):
+    import_rules_to_remove = []
+    changed = False
+    for i, r in enumerate(sheet):
+        if r.type == r.IMPORT_RULE:
+            name = href_to_name(r.href)
+            if predicate(name, r.href, None):
+                import_rules_to_remove.append(i)
+    for i in sorted(import_rules_to_remove, reverse=True):
+        sheet.deleteRule(i)
+        changed = True
+
+    for dec in iter_declarations(sheet):
+        changed = remove_links_in_declaration(href_to_name, dec, predicate) or changed
+    return changed
+
+
+def remove_links_in_declaration(href_to_name, style, predicate):
+    def check_pval(v):
+        if v.type == v.URI:
+            name = href_to_name(v.uri)
+            return predicate(name, v.uri, None)
+        return False
+
+    changed = False
+
+    for p in tuple(style.getProperties(all=True)):
+        changed = remove_property_value(p, check_pval) or changed
+    return changed
+
+
+def remove_links_to(container, predicate):
+    ''' predicate must be a function that takes the arguments (name, href,
+    fragment=None) and returns True iff the link should be removed '''
+    from calibre.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML
+    stylepath = XPath('//h:style')
+    styleattrpath = XPath('//*[@style]')
+    changed = set()
+    for name, mt in iteritems(container.mime_map):
+        removed = False
+        if mt in OEB_DOCS:
+            root = container.parsed(name)
+            for el, attr, href, pos in iterlinks(root, find_links_in_css=False):
+                hname = container.href_to_name(href, name)
+                frag = href.partition('#')[-1]
+                if predicate(hname, href, frag):
+                    if attr is None:
+                        el.text = None
+                    else:
+                        if el.tag == XHTML('link') or el.tag == XHTML('img'):
+                            extract(el)
+                        else:
+                            del el.attrib[attr]
+                    removed = True
+            for tag in stylepath(root):
+                if tag.text and (tag.get('type') or 'text/css').lower() == 'text/css':
+                    sheet = container.parse_css(tag.text)
+                    if remove_links_in_sheet(partial(container.href_to_name, base=name), sheet, predicate):
+                        tag.text = css_text(sheet)
+                        removed = True
+            for tag in styleattrpath(root):
+                style = tag.get('style')
+                if style:
+                    style = container.parse_css(style, is_declaration=True)
+                    if remove_links_in_declaration(partial(container.href_to_name, base=name), style, predicate):
+                        removed = True
+                        tag.set('style', css_text(style))
+        elif mt in OEB_STYLES:
+            removed = remove_links_in_sheet(partial(container.href_to_name, base=name), container.parsed(name), predicate)
+        if removed:
+            changed.add(name)
+    tuple(map(container.dirty, changed))
+    return changed
+
+
+def get_spine_order_for_all_files(container):
+    linear_names, non_linear_names = [], []
+    for name, is_linear in container.spine_names:
+        (linear_names if is_linear else non_linear_names).append(name)
+    all_names = linear_names + non_linear_names
+    spine_names = frozenset(all_names)
+    ans = {}
+    for spine_pos, name in enumerate(all_names):
+        ans.setdefault(name, (spine_pos, -1))
+        for i, href in enumerate(container.iterlinks(name, get_line_numbers=False)):
+            lname = container.href_to_name(href, name)
+            if lname not in spine_names:
+                ans.setdefault(lname, (spine_pos, i))
+    return ans
@@ -0,0 +1,517 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import copy, os, re
+from polyglot.builtins import map, string_or_bytes, range
+
+from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
+from calibre.ebooks.oeb.polish.errors import MalformedMarkup
+from calibre.ebooks.oeb.polish.toc import node_from_loc
+from calibre.ebooks.oeb.polish.replace import LinkRebaser
+from polyglot.builtins import iteritems, unicode_type
+from polyglot.urllib import urlparse
+
+
+class AbortError(ValueError):
+    pass
+
+
+def in_table(node):
+    while node is not None:
+        if node.tag.endswith('}table'):
+            return True
+        node = node.getparent()
+    return False
+
+
+def adjust_split_point(split_point, log):
+    '''
+    Move the split point up its ancestor chain if it has no content
+    before it. This handles the common case:
+    <div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
+    h2.
+    '''
+    sp = split_point
+    while True:
+        parent = sp.getparent()
+        if (
+            parent is None or
+            barename(parent.tag) in {'body', 'html'} or
+            (parent.text and parent.text.strip()) or
+            parent.index(sp) > 0
+        ):
+            break
+        sp = parent
+
+    if sp is not split_point:
+        log.debug('Adjusted split point to ancestor')
+
+    return sp
+
+
+def get_body(root):
+    return root.find('h:body', namespaces=XPNSMAP)
+
+
+def do_split(split_point, log, before=True):
+    '''
+    Split tree into a *before* and an *after* tree at ``split_point``.
+
+    :param split_point: The Element at which to split
+    :param before: If True tree is split before split_point, otherwise after split_point
+    :return: before_tree, after_tree
+    '''
+    if before:
+        # We cannot adjust for after since moving an after split point to a
+        # parent will cause breakage if the parent contains any content
+        # after the original split point
+        split_point = adjust_split_point(split_point, log)
+    tree         = split_point.getroottree()
+    path         = tree.getpath(split_point)
+
+    tree, tree2  = copy.deepcopy(tree), copy.deepcopy(tree)
+    root, root2  = tree.getroot(), tree2.getroot()
+    body, body2  = map(get_body, (root, root2))
+    split_point  = root.xpath(path)[0]
+    split_point2 = root2.xpath(path)[0]
+
+    def nix_element(elem, top=True):
+        # Remove elem unless top is False in which case replace elem by its
+        # children
+        parent = elem.getparent()
+        if top:
+            parent.remove(elem)
+        else:
+            index = parent.index(elem)
+            parent[index:index+1] = list(elem.iterchildren())
+
+    # Tree 1
+    hit_split_point = False
+    keep_descendants = False
+    split_point_descendants = frozenset(split_point.iterdescendants())
+    for elem in tuple(body.iterdescendants()):
+        if elem is split_point:
+            hit_split_point = True
+            if before:
+                nix_element(elem)
+            else:
+                # We want to keep the descendants of the split point in
+                # Tree 1
+                keep_descendants = True
+                # We want the split point element, but not its tail
+                elem.tail = '\n'
+
+            continue
+        if hit_split_point:
+            if keep_descendants:
+                if elem in split_point_descendants:
+                    # elem is a descendant keep it
+                    continue
+                else:
+                    # We are out of split_point, so prevent further set
+                    # lookups of split_point_descendants
+                    keep_descendants = False
+            nix_element(elem)
+
+    # Tree 2
+    ancestors = frozenset(XPath('ancestor::*')(split_point2))
+    for elem in tuple(body2.iterdescendants()):
+        if elem is split_point2:
+            if not before:
+                # Keep the split point element's tail, if it contains non-whitespace
+                # text
+                tail = elem.tail
+                if tail and not tail.isspace():
+                    parent = elem.getparent()
+                    idx = parent.index(elem)
+                    if idx == 0:
+                        parent.text = (parent.text or '') + tail
+                    else:
+                        sib = parent[idx-1]
+                        sib.tail = (sib.tail or '') + tail
+                # Remove the element itself
+                nix_element(elem)
+            break
+        if elem in ancestors:
+            # We have to preserve the ancestors as they could have CSS
+            # styles that are inherited/applicable, like font or
+            # width. So we only remove the text, if any.
+            elem.text = '\n'
+        else:
+            nix_element(elem, top=False)
+
+    body2.text = '\n'
+
+    return tree, tree2
+
+
+class SplitLinkReplacer(object):
+
+    def __init__(self, base, bottom_anchors, top_name, bottom_name, container):
+        self.bottom_anchors, self.bottom_name = bottom_anchors, bottom_name
+        self.container, self.top_name = container, top_name
+        self.base = base
+        self.replaced = False
+
+    def __call__(self, url):
+        if url and url.startswith('#'):
+            return url
+        name = self.container.href_to_name(url, self.base)
+        if name != self.top_name:
+            return url
+        purl = urlparse(url)
+        if purl.fragment and purl.fragment in self.bottom_anchors:
+            url = self.container.name_to_href(self.bottom_name, self.base) + '#' + purl.fragment
+            self.replaced = True
+        return url
+
+
+def split(container, name, loc_or_xpath, before=True, totals=None):
+    '''
+    Split the file specified by name at the position specified by loc_or_xpath.
+    Splitting automatically migrates all links and references to the affected
+    files.
+
+    :param loc_or_xpath: Should be an XPath expression such as
+        //h:div[@id="split_here"]. Can also be a *loc* which is used internally to
+        implement splitting in the preview panel.
+    :param before: If True the split occurs before the identified element otherwise after it.
+    :param totals: Used internally
+    '''
+
+    root = container.parsed(name)
+    if isinstance(loc_or_xpath, unicode_type):
+        split_point = root.xpath(loc_or_xpath)[0]
+    else:
+        try:
+            split_point = node_from_loc(root, loc_or_xpath, totals=totals)
+        except MalformedMarkup:
+            # The webkit HTML parser and the container parser have yielded
+            # different node counts, this can happen if the file is valid XML
+            # but contains constructs like nested <p> tags. So force parse it
+            # with the HTML 5 parser and try again.
+            raw = container.raw_data(name)
+            root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
+            try:
+                split_point = node_from_loc(root, loc_or_xpath, totals=totals)
+            except MalformedMarkup:
+                raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
+                                        ' before splitting') % name)
+            container.replace(name, root)
+    if in_table(split_point):
+        raise AbortError('Cannot split inside tables')
+    if split_point.tag.endswith('}body'):
+        raise AbortError('Cannot split on the <body> tag')
+    tree1, tree2 = do_split(split_point, container.log, before=before)
+    root1, root2 = tree1.getroot(), tree2.getroot()
+    anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''}
+    anchors_in_bottom = frozenset(root2.xpath('//*/@id')) | frozenset(root2.xpath('//*/@name'))
+    base, ext = name.rpartition('.')[0::2]
+    base = re.sub(r'_split\d+$', '', base)
+    nname, s = None, 0
+    while not nname or container.exists(nname):
+        s += 1
+        nname = '%s_split%d.%s' % (base, s, ext)
+    manifest_item = container.generate_item(nname, media_type=container.mime_map[name])
+    bottom_name = container.href_to_name(manifest_item.get('href'), container.opf_name)
+
+    # Fix links in the split trees
+    for r in (root1, root2):
+        for a in r.xpath('//*[@href]'):
+            url = a.get('href')
+            if url.startswith('#'):
+                fname = name
+            else:
+                fname = container.href_to_name(url, name)
+            if fname == name:
+                purl = urlparse(url)
+                if purl.fragment in anchors_in_top:
+                    if r is root2:
+                        a.set('href', '%s#%s' % (container.name_to_href(name, bottom_name), purl.fragment))
+                    else:
+                        a.set('href', '#' + purl.fragment)
+                elif purl.fragment in anchors_in_bottom:
+                    if r is root1:
+                        a.set('href', '%s#%s' % (container.name_to_href(bottom_name, name), purl.fragment))
+                    else:
+                        a.set('href', '#' + purl.fragment)
+
+    # Fix all links in the container that point to anchors in the bottom tree
+    for fname, media_type in iteritems(container.mime_map):
+        if fname not in {name, bottom_name}:
+            repl = SplitLinkReplacer(fname, anchors_in_bottom, name, bottom_name, container)
+            container.replace_links(fname, repl)
+
+    container.replace(name, root1)
+    container.replace(bottom_name, root2)
+
+    spine = container.opf_xpath('//opf:spine')[0]
+    for spine_item, spine_name, linear in container.spine_iter:
+        if spine_name == name:
+            break
+    index = spine.index(spine_item) + 1
+
+    si = spine.makeelement(OPF('itemref'), idref=manifest_item.get('id'))
+    if not linear:
+        si.set('linear', 'no')
+    container.insert_into_xml(spine, si, index=index)
+    container.dirty(container.opf_name)
+    return bottom_name
+
+
+def multisplit(container, name, xpath, before=True):
+    '''
+    Split the specified file at multiple locations (all tags that match the specified XPath expression). See also: :func:`split`.
+    Splitting automatically migrates all links and references to the affected
+    files.
+
+    :param before: If True the splits occur before the identified element otherwise after it.
+    '''
+    root = container.parsed(name)
+    nodes = root.xpath(xpath, namespaces=XPNSMAP)
+    if not nodes:
+        raise AbortError(_('The expression %s did not match any nodes') % xpath)
+    for split_point in nodes:
+        if in_table(split_point):
+            raise AbortError('Cannot split inside tables')
+        if split_point.tag.endswith('}body'):
+            raise AbortError('Cannot split on the <body> tag')
+
+    for i, tag in enumerate(nodes):
+        tag.set('calibre-split-point', unicode_type(i))
+
+    current = name
+    all_names = [name]
+    for i in range(len(nodes)):
+        current = split(container, current, '//*[@calibre-split-point="%d"]' % i, before=before)
+        all_names.append(current)
+
+    for x in all_names:
+        for tag in container.parsed(x).xpath('//*[@calibre-split-point]'):
+            tag.attrib.pop('calibre-split-point')
+        container.dirty(x)
+
+    return all_names[1:]
+
+
+class MergeLinkReplacer(object):
+
+    def __init__(self, base, anchor_map, master, container):
+        self.container, self.anchor_map = container, anchor_map
+        self.master = master
+        self.base = base
+        self.replaced = False
+
+    def __call__(self, url):
+        if url and url.startswith('#'):
+            return url
+        name = self.container.href_to_name(url, self.base)
+        amap = self.anchor_map.get(name, None)
+        if amap is None:
+            return url
+        purl = urlparse(url)
+        frag = purl.fragment or ''
+        frag = amap.get(frag, frag)
+        url = self.container.name_to_href(self.master, self.base) + '#' + frag
+        self.replaced = True
+        return url
+
+
+def add_text(body, text):
+    if len(body) > 0:
+        body[-1].tail = (body[-1].tail or '') + text
+    else:
+        body.text = (body.text or '') + text
+
+
+def all_anchors(root):
+    return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name'))
+
+
+def all_stylesheets(container, name):
+    for link in XPath('//h:head/h:link[@href]')(container.parsed(name)):
+        name = container.href_to_name(link.get('href'), name)
+        typ = link.get('type', 'text/css')
+        if typ == 'text/css':
+            yield name
+
+
+def unique_anchor(seen_anchors, current):
+    c = 0
+    ans = current
+    while ans in seen_anchors:
+        c += 1
+        ans = '%s_%d' % (current, c)
+    return ans
+
+
+def remove_name_attributes(root):
+    # Remove all name attributes, replacing them with id attributes
+    for elem in root.xpath('//*[@id and @name]'):
+        del elem.attrib['name']
+    for elem in root.xpath('//*[@name]'):
+        elem.set('id', elem.attrib.pop('name'))
+
+
+def merge_html(container, names, master, insert_page_breaks=False):
+    p = container.parsed
+    root = p(master)
+
+    # Ensure master has a <head>
+    head = root.find('h:head', namespaces=XPNSMAP)
+    if head is None:
+        head = root.makeelement(XHTML('head'))
+        container.insert_into_xml(root, head, 0)
+
+    seen_anchors = all_anchors(root)
+    seen_stylesheets = set(all_stylesheets(container, master))
+    master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
+    master_base = os.path.dirname(master)
+    anchor_map = {n:{} for n in names if n != master}
+    first_anchor_map = {}
+
+    for name in names:
+        if name == master:
+            continue
+        # Insert new stylesheets into master
+        for sheet in all_stylesheets(container, name):
+            if sheet not in seen_stylesheets:
+                seen_stylesheets.add(sheet)
+                link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
+                container.insert_into_xml(head, link)
+
+        # Rebase links if master is in a different directory
+        if os.path.dirname(name) != master_base:
+            container.replace_links(name, LinkRebaser(container, name, master))
+
+        root = p(name)
+        children = []
+        for body in p(name).findall('h:body', namespaces=XPNSMAP):
+            children.append(body.text if body.text and body.text.strip() else '\n\n')
+            children.extend(body)
+
+        first_child = ''
+        for first_child in children:
+            if not isinstance(first_child, string_or_bytes):
+                break
+        if isinstance(first_child, string_or_bytes):
+            # body contained only text, no tags
+            first_child = body.makeelement(XHTML('p'))
+            first_child.text, children[0] = children[0], first_child
+
+        amap = anchor_map[name]
+        remove_name_attributes(root)
+
+        for elem in root.xpath('//*[@id]'):
+            val = elem.get('id')
+            if not val:
+                continue
+            if val in seen_anchors:
+                nval = unique_anchor(seen_anchors, val)
+                elem.set('id', nval)
+                amap[val] = nval
+            else:
+                seen_anchors.add(val)
+
+        if 'id' not in first_child.attrib:
+            first_child.set('id', unique_anchor(seen_anchors, 'top'))
+            seen_anchors.add(first_child.get('id'))
+        first_anchor_map[name] = first_child.get('id')
+
+        if insert_page_breaks:
+            first_child.set('style', first_child.get('style', '') + '; page-break-before: always')
+
+        amap[''] = first_child.get('id')
+
+        # Fix links that point to local changed anchors
+        for a in XPath('//h:a[starts-with(@href, "#")]')(root):
+            q = a.get('href')[1:]
+            if q in amap:
+                a.set('href', '#' + amap[q])
+
+        for child in children:
+            if isinstance(child, string_or_bytes):
+                add_text(master_body, child)
+            else:
+                master_body.append(copy.deepcopy(child))
+
+        container.remove_item(name, remove_from_guide=False)
+
+    # Fix all links in the container that point to merged files
+    for fname, media_type in iteritems(container.mime_map):
+        repl = MergeLinkReplacer(fname, anchor_map, master, container)
+        container.replace_links(fname, repl)
+
+    return first_anchor_map
+
+
+def merge_css(container, names, master):
+    p = container.parsed
+    msheet = p(master)
+    master_base = os.path.dirname(master)
+    merged = set()
+
+    for name in names:
+        if name == master:
+            continue
+        # Rebase links if master is in a different directory
+        if os.path.dirname(name) != master_base:
+            container.replace_links(name, LinkRebaser(container, name, master))
+
+        sheet = p(name)
+
+        # Remove charset rules
+        cr = [r for r in sheet.cssRules if r.type == r.CHARSET_RULE]
+        [sheet.deleteRule(sheet.cssRules.index(r)) for r in cr]
+        for rule in sheet.cssRules:
+            msheet.add(rule)
+
+        container.remove_item(name)
+        merged.add(name)
+
+    # Remove links to merged stylesheets in the html files, replacing with a
+    # link to the master sheet
+    for name, mt in iteritems(container.mime_map):
+        if mt in OEB_DOCS:
+            removed = False
+            root = p(name)
+            for link in XPath('//h:link[@href]')(root):
+                q = container.href_to_name(link.get('href'), name)
+                if q in merged:
+                    container.remove_from_xml(link)
+                    removed = True
+            if removed:
+                container.dirty(name)
+            if removed and master not in set(all_stylesheets(container, name)):
+                head = root.find('h:head', namespaces=XPNSMAP)
+                if head is not None:
+                    link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name))
+                    container.insert_into_xml(head, link)
+
+
+def merge(container, category, names, master):
+    '''
+    Merge the specified files into a single file, automatically migrating all
+    links and references to the affected files. The file must all either be HTML or CSS files.
+
+    :param category: Must be either ``'text'`` for HTML files or ``'styles'`` for CSS files
+    :param names: The list of files to be merged
+    :param master: Which of the merged files is the *master* file, that is, the file that will remain after merging.
+    '''
+    if category not in {'text', 'styles'}:
+        raise AbortError('Cannot merge files of type: %s' % category)
+    if len(names) < 2:
+        raise AbortError('Must specify at least two files to be merged')
+    if master not in names:
+        raise AbortError('The master file (%s) must be one of the files being merged' % master)
+
+    if category == 'text':
+        merge_html(container, names, master)
+    elif category == 'styles':
+        merge_css(container, names, master)
+
+    container.dirty(master)
@@ -0,0 +1,172 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import textwrap
+
+from calibre import guess_type
+from calibre.utils.imghdr import identify
+from calibre.utils.xml_parse import safe_xml_fromstring
+from polyglot.builtins import unicode_type
+from polyglot.urllib import unquote
+
+
+class CoverManager(object):
+
+    SVG_TEMPLATE = textwrap.dedent('''\
+        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+            <head>
+                <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+                <meta name="calibre:cover" content="true" />
+                <title>Cover</title>
+                <style type="text/css" title="override_css">
+                    @page {padding: 0pt; margin:0pt}
+                    body { text-align: center; padding:0pt; margin: 0pt; }
+                </style>
+            </head>
+            <body>
+                <div>
+                    <svg version="1.1" xmlns="http://www.w3.org/2000/svg"
+                        xmlns:xlink="http://www.w3.org/1999/xlink"
+                        width="100%%" height="100%%" viewBox="__viewbox__"
+                        preserveAspectRatio="__ar__">
+                        <image width="__width__" height="__height__" xlink:href="%s"/>
+                    </svg>
+                </div>
+            </body>
+        </html>
+        ''')
+
+    NONSVG_TEMPLATE = textwrap.dedent('''\
+        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+            <head>
+                <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+                <meta name="calibre:cover" content="true" />
+                <title>Cover</title>
+                <style type="text/css" title="override_css">
+                    @page {padding: 0pt; margin:0pt}
+                    body { text-align: center; padding:0pt; margin: 0pt }
+                    div { padding:0pt; margin: 0pt }
+                    img { padding:0pt; margin: 0pt }
+                </style>
+            </head>
+            <body>
+                <div>
+                    <img src="%s" alt="cover" __style__ />
+                </div>
+            </body>
+        </html>
+    ''')
+
+    def __init__(self, no_default_cover=False, no_svg_cover=False,
+            preserve_aspect_ratio=False, fixed_size=None):
+        self.no_default_cover = no_default_cover
+        self.no_svg_cover = no_svg_cover
+        self.preserve_aspect_ratio = preserve_aspect_ratio
+
+        ar = 'xMidYMid meet' if preserve_aspect_ratio else 'none'
+        self.svg_template = self.SVG_TEMPLATE.replace('__ar__', ar)
+
+        if fixed_size is None:
+            style = 'style="height: 100%%"'
+        else:
+            width, height = fixed_size
+            style = 'style="height: %s; width: %s"'%(height, width)
+        self.non_svg_template = self.NONSVG_TEMPLATE.replace('__style__',
+                style)
+
+    def __call__(self, oeb, opts, log):
+        self.oeb = oeb
+        self.log = log
+        self.insert_cover()
+
+    def default_cover(self):
+        '''
+        Create a generic cover for books that dont have a cover
+        '''
+        if self.no_default_cover:
+            return None
+        self.log('Generating default cover')
+        m = self.oeb.metadata
+        title = unicode_type(m.title[0])
+        authors = [unicode_type(x) for x in m.creator if x.role == 'aut']
+        try:
+            from calibre.ebooks.covers import create_cover
+            series = series_index = None
+            if m.series:
+                try:
+                    series, series_index = unicode_type(m.series[0]), m.series_index[0]
+                except IndexError:
+                    pass
+            img_data = create_cover(title, authors, series, series_index)
+            id, href = self.oeb.manifest.generate('cover',
+                    'cover_image.jpg')
+            item = self.oeb.manifest.add(id, href, guess_type('t.jpg')[0],
+                        data=img_data)
+            m.clear('cover')
+            m.add('cover', item.id)
+
+            return item.href
+        except:
+            self.log.exception('Failed to generate default cover')
+        return None
+
+    def inspect_cover(self, href):
+        from calibre.ebooks.oeb.base import urlnormalize
+        for x in self.oeb.manifest:
+            if x.href == urlnormalize(href):
+                try:
+                    raw = x.data
+                    return identify(raw)[1:]
+                except Exception:
+                    self.log.exception('Failed to read cover image dimensions')
+        return -1, -1
+
+    def insert_cover(self):
+        from calibre.ebooks.oeb.base import urldefrag
+        g, m = self.oeb.guide, self.oeb.manifest
+        item = None
+        if 'titlepage' not in g:
+            if 'cover' in g:
+                href = g['cover'].href
+            else:
+                href = self.default_cover()
+            if href is None:
+                return
+            width, height = self.inspect_cover(href)
+            if width == -1 or height == -1:
+                self.log.warning('Failed to read cover dimensions')
+                width, height = 600, 800
+            # if self.preserve_aspect_ratio:
+            #    width, height = 600, 800
+            self.svg_template = self.svg_template.replace('__viewbox__',
+                    '0 0 %d %d'%(width, height))
+            self.svg_template = self.svg_template.replace('__width__',
+                    unicode_type(width))
+            self.svg_template = self.svg_template.replace('__height__',
+                    unicode_type(height))
+
+            if href is not None:
+                templ = self.non_svg_template if self.no_svg_cover \
+                        else self.svg_template
+                tp = templ%unquote(href)
+                id, href = m.generate('titlepage', 'titlepage.xhtml')
+                item = m.add(id, href, guess_type('t.xhtml')[0],
+                        data=safe_xml_fromstring(tp))
+        else:
+            item = self.oeb.manifest.hrefs[
+                    urldefrag(self.oeb.guide['titlepage'].href)[0]]
+        if item is not None:
+            self.oeb.spine.insert(0, item, True)
+            if 'cover' not in self.oeb.guide.refs:
+                self.oeb.guide.add('cover', 'Title Page', 'a')
+            self.oeb.guide.refs['cover'].href = item.href
+            if 'titlepage' in self.oeb.guide.refs:
+                self.oeb.guide.refs['titlepage'].href = item.href
+            titem = getattr(self.oeb.toc, 'item_that_refers_to_cover', None)
+            if titem is not None:
+                titem.href = item.href
@@ -0,0 +1,187 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import posixpath
+
+from lxml import etree
+
+from calibre.ebooks.oeb.base import rewrite_links, urlnormalize
+from polyglot.urllib import urldefrag, urlparse
+
+
+class RenameFiles(object):  # {{{
+
+    '''
+    Rename files and adjust all links pointing to them. Note that the spine
+    and manifest are not touched by this transform.
+    '''
+
+    def __init__(self, rename_map, renamed_items_map=None):
+        self.rename_map = rename_map
+        self.renamed_items_map = renamed_items_map
+
+    def __call__(self, oeb, opts):
+        import css_parser
+        self.log = oeb.logger
+        self.opts = opts
+        self.oeb = oeb
+
+        for item in oeb.manifest.items:
+            self.current_item = item
+            if etree.iselement(item.data):
+                rewrite_links(self.current_item.data, self.url_replacer)
+            elif hasattr(item.data, 'cssText'):
+                css_parser.replaceUrls(item.data, self.url_replacer)
+
+        if self.oeb.guide:
+            for ref in self.oeb.guide.values():
+                href = urlnormalize(ref.href)
+                href, frag = urldefrag(href)
+                replacement = self.rename_map.get(href, None)
+                if replacement is not None:
+                    nhref = replacement
+                    if frag:
+                        nhref += '#' + frag
+                    ref.href = nhref
+
+        if self.oeb.toc:
+            self.fix_toc_entry(self.oeb.toc)
+
+    def fix_toc_entry(self, toc):
+        if toc.href:
+            href = urlnormalize(toc.href)
+            href, frag = urldefrag(href)
+            replacement = self.rename_map.get(href, None)
+
+            if replacement is not None:
+                nhref = replacement
+                if frag:
+                    nhref = '#'.join((nhref, frag))
+                toc.href = nhref
+
+        for x in toc:
+            self.fix_toc_entry(x)
+
+    def url_replacer(self, orig_url):
+        url = urlnormalize(orig_url)
+        parts = urlparse(url)
+        if parts.scheme:
+            # Only rewrite local URLs
+            return orig_url
+        path, frag = urldefrag(url)
+        if self.renamed_items_map:
+            orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item)
+        else:
+            orig_item = self.current_item
+
+        href = orig_item.abshref(path)
+        replacement = self.current_item.relhref(self.rename_map.get(href, href))
+        if frag:
+            replacement += '#' + frag
+        return replacement
+
+# }}}
+
+
+class UniqueFilenames(object):  # {{{
+
+    'Ensure that every item in the manifest has a unique filename'
+
+    def __call__(self, oeb, opts):
+        self.log = oeb.logger
+        self.opts = opts
+        self.oeb = oeb
+
+        self.seen_filenames = set()
+        self.rename_map = {}
+
+        for item in list(oeb.manifest.items):
+            fname = posixpath.basename(item.href)
+            if fname in self.seen_filenames:
+                suffix = self.unique_suffix(fname)
+                data = item.data
+                base, ext = posixpath.splitext(item.href)
+                nhref = base + suffix + ext
+                nhref = oeb.manifest.generate(href=nhref)[1]
+                spine_pos = item.spine_position
+                oeb.manifest.remove(item)
+                nitem = oeb.manifest.add(item.id, nhref, item.media_type, data=data,
+                        fallback=item.fallback)
+                self.seen_filenames.add(posixpath.basename(nhref))
+                self.rename_map[item.href] = nhref
+                if spine_pos is not None:
+                    oeb.spine.insert(spine_pos, nitem, item.linear)
+            else:
+                self.seen_filenames.add(fname)
+
+        if self.rename_map:
+            self.log('Found non-unique filenames, renaming to support broken'
+                    ' EPUB readers like FBReader, Aldiko and Stanza...')
+            from pprint import pformat
+            self.log.debug(pformat(self.rename_map))
+
+            renamer = RenameFiles(self.rename_map)
+            renamer(oeb, opts)
+
+    def unique_suffix(self, fname):
+        base, ext = posixpath.splitext(fname)
+        c = 0
+        while True:
+            c += 1
+            suffix = '_u%d'%c
+            candidate = base + suffix + ext
+            if candidate not in self.seen_filenames:
+                return suffix
+# }}}
+
+
+class FlatFilenames(object):  # {{{
+
+    'Ensure that every item in the manifest has a unique filename without subdirectories.'
+
+    def __call__(self, oeb, opts):
+        self.log = oeb.logger
+        self.opts = opts
+        self.oeb = oeb
+
+        self.rename_map = {}
+        self.renamed_items_map = {}
+
+        for item in list(oeb.manifest.items):
+            # Flatten URL by removing directories.
+            # Example: a/b/c/index.html -> a_b_c_index.html
+            nhref = item.href.replace("/", "_")
+
+            if item.href == nhref:
+                # URL hasn't changed, skip item.
+                continue
+
+            data = item.data
+            isp = item.spine_position
+            nhref = oeb.manifest.generate(href=nhref)[1]
+            if isp is not None:
+                oeb.spine.remove(item)
+            oeb.manifest.remove(item)
+
+            nitem = oeb.manifest.add(item.id, nhref, item.media_type, data=data,
+                                     fallback=item.fallback)
+            self.rename_map[item.href] = nhref
+            self.renamed_items_map[nhref] = item
+            if isp is not None:
+                oeb.spine.insert(isp, nitem, item.linear)
+
+        if self.rename_map:
+            self.log('Found non-flat filenames, renaming to support broken'
+                    ' EPUB readers like FBReader...')
+            from pprint import pformat
+            self.log.debug(pformat(self.rename_map))
+            self.log.debug(pformat(self.renamed_items_map))
+
+            renamer = RenameFiles(self.rename_map, self.renamed_items_map)
+            renamer(oeb, opts)
+# }}}
@@ -0,0 +1,81 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre import fit_image
+
+
+class RescaleImages(object):
+
+    'Rescale all images to fit inside given screen size'
+
+    def __init__(self, check_colorspaces=False):
+        self.check_colorspaces = check_colorspaces
+
+    def __call__(self, oeb, opts):
+        self.oeb, self.opts, self.log = oeb, opts, oeb.log
+        self.rescale()
+
+    def rescale(self):
+        from PIL import Image
+        from io import BytesIO
+
+        is_image_collection = getattr(self.opts, 'is_image_collection', False)
+
+        if is_image_collection:
+            page_width, page_height = self.opts.dest.comic_screen_size
+        else:
+            page_width, page_height = self.opts.dest.width, self.opts.dest.height
+            page_width -= (self.opts.margin_left + self.opts.margin_right) * self.opts.dest.dpi/72
+            page_height -= (self.opts.margin_top + self.opts.margin_bottom) * self.opts.dest.dpi/72
+
+        for item in self.oeb.manifest:
+            if item.media_type.startswith('image'):
+                ext = item.media_type.split('/')[-1].upper()
+                if ext == 'JPG':
+                    ext = 'JPEG'
+                if ext not in ('PNG', 'JPEG', 'GIF'):
+                    ext = 'JPEG'
+
+                raw = item.data
+                if hasattr(raw, 'xpath') or not raw:
+                    # Probably an svg image
+                    continue
+                try:
+                    img = Image.open(BytesIO(raw))
+                except Exception:
+                    continue
+                width, height = img.size
+
+                try:
+                    if self.check_colorspaces and img.mode == 'CMYK':
+                        self.log.warn(
+                            'The image %s is in the CMYK colorspace, converting it '
+                            'to RGB as Adobe Digital Editions cannot display CMYK' % item.href)
+                        img = img.convert('RGB')
+                except Exception:
+                    self.log.exception('Failed to convert image %s from CMYK to RGB' % item.href)
+
+                scaled, new_width, new_height = fit_image(width, height, page_width, page_height)
+                if scaled:
+                    new_width = max(1, new_width)
+                    new_height = max(1, new_height)
+                    self.log('Rescaling image from %dx%d to %dx%d'%(
+                        width, height, new_width, new_height), item.href)
+                    try:
+                        img = img.resize((new_width, new_height))
+                    except Exception:
+                        self.log.exception('Failed to rescale image: %s' % item.href)
+                        continue
+                    buf = BytesIO()
+                    try:
+                        img.save(buf, ext)
+                    except Exception:
+                        self.log.exception('Failed to rescale image: %s' % item.href)
+                    else:
+                        item.data = buf.getvalue()
+                        item.unload_data_from_memory()
@@ -0,0 +1,488 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
+forced at "likely" locations to conform to size limitations. This transform
+assumes a prior call to the flatcss transform.
+'''
+
+import os, functools, collections, re, copy
+from collections import OrderedDict
+
+from lxml.etree import XPath as _XPath
+from lxml import etree
+
+from calibre import as_unicode, force_unicode
+from calibre.ebooks.epub import rules
+from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
+        urldefrag, rewrite_links, XHTML, urlnormalize)
+from calibre.ebooks.oeb.polish.split import do_split
+from polyglot.builtins import iteritems, range, map, unicode_type
+from polyglot.urllib import unquote
+from css_selectors import Select, SelectorError
+
+XPath = functools.partial(_XPath, namespaces=NAMESPACES)
+
+SPLIT_POINT_ATTR = 'csp'
+
+
+def tostring(root):
+    return etree.tostring(root, encoding='utf-8')
+
+
+class SplitError(ValueError):
+
+    def __init__(self, path, root):
+        size = len(tostring(root))/1024.
+        ValueError.__init__(self,
+            _('Could not find reasonable point at which to split: '
+                '%(path)s Sub-tree size: %(size)d KB')%dict(
+                            path=path, size=size))
+
+
+class Split(object):
+
+    def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
+            max_flow_size=0, remove_css_pagebreaks=True):
+        self.split_on_page_breaks = split_on_page_breaks
+        self.page_breaks_xpath = page_breaks_xpath
+        self.max_flow_size = max_flow_size
+        self.page_break_selectors = None
+        self.remove_css_pagebreaks = remove_css_pagebreaks
+        if self.page_breaks_xpath is not None:
+            self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)]
+
+    def __call__(self, oeb, opts):
+        self.oeb = oeb
+        self.log = oeb.log
+        self.log('Splitting markup on page breaks and flow limits, if any...')
+        self.opts = opts
+        self.map = {}
+        for item in list(self.oeb.manifest.items):
+            if item.spine_position is not None and etree.iselement(item.data):
+                self.split_item(item)
+
+        self.fix_links()
+
+    def split_item(self, item):
+        page_breaks, page_break_ids = [], []
+        if self.split_on_page_breaks:
+            page_breaks, page_break_ids = self.find_page_breaks(item)
+
+        splitter = FlowSplitter(item, page_breaks, page_break_ids,
+                self.max_flow_size, self.oeb, self.opts)
+        if splitter.was_split:
+            am = splitter.anchor_map
+            self.map[item.href] = collections.defaultdict(
+                    am.default_factory, am)
+
+    def find_page_breaks(self, item):
+        if self.page_break_selectors is None:
+            self.page_break_selectors = set()
+            stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
+                    OEB_STYLES]
+            for rule in rules(stylesheets):
+                before = force_unicode(getattr(rule.style.getPropertyCSSValue(
+                    'page-break-before'), 'cssText', '').strip().lower())
+                after  = force_unicode(getattr(rule.style.getPropertyCSSValue(
+                    'page-break-after'), 'cssText', '').strip().lower())
+                try:
+                    if before and before not in {'avoid', 'auto', 'inherit'}:
+                        self.page_break_selectors.add((rule.selectorText, True))
+                        if self.remove_css_pagebreaks:
+                            rule.style.removeProperty('page-break-before')
+                except:
+                    pass
+                try:
+                    if after and after not in {'avoid', 'auto', 'inherit'}:
+                        self.page_break_selectors.add((rule.selectorText, False))
+                        if self.remove_css_pagebreaks:
+                            rule.style.removeProperty('page-break-after')
+                except:
+                    pass
+        page_breaks = set()
+        select = Select(item.data)
+        if not self.page_break_selectors:
+            return [], []
+        body = item.data.xpath('//h:body', namespaces=NAMESPACES)
+        if not body:
+            return [], []
+        descendants = frozenset(body[0].iterdescendants('*'))
+
+        for selector, before in self.page_break_selectors:
+            try:
+                for elem in select(selector):
+                    if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
+                        elem.set('pb_before', '1' if before else '0')
+                        page_breaks.add(elem)
+            except SelectorError as err:
+                self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))
+
+        for i, elem in enumerate(item.data.iter('*')):
+            try:
+                elem.set('pb_order', unicode_type(i))
+            except TypeError:  # Cant set attributes on comment nodes etc.
+                continue
+
+        page_breaks = list(page_breaks)
+        page_breaks.sort(key=lambda x:int(x.get('pb_order')))
+        page_break_ids, page_breaks_ = [], []
+        for i, x in enumerate(page_breaks):
+            x.set('id', x.get('id', 'calibre_pb_%d'%i))
+            id = x.get('id')
+            try:
+                xp = XPath('//*[@id="%s"]'%id)
+            except:
+                try:
+                    xp = XPath("//*[@id='%s']"%id)
+                except:
+                    # The id has both a quote and an apostrophe or some other
+                    # Just replace it since I doubt its going to work anywhere else
+                    # either
+                    id = 'calibre_pb_%d'%i
+                    x.set('id', id)
+                    xp = XPath('//*[@id=%r]'%id)
+            page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
+            page_break_ids.append(id)
+
+        for elem in item.data.iter(etree.Element):
+            elem.attrib.pop('pb_order', False)
+            elem.attrib.pop('pb_before', False)
+
+        return page_breaks_, page_break_ids
+
+    def fix_links(self):
+        '''
+        Fix references to the split files in other content files.
+        '''
+        for item in self.oeb.manifest:
+            if etree.iselement(item.data):
+                self.current_item = item
+                rewrite_links(item.data, self.rewrite_links)
+
+    def rewrite_links(self, url):
+        href, frag = urldefrag(url)
+        try:
+            href = self.current_item.abshref(href)
+        except ValueError:
+            # Unparseable URL
+            return url
+        try:
+            href = urlnormalize(href)
+        except ValueError:
+            # href has non utf-8 quoting
+            return url
+        if href in self.map:
+            anchor_map = self.map[href]
+            nhref = anchor_map[frag if frag else None]
+            nhref = self.current_item.relhref(nhref)
+            if frag:
+                nhref = '#'.join((unquote(nhref), frag))
+
+            return nhref
+        return url
+
+
+class FlowSplitter(object):
+    'The actual splitting logic'
+
+    def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
+            opts):
+        self.item           = item
+        self.oeb            = oeb
+        self.opts           = opts
+        self.log            = oeb.log
+        self.page_breaks    = page_breaks
+        self.page_break_ids = page_break_ids
+        self.max_flow_size  = max_flow_size
+        self.base           = item.href
+        self.csp_counter    = 0
+
+        base, ext = os.path.splitext(self.base)
+        self.base = base.replace('%', '%%')+'_split_%.3d'+ext
+
+        self.trees = [self.item.data.getroottree()]
+        self.splitting_on_page_breaks = True
+        if self.page_breaks:
+            self.split_on_page_breaks(self.trees[0])
+        self.splitting_on_page_breaks = False
+
+        if self.max_flow_size > 0:
+            lt_found = False
+            self.log('\tLooking for large trees in %s...'%item.href)
+            trees = list(self.trees)
+            self.tree_map = {}
+            for i, tree in enumerate(trees):
+                size = len(tostring(tree.getroot()))
+                if size > self.max_flow_size:
+                    self.log('\tFound large tree #%d'%i)
+                    lt_found = True
+                    self.split_trees = []
+                    self.split_to_size(tree)
+                    self.tree_map[tree] = self.split_trees
+            if not lt_found:
+                self.log('\tNo large trees found')
+            self.trees = []
+            for x in trees:
+                self.trees.extend(self.tree_map.get(x, [x]))
+
+        self.was_split = len(self.trees) > 1
+        if self.was_split:
+            self.log('\tSplit into %d parts'%len(self.trees))
+        self.commit()
+
+    def split_on_page_breaks(self, orig_tree):
+        ordered_ids = OrderedDict()
+        all_page_break_ids = frozenset(self.page_break_ids)
+        for elem_id in orig_tree.xpath('//*/@id'):
+            if elem_id in all_page_break_ids:
+                ordered_ids[elem_id] = self.page_breaks[
+                    self.page_break_ids.index(elem_id)]
+
+        self.trees = [orig_tree]
+        while ordered_ids:
+            pb_id, (pattern, before) = next(iteritems(ordered_ids))
+            del ordered_ids[pb_id]
+            for i in range(len(self.trees)-1, -1, -1):
+                tree = self.trees[i]
+                elem = pattern(tree)
+                if elem:
+                    self.log.debug('\t\tSplitting on page-break at id=%s'%
+                                elem[0].get('id'))
+                    before_tree, after_tree = self.do_split(tree, elem[0], before)
+                    self.trees[i:i+1] = [before_tree, after_tree]
+                    break
+
+        trees, ids = [], set()
+        for tree in self.trees:
+            root = tree.getroot()
+            if self.is_page_empty(root):
+                discarded_ids = root.xpath('//*[@id]')
+                for x in discarded_ids:
+                    x = x.get('id')
+                    if not x.startswith('calibre_'):
+                        ids.add(x)
+            else:
+                if ids:
+                    body = self.get_body(root)
+                    if body is not None:
+                        existing_ids = frozenset(body.xpath('//*/@id'))
+                        for x in ids - existing_ids:
+                            body.insert(0, body.makeelement(XHTML('div'), id=x, style='height:0pt'))
+                ids = set()
+                trees.append(tree)
+        self.trees = trees
+
+    def get_body(self, root):
+        body = root.xpath('//h:body', namespaces=NAMESPACES)
+        if not body:
+            return None
+        return body[0]
+
+    def do_split(self, tree, split_point, before):
+        '''
+        Split ``tree`` into a *before* and *after* tree at ``split_point``.
+
+        :param before: If True tree is split before split_point, otherwise after split_point
+        :return: before_tree, after_tree
+        '''
+        return do_split(split_point, self.log, before=before)
+
+    def is_page_empty(self, root):
+        body = self.get_body(root)
+        if body is None:
+            return False
+        txt = re.sub(r'\s+|\xa0', '',
+                etree.tostring(body, method='text', encoding='unicode'))
+        if len(txt) > 1:
+            return False
+        for img in root.xpath('//h:img', namespaces=NAMESPACES):
+            if img.get('style', '') != 'display:none':
+                return False
+        if root.xpath('//*[local-name() = "svg"]'):
+            return False
+        return True
+
+    def split_text(self, text, root, size):
+        self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
+        rest = text.replace('\r', '')
+        parts = re.split('\n\n', rest)
+        self.log.debug('\t\t\t\tFound %d parts'%len(parts))
+        if max(map(len, parts)) > size:
+            raise SplitError('Cannot split as file contains a <pre> tag '
+                'with a very large paragraph', root)
+        ans = []
+        buf = ''
+        for part in parts:
+            if len(buf) + len(part) < size:
+                buf += '\n\n'+part
+            else:
+                ans.append(buf)
+                buf = part
+        return ans
+
+    def split_to_size(self, tree):
+        self.log.debug('\t\tSplitting...')
+        root = tree.getroot()
+        # Split large <pre> tags if they contain only text
+        for pre in XPath('//h:pre')(root):
+            if len(tuple(pre.iterchildren(etree.Element))) > 0:
+                continue
+            if pre.text and len(pre.text) > self.max_flow_size*0.5:
+                self.log.debug('\t\tSplitting large <pre> tag')
+                frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
+                new_pres = []
+                for frag in frags:
+                    pre2 = copy.copy(pre)
+                    pre2.text = frag
+                    pre2.tail = ''
+                    new_pres.append(pre2)
+                new_pres[-1].tail = pre.tail
+                p = pre.getparent()
+                i = p.index(pre)
+                p[i:i+1] = new_pres
+
+        split_point, before = self.find_split_point(root)
+        if split_point is None:
+            raise SplitError(self.item.href, root)
+        self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))
+
+        trees = self.do_split(tree, split_point, before)
+        sizes = [len(tostring(t.getroot())) for t in trees]
+        if min(sizes) < 5*1024:
+            self.log.debug('\t\t\tSplit tree too small')
+            self.split_to_size(tree)
+            return
+
+        for t, size in zip(trees, sizes):
+            r = t.getroot()
+            if self.is_page_empty(r):
+                continue
+            elif size <= self.max_flow_size:
+                self.split_trees.append(t)
+                self.log.debug(
+                    '\t\t\tCommitted sub-tree #%d (%d KB)'%(
+                               len(self.split_trees), size/1024.))
+            else:
+                self.log.debug(
+                        '\t\t\tSplit tree still too large: %d KB' % (size/1024.))
+                self.split_to_size(t)
+
+    def find_split_point(self, root):
+        '''
+        Find the tag at which to split the tree rooted at `root`.
+        Search order is:
+            * Heading tags
+            * <div> tags
+            * <pre> tags
+            * <hr> tags
+            * <p> tags
+            * <br> tags
+            * <li> tags
+
+        We try to split in the "middle" of the file (as defined by tag counts.
+        '''
+        def pick_elem(elems):
+            if elems:
+                elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') !=
+                        '1']
+                if elems:
+                    i = int(len(elems)//2)
+                    elems[i].set(SPLIT_POINT_ATTR, '1')
+                    return elems[i]
+
+        for path in (
+                     '//*[re:match(name(), "h[1-6]", "i")]',
+                     '/h:html/h:body/h:div',
+                     '//h:pre',
+                     '//h:hr',
+                     '//h:p',
+                     '//h:div',
+                     '//h:br',
+                     '//h:li',
+                     ):
+            elems = root.xpath(path, namespaces=NAMESPACES)
+            elem = pick_elem(elems)
+            if elem is not None:
+                try:
+                    XPath(elem.getroottree().getpath(elem))
+                except:
+                    continue
+                return elem, True
+
+        return None, True
+
+    def commit(self):
+        '''
+        Commit all changes caused by the split. Calculates an *anchor_map* for
+        all anchors in the original tree. Internal links are re-directed. The
+        original file is deleted and the split files are saved.
+        '''
+        if not self.was_split:
+            return
+        self.anchor_map = collections.defaultdict(lambda :self.base%0)
+        self.files = []
+
+        for i, tree in enumerate(self.trees):
+            root = tree.getroot()
+            self.files.append(self.base%i)
+            for elem in root.xpath('//*[@id or @name]'):
+                for anchor in elem.get('id', ''), elem.get('name', ''):
+                    if anchor != '' and anchor not in self.anchor_map:
+                        self.anchor_map[anchor] = self.files[-1]
+            for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR):
+                elem.attrib.pop(SPLIT_POINT_ATTR, '0')
+
+        spine_pos = self.item.spine_position
+
+        for current, tree in zip(*map(reversed, (self.files, self.trees))):
+            for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
+                href = a.get('href').strip()
+                if href.startswith('#'):
+                    anchor = href[1:]
+                    file = self.anchor_map[anchor]
+                    file = self.item.relhref(file)
+                    if file != current:
+                        a.set('href', file+href)
+
+            new_id = self.oeb.manifest.generate(id=self.item.id)[0]
+            new_item = self.oeb.manifest.add(new_id, current,
+                    self.item.media_type, data=tree.getroot())
+            self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
+
+        if self.oeb.guide:
+            for ref in self.oeb.guide.values():
+                href, frag = urldefrag(ref.href)
+                if href == self.item.href:
+                    nhref = self.anchor_map[frag if frag else None]
+                    if frag:
+                        nhref = '#'.join((nhref, frag))
+                    ref.href = nhref
+
+        def fix_toc_entry(toc):
+            if toc.href:
+                href, frag = urldefrag(toc.href)
+                if href == self.item.href:
+                    nhref = self.anchor_map[frag if frag else None]
+                    if frag:
+                        nhref = '#'.join((nhref, frag))
+                    toc.href = nhref
+            for x in toc:
+                fix_toc_entry(x)
+
+        if self.oeb.toc:
+            fix_toc_entry(self.oeb.toc)
+
+        if self.oeb.pages:
+            for page in self.oeb.pages:
+                href, frag = urldefrag(page.href)
+                if href == self.item.href:
+                    nhref = self.anchor_map[frag if frag else None]
+                    if frag:
+                        nhref = '#'.join((nhref, frag))
+                    page.href = nhref
+
+        self.oeb.manifest.remove(self.item)