Initial import

2026-05-02 11:50:52 +02:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
@@ -0,0 +1,30 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from polyglot.builtins import native_string_type
+
+
+class ConversionUserFeedBack(Exception):
+
+    def __init__(self, title, msg, level='info', det_msg=''):
+        ''' Show a simple message to the user
+
+        :param title: The title (very short description)
+        :param msg: The message to show the user
+        :param level: Must be one of 'info', 'warn' or 'error'
+        :param det_msg: Optional detailed message to show the user
+        '''
+        import json
+        Exception.__init__(self, json.dumps({'msg':msg, 'level':level,
+            'det_msg':det_msg, 'title':title}))
+        self.title, self.msg, self.det_msg = title, msg, det_msg
+        self.level = level
+
+
+# Ensure exception uses fully qualified name as this is used to detect it in
+# the GUI.
+ConversionUserFeedBack.__name__ = native_string_type('calibre.ebooks.conversion.ConversionUserFeedBack')
@@ -0,0 +1,428 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Command line interface to conversion sub-system
+'''
+
+import sys, os, numbers
+from optparse import OptionGroup, Option
+from collections import OrderedDict
+
+from calibre.utils.config import OptionParser
+from calibre.utils.logging import Log
+from calibre.customize.conversion import OptionRecommendation
+from calibre import patheq
+from calibre.ebooks.conversion import ConversionUserFeedBack
+from calibre.utils.localization import localize_user_manual_link
+from polyglot.builtins import iteritems
+
+USAGE = '%prog ' + _('''\
+input_file output_file [options]
+
+Convert an e-book from one format to another.
+
+input_file is the input and output_file is the output. Both must be \
+specified as the first two arguments to the command.
+
+The output e-book format is guessed from the file extension of \
+output_file. output_file can also be of the special format .EXT where \
+EXT is the output file extension. In this case, the name of the output \
+file is derived from the name of the input file. Note that the filenames must \
+not start with a hyphen. Finally, if output_file has no extension, then \
+it is treated as a directory and an "open e-book" (OEB) consisting of HTML \
+files is written to that directory. These files are the files that would \
+normally have been passed to the output plugin.
+
+After specifying the input \
+and output file you can customize the conversion by specifying various \
+options. The available options depend on the input and output file types. \
+To get help on them specify the input and output file and then use the -h \
+option.
+
+For full documentation of the conversion system see
+''') + localize_user_manual_link('https://manual.calibre-ebook.com/conversion.html')
+
+HEURISTIC_OPTIONS = ['markup_chapter_headings',
+                      'italicize_common_cases', 'fix_indents',
+                      'html_unwrap_factor', 'unwrap_lines',
+                      'delete_blank_paragraphs', 'format_scene_breaks',
+                      'dehyphenate', 'renumber_headings',
+                      'replace_scene_breaks']
+
+DEFAULT_TRUE_OPTIONS = HEURISTIC_OPTIONS + ['remove_fake_margins']
+
+
+def print_help(parser, log):
+    parser.print_help()
+
+
+def check_command_line_options(parser, args, log):
+    if len(args) < 3 or args[1].startswith('-') or args[2].startswith('-'):
+        print_help(parser, log)
+        log.error('\n\nYou must specify the input AND output files')
+        raise SystemExit(1)
+
+    input = os.path.abspath(args[1])
+    if not input.endswith('.recipe') and not os.access(input, os.R_OK) and not \
+            ('-h' in args or '--help' in args):
+        log.error('Cannot read from', input)
+        raise SystemExit(1)
+    if input.endswith('.recipe') and not os.access(input, os.R_OK):
+        input = args[1]
+
+    output = args[2]
+    if (output.startswith('.') and output[:2] not in {'..', '.'} and '/' not in
+            output and '\\' not in output):
+        output = os.path.splitext(os.path.basename(input))[0]+output
+    output = os.path.abspath(output)
+
+    return input, output
+
+
+def option_recommendation_to_cli_option(add_option, rec):
+    opt = rec.option
+    switches = ['-'+opt.short_switch] if opt.short_switch else []
+    switches.append('--'+opt.long_switch)
+    attrs = dict(dest=opt.name, help=opt.help,
+                     choices=opt.choices, default=rec.recommended_value)
+    if isinstance(rec.recommended_value, type(True)):
+        attrs['action'] = 'store_false' if rec.recommended_value else \
+                          'store_true'
+    else:
+        if isinstance(rec.recommended_value, numbers.Integral):
+            attrs['type'] = 'int'
+        if isinstance(rec.recommended_value, numbers.Real):
+            attrs['type'] = 'float'
+
+    if opt.long_switch == 'verbose':
+        attrs['action'] = 'count'
+        attrs.pop('type', '')
+    if opt.name == 'read_metadata_from_opf':
+        switches.append('--from-opf')
+    if opt.name == 'transform_css_rules':
+        attrs['help'] = _(
+            'Path to a file containing rules to transform the CSS styles'
+            ' in this book. The easiest way to create such a file is to'
+            ' use the wizard for creating rules in the calibre GUI. Access'
+            ' it in the "Look & feel->Transform styles" section of the conversion'
+            ' dialog. Once you create the rules, you can use the "Export" button'
+            ' to save them to a file.'
+        )
+    if opt.name in DEFAULT_TRUE_OPTIONS and rec.recommended_value is True:
+        switches = ['--disable-'+opt.long_switch]
+    add_option(Option(*switches, **attrs))
+
+
+def group_titles():
+    return _('INPUT OPTIONS'), _('OUTPUT OPTIONS')
+
+
+def recipe_test(option, opt_str, value, parser):
+    assert value is None
+    value = []
+
+    def floatable(s):
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+
+    for arg in parser.rargs:
+        # stop on --foo like options
+        if arg[:2] == "--":
+            break
+        # stop on -a, but not on -3 or -3.0
+        if arg[:1] == "-" and len(arg) > 1 and not floatable(arg):
+            break
+        try:
+            value.append(int(arg))
+        except (TypeError, ValueError, AttributeError):
+            break
+        if len(value) == 2:
+            break
+    del parser.rargs[:len(value)]
+
+    while len(value) < 2:
+        value.append(2)
+
+    setattr(parser.values, option.dest, tuple(value))
+
+
+def add_input_output_options(parser, plumber):
+    input_options, output_options = \
+                                plumber.input_options, plumber.output_options
+
+    def add_options(group, options):
+        for opt in options:
+            if plumber.input_fmt == 'recipe' and opt.option.long_switch == 'test':
+                group(Option('--test', dest='test', action='callback', callback=recipe_test))
+            else:
+                option_recommendation_to_cli_option(group, opt)
+
+    if input_options:
+        title = group_titles()[0]
+        io = OptionGroup(parser, title, _('Options to control the processing'
+                          ' of the input %s file')%plumber.input_fmt)
+        add_options(io.add_option, input_options)
+        parser.add_option_group(io)
+
+    if output_options:
+        title = group_titles()[1]
+        oo = OptionGroup(parser, title, _('Options to control the processing'
+                          ' of the output %s')%plumber.output_fmt)
+        add_options(oo.add_option, output_options)
+        parser.add_option_group(oo)
+
+
+def add_pipeline_options(parser, plumber):
+    groups = OrderedDict((
+              ('' , ('',
+                    [
+                     'input_profile',
+                     'output_profile',
+                     ]
+                    )),
+              (_('LOOK AND FEEL') , (
+                  _('Options to control the look and feel of the output'),
+                  [
+                      'base_font_size', 'disable_font_rescaling',
+                      'font_size_mapping', 'embed_font_family',
+                      'subset_embedded_fonts', 'embed_all_fonts',
+                      'line_height', 'minimum_line_height',
+                      'linearize_tables',
+                      'extra_css', 'filter_css', 'transform_css_rules', 'expand_css',
+                      'smarten_punctuation', 'unsmarten_punctuation',
+                      'margin_top', 'margin_left', 'margin_right',
+                      'margin_bottom', 'change_justification',
+                      'insert_blank_line', 'insert_blank_line_size',
+                      'remove_paragraph_spacing',
+                      'remove_paragraph_spacing_indent_size',
+                      'asciiize', 'keep_ligatures',
+                  ]
+                  )),
+
+              (_('HEURISTIC PROCESSING') , (
+                  _('Modify the document text and structure using common'
+                     ' patterns. Disabled by default. Use %(en)s to enable. '
+                     ' Individual actions can be disabled with the %(dis)s options.')
+                  % dict(en='--enable-heuristics', dis='--disable-*'),
+                  ['enable_heuristics'] + HEURISTIC_OPTIONS
+                  )),
+
+              (_('SEARCH AND REPLACE') , (
+                 _('Modify the document text and structure using user defined patterns.'),
+                 [
+                     'sr1_search', 'sr1_replace',
+                     'sr2_search', 'sr2_replace',
+                     'sr3_search', 'sr3_replace',
+                     'search_replace',
+                 ]
+              )),
+
+              (_('STRUCTURE DETECTION') , (
+                  _('Control auto-detection of document structure.'),
+                  [
+                      'chapter', 'chapter_mark',
+                      'prefer_metadata_cover', 'remove_first_image',
+                      'insert_metadata', 'page_breaks_before',
+                      'remove_fake_margins', 'start_reading_at',
+                  ]
+                  )),
+
+              (_('TABLE OF CONTENTS') , (
+                  _('Control the automatic generation of a Table of Contents. By '
+                  'default, if the source file has a Table of Contents, it will '
+                  'be used in preference to the automatically generated one.'),
+                  [
+                    'level1_toc', 'level2_toc', 'level3_toc',
+                    'toc_threshold', 'max_toc_links', 'no_chapters_in_toc',
+                    'use_auto_toc', 'toc_filter', 'duplicate_links_in_toc',
+                  ]
+                  )),
+
+              (_('METADATA') , (_('Options to set metadata in the output'),
+                            plumber.metadata_option_names + ['read_metadata_from_opf'],
+                            )),
+              (_('DEBUG'), (_('Options to help with debugging the conversion'),
+                        [
+                         'verbose',
+                         'debug_pipeline',
+                         ])),
+
+              ))
+
+    for group, (desc, options) in iteritems(groups):
+        if group:
+            group = OptionGroup(parser, group, desc)
+            parser.add_option_group(group)
+        add_option = group.add_option if group != '' else parser.add_option
+
+        for name in options:
+            rec = plumber.get_option_by_name(name)
+            if rec.level < rec.HIGH:
+                option_recommendation_to_cli_option(add_option, rec)
+
+
+def option_parser():
+    parser = OptionParser(usage=USAGE)
+    parser.add_option('--list-recipes', default=False, action='store_true',
+            help=_('List builtin recipe names. You can create an e-book from '
+                'a builtin recipe like this: ebook-convert "Recipe Name.recipe" '
+                'output.epub'))
+    return parser
+
+
+class ProgressBar(object):
+
+    def __init__(self, log):
+        self.log = log
+
+    def __call__(self, frac, msg=''):
+        if msg:
+            percent = int(frac*100)
+            self.log('%d%% %s'%(percent, msg))
+
+
+def create_option_parser(args, log):
+    if '--version' in args:
+        from calibre.constants import __appname__, __version__, __author__
+        log(os.path.basename(args[0]), '('+__appname__, __version__+')')
+        log('Created by:', __author__)
+        raise SystemExit(0)
+    if '--list-recipes' in args:
+        from calibre.web.feeds.recipes.collection import get_builtin_recipe_titles
+        log('Available recipes:')
+        titles = sorted(get_builtin_recipe_titles())
+        for title in titles:
+            try:
+                log('\t'+title)
+            except:
+                log('\t'+repr(title))
+        log('%d recipes available'%len(titles))
+        raise SystemExit(0)
+
+    parser = option_parser()
+    if len(args) < 3:
+        print_help(parser, log)
+        if any(x in args for x in ('-h', '--help')):
+            raise SystemExit(0)
+        else:
+            raise SystemExit(1)
+
+    input, output = check_command_line_options(parser, args, log)
+
+    from calibre.ebooks.conversion.plumber import Plumber
+
+    reporter = ProgressBar(log)
+    if patheq(input, output):
+        raise ValueError('Input file is the same as the output file')
+
+    plumber = Plumber(input, output, log, reporter)
+    add_input_output_options(parser, plumber)
+    add_pipeline_options(parser, plumber)
+
+    return parser, plumber
+
+
+def abspath(x):
+    if x.startswith('http:') or x.startswith('https:'):
+        return x
+    return os.path.abspath(os.path.expanduser(x))
+
+
+def escape_sr_pattern(exp):
+    return exp.replace('\n', '\ue123')
+
+
+def read_sr_patterns(path, log=None):
+    import json, re
+    pats = []
+    with open(path, 'rb') as f:
+        lines = f.read().decode('utf-8').splitlines()
+    pat = None
+    for line in lines:
+        if pat is None:
+            if not line.strip():
+                continue
+            line = line.replace('\ue123', '\n')
+            try:
+                re.compile(line)
+            except:
+                msg = 'Invalid regular expression: %r from file: %r'%(
+                        line, path)
+                if log is not None:
+                    log.error(msg)
+                    raise SystemExit(1)
+                else:
+                    raise ValueError(msg)
+            pat = line
+        else:
+            pats.append((pat, line))
+            pat = None
+    return json.dumps(pats)
+
+
+def main(args=sys.argv):
+    log = Log()
+    parser, plumber = create_option_parser(args, log)
+    opts, leftover_args = parser.parse_args(args)
+    if len(leftover_args) > 3:
+        log.error('Extra arguments not understood:', u', '.join(leftover_args[3:]))
+        return 1
+    for x in ('read_metadata_from_opf', 'cover'):
+        if getattr(opts, x, None) is not None:
+            setattr(opts, x, abspath(getattr(opts, x)))
+    if opts.search_replace:
+        opts.search_replace = read_sr_patterns(opts.search_replace, log)
+    if opts.transform_css_rules:
+        from calibre.ebooks.css_transform_rules import import_rules, validate_rule
+        with open(opts.transform_css_rules, 'rb') as tcr:
+            opts.transform_css_rules = rules = list(import_rules(tcr.read()))
+            for rule in rules:
+                title, msg = validate_rule(rule)
+                if title and msg:
+                    log.error('Failed to parse CSS transform rules')
+                    log.error(title)
+                    log.error(msg)
+                    return 1
+
+    recommendations = [(n.dest, getattr(opts, n.dest),
+                        OptionRecommendation.HIGH)
+                                        for n in parser.options_iter()
+                                        if n.dest]
+    plumber.merge_ui_recommendations(recommendations)
+
+    try:
+        plumber.run()
+    except ConversionUserFeedBack as e:
+        ll = {'info': log.info, 'warn': log.warn,
+                'error':log.error}.get(e.level, log.info)
+        ll(e.title)
+        if e.det_msg:
+            log.debug(e.detmsg)
+        ll(e.msg)
+        raise SystemExit(1)
+
+    log(_('Output saved to'), ' ', plumber.output)
+
+    return 0
+
+
+def manual_index_strings():
+    return _('''\
+The options and default values for the options change depending on both the
+input and output formats, so you should always check with::
+
+    %s
+
+Below are the options that are common to all conversion, followed by the
+options specific to every input and output format.''')
+
+
+if __name__ == '__main__':
+    sys.exit(main())
@@ -0,0 +1,10 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import getcwd
+
+
+class AZW4Input(InputFormatPlugin):
+
+    name        = 'AZW4 Input'
+    author      = 'John Schember'
+    description = 'Convert AZW4 to HTML'
+    file_types  = {'azw4'}
+    commit_name = 'azw4_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.pdb.header import PdbHeaderReader
+        from calibre.ebooks.azw4.reader import Reader
+
+        header = PdbHeaderReader(stream)
+        reader = Reader(header, stream, log, options)
+        opf = reader.extract_content(getcwd())
+
+        return opf
@@ -0,0 +1,202 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+''' CHM File decoding support '''
+__license__ = 'GPL v3'
+__copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
+                 ' and Alex Bramley <a.bramley at gmail.com>.'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin
+from calibre.ptempfile import TemporaryDirectory
+from calibre.constants import filesystem_encoding
+from polyglot.builtins import unicode_type, as_bytes
+
+
+class CHMInput(InputFormatPlugin):
+
+    name        = 'CHM Input'
+    author      = 'Kovid Goyal and Alex Bramley'
+    description = 'Convert CHM files to OEB'
+    file_types  = {'chm'}
+    commit_name = 'chm_input'
+
+    def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False):
+        from calibre.ebooks.chm.reader import CHMReader
+        log.debug('Opening CHM file')
+        rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding)
+        log.debug('Extracting CHM to %s' % output_dir)
+        rdr.extract_content(output_dir, debug_dump=debug_dump)
+        self._chm_reader = rdr
+        return rdr.hhc_path
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.ebooks.chm.metadata import get_metadata_from_reader
+        from calibre.customize.ui import plugin_for_input_format
+        self.opts = options
+
+        log.debug('Processing CHM...')
+        with TemporaryDirectory('_chm2oeb') as tdir:
+            if not isinstance(tdir, unicode_type):
+                tdir = tdir.decode(filesystem_encoding)
+            html_input = plugin_for_input_format('html')
+            for opt in html_input.options:
+                setattr(options, opt.option.name, opt.recommended_value)
+            no_images = False  # options.no_images
+            chm_name = stream.name
+            # chm_data = stream.read()
+
+            # closing stream so CHM can be opened by external library
+            stream.close()
+            log.debug('tdir=%s' % tdir)
+            log.debug('stream.name=%s' % stream.name)
+            debug_dump = False
+            odi = options.debug_pipeline
+            if odi:
+                debug_dump = os.path.join(odi, 'input')
+            mainname = self._chmtohtml(tdir, chm_name, no_images, log,
+                    debug_dump=debug_dump)
+            mainpath = os.path.join(tdir, mainname)
+
+            try:
+                metadata = get_metadata_from_reader(self._chm_reader)
+            except Exception:
+                log.exception('Failed to read metadata, using filename')
+                from calibre.ebooks.metadata.book.base import Metadata
+                metadata = Metadata(os.path.basename(chm_name))
+            encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
+            self._chm_reader.CloseCHM()
+            # print((tdir, mainpath))
+            # from calibre import ipython
+            # ipython()
+
+            options.debug_pipeline = None
+            options.input_encoding = 'utf-8'
+            uenc = encoding
+            if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
+                uenc = 'utf-8'
+            htmlpath, toc = self._create_html_root(mainpath, log, uenc)
+            oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
+            options.debug_pipeline = odi
+            if toc.count() > 1:
+                oeb.toc = self.parse_html_toc(oeb.spine[0])
+                oeb.manifest.remove(oeb.spine[0])
+                oeb.auto_generated_toc = False
+        return oeb
+
+    def parse_html_toc(self, item):
+        from calibre.ebooks.oeb.base import TOC, XPath
+        dx = XPath('./h:div')
+        ax = XPath('./h:a[1]')
+
+        def do_node(parent, div):
+            for child in dx(div):
+                a = ax(child)[0]
+                c = parent.add(a.text, a.attrib['href'])
+                do_node(c, child)
+
+        toc = TOC()
+        root = XPath('//h:div[1]')(item.data)[0]
+        do_node(toc, root)
+        return toc
+
+    def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
+        # use HTMLInput plugin to generate book
+        from calibre.customize.builtins import HTMLInput
+        opts.breadth_first = True
+        htmlinput = HTMLInput(None)
+        oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
+        return oeb
+
+    def _create_html_root(self, hhcpath, log, encoding):
+        from lxml import html
+        from polyglot.urllib import unquote as _unquote
+        from calibre.ebooks.oeb.base import urlquote
+        from calibre.ebooks.chardet import xml_to_unicode
+        hhcdata = self._read_file(hhcpath)
+        hhcdata = hhcdata.decode(encoding)
+        hhcdata = xml_to_unicode(hhcdata, verbose=True,
+                            strip_encoding_pats=True, resolve_entities=True)[0]
+        hhcroot = html.fromstring(hhcdata)
+        toc = self._process_nodes(hhcroot)
+        # print("=============================")
+        # print("Printing hhcroot")
+        # print(etree.tostring(hhcroot, pretty_print=True))
+        # print("=============================")
+        log.debug('Found %d section nodes' % toc.count())
+        htmlpath = os.path.splitext(hhcpath)[0] + ".html"
+        base = os.path.dirname(os.path.abspath(htmlpath))
+
+        def unquote(x):
+            if isinstance(x, unicode_type):
+                x = x.encode('utf-8')
+            return _unquote(x).decode('utf-8')
+
+        def unquote_path(x):
+            y = unquote(x)
+            if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))):
+                x = y
+            return x
+
+        def donode(item, parent, base, subpath):
+            for child in item:
+                title = child.title
+                if not title:
+                    continue
+                raw = unquote_path(child.href or '')
+                rsrcname = os.path.basename(raw)
+                rsrcpath = os.path.join(subpath, rsrcname)
+                if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))):
+                    rsrcpath = raw
+
+                if '%' not in rsrcpath:
+                    rsrcpath = urlquote(rsrcpath)
+                if not raw:
+                    rsrcpath = ''
+                c = DIV(A(title, href=rsrcpath))
+                donode(child, c, base, subpath)
+                parent.append(c)
+
+        with open(htmlpath, 'wb') as f:
+            if toc.count() > 1:
+                from lxml.html.builder import HTML, BODY, DIV, A
+                path0 = toc[0].href
+                path0 = unquote_path(path0)
+                subpath = os.path.dirname(path0)
+                base = os.path.dirname(f.name)
+                root = DIV()
+                donode(toc, root, base, subpath)
+                raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
+                                   pretty_print=True)
+                f.write(raw)
+            else:
+                f.write(as_bytes(hhcdata))
+        return htmlpath, toc
+
+    def _read_file(self, name):
+        with lopen(name, 'rb') as f:
+            data = f.read()
+        return data
+
+    def add_node(self, node, toc, ancestor_map):
+        from calibre.ebooks.chm.reader import match_string
+        if match_string(node.attrib.get('type', ''), 'text/sitemap'):
+            p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
+            parent = p[0] if p else None
+            toc = ancestor_map.get(parent, toc)
+            title = href = ''
+            for param in node.xpath('./param'):
+                if match_string(param.attrib['name'], 'name'):
+                    title = param.attrib['value']
+                elif match_string(param.attrib['name'], 'local'):
+                    href = param.attrib['value']
+            child = toc.add(title or _('Unknown'), href)
+            ancestor_map[node] = child
+
+    def _process_nodes(self, root):
+        from calibre.ebooks.oeb.base import TOC
+        toc = TOC()
+        ancestor_map = {}
+        for node in root.xpath('//object'):
+            self.add_node(node, toc, ancestor_map)
+        return toc
@@ -0,0 +1,310 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Based on ideas from comiclrf created by FangornUK.
+'''
+
+import shutil, textwrap, codecs, os
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre import CurrentDir
+from calibre.ptempfile import PersistentTemporaryDirectory
+from polyglot.builtins import getcwd, map
+
+
+class ComicInput(InputFormatPlugin):
+
+    name        = 'Comic Input'
+    author      = 'Kovid Goyal'
+    description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
+    file_types  = {'cbz', 'cbr', 'cbc'}
+    is_image_collection = True
+    commit_name = 'comic_input'
+    core_usage = -1
+
+    options = {
+        OptionRecommendation(name='colors', recommended_value=0,
+            help=_('Reduce the number of colors used in the image. This works only'
+                   ' if you choose the PNG output format. It is useful to reduce file sizes.'
+                   ' Set to zero to turn off. Maximum value is 256. It is off by default.')),
+        OptionRecommendation(name='dont_normalize', recommended_value=False,
+            help=_('Disable normalize (improve contrast) color range '
+            'for pictures. Default: False')),
+        OptionRecommendation(name='keep_aspect_ratio', recommended_value=False,
+            help=_('Maintain picture aspect ratio. Default is to fill the screen.')),
+        OptionRecommendation(name='dont_sharpen', recommended_value=False,
+            help=_('Disable sharpening.')),
+        OptionRecommendation(name='disable_trim', recommended_value=False,
+            help=_('Disable trimming of comic pages. For some comics, '
+                     'trimming might remove content as well as borders.')),
+        OptionRecommendation(name='landscape', recommended_value=False,
+            help=_("Don't split landscape images into two portrait images")),
+        OptionRecommendation(name='wide', recommended_value=False,
+            help=_("Keep aspect ratio and scale image using screen height as "
+            "image width for viewing in landscape mode.")),
+        OptionRecommendation(name='right2left', recommended_value=False,
+              help=_('Used for right-to-left publications like manga. '
+              'Causes landscape pages to be split into portrait pages '
+              'from right to left.')),
+        OptionRecommendation(name='despeckle', recommended_value=False,
+              help=_('Enable Despeckle. Reduces speckle noise. '
+              'May greatly increase processing time.')),
+        OptionRecommendation(name='no_sort', recommended_value=False,
+              help=_("Don't sort the files found in the comic "
+              "alphabetically by name. Instead use the order they were "
+              "added to the comic.")),
+        OptionRecommendation(name='output_format', choices=['png', 'jpg'],
+            recommended_value='png', help=_('The format that images in the created e-book '
+                'are converted to. You can experiment to see which format gives '
+                'you optimal size and look on your device.')),
+        OptionRecommendation(name='no_process', recommended_value=False,
+              help=_("Apply no processing to the image")),
+        OptionRecommendation(name='dont_grayscale', recommended_value=False,
+            help=_('Do not convert the image to grayscale (black and white)')),
+        OptionRecommendation(name='comic_image_size', recommended_value=None,
+            help=_('Specify the image size as widthxheight pixels. Normally,'
+                ' an image size is automatically calculated from the output '
+                'profile, this option overrides it.')),
+        OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False,
+            help=_('When converting a CBC do not add links to each page to'
+                ' the TOC. Note this only applies if the TOC has more than one'
+                ' section')),
+        }
+
+    recommendations = {
+        ('margin_left', 0, OptionRecommendation.HIGH),
+        ('margin_top',  0, OptionRecommendation.HIGH),
+        ('margin_right', 0, OptionRecommendation.HIGH),
+        ('margin_bottom', 0, OptionRecommendation.HIGH),
+        ('insert_blank_line', False, OptionRecommendation.HIGH),
+        ('remove_paragraph_spacing',  False, OptionRecommendation.HIGH),
+        ('change_justification', 'left', OptionRecommendation.HIGH),
+        ('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
+        ('chapter', None, OptionRecommendation.HIGH),
+        ('page_breaks_brefore', None, OptionRecommendation.HIGH),
+        ('use_auto_toc', False, OptionRecommendation.HIGH),
+        ('page_breaks_before', None, OptionRecommendation.HIGH),
+        ('disable_font_rescaling', True, OptionRecommendation.HIGH),
+        ('linearize_tables', False, OptionRecommendation.HIGH),
+        }
+
+    def get_comics_from_collection(self, stream):
+        from calibre.libunzip import extract as zipextract
+        tdir = PersistentTemporaryDirectory('_comic_collection')
+        zipextract(stream, tdir)
+        comics = []
+        with CurrentDir(tdir):
+            if not os.path.exists('comics.txt'):
+                raise ValueError((
+                    '%s is not a valid comic collection'
+                    ' no comics.txt was found in the file')
+                        %stream.name)
+            with open('comics.txt', 'rb') as f:
+                raw = f.read()
+            if raw.startswith(codecs.BOM_UTF16_BE):
+                raw = raw.decode('utf-16-be')[1:]
+            elif raw.startswith(codecs.BOM_UTF16_LE):
+                raw = raw.decode('utf-16-le')[1:]
+            elif raw.startswith(codecs.BOM_UTF8):
+                raw = raw.decode('utf-8')[1:]
+            else:
+                raw = raw.decode('utf-8')
+            for line in raw.splitlines():
+                line = line.strip()
+                if not line:
+                    continue
+                fname, title = line.partition(':')[0], line.partition(':')[-1]
+                fname = fname.replace('#', '_')
+                fname = os.path.join(tdir, *fname.split('/'))
+                if not title:
+                    title = os.path.basename(fname).rpartition('.')[0]
+                if os.access(fname, os.R_OK):
+                    comics.append([title, fname])
+        if not comics:
+            raise ValueError('%s has no comics'%stream.name)
+        return comics
+
+    def get_pages(self, comic, tdir2):
+        from calibre.ebooks.comic.input import (extract_comic,  process_pages,
+                find_pages)
+        tdir  = extract_comic(comic)
+        new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort,
+                verbose=self.opts.verbose)
+        thumbnail = None
+        if not new_pages:
+            raise ValueError('Could not find any pages in the comic: %s'
+                    %comic)
+        if self.opts.no_process:
+            n2 = []
+            for i, page in enumerate(new_pages):
+                n2.append(os.path.join(tdir2, '{} - {}' .format(i, os.path.basename(page))))
+                shutil.copyfile(page, n2[-1])
+            new_pages = n2
+        else:
+            new_pages, failures = process_pages(new_pages, self.opts,
+                    self.report_progress, tdir2)
+            if failures:
+                self.log.warning('Could not process the following pages '
+                '(run with --verbose to see why):')
+                for f in failures:
+                    self.log.warning('\t', f)
+            if not new_pages:
+                raise ValueError('Could not find any valid pages in comic: %s'
+                        % comic)
+            thumbnail = os.path.join(tdir2,
+                    'thumbnail.'+self.opts.output_format.lower())
+            if not os.access(thumbnail, os.R_OK):
+                thumbnail = None
+        return new_pages
+
+    def get_images(self):
+        return self._images
+
+    def convert(self, stream, opts, file_ext, log, accelerators):
+        from calibre.ebooks.metadata import MetaInformation
+        from calibre.ebooks.metadata.opf2 import OPFCreator
+        from calibre.ebooks.metadata.toc import TOC
+
+        self.opts, self.log= opts, log
+        if file_ext == 'cbc':
+            comics_ = self.get_comics_from_collection(stream)
+        else:
+            comics_ = [['Comic', os.path.abspath(stream.name)]]
+        stream.close()
+        comics = []
+        for i, x in enumerate(comics_):
+            title, fname = x
+            cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.'
+            cdir = os.path.abspath(cdir)
+            if not os.path.exists(cdir):
+                os.makedirs(cdir)
+            pages = self.get_pages(fname, cdir)
+            if not pages:
+                continue
+            if self.for_viewer:
+                comics.append((title, pages, [self.create_viewer_wrapper(pages)]))
+            else:
+                wrappers = self.create_wrappers(pages)
+                comics.append((title, pages, wrappers))
+
+        if not comics:
+            raise ValueError('No comic pages found in %s'%stream.name)
+
+        mi  = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
+            [_('Unknown')])
+        opf = OPFCreator(getcwd(), mi)
+        entries = []
+
+        def href(x):
+            if len(comics) == 1:
+                return os.path.basename(x)
+            return '/'.join(x.split(os.sep)[-2:])
+
+        cover_href = None
+        for comic in comics:
+            pages, wrappers = comic[1:]
+            page_entries = [(x, None) for x in map(href, pages)]
+            entries += [(w, None) for w in map(href, wrappers)] + page_entries
+            if cover_href is None and page_entries:
+                cover_href = page_entries[0][0]
+        opf.create_manifest(entries)
+        spine = []
+        for comic in comics:
+            spine.extend(map(href, comic[2]))
+        self._images = []
+        for comic in comics:
+            self._images.extend(comic[1])
+        opf.create_spine(spine)
+        if self.for_viewer and cover_href:
+            opf.guide.set_cover(cover_href)
+        toc = TOC()
+        if len(comics) == 1:
+            wrappers = comics[0][2]
+            for i, x in enumerate(wrappers):
+                toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
+                        play_order=i)
+        else:
+            po = 0
+            for comic in comics:
+                po += 1
+                wrappers = comic[2]
+                stoc = toc.add_item(href(wrappers[0]),
+                        None, comic[0], play_order=po)
+                if not opts.dont_add_comic_pages_to_toc:
+                    for i, x in enumerate(wrappers):
+                        stoc.add_item(href(x), None,
+                                _('Page')+' %d'%(i+1), play_order=po)
+                        po += 1
+        opf.set_toc(toc)
+        with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
+            opf.render(m, n, 'toc.ncx')
+        return os.path.abspath('metadata.opf')
+
+    def create_wrappers(self, pages):
+        from calibre.ebooks.oeb.base import XHTML_NS
+        wrappers = []
+        WRAPPER = textwrap.dedent('''\
+        <html xmlns="%s">
+            <head>
+                <meta charset="utf-8"/>
+                <title>Page #%d</title>
+                <style type="text/css">
+                    @page { margin:0pt; padding: 0pt}
+                    body { margin: 0pt; padding: 0pt}
+                    div { text-align: center }
+                </style>
+            </head>
+            <body>
+                <div>
+                    <img src="%s" alt="comic page #%d" />
+                </div>
+            </body>
+        </html>
+        ''')
+        dir = os.path.dirname(pages[0])
+        for i, page in enumerate(pages):
+            wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
+            page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
+            with open(page, 'wb') as f:
+                f.write(wrapper.encode('utf-8'))
+            wrappers.append(page)
+        return wrappers
+
+    def create_viewer_wrapper(self, pages):
+        from calibre.ebooks.oeb.base import XHTML_NS
+
+        def page(src):
+            return '<img src="{}"></img>'.format(os.path.basename(src))
+
+        pages = '\n'.join(map(page, pages))
+        base = os.path.dirname(pages[0])
+        wrapper = '''
+        <html xmlns="%s">
+            <head>
+                <meta charset="utf-8"/>
+                <style type="text/css">
+                html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
+                img {
+                    width: 100%%; height: 100%%;
+                    object-fit: contain;
+                    margin-left: auto; margin-right: auto;
+                    max-width: 100vw; max-height: 100vh;
+                    top: 50vh; transform: translateY(-50%%);
+                    position: relative;
+                    page-break-after: always;
+                }
+                </style>
+            </head>
+            <body>
+            %s
+            </body>
+        </html>
+        ''' % (XHTML_NS, pages)
+        path = os.path.join(base, 'wrapper.xhtml')
+        with open(path, 'wb') as f:
+            f.write(wrapper.encode('utf-8'))
+        return path
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
+__docformat__ = 'restructuredtext en'
+
+import os
+from io import BytesIO
+
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import getcwd
+
+
+class DJVUInput(InputFormatPlugin):
+
+    name        = 'DJVU Input'
+    author      = 'Anthon van der Neut'
+    description = 'Convert OCR-ed DJVU files (.djvu) to HTML'
+    file_types  = {'djvu', 'djv'}
+    commit_name = 'djvu_input'
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.ebooks.txt.processor import convert_basic
+
+        stdout = BytesIO()
+        from calibre.ebooks.djvu.djvu import DJVUFile
+        x = DJVUFile(stream)
+        x.get_text(stdout)
+        raw_text = stdout.getvalue()
+        if not raw_text:
+            raise ValueError('The DJVU file contains no text, only images, probably page scans.'
+                    ' calibre only supports conversion of DJVU files with actual text in them.')
+
+        html = convert_basic(raw_text.replace(b"\n", b' ').replace(
+            b'\037', b'\n\n'))
+        # Run the HTMLized text through the html processing plugin.
+        from calibre.customize.ui import plugin_for_input_format
+        html_input = plugin_for_input_format('html')
+        for opt in html_input.options:
+            setattr(options, opt.option.name, opt.recommended_value)
+        options.input_encoding = 'utf-8'
+        base = getcwd()
+        htmlfile = os.path.join(base, 'index.html')
+        c = 0
+        while os.path.exists(htmlfile):
+            c += 1
+            htmlfile = os.path.join(base, 'index%d.html'%c)
+        with open(htmlfile, 'wb') as f:
+            f.write(html.encode('utf-8'))
+        odi = options.debug_pipeline
+        options.debug_pipeline = None
+        # Generate oeb from html conversion.
+        with open(htmlfile, 'rb') as f:
+            oeb = html_input.convert(f, options, 'html', log,
+                {})
+        options.debug_pipeline = odi
+        os.remove(htmlfile)
+
+        # Set metadata from file.
+        from calibre.customize.ui import get_file_type_metadata
+        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
+        mi = get_file_type_metadata(stream, file_ext)
+        meta_info_to_oeb_metadata(mi, oeb.metadata, log)
+
+        return oeb
@@ -0,0 +1,34 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+
+
+class DOCXInput(InputFormatPlugin):
+    name        = 'DOCX Input'
+    author      = 'Kovid Goyal'
+    description = _('Convert DOCX files (.docx and .docm) to HTML')
+    file_types  = {'docx', 'docm'}
+    commit_name = 'docx_input'
+
+    options = {
+        OptionRecommendation(name='docx_no_cover', recommended_value=False,
+            help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
+                   'it will be removed from the document and used as the cover for created e-book. This option '
+                   'turns off that behavior.')),
+        OptionRecommendation(name='docx_no_pagebreaks_between_notes', recommended_value=False,
+            help=_('Do not insert a page break after every endnote.')),
+        OptionRecommendation(name='docx_inline_subsup', recommended_value=False,
+            help=_('Render superscripts and subscripts so that they do not affect the line height.')),
+    }
+
+    recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.ebooks.docx.to_html import Convert
+        return Convert(stream, detect_cover=not options.docx_no_cover, log=log, notes_nopb=options.docx_no_pagebreaks_between_notes,
+                       nosupsub=options.docx_inline_subsup)()
@@ -0,0 +1,93 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
+
+PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
+              'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter']
+
+
+class DOCXOutput(OutputFormatPlugin):
+
+    name = 'DOCX Output'
+    author = 'Kovid Goyal'
+    file_type = 'docx'
+    commit_name = 'docx_output'
+    ui_data = {'page_sizes': PAGE_SIZES}
+
+    options = {
+        OptionRecommendation(name='docx_page_size', recommended_value='letter',
+            level=OptionRecommendation.LOW, choices=PAGE_SIZES,
+            help=_('The size of the page. Default is letter. Choices '
+            'are %s') % PAGE_SIZES),
+
+        OptionRecommendation(name='docx_custom_page_size', recommended_value=None,
+            help=_('Custom size of the document. Use the form widthxheight '
+            'EG. `123x321` to specify the width and height (in pts). '
+            'This overrides any specified page-size.')),
+
+        OptionRecommendation(name='docx_no_cover', recommended_value=False,
+            help=_('Do not insert the book cover as an image at the start of the document.'
+                   ' If you use this option, the book cover will be discarded.')),
+
+        OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False,
+            help=_('Preserve the aspect ratio of the cover image instead of stretching'
+                   ' it out to cover the entire page.')),
+
+        OptionRecommendation(name='docx_no_toc', recommended_value=False,
+            help=_('Do not insert the table of contents as a page at the start of the document.')),
+
+        OptionRecommendation(name='extract_to',
+            help=_('Extract the contents of the generated %s file to the '
+                'specified directory. The contents of the directory are first '
+                'deleted, so be careful.') % 'DOCX'),
+
+        OptionRecommendation(name='docx_page_margin_left', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the left page margin, in pts. Default is 72pt.'
+                   ' Overrides the common left page margin setting.')
+        ),
+
+        OptionRecommendation(name='docx_page_margin_top', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the top page margin, in pts. Default is 72pt.'
+                   ' Overrides the common top page margin setting, unless set to zero.')
+        ),
+
+        OptionRecommendation(name='docx_page_margin_right', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the right page margin, in pts. Default is 72pt.'
+                   ' Overrides the common right page margin setting, unless set to zero.')
+        ),
+
+        OptionRecommendation(name='docx_page_margin_bottom', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the bottom page margin, in pts. Default is 72pt.'
+                   ' Overrides the common bottom page margin setting, unless set to zero.')
+        ),
+
+    }
+
+    def convert_metadata(self, oeb):
+        from lxml import etree
+        from calibre.ebooks.oeb.base import OPF, OPF2_NS
+        from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
+        from io import BytesIO
+        package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS})
+        oeb.metadata.to_opf2(package)
+        self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata()
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        from calibre.ebooks.docx.writer.container import DOCX
+        from calibre.ebooks.docx.writer.from_html import Convert
+        docx = DOCX(opts, log)
+        self.convert_metadata(oeb)
+        Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)()
+        docx.write(output_path, self.mi)
+        if opts.extract_to:
+            from calibre.ebooks.docx.dump import do_dump
+            do_dump(output_path, opts.extract_to)
@@ -0,0 +1,438 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, re, posixpath
+from itertools import cycle
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from polyglot.builtins import getcwd
+
+ADOBE_OBFUSCATION =  'http://ns.adobe.com/pdf/enc#RC'
+IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
+
+
+def decrypt_font_data(key, data, algorithm):
+    is_adobe = algorithm == ADOBE_OBFUSCATION
+    crypt_len = 1024 if is_adobe else 1040
+    crypt = bytearray(data[:crypt_len])
+    key = cycle(iter(bytearray(key)))
+    decrypt = bytes(bytearray(x^next(key) for x in crypt))
+    return decrypt + data[crypt_len:]
+
+
+def decrypt_font(key, path, algorithm):
+    with lopen(path, 'r+b') as f:
+        data = decrypt_font_data(key, f.read(), algorithm)
+        f.seek(0), f.truncate(), f.write(data)
+
+
+class EPUBInput(InputFormatPlugin):
+
+    name        = 'EPUB Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert EPUB files (.epub) to HTML'
+    file_types  = {'epub'}
+    output_encoding = None
+    commit_name = 'epub_input'
+
+    recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
+
+    def process_encryption(self, encfile, opf, log):
+        from lxml import etree
+        import uuid, hashlib
+        idpf_key = opf.raw_unique_identifier
+        if idpf_key:
+            idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
+            idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
+        key = None
+        for item in opf.identifier_iter():
+            scheme = None
+            for xkey in item.attrib.keys():
+                if xkey.endswith('scheme'):
+                    scheme = item.get(xkey)
+            if (scheme and scheme.lower() == 'uuid') or \
+                    (item.text and item.text.startswith('urn:uuid:')):
+                try:
+                    key = item.text.rpartition(':')[-1]
+                    key = uuid.UUID(key).bytes
+                except:
+                    import traceback
+                    traceback.print_exc()
+                    key = None
+
+        try:
+            root = etree.parse(encfile)
+            for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
+                algorithm = em.get('Algorithm', '')
+                if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
+                    return False
+                cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
+                uri = cr.get('URI')
+                path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
+                tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
+                if (tkey and os.path.exists(path)):
+                    self._encrypted_font_uris.append(uri)
+                    decrypt_font(tkey, path, algorithm)
+            return True
+        except:
+            import traceback
+            traceback.print_exc()
+        return False
+
+    def set_guide_type(self, opf, gtype, href=None, title=''):
+        # Set the specified guide entry
+        for elem in list(opf.iterguide()):
+            if elem.get('type', '').lower() == gtype:
+                elem.getparent().remove(elem)
+
+        if href is not None:
+            t = opf.create_guide_item(gtype, title, href)
+            for guide in opf.root.xpath('./*[local-name()="guide"]'):
+                guide.append(t)
+                return
+            guide = opf.create_guide_element()
+            opf.root.append(guide)
+            guide.append(t)
+            return t
+
+    def rationalize_cover3(self, opf, log):
+        ''' If there is a reference to the cover/titlepage via manifest properties, convert to
+        entries in the <guide> so that the rest of the pipeline picks it up. '''
+        from calibre.ebooks.metadata.opf3 import items_with_property
+        removed = guide_titlepage_href = guide_titlepage_id = None
+
+        # Look for titlepages incorrectly marked in the <guide> as covers
+        guide_cover, guide_elem = None, None
+        for guide_elem in opf.iterguide():
+            if guide_elem.get('type', '').lower() == 'cover':
+                guide_cover = guide_elem.get('href', '').partition('#')[0]
+                break
+        if guide_cover:
+            spine = list(opf.iterspine())
+            if spine:
+                idref = spine[0].get('idref', '')
+                for x in opf.itermanifest():
+                    if x.get('id') == idref and x.get('href') == guide_cover:
+                        guide_titlepage_href = guide_cover
+                        guide_titlepage_id = idref
+                        break
+
+        raster_cover_href = opf.epub3_raster_cover or opf.raster_cover
+        if raster_cover_href:
+            self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image')
+        titlepage_id = titlepage_href = None
+        for item in items_with_property(opf.root, 'calibre:title-page'):
+            tid, href = item.get('id'), item.get('href')
+            if href and tid:
+                titlepage_id, titlepage_href = tid, href.partition('#')[0]
+                break
+        if titlepage_href is None:
+            titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id
+        if titlepage_href is not None:
+            self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page')
+            spine = list(opf.iterspine())
+            if len(spine) > 1:
+                for item in spine:
+                    if item.get('idref') == titlepage_id:
+                        log('Found HTML cover', titlepage_href)
+                        if self.for_viewer:
+                            item.attrib.pop('linear', None)
+                        else:
+                            item.getparent().remove(item)
+                            removed = titlepage_href
+                        return removed
+
+    def rationalize_cover2(self, opf, log):
+        ''' Ensure that the cover information in the guide is correct. That
+        means, at most one entry with type="cover" that points to a raster
+        cover and at most one entry with type="titlepage" that points to an
+        HTML titlepage. '''
+        from calibre.ebooks.oeb.base import OPF
+        removed = None
+        from lxml import etree
+        guide_cover, guide_elem = None, None
+        for guide_elem in opf.iterguide():
+            if guide_elem.get('type', '').lower() == 'cover':
+                guide_cover = guide_elem.get('href', '').partition('#')[0]
+                break
+        if not guide_cover:
+            raster_cover = opf.raster_cover
+            if raster_cover:
+                if guide_elem is None:
+                    g = opf.root.makeelement(OPF('guide'))
+                    opf.root.append(g)
+                else:
+                    g = guide_elem.getparent()
+                guide_cover = raster_cover
+                guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'})
+                g.append(guide_elem)
+            return
+        spine = list(opf.iterspine())
+        if not spine:
+            return
+        # Check if the cover specified in the guide is also
+        # the first element in spine
+        idref = spine[0].get('idref', '')
+        manifest = list(opf.itermanifest())
+        if not manifest:
+            return
+        elem = [x for x in manifest if x.get('id', '') == idref]
+        if not elem or elem[0].get('href', None) != guide_cover:
+            return
+        log('Found HTML cover', guide_cover)
+
+        # Remove from spine as covers must be treated
+        # specially
+        if not self.for_viewer:
+            if len(spine) == 1:
+                log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.')
+                for guide_elem in tuple(opf.iterguide()):
+                    if guide_elem.get('type', '').lower() == 'cover':
+                        guide_elem.getparent().remove(guide_elem)
+                return
+            else:
+                spine[0].getparent().remove(spine[0])
+                removed = guide_cover
+        else:
+            # Ensure the cover is displayed as the first item in the book, some
+            # epub files have it set with linear='no' which causes the cover to
+            # display in the end
+            spine[0].attrib.pop('linear', None)
+            opf.spine[0].is_linear = True
+        # Ensure that the guide has a cover entry pointing to a raster cover
+        # and a titlepage entry pointing to the html titlepage. The titlepage
+        # entry will be used by the epub output plugin, the raster cover entry
+        # by other output plugins.
+
+        # Search for a raster cover identified in the OPF
+        raster_cover = opf.raster_cover
+
+        # Set the cover guide entry
+        if raster_cover is not None:
+            guide_elem.set('href', raster_cover)
+        else:
+            # Render the titlepage to create a raster cover
+            from calibre.ebooks import render_html_svg_workaround
+            guide_elem.set('href', 'calibre_raster_cover.jpg')
+            t = etree.SubElement(
+                elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover')
+            t.set('media-type', 'image/jpeg')
+            if os.path.exists(guide_cover):
+                renderer = render_html_svg_workaround(guide_cover, log)
+                if renderer is not None:
+                    with lopen('calibre_raster_cover.jpg', 'wb') as f:
+                        f.write(renderer)
+
+        # Set the titlepage guide entry
+        self.set_guide_type(opf, 'titlepage', guide_cover, 'Title Page')
+        return removed
+
+    def find_opf(self):
+        from calibre.utils.xml_parse import safe_xml_fromstring
+
+        def attr(n, attr):
+            for k, v in n.attrib.items():
+                if k.endswith(attr):
+                    return v
+        try:
+            with lopen('META-INF/container.xml', 'rb') as f:
+                root = safe_xml_fromstring(f.read())
+                for r in root.xpath('//*[local-name()="rootfile"]'):
+                    if attr(r, 'media-type') != "application/oebps-package+xml":
+                        continue
+                    path = attr(r, 'full-path')
+                    if not path:
+                        continue
+                    path = os.path.join(getcwd(), *path.split('/'))
+                    if os.path.exists(path):
+                        return path
+        except Exception:
+            import traceback
+            traceback.print_exc()
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.utils.zipfile import ZipFile
+        from calibre import walk
+        from calibre.ebooks import DRMError
+        from calibre.ebooks.metadata.opf2 import OPF
+        try:
+            zf = ZipFile(stream)
+            zf.extractall(getcwd())
+        except:
+            log.exception('EPUB appears to be invalid ZIP file, trying a'
+                    ' more forgiving ZIP parser')
+            from calibre.utils.localunzip import extractall
+            stream.seek(0)
+            extractall(stream)
+        encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
+        opf = self.find_opf()
+        if opf is None:
+            for f in walk('.'):
+                if f.lower().endswith('.opf') and '__MACOSX' not in f and \
+                        not os.path.basename(f).startswith('.'):
+                    opf = os.path.abspath(f)
+                    break
+        path = getattr(stream, 'name', 'stream')
+
+        if opf is None:
+            raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)
+
+        opf = os.path.relpath(opf, getcwd())
+        parts = os.path.split(opf)
+        opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
+
+        self._encrypted_font_uris = []
+        if os.path.exists(encfile):
+            if not self.process_encryption(encfile, opf, log):
+                raise DRMError(os.path.basename(path))
+        self.encrypted_fonts = self._encrypted_font_uris
+
+        if len(parts) > 1 and parts[0]:
+            delta = '/'.join(parts[:-1])+'/'
+
+            def normpath(x):
+                return posixpath.normpath(delta + elem.get('href'))
+
+            for elem in opf.itermanifest():
+                elem.set('href', normpath(elem.get('href')))
+            for elem in opf.iterguide():
+                elem.set('href', normpath(elem.get('href')))
+
+        f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
+        self.removed_cover = f(opf, log)
+        if self.removed_cover:
+            self.removed_items_to_ignore = (self.removed_cover,)
+        epub3_nav = opf.epub3_nav
+        if epub3_nav is not None:
+            self.convert_epub3_nav(epub3_nav, opf, log, options)
+
+        for x in opf.itermanifest():
+            if x.get('media-type', '') == 'application/x-dtbook+xml':
+                raise ValueError(
+                    'EPUB files with DTBook markup are not supported')
+
+        not_for_spine = set()
+        for y in opf.itermanifest():
+            id_ = y.get('id', None)
+            if id_:
+                mt = y.get('media-type', None)
+                if mt in {
+                        'application/vnd.adobe-page-template+xml',
+                        'application/vnd.adobe.page-template+xml',
+                        'application/adobe-page-template+xml',
+                        'application/adobe.page-template+xml',
+                        'application/text'
+                }:
+                    not_for_spine.add(id_)
+                ext = y.get('href', '').rpartition('.')[-1].lower()
+                if mt == 'text/plain' and ext in {'otf', 'ttf'}:
+                    # some epub authoring software sets font mime types to
+                    # text/plain
+                    not_for_spine.add(id_)
+                    y.set('media-type', 'application/font')
+
+        seen = set()
+        for x in list(opf.iterspine()):
+            ref = x.get('idref', None)
+            if not ref or ref in not_for_spine or ref in seen:
+                x.getparent().remove(x)
+                continue
+            seen.add(ref)
+
+        if len(list(opf.iterspine())) == 0:
+            raise ValueError('No valid entries in the spine of this EPUB')
+
+        with lopen('content.opf', 'wb') as nopf:
+            nopf.write(opf.render())
+
+        return os.path.abspath('content.opf')
+
+    def convert_epub3_nav(self, nav_path, opf, log, opts):
+        from lxml import etree
+        from calibre.ebooks.chardet import xml_to_unicode
+        from calibre.ebooks.oeb.polish.parsing import parse
+        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
+        from calibre.ebooks.oeb.polish.toc import first_child
+        from calibre.utils.xml_parse import safe_xml_fromstring
+        from tempfile import NamedTemporaryFile
+        with lopen(nav_path, 'rb') as f:
+            raw = f.read()
+        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
+        root = parse(raw, log=log)
+        ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
+        navmap = ncx[0]
+        et = '{%s}type' % EPUB_NS
+        bn = os.path.basename(nav_path)
+
+        def add_from_li(li, parent):
+            href = text = None
+            for x in li.iterchildren(XHTML('a'), XHTML('span')):
+                text = etree.tostring(
+                    x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
+                            x.xpath('descendant-or-self::*/@title')).strip()
+                href = x.get('href')
+                if href:
+                    if href.startswith('#'):
+                        href = bn + href
+                break
+            np = parent.makeelement(NCX('navPoint'))
+            parent.append(np)
+            np.append(np.makeelement(NCX('navLabel')))
+            np[0].append(np.makeelement(NCX('text')))
+            np[0][0].text = text
+            if href:
+                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
+            return np
+
+        def process_nav_node(node, toc_parent):
+            for li in node.iterchildren(XHTML('li')):
+                child = add_from_li(li, toc_parent)
+                ol = first_child(li, XHTML('ol'))
+                if child is not None and ol is not None:
+                    process_nav_node(ol, child)
+
+        for nav in root.iterdescendants(XHTML('nav')):
+            if nav.get(et) == 'toc':
+                ol = first_child(nav, XHTML('ol'))
+                if ol is not None:
+                    process_nav_node(ol, navmap)
+                    break
+        else:
+            return
+
+        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
+            f.write(etree.tostring(ncx, encoding='utf-8'))
+        ncx_href = os.path.relpath(f.name, getcwd()).replace(os.sep, '/')
+        ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
+        for spine in opf.root.xpath('//*[local-name()="spine"]'):
+            spine.set('toc', ncx_id)
+        opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
+        opts.epub3_nav_parsed = root
+        if getattr(self, 'removed_cover', None):
+            changed = False
+            base_path = os.path.dirname(nav_path)
+            for elem in root.xpath('//*[@href]'):
+                href, frag = elem.get('href').partition('#')[::2]
+                link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
+                abs_href = urlnormalize(link_path)
+                if abs_href == self.removed_cover:
+                    changed = True
+                    elem.set('data-calibre-removed-titlepage', '1')
+            if changed:
+                with lopen(nav_path, 'wb') as f:
+                    f.write(serialize(root, 'application/xhtml+xml'))
+
+    def postprocess_book(self, oeb, opts, log):
+        rc = getattr(self, 'removed_cover', None)
+        if rc:
+            cover_toc_item = None
+            for item in oeb.toc.iterdescendants():
+                if item.href and item.href.partition('#')[0] == rc:
+                    cover_toc_item = item
+                    break
+            spine = {x.href for x in oeb.spine}
+            if (cover_toc_item is not None and cover_toc_item not in spine):
+                oeb.toc.item_that_refers_to_cover = cover_toc_item
@@ -0,0 +1,548 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, shutil, re
+
+from calibre.customize.conversion import (OutputFormatPlugin,
+        OptionRecommendation)
+from calibre.ptempfile import TemporaryDirectory
+from calibre import CurrentDir
+from polyglot.builtins import unicode_type, filter, map, zip, range, as_bytes
+
+block_level_tags = (
+      'address',
+      'body',
+      'blockquote',
+      'center',
+      'dir',
+      'div',
+      'dl',
+      'fieldset',
+      'form',
+      'h1',
+      'h2',
+      'h3',
+      'h4',
+      'h5',
+      'h6',
+      'hr',
+      'isindex',
+      'menu',
+      'noframes',
+      'noscript',
+      'ol',
+      'p',
+      'pre',
+      'table',
+      'ul',
+)
+
+
+class EPUBOutput(OutputFormatPlugin):
+
+    name = 'EPUB Output'
+    author = 'Kovid Goyal'
+    file_type = 'epub'
+    commit_name = 'epub_output'
+    ui_data = {'versions': ('2', '3')}
+
+    options = {
+        OptionRecommendation(name='extract_to',
+            help=_('Extract the contents of the generated %s file to the '
+                'specified directory. The contents of the directory are first '
+                'deleted, so be careful.') % 'EPUB'),
+
+        OptionRecommendation(name='dont_split_on_page_breaks',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Turn off splitting at page breaks. Normally, input '
+                    'files are automatically split at every page break into '
+                    'two files. This gives an output e-book that can be '
+                    'parsed faster and with less resources. However, '
+                    'splitting is slow and if your source file contains a '
+                    'very large number of page breaks, you should turn off '
+                    'splitting on page breaks.'
+                )
+        ),
+
+        OptionRecommendation(name='flow_size', recommended_value=260,
+            help=_('Split all HTML files larger than this size (in KB). '
+                'This is necessary as most EPUB readers cannot handle large '
+                'file sizes. The default of %defaultKB is the size required '
+                'for Adobe Digital Editions. Set to 0 to disable size based splitting.')
+        ),
+
+        OptionRecommendation(name='no_default_epub_cover', recommended_value=False,
+            help=_('Normally, if the input file has no cover and you don\'t'
+            ' specify one, a default cover is generated with the title, '
+            'authors, etc. This option disables the generation of this cover.')
+        ),
+
+        OptionRecommendation(name='no_svg_cover', recommended_value=False,
+            help=_('Do not use SVG for the book cover. Use this option if '
+                'your EPUB is going to be used on a device that does not '
+                'support SVG, like the iPhone or the JetBook Lite. '
+                'Without this option, such devices will display the cover '
+                'as a blank page.')
+        ),
+
+        OptionRecommendation(name='preserve_cover_aspect_ratio',
+            recommended_value=False, help=_(
+            'When using an SVG cover, this option will cause the cover to scale '
+            'to cover the available screen area, but still preserve its aspect ratio '
+            '(ratio of width to height). That means there may be white borders '
+            'at the sides or top and bottom of the image, but the image will '
+            'never be distorted. Without this option the image may be slightly '
+            'distorted, but there will be no borders.'
+            )
+        ),
+
+        OptionRecommendation(name='epub_flatten', recommended_value=False,
+            help=_('This option is needed only if you intend to use the EPUB'
+                ' with FBReaderJ. It will flatten the file system inside the'
+                ' EPUB, putting all files into the top level.')
+        ),
+
+        OptionRecommendation(name='epub_inline_toc', recommended_value=False,
+            help=_('Insert an inline Table of Contents that will appear as part of the main book content.')
+        ),
+
+        OptionRecommendation(name='epub_toc_at_end', recommended_value=False,
+            help=_('Put the inserted inline Table of Contents at the end of the book instead of the start.')
+        ),
+
+        OptionRecommendation(name='toc_title', recommended_value=None,
+            help=_('Title for any generated in-line table of contents.')
+        ),
+
+        OptionRecommendation(name='epub_version', recommended_value='2', choices=ui_data['versions'],
+            help=_('The version of the EPUB file to generate. EPUB 2 is the'
+                ' most widely compatible, only use EPUB 3 if you know you'
+                ' actually need it.')
+        ),
+
+        }
+
+    recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
+
+    def workaround_webkit_quirks(self):  # {{{
+        from calibre.ebooks.oeb.base import XPath
+        for x in self.oeb.spine:
+            root = x.data
+            body = XPath('//h:body')(root)
+            if body:
+                body = body[0]
+
+            if not hasattr(body, 'xpath'):
+                continue
+
+            for pre in XPath('//h:pre')(body):
+                if not pre.text and len(pre) == 0:
+                    pre.tag = 'div'
+    # }}}
+
+    def upshift_markup(self):  # {{{
+        'Upgrade markup to comply with XHTML 1.1 where possible'
+        from calibre.ebooks.oeb.base import XPath, XML
+        for x in self.oeb.spine:
+            root = x.data
+            if (not root.get(XML('lang'))) and (root.get('lang')):
+                root.set(XML('lang'), root.get('lang'))
+            body = XPath('//h:body')(root)
+            if body:
+                body = body[0]
+
+            if not hasattr(body, 'xpath'):
+                continue
+            for u in XPath('//h:u')(root):
+                u.tag = 'span'
+
+            seen_ids, seen_names = set(), set()
+            for x in XPath('//*[@id or @name]')(root):
+                eid, name = x.get('id', None), x.get('name', None)
+                if eid:
+                    if eid in seen_ids:
+                        del x.attrib['id']
+                    else:
+                        seen_ids.add(eid)
+                if name:
+                    if name in seen_names:
+                        del x.attrib['name']
+                    else:
+                        seen_names.add(name)
+
+    # }}}
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        self.log, self.opts, self.oeb = log, opts, oeb
+
+        if self.opts.epub_inline_toc:
+            from calibre.ebooks.mobi.writer8.toc import TOCAdder
+            opts.mobi_toc_at_start = not opts.epub_toc_at_end
+            opts.mobi_passthrough = False
+            opts.no_inline_toc = False
+            TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)
+
+        if self.opts.epub_flatten:
+            from calibre.ebooks.oeb.transforms.filenames import FlatFilenames
+            FlatFilenames()(oeb, opts)
+        else:
+            from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames
+            UniqueFilenames()(oeb, opts)
+
+        self.workaround_ade_quirks()
+        self.workaround_webkit_quirks()
+        self.upshift_markup()
+        from calibre.ebooks.oeb.transforms.rescale import RescaleImages
+        RescaleImages(check_colorspaces=True)(oeb, opts)
+
+        from calibre.ebooks.oeb.transforms.split import Split
+        split = Split(not self.opts.dont_split_on_page_breaks,
+                max_flow_size=self.opts.flow_size*1024
+                )
+        split(self.oeb, self.opts)
+
+        from calibre.ebooks.oeb.transforms.cover import CoverManager
+        cm = CoverManager(
+                no_default_cover=self.opts.no_default_epub_cover,
+                no_svg_cover=self.opts.no_svg_cover,
+                preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
+        cm(self.oeb, self.opts, self.log)
+
+        self.workaround_sony_quirks()
+
+        if self.oeb.toc.count() == 0:
+            self.log.warn('This EPUB file has no Table of Contents. '
+                    'Creating a default TOC')
+            first = next(iter(self.oeb.spine))
+            self.oeb.toc.add(_('Start'), first.href)
+
+        from calibre.ebooks.oeb.base import OPF
+        identifiers = oeb.metadata['identifier']
+        uuid = None
+        for x in identifiers:
+            if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
+                uuid = unicode_type(x).split(':')[-1]
+                break
+        encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
+
+        if uuid is None:
+            self.log.warn('No UUID identifier found')
+            from uuid import uuid4
+            uuid = unicode_type(uuid4())
+            oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
+
+        if encrypted_fonts and not uuid.startswith('urn:uuid:'):
+            # Apparently ADE requires this value to start with urn:uuid:
+            # for some absurd reason, or it will throw a hissy fit and refuse
+            # to use the obfuscated fonts.
+            for x in identifiers:
+                if unicode_type(x) == uuid:
+                    x.content = 'urn:uuid:'+uuid
+
+        with TemporaryDirectory('_epub_output') as tdir:
+            from calibre.customize.ui import plugin_for_output_format
+            metadata_xml = None
+            extra_entries = []
+            if self.is_periodical:
+                if self.opts.output_profile.epub_periodical_format == 'sony':
+                    from calibre.ebooks.epub.periodical import sony_metadata
+                    metadata_xml, atom_xml = sony_metadata(oeb)
+                    extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
+            oeb_output = plugin_for_output_format('oeb')
+            oeb_output.convert(oeb, tdir, input_plugin, opts, log)
+            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
+            self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
+                    if x.endswith('.ncx')][0])
+            if self.opts.epub_version == '3':
+                self.upgrade_to_epub3(tdir, opf)
+            encryption = None
+            if encrypted_fonts:
+                encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
+
+            from calibre.ebooks.epub import initialize_container
+            with initialize_container(output_path, os.path.basename(opf),
+                    extra_entries=extra_entries) as epub:
+                epub.add_dir(tdir)
+                if encryption is not None:
+                    epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
+                if metadata_xml is not None:
+                    epub.writestr('META-INF/metadata.xml',
+                            metadata_xml.encode('utf-8'))
+            if opts.extract_to is not None:
+                from calibre.utils.zipfile import ZipFile
+                if os.path.exists(opts.extract_to):
+                    if os.path.isdir(opts.extract_to):
+                        shutil.rmtree(opts.extract_to)
+                    else:
+                        os.remove(opts.extract_to)
+                os.mkdir(opts.extract_to)
+                with ZipFile(output_path) as zf:
+                    zf.extractall(path=opts.extract_to)
+                self.log.info('EPUB extracted to', opts.extract_to)
+
+    def upgrade_to_epub3(self, tdir, opf):
+        self.log.info('Upgrading to EPUB 3...')
+        from calibre.ebooks.epub import simple_container_xml
+        from calibre.ebooks.oeb.polish.cover import fix_conversion_titlepage_links_in_nav
+        try:
+            os.mkdir(os.path.join(tdir, 'META-INF'))
+        except EnvironmentError:
+            pass
+        with open(os.path.join(tdir, 'META-INF', 'container.xml'), 'wb') as f:
+            f.write(simple_container_xml(os.path.basename(opf)).encode('utf-8'))
+        from calibre.ebooks.oeb.polish.container import EpubContainer
+        container = EpubContainer(tdir, self.log)
+        from calibre.ebooks.oeb.polish.upgrade import epub_2_to_3
+        existing_nav = getattr(self.opts, 'epub3_nav_parsed', None)
+        nav_href = getattr(self.opts, 'epub3_nav_href', None)
+        previous_nav = (nav_href, existing_nav) if existing_nav and nav_href else None
+        epub_2_to_3(container, self.log.info, previous_nav=previous_nav)
+        fix_conversion_titlepage_links_in_nav(container)
+        container.commit()
+        os.remove(f.name)
+        try:
+            os.rmdir(os.path.join(tdir, 'META-INF'))
+        except EnvironmentError:
+            pass
+
+    def encrypt_fonts(self, uris, tdir, uuid):  # {{{
+        from polyglot.binary import from_hex_bytes
+
+        key = re.sub(r'[^a-fA-F0-9]', '', uuid)
+        if len(key) < 16:
+            raise ValueError('UUID identifier %r is invalid'%uuid)
+        key = bytearray(from_hex_bytes((key + key)[:32]))
+        paths = []
+        with CurrentDir(tdir):
+            paths = [os.path.join(*x.split('/')) for x in uris]
+            uris = dict(zip(uris, paths))
+            fonts = []
+            for uri in list(uris.keys()):
+                path = uris[uri]
+                if not os.path.exists(path):
+                    uris.pop(uri)
+                    continue
+                self.log.debug('Encrypting font:', uri)
+                with lopen(path, 'r+b') as f:
+                    data = f.read(1024)
+                    if len(data) >= 1024:
+                        data = bytearray(data)
+                        f.seek(0)
+                        f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024))))
+                    else:
+                        self.log.warn('Font', path, 'is invalid, ignoring')
+                if not isinstance(uri, unicode_type):
+                    uri = uri.decode('utf-8')
+                fonts.append('''
+                <enc:EncryptedData>
+                    <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
+                    <enc:CipherData>
+                    <enc:CipherReference URI="%s"/>
+                    </enc:CipherData>
+                </enc:EncryptedData>
+                '''%(uri.replace('"', '\\"')))
+            if fonts:
+                ans = '''<encryption
+                    xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
+                    xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
+                    xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
+                    '''
+                ans += '\n'.join(fonts)
+                ans += '\n</encryption>'
+                return ans
+    # }}}
+
+    def condense_ncx(self, ncx_path):  # {{{
+        from lxml import etree
+        if not self.opts.pretty_print:
+            tree = etree.parse(ncx_path)
+            for tag in tree.getroot().iter(tag=etree.Element):
+                if tag.text:
+                    tag.text = tag.text.strip()
+                if tag.tail:
+                    tag.tail = tag.tail.strip()
+            compressed = etree.tostring(tree.getroot(), encoding='utf-8')
+            with open(ncx_path, 'wb') as f:
+                f.write(compressed)
+    # }}}
+
+    def workaround_ade_quirks(self):  # {{{
+        '''
+        Perform various markup transforms to get the output to render correctly
+        in the quirky ADE.
+        '''
+        from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote
+
+        stylesheet = self.oeb.manifest.main_stylesheet
+
+        # ADE cries big wet tears when it encounters an invalid fragment
+        # identifier in the NCX toc.
+        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
+        for node in self.oeb.toc.iter():
+            href = getattr(node, 'href', None)
+            if hasattr(href, 'partition'):
+                base, _, frag = href.partition('#')
+                frag = urlunquote(frag)
+                if frag and frag_pat.match(frag) is None:
+                    self.log.warn(
+                            'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
+                    node.href = base
+
+        for x in self.oeb.spine:
+            root = x.data
+            body = XPath('//h:body')(root)
+            if body:
+                body = body[0]
+
+            if hasattr(body, 'xpath'):
+                # remove <img> tags with empty src elements
+                bad = []
+                for x in XPath('//h:img')(body):
+                    src = x.get('src', '').strip()
+                    if src in ('', '#') or src.startswith('http:'):
+                        bad.append(x)
+                for img in bad:
+                    img.getparent().remove(img)
+
+                # Add id attribute to <a> tags that have name
+                for x in XPath('//h:a[@name]')(body):
+                    if not x.get('id', False):
+                        x.set('id', x.get('name'))
+                    # The delightful epubcheck has started complaining about <a> tags that
+                    # have name attributes.
+                    x.attrib.pop('name')
+
+                # Replace <br> that are children of <body> as ADE doesn't handle them
+                for br in XPath('./h:br')(body):
+                    if br.getparent() is None:
+                        continue
+                    try:
+                        prior = next(br.itersiblings(preceding=True))
+                        priortag = barename(prior.tag)
+                        priortext = prior.tail
+                    except:
+                        priortag = 'body'
+                        priortext = body.text
+                    if priortext:
+                        priortext = priortext.strip()
+                    br.tag = XHTML('p')
+                    br.text = '\u00a0'
+                    style = br.get('style', '').split(';')
+                    style = list(filter(None, map(lambda x: x.strip(), style)))
+                    style.append('margin:0pt; border:0pt')
+                    # If the prior tag is a block (including a <br> we replaced)
+                    # then this <br> replacement should have a 1-line height.
+                    # Otherwise it should have no height.
+                    if not priortext and priortag in block_level_tags:
+                        style.append('height:1em')
+                    else:
+                        style.append('height:0pt')
+                    br.set('style', '; '.join(style))
+
+            for tag in XPath('//h:embed')(root):
+                tag.getparent().remove(tag)
+            for tag in XPath('//h:object')(root):
+                if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
+                    continue
+                tag.getparent().remove(tag)
+
+            for tag in XPath('//h:title|//h:style')(root):
+                if not tag.text:
+                    tag.getparent().remove(tag)
+            for tag in XPath('//h:script')(root):
+                if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
+                    tag.getparent().remove(tag)
+            for tag in XPath('//h:body/descendant::h:script')(root):
+                tag.getparent().remove(tag)
+
+            formchildren = XPath('./h:input|./h:button|./h:textarea|'
+                    './h:label|./h:fieldset|./h:legend')
+            for tag in XPath('//h:form')(root):
+                if formchildren(tag):
+                    tag.getparent().remove(tag)
+                else:
+                    # Not a real form
+                    tag.tag = XHTML('div')
+
+            for tag in XPath('//h:center')(root):
+                tag.tag = XHTML('div')
+                tag.set('style', 'text-align:center')
+            # ADE can't handle &amp; in an img url
+            for tag in XPath('//h:img[@src]')(root):
+                tag.set('src', tag.get('src', '').replace('&', ''))
+
+            # ADE whimpers in fright when it encounters a <td> outside a
+            # <table>
+            in_table = XPath('ancestor::h:table')
+            for tag in XPath('//h:td|//h:tr|//h:th')(root):
+                if not in_table(tag):
+                    tag.tag = XHTML('div')
+
+            # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
+            special_chars = re.compile('[\u200b\u00ad]')
+            for elem in root.iterdescendants('*'):
+                if elem.text:
+                    elem.text = special_chars.sub('', elem.text)
+                    elem.text = elem.text.replace('\u2011', '-')
+                if elem.tail:
+                    elem.tail = special_chars.sub('', elem.tail)
+                    elem.tail = elem.tail.replace('\u2011', '-')
+
+            if stylesheet is not None:
+                # ADE doesn't render lists correctly if they have left margins
+                from css_parser.css import CSSRule
+                for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
+                    sel = '.'+lb.get('class')
+                    for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
+                        if sel == rule.selectorList.selectorText:
+                            rule.style.removeProperty('margin-left')
+                            # padding-left breaks rendering in webkit and gecko
+                            rule.style.removeProperty('padding-left')
+                # Change whitespace:pre to pre-wrap to accommodate readers that
+                # cannot scroll horizontally
+                for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
+                    style = rule.style
+                    ws = style.getPropertyValue('white-space')
+                    if ws == 'pre':
+                        style.setProperty('white-space', 'pre-wrap')
+
+    # }}}
+
+    def workaround_sony_quirks(self):  # {{{
+        '''
+        Perform toc link transforms to alleviate slow loading.
+        '''
+        from calibre.ebooks.oeb.base import urldefrag, XPath
+        from calibre.ebooks.oeb.polish.toc import item_at_top
+
+        def frag_is_at_top(root, frag):
+            elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
+            if elem:
+                elem = elem[0]
+            else:
+                return False
+            return item_at_top(elem)
+
+        def simplify_toc_entry(toc):
+            if toc.href:
+                href, frag = urldefrag(toc.href)
+                if frag:
+                    for x in self.oeb.spine:
+                        if x.href == href:
+                            if frag_is_at_top(x.data, frag):
+                                self.log.debug('Removing anchor from TOC href:',
+                                        href+'#'+frag)
+                                toc.href = href
+                            break
+            for x in toc:
+                simplify_toc_entry(x)
+
+        if self.oeb.toc:
+            simplify_toc_entry(self.oeb.toc)
+
+    # }}}
@@ -0,0 +1,179 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
+"""
+Convert .fb2 files to .lrf
+"""
+import os, re
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre import guess_type
+from polyglot.builtins import iteritems, getcwd
+
+FB2NS  = 'http://www.gribuser.ru/xml/fictionbook/2.0'
+FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1'
+
+
+class FB2Input(InputFormatPlugin):
+
+    name        = 'FB2 Input'
+    author      = 'Anatoly Shipitsin'
+    description = 'Convert FB2 and FBZ files to HTML'
+    file_types  = {'fb2', 'fbz'}
+    commit_name = 'fb2_input'
+
+    recommendations = {
+        ('level1_toc', '//h:h1', OptionRecommendation.MED),
+        ('level2_toc', '//h:h2', OptionRecommendation.MED),
+        ('level3_toc', '//h:h3', OptionRecommendation.MED),
+        }
+
+    options = {
+    OptionRecommendation(name='no_inline_fb2_toc',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Do not insert a Table of Contents at the beginning of the book.'
+                )
+        )}
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from lxml import etree
+        from calibre.utils.xml_parse import safe_xml_fromstring
+        from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
+        from calibre.ebooks.metadata.opf2 import OPFCreator
+        from calibre.ebooks.metadata.meta import get_metadata
+        from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
+        from calibre.ebooks.chardet import xml_to_unicode
+        self.log = log
+        log.debug('Parsing XML...')
+        raw = get_fb2_data(stream)[0]
+        raw = raw.replace(b'\0', b'')
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+            assume_utf8=True, resolve_entities=True)[0]
+        try:
+            doc = safe_xml_fromstring(raw)
+        except etree.XMLSyntaxError:
+            doc = safe_xml_fromstring(raw.replace('& ', '&amp;'))
+        if doc is None:
+            raise ValueError('The FB2 file is not valid XML')
+        doc = ensure_namespace(doc)
+        try:
+            fb_ns = doc.nsmap[doc.prefix]
+        except Exception:
+            fb_ns = FB2NS
+
+        NAMESPACES = {'f':fb_ns, 'l':XLINK_NS}
+        stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
+        css = ''
+        for s in stylesheets:
+            css += etree.tostring(s, encoding='unicode', method='text',
+                    with_tail=False) + '\n\n'
+        if css:
+            import css_parser, logging
+            parser = css_parser.CSSParser(fetcher=None,
+                    log=logging.getLogger('calibre.css'))
+
+            XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
+            text = XHTML_CSS_NAMESPACE + css
+            log.debug('Parsing stylesheet...')
+            stylesheet = parser.parseString(text)
+            stylesheet.namespaces['h'] = XHTML_NS
+            css = stylesheet.cssText
+            if isinstance(css, bytes):
+                css = css.decode('utf-8', 'replace')
+            css = css.replace('h|style', 'h|span')
+            css = re.sub(r'name\s*=\s*', 'class=', css)
+        self.extract_embedded_content(doc)
+        log.debug('Converting XML to HTML...')
+        with open(P('templates/fb2.xsl'), 'rb') as f:
+            ss = f.read().decode('utf-8')
+        ss = ss.replace("__FB_NS__", fb_ns)
+        if options.no_inline_fb2_toc:
+            log('Disabling generation of inline FB2 TOC')
+            ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
+                    re.DOTALL).sub('', ss)
+
+        styledoc = safe_xml_fromstring(ss)
+
+        transform = etree.XSLT(styledoc)
+        result = transform(doc)
+
+        # Handle links of type note and cite
+        notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')}
+        cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')}
+        all_ids = {x for x in result.xpath('//*/@id')}
+        for cite, a in iteritems(cites):
+            note = notes.get(cite, None)
+            if note:
+                c = 1
+                while 'cite%d' % c in all_ids:
+                    c += 1
+                if not note.get('id', None):
+                    note.set('id', 'cite%d' % c)
+                    all_ids.add(note.get('id'))
+                a.set('href', '#%s' % note.get('id'))
+        for x in result.xpath('//*[@link_note or @link_cite]'):
+            x.attrib.pop('link_note', None)
+            x.attrib.pop('link_cite', None)
+
+        for img in result.xpath('//img[@src]'):
+            src = img.get('src')
+            img.set('src', self.binary_map.get(src, src))
+        index = transform.tostring(result)
+        with open('index.xhtml', 'wb') as f:
+            f.write(index.encode('utf-8'))
+        with open('inline-styles.css', 'wb') as f:
+            f.write(css.encode('utf-8'))
+        stream.seek(0)
+        mi = get_metadata(stream, 'fb2')
+        if not mi.title:
+            mi.title = _('Unknown')
+        if not mi.authors:
+            mi.authors = [_('Unknown')]
+        cpath = None
+        if mi.cover_data and mi.cover_data[1]:
+            with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
+                f.write(mi.cover_data[1])
+            cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
+        else:
+            for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
+                href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
+                if href is not None:
+                    if href.startswith('#'):
+                        href = href[1:]
+                    cpath = os.path.abspath(href)
+                    break
+
+        opf = OPFCreator(getcwd(), mi)
+        entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
+        opf.create_manifest(entries)
+        opf.create_spine(['index.xhtml'])
+        if cpath:
+            opf.guide.set_cover(cpath)
+        with open('metadata.opf', 'wb') as f:
+            opf.render(f)
+        return os.path.join(getcwd(), 'metadata.opf')
+
+    def extract_embedded_content(self, doc):
+        from calibre.ebooks.fb2 import base64_decode
+        self.binary_map = {}
+        for elem in doc.xpath('./*'):
+            if elem.text and 'binary' in elem.tag and 'id' in elem.attrib:
+                ct = elem.get('content-type', '')
+                fname = elem.attrib['id']
+                ext = ct.rpartition('/')[-1].lower()
+                if ext in ('png', 'jpeg', 'jpg'):
+                    if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
+                            'png'}:
+                        fname += '.' + ext
+                    self.binary_map[elem.get('id')] = fname
+                raw = elem.text.strip()
+                try:
+                    data = base64_decode(raw)
+                except TypeError:
+                    self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
+                        elem.get('id')))
+                else:
+                    with open(fname, 'wb') as f:
+                        f.write(data)
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
+
+
+class FB2Output(OutputFormatPlugin):
+
+    name = 'FB2 Output'
+    author = 'John Schember'
+    file_type = 'fb2'
+    commit_name = 'fb2_output'
+
+    FB2_GENRES = [
+        # Science Fiction & Fantasy
+        'sf_history',  # Alternative history
+        'sf_action',  # Action
+        'sf_epic',  # Epic
+        'sf_heroic',  # Heroic
+        'sf_detective',  # Detective
+        'sf_cyberpunk',  # Cyberpunk
+        'sf_space',  # Space
+        'sf_social',  # Social#philosophical
+        'sf_horror',  # Horror & mystic
+        'sf_humor',  # Humor
+        'sf_fantasy',  # Fantasy
+        'sf',  # Science Fiction
+        # Detectives & Thrillers
+        'det_classic',  # Classical detectives
+        'det_police',  # Police Stories
+        'det_action',  # Action
+        'det_irony',  # Ironical detectives
+        'det_history',  # Historical detectives
+        'det_espionage',  # Espionage detectives
+        'det_crime',  # Crime detectives
+        'det_political',  # Political detectives
+        'det_maniac',  # Maniacs
+        'det_hard',  # Hard#boiled
+        'thriller',  # Thrillers
+        'detective',  # Detectives
+        # Prose
+        'prose_classic',  # Classics prose
+        'prose_history',  # Historical prose
+        'prose_contemporary',  # Contemporary prose
+        'prose_counter',  # Counterculture
+        'prose_rus_classic',  # Russial classics prose
+        'prose_su_classics',  # Soviet classics prose
+        # Romance
+        'love_contemporary',  # Contemporary Romance
+        'love_history',  # Historical Romance
+        'love_detective',  # Detective Romance
+        'love_short',  # Short Romance
+        'love_erotica',  # Erotica
+        # Adventure
+        'adv_western',  # Western
+        'adv_history',  # History
+        'adv_indian',  # Indians
+        'adv_maritime',  # Maritime Fiction
+        'adv_geo',  # Travel & geography
+        'adv_animal',  # Nature & animals
+        'adventure',  # Other
+        # Children's
+        'child_tale',  # Fairy Tales
+        'child_verse',  # Verses
+        'child_prose',  # Prose
+        'child_sf',  # Science Fiction
+        'child_det',  # Detectives & Thrillers
+        'child_adv',  # Adventures
+        'child_education',  # Educational
+        'children',  # Other
+        # Poetry & Dramaturgy
+        'poetry',  # Poetry
+        'dramaturgy',  # Dramaturgy
+        # Antique literature
+        'antique_ant',  # Antique
+        'antique_european',  # European
+        'antique_russian',  # Old russian
+        'antique_east',  # Old east
+        'antique_myths',  # Myths. Legends. Epos
+        'antique',  # Other
+        # Scientific#educational
+        'sci_history',  # History
+        'sci_psychology',  # Psychology
+        'sci_culture',  # Cultural science
+        'sci_religion',  # Religious studies
+        'sci_philosophy',  # Philosophy
+        'sci_politics',  # Politics
+        'sci_business',  # Business literature
+        'sci_juris',  # Jurisprudence
+        'sci_linguistic',  # Linguistics
+        'sci_medicine',  # Medicine
+        'sci_phys',  # Physics
+        'sci_math',  # Mathematics
+        'sci_chem',  # Chemistry
+        'sci_biology',  # Biology
+        'sci_tech',  # Technical
+        'science',  # Other
+        # Computers & Internet
+        'comp_www',  # Internet
+        'comp_programming',  # Programming
+        'comp_hard',  # Hardware
+        'comp_soft',  # Software
+        'comp_db',  # Databases
+        'comp_osnet',  # OS & Networking
+        'computers',  # Other
+        # Reference
+        'ref_encyc',  # Encyclopedias
+        'ref_dict',  # Dictionaries
+        'ref_ref',  # Reference
+        'ref_guide',  # Guidebooks
+        'reference',  # Other
+        # Nonfiction
+        'nonf_biography',  # Biography & Memoirs
+        'nonf_publicism',  # Publicism
+        'nonf_criticism',  # Criticism
+        'design',  # Art & design
+        'nonfiction',  # Other
+        # Religion & Inspiration
+        'religion_rel',  # Religion
+        'religion_esoterics',  # Esoterics
+        'religion_self',  # Self#improvement
+        'religion',  # Other
+        # Humor
+        'humor_anecdote',  # Anecdote (funny stories)
+        'humor_prose',  # Prose
+        'humor_verse',  # Verses
+        'humor',  # Other
+        # Home & Family
+        'home_cooking',  # Cooking
+        'home_pets',  # Pets
+        'home_crafts',  # Hobbies & Crafts
+        'home_entertain',  # Entertaining
+        'home_health',  # Health
+        'home_garden',  # Garden
+        'home_diy',  # Do it yourself
+        'home_sport',  # Sports
+        'home_sex',  # Erotica & sex
+        'home',  # Other
+    ]
+    ui_data = {
+        'sectionize': {
+            'toc': _('Section per entry in the ToC'),
+            'files': _('Section per file'),
+            'nothing': _('A single section')
+        },
+        'genres': FB2_GENRES,
+    }
+
+    options = {
+        OptionRecommendation(name='sectionize',
+            recommended_value='files', level=OptionRecommendation.LOW,
+            choices=list(ui_data['sectionize']),
+            help=_('Specify how sections are created:\n'
+                ' * nothing: {nothing}\n'
+                ' * files: {files}\n'
+                ' * toc: {toc}\n'
+                'If ToC based generation fails, adjust the "Structure detection" and/or "Table of Contents" settings '
+                '(turn on "Force use of auto-generated Table of Contents").').format(**ui_data['sectionize'])
+        ),
+        OptionRecommendation(name='fb2_genre',
+            recommended_value='antique', level=OptionRecommendation.LOW,
+            choices=FB2_GENRES,
+            help=(_('Genre for the book. Choices: %s\n\n See: ') % ', '.join(FB2_GENRES)
+                ) + 'http://www.fictionbook.org/index.php/Eng:FictionBook_2.1_genres ' + _('for a complete list with descriptions.')),
+    }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.oeb.transforms.jacket import linearize_jacket
+        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
+        from calibre.ebooks.fb2.fb2ml import FB2MLizer
+
+        try:
+            rasterizer = SVGRasterizer()
+            rasterizer(oeb_book, opts)
+        except Unavailable:
+            log.warn('SVG rasterizer unavailable, SVG will not be converted')
+
+        linearize_jacket(oeb_book)
+
+        fb2mlizer = FB2MLizer(log)
+        fb2_content = fb2mlizer.extract_content(oeb_book, opts)
+
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = lopen(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        out_stream.seek(0)
+        out_stream.truncate()
+        out_stream.write(fb2_content.encode('utf-8', 'replace'))
+
+        if close:
+            out_stream.close()
@@ -0,0 +1,316 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re, tempfile, os
+from functools import partial
+
+from calibre.constants import islinux, isbsd
+from calibre.customize.conversion import (InputFormatPlugin,
+        OptionRecommendation)
+from calibre.utils.localization import get_lang
+from calibre.utils.filenames import ascii_filename
+from calibre.utils.imghdr import what
+from polyglot.builtins import unicode_type, zip, getcwd, as_unicode
+
+
+def sanitize_file_name(x):
+    ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
+    ans, ext = ans.rpartition('.')[::2]
+    return (ans.strip() + '.' + ext.strip()).rstrip('.')
+
+
+class HTMLInput(InputFormatPlugin):
+
+    name        = 'HTML Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert HTML and OPF files to an OEB'
+    file_types  = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
+    commit_name = 'html_input'
+
+    options = {
+        OptionRecommendation(name='breadth_first',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Traverse links in HTML files breadth first. Normally, '
+                    'they are traversed depth first.'
+                   )
+        ),
+
+        OptionRecommendation(name='max_levels',
+            recommended_value=5, level=OptionRecommendation.LOW,
+            help=_('Maximum levels of recursion when following links in '
+                   'HTML files. Must be non-negative. 0 implies that no '
+                   'links in the root HTML file are followed. Default is '
+                   '%default.'
+                   )
+        ),
+
+        OptionRecommendation(name='dont_package',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Normally this input plugin re-arranges all the input '
+                'files into a standard folder hierarchy. Only use this option '
+                'if you know what you are doing as it can result in various '
+                'nasty side effects in the rest of the conversion pipeline.'
+                )
+        ),
+
+    }
+
+    def convert(self, stream, opts, file_ext, log,
+                accelerators):
+        self._is_case_sensitive = None
+        basedir = getcwd()
+        self.opts = opts
+
+        fname = None
+        if hasattr(stream, 'name'):
+            basedir = os.path.dirname(stream.name)
+            fname = os.path.basename(stream.name)
+
+        if file_ext != 'opf':
+            if opts.dont_package:
+                raise ValueError('The --dont-package option is not supported for an HTML input file')
+            from calibre.ebooks.metadata.html import get_metadata
+            mi = get_metadata(stream)
+            if fname:
+                from calibre.ebooks.metadata.meta import metadata_from_filename
+                fmi = metadata_from_filename(fname)
+                fmi.smart_update(mi)
+                mi = fmi
+            oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
+            return oeb
+
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        return create_oebbook(log, stream.name, opts,
+                encoding=opts.input_encoding)
+
+    def is_case_sensitive(self, path):
+        if getattr(self, '_is_case_sensitive', None) is not None:
+            return self._is_case_sensitive
+        if not path or not os.path.exists(path):
+            return islinux or isbsd
+        self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
+        return self._is_case_sensitive
+
+    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
+        import uuid
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        from calibre.ebooks.oeb.base import (DirContainer,
+            rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
+            xpath, urlquote)
+        from calibre import guess_type
+        from calibre.ebooks.oeb.transforms.metadata import \
+            meta_info_to_oeb_metadata
+        from calibre.ebooks.html.input import get_filelist
+        from calibre.ebooks.metadata import string_to_authors
+        from calibre.utils.localization import canonicalize_lang
+        import css_parser, logging
+        css_parser.log.setLevel(logging.WARN)
+        self.OEB_STYLES = OEB_STYLES
+        oeb = create_oebbook(log, None, opts, self,
+                encoding=opts.input_encoding, populate=False)
+        self.oeb = oeb
+
+        metadata = oeb.metadata
+        meta_info_to_oeb_metadata(mi, metadata, log)
+        if not metadata.language:
+            l = canonicalize_lang(getattr(opts, 'language', None))
+            if not l:
+                oeb.logger.warn('Language not specified')
+                l = get_lang().replace('_', '-')
+            metadata.add('language', l)
+        if not metadata.creator:
+            a = getattr(opts, 'authors', None)
+            if a:
+                a = string_to_authors(a)
+            if not a:
+                oeb.logger.warn('Creator not specified')
+                a = [self.oeb.translate(__('Unknown'))]
+            for aut in a:
+                metadata.add('creator', aut)
+        if not metadata.title:
+            oeb.logger.warn('Title not specified')
+            metadata.add('title', self.oeb.translate(__('Unknown')))
+        bookid = unicode_type(uuid.uuid4())
+        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
+        for ident in metadata.identifier:
+            if 'id' in ident.attrib:
+                self.oeb.uid = metadata.identifier[0]
+                break
+
+        filelist = get_filelist(htmlpath, basedir, opts, log)
+        filelist = [f for f in filelist if not f.is_binary]
+        htmlfile_map = {}
+        for f in filelist:
+            path = f.path
+            oeb.container = DirContainer(os.path.dirname(path), log,
+                    ignore_opf=True)
+            bname = os.path.basename(path)
+            id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
+            htmlfile_map[path] = href
+            item = oeb.manifest.add(id, href, 'text/html')
+            if path == htmlpath and '%' in path:
+                bname = urlquote(bname)
+            item.html_input_href = bname
+            oeb.spine.add(item, True)
+
+        self.added_resources = {}
+        self.log = log
+        self.log('Normalizing filename cases')
+        for path, href in htmlfile_map.items():
+            if not self.is_case_sensitive(path):
+                path = path.lower()
+            self.added_resources[path] = href
+        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
+        self.urldefrag = urldefrag
+        self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME
+
+        self.log('Rewriting HTML links')
+        for f in filelist:
+            path = f.path
+            dpath = os.path.dirname(path)
+            oeb.container = DirContainer(dpath, log, ignore_opf=True)
+            href = htmlfile_map[path]
+            try:
+                item = oeb.manifest.hrefs[href]
+            except KeyError:
+                item = oeb.manifest.hrefs[urlnormalize(href)]
+            rewrite_links(item.data, partial(self.resource_adder, base=dpath))
+
+        for item in oeb.manifest.values():
+            if item.media_type in self.OEB_STYLES:
+                dpath = None
+                for path, href in self.added_resources.items():
+                    if href == item.href:
+                        dpath = os.path.dirname(path)
+                        break
+                css_parser.replaceUrls(item.data,
+                        partial(self.resource_adder, base=dpath))
+
+        toc = self.oeb.toc
+        self.oeb.auto_generated_toc = True
+        titles = []
+        headers = []
+        for item in self.oeb.spine:
+            if not item.linear:
+                continue
+            html = item.data
+            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
+            title = re.sub(r'\s+', ' ', title.strip())
+            if title:
+                titles.append(title)
+            headers.append('(unlabled)')
+            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
+                expr = '/h:html/h:body//h:%s[position()=1]/text()'
+                header = ''.join(xpath(html, expr % tag))
+                header = re.sub(r'\s+', ' ', header.strip())
+                if header:
+                    headers[-1] = header
+                    break
+        use = titles
+        if len(titles) > len(set(titles)):
+            use = headers
+        for title, item in zip(use, self.oeb.spine):
+            if not item.linear:
+                continue
+            toc.add(title, item.href)
+
+        oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
+        return oeb
+
+    def link_to_local_path(self, link_, base=None):
+        from calibre.ebooks.html.input import Link
+        if not isinstance(link_, unicode_type):
+            try:
+                link_ = link_.decode('utf-8', 'error')
+            except:
+                self.log.warn('Failed to decode link %r. Ignoring'%link_)
+                return None, None
+        try:
+            l = Link(link_, base if base else getcwd())
+        except:
+            self.log.exception('Failed to process link: %r'%link_)
+            return None, None
+        if l.path is None:
+            # Not a local resource
+            return None, None
+        link = l.path.replace('/', os.sep).strip()
+        frag = l.fragment
+        if not link:
+            return None, None
+        return link, frag
+
+    def resource_adder(self, link_, base=None):
+        from polyglot.urllib import quote
+        link, frag = self.link_to_local_path(link_, base=base)
+        if link is None:
+            return link_
+        try:
+            if base and not os.path.isabs(link):
+                link = os.path.join(base, link)
+            link = os.path.abspath(link)
+        except:
+            return link_
+        if not os.access(link, os.R_OK):
+            return link_
+        if os.path.isdir(link):
+            self.log.warn(link_, 'is a link to a directory. Ignoring.')
+            return link_
+        if not self.is_case_sensitive(tempfile.gettempdir()):
+            link = link.lower()
+        if link not in self.added_resources:
+            bhref = os.path.basename(link)
+            id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
+            guessed = self.guess_type(href)[0]
+            media_type = guessed or self.BINARY_MIME
+            if media_type == 'text/plain':
+                self.log.warn('Ignoring link to text file %r'%link_)
+                return None
+            if media_type == self.BINARY_MIME:
+                # Check for the common case, images
+                try:
+                    img = what(link)
+                except EnvironmentError:
+                    pass
+                else:
+                    if img:
+                        media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME
+
+            self.oeb.log.debug('Added', link)
+            self.oeb.container = self.DirContainer(os.path.dirname(link),
+                    self.oeb.log, ignore_opf=True)
+            # Load into memory
+            item = self.oeb.manifest.add(id, href, media_type)
+            # bhref refers to an already existing file. The read() method of
+            # DirContainer will call unquote on it before trying to read the
+            # file, therefore we quote it here.
+            if isinstance(bhref, unicode_type):
+                bhref = bhref.encode('utf-8')
+            item.html_input_href = as_unicode(quote(bhref))
+            if guessed in self.OEB_STYLES:
+                item.override_css_fetch = partial(
+                        self.css_import_handler, os.path.dirname(link))
+            item.data
+            self.added_resources[link] = href
+
+        nlink = self.added_resources[link]
+        if frag:
+            nlink = '#'.join((nlink, frag))
+        return nlink
+
+    def css_import_handler(self, base, href):
+        link, frag = self.link_to_local_path(href, base=base)
+        if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
+            return (None, None)
+        try:
+            with open(link, 'rb') as f:
+                raw = f.read().decode('utf-8', 'replace')
+            raw = self.oeb.css_preprocessor(raw, add_namespace=False)
+        except:
+            self.log.exception('Failed to read CSS file: %r'%link)
+            return (None, None)
+        return (None, raw)
@@ -0,0 +1,226 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
+__docformat__ = 'restructuredtext en'
+
+import os, re, shutil
+from os.path import dirname, abspath, relpath as _relpath, exists, basename
+
+from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
+from calibre import CurrentDir
+from calibre.ptempfile import PersistentTemporaryDirectory
+from polyglot.builtins import unicode_type
+
+
+def relpath(*args):
+    return _relpath(*args).replace(os.sep, '/')
+
+
+class HTMLOutput(OutputFormatPlugin):
+
+    name = 'HTML Output'
+    author = 'Fabian Grassl'
+    file_type = 'zip'
+    commit_name = 'html_output'
+
+    options = {
+        OptionRecommendation(name='template_css',
+            help=_('CSS file used for the output instead of the default file')),
+
+        OptionRecommendation(name='template_html_index',
+            help=_('Template used for generation of the HTML index file instead of the default file')),
+
+        OptionRecommendation(name='template_html',
+            help=_('Template used for the generation of the HTML contents of the book instead of the default file')),
+
+        OptionRecommendation(name='extract_to',
+            help=_('Extract the contents of the generated ZIP file to the '
+                'specified directory. WARNING: The contents of the directory '
+                'will be deleted.')
+        ),
+    }
+
+    recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
+
+    def generate_toc(self, oeb_book, ref_url, output_dir):
+        '''
+        Generate table of contents
+        '''
+        from lxml import etree
+        from polyglot.urllib import unquote
+
+        from calibre.ebooks.oeb.base import element
+        from calibre.utils.cleantext import clean_xml_chars
+        with CurrentDir(output_dir):
+            def build_node(current_node, parent=None):
+                if parent is None:
+                    parent = etree.Element('ul')
+                elif len(current_node.nodes):
+                    parent = element(parent, ('ul'))
+                for node in current_node.nodes:
+                    point = element(parent, 'li')
+                    href = relpath(abspath(unquote(node.href)), dirname(ref_url))
+                    if isinstance(href, bytes):
+                        href = href.decode('utf-8')
+                    link = element(point, 'a', href=clean_xml_chars(href))
+                    title = node.title
+                    if isinstance(title, bytes):
+                        title = title.decode('utf-8')
+                    if title:
+                        title = re.sub(r'\s+', ' ', title)
+                    link.text = clean_xml_chars(title)
+                    build_node(node, point)
+                return parent
+            wrap = etree.Element('div')
+            wrap.append(build_node(oeb_book.toc))
+            return wrap
+
+    def generate_html_toc(self, oeb_book, ref_url, output_dir):
+        from lxml import etree
+
+        root = self.generate_toc(oeb_book, ref_url, output_dir)
+        return etree.tostring(root, pretty_print=True, encoding='unicode',
+                xml_declaration=False)
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from lxml import etree
+        from calibre.utils import zipfile
+        from templite import Templite
+        from polyglot.urllib import unquote
+        from calibre.ebooks.html.meta import EasyMeta
+
+        # read template files
+        if opts.template_html_index is not None:
+            with open(opts.template_html_index, 'rb') as f:
+                template_html_index_data = f.read()
+        else:
+            template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)
+
+        if opts.template_html is not None:
+            with open(opts.template_html, 'rb') as f:
+                template_html_data = f.read()
+        else:
+            template_html_data = P('templates/html_export_default.tmpl', data=True)
+
+        if opts.template_css is not None:
+            with open(opts.template_css, 'rb') as f:
+                template_css_data = f.read()
+        else:
+            template_css_data = P('templates/html_export_default.css', data=True)
+
+        template_html_index_data = template_html_index_data.decode('utf-8')
+        template_html_data = template_html_data.decode('utf-8')
+        template_css_data = template_css_data.decode('utf-8')
+
+        self.log  = log
+        self.opts = opts
+        meta = EasyMeta(oeb_book.metadata)
+
+        tempdir = os.path.realpath(PersistentTemporaryDirectory())
+        output_file = os.path.join(tempdir,
+                basename(re.sub(r'\.zip', '', output_path)+'.html'))
+        output_dir = re.sub(r'\.html', '', output_file)+'_files'
+
+        if not exists(output_dir):
+            os.makedirs(output_dir)
+
+        css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
+        with open(css_path, 'wb') as f:
+            f.write(template_css_data.encode('utf-8'))
+
+        with open(output_file, 'wb') as f:
+            html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
+            templite = Templite(template_html_index_data)
+            nextLink = oeb_book.spine[0].href
+            nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
+            cssLink = relpath(abspath(css_path), dirname(output_file))
+            tocUrl = relpath(output_file, dirname(output_file))
+            t = templite.render(has_toc=bool(oeb_book.toc.count()),
+                    toc=html_toc, meta=meta, nextLink=nextLink,
+                    tocUrl=tocUrl, cssLink=cssLink,
+                    firstContentPageLink=nextLink)
+            if isinstance(t, unicode_type):
+                t = t.encode('utf-8')
+            f.write(t)
+
+        with CurrentDir(output_dir):
+            for item in oeb_book.manifest:
+                path = abspath(unquote(item.href))
+                dir = dirname(path)
+                if not exists(dir):
+                    os.makedirs(dir)
+                if item.spine_position is not None:
+                    with open(path, 'wb') as f:
+                        pass
+                else:
+                    with open(path, 'wb') as f:
+                        f.write(item.bytes_representation)
+                    item.unload_data_from_memory(memory=path)
+
+            for item in oeb_book.spine:
+                path = abspath(unquote(item.href))
+                dir = dirname(path)
+                root = item.data.getroottree()
+
+                # get & clean HTML <HEAD>-data
+                head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
+                head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
+                head_content = re.sub(r'\<\/?head.*\>', '', head_content)
+                head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
+                head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
+
+                # get & clean HTML <BODY>-data
+                body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
+                ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
+                ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
+                ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
+
+                # generate link to next page
+                if item.spine_position+1 < len(oeb_book.spine):
+                    nextLink = oeb_book.spine[item.spine_position+1].href
+                    nextLink = relpath(abspath(nextLink), dir)
+                else:
+                    nextLink = None
+
+                # generate link to previous page
+                if item.spine_position > 0:
+                    prevLink = oeb_book.spine[item.spine_position-1].href
+                    prevLink = relpath(abspath(prevLink), dir)
+                else:
+                    prevLink = None
+
+                cssLink = relpath(abspath(css_path), dir)
+                tocUrl = relpath(output_file, dir)
+                firstContentPageLink = oeb_book.spine[0].href
+
+                # render template
+                templite = Templite(template_html_data)
+                toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
+                t = templite.render(ebookContent=ebook_content,
+                        prevLink=prevLink, nextLink=nextLink,
+                        has_toc=bool(oeb_book.toc.count()), toc=toc,
+                        tocUrl=tocUrl, head_content=head_content,
+                        meta=meta, cssLink=cssLink,
+                        firstContentPageLink=firstContentPageLink)
+
+                # write html to file
+                with open(path, 'wb') as f:
+                    f.write(t.encode('utf-8'))
+                item.unload_data_from_memory(memory=path)
+
+        zfile = zipfile.ZipFile(output_path, "w")
+        zfile.add_dir(output_dir, basename(output_dir))
+        zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)
+
+        if opts.extract_to:
+            if os.path.exists(opts.extract_to):
+                shutil.rmtree(opts.extract_to)
+            os.makedirs(opts.extract_to)
+            zfile.extractall(opts.extract_to)
+            self.log('Zip file extracted to', opts.extract_to)
+
+        zfile.close()
+
+        # cleanup temp dir
+        shutil.rmtree(tempdir)
@@ -0,0 +1,133 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre import guess_type
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import getcwd
+
+
+class HTMLZInput(InputFormatPlugin):
+
+    name        = 'HTLZ Input'
+    author      = 'John Schember'
+    description = 'Convert HTML files to HTML'
+    file_types  = {'htmlz'}
+    commit_name = 'htmlz_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.chardet import xml_to_unicode
+        from calibre.ebooks.metadata.opf2 import OPF
+        from calibre.utils.zipfile import ZipFile
+
+        self.log = log
+        html = u''
+        top_levels = []
+
+        # Extract content from zip archive.
+        zf = ZipFile(stream)
+        zf.extractall()
+
+        # Find the HTML file in the archive. It needs to be
+        # top level.
+        index = u''
+        multiple_html = False
+        # Get a list of all top level files in the archive.
+        for x in os.listdir(u'.'):
+            if os.path.isfile(x):
+                top_levels.append(x)
+        # Try to find an index. file.
+        for x in top_levels:
+            if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
+                index = x
+                break
+        # Look for multiple HTML files in the archive. We look at the
+        # top level files only as only they matter in HTMLZ.
+        for x in top_levels:
+            if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
+                # Set index to the first HTML file found if it's not
+                # called index.
+                if not index:
+                    index = x
+                else:
+                    multiple_html = True
+        # Warn the user if there multiple HTML file in the archive. HTMLZ
+        # supports a single HTML file. A conversion with a multiple HTML file
+        # HTMLZ archive probably won't turn out as the user expects. With
+        # Multiple HTML files ZIP input should be used in place of HTMLZ.
+        if multiple_html:
+            log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
+
+        if index:
+            with open(index, 'rb') as tf:
+                html = tf.read()
+        else:
+            raise Exception(_('No top level HTML file found.'))
+
+        if not html:
+            raise Exception(_('Top level HTML file %s is empty') % index)
+
+        # Encoding
+        if options.input_encoding:
+            ienc = options.input_encoding
+        else:
+            ienc = xml_to_unicode(html[:4096])[-1]
+        html = html.decode(ienc, 'replace')
+
+        # Run the HTML through the html processing plugin.
+        from calibre.customize.ui import plugin_for_input_format
+        html_input = plugin_for_input_format('html')
+        for opt in html_input.options:
+            setattr(options, opt.option.name, opt.recommended_value)
+        options.input_encoding = 'utf-8'
+        base = getcwd()
+        htmlfile = os.path.join(base, u'index.html')
+        c = 0
+        while os.path.exists(htmlfile):
+            c += 1
+            htmlfile = u'index%d.html'%c
+        with open(htmlfile, 'wb') as f:
+            f.write(html.encode('utf-8'))
+        odi = options.debug_pipeline
+        options.debug_pipeline = None
+        # Generate oeb from html conversion.
+        with open(htmlfile, 'rb') as f:
+            oeb = html_input.convert(f, options, 'html', log,
+                {})
+        options.debug_pipeline = odi
+        os.remove(htmlfile)
+
+        # Set metadata from file.
+        from calibre.customize.ui import get_file_type_metadata
+        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
+        mi = get_file_type_metadata(stream, file_ext)
+        meta_info_to_oeb_metadata(mi, oeb.metadata, log)
+
+        # Get the cover path from the OPF.
+        cover_path = None
+        opf = None
+        for x in top_levels:
+            if os.path.splitext(x)[1].lower() == u'.opf':
+                opf = x
+                break
+        if opf:
+            opf = OPF(opf, basedir=getcwd())
+            cover_path = opf.raster_cover or opf.cover
+        # Set the cover.
+        if cover_path:
+            cdata = None
+            with open(os.path.join(getcwd(), cover_path), 'rb') as cf:
+                cdata = cf.read()
+            cover_name = os.path.basename(cover_path)
+            id, href = oeb.manifest.generate('cover', cover_name)
+            oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata)
+            oeb.guide.add('cover', 'Cover', href)
+
+        return oeb
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import io
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, \
+    OptionRecommendation
+from calibre.ptempfile import TemporaryDirectory
+from polyglot.builtins import unicode_type
+
+
+class HTMLZOutput(OutputFormatPlugin):
+
+    name = 'HTMLZ Output'
+    author = 'John Schember'
+    file_type = 'htmlz'
+    commit_name = 'htmlz_output'
+    ui_data = {
+            'css_choices': {
+                'class': _('Use CSS classes'),
+                'inline': _('Use the style attribute'),
+                'tag': _('Use HTML tags wherever possible')
+            },
+            'sheet_choices': {
+                'external': _('Use an external CSS file'),
+                'inline': _('Use a <style> tag in the HTML file')
+            }
+    }
+
+    options = {
+        OptionRecommendation(name='htmlz_css_type', recommended_value='class',
+            level=OptionRecommendation.LOW,
+            choices=list(ui_data['css_choices']),
+            help=_('Specify the handling of CSS. Default is class.\n'
+                   'class: {class}\n'
+                   'inline: {inline}\n'
+                   'tag: {tag}'
+            ).format(**ui_data['css_choices'])),
+        OptionRecommendation(name='htmlz_class_style', recommended_value='external',
+            level=OptionRecommendation.LOW,
+            choices=list(ui_data['sheet_choices']),
+            help=_('How to handle the CSS when using css-type = \'class\'.\n'
+                   'Default is external.\n'
+                   'external: {external}\n'
+                   'inline: {inline}'
+            ).format(**ui_data['sheet_choices'])),
+        OptionRecommendation(name='htmlz_title_filename',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('If set this option causes the file name of the HTML file'
+                ' inside the HTMLZ archive to be based on the book title.')
+            ),
+    }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from lxml import etree
+        from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
+        from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
+        from calibre.utils.zipfile import ZipFile
+        from calibre.utils.filenames import ascii_filename
+
+        # HTML
+        if opts.htmlz_css_type == 'inline':
+            from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
+            OEB2HTMLizer = OEB2HTMLInlineCSSizer
+        elif opts.htmlz_css_type == 'tag':
+            from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
+            OEB2HTMLizer = OEB2HTMLNoCSSizer
+        else:
+            from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer
+
+        with TemporaryDirectory(u'_htmlz_output') as tdir:
+            htmlizer = OEB2HTMLizer(log)
+            html = htmlizer.oeb2html(oeb_book, opts)
+
+            fname = u'index'
+            if opts.htmlz_title_filename:
+                from calibre.utils.filenames import shorten_components_to
+                fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0]
+            with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
+                if isinstance(html, unicode_type):
+                    html = html.encode('utf-8')
+                tf.write(html)
+
+            # CSS
+            if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
+                with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
+                    tf.write(htmlizer.get_css(oeb_book))
+
+            # Images
+            images = htmlizer.images
+            if images:
+                if not os.path.exists(os.path.join(tdir, u'images')):
+                    os.makedirs(os.path.join(tdir, u'images'))
+                for item in oeb_book.manifest:
+                    if item.media_type in OEB_IMAGES and item.href in images:
+                        if item.media_type == SVG_MIME:
+                            data = etree.tostring(item.data, encoding='unicode')
+                        else:
+                            data = item.data
+                        fname = os.path.join(tdir, u'images', images[item.href])
+                        with open(fname, 'wb') as img:
+                            img.write(data)
+
+            # Cover
+            cover_path = None
+            try:
+                cover_data = None
+                if oeb_book.metadata.cover:
+                    term = oeb_book.metadata.cover[0].term
+                    cover_data = oeb_book.guide[term].item.data
+                if cover_data:
+                    from calibre.utils.img import save_cover_data_to
+                    cover_path = os.path.join(tdir, u'cover.jpg')
+                    with lopen(cover_path, 'w') as cf:
+                        cf.write('')
+                    save_cover_data_to(cover_data, cover_path)
+            except:
+                import traceback
+                traceback.print_exc()
+
+            # Metadata
+            with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
+                opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8')))
+                mi = opf.to_book_metadata()
+                if cover_path:
+                    mi.cover = u'cover.jpg'
+                mdataf.write(metadata_to_opf(mi))
+
+            htmlz = ZipFile(output_path, 'w')
+            htmlz.add_dir(tdir)
@@ -0,0 +1,64 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.customize.conversion import InputFormatPlugin
+
+
+class LITInput(InputFormatPlugin):
+
+    name        = 'LIT Input'
+    author      = 'Marshall T. Vandegrift'
+    description = 'Convert LIT files to HTML'
+    file_types  = {'lit'}
+    commit_name = 'lit_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.lit.reader import LitReader
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        self.log = log
+        return create_oebbook(log, stream, options, reader=LitReader)
+
+    def postprocess_book(self, oeb, opts, log):
+        from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
+        for item in oeb.spine:
+            root = item.data
+            if not hasattr(root, 'xpath'):
+                continue
+            for bad in ('metadata', 'guide'):
+                metadata = XPath('//h:'+bad)(root)
+                if metadata:
+                    for x in metadata:
+                        x.getparent().remove(x)
+            body = XPath('//h:body')(root)
+            if body:
+                body = body[0]
+                if len(body) == 1 and body[0].tag == XHTML('pre'):
+                    pre = body[0]
+                    from calibre.ebooks.txt.processor import convert_basic, \
+                        separate_paragraphs_single_line
+                    from calibre.ebooks.chardet import xml_to_unicode
+                    from calibre.utils.xml_parse import safe_xml_fromstring
+                    import copy
+                    self.log('LIT file with all text in singe <pre> tag detected')
+                    html = separate_paragraphs_single_line(pre.text)
+                    html = convert_basic(html).replace('<html>',
+                            '<html xmlns="%s">'%XHTML_NS)
+                    html = xml_to_unicode(html, strip_encoding_pats=True,
+                            resolve_entities=True)[0]
+                    if opts.smarten_punctuation:
+                        # SmartyPants skips text inside <pre> tags
+                        from calibre.ebooks.conversion.preprocess import smarten_punctuation
+                        html = smarten_punctuation(html, self.log)
+                    root = safe_xml_fromstring(html)
+                    body = XPath('//h:body')(root)
+                    pre.tag = XHTML('div')
+                    pre.text = ''
+                    for elem in body:
+                        ne = copy.deepcopy(elem)
+                        pre.append(ne)
@@ -0,0 +1,38 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+from calibre.customize.conversion import OutputFormatPlugin
+
+
+class LITOutput(OutputFormatPlugin):
+
+    name = 'LIT Output'
+    author = 'Marshall T. Vandegrift'
+    file_type = 'lit'
+    commit_name = 'lit_output'
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        self.log, self.opts, self.oeb = log, opts, oeb
+        from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
+        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
+        from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
+        from calibre.ebooks.lit.writer import LitWriter
+        from calibre.ebooks.oeb.transforms.split import Split
+        split = Split(split_on_page_breaks=True, max_flow_size=0,
+                remove_css_pagebreaks=False)
+        split(self.oeb, self.opts)
+
+        tocadder = HTMLTOCAdder()
+        tocadder(oeb, opts)
+        mangler = CaseMangler()
+        mangler(oeb, opts)
+        rasterizer = SVGRasterizer()
+        rasterizer(oeb, opts)
+        lit = LitWriter(self.opts)
+        lit(oeb, output_path)
@@ -0,0 +1,82 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, sys
+from calibre.customize.conversion import InputFormatPlugin
+
+
+class LRFInput(InputFormatPlugin):
+
+    name        = 'LRF Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert LRF files to HTML'
+    file_types  = {'lrf'}
+    commit_name = 'lrf_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
+                Canvas, ImageBlock, RuledLine)
+        self.log = log
+        self.log('Generating XML')
+        from calibre.ebooks.lrf.lrfparser import LRFDocument
+        from calibre.utils.xml_parse import safe_xml_fromstring
+        from lxml import etree
+        d = LRFDocument(stream)
+        d.parse()
+        xml = d.to_xml(write_files=True)
+        if options.verbose > 2:
+            open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
+        doc = safe_xml_fromstring(xml)
+
+        char_button_map = {}
+        for x in doc.xpath('//CharButton[@refobj]'):
+            ro = x.get('refobj')
+            jump_button = doc.xpath('//*[@objid="%s"]'%ro)
+            if jump_button:
+                jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]')
+                if jump_to:
+                    char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'),
+                            jump_to[0].get('refobj'))
+        plot_map = {}
+        for x in doc.xpath('//Plot[@refobj]'):
+            ro = x.get('refobj')
+            image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro)
+            if image:
+                imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'%
+                    image[0].get('refstream'))
+                if imgstr:
+                    plot_map[ro] = imgstr[0].get('file')
+
+        self.log('Converting XML to HTML...')
+        styledoc = safe_xml_fromstring(P('templates/lrf.xsl', data=True))
+        media_type = MediaType()
+        styles = Styles()
+        text_block = TextBlock(styles, char_button_map, plot_map, log)
+        canvas = Canvas(doc, styles, text_block, log)
+        image_block = ImageBlock(canvas)
+        ruled_line = RuledLine()
+        extensions = {
+                ('calibre', 'media-type') : media_type,
+                ('calibre', 'text-block') : text_block,
+                ('calibre', 'ruled-line') : ruled_line,
+                ('calibre', 'styles')     : styles,
+                ('calibre', 'canvas')     : canvas,
+                ('calibre', 'image-block'): image_block,
+                }
+        transform = etree.XSLT(styledoc, extensions=extensions)
+        try:
+            result = transform(doc)
+        except RuntimeError:
+            sys.setrecursionlimit(5000)
+            result = transform(doc)
+
+        with open('content.opf', 'wb') as f:
+            f.write(result)
+        styles.write()
+        return os.path.abspath('content.opf')
@@ -0,0 +1,196 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import sys, os
+
+from calibre.customize.conversion import OutputFormatPlugin
+from calibre.customize.conversion import OptionRecommendation
+from polyglot.builtins import unicode_type
+
+
+class LRFOptions(object):
+
+    def __init__(self, output, opts, oeb):
+        def f2s(f):
+            try:
+                return unicode_type(f[0])
+            except:
+                return ''
+        m = oeb.metadata
+        for x in ('left', 'top', 'right', 'bottom'):
+            attr = 'margin_'+x
+            val = getattr(opts, attr)
+            if val < 0:
+                setattr(opts, attr, 0)
+        self.title = None
+        self.author = self.publisher = _('Unknown')
+        self.title_sort = self.author_sort = ''
+        for x in m.creator:
+            if x.role == 'aut':
+                self.author = unicode_type(x)
+                fa = unicode_type(getattr(x, 'file_as', ''))
+                if fa:
+                    self.author_sort = fa
+        for x in m.title:
+            if unicode_type(x.file_as):
+                self.title_sort = unicode_type(x.file_as)
+        self.freetext = f2s(m.description)
+        self.category = f2s(m.subject)
+        self.cover = None
+        self.use_metadata_cover = True
+        self.output = output
+        self.ignore_tables = opts.linearize_tables
+        if opts.disable_font_rescaling:
+            self.base_font_size = 0
+        else:
+            self.base_font_size = opts.base_font_size
+        self.blank_after_para = opts.insert_blank_line
+        self.use_spine = True
+        self.font_delta = 0
+        self.ignore_colors = False
+        from calibre.ebooks.lrf import PRS500_PROFILE
+        self.profile = PRS500_PROFILE
+        self.link_levels = sys.maxsize
+        self.link_exclude = '@'
+        self.no_links_in_toc = True
+        self.disable_chapter_detection = True
+        self.chapter_regex = 'dsadcdswcdec'
+        self.chapter_attr = '$,,$'
+        self.override_css = self._override_css = ''
+        self.page_break = 'h[12]'
+        self.force_page_break = '$'
+        self.force_page_break_attr = '$'
+        self.add_chapters_to_toc = False
+        self.baen = self.pdftohtml = self.book_designer = False
+        self.verbose = opts.verbose
+        self.encoding = 'utf-8'
+        self.lrs = False
+        self.minimize_memory_usage = False
+        self.autorotation = opts.enable_autorotation
+        self.header_separation = (self.profile.dpi/72.) * opts.header_separation
+        self.headerformat = opts.header_format
+
+        for x in ('top', 'bottom', 'left', 'right'):
+            setattr(self, x+'_margin',
+                (self.profile.dpi/72.) * float(getattr(opts, 'margin_'+x)))
+
+        for x in ('wordspace', 'header', 'header_format',
+                'minimum_indent', 'serif_family',
+                'render_tables_as_images', 'sans_family', 'mono_family',
+                'text_size_multiplier_for_rendered_tables'):
+            setattr(self, x, getattr(opts, x))
+
+
+class LRFOutput(OutputFormatPlugin):
+
+    name = 'LRF Output'
+    author = 'Kovid Goyal'
+    file_type = 'lrf'
+    commit_name = 'lrf_output'
+
+    options = {
+        OptionRecommendation(name='enable_autorotation', recommended_value=False,
+            help=_('Enable auto-rotation of images that are wider than the screen width.')
+        ),
+        OptionRecommendation(name='wordspace',
+            recommended_value=2.5, level=OptionRecommendation.LOW,
+            help=_('Set the space between words in pts. Default is %default')
+        ),
+        OptionRecommendation(name='header', recommended_value=False,
+            help=_('Add a header to all the pages with title and author.')
+        ),
+        OptionRecommendation(name='header_format', recommended_value="%t by %a",
+            help=_('Set the format of the header. %a is replaced by the author '
+            'and %t by the title. Default is %default')
+        ),
+        OptionRecommendation(name='header_separation', recommended_value=0,
+            help=_('Add extra spacing below the header. Default is %default pt.')
+        ),
+        OptionRecommendation(name='minimum_indent', recommended_value=0,
+            help=_('Minimum paragraph indent (the indent of the first line '
+            'of a paragraph) in pts. Default: %default')
+        ),
+        OptionRecommendation(name='render_tables_as_images',
+            recommended_value=False,
+            help=_('This option has no effect')
+        ),
+        OptionRecommendation(name='text_size_multiplier_for_rendered_tables',
+            recommended_value=1.0,
+            help=_('Multiply the size of text in rendered tables by this '
+            'factor. Default is %default')
+        ),
+        OptionRecommendation(name='serif_family', recommended_value=None,
+            help=_('The serif family of fonts to embed')
+        ),
+        OptionRecommendation(name='sans_family', recommended_value=None,
+            help=_('The sans-serif family of fonts to embed')
+        ),
+        OptionRecommendation(name='mono_family', recommended_value=None,
+            help=_('The monospace family of fonts to embed')
+        ),
+
+    }
+
+    recommendations = {
+        ('change_justification', 'original', OptionRecommendation.HIGH)}
+
+    def convert_images(self, pages, opts, wide):
+        from calibre.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock
+        from uuid import uuid4
+        from calibre.constants import __appname__, __version__
+
+        width, height = (784, 1012) if wide else (584, 754)
+
+        ps = {}
+        ps['topmargin']      = 0
+        ps['evensidemargin'] = 0
+        ps['oddsidemargin']  = 0
+        ps['textwidth']      = width
+        ps['textheight']     = height
+        book = Book(title=opts.title, author=opts.author,
+                bookid=uuid4().hex,
+                publisher='%s %s'%(__appname__, __version__),
+                category=_('Comic'), pagestyledefault=ps,
+                booksetting=BookSetting(screenwidth=width, screenheight=height))
+        for page in pages:
+            imageStream = ImageStream(page)
+            _page = book.create_page()
+            _page.append(ImageBlock(refstream=imageStream,
+                        blockwidth=width, blockheight=height, xsize=width,
+                        ysize=height, x1=width, y1=height))
+            book.append(_page)
+
+        book.renderLrf(open(opts.output, 'wb'))
+
+    def flatten_toc(self):
+        from calibre.ebooks.oeb.base import TOC
+        nroot = TOC()
+        for x in self.oeb.toc.iterdescendants():
+            nroot.add(x.title, x.href)
+        self.oeb.toc = nroot
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        self.log, self.opts, self.oeb = log, opts, oeb
+
+        lrf_opts = LRFOptions(output_path, opts, oeb)
+
+        if input_plugin.is_image_collection:
+            self.convert_images(input_plugin.get_images(), lrf_opts,
+                    getattr(opts, 'wide', False))
+            return
+
+        self.flatten_toc()
+
+        from calibre.ptempfile import TemporaryDirectory
+        with TemporaryDirectory('_lrf_output') as tdir:
+            from calibre.customize.ui import plugin_for_output_format
+            oeb_output = plugin_for_output_format('oeb')
+            oeb_output.convert(oeb, tdir, input_plugin, opts, log)
+            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
+            from calibre.ebooks.lrf.html.convert_from import process_file
+            process_file(os.path.join(tdir, opf), lrf_opts, self.log)
@@ -0,0 +1,66 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import unicode_type
+
+
+class MOBIInput(InputFormatPlugin):
+
+    name        = 'MOBI Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
+    file_types  = {'mobi', 'prc', 'azw', 'azw3', 'pobi'}
+    commit_name = 'mobi_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        self.is_kf8 = False
+        self.mobi_is_joint = False
+
+        from calibre.ebooks.mobi.reader.mobi6 import MobiReader
+        from lxml import html
+        parse_cache = {}
+        try:
+            mr = MobiReader(stream, log, options.input_encoding,
+                        options.debug_pipeline)
+            if mr.kf8_type is None:
+                mr.extract_content('.', parse_cache)
+
+        except:
+            mr = MobiReader(stream, log, options.input_encoding,
+                        options.debug_pipeline, try_extra_data_fix=True)
+            if mr.kf8_type is None:
+                mr.extract_content('.', parse_cache)
+
+        if mr.kf8_type is not None:
+            log('Found KF8 MOBI of type %r'%mr.kf8_type)
+            if mr.kf8_type == 'joint':
+                self.mobi_is_joint = True
+            from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
+            mr = Mobi8Reader(mr, log)
+            opf = os.path.abspath(mr())
+            self.encrypted_fonts = mr.encrypted_fonts
+            self.is_kf8 = True
+            return opf
+
+        raw = parse_cache.pop('calibre_raw_mobi_markup', False)
+        if raw:
+            if isinstance(raw, unicode_type):
+                raw = raw.encode('utf-8')
+            with lopen('debug-raw.html', 'wb') as f:
+                f.write(raw)
+        from calibre.ebooks.oeb.base import close_self_closing_tags
+        for f, root in parse_cache.items():
+            raw = html.tostring(root, encoding='utf-8', method='xml',
+                    include_meta_content_type=False)
+            raw = close_self_closing_tags(raw)
+            with lopen(f, 'wb') as q:
+                q.write(raw)
+        accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
+        return mr.created_opf_path
@@ -0,0 +1,337 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.customize.conversion import (OutputFormatPlugin,
+        OptionRecommendation)
+from polyglot.builtins import unicode_type
+
+
+def remove_html_cover(oeb, log):
+    from calibre.ebooks.oeb.base import OEB_DOCS
+
+    if not oeb.metadata.cover \
+        or 'cover' not in oeb.guide:
+        return
+    href = oeb.guide['cover'].href
+    del oeb.guide['cover']
+    item = oeb.manifest.hrefs[href]
+    if item.spine_position is not None:
+        log.warn('Found an HTML cover: ', item.href, 'removing it.',
+                'If you find some content missing from the output MOBI, it '
+                'is because you misidentified the HTML cover in the input '
+                'document')
+        oeb.spine.remove(item)
+        if item.media_type in OEB_DOCS:
+            oeb.manifest.remove(item)
+
+
+def extract_mobi(output_path, opts):
+    if opts.extract_to is not None:
+        from calibre.ebooks.mobi.debug.main import inspect_mobi
+        ddir = opts.extract_to
+        inspect_mobi(output_path, ddir=ddir)
+
+
+class MOBIOutput(OutputFormatPlugin):
+
+    name = 'MOBI Output'
+    author = 'Kovid Goyal'
+    file_type = 'mobi'
+    commit_name = 'mobi_output'
+    ui_data = {'file_types': ['old', 'both', 'new']}
+
+    options = {
+        OptionRecommendation(name='prefer_author_sort',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('When present, use author sort field as author.')
+        ),
+        OptionRecommendation(name='no_inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Don\'t add Table of Contents to the book. Useful if '
+                'the book has its own table of contents.')),
+        OptionRecommendation(name='toc_title', recommended_value=None,
+            help=_('Title for any generated in-line table of contents.')
+        ),
+        OptionRecommendation(name='dont_compress',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Disable compression of the file contents.')
+        ),
+        OptionRecommendation(name='personal_doc', recommended_value='[PDOC]',
+            help=_('Tag for MOBI files to be marked as personal documents.'
+                   ' This option has no effect on the conversion. It is used'
+                   ' only when sending MOBI files to a device. If the file'
+                   ' being sent has the specified tag, it will be marked as'
+                   ' a personal document when sent to the Kindle.')
+        ),
+        OptionRecommendation(name='mobi_ignore_margins',
+            recommended_value=False,
+            help=_('Ignore margins in the input document. If False, then '
+                'the MOBI output plugin will try to convert margins specified'
+                ' in the input document, otherwise it will ignore them.')
+        ),
+        OptionRecommendation(name='mobi_toc_at_start',
+            recommended_value=False,
+            help=_('When adding the Table of Contents to the book, add it at the start of the '
+                'book instead of the end. Not recommended.')
+        ),
+        OptionRecommendation(name='extract_to',
+            help=_('Extract the contents of the generated %s file to the '
+                'specified directory. The contents of the directory are first '
+                'deleted, so be careful.') % 'MOBI'
+        ),
+        OptionRecommendation(name='share_not_sync', recommended_value=False,
+            help=_('Enable sharing of book content via Facebook etc. '
+                ' on the Kindle. WARNING: Using this feature means that '
+                ' the book will not auto sync its last read position '
+                ' on multiple devices. Complain to Amazon.')
+        ),
+        OptionRecommendation(name='mobi_keep_original_images',
+            recommended_value=False,
+            help=_('By default calibre converts all images to JPEG format '
+                'in the output MOBI file. This is for maximum compatibility '
+                'as some older MOBI viewers have problems with other image '
+                'formats. This option tells calibre not to do this. '
+                'Useful if your document contains lots of GIF/PNG images that '
+                'become very large when converted to JPEG.')),
+        OptionRecommendation(name='mobi_file_type', choices=ui_data['file_types'], recommended_value='old',
+            help=_('By default calibre generates MOBI files that contain the '
+                'old MOBI 6 format. This format is compatible with all '
+                'devices. However, by changing this setting, you can tell '
+                'calibre to generate MOBI files that contain both MOBI 6 and '
+                'the new KF8 format, or only the new KF8 format. KF8 has '
+                'more features than MOBI 6, but only works with newer Kindles. '
+                'Allowed values: {}').format('old, both, new')),
+
+    }
+
+    def check_for_periodical(self):
+        if self.is_periodical:
+            self.periodicalize_toc()
+            self.check_for_masthead()
+            self.opts.mobi_periodical = True
+        else:
+            self.opts.mobi_periodical = False
+
+    def check_for_masthead(self):
+        found = 'masthead' in self.oeb.guide
+        if not found:
+            from calibre.ebooks import generate_masthead
+            self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
+            raw = generate_masthead(unicode_type(self.oeb.metadata['title'][0]))
+            id, href = self.oeb.manifest.generate('masthead', 'masthead')
+            self.oeb.manifest.add(id, href, 'image/gif', data=raw)
+            self.oeb.guide.add('masthead', 'Masthead Image', href)
+        else:
+            self.oeb.log.debug('Using mastheadImage supplied in manifest...')
+
+    def periodicalize_toc(self):
+        from calibre.ebooks.oeb.base import TOC
+        toc = self.oeb.toc
+        if not toc or len(self.oeb.spine) < 3:
+            return
+        if toc and toc[0].klass != 'periodical':
+            one, two = self.oeb.spine[0], self.oeb.spine[1]
+            self.log('Converting TOC for MOBI periodical indexing...')
+
+            articles = {}
+            if toc.depth() < 3:
+                # single section periodical
+                self.oeb.manifest.remove(one)
+                self.oeb.manifest.remove(two)
+                sections = [TOC(klass='section', title=_('All articles'),
+                    href=self.oeb.spine[0].href)]
+                for x in toc:
+                    sections[0].nodes.append(x)
+            else:
+                # multi-section periodical
+                self.oeb.manifest.remove(one)
+                sections = list(toc)
+                for i,x in enumerate(sections):
+                    x.klass = 'section'
+                    articles_ = list(x)
+                    if articles_:
+                        self.oeb.manifest.remove(self.oeb.manifest.hrefs[x.href])
+                        x.href = articles_[0].href
+
+            for sec in sections:
+                articles[id(sec)] = []
+                for a in list(sec):
+                    a.klass = 'article'
+                    articles[id(sec)].append(a)
+                    sec.nodes.remove(a)
+
+            root = TOC(klass='periodical', href=self.oeb.spine[0].href,
+                    title=unicode_type(self.oeb.metadata.title[0]))
+
+            for s in sections:
+                if articles[id(s)]:
+                    for a in articles[id(s)]:
+                        s.nodes.append(a)
+                    root.nodes.append(s)
+
+            for x in list(toc.nodes):
+                toc.nodes.remove(x)
+
+            toc.nodes.append(root)
+
+            # Fix up the periodical href to point to first section href
+            toc.nodes[0].href = toc.nodes[0].nodes[0].href
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        from calibre.ebooks.mobi.writer2.resources import Resources
+        self.log, self.opts, self.oeb = log, opts, oeb
+
+        mobi_type = opts.mobi_file_type
+        if self.is_periodical:
+            mobi_type = 'old'  # Amazon does not support KF8 periodicals
+        create_kf8 = mobi_type in ('new', 'both')
+
+        remove_html_cover(self.oeb, self.log)
+        resources = Resources(oeb, opts, self.is_periodical,
+                add_fonts=create_kf8)
+        self.check_for_periodical()
+
+        if create_kf8:
+            from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
+            remove_duplicate_anchors(self.oeb)
+            # Split on pagebreaks so that the resulting KF8 is faster to load
+            from calibre.ebooks.oeb.transforms.split import Split
+            Split()(self.oeb, self.opts)
+
+        kf8 = self.create_kf8(resources, for_joint=mobi_type=='both'
+                ) if create_kf8 else None
+        if mobi_type == 'new':
+            kf8.write(output_path)
+            extract_mobi(output_path, opts)
+            return
+
+        self.log('Creating MOBI 6 output')
+        self.write_mobi(input_plugin, output_path, kf8, resources)
+
+    def create_kf8(self, resources, for_joint=False):
+        from calibre.ebooks.mobi.writer8.main import create_kf8_book
+        return create_kf8_book(self.oeb, self.opts, resources,
+                for_joint=for_joint)
+
+    def write_mobi(self, input_plugin, output_path, kf8, resources):
+        from calibre.ebooks.mobi.mobiml import MobiMLizer
+        from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
+        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
+        from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
+        from calibre.customize.ui import plugin_for_input_format
+
+        opts, oeb = self.opts, self.oeb
+        if not opts.no_inline_toc:
+            tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
+                    opts.mobi_toc_at_start else 'end')
+            tocadder(oeb, opts)
+        mangler = CaseMangler()
+        mangler(oeb, opts)
+        try:
+            rasterizer = SVGRasterizer()
+            rasterizer(oeb, opts)
+        except Unavailable:
+            self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
+        else:
+            # Add rasterized SVG images
+            resources.add_extra_images()
+        if hasattr(self.oeb, 'inserted_metadata_jacket'):
+            self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
+        mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
+        mobimlizer(oeb, opts)
+        write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
+        from calibre.ebooks.mobi.writer2.main import MobiWriter
+        writer = MobiWriter(opts, resources, kf8,
+                        write_page_breaks_after_item=write_page_breaks_after_item)
+        writer(oeb, output_path)
+        extract_mobi(output_path, opts)
+
+    def specialize_css_for_output(self, log, opts, item, stylizer):
+        from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
+        CSSCleanup(log, opts)(item, stylizer)
+
+    def workaround_fire_bugs(self, jacket):
+        # The idiotic Fire crashes when trying to render the table used to
+        # layout the jacket
+        from calibre.ebooks.oeb.base import XHTML
+        for table in jacket.data.xpath('//*[local-name()="table"]'):
+            table.tag = XHTML('div')
+            for tr in table.xpath('descendant::*[local-name()="tr"]'):
+                cols = tr.xpath('descendant::*[local-name()="td"]')
+                tr.tag = XHTML('div')
+                for td in cols:
+                    td.tag = XHTML('span' if cols else 'div')
+
+
+class AZW3Output(OutputFormatPlugin):
+
+    name = 'AZW3 Output'
+    author = 'Kovid Goyal'
+    file_type = 'azw3'
+    commit_name = 'azw3_output'
+
+    options = {
+        OptionRecommendation(name='prefer_author_sort',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('When present, use author sort field as author.')
+        ),
+        OptionRecommendation(name='no_inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Don\'t add Table of Contents to the book. Useful if '
+                'the book has its own table of contents.')),
+        OptionRecommendation(name='toc_title', recommended_value=None,
+            help=_('Title for any generated in-line table of contents.')
+        ),
+        OptionRecommendation(name='dont_compress',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Disable compression of the file contents.')
+        ),
+        OptionRecommendation(name='mobi_toc_at_start',
+            recommended_value=False,
+            help=_('When adding the Table of Contents to the book, add it at the start of the '
+                'book instead of the end. Not recommended.')
+        ),
+        OptionRecommendation(name='extract_to',
+            help=_('Extract the contents of the generated %s file to the '
+                'specified directory. The contents of the directory are first '
+                'deleted, so be careful.') % 'AZW3'),
+        OptionRecommendation(name='share_not_sync', recommended_value=False,
+            help=_('Enable sharing of book content via Facebook etc. '
+                ' on the Kindle. WARNING: Using this feature means that '
+                ' the book will not auto sync its last read position '
+                ' on multiple devices. Complain to Amazon.')
+        ),
+    }
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        from calibre.ebooks.mobi.writer2.resources import Resources
+        from calibre.ebooks.mobi.writer8.main import create_kf8_book
+        from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
+
+        self.oeb, self.opts, self.log = oeb, opts, log
+        opts.mobi_periodical = self.is_periodical
+        passthrough = getattr(opts, 'mobi_passthrough', False)
+        remove_duplicate_anchors(oeb)
+
+        resources = Resources(self.oeb, self.opts, self.is_periodical,
+                add_fonts=True, process_images=False)
+        if not passthrough:
+            remove_html_cover(self.oeb, self.log)
+
+            # Split on pagebreaks so that the resulting KF8 is faster to load
+            from calibre.ebooks.oeb.transforms.split import Split
+            Split()(self.oeb, self.opts)
+
+        kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)
+
+        kf8.write(output_path)
+        extract_mobi(output_path, opts)
+
+    def specialize_css_for_output(self, log, opts, item, stylizer):
+        from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
+        CSSCleanup(log, opts)(item, stylizer)
@@ -0,0 +1,25 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Convert an ODT file into a Open Ebook
+'''
+
+from calibre.customize.conversion import InputFormatPlugin
+
+
+class ODTInput(InputFormatPlugin):
+
+    name        = 'ODT Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert ODT (OpenOffice) files to HTML'
+    file_types  = {'odt'}
+    commit_name = 'odt_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.odt.input import Extract
+        return Extract()(stream, '.', log)
@@ -0,0 +1,122 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, re
+
+
+from calibre.customize.conversion import (OutputFormatPlugin,
+        OptionRecommendation)
+from calibre import CurrentDir
+
+
+class OEBOutput(OutputFormatPlugin):
+
+    name = 'OEB Output'
+    author = 'Kovid Goyal'
+    file_type = 'oeb'
+    commit_name = 'oeb_output'
+
+    recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from polyglot.urllib import unquote
+        from lxml import etree
+
+        self.log, self.opts = log, opts
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES
+        from calibre.ebooks.oeb.normalize_css import condense_sheet
+        with CurrentDir(output_path):
+            results = oeb_book.to_opf2(page_map=True)
+            for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
+                href, root = results.pop(key, [None, None])
+                if root is not None:
+                    if key == OPF_MIME:
+                        try:
+                            self.workaround_nook_cover_bug(root)
+                        except:
+                            self.log.exception('Something went wrong while trying to'
+                                    ' workaround Nook cover bug, ignoring')
+                        try:
+                            self.workaround_pocketbook_cover_bug(root)
+                        except:
+                            self.log.exception('Something went wrong while trying to'
+                                    ' workaround Pocketbook cover bug, ignoring')
+                        self.migrate_lang_code(root)
+                    raw = etree.tostring(root, pretty_print=True,
+                            encoding='utf-8', xml_declaration=True)
+                    if key == OPF_MIME:
+                        # Needed as I can't get lxml to output opf:role and
+                        # not output <opf:metadata> as well
+                        raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
+                    with lopen(href, 'wb') as f:
+                        f.write(raw)
+
+            for item in oeb_book.manifest:
+                if (
+                        not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(
+                            item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name):
+                    condense_sheet(item.data)
+                path = os.path.abspath(unquote(item.href))
+                dir = os.path.dirname(path)
+                if not os.path.exists(dir):
+                    os.makedirs(dir)
+                with lopen(path, 'wb') as f:
+                    f.write(item.bytes_representation)
+                item.unload_data_from_memory(memory=path)
+
+    def workaround_nook_cover_bug(self, root):  # {{{
+        cov = root.xpath('//*[local-name() = "meta" and @name="cover" and'
+                ' @content != "cover"]')
+
+        def manifest_items_with_id(id_):
+            return root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
+                ' and @id="%s"]'%id_)
+
+        if len(cov) == 1:
+            cov = cov[0]
+            covid = cov.get('content', '')
+
+            if covid:
+                manifest_item = manifest_items_with_id(covid)
+                if len(manifest_item) == 1 and \
+                        manifest_item[0].get('media-type',
+                                '').startswith('image/'):
+                    self.log.warn('The cover image has an id != "cover". Renaming'
+                            ' to work around bug in Nook Color')
+
+                    from calibre.ebooks.oeb.base import uuid_id
+                    newid = uuid_id()
+
+                    for item in manifest_items_with_id('cover'):
+                        item.set('id', newid)
+
+                    for x in root.xpath('//*[@idref="cover"]'):
+                        x.set('idref', newid)
+
+                    manifest_item = manifest_item[0]
+                    manifest_item.set('id', 'cover')
+                    cov.set('content', 'cover')
+    # }}}
+
+    def workaround_pocketbook_cover_bug(self, root):  # {{{
+        m = root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
+                ' and @id="cover"]')
+        if len(m) == 1:
+            m = m[0]
+            p = m.getparent()
+            p.remove(m)
+            p.insert(0, m)
+    # }}}
+
+    def migrate_lang_code(self, root):  # {{{
+        from calibre.utils.localization import lang_as_iso639_1
+        for lang in root.xpath('//*[local-name() = "language"]'):
+            clc = lang_as_iso639_1(lang.text)
+            if clc:
+                lang.text = clc
+    # }}}
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import getcwd
+
+
+class PDBInput(InputFormatPlugin):
+
+    name        = 'PDB Input'
+    author      = 'John Schember'
+    description = 'Convert PDB to HTML'
+    file_types  = {'pdb', 'updb'}
+    commit_name = 'pdb_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.pdb.header import PdbHeaderReader
+        from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
+
+        header = PdbHeaderReader(stream)
+        Reader = get_reader(header.ident)
+
+        if Reader is None:
+            raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' %
+                           (header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown'))))
+
+        log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))
+
+        reader = Reader(header, stream, log, options)
+        opf = reader.extract_content(getcwd())
+
+        return opf
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, \
+    OptionRecommendation
+from calibre.ebooks.pdb import PDBError, get_writer, ALL_FORMAT_WRITERS
+
+
+class PDBOutput(OutputFormatPlugin):
+
+    name = 'PDB Output'
+    author = 'John Schember'
+    file_type = 'pdb'
+    commit_name = 'pdb_output'
+    ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)}
+
+    options = {
+        OptionRecommendation(name='format', recommended_value='doc',
+            level=OptionRecommendation.LOW,
+            short_switch='f', choices=list(ALL_FORMAT_WRITERS),
+            help=(_('Format to use inside the pdb container. Choices are:') + ' %s' % sorted(ALL_FORMAT_WRITERS))),
+        OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
+            level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the output document. '
+            'The default is cp1252. Note: This option is not honored by all '
+            'formats.')),
+        OptionRecommendation(name='inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Add Table of Contents to beginning of the book.')),
+    }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = lopen(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        Writer = get_writer(opts.format)
+
+        if Writer is None:
+            raise PDBError('No writer available for format %s.' % format)
+
+        setattr(opts, 'max_line_length', 0)
+        setattr(opts, 'force_max_line_length', False)
+
+        writer = Writer(opts, log)
+
+        out_stream.seek(0)
+        out_stream.truncate()
+
+        writer.write_content(oeb_book, out_stream, oeb_book.metadata)
+
+        if close:
+            out_stream.close()
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from polyglot.builtins import as_bytes, getcwd
+
+
+class PDFInput(InputFormatPlugin):
+
+    name        = 'PDF Input'
+    author      = 'Kovid Goyal and John Schember'
+    description = 'Convert PDF files to HTML'
+    file_types  = {'pdf'}
+    commit_name = 'pdf_input'
+
+    options = {
+        OptionRecommendation(name='no_images', recommended_value=False,
+            help=_('Do not extract images from the document')),
+        OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
+            help=_('Scale used to determine the length at which a line should '
+            'be unwrapped. Valid values are a decimal between 0 and 1. The '
+            'default is 0.45, just below the median line length.')),
+        OptionRecommendation(name='new_pdf_engine', recommended_value=False,
+            help=_('Use the new PDF conversion engine. Currently not operational.'))
+    }
+
+    def convert_new(self, stream, accelerators):
+        from calibre.ebooks.pdf.pdftohtml import pdftohtml
+        from calibre.utils.cleantext import clean_ascii_chars
+        from calibre.ebooks.pdf.reflow import PDFDocument
+
+        pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True)
+        with lopen('index.xml', 'rb') as f:
+            xml = clean_ascii_chars(f.read())
+        PDFDocument(xml, self.opts, self.log)
+        return os.path.join(getcwd(), 'metadata.opf')
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.metadata.opf2 import OPFCreator
+        from calibre.ebooks.pdf.pdftohtml import pdftohtml
+
+        log.debug('Converting file to html...')
+        # The main html file will be named index.html
+        self.opts, self.log = options, log
+        if options.new_pdf_engine:
+            return self.convert_new(stream, accelerators)
+        pdftohtml(getcwd(), stream.name, options.no_images)
+
+        from calibre.ebooks.metadata.meta import get_metadata
+        log.debug('Retrieving document metadata...')
+        mi = get_metadata(stream, 'pdf')
+        opf = OPFCreator(getcwd(), mi)
+
+        manifest = [('index.html', None)]
+
+        images = os.listdir(getcwd())
+        images.remove('index.html')
+        for i in images:
+            manifest.append((i, None))
+        log.debug('Generating manifest...')
+        opf.create_manifest(manifest)
+
+        opf.create_spine(['index.html'])
+        log.debug('Rendering manifest...')
+        with lopen('metadata.opf', 'wb') as opffile:
+            opf.render(opffile)
+        if os.path.exists('toc.ncx'):
+            ncxid = opf.manifest.id_for_path('toc.ncx')
+            if ncxid:
+                with lopen('metadata.opf', 'r+b') as f:
+                    raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
+                    f.seek(0)
+                    f.write(raw)
+
+        return os.path.join(getcwd(), 'metadata.opf')
@@ -0,0 +1,256 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Convert OEB ebook format to PDF.
+'''
+
+import glob, os
+
+from calibre.customize.conversion import (OutputFormatPlugin,
+    OptionRecommendation)
+from calibre.ptempfile import TemporaryDirectory
+from polyglot.builtins import iteritems, unicode_type
+
+UNITS = ('millimeter', 'centimeter', 'point', 'inch' , 'pica' , 'didot',
+        'cicero', 'devicepixel')
+
+PAPER_SIZES = ('a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
+        'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter')
+
+
+class PDFOutput(OutputFormatPlugin):
+
+    name = 'PDF Output'
+    author = 'Kovid Goyal'
+    file_type = 'pdf'
+    commit_name = 'pdf_output'
+    ui_data = {'paper_sizes': PAPER_SIZES, 'units': UNITS, 'font_types': ('serif', 'sans', 'mono')}
+
+    options = {
+        OptionRecommendation(name='use_profile_size', recommended_value=False,
+            help=_('Instead of using the paper size specified in the PDF Output options,'
+                   ' use a paper size corresponding to the current output profile.'
+                   ' Useful if you want to generate a PDF for viewing on a specific device.')),
+        OptionRecommendation(name='unit', recommended_value='inch',
+            level=OptionRecommendation.LOW, short_switch='u', choices=UNITS,
+            help=_('The unit of measure for page sizes. Default is inch. Choices '
+            'are {} '
+            'Note: This does not override the unit for margins!').format(', '.join(UNITS))),
+        OptionRecommendation(name='paper_size', recommended_value='letter',
+            level=OptionRecommendation.LOW, choices=PAPER_SIZES,
+            help=_('The size of the paper. This size will be overridden when a '
+            'non default output profile is used. Default is letter. Choices '
+            'are {}').format(', '.join(PAPER_SIZES))),
+        OptionRecommendation(name='custom_size', recommended_value=None,
+            help=_('Custom size of the document. Use the form widthxheight '
+            'e.g. `123x321` to specify the width and height. '
+            'This overrides any specified paper-size.')),
+        OptionRecommendation(name='preserve_cover_aspect_ratio',
+            recommended_value=False,
+            help=_('Preserve the aspect ratio of the cover, instead'
+                ' of stretching it to fill the full first page of the'
+                ' generated pdf.')),
+        OptionRecommendation(name='pdf_serif_family',
+            recommended_value='Times', help=_(
+                'The font family used to render serif fonts. Will work only if the font is available system-wide.')),
+        OptionRecommendation(name='pdf_sans_family',
+            recommended_value='Helvetica', help=_(
+                'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.')),
+        OptionRecommendation(name='pdf_mono_family',
+            recommended_value='Courier', help=_(
+                'The font family used to render monospace fonts. Will work only if the font is available system-wide.')),
+        OptionRecommendation(name='pdf_standard_font', choices=ui_data['font_types'],
+            recommended_value='serif', help=_(
+                'The font family used to render monospace fonts')),
+        OptionRecommendation(name='pdf_default_font_size',
+            recommended_value=20, help=_(
+                'The default font size')),
+        OptionRecommendation(name='pdf_mono_font_size',
+            recommended_value=16, help=_(
+                'The default font size for monospaced text')),
+        OptionRecommendation(name='pdf_hyphenate', recommended_value=False,
+            help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')),
+        OptionRecommendation(name='pdf_mark_links', recommended_value=False,
+            help=_('Surround all links with a red box, useful for debugging.')),
+        OptionRecommendation(name='pdf_page_numbers', recommended_value=False,
+            help=_('Add page numbers to the bottom of every page in the generated PDF file. If you '
+                   'specify a footer template, it will take precedence '
+                   'over this option.')),
+        OptionRecommendation(name='pdf_footer_template', recommended_value=None,
+            help=_('An HTML template used to generate %s on every page.'
+                   ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')),
+        OptionRecommendation(name='pdf_header_template', recommended_value=None,
+            help=_('An HTML template used to generate %s on every page.'
+                   ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')),
+        OptionRecommendation(name='pdf_add_toc', recommended_value=False,
+            help=_('Add a Table of Contents at the end of the PDF that lists page numbers. '
+                   'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')),
+        OptionRecommendation(name='toc_title', recommended_value=None,
+            help=_('Title for generated table of contents.')
+        ),
+
+        OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the left page margin, in pts. Default is 72pt.'
+                   ' Overrides the common left page margin setting.')
+        ),
+
+        OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the top page margin, in pts. Default is 72pt.'
+                   ' Overrides the common top page margin setting, unless set to zero.')
+        ),
+
+        OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the right page margin, in pts. Default is 72pt.'
+                   ' Overrides the common right page margin setting, unless set to zero.')
+        ),
+
+        OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the bottom page margin, in pts. Default is 72pt.'
+                   ' Overrides the common bottom page margin setting, unless set to zero.')
+        ),
+        OptionRecommendation(name='pdf_use_document_margins', recommended_value=False,
+            help=_('Use the page margins specified in the input document via @page CSS rules.'
+            ' This will cause the margins specified in the conversion settings to be ignored.'
+            ' If the document does not specify page margins, the conversion settings will be used as a fallback.')
+        ),
+        OptionRecommendation(name='pdf_page_number_map', recommended_value=None,
+            help=_('Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.'
+                ' For example, "if (n < 3) 0; else n - 3;", where n is current page number.')
+        ),
+        OptionRecommendation(name='uncompressed_pdf',
+            recommended_value=False, help=_(
+                'Generate an uncompressed PDF, useful for debugging.')
+        ),
+        OptionRecommendation(name='pdf_odd_even_offset', recommended_value=0.0,
+            level=OptionRecommendation.LOW,
+            help=_(
+                'Shift the text horizontally by the specified offset (in pts).'
+                ' On odd numbered pages, it is shifted to the right and on even'
+                ' numbered pages to the left. Use negative numbers for the opposite'
+                ' effect. Note that this setting is ignored on pages where the margins'
+                ' are smaller than the specified offset. Shifting is done by setting'
+                ' the PDF CropBox, not all software respects the CropBox.'
+            )
+        ),
+
+    }
+
+    def specialize_options(self, log, opts, input_fmt):
+        # Ensure Qt is setup to be used with WebEngine
+        # specialize_options is called early enough in the pipeline
+        # that hopefully no Qt application has been constructed as yet
+        from PyQt5.QtWebEngineCore import QWebEngineUrlScheme
+        from PyQt5.QtWebEngineWidgets import QWebEnginePage  # noqa
+        from calibre.gui2 import must_use_qt
+        from calibre.constants import FAKE_PROTOCOL
+        scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii'))
+        scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host)
+        scheme.setFlags(QWebEngineUrlScheme.SecureScheme)
+        QWebEngineUrlScheme.registerScheme(scheme)
+        must_use_qt()
+        self.input_fmt = input_fmt
+
+        if opts.pdf_use_document_margins:
+            # Prevent the conversion pipeline from overwriting document margins
+            opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        self.stored_page_margins = getattr(opts, '_stored_page_margins', {})
+
+        self.oeb = oeb_book
+        self.input_plugin, self.opts, self.log = input_plugin, opts, log
+        self.output_path = output_path
+        from calibre.ebooks.oeb.base import OPF, OPF2_NS
+        from lxml import etree
+        from io import BytesIO
+        package = etree.Element(OPF('package'),
+            attrib={'version': '2.0', 'unique-identifier': 'dummy'},
+            nsmap={None: OPF2_NS})
+        from calibre.ebooks.metadata.opf2 import OPF
+        self.oeb.metadata.to_opf2(package)
+        self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata()
+        self.cover_data = None
+
+        if input_plugin.is_image_collection:
+            log.debug('Converting input as an image collection...')
+            self.convert_images(input_plugin.get_images())
+        else:
+            log.debug('Converting input as a text based book...')
+            self.convert_text(oeb_book)
+
+    def convert_images(self, images):
+        from calibre.ebooks.pdf.image_writer import convert
+        convert(images, self.output_path, self.opts, self.metadata, self.report_progress)
+
+    def get_cover_data(self):
+        oeb = self.oeb
+        if (oeb.metadata.cover and unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
+            cover_id = unicode_type(oeb.metadata.cover[0])
+            item = oeb.manifest.ids[cover_id]
+            self.cover_data = item.data
+
+    def process_fonts(self):
+        ''' Make sure all fonts are embeddable '''
+        from calibre.ebooks.oeb.base import urlnormalize
+        from calibre.utils.fonts.utils import remove_embed_restriction
+
+        processed = set()
+        for item in list(self.oeb.manifest):
+            if not hasattr(item.data, 'cssRules'):
+                continue
+            for i, rule in enumerate(item.data.cssRules):
+                if rule.type == rule.FONT_FACE_RULE:
+                    try:
+                        s = rule.style
+                        src = s.getProperty('src').propertyValue[0].uri
+                    except:
+                        continue
+                    path = item.abshref(src)
+                    ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
+                    if ff is None:
+                        continue
+
+                    raw = nraw = ff.data
+                    if path not in processed:
+                        processed.add(path)
+                        try:
+                            nraw = remove_embed_restriction(raw)
+                        except:
+                            continue
+                        if nraw != raw:
+                            ff.data = nraw
+                            self.oeb.container.write(path, nraw)
+
+    def convert_text(self, oeb_book):
+        import json
+        from calibre.ebooks.pdf.html_writer import convert
+        self.get_cover_data()
+        self.process_fonts()
+
+        if self.opts.pdf_use_document_margins and self.stored_page_margins:
+            for href, margins in iteritems(self.stored_page_margins):
+                item = oeb_book.manifest.hrefs.get(href)
+                if item is not None:
+                    root = item.data
+                    if hasattr(root, 'xpath') and margins:
+                        root.set('data-calibre-pdf-output-page-margins', json.dumps(margins))
+
+        with TemporaryDirectory('_pdf_out') as oeb_dir:
+            from calibre.customize.ui import plugin_for_output_format
+            oeb_dir = os.path.realpath(oeb_dir)
+            oeb_output = plugin_for_output_format('oeb')
+            oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log)
+            opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
+            convert(
+                opfpath, self.opts, metadata=self.metadata, output_path=self.output_path,
+                log=self.log, cover_data=self.cover_data, report_progress=self.report_progress
+            )
@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import glob
+import os
+import shutil
+
+from calibre.customize.conversion import InputFormatPlugin
+from calibre.ptempfile import TemporaryDirectory
+from polyglot.builtins import getcwd
+
+
+class PMLInput(InputFormatPlugin):
+
+    name        = 'PML Input'
+    author      = 'John Schember'
+    description = 'Convert PML to OEB'
+    # pmlz is a zip file containing pml files and png images.
+    file_types  = {'pml', 'pmlz'}
+    commit_name = 'pml_input'
+
+    def process_pml(self, pml_path, html_path, close_all=False):
+        from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
+
+        pclose = False
+        hclose = False
+
+        if not hasattr(pml_path, 'read'):
+            pml_stream = lopen(pml_path, 'rb')
+            pclose = True
+        else:
+            pml_stream = pml_path
+            pml_stream.seek(0)
+
+        if not hasattr(html_path, 'write'):
+            html_stream = lopen(html_path, 'wb')
+            hclose = True
+        else:
+            html_stream = html_path
+
+        ienc = getattr(pml_stream, 'encoding', None)
+        if ienc is None:
+            ienc = 'cp1252'
+        if self.options.input_encoding:
+            ienc = self.options.input_encoding
+
+        self.log.debug('Converting PML to HTML...')
+        hizer = PML_HTMLizer()
+        html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
+        html = '<html><head><title></title></head><body>%s</body></html>'%html
+        html_stream.write(html.encode('utf-8', 'replace'))
+
+        if pclose:
+            pml_stream.close()
+        if hclose:
+            html_stream.close()
+
+        return hizer.get_toc()
+
+    def get_images(self, stream, tdir, top_level=False):
+        images = []
+        imgs = []
+
+        if top_level:
+            imgs = glob.glob(os.path.join(tdir, '*.png'))
+        # Images not in top level try bookname_img directory because
+        # that's where Dropbook likes to see them.
+        if not imgs:
+            if hasattr(stream, 'name'):
+                imgs = glob.glob(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img', '*.png'))
+        # No images in Dropbook location try generic images directory
+        if not imgs:
+            imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png'))
+        if imgs:
+            os.makedirs(os.path.join(getcwd(), 'images'))
+        for img in imgs:
+            pimg_name = os.path.basename(img)
+            pimg_path = os.path.join(getcwd(), 'images', pimg_name)
+
+            images.append('images/' + pimg_name)
+
+            shutil.copy(img, pimg_path)
+
+        return images
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.metadata.toc import TOC
+        from calibre.ebooks.metadata.opf2 import OPFCreator
+        from calibre.utils.zipfile import ZipFile
+
+        self.options = options
+        self.log = log
+        pages, images = [], []
+        toc = TOC()
+
+        if file_ext == 'pmlz':
+            log.debug('De-compressing content to temporary directory...')
+            with TemporaryDirectory('_unpmlz') as tdir:
+                zf = ZipFile(stream)
+                zf.extractall(tdir)
+
+                pmls = glob.glob(os.path.join(tdir, '*.pml'))
+                for pml in pmls:
+                    html_name = os.path.splitext(os.path.basename(pml))[0]+'.html'
+                    html_path = os.path.join(getcwd(), html_name)
+
+                    pages.append(html_name)
+                    log.debug('Processing PML item %s...' % pml)
+                    ttoc = self.process_pml(pml, html_path)
+                    toc += ttoc
+                images = self.get_images(stream, tdir, True)
+        else:
+            toc = self.process_pml(stream, 'index.html')
+            pages.append('index.html')
+
+            if hasattr(stream, 'name'):
+                images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name)))
+
+        # We want pages to be orded alphabetically.
+        pages.sort()
+
+        manifest_items = []
+        for item in pages+images:
+            manifest_items.append((item, None))
+
+        from calibre.ebooks.metadata.meta import get_metadata
+        log.debug('Reading metadata from input file...')
+        mi = get_metadata(stream, 'pml')
+        if 'images/cover.png' in images:
+            mi.cover = 'images/cover.png'
+        opf = OPFCreator(getcwd(), mi)
+        log.debug('Generating manifest...')
+        opf.create_manifest(manifest_items)
+        opf.create_spine(pages)
+        opf.set_toc(toc)
+        with lopen('metadata.opf', 'wb') as opffile:
+            with lopen('toc.ncx', 'wb') as tocfile:
+                opf.render(opffile, tocfile, 'toc.ncx')
+
+        return os.path.join(getcwd(), 'metadata.opf')
+
+    def postprocess_book(self, oeb, opts, log):
+        from calibre.ebooks.oeb.base import XHTML, barename
+        for item in oeb.spine:
+            if hasattr(item.data, 'xpath'):
+                for heading in item.data.iterdescendants(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
+                    if not len(heading):
+                        continue
+                    span = heading[0]
+                    if not heading.text and not span.text and not len(span) and barename(span.tag) == 'span':
+                        if not heading.get('id') and span.get('id'):
+                            heading.set('id', span.get('id'))
+                            heading.text = span.tail
+                            heading.remove(span)
+                    if len(heading) == 1 and heading[0].get('style') == 'text-align: center; margin: auto;':
+                        div = heading[0]
+                        if barename(div.tag) == 'div' and not len(div) and not div.get('id') and not heading.get('style'):
+                            heading.text = (heading.text or '') + (div.text or '') + (div.tail or '')
+                            heading.remove(div)
+                            heading.set('style', 'text-align: center')
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os, io
+
+from calibre.customize.conversion import (OutputFormatPlugin,
+        OptionRecommendation)
+from calibre.ptempfile import TemporaryDirectory
+from polyglot.builtins import unicode_type
+
+
+class PMLOutput(OutputFormatPlugin):
+
+    name = 'PML Output'
+    author = 'John Schember'
+    file_type = 'pmlz'
+    commit_name = 'pml_output'
+
+    options = {
+        OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252',
+            level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the output document. '
+            'The default is cp1252.')),
+        OptionRecommendation(name='inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Add Table of Contents to beginning of the book.')),
+        OptionRecommendation(name='full_image_depth',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Do not reduce the size or bit depth of images. Images '
+                   'have their size and depth reduced by default to accommodate '
+                   'applications that can not convert images on their '
+                   'own such as Dropbook.')),
+    }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.pml.pmlml import PMLMLizer
+        from calibre.utils.zipfile import ZipFile
+
+        with TemporaryDirectory('_pmlz_output') as tdir:
+            pmlmlizer = PMLMLizer(log)
+            pml = unicode_type(pmlmlizer.extract_content(oeb_book, opts))
+            with lopen(os.path.join(tdir, 'index.pml'), 'wb') as out:
+                out.write(pml.encode(opts.pml_output_encoding, 'replace'))
+
+            img_path = os.path.join(tdir, 'index_img')
+            if not os.path.exists(img_path):
+                os.makedirs(img_path)
+            self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, img_path, opts)
+
+            log.debug('Compressing output...')
+            pmlz = ZipFile(output_path, 'w')
+            pmlz.add_dir(tdir)
+
+    def write_images(self, manifest, image_hrefs, out_dir, opts):
+        from PIL import Image
+
+        from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
+        for item in manifest:
+            if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys():
+                if opts.full_image_depth:
+                    im = Image.open(io.BytesIO(item.data))
+                else:
+                    im = Image.open(io.BytesIO(item.data)).convert('P')
+                    im.thumbnail((300,300), Image.ANTIALIAS)
+
+                data = io.BytesIO()
+                im.save(data, 'PNG')
+                data = data.getvalue()
+
+                path = os.path.join(out_dir, image_hrefs[item.href])
+
+                with lopen(path, 'wb') as out:
+                    out.write(data)
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import getcwd
+
+
+class RBInput(InputFormatPlugin):
+
+    name        = 'RB Input'
+    author      = 'John Schember'
+    description = 'Convert RB files to HTML'
+    file_types  = {'rb'}
+    commit_name = 'rb_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.rb.reader import Reader
+
+        reader = Reader(stream, log, options.input_encoding)
+        opf = reader.extract_content(getcwd())
+
+        return opf
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
+
+
+class RBOutput(OutputFormatPlugin):
+
+    name = 'RB Output'
+    author = 'John Schember'
+    file_type = 'rb'
+    commit_name = 'rb_output'
+
+    options = {
+        OptionRecommendation(name='inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Add Table of Contents to beginning of the book.'))}
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.rb.writer import RBWriter
+
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = lopen(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        writer = RBWriter(opts, log)
+
+        out_stream.seek(0)
+        out_stream.truncate()
+
+        writer.write_content(oeb_book, out_stream, oeb_book.metadata)
+
+        if close:
+            out_stream.close()
@@ -0,0 +1,169 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.constants import numeric_version
+from calibre import walk
+from polyglot.builtins import unicode_type
+
+
+class RecipeDisabled(Exception):
+    pass
+
+
+class RecipeInput(InputFormatPlugin):
+
+    name        = 'Recipe Input'
+    author      = 'Kovid Goyal'
+    description = _('Download periodical content from the internet')
+    file_types  = {'recipe', 'downloaded_recipe'}
+    commit_name = 'recipe_input'
+
+    recommendations = {
+        ('chapter', None, OptionRecommendation.HIGH),
+        ('dont_split_on_page_breaks', True, OptionRecommendation.HIGH),
+        ('use_auto_toc', False, OptionRecommendation.HIGH),
+        ('input_encoding', None, OptionRecommendation.HIGH),
+        ('input_profile', 'default', OptionRecommendation.HIGH),
+        ('page_breaks_before', None, OptionRecommendation.HIGH),
+        ('insert_metadata', False, OptionRecommendation.HIGH),
+        }
+
+    options = {
+        OptionRecommendation(name='test', recommended_value=False,
+            help=_(
+            'Useful for recipe development. Forces'
+            ' max_articles_per_feed to 2 and downloads at most 2 feeds.'
+            ' You can change the number of feeds and articles by supplying optional arguments.'
+            ' For example: --test 3 1 will download at most 3 feeds and only 1 article per feed.')),
+        OptionRecommendation(name='username', recommended_value=None,
+            help=_('Username for sites that require a login to access '
+                'content.')),
+        OptionRecommendation(name='password', recommended_value=None,
+            help=_('Password for sites that require a login to access '
+                'content.')),
+        OptionRecommendation(name='dont_download_recipe',
+            recommended_value=False,
+            help=_('Do not download latest version of builtin recipes from the calibre server')),
+        OptionRecommendation(name='lrf', recommended_value=False,
+            help='Optimize fetching for subsequent conversion to LRF.'),
+        }
+
+    def convert(self, recipe_or_file, opts, file_ext, log,
+            accelerators):
+        from calibre.web.feeds.recipes import compile_recipe
+        opts.output_profile.flow_size = 0
+        if file_ext == 'downloaded_recipe':
+            from calibre.utils.zipfile import ZipFile
+            zf = ZipFile(recipe_or_file, 'r')
+            zf.extractall()
+            zf.close()
+            with lopen('download.recipe', 'rb') as f:
+                self.recipe_source = f.read()
+            recipe = compile_recipe(self.recipe_source)
+            recipe.needs_subscription = False
+            self.recipe_object = recipe(opts, log, self.report_progress)
+        else:
+            if os.environ.get('CALIBRE_RECIPE_URN'):
+                from calibre.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
+                urn = os.environ['CALIBRE_RECIPE_URN']
+                log('Downloading recipe urn: ' + urn)
+                rtype, recipe_id = urn.partition(':')[::2]
+                if not recipe_id:
+                    raise ValueError('Invalid recipe urn: ' + urn)
+                if rtype == 'custom':
+                    self.recipe_source = get_custom_recipe(recipe_id)
+                else:
+                    self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True)
+                if not self.recipe_source:
+                    raise ValueError('Could not find recipe with urn: ' + urn)
+                if not isinstance(self.recipe_source, bytes):
+                    self.recipe_source = self.recipe_source.encode('utf-8')
+                recipe = compile_recipe(self.recipe_source)
+            elif os.access(recipe_or_file, os.R_OK):
+                with lopen(recipe_or_file, 'rb') as f:
+                    self.recipe_source = f.read()
+                recipe = compile_recipe(self.recipe_source)
+                log('Using custom recipe')
+            else:
+                from calibre.web.feeds.recipes.collection import (
+                        get_builtin_recipe_by_title, get_builtin_recipe_titles)
+                title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
+                title = os.path.basename(title).rpartition('.')[0]
+                titles = frozenset(get_builtin_recipe_titles())
+                if title not in titles:
+                    title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
+                    title = title.rpartition('.')[0]
+
+                raw = get_builtin_recipe_by_title(title, log=log,
+                        download_recipe=not opts.dont_download_recipe)
+                builtin = False
+                try:
+                    recipe = compile_recipe(raw)
+                    self.recipe_source = raw
+                    if recipe.requires_version > numeric_version:
+                        log.warn(
+                        'Downloaded recipe needs calibre version at least: %s' %
+                        ('.'.join(recipe.requires_version)))
+                        builtin = True
+                except:
+                    log.exception('Failed to compile downloaded recipe. Falling '
+                            'back to builtin one')
+                    builtin = True
+                if builtin:
+                    log('Using bundled builtin recipe')
+                    raw = get_builtin_recipe_by_title(title, log=log,
+                            download_recipe=False)
+                    if raw is None:
+                        raise ValueError('Failed to find builtin recipe: '+title)
+                    recipe = compile_recipe(raw)
+                    self.recipe_source = raw
+                else:
+                    log('Using downloaded builtin recipe')
+
+            if recipe is None:
+                raise ValueError('%r is not a valid recipe file or builtin recipe' %
+                        recipe_or_file)
+
+            disabled = getattr(recipe, 'recipe_disabled', None)
+            if disabled is not None:
+                raise RecipeDisabled(disabled)
+            ro = recipe(opts, log, self.report_progress)
+            ro.download()
+            self.recipe_object = ro
+
+        for key, val in self.recipe_object.conversion_options.items():
+            setattr(opts, key, val)
+
+        for f in os.listdir('.'):
+            if f.endswith('.opf'):
+                return os.path.abspath(f)
+
+        for f in walk('.'):
+            if f.endswith('.opf'):
+                return os.path.abspath(f)
+
+    def postprocess_book(self, oeb, opts, log):
+        if self.recipe_object is not None:
+            self.recipe_object.internal_postprocess_book(oeb, opts, log)
+            self.recipe_object.postprocess_book(oeb, opts, log)
+
+    def specialize(self, oeb, opts, log, output_fmt):
+        if opts.no_inline_navbars:
+            from calibre.ebooks.oeb.base import XPath
+            for item in oeb.spine:
+                for div in XPath('//h:div[contains(@class, "calibre_navbar")]')(item.data):
+                    div.getparent().remove(div)
+
+    def save_download(self, zf):
+        raw = self.recipe_source
+        if isinstance(raw, unicode_type):
+            raw = raw.encode('utf-8')
+        zf.writestr('download.recipe', raw)
@@ -0,0 +1,323 @@
+from __future__ import with_statement, unicode_literals
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os, glob, re, textwrap
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from polyglot.builtins import iteritems, filter, getcwd, as_bytes
+
+border_style_map = {
+        'single' : 'solid',
+        'double-thickness-border' : 'double',
+        'shadowed-border': 'outset',
+        'double-border': 'double',
+        'dotted-border': 'dotted',
+        'dashed': 'dashed',
+        'hairline': 'solid',
+        'inset': 'inset',
+        'dash-small': 'dashed',
+        'dot-dash': 'dotted',
+        'dot-dot-dash': 'dotted',
+        'outset': 'outset',
+        'tripple': 'double',
+        'triple': 'double',
+        'thick-thin-small': 'solid',
+        'thin-thick-small': 'solid',
+        'thin-thick-thin-small': 'solid',
+        'thick-thin-medium': 'solid',
+        'thin-thick-medium': 'solid',
+        'thin-thick-thin-medium': 'solid',
+        'thick-thin-large': 'solid',
+        'thin-thick-thin-large': 'solid',
+        'wavy': 'ridge',
+        'double-wavy': 'ridge',
+        'striped': 'ridge',
+        'emboss': 'inset',
+        'engrave': 'inset',
+        'frame': 'ridge',
+}
+
+
+class RTFInput(InputFormatPlugin):
+
+    name        = 'RTF Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert RTF files to HTML'
+    file_types  = {'rtf'}
+    commit_name = 'rtf_input'
+
+    options = {
+        OptionRecommendation(name='ignore_wmf', recommended_value=False,
+            help=_('Ignore WMF images instead of replacing them with a placeholder image.')),
+    }
+
+    def generate_xml(self, stream):
+        from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
+        ofile = u'dataxml.xml'
+        run_lev, debug_dir, indent_out = 1, None, 0
+        if getattr(self.opts, 'debug_pipeline', None) is not None:
+            try:
+                os.mkdir(u'rtfdebug')
+                debug_dir = u'rtfdebug'
+                run_lev = 4
+                indent_out = 1
+                self.log('Running RTFParser in debug mode')
+            except:
+                self.log.warn('Impossible to run RTFParser in debug mode')
+        parser = ParseRtf(
+            in_file=stream,
+            out_file=ofile,
+            # Convert symbol fonts to unicode equivalents. Default
+            # is 1
+            convert_symbol=1,
+
+            # Convert Zapf fonts to unicode equivalents. Default
+            # is 1.
+            convert_zapf=1,
+
+            # Convert Wingding fonts to unicode equivalents.
+            # Default is 1.
+            convert_wingdings=1,
+
+            # Convert RTF caps to real caps.
+            # Default is 1.
+            convert_caps=1,
+
+            # Indent resulting XML.
+            # Default is 0 (no indent).
+            indent=indent_out,
+
+            # Form lists from RTF. Default is 1.
+            form_lists=1,
+
+            # Convert headings to sections. Default is 0.
+            headings_to_sections=1,
+
+            # Group paragraphs with the same style name. Default is 1.
+            group_styles=1,
+
+            # Group borders. Default is 1.
+            group_borders=1,
+
+            # Write or do not write paragraphs. Default is 0.
+            empty_paragraphs=1,
+
+            # Debug
+            deb_dir=debug_dir,
+
+            # Default encoding
+            default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
+
+            # Run level
+            run_level=run_lev,
+        )
+        parser.parse_rtf()
+        with open(ofile, 'rb') as f:
+            return f.read()
+
+    def extract_images(self, picts):
+        from calibre.utils.imghdr import what
+        from binascii import unhexlify
+        self.log('Extracting images...')
+
+        with open(picts, 'rb') as f:
+            raw = f.read()
+        picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
+        hex_pat = re.compile(br'[^a-fA-F0-9]')
+        encs = [hex_pat.sub(b'', pict) for pict in picts]
+
+        count = 0
+        imap = {}
+        for enc in encs:
+            if len(enc) % 2 == 1:
+                enc = enc[:-1]
+            data = unhexlify(enc)
+            fmt = what(None, data)
+            if fmt is None:
+                fmt = 'wmf'
+            count += 1
+            name = u'%04d.%s' % (count, fmt)
+            with open(name, 'wb') as f:
+                f.write(data)
+            imap[count] = name
+            # with open(name+'.hex', 'wb') as f:
+            #     f.write(enc)
+        return self.convert_images(imap)
+
+    def convert_images(self, imap):
+        self.default_img = None
+        for count, val in iteritems(imap):
+            try:
+                imap[count] = self.convert_image(val)
+            except:
+                self.log.exception('Failed to convert', val)
+        return imap
+
+    def convert_image(self, name):
+        if not name.endswith('.wmf'):
+            return name
+        try:
+            return self.rasterize_wmf(name)
+        except Exception:
+            self.log.exception('Failed to convert WMF image %r'%name)
+        return self.replace_wmf(name)
+
+    def replace_wmf(self, name):
+        if self.opts.ignore_wmf:
+            os.remove(name)
+            return '__REMOVE_ME__'
+        from calibre.ebooks.covers import message_image
+        if self.default_img is None:
+            self.default_img = message_image('Conversion of WMF images is not supported.'
+            ' Use Microsoft Word or OpenOffice to save this RTF file'
+            ' as HTML and convert that in calibre.')
+        name = name.replace('.wmf', '.jpg')
+        with lopen(name, 'wb') as f:
+            f.write(self.default_img)
+        return name
+
+    def rasterize_wmf(self, name):
+        from calibre.utils.wmf.parse import wmf_unwrap
+        with open(name, 'rb') as f:
+            data = f.read()
+        data = wmf_unwrap(data)
+        name = name.replace('.wmf', '.png')
+        with open(name, 'wb') as f:
+            f.write(data)
+        return name
+
+    def write_inline_css(self, ic, border_styles):
+        font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in
+                enumerate(ic.font_sizes)]
+        color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in
+                enumerate(ic.colors) if x != 'false']
+        css = textwrap.dedent('''
+        span.none {
+            text-decoration: none; font-weight: normal;
+            font-style: normal; font-variant: normal
+        }
+
+        span.italics { font-style: italic }
+
+        span.bold { font-weight: bold }
+
+        span.small-caps { font-variant: small-caps }
+
+        span.underlined { text-decoration: underline }
+
+        span.strike-through { text-decoration: line-through }
+
+        ''')
+        css += '\n'+'\n'.join(font_size_classes)
+        css += '\n' +'\n'.join(color_classes)
+
+        for cls, val in iteritems(border_styles):
+            css += '\n\n.%s {\n%s\n}'%(cls, val)
+
+        with open(u'styles.css', 'ab') as f:
+            f.write(css.encode('utf-8'))
+
+    def convert_borders(self, doc):
+        border_styles = []
+        style_map = {}
+        for elem in doc.xpath(r'//*[local-name()="cell"]'):
+            style = ['border-style: hidden', 'border-width: 1px',
+                    'border-color: black']
+            for x in ('bottom', 'top', 'left', 'right'):
+                bs = elem.get('border-cell-%s-style'%x, None)
+                if bs:
+                    cbs = border_style_map.get(bs, 'solid')
+                    style.append('border-%s-style: %s'%(x, cbs))
+                bw = elem.get('border-cell-%s-line-width'%x, None)
+                if bw:
+                    style.append('border-%s-width: %spt'%(x, bw))
+                bc = elem.get('border-cell-%s-color'%x, None)
+                if bc:
+                    style.append('border-%s-color: %s'%(x, bc))
+            style = ';\n'.join(style)
+            if style not in border_styles:
+                border_styles.append(style)
+            idx = border_styles.index(style)
+            cls = 'border_style%d'%idx
+            style_map[cls] = style
+            elem.set('class', cls)
+        return style_map
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from lxml import etree
+        from calibre.ebooks.metadata.meta import get_metadata
+        from calibre.ebooks.metadata.opf2 import OPFCreator
+        from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
+        from calibre.ebooks.rtf.input import InlineClass
+        from calibre.utils.xml_parse import safe_xml_fromstring
+        self.opts = options
+        self.log = log
+        self.log('Converting RTF to XML...')
+        try:
+            xml = self.generate_xml(stream.name)
+        except RtfInvalidCodeException as e:
+            self.log.exception('Unable to parse RTF')
+            raise ValueError(_('This RTF file has a feature calibre does not '
+            'support. Convert it to HTML first and then try it.\n%s')%e)
+
+        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
+        if d:
+            imap = {}
+            try:
+                imap = self.extract_images(d[0])
+            except:
+                self.log.exception('Failed to extract images...')
+
+        self.log('Parsing XML...')
+        doc = safe_xml_fromstring(xml)
+        border_styles = self.convert_borders(doc)
+        for pict in doc.xpath('//rtf:pict[@num]',
+                namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
+            num = int(pict.get('num'))
+            name = imap.get(num, None)
+            if name is not None:
+                pict.set('num', name)
+
+        self.log('Converting XML to HTML...')
+        inline_class = InlineClass(self.log)
+        styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
+        extensions = {('calibre', 'inline-class') : inline_class}
+        transform = etree.XSLT(styledoc, extensions=extensions)
+        result = transform(doc)
+        html = u'index.xhtml'
+        with open(html, 'wb') as f:
+            res = as_bytes(transform.tostring(result))
+            # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
+            # clean multiple \n
+            res = re.sub(b'\n+', b'\n', res)
+            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
+            # res = re.sub('\s*<body>', '<body>', res)
+            # res = re.sub('(?<=\n)\n{2}',
+            # u'<p>\u00a0</p>\n'.encode('utf-8'), res)
+            f.write(res)
+        self.write_inline_css(inline_class, border_styles)
+        stream.seek(0)
+        mi = get_metadata(stream, 'rtf')
+        if not mi.title:
+            mi.title = _('Unknown')
+        if not mi.authors:
+            mi.authors = [_('Unknown')]
+        opf = OPFCreator(getcwd(), mi)
+        opf.create_manifest([(u'index.xhtml', None)])
+        opf.create_spine([u'index.xhtml'])
+        opf.render(open(u'metadata.opf', 'wb'))
+        return os.path.abspath(u'metadata.opf')
+
+    def postprocess_book(self, oeb, opts, log):
+        for item in oeb.spine:
+            for img in item.data.xpath('//*[local-name()="img" and @src="__REMOVE_ME__"]'):
+                p = img.getparent()
+                idx = p.index(img)
+                p.remove(img)
+                if img.tail:
+                    if idx == 0:
+                        p.text = (p.text or '') + img.tail
+                    else:
+                        p[idx-1].tail = (p[idx-1].tail or '') + img.tail
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin
+
+
+class RTFOutput(OutputFormatPlugin):
+
+    name = 'RTF Output'
+    author = 'John Schember'
+    file_type = 'rtf'
+    commit_name = 'rtf_output'
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.rtf.rtfml import RTFMLizer
+
+        rtfmlitzer = RTFMLizer(log)
+        content = rtfmlitzer.extract_content(oeb_book, opts)
+
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = lopen(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        out_stream.seek(0)
+        out_stream.truncate()
+        out_stream.write(content.encode('ascii', 'replace'))
+
+        if close:
+            out_stream.close()
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.filenames import ascii_filename
+from polyglot.builtins import unicode_type
+
+HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
+
+
+def html_encode(s):
+    return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;').replace('\n', '<br/>').replace(' ', '&nbsp;')  # noqa
+
+
+class SNBInput(InputFormatPlugin):
+
+    name        = 'SNB Input'
+    author      = 'Li Fanxi'
+    description = 'Convert SNB files to OEB'
+    file_types  = {'snb'}
+    commit_name = 'snb_input'
+
+    options = set()
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        import uuid
+
+        from calibre.ebooks.oeb.base import DirContainer
+        from calibre.ebooks.snb.snbfile import SNBFile
+        from calibre.utils.xml_parse import safe_xml_fromstring
+
+        log.debug("Parsing SNB file...")
+        snbFile = SNBFile()
+        try:
+            snbFile.Parse(stream)
+        except:
+            raise ValueError("Invalid SNB file")
+        if not snbFile.IsValid():
+            log.debug("Invalid SNB file")
+            raise ValueError("Invalid SNB file")
+        log.debug("Handle meta data ...")
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        oeb = create_oebbook(log, None, options,
+                encoding=options.input_encoding, populate=False)
+        meta = snbFile.GetFileStream('snbf/book.snbf')
+        if meta is not None:
+            meta = safe_xml_fromstring(meta)
+            l = {'title'    : './/head/name',
+                  'creator'  : './/head/author',
+                  'language' : './/head/language',
+                  'generator': './/head/generator',
+                  'publisher': './/head/publisher',
+                  'cover'    : './/head/cover', }
+            d = {}
+            for item in l:
+                node = meta.find(l[item])
+                if node is not None:
+                    d[item] = node.text if node.text is not None else ''
+                else:
+                    d[item] = ''
+
+            oeb.metadata.add('title', d['title'])
+            oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'})
+            oeb.metadata.add('language', d['language'].lower().replace('_', '-'))
+            oeb.metadata.add('generator', d['generator'])
+            oeb.metadata.add('publisher', d['publisher'])
+            if d['cover'] != '':
+                oeb.guide.add('cover', 'Cover', d['cover'])
+
+        bookid = unicode_type(uuid.uuid4())
+        oeb.metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
+        for ident in oeb.metadata.identifier:
+            if 'id' in ident.attrib:
+                oeb.uid = oeb.metadata.identifier[0]
+                break
+
+        with TemporaryDirectory('_snb2oeb', keep=True) as tdir:
+            log.debug('Process TOC ...')
+            toc = snbFile.GetFileStream('snbf/toc.snbf')
+            oeb.container = DirContainer(tdir, log)
+            if toc is not None:
+                toc = safe_xml_fromstring(toc)
+                i = 1
+                for ch in toc.find('.//body'):
+                    chapterName = ch.text
+                    chapterSrc = ch.get('src')
+                    fname = 'ch_%d.htm' % i
+                    data = snbFile.GetFileStream('snbc/' + chapterSrc)
+                    if data is None:
+                        continue
+                    snbc = safe_xml_fromstring(data)
+                    lines = []
+                    for line in snbc.find('.//body'):
+                        if line.tag == 'text':
+                            lines.append('<p>%s</p>' % html_encode(line.text))
+                        elif line.tag == 'img':
+                            lines.append('<p><img src="%s" /></p>' % html_encode(line.text))
+                    with open(os.path.join(tdir, fname), 'wb') as f:
+                        f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode('utf-8', 'replace'))
+                    oeb.toc.add(ch.text, fname)
+                    id, href = oeb.manifest.generate(id='html',
+                        href=ascii_filename(fname))
+                    item = oeb.manifest.add(id, href, 'text/html')
+                    item.html_input_href = fname
+                    oeb.spine.add(item, True)
+                    i = i + 1
+                imageFiles = snbFile.OutputImageFiles(tdir)
+                for f, m in imageFiles:
+                    id, href = oeb.manifest.generate(id='image',
+                        href=ascii_filename(f))
+                    item = oeb.manifest.add(id, href, m)
+                    item.html_input_href = f
+
+        return oeb
@@ -0,0 +1,269 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
+from calibre.ptempfile import TemporaryDirectory
+from calibre.constants import __appname__, __version__
+from polyglot.builtins import unicode_type
+
+
+class SNBOutput(OutputFormatPlugin):
+
+    name = 'SNB Output'
+    author = 'Li Fanxi'
+    file_type = 'snb'
+    commit_name = 'snb_output'
+
+    options = {
+        OptionRecommendation(name='snb_output_encoding', recommended_value='utf-8',
+            level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the output document. '
+            'The default is utf-8.')),
+        OptionRecommendation(name='snb_max_line_length',
+            recommended_value=0, level=OptionRecommendation.LOW,
+            help=_('The maximum number of characters per line. This splits on '
+            'the first space before the specified value. If no space is found '
+            'the line will be broken at the space after and will exceed the '
+            'specified value. Also, there is a minimum of 25 characters. '
+            'Use 0 to disable line splitting.')),
+        OptionRecommendation(name='snb_insert_empty_line',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Specify whether or not to insert an empty line between '
+            'two paragraphs.')),
+        OptionRecommendation(name='snb_dont_indent_first_line',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Specify whether or not to insert two space characters '
+            'to indent the first line of each paragraph.')),
+        OptionRecommendation(name='snb_hide_chapter_name',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Specify whether or not to hide the chapter title for each '
+            'chapter. Useful for image-only output (eg. comics).')),
+        OptionRecommendation(name='snb_full_screen',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Resize all the images for full screen view. ')),
+     }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from lxml import etree
+        from calibre.ebooks.snb.snbfile import SNBFile
+        from calibre.ebooks.snb.snbml import SNBMLizer, ProcessFileName
+
+        self.opts = opts
+        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
+        try:
+            rasterizer = SVGRasterizer()
+            rasterizer(oeb_book, opts)
+        except Unavailable:
+            log.warn('SVG rasterizer unavailable, SVG will not be converted')
+
+        # Create temp dir
+        with TemporaryDirectory('_snb_output') as tdir:
+            # Create stub directories
+            snbfDir = os.path.join(tdir, 'snbf')
+            snbcDir = os.path.join(tdir, 'snbc')
+            snbiDir = os.path.join(tdir, 'snbc/images')
+            os.mkdir(snbfDir)
+            os.mkdir(snbcDir)
+            os.mkdir(snbiDir)
+
+            # Process Meta data
+            meta = oeb_book.metadata
+            if meta.title:
+                title = unicode_type(meta.title[0])
+            else:
+                title = ''
+            authors = [unicode_type(x) for x in meta.creator if x.role == 'aut']
+            if meta.publisher:
+                publishers = unicode_type(meta.publisher[0])
+            else:
+                publishers = ''
+            if meta.language:
+                lang = unicode_type(meta.language[0]).upper()
+            else:
+                lang = ''
+            if meta.description:
+                abstract = unicode_type(meta.description[0])
+            else:
+                abstract = ''
+
+            # Process Cover
+            g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine
+            href = None
+            if 'titlepage' not in g:
+                if 'cover' in g:
+                    href = g['cover'].href
+
+            # Output book info file
+            bookInfoTree = etree.Element("book-snbf", version="1.0")
+            headTree = etree.SubElement(bookInfoTree, "head")
+            etree.SubElement(headTree, "name").text = title
+            etree.SubElement(headTree, "author").text = ' '.join(authors)
+            etree.SubElement(headTree, "language").text = lang
+            etree.SubElement(headTree, "rights")
+            etree.SubElement(headTree, "publisher").text = publishers
+            etree.SubElement(headTree, "generator").text = __appname__ + ' ' + __version__
+            etree.SubElement(headTree, "created")
+            etree.SubElement(headTree, "abstract").text = abstract
+            if href is not None:
+                etree.SubElement(headTree, "cover").text = ProcessFileName(href)
+            else:
+                etree.SubElement(headTree, "cover")
+            with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f:
+                f.write(etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8'))
+
+            # Output TOC
+            tocInfoTree = etree.Element("toc-snbf")
+            tocHead = etree.SubElement(tocInfoTree, "head")
+            tocBody = etree.SubElement(tocInfoTree, "body")
+            outputFiles = {}
+            if oeb_book.toc.count() == 0:
+                log.warn('This SNB file has no Table of Contents. '
+                    'Creating a default TOC')
+                first = next(iter(oeb_book.spine))
+                oeb_book.toc.add(_('Start page'), first.href)
+            else:
+                first = next(iter(oeb_book.spine))
+                if oeb_book.toc[0].href != first.href:
+                    # The pages before the fist item in toc will be stored as
+                    # "Cover Pages".
+                    # oeb_book.toc does not support "insert", so we generate
+                    # the tocInfoTree directly instead of modifying the toc
+                    ch = etree.SubElement(tocBody, "chapter")
+                    ch.set("src", ProcessFileName(first.href) + ".snbc")
+                    ch.text = _('Cover pages')
+                    outputFiles[first.href] = []
+                    outputFiles[first.href].append(("", _("Cover pages")))
+
+            for tocitem in oeb_book.toc:
+                if tocitem.href.find('#') != -1:
+                    item = tocitem.href.split('#')
+                    if len(item) != 2:
+                        log.error('Error in TOC item: %s' % tocitem)
+                    else:
+                        if item[0] in outputFiles:
+                            outputFiles[item[0]].append((item[1], tocitem.title))
+                        else:
+                            outputFiles[item[0]] = []
+                            if "" not in outputFiles[item[0]]:
+                                outputFiles[item[0]].append(("", tocitem.title + _(" (Preface)")))
+                                ch = etree.SubElement(tocBody, "chapter")
+                                ch.set("src", ProcessFileName(item[0]) + ".snbc")
+                                ch.text = tocitem.title + _(" (Preface)")
+                            outputFiles[item[0]].append((item[1], tocitem.title))
+                else:
+                    if tocitem.href in outputFiles:
+                        outputFiles[tocitem.href].append(("", tocitem.title))
+                    else:
+                        outputFiles[tocitem.href] = []
+                        outputFiles[tocitem.href].append(("", tocitem.title))
+                ch = etree.SubElement(tocBody, "chapter")
+                ch.set("src", ProcessFileName(tocitem.href) + ".snbc")
+                ch.text = tocitem.title
+
+            etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody)
+
+            with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f:
+                f.write(etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8'))
+
+            # Output Files
+            oldTree = None
+            mergeLast = False
+            lastName = None
+            for item in s:
+                from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES
+                if m.hrefs[item.href].media_type in OEB_DOCS:
+                    if item.href not in outputFiles:
+                        log.debug('File %s is unused in TOC. Continue in last chapter' % item.href)
+                        mergeLast = True
+                    else:
+                        if oldTree is not None and mergeLast:
+                            log.debug('Output the modified chapter again: %s' % lastName)
+                            with open(os.path.join(snbcDir, lastName), 'wb') as f:
+                                f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
+                            mergeLast = False
+
+                    log.debug('Converting %s to snbc...' % item.href)
+                    snbwriter = SNBMLizer(log)
+                    snbcTrees = None
+                    if not mergeLast:
+                        snbcTrees = snbwriter.extract_content(oeb_book, item, outputFiles[item.href], opts)
+                        for subName in snbcTrees:
+                            postfix = ''
+                            if subName != '':
+                                postfix = '_' + subName
+                            lastName = ProcessFileName(item.href + postfix + ".snbc")
+                            oldTree = snbcTrees[subName]
+                            with open(os.path.join(snbcDir, lastName), 'wb') as f:
+                                f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
+                    else:
+                        log.debug('Merge %s with last TOC item...' % item.href)
+                        snbwriter.merge_content(oldTree, oeb_book, item, [('', _("Start"))], opts)
+
+            # Output the last one if needed
+            log.debug('Output the last modified chapter again: %s' % lastName)
+            if oldTree is not None and mergeLast:
+                with open(os.path.join(snbcDir, lastName), 'wb') as f:
+                    f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
+                mergeLast = False
+
+            for item in m:
+                if m.hrefs[item.href].media_type in OEB_IMAGES:
+                    log.debug('Converting image: %s ...' % item.href)
+                    content = m.hrefs[item.href].data
+                    # Convert & Resize image
+                    self.HandleImage(content, os.path.join(snbiDir, ProcessFileName(item.href)))
+
+            # Package as SNB File
+            snbFile = SNBFile()
+            snbFile.FromDir(tdir)
+            snbFile.Output(output_path)
+
+    def HandleImage(self, imageData, imagePath):
+        from calibre.utils.img import image_from_data, resize_image, image_to_data
+        img = image_from_data(imageData)
+        x, y = img.width(), img.height()
+        if self.opts:
+            if self.opts.snb_full_screen:
+                SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size
+            else:
+                SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size
+        else:
+            SCREEN_X = 540
+            SCREEN_Y = 700
+        # Handle big image only
+        if x > SCREEN_X or y > SCREEN_Y:
+            xScale = float(x) / SCREEN_X
+            yScale = float(y) / SCREEN_Y
+            scale = max(xScale, yScale)
+            # TODO : intelligent image rotation
+            #     img = img.rotate(90)
+            #     x,y = y,x
+            img = resize_image(img, x // scale, y // scale)
+        with lopen(imagePath, 'wb') as f:
+            f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
+
+
+if __name__ == '__main__':
+    from calibre.ebooks.oeb.reader import OEBReader
+    from calibre.ebooks.oeb.base import OEBBook
+    from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
+    from calibre.customize.profiles import HanlinV3Output
+
+    class OptionValues(object):
+        pass
+
+    opts = OptionValues()
+    opts.output_profile = HanlinV3Output(None)
+
+    html_preprocessor = HTMLPreProcessor(None, None, opts)
+    from calibre.utils.logging import default_log
+    oeb = OEBBook(default_log, html_preprocessor)
+    reader = OEBReader
+    reader()(oeb, '/tmp/bbb/processed/')
+    SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log)
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+from io import BytesIO
+
+from calibre.customize.conversion import InputFormatPlugin
+
+
+class TCRInput(InputFormatPlugin):
+
+    name        = 'TCR Input'
+    author      = 'John Schember'
+    description = 'Convert TCR files to HTML'
+    file_types  = {'tcr'}
+    commit_name = 'tcr_input'
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.ebooks.compression.tcr import decompress
+
+        log.info('Decompressing text...')
+        raw_txt = decompress(stream)
+
+        log.info('Converting text to OEB...')
+        stream = BytesIO(raw_txt)
+
+        from calibre.customize.ui import plugin_for_input_format
+
+        txt_plugin = plugin_for_input_format('txt')
+        for opt in txt_plugin.options:
+            if not hasattr(self.options, opt.option.name):
+                setattr(options, opt.option.name, opt.recommended_value)
+
+        stream.seek(0)
+        return txt_plugin.convert(stream, options,
+                'txt', log, accelerators)
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, \
+    OptionRecommendation
+
+
+class TCROutput(OutputFormatPlugin):
+
+    name = 'TCR Output'
+    author = 'John Schember'
+    file_type = 'tcr'
+    commit_name = 'tcr_output'
+
+    options = {
+        OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8',
+            level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the output document. '
+            'The default is utf-8.'))}
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.txt.txtml import TXTMLizer
+        from calibre.ebooks.compression.tcr import compress
+
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = lopen(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        setattr(opts, 'flush_paras', False)
+        setattr(opts, 'max_line_length', 0)
+        setattr(opts, 'force_max_line_length', False)
+        setattr(opts, 'indent_paras', False)
+
+        writer = TXTMLizer(log)
+        txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace')
+
+        log.info('Compressing text...')
+        txt = compress(txt)
+
+        out_stream.seek(0)
+        out_stream.truncate()
+        out_stream.write(txt)
+
+        if close:
+            out_stream.close()
@@ -0,0 +1,308 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre import _ent_pat, walk, xml_entity_to_unicode
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from polyglot.builtins import getcwd
+
+MD_EXTENSIONS = {
+    'abbr': _('Abbreviations'),
+    'admonition': _('Support admonitions'),
+    'attr_list': _('Add attribute to HTML tags'),
+    'codehilite': _('Add code highlighting via Pygments'),
+    'def_list': _('Definition lists'),
+    'extra': _('Enables various common extensions'),
+    'fenced_code': _('Alternative code block syntax'),
+    'footnotes': _('Footnotes'),
+    'legacy_attrs': _('Use legacy element attributes'),
+    'legacy_em': _('Use legacy underscore handling for connected words'),
+    'meta': _('Metadata in the document'),
+    'nl2br': _('Treat newlines as hard breaks'),
+    'sane_lists': _('Do not allow mixing list types'),
+    'smarty': _('Use markdown\'s internal smartypants parser'),
+    'tables': _('Support tables'),
+    'toc': _('Generate a table of contents'),
+    'wikilinks': _('Wiki style links'),
+}
+
+
+class TXTInput(InputFormatPlugin):
+
+    name        = 'TXT Input'
+    author      = 'John Schember'
+    description = 'Convert TXT files to HTML'
+    file_types  = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
+    commit_name = 'txt_input'
+    ui_data = {
+        'md_extensions': MD_EXTENSIONS,
+        'paragraph_types': {
+            'auto': _('Try to auto detect paragraph type'),
+            'block': _('Treat a blank line as a paragraph break'),
+            'single': _('Assume every line is a paragraph'),
+            'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
+            'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
+            'off': _('Don\'t modify the paragraph structure'),
+        },
+        'formatting_types': {
+            'auto': _('Automatically decide which formatting processor to use'),
+            'plain': _('No formatting'),
+            'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
+            'textile': _('Use the TexTile markup language'),
+            'markdown': _('Use the Markdown markup language')
+        },
+    }
+
+    options = {
+        OptionRecommendation(name='formatting_type', recommended_value='auto',
+            choices=list(ui_data['formatting_types']),
+            help=_('Formatting used within the document.\n'
+                   '* auto: {auto}\n'
+                   '* plain: {plain}\n'
+                   '* heuristic: {heuristic}\n'
+                   '* textile: {textile}\n'
+                   '* markdown: {markdown}\n'
+                   'To learn more about markdown see {url}').format(
+                       url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
+        ),
+        OptionRecommendation(name='paragraph_type', recommended_value='auto',
+            choices=list(ui_data['paragraph_types']),
+            help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
+                   'Choices are:\n'
+                   '* auto: {auto}\n'
+                   '* block: {block}\n'
+                   '* single: {single}\n'
+                   '* print:  {print}\n'
+                   '* unformatted: {unformatted}\n'
+                   '* off: {off}').format(**ui_data['paragraph_types'])
+        ),
+        OptionRecommendation(name='preserve_spaces', recommended_value=False,
+            help=_('Normally extra spaces are condensed into a single space. '
+                'With this option all spaces will be displayed.')),
+        OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
+            help=_('Normally extra space at the beginning of lines is retained. '
+                   'With this option they will be removed.')),
+        OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
+            help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
+                   'of the standard markdown format. The extensions enabled by default: %default.\n'
+                   'To learn more about markdown extensions, see {}\n'
+                   'This should be a comma separated list of extensions to enable:\n'
+                   ).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
+    }
+
+    def shift_file(self, fname, data):
+        name, ext = os.path.splitext(fname)
+        candidate = os.path.join(self.output_dir, fname)
+        c = 0
+        while os.path.exists(candidate):
+            c += 1
+            candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext))
+        ans = candidate
+        with open(ans, 'wb') as f:
+            f.write(data)
+        return f.name
+
+    def fix_resources(self, html, base_dir):
+        from html5_parser import parse
+        root = parse(html)
+        changed = False
+        for img in root.xpath('//img[@src]'):
+            src = img.get('src')
+            prefix = src.split(':', 1)[0].lower()
+            if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
+                src = os.path.join(base_dir, src)
+                if os.access(src, os.R_OK):
+                    with open(src, 'rb') as f:
+                        data = f.read()
+                    f = self.shift_file(os.path.basename(src), data)
+                    changed = True
+                    img.set('src', os.path.basename(f))
+        if changed:
+            from lxml import etree
+            html = etree.tostring(root, encoding='unicode')
+        return html
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
+        from calibre.ebooks.chardet import detect
+        from calibre.utils.zipfile import ZipFile
+        from calibre.ebooks.txt.processor import (convert_basic,
+                convert_markdown_with_metadata, separate_paragraphs_single_line,
+                separate_paragraphs_print_formatted, preserve_spaces,
+                detect_paragraph_type, detect_formatting_type,
+                normalize_line_endings, convert_textile, remove_indents,
+                block_to_single_line, separate_hard_scene_breaks)
+
+        self.log = log
+        txt = b''
+        log.debug('Reading text from file...')
+        length = 0
+        base_dir = self.output_dir = getcwd()
+
+        # Extract content from zip archive.
+        if file_ext == 'txtz':
+            zf = ZipFile(stream)
+            zf.extractall('.')
+
+            for x in walk('.'):
+                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
+                    with open(x, 'rb') as tf:
+                        txt += tf.read() + b'\n\n'
+        else:
+            if getattr(stream, 'name', None):
+                base_dir = os.path.dirname(stream.name)
+            txt = stream.read()
+            if file_ext in {'md', 'textile', 'markdown'}:
+                options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
+                log.info('File extension indicates particular formatting. '
+                        'Forcing formatting type to: %s'%options.formatting_type)
+                options.paragraph_type = 'off'
+
+        # Get the encoding of the document.
+        if options.input_encoding:
+            ienc = options.input_encoding
+            log.debug('Using user specified input encoding of %s' % ienc)
+        else:
+            det_encoding = detect(txt[:4096])
+            det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
+            if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
+                    'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
+                    'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
+                # Microsoft Word exports to HTML with encoding incorrectly set to
+                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
+                det_encoding = 'gbk'
+            ienc = det_encoding
+            log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
+        if not ienc:
+            ienc = 'utf-8'
+            log.debug('No input encoding specified and could not auto detect using %s' % ienc)
+        # Remove BOM from start of txt as its presence can confuse markdown
+        import codecs
+        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
+            if txt.startswith(bom):
+                txt = txt[len(bom):]
+                break
+        txt = txt.decode(ienc, 'replace')
+
+        # Replace entities
+        txt = _ent_pat.sub(xml_entity_to_unicode, txt)
+
+        # Normalize line endings
+        txt = normalize_line_endings(txt)
+
+        # Determine the paragraph type of the document.
+        if options.paragraph_type == 'auto':
+            options.paragraph_type = detect_paragraph_type(txt)
+            if options.paragraph_type == 'unknown':
+                log.debug('Could not reliably determine paragraph type using block')
+                options.paragraph_type = 'block'
+            else:
+                log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+
+        # Detect formatting
+        if options.formatting_type == 'auto':
+            options.formatting_type = detect_formatting_type(txt)
+            log.debug('Auto detected formatting as %s' % options.formatting_type)
+
+        if options.formatting_type == 'heuristic':
+            setattr(options, 'enable_heuristics', True)
+            setattr(options, 'unwrap_lines', False)
+            setattr(options, 'smarten_punctuation', True)
+
+        # Reformat paragraphs to block formatting based on the detected type.
+        # We don't check for block because the processor assumes block.
+        # single and print at transformed to block for processing.
+        if options.paragraph_type == 'single':
+            txt = separate_paragraphs_single_line(txt)
+        elif options.paragraph_type == 'print':
+            txt = separate_hard_scene_breaks(txt)
+            txt = separate_paragraphs_print_formatted(txt)
+            txt = block_to_single_line(txt)
+        elif options.paragraph_type == 'unformatted':
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            # unwrap lines based on punctuation
+            docanalysis = DocAnalysis('txt', txt)
+            length = docanalysis.line_length(.5)
+            preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
+            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
+            txt = separate_paragraphs_single_line(txt)
+        elif options.paragraph_type == 'block':
+            txt = separate_hard_scene_breaks(txt)
+            txt = block_to_single_line(txt)
+
+        if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
+            docanalysis = DocAnalysis('txt', txt)
+            if not length:
+                length = docanalysis.line_length(.5)
+            dehyphenator = Dehyphenator(options.verbose, log=self.log)
+            txt = dehyphenator(txt,'txt', length)
+
+        # User requested transformation on the text.
+        if options.txt_in_remove_indents:
+            txt = remove_indents(txt)
+
+        # Preserve spaces will replace multiple spaces to a space
+        # followed by the &nbsp; entity.
+        if options.preserve_spaces:
+            txt = preserve_spaces(txt)
+
+        # Process the text using the appropriate text processor.
+        self.shifted_files = []
+        try:
+            html = ''
+            input_mi = None
+            if options.formatting_type == 'markdown':
+                log.debug('Running text through markdown conversion...')
+                try:
+                    input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
+                except RuntimeError:
+                    raise ValueError('This txt file has malformed markup, it cannot be'
+                        ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
+                html = self.fix_resources(html, base_dir)
+            elif options.formatting_type == 'textile':
+                log.debug('Running text through textile conversion...')
+                html = convert_textile(txt)
+                html = self.fix_resources(html, base_dir)
+            else:
+                log.debug('Running text through basic conversion...')
+                flow_size = getattr(options, 'flow_size', 0)
+                html = convert_basic(txt, epub_split_size_kb=flow_size)
+
+            # Run the HTMLized text through the html processing plugin.
+            from calibre.customize.ui import plugin_for_input_format
+            html_input = plugin_for_input_format('html')
+            for opt in html_input.options:
+                setattr(options, opt.option.name, opt.recommended_value)
+            options.input_encoding = 'utf-8'
+            htmlfile = self.shift_file('index.html', html.encode('utf-8'))
+            odi = options.debug_pipeline
+            options.debug_pipeline = None
+            # Generate oeb from html conversion.
+            oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
+            options.debug_pipeline = odi
+        finally:
+            for x in self.shifted_files:
+                os.remove(x)
+
+        # Set metadata from file.
+        if input_mi is None:
+            from calibre.customize.ui import get_file_type_metadata
+            input_mi = get_file_type_metadata(stream, file_ext)
+        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
+        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
+        self.html_postprocess_title = input_mi.title
+
+        return oeb
+
+    def postprocess_book(self, oeb, opts, log):
+        for item in oeb.spine:
+            if hasattr(item.data, 'xpath'):
+                for title in item.data.xpath('//*[local-name()="title"]'):
+                    if title.text == _('Unknown'):
+                        title.text = self.html_postprocess_title
@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+import shutil
+
+
+from calibre.customize.conversion import OutputFormatPlugin, \
+    OptionRecommendation
+from calibre.ptempfile import TemporaryDirectory, TemporaryFile
+
+NEWLINE_TYPES = ['system', 'unix', 'old_mac', 'windows']
+
+
+class TXTOutput(OutputFormatPlugin):
+
+    name = 'TXT Output'
+    author = 'John Schember'
+    file_type = 'txt'
+    commit_name = 'txt_output'
+    ui_data = {
+            'newline_types': NEWLINE_TYPES,
+            'formatting_types': {
+                'plain': _('Plain text'),
+                'markdown': _('Markdown formatted text'),
+                'textile': _('TexTile formatted text')
+            },
+    }
+
+    options = {
+        OptionRecommendation(name='newline', recommended_value='system',
+            level=OptionRecommendation.LOW,
+            short_switch='n', choices=NEWLINE_TYPES,
+            help=_('Type of newline to use. Options are %s. Default is \'system\'. '
+                'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
+                'For macOS use \'unix\'. \'system\' will default to the newline '
+                'type used by this OS.') % sorted(NEWLINE_TYPES)),
+        OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8',
+            level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the output document. '
+            'The default is utf-8.')),
+        OptionRecommendation(name='inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Add Table of Contents to beginning of the book.')),
+        OptionRecommendation(name='max_line_length',
+            recommended_value=0, level=OptionRecommendation.LOW,
+            help=_('The maximum number of characters per line. This splits on '
+            'the first space before the specified value. If no space is found '
+            'the line will be broken at the space after and will exceed the '
+            'specified value. Also, there is a minimum of 25 characters. '
+            'Use 0 to disable line splitting.')),
+        OptionRecommendation(name='force_max_line_length',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Force splitting on the max-line-length value when no space '
+            'is present. Also allows max-line-length to be below the minimum')),
+        OptionRecommendation(name='txt_output_formatting',
+             recommended_value='plain',
+             choices=list(ui_data['formatting_types']),
+             help=_('Formatting used within the document.\n'
+                    '* plain: {plain}\n'
+                    '* markdown: {markdown}\n'
+                    '* textile: {textile}').format(**ui_data['formatting_types'])),
+        OptionRecommendation(name='keep_links',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Do not remove links within the document. This is only '
+            'useful when paired with a txt-output-formatting option that '
+            'is not none because links are always removed with plain text output.')),
+        OptionRecommendation(name='keep_image_references',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Do not remove image references within the document. This is only '
+            'useful when paired with a txt-output-formatting option that '
+            'is not none because links are always removed with plain text output.')),
+        OptionRecommendation(name='keep_color',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Do not remove font color from output. This is only useful when '
+                   'txt-output-formatting is set to textile. Textile is the only '
+                   'formatting that supports setting font color. If this option is '
+                   'not specified font color will not be set and default to the '
+                   'color displayed by the reader (generally this is black).')),
+     }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.txt.txtml import TXTMLizer
+        from calibre.utils.cleantext import clean_ascii_chars
+        from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines
+
+        if opts.txt_output_formatting.lower() == 'markdown':
+            from calibre.ebooks.txt.markdownml import MarkdownMLizer
+            self.writer = MarkdownMLizer(log)
+        elif opts.txt_output_formatting.lower() == 'textile':
+            from calibre.ebooks.txt.textileml import TextileMLizer
+            self.writer = TextileMLizer(log)
+        else:
+            self.writer = TXTMLizer(log)
+
+        txt = self.writer.extract_content(oeb_book, opts)
+        txt = clean_ascii_chars(txt)
+
+        log.debug('\tReplacing newlines with selected type...')
+        txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
+
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = open(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        out_stream.seek(0)
+        out_stream.truncate()
+        out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))
+
+        if close:
+            out_stream.close()
+
+
+class TXTZOutput(TXTOutput):
+
+    name = 'TXTZ Output'
+    author = 'John Schember'
+    file_type = 'txtz'
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.oeb.base import OEB_IMAGES
+        from calibre.utils.zipfile import ZipFile
+        from lxml import etree
+
+        with TemporaryDirectory('_txtz_output') as tdir:
+            # TXT
+            txt_name = 'index.txt'
+            if opts.txt_output_formatting.lower() == 'textile':
+                txt_name = 'index.text'
+            with TemporaryFile(txt_name) as tf:
+                TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
+                shutil.copy(tf, os.path.join(tdir, txt_name))
+
+            # Images
+            for item in oeb_book.manifest:
+                if item.media_type in OEB_IMAGES:
+                    if hasattr(self.writer, 'images'):
+                        path = os.path.join(tdir, 'images')
+                        if item.href in self.writer.images:
+                            href = self.writer.images[item.href]
+                        else:
+                            continue
+                    else:
+                        path = os.path.join(tdir, os.path.dirname(item.href))
+                        href = os.path.basename(item.href)
+                    if not os.path.exists(path):
+                        os.makedirs(path)
+                    with open(os.path.join(path, href), 'wb') as imgf:
+                        imgf.write(item.data)
+
+            # Metadata
+            with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
+                mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
+
+            txtz = ZipFile(output_path, 'w')
+            txtz.add_dir(tdir)
@@ -0,0 +1,646 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import functools, re, json
+from math import ceil
+
+from calibre import entity_to_unicode, as_unicode
+from polyglot.builtins import unicode_type, range
+
+XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
+SVG_NS       = 'http://www.w3.org/2000/svg'
+XLINK_NS     = 'http://www.w3.org/1999/xlink'
+
+convert_entities = functools.partial(entity_to_unicode,
+        result_exceptions={
+            '<' : '&lt;',
+            '>' : '&gt;',
+            "'" : '&apos;',
+            '"' : '&quot;',
+            '&' : '&amp;',
+        })
+_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
+
+LIGATURES = {
+#        '\u00c6': 'AE',
+#        '\u00e6': 'ae',
+#        '\u0152': 'OE',
+#        '\u0153': 'oe',
+#        '\u0132': 'IJ',
+#        '\u0133': 'ij',
+#        '\u1D6B': 'ue',
+        '\uFB00': 'ff',
+        '\uFB01': 'fi',
+        '\uFB02': 'fl',
+        '\uFB03': 'ffi',
+        '\uFB04': 'ffl',
+        '\uFB05': 'ft',
+        '\uFB06': 'st',
+        }
+
+_ligpat = re.compile('|'.join(LIGATURES))
+
+
+def sanitize_head(match):
+    x = match.group(1)
+    x = _span_pat.sub('', x)
+    return '<head>\n%s\n</head>' % x
+
+
+def chap_head(match):
+    chap = match.group('chap')
+    title = match.group('title')
+    if not title:
+        return '<h1>'+chap+'</h1><br/>\n'
+    else:
+        return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
+
+
+def wrap_lines(match):
+    ital = match.group('ital')
+    if not ital:
+        return ' '
+    else:
+        return ital+' '
+
+
+def smarten_punctuation(html, log=None):
+    from calibre.utils.smartypants import smartyPants
+    from calibre.ebooks.chardet import substitute_entites
+    from calibre.ebooks.conversion.utils import HeuristicProcessor
+    preprocessor = HeuristicProcessor(log=log)
+    from uuid import uuid4
+    start = 'calibre-smartypants-'+unicode_type(uuid4())
+    stop = 'calibre-smartypants-'+unicode_type(uuid4())
+    html = html.replace('<!--', start)
+    html = html.replace('-->', stop)
+    html = preprocessor.fix_nbsp_indents(html)
+    html = smartyPants(html)
+    html = html.replace(start, '<!--')
+    html = html.replace(stop, '-->')
+    return substitute_entites(html)
+
+
+class DocAnalysis(object):
+    '''
+    Provides various text analysis functions to determine how the document is structured.
+    format is the type of document analysis will be done against.
+    raw is the raw text to determine the line length to use for wrapping.
+    Blank lines are excluded from analysis
+    '''
+
+    def __init__(self, format='html', raw=''):
+        raw = raw.replace('&nbsp;', ' ')
+        if format == 'html':
+            linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
+        elif format == 'pdf':
+            linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
+        elif format == 'spanned_html':
+            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
+        elif format == 'txt':
+            linere = re.compile('.*?\n')
+        self.lines = linere.findall(raw)
+
+    def line_length(self, percent):
+        '''
+        Analyses the document to find the median line length.
+        percentage is a decimal number, 0 - 1 which is used to determine
+        how far in the list of line lengths to use. The list of line lengths is
+        ordered smallest to largest and does not include duplicates. 0.5 is the
+        median value.
+        '''
+        lengths = []
+        for line in self.lines:
+            if len(line) > 0:
+                lengths.append(len(line))
+
+        if not lengths:
+            return 0
+
+        lengths = list(set(lengths))
+        total = sum(lengths)
+        avg = total / len(lengths)
+        max_line = ceil(avg * 2)
+
+        lengths = sorted(lengths)
+        for i in range(len(lengths) - 1, -1, -1):
+            if lengths[i] > max_line:
+                del lengths[i]
+
+        if percent > 1:
+            percent = 1
+        if percent < 0:
+            percent = 0
+
+        index = int(len(lengths) * percent) - 1
+
+        return lengths[index]
+
+    def line_histogram(self, percent):
+        '''
+        Creates a broad histogram of the document to determine whether it incorporates hard
+        line breaks.  Lines are sorted into 20 'buckets' based on length.
+        percent is the percentage of lines that should be in a single bucket to return true
+        The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
+        '''
+        minLineLength=20  # Ignore lines under 20 chars (typical of spaces)
+        maxLineLength=1900  # Discard larger than this to stay in range
+        buckets=20  # Each line is divided into a bucket based on length
+
+        # print("there are "+unicode_type(len(lines))+" lines")
+        # max = 0
+        # for line in self.lines:
+        #    l = len(line)
+        #    if l > max:
+        #        max = l
+        # print("max line found is "+unicode_type(max))
+        # Build the line length histogram
+        hRaw = [0 for i in range(0,buckets)]
+        for line in self.lines:
+            l = len(line)
+            if l > minLineLength and l < maxLineLength:
+                l = int(l // 100)
+                # print("adding "+unicode_type(l))
+                hRaw[l]+=1
+
+        # Normalize the histogram into percents
+        totalLines = len(self.lines)
+        if totalLines > 0:
+            h = [float(count)/totalLines for count in hRaw]
+        else:
+            h = []
+        # print("\nhRaw histogram lengths are: "+unicode_type(hRaw))
+        # print("              percents are: "+unicode_type(h)+"\n")
+
+        # Find the biggest bucket
+        maxValue = 0
+        for i in range(0,len(h)):
+            if h[i] > maxValue:
+                maxValue = h[i]
+
+        if maxValue < percent:
+            # print("Line lengths are too variable. Not unwrapping.")
+            return False
+        else:
+            # print(unicode_type(maxValue)+" of the lines were in one bucket")
+            return True
+
+
+class Dehyphenator(object):
+    '''
+    Analyzes words to determine whether hyphens should be retained/removed.  Uses the document
+    itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
+    scientific words. The primary disadvantage is that words appearing only once in the document
+    retain hyphens.
+    '''
+
+    def __init__(self, verbose=0, log=None):
+        self.log = log
+        self.verbose = verbose
+        # Add common suffixes to the regex below to increase the likelihood of a match -
+        # don't add suffixes which are also complete words, such as 'able' or 'sex'
+        # only remove if it's not already the point of hyphenation
+        self.suffix_string = (
+            "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
+            "(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
+            "(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
+        self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
+        self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
+        # remove prefixes if the prefix was not already the point of hyphenation
+        self.prefix_string = '^(dis|re|un|in|ex)'
+        self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
+        self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
+
+    def dehyphenate(self, match):
+        firsthalf = match.group('firstpart')
+        secondhalf = match.group('secondpart')
+        try:
+            wraptags = match.group('wraptags')
+        except:
+            wraptags = ''
+        hyphenated = unicode_type(firsthalf) + "-" + unicode_type(secondhalf)
+        dehyphenated = unicode_type(firsthalf) + unicode_type(secondhalf)
+        if self.suffixes.match(secondhalf) is None:
+            lookupword = self.removesuffixes.sub('', dehyphenated)
+        else:
+            lookupword = dehyphenated
+        if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None:
+            lookupword = self.removeprefix.sub('', lookupword)
+        if self.verbose > 2:
+            self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
+        try:
+            searchresult = self.html.find(lookupword.lower())
+        except:
+            return hyphenated
+        if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
+            if self.html.find(lookupword) != -1 or searchresult != -1:
+                if self.verbose > 2:
+                    self.log("    Cleanup:returned dehyphenated word: " + dehyphenated)
+                return dehyphenated
+            elif self.html.find(hyphenated) != -1:
+                if self.verbose > 2:
+                    self.log("        Cleanup:returned hyphenated word: " + hyphenated)
+                return hyphenated
+            else:
+                if self.verbose > 2:
+                    self.log("            Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf)
+                return firsthalf+'\u2014'+wraptags+secondhalf
+
+        else:
+            if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
+                if self.verbose > 2:
+                    self.log("too short, returned hyphenated word: " + hyphenated)
+                return hyphenated
+            if len(firsthalf) <= 2 and len(secondhalf) <= 2:
+                if self.verbose > 2:
+                    self.log("too short, returned hyphenated word: " + hyphenated)
+                return hyphenated
+            if self.html.find(lookupword) != -1 or searchresult != -1:
+                if self.verbose > 2:
+                    self.log("     returned dehyphenated word: " + dehyphenated)
+                return dehyphenated
+            else:
+                if self.verbose > 2:
+                    self.log("          returned hyphenated word: " + hyphenated)
+                return hyphenated
+
+    def __call__(self, html, format, length=1):
+        self.html = html
+        self.format = format
+        if format == 'html':
+            intextmatch = re.compile((
+                r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
+                r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
+                r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
+        elif format == 'pdf':
+            intextmatch = re.compile((
+                r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<wraptags><p>|'
+                r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
+        elif format == 'txt':
+            intextmatch = re.compile(
+                '(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
+        elif format == 'individual_words':
+            intextmatch = re.compile(
+                r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
+        elif format == 'html_cleanup':
+            intextmatch = re.compile(
+                r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
+                r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
+        elif format == 'txt_cleanup':
+            intextmatch = re.compile(
+                r'(?P<firstpart>[^\W\-]+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
+
+        html = intextmatch.sub(self.dehyphenate, html)
+        return html
+
+
+class CSSPreProcessor(object):
+
+    # Remove some of the broken CSS Microsoft products
+    # create
+    MS_PAT     = re.compile(r'''
+        (?P<start>^|;|\{)\s*    # The end of the previous rule or block start
+        (%s).+?                 # The invalid selectors
+        (?P<end>$|;|\})         # The end of the declaration
+        '''%'mso-|panose-|text-underline|tab-interval',
+        re.MULTILINE|re.IGNORECASE|re.VERBOSE)
+
+    def ms_sub(self, match):
+        end = match.group('end')
+        try:
+            start = match.group('start')
+        except:
+            start = ''
+        if end == ';':
+            end = ''
+        return start + end
+
+    def __call__(self, data, add_namespace=False):
+        from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
+        data = self.MS_PAT.sub(self.ms_sub, data)
+        if not add_namespace:
+            return data
+
+        # Remove comments as the following namespace logic will break if there
+        # are commented lines before the first @import or @charset rule. Since
+        # the conversion will remove all stylesheets anyway, we don't lose
+        # anything
+        data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
+
+        ans, namespaced = [], False
+        for line in data.splitlines():
+            ll = line.lstrip()
+            if not (namespaced or ll.startswith('@import') or not ll or
+                        ll.startswith('@charset')):
+                ans.append(XHTML_CSS_NAMESPACE.strip())
+                namespaced = True
+            ans.append(line)
+
+        return '\n'.join(ans)
+
+
+def accent_regex(accent_maps, letter_before=False):
+    accent_cat = set()
+    letters = set()
+
+    for accent in tuple(accent_maps):
+        accent_cat.add(accent)
+        k, v = accent_maps[accent].split(':', 1)
+        if len(k) != len(v):
+            raise ValueError('Invalid mapping for: {} -> {}'.format(k, v))
+        accent_maps[accent] = lmap = dict(zip(k, v))
+        letters |= set(lmap)
+
+    if letter_before:
+        args = ''.join(letters), ''.join(accent_cat)
+        accent_group, letter_group = 2, 1
+    else:
+        args = ''.join(accent_cat), ''.join(letters)
+        accent_group, letter_group = 1, 2
+
+    pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
+
+    def sub(m):
+        lmap = accent_maps[m.group(accent_group)]
+        return lmap.get(m.group(letter_group)) or m.group()
+
+    return pat, sub
+
+
+def html_preprocess_rules():
+    ans = getattr(html_preprocess_rules, 'ans', None)
+    if ans is None:
+        ans = html_preprocess_rules.ans = [
+        # Remove huge block of contiguous spaces as they slow down
+        # the following regexes pretty badly
+        (re.compile(r'\s{10000,}'), ''),
+        # Some idiotic HTML generators (Frontpage I'm looking at you)
+        # Put all sorts of crap into <head>. This messes up lxml
+        (re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
+        sanitize_head),
+        # Convert all entities, since lxml doesn't handle them well
+        (re.compile(r'&(\S+?);'), convert_entities),
+        # Remove the <![if/endif tags inserted by everybody's darling, MS Word
+        (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
+    ]
+    return ans
+
+
+def pdftohtml_rules():
+    ans = getattr(pdftohtml_rules, 'ans', None)
+    if ans is None:
+        ans = pdftohtml_rules.ans = [
+        accent_regex({
+            '¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
+            '`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
+            '´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
+            'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
+            '¸': 'cC:çÇ',
+            '˛': 'aAeE:ąĄęĘ',
+            '˙': 'zZ:żŻ',
+            'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
+            '°': 'uU:ůŮ',
+        }),
+
+        accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
+
+        # If pdf printed from a browser then the header/footer has a reliable pattern
+        (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+
+        # Center separator lines
+        (re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
+
+        # Remove <hr> tags
+        (re.compile(r'<hr.*?>', re.IGNORECASE), ''),
+
+        # Remove gray background
+        (re.compile(r'<BODY[^<>]+>'), '<BODY>'),
+
+        # Convert line breaks to paragraphs
+        (re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
+        (re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
+        (re.compile(r'\s*</body>'), '</p>\n</body>'),
+
+        # Clean up spaces
+        (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
+        # Add space before and after italics
+        (re.compile(r'(?<!“)<i>'), ' <i>'),
+        (re.compile(r'</i>(?=\w)'), '</i> '),
+    ]
+    return ans
+
+
+def book_designer_rules():
+    ans = getattr(book_designer_rules, 'ans', None)
+    if ans is None:
+        ans = book_designer_rules.ans = [
+        # HR
+        (re.compile('<hr>', re.IGNORECASE),
+        lambda match : '<span style="page-break-after:always"> </span>'),
+        # Create header tags
+        (re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
+        lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
+        (re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
+        lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
+        (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
+        lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
+        (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
+        lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
+    ]
+    return None
+
+
+class HTMLPreProcessor(object):
+
+    def __init__(self, log=None, extra_opts=None, regex_wizard_callback=None):
+        self.log = log
+        self.extra_opts = extra_opts
+        self.regex_wizard_callback = regex_wizard_callback
+        self.current_href = None
+
+    def is_baen(self, src):
+        return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
+                          re.IGNORECASE).search(src) is not None
+
+    def is_book_designer(self, raw):
+        return re.search('<H2[^><]*id=BookTitle', raw) is not None
+
+    def is_pdftohtml(self, src):
+        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
+
+    def __call__(self, html, remove_special_chars=None,
+            get_preprocess_html=False):
+        if remove_special_chars is not None:
+            html = remove_special_chars.sub('', html)
+        html = html.replace('\0', '')
+        is_pdftohtml = self.is_pdftohtml(html)
+        if self.is_baen(html):
+            rules = []
+        elif self.is_book_designer(html):
+            rules = book_designer_rules()
+        elif is_pdftohtml:
+            rules = pdftohtml_rules()
+        else:
+            rules = []
+
+        start_rules = []
+
+        if not getattr(self.extra_opts, 'keep_ligatures', False):
+            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
+
+        user_sr_rules = {}
+        # Function for processing search and replace
+
+        def do_search_replace(search_pattern, replace_txt):
+            from calibre.ebooks.conversion.search_replace import compile_regular_expression
+            try:
+                search_re = compile_regular_expression(search_pattern)
+                if not replace_txt:
+                    replace_txt = ''
+                rules.insert(0, (search_re, replace_txt))
+                user_sr_rules[(search_re, replace_txt)] = search_pattern
+            except Exception as e:
+                self.log.error('Failed to parse %r regexp because %s' %
+                        (search, as_unicode(e)))
+
+        # search / replace using the sr?_search / sr?_replace options
+        for i in range(1, 4):
+            search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
+            search_pattern = getattr(self.extra_opts, search, '')
+            replace_txt = getattr(self.extra_opts, replace, '')
+            if search_pattern:
+                do_search_replace(search_pattern, replace_txt)
+
+        # multi-search / replace using the search_replace option
+        search_replace = getattr(self.extra_opts, 'search_replace', None)
+        if search_replace:
+            search_replace = json.loads(search_replace)
+            for search_pattern, replace_txt in reversed(search_replace):
+                do_search_replace(search_pattern, replace_txt)
+
+        end_rules = []
+        # delete soft hyphens - moved here so it's executed after header/footer removal
+        if is_pdftohtml:
+            # unwrap/delete soft hyphens
+            end_rules.append((re.compile(
+                r'[](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
+            # unwrap/delete soft hyphens with formatting
+            end_rules.append((re.compile(
+                r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
+
+        length = -1
+        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
+            docanalysis = DocAnalysis('pdf', html)
+            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
+            if length:
+                # print("The pdf line length returned is " + unicode_type(length))
+                # unwrap em/en dashes
+                end_rules.append((re.compile(
+                    r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
+                end_rules.append(
+                    # Un wrap using punctuation
+                    (re.compile((
+                        r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]'
+                        r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
+                        r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
+                )
+
+        for rule in html_preprocess_rules() + start_rules:
+            html = rule[0].sub(rule[1], html)
+
+        if self.regex_wizard_callback is not None:
+            self.regex_wizard_callback(self.current_href, html)
+
+        if get_preprocess_html:
+            return html
+
+        def dump(raw, where):
+            import os
+            dp = getattr(self.extra_opts, 'debug_pipeline', None)
+            if dp and os.path.exists(dp):
+                odir = os.path.join(dp, 'input')
+                if os.path.exists(odir):
+                    odir = os.path.join(odir, where)
+                    if not os.path.exists(odir):
+                        os.makedirs(odir)
+                    name, i = None, 0
+                    while not name or os.path.exists(os.path.join(odir, name)):
+                        i += 1
+                        name = '%04d.html'%i
+                    with open(os.path.join(odir, name), 'wb') as f:
+                        f.write(raw.encode('utf-8'))
+
+        # dump(html, 'pre-preprocess')
+
+        for rule in rules + end_rules:
+            try:
+                html = rule[0].sub(rule[1], html)
+            except Exception as e:
+                if rule in user_sr_rules:
+                    self.log.error(
+                        'User supplied search & replace rule: %s -> %s '
+                        'failed with error: %s, ignoring.'%(
+                            user_sr_rules[rule], rule[1], e))
+                else:
+                    raise
+
+        if is_pdftohtml and length > -1:
+            # Dehyphenate
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
+            html = dehyphenator(html,'html', length)
+
+        if is_pdftohtml:
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            pdf_markup = HeuristicProcessor(self.extra_opts, None)
+            totalwords = 0
+            if pdf_markup.get_word_count(html) > 7000:
+                html = pdf_markup.markup_chapters(html, totalwords, True)
+
+        # dump(html, 'post-preprocess')
+
+        # Handle broken XHTML w/ SVG (ugh)
+        if 'svg:' in html and SVG_NS not in html:
+            html = html.replace(
+                '<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
+        if 'xlink:' in html and XLINK_NS not in html:
+            html = html.replace(
+                '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
+
+        html = XMLDECL_RE.sub('', html)
+
+        if getattr(self.extra_opts, 'asciiize', False):
+            from calibre.utils.localization import get_udc
+            from calibre.utils.mreplace import MReplace
+            unihandecoder = get_udc()
+            mr = MReplace(data={'«':'&lt;'*3, '»':'&gt;'*3})
+            html = mr.mreplace(html)
+            html = unihandecoder.decode(html)
+
+        if getattr(self.extra_opts, 'enable_heuristics', False):
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
+            html = preprocessor(html)
+
+        if is_pdftohtml:
+            html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
+
+        if getattr(self.extra_opts, 'smarten_punctuation', False):
+            html = smarten_punctuation(html, self.log)
+
+        try:
+            unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
+        except AttributeError:
+            unsupported_unicode_chars = ''
+        if unsupported_unicode_chars:
+            from calibre.utils.localization import get_udc
+            unihandecoder = get_udc()
+            for char in unsupported_unicode_chars:
+                asciichar = unihandecoder.decode(char)
+                html = html.replace(char, asciichar)
+
+        return html
@@ -0,0 +1,881 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+from math import ceil
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
+from calibre.utils.logging import default_log
+from calibre.utils.wordcount import get_wordcount_obj
+from polyglot.builtins import unicode_type
+
+
+class HeuristicProcessor(object):
+
+    def __init__(self, extra_opts=None, log=None):
+        self.log = default_log if log is None else log
+        self.html_preprocess_sections = 0
+        self.found_indents = 0
+        self.extra_opts = extra_opts
+        self.deleted_nbsps = False
+        self.totalwords = 0
+        self.min_chapters = 1
+        self.chapters_no_title = 0
+        self.chapters_with_title = 0
+        self.blanks_deleted = False
+        self.blanks_between_paragraphs = False
+        self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
+        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
+        self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
+        self.line_open = (
+            r"<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*"
+            r"(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*")
+        self.line_close = "(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>"
+        self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
+        self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
+        self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\…\\)„\\w]'
+        self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
+
+    def is_pdftohtml(self, src):
+        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
+
+    def is_abbyy(self, src):
+        return '<meta name="generator" content="ABBYY FineReader' in src[:1000]
+
+    def chapter_head(self, match):
+        from calibre.utils.html2text import html2text
+        chap = match.group('chap')
+        title = match.group('title')
+        if not title:
+            self.html_preprocess_sections = self.html_preprocess_sections + 1
+            self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
+                    " chapters. - " + unicode_type(chap))
+            return '<h2>'+chap+'</h2>\n'
+        else:
+            delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
+            delete_quotes = re.compile('\'\"')
+            txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
+            txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
+            self.html_preprocess_sections = self.html_preprocess_sections + 1
+            self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
+                    " chapters & titles. - " + unicode_type(chap) + ", " + unicode_type(title))
+            return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
+
+    def chapter_break(self, match):
+        chap = match.group('section')
+        styles = match.group('styles')
+        self.html_preprocess_sections = self.html_preprocess_sections + 1
+        self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
+                " section markers based on punctuation. - " + unicode_type(chap))
+        return '<'+styles+' style="page-break-before:always">'+chap
+
+    def analyze_title_matches(self, match):
+        # chap = match.group('chap')
+        title = match.group('title')
+        if not title:
+            self.chapters_no_title = self.chapters_no_title + 1
+        else:
+            self.chapters_with_title = self.chapters_with_title + 1
+
+    def insert_indent(self, match):
+        pstyle = match.group('formatting')
+        tag = match.group('tagtype')
+        span = match.group('span')
+        self.found_indents = self.found_indents + 1
+        if pstyle:
+            if pstyle.lower().find('style') != -1:
+                pstyle = re.sub(r'"$', '; text-indent:3%"', pstyle)
+            else:
+                pstyle = pstyle+' style="text-indent:3%"'
+            if not span:
+                return '<'+tag+' '+pstyle+'>'
+            else:
+                return '<'+tag+' '+pstyle+'>'+span
+        else:
+            if not span:
+                return '<'+tag+' style="text-indent:3%">'
+            else:
+                return '<'+tag+' style="text-indent:3%">'+span
+
+    def no_markup(self, raw, percent):
+        '''
+        Detects total marked up line endings in the file. raw is the text to
+        inspect.  Percent is the minimum percent of line endings which should
+        be marked up to return true.
+        '''
+        htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
+        line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+        htm_end = htm_end_ere.findall(raw)
+        line_end = line_end_ere.findall(raw)
+        tot_htm_ends = len(htm_end)
+        tot_ln_fds = len(line_end)
+        # self.log.debug("There are " + unicode_type(tot_ln_fds) + " total Line feeds, and " +
+        #        unicode_type(tot_htm_ends) + " marked up endings")
+
+        if percent > 1:
+            percent = 1
+        if percent < 0:
+            percent = 0
+
+        min_lns = tot_ln_fds * percent
+        # self.log.debug("There must be fewer than " + unicode_type(min_lns) + " unmarked lines to add markup")
+        return min_lns > tot_htm_ends
+
+    def dump(self, raw, where):
+        import os
+        dp = getattr(self.extra_opts, 'debug_pipeline', None)
+        if dp and os.path.exists(dp):
+            odir = os.path.join(dp, 'preprocess')
+            if not os.path.exists(odir):
+                os.makedirs(odir)
+            if os.path.exists(odir):
+                odir = os.path.join(odir, where)
+                if not os.path.exists(odir):
+                    os.makedirs(odir)
+                name, i = None, 0
+                while not name or os.path.exists(os.path.join(odir, name)):
+                    i += 1
+                    name = '%04d.html'%i
+                with open(os.path.join(odir, name), 'wb') as f:
+                    f.write(raw.encode('utf-8'))
+
+    def get_word_count(self, html):
+        word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
+        word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
+        wordcount = get_wordcount_obj(word_count_text)
+        return wordcount.words
+
+    def markup_italicis(self, html):
+        # self.log.debug("\n\n\nitalicize debugging \n\n\n")
+        ITALICIZE_WORDS = [
+            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
+            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+            'Mlle.', 'Mons.', 'PS.', 'PPS.',
+        ]
+
+        ITALICIZE_STYLE_PATS = [
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])_\*/(?P<words>[^\*_]+)/\*_'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])~~(?P<words>[^~]+)~~'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])_/(?P<words>[^/_]+)/_'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])_\*(?P<words>[^\*_]+)\*_'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])\*/(?P<words>[^/\*]+)/\*'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])/:(?P<words>[^:/]+):/'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])\|:(?P<words>[^:\|]+):\|'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])\*(?P<words>[^\*]+)\*'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])~(?P<words>[^~]+)~'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])/(?P<words>[^/\*><]+)/'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])_(?P<words>[^_]+)_'),
+        ]
+
+        for word in ITALICIZE_WORDS:
+            html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
+
+        search_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
+        search_text = re.sub(r'<[^>]*>', '', search_text)
+        for pat in ITALICIZE_STYLE_PATS:
+            for match in re.finditer(pat, search_text):
+                ital_string = unicode_type(match.group('words'))
+                # self.log.debug("italicising "+unicode_type(match.group(0))+"    with <i>"+ital_string+"</i>")
+                try:
+                    html = re.sub(re.escape(unicode_type(match.group(0))), '<i>%s</i>' % ital_string, html)
+                except OverflowError:
+                    # match.group(0) was too large to be compiled into a regex
+                    continue
+                except re.error:
+                    # the match was not a valid regular expression
+                    continue
+
+        return html
+
+    def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
+        '''
+        Searches for common chapter headings throughout the document
+        attempts multiple patterns based on likelihood of a match
+        with minimum false positives.  Exits after finding a successful pattern
+        '''
+        # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
+        # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
+        # or pdf page numbers from being treated as TOC markers
+        max_chapters = 150
+        typical_chapters = 7000.
+        if wordcount > 7000:
+            if wordcount > 200000:
+                typical_chapters = 15000.
+            self.min_chapters = int(ceil(wordcount / typical_chapters))
+        self.log.debug("minimum chapters required are: "+unicode_type(self.min_chapters))
+        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
+        self.html_preprocess_sections = len(heading.findall(html))
+        self.log.debug("found " + unicode_type(self.html_preprocess_sections) + " pre-existing headings")
+
+        # Build the Regular Expressions in pieces
+        init_lookahead = "(?=<(p|div))"
+        chapter_line_open = self.line_open
+        title_line_open = (r"<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?"
+        r"\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*")
+        chapter_header_open = r"(?P<chap>"
+        title_header_open = r"(?P<title>"
+        chapter_header_close = ")\\s*"
+        title_header_close = ")"
+        chapter_line_close = self.line_close
+        title_line_close = "(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)>)?\\s*</(?P=outer2)>"
+
+        is_pdftohtml = self.is_pdftohtml(html)
+        if is_pdftohtml:
+            title_line_open = "<(?P<outer2>p)[^>]*>\\s*"
+            title_line_close = "\\s*</(?P=outer2)>"
+
+        if blanks_between_paragraphs:
+            blank_lines = "(\\s*<p[^>]*>\\s*</p>){0,2}\\s*"
+        else:
+            blank_lines = ""
+        opt_title_open = "("
+        opt_title_close = ")?"
+        n_lookahead_open = "(?!\\s*"
+        n_lookahead_close = ")\\s*"
+
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
+
+        analysis_result = []
+
+        chapter_types = [
+            [(
+                r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)"
+                r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'],
+            # Highest frequency headings which include titles
+            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>",
+                           True, True, True, False, "Searching for emphasized lines", 'emphasized'],  # Emphasized lines
+            [r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False,
+                       "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False,
+                       "Searching for numeric chapters with titles", 'numeric_title'],  # Numeric Titles
+            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False,
+                       "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False,
+                          "Searching for chapters with Uppercase Characters", 'uppercase']  # Uppercase Chapters
+            ]
+
+        def recurse_patterns(html, analyze):
+            # Start with most typical chapter headings, get more aggressive until one works
+            for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
+                n_lookahead = ''
+                hits = 0
+                self.chapters_no_title = 0
+                self.chapters_with_title = 0
+
+                if n_lookahead_req:
+                    lp_n_lookahead_open = n_lookahead_open
+                    lp_n_lookahead_close = n_lookahead_close
+                else:
+                    lp_n_lookahead_open = ''
+                    lp_n_lookahead_close = ''
+
+                if strict_title:
+                    lp_title = default_title
+                else:
+                    lp_title = simple_title
+
+                if ignorecase:
+                    arg_ignorecase = r'(?i)'
+                else:
+                    arg_ignorecase = ''
+
+                if title_req:
+                    lp_opt_title_open = ''
+                    lp_opt_title_close = ''
+                else:
+                    lp_opt_title_open = opt_title_open
+                    lp_opt_title_close = opt_title_close
+
+                if self.html_preprocess_sections >= self.min_chapters:
+                    break
+                full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
+                if n_lookahead_req:
+                    n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+                if not analyze:
+                    self.log.debug("Marked " + unicode_type(self.html_preprocess_sections) + " headings, " + log_message)
+
+                chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
+                    lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
+                chapdetect = re.compile(r'%s' % chapter_marker)
+
+                if analyze:
+                    hits = len(chapdetect.findall(html))
+                    if hits:
+                        chapdetect.sub(self.analyze_title_matches, html)
+                        if float(self.chapters_with_title) / float(hits) > .5:
+                            title_req = True
+                            strict_title = False
+                        self.log.debug(
+                                unicode_type(type_name)+" had "+unicode_type(hits)+
+                                " hits - "+unicode_type(self.chapters_no_title)+" chapters with no title, "+
+                                unicode_type(self.chapters_with_title)+" chapters with titles, "+
+                                unicode_type(float(self.chapters_with_title) / float(hits))+" percent. ")
+                        if type_name == 'common':
+                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
+                        elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
+                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
+                            break
+                else:
+                    html = chapdetect.sub(self.chapter_head, html)
+            return html
+
+        recurse_patterns(html, True)
+        chapter_types = analysis_result
+        html = recurse_patterns(html, False)
+
+        words_per_chptr = wordcount
+        if words_per_chptr > 0 and self.html_preprocess_sections > 0:
+            words_per_chptr = wordcount // self.html_preprocess_sections
+        self.log.debug("Total wordcount is: "+ unicode_type(wordcount)+", Average words per section is: "+
+                       unicode_type(words_per_chptr)+", Marked up "+unicode_type(self.html_preprocess_sections)+" chapters")
+        return html
+
+    def punctuation_unwrap(self, length, content, format):
+        '''
+        Unwraps lines based on line length and punctuation
+        supports a range of html markup and text files
+
+        the lookahead regex below is meant look for any non-full stop characters - punctuation
+        characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
+        the reason for this is to prevent false positive wrapping.  False positives are more
+        difficult to detect than false negatives during a manual review of the doc
+
+        This function intentionally leaves hyphenated content alone as that is handled by the
+        dehyphenate routine in a separate step
+        '''
+        def style_unwrap(match):
+            style_close = match.group('style_close')
+            style_open = match.group('style_open')
+            if style_open and style_close:
+                return style_close+' '+style_open
+            elif style_open and not style_close:
+                return ' '+style_open
+            elif not style_open and style_close:
+                return style_close+' '
+            else:
+                return ' '
+
+        # define the pieces of the regex
+        # (?<!\&\w{4});) is a semicolon not part of an entity
+        lookahead = "(?<=.{"+unicode_type(length)+r"}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4});))"
+        em_en_lookahead = "(?<=.{"+unicode_type(length)+"}[\u2013\u2014])"
+        soft_hyphen = "\xad"
+        line_ending = "\\s*(?P<style_close></(span|[iub])>)?\\s*(</(p|div)>)?"
+        blanklines = "\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*"
+        line_opening = "<(p|div)[^>]*>\\s*(?P<style_open><(span|[iub])[^>]*>)?\\s*"
+        txt_line_wrap = "((\u0020|\u0009)*\n){1,4}"
+
+        if format == 'txt':
+            unwrap_regex = lookahead+txt_line_wrap
+            em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
+            shy_unwrap_regex = soft_hyphen+txt_line_wrap
+        else:
+            unwrap_regex = lookahead+line_ending+blanklines+line_opening
+            em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
+            shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
+
+        unwrap = re.compile("%s" % unwrap_regex, re.UNICODE)
+        em_en_unwrap = re.compile("%s" % em_en_unwrap_regex, re.UNICODE)
+        shy_unwrap = re.compile("%s" % shy_unwrap_regex, re.UNICODE)
+
+        if format == 'txt':
+            content = unwrap.sub(' ', content)
+            content = em_en_unwrap.sub('', content)
+            content = shy_unwrap.sub('', content)
+        else:
+            content = unwrap.sub(style_unwrap, content)
+            content = em_en_unwrap.sub(style_unwrap, content)
+            content = shy_unwrap.sub(style_unwrap, content)
+
+        return content
+
+    def txt_process(self, match):
+        from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs_single_line
+        content = match.group('text')
+        content = separate_paragraphs_single_line(content)
+        content = convert_basic(content, epub_split_size_kb=0)
+        return content
+
+    def markup_pre(self, html):
+        pre = re.compile(r'<pre>', re.IGNORECASE)
+        if len(pre.findall(html)) >= 1:
+            self.log.debug("Running Text Processing")
+            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
+            html = outerhtml.sub(self.txt_process, html)
+            from calibre.ebooks.conversion.preprocess import convert_entities
+            html = re.sub(r'&(\S+?);', convert_entities, html)
+        else:
+            # Add markup naively
+            # TODO - find out if there are cases where there are more than one <pre> tag or
+            # other types of unmarked html and handle them in some better fashion
+            add_markup = re.compile('(?<!>)(\n)')
+            html = add_markup.sub('</p>\n<p>', html)
+        return html
+
+    def arrange_htm_line_endings(self, html):
+        html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\\g<tag>"+">\n", html)
+        html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\\g<tag>"+"\\g<style>"+">", html)
+        return html
+
+    def fix_nbsp_indents(self, html):
+        txtindent = re.compile(unicode_type(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
+        html = txtindent.sub(self.insert_indent, html)
+        if self.found_indents > 1:
+            self.log.debug("replaced "+unicode_type(self.found_indents)+ " nbsp indents with inline styles")
+        return html
+
+    def cleanup_markup(self, html):
+        # remove remaining non-breaking spaces
+        html = re.sub(unicode_type(r'\u00a0'), ' ', html)
+        # Get rid of various common microsoft specific tags which can cause issues later
+        # Get rid of empty <o:p> tags to simplify other processing
+        html = re.sub(unicode_type(r'\s*<o:p>\s*</o:p>'), ' ', html)
+        # Delete microsoft 'smart' tags
+        html = re.sub('(?i)</?st1:\\w+>', '', html)
+        # Re-open self closing paragraph tags
+        html = re.sub('<p[^>/]*/>', '<p> </p>', html)
+        # Get rid of empty span, bold, font, em, & italics tags
+        fmt_tags = 'font|[ibu]|em|strong'
+        open_fmt_pat, close_fmt_pat = r'<(?:{})(?:\s[^>]*)?>'.format(fmt_tags), '</(?:{})>'.format(fmt_tags)
+        for i in range(2):
+            html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
+            html = re.sub(
+                r"\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}".format(open=open_fmt_pat, close=close_fmt_pat) , " ", html)
+        # delete surrounding divs from empty paragraphs
+        html = re.sub('<div[^>]*>\\s*<p[^>]*>\\s*</p>\\s*</div>', '<p> </p>', html)
+        # Empty heading tags
+        html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
+        self.deleted_nbsps = True
+        return html
+
+    def analyze_line_endings(self, html):
+        '''
+        determines the type of html line ending used most commonly in a document
+        use before calling docanalysis functions
+        '''
+        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
+        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
+        paras = len(paras_reg.findall(html))
+        spans = len(spans_reg.findall(html))
+        if spans > 1:
+            if float(paras) / float(spans) < 0.75:
+                return 'spanned_html'
+            else:
+                return 'html'
+        else:
+            return 'html'
+
+    def analyze_blanks(self, html):
+        blanklines = self.blankreg.findall(html)
+        lines = self.linereg.findall(html)
+        if len(lines) > 1:
+            self.log.debug("There are " + unicode_type(len(blanklines)) + " blank lines. " +
+                    unicode_type(float(len(blanklines)) / float(len(lines))) + " percent blank")
+
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
+                return True
+            else:
+                return False
+
+    def cleanup_required(self):
+        for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
+            if getattr(self.extra_opts, option, False):
+                return True
+        return False
+
+    def merge_blanks(self, html, blanks_count=None):
+        base_em = .5  # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
+        em_per_line = 1.5  # Add another 1.5 em for each additional blank
+
+        def merge_matches(match):
+            to_merge = match.group(0)
+            lines = float(len(self.single_blank.findall(to_merge))) - 1.
+            em = base_em + (em_per_line * lines)
+            if to_merge.find('whitespace'):
+                newline = self.any_multi_blank.sub('\n<p class="whitespace'+unicode_type(int(em * 10))+
+                                                   '" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
+            else:
+                newline = self.any_multi_blank.sub('\n<p class="softbreak'+unicode_type(int(em * 10))+
+                                                   '" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
+            return newline
+
+        html = self.any_multi_blank.sub(merge_matches, html)
+        return html
+
+    def detect_whitespace(self, html):
+        blanks_around_headings = re.compile(
+            r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
+            r'(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+        blanks_around_scene_breaks = re.compile(
+            r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
+            r'(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+        blanks_n_nopunct = re.compile(
+            r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*'
+            r'.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+
+        def merge_header_whitespace(match):
+            initblanks = match.group('initparas')
+            endblanks = match.group('endparas')
+            content = match.group('content')
+            top_margin = ''
+            bottom_margin = ''
+            if initblanks is not None:
+                top_margin = 'margin-top:'+unicode_type(len(self.single_blank.findall(initblanks)))+'em;'
+            if endblanks is not None:
+                bottom_margin = 'margin-bottom:'+unicode_type(len(self.single_blank.findall(endblanks)))+'em;'
+
+            if initblanks is None and endblanks is None:
+                return content
+            elif content.find('scenebreak') != -1:
+                return content
+            else:
+                content = re.sub('(?i)<h(?P<hnum>\\d+)[^>]*>', '\n\n<h'+'\\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
+            return content
+
+        html = blanks_around_headings.sub(merge_header_whitespace, html)
+        html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)
+
+        def markup_whitespaces(match):
+            blanks = match.group(0)
+            blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
+            return blanks
+
+        html = blanks_n_nopunct.sub(markup_whitespaces, html)
+        if self.html_preprocess_sections > self.min_chapters:
+            html = re.sub('(?si)^.*?(?=<h\\d)', markup_whitespaces, html)
+
+        return html
+
+    def detect_soft_breaks(self, html):
+        line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')'
+        line_two = '(?P<line_two>'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+ \
+                     '\\s*(?P<line_two_content>.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')'
+        div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two
+        div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
+
+        def convert_div_softbreaks(match):
+            init_is_paragraph = self.check_paragraph(match.group('init_content'))
+            line_two_is_paragraph = self.check_paragraph(match.group('line_two_content'))
+            if init_is_paragraph and line_two_is_paragraph:
+                return (match.group('initline')+
+                        '\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>\n'+
+                        match.group('line_two'))
+            else:
+                return match.group(0)
+
+        html = div_break_candidate.sub(convert_div_softbreaks, html)
+
+        if not self.blanks_deleted and self.blanks_between_paragraphs:
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
+        else:
+            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
+        return html
+
+    def detect_scene_breaks(self, html):
+        scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+ \
+                                             '<))(?P<break>((?P<break_char>((?!\\s)\\W))\\s*(?P=break_char)?)+)\\s*'+self.line_close
+        scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+        html = scene_breaks.sub(self.scene_break_open+'\\g<break>'+'</p>', html)
+        return html
+
+    def markup_user_break(self, replacement_break):
+        '''
+        Takes string a user supplies and wraps it in markup that will be centered with
+        appropriate margins.  <hr> and <img> tags are allowed.  If the user specifies
+        a style with width attributes in the <hr> tag then the appropriate margins are
+        applied to wrapping divs.  This is because many ebook devices don't support margin:auto
+        All other html is converted to text.
+        '''
+        hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
+        if re.findall('(<|>)', replacement_break):
+            if re.match('^<hr', replacement_break):
+                if replacement_break.find('width') != -1:
+                    try:
+                        width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
+                    except:
+                        scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
+                        self.log.warn('Invalid replacement scene break'
+                                ' expression, using default')
+                    else:
+                        replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
+                        divpercent = (100 - width) // 2
+                        hr_open = re.sub('45', unicode_type(divpercent), hr_open)
+                        scene_break = hr_open+replacement_break+'</div>'
+                else:
+                    scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
+            elif re.match('^<img', replacement_break):
+                scene_break = self.scene_break_open+replacement_break+'</p>'
+            else:
+                from calibre.utils.html2text import html2text
+                replacement_break = html2text(replacement_break)
+                replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
+                scene_break = self.scene_break_open+replacement_break+'</p>'
+        else:
+            replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
+            scene_break = self.scene_break_open+replacement_break+'</p>'
+
+        return scene_break
+
+    def check_paragraph(self, content):
+        content = re.sub('\\s*</?span[^>]*>\\s*', '', content)
+        if re.match('.*[\"\'.!?:]$', content):
+            # print "detected this as a paragraph"
+            return True
+        else:
+            return False
+
+    def abbyy_processor(self, html):
+        abbyy_line = re.compile('((?P<linestart><p\\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
+        empty_paragraph = '\n<p> </p>\n'
+        self.in_blockquote = False
+        self.previous_was_paragraph = False
+        html = re.sub('</?a[^>]*>', '', html)
+
+        def convert_styles(match):
+            # print "raw styles are: "+match.group('styles')
+            content = match.group('content')
+            # print "raw content is: "+match.group('content')
+            image = match.group('image')
+
+            is_paragraph = False
+            text_align = ''
+            text_indent = ''
+            paragraph_before = ''
+            paragraph_after = ''
+            blockquote_open = '\n<blockquote>\n'
+            blockquote_close = '</blockquote>\n'
+            indented_text = 'text-indent:3%;'
+            blockquote_open_loop = ''
+            blockquote_close_loop = ''
+            debugabby = False
+
+            if image:
+                debugabby = True
+                if self.in_blockquote:
+                    self.in_blockquote = False
+                    blockquote_close_loop = blockquote_close
+                self.previous_was_paragraph = False
+                return blockquote_close_loop+'\n'+image+'\n'
+            else:
+                styles = match.group('styles').split(';')
+                is_paragraph = self.check_paragraph(content)
+                # print "styles for this line are: "+unicode_type(styles)
+                split_styles = []
+                for style in styles:
+                    # print "style is: "+unicode_type(style)
+                    newstyle = style.split(':')
+                    # print "newstyle is: "+unicode_type(newstyle)
+                    split_styles.append(newstyle)
+                styles = split_styles
+                for style, setting in styles:
+                    if style == 'text-align' and setting != 'left':
+                        text_align = style+':'+setting+';'
+                    if style == 'text-indent':
+                        setting = int(re.sub('\\s*pt\\s*', '', setting))
+                        if 9 < setting < 14:
+                            text_indent = indented_text
+                        else:
+                            text_indent = style+':'+unicode_type(setting)+'pt;'
+                    if style == 'padding':
+                        setting = re.sub('pt', '', setting).split(' ')
+                        if int(setting[1]) < 16 and int(setting[3]) < 16:
+                            if self.in_blockquote:
+                                debugabby = True
+                                if is_paragraph:
+                                    self.in_blockquote = False
+                                    blockquote_close_loop = blockquote_close
+                            if int(setting[3]) > 8 and text_indent == '':
+                                text_indent = indented_text
+                            if int(setting[0]) > 5:
+                                paragraph_before = empty_paragraph
+                            if int(setting[2]) > 5:
+                                paragraph_after = empty_paragraph
+                        elif not self.in_blockquote and self.previous_was_paragraph:
+                            debugabby = True
+                            self.in_blockquote = True
+                            blockquote_open_loop = blockquote_open
+                        if debugabby:
+                            self.log.debug('\n\n******\n')
+                            self.log.debug('padding top is: '+unicode_type(setting[0]))
+                            self.log.debug('padding right is:' +unicode_type(setting[1]))
+                            self.log.debug('padding bottom is: ' + unicode_type(setting[2]))
+                            self.log.debug('padding left is: ' +unicode_type(setting[3]))
+
+                # print "text-align is: "+unicode_type(text_align)
+                # print "\n***\nline is:\n     "+unicode_type(match.group(0))+'\n'
+                if debugabby:
+                    # print "this line is a paragraph = "+unicode_type(is_paragraph)+", previous line was "+unicode_type(self.previous_was_paragraph)
+                    self.log.debug("styles for this line were:", styles)
+                    self.log.debug('newline is:')
+                    self.log.debug(blockquote_open_loop+blockquote_close_loop+
+                            paragraph_before+'<p style="'+text_indent+text_align+
+                            '">'+content+'</p>'+paragraph_after+'\n\n\n\n\n')
+                # print "is_paragraph is "+unicode_type(is_paragraph)+", previous_was_paragraph is "+unicode_type(self.previous_was_paragraph)
+                self.previous_was_paragraph = is_paragraph
+                # print "previous_was_paragraph is now set to "+unicode_type(self.previous_was_paragraph)+"\n\n\n"
+                return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after
+
+        html = abbyy_line.sub(convert_styles, html)
+        return html
+
+    def __call__(self, html):
+        self.log.debug("*********  Heuristic processing HTML  *********")
+        # Count the words in the document to estimate how many chapters to look for and whether
+        # other types of processing are attempted
+        try:
+            self.totalwords = self.get_word_count(html)
+        except:
+            self.log.warn("Can't get wordcount")
+
+        if self.totalwords < 50:
+            self.log.warn("flow is too short, not running heuristics")
+            return html
+
+        is_abbyy = self.is_abbyy(html)
+        if is_abbyy:
+            html = self.abbyy_processor(html)
+
+        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+        html = self.arrange_htm_line_endings(html)
+        # self.dump(html, 'after_arrange_line_endings')
+        if self.cleanup_required():
+            # ##### Check Markup ######
+            #
+            # some lit files don't have any <p> tags or equivalent (generally just plain text between
+            # <pre> tags), check and  mark up line endings if required before proceeding
+            # fix indents must run after this step
+            if self.no_markup(html, 0.1):
+                self.log.debug("not enough paragraph markers, adding now")
+                # markup using text processing
+                html = self.markup_pre(html)
+
+        # Replace series of non-breaking spaces with text-indent
+        if getattr(self.extra_opts, 'fix_indents', False):
+            html = self.fix_nbsp_indents(html)
+
+        if self.cleanup_required():
+            # fix indents must run before this step, as it removes non-breaking spaces
+            html = self.cleanup_markup(html)
+
+        is_pdftohtml = self.is_pdftohtml(html)
+        if is_pdftohtml:
+            self.line_open = "<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*"
+            self.line_close = "\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>"
+
+        # ADE doesn't render <br />, change to empty paragraphs
+        # html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
+
+        # Determine whether the document uses interleaved blank lines
+        self.blanks_between_paragraphs = self.analyze_blanks(html)
+
+        # detect chapters/sections to match xpath or splitting logic
+
+        if getattr(self.extra_opts, 'markup_chapter_headings', False):
+            html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
+        # self.dump(html, 'after_chapter_markup')
+
+        if getattr(self.extra_opts, 'italicize_common_cases', False):
+            html = self.markup_italicis(html)
+
+        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
+        # blank paragraphs then delete blank lines to clean up spacing
+        if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
+            self.log.debug("deleting blank lines")
+            self.blanks_deleted = True
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
+            html = self.blankreg.sub('', html)
+
+        # Determine line ending type
+        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
+        # that lines can be un-wrapped across page boundaries
+        format = self.analyze_line_endings(html)
+
+        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+        # more of the lines break in the same region of the document then unwrapping is required
+        docanalysis = DocAnalysis(format, html)
+        hardbreaks = docanalysis.line_histogram(.50)
+        self.log.debug("Hard line breaks check returned "+unicode_type(hardbreaks))
+
+        # Calculate Length
+        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+        length = docanalysis.line_length(unwrap_factor)
+        self.log.debug("Median line length is " + unicode_type(length) + ", calculated with " + format + " format")
+
+        # ##### Unwrap lines ######
+        if getattr(self.extra_opts, 'unwrap_lines', False):
+            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
+            if hardbreaks or unwrap_factor < 0.4:
+                self.log.debug("Unwrapping required, unwrapping Lines")
+                # Dehyphenate with line length limiters
+                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
+                html = dehyphenator(html,'html', length)
+                html = self.punctuation_unwrap(length, html, 'html')
+
+        if getattr(self.extra_opts, 'dehyphenate', False):
+            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
+            self.log.debug("Fixing hyphenated content")
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
+            html = dehyphenator(html,'html_cleanup', length)
+            html = dehyphenator(html, 'individual_words', length)
+
+        # If still no sections after unwrapping mark split points on lines with no punctuation
+        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
+            self.log.debug("Looking for more split points based on punctuation,"
+                    " currently have " + unicode_type(self.html_preprocess_sections))
+            chapdetect3 = re.compile(
+                r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
+                r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
+                r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*'
+                r'(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            html = chapdetect3.sub(self.chapter_break, html)
+
+        if getattr(self.extra_opts, 'renumber_headings', False):
+            # search for places where a first or second level heading is immediately followed by another
+            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+            # headings and titles, images, etc
+            doubleheading = re.compile(
+                r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+            html = doubleheading.sub('\\g<firsthead>'+'\n<h3'+'\\g<secondhead>'+'</h3>', html)
+
+        # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
+        # style it with the 'whitespace' class.  All remaining blank lines are styled as softbreaks.
+        # Multiple sequential blank paragraphs are merged with appropriate margins
+        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
+        if getattr(self.extra_opts, 'format_scene_breaks', False):
+            self.log.debug('Formatting scene breaks')
+            html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html)
+            html = self.detect_scene_breaks(html)
+            html = self.detect_whitespace(html)
+            html = self.detect_soft_breaks(html)
+            blanks_count = len(self.any_multi_blank.findall(html))
+            if blanks_count >= 1:
+                html = self.merge_blanks(html, blanks_count)
+            detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
+            scene_break_count = len(detected_scene_break.findall(html))
+            # If the user has enabled scene break replacement, then either softbreaks
+            # or 'hard' scene breaks are replaced, depending on which is in use
+            # Otherwise separator lines are centered, use a bit larger margin in this case
+            replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
+            if replacement_break:
+                replacement_break = self.markup_user_break(replacement_break)
+                if scene_break_count >= 1:
+                    html = detected_scene_break.sub(replacement_break, html)
+                    html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
+                else:
+                    html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
+
+        if self.deleted_nbsps:
+            # put back non-breaking spaces in empty paragraphs so they render correctly
+            html = self.anyblank.sub('\n'+r'\g<openline>'+'\u00a0'+r'\g<closeline>', html)
+        return html