Initial import

2026-03-26 12:33:32 +01:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
--- a/ebook_converter/ebooks/BeautifulSoup.py
+++ b/ebook_converter/ebooks/BeautifulSoup.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPLv3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import bs4
+from bs4 import (  # noqa
+    CData, Comment, Declaration, NavigableString, ProcessingInstruction,
+    SoupStrainer, Tag, __version__
+)
+
+from polyglot.builtins import unicode_type
+
+
+def parse_html(markup):
+    from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites
+    from calibre.utils.cleantext import clean_xml_chars
+    if isinstance(markup, unicode_type):
+        markup = strip_encoding_declarations(markup)
+        markup = substitute_entites(markup)
+    else:
+        markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
+    markup = clean_xml_chars(markup)
+    from html5_parser.soup import parse
+    return parse(markup, return_root=False)
+
+
+def prettify(soup):
+    ans = soup.prettify()
+    if isinstance(ans, bytes):
+        ans = ans.decode('utf-8')
+    return ans
+
+
+def BeautifulSoup(markup='', *a, **kw):
+    return parse_html(markup)
+
+
+def BeautifulStoneSoup(markup='', *a, **kw):
+    return bs4.BeautifulSoup(markup, 'xml')
--- a/ebook_converter/ebooks/init.py
+++ b/ebook_converter/ebooks/init.py
@@ -0,0 +1,248 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
+'''
+Code for the conversion of ebook formats and the reading of metadata
+from various formats.
+'''
+
+import os, re, numbers, sys
+from calibre import prints
+from calibre.ebooks.chardet import xml_to_unicode
+from polyglot.builtins import unicode_type
+
+
+class ConversionError(Exception):
+
+    def __init__(self, msg, only_msg=False):
+        Exception.__init__(self, msg)
+        self.only_msg = only_msg
+
+
+class UnknownFormatError(Exception):
+    pass
+
+
+class DRMError(ValueError):
+    pass
+
+
+class ParserError(ValueError):
+    pass
+
+
+BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'htm', 'xhtm',
+                   'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'updb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
+                   'epub', 'fb2', 'fbz', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
+                   'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb',
+                   'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx', 'docm', 'md',
+                   'textile', 'markdown', 'ibook', 'ibooks', 'iba', 'azw3', 'ps', 'kepub', 'kfx', 'kpf']
+
+
+def return_raster_image(path):
+    from calibre.utils.imghdr import what
+    if os.access(path, os.R_OK):
+        with open(path, 'rb') as f:
+            raw = f.read()
+        if what(None, raw) not in (None, 'svg'):
+            return raw
+
+
+def extract_cover_from_embedded_svg(html, base, log):
+    from calibre.ebooks.oeb.base import XPath, SVG, XLINK
+    from calibre.utils.xml_parse import safe_xml_fromstring
+    root = safe_xml_fromstring(html)
+
+    svg = XPath('//svg:svg')(root)
+    if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
+        image = svg[0][0]
+        href = image.get(XLINK('href'), None)
+        if href:
+            path = os.path.join(base, *href.split('/'))
+            return return_raster_image(path)
+
+
+def extract_calibre_cover(raw, base, log):
+    from calibre.ebooks.BeautifulSoup import BeautifulSoup
+    soup = BeautifulSoup(raw)
+    matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
+        'font', 'br'])
+    images = soup.findAll('img', src=True)
+    if matches is None and len(images) == 1 and \
+            images[0].get('alt', '').lower()=='cover':
+        img = images[0]
+        img = os.path.join(base, *img['src'].split('/'))
+        q = return_raster_image(img)
+        if q is not None:
+            return q
+
+    # Look for a simple cover, i.e. a body with no text and only one <img> tag
+    if matches is None:
+        body = soup.find('body')
+        if body is not None:
+            text = u''.join(map(unicode_type, body.findAll(text=True)))
+            if text.strip():
+                # Body has text, abort
+                return
+            images = body.findAll('img', src=True)
+            if len(images) == 1:
+                img = os.path.join(base, *images[0]['src'].split('/'))
+                return return_raster_image(img)
+
+
+def render_html_svg_workaround(path_to_html, log, width=590, height=750):
+    from calibre.ebooks.oeb.base import SVG_NS
+    with open(path_to_html, 'rb') as f:
+        raw = f.read()
+    raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
+    data = None
+    if SVG_NS in raw:
+        try:
+            data = extract_cover_from_embedded_svg(raw,
+                   os.path.dirname(path_to_html), log)
+        except Exception:
+            pass
+    if data is None:
+        try:
+            data = extract_calibre_cover(raw, os.path.dirname(path_to_html), log)
+        except Exception:
+            pass
+
+    if data is None:
+        data = render_html_data(path_to_html, width, height)
+    return data
+
+
+def render_html_data(path_to_html, width, height):
+    from calibre.ptempfile import TemporaryDirectory
+    from calibre.utils.ipc.simple_worker import fork_job, WorkerError
+    result = {}
+
+    def report_error(text=''):
+        prints('Failed to render', path_to_html, 'with errors:', file=sys.stderr)
+        if text:
+            prints(text, file=sys.stderr)
+        if result and result['stdout_stderr']:
+            with open(result['stdout_stderr'], 'rb') as f:
+                prints(f.read(), file=sys.stderr)
+
+    with TemporaryDirectory('-render-html') as tdir:
+        try:
+            result = fork_job('calibre.ebooks.render_html', 'main', args=(path_to_html, tdir, 'jpeg'))
+        except WorkerError as e:
+            report_error(e.orig_tb)
+        else:
+            if result['result']:
+                with open(os.path.join(tdir, 'rendered.jpeg'), 'rb') as f:
+                    return f.read()
+            else:
+                report_error()
+
+
+def check_ebook_format(stream, current_guess):
+    ans = current_guess
+    if current_guess.lower() in ('prc', 'mobi', 'azw', 'azw1', 'azw3'):
+        stream.seek(0)
+        if stream.read(3) == b'TPZ':
+            ans = 'tpz'
+        stream.seek(0)
+    return ans
+
+
+def normalize(x):
+    if isinstance(x, unicode_type):
+        import unicodedata
+        x = unicodedata.normalize('NFC', x)
+    return x
+
+
+def calibre_cover(title, author_string, series_string=None,
+        output_format='jpg', title_size=46, author_size=36, logo_path=None):
+    title = normalize(title)
+    author_string = normalize(author_string)
+    series_string = normalize(series_string)
+    from calibre.ebooks.covers import calibre_cover2
+    from calibre.utils.img import image_to_data
+    ans = calibre_cover2(title, author_string or '', series_string or '', logo_path=logo_path, as_qimage=True)
+    return image_to_data(ans, fmt=output_format)
+
+
+UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc|rem|q)$')
+
+
+def unit_convert(value, base, font, dpi, body_font_size=12):
+    ' Return value in pts'
+    if isinstance(value, numbers.Number):
+        return value
+    try:
+        return float(value) * 72.0 / dpi
+    except:
+        pass
+    result = value
+    m = UNIT_RE.match(value)
+    if m is not None and m.group(1):
+        value = float(m.group(1))
+        unit = m.group(2)
+        if unit == '%':
+            result = (value / 100.0) * base
+        elif unit == 'px':
+            result = value * 72.0 / dpi
+        elif unit == 'in':
+            result = value * 72.0
+        elif unit == 'pt':
+            result = value
+        elif unit == 'em':
+            result = value * font
+        elif unit in ('ex', 'en'):
+            # This is a hack for ex since we have no way to know
+            # the x-height of the font
+            font = font
+            result = value * font * 0.5
+        elif unit == 'pc':
+            result = value * 12.0
+        elif unit == 'mm':
+            result = value * 2.8346456693
+        elif unit == 'cm':
+            result = value * 28.346456693
+        elif unit == 'rem':
+            result = value * body_font_size
+        elif unit == 'q':
+            result = value * 0.708661417325
+    return result
+
+
+def parse_css_length(value):
+    try:
+        m = UNIT_RE.match(value)
+    except TypeError:
+        return None, None
+    if m is not None and m.group(1):
+        value = float(m.group(1))
+        unit = m.group(2)
+        return value, unit.lower()
+    return None, None
+
+
+def generate_masthead(title, output_path=None, width=600, height=60):
+    from calibre.ebooks.conversion.config import load_defaults
+    recs = load_defaults('mobi_output')
+    masthead_font_family = recs.get('masthead_font', None)
+    from calibre.ebooks.covers import generate_masthead
+    return generate_masthead(title, output_path=output_path, width=width, height=height, font_family=masthead_font_family)
+
+
+def escape_xpath_attr(value):
+    if '"' in value:
+        if "'" in value:
+            parts = re.split('("+)', value)
+            ans = []
+            for x in parts:
+                if x:
+                    q = "'" if '"' in x else '"'
+                    ans.append(q + x + q)
+            return 'concat(%s)' % ', '.join(ans)
+        else:
+            return "'%s'" % value
+    return '"%s"' % value
--- a/ebook_converter/ebooks/chardet.py
+++ b/ebook_converter/ebooks/chardet.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re, codecs
+from polyglot.builtins import unicode_type
+
+_encoding_pats = (
+    # XML declaration
+    r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
+    # HTML 5 charset
+    r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
+    # HTML 4 Pragma directive
+    r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
+)
+
+
+def compile_pats(binary):
+    for raw in _encoding_pats:
+        if binary:
+            raw = raw.encode('ascii')
+        yield re.compile(raw, flags=re.IGNORECASE)
+
+
+class LazyEncodingPats(object):
+
+    def __call__(self, binary=False):
+        attr = 'binary_pats' if binary else 'unicode_pats'
+        pats = getattr(self, attr, None)
+        if pats is None:
+            pats = tuple(compile_pats(binary))
+            setattr(self, attr, pats)
+        for pat in pats:
+            yield pat
+
+
+lazy_encoding_pats = LazyEncodingPats()
+ENTITY_PATTERN = re.compile(r'&(\S+?);')
+
+
+def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
+    prefix = raw[:limit]
+    suffix = raw[limit:]
+    is_binary = isinstance(raw, bytes)
+    if preserve_newlines:
+        if is_binary:
+            sub = lambda m: b'\n' * m.group().count(b'\n')
+        else:
+            sub = lambda m: '\n' * m.group().count('\n')
+    else:
+        sub = b'' if is_binary else u''
+    for pat in lazy_encoding_pats(is_binary):
+        prefix = pat.sub(sub, prefix)
+    raw = prefix + suffix
+    return raw
+
+
+def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
+    prefix = raw[:limit]
+    suffix = raw[limit:]
+    changed = [False]
+    is_binary = isinstance(raw, bytes)
+    if is_binary:
+        if not isinstance(enc, bytes):
+            enc = enc.encode('ascii')
+    else:
+        if isinstance(enc, bytes):
+            enc = enc.decode('ascii')
+
+    def sub(m):
+        ans = m.group()
+        if m.group(1).lower() != enc.lower():
+            changed[0] = True
+            start, end = m.start(1) - m.start(0), m.end(1) - m.end(0)
+            ans = ans[:start] + enc + ans[end:]
+        return ans
+
+    for pat in lazy_encoding_pats(is_binary):
+        prefix = pat.sub(sub, prefix)
+    raw = prefix + suffix
+    return raw, changed[0]
+
+
+def find_declared_encoding(raw, limit=50*1024):
+    prefix = raw[:limit]
+    is_binary = isinstance(raw, bytes)
+    for pat in lazy_encoding_pats(is_binary):
+        m = pat.search(prefix)
+        if m is not None:
+            ans = m.group(1)
+            if is_binary:
+                ans = ans.decode('ascii', 'replace')
+                return ans
+
+
+def substitute_entites(raw):
+    from calibre import xml_entity_to_unicode
+    return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
+
+
+_CHARSET_ALIASES = {"macintosh" : "mac-roman",
+                        "x-sjis" : "shift-jis"}
+
+
+def detect(*args, **kwargs):
+    from chardet import detect
+    return detect(*args, **kwargs)
+
+
+def force_encoding(raw, verbose, assume_utf8=False):
+    from calibre.constants import preferred_encoding
+
+    try:
+        chardet = detect(raw[:1024*50])
+    except:
+        chardet = {'encoding':preferred_encoding, 'confidence':0}
+    encoding = chardet['encoding']
+    if chardet['confidence'] < 1 and assume_utf8:
+        encoding = 'utf-8'
+    if chardet['confidence'] < 1 and verbose:
+        print('WARNING: Encoding detection confidence for %s is %d%%'%(
+            chardet['encoding'], chardet['confidence']*100))
+    if not encoding:
+        encoding = preferred_encoding
+    encoding = encoding.lower()
+    encoding = _CHARSET_ALIASES.get(encoding, encoding)
+    if encoding == 'ascii':
+        encoding = 'utf-8'
+    return encoding
+
+
+def detect_xml_encoding(raw, verbose=False, assume_utf8=False):
+    if not raw or isinstance(raw, unicode_type):
+        return raw, None
+    for x in ('utf8', 'utf-16-le', 'utf-16-be'):
+        bom = getattr(codecs, 'BOM_'+x.upper().replace('-16', '16').replace(
+            '-', '_'))
+        if raw.startswith(bom):
+            return raw[len(bom):], x
+    encoding = None
+    for pat in lazy_encoding_pats(True):
+        match = pat.search(raw)
+        if match:
+            encoding = match.group(1)
+            encoding = encoding.decode('ascii', 'replace')
+            break
+    if encoding is None:
+        encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
+    if encoding.lower().strip() == 'macintosh':
+        encoding = 'mac-roman'
+    if encoding.lower().replace('_', '-').strip() in (
+            'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
+            'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
+        # Microsoft Word exports to HTML with encoding incorrectly set to
+        # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
+        encoding = 'gbk'
+    try:
+        codecs.lookup(encoding)
+    except LookupError:
+        encoding = 'utf-8'
+
+    return raw, encoding
+
+
+def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
+                   resolve_entities=False, assume_utf8=False):
+    '''
+    Force conversion of byte string to unicode. Tries to look for XML/HTML
+    encoding declaration first, if not found uses the chardet library and
+    prints a warning if detection confidence is < 100%
+    @return: (unicode, encoding used)
+    '''
+    if not raw:
+        return '', None
+    raw, encoding = detect_xml_encoding(raw, verbose=verbose,
+            assume_utf8=assume_utf8)
+    if not isinstance(raw, unicode_type):
+        raw = raw.decode(encoding, 'replace')
+
+    if strip_encoding_pats:
+        raw = strip_encoding_declarations(raw)
+    if resolve_entities:
+        raw = substitute_entites(raw)
+
+    return raw, encoding
--- a/ebook_converter/ebooks/compression/init.py
+++ b/ebook_converter/ebooks/compression/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
--- a/ebook_converter/ebooks/compression/palmdoc.c
+++ b/ebook_converter/ebooks/compression/palmdoc.c
@@ -0,0 +1,238 @@
+/*
+:mod:`cPalmdoc` -- Palmdoc compression/decompression
+=====================================================
+
+.. module:: cPalmdoc
+    :platform: All
+    :synopsis: Compression decompression of Palmdoc implemented in C for speed
+
+.. moduleauthor:: Kovid Goyal <kovid@kovidgoyal.net> Copyright 2009
+
+*/
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <stdio.h>
+
+#define BUFFER 6000
+
+#define MIN(x, y) ( ((x) < (y)) ? (x) : (y) )
+#define MAX(x, y) ( ((x) > (y)) ? (x) : (y) )
+
+typedef unsigned short int Byte;
+typedef struct {
+	Byte	*data;
+	Py_ssize_t len;
+} buffer;
+
+#ifdef	bool
+#undef	bool
+#endif
+#define	bool		int
+
+#ifdef	false
+#undef	false
+#endif
+#define	false		0
+
+#ifdef	true
+#undef	true
+#endif
+#define	true		1
+
+#define CHAR(x) (( (x) > 127 ) ? (x)-256 : (x))
+
+#if PY_MAJOR_VERSION >= 3
+    #define BUFFER_FMT "y#"
+    #define BYTES_FMT "y#"
+#else
+    #define BUFFER_FMT "t#"
+    #define BYTES_FMT "s#"
+#endif
+
+static PyObject *
+cpalmdoc_decompress(PyObject *self, PyObject *args) {
+    const char *_input = NULL; Py_ssize_t input_len = 0;
+    Byte *input; char *output; Byte c; PyObject *ans;
+    Py_ssize_t i = 0, o = 0, j = 0, di, n;
+    if (!PyArg_ParseTuple(args, BUFFER_FMT, &_input, &input_len))
+		return NULL;
+    input = (Byte *) PyMem_Malloc(sizeof(Byte)*input_len);
+    if (input == NULL) return PyErr_NoMemory();
+    // Map chars to bytes
+    for (j = 0; j < input_len; j++)
+        input[j] = (_input[j] < 0) ? _input[j]+256 : _input[j];
+    output = (char *)PyMem_Malloc(sizeof(char)*(MAX(BUFFER, 8*input_len)));
+    if (output == NULL) return PyErr_NoMemory();
+
+    while (i < input_len) {
+        c = input[i++];
+        if (c >= 1 && c <= 8)  // copy 'c' bytes
+            while (c--) output[o++] = (char)input[i++];
+
+        else if (c <= 0x7F)  // 0, 09-7F = self
+            output[o++] = (char)c;
+
+        else if (c >= 0xC0) { // space + ASCII char
+            output[o++] = ' ';
+            output[o++] = c ^ 0x80;
+        }
+        else { // 80-BF repeat sequences
+            c = (c << 8) + input[i++];
+            di = (c & 0x3FFF) >> 3;
+            for ( n = (c & 7) + 3; n--; ++o )
+                output[o] = output[o - di];
+        }
+    }
+    ans = Py_BuildValue(BYTES_FMT, output, o);
+    if (output != NULL) PyMem_Free(output);
+    if (input != NULL) PyMem_Free(input);
+    return ans;
+}
+
+static bool
+cpalmdoc_memcmp( Byte *a, Byte *b, Py_ssize_t len) {
+    Py_ssize_t i;
+    for (i = 0; i < len; i++) if (a[i] != b[i]) return false;
+    return true;
+}
+
+static Py_ssize_t
+cpalmdoc_rfind(Byte *data, Py_ssize_t pos, Py_ssize_t chunk_length) {
+    Py_ssize_t i;
+    for (i = pos - chunk_length; i > -1; i--)
+        if (cpalmdoc_memcmp(data+i, data+pos, chunk_length)) return i;
+    return pos;
+}
+
+
+static Py_ssize_t
+cpalmdoc_do_compress(buffer *b, char *output) {
+    Py_ssize_t i = 0, j, chunk_len, dist;
+    unsigned int compound;
+    Byte c, n;
+    bool found;
+    char *head;
+    buffer temp;
+    head = output;
+    temp.data = (Byte *)PyMem_Malloc(sizeof(Byte)*8); temp.len = 0;
+    if (temp.data == NULL) return 0;
+    while (i < b->len) {
+        c = b->data[i];
+        //do repeats
+        if ( i > 10 && (b->len - i) > 10) {
+            found = false;
+            for (chunk_len = 10; chunk_len > 2; chunk_len--) {
+                j = cpalmdoc_rfind(b->data, i, chunk_len);
+                dist = i - j;
+                if (j < i && dist <= 2047) {
+                    found = true;
+                    compound = (unsigned int)((dist << 3) + chunk_len-3);
+                    *(output++) = CHAR(0x80 + (compound >> 8 ));
+                    *(output++) = CHAR(compound & 0xFF);
+                    i += chunk_len;
+                    break;
+                }
+            }
+            if (found) continue;
+        }
+
+        //write single character
+        i++;
+        if (c == 32 && i < b->len) {
+            n = b->data[i];
+            if ( n >= 0x40 && n <= 0x7F) {
+                *(output++) = CHAR(n^0x80); i++; continue;
+            }
+        }
+        if (c == 0 || (c > 8 && c < 0x80))
+            *(output++) = CHAR(c);
+        else { // Write binary data
+            j = i;
+            temp.data[0] = c; temp.len = 1;
+            while (j < b->len && temp.len < 8) {
+                c = b->data[j];
+                if (c == 0 || (c > 8 && c < 0x80)) break;
+                temp.data[temp.len++] = c; j++;
+            }
+            i += temp.len - 1;
+            *(output++) = (char)temp.len;
+            for (j=0; j < temp.len; j++) *(output++) = (char)temp.data[j];
+        }
+    }
+    PyMem_Free(temp.data);
+    return output - head;
+}
+
+static PyObject *
+cpalmdoc_compress(PyObject *self, PyObject *args) {
+    const char *_input = NULL; Py_ssize_t input_len = 0;
+    char *output; PyObject *ans;
+    Py_ssize_t j = 0;
+    buffer b;
+    if (!PyArg_ParseTuple(args, BUFFER_FMT, &_input, &input_len))
+		return NULL;
+    b.data = (Byte *)PyMem_Malloc(sizeof(Byte)*input_len);
+    if (b.data == NULL) return PyErr_NoMemory();
+    // Map chars to bytes
+    for (j = 0; j < input_len; j++)
+        b.data[j] = (_input[j] < 0) ? _input[j]+256 : _input[j];
+    b.len = input_len;
+    // Make the output buffer larger than the input as sometimes
+    // compression results in a larger block
+    output = (char *)PyMem_Malloc(sizeof(char) * (int)(1.25*b.len));
+    if (output == NULL) return PyErr_NoMemory();
+    j = cpalmdoc_do_compress(&b, output);
+    if ( j == 0) return PyErr_NoMemory();
+    ans = Py_BuildValue(BYTES_FMT, output, j);
+    PyMem_Free(output);
+    PyMem_Free(b.data);
+    return ans;
+}
+
+static char cPalmdoc_doc[] = "Compress and decompress palmdoc strings.";
+
+static PyMethodDef cPalmdoc_methods[] = {
+    {"decompress", cpalmdoc_decompress, METH_VARARGS,
+    "decompress(bytestring) -> decompressed bytestring\n\n"
+    		"Decompress a palmdoc compressed byte string. "
+    },
+
+    {"compress", cpalmdoc_compress, METH_VARARGS,
+    "compress(bytestring) -> compressed bytestring\n\n"
+    		"Palmdoc compress a byte string. "
+    },
+    {NULL, NULL, 0, NULL}
+};
+
+#if PY_MAJOR_VERSION >= 3
+#define INITERROR return NULL
+#define INITMODULE PyModule_Create(&cPalmdoc_module)
+static struct PyModuleDef cPalmdoc_module = {
+    /* m_base     */ PyModuleDef_HEAD_INIT,
+    /* m_name     */ "cPalmdoc",
+    /* m_doc      */ cPalmdoc_doc,
+    /* m_size     */ -1,
+    /* m_methods  */ cPalmdoc_methods,
+    /* m_slots    */ 0,
+    /* m_traverse */ 0,
+    /* m_clear    */ 0,
+    /* m_free     */ 0,
+};
+CALIBRE_MODINIT_FUNC PyInit_cPalmdoc(void) {
+#else
+#define INITERROR return
+#define INITMODULE Py_InitModule3("cPalmdoc", cPalmdoc_methods, cPalmdoc_doc)
+CALIBRE_MODINIT_FUNC initcPalmdoc(void) {
+#endif
+
+    PyObject *m;
+    m = INITMODULE;
+    if (m == NULL) {
+        INITERROR;
+    }
+
+#if PY_MAJOR_VERSION >= 3
+    return m;
+#endif
+}
--- a/ebook_converter/ebooks/compression/palmdoc.py
+++ b/ebook_converter/ebooks/compression/palmdoc.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env  python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import io
+from struct import pack
+
+from calibre.constants import plugins
+from polyglot.builtins import range
+cPalmdoc = plugins['cPalmdoc'][0]
+if not cPalmdoc:
+    raise RuntimeError(('Failed to load required cPalmdoc module: '
+            '%s')%plugins['cPalmdoc'][1])
+
+
+def decompress_doc(data):
+    return cPalmdoc.decompress(data)
+
+
+def compress_doc(data):
+    return cPalmdoc.compress(data) if data else b''
+
+
+def py_compress_doc(data):
+    out = io.BytesIO()
+    i = 0
+    ldata = len(data)
+    while i < ldata:
+        if i > 10 and (ldata - i) > 10:
+            chunk = b''
+            match = -1
+            for j in range(10, 2, -1):
+                chunk = data[i:i+j]
+                try:
+                    match = data.rindex(chunk, 0, i)
+                except ValueError:
+                    continue
+                if (i - match) <= 2047:
+                    break
+                match = -1
+            if match >= 0:
+                n = len(chunk)
+                m = i - match
+                code = 0x8000 + ((m << 3) & 0x3ff8) + (n - 3)
+                out.write(pack('>H', code))
+                i += n
+                continue
+        ch = data[i:i+1]
+        och = ord(ch)
+        i += 1
+        if ch == b' ' and (i + 1) < ldata:
+            onch = ord(data[i:i+1])
+            if onch >= 0x40 and onch < 0x80:
+                out.write(pack('>B', onch ^ 0x80))
+                i += 1
+                continue
+        if och == 0 or (och > 8 and och < 0x80):
+            out.write(ch)
+        else:
+            j = i
+            binseq = [ch]
+            while j < ldata and len(binseq) < 8:
+                ch = data[j:j+1]
+                och = ord(ch)
+                if och == 0 or (och > 8 and och < 0x80):
+                    break
+                binseq.append(ch)
+                j += 1
+            out.write(pack('>B', len(binseq)))
+            out.write(b''.join(binseq))
+            i += len(binseq) - 1
+    return out.getvalue()
+
+
+def find_tests():
+    import unittest
+
+    class Test(unittest.TestCase):
+
+        def test_palmdoc_compression(self):
+            for test in [
+                b'abc\x03\x04\x05\x06ms',  # Test binary writing
+                b'a b c \xfed ',  # Test encoding of spaces
+                b'0123456789axyz2bxyz2cdfgfo9iuyerh',
+                b'0123456789asd0123456789asd|yyzzxxffhhjjkk',
+                (b'ciewacnaq eiu743 r787q 0w%  ; sa fd\xef\ffdxosac wocjp acoiecowei '
+                b'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
+            ]:
+                x = compress_doc(test)
+                self.assertEqual(py_compress_doc(test), x)
+                self.assertEqual(decompress_doc(x), test)
+
+    return unittest.defaultTestLoader.loadTestsFromTestCase(Test)
--- a/ebook_converter/ebooks/conversion/init.py
+++ b/ebook_converter/ebooks/conversion/init.py
@@ -0,0 +1,30 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from polyglot.builtins import native_string_type
+
+
+class ConversionUserFeedBack(Exception):
+
+    def __init__(self, title, msg, level='info', det_msg=''):
+        ''' Show a simple message to the user
+
+        :param title: The title (very short description)
+        :param msg: The message to show the user
+        :param level: Must be one of 'info', 'warn' or 'error'
+        :param det_msg: Optional detailed message to show the user
+        '''
+        import json
+        Exception.__init__(self, json.dumps({'msg':msg, 'level':level,
+            'det_msg':det_msg, 'title':title}))
+        self.title, self.msg, self.det_msg = title, msg, det_msg
+        self.level = level
+
+
+# Ensure exception uses fully qualified name as this is used to detect it in
+# the GUI.
+ConversionUserFeedBack.__name__ = native_string_type('calibre.ebooks.conversion.ConversionUserFeedBack')
--- a/ebook_converter/ebooks/conversion/cli.py
+++ b/ebook_converter/ebooks/conversion/cli.py
@@ -0,0 +1,428 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Command line interface to conversion sub-system
+'''
+
+import sys, os, numbers
+from optparse import OptionGroup, Option
+from collections import OrderedDict
+
+from calibre.utils.config import OptionParser
+from calibre.utils.logging import Log
+from calibre.customize.conversion import OptionRecommendation
+from calibre import patheq
+from calibre.ebooks.conversion import ConversionUserFeedBack
+from calibre.utils.localization import localize_user_manual_link
+from polyglot.builtins import iteritems
+
+USAGE = '%prog ' + _('''\
+input_file output_file [options]
+
+Convert an e-book from one format to another.
+
+input_file is the input and output_file is the output. Both must be \
+specified as the first two arguments to the command.
+
+The output e-book format is guessed from the file extension of \
+output_file. output_file can also be of the special format .EXT where \
+EXT is the output file extension. In this case, the name of the output \
+file is derived from the name of the input file. Note that the filenames must \
+not start with a hyphen. Finally, if output_file has no extension, then \
+it is treated as a directory and an "open e-book" (OEB) consisting of HTML \
+files is written to that directory. These files are the files that would \
+normally have been passed to the output plugin.
+
+After specifying the input \
+and output file you can customize the conversion by specifying various \
+options. The available options depend on the input and output file types. \
+To get help on them specify the input and output file and then use the -h \
+option.
+
+For full documentation of the conversion system see
+''') + localize_user_manual_link('https://manual.calibre-ebook.com/conversion.html')
+
+HEURISTIC_OPTIONS = ['markup_chapter_headings',
+                      'italicize_common_cases', 'fix_indents',
+                      'html_unwrap_factor', 'unwrap_lines',
+                      'delete_blank_paragraphs', 'format_scene_breaks',
+                      'dehyphenate', 'renumber_headings',
+                      'replace_scene_breaks']
+
+DEFAULT_TRUE_OPTIONS = HEURISTIC_OPTIONS + ['remove_fake_margins']
+
+
+def print_help(parser, log):
+    parser.print_help()
+
+
+def check_command_line_options(parser, args, log):
+    if len(args) < 3 or args[1].startswith('-') or args[2].startswith('-'):
+        print_help(parser, log)
+        log.error('\n\nYou must specify the input AND output files')
+        raise SystemExit(1)
+
+    input = os.path.abspath(args[1])
+    if not input.endswith('.recipe') and not os.access(input, os.R_OK) and not \
+            ('-h' in args or '--help' in args):
+        log.error('Cannot read from', input)
+        raise SystemExit(1)
+    if input.endswith('.recipe') and not os.access(input, os.R_OK):
+        input = args[1]
+
+    output = args[2]
+    if (output.startswith('.') and output[:2] not in {'..', '.'} and '/' not in
+            output and '\\' not in output):
+        output = os.path.splitext(os.path.basename(input))[0]+output
+    output = os.path.abspath(output)
+
+    return input, output
+
+
+def option_recommendation_to_cli_option(add_option, rec):
+    opt = rec.option
+    switches = ['-'+opt.short_switch] if opt.short_switch else []
+    switches.append('--'+opt.long_switch)
+    attrs = dict(dest=opt.name, help=opt.help,
+                     choices=opt.choices, default=rec.recommended_value)
+    if isinstance(rec.recommended_value, type(True)):
+        attrs['action'] = 'store_false' if rec.recommended_value else \
+                          'store_true'
+    else:
+        if isinstance(rec.recommended_value, numbers.Integral):
+            attrs['type'] = 'int'
+        if isinstance(rec.recommended_value, numbers.Real):
+            attrs['type'] = 'float'
+
+    if opt.long_switch == 'verbose':
+        attrs['action'] = 'count'
+        attrs.pop('type', '')
+    if opt.name == 'read_metadata_from_opf':
+        switches.append('--from-opf')
+    if opt.name == 'transform_css_rules':
+        attrs['help'] = _(
+            'Path to a file containing rules to transform the CSS styles'
+            ' in this book. The easiest way to create such a file is to'
+            ' use the wizard for creating rules in the calibre GUI. Access'
+            ' it in the "Look & feel->Transform styles" section of the conversion'
+            ' dialog. Once you create the rules, you can use the "Export" button'
+            ' to save them to a file.'
+        )
+    if opt.name in DEFAULT_TRUE_OPTIONS and rec.recommended_value is True:
+        switches = ['--disable-'+opt.long_switch]
+    add_option(Option(*switches, **attrs))
+
+
+def group_titles():
+    return _('INPUT OPTIONS'), _('OUTPUT OPTIONS')
+
+
+def recipe_test(option, opt_str, value, parser):
+    assert value is None
+    value = []
+
+    def floatable(s):
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+
+    for arg in parser.rargs:
+        # stop on --foo like options
+        if arg[:2] == "--":
+            break
+        # stop on -a, but not on -3 or -3.0
+        if arg[:1] == "-" and len(arg) > 1 and not floatable(arg):
+            break
+        try:
+            value.append(int(arg))
+        except (TypeError, ValueError, AttributeError):
+            break
+        if len(value) == 2:
+            break
+    del parser.rargs[:len(value)]
+
+    while len(value) < 2:
+        value.append(2)
+
+    setattr(parser.values, option.dest, tuple(value))
+
+
+def add_input_output_options(parser, plumber):
+    input_options, output_options = \
+                                plumber.input_options, plumber.output_options
+
+    def add_options(group, options):
+        for opt in options:
+            if plumber.input_fmt == 'recipe' and opt.option.long_switch == 'test':
+                group(Option('--test', dest='test', action='callback', callback=recipe_test))
+            else:
+                option_recommendation_to_cli_option(group, opt)
+
+    if input_options:
+        title = group_titles()[0]
+        io = OptionGroup(parser, title, _('Options to control the processing'
+                          ' of the input %s file')%plumber.input_fmt)
+        add_options(io.add_option, input_options)
+        parser.add_option_group(io)
+
+    if output_options:
+        title = group_titles()[1]
+        oo = OptionGroup(parser, title, _('Options to control the processing'
+                          ' of the output %s')%plumber.output_fmt)
+        add_options(oo.add_option, output_options)
+        parser.add_option_group(oo)
+
+
+def add_pipeline_options(parser, plumber):
+    groups = OrderedDict((
+              ('' , ('',
+                    [
+                     'input_profile',
+                     'output_profile',
+                     ]
+                    )),
+              (_('LOOK AND FEEL') , (
+                  _('Options to control the look and feel of the output'),
+                  [
+                      'base_font_size', 'disable_font_rescaling',
+                      'font_size_mapping', 'embed_font_family',
+                      'subset_embedded_fonts', 'embed_all_fonts',
+                      'line_height', 'minimum_line_height',
+                      'linearize_tables',
+                      'extra_css', 'filter_css', 'transform_css_rules', 'expand_css',
+                      'smarten_punctuation', 'unsmarten_punctuation',
+                      'margin_top', 'margin_left', 'margin_right',
+                      'margin_bottom', 'change_justification',
+                      'insert_blank_line', 'insert_blank_line_size',
+                      'remove_paragraph_spacing',
+                      'remove_paragraph_spacing_indent_size',
+                      'asciiize', 'keep_ligatures',
+                  ]
+                  )),
+
+              (_('HEURISTIC PROCESSING') , (
+                  _('Modify the document text and structure using common'
+                     ' patterns. Disabled by default. Use %(en)s to enable. '
+                     ' Individual actions can be disabled with the %(dis)s options.')
+                  % dict(en='--enable-heuristics', dis='--disable-*'),
+                  ['enable_heuristics'] + HEURISTIC_OPTIONS
+                  )),
+
+              (_('SEARCH AND REPLACE') , (
+                 _('Modify the document text and structure using user defined patterns.'),
+                 [
+                     'sr1_search', 'sr1_replace',
+                     'sr2_search', 'sr2_replace',
+                     'sr3_search', 'sr3_replace',
+                     'search_replace',
+                 ]
+              )),
+
+              (_('STRUCTURE DETECTION') , (
+                  _('Control auto-detection of document structure.'),
+                  [
+                      'chapter', 'chapter_mark',
+                      'prefer_metadata_cover', 'remove_first_image',
+                      'insert_metadata', 'page_breaks_before',
+                      'remove_fake_margins', 'start_reading_at',
+                  ]
+                  )),
+
+              (_('TABLE OF CONTENTS') , (
+                  _('Control the automatic generation of a Table of Contents. By '
+                  'default, if the source file has a Table of Contents, it will '
+                  'be used in preference to the automatically generated one.'),
+                  [
+                    'level1_toc', 'level2_toc', 'level3_toc',
+                    'toc_threshold', 'max_toc_links', 'no_chapters_in_toc',
+                    'use_auto_toc', 'toc_filter', 'duplicate_links_in_toc',
+                  ]
+                  )),
+
+              (_('METADATA') , (_('Options to set metadata in the output'),
+                            plumber.metadata_option_names + ['read_metadata_from_opf'],
+                            )),
+              (_('DEBUG'), (_('Options to help with debugging the conversion'),
+                        [
+                         'verbose',
+                         'debug_pipeline',
+                         ])),
+
+              ))
+
+    for group, (desc, options) in iteritems(groups):
+        if group:
+            group = OptionGroup(parser, group, desc)
+            parser.add_option_group(group)
+        add_option = group.add_option if group != '' else parser.add_option
+
+        for name in options:
+            rec = plumber.get_option_by_name(name)
+            if rec.level < rec.HIGH:
+                option_recommendation_to_cli_option(add_option, rec)
+
+
+def option_parser():
+    parser = OptionParser(usage=USAGE)
+    parser.add_option('--list-recipes', default=False, action='store_true',
+            help=_('List builtin recipe names. You can create an e-book from '
+                'a builtin recipe like this: ebook-convert "Recipe Name.recipe" '
+                'output.epub'))
+    return parser
+
+
+class ProgressBar(object):
+
+    def __init__(self, log):
+        self.log = log
+
+    def __call__(self, frac, msg=''):
+        if msg:
+            percent = int(frac*100)
+            self.log('%d%% %s'%(percent, msg))
+
+
+def create_option_parser(args, log):
+    if '--version' in args:
+        from calibre.constants import __appname__, __version__, __author__
+        log(os.path.basename(args[0]), '('+__appname__, __version__+')')
+        log('Created by:', __author__)
+        raise SystemExit(0)
+    if '--list-recipes' in args:
+        from calibre.web.feeds.recipes.collection import get_builtin_recipe_titles
+        log('Available recipes:')
+        titles = sorted(get_builtin_recipe_titles())
+        for title in titles:
+            try:
+                log('\t'+title)
+            except:
+                log('\t'+repr(title))
+        log('%d recipes available'%len(titles))
+        raise SystemExit(0)
+
+    parser = option_parser()
+    if len(args) < 3:
+        print_help(parser, log)
+        if any(x in args for x in ('-h', '--help')):
+            raise SystemExit(0)
+        else:
+            raise SystemExit(1)
+
+    input, output = check_command_line_options(parser, args, log)
+
+    from calibre.ebooks.conversion.plumber import Plumber
+
+    reporter = ProgressBar(log)
+    if patheq(input, output):
+        raise ValueError('Input file is the same as the output file')
+
+    plumber = Plumber(input, output, log, reporter)
+    add_input_output_options(parser, plumber)
+    add_pipeline_options(parser, plumber)
+
+    return parser, plumber
+
+
+def abspath(x):
+    if x.startswith('http:') or x.startswith('https:'):
+        return x
+    return os.path.abspath(os.path.expanduser(x))
+
+
+def escape_sr_pattern(exp):
+    return exp.replace('\n', '\ue123')
+
+
+def read_sr_patterns(path, log=None):
+    import json, re
+    pats = []
+    with open(path, 'rb') as f:
+        lines = f.read().decode('utf-8').splitlines()
+    pat = None
+    for line in lines:
+        if pat is None:
+            if not line.strip():
+                continue
+            line = line.replace('\ue123', '\n')
+            try:
+                re.compile(line)
+            except:
+                msg = 'Invalid regular expression: %r from file: %r'%(
+                        line, path)
+                if log is not None:
+                    log.error(msg)
+                    raise SystemExit(1)
+                else:
+                    raise ValueError(msg)
+            pat = line
+        else:
+            pats.append((pat, line))
+            pat = None
+    return json.dumps(pats)
+
+
+def main(args=sys.argv):
+    log = Log()
+    parser, plumber = create_option_parser(args, log)
+    opts, leftover_args = parser.parse_args(args)
+    if len(leftover_args) > 3:
+        log.error('Extra arguments not understood:', u', '.join(leftover_args[3:]))
+        return 1
+    for x in ('read_metadata_from_opf', 'cover'):
+        if getattr(opts, x, None) is not None:
+            setattr(opts, x, abspath(getattr(opts, x)))
+    if opts.search_replace:
+        opts.search_replace = read_sr_patterns(opts.search_replace, log)
+    if opts.transform_css_rules:
+        from calibre.ebooks.css_transform_rules import import_rules, validate_rule
+        with open(opts.transform_css_rules, 'rb') as tcr:
+            opts.transform_css_rules = rules = list(import_rules(tcr.read()))
+            for rule in rules:
+                title, msg = validate_rule(rule)
+                if title and msg:
+                    log.error('Failed to parse CSS transform rules')
+                    log.error(title)
+                    log.error(msg)
+                    return 1
+
+    recommendations = [(n.dest, getattr(opts, n.dest),
+                        OptionRecommendation.HIGH)
+                                        for n in parser.options_iter()
+                                        if n.dest]
+    plumber.merge_ui_recommendations(recommendations)
+
+    try:
+        plumber.run()
+    except ConversionUserFeedBack as e:
+        ll = {'info': log.info, 'warn': log.warn,
+                'error':log.error}.get(e.level, log.info)
+        ll(e.title)
+        if e.det_msg:
+            log.debug(e.detmsg)
+        ll(e.msg)
+        raise SystemExit(1)
+
+    log(_('Output saved to'), ' ', plumber.output)
+
+    return 0
+
+
+def manual_index_strings():
+    return _('''\
+The options and default values for the options change depending on both the
+input and output formats, so you should always check with::
+
+    %s
+
+Below are the options that are common to all conversion, followed by the
+options specific to every input and output format.''')
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/ebook_converter/ebooks/conversion/plugins/init.py
+++ b/ebook_converter/ebooks/conversion/plugins/init.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+
--- a/ebook_converter/ebooks/conversion/plugins/azw4_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/azw4_input.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import getcwd
+
+
+class AZW4Input(InputFormatPlugin):
+
+    name        = 'AZW4 Input'
+    author      = 'John Schember'
+    description = 'Convert AZW4 to HTML'
+    file_types  = {'azw4'}
+    commit_name = 'azw4_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.pdb.header import PdbHeaderReader
+        from calibre.ebooks.azw4.reader import Reader
+
+        header = PdbHeaderReader(stream)
+        reader = Reader(header, stream, log, options)
+        opf = reader.extract_content(getcwd())
+
+        return opf
--- a/ebook_converter/ebooks/conversion/plugins/chm_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/chm_input.py
@@ -0,0 +1,202 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+''' CHM File decoding support '''
+__license__ = 'GPL v3'
+__copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
+                 ' and Alex Bramley <a.bramley at gmail.com>.'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin
+from calibre.ptempfile import TemporaryDirectory
+from calibre.constants import filesystem_encoding
+from polyglot.builtins import unicode_type, as_bytes
+
+
+class CHMInput(InputFormatPlugin):
+
+    name        = 'CHM Input'
+    author      = 'Kovid Goyal and Alex Bramley'
+    description = 'Convert CHM files to OEB'
+    file_types  = {'chm'}
+    commit_name = 'chm_input'
+
+    def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False):
+        from calibre.ebooks.chm.reader import CHMReader
+        log.debug('Opening CHM file')
+        rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding)
+        log.debug('Extracting CHM to %s' % output_dir)
+        rdr.extract_content(output_dir, debug_dump=debug_dump)
+        self._chm_reader = rdr
+        return rdr.hhc_path
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.ebooks.chm.metadata import get_metadata_from_reader
+        from calibre.customize.ui import plugin_for_input_format
+        self.opts = options
+
+        log.debug('Processing CHM...')
+        with TemporaryDirectory('_chm2oeb') as tdir:
+            if not isinstance(tdir, unicode_type):
+                tdir = tdir.decode(filesystem_encoding)
+            html_input = plugin_for_input_format('html')
+            for opt in html_input.options:
+                setattr(options, opt.option.name, opt.recommended_value)
+            no_images = False  # options.no_images
+            chm_name = stream.name
+            # chm_data = stream.read()
+
+            # closing stream so CHM can be opened by external library
+            stream.close()
+            log.debug('tdir=%s' % tdir)
+            log.debug('stream.name=%s' % stream.name)
+            debug_dump = False
+            odi = options.debug_pipeline
+            if odi:
+                debug_dump = os.path.join(odi, 'input')
+            mainname = self._chmtohtml(tdir, chm_name, no_images, log,
+                    debug_dump=debug_dump)
+            mainpath = os.path.join(tdir, mainname)
+
+            try:
+                metadata = get_metadata_from_reader(self._chm_reader)
+            except Exception:
+                log.exception('Failed to read metadata, using filename')
+                from calibre.ebooks.metadata.book.base import Metadata
+                metadata = Metadata(os.path.basename(chm_name))
+            encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
+            self._chm_reader.CloseCHM()
+            # print((tdir, mainpath))
+            # from calibre import ipython
+            # ipython()
+
+            options.debug_pipeline = None
+            options.input_encoding = 'utf-8'
+            uenc = encoding
+            if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
+                uenc = 'utf-8'
+            htmlpath, toc = self._create_html_root(mainpath, log, uenc)
+            oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
+            options.debug_pipeline = odi
+            if toc.count() > 1:
+                oeb.toc = self.parse_html_toc(oeb.spine[0])
+                oeb.manifest.remove(oeb.spine[0])
+                oeb.auto_generated_toc = False
+        return oeb
+
+    def parse_html_toc(self, item):
+        from calibre.ebooks.oeb.base import TOC, XPath
+        dx = XPath('./h:div')
+        ax = XPath('./h:a[1]')
+
+        def do_node(parent, div):
+            for child in dx(div):
+                a = ax(child)[0]
+                c = parent.add(a.text, a.attrib['href'])
+                do_node(c, child)
+
+        toc = TOC()
+        root = XPath('//h:div[1]')(item.data)[0]
+        do_node(toc, root)
+        return toc
+
+    def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
+        # use HTMLInput plugin to generate book
+        from calibre.customize.builtins import HTMLInput
+        opts.breadth_first = True
+        htmlinput = HTMLInput(None)
+        oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
+        return oeb
+
+    def _create_html_root(self, hhcpath, log, encoding):
+        from lxml import html
+        from polyglot.urllib import unquote as _unquote
+        from calibre.ebooks.oeb.base import urlquote
+        from calibre.ebooks.chardet import xml_to_unicode
+        hhcdata = self._read_file(hhcpath)
+        hhcdata = hhcdata.decode(encoding)
+        hhcdata = xml_to_unicode(hhcdata, verbose=True,
+                            strip_encoding_pats=True, resolve_entities=True)[0]
+        hhcroot = html.fromstring(hhcdata)
+        toc = self._process_nodes(hhcroot)
+        # print("=============================")
+        # print("Printing hhcroot")
+        # print(etree.tostring(hhcroot, pretty_print=True))
+        # print("=============================")
+        log.debug('Found %d section nodes' % toc.count())
+        htmlpath = os.path.splitext(hhcpath)[0] + ".html"
+        base = os.path.dirname(os.path.abspath(htmlpath))
+
+        def unquote(x):
+            if isinstance(x, unicode_type):
+                x = x.encode('utf-8')
+            return _unquote(x).decode('utf-8')
+
+        def unquote_path(x):
+            y = unquote(x)
+            if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))):
+                x = y
+            return x
+
+        def donode(item, parent, base, subpath):
+            for child in item:
+                title = child.title
+                if not title:
+                    continue
+                raw = unquote_path(child.href or '')
+                rsrcname = os.path.basename(raw)
+                rsrcpath = os.path.join(subpath, rsrcname)
+                if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))):
+                    rsrcpath = raw
+
+                if '%' not in rsrcpath:
+                    rsrcpath = urlquote(rsrcpath)
+                if not raw:
+                    rsrcpath = ''
+                c = DIV(A(title, href=rsrcpath))
+                donode(child, c, base, subpath)
+                parent.append(c)
+
+        with open(htmlpath, 'wb') as f:
+            if toc.count() > 1:
+                from lxml.html.builder import HTML, BODY, DIV, A
+                path0 = toc[0].href
+                path0 = unquote_path(path0)
+                subpath = os.path.dirname(path0)
+                base = os.path.dirname(f.name)
+                root = DIV()
+                donode(toc, root, base, subpath)
+                raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
+                                   pretty_print=True)
+                f.write(raw)
+            else:
+                f.write(as_bytes(hhcdata))
+        return htmlpath, toc
+
+    def _read_file(self, name):
+        with lopen(name, 'rb') as f:
+            data = f.read()
+        return data
+
+    def add_node(self, node, toc, ancestor_map):
+        from calibre.ebooks.chm.reader import match_string
+        if match_string(node.attrib.get('type', ''), 'text/sitemap'):
+            p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
+            parent = p[0] if p else None
+            toc = ancestor_map.get(parent, toc)
+            title = href = ''
+            for param in node.xpath('./param'):
+                if match_string(param.attrib['name'], 'name'):
+                    title = param.attrib['value']
+                elif match_string(param.attrib['name'], 'local'):
+                    href = param.attrib['value']
+            child = toc.add(title or _('Unknown'), href)
+            ancestor_map[node] = child
+
+    def _process_nodes(self, root):
+        from calibre.ebooks.oeb.base import TOC
+        toc = TOC()
+        ancestor_map = {}
+        for node in root.xpath('//object'):
+            self.add_node(node, toc, ancestor_map)
+        return toc
--- a/ebook_converter/ebooks/conversion/plugins/comic_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/comic_input.py
@@ -0,0 +1,310 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Based on ideas from comiclrf created by FangornUK.
+'''
+
+import shutil, textwrap, codecs, os
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre import CurrentDir
+from calibre.ptempfile import PersistentTemporaryDirectory
+from polyglot.builtins import getcwd, map
+
+
+class ComicInput(InputFormatPlugin):
+
+    name        = 'Comic Input'
+    author      = 'Kovid Goyal'
+    description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
+    file_types  = {'cbz', 'cbr', 'cbc'}
+    is_image_collection = True
+    commit_name = 'comic_input'
+    core_usage = -1
+
+    options = {
+        OptionRecommendation(name='colors', recommended_value=0,
+            help=_('Reduce the number of colors used in the image. This works only'
+                   ' if you choose the PNG output format. It is useful to reduce file sizes.'
+                   ' Set to zero to turn off. Maximum value is 256. It is off by default.')),
+        OptionRecommendation(name='dont_normalize', recommended_value=False,
+            help=_('Disable normalize (improve contrast) color range '
+            'for pictures. Default: False')),
+        OptionRecommendation(name='keep_aspect_ratio', recommended_value=False,
+            help=_('Maintain picture aspect ratio. Default is to fill the screen.')),
+        OptionRecommendation(name='dont_sharpen', recommended_value=False,
+            help=_('Disable sharpening.')),
+        OptionRecommendation(name='disable_trim', recommended_value=False,
+            help=_('Disable trimming of comic pages. For some comics, '
+                     'trimming might remove content as well as borders.')),
+        OptionRecommendation(name='landscape', recommended_value=False,
+            help=_("Don't split landscape images into two portrait images")),
+        OptionRecommendation(name='wide', recommended_value=False,
+            help=_("Keep aspect ratio and scale image using screen height as "
+            "image width for viewing in landscape mode.")),
+        OptionRecommendation(name='right2left', recommended_value=False,
+              help=_('Used for right-to-left publications like manga. '
+              'Causes landscape pages to be split into portrait pages '
+              'from right to left.')),
+        OptionRecommendation(name='despeckle', recommended_value=False,
+              help=_('Enable Despeckle. Reduces speckle noise. '
+              'May greatly increase processing time.')),
+        OptionRecommendation(name='no_sort', recommended_value=False,
+              help=_("Don't sort the files found in the comic "
+              "alphabetically by name. Instead use the order they were "
+              "added to the comic.")),
+        OptionRecommendation(name='output_format', choices=['png', 'jpg'],
+            recommended_value='png', help=_('The format that images in the created e-book '
+                'are converted to. You can experiment to see which format gives '
+                'you optimal size and look on your device.')),
+        OptionRecommendation(name='no_process', recommended_value=False,
+              help=_("Apply no processing to the image")),
+        OptionRecommendation(name='dont_grayscale', recommended_value=False,
+            help=_('Do not convert the image to grayscale (black and white)')),
+        OptionRecommendation(name='comic_image_size', recommended_value=None,
+            help=_('Specify the image size as widthxheight pixels. Normally,'
+                ' an image size is automatically calculated from the output '
+                'profile, this option overrides it.')),
+        OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False,
+            help=_('When converting a CBC do not add links to each page to'
+                ' the TOC. Note this only applies if the TOC has more than one'
+                ' section')),
+        }
+
+    recommendations = {
+        ('margin_left', 0, OptionRecommendation.HIGH),
+        ('margin_top',  0, OptionRecommendation.HIGH),
+        ('margin_right', 0, OptionRecommendation.HIGH),
+        ('margin_bottom', 0, OptionRecommendation.HIGH),
+        ('insert_blank_line', False, OptionRecommendation.HIGH),
+        ('remove_paragraph_spacing',  False, OptionRecommendation.HIGH),
+        ('change_justification', 'left', OptionRecommendation.HIGH),
+        ('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
+        ('chapter', None, OptionRecommendation.HIGH),
+        ('page_breaks_brefore', None, OptionRecommendation.HIGH),
+        ('use_auto_toc', False, OptionRecommendation.HIGH),
+        ('page_breaks_before', None, OptionRecommendation.HIGH),
+        ('disable_font_rescaling', True, OptionRecommendation.HIGH),
+        ('linearize_tables', False, OptionRecommendation.HIGH),
+        }
+
+    def get_comics_from_collection(self, stream):
+        from calibre.libunzip import extract as zipextract
+        tdir = PersistentTemporaryDirectory('_comic_collection')
+        zipextract(stream, tdir)
+        comics = []
+        with CurrentDir(tdir):
+            if not os.path.exists('comics.txt'):
+                raise ValueError((
+                    '%s is not a valid comic collection'
+                    ' no comics.txt was found in the file')
+                        %stream.name)
+            with open('comics.txt', 'rb') as f:
+                raw = f.read()
+            if raw.startswith(codecs.BOM_UTF16_BE):
+                raw = raw.decode('utf-16-be')[1:]
+            elif raw.startswith(codecs.BOM_UTF16_LE):
+                raw = raw.decode('utf-16-le')[1:]
+            elif raw.startswith(codecs.BOM_UTF8):
+                raw = raw.decode('utf-8')[1:]
+            else:
+                raw = raw.decode('utf-8')
+            for line in raw.splitlines():
+                line = line.strip()
+                if not line:
+                    continue
+                fname, title = line.partition(':')[0], line.partition(':')[-1]
+                fname = fname.replace('#', '_')
+                fname = os.path.join(tdir, *fname.split('/'))
+                if not title:
+                    title = os.path.basename(fname).rpartition('.')[0]
+                if os.access(fname, os.R_OK):
+                    comics.append([title, fname])
+        if not comics:
+            raise ValueError('%s has no comics'%stream.name)
+        return comics
+
+    def get_pages(self, comic, tdir2):
+        from calibre.ebooks.comic.input import (extract_comic,  process_pages,
+                find_pages)
+        tdir  = extract_comic(comic)
+        new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort,
+                verbose=self.opts.verbose)
+        thumbnail = None
+        if not new_pages:
+            raise ValueError('Could not find any pages in the comic: %s'
+                    %comic)
+        if self.opts.no_process:
+            n2 = []
+            for i, page in enumerate(new_pages):
+                n2.append(os.path.join(tdir2, '{} - {}' .format(i, os.path.basename(page))))
+                shutil.copyfile(page, n2[-1])
+            new_pages = n2
+        else:
+            new_pages, failures = process_pages(new_pages, self.opts,
+                    self.report_progress, tdir2)
+            if failures:
+                self.log.warning('Could not process the following pages '
+                '(run with --verbose to see why):')
+                for f in failures:
+                    self.log.warning('\t', f)
+            if not new_pages:
+                raise ValueError('Could not find any valid pages in comic: %s'
+                        % comic)
+            thumbnail = os.path.join(tdir2,
+                    'thumbnail.'+self.opts.output_format.lower())
+            if not os.access(thumbnail, os.R_OK):
+                thumbnail = None
+        return new_pages
+
+    def get_images(self):
+        return self._images
+
+    def convert(self, stream, opts, file_ext, log, accelerators):
+        from calibre.ebooks.metadata import MetaInformation
+        from calibre.ebooks.metadata.opf2 import OPFCreator
+        from calibre.ebooks.metadata.toc import TOC
+
+        self.opts, self.log= opts, log
+        if file_ext == 'cbc':
+            comics_ = self.get_comics_from_collection(stream)
+        else:
+            comics_ = [['Comic', os.path.abspath(stream.name)]]
+        stream.close()
+        comics = []
+        for i, x in enumerate(comics_):
+            title, fname = x
+            cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.'
+            cdir = os.path.abspath(cdir)
+            if not os.path.exists(cdir):
+                os.makedirs(cdir)
+            pages = self.get_pages(fname, cdir)
+            if not pages:
+                continue
+            if self.for_viewer:
+                comics.append((title, pages, [self.create_viewer_wrapper(pages)]))
+            else:
+                wrappers = self.create_wrappers(pages)
+                comics.append((title, pages, wrappers))
+
+        if not comics:
+            raise ValueError('No comic pages found in %s'%stream.name)
+
+        mi  = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
+            [_('Unknown')])
+        opf = OPFCreator(getcwd(), mi)
+        entries = []
+
+        def href(x):
+            if len(comics) == 1:
+                return os.path.basename(x)
+            return '/'.join(x.split(os.sep)[-2:])
+
+        cover_href = None
+        for comic in comics:
+            pages, wrappers = comic[1:]
+            page_entries = [(x, None) for x in map(href, pages)]
+            entries += [(w, None) for w in map(href, wrappers)] + page_entries
+            if cover_href is None and page_entries:
+                cover_href = page_entries[0][0]
+        opf.create_manifest(entries)
+        spine = []
+        for comic in comics:
+            spine.extend(map(href, comic[2]))
+        self._images = []
+        for comic in comics:
+            self._images.extend(comic[1])
+        opf.create_spine(spine)
+        if self.for_viewer and cover_href:
+            opf.guide.set_cover(cover_href)
+        toc = TOC()
+        if len(comics) == 1:
+            wrappers = comics[0][2]
+            for i, x in enumerate(wrappers):
+                toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
+                        play_order=i)
+        else:
+            po = 0
+            for comic in comics:
+                po += 1
+                wrappers = comic[2]
+                stoc = toc.add_item(href(wrappers[0]),
+                        None, comic[0], play_order=po)
+                if not opts.dont_add_comic_pages_to_toc:
+                    for i, x in enumerate(wrappers):
+                        stoc.add_item(href(x), None,
+                                _('Page')+' %d'%(i+1), play_order=po)
+                        po += 1
+        opf.set_toc(toc)
+        with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
+            opf.render(m, n, 'toc.ncx')
+        return os.path.abspath('metadata.opf')
+
+    def create_wrappers(self, pages):
+        from calibre.ebooks.oeb.base import XHTML_NS
+        wrappers = []
+        WRAPPER = textwrap.dedent('''\
+        <html xmlns="%s">
+            <head>
+                <meta charset="utf-8"/>
+                <title>Page #%d</title>
+                <style type="text/css">
+                    @page { margin:0pt; padding: 0pt}
+                    body { margin: 0pt; padding: 0pt}
+                    div { text-align: center }
+                </style>
+            </head>
+            <body>
+                <div>
+                    <img src="%s" alt="comic page #%d" />
+                </div>
+            </body>
+        </html>
+        ''')
+        dir = os.path.dirname(pages[0])
+        for i, page in enumerate(pages):
+            wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
+            page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
+            with open(page, 'wb') as f:
+                f.write(wrapper.encode('utf-8'))
+            wrappers.append(page)
+        return wrappers
+
+    def create_viewer_wrapper(self, pages):
+        from calibre.ebooks.oeb.base import XHTML_NS
+
+        def page(src):
+            return '<img src="{}"></img>'.format(os.path.basename(src))
+
+        pages = '\n'.join(map(page, pages))
+        base = os.path.dirname(pages[0])
+        wrapper = '''
+        <html xmlns="%s">
+            <head>
+                <meta charset="utf-8"/>
+                <style type="text/css">
+                html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
+                img {
+                    width: 100%%; height: 100%%;
+                    object-fit: contain;
+                    margin-left: auto; margin-right: auto;
+                    max-width: 100vw; max-height: 100vh;
+                    top: 50vh; transform: translateY(-50%%);
+                    position: relative;
+                    page-break-after: always;
+                }
+                </style>
+            </head>
+            <body>
+            %s
+            </body>
+        </html>
+        ''' % (XHTML_NS, pages)
+        path = os.path.join(base, 'wrapper.xhtml')
+        with open(path, 'wb') as f:
+            f.write(wrapper.encode('utf-8'))
+        return path
--- a/ebook_converter/ebooks/conversion/plugins/djvu_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/djvu_input.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
+__docformat__ = 'restructuredtext en'
+
+import os
+from io import BytesIO
+
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import getcwd
+
+
+class DJVUInput(InputFormatPlugin):
+
+    name        = 'DJVU Input'
+    author      = 'Anthon van der Neut'
+    description = 'Convert OCR-ed DJVU files (.djvu) to HTML'
+    file_types  = {'djvu', 'djv'}
+    commit_name = 'djvu_input'
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.ebooks.txt.processor import convert_basic
+
+        stdout = BytesIO()
+        from calibre.ebooks.djvu.djvu import DJVUFile
+        x = DJVUFile(stream)
+        x.get_text(stdout)
+        raw_text = stdout.getvalue()
+        if not raw_text:
+            raise ValueError('The DJVU file contains no text, only images, probably page scans.'
+                    ' calibre only supports conversion of DJVU files with actual text in them.')
+
+        html = convert_basic(raw_text.replace(b"\n", b' ').replace(
+            b'\037', b'\n\n'))
+        # Run the HTMLized text through the html processing plugin.
+        from calibre.customize.ui import plugin_for_input_format
+        html_input = plugin_for_input_format('html')
+        for opt in html_input.options:
+            setattr(options, opt.option.name, opt.recommended_value)
+        options.input_encoding = 'utf-8'
+        base = getcwd()
+        htmlfile = os.path.join(base, 'index.html')
+        c = 0
+        while os.path.exists(htmlfile):
+            c += 1
+            htmlfile = os.path.join(base, 'index%d.html'%c)
+        with open(htmlfile, 'wb') as f:
+            f.write(html.encode('utf-8'))
+        odi = options.debug_pipeline
+        options.debug_pipeline = None
+        # Generate oeb from html conversion.
+        with open(htmlfile, 'rb') as f:
+            oeb = html_input.convert(f, options, 'html', log,
+                {})
+        options.debug_pipeline = odi
+        os.remove(htmlfile)
+
+        # Set metadata from file.
+        from calibre.customize.ui import get_file_type_metadata
+        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
+        mi = get_file_type_metadata(stream, file_ext)
+        meta_info_to_oeb_metadata(mi, oeb.metadata, log)
+
+        return oeb
--- a/ebook_converter/ebooks/conversion/plugins/docx_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/docx_input.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+
+
+class DOCXInput(InputFormatPlugin):
+    name        = 'DOCX Input'
+    author      = 'Kovid Goyal'
+    description = _('Convert DOCX files (.docx and .docm) to HTML')
+    file_types  = {'docx', 'docm'}
+    commit_name = 'docx_input'
+
+    options = {
+        OptionRecommendation(name='docx_no_cover', recommended_value=False,
+            help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
+                   'it will be removed from the document and used as the cover for created e-book. This option '
+                   'turns off that behavior.')),
+        OptionRecommendation(name='docx_no_pagebreaks_between_notes', recommended_value=False,
+            help=_('Do not insert a page break after every endnote.')),
+        OptionRecommendation(name='docx_inline_subsup', recommended_value=False,
+            help=_('Render superscripts and subscripts so that they do not affect the line height.')),
+    }
+
+    recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.ebooks.docx.to_html import Convert
+        return Convert(stream, detect_cover=not options.docx_no_cover, log=log, notes_nopb=options.docx_no_pagebreaks_between_notes,
+                       nosupsub=options.docx_inline_subsup)()
--- a/ebook_converter/ebooks/conversion/plugins/docx_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/docx_output.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
+
+PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
+              'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter']
+
+
+class DOCXOutput(OutputFormatPlugin):
+
+    name = 'DOCX Output'
+    author = 'Kovid Goyal'
+    file_type = 'docx'
+    commit_name = 'docx_output'
+    ui_data = {'page_sizes': PAGE_SIZES}
+
+    options = {
+        OptionRecommendation(name='docx_page_size', recommended_value='letter',
+            level=OptionRecommendation.LOW, choices=PAGE_SIZES,
+            help=_('The size of the page. Default is letter. Choices '
+            'are %s') % PAGE_SIZES),
+
+        OptionRecommendation(name='docx_custom_page_size', recommended_value=None,
+            help=_('Custom size of the document. Use the form widthxheight '
+            'EG. `123x321` to specify the width and height (in pts). '
+            'This overrides any specified page-size.')),
+
+        OptionRecommendation(name='docx_no_cover', recommended_value=False,
+            help=_('Do not insert the book cover as an image at the start of the document.'
+                   ' If you use this option, the book cover will be discarded.')),
+
+        OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False,
+            help=_('Preserve the aspect ratio of the cover image instead of stretching'
+                   ' it out to cover the entire page.')),
+
+        OptionRecommendation(name='docx_no_toc', recommended_value=False,
+            help=_('Do not insert the table of contents as a page at the start of the document.')),
+
+        OptionRecommendation(name='extract_to',
+            help=_('Extract the contents of the generated %s file to the '
+                'specified directory. The contents of the directory are first '
+                'deleted, so be careful.') % 'DOCX'),
+
+        OptionRecommendation(name='docx_page_margin_left', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the left page margin, in pts. Default is 72pt.'
+                   ' Overrides the common left page margin setting.')
+        ),
+
+        OptionRecommendation(name='docx_page_margin_top', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the top page margin, in pts. Default is 72pt.'
+                   ' Overrides the common top page margin setting, unless set to zero.')
+        ),
+
+        OptionRecommendation(name='docx_page_margin_right', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the right page margin, in pts. Default is 72pt.'
+                   ' Overrides the common right page margin setting, unless set to zero.')
+        ),
+
+        OptionRecommendation(name='docx_page_margin_bottom', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the bottom page margin, in pts. Default is 72pt.'
+                   ' Overrides the common bottom page margin setting, unless set to zero.')
+        ),
+
+    }
+
+    def convert_metadata(self, oeb):
+        from lxml import etree
+        from calibre.ebooks.oeb.base import OPF, OPF2_NS
+        from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
+        from io import BytesIO
+        package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS})
+        oeb.metadata.to_opf2(package)
+        self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata()
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        from calibre.ebooks.docx.writer.container import DOCX
+        from calibre.ebooks.docx.writer.from_html import Convert
+        docx = DOCX(opts, log)
+        self.convert_metadata(oeb)
+        Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)()
+        docx.write(output_path, self.mi)
+        if opts.extract_to:
+            from calibre.ebooks.docx.dump import do_dump
+            do_dump(output_path, opts.extract_to)
--- a/ebook_converter/ebooks/conversion/plugins/epub_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/epub_input.py
@@ -0,0 +1,438 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, re, posixpath
+from itertools import cycle
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from polyglot.builtins import getcwd
+
+ADOBE_OBFUSCATION =  'http://ns.adobe.com/pdf/enc#RC'
+IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
+
+
+def decrypt_font_data(key, data, algorithm):
+    is_adobe = algorithm == ADOBE_OBFUSCATION
+    crypt_len = 1024 if is_adobe else 1040
+    crypt = bytearray(data[:crypt_len])
+    key = cycle(iter(bytearray(key)))
+    decrypt = bytes(bytearray(x^next(key) for x in crypt))
+    return decrypt + data[crypt_len:]
+
+
+def decrypt_font(key, path, algorithm):
+    with lopen(path, 'r+b') as f:
+        data = decrypt_font_data(key, f.read(), algorithm)
+        f.seek(0), f.truncate(), f.write(data)
+
+
+class EPUBInput(InputFormatPlugin):
+
+    name        = 'EPUB Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert EPUB files (.epub) to HTML'
+    file_types  = {'epub'}
+    output_encoding = None
+    commit_name = 'epub_input'
+
+    recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
+
+    def process_encryption(self, encfile, opf, log):
+        from lxml import etree
+        import uuid, hashlib
+        idpf_key = opf.raw_unique_identifier
+        if idpf_key:
+            idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
+            idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
+        key = None
+        for item in opf.identifier_iter():
+            scheme = None
+            for xkey in item.attrib.keys():
+                if xkey.endswith('scheme'):
+                    scheme = item.get(xkey)
+            if (scheme and scheme.lower() == 'uuid') or \
+                    (item.text and item.text.startswith('urn:uuid:')):
+                try:
+                    key = item.text.rpartition(':')[-1]
+                    key = uuid.UUID(key).bytes
+                except:
+                    import traceback
+                    traceback.print_exc()
+                    key = None
+
+        try:
+            root = etree.parse(encfile)
+            for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
+                algorithm = em.get('Algorithm', '')
+                if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
+                    return False
+                cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
+                uri = cr.get('URI')
+                path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
+                tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
+                if (tkey and os.path.exists(path)):
+                    self._encrypted_font_uris.append(uri)
+                    decrypt_font(tkey, path, algorithm)
+            return True
+        except:
+            import traceback
+            traceback.print_exc()
+        return False
+
+    def set_guide_type(self, opf, gtype, href=None, title=''):
+        # Set the specified guide entry
+        for elem in list(opf.iterguide()):
+            if elem.get('type', '').lower() == gtype:
+                elem.getparent().remove(elem)
+
+        if href is not None:
+            t = opf.create_guide_item(gtype, title, href)
+            for guide in opf.root.xpath('./*[local-name()="guide"]'):
+                guide.append(t)
+                return
+            guide = opf.create_guide_element()
+            opf.root.append(guide)
+            guide.append(t)
+            return t
+
+    def rationalize_cover3(self, opf, log):
+        ''' If there is a reference to the cover/titlepage via manifest properties, convert to
+        entries in the <guide> so that the rest of the pipeline picks it up. '''
+        from calibre.ebooks.metadata.opf3 import items_with_property
+        removed = guide_titlepage_href = guide_titlepage_id = None
+
+        # Look for titlepages incorrectly marked in the <guide> as covers
+        guide_cover, guide_elem = None, None
+        for guide_elem in opf.iterguide():
+            if guide_elem.get('type', '').lower() == 'cover':
+                guide_cover = guide_elem.get('href', '').partition('#')[0]
+                break
+        if guide_cover:
+            spine = list(opf.iterspine())
+            if spine:
+                idref = spine[0].get('idref', '')
+                for x in opf.itermanifest():
+                    if x.get('id') == idref and x.get('href') == guide_cover:
+                        guide_titlepage_href = guide_cover
+                        guide_titlepage_id = idref
+                        break
+
+        raster_cover_href = opf.epub3_raster_cover or opf.raster_cover
+        if raster_cover_href:
+            self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image')
+        titlepage_id = titlepage_href = None
+        for item in items_with_property(opf.root, 'calibre:title-page'):
+            tid, href = item.get('id'), item.get('href')
+            if href and tid:
+                titlepage_id, titlepage_href = tid, href.partition('#')[0]
+                break
+        if titlepage_href is None:
+            titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id
+        if titlepage_href is not None:
+            self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page')
+            spine = list(opf.iterspine())
+            if len(spine) > 1:
+                for item in spine:
+                    if item.get('idref') == titlepage_id:
+                        log('Found HTML cover', titlepage_href)
+                        if self.for_viewer:
+                            item.attrib.pop('linear', None)
+                        else:
+                            item.getparent().remove(item)
+                            removed = titlepage_href
+                        return removed
+
+    def rationalize_cover2(self, opf, log):
+        ''' Ensure that the cover information in the guide is correct. That
+        means, at most one entry with type="cover" that points to a raster
+        cover and at most one entry with type="titlepage" that points to an
+        HTML titlepage. '''
+        from calibre.ebooks.oeb.base import OPF
+        removed = None
+        from lxml import etree
+        guide_cover, guide_elem = None, None
+        for guide_elem in opf.iterguide():
+            if guide_elem.get('type', '').lower() == 'cover':
+                guide_cover = guide_elem.get('href', '').partition('#')[0]
+                break
+        if not guide_cover:
+            raster_cover = opf.raster_cover
+            if raster_cover:
+                if guide_elem is None:
+                    g = opf.root.makeelement(OPF('guide'))
+                    opf.root.append(g)
+                else:
+                    g = guide_elem.getparent()
+                guide_cover = raster_cover
+                guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'})
+                g.append(guide_elem)
+            return
+        spine = list(opf.iterspine())
+        if not spine:
+            return
+        # Check if the cover specified in the guide is also
+        # the first element in spine
+        idref = spine[0].get('idref', '')
+        manifest = list(opf.itermanifest())
+        if not manifest:
+            return
+        elem = [x for x in manifest if x.get('id', '') == idref]
+        if not elem or elem[0].get('href', None) != guide_cover:
+            return
+        log('Found HTML cover', guide_cover)
+
+        # Remove from spine as covers must be treated
+        # specially
+        if not self.for_viewer:
+            if len(spine) == 1:
+                log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.')
+                for guide_elem in tuple(opf.iterguide()):
+                    if guide_elem.get('type', '').lower() == 'cover':
+                        guide_elem.getparent().remove(guide_elem)
+                return
+            else:
+                spine[0].getparent().remove(spine[0])
+                removed = guide_cover
+        else:
+            # Ensure the cover is displayed as the first item in the book, some
+            # epub files have it set with linear='no' which causes the cover to
+            # display in the end
+            spine[0].attrib.pop('linear', None)
+            opf.spine[0].is_linear = True
+        # Ensure that the guide has a cover entry pointing to a raster cover
+        # and a titlepage entry pointing to the html titlepage. The titlepage
+        # entry will be used by the epub output plugin, the raster cover entry
+        # by other output plugins.
+
+        # Search for a raster cover identified in the OPF
+        raster_cover = opf.raster_cover
+
+        # Set the cover guide entry
+        if raster_cover is not None:
+            guide_elem.set('href', raster_cover)
+        else:
+            # Render the titlepage to create a raster cover
+            from calibre.ebooks import render_html_svg_workaround
+            guide_elem.set('href', 'calibre_raster_cover.jpg')
+            t = etree.SubElement(
+                elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover')
+            t.set('media-type', 'image/jpeg')
+            if os.path.exists(guide_cover):
+                renderer = render_html_svg_workaround(guide_cover, log)
+                if renderer is not None:
+                    with lopen('calibre_raster_cover.jpg', 'wb') as f:
+                        f.write(renderer)
+
+        # Set the titlepage guide entry
+        self.set_guide_type(opf, 'titlepage', guide_cover, 'Title Page')
+        return removed
+
+    def find_opf(self):
+        from calibre.utils.xml_parse import safe_xml_fromstring
+
+        def attr(n, attr):
+            for k, v in n.attrib.items():
+                if k.endswith(attr):
+                    return v
+        try:
+            with lopen('META-INF/container.xml', 'rb') as f:
+                root = safe_xml_fromstring(f.read())
+                for r in root.xpath('//*[local-name()="rootfile"]'):
+                    if attr(r, 'media-type') != "application/oebps-package+xml":
+                        continue
+                    path = attr(r, 'full-path')
+                    if not path:
+                        continue
+                    path = os.path.join(getcwd(), *path.split('/'))
+                    if os.path.exists(path):
+                        return path
+        except Exception:
+            import traceback
+            traceback.print_exc()
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.utils.zipfile import ZipFile
+        from calibre import walk
+        from calibre.ebooks import DRMError
+        from calibre.ebooks.metadata.opf2 import OPF
+        try:
+            zf = ZipFile(stream)
+            zf.extractall(getcwd())
+        except:
+            log.exception('EPUB appears to be invalid ZIP file, trying a'
+                    ' more forgiving ZIP parser')
+            from calibre.utils.localunzip import extractall
+            stream.seek(0)
+            extractall(stream)
+        encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
+        opf = self.find_opf()
+        if opf is None:
+            for f in walk('.'):
+                if f.lower().endswith('.opf') and '__MACOSX' not in f and \
+                        not os.path.basename(f).startswith('.'):
+                    opf = os.path.abspath(f)
+                    break
+        path = getattr(stream, 'name', 'stream')
+
+        if opf is None:
+            raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)
+
+        opf = os.path.relpath(opf, getcwd())
+        parts = os.path.split(opf)
+        opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
+
+        self._encrypted_font_uris = []
+        if os.path.exists(encfile):
+            if not self.process_encryption(encfile, opf, log):
+                raise DRMError(os.path.basename(path))
+        self.encrypted_fonts = self._encrypted_font_uris
+
+        if len(parts) > 1 and parts[0]:
+            delta = '/'.join(parts[:-1])+'/'
+
+            def normpath(x):
+                return posixpath.normpath(delta + elem.get('href'))
+
+            for elem in opf.itermanifest():
+                elem.set('href', normpath(elem.get('href')))
+            for elem in opf.iterguide():
+                elem.set('href', normpath(elem.get('href')))
+
+        f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
+        self.removed_cover = f(opf, log)
+        if self.removed_cover:
+            self.removed_items_to_ignore = (self.removed_cover,)
+        epub3_nav = opf.epub3_nav
+        if epub3_nav is not None:
+            self.convert_epub3_nav(epub3_nav, opf, log, options)
+
+        for x in opf.itermanifest():
+            if x.get('media-type', '') == 'application/x-dtbook+xml':
+                raise ValueError(
+                    'EPUB files with DTBook markup are not supported')
+
+        not_for_spine = set()
+        for y in opf.itermanifest():
+            id_ = y.get('id', None)
+            if id_:
+                mt = y.get('media-type', None)
+                if mt in {
+                        'application/vnd.adobe-page-template+xml',
+                        'application/vnd.adobe.page-template+xml',
+                        'application/adobe-page-template+xml',
+                        'application/adobe.page-template+xml',
+                        'application/text'
+                }:
+                    not_for_spine.add(id_)
+                ext = y.get('href', '').rpartition('.')[-1].lower()
+                if mt == 'text/plain' and ext in {'otf', 'ttf'}:
+                    # some epub authoring software sets font mime types to
+                    # text/plain
+                    not_for_spine.add(id_)
+                    y.set('media-type', 'application/font')
+
+        seen = set()
+        for x in list(opf.iterspine()):
+            ref = x.get('idref', None)
+            if not ref or ref in not_for_spine or ref in seen:
+                x.getparent().remove(x)
+                continue
+            seen.add(ref)
+
+        if len(list(opf.iterspine())) == 0:
+            raise ValueError('No valid entries in the spine of this EPUB')
+
+        with lopen('content.opf', 'wb') as nopf:
+            nopf.write(opf.render())
+
+        return os.path.abspath('content.opf')
+
+    def convert_epub3_nav(self, nav_path, opf, log, opts):
+        from lxml import etree
+        from calibre.ebooks.chardet import xml_to_unicode
+        from calibre.ebooks.oeb.polish.parsing import parse
+        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
+        from calibre.ebooks.oeb.polish.toc import first_child
+        from calibre.utils.xml_parse import safe_xml_fromstring
+        from tempfile import NamedTemporaryFile
+        with lopen(nav_path, 'rb') as f:
+            raw = f.read()
+        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
+        root = parse(raw, log=log)
+        ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
+        navmap = ncx[0]
+        et = '{%s}type' % EPUB_NS
+        bn = os.path.basename(nav_path)
+
+        def add_from_li(li, parent):
+            href = text = None
+            for x in li.iterchildren(XHTML('a'), XHTML('span')):
+                text = etree.tostring(
+                    x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
+                            x.xpath('descendant-or-self::*/@title')).strip()
+                href = x.get('href')
+                if href:
+                    if href.startswith('#'):
+                        href = bn + href
+                break
+            np = parent.makeelement(NCX('navPoint'))
+            parent.append(np)
+            np.append(np.makeelement(NCX('navLabel')))
+            np[0].append(np.makeelement(NCX('text')))
+            np[0][0].text = text
+            if href:
+                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
+            return np
+
+        def process_nav_node(node, toc_parent):
+            for li in node.iterchildren(XHTML('li')):
+                child = add_from_li(li, toc_parent)
+                ol = first_child(li, XHTML('ol'))
+                if child is not None and ol is not None:
+                    process_nav_node(ol, child)
+
+        for nav in root.iterdescendants(XHTML('nav')):
+            if nav.get(et) == 'toc':
+                ol = first_child(nav, XHTML('ol'))
+                if ol is not None:
+                    process_nav_node(ol, navmap)
+                    break
+        else:
+            return
+
+        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
+            f.write(etree.tostring(ncx, encoding='utf-8'))
+        ncx_href = os.path.relpath(f.name, getcwd()).replace(os.sep, '/')
+        ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
+        for spine in opf.root.xpath('//*[local-name()="spine"]'):
+            spine.set('toc', ncx_id)
+        opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
+        opts.epub3_nav_parsed = root
+        if getattr(self, 'removed_cover', None):
+            changed = False
+            base_path = os.path.dirname(nav_path)
+            for elem in root.xpath('//*[@href]'):
+                href, frag = elem.get('href').partition('#')[::2]
+                link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
+                abs_href = urlnormalize(link_path)
+                if abs_href == self.removed_cover:
+                    changed = True
+                    elem.set('data-calibre-removed-titlepage', '1')
+            if changed:
+                with lopen(nav_path, 'wb') as f:
+                    f.write(serialize(root, 'application/xhtml+xml'))
+
+    def postprocess_book(self, oeb, opts, log):
+        rc = getattr(self, 'removed_cover', None)
+        if rc:
+            cover_toc_item = None
+            for item in oeb.toc.iterdescendants():
+                if item.href and item.href.partition('#')[0] == rc:
+                    cover_toc_item = item
+                    break
+            spine = {x.href for x in oeb.spine}
+            if (cover_toc_item is not None and cover_toc_item not in spine):
+                oeb.toc.item_that_refers_to_cover = cover_toc_item
--- a/ebook_converter/ebooks/conversion/plugins/epub_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/epub_output.py
@@ -0,0 +1,548 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, shutil, re
+
+from calibre.customize.conversion import (OutputFormatPlugin,
+        OptionRecommendation)
+from calibre.ptempfile import TemporaryDirectory
+from calibre import CurrentDir
+from polyglot.builtins import unicode_type, filter, map, zip, range, as_bytes
+
+block_level_tags = (
+      'address',
+      'body',
+      'blockquote',
+      'center',
+      'dir',
+      'div',
+      'dl',
+      'fieldset',
+      'form',
+      'h1',
+      'h2',
+      'h3',
+      'h4',
+      'h5',
+      'h6',
+      'hr',
+      'isindex',
+      'menu',
+      'noframes',
+      'noscript',
+      'ol',
+      'p',
+      'pre',
+      'table',
+      'ul',
+)
+
+
+class EPUBOutput(OutputFormatPlugin):
+
+    name = 'EPUB Output'
+    author = 'Kovid Goyal'
+    file_type = 'epub'
+    commit_name = 'epub_output'
+    ui_data = {'versions': ('2', '3')}
+
+    options = {
+        OptionRecommendation(name='extract_to',
+            help=_('Extract the contents of the generated %s file to the '
+                'specified directory. The contents of the directory are first '
+                'deleted, so be careful.') % 'EPUB'),
+
+        OptionRecommendation(name='dont_split_on_page_breaks',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Turn off splitting at page breaks. Normally, input '
+                    'files are automatically split at every page break into '
+                    'two files. This gives an output e-book that can be '
+                    'parsed faster and with less resources. However, '
+                    'splitting is slow and if your source file contains a '
+                    'very large number of page breaks, you should turn off '
+                    'splitting on page breaks.'
+                )
+        ),
+
+        OptionRecommendation(name='flow_size', recommended_value=260,
+            help=_('Split all HTML files larger than this size (in KB). '
+                'This is necessary as most EPUB readers cannot handle large '
+                'file sizes. The default of %defaultKB is the size required '
+                'for Adobe Digital Editions. Set to 0 to disable size based splitting.')
+        ),
+
+        OptionRecommendation(name='no_default_epub_cover', recommended_value=False,
+            help=_('Normally, if the input file has no cover and you don\'t'
+            ' specify one, a default cover is generated with the title, '
+            'authors, etc. This option disables the generation of this cover.')
+        ),
+
+        OptionRecommendation(name='no_svg_cover', recommended_value=False,
+            help=_('Do not use SVG for the book cover. Use this option if '
+                'your EPUB is going to be used on a device that does not '
+                'support SVG, like the iPhone or the JetBook Lite. '
+                'Without this option, such devices will display the cover '
+                'as a blank page.')
+        ),
+
+        OptionRecommendation(name='preserve_cover_aspect_ratio',
+            recommended_value=False, help=_(
+            'When using an SVG cover, this option will cause the cover to scale '
+            'to cover the available screen area, but still preserve its aspect ratio '
+            '(ratio of width to height). That means there may be white borders '
+            'at the sides or top and bottom of the image, but the image will '
+            'never be distorted. Without this option the image may be slightly '
+            'distorted, but there will be no borders.'
+            )
+        ),
+
+        OptionRecommendation(name='epub_flatten', recommended_value=False,
+            help=_('This option is needed only if you intend to use the EPUB'
+                ' with FBReaderJ. It will flatten the file system inside the'
+                ' EPUB, putting all files into the top level.')
+        ),
+
+        OptionRecommendation(name='epub_inline_toc', recommended_value=False,
+            help=_('Insert an inline Table of Contents that will appear as part of the main book content.')
+        ),
+
+        OptionRecommendation(name='epub_toc_at_end', recommended_value=False,
+            help=_('Put the inserted inline Table of Contents at the end of the book instead of the start.')
+        ),
+
+        OptionRecommendation(name='toc_title', recommended_value=None,
+            help=_('Title for any generated in-line table of contents.')
+        ),
+
+        OptionRecommendation(name='epub_version', recommended_value='2', choices=ui_data['versions'],
+            help=_('The version of the EPUB file to generate. EPUB 2 is the'
+                ' most widely compatible, only use EPUB 3 if you know you'
+                ' actually need it.')
+        ),
+
+        }
+
+    recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
+
+    def workaround_webkit_quirks(self):  # {{{
+        from calibre.ebooks.oeb.base import XPath
+        for x in self.oeb.spine:
+            root = x.data
+            body = XPath('//h:body')(root)
+            if body:
+                body = body[0]
+
+            if not hasattr(body, 'xpath'):
+                continue
+
+            for pre in XPath('//h:pre')(body):
+                if not pre.text and len(pre) == 0:
+                    pre.tag = 'div'
+    # }}}
+
+    def upshift_markup(self):  # {{{
+        'Upgrade markup to comply with XHTML 1.1 where possible'
+        from calibre.ebooks.oeb.base import XPath, XML
+        for x in self.oeb.spine:
+            root = x.data
+            if (not root.get(XML('lang'))) and (root.get('lang')):
+                root.set(XML('lang'), root.get('lang'))
+            body = XPath('//h:body')(root)
+            if body:
+                body = body[0]
+
+            if not hasattr(body, 'xpath'):
+                continue
+            for u in XPath('//h:u')(root):
+                u.tag = 'span'
+
+            seen_ids, seen_names = set(), set()
+            for x in XPath('//*[@id or @name]')(root):
+                eid, name = x.get('id', None), x.get('name', None)
+                if eid:
+                    if eid in seen_ids:
+                        del x.attrib['id']
+                    else:
+                        seen_ids.add(eid)
+                if name:
+                    if name in seen_names:
+                        del x.attrib['name']
+                    else:
+                        seen_names.add(name)
+
+    # }}}
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        self.log, self.opts, self.oeb = log, opts, oeb
+
+        if self.opts.epub_inline_toc:
+            from calibre.ebooks.mobi.writer8.toc import TOCAdder
+            opts.mobi_toc_at_start = not opts.epub_toc_at_end
+            opts.mobi_passthrough = False
+            opts.no_inline_toc = False
+            TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)
+
+        if self.opts.epub_flatten:
+            from calibre.ebooks.oeb.transforms.filenames import FlatFilenames
+            FlatFilenames()(oeb, opts)
+        else:
+            from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames
+            UniqueFilenames()(oeb, opts)
+
+        self.workaround_ade_quirks()
+        self.workaround_webkit_quirks()
+        self.upshift_markup()
+        from calibre.ebooks.oeb.transforms.rescale import RescaleImages
+        RescaleImages(check_colorspaces=True)(oeb, opts)
+
+        from calibre.ebooks.oeb.transforms.split import Split
+        split = Split(not self.opts.dont_split_on_page_breaks,
+                max_flow_size=self.opts.flow_size*1024
+                )
+        split(self.oeb, self.opts)
+
+        from calibre.ebooks.oeb.transforms.cover import CoverManager
+        cm = CoverManager(
+                no_default_cover=self.opts.no_default_epub_cover,
+                no_svg_cover=self.opts.no_svg_cover,
+                preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
+        cm(self.oeb, self.opts, self.log)
+
+        self.workaround_sony_quirks()
+
+        if self.oeb.toc.count() == 0:
+            self.log.warn('This EPUB file has no Table of Contents. '
+                    'Creating a default TOC')
+            first = next(iter(self.oeb.spine))
+            self.oeb.toc.add(_('Start'), first.href)
+
+        from calibre.ebooks.oeb.base import OPF
+        identifiers = oeb.metadata['identifier']
+        uuid = None
+        for x in identifiers:
+            if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
+                uuid = unicode_type(x).split(':')[-1]
+                break
+        encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
+
+        if uuid is None:
+            self.log.warn('No UUID identifier found')
+            from uuid import uuid4
+            uuid = unicode_type(uuid4())
+            oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
+
+        if encrypted_fonts and not uuid.startswith('urn:uuid:'):
+            # Apparently ADE requires this value to start with urn:uuid:
+            # for some absurd reason, or it will throw a hissy fit and refuse
+            # to use the obfuscated fonts.
+            for x in identifiers:
+                if unicode_type(x) == uuid:
+                    x.content = 'urn:uuid:'+uuid
+
+        with TemporaryDirectory('_epub_output') as tdir:
+            from calibre.customize.ui import plugin_for_output_format
+            metadata_xml = None
+            extra_entries = []
+            if self.is_periodical:
+                if self.opts.output_profile.epub_periodical_format == 'sony':
+                    from calibre.ebooks.epub.periodical import sony_metadata
+                    metadata_xml, atom_xml = sony_metadata(oeb)
+                    extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
+            oeb_output = plugin_for_output_format('oeb')
+            oeb_output.convert(oeb, tdir, input_plugin, opts, log)
+            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
+            self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
+                    if x.endswith('.ncx')][0])
+            if self.opts.epub_version == '3':
+                self.upgrade_to_epub3(tdir, opf)
+            encryption = None
+            if encrypted_fonts:
+                encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
+
+            from calibre.ebooks.epub import initialize_container
+            with initialize_container(output_path, os.path.basename(opf),
+                    extra_entries=extra_entries) as epub:
+                epub.add_dir(tdir)
+                if encryption is not None:
+                    epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
+                if metadata_xml is not None:
+                    epub.writestr('META-INF/metadata.xml',
+                            metadata_xml.encode('utf-8'))
+            if opts.extract_to is not None:
+                from calibre.utils.zipfile import ZipFile
+                if os.path.exists(opts.extract_to):
+                    if os.path.isdir(opts.extract_to):
+                        shutil.rmtree(opts.extract_to)
+                    else:
+                        os.remove(opts.extract_to)
+                os.mkdir(opts.extract_to)
+                with ZipFile(output_path) as zf:
+                    zf.extractall(path=opts.extract_to)
+                self.log.info('EPUB extracted to', opts.extract_to)
+
+    def upgrade_to_epub3(self, tdir, opf):
+        self.log.info('Upgrading to EPUB 3...')
+        from calibre.ebooks.epub import simple_container_xml
+        from calibre.ebooks.oeb.polish.cover import fix_conversion_titlepage_links_in_nav
+        try:
+            os.mkdir(os.path.join(tdir, 'META-INF'))
+        except EnvironmentError:
+            pass
+        with open(os.path.join(tdir, 'META-INF', 'container.xml'), 'wb') as f:
+            f.write(simple_container_xml(os.path.basename(opf)).encode('utf-8'))
+        from calibre.ebooks.oeb.polish.container import EpubContainer
+        container = EpubContainer(tdir, self.log)
+        from calibre.ebooks.oeb.polish.upgrade import epub_2_to_3
+        existing_nav = getattr(self.opts, 'epub3_nav_parsed', None)
+        nav_href = getattr(self.opts, 'epub3_nav_href', None)
+        previous_nav = (nav_href, existing_nav) if existing_nav and nav_href else None
+        epub_2_to_3(container, self.log.info, previous_nav=previous_nav)
+        fix_conversion_titlepage_links_in_nav(container)
+        container.commit()
+        os.remove(f.name)
+        try:
+            os.rmdir(os.path.join(tdir, 'META-INF'))
+        except EnvironmentError:
+            pass
+
+    def encrypt_fonts(self, uris, tdir, uuid):  # {{{
+        from polyglot.binary import from_hex_bytes
+
+        key = re.sub(r'[^a-fA-F0-9]', '', uuid)
+        if len(key) < 16:
+            raise ValueError('UUID identifier %r is invalid'%uuid)
+        key = bytearray(from_hex_bytes((key + key)[:32]))
+        paths = []
+        with CurrentDir(tdir):
+            paths = [os.path.join(*x.split('/')) for x in uris]
+            uris = dict(zip(uris, paths))
+            fonts = []
+            for uri in list(uris.keys()):
+                path = uris[uri]
+                if not os.path.exists(path):
+                    uris.pop(uri)
+                    continue
+                self.log.debug('Encrypting font:', uri)
+                with lopen(path, 'r+b') as f:
+                    data = f.read(1024)
+                    if len(data) >= 1024:
+                        data = bytearray(data)
+                        f.seek(0)
+                        f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024))))
+                    else:
+                        self.log.warn('Font', path, 'is invalid, ignoring')
+                if not isinstance(uri, unicode_type):
+                    uri = uri.decode('utf-8')
+                fonts.append('''
+                <enc:EncryptedData>
+                    <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
+                    <enc:CipherData>
+                    <enc:CipherReference URI="%s"/>
+                    </enc:CipherData>
+                </enc:EncryptedData>
+                '''%(uri.replace('"', '\\"')))
+            if fonts:
+                ans = '''<encryption
+                    xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
+                    xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
+                    xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
+                    '''
+                ans += '\n'.join(fonts)
+                ans += '\n</encryption>'
+                return ans
+    # }}}
+
+    def condense_ncx(self, ncx_path):  # {{{
+        from lxml import etree
+        if not self.opts.pretty_print:
+            tree = etree.parse(ncx_path)
+            for tag in tree.getroot().iter(tag=etree.Element):
+                if tag.text:
+                    tag.text = tag.text.strip()
+                if tag.tail:
+                    tag.tail = tag.tail.strip()
+            compressed = etree.tostring(tree.getroot(), encoding='utf-8')
+            with open(ncx_path, 'wb') as f:
+                f.write(compressed)
+    # }}}
+
+    def workaround_ade_quirks(self):  # {{{
+        '''
+        Perform various markup transforms to get the output to render correctly
+        in the quirky ADE.
+        '''
+        from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote
+
+        stylesheet = self.oeb.manifest.main_stylesheet
+
+        # ADE cries big wet tears when it encounters an invalid fragment
+        # identifier in the NCX toc.
+        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
+        for node in self.oeb.toc.iter():
+            href = getattr(node, 'href', None)
+            if hasattr(href, 'partition'):
+                base, _, frag = href.partition('#')
+                frag = urlunquote(frag)
+                if frag and frag_pat.match(frag) is None:
+                    self.log.warn(
+                            'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
+                    node.href = base
+
+        for x in self.oeb.spine:
+            root = x.data
+            body = XPath('//h:body')(root)
+            if body:
+                body = body[0]
+
+            if hasattr(body, 'xpath'):
+                # remove <img> tags with empty src elements
+                bad = []
+                for x in XPath('//h:img')(body):
+                    src = x.get('src', '').strip()
+                    if src in ('', '#') or src.startswith('http:'):
+                        bad.append(x)
+                for img in bad:
+                    img.getparent().remove(img)
+
+                # Add id attribute to <a> tags that have name
+                for x in XPath('//h:a[@name]')(body):
+                    if not x.get('id', False):
+                        x.set('id', x.get('name'))
+                    # The delightful epubcheck has started complaining about <a> tags that
+                    # have name attributes.
+                    x.attrib.pop('name')
+
+                # Replace <br> that are children of <body> as ADE doesn't handle them
+                for br in XPath('./h:br')(body):
+                    if br.getparent() is None:
+                        continue
+                    try:
+                        prior = next(br.itersiblings(preceding=True))
+                        priortag = barename(prior.tag)
+                        priortext = prior.tail
+                    except:
+                        priortag = 'body'
+                        priortext = body.text
+                    if priortext:
+                        priortext = priortext.strip()
+                    br.tag = XHTML('p')
+                    br.text = '\u00a0'
+                    style = br.get('style', '').split(';')
+                    style = list(filter(None, map(lambda x: x.strip(), style)))
+                    style.append('margin:0pt; border:0pt')
+                    # If the prior tag is a block (including a <br> we replaced)
+                    # then this <br> replacement should have a 1-line height.
+                    # Otherwise it should have no height.
+                    if not priortext and priortag in block_level_tags:
+                        style.append('height:1em')
+                    else:
+                        style.append('height:0pt')
+                    br.set('style', '; '.join(style))
+
+            for tag in XPath('//h:embed')(root):
+                tag.getparent().remove(tag)
+            for tag in XPath('//h:object')(root):
+                if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
+                    continue
+                tag.getparent().remove(tag)
+
+            for tag in XPath('//h:title|//h:style')(root):
+                if not tag.text:
+                    tag.getparent().remove(tag)
+            for tag in XPath('//h:script')(root):
+                if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
+                    tag.getparent().remove(tag)
+            for tag in XPath('//h:body/descendant::h:script')(root):
+                tag.getparent().remove(tag)
+
+            formchildren = XPath('./h:input|./h:button|./h:textarea|'
+                    './h:label|./h:fieldset|./h:legend')
+            for tag in XPath('//h:form')(root):
+                if formchildren(tag):
+                    tag.getparent().remove(tag)
+                else:
+                    # Not a real form
+                    tag.tag = XHTML('div')
+
+            for tag in XPath('//h:center')(root):
+                tag.tag = XHTML('div')
+                tag.set('style', 'text-align:center')
+            # ADE can't handle &amp; in an img url
+            for tag in XPath('//h:img[@src]')(root):
+                tag.set('src', tag.get('src', '').replace('&', ''))
+
+            # ADE whimpers in fright when it encounters a <td> outside a
+            # <table>
+            in_table = XPath('ancestor::h:table')
+            for tag in XPath('//h:td|//h:tr|//h:th')(root):
+                if not in_table(tag):
+                    tag.tag = XHTML('div')
+
+            # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
+            special_chars = re.compile('[\u200b\u00ad]')
+            for elem in root.iterdescendants('*'):
+                if elem.text:
+                    elem.text = special_chars.sub('', elem.text)
+                    elem.text = elem.text.replace('\u2011', '-')
+                if elem.tail:
+                    elem.tail = special_chars.sub('', elem.tail)
+                    elem.tail = elem.tail.replace('\u2011', '-')
+
+            if stylesheet is not None:
+                # ADE doesn't render lists correctly if they have left margins
+                from css_parser.css import CSSRule
+                for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
+                    sel = '.'+lb.get('class')
+                    for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
+                        if sel == rule.selectorList.selectorText:
+                            rule.style.removeProperty('margin-left')
+                            # padding-left breaks rendering in webkit and gecko
+                            rule.style.removeProperty('padding-left')
+                # Change whitespace:pre to pre-wrap to accommodate readers that
+                # cannot scroll horizontally
+                for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
+                    style = rule.style
+                    ws = style.getPropertyValue('white-space')
+                    if ws == 'pre':
+                        style.setProperty('white-space', 'pre-wrap')
+
+    # }}}
+
+    def workaround_sony_quirks(self):  # {{{
+        '''
+        Perform toc link transforms to alleviate slow loading.
+        '''
+        from calibre.ebooks.oeb.base import urldefrag, XPath
+        from calibre.ebooks.oeb.polish.toc import item_at_top
+
+        def frag_is_at_top(root, frag):
+            elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
+            if elem:
+                elem = elem[0]
+            else:
+                return False
+            return item_at_top(elem)
+
+        def simplify_toc_entry(toc):
+            if toc.href:
+                href, frag = urldefrag(toc.href)
+                if frag:
+                    for x in self.oeb.spine:
+                        if x.href == href:
+                            if frag_is_at_top(x.data, frag):
+                                self.log.debug('Removing anchor from TOC href:',
+                                        href+'#'+frag)
+                                toc.href = href
+                            break
+            for x in toc:
+                simplify_toc_entry(x)
+
+        if self.oeb.toc:
+            simplify_toc_entry(self.oeb.toc)
+
+    # }}}
--- a/ebook_converter/ebooks/conversion/plugins/fb2_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/fb2_input.py
@@ -0,0 +1,179 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
+"""
+Convert .fb2 files to .lrf
+"""
+import os, re
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre import guess_type
+from polyglot.builtins import iteritems, getcwd
+
+FB2NS  = 'http://www.gribuser.ru/xml/fictionbook/2.0'
+FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1'
+
+
+class FB2Input(InputFormatPlugin):
+
+    name        = 'FB2 Input'
+    author      = 'Anatoly Shipitsin'
+    description = 'Convert FB2 and FBZ files to HTML'
+    file_types  = {'fb2', 'fbz'}
+    commit_name = 'fb2_input'
+
+    recommendations = {
+        ('level1_toc', '//h:h1', OptionRecommendation.MED),
+        ('level2_toc', '//h:h2', OptionRecommendation.MED),
+        ('level3_toc', '//h:h3', OptionRecommendation.MED),
+        }
+
+    options = {
+    OptionRecommendation(name='no_inline_fb2_toc',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Do not insert a Table of Contents at the beginning of the book.'
+                )
+        )}
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from lxml import etree
+        from calibre.utils.xml_parse import safe_xml_fromstring
+        from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
+        from calibre.ebooks.metadata.opf2 import OPFCreator
+        from calibre.ebooks.metadata.meta import get_metadata
+        from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
+        from calibre.ebooks.chardet import xml_to_unicode
+        self.log = log
+        log.debug('Parsing XML...')
+        raw = get_fb2_data(stream)[0]
+        raw = raw.replace(b'\0', b'')
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+            assume_utf8=True, resolve_entities=True)[0]
+        try:
+            doc = safe_xml_fromstring(raw)
+        except etree.XMLSyntaxError:
+            doc = safe_xml_fromstring(raw.replace('& ', '&amp;'))
+        if doc is None:
+            raise ValueError('The FB2 file is not valid XML')
+        doc = ensure_namespace(doc)
+        try:
+            fb_ns = doc.nsmap[doc.prefix]
+        except Exception:
+            fb_ns = FB2NS
+
+        NAMESPACES = {'f':fb_ns, 'l':XLINK_NS}
+        stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
+        css = ''
+        for s in stylesheets:
+            css += etree.tostring(s, encoding='unicode', method='text',
+                    with_tail=False) + '\n\n'
+        if css:
+            import css_parser, logging
+            parser = css_parser.CSSParser(fetcher=None,
+                    log=logging.getLogger('calibre.css'))
+
+            XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
+            text = XHTML_CSS_NAMESPACE + css
+            log.debug('Parsing stylesheet...')
+            stylesheet = parser.parseString(text)
+            stylesheet.namespaces['h'] = XHTML_NS
+            css = stylesheet.cssText
+            if isinstance(css, bytes):
+                css = css.decode('utf-8', 'replace')
+            css = css.replace('h|style', 'h|span')
+            css = re.sub(r'name\s*=\s*', 'class=', css)
+        self.extract_embedded_content(doc)
+        log.debug('Converting XML to HTML...')
+        with open(P('templates/fb2.xsl'), 'rb') as f:
+            ss = f.read().decode('utf-8')
+        ss = ss.replace("__FB_NS__", fb_ns)
+        if options.no_inline_fb2_toc:
+            log('Disabling generation of inline FB2 TOC')
+            ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
+                    re.DOTALL).sub('', ss)
+
+        styledoc = safe_xml_fromstring(ss)
+
+        transform = etree.XSLT(styledoc)
+        result = transform(doc)
+
+        # Handle links of type note and cite
+        notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')}
+        cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')}
+        all_ids = {x for x in result.xpath('//*/@id')}
+        for cite, a in iteritems(cites):
+            note = notes.get(cite, None)
+            if note:
+                c = 1
+                while 'cite%d' % c in all_ids:
+                    c += 1
+                if not note.get('id', None):
+                    note.set('id', 'cite%d' % c)
+                    all_ids.add(note.get('id'))
+                a.set('href', '#%s' % note.get('id'))
+        for x in result.xpath('//*[@link_note or @link_cite]'):
+            x.attrib.pop('link_note', None)
+            x.attrib.pop('link_cite', None)
+
+        for img in result.xpath('//img[@src]'):
+            src = img.get('src')
+            img.set('src', self.binary_map.get(src, src))
+        index = transform.tostring(result)
+        with open('index.xhtml', 'wb') as f:
+            f.write(index.encode('utf-8'))
+        with open('inline-styles.css', 'wb') as f:
+            f.write(css.encode('utf-8'))
+        stream.seek(0)
+        mi = get_metadata(stream, 'fb2')
+        if not mi.title:
+            mi.title = _('Unknown')
+        if not mi.authors:
+            mi.authors = [_('Unknown')]
+        cpath = None
+        if mi.cover_data and mi.cover_data[1]:
+            with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
+                f.write(mi.cover_data[1])
+            cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
+        else:
+            for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
+                href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
+                if href is not None:
+                    if href.startswith('#'):
+                        href = href[1:]
+                    cpath = os.path.abspath(href)
+                    break
+
+        opf = OPFCreator(getcwd(), mi)
+        entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
+        opf.create_manifest(entries)
+        opf.create_spine(['index.xhtml'])
+        if cpath:
+            opf.guide.set_cover(cpath)
+        with open('metadata.opf', 'wb') as f:
+            opf.render(f)
+        return os.path.join(getcwd(), 'metadata.opf')
+
+    def extract_embedded_content(self, doc):
+        from calibre.ebooks.fb2 import base64_decode
+        self.binary_map = {}
+        for elem in doc.xpath('./*'):
+            if elem.text and 'binary' in elem.tag and 'id' in elem.attrib:
+                ct = elem.get('content-type', '')
+                fname = elem.attrib['id']
+                ext = ct.rpartition('/')[-1].lower()
+                if ext in ('png', 'jpeg', 'jpg'):
+                    if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
+                            'png'}:
+                        fname += '.' + ext
+                    self.binary_map[elem.get('id')] = fname
+                raw = elem.text.strip()
+                try:
+                    data = base64_decode(raw)
+                except TypeError:
+                    self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
+                        elem.get('id')))
+                else:
+                    with open(fname, 'wb') as f:
+                        f.write(data)
--- a/ebook_converter/ebooks/conversion/plugins/fb2_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/fb2_output.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
+
+
+class FB2Output(OutputFormatPlugin):
+
+    name = 'FB2 Output'
+    author = 'John Schember'
+    file_type = 'fb2'
+    commit_name = 'fb2_output'
+
+    FB2_GENRES = [
+        # Science Fiction & Fantasy
+        'sf_history',  # Alternative history
+        'sf_action',  # Action
+        'sf_epic',  # Epic
+        'sf_heroic',  # Heroic
+        'sf_detective',  # Detective
+        'sf_cyberpunk',  # Cyberpunk
+        'sf_space',  # Space
+        'sf_social',  # Social#philosophical
+        'sf_horror',  # Horror & mystic
+        'sf_humor',  # Humor
+        'sf_fantasy',  # Fantasy
+        'sf',  # Science Fiction
+        # Detectives & Thrillers
+        'det_classic',  # Classical detectives
+        'det_police',  # Police Stories
+        'det_action',  # Action
+        'det_irony',  # Ironical detectives
+        'det_history',  # Historical detectives
+        'det_espionage',  # Espionage detectives
+        'det_crime',  # Crime detectives
+        'det_political',  # Political detectives
+        'det_maniac',  # Maniacs
+        'det_hard',  # Hard#boiled
+        'thriller',  # Thrillers
+        'detective',  # Detectives
+        # Prose
+        'prose_classic',  # Classics prose
+        'prose_history',  # Historical prose
+        'prose_contemporary',  # Contemporary prose
+        'prose_counter',  # Counterculture
+        'prose_rus_classic',  # Russial classics prose
+        'prose_su_classics',  # Soviet classics prose
+        # Romance
+        'love_contemporary',  # Contemporary Romance
+        'love_history',  # Historical Romance
+        'love_detective',  # Detective Romance
+        'love_short',  # Short Romance
+        'love_erotica',  # Erotica
+        # Adventure
+        'adv_western',  # Western
+        'adv_history',  # History
+        'adv_indian',  # Indians
+        'adv_maritime',  # Maritime Fiction
+        'adv_geo',  # Travel & geography
+        'adv_animal',  # Nature & animals
+        'adventure',  # Other
+        # Children's
+        'child_tale',  # Fairy Tales
+        'child_verse',  # Verses
+        'child_prose',  # Prose
+        'child_sf',  # Science Fiction
+        'child_det',  # Detectives & Thrillers
+        'child_adv',  # Adventures
+        'child_education',  # Educational
+        'children',  # Other
+        # Poetry & Dramaturgy
+        'poetry',  # Poetry
+        'dramaturgy',  # Dramaturgy
+        # Antique literature
+        'antique_ant',  # Antique
+        'antique_european',  # European
+        'antique_russian',  # Old russian
+        'antique_east',  # Old east
+        'antique_myths',  # Myths. Legends. Epos
+        'antique',  # Other
+        # Scientific#educational
+        'sci_history',  # History
+        'sci_psychology',  # Psychology
+        'sci_culture',  # Cultural science
+        'sci_religion',  # Religious studies
+        'sci_philosophy',  # Philosophy
+        'sci_politics',  # Politics
+        'sci_business',  # Business literature
+        'sci_juris',  # Jurisprudence
+        'sci_linguistic',  # Linguistics
+        'sci_medicine',  # Medicine
+        'sci_phys',  # Physics
+        'sci_math',  # Mathematics
+        'sci_chem',  # Chemistry
+        'sci_biology',  # Biology
+        'sci_tech',  # Technical
+        'science',  # Other
+        # Computers & Internet
+        'comp_www',  # Internet
+        'comp_programming',  # Programming
+        'comp_hard',  # Hardware
+        'comp_soft',  # Software
+        'comp_db',  # Databases
+        'comp_osnet',  # OS & Networking
+        'computers',  # Other
+        # Reference
+        'ref_encyc',  # Encyclopedias
+        'ref_dict',  # Dictionaries
+        'ref_ref',  # Reference
+        'ref_guide',  # Guidebooks
+        'reference',  # Other
+        # Nonfiction
+        'nonf_biography',  # Biography & Memoirs
+        'nonf_publicism',  # Publicism
+        'nonf_criticism',  # Criticism
+        'design',  # Art & design
+        'nonfiction',  # Other
+        # Religion & Inspiration
+        'religion_rel',  # Religion
+        'religion_esoterics',  # Esoterics
+        'religion_self',  # Self#improvement
+        'religion',  # Other
+        # Humor
+        'humor_anecdote',  # Anecdote (funny stories)
+        'humor_prose',  # Prose
+        'humor_verse',  # Verses
+        'humor',  # Other
+        # Home & Family
+        'home_cooking',  # Cooking
+        'home_pets',  # Pets
+        'home_crafts',  # Hobbies & Crafts
+        'home_entertain',  # Entertaining
+        'home_health',  # Health
+        'home_garden',  # Garden
+        'home_diy',  # Do it yourself
+        'home_sport',  # Sports
+        'home_sex',  # Erotica & sex
+        'home',  # Other
+    ]
+    ui_data = {
+        'sectionize': {
+            'toc': _('Section per entry in the ToC'),
+            'files': _('Section per file'),
+            'nothing': _('A single section')
+        },
+        'genres': FB2_GENRES,
+    }
+
+    options = {
+        OptionRecommendation(name='sectionize',
+            recommended_value='files', level=OptionRecommendation.LOW,
+            choices=list(ui_data['sectionize']),
+            help=_('Specify how sections are created:\n'
+                ' * nothing: {nothing}\n'
+                ' * files: {files}\n'
+                ' * toc: {toc}\n'
+                'If ToC based generation fails, adjust the "Structure detection" and/or "Table of Contents" settings '
+                '(turn on "Force use of auto-generated Table of Contents").').format(**ui_data['sectionize'])
+        ),
+        OptionRecommendation(name='fb2_genre',
+            recommended_value='antique', level=OptionRecommendation.LOW,
+            choices=FB2_GENRES,
+            help=(_('Genre for the book. Choices: %s\n\n See: ') % ', '.join(FB2_GENRES)
+                ) + 'http://www.fictionbook.org/index.php/Eng:FictionBook_2.1_genres ' + _('for a complete list with descriptions.')),
+    }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.oeb.transforms.jacket import linearize_jacket
+        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
+        from calibre.ebooks.fb2.fb2ml import FB2MLizer
+
+        try:
+            rasterizer = SVGRasterizer()
+            rasterizer(oeb_book, opts)
+        except Unavailable:
+            log.warn('SVG rasterizer unavailable, SVG will not be converted')
+
+        linearize_jacket(oeb_book)
+
+        fb2mlizer = FB2MLizer(log)
+        fb2_content = fb2mlizer.extract_content(oeb_book, opts)
+
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = lopen(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        out_stream.seek(0)
+        out_stream.truncate()
+        out_stream.write(fb2_content.encode('utf-8', 'replace'))
+
+        if close:
+            out_stream.close()
--- a/ebook_converter/ebooks/conversion/plugins/html_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/html_input.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re, tempfile, os
+from functools import partial
+
+from calibre.constants import islinux, isbsd
+from calibre.customize.conversion import (InputFormatPlugin,
+        OptionRecommendation)
+from calibre.utils.localization import get_lang
+from calibre.utils.filenames import ascii_filename
+from calibre.utils.imghdr import what
+from polyglot.builtins import unicode_type, zip, getcwd, as_unicode
+
+
+def sanitize_file_name(x):
+    ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
+    ans, ext = ans.rpartition('.')[::2]
+    return (ans.strip() + '.' + ext.strip()).rstrip('.')
+
+
+class HTMLInput(InputFormatPlugin):
+
+    name        = 'HTML Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert HTML and OPF files to an OEB'
+    file_types  = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
+    commit_name = 'html_input'
+
+    options = {
+        OptionRecommendation(name='breadth_first',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Traverse links in HTML files breadth first. Normally, '
+                    'they are traversed depth first.'
+                   )
+        ),
+
+        OptionRecommendation(name='max_levels',
+            recommended_value=5, level=OptionRecommendation.LOW,
+            help=_('Maximum levels of recursion when following links in '
+                   'HTML files. Must be non-negative. 0 implies that no '
+                   'links in the root HTML file are followed. Default is '
+                   '%default.'
+                   )
+        ),
+
+        OptionRecommendation(name='dont_package',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Normally this input plugin re-arranges all the input '
+                'files into a standard folder hierarchy. Only use this option '
+                'if you know what you are doing as it can result in various '
+                'nasty side effects in the rest of the conversion pipeline.'
+                )
+        ),
+
+    }
+
+    def convert(self, stream, opts, file_ext, log,
+                accelerators):
+        self._is_case_sensitive = None
+        basedir = getcwd()
+        self.opts = opts
+
+        fname = None
+        if hasattr(stream, 'name'):
+            basedir = os.path.dirname(stream.name)
+            fname = os.path.basename(stream.name)
+
+        if file_ext != 'opf':
+            if opts.dont_package:
+                raise ValueError('The --dont-package option is not supported for an HTML input file')
+            from calibre.ebooks.metadata.html import get_metadata
+            mi = get_metadata(stream)
+            if fname:
+                from calibre.ebooks.metadata.meta import metadata_from_filename
+                fmi = metadata_from_filename(fname)
+                fmi.smart_update(mi)
+                mi = fmi
+            oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
+            return oeb
+
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        return create_oebbook(log, stream.name, opts,
+                encoding=opts.input_encoding)
+
+    def is_case_sensitive(self, path):
+        if getattr(self, '_is_case_sensitive', None) is not None:
+            return self._is_case_sensitive
+        if not path or not os.path.exists(path):
+            return islinux or isbsd
+        self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
+        return self._is_case_sensitive
+
+    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
+        import uuid
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        from calibre.ebooks.oeb.base import (DirContainer,
+            rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
+            xpath, urlquote)
+        from calibre import guess_type
+        from calibre.ebooks.oeb.transforms.metadata import \
+            meta_info_to_oeb_metadata
+        from calibre.ebooks.html.input import get_filelist
+        from calibre.ebooks.metadata import string_to_authors
+        from calibre.utils.localization import canonicalize_lang
+        import css_parser, logging
+        css_parser.log.setLevel(logging.WARN)
+        self.OEB_STYLES = OEB_STYLES
+        oeb = create_oebbook(log, None, opts, self,
+                encoding=opts.input_encoding, populate=False)
+        self.oeb = oeb
+
+        metadata = oeb.metadata
+        meta_info_to_oeb_metadata(mi, metadata, log)
+        if not metadata.language:
+            l = canonicalize_lang(getattr(opts, 'language', None))
+            if not l:
+                oeb.logger.warn('Language not specified')
+                l = get_lang().replace('_', '-')
+            metadata.add('language', l)
+        if not metadata.creator:
+            a = getattr(opts, 'authors', None)
+            if a:
+                a = string_to_authors(a)
+            if not a:
+                oeb.logger.warn('Creator not specified')
+                a = [self.oeb.translate(__('Unknown'))]
+            for aut in a:
+                metadata.add('creator', aut)
+        if not metadata.title:
+            oeb.logger.warn('Title not specified')
+            metadata.add('title', self.oeb.translate(__('Unknown')))
+        bookid = unicode_type(uuid.uuid4())
+        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
+        for ident in metadata.identifier:
+            if 'id' in ident.attrib:
+                self.oeb.uid = metadata.identifier[0]
+                break
+
+        filelist = get_filelist(htmlpath, basedir, opts, log)
+        filelist = [f for f in filelist if not f.is_binary]
+        htmlfile_map = {}
+        for f in filelist:
+            path = f.path
+            oeb.container = DirContainer(os.path.dirname(path), log,
+                    ignore_opf=True)
+            bname = os.path.basename(path)
+            id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
+            htmlfile_map[path] = href
+            item = oeb.manifest.add(id, href, 'text/html')
+            if path == htmlpath and '%' in path:
+                bname = urlquote(bname)
+            item.html_input_href = bname
+            oeb.spine.add(item, True)
+
+        self.added_resources = {}
+        self.log = log
+        self.log('Normalizing filename cases')
+        for path, href in htmlfile_map.items():
+            if not self.is_case_sensitive(path):
+                path = path.lower()
+            self.added_resources[path] = href
+        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
+        self.urldefrag = urldefrag
+        self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME
+
+        self.log('Rewriting HTML links')
+        for f in filelist:
+            path = f.path
+            dpath = os.path.dirname(path)
+            oeb.container = DirContainer(dpath, log, ignore_opf=True)
+            href = htmlfile_map[path]
+            try:
+                item = oeb.manifest.hrefs[href]
+            except KeyError:
+                item = oeb.manifest.hrefs[urlnormalize(href)]
+            rewrite_links(item.data, partial(self.resource_adder, base=dpath))
+
+        for item in oeb.manifest.values():
+            if item.media_type in self.OEB_STYLES:
+                dpath = None
+                for path, href in self.added_resources.items():
+                    if href == item.href:
+                        dpath = os.path.dirname(path)
+                        break
+                css_parser.replaceUrls(item.data,
+                        partial(self.resource_adder, base=dpath))
+
+        toc = self.oeb.toc
+        self.oeb.auto_generated_toc = True
+        titles = []
+        headers = []
+        for item in self.oeb.spine:
+            if not item.linear:
+                continue
+            html = item.data
+            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
+            title = re.sub(r'\s+', ' ', title.strip())
+            if title:
+                titles.append(title)
+            headers.append('(unlabled)')
+            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
+                expr = '/h:html/h:body//h:%s[position()=1]/text()'
+                header = ''.join(xpath(html, expr % tag))
+                header = re.sub(r'\s+', ' ', header.strip())
+                if header:
+                    headers[-1] = header
+                    break
+        use = titles
+        if len(titles) > len(set(titles)):
+            use = headers
+        for title, item in zip(use, self.oeb.spine):
+            if not item.linear:
+                continue
+            toc.add(title, item.href)
+
+        oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
+        return oeb
+
+    def link_to_local_path(self, link_, base=None):
+        from calibre.ebooks.html.input import Link
+        if not isinstance(link_, unicode_type):
+            try:
+                link_ = link_.decode('utf-8', 'error')
+            except:
+                self.log.warn('Failed to decode link %r. Ignoring'%link_)
+                return None, None
+        try:
+            l = Link(link_, base if base else getcwd())
+        except:
+            self.log.exception('Failed to process link: %r'%link_)
+            return None, None
+        if l.path is None:
+            # Not a local resource
+            return None, None
+        link = l.path.replace('/', os.sep).strip()
+        frag = l.fragment
+        if not link:
+            return None, None
+        return link, frag
+
+    def resource_adder(self, link_, base=None):
+        from polyglot.urllib import quote
+        link, frag = self.link_to_local_path(link_, base=base)
+        if link is None:
+            return link_
+        try:
+            if base and not os.path.isabs(link):
+                link = os.path.join(base, link)
+            link = os.path.abspath(link)
+        except:
+            return link_
+        if not os.access(link, os.R_OK):
+            return link_
+        if os.path.isdir(link):
+            self.log.warn(link_, 'is a link to a directory. Ignoring.')
+            return link_
+        if not self.is_case_sensitive(tempfile.gettempdir()):
+            link = link.lower()
+        if link not in self.added_resources:
+            bhref = os.path.basename(link)
+            id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
+            guessed = self.guess_type(href)[0]
+            media_type = guessed or self.BINARY_MIME
+            if media_type == 'text/plain':
+                self.log.warn('Ignoring link to text file %r'%link_)
+                return None
+            if media_type == self.BINARY_MIME:
+                # Check for the common case, images
+                try:
+                    img = what(link)
+                except EnvironmentError:
+                    pass
+                else:
+                    if img:
+                        media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME
+
+            self.oeb.log.debug('Added', link)
+            self.oeb.container = self.DirContainer(os.path.dirname(link),
+                    self.oeb.log, ignore_opf=True)
+            # Load into memory
+            item = self.oeb.manifest.add(id, href, media_type)
+            # bhref refers to an already existing file. The read() method of
+            # DirContainer will call unquote on it before trying to read the
+            # file, therefore we quote it here.
+            if isinstance(bhref, unicode_type):
+                bhref = bhref.encode('utf-8')
+            item.html_input_href = as_unicode(quote(bhref))
+            if guessed in self.OEB_STYLES:
+                item.override_css_fetch = partial(
+                        self.css_import_handler, os.path.dirname(link))
+            item.data
+            self.added_resources[link] = href
+
+        nlink = self.added_resources[link]
+        if frag:
+            nlink = '#'.join((nlink, frag))
+        return nlink
+
+    def css_import_handler(self, base, href):
+        link, frag = self.link_to_local_path(href, base=base)
+        if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
+            return (None, None)
+        try:
+            with open(link, 'rb') as f:
+                raw = f.read().decode('utf-8', 'replace')
+            raw = self.oeb.css_preprocessor(raw, add_namespace=False)
+        except:
+            self.log.exception('Failed to read CSS file: %r'%link)
+            return (None, None)
+        return (None, raw)
--- a/ebook_converter/ebooks/conversion/plugins/html_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/html_output.py
@@ -0,0 +1,226 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
+__docformat__ = 'restructuredtext en'
+
+import os, re, shutil
+from os.path import dirname, abspath, relpath as _relpath, exists, basename
+
+from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
+from calibre import CurrentDir
+from calibre.ptempfile import PersistentTemporaryDirectory
+from polyglot.builtins import unicode_type
+
+
+def relpath(*args):
+    return _relpath(*args).replace(os.sep, '/')
+
+
+class HTMLOutput(OutputFormatPlugin):
+
+    name = 'HTML Output'
+    author = 'Fabian Grassl'
+    file_type = 'zip'
+    commit_name = 'html_output'
+
+    options = {
+        OptionRecommendation(name='template_css',
+            help=_('CSS file used for the output instead of the default file')),
+
+        OptionRecommendation(name='template_html_index',
+            help=_('Template used for generation of the HTML index file instead of the default file')),
+
+        OptionRecommendation(name='template_html',
+            help=_('Template used for the generation of the HTML contents of the book instead of the default file')),
+
+        OptionRecommendation(name='extract_to',
+            help=_('Extract the contents of the generated ZIP file to the '
+                'specified directory. WARNING: The contents of the directory '
+                'will be deleted.')
+        ),
+    }
+
+    recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
+
+    def generate_toc(self, oeb_book, ref_url, output_dir):
+        '''
+        Generate table of contents
+        '''
+        from lxml import etree
+        from polyglot.urllib import unquote
+
+        from calibre.ebooks.oeb.base import element
+        from calibre.utils.cleantext import clean_xml_chars
+        with CurrentDir(output_dir):
+            def build_node(current_node, parent=None):
+                if parent is None:
+                    parent = etree.Element('ul')
+                elif len(current_node.nodes):
+                    parent = element(parent, ('ul'))
+                for node in current_node.nodes:
+                    point = element(parent, 'li')
+                    href = relpath(abspath(unquote(node.href)), dirname(ref_url))
+                    if isinstance(href, bytes):
+                        href = href.decode('utf-8')
+                    link = element(point, 'a', href=clean_xml_chars(href))
+                    title = node.title
+                    if isinstance(title, bytes):
+                        title = title.decode('utf-8')
+                    if title:
+                        title = re.sub(r'\s+', ' ', title)
+                    link.text = clean_xml_chars(title)
+                    build_node(node, point)
+                return parent
+            wrap = etree.Element('div')
+            wrap.append(build_node(oeb_book.toc))
+            return wrap
+
+    def generate_html_toc(self, oeb_book, ref_url, output_dir):
+        from lxml import etree
+
+        root = self.generate_toc(oeb_book, ref_url, output_dir)
+        return etree.tostring(root, pretty_print=True, encoding='unicode',
+                xml_declaration=False)
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from lxml import etree
+        from calibre.utils import zipfile
+        from templite import Templite
+        from polyglot.urllib import unquote
+        from calibre.ebooks.html.meta import EasyMeta
+
+        # read template files
+        if opts.template_html_index is not None:
+            with open(opts.template_html_index, 'rb') as f:
+                template_html_index_data = f.read()
+        else:
+            template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)
+
+        if opts.template_html is not None:
+            with open(opts.template_html, 'rb') as f:
+                template_html_data = f.read()
+        else:
+            template_html_data = P('templates/html_export_default.tmpl', data=True)
+
+        if opts.template_css is not None:
+            with open(opts.template_css, 'rb') as f:
+                template_css_data = f.read()
+        else:
+            template_css_data = P('templates/html_export_default.css', data=True)
+
+        template_html_index_data = template_html_index_data.decode('utf-8')
+        template_html_data = template_html_data.decode('utf-8')
+        template_css_data = template_css_data.decode('utf-8')
+
+        self.log  = log
+        self.opts = opts
+        meta = EasyMeta(oeb_book.metadata)
+
+        tempdir = os.path.realpath(PersistentTemporaryDirectory())
+        output_file = os.path.join(tempdir,
+                basename(re.sub(r'\.zip', '', output_path)+'.html'))
+        output_dir = re.sub(r'\.html', '', output_file)+'_files'
+
+        if not exists(output_dir):
+            os.makedirs(output_dir)
+
+        css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
+        with open(css_path, 'wb') as f:
+            f.write(template_css_data.encode('utf-8'))
+
+        with open(output_file, 'wb') as f:
+            html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
+            templite = Templite(template_html_index_data)
+            nextLink = oeb_book.spine[0].href
+            nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
+            cssLink = relpath(abspath(css_path), dirname(output_file))
+            tocUrl = relpath(output_file, dirname(output_file))
+            t = templite.render(has_toc=bool(oeb_book.toc.count()),
+                    toc=html_toc, meta=meta, nextLink=nextLink,
+                    tocUrl=tocUrl, cssLink=cssLink,
+                    firstContentPageLink=nextLink)
+            if isinstance(t, unicode_type):
+                t = t.encode('utf-8')
+            f.write(t)
+
+        with CurrentDir(output_dir):
+            for item in oeb_book.manifest:
+                path = abspath(unquote(item.href))
+                dir = dirname(path)
+                if not exists(dir):
+                    os.makedirs(dir)
+                if item.spine_position is not None:
+                    with open(path, 'wb') as f:
+                        pass
+                else:
+                    with open(path, 'wb') as f:
+                        f.write(item.bytes_representation)
+                    item.unload_data_from_memory(memory=path)
+
+            for item in oeb_book.spine:
+                path = abspath(unquote(item.href))
+                dir = dirname(path)
+                root = item.data.getroottree()
+
+                # get & clean HTML <HEAD>-data
+                head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
+                head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
+                head_content = re.sub(r'\<\/?head.*\>', '', head_content)
+                head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
+                head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
+
+                # get & clean HTML <BODY>-data
+                body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
+                ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
+                ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
+                ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
+
+                # generate link to next page
+                if item.spine_position+1 < len(oeb_book.spine):
+                    nextLink = oeb_book.spine[item.spine_position+1].href
+                    nextLink = relpath(abspath(nextLink), dir)
+                else:
+                    nextLink = None
+
+                # generate link to previous page
+                if item.spine_position > 0:
+                    prevLink = oeb_book.spine[item.spine_position-1].href
+                    prevLink = relpath(abspath(prevLink), dir)
+                else:
+                    prevLink = None
+
+                cssLink = relpath(abspath(css_path), dir)
+                tocUrl = relpath(output_file, dir)
+                firstContentPageLink = oeb_book.spine[0].href
+
+                # render template
+                templite = Templite(template_html_data)
+                toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
+                t = templite.render(ebookContent=ebook_content,
+                        prevLink=prevLink, nextLink=nextLink,
+                        has_toc=bool(oeb_book.toc.count()), toc=toc,
+                        tocUrl=tocUrl, head_content=head_content,
+                        meta=meta, cssLink=cssLink,
+                        firstContentPageLink=firstContentPageLink)
+
+                # write html to file
+                with open(path, 'wb') as f:
+                    f.write(t.encode('utf-8'))
+                item.unload_data_from_memory(memory=path)
+
+        zfile = zipfile.ZipFile(output_path, "w")
+        zfile.add_dir(output_dir, basename(output_dir))
+        zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)
+
+        if opts.extract_to:
+            if os.path.exists(opts.extract_to):
+                shutil.rmtree(opts.extract_to)
+            os.makedirs(opts.extract_to)
+            zfile.extractall(opts.extract_to)
+            self.log('Zip file extracted to', opts.extract_to)
+
+        zfile.close()
+
+        # cleanup temp dir
+        shutil.rmtree(tempdir)
--- a/ebook_converter/ebooks/conversion/plugins/htmlz_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/htmlz_input.py
@@ -0,0 +1,133 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre import guess_type
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import getcwd
+
+
+class HTMLZInput(InputFormatPlugin):
+
+    name        = 'HTLZ Input'
+    author      = 'John Schember'
+    description = 'Convert HTML files to HTML'
+    file_types  = {'htmlz'}
+    commit_name = 'htmlz_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.chardet import xml_to_unicode
+        from calibre.ebooks.metadata.opf2 import OPF
+        from calibre.utils.zipfile import ZipFile
+
+        self.log = log
+        html = u''
+        top_levels = []
+
+        # Extract content from zip archive.
+        zf = ZipFile(stream)
+        zf.extractall()
+
+        # Find the HTML file in the archive. It needs to be
+        # top level.
+        index = u''
+        multiple_html = False
+        # Get a list of all top level files in the archive.
+        for x in os.listdir(u'.'):
+            if os.path.isfile(x):
+                top_levels.append(x)
+        # Try to find an index. file.
+        for x in top_levels:
+            if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
+                index = x
+                break
+        # Look for multiple HTML files in the archive. We look at the
+        # top level files only as only they matter in HTMLZ.
+        for x in top_levels:
+            if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
+                # Set index to the first HTML file found if it's not
+                # called index.
+                if not index:
+                    index = x
+                else:
+                    multiple_html = True
+        # Warn the user if there multiple HTML file in the archive. HTMLZ
+        # supports a single HTML file. A conversion with a multiple HTML file
+        # HTMLZ archive probably won't turn out as the user expects. With
+        # Multiple HTML files ZIP input should be used in place of HTMLZ.
+        if multiple_html:
+            log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
+
+        if index:
+            with open(index, 'rb') as tf:
+                html = tf.read()
+        else:
+            raise Exception(_('No top level HTML file found.'))
+
+        if not html:
+            raise Exception(_('Top level HTML file %s is empty') % index)
+
+        # Encoding
+        if options.input_encoding:
+            ienc = options.input_encoding
+        else:
+            ienc = xml_to_unicode(html[:4096])[-1]
+        html = html.decode(ienc, 'replace')
+
+        # Run the HTML through the html processing plugin.
+        from calibre.customize.ui import plugin_for_input_format
+        html_input = plugin_for_input_format('html')
+        for opt in html_input.options:
+            setattr(options, opt.option.name, opt.recommended_value)
+        options.input_encoding = 'utf-8'
+        base = getcwd()
+        htmlfile = os.path.join(base, u'index.html')
+        c = 0
+        while os.path.exists(htmlfile):
+            c += 1
+            htmlfile = u'index%d.html'%c
+        with open(htmlfile, 'wb') as f:
+            f.write(html.encode('utf-8'))
+        odi = options.debug_pipeline
+        options.debug_pipeline = None
+        # Generate oeb from html conversion.
+        with open(htmlfile, 'rb') as f:
+            oeb = html_input.convert(f, options, 'html', log,
+                {})
+        options.debug_pipeline = odi
+        os.remove(htmlfile)
+
+        # Set metadata from file.
+        from calibre.customize.ui import get_file_type_metadata
+        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
+        mi = get_file_type_metadata(stream, file_ext)
+        meta_info_to_oeb_metadata(mi, oeb.metadata, log)
+
+        # Get the cover path from the OPF.
+        cover_path = None
+        opf = None
+        for x in top_levels:
+            if os.path.splitext(x)[1].lower() == u'.opf':
+                opf = x
+                break
+        if opf:
+            opf = OPF(opf, basedir=getcwd())
+            cover_path = opf.raster_cover or opf.cover
+        # Set the cover.
+        if cover_path:
+            cdata = None
+            with open(os.path.join(getcwd(), cover_path), 'rb') as cf:
+                cdata = cf.read()
+            cover_name = os.path.basename(cover_path)
+            id, href = oeb.manifest.generate('cover', cover_name)
+            oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata)
+            oeb.guide.add('cover', 'Cover', href)
+
+        return oeb
--- a/ebook_converter/ebooks/conversion/plugins/htmlz_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/htmlz_output.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import io
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, \
+    OptionRecommendation
+from calibre.ptempfile import TemporaryDirectory
+from polyglot.builtins import unicode_type
+
+
+class HTMLZOutput(OutputFormatPlugin):
+
+    name = 'HTMLZ Output'
+    author = 'John Schember'
+    file_type = 'htmlz'
+    commit_name = 'htmlz_output'
+    ui_data = {
+            'css_choices': {
+                'class': _('Use CSS classes'),
+                'inline': _('Use the style attribute'),
+                'tag': _('Use HTML tags wherever possible')
+            },
+            'sheet_choices': {
+                'external': _('Use an external CSS file'),
+                'inline': _('Use a <style> tag in the HTML file')
+            }
+    }
+
+    options = {
+        OptionRecommendation(name='htmlz_css_type', recommended_value='class',
+            level=OptionRecommendation.LOW,
+            choices=list(ui_data['css_choices']),
+            help=_('Specify the handling of CSS. Default is class.\n'
+                   'class: {class}\n'
+                   'inline: {inline}\n'
+                   'tag: {tag}'
+            ).format(**ui_data['css_choices'])),
+        OptionRecommendation(name='htmlz_class_style', recommended_value='external',
+            level=OptionRecommendation.LOW,
+            choices=list(ui_data['sheet_choices']),
+            help=_('How to handle the CSS when using css-type = \'class\'.\n'
+                   'Default is external.\n'
+                   'external: {external}\n'
+                   'inline: {inline}'
+            ).format(**ui_data['sheet_choices'])),
+        OptionRecommendation(name='htmlz_title_filename',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('If set this option causes the file name of the HTML file'
+                ' inside the HTMLZ archive to be based on the book title.')
+            ),
+    }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from lxml import etree
+        from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
+        from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
+        from calibre.utils.zipfile import ZipFile
+        from calibre.utils.filenames import ascii_filename
+
+        # HTML
+        if opts.htmlz_css_type == 'inline':
+            from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
+            OEB2HTMLizer = OEB2HTMLInlineCSSizer
+        elif opts.htmlz_css_type == 'tag':
+            from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
+            OEB2HTMLizer = OEB2HTMLNoCSSizer
+        else:
+            from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer
+
+        with TemporaryDirectory(u'_htmlz_output') as tdir:
+            htmlizer = OEB2HTMLizer(log)
+            html = htmlizer.oeb2html(oeb_book, opts)
+
+            fname = u'index'
+            if opts.htmlz_title_filename:
+                from calibre.utils.filenames import shorten_components_to
+                fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0]
+            with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
+                if isinstance(html, unicode_type):
+                    html = html.encode('utf-8')
+                tf.write(html)
+
+            # CSS
+            if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
+                with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
+                    tf.write(htmlizer.get_css(oeb_book))
+
+            # Images
+            images = htmlizer.images
+            if images:
+                if not os.path.exists(os.path.join(tdir, u'images')):
+                    os.makedirs(os.path.join(tdir, u'images'))
+                for item in oeb_book.manifest:
+                    if item.media_type in OEB_IMAGES and item.href in images:
+                        if item.media_type == SVG_MIME:
+                            data = etree.tostring(item.data, encoding='unicode')
+                        else:
+                            data = item.data
+                        fname = os.path.join(tdir, u'images', images[item.href])
+                        with open(fname, 'wb') as img:
+                            img.write(data)
+
+            # Cover
+            cover_path = None
+            try:
+                cover_data = None
+                if oeb_book.metadata.cover:
+                    term = oeb_book.metadata.cover[0].term
+                    cover_data = oeb_book.guide[term].item.data
+                if cover_data:
+                    from calibre.utils.img import save_cover_data_to
+                    cover_path = os.path.join(tdir, u'cover.jpg')
+                    with lopen(cover_path, 'w') as cf:
+                        cf.write('')
+                    save_cover_data_to(cover_data, cover_path)
+            except:
+                import traceback
+                traceback.print_exc()
+
+            # Metadata
+            with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
+                opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8')))
+                mi = opf.to_book_metadata()
+                if cover_path:
+                    mi.cover = u'cover.jpg'
+                mdataf.write(metadata_to_opf(mi))
+
+            htmlz = ZipFile(output_path, 'w')
+            htmlz.add_dir(tdir)
--- a/ebook_converter/ebooks/conversion/plugins/lit_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/lit_input.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.customize.conversion import InputFormatPlugin
+
+
+class LITInput(InputFormatPlugin):
+
+    name        = 'LIT Input'
+    author      = 'Marshall T. Vandegrift'
+    description = 'Convert LIT files to HTML'
+    file_types  = {'lit'}
+    commit_name = 'lit_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.lit.reader import LitReader
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        self.log = log
+        return create_oebbook(log, stream, options, reader=LitReader)
+
+    def postprocess_book(self, oeb, opts, log):
+        from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
+        for item in oeb.spine:
+            root = item.data
+            if not hasattr(root, 'xpath'):
+                continue
+            for bad in ('metadata', 'guide'):
+                metadata = XPath('//h:'+bad)(root)
+                if metadata:
+                    for x in metadata:
+                        x.getparent().remove(x)
+            body = XPath('//h:body')(root)
+            if body:
+                body = body[0]
+                if len(body) == 1 and body[0].tag == XHTML('pre'):
+                    pre = body[0]
+                    from calibre.ebooks.txt.processor import convert_basic, \
+                        separate_paragraphs_single_line
+                    from calibre.ebooks.chardet import xml_to_unicode
+                    from calibre.utils.xml_parse import safe_xml_fromstring
+                    import copy
+                    self.log('LIT file with all text in singe <pre> tag detected')
+                    html = separate_paragraphs_single_line(pre.text)
+                    html = convert_basic(html).replace('<html>',
+                            '<html xmlns="%s">'%XHTML_NS)
+                    html = xml_to_unicode(html, strip_encoding_pats=True,
+                            resolve_entities=True)[0]
+                    if opts.smarten_punctuation:
+                        # SmartyPants skips text inside <pre> tags
+                        from calibre.ebooks.conversion.preprocess import smarten_punctuation
+                        html = smarten_punctuation(html, self.log)
+                    root = safe_xml_fromstring(html)
+                    body = XPath('//h:body')(root)
+                    pre.tag = XHTML('div')
+                    pre.text = ''
+                    for elem in body:
+                        ne = copy.deepcopy(elem)
+                        pre.append(ne)
--- a/ebook_converter/ebooks/conversion/plugins/lit_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/lit_output.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+from calibre.customize.conversion import OutputFormatPlugin
+
+
+class LITOutput(OutputFormatPlugin):
+
+    name = 'LIT Output'
+    author = 'Marshall T. Vandegrift'
+    file_type = 'lit'
+    commit_name = 'lit_output'
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        self.log, self.opts, self.oeb = log, opts, oeb
+        from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
+        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
+        from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
+        from calibre.ebooks.lit.writer import LitWriter
+        from calibre.ebooks.oeb.transforms.split import Split
+        split = Split(split_on_page_breaks=True, max_flow_size=0,
+                remove_css_pagebreaks=False)
+        split(self.oeb, self.opts)
+
+        tocadder = HTMLTOCAdder()
+        tocadder(oeb, opts)
+        mangler = CaseMangler()
+        mangler(oeb, opts)
+        rasterizer = SVGRasterizer()
+        rasterizer(oeb, opts)
+        lit = LitWriter(self.opts)
+        lit(oeb, output_path)
--- a/ebook_converter/ebooks/conversion/plugins/lrf_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/lrf_input.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, sys
+from calibre.customize.conversion import InputFormatPlugin
+
+
+class LRFInput(InputFormatPlugin):
+
+    name        = 'LRF Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert LRF files to HTML'
+    file_types  = {'lrf'}
+    commit_name = 'lrf_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
+                Canvas, ImageBlock, RuledLine)
+        self.log = log
+        self.log('Generating XML')
+        from calibre.ebooks.lrf.lrfparser import LRFDocument
+        from calibre.utils.xml_parse import safe_xml_fromstring
+        from lxml import etree
+        d = LRFDocument(stream)
+        d.parse()
+        xml = d.to_xml(write_files=True)
+        if options.verbose > 2:
+            open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
+        doc = safe_xml_fromstring(xml)
+
+        char_button_map = {}
+        for x in doc.xpath('//CharButton[@refobj]'):
+            ro = x.get('refobj')
+            jump_button = doc.xpath('//*[@objid="%s"]'%ro)
+            if jump_button:
+                jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]')
+                if jump_to:
+                    char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'),
+                            jump_to[0].get('refobj'))
+        plot_map = {}
+        for x in doc.xpath('//Plot[@refobj]'):
+            ro = x.get('refobj')
+            image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro)
+            if image:
+                imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'%
+                    image[0].get('refstream'))
+                if imgstr:
+                    plot_map[ro] = imgstr[0].get('file')
+
+        self.log('Converting XML to HTML...')
+        styledoc = safe_xml_fromstring(P('templates/lrf.xsl', data=True))
+        media_type = MediaType()
+        styles = Styles()
+        text_block = TextBlock(styles, char_button_map, plot_map, log)
+        canvas = Canvas(doc, styles, text_block, log)
+        image_block = ImageBlock(canvas)
+        ruled_line = RuledLine()
+        extensions = {
+                ('calibre', 'media-type') : media_type,
+                ('calibre', 'text-block') : text_block,
+                ('calibre', 'ruled-line') : ruled_line,
+                ('calibre', 'styles')     : styles,
+                ('calibre', 'canvas')     : canvas,
+                ('calibre', 'image-block'): image_block,
+                }
+        transform = etree.XSLT(styledoc, extensions=extensions)
+        try:
+            result = transform(doc)
+        except RuntimeError:
+            sys.setrecursionlimit(5000)
+            result = transform(doc)
+
+        with open('content.opf', 'wb') as f:
+            f.write(result)
+        styles.write()
+        return os.path.abspath('content.opf')
--- a/ebook_converter/ebooks/conversion/plugins/lrf_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/lrf_output.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import sys, os
+
+from calibre.customize.conversion import OutputFormatPlugin
+from calibre.customize.conversion import OptionRecommendation
+from polyglot.builtins import unicode_type
+
+
+class LRFOptions(object):
+
+    def __init__(self, output, opts, oeb):
+        def f2s(f):
+            try:
+                return unicode_type(f[0])
+            except:
+                return ''
+        m = oeb.metadata
+        for x in ('left', 'top', 'right', 'bottom'):
+            attr = 'margin_'+x
+            val = getattr(opts, attr)
+            if val < 0:
+                setattr(opts, attr, 0)
+        self.title = None
+        self.author = self.publisher = _('Unknown')
+        self.title_sort = self.author_sort = ''
+        for x in m.creator:
+            if x.role == 'aut':
+                self.author = unicode_type(x)
+                fa = unicode_type(getattr(x, 'file_as', ''))
+                if fa:
+                    self.author_sort = fa
+        for x in m.title:
+            if unicode_type(x.file_as):
+                self.title_sort = unicode_type(x.file_as)
+        self.freetext = f2s(m.description)
+        self.category = f2s(m.subject)
+        self.cover = None
+        self.use_metadata_cover = True
+        self.output = output
+        self.ignore_tables = opts.linearize_tables
+        if opts.disable_font_rescaling:
+            self.base_font_size = 0
+        else:
+            self.base_font_size = opts.base_font_size
+        self.blank_after_para = opts.insert_blank_line
+        self.use_spine = True
+        self.font_delta = 0
+        self.ignore_colors = False
+        from calibre.ebooks.lrf import PRS500_PROFILE
+        self.profile = PRS500_PROFILE
+        self.link_levels = sys.maxsize
+        self.link_exclude = '@'
+        self.no_links_in_toc = True
+        self.disable_chapter_detection = True
+        self.chapter_regex = 'dsadcdswcdec'
+        self.chapter_attr = '$,,$'
+        self.override_css = self._override_css = ''
+        self.page_break = 'h[12]'
+        self.force_page_break = '$'
+        self.force_page_break_attr = '$'
+        self.add_chapters_to_toc = False
+        self.baen = self.pdftohtml = self.book_designer = False
+        self.verbose = opts.verbose
+        self.encoding = 'utf-8'
+        self.lrs = False
+        self.minimize_memory_usage = False
+        self.autorotation = opts.enable_autorotation
+        self.header_separation = (self.profile.dpi/72.) * opts.header_separation
+        self.headerformat = opts.header_format
+
+        for x in ('top', 'bottom', 'left', 'right'):
+            setattr(self, x+'_margin',
+                (self.profile.dpi/72.) * float(getattr(opts, 'margin_'+x)))
+
+        for x in ('wordspace', 'header', 'header_format',
+                'minimum_indent', 'serif_family',
+                'render_tables_as_images', 'sans_family', 'mono_family',
+                'text_size_multiplier_for_rendered_tables'):
+            setattr(self, x, getattr(opts, x))
+
+
+class LRFOutput(OutputFormatPlugin):
+
+    name = 'LRF Output'
+    author = 'Kovid Goyal'
+    file_type = 'lrf'
+    commit_name = 'lrf_output'
+
+    options = {
+        OptionRecommendation(name='enable_autorotation', recommended_value=False,
+            help=_('Enable auto-rotation of images that are wider than the screen width.')
+        ),
+        OptionRecommendation(name='wordspace',
+            recommended_value=2.5, level=OptionRecommendation.LOW,
+            help=_('Set the space between words in pts. Default is %default')
+        ),
+        OptionRecommendation(name='header', recommended_value=False,
+            help=_('Add a header to all the pages with title and author.')
+        ),
+        OptionRecommendation(name='header_format', recommended_value="%t by %a",
+            help=_('Set the format of the header. %a is replaced by the author '
+            'and %t by the title. Default is %default')
+        ),
+        OptionRecommendation(name='header_separation', recommended_value=0,
+            help=_('Add extra spacing below the header. Default is %default pt.')
+        ),
+        OptionRecommendation(name='minimum_indent', recommended_value=0,
+            help=_('Minimum paragraph indent (the indent of the first line '
+            'of a paragraph) in pts. Default: %default')
+        ),
+        OptionRecommendation(name='render_tables_as_images',
+            recommended_value=False,
+            help=_('This option has no effect')
+        ),
+        OptionRecommendation(name='text_size_multiplier_for_rendered_tables',
+            recommended_value=1.0,
+            help=_('Multiply the size of text in rendered tables by this '
+            'factor. Default is %default')
+        ),
+        OptionRecommendation(name='serif_family', recommended_value=None,
+            help=_('The serif family of fonts to embed')
+        ),
+        OptionRecommendation(name='sans_family', recommended_value=None,
+            help=_('The sans-serif family of fonts to embed')
+        ),
+        OptionRecommendation(name='mono_family', recommended_value=None,
+            help=_('The monospace family of fonts to embed')
+        ),
+
+    }
+
+    recommendations = {
+        ('change_justification', 'original', OptionRecommendation.HIGH)}
+
+    def convert_images(self, pages, opts, wide):
+        from calibre.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock
+        from uuid import uuid4
+        from calibre.constants import __appname__, __version__
+
+        width, height = (784, 1012) if wide else (584, 754)
+
+        ps = {}
+        ps['topmargin']      = 0
+        ps['evensidemargin'] = 0
+        ps['oddsidemargin']  = 0
+        ps['textwidth']      = width
+        ps['textheight']     = height
+        book = Book(title=opts.title, author=opts.author,
+                bookid=uuid4().hex,
+                publisher='%s %s'%(__appname__, __version__),
+                category=_('Comic'), pagestyledefault=ps,
+                booksetting=BookSetting(screenwidth=width, screenheight=height))
+        for page in pages:
+            imageStream = ImageStream(page)
+            _page = book.create_page()
+            _page.append(ImageBlock(refstream=imageStream,
+                        blockwidth=width, blockheight=height, xsize=width,
+                        ysize=height, x1=width, y1=height))
+            book.append(_page)
+
+        book.renderLrf(open(opts.output, 'wb'))
+
+    def flatten_toc(self):
+        from calibre.ebooks.oeb.base import TOC
+        nroot = TOC()
+        for x in self.oeb.toc.iterdescendants():
+            nroot.add(x.title, x.href)
+        self.oeb.toc = nroot
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        self.log, self.opts, self.oeb = log, opts, oeb
+
+        lrf_opts = LRFOptions(output_path, opts, oeb)
+
+        if input_plugin.is_image_collection:
+            self.convert_images(input_plugin.get_images(), lrf_opts,
+                    getattr(opts, 'wide', False))
+            return
+
+        self.flatten_toc()
+
+        from calibre.ptempfile import TemporaryDirectory
+        with TemporaryDirectory('_lrf_output') as tdir:
+            from calibre.customize.ui import plugin_for_output_format
+            oeb_output = plugin_for_output_format('oeb')
+            oeb_output.convert(oeb, tdir, input_plugin, opts, log)
+            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
+            from calibre.ebooks.lrf.html.convert_from import process_file
+            process_file(os.path.join(tdir, opf), lrf_opts, self.log)
--- a/ebook_converter/ebooks/conversion/plugins/mobi_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/mobi_input.py
@@ -0,0 +1,66 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import unicode_type
+
+
+class MOBIInput(InputFormatPlugin):
+
+    name        = 'MOBI Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
+    file_types  = {'mobi', 'prc', 'azw', 'azw3', 'pobi'}
+    commit_name = 'mobi_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        self.is_kf8 = False
+        self.mobi_is_joint = False
+
+        from calibre.ebooks.mobi.reader.mobi6 import MobiReader
+        from lxml import html
+        parse_cache = {}
+        try:
+            mr = MobiReader(stream, log, options.input_encoding,
+                        options.debug_pipeline)
+            if mr.kf8_type is None:
+                mr.extract_content('.', parse_cache)
+
+        except:
+            mr = MobiReader(stream, log, options.input_encoding,
+                        options.debug_pipeline, try_extra_data_fix=True)
+            if mr.kf8_type is None:
+                mr.extract_content('.', parse_cache)
+
+        if mr.kf8_type is not None:
+            log('Found KF8 MOBI of type %r'%mr.kf8_type)
+            if mr.kf8_type == 'joint':
+                self.mobi_is_joint = True
+            from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
+            mr = Mobi8Reader(mr, log)
+            opf = os.path.abspath(mr())
+            self.encrypted_fonts = mr.encrypted_fonts
+            self.is_kf8 = True
+            return opf
+
+        raw = parse_cache.pop('calibre_raw_mobi_markup', False)
+        if raw:
+            if isinstance(raw, unicode_type):
+                raw = raw.encode('utf-8')
+            with lopen('debug-raw.html', 'wb') as f:
+                f.write(raw)
+        from calibre.ebooks.oeb.base import close_self_closing_tags
+        for f, root in parse_cache.items():
+            raw = html.tostring(root, encoding='utf-8', method='xml',
+                    include_meta_content_type=False)
+            raw = close_self_closing_tags(raw)
+            with lopen(f, 'wb') as q:
+                q.write(raw)
+        accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
+        return mr.created_opf_path
--- a/ebook_converter/ebooks/conversion/plugins/mobi_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/mobi_output.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.customize.conversion import (OutputFormatPlugin,
+        OptionRecommendation)
+from polyglot.builtins import unicode_type
+
+
+def remove_html_cover(oeb, log):
+    from calibre.ebooks.oeb.base import OEB_DOCS
+
+    if not oeb.metadata.cover \
+        or 'cover' not in oeb.guide:
+        return
+    href = oeb.guide['cover'].href
+    del oeb.guide['cover']
+    item = oeb.manifest.hrefs[href]
+    if item.spine_position is not None:
+        log.warn('Found an HTML cover: ', item.href, 'removing it.',
+                'If you find some content missing from the output MOBI, it '
+                'is because you misidentified the HTML cover in the input '
+                'document')
+        oeb.spine.remove(item)
+        if item.media_type in OEB_DOCS:
+            oeb.manifest.remove(item)
+
+
+def extract_mobi(output_path, opts):
+    if opts.extract_to is not None:
+        from calibre.ebooks.mobi.debug.main import inspect_mobi
+        ddir = opts.extract_to
+        inspect_mobi(output_path, ddir=ddir)
+
+
+class MOBIOutput(OutputFormatPlugin):
+
+    name = 'MOBI Output'
+    author = 'Kovid Goyal'
+    file_type = 'mobi'
+    commit_name = 'mobi_output'
+    ui_data = {'file_types': ['old', 'both', 'new']}
+
+    options = {
+        OptionRecommendation(name='prefer_author_sort',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('When present, use author sort field as author.')
+        ),
+        OptionRecommendation(name='no_inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Don\'t add Table of Contents to the book. Useful if '
+                'the book has its own table of contents.')),
+        OptionRecommendation(name='toc_title', recommended_value=None,
+            help=_('Title for any generated in-line table of contents.')
+        ),
+        OptionRecommendation(name='dont_compress',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Disable compression of the file contents.')
+        ),
+        OptionRecommendation(name='personal_doc', recommended_value='[PDOC]',
+            help=_('Tag for MOBI files to be marked as personal documents.'
+                   ' This option has no effect on the conversion. It is used'
+                   ' only when sending MOBI files to a device. If the file'
+                   ' being sent has the specified tag, it will be marked as'
+                   ' a personal document when sent to the Kindle.')
+        ),
+        OptionRecommendation(name='mobi_ignore_margins',
+            recommended_value=False,
+            help=_('Ignore margins in the input document. If False, then '
+                'the MOBI output plugin will try to convert margins specified'
+                ' in the input document, otherwise it will ignore them.')
+        ),
+        OptionRecommendation(name='mobi_toc_at_start',
+            recommended_value=False,
+            help=_('When adding the Table of Contents to the book, add it at the start of the '
+                'book instead of the end. Not recommended.')
+        ),
+        OptionRecommendation(name='extract_to',
+            help=_('Extract the contents of the generated %s file to the '
+                'specified directory. The contents of the directory are first '
+                'deleted, so be careful.') % 'MOBI'
+        ),
+        OptionRecommendation(name='share_not_sync', recommended_value=False,
+            help=_('Enable sharing of book content via Facebook etc. '
+                ' on the Kindle. WARNING: Using this feature means that '
+                ' the book will not auto sync its last read position '
+                ' on multiple devices. Complain to Amazon.')
+        ),
+        OptionRecommendation(name='mobi_keep_original_images',
+            recommended_value=False,
+            help=_('By default calibre converts all images to JPEG format '
+                'in the output MOBI file. This is for maximum compatibility '
+                'as some older MOBI viewers have problems with other image '
+                'formats. This option tells calibre not to do this. '
+                'Useful if your document contains lots of GIF/PNG images that '
+                'become very large when converted to JPEG.')),
+        OptionRecommendation(name='mobi_file_type', choices=ui_data['file_types'], recommended_value='old',
+            help=_('By default calibre generates MOBI files that contain the '
+                'old MOBI 6 format. This format is compatible with all '
+                'devices. However, by changing this setting, you can tell '
+                'calibre to generate MOBI files that contain both MOBI 6 and '
+                'the new KF8 format, or only the new KF8 format. KF8 has '
+                'more features than MOBI 6, but only works with newer Kindles. '
+                'Allowed values: {}').format('old, both, new')),
+
+    }
+
+    def check_for_periodical(self):
+        if self.is_periodical:
+            self.periodicalize_toc()
+            self.check_for_masthead()
+            self.opts.mobi_periodical = True
+        else:
+            self.opts.mobi_periodical = False
+
+    def check_for_masthead(self):
+        found = 'masthead' in self.oeb.guide
+        if not found:
+            from calibre.ebooks import generate_masthead
+            self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
+            raw = generate_masthead(unicode_type(self.oeb.metadata['title'][0]))
+            id, href = self.oeb.manifest.generate('masthead', 'masthead')
+            self.oeb.manifest.add(id, href, 'image/gif', data=raw)
+            self.oeb.guide.add('masthead', 'Masthead Image', href)
+        else:
+            self.oeb.log.debug('Using mastheadImage supplied in manifest...')
+
+    def periodicalize_toc(self):
+        from calibre.ebooks.oeb.base import TOC
+        toc = self.oeb.toc
+        if not toc or len(self.oeb.spine) < 3:
+            return
+        if toc and toc[0].klass != 'periodical':
+            one, two = self.oeb.spine[0], self.oeb.spine[1]
+            self.log('Converting TOC for MOBI periodical indexing...')
+
+            articles = {}
+            if toc.depth() < 3:
+                # single section periodical
+                self.oeb.manifest.remove(one)
+                self.oeb.manifest.remove(two)
+                sections = [TOC(klass='section', title=_('All articles'),
+                    href=self.oeb.spine[0].href)]
+                for x in toc:
+                    sections[0].nodes.append(x)
+            else:
+                # multi-section periodical
+                self.oeb.manifest.remove(one)
+                sections = list(toc)
+                for i,x in enumerate(sections):
+                    x.klass = 'section'
+                    articles_ = list(x)
+                    if articles_:
+                        self.oeb.manifest.remove(self.oeb.manifest.hrefs[x.href])
+                        x.href = articles_[0].href
+
+            for sec in sections:
+                articles[id(sec)] = []
+                for a in list(sec):
+                    a.klass = 'article'
+                    articles[id(sec)].append(a)
+                    sec.nodes.remove(a)
+
+            root = TOC(klass='periodical', href=self.oeb.spine[0].href,
+                    title=unicode_type(self.oeb.metadata.title[0]))
+
+            for s in sections:
+                if articles[id(s)]:
+                    for a in articles[id(s)]:
+                        s.nodes.append(a)
+                    root.nodes.append(s)
+
+            for x in list(toc.nodes):
+                toc.nodes.remove(x)
+
+            toc.nodes.append(root)
+
+            # Fix up the periodical href to point to first section href
+            toc.nodes[0].href = toc.nodes[0].nodes[0].href
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        from calibre.ebooks.mobi.writer2.resources import Resources
+        self.log, self.opts, self.oeb = log, opts, oeb
+
+        mobi_type = opts.mobi_file_type
+        if self.is_periodical:
+            mobi_type = 'old'  # Amazon does not support KF8 periodicals
+        create_kf8 = mobi_type in ('new', 'both')
+
+        remove_html_cover(self.oeb, self.log)
+        resources = Resources(oeb, opts, self.is_periodical,
+                add_fonts=create_kf8)
+        self.check_for_periodical()
+
+        if create_kf8:
+            from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
+            remove_duplicate_anchors(self.oeb)
+            # Split on pagebreaks so that the resulting KF8 is faster to load
+            from calibre.ebooks.oeb.transforms.split import Split
+            Split()(self.oeb, self.opts)
+
+        kf8 = self.create_kf8(resources, for_joint=mobi_type=='both'
+                ) if create_kf8 else None
+        if mobi_type == 'new':
+            kf8.write(output_path)
+            extract_mobi(output_path, opts)
+            return
+
+        self.log('Creating MOBI 6 output')
+        self.write_mobi(input_plugin, output_path, kf8, resources)
+
+    def create_kf8(self, resources, for_joint=False):
+        from calibre.ebooks.mobi.writer8.main import create_kf8_book
+        return create_kf8_book(self.oeb, self.opts, resources,
+                for_joint=for_joint)
+
+    def write_mobi(self, input_plugin, output_path, kf8, resources):
+        from calibre.ebooks.mobi.mobiml import MobiMLizer
+        from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
+        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
+        from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
+        from calibre.customize.ui import plugin_for_input_format
+
+        opts, oeb = self.opts, self.oeb
+        if not opts.no_inline_toc:
+            tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
+                    opts.mobi_toc_at_start else 'end')
+            tocadder(oeb, opts)
+        mangler = CaseMangler()
+        mangler(oeb, opts)
+        try:
+            rasterizer = SVGRasterizer()
+            rasterizer(oeb, opts)
+        except Unavailable:
+            self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
+        else:
+            # Add rasterized SVG images
+            resources.add_extra_images()
+        if hasattr(self.oeb, 'inserted_metadata_jacket'):
+            self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
+        mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
+        mobimlizer(oeb, opts)
+        write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
+        from calibre.ebooks.mobi.writer2.main import MobiWriter
+        writer = MobiWriter(opts, resources, kf8,
+                        write_page_breaks_after_item=write_page_breaks_after_item)
+        writer(oeb, output_path)
+        extract_mobi(output_path, opts)
+
+    def specialize_css_for_output(self, log, opts, item, stylizer):
+        from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
+        CSSCleanup(log, opts)(item, stylizer)
+
+    def workaround_fire_bugs(self, jacket):
+        # The idiotic Fire crashes when trying to render the table used to
+        # layout the jacket
+        from calibre.ebooks.oeb.base import XHTML
+        for table in jacket.data.xpath('//*[local-name()="table"]'):
+            table.tag = XHTML('div')
+            for tr in table.xpath('descendant::*[local-name()="tr"]'):
+                cols = tr.xpath('descendant::*[local-name()="td"]')
+                tr.tag = XHTML('div')
+                for td in cols:
+                    td.tag = XHTML('span' if cols else 'div')
+
+
+class AZW3Output(OutputFormatPlugin):
+
+    name = 'AZW3 Output'
+    author = 'Kovid Goyal'
+    file_type = 'azw3'
+    commit_name = 'azw3_output'
+
+    options = {
+        OptionRecommendation(name='prefer_author_sort',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('When present, use author sort field as author.')
+        ),
+        OptionRecommendation(name='no_inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Don\'t add Table of Contents to the book. Useful if '
+                'the book has its own table of contents.')),
+        OptionRecommendation(name='toc_title', recommended_value=None,
+            help=_('Title for any generated in-line table of contents.')
+        ),
+        OptionRecommendation(name='dont_compress',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Disable compression of the file contents.')
+        ),
+        OptionRecommendation(name='mobi_toc_at_start',
+            recommended_value=False,
+            help=_('When adding the Table of Contents to the book, add it at the start of the '
+                'book instead of the end. Not recommended.')
+        ),
+        OptionRecommendation(name='extract_to',
+            help=_('Extract the contents of the generated %s file to the '
+                'specified directory. The contents of the directory are first '
+                'deleted, so be careful.') % 'AZW3'),
+        OptionRecommendation(name='share_not_sync', recommended_value=False,
+            help=_('Enable sharing of book content via Facebook etc. '
+                ' on the Kindle. WARNING: Using this feature means that '
+                ' the book will not auto sync its last read position '
+                ' on multiple devices. Complain to Amazon.')
+        ),
+    }
+
+    def convert(self, oeb, output_path, input_plugin, opts, log):
+        from calibre.ebooks.mobi.writer2.resources import Resources
+        from calibre.ebooks.mobi.writer8.main import create_kf8_book
+        from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
+
+        self.oeb, self.opts, self.log = oeb, opts, log
+        opts.mobi_periodical = self.is_periodical
+        passthrough = getattr(opts, 'mobi_passthrough', False)
+        remove_duplicate_anchors(oeb)
+
+        resources = Resources(self.oeb, self.opts, self.is_periodical,
+                add_fonts=True, process_images=False)
+        if not passthrough:
+            remove_html_cover(self.oeb, self.log)
+
+            # Split on pagebreaks so that the resulting KF8 is faster to load
+            from calibre.ebooks.oeb.transforms.split import Split
+            Split()(self.oeb, self.opts)
+
+        kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)
+
+        kf8.write(output_path)
+        extract_mobi(output_path, opts)
+
+    def specialize_css_for_output(self, log, opts, item, stylizer):
+        from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
+        CSSCleanup(log, opts)(item, stylizer)
--- a/ebook_converter/ebooks/conversion/plugins/odt_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/odt_input.py
@@ -0,0 +1,25 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Convert an ODT file into a Open Ebook
+'''
+
+from calibre.customize.conversion import InputFormatPlugin
+
+
+class ODTInput(InputFormatPlugin):
+
+    name        = 'ODT Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert ODT (OpenOffice) files to HTML'
+    file_types  = {'odt'}
+    commit_name = 'odt_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.odt.input import Extract
+        return Extract()(stream, '.', log)
--- a/ebook_converter/ebooks/conversion/plugins/oeb_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/oeb_output.py
@@ -0,0 +1,122 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, re
+
+
+from calibre.customize.conversion import (OutputFormatPlugin,
+        OptionRecommendation)
+from calibre import CurrentDir
+
+
+class OEBOutput(OutputFormatPlugin):
+
+    name = 'OEB Output'
+    author = 'Kovid Goyal'
+    file_type = 'oeb'
+    commit_name = 'oeb_output'
+
+    recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from polyglot.urllib import unquote
+        from lxml import etree
+
+        self.log, self.opts = log, opts
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES
+        from calibre.ebooks.oeb.normalize_css import condense_sheet
+        with CurrentDir(output_path):
+            results = oeb_book.to_opf2(page_map=True)
+            for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
+                href, root = results.pop(key, [None, None])
+                if root is not None:
+                    if key == OPF_MIME:
+                        try:
+                            self.workaround_nook_cover_bug(root)
+                        except:
+                            self.log.exception('Something went wrong while trying to'
+                                    ' workaround Nook cover bug, ignoring')
+                        try:
+                            self.workaround_pocketbook_cover_bug(root)
+                        except:
+                            self.log.exception('Something went wrong while trying to'
+                                    ' workaround Pocketbook cover bug, ignoring')
+                        self.migrate_lang_code(root)
+                    raw = etree.tostring(root, pretty_print=True,
+                            encoding='utf-8', xml_declaration=True)
+                    if key == OPF_MIME:
+                        # Needed as I can't get lxml to output opf:role and
+                        # not output <opf:metadata> as well
+                        raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
+                    with lopen(href, 'wb') as f:
+                        f.write(raw)
+
+            for item in oeb_book.manifest:
+                if (
+                        not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(
+                            item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name):
+                    condense_sheet(item.data)
+                path = os.path.abspath(unquote(item.href))
+                dir = os.path.dirname(path)
+                if not os.path.exists(dir):
+                    os.makedirs(dir)
+                with lopen(path, 'wb') as f:
+                    f.write(item.bytes_representation)
+                item.unload_data_from_memory(memory=path)
+
+    def workaround_nook_cover_bug(self, root):  # {{{
+        cov = root.xpath('//*[local-name() = "meta" and @name="cover" and'
+                ' @content != "cover"]')
+
+        def manifest_items_with_id(id_):
+            return root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
+                ' and @id="%s"]'%id_)
+
+        if len(cov) == 1:
+            cov = cov[0]
+            covid = cov.get('content', '')
+
+            if covid:
+                manifest_item = manifest_items_with_id(covid)
+                if len(manifest_item) == 1 and \
+                        manifest_item[0].get('media-type',
+                                '').startswith('image/'):
+                    self.log.warn('The cover image has an id != "cover". Renaming'
+                            ' to work around bug in Nook Color')
+
+                    from calibre.ebooks.oeb.base import uuid_id
+                    newid = uuid_id()
+
+                    for item in manifest_items_with_id('cover'):
+                        item.set('id', newid)
+
+                    for x in root.xpath('//*[@idref="cover"]'):
+                        x.set('idref', newid)
+
+                    manifest_item = manifest_item[0]
+                    manifest_item.set('id', 'cover')
+                    cov.set('content', 'cover')
+    # }}}
+
+    def workaround_pocketbook_cover_bug(self, root):  # {{{
+        m = root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
+                ' and @id="cover"]')
+        if len(m) == 1:
+            m = m[0]
+            p = m.getparent()
+            p.remove(m)
+            p.insert(0, m)
+    # }}}
+
+    def migrate_lang_code(self, root):  # {{{
+        from calibre.utils.localization import lang_as_iso639_1
+        for lang in root.xpath('//*[local-name() = "language"]'):
+            clc = lang_as_iso639_1(lang.text)
+            if clc:
+                lang.text = clc
+    # }}}
--- a/ebook_converter/ebooks/conversion/plugins/pdb_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/pdb_input.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import getcwd
+
+
+class PDBInput(InputFormatPlugin):
+
+    name        = 'PDB Input'
+    author      = 'John Schember'
+    description = 'Convert PDB to HTML'
+    file_types  = {'pdb', 'updb'}
+    commit_name = 'pdb_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.pdb.header import PdbHeaderReader
+        from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
+
+        header = PdbHeaderReader(stream)
+        Reader = get_reader(header.ident)
+
+        if Reader is None:
+            raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' %
+                           (header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown'))))
+
+        log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))
+
+        reader = Reader(header, stream, log, options)
+        opf = reader.extract_content(getcwd())
+
+        return opf
--- a/ebook_converter/ebooks/conversion/plugins/pdb_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/pdb_output.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, \
+    OptionRecommendation
+from calibre.ebooks.pdb import PDBError, get_writer, ALL_FORMAT_WRITERS
+
+
+class PDBOutput(OutputFormatPlugin):
+
+    name = 'PDB Output'
+    author = 'John Schember'
+    file_type = 'pdb'
+    commit_name = 'pdb_output'
+    ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)}
+
+    options = {
+        OptionRecommendation(name='format', recommended_value='doc',
+            level=OptionRecommendation.LOW,
+            short_switch='f', choices=list(ALL_FORMAT_WRITERS),
+            help=(_('Format to use inside the pdb container. Choices are:') + ' %s' % sorted(ALL_FORMAT_WRITERS))),
+        OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
+            level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the output document. '
+            'The default is cp1252. Note: This option is not honored by all '
+            'formats.')),
+        OptionRecommendation(name='inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Add Table of Contents to beginning of the book.')),
+    }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = lopen(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        Writer = get_writer(opts.format)
+
+        if Writer is None:
+            raise PDBError('No writer available for format %s.' % format)
+
+        setattr(opts, 'max_line_length', 0)
+        setattr(opts, 'force_max_line_length', False)
+
+        writer = Writer(opts, log)
+
+        out_stream.seek(0)
+        out_stream.truncate()
+
+        writer.write_content(oeb_book, out_stream, oeb_book.metadata)
+
+        if close:
+            out_stream.close()
--- a/ebook_converter/ebooks/conversion/plugins/pdf_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/pdf_input.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from polyglot.builtins import as_bytes, getcwd
+
+
+class PDFInput(InputFormatPlugin):
+
+    name        = 'PDF Input'
+    author      = 'Kovid Goyal and John Schember'
+    description = 'Convert PDF files to HTML'
+    file_types  = {'pdf'}
+    commit_name = 'pdf_input'
+
+    options = {
+        OptionRecommendation(name='no_images', recommended_value=False,
+            help=_('Do not extract images from the document')),
+        OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
+            help=_('Scale used to determine the length at which a line should '
+            'be unwrapped. Valid values are a decimal between 0 and 1. The '
+            'default is 0.45, just below the median line length.')),
+        OptionRecommendation(name='new_pdf_engine', recommended_value=False,
+            help=_('Use the new PDF conversion engine. Currently not operational.'))
+    }
+
+    def convert_new(self, stream, accelerators):
+        from calibre.ebooks.pdf.pdftohtml import pdftohtml
+        from calibre.utils.cleantext import clean_ascii_chars
+        from calibre.ebooks.pdf.reflow import PDFDocument
+
+        pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True)
+        with lopen('index.xml', 'rb') as f:
+            xml = clean_ascii_chars(f.read())
+        PDFDocument(xml, self.opts, self.log)
+        return os.path.join(getcwd(), 'metadata.opf')
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.metadata.opf2 import OPFCreator
+        from calibre.ebooks.pdf.pdftohtml import pdftohtml
+
+        log.debug('Converting file to html...')
+        # The main html file will be named index.html
+        self.opts, self.log = options, log
+        if options.new_pdf_engine:
+            return self.convert_new(stream, accelerators)
+        pdftohtml(getcwd(), stream.name, options.no_images)
+
+        from calibre.ebooks.metadata.meta import get_metadata
+        log.debug('Retrieving document metadata...')
+        mi = get_metadata(stream, 'pdf')
+        opf = OPFCreator(getcwd(), mi)
+
+        manifest = [('index.html', None)]
+
+        images = os.listdir(getcwd())
+        images.remove('index.html')
+        for i in images:
+            manifest.append((i, None))
+        log.debug('Generating manifest...')
+        opf.create_manifest(manifest)
+
+        opf.create_spine(['index.html'])
+        log.debug('Rendering manifest...')
+        with lopen('metadata.opf', 'wb') as opffile:
+            opf.render(opffile)
+        if os.path.exists('toc.ncx'):
+            ncxid = opf.manifest.id_for_path('toc.ncx')
+            if ncxid:
+                with lopen('metadata.opf', 'r+b') as f:
+                    raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
+                    f.seek(0)
+                    f.write(raw)
+
+        return os.path.join(getcwd(), 'metadata.opf')
--- a/ebook_converter/ebooks/conversion/plugins/pdf_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/pdf_output.py
@@ -0,0 +1,256 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Convert OEB ebook format to PDF.
+'''
+
+import glob, os
+
+from calibre.customize.conversion import (OutputFormatPlugin,
+    OptionRecommendation)
+from calibre.ptempfile import TemporaryDirectory
+from polyglot.builtins import iteritems, unicode_type
+
+UNITS = ('millimeter', 'centimeter', 'point', 'inch' , 'pica' , 'didot',
+        'cicero', 'devicepixel')
+
+PAPER_SIZES = ('a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
+        'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter')
+
+
+class PDFOutput(OutputFormatPlugin):
+
+    name = 'PDF Output'
+    author = 'Kovid Goyal'
+    file_type = 'pdf'
+    commit_name = 'pdf_output'
+    ui_data = {'paper_sizes': PAPER_SIZES, 'units': UNITS, 'font_types': ('serif', 'sans', 'mono')}
+
+    options = {
+        OptionRecommendation(name='use_profile_size', recommended_value=False,
+            help=_('Instead of using the paper size specified in the PDF Output options,'
+                   ' use a paper size corresponding to the current output profile.'
+                   ' Useful if you want to generate a PDF for viewing on a specific device.')),
+        OptionRecommendation(name='unit', recommended_value='inch',
+            level=OptionRecommendation.LOW, short_switch='u', choices=UNITS,
+            help=_('The unit of measure for page sizes. Default is inch. Choices '
+            'are {} '
+            'Note: This does not override the unit for margins!').format(', '.join(UNITS))),
+        OptionRecommendation(name='paper_size', recommended_value='letter',
+            level=OptionRecommendation.LOW, choices=PAPER_SIZES,
+            help=_('The size of the paper. This size will be overridden when a '
+            'non default output profile is used. Default is letter. Choices '
+            'are {}').format(', '.join(PAPER_SIZES))),
+        OptionRecommendation(name='custom_size', recommended_value=None,
+            help=_('Custom size of the document. Use the form widthxheight '
+            'e.g. `123x321` to specify the width and height. '
+            'This overrides any specified paper-size.')),
+        OptionRecommendation(name='preserve_cover_aspect_ratio',
+            recommended_value=False,
+            help=_('Preserve the aspect ratio of the cover, instead'
+                ' of stretching it to fill the full first page of the'
+                ' generated pdf.')),
+        OptionRecommendation(name='pdf_serif_family',
+            recommended_value='Times', help=_(
+                'The font family used to render serif fonts. Will work only if the font is available system-wide.')),
+        OptionRecommendation(name='pdf_sans_family',
+            recommended_value='Helvetica', help=_(
+                'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.')),
+        OptionRecommendation(name='pdf_mono_family',
+            recommended_value='Courier', help=_(
+                'The font family used to render monospace fonts. Will work only if the font is available system-wide.')),
+        OptionRecommendation(name='pdf_standard_font', choices=ui_data['font_types'],
+            recommended_value='serif', help=_(
+                'The font family used to render monospace fonts')),
+        OptionRecommendation(name='pdf_default_font_size',
+            recommended_value=20, help=_(
+                'The default font size')),
+        OptionRecommendation(name='pdf_mono_font_size',
+            recommended_value=16, help=_(
+                'The default font size for monospaced text')),
+        OptionRecommendation(name='pdf_hyphenate', recommended_value=False,
+            help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')),
+        OptionRecommendation(name='pdf_mark_links', recommended_value=False,
+            help=_('Surround all links with a red box, useful for debugging.')),
+        OptionRecommendation(name='pdf_page_numbers', recommended_value=False,
+            help=_('Add page numbers to the bottom of every page in the generated PDF file. If you '
+                   'specify a footer template, it will take precedence '
+                   'over this option.')),
+        OptionRecommendation(name='pdf_footer_template', recommended_value=None,
+            help=_('An HTML template used to generate %s on every page.'
+                   ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')),
+        OptionRecommendation(name='pdf_header_template', recommended_value=None,
+            help=_('An HTML template used to generate %s on every page.'
+                   ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')),
+        OptionRecommendation(name='pdf_add_toc', recommended_value=False,
+            help=_('Add a Table of Contents at the end of the PDF that lists page numbers. '
+                   'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')),
+        OptionRecommendation(name='toc_title', recommended_value=None,
+            help=_('Title for generated table of contents.')
+        ),
+
+        OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the left page margin, in pts. Default is 72pt.'
+                   ' Overrides the common left page margin setting.')
+        ),
+
+        OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the top page margin, in pts. Default is 72pt.'
+                   ' Overrides the common top page margin setting, unless set to zero.')
+        ),
+
+        OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the right page margin, in pts. Default is 72pt.'
+                   ' Overrides the common right page margin setting, unless set to zero.')
+        ),
+
+        OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0,
+            level=OptionRecommendation.LOW,
+            help=_('The size of the bottom page margin, in pts. Default is 72pt.'
+                   ' Overrides the common bottom page margin setting, unless set to zero.')
+        ),
+        OptionRecommendation(name='pdf_use_document_margins', recommended_value=False,
+            help=_('Use the page margins specified in the input document via @page CSS rules.'
+            ' This will cause the margins specified in the conversion settings to be ignored.'
+            ' If the document does not specify page margins, the conversion settings will be used as a fallback.')
+        ),
+        OptionRecommendation(name='pdf_page_number_map', recommended_value=None,
+            help=_('Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.'
+                ' For example, "if (n < 3) 0; else n - 3;", where n is current page number.')
+        ),
+        OptionRecommendation(name='uncompressed_pdf',
+            recommended_value=False, help=_(
+                'Generate an uncompressed PDF, useful for debugging.')
+        ),
+        OptionRecommendation(name='pdf_odd_even_offset', recommended_value=0.0,
+            level=OptionRecommendation.LOW,
+            help=_(
+                'Shift the text horizontally by the specified offset (in pts).'
+                ' On odd numbered pages, it is shifted to the right and on even'
+                ' numbered pages to the left. Use negative numbers for the opposite'
+                ' effect. Note that this setting is ignored on pages where the margins'
+                ' are smaller than the specified offset. Shifting is done by setting'
+                ' the PDF CropBox, not all software respects the CropBox.'
+            )
+        ),
+
+    }
+
+    def specialize_options(self, log, opts, input_fmt):
+        # Ensure Qt is setup to be used with WebEngine
+        # specialize_options is called early enough in the pipeline
+        # that hopefully no Qt application has been constructed as yet
+        from PyQt5.QtWebEngineCore import QWebEngineUrlScheme
+        from PyQt5.QtWebEngineWidgets import QWebEnginePage  # noqa
+        from calibre.gui2 import must_use_qt
+        from calibre.constants import FAKE_PROTOCOL
+        scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii'))
+        scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host)
+        scheme.setFlags(QWebEngineUrlScheme.SecureScheme)
+        QWebEngineUrlScheme.registerScheme(scheme)
+        must_use_qt()
+        self.input_fmt = input_fmt
+
+        if opts.pdf_use_document_margins:
+            # Prevent the conversion pipeline from overwriting document margins
+            opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        self.stored_page_margins = getattr(opts, '_stored_page_margins', {})
+
+        self.oeb = oeb_book
+        self.input_plugin, self.opts, self.log = input_plugin, opts, log
+        self.output_path = output_path
+        from calibre.ebooks.oeb.base import OPF, OPF2_NS
+        from lxml import etree
+        from io import BytesIO
+        package = etree.Element(OPF('package'),
+            attrib={'version': '2.0', 'unique-identifier': 'dummy'},
+            nsmap={None: OPF2_NS})
+        from calibre.ebooks.metadata.opf2 import OPF
+        self.oeb.metadata.to_opf2(package)
+        self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata()
+        self.cover_data = None
+
+        if input_plugin.is_image_collection:
+            log.debug('Converting input as an image collection...')
+            self.convert_images(input_plugin.get_images())
+        else:
+            log.debug('Converting input as a text based book...')
+            self.convert_text(oeb_book)
+
+    def convert_images(self, images):
+        from calibre.ebooks.pdf.image_writer import convert
+        convert(images, self.output_path, self.opts, self.metadata, self.report_progress)
+
+    def get_cover_data(self):
+        oeb = self.oeb
+        if (oeb.metadata.cover and unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
+            cover_id = unicode_type(oeb.metadata.cover[0])
+            item = oeb.manifest.ids[cover_id]
+            self.cover_data = item.data
+
+    def process_fonts(self):
+        ''' Make sure all fonts are embeddable '''
+        from calibre.ebooks.oeb.base import urlnormalize
+        from calibre.utils.fonts.utils import remove_embed_restriction
+
+        processed = set()
+        for item in list(self.oeb.manifest):
+            if not hasattr(item.data, 'cssRules'):
+                continue
+            for i, rule in enumerate(item.data.cssRules):
+                if rule.type == rule.FONT_FACE_RULE:
+                    try:
+                        s = rule.style
+                        src = s.getProperty('src').propertyValue[0].uri
+                    except:
+                        continue
+                    path = item.abshref(src)
+                    ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
+                    if ff is None:
+                        continue
+
+                    raw = nraw = ff.data
+                    if path not in processed:
+                        processed.add(path)
+                        try:
+                            nraw = remove_embed_restriction(raw)
+                        except:
+                            continue
+                        if nraw != raw:
+                            ff.data = nraw
+                            self.oeb.container.write(path, nraw)
+
+    def convert_text(self, oeb_book):
+        import json
+        from calibre.ebooks.pdf.html_writer import convert
+        self.get_cover_data()
+        self.process_fonts()
+
+        if self.opts.pdf_use_document_margins and self.stored_page_margins:
+            for href, margins in iteritems(self.stored_page_margins):
+                item = oeb_book.manifest.hrefs.get(href)
+                if item is not None:
+                    root = item.data
+                    if hasattr(root, 'xpath') and margins:
+                        root.set('data-calibre-pdf-output-page-margins', json.dumps(margins))
+
+        with TemporaryDirectory('_pdf_out') as oeb_dir:
+            from calibre.customize.ui import plugin_for_output_format
+            oeb_dir = os.path.realpath(oeb_dir)
+            oeb_output = plugin_for_output_format('oeb')
+            oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log)
+            opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
+            convert(
+                opfpath, self.opts, metadata=self.metadata, output_path=self.output_path,
+                log=self.log, cover_data=self.cover_data, report_progress=self.report_progress
+            )
--- a/ebook_converter/ebooks/conversion/plugins/pml_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/pml_input.py
@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import glob
+import os
+import shutil
+
+from calibre.customize.conversion import InputFormatPlugin
+from calibre.ptempfile import TemporaryDirectory
+from polyglot.builtins import getcwd
+
+
+class PMLInput(InputFormatPlugin):
+
+    name        = 'PML Input'
+    author      = 'John Schember'
+    description = 'Convert PML to OEB'
+    # pmlz is a zip file containing pml files and png images.
+    file_types  = {'pml', 'pmlz'}
+    commit_name = 'pml_input'
+
+    def process_pml(self, pml_path, html_path, close_all=False):
+        from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
+
+        pclose = False
+        hclose = False
+
+        if not hasattr(pml_path, 'read'):
+            pml_stream = lopen(pml_path, 'rb')
+            pclose = True
+        else:
+            pml_stream = pml_path
+            pml_stream.seek(0)
+
+        if not hasattr(html_path, 'write'):
+            html_stream = lopen(html_path, 'wb')
+            hclose = True
+        else:
+            html_stream = html_path
+
+        ienc = getattr(pml_stream, 'encoding', None)
+        if ienc is None:
+            ienc = 'cp1252'
+        if self.options.input_encoding:
+            ienc = self.options.input_encoding
+
+        self.log.debug('Converting PML to HTML...')
+        hizer = PML_HTMLizer()
+        html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
+        html = '<html><head><title></title></head><body>%s</body></html>'%html
+        html_stream.write(html.encode('utf-8', 'replace'))
+
+        if pclose:
+            pml_stream.close()
+        if hclose:
+            html_stream.close()
+
+        return hizer.get_toc()
+
+    def get_images(self, stream, tdir, top_level=False):
+        images = []
+        imgs = []
+
+        if top_level:
+            imgs = glob.glob(os.path.join(tdir, '*.png'))
+        # Images not in top level try bookname_img directory because
+        # that's where Dropbook likes to see them.
+        if not imgs:
+            if hasattr(stream, 'name'):
+                imgs = glob.glob(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img', '*.png'))
+        # No images in Dropbook location try generic images directory
+        if not imgs:
+            imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png'))
+        if imgs:
+            os.makedirs(os.path.join(getcwd(), 'images'))
+        for img in imgs:
+            pimg_name = os.path.basename(img)
+            pimg_path = os.path.join(getcwd(), 'images', pimg_name)
+
+            images.append('images/' + pimg_name)
+
+            shutil.copy(img, pimg_path)
+
+        return images
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.metadata.toc import TOC
+        from calibre.ebooks.metadata.opf2 import OPFCreator
+        from calibre.utils.zipfile import ZipFile
+
+        self.options = options
+        self.log = log
+        pages, images = [], []
+        toc = TOC()
+
+        if file_ext == 'pmlz':
+            log.debug('De-compressing content to temporary directory...')
+            with TemporaryDirectory('_unpmlz') as tdir:
+                zf = ZipFile(stream)
+                zf.extractall(tdir)
+
+                pmls = glob.glob(os.path.join(tdir, '*.pml'))
+                for pml in pmls:
+                    html_name = os.path.splitext(os.path.basename(pml))[0]+'.html'
+                    html_path = os.path.join(getcwd(), html_name)
+
+                    pages.append(html_name)
+                    log.debug('Processing PML item %s...' % pml)
+                    ttoc = self.process_pml(pml, html_path)
+                    toc += ttoc
+                images = self.get_images(stream, tdir, True)
+        else:
+            toc = self.process_pml(stream, 'index.html')
+            pages.append('index.html')
+
+            if hasattr(stream, 'name'):
+                images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name)))
+
+        # We want pages to be orded alphabetically.
+        pages.sort()
+
+        manifest_items = []
+        for item in pages+images:
+            manifest_items.append((item, None))
+
+        from calibre.ebooks.metadata.meta import get_metadata
+        log.debug('Reading metadata from input file...')
+        mi = get_metadata(stream, 'pml')
+        if 'images/cover.png' in images:
+            mi.cover = 'images/cover.png'
+        opf = OPFCreator(getcwd(), mi)
+        log.debug('Generating manifest...')
+        opf.create_manifest(manifest_items)
+        opf.create_spine(pages)
+        opf.set_toc(toc)
+        with lopen('metadata.opf', 'wb') as opffile:
+            with lopen('toc.ncx', 'wb') as tocfile:
+                opf.render(opffile, tocfile, 'toc.ncx')
+
+        return os.path.join(getcwd(), 'metadata.opf')
+
+    def postprocess_book(self, oeb, opts, log):
+        from calibre.ebooks.oeb.base import XHTML, barename
+        for item in oeb.spine:
+            if hasattr(item.data, 'xpath'):
+                for heading in item.data.iterdescendants(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
+                    if not len(heading):
+                        continue
+                    span = heading[0]
+                    if not heading.text and not span.text and not len(span) and barename(span.tag) == 'span':
+                        if not heading.get('id') and span.get('id'):
+                            heading.set('id', span.get('id'))
+                            heading.text = span.tail
+                            heading.remove(span)
+                    if len(heading) == 1 and heading[0].get('style') == 'text-align: center; margin: auto;':
+                        div = heading[0]
+                        if barename(div.tag) == 'div' and not len(div) and not div.get('id') and not heading.get('style'):
+                            heading.text = (heading.text or '') + (div.text or '') + (div.tail or '')
+                            heading.remove(div)
+                            heading.set('style', 'text-align: center')
--- a/ebook_converter/ebooks/conversion/plugins/pml_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/pml_output.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os, io
+
+from calibre.customize.conversion import (OutputFormatPlugin,
+        OptionRecommendation)
+from calibre.ptempfile import TemporaryDirectory
+from polyglot.builtins import unicode_type
+
+
+class PMLOutput(OutputFormatPlugin):
+
+    name = 'PML Output'
+    author = 'John Schember'
+    file_type = 'pmlz'
+    commit_name = 'pml_output'
+
+    options = {
+        OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252',
+            level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the output document. '
+            'The default is cp1252.')),
+        OptionRecommendation(name='inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Add Table of Contents to beginning of the book.')),
+        OptionRecommendation(name='full_image_depth',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Do not reduce the size or bit depth of images. Images '
+                   'have their size and depth reduced by default to accommodate '
+                   'applications that can not convert images on their '
+                   'own such as Dropbook.')),
+    }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.pml.pmlml import PMLMLizer
+        from calibre.utils.zipfile import ZipFile
+
+        with TemporaryDirectory('_pmlz_output') as tdir:
+            pmlmlizer = PMLMLizer(log)
+            pml = unicode_type(pmlmlizer.extract_content(oeb_book, opts))
+            with lopen(os.path.join(tdir, 'index.pml'), 'wb') as out:
+                out.write(pml.encode(opts.pml_output_encoding, 'replace'))
+
+            img_path = os.path.join(tdir, 'index_img')
+            if not os.path.exists(img_path):
+                os.makedirs(img_path)
+            self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, img_path, opts)
+
+            log.debug('Compressing output...')
+            pmlz = ZipFile(output_path, 'w')
+            pmlz.add_dir(tdir)
+
+    def write_images(self, manifest, image_hrefs, out_dir, opts):
+        from PIL import Image
+
+        from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
+        for item in manifest:
+            if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys():
+                if opts.full_image_depth:
+                    im = Image.open(io.BytesIO(item.data))
+                else:
+                    im = Image.open(io.BytesIO(item.data)).convert('P')
+                    im.thumbnail((300,300), Image.ANTIALIAS)
+
+                data = io.BytesIO()
+                im.save(data, 'PNG')
+                data = data.getvalue()
+
+                path = os.path.join(out_dir, image_hrefs[item.href])
+
+                with lopen(path, 'wb') as out:
+                    out.write(data)
--- a/ebook_converter/ebooks/conversion/plugins/rb_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/rb_input.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+
+from calibre.customize.conversion import InputFormatPlugin
+from polyglot.builtins import getcwd
+
+
+class RBInput(InputFormatPlugin):
+
+    name        = 'RB Input'
+    author      = 'John Schember'
+    description = 'Convert RB files to HTML'
+    file_types  = {'rb'}
+    commit_name = 'rb_input'
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.rb.reader import Reader
+
+        reader = Reader(stream, log, options.input_encoding)
+        opf = reader.extract_content(getcwd())
+
+        return opf
--- a/ebook_converter/ebooks/conversion/plugins/rb_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/rb_output.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
+
+
+class RBOutput(OutputFormatPlugin):
+
+    name = 'RB Output'
+    author = 'John Schember'
+    file_type = 'rb'
+    commit_name = 'rb_output'
+
+    options = {
+        OptionRecommendation(name='inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Add Table of Contents to beginning of the book.'))}
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.rb.writer import RBWriter
+
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = lopen(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        writer = RBWriter(opts, log)
+
+        out_stream.seek(0)
+        out_stream.truncate()
+
+        writer.write_content(oeb_book, out_stream, oeb_book.metadata)
+
+        if close:
+            out_stream.close()
--- a/ebook_converter/ebooks/conversion/plugins/recipe_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/recipe_input.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.constants import numeric_version
+from calibre import walk
+from polyglot.builtins import unicode_type
+
+
+class RecipeDisabled(Exception):
+    pass
+
+
+class RecipeInput(InputFormatPlugin):
+
+    name        = 'Recipe Input'
+    author      = 'Kovid Goyal'
+    description = _('Download periodical content from the internet')
+    file_types  = {'recipe', 'downloaded_recipe'}
+    commit_name = 'recipe_input'
+
+    recommendations = {
+        ('chapter', None, OptionRecommendation.HIGH),
+        ('dont_split_on_page_breaks', True, OptionRecommendation.HIGH),
+        ('use_auto_toc', False, OptionRecommendation.HIGH),
+        ('input_encoding', None, OptionRecommendation.HIGH),
+        ('input_profile', 'default', OptionRecommendation.HIGH),
+        ('page_breaks_before', None, OptionRecommendation.HIGH),
+        ('insert_metadata', False, OptionRecommendation.HIGH),
+        }
+
+    options = {
+        OptionRecommendation(name='test', recommended_value=False,
+            help=_(
+            'Useful for recipe development. Forces'
+            ' max_articles_per_feed to 2 and downloads at most 2 feeds.'
+            ' You can change the number of feeds and articles by supplying optional arguments.'
+            ' For example: --test 3 1 will download at most 3 feeds and only 1 article per feed.')),
+        OptionRecommendation(name='username', recommended_value=None,
+            help=_('Username for sites that require a login to access '
+                'content.')),
+        OptionRecommendation(name='password', recommended_value=None,
+            help=_('Password for sites that require a login to access '
+                'content.')),
+        OptionRecommendation(name='dont_download_recipe',
+            recommended_value=False,
+            help=_('Do not download latest version of builtin recipes from the calibre server')),
+        OptionRecommendation(name='lrf', recommended_value=False,
+            help='Optimize fetching for subsequent conversion to LRF.'),
+        }
+
+    def convert(self, recipe_or_file, opts, file_ext, log,
+            accelerators):
+        from calibre.web.feeds.recipes import compile_recipe
+        opts.output_profile.flow_size = 0
+        if file_ext == 'downloaded_recipe':
+            from calibre.utils.zipfile import ZipFile
+            zf = ZipFile(recipe_or_file, 'r')
+            zf.extractall()
+            zf.close()
+            with lopen('download.recipe', 'rb') as f:
+                self.recipe_source = f.read()
+            recipe = compile_recipe(self.recipe_source)
+            recipe.needs_subscription = False
+            self.recipe_object = recipe(opts, log, self.report_progress)
+        else:
+            if os.environ.get('CALIBRE_RECIPE_URN'):
+                from calibre.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
+                urn = os.environ['CALIBRE_RECIPE_URN']
+                log('Downloading recipe urn: ' + urn)
+                rtype, recipe_id = urn.partition(':')[::2]
+                if not recipe_id:
+                    raise ValueError('Invalid recipe urn: ' + urn)
+                if rtype == 'custom':
+                    self.recipe_source = get_custom_recipe(recipe_id)
+                else:
+                    self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True)
+                if not self.recipe_source:
+                    raise ValueError('Could not find recipe with urn: ' + urn)
+                if not isinstance(self.recipe_source, bytes):
+                    self.recipe_source = self.recipe_source.encode('utf-8')
+                recipe = compile_recipe(self.recipe_source)
+            elif os.access(recipe_or_file, os.R_OK):
+                with lopen(recipe_or_file, 'rb') as f:
+                    self.recipe_source = f.read()
+                recipe = compile_recipe(self.recipe_source)
+                log('Using custom recipe')
+            else:
+                from calibre.web.feeds.recipes.collection import (
+                        get_builtin_recipe_by_title, get_builtin_recipe_titles)
+                title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
+                title = os.path.basename(title).rpartition('.')[0]
+                titles = frozenset(get_builtin_recipe_titles())
+                if title not in titles:
+                    title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
+                    title = title.rpartition('.')[0]
+
+                raw = get_builtin_recipe_by_title(title, log=log,
+                        download_recipe=not opts.dont_download_recipe)
+                builtin = False
+                try:
+                    recipe = compile_recipe(raw)
+                    self.recipe_source = raw
+                    if recipe.requires_version > numeric_version:
+                        log.warn(
+                        'Downloaded recipe needs calibre version at least: %s' %
+                        ('.'.join(recipe.requires_version)))
+                        builtin = True
+                except:
+                    log.exception('Failed to compile downloaded recipe. Falling '
+                            'back to builtin one')
+                    builtin = True
+                if builtin:
+                    log('Using bundled builtin recipe')
+                    raw = get_builtin_recipe_by_title(title, log=log,
+                            download_recipe=False)
+                    if raw is None:
+                        raise ValueError('Failed to find builtin recipe: '+title)
+                    recipe = compile_recipe(raw)
+                    self.recipe_source = raw
+                else:
+                    log('Using downloaded builtin recipe')
+
+            if recipe is None:
+                raise ValueError('%r is not a valid recipe file or builtin recipe' %
+                        recipe_or_file)
+
+            disabled = getattr(recipe, 'recipe_disabled', None)
+            if disabled is not None:
+                raise RecipeDisabled(disabled)
+            ro = recipe(opts, log, self.report_progress)
+            ro.download()
+            self.recipe_object = ro
+
+        for key, val in self.recipe_object.conversion_options.items():
+            setattr(opts, key, val)
+
+        for f in os.listdir('.'):
+            if f.endswith('.opf'):
+                return os.path.abspath(f)
+
+        for f in walk('.'):
+            if f.endswith('.opf'):
+                return os.path.abspath(f)
+
+    def postprocess_book(self, oeb, opts, log):
+        if self.recipe_object is not None:
+            self.recipe_object.internal_postprocess_book(oeb, opts, log)
+            self.recipe_object.postprocess_book(oeb, opts, log)
+
+    def specialize(self, oeb, opts, log, output_fmt):
+        if opts.no_inline_navbars:
+            from calibre.ebooks.oeb.base import XPath
+            for item in oeb.spine:
+                for div in XPath('//h:div[contains(@class, "calibre_navbar")]')(item.data):
+                    div.getparent().remove(div)
+
+    def save_download(self, zf):
+        raw = self.recipe_source
+        if isinstance(raw, unicode_type):
+            raw = raw.encode('utf-8')
+        zf.writestr('download.recipe', raw)
--- a/ebook_converter/ebooks/conversion/plugins/rtf_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/rtf_input.py
@@ -0,0 +1,323 @@
+from __future__ import with_statement, unicode_literals
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os, glob, re, textwrap
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from polyglot.builtins import iteritems, filter, getcwd, as_bytes
+
+border_style_map = {
+        'single' : 'solid',
+        'double-thickness-border' : 'double',
+        'shadowed-border': 'outset',
+        'double-border': 'double',
+        'dotted-border': 'dotted',
+        'dashed': 'dashed',
+        'hairline': 'solid',
+        'inset': 'inset',
+        'dash-small': 'dashed',
+        'dot-dash': 'dotted',
+        'dot-dot-dash': 'dotted',
+        'outset': 'outset',
+        'tripple': 'double',
+        'triple': 'double',
+        'thick-thin-small': 'solid',
+        'thin-thick-small': 'solid',
+        'thin-thick-thin-small': 'solid',
+        'thick-thin-medium': 'solid',
+        'thin-thick-medium': 'solid',
+        'thin-thick-thin-medium': 'solid',
+        'thick-thin-large': 'solid',
+        'thin-thick-thin-large': 'solid',
+        'wavy': 'ridge',
+        'double-wavy': 'ridge',
+        'striped': 'ridge',
+        'emboss': 'inset',
+        'engrave': 'inset',
+        'frame': 'ridge',
+}
+
+
+class RTFInput(InputFormatPlugin):
+
+    name        = 'RTF Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert RTF files to HTML'
+    file_types  = {'rtf'}
+    commit_name = 'rtf_input'
+
+    options = {
+        OptionRecommendation(name='ignore_wmf', recommended_value=False,
+            help=_('Ignore WMF images instead of replacing them with a placeholder image.')),
+    }
+
+    def generate_xml(self, stream):
+        from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
+        ofile = u'dataxml.xml'
+        run_lev, debug_dir, indent_out = 1, None, 0
+        if getattr(self.opts, 'debug_pipeline', None) is not None:
+            try:
+                os.mkdir(u'rtfdebug')
+                debug_dir = u'rtfdebug'
+                run_lev = 4
+                indent_out = 1
+                self.log('Running RTFParser in debug mode')
+            except:
+                self.log.warn('Impossible to run RTFParser in debug mode')
+        parser = ParseRtf(
+            in_file=stream,
+            out_file=ofile,
+            # Convert symbol fonts to unicode equivalents. Default
+            # is 1
+            convert_symbol=1,
+
+            # Convert Zapf fonts to unicode equivalents. Default
+            # is 1.
+            convert_zapf=1,
+
+            # Convert Wingding fonts to unicode equivalents.
+            # Default is 1.
+            convert_wingdings=1,
+
+            # Convert RTF caps to real caps.
+            # Default is 1.
+            convert_caps=1,
+
+            # Indent resulting XML.
+            # Default is 0 (no indent).
+            indent=indent_out,
+
+            # Form lists from RTF. Default is 1.
+            form_lists=1,
+
+            # Convert headings to sections. Default is 0.
+            headings_to_sections=1,
+
+            # Group paragraphs with the same style name. Default is 1.
+            group_styles=1,
+
+            # Group borders. Default is 1.
+            group_borders=1,
+
+            # Write or do not write paragraphs. Default is 0.
+            empty_paragraphs=1,
+
+            # Debug
+            deb_dir=debug_dir,
+
+            # Default encoding
+            default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
+
+            # Run level
+            run_level=run_lev,
+        )
+        parser.parse_rtf()
+        with open(ofile, 'rb') as f:
+            return f.read()
+
+    def extract_images(self, picts):
+        from calibre.utils.imghdr import what
+        from binascii import unhexlify
+        self.log('Extracting images...')
+
+        with open(picts, 'rb') as f:
+            raw = f.read()
+        picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
+        hex_pat = re.compile(br'[^a-fA-F0-9]')
+        encs = [hex_pat.sub(b'', pict) for pict in picts]
+
+        count = 0
+        imap = {}
+        for enc in encs:
+            if len(enc) % 2 == 1:
+                enc = enc[:-1]
+            data = unhexlify(enc)
+            fmt = what(None, data)
+            if fmt is None:
+                fmt = 'wmf'
+            count += 1
+            name = u'%04d.%s' % (count, fmt)
+            with open(name, 'wb') as f:
+                f.write(data)
+            imap[count] = name
+            # with open(name+'.hex', 'wb') as f:
+            #     f.write(enc)
+        return self.convert_images(imap)
+
+    def convert_images(self, imap):
+        self.default_img = None
+        for count, val in iteritems(imap):
+            try:
+                imap[count] = self.convert_image(val)
+            except:
+                self.log.exception('Failed to convert', val)
+        return imap
+
+    def convert_image(self, name):
+        if not name.endswith('.wmf'):
+            return name
+        try:
+            return self.rasterize_wmf(name)
+        except Exception:
+            self.log.exception('Failed to convert WMF image %r'%name)
+        return self.replace_wmf(name)
+
+    def replace_wmf(self, name):
+        if self.opts.ignore_wmf:
+            os.remove(name)
+            return '__REMOVE_ME__'
+        from calibre.ebooks.covers import message_image
+        if self.default_img is None:
+            self.default_img = message_image('Conversion of WMF images is not supported.'
+            ' Use Microsoft Word or OpenOffice to save this RTF file'
+            ' as HTML and convert that in calibre.')
+        name = name.replace('.wmf', '.jpg')
+        with lopen(name, 'wb') as f:
+            f.write(self.default_img)
+        return name
+
+    def rasterize_wmf(self, name):
+        from calibre.utils.wmf.parse import wmf_unwrap
+        with open(name, 'rb') as f:
+            data = f.read()
+        data = wmf_unwrap(data)
+        name = name.replace('.wmf', '.png')
+        with open(name, 'wb') as f:
+            f.write(data)
+        return name
+
+    def write_inline_css(self, ic, border_styles):
+        font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in
+                enumerate(ic.font_sizes)]
+        color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in
+                enumerate(ic.colors) if x != 'false']
+        css = textwrap.dedent('''
+        span.none {
+            text-decoration: none; font-weight: normal;
+            font-style: normal; font-variant: normal
+        }
+
+        span.italics { font-style: italic }
+
+        span.bold { font-weight: bold }
+
+        span.small-caps { font-variant: small-caps }
+
+        span.underlined { text-decoration: underline }
+
+        span.strike-through { text-decoration: line-through }
+
+        ''')
+        css += '\n'+'\n'.join(font_size_classes)
+        css += '\n' +'\n'.join(color_classes)
+
+        for cls, val in iteritems(border_styles):
+            css += '\n\n.%s {\n%s\n}'%(cls, val)
+
+        with open(u'styles.css', 'ab') as f:
+            f.write(css.encode('utf-8'))
+
+    def convert_borders(self, doc):
+        border_styles = []
+        style_map = {}
+        for elem in doc.xpath(r'//*[local-name()="cell"]'):
+            style = ['border-style: hidden', 'border-width: 1px',
+                    'border-color: black']
+            for x in ('bottom', 'top', 'left', 'right'):
+                bs = elem.get('border-cell-%s-style'%x, None)
+                if bs:
+                    cbs = border_style_map.get(bs, 'solid')
+                    style.append('border-%s-style: %s'%(x, cbs))
+                bw = elem.get('border-cell-%s-line-width'%x, None)
+                if bw:
+                    style.append('border-%s-width: %spt'%(x, bw))
+                bc = elem.get('border-cell-%s-color'%x, None)
+                if bc:
+                    style.append('border-%s-color: %s'%(x, bc))
+            style = ';\n'.join(style)
+            if style not in border_styles:
+                border_styles.append(style)
+            idx = border_styles.index(style)
+            cls = 'border_style%d'%idx
+            style_map[cls] = style
+            elem.set('class', cls)
+        return style_map
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from lxml import etree
+        from calibre.ebooks.metadata.meta import get_metadata
+        from calibre.ebooks.metadata.opf2 import OPFCreator
+        from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
+        from calibre.ebooks.rtf.input import InlineClass
+        from calibre.utils.xml_parse import safe_xml_fromstring
+        self.opts = options
+        self.log = log
+        self.log('Converting RTF to XML...')
+        try:
+            xml = self.generate_xml(stream.name)
+        except RtfInvalidCodeException as e:
+            self.log.exception('Unable to parse RTF')
+            raise ValueError(_('This RTF file has a feature calibre does not '
+            'support. Convert it to HTML first and then try it.\n%s')%e)
+
+        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
+        if d:
+            imap = {}
+            try:
+                imap = self.extract_images(d[0])
+            except:
+                self.log.exception('Failed to extract images...')
+
+        self.log('Parsing XML...')
+        doc = safe_xml_fromstring(xml)
+        border_styles = self.convert_borders(doc)
+        for pict in doc.xpath('//rtf:pict[@num]',
+                namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
+            num = int(pict.get('num'))
+            name = imap.get(num, None)
+            if name is not None:
+                pict.set('num', name)
+
+        self.log('Converting XML to HTML...')
+        inline_class = InlineClass(self.log)
+        styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
+        extensions = {('calibre', 'inline-class') : inline_class}
+        transform = etree.XSLT(styledoc, extensions=extensions)
+        result = transform(doc)
+        html = u'index.xhtml'
+        with open(html, 'wb') as f:
+            res = as_bytes(transform.tostring(result))
+            # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
+            # clean multiple \n
+            res = re.sub(b'\n+', b'\n', res)
+            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
+            # res = re.sub('\s*<body>', '<body>', res)
+            # res = re.sub('(?<=\n)\n{2}',
+            # u'<p>\u00a0</p>\n'.encode('utf-8'), res)
+            f.write(res)
+        self.write_inline_css(inline_class, border_styles)
+        stream.seek(0)
+        mi = get_metadata(stream, 'rtf')
+        if not mi.title:
+            mi.title = _('Unknown')
+        if not mi.authors:
+            mi.authors = [_('Unknown')]
+        opf = OPFCreator(getcwd(), mi)
+        opf.create_manifest([(u'index.xhtml', None)])
+        opf.create_spine([u'index.xhtml'])
+        opf.render(open(u'metadata.opf', 'wb'))
+        return os.path.abspath(u'metadata.opf')
+
+    def postprocess_book(self, oeb, opts, log):
+        for item in oeb.spine:
+            for img in item.data.xpath('//*[local-name()="img" and @src="__REMOVE_ME__"]'):
+                p = img.getparent()
+                idx = p.index(img)
+                p.remove(img)
+                if img.tail:
+                    if idx == 0:
+                        p.text = (p.text or '') + img.tail
+                    else:
+                        p[idx-1].tail = (p[idx-1].tail or '') + img.tail
--- a/ebook_converter/ebooks/conversion/plugins/rtf_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/rtf_output.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin
+
+
+class RTFOutput(OutputFormatPlugin):
+
+    name = 'RTF Output'
+    author = 'John Schember'
+    file_type = 'rtf'
+    commit_name = 'rtf_output'
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.rtf.rtfml import RTFMLizer
+
+        rtfmlitzer = RTFMLizer(log)
+        content = rtfmlitzer.extract_content(oeb_book, opts)
+
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = lopen(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        out_stream.seek(0)
+        out_stream.truncate()
+        out_stream.write(content.encode('ascii', 'replace'))
+
+        if close:
+            out_stream.close()
--- a/ebook_converter/ebooks/conversion/plugins/snb_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/snb_input.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.filenames import ascii_filename
+from polyglot.builtins import unicode_type
+
+HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
+
+
+def html_encode(s):
+    return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;').replace('\n', '<br/>').replace(' ', '&nbsp;')  # noqa
+
+
+class SNBInput(InputFormatPlugin):
+
+    name        = 'SNB Input'
+    author      = 'Li Fanxi'
+    description = 'Convert SNB files to OEB'
+    file_types  = {'snb'}
+    commit_name = 'snb_input'
+
+    options = set()
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        import uuid
+
+        from calibre.ebooks.oeb.base import DirContainer
+        from calibre.ebooks.snb.snbfile import SNBFile
+        from calibre.utils.xml_parse import safe_xml_fromstring
+
+        log.debug("Parsing SNB file...")
+        snbFile = SNBFile()
+        try:
+            snbFile.Parse(stream)
+        except:
+            raise ValueError("Invalid SNB file")
+        if not snbFile.IsValid():
+            log.debug("Invalid SNB file")
+            raise ValueError("Invalid SNB file")
+        log.debug("Handle meta data ...")
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        oeb = create_oebbook(log, None, options,
+                encoding=options.input_encoding, populate=False)
+        meta = snbFile.GetFileStream('snbf/book.snbf')
+        if meta is not None:
+            meta = safe_xml_fromstring(meta)
+            l = {'title'    : './/head/name',
+                  'creator'  : './/head/author',
+                  'language' : './/head/language',
+                  'generator': './/head/generator',
+                  'publisher': './/head/publisher',
+                  'cover'    : './/head/cover', }
+            d = {}
+            for item in l:
+                node = meta.find(l[item])
+                if node is not None:
+                    d[item] = node.text if node.text is not None else ''
+                else:
+                    d[item] = ''
+
+            oeb.metadata.add('title', d['title'])
+            oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'})
+            oeb.metadata.add('language', d['language'].lower().replace('_', '-'))
+            oeb.metadata.add('generator', d['generator'])
+            oeb.metadata.add('publisher', d['publisher'])
+            if d['cover'] != '':
+                oeb.guide.add('cover', 'Cover', d['cover'])
+
+        bookid = unicode_type(uuid.uuid4())
+        oeb.metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
+        for ident in oeb.metadata.identifier:
+            if 'id' in ident.attrib:
+                oeb.uid = oeb.metadata.identifier[0]
+                break
+
+        with TemporaryDirectory('_snb2oeb', keep=True) as tdir:
+            log.debug('Process TOC ...')
+            toc = snbFile.GetFileStream('snbf/toc.snbf')
+            oeb.container = DirContainer(tdir, log)
+            if toc is not None:
+                toc = safe_xml_fromstring(toc)
+                i = 1
+                for ch in toc.find('.//body'):
+                    chapterName = ch.text
+                    chapterSrc = ch.get('src')
+                    fname = 'ch_%d.htm' % i
+                    data = snbFile.GetFileStream('snbc/' + chapterSrc)
+                    if data is None:
+                        continue
+                    snbc = safe_xml_fromstring(data)
+                    lines = []
+                    for line in snbc.find('.//body'):
+                        if line.tag == 'text':
+                            lines.append('<p>%s</p>' % html_encode(line.text))
+                        elif line.tag == 'img':
+                            lines.append('<p><img src="%s" /></p>' % html_encode(line.text))
+                    with open(os.path.join(tdir, fname), 'wb') as f:
+                        f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode('utf-8', 'replace'))
+                    oeb.toc.add(ch.text, fname)
+                    id, href = oeb.manifest.generate(id='html',
+                        href=ascii_filename(fname))
+                    item = oeb.manifest.add(id, href, 'text/html')
+                    item.html_input_href = fname
+                    oeb.spine.add(item, True)
+                    i = i + 1
+                imageFiles = snbFile.OutputImageFiles(tdir)
+                for f, m in imageFiles:
+                    id, href = oeb.manifest.generate(id='image',
+                        href=ascii_filename(f))
+                    item = oeb.manifest.add(id, href, m)
+                    item.html_input_href = f
+
+        return oeb
--- a/ebook_converter/ebooks/conversion/plugins/snb_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/snb_output.py
@@ -0,0 +1,269 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
+from calibre.ptempfile import TemporaryDirectory
+from calibre.constants import __appname__, __version__
+from polyglot.builtins import unicode_type
+
+
+class SNBOutput(OutputFormatPlugin):
+
+    name = 'SNB Output'
+    author = 'Li Fanxi'
+    file_type = 'snb'
+    commit_name = 'snb_output'
+
+    options = {
+        OptionRecommendation(name='snb_output_encoding', recommended_value='utf-8',
+            level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the output document. '
+            'The default is utf-8.')),
+        OptionRecommendation(name='snb_max_line_length',
+            recommended_value=0, level=OptionRecommendation.LOW,
+            help=_('The maximum number of characters per line. This splits on '
+            'the first space before the specified value. If no space is found '
+            'the line will be broken at the space after and will exceed the '
+            'specified value. Also, there is a minimum of 25 characters. '
+            'Use 0 to disable line splitting.')),
+        OptionRecommendation(name='snb_insert_empty_line',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Specify whether or not to insert an empty line between '
+            'two paragraphs.')),
+        OptionRecommendation(name='snb_dont_indent_first_line',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Specify whether or not to insert two space characters '
+            'to indent the first line of each paragraph.')),
+        OptionRecommendation(name='snb_hide_chapter_name',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Specify whether or not to hide the chapter title for each '
+            'chapter. Useful for image-only output (eg. comics).')),
+        OptionRecommendation(name='snb_full_screen',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Resize all the images for full screen view. ')),
+     }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from lxml import etree
+        from calibre.ebooks.snb.snbfile import SNBFile
+        from calibre.ebooks.snb.snbml import SNBMLizer, ProcessFileName
+
+        self.opts = opts
+        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
+        try:
+            rasterizer = SVGRasterizer()
+            rasterizer(oeb_book, opts)
+        except Unavailable:
+            log.warn('SVG rasterizer unavailable, SVG will not be converted')
+
+        # Create temp dir
+        with TemporaryDirectory('_snb_output') as tdir:
+            # Create stub directories
+            snbfDir = os.path.join(tdir, 'snbf')
+            snbcDir = os.path.join(tdir, 'snbc')
+            snbiDir = os.path.join(tdir, 'snbc/images')
+            os.mkdir(snbfDir)
+            os.mkdir(snbcDir)
+            os.mkdir(snbiDir)
+
+            # Process Meta data
+            meta = oeb_book.metadata
+            if meta.title:
+                title = unicode_type(meta.title[0])
+            else:
+                title = ''
+            authors = [unicode_type(x) for x in meta.creator if x.role == 'aut']
+            if meta.publisher:
+                publishers = unicode_type(meta.publisher[0])
+            else:
+                publishers = ''
+            if meta.language:
+                lang = unicode_type(meta.language[0]).upper()
+            else:
+                lang = ''
+            if meta.description:
+                abstract = unicode_type(meta.description[0])
+            else:
+                abstract = ''
+
+            # Process Cover
+            g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine
+            href = None
+            if 'titlepage' not in g:
+                if 'cover' in g:
+                    href = g['cover'].href
+
+            # Output book info file
+            bookInfoTree = etree.Element("book-snbf", version="1.0")
+            headTree = etree.SubElement(bookInfoTree, "head")
+            etree.SubElement(headTree, "name").text = title
+            etree.SubElement(headTree, "author").text = ' '.join(authors)
+            etree.SubElement(headTree, "language").text = lang
+            etree.SubElement(headTree, "rights")
+            etree.SubElement(headTree, "publisher").text = publishers
+            etree.SubElement(headTree, "generator").text = __appname__ + ' ' + __version__
+            etree.SubElement(headTree, "created")
+            etree.SubElement(headTree, "abstract").text = abstract
+            if href is not None:
+                etree.SubElement(headTree, "cover").text = ProcessFileName(href)
+            else:
+                etree.SubElement(headTree, "cover")
+            with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f:
+                f.write(etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8'))
+
+            # Output TOC
+            tocInfoTree = etree.Element("toc-snbf")
+            tocHead = etree.SubElement(tocInfoTree, "head")
+            tocBody = etree.SubElement(tocInfoTree, "body")
+            outputFiles = {}
+            if oeb_book.toc.count() == 0:
+                log.warn('This SNB file has no Table of Contents. '
+                    'Creating a default TOC')
+                first = next(iter(oeb_book.spine))
+                oeb_book.toc.add(_('Start page'), first.href)
+            else:
+                first = next(iter(oeb_book.spine))
+                if oeb_book.toc[0].href != first.href:
+                    # The pages before the fist item in toc will be stored as
+                    # "Cover Pages".
+                    # oeb_book.toc does not support "insert", so we generate
+                    # the tocInfoTree directly instead of modifying the toc
+                    ch = etree.SubElement(tocBody, "chapter")
+                    ch.set("src", ProcessFileName(first.href) + ".snbc")
+                    ch.text = _('Cover pages')
+                    outputFiles[first.href] = []
+                    outputFiles[first.href].append(("", _("Cover pages")))
+
+            for tocitem in oeb_book.toc:
+                if tocitem.href.find('#') != -1:
+                    item = tocitem.href.split('#')
+                    if len(item) != 2:
+                        log.error('Error in TOC item: %s' % tocitem)
+                    else:
+                        if item[0] in outputFiles:
+                            outputFiles[item[0]].append((item[1], tocitem.title))
+                        else:
+                            outputFiles[item[0]] = []
+                            if "" not in outputFiles[item[0]]:
+                                outputFiles[item[0]].append(("", tocitem.title + _(" (Preface)")))
+                                ch = etree.SubElement(tocBody, "chapter")
+                                ch.set("src", ProcessFileName(item[0]) + ".snbc")
+                                ch.text = tocitem.title + _(" (Preface)")
+                            outputFiles[item[0]].append((item[1], tocitem.title))
+                else:
+                    if tocitem.href in outputFiles:
+                        outputFiles[tocitem.href].append(("", tocitem.title))
+                    else:
+                        outputFiles[tocitem.href] = []
+                        outputFiles[tocitem.href].append(("", tocitem.title))
+                ch = etree.SubElement(tocBody, "chapter")
+                ch.set("src", ProcessFileName(tocitem.href) + ".snbc")
+                ch.text = tocitem.title
+
+            etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody)
+
+            with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f:
+                f.write(etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8'))
+
+            # Output Files
+            oldTree = None
+            mergeLast = False
+            lastName = None
+            for item in s:
+                from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES
+                if m.hrefs[item.href].media_type in OEB_DOCS:
+                    if item.href not in outputFiles:
+                        log.debug('File %s is unused in TOC. Continue in last chapter' % item.href)
+                        mergeLast = True
+                    else:
+                        if oldTree is not None and mergeLast:
+                            log.debug('Output the modified chapter again: %s' % lastName)
+                            with open(os.path.join(snbcDir, lastName), 'wb') as f:
+                                f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
+                            mergeLast = False
+
+                    log.debug('Converting %s to snbc...' % item.href)
+                    snbwriter = SNBMLizer(log)
+                    snbcTrees = None
+                    if not mergeLast:
+                        snbcTrees = snbwriter.extract_content(oeb_book, item, outputFiles[item.href], opts)
+                        for subName in snbcTrees:
+                            postfix = ''
+                            if subName != '':
+                                postfix = '_' + subName
+                            lastName = ProcessFileName(item.href + postfix + ".snbc")
+                            oldTree = snbcTrees[subName]
+                            with open(os.path.join(snbcDir, lastName), 'wb') as f:
+                                f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
+                    else:
+                        log.debug('Merge %s with last TOC item...' % item.href)
+                        snbwriter.merge_content(oldTree, oeb_book, item, [('', _("Start"))], opts)
+
+            # Output the last one if needed
+            log.debug('Output the last modified chapter again: %s' % lastName)
+            if oldTree is not None and mergeLast:
+                with open(os.path.join(snbcDir, lastName), 'wb') as f:
+                    f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
+                mergeLast = False
+
+            for item in m:
+                if m.hrefs[item.href].media_type in OEB_IMAGES:
+                    log.debug('Converting image: %s ...' % item.href)
+                    content = m.hrefs[item.href].data
+                    # Convert & Resize image
+                    self.HandleImage(content, os.path.join(snbiDir, ProcessFileName(item.href)))
+
+            # Package as SNB File
+            snbFile = SNBFile()
+            snbFile.FromDir(tdir)
+            snbFile.Output(output_path)
+
+    def HandleImage(self, imageData, imagePath):
+        from calibre.utils.img import image_from_data, resize_image, image_to_data
+        img = image_from_data(imageData)
+        x, y = img.width(), img.height()
+        if self.opts:
+            if self.opts.snb_full_screen:
+                SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size
+            else:
+                SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size
+        else:
+            SCREEN_X = 540
+            SCREEN_Y = 700
+        # Handle big image only
+        if x > SCREEN_X or y > SCREEN_Y:
+            xScale = float(x) / SCREEN_X
+            yScale = float(y) / SCREEN_Y
+            scale = max(xScale, yScale)
+            # TODO : intelligent image rotation
+            #     img = img.rotate(90)
+            #     x,y = y,x
+            img = resize_image(img, x // scale, y // scale)
+        with lopen(imagePath, 'wb') as f:
+            f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
+
+
+if __name__ == '__main__':
+    from calibre.ebooks.oeb.reader import OEBReader
+    from calibre.ebooks.oeb.base import OEBBook
+    from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
+    from calibre.customize.profiles import HanlinV3Output
+
+    class OptionValues(object):
+        pass
+
+    opts = OptionValues()
+    opts.output_profile = HanlinV3Output(None)
+
+    html_preprocessor = HTMLPreProcessor(None, None, opts)
+    from calibre.utils.logging import default_log
+    oeb = OEBBook(default_log, html_preprocessor)
+    reader = OEBReader
+    reader()(oeb, '/tmp/bbb/processed/')
+    SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log)
--- a/ebook_converter/ebooks/conversion/plugins/tcr_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/tcr_input.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+from io import BytesIO
+
+from calibre.customize.conversion import InputFormatPlugin
+
+
+class TCRInput(InputFormatPlugin):
+
+    name        = 'TCR Input'
+    author      = 'John Schember'
+    description = 'Convert TCR files to HTML'
+    file_types  = {'tcr'}
+    commit_name = 'tcr_input'
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.ebooks.compression.tcr import decompress
+
+        log.info('Decompressing text...')
+        raw_txt = decompress(stream)
+
+        log.info('Converting text to OEB...')
+        stream = BytesIO(raw_txt)
+
+        from calibre.customize.ui import plugin_for_input_format
+
+        txt_plugin = plugin_for_input_format('txt')
+        for opt in txt_plugin.options:
+            if not hasattr(self.options, opt.option.name):
+                setattr(options, opt.option.name, opt.recommended_value)
+
+        stream.seek(0)
+        return txt_plugin.convert(stream, options,
+                'txt', log, accelerators)
--- a/ebook_converter/ebooks/conversion/plugins/tcr_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/tcr_output.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, \
+    OptionRecommendation
+
+
+class TCROutput(OutputFormatPlugin):
+
+    name = 'TCR Output'
+    author = 'John Schember'
+    file_type = 'tcr'
+    commit_name = 'tcr_output'
+
+    options = {
+        OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8',
+            level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the output document. '
+            'The default is utf-8.'))}
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.txt.txtml import TXTMLizer
+        from calibre.ebooks.compression.tcr import compress
+
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = lopen(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        setattr(opts, 'flush_paras', False)
+        setattr(opts, 'max_line_length', 0)
+        setattr(opts, 'force_max_line_length', False)
+        setattr(opts, 'indent_paras', False)
+
+        writer = TXTMLizer(log)
+        txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace')
+
+        log.info('Compressing text...')
+        txt = compress(txt)
+
+        out_stream.seek(0)
+        out_stream.truncate()
+        out_stream.write(txt)
+
+        if close:
+            out_stream.close()
--- a/ebook_converter/ebooks/conversion/plugins/txt_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/txt_input.py
@@ -0,0 +1,308 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre import _ent_pat, walk, xml_entity_to_unicode
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from polyglot.builtins import getcwd
+
+MD_EXTENSIONS = {
+    'abbr': _('Abbreviations'),
+    'admonition': _('Support admonitions'),
+    'attr_list': _('Add attribute to HTML tags'),
+    'codehilite': _('Add code highlighting via Pygments'),
+    'def_list': _('Definition lists'),
+    'extra': _('Enables various common extensions'),
+    'fenced_code': _('Alternative code block syntax'),
+    'footnotes': _('Footnotes'),
+    'legacy_attrs': _('Use legacy element attributes'),
+    'legacy_em': _('Use legacy underscore handling for connected words'),
+    'meta': _('Metadata in the document'),
+    'nl2br': _('Treat newlines as hard breaks'),
+    'sane_lists': _('Do not allow mixing list types'),
+    'smarty': _('Use markdown\'s internal smartypants parser'),
+    'tables': _('Support tables'),
+    'toc': _('Generate a table of contents'),
+    'wikilinks': _('Wiki style links'),
+}
+
+
+class TXTInput(InputFormatPlugin):
+
+    name        = 'TXT Input'
+    author      = 'John Schember'
+    description = 'Convert TXT files to HTML'
+    file_types  = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
+    commit_name = 'txt_input'
+    ui_data = {
+        'md_extensions': MD_EXTENSIONS,
+        'paragraph_types': {
+            'auto': _('Try to auto detect paragraph type'),
+            'block': _('Treat a blank line as a paragraph break'),
+            'single': _('Assume every line is a paragraph'),
+            'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
+            'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
+            'off': _('Don\'t modify the paragraph structure'),
+        },
+        'formatting_types': {
+            'auto': _('Automatically decide which formatting processor to use'),
+            'plain': _('No formatting'),
+            'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
+            'textile': _('Use the TexTile markup language'),
+            'markdown': _('Use the Markdown markup language')
+        },
+    }
+
+    options = {
+        OptionRecommendation(name='formatting_type', recommended_value='auto',
+            choices=list(ui_data['formatting_types']),
+            help=_('Formatting used within the document.\n'
+                   '* auto: {auto}\n'
+                   '* plain: {plain}\n'
+                   '* heuristic: {heuristic}\n'
+                   '* textile: {textile}\n'
+                   '* markdown: {markdown}\n'
+                   'To learn more about markdown see {url}').format(
+                       url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
+        ),
+        OptionRecommendation(name='paragraph_type', recommended_value='auto',
+            choices=list(ui_data['paragraph_types']),
+            help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
+                   'Choices are:\n'
+                   '* auto: {auto}\n'
+                   '* block: {block}\n'
+                   '* single: {single}\n'
+                   '* print:  {print}\n'
+                   '* unformatted: {unformatted}\n'
+                   '* off: {off}').format(**ui_data['paragraph_types'])
+        ),
+        OptionRecommendation(name='preserve_spaces', recommended_value=False,
+            help=_('Normally extra spaces are condensed into a single space. '
+                'With this option all spaces will be displayed.')),
+        OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
+            help=_('Normally extra space at the beginning of lines is retained. '
+                   'With this option they will be removed.')),
+        OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
+            help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
+                   'of the standard markdown format. The extensions enabled by default: %default.\n'
+                   'To learn more about markdown extensions, see {}\n'
+                   'This should be a comma separated list of extensions to enable:\n'
+                   ).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
+    }
+
+    def shift_file(self, fname, data):
+        name, ext = os.path.splitext(fname)
+        candidate = os.path.join(self.output_dir, fname)
+        c = 0
+        while os.path.exists(candidate):
+            c += 1
+            candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext))
+        ans = candidate
+        with open(ans, 'wb') as f:
+            f.write(data)
+        return f.name
+
+    def fix_resources(self, html, base_dir):
+        from html5_parser import parse
+        root = parse(html)
+        changed = False
+        for img in root.xpath('//img[@src]'):
+            src = img.get('src')
+            prefix = src.split(':', 1)[0].lower()
+            if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
+                src = os.path.join(base_dir, src)
+                if os.access(src, os.R_OK):
+                    with open(src, 'rb') as f:
+                        data = f.read()
+                    f = self.shift_file(os.path.basename(src), data)
+                    changed = True
+                    img.set('src', os.path.basename(f))
+        if changed:
+            from lxml import etree
+            html = etree.tostring(root, encoding='unicode')
+        return html
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
+        from calibre.ebooks.chardet import detect
+        from calibre.utils.zipfile import ZipFile
+        from calibre.ebooks.txt.processor import (convert_basic,
+                convert_markdown_with_metadata, separate_paragraphs_single_line,
+                separate_paragraphs_print_formatted, preserve_spaces,
+                detect_paragraph_type, detect_formatting_type,
+                normalize_line_endings, convert_textile, remove_indents,
+                block_to_single_line, separate_hard_scene_breaks)
+
+        self.log = log
+        txt = b''
+        log.debug('Reading text from file...')
+        length = 0
+        base_dir = self.output_dir = getcwd()
+
+        # Extract content from zip archive.
+        if file_ext == 'txtz':
+            zf = ZipFile(stream)
+            zf.extractall('.')
+
+            for x in walk('.'):
+                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
+                    with open(x, 'rb') as tf:
+                        txt += tf.read() + b'\n\n'
+        else:
+            if getattr(stream, 'name', None):
+                base_dir = os.path.dirname(stream.name)
+            txt = stream.read()
+            if file_ext in {'md', 'textile', 'markdown'}:
+                options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
+                log.info('File extension indicates particular formatting. '
+                        'Forcing formatting type to: %s'%options.formatting_type)
+                options.paragraph_type = 'off'
+
+        # Get the encoding of the document.
+        if options.input_encoding:
+            ienc = options.input_encoding
+            log.debug('Using user specified input encoding of %s' % ienc)
+        else:
+            det_encoding = detect(txt[:4096])
+            det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
+            if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
+                    'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
+                    'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
+                # Microsoft Word exports to HTML with encoding incorrectly set to
+                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
+                det_encoding = 'gbk'
+            ienc = det_encoding
+            log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
+        if not ienc:
+            ienc = 'utf-8'
+            log.debug('No input encoding specified and could not auto detect using %s' % ienc)
+        # Remove BOM from start of txt as its presence can confuse markdown
+        import codecs
+        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
+            if txt.startswith(bom):
+                txt = txt[len(bom):]
+                break
+        txt = txt.decode(ienc, 'replace')
+
+        # Replace entities
+        txt = _ent_pat.sub(xml_entity_to_unicode, txt)
+
+        # Normalize line endings
+        txt = normalize_line_endings(txt)
+
+        # Determine the paragraph type of the document.
+        if options.paragraph_type == 'auto':
+            options.paragraph_type = detect_paragraph_type(txt)
+            if options.paragraph_type == 'unknown':
+                log.debug('Could not reliably determine paragraph type using block')
+                options.paragraph_type = 'block'
+            else:
+                log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+
+        # Detect formatting
+        if options.formatting_type == 'auto':
+            options.formatting_type = detect_formatting_type(txt)
+            log.debug('Auto detected formatting as %s' % options.formatting_type)
+
+        if options.formatting_type == 'heuristic':
+            setattr(options, 'enable_heuristics', True)
+            setattr(options, 'unwrap_lines', False)
+            setattr(options, 'smarten_punctuation', True)
+
+        # Reformat paragraphs to block formatting based on the detected type.
+        # We don't check for block because the processor assumes block.
+        # single and print at transformed to block for processing.
+        if options.paragraph_type == 'single':
+            txt = separate_paragraphs_single_line(txt)
+        elif options.paragraph_type == 'print':
+            txt = separate_hard_scene_breaks(txt)
+            txt = separate_paragraphs_print_formatted(txt)
+            txt = block_to_single_line(txt)
+        elif options.paragraph_type == 'unformatted':
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            # unwrap lines based on punctuation
+            docanalysis = DocAnalysis('txt', txt)
+            length = docanalysis.line_length(.5)
+            preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
+            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
+            txt = separate_paragraphs_single_line(txt)
+        elif options.paragraph_type == 'block':
+            txt = separate_hard_scene_breaks(txt)
+            txt = block_to_single_line(txt)
+
+        if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
+            docanalysis = DocAnalysis('txt', txt)
+            if not length:
+                length = docanalysis.line_length(.5)
+            dehyphenator = Dehyphenator(options.verbose, log=self.log)
+            txt = dehyphenator(txt,'txt', length)
+
+        # User requested transformation on the text.
+        if options.txt_in_remove_indents:
+            txt = remove_indents(txt)
+
+        # Preserve spaces will replace multiple spaces to a space
+        # followed by the &nbsp; entity.
+        if options.preserve_spaces:
+            txt = preserve_spaces(txt)
+
+        # Process the text using the appropriate text processor.
+        self.shifted_files = []
+        try:
+            html = ''
+            input_mi = None
+            if options.formatting_type == 'markdown':
+                log.debug('Running text through markdown conversion...')
+                try:
+                    input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
+                except RuntimeError:
+                    raise ValueError('This txt file has malformed markup, it cannot be'
+                        ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
+                html = self.fix_resources(html, base_dir)
+            elif options.formatting_type == 'textile':
+                log.debug('Running text through textile conversion...')
+                html = convert_textile(txt)
+                html = self.fix_resources(html, base_dir)
+            else:
+                log.debug('Running text through basic conversion...')
+                flow_size = getattr(options, 'flow_size', 0)
+                html = convert_basic(txt, epub_split_size_kb=flow_size)
+
+            # Run the HTMLized text through the html processing plugin.
+            from calibre.customize.ui import plugin_for_input_format
+            html_input = plugin_for_input_format('html')
+            for opt in html_input.options:
+                setattr(options, opt.option.name, opt.recommended_value)
+            options.input_encoding = 'utf-8'
+            htmlfile = self.shift_file('index.html', html.encode('utf-8'))
+            odi = options.debug_pipeline
+            options.debug_pipeline = None
+            # Generate oeb from html conversion.
+            oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
+            options.debug_pipeline = odi
+        finally:
+            for x in self.shifted_files:
+                os.remove(x)
+
+        # Set metadata from file.
+        if input_mi is None:
+            from calibre.customize.ui import get_file_type_metadata
+            input_mi = get_file_type_metadata(stream, file_ext)
+        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
+        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
+        self.html_postprocess_title = input_mi.title
+
+        return oeb
+
+    def postprocess_book(self, oeb, opts, log):
+        for item in oeb.spine:
+            if hasattr(item.data, 'xpath'):
+                for title in item.data.xpath('//*[local-name()="title"]'):
+                    if title.text == _('Unknown'):
+                        title.text = self.html_postprocess_title
--- a/ebook_converter/ebooks/conversion/plugins/txt_output.py
+++ b/ebook_converter/ebooks/conversion/plugins/txt_output.py
@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+import shutil
+
+
+from calibre.customize.conversion import OutputFormatPlugin, \
+    OptionRecommendation
+from calibre.ptempfile import TemporaryDirectory, TemporaryFile
+
+NEWLINE_TYPES = ['system', 'unix', 'old_mac', 'windows']
+
+
+class TXTOutput(OutputFormatPlugin):
+
+    name = 'TXT Output'
+    author = 'John Schember'
+    file_type = 'txt'
+    commit_name = 'txt_output'
+    ui_data = {
+            'newline_types': NEWLINE_TYPES,
+            'formatting_types': {
+                'plain': _('Plain text'),
+                'markdown': _('Markdown formatted text'),
+                'textile': _('TexTile formatted text')
+            },
+    }
+
+    options = {
+        OptionRecommendation(name='newline', recommended_value='system',
+            level=OptionRecommendation.LOW,
+            short_switch='n', choices=NEWLINE_TYPES,
+            help=_('Type of newline to use. Options are %s. Default is \'system\'. '
+                'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
+                'For macOS use \'unix\'. \'system\' will default to the newline '
+                'type used by this OS.') % sorted(NEWLINE_TYPES)),
+        OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8',
+            level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the output document. '
+            'The default is utf-8.')),
+        OptionRecommendation(name='inline_toc',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Add Table of Contents to beginning of the book.')),
+        OptionRecommendation(name='max_line_length',
+            recommended_value=0, level=OptionRecommendation.LOW,
+            help=_('The maximum number of characters per line. This splits on '
+            'the first space before the specified value. If no space is found '
+            'the line will be broken at the space after and will exceed the '
+            'specified value. Also, there is a minimum of 25 characters. '
+            'Use 0 to disable line splitting.')),
+        OptionRecommendation(name='force_max_line_length',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Force splitting on the max-line-length value when no space '
+            'is present. Also allows max-line-length to be below the minimum')),
+        OptionRecommendation(name='txt_output_formatting',
+             recommended_value='plain',
+             choices=list(ui_data['formatting_types']),
+             help=_('Formatting used within the document.\n'
+                    '* plain: {plain}\n'
+                    '* markdown: {markdown}\n'
+                    '* textile: {textile}').format(**ui_data['formatting_types'])),
+        OptionRecommendation(name='keep_links',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Do not remove links within the document. This is only '
+            'useful when paired with a txt-output-formatting option that '
+            'is not none because links are always removed with plain text output.')),
+        OptionRecommendation(name='keep_image_references',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Do not remove image references within the document. This is only '
+            'useful when paired with a txt-output-formatting option that '
+            'is not none because links are always removed with plain text output.')),
+        OptionRecommendation(name='keep_color',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Do not remove font color from output. This is only useful when '
+                   'txt-output-formatting is set to textile. Textile is the only '
+                   'formatting that supports setting font color. If this option is '
+                   'not specified font color will not be set and default to the '
+                   'color displayed by the reader (generally this is black).')),
+     }
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.txt.txtml import TXTMLizer
+        from calibre.utils.cleantext import clean_ascii_chars
+        from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines
+
+        if opts.txt_output_formatting.lower() == 'markdown':
+            from calibre.ebooks.txt.markdownml import MarkdownMLizer
+            self.writer = MarkdownMLizer(log)
+        elif opts.txt_output_formatting.lower() == 'textile':
+            from calibre.ebooks.txt.textileml import TextileMLizer
+            self.writer = TextileMLizer(log)
+        else:
+            self.writer = TXTMLizer(log)
+
+        txt = self.writer.extract_content(oeb_book, opts)
+        txt = clean_ascii_chars(txt)
+
+        log.debug('\tReplacing newlines with selected type...')
+        txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
+
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = open(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        out_stream.seek(0)
+        out_stream.truncate()
+        out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))
+
+        if close:
+            out_stream.close()
+
+
+class TXTZOutput(TXTOutput):
+
+    name = 'TXTZ Output'
+    author = 'John Schember'
+    file_type = 'txtz'
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        from calibre.ebooks.oeb.base import OEB_IMAGES
+        from calibre.utils.zipfile import ZipFile
+        from lxml import etree
+
+        with TemporaryDirectory('_txtz_output') as tdir:
+            # TXT
+            txt_name = 'index.txt'
+            if opts.txt_output_formatting.lower() == 'textile':
+                txt_name = 'index.text'
+            with TemporaryFile(txt_name) as tf:
+                TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
+                shutil.copy(tf, os.path.join(tdir, txt_name))
+
+            # Images
+            for item in oeb_book.manifest:
+                if item.media_type in OEB_IMAGES:
+                    if hasattr(self.writer, 'images'):
+                        path = os.path.join(tdir, 'images')
+                        if item.href in self.writer.images:
+                            href = self.writer.images[item.href]
+                        else:
+                            continue
+                    else:
+                        path = os.path.join(tdir, os.path.dirname(item.href))
+                        href = os.path.basename(item.href)
+                    if not os.path.exists(path):
+                        os.makedirs(path)
+                    with open(os.path.join(path, href), 'wb') as imgf:
+                        imgf.write(item.data)
+
+            # Metadata
+            with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
+                mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
+
+            txtz = ZipFile(output_path, 'w')
+            txtz.add_dir(tdir)
--- a/ebook_converter/ebooks/conversion/plumber.py
+++ b/ebook_converter/ebooks/conversion/plumber.py
--- a/ebook_converter/ebooks/conversion/preprocess.py
+++ b/ebook_converter/ebooks/conversion/preprocess.py
@@ -0,0 +1,646 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import functools, re, json
+from math import ceil
+
+from calibre import entity_to_unicode, as_unicode
+from polyglot.builtins import unicode_type, range
+
+XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
+SVG_NS       = 'http://www.w3.org/2000/svg'
+XLINK_NS     = 'http://www.w3.org/1999/xlink'
+
+convert_entities = functools.partial(entity_to_unicode,
+        result_exceptions={
+            '<' : '&lt;',
+            '>' : '&gt;',
+            "'" : '&apos;',
+            '"' : '&quot;',
+            '&' : '&amp;',
+        })
+_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
+
+LIGATURES = {
+#        '\u00c6': 'AE',
+#        '\u00e6': 'ae',
+#        '\u0152': 'OE',
+#        '\u0153': 'oe',
+#        '\u0132': 'IJ',
+#        '\u0133': 'ij',
+#        '\u1D6B': 'ue',
+        '\uFB00': 'ff',
+        '\uFB01': 'fi',
+        '\uFB02': 'fl',
+        '\uFB03': 'ffi',
+        '\uFB04': 'ffl',
+        '\uFB05': 'ft',
+        '\uFB06': 'st',
+        }
+
+_ligpat = re.compile('|'.join(LIGATURES))
+
+
+def sanitize_head(match):
+    x = match.group(1)
+    x = _span_pat.sub('', x)
+    return '<head>\n%s\n</head>' % x
+
+
+def chap_head(match):
+    chap = match.group('chap')
+    title = match.group('title')
+    if not title:
+        return '<h1>'+chap+'</h1><br/>\n'
+    else:
+        return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
+
+
+def wrap_lines(match):
+    ital = match.group('ital')
+    if not ital:
+        return ' '
+    else:
+        return ital+' '
+
+
+def smarten_punctuation(html, log=None):
+    from calibre.utils.smartypants import smartyPants
+    from calibre.ebooks.chardet import substitute_entites
+    from calibre.ebooks.conversion.utils import HeuristicProcessor
+    preprocessor = HeuristicProcessor(log=log)
+    from uuid import uuid4
+    start = 'calibre-smartypants-'+unicode_type(uuid4())
+    stop = 'calibre-smartypants-'+unicode_type(uuid4())
+    html = html.replace('<!--', start)
+    html = html.replace('-->', stop)
+    html = preprocessor.fix_nbsp_indents(html)
+    html = smartyPants(html)
+    html = html.replace(start, '<!--')
+    html = html.replace(stop, '-->')
+    return substitute_entites(html)
+
+
+class DocAnalysis(object):
+    '''
+    Provides various text analysis functions to determine how the document is structured.
+    format is the type of document analysis will be done against.
+    raw is the raw text to determine the line length to use for wrapping.
+    Blank lines are excluded from analysis
+    '''
+
+    def __init__(self, format='html', raw=''):
+        raw = raw.replace('&nbsp;', ' ')
+        if format == 'html':
+            linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
+        elif format == 'pdf':
+            linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
+        elif format == 'spanned_html':
+            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
+        elif format == 'txt':
+            linere = re.compile('.*?\n')
+        self.lines = linere.findall(raw)
+
+    def line_length(self, percent):
+        '''
+        Analyses the document to find the median line length.
+        percentage is a decimal number, 0 - 1 which is used to determine
+        how far in the list of line lengths to use. The list of line lengths is
+        ordered smallest to largest and does not include duplicates. 0.5 is the
+        median value.
+        '''
+        lengths = []
+        for line in self.lines:
+            if len(line) > 0:
+                lengths.append(len(line))
+
+        if not lengths:
+            return 0
+
+        lengths = list(set(lengths))
+        total = sum(lengths)
+        avg = total / len(lengths)
+        max_line = ceil(avg * 2)
+
+        lengths = sorted(lengths)
+        for i in range(len(lengths) - 1, -1, -1):
+            if lengths[i] > max_line:
+                del lengths[i]
+
+        if percent > 1:
+            percent = 1
+        if percent < 0:
+            percent = 0
+
+        index = int(len(lengths) * percent) - 1
+
+        return lengths[index]
+
+    def line_histogram(self, percent):
+        '''
+        Creates a broad histogram of the document to determine whether it incorporates hard
+        line breaks.  Lines are sorted into 20 'buckets' based on length.
+        percent is the percentage of lines that should be in a single bucket to return true
+        The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
+        '''
+        minLineLength=20  # Ignore lines under 20 chars (typical of spaces)
+        maxLineLength=1900  # Discard larger than this to stay in range
+        buckets=20  # Each line is divided into a bucket based on length
+
+        # print("there are "+unicode_type(len(lines))+" lines")
+        # max = 0
+        # for line in self.lines:
+        #    l = len(line)
+        #    if l > max:
+        #        max = l
+        # print("max line found is "+unicode_type(max))
+        # Build the line length histogram
+        hRaw = [0 for i in range(0,buckets)]
+        for line in self.lines:
+            l = len(line)
+            if l > minLineLength and l < maxLineLength:
+                l = int(l // 100)
+                # print("adding "+unicode_type(l))
+                hRaw[l]+=1
+
+        # Normalize the histogram into percents
+        totalLines = len(self.lines)
+        if totalLines > 0:
+            h = [float(count)/totalLines for count in hRaw]
+        else:
+            h = []
+        # print("\nhRaw histogram lengths are: "+unicode_type(hRaw))
+        # print("              percents are: "+unicode_type(h)+"\n")
+
+        # Find the biggest bucket
+        maxValue = 0
+        for i in range(0,len(h)):
+            if h[i] > maxValue:
+                maxValue = h[i]
+
+        if maxValue < percent:
+            # print("Line lengths are too variable. Not unwrapping.")
+            return False
+        else:
+            # print(unicode_type(maxValue)+" of the lines were in one bucket")
+            return True
+
+
+class Dehyphenator(object):
+    '''
+    Analyzes words to determine whether hyphens should be retained/removed.  Uses the document
+    itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
+    scientific words. The primary disadvantage is that words appearing only once in the document
+    retain hyphens.
+    '''
+
+    def __init__(self, verbose=0, log=None):
+        self.log = log
+        self.verbose = verbose
+        # Add common suffixes to the regex below to increase the likelihood of a match -
+        # don't add suffixes which are also complete words, such as 'able' or 'sex'
+        # only remove if it's not already the point of hyphenation
+        self.suffix_string = (
+            "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
+            "(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
+            "(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
+        self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
+        self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
+        # remove prefixes if the prefix was not already the point of hyphenation
+        self.prefix_string = '^(dis|re|un|in|ex)'
+        self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
+        self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
+
+    def dehyphenate(self, match):
+        firsthalf = match.group('firstpart')
+        secondhalf = match.group('secondpart')
+        try:
+            wraptags = match.group('wraptags')
+        except:
+            wraptags = ''
+        hyphenated = unicode_type(firsthalf) + "-" + unicode_type(secondhalf)
+        dehyphenated = unicode_type(firsthalf) + unicode_type(secondhalf)
+        if self.suffixes.match(secondhalf) is None:
+            lookupword = self.removesuffixes.sub('', dehyphenated)
+        else:
+            lookupword = dehyphenated
+        if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None:
+            lookupword = self.removeprefix.sub('', lookupword)
+        if self.verbose > 2:
+            self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
+        try:
+            searchresult = self.html.find(lookupword.lower())
+        except:
+            return hyphenated
+        if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
+            if self.html.find(lookupword) != -1 or searchresult != -1:
+                if self.verbose > 2:
+                    self.log("    Cleanup:returned dehyphenated word: " + dehyphenated)
+                return dehyphenated
+            elif self.html.find(hyphenated) != -1:
+                if self.verbose > 2:
+                    self.log("        Cleanup:returned hyphenated word: " + hyphenated)
+                return hyphenated
+            else:
+                if self.verbose > 2:
+                    self.log("            Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf)
+                return firsthalf+'\u2014'+wraptags+secondhalf
+
+        else:
+            if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
+                if self.verbose > 2:
+                    self.log("too short, returned hyphenated word: " + hyphenated)
+                return hyphenated
+            if len(firsthalf) <= 2 and len(secondhalf) <= 2:
+                if self.verbose > 2:
+                    self.log("too short, returned hyphenated word: " + hyphenated)
+                return hyphenated
+            if self.html.find(lookupword) != -1 or searchresult != -1:
+                if self.verbose > 2:
+                    self.log("     returned dehyphenated word: " + dehyphenated)
+                return dehyphenated
+            else:
+                if self.verbose > 2:
+                    self.log("          returned hyphenated word: " + hyphenated)
+                return hyphenated
+
+    def __call__(self, html, format, length=1):
+        self.html = html
+        self.format = format
+        if format == 'html':
+            intextmatch = re.compile((
+                r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
+                r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
+                r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
+        elif format == 'pdf':
+            intextmatch = re.compile((
+                r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<wraptags><p>|'
+                r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
+        elif format == 'txt':
+            intextmatch = re.compile(
+                '(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
+        elif format == 'individual_words':
+            intextmatch = re.compile(
+                r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
+        elif format == 'html_cleanup':
+            intextmatch = re.compile(
+                r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
+                r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
+        elif format == 'txt_cleanup':
+            intextmatch = re.compile(
+                r'(?P<firstpart>[^\W\-]+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
+
+        html = intextmatch.sub(self.dehyphenate, html)
+        return html
+
+
+class CSSPreProcessor(object):
+
+    # Remove some of the broken CSS Microsoft products
+    # create
+    MS_PAT     = re.compile(r'''
+        (?P<start>^|;|\{)\s*    # The end of the previous rule or block start
+        (%s).+?                 # The invalid selectors
+        (?P<end>$|;|\})         # The end of the declaration
+        '''%'mso-|panose-|text-underline|tab-interval',
+        re.MULTILINE|re.IGNORECASE|re.VERBOSE)
+
+    def ms_sub(self, match):
+        end = match.group('end')
+        try:
+            start = match.group('start')
+        except:
+            start = ''
+        if end == ';':
+            end = ''
+        return start + end
+
+    def __call__(self, data, add_namespace=False):
+        from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
+        data = self.MS_PAT.sub(self.ms_sub, data)
+        if not add_namespace:
+            return data
+
+        # Remove comments as the following namespace logic will break if there
+        # are commented lines before the first @import or @charset rule. Since
+        # the conversion will remove all stylesheets anyway, we don't lose
+        # anything
+        data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
+
+        ans, namespaced = [], False
+        for line in data.splitlines():
+            ll = line.lstrip()
+            if not (namespaced or ll.startswith('@import') or not ll or
+                        ll.startswith('@charset')):
+                ans.append(XHTML_CSS_NAMESPACE.strip())
+                namespaced = True
+            ans.append(line)
+
+        return '\n'.join(ans)
+
+
+def accent_regex(accent_maps, letter_before=False):
+    accent_cat = set()
+    letters = set()
+
+    for accent in tuple(accent_maps):
+        accent_cat.add(accent)
+        k, v = accent_maps[accent].split(':', 1)
+        if len(k) != len(v):
+            raise ValueError('Invalid mapping for: {} -> {}'.format(k, v))
+        accent_maps[accent] = lmap = dict(zip(k, v))
+        letters |= set(lmap)
+
+    if letter_before:
+        args = ''.join(letters), ''.join(accent_cat)
+        accent_group, letter_group = 2, 1
+    else:
+        args = ''.join(accent_cat), ''.join(letters)
+        accent_group, letter_group = 1, 2
+
+    pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
+
+    def sub(m):
+        lmap = accent_maps[m.group(accent_group)]
+        return lmap.get(m.group(letter_group)) or m.group()
+
+    return pat, sub
+
+
+def html_preprocess_rules():
+    ans = getattr(html_preprocess_rules, 'ans', None)
+    if ans is None:
+        ans = html_preprocess_rules.ans = [
+        # Remove huge block of contiguous spaces as they slow down
+        # the following regexes pretty badly
+        (re.compile(r'\s{10000,}'), ''),
+        # Some idiotic HTML generators (Frontpage I'm looking at you)
+        # Put all sorts of crap into <head>. This messes up lxml
+        (re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
+        sanitize_head),
+        # Convert all entities, since lxml doesn't handle them well
+        (re.compile(r'&(\S+?);'), convert_entities),
+        # Remove the <![if/endif tags inserted by everybody's darling, MS Word
+        (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
+    ]
+    return ans
+
+
+def pdftohtml_rules():
+    ans = getattr(pdftohtml_rules, 'ans', None)
+    if ans is None:
+        ans = pdftohtml_rules.ans = [
+        accent_regex({
+            '¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
+            '`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
+            '´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
+            'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
+            '¸': 'cC:çÇ',
+            '˛': 'aAeE:ąĄęĘ',
+            '˙': 'zZ:żŻ',
+            'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
+            '°': 'uU:ůŮ',
+        }),
+
+        accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
+
+        # If pdf printed from a browser then the header/footer has a reliable pattern
+        (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+
+        # Center separator lines
+        (re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
+
+        # Remove <hr> tags
+        (re.compile(r'<hr.*?>', re.IGNORECASE), ''),
+
+        # Remove gray background
+        (re.compile(r'<BODY[^<>]+>'), '<BODY>'),
+
+        # Convert line breaks to paragraphs
+        (re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
+        (re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
+        (re.compile(r'\s*</body>'), '</p>\n</body>'),
+
+        # Clean up spaces
+        (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
+        # Add space before and after italics
+        (re.compile(r'(?<!“)<i>'), ' <i>'),
+        (re.compile(r'</i>(?=\w)'), '</i> '),
+    ]
+    return ans
+
+
+def book_designer_rules():
+    ans = getattr(book_designer_rules, 'ans', None)
+    if ans is None:
+        ans = book_designer_rules.ans = [
+        # HR
+        (re.compile('<hr>', re.IGNORECASE),
+        lambda match : '<span style="page-break-after:always"> </span>'),
+        # Create header tags
+        (re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
+        lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
+        (re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
+        lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
+        (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
+        lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
+        (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
+        lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
+    ]
+    return None
+
+
+class HTMLPreProcessor(object):
+
+    def __init__(self, log=None, extra_opts=None, regex_wizard_callback=None):
+        self.log = log
+        self.extra_opts = extra_opts
+        self.regex_wizard_callback = regex_wizard_callback
+        self.current_href = None
+
+    def is_baen(self, src):
+        return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
+                          re.IGNORECASE).search(src) is not None
+
+    def is_book_designer(self, raw):
+        return re.search('<H2[^><]*id=BookTitle', raw) is not None
+
+    def is_pdftohtml(self, src):
+        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
+
+    def __call__(self, html, remove_special_chars=None,
+            get_preprocess_html=False):
+        if remove_special_chars is not None:
+            html = remove_special_chars.sub('', html)
+        html = html.replace('\0', '')
+        is_pdftohtml = self.is_pdftohtml(html)
+        if self.is_baen(html):
+            rules = []
+        elif self.is_book_designer(html):
+            rules = book_designer_rules()
+        elif is_pdftohtml:
+            rules = pdftohtml_rules()
+        else:
+            rules = []
+
+        start_rules = []
+
+        if not getattr(self.extra_opts, 'keep_ligatures', False):
+            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
+
+        user_sr_rules = {}
+        # Function for processing search and replace
+
+        def do_search_replace(search_pattern, replace_txt):
+            from calibre.ebooks.conversion.search_replace import compile_regular_expression
+            try:
+                search_re = compile_regular_expression(search_pattern)
+                if not replace_txt:
+                    replace_txt = ''
+                rules.insert(0, (search_re, replace_txt))
+                user_sr_rules[(search_re, replace_txt)] = search_pattern
+            except Exception as e:
+                self.log.error('Failed to parse %r regexp because %s' %
+                        (search, as_unicode(e)))
+
+        # search / replace using the sr?_search / sr?_replace options
+        for i in range(1, 4):
+            search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
+            search_pattern = getattr(self.extra_opts, search, '')
+            replace_txt = getattr(self.extra_opts, replace, '')
+            if search_pattern:
+                do_search_replace(search_pattern, replace_txt)
+
+        # multi-search / replace using the search_replace option
+        search_replace = getattr(self.extra_opts, 'search_replace', None)
+        if search_replace:
+            search_replace = json.loads(search_replace)
+            for search_pattern, replace_txt in reversed(search_replace):
+                do_search_replace(search_pattern, replace_txt)
+
+        end_rules = []
+        # delete soft hyphens - moved here so it's executed after header/footer removal
+        if is_pdftohtml:
+            # unwrap/delete soft hyphens
+            end_rules.append((re.compile(
+                r'[](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
+            # unwrap/delete soft hyphens with formatting
+            end_rules.append((re.compile(
+                r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
+
+        length = -1
+        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
+            docanalysis = DocAnalysis('pdf', html)
+            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
+            if length:
+                # print("The pdf line length returned is " + unicode_type(length))
+                # unwrap em/en dashes
+                end_rules.append((re.compile(
+                    r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
+                end_rules.append(
+                    # Un wrap using punctuation
+                    (re.compile((
+                        r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]'
+                        r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
+                        r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
+                )
+
+        for rule in html_preprocess_rules() + start_rules:
+            html = rule[0].sub(rule[1], html)
+
+        if self.regex_wizard_callback is not None:
+            self.regex_wizard_callback(self.current_href, html)
+
+        if get_preprocess_html:
+            return html
+
+        def dump(raw, where):
+            import os
+            dp = getattr(self.extra_opts, 'debug_pipeline', None)
+            if dp and os.path.exists(dp):
+                odir = os.path.join(dp, 'input')
+                if os.path.exists(odir):
+                    odir = os.path.join(odir, where)
+                    if not os.path.exists(odir):
+                        os.makedirs(odir)
+                    name, i = None, 0
+                    while not name or os.path.exists(os.path.join(odir, name)):
+                        i += 1
+                        name = '%04d.html'%i
+                    with open(os.path.join(odir, name), 'wb') as f:
+                        f.write(raw.encode('utf-8'))
+
+        # dump(html, 'pre-preprocess')
+
+        for rule in rules + end_rules:
+            try:
+                html = rule[0].sub(rule[1], html)
+            except Exception as e:
+                if rule in user_sr_rules:
+                    self.log.error(
+                        'User supplied search & replace rule: %s -> %s '
+                        'failed with error: %s, ignoring.'%(
+                            user_sr_rules[rule], rule[1], e))
+                else:
+                    raise
+
+        if is_pdftohtml and length > -1:
+            # Dehyphenate
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
+            html = dehyphenator(html,'html', length)
+
+        if is_pdftohtml:
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            pdf_markup = HeuristicProcessor(self.extra_opts, None)
+            totalwords = 0
+            if pdf_markup.get_word_count(html) > 7000:
+                html = pdf_markup.markup_chapters(html, totalwords, True)
+
+        # dump(html, 'post-preprocess')
+
+        # Handle broken XHTML w/ SVG (ugh)
+        if 'svg:' in html and SVG_NS not in html:
+            html = html.replace(
+                '<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
+        if 'xlink:' in html and XLINK_NS not in html:
+            html = html.replace(
+                '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
+
+        html = XMLDECL_RE.sub('', html)
+
+        if getattr(self.extra_opts, 'asciiize', False):
+            from calibre.utils.localization import get_udc
+            from calibre.utils.mreplace import MReplace
+            unihandecoder = get_udc()
+            mr = MReplace(data={'«':'&lt;'*3, '»':'&gt;'*3})
+            html = mr.mreplace(html)
+            html = unihandecoder.decode(html)
+
+        if getattr(self.extra_opts, 'enable_heuristics', False):
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
+            html = preprocessor(html)
+
+        if is_pdftohtml:
+            html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
+
+        if getattr(self.extra_opts, 'smarten_punctuation', False):
+            html = smarten_punctuation(html, self.log)
+
+        try:
+            unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
+        except AttributeError:
+            unsupported_unicode_chars = ''
+        if unsupported_unicode_chars:
+            from calibre.utils.localization import get_udc
+            unihandecoder = get_udc()
+            for char in unsupported_unicode_chars:
+                asciichar = unihandecoder.decode(char)
+                html = html.replace(char, asciichar)
+
+        return html
--- a/ebook_converter/ebooks/conversion/utils.py
+++ b/ebook_converter/ebooks/conversion/utils.py
@@ -0,0 +1,881 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+from math import ceil
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
+from calibre.utils.logging import default_log
+from calibre.utils.wordcount import get_wordcount_obj
+from polyglot.builtins import unicode_type
+
+
+class HeuristicProcessor(object):
+
+    def __init__(self, extra_opts=None, log=None):
+        self.log = default_log if log is None else log
+        self.html_preprocess_sections = 0
+        self.found_indents = 0
+        self.extra_opts = extra_opts
+        self.deleted_nbsps = False
+        self.totalwords = 0
+        self.min_chapters = 1
+        self.chapters_no_title = 0
+        self.chapters_with_title = 0
+        self.blanks_deleted = False
+        self.blanks_between_paragraphs = False
+        self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
+        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
+        self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
+        self.line_open = (
+            r"<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*"
+            r"(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*")
+        self.line_close = "(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>"
+        self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
+        self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
+        self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\…\\)„\\w]'
+        self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
+
+    def is_pdftohtml(self, src):
+        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
+
+    def is_abbyy(self, src):
+        return '<meta name="generator" content="ABBYY FineReader' in src[:1000]
+
+    def chapter_head(self, match):
+        from calibre.utils.html2text import html2text
+        chap = match.group('chap')
+        title = match.group('title')
+        if not title:
+            self.html_preprocess_sections = self.html_preprocess_sections + 1
+            self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
+                    " chapters. - " + unicode_type(chap))
+            return '<h2>'+chap+'</h2>\n'
+        else:
+            delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
+            delete_quotes = re.compile('\'\"')
+            txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
+            txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
+            self.html_preprocess_sections = self.html_preprocess_sections + 1
+            self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
+                    " chapters & titles. - " + unicode_type(chap) + ", " + unicode_type(title))
+            return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
+
+    def chapter_break(self, match):
+        chap = match.group('section')
+        styles = match.group('styles')
+        self.html_preprocess_sections = self.html_preprocess_sections + 1
+        self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
+                " section markers based on punctuation. - " + unicode_type(chap))
+        return '<'+styles+' style="page-break-before:always">'+chap
+
+    def analyze_title_matches(self, match):
+        # chap = match.group('chap')
+        title = match.group('title')
+        if not title:
+            self.chapters_no_title = self.chapters_no_title + 1
+        else:
+            self.chapters_with_title = self.chapters_with_title + 1
+
+    def insert_indent(self, match):
+        pstyle = match.group('formatting')
+        tag = match.group('tagtype')
+        span = match.group('span')
+        self.found_indents = self.found_indents + 1
+        if pstyle:
+            if pstyle.lower().find('style') != -1:
+                pstyle = re.sub(r'"$', '; text-indent:3%"', pstyle)
+            else:
+                pstyle = pstyle+' style="text-indent:3%"'
+            if not span:
+                return '<'+tag+' '+pstyle+'>'
+            else:
+                return '<'+tag+' '+pstyle+'>'+span
+        else:
+            if not span:
+                return '<'+tag+' style="text-indent:3%">'
+            else:
+                return '<'+tag+' style="text-indent:3%">'+span
+
+    def no_markup(self, raw, percent):
+        '''
+        Detects total marked up line endings in the file. raw is the text to
+        inspect.  Percent is the minimum percent of line endings which should
+        be marked up to return true.
+        '''
+        htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
+        line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+        htm_end = htm_end_ere.findall(raw)
+        line_end = line_end_ere.findall(raw)
+        tot_htm_ends = len(htm_end)
+        tot_ln_fds = len(line_end)
+        # self.log.debug("There are " + unicode_type(tot_ln_fds) + " total Line feeds, and " +
+        #        unicode_type(tot_htm_ends) + " marked up endings")
+
+        if percent > 1:
+            percent = 1
+        if percent < 0:
+            percent = 0
+
+        min_lns = tot_ln_fds * percent
+        # self.log.debug("There must be fewer than " + unicode_type(min_lns) + " unmarked lines to add markup")
+        return min_lns > tot_htm_ends
+
+    def dump(self, raw, where):
+        import os
+        dp = getattr(self.extra_opts, 'debug_pipeline', None)
+        if dp and os.path.exists(dp):
+            odir = os.path.join(dp, 'preprocess')
+            if not os.path.exists(odir):
+                os.makedirs(odir)
+            if os.path.exists(odir):
+                odir = os.path.join(odir, where)
+                if not os.path.exists(odir):
+                    os.makedirs(odir)
+                name, i = None, 0
+                while not name or os.path.exists(os.path.join(odir, name)):
+                    i += 1
+                    name = '%04d.html'%i
+                with open(os.path.join(odir, name), 'wb') as f:
+                    f.write(raw.encode('utf-8'))
+
+    def get_word_count(self, html):
+        word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
+        word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
+        wordcount = get_wordcount_obj(word_count_text)
+        return wordcount.words
+
+    def markup_italicis(self, html):
+        # self.log.debug("\n\n\nitalicize debugging \n\n\n")
+        ITALICIZE_WORDS = [
+            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
+            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+            'Mlle.', 'Mons.', 'PS.', 'PPS.',
+        ]
+
+        ITALICIZE_STYLE_PATS = [
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])_\*/(?P<words>[^\*_]+)/\*_'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])~~(?P<words>[^~]+)~~'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])_/(?P<words>[^/_]+)/_'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])_\*(?P<words>[^\*_]+)\*_'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])\*/(?P<words>[^/\*]+)/\*'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])/:(?P<words>[^:/]+):/'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])\|:(?P<words>[^:\|]+):\|'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])\*(?P<words>[^\*]+)\*'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])~(?P<words>[^~]+)~'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])/(?P<words>[^/\*><]+)/'),
+            unicode_type(r'(?msu)(?<=[\s>"“\'‘])_(?P<words>[^_]+)_'),
+        ]
+
+        for word in ITALICIZE_WORDS:
+            html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
+
+        search_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
+        search_text = re.sub(r'<[^>]*>', '', search_text)
+        for pat in ITALICIZE_STYLE_PATS:
+            for match in re.finditer(pat, search_text):
+                ital_string = unicode_type(match.group('words'))
+                # self.log.debug("italicising "+unicode_type(match.group(0))+"    with <i>"+ital_string+"</i>")
+                try:
+                    html = re.sub(re.escape(unicode_type(match.group(0))), '<i>%s</i>' % ital_string, html)
+                except OverflowError:
+                    # match.group(0) was too large to be compiled into a regex
+                    continue
+                except re.error:
+                    # the match was not a valid regular expression
+                    continue
+
+        return html
+
+    def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
+        '''
+        Searches for common chapter headings throughout the document
+        attempts multiple patterns based on likelihood of a match
+        with minimum false positives.  Exits after finding a successful pattern
+        '''
+        # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
+        # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
+        # or pdf page numbers from being treated as TOC markers
+        max_chapters = 150
+        typical_chapters = 7000.
+        if wordcount > 7000:
+            if wordcount > 200000:
+                typical_chapters = 15000.
+            self.min_chapters = int(ceil(wordcount / typical_chapters))
+        self.log.debug("minimum chapters required are: "+unicode_type(self.min_chapters))
+        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
+        self.html_preprocess_sections = len(heading.findall(html))
+        self.log.debug("found " + unicode_type(self.html_preprocess_sections) + " pre-existing headings")
+
+        # Build the Regular Expressions in pieces
+        init_lookahead = "(?=<(p|div))"
+        chapter_line_open = self.line_open
+        title_line_open = (r"<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?"
+        r"\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*")
+        chapter_header_open = r"(?P<chap>"
+        title_header_open = r"(?P<title>"
+        chapter_header_close = ")\\s*"
+        title_header_close = ")"
+        chapter_line_close = self.line_close
+        title_line_close = "(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)>)?\\s*</(?P=outer2)>"
+
+        is_pdftohtml = self.is_pdftohtml(html)
+        if is_pdftohtml:
+            title_line_open = "<(?P<outer2>p)[^>]*>\\s*"
+            title_line_close = "\\s*</(?P=outer2)>"
+
+        if blanks_between_paragraphs:
+            blank_lines = "(\\s*<p[^>]*>\\s*</p>){0,2}\\s*"
+        else:
+            blank_lines = ""
+        opt_title_open = "("
+        opt_title_close = ")?"
+        n_lookahead_open = "(?!\\s*"
+        n_lookahead_close = ")\\s*"
+
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
+
+        analysis_result = []
+
+        chapter_types = [
+            [(
+                r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)"
+                r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'],
+            # Highest frequency headings which include titles
+            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>",
+                           True, True, True, False, "Searching for emphasized lines", 'emphasized'],  # Emphasized lines
+            [r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False,
+                       "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False,
+                       "Searching for numeric chapters with titles", 'numeric_title'],  # Numeric Titles
+            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False,
+                       "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False,
+                          "Searching for chapters with Uppercase Characters", 'uppercase']  # Uppercase Chapters
+            ]
+
+        def recurse_patterns(html, analyze):
+            # Start with most typical chapter headings, get more aggressive until one works
+            for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
+                n_lookahead = ''
+                hits = 0
+                self.chapters_no_title = 0
+                self.chapters_with_title = 0
+
+                if n_lookahead_req:
+                    lp_n_lookahead_open = n_lookahead_open
+                    lp_n_lookahead_close = n_lookahead_close
+                else:
+                    lp_n_lookahead_open = ''
+                    lp_n_lookahead_close = ''
+
+                if strict_title:
+                    lp_title = default_title
+                else:
+                    lp_title = simple_title
+
+                if ignorecase:
+                    arg_ignorecase = r'(?i)'
+                else:
+                    arg_ignorecase = ''
+
+                if title_req:
+                    lp_opt_title_open = ''
+                    lp_opt_title_close = ''
+                else:
+                    lp_opt_title_open = opt_title_open
+                    lp_opt_title_close = opt_title_close
+
+                if self.html_preprocess_sections >= self.min_chapters:
+                    break
+                full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
+                if n_lookahead_req:
+                    n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+                if not analyze:
+                    self.log.debug("Marked " + unicode_type(self.html_preprocess_sections) + " headings, " + log_message)
+
+                chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
+                    lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
+                chapdetect = re.compile(r'%s' % chapter_marker)
+
+                if analyze:
+                    hits = len(chapdetect.findall(html))
+                    if hits:
+                        chapdetect.sub(self.analyze_title_matches, html)
+                        if float(self.chapters_with_title) / float(hits) > .5:
+                            title_req = True
+                            strict_title = False
+                        self.log.debug(
+                                unicode_type(type_name)+" had "+unicode_type(hits)+
+                                " hits - "+unicode_type(self.chapters_no_title)+" chapters with no title, "+
+                                unicode_type(self.chapters_with_title)+" chapters with titles, "+
+                                unicode_type(float(self.chapters_with_title) / float(hits))+" percent. ")
+                        if type_name == 'common':
+                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
+                        elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
+                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
+                            break
+                else:
+                    html = chapdetect.sub(self.chapter_head, html)
+            return html
+
+        recurse_patterns(html, True)
+        chapter_types = analysis_result
+        html = recurse_patterns(html, False)
+
+        words_per_chptr = wordcount
+        if words_per_chptr > 0 and self.html_preprocess_sections > 0:
+            words_per_chptr = wordcount // self.html_preprocess_sections
+        self.log.debug("Total wordcount is: "+ unicode_type(wordcount)+", Average words per section is: "+
+                       unicode_type(words_per_chptr)+", Marked up "+unicode_type(self.html_preprocess_sections)+" chapters")
+        return html
+
+    def punctuation_unwrap(self, length, content, format):
+        '''
+        Unwraps lines based on line length and punctuation
+        supports a range of html markup and text files
+
+        the lookahead regex below is meant look for any non-full stop characters - punctuation
+        characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
+        the reason for this is to prevent false positive wrapping.  False positives are more
+        difficult to detect than false negatives during a manual review of the doc
+
+        This function intentionally leaves hyphenated content alone as that is handled by the
+        dehyphenate routine in a separate step
+        '''
+        def style_unwrap(match):
+            style_close = match.group('style_close')
+            style_open = match.group('style_open')
+            if style_open and style_close:
+                return style_close+' '+style_open
+            elif style_open and not style_close:
+                return ' '+style_open
+            elif not style_open and style_close:
+                return style_close+' '
+            else:
+                return ' '
+
+        # define the pieces of the regex
+        # (?<!\&\w{4});) is a semicolon not part of an entity
+        lookahead = "(?<=.{"+unicode_type(length)+r"}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4});))"
+        em_en_lookahead = "(?<=.{"+unicode_type(length)+"}[\u2013\u2014])"
+        soft_hyphen = "\xad"
+        line_ending = "\\s*(?P<style_close></(span|[iub])>)?\\s*(</(p|div)>)?"
+        blanklines = "\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*"
+        line_opening = "<(p|div)[^>]*>\\s*(?P<style_open><(span|[iub])[^>]*>)?\\s*"
+        txt_line_wrap = "((\u0020|\u0009)*\n){1,4}"
+
+        if format == 'txt':
+            unwrap_regex = lookahead+txt_line_wrap
+            em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
+            shy_unwrap_regex = soft_hyphen+txt_line_wrap
+        else:
+            unwrap_regex = lookahead+line_ending+blanklines+line_opening
+            em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
+            shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
+
+        unwrap = re.compile("%s" % unwrap_regex, re.UNICODE)
+        em_en_unwrap = re.compile("%s" % em_en_unwrap_regex, re.UNICODE)
+        shy_unwrap = re.compile("%s" % shy_unwrap_regex, re.UNICODE)
+
+        if format == 'txt':
+            content = unwrap.sub(' ', content)
+            content = em_en_unwrap.sub('', content)
+            content = shy_unwrap.sub('', content)
+        else:
+            content = unwrap.sub(style_unwrap, content)
+            content = em_en_unwrap.sub(style_unwrap, content)
+            content = shy_unwrap.sub(style_unwrap, content)
+
+        return content
+
+    def txt_process(self, match):
+        from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs_single_line
+        content = match.group('text')
+        content = separate_paragraphs_single_line(content)
+        content = convert_basic(content, epub_split_size_kb=0)
+        return content
+
+    def markup_pre(self, html):
+        pre = re.compile(r'<pre>', re.IGNORECASE)
+        if len(pre.findall(html)) >= 1:
+            self.log.debug("Running Text Processing")
+            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
+            html = outerhtml.sub(self.txt_process, html)
+            from calibre.ebooks.conversion.preprocess import convert_entities
+            html = re.sub(r'&(\S+?);', convert_entities, html)
+        else:
+            # Add markup naively
+            # TODO - find out if there are cases where there are more than one <pre> tag or
+            # other types of unmarked html and handle them in some better fashion
+            add_markup = re.compile('(?<!>)(\n)')
+            html = add_markup.sub('</p>\n<p>', html)
+        return html
+
+    def arrange_htm_line_endings(self, html):
+        html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\\g<tag>"+">\n", html)
+        html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\\g<tag>"+"\\g<style>"+">", html)
+        return html
+
+    def fix_nbsp_indents(self, html):
+        txtindent = re.compile(unicode_type(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
+        html = txtindent.sub(self.insert_indent, html)
+        if self.found_indents > 1:
+            self.log.debug("replaced "+unicode_type(self.found_indents)+ " nbsp indents with inline styles")
+        return html
+
+    def cleanup_markup(self, html):
+        # remove remaining non-breaking spaces
+        html = re.sub(unicode_type(r'\u00a0'), ' ', html)
+        # Get rid of various common microsoft specific tags which can cause issues later
+        # Get rid of empty <o:p> tags to simplify other processing
+        html = re.sub(unicode_type(r'\s*<o:p>\s*</o:p>'), ' ', html)
+        # Delete microsoft 'smart' tags
+        html = re.sub('(?i)</?st1:\\w+>', '', html)
+        # Re-open self closing paragraph tags
+        html = re.sub('<p[^>/]*/>', '<p> </p>', html)
+        # Get rid of empty span, bold, font, em, & italics tags
+        fmt_tags = 'font|[ibu]|em|strong'
+        open_fmt_pat, close_fmt_pat = r'<(?:{})(?:\s[^>]*)?>'.format(fmt_tags), '</(?:{})>'.format(fmt_tags)
+        for i in range(2):
+            html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
+            html = re.sub(
+                r"\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}".format(open=open_fmt_pat, close=close_fmt_pat) , " ", html)
+        # delete surrounding divs from empty paragraphs
+        html = re.sub('<div[^>]*>\\s*<p[^>]*>\\s*</p>\\s*</div>', '<p> </p>', html)
+        # Empty heading tags
+        html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
+        self.deleted_nbsps = True
+        return html
+
+    def analyze_line_endings(self, html):
+        '''
+        determines the type of html line ending used most commonly in a document
+        use before calling docanalysis functions
+        '''
+        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
+        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
+        paras = len(paras_reg.findall(html))
+        spans = len(spans_reg.findall(html))
+        if spans > 1:
+            if float(paras) / float(spans) < 0.75:
+                return 'spanned_html'
+            else:
+                return 'html'
+        else:
+            return 'html'
+
+    def analyze_blanks(self, html):
+        blanklines = self.blankreg.findall(html)
+        lines = self.linereg.findall(html)
+        if len(lines) > 1:
+            self.log.debug("There are " + unicode_type(len(blanklines)) + " blank lines. " +
+                    unicode_type(float(len(blanklines)) / float(len(lines))) + " percent blank")
+
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
+                return True
+            else:
+                return False
+
+    def cleanup_required(self):
+        for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
+            if getattr(self.extra_opts, option, False):
+                return True
+        return False
+
+    def merge_blanks(self, html, blanks_count=None):
+        base_em = .5  # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
+        em_per_line = 1.5  # Add another 1.5 em for each additional blank
+
+        def merge_matches(match):
+            to_merge = match.group(0)
+            lines = float(len(self.single_blank.findall(to_merge))) - 1.
+            em = base_em + (em_per_line * lines)
+            if to_merge.find('whitespace'):
+                newline = self.any_multi_blank.sub('\n<p class="whitespace'+unicode_type(int(em * 10))+
+                                                   '" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
+            else:
+                newline = self.any_multi_blank.sub('\n<p class="softbreak'+unicode_type(int(em * 10))+
+                                                   '" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
+            return newline
+
+        html = self.any_multi_blank.sub(merge_matches, html)
+        return html
+
+    def detect_whitespace(self, html):
+        blanks_around_headings = re.compile(
+            r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
+            r'(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+        blanks_around_scene_breaks = re.compile(
+            r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
+            r'(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+        blanks_n_nopunct = re.compile(
+            r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*'
+            r'.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+
+        def merge_header_whitespace(match):
+            initblanks = match.group('initparas')
+            endblanks = match.group('endparas')
+            content = match.group('content')
+            top_margin = ''
+            bottom_margin = ''
+            if initblanks is not None:
+                top_margin = 'margin-top:'+unicode_type(len(self.single_blank.findall(initblanks)))+'em;'
+            if endblanks is not None:
+                bottom_margin = 'margin-bottom:'+unicode_type(len(self.single_blank.findall(endblanks)))+'em;'
+
+            if initblanks is None and endblanks is None:
+                return content
+            elif content.find('scenebreak') != -1:
+                return content
+            else:
+                content = re.sub('(?i)<h(?P<hnum>\\d+)[^>]*>', '\n\n<h'+'\\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
+            return content
+
+        html = blanks_around_headings.sub(merge_header_whitespace, html)
+        html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)
+
+        def markup_whitespaces(match):
+            blanks = match.group(0)
+            blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
+            return blanks
+
+        html = blanks_n_nopunct.sub(markup_whitespaces, html)
+        if self.html_preprocess_sections > self.min_chapters:
+            html = re.sub('(?si)^.*?(?=<h\\d)', markup_whitespaces, html)
+
+        return html
+
+    def detect_soft_breaks(self, html):
+        line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')'
+        line_two = '(?P<line_two>'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+ \
+                     '\\s*(?P<line_two_content>.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')'
+        div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two
+        div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
+
+        def convert_div_softbreaks(match):
+            init_is_paragraph = self.check_paragraph(match.group('init_content'))
+            line_two_is_paragraph = self.check_paragraph(match.group('line_two_content'))
+            if init_is_paragraph and line_two_is_paragraph:
+                return (match.group('initline')+
+                        '\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>\n'+
+                        match.group('line_two'))
+            else:
+                return match.group(0)
+
+        html = div_break_candidate.sub(convert_div_softbreaks, html)
+
+        if not self.blanks_deleted and self.blanks_between_paragraphs:
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
+        else:
+            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
+        return html
+
+    def detect_scene_breaks(self, html):
+        scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+ \
+                                             '<))(?P<break>((?P<break_char>((?!\\s)\\W))\\s*(?P=break_char)?)+)\\s*'+self.line_close
+        scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+        html = scene_breaks.sub(self.scene_break_open+'\\g<break>'+'</p>', html)
+        return html
+
+    def markup_user_break(self, replacement_break):
+        '''
+        Takes string a user supplies and wraps it in markup that will be centered with
+        appropriate margins.  <hr> and <img> tags are allowed.  If the user specifies
+        a style with width attributes in the <hr> tag then the appropriate margins are
+        applied to wrapping divs.  This is because many ebook devices don't support margin:auto
+        All other html is converted to text.
+        '''
+        hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
+        if re.findall('(<|>)', replacement_break):
+            if re.match('^<hr', replacement_break):
+                if replacement_break.find('width') != -1:
+                    try:
+                        width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
+                    except:
+                        scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
+                        self.log.warn('Invalid replacement scene break'
+                                ' expression, using default')
+                    else:
+                        replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
+                        divpercent = (100 - width) // 2
+                        hr_open = re.sub('45', unicode_type(divpercent), hr_open)
+                        scene_break = hr_open+replacement_break+'</div>'
+                else:
+                    scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
+            elif re.match('^<img', replacement_break):
+                scene_break = self.scene_break_open+replacement_break+'</p>'
+            else:
+                from calibre.utils.html2text import html2text
+                replacement_break = html2text(replacement_break)
+                replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
+                scene_break = self.scene_break_open+replacement_break+'</p>'
+        else:
+            replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
+            scene_break = self.scene_break_open+replacement_break+'</p>'
+
+        return scene_break
+
+    def check_paragraph(self, content):
+        content = re.sub('\\s*</?span[^>]*>\\s*', '', content)
+        if re.match('.*[\"\'.!?:]$', content):
+            # print "detected this as a paragraph"
+            return True
+        else:
+            return False
+
+    def abbyy_processor(self, html):
+        abbyy_line = re.compile('((?P<linestart><p\\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
+        empty_paragraph = '\n<p> </p>\n'
+        self.in_blockquote = False
+        self.previous_was_paragraph = False
+        html = re.sub('</?a[^>]*>', '', html)
+
+        def convert_styles(match):
+            # print "raw styles are: "+match.group('styles')
+            content = match.group('content')
+            # print "raw content is: "+match.group('content')
+            image = match.group('image')
+
+            is_paragraph = False
+            text_align = ''
+            text_indent = ''
+            paragraph_before = ''
+            paragraph_after = ''
+            blockquote_open = '\n<blockquote>\n'
+            blockquote_close = '</blockquote>\n'
+            indented_text = 'text-indent:3%;'
+            blockquote_open_loop = ''
+            blockquote_close_loop = ''
+            debugabby = False
+
+            if image:
+                debugabby = True
+                if self.in_blockquote:
+                    self.in_blockquote = False
+                    blockquote_close_loop = blockquote_close
+                self.previous_was_paragraph = False
+                return blockquote_close_loop+'\n'+image+'\n'
+            else:
+                styles = match.group('styles').split(';')
+                is_paragraph = self.check_paragraph(content)
+                # print "styles for this line are: "+unicode_type(styles)
+                split_styles = []
+                for style in styles:
+                    # print "style is: "+unicode_type(style)
+                    newstyle = style.split(':')
+                    # print "newstyle is: "+unicode_type(newstyle)
+                    split_styles.append(newstyle)
+                styles = split_styles
+                for style, setting in styles:
+                    if style == 'text-align' and setting != 'left':
+                        text_align = style+':'+setting+';'
+                    if style == 'text-indent':
+                        setting = int(re.sub('\\s*pt\\s*', '', setting))
+                        if 9 < setting < 14:
+                            text_indent = indented_text
+                        else:
+                            text_indent = style+':'+unicode_type(setting)+'pt;'
+                    if style == 'padding':
+                        setting = re.sub('pt', '', setting).split(' ')
+                        if int(setting[1]) < 16 and int(setting[3]) < 16:
+                            if self.in_blockquote:
+                                debugabby = True
+                                if is_paragraph:
+                                    self.in_blockquote = False
+                                    blockquote_close_loop = blockquote_close
+                            if int(setting[3]) > 8 and text_indent == '':
+                                text_indent = indented_text
+                            if int(setting[0]) > 5:
+                                paragraph_before = empty_paragraph
+                            if int(setting[2]) > 5:
+                                paragraph_after = empty_paragraph
+                        elif not self.in_blockquote and self.previous_was_paragraph:
+                            debugabby = True
+                            self.in_blockquote = True
+                            blockquote_open_loop = blockquote_open
+                        if debugabby:
+                            self.log.debug('\n\n******\n')
+                            self.log.debug('padding top is: '+unicode_type(setting[0]))
+                            self.log.debug('padding right is:' +unicode_type(setting[1]))
+                            self.log.debug('padding bottom is: ' + unicode_type(setting[2]))
+                            self.log.debug('padding left is: ' +unicode_type(setting[3]))
+
+                # print "text-align is: "+unicode_type(text_align)
+                # print "\n***\nline is:\n     "+unicode_type(match.group(0))+'\n'
+                if debugabby:
+                    # print "this line is a paragraph = "+unicode_type(is_paragraph)+", previous line was "+unicode_type(self.previous_was_paragraph)
+                    self.log.debug("styles for this line were:", styles)
+                    self.log.debug('newline is:')
+                    self.log.debug(blockquote_open_loop+blockquote_close_loop+
+                            paragraph_before+'<p style="'+text_indent+text_align+
+                            '">'+content+'</p>'+paragraph_after+'\n\n\n\n\n')
+                # print "is_paragraph is "+unicode_type(is_paragraph)+", previous_was_paragraph is "+unicode_type(self.previous_was_paragraph)
+                self.previous_was_paragraph = is_paragraph
+                # print "previous_was_paragraph is now set to "+unicode_type(self.previous_was_paragraph)+"\n\n\n"
+                return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after
+
+        html = abbyy_line.sub(convert_styles, html)
+        return html
+
+    def __call__(self, html):
+        self.log.debug("*********  Heuristic processing HTML  *********")
+        # Count the words in the document to estimate how many chapters to look for and whether
+        # other types of processing are attempted
+        try:
+            self.totalwords = self.get_word_count(html)
+        except:
+            self.log.warn("Can't get wordcount")
+
+        if self.totalwords < 50:
+            self.log.warn("flow is too short, not running heuristics")
+            return html
+
+        is_abbyy = self.is_abbyy(html)
+        if is_abbyy:
+            html = self.abbyy_processor(html)
+
+        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+        html = self.arrange_htm_line_endings(html)
+        # self.dump(html, 'after_arrange_line_endings')
+        if self.cleanup_required():
+            # ##### Check Markup ######
+            #
+            # some lit files don't have any <p> tags or equivalent (generally just plain text between
+            # <pre> tags), check and  mark up line endings if required before proceeding
+            # fix indents must run after this step
+            if self.no_markup(html, 0.1):
+                self.log.debug("not enough paragraph markers, adding now")
+                # markup using text processing
+                html = self.markup_pre(html)
+
+        # Replace series of non-breaking spaces with text-indent
+        if getattr(self.extra_opts, 'fix_indents', False):
+            html = self.fix_nbsp_indents(html)
+
+        if self.cleanup_required():
+            # fix indents must run before this step, as it removes non-breaking spaces
+            html = self.cleanup_markup(html)
+
+        is_pdftohtml = self.is_pdftohtml(html)
+        if is_pdftohtml:
+            self.line_open = "<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*"
+            self.line_close = "\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>"
+
+        # ADE doesn't render <br />, change to empty paragraphs
+        # html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
+
+        # Determine whether the document uses interleaved blank lines
+        self.blanks_between_paragraphs = self.analyze_blanks(html)
+
+        # detect chapters/sections to match xpath or splitting logic
+
+        if getattr(self.extra_opts, 'markup_chapter_headings', False):
+            html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
+        # self.dump(html, 'after_chapter_markup')
+
+        if getattr(self.extra_opts, 'italicize_common_cases', False):
+            html = self.markup_italicis(html)
+
+        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
+        # blank paragraphs then delete blank lines to clean up spacing
+        if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
+            self.log.debug("deleting blank lines")
+            self.blanks_deleted = True
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
+            html = self.blankreg.sub('', html)
+
+        # Determine line ending type
+        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
+        # that lines can be un-wrapped across page boundaries
+        format = self.analyze_line_endings(html)
+
+        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+        # more of the lines break in the same region of the document then unwrapping is required
+        docanalysis = DocAnalysis(format, html)
+        hardbreaks = docanalysis.line_histogram(.50)
+        self.log.debug("Hard line breaks check returned "+unicode_type(hardbreaks))
+
+        # Calculate Length
+        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+        length = docanalysis.line_length(unwrap_factor)
+        self.log.debug("Median line length is " + unicode_type(length) + ", calculated with " + format + " format")
+
+        # ##### Unwrap lines ######
+        if getattr(self.extra_opts, 'unwrap_lines', False):
+            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
+            if hardbreaks or unwrap_factor < 0.4:
+                self.log.debug("Unwrapping required, unwrapping Lines")
+                # Dehyphenate with line length limiters
+                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
+                html = dehyphenator(html,'html', length)
+                html = self.punctuation_unwrap(length, html, 'html')
+
+        if getattr(self.extra_opts, 'dehyphenate', False):
+            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
+            self.log.debug("Fixing hyphenated content")
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
+            html = dehyphenator(html,'html_cleanup', length)
+            html = dehyphenator(html, 'individual_words', length)
+
+        # If still no sections after unwrapping mark split points on lines with no punctuation
+        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
+            self.log.debug("Looking for more split points based on punctuation,"
+                    " currently have " + unicode_type(self.html_preprocess_sections))
+            chapdetect3 = re.compile(
+                r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
+                r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
+                r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*'
+                r'(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            html = chapdetect3.sub(self.chapter_break, html)
+
+        if getattr(self.extra_opts, 'renumber_headings', False):
+            # search for places where a first or second level heading is immediately followed by another
+            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+            # headings and titles, images, etc
+            doubleheading = re.compile(
+                r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+            html = doubleheading.sub('\\g<firsthead>'+'\n<h3'+'\\g<secondhead>'+'</h3>', html)
+
+        # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
+        # style it with the 'whitespace' class.  All remaining blank lines are styled as softbreaks.
+        # Multiple sequential blank paragraphs are merged with appropriate margins
+        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
+        if getattr(self.extra_opts, 'format_scene_breaks', False):
+            self.log.debug('Formatting scene breaks')
+            html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html)
+            html = self.detect_scene_breaks(html)
+            html = self.detect_whitespace(html)
+            html = self.detect_soft_breaks(html)
+            blanks_count = len(self.any_multi_blank.findall(html))
+            if blanks_count >= 1:
+                html = self.merge_blanks(html, blanks_count)
+            detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
+            scene_break_count = len(detected_scene_break.findall(html))
+            # If the user has enabled scene break replacement, then either softbreaks
+            # or 'hard' scene breaks are replaced, depending on which is in use
+            # Otherwise separator lines are centered, use a bit larger margin in this case
+            replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
+            if replacement_break:
+                replacement_break = self.markup_user_break(replacement_break)
+                if scene_break_count >= 1:
+                    html = detected_scene_break.sub(replacement_break, html)
+                    html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
+                else:
+                    html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
+
+        if self.deleted_nbsps:
+            # put back non-breaking spaces in empty paragraphs so they render correctly
+            html = self.anyblank.sub('\n'+r'\g<openline>'+'\u00a0'+r'\g<closeline>', html)
+        return html
--- a/ebook_converter/ebooks/docx/init.py
+++ b/ebook_converter/ebooks/docx/init.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+
+class InvalidDOCX(ValueError):
+    pass
+
--- a/ebook_converter/ebooks/docx/block_styles.py
+++ b/ebook_converter/ebooks/docx/block_styles.py
@@ -0,0 +1,478 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import numbers
+from collections import OrderedDict
+from polyglot.builtins import iteritems
+
+
+class Inherit(object):
+
+    def __eq__(self, other):
+        return other is self
+
+    def __hash__(self):
+        return id(self)
+
+    def __lt__(self, other):
+        return False
+
+    def __gt__(self, other):
+        return other is not self
+
+    def __ge__(self, other):
+        if self is other:
+            return True
+        return True
+
+    def __le__(self, other):
+        if self is other:
+            return True
+        return False
+
+
+inherit = Inherit()
+
+
+def binary_property(parent, name, XPath, get):
+    vals = XPath('./w:%s' % name)(parent)
+    if not vals:
+        return inherit
+    val = get(vals[0], 'w:val', 'on')
+    return True if val in {'on', '1', 'true'} else False
+
+
+def simple_color(col, auto='black'):
+    if not col or col == 'auto' or len(col) != 6:
+        return auto
+    return '#'+col
+
+
+def simple_float(val, mult=1.0):
+    try:
+        return float(val) * mult
+    except (ValueError, TypeError, AttributeError, KeyError):
+        pass
+
+
+def twips(val, mult=0.05):
+    ''' Parse val as either a pure number representing twentieths of a point or a number followed by the suffix pt, representing pts.'''
+    try:
+        return float(val) * mult
+    except (ValueError, TypeError, AttributeError, KeyError):
+        if val and val.endswith('pt') and mult == 0.05:
+            return twips(val[:-2], mult=1.0)
+
+
+LINE_STYLES = {  # {{{
+    'basicBlackDashes': 'dashed',
+    'basicBlackDots': 'dotted',
+    'basicBlackSquares': 'dashed',
+    'basicThinLines': 'solid',
+    'dashDotStroked': 'groove',
+    'dashed': 'dashed',
+    'dashSmallGap': 'dashed',
+    'dotDash': 'dashed',
+    'dotDotDash': 'dashed',
+    'dotted': 'dotted',
+    'double': 'double',
+    'inset': 'inset',
+    'nil': 'none',
+    'none': 'none',
+    'outset': 'outset',
+    'single': 'solid',
+    'thick': 'solid',
+    'thickThinLargeGap': 'double',
+    'thickThinMediumGap': 'double',
+    'thickThinSmallGap' : 'double',
+    'thinThickLargeGap': 'double',
+    'thinThickMediumGap': 'double',
+    'thinThickSmallGap': 'double',
+    'thinThickThinLargeGap': 'double',
+    'thinThickThinMediumGap': 'double',
+    'thinThickThinSmallGap': 'double',
+    'threeDEmboss': 'ridge',
+    'threeDEngrave': 'groove',
+    'triple': 'double',
+}  # }}}
+
+# Read from XML {{{
+
+border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
+border_edges = ('left', 'top', 'right', 'bottom', 'between')
+
+
+def read_single_border(parent, edge, XPath, get):
+    color = style = width = padding = None
+    for elem in XPath('./w:%s' % edge)(parent):
+        c = get(elem, 'w:color')
+        if c is not None:
+            color = simple_color(c)
+        s = get(elem, 'w:val')
+        if s is not None:
+            style = LINE_STYLES.get(s, 'solid')
+        space = get(elem, 'w:space')
+        if space is not None:
+            try:
+                padding = float(space)
+            except (ValueError, TypeError):
+                pass
+        sz = get(elem, 'w:sz')
+        if sz is not None:
+            # we dont care about art borders (they are only used for page borders)
+            try:
+                width = min(96, max(2, float(sz))) / 8
+            except (ValueError, TypeError):
+                pass
+    return {p:v for p, v in zip(border_props, (padding, width, style, color))}
+
+
+def read_border(parent, dest, XPath, get, border_edges=border_edges, name='pBdr'):
+    vals = {k % edge:inherit for edge in border_edges for k in border_props}
+
+    for border in XPath('./w:' + name)(parent):
+        for edge in border_edges:
+            for prop, val in iteritems(read_single_border(border, edge, XPath, get)):
+                if val is not None:
+                    vals[prop % edge] = val
+
+    for key, val in iteritems(vals):
+        setattr(dest, key, val)
+
+
+def border_to_css(edge, style, css):
+    bs = getattr(style, 'border_%s_style' % edge)
+    bc = getattr(style, 'border_%s_color' % edge)
+    bw = getattr(style, 'border_%s_width' % edge)
+    if isinstance(bw, numbers.Number):
+        # WebKit needs at least 1pt to render borders and 3pt to render double borders
+        bw = max(bw, (3 if bs == 'double' else 1))
+    if bs is not inherit and bs is not None:
+        css['border-%s-style' % edge] = bs
+    if bc is not inherit and bc is not None:
+        css['border-%s-color' % edge] = bc
+    if bw is not inherit and bw is not None:
+        if isinstance(bw, numbers.Number):
+            bw = '%.3gpt' % bw
+        css['border-%s-width' % edge] = bw
+
+
+def read_indent(parent, dest, XPath, get):
+    padding_left = padding_right = text_indent = inherit
+    for indent in XPath('./w:ind')(parent):
+        l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
+        pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None
+        if pl is not None:
+            padding_left = '%.3g%s' % (pl, 'em' if lc is not None else 'pt')
+
+        r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars')
+        pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None
+        if pr is not None:
+            padding_right = '%.3g%s' % (pr, 'em' if rc is not None else 'pt')
+
+        h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars')
+        fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars')
+        h = h if h is None else '-'+h
+        hc = hc if hc is None else '-'+hc
+        ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
+              simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
+        if ti is not None:
+            text_indent = '%.3g%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
+
+    setattr(dest, 'margin_left', padding_left)
+    setattr(dest, 'margin_right', padding_right)
+    setattr(dest, 'text_indent', text_indent)
+
+
+def read_justification(parent, dest, XPath, get):
+    ans = inherit
+    for jc in XPath('./w:jc[@w:val]')(parent):
+        val = get(jc, 'w:val')
+        if not val:
+            continue
+        if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val:
+            ans = 'justify'
+        elif val in {'left', 'center', 'right', 'start', 'end'}:
+            ans = val
+        elif val in {'start', 'end'}:
+            ans = {'start':'left'}.get(val, 'right')
+    setattr(dest, 'text_align', ans)
+
+
+def read_spacing(parent, dest, XPath, get):
+    padding_top = padding_bottom = line_height = inherit
+    for s in XPath('./w:spacing')(parent):
+        a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
+        pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None
+        if pb is not None:
+            padding_bottom = '%.3g%s' % (pb, 'ex' if al is not None else 'pt')
+
+        b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing')
+        pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None
+        if pt is not None:
+            padding_top = '%.3g%s' % (pt, 'ex' if bl is not None else 'pt')
+
+        l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto')
+        if l is not None:
+            lh = simple_float(l, 0.05) if lr in {'exact', 'atLeast'} else simple_float(l, 1/240.0)
+            if lh is not None:
+                line_height = '%.3g%s' % (lh, 'pt' if lr in {'exact', 'atLeast'} else '')
+
+    setattr(dest, 'margin_top', padding_top)
+    setattr(dest, 'margin_bottom', padding_bottom)
+    setattr(dest, 'line_height', line_height)
+
+
+def read_shd(parent, dest, XPath, get):
+    ans = inherit
+    for shd in XPath('./w:shd[@w:fill]')(parent):
+        val = get(shd, 'w:fill')
+        if val:
+            ans = simple_color(val, auto='transparent')
+    setattr(dest, 'background_color', ans)
+
+
+def read_numbering(parent, dest, XPath, get):
+    lvl = num_id = inherit
+    for np in XPath('./w:numPr')(parent):
+        for ilvl in XPath('./w:ilvl[@w:val]')(np):
+            try:
+                lvl = int(get(ilvl, 'w:val'))
+            except (ValueError, TypeError):
+                pass
+        for num in XPath('./w:numId[@w:val]')(np):
+            num_id = get(num, 'w:val')
+    setattr(dest, 'numbering_id', num_id)
+    setattr(dest, 'numbering_level', lvl)
+
+
+class Frame(object):
+
+    all_attributes = ('drop_cap', 'h', 'w', 'h_anchor', 'h_rule', 'v_anchor', 'wrap',
+                      'h_space', 'v_space', 'lines', 'x_align', 'y_align', 'x', 'y')
+
+    def __init__(self, fp, XPath, get):
+        self.drop_cap = get(fp, 'w:dropCap', 'none')
+        try:
+            self.h = int(get(fp, 'w:h'))/20
+        except (ValueError, TypeError):
+            self.h = 0
+        try:
+            self.w = int(get(fp, 'w:w'))/20
+        except (ValueError, TypeError):
+            self.w = None
+        try:
+            self.x = int(get(fp, 'w:x'))/20
+        except (ValueError, TypeError):
+            self.x = 0
+        try:
+            self.y = int(get(fp, 'w:y'))/20
+        except (ValueError, TypeError):
+            self.y = 0
+
+        self.h_anchor = get(fp, 'w:hAnchor', 'page')
+        self.h_rule = get(fp, 'w:hRule', 'auto')
+        self.v_anchor = get(fp, 'w:vAnchor', 'page')
+        self.wrap = get(fp, 'w:wrap', 'around')
+        self.x_align = get(fp, 'w:xAlign')
+        self.y_align = get(fp, 'w:yAlign')
+
+        try:
+            self.h_space = int(get(fp, 'w:hSpace'))/20
+        except (ValueError, TypeError):
+            self.h_space = 0
+        try:
+            self.v_space = int(get(fp, 'w:vSpace'))/20
+        except (ValueError, TypeError):
+            self.v_space = 0
+        try:
+            self.lines = int(get(fp, 'w:lines'))
+        except (ValueError, TypeError):
+            self.lines = 1
+
+    def css(self, page):
+        is_dropcap = self.drop_cap in {'drop', 'margin'}
+        ans = {'overflow': 'hidden'}
+
+        if is_dropcap:
+            ans['float'] = 'left'
+            ans['margin'] = '0'
+            ans['padding-right'] = '0.2em'
+        else:
+            if self.h_rule != 'auto':
+                t = 'min-height' if self.h_rule == 'atLeast' else 'height'
+                ans[t] = '%.3gpt' % self.h
+            if self.w is not None:
+                ans['width'] = '%.3gpt' % self.w
+            ans['padding-top'] = ans['padding-bottom'] = '%.3gpt' % self.v_space
+            if self.wrap not in {None, 'none'}:
+                ans['padding-left'] = ans['padding-right'] = '%.3gpt' % self.h_space
+                if self.x_align is None:
+                    fl = 'left' if self.x/page.width < 0.5 else 'right'
+                else:
+                    fl = 'right' if self.x_align == 'right' else 'left'
+                ans['float'] = fl
+        return ans
+
+    def __eq__(self, other):
+        for x in self.all_attributes:
+            if getattr(other, x, inherit) != getattr(self, x):
+                return False
+        return True
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+
+def read_frame(parent, dest, XPath, get):
+    ans = inherit
+    for fp in XPath('./w:framePr')(parent):
+        ans = Frame(fp, XPath, get)
+    setattr(dest, 'frame', ans)
+
+# }}}
+
+
+class ParagraphStyle(object):
+
+    all_properties = (
+        'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
+        'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
+        'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
+        'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
+
+        # Border margins padding
+        'border_left_width', 'border_left_style', 'border_left_color', 'padding_left',
+        'border_top_width', 'border_top_style', 'border_top_color', 'padding_top',
+        'border_right_width', 'border_right_style', 'border_right_color', 'padding_right',
+        'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom',
+        'border_between_width', 'border_between_style', 'border_between_color', 'padding_between',
+        'margin_left', 'margin_top', 'margin_right', 'margin_bottom',
+
+        # Misc.
+        'text_indent', 'text_align', 'line_height', 'background_color',
+        'numbering_id', 'numbering_level', 'font_family', 'font_size', 'color', 'frame',
+        'cs_font_size', 'cs_font_family',
+    )
+
+    def __init__(self, namespace, pPr=None):
+        self.namespace = namespace
+        self.linked_style = None
+        if pPr is None:
+            for p in self.all_properties:
+                setattr(self, p, inherit)
+        else:
+            for p in (
+                'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
+                'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
+                'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
+                'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
+            ):
+                setattr(self, p, binary_property(pPr, p, namespace.XPath, namespace.get))
+
+            for x in ('border', 'indent', 'justification', 'spacing', 'shd', 'numbering', 'frame'):
+                f = read_funcs[x]
+                f(pPr, self, namespace.XPath, namespace.get)
+
+            for s in namespace.XPath('./w:pStyle[@w:val]')(pPr):
+                self.linked_style = namespace.get(s, 'w:val')
+
+            self.font_family = self.font_size = self.color = self.cs_font_size = self.cs_font_family = inherit
+
+        self._css = None
+        self._border_key = None
+
+    def update(self, other):
+        for prop in self.all_properties:
+            nval = getattr(other, prop)
+            if nval is not inherit:
+                setattr(self, prop, nval)
+        if other.linked_style is not None:
+            self.linked_style = other.linked_style
+
+    def resolve_based_on(self, parent):
+        for p in self.all_properties:
+            val = getattr(self, p)
+            if val is inherit:
+                setattr(self, p, getattr(parent, p))
+
+    @property
+    def css(self):
+        if self._css is None:
+            self._css = c = OrderedDict()
+            if self.keepLines is True:
+                c['page-break-inside'] = 'avoid'
+            if self.pageBreakBefore is True:
+                c['page-break-before'] = 'always'
+            if self.keepNext is True:
+                c['page-break-after'] = 'avoid'
+            for edge in ('left', 'top', 'right', 'bottom'):
+                border_to_css(edge, self, c)
+                val = getattr(self, 'padding_%s' % edge)
+                if val is not inherit:
+                    c['padding-%s' % edge] = '%.3gpt' % val
+                val = getattr(self, 'margin_%s' % edge)
+                if val is not inherit:
+                    c['margin-%s' % edge] = val
+
+            if self.line_height not in {inherit, '1'}:
+                c['line-height'] = self.line_height
+
+            for x in ('text_indent', 'background_color', 'font_family', 'font_size', 'color'):
+                val = getattr(self, x)
+                if val is not inherit:
+                    if x == 'font_size':
+                        val = '%.3gpt' % val
+                    c[x.replace('_', '-')] = val
+            ta = self.text_align
+            if ta is not inherit:
+                if self.bidi is True:
+                    ta = {'left':'right', 'right':'left'}.get(ta, ta)
+                c['text-align'] = ta
+
+        return self._css
+
+    @property
+    def border_key(self):
+        if self._border_key is None:
+            k = []
+            for edge in border_edges:
+                for prop in border_props:
+                    prop = prop % edge
+                    k.append(getattr(self, prop))
+            self._border_key = tuple(k)
+        return self._border_key
+
+    def has_identical_borders(self, other_style):
+        return self.border_key == getattr(other_style, 'border_key', None)
+
+    def clear_borders(self):
+        for edge in border_edges[:-1]:
+            for prop in ('width', 'color', 'style'):
+                setattr(self, 'border_%s_%s' % (edge, prop), inherit)
+
+    def clone_border_styles(self):
+        style = ParagraphStyle(self.namespace)
+        for edge in border_edges[:-1]:
+            for prop in ('width', 'color', 'style'):
+                attr = 'border_%s_%s' % (edge, prop)
+                setattr(style, attr, getattr(self, attr))
+        return style
+
+    def apply_between_border(self):
+        for prop in ('width', 'color', 'style'):
+            setattr(self, 'border_bottom_%s' % prop, getattr(self, 'border_between_%s' % prop))
+
+    def has_visible_border(self):
+        for edge in border_edges[:-1]:
+            bw, bs = getattr(self, 'border_%s_width' % edge), getattr(self, 'border_%s_style' % edge)
+            if bw is not inherit and bw and bs is not inherit and bs != 'none':
+                return True
+        return False
+
+
+read_funcs = {k[5:]:v for k, v in iteritems(globals()) if k.startswith('read_')}
--- a/ebook_converter/ebooks/docx/char_styles.py
+++ b/ebook_converter/ebooks/docx/char_styles.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import OrderedDict
+from calibre.ebooks.docx.block_styles import (  # noqa
+    inherit, simple_color, LINE_STYLES, simple_float, binary_property, read_shd)
+
+# Read from XML {{{
+
+
+def read_text_border(parent, dest, XPath, get):
+    border_color = border_style = border_width = padding = inherit
+    elems = XPath('./w:bdr')(parent)
+    if elems and elems[0].attrib:
+        border_color = simple_color('auto')
+        border_style = 'none'
+        border_width = 1
+    for elem in elems:
+        color = get(elem, 'w:color')
+        if color is not None:
+            border_color = simple_color(color)
+        style = get(elem, 'w:val')
+        if style is not None:
+            border_style = LINE_STYLES.get(style, 'solid')
+        space = get(elem, 'w:space')
+        if space is not None:
+            try:
+                padding = float(space)
+            except (ValueError, TypeError):
+                pass
+        sz = get(elem, 'w:sz')
+        if sz is not None:
+            # we dont care about art borders (they are only used for page borders)
+            try:
+                # A border of less than 1pt is not rendered by WebKit
+                border_width = min(96, max(8, float(sz))) / 8
+            except (ValueError, TypeError):
+                pass
+
+    setattr(dest, 'border_color', border_color)
+    setattr(dest, 'border_style', border_style)
+    setattr(dest, 'border_width', border_width)
+    setattr(dest, 'padding', padding)
+
+
+def read_color(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:color[@w:val]')(parent):
+        val = get(col, 'w:val')
+        if not val:
+            continue
+        ans = simple_color(val)
+    setattr(dest, 'color', ans)
+
+
+def convert_highlight_color(val):
+    return {
+        'darkBlue': '#000080', 'darkCyan': '#008080', 'darkGray': '#808080',
+        'darkGreen': '#008000', 'darkMagenta': '#800080', 'darkRed': '#800000', 'darkYellow': '#808000',
+        'lightGray': '#c0c0c0'}.get(val, val)
+
+
+def read_highlight(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:highlight[@w:val]')(parent):
+        val = get(col, 'w:val')
+        if not val:
+            continue
+        if not val or val == 'none':
+            val = 'transparent'
+        else:
+            val = convert_highlight_color(val)
+        ans = val
+    setattr(dest, 'highlight', ans)
+
+
+def read_lang(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:lang[@w:val]')(parent):
+        val = get(col, 'w:val')
+        if not val:
+            continue
+        try:
+            code = int(val, 16)
+        except (ValueError, TypeError):
+            ans = val
+        else:
+            from calibre.ebooks.docx.lcid import lcid
+            val = lcid.get(code, None)
+            if val:
+                ans = val
+    setattr(dest, 'lang', ans)
+
+
+def read_letter_spacing(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:spacing[@w:val]')(parent):
+        val = simple_float(get(col, 'w:val'), 0.05)
+        if val is not None:
+            ans = val
+    setattr(dest, 'letter_spacing', ans)
+
+
+def read_underline(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:u[@w:val]')(parent):
+        val = get(col, 'w:val')
+        if val:
+            ans = val if val == 'none' else 'underline'
+    setattr(dest, 'text_decoration', ans)
+
+
+def read_vert_align(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:vertAlign[@w:val]')(parent):
+        val = get(col, 'w:val')
+        if val and val in {'baseline', 'subscript', 'superscript'}:
+            ans = val
+    setattr(dest, 'vert_align', ans)
+
+
+def read_position(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:position[@w:val]')(parent):
+        val = get(col, 'w:val')
+        try:
+            ans = float(val)/2.0
+        except Exception:
+            pass
+    setattr(dest, 'position', ans)
+
+
+def read_font(parent, dest, XPath, get):
+    ff = inherit
+    for col in XPath('./w:rFonts')(parent):
+        val = get(col, 'w:asciiTheme')
+        if val:
+            val = '|%s|' % val
+        else:
+            val = get(col, 'w:ascii')
+        if val:
+            ff = val
+    setattr(dest, 'font_family', ff)
+    for col in XPath('./w:sz[@w:val]')(parent):
+        val = simple_float(get(col, 'w:val'), 0.5)
+        if val is not None:
+            setattr(dest, 'font_size', val)
+            return
+    setattr(dest, 'font_size', inherit)
+
+
+def read_font_cs(parent, dest, XPath, get):
+    ff = inherit
+    for col in XPath('./w:rFonts')(parent):
+        val = get(col, 'w:csTheme')
+        if val:
+            val = '|%s|' % val
+        else:
+            val = get(col, 'w:cs')
+        if val:
+            ff = val
+    setattr(dest, 'cs_font_family', ff)
+    for col in XPath('./w:szCS[@w:val]')(parent):
+        val = simple_float(get(col, 'w:val'), 0.5)
+        if val is not None:
+            setattr(dest, 'font_size', val)
+            return
+    setattr(dest, 'cs_font_size', inherit)
+
+# }}}
+
+
+class RunStyle(object):
+
+    all_properties = {
+        'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint',
+        'rtl', 'shadow', 'smallCaps', 'strike', 'vanish', 'webHidden',
+
+        'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background_color',
+        'letter_spacing', 'font_size', 'text_decoration', 'vert_align', 'lang', 'font_family', 'position',
+        'cs_font_size', 'cs_font_family'
+    }
+
+    toggle_properties = {
+        'b', 'bCs', 'caps', 'emboss', 'i', 'iCs', 'imprint', 'shadow', 'smallCaps', 'strike', 'vanish',
+    }
+
+    def __init__(self, namespace, rPr=None):
+        self.namespace = namespace
+        self.linked_style = None
+        if rPr is None:
+            for p in self.all_properties:
+                setattr(self, p, inherit)
+        else:
+            X, g = namespace.XPath, namespace.get
+            for p in (
+                'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
+                'smallCaps', 'strike', 'vanish', 'webHidden',
+            ):
+                setattr(self, p, binary_property(rPr, p, X, g))
+
+            read_font(rPr, self, X, g)
+            read_font_cs(rPr, self, X, g)
+            read_text_border(rPr, self, X, g)
+            read_color(rPr, self, X, g)
+            read_highlight(rPr, self, X, g)
+            read_shd(rPr, self, X, g)
+            read_letter_spacing(rPr, self, X, g)
+            read_underline(rPr, self, X, g)
+            read_vert_align(rPr, self, X, g)
+            read_position(rPr, self, X, g)
+            read_lang(rPr, self, X, g)
+
+            for s in X('./w:rStyle[@w:val]')(rPr):
+                self.linked_style = g(s, 'w:val')
+
+        self._css = None
+
+    def update(self, other):
+        for prop in self.all_properties:
+            nval = getattr(other, prop)
+            if nval is not inherit:
+                setattr(self, prop, nval)
+        if other.linked_style is not None:
+            self.linked_style = other.linked_style
+
+    def resolve_based_on(self, parent):
+        for p in self.all_properties:
+            val = getattr(self, p)
+            if val is inherit:
+                setattr(self, p, getattr(parent, p))
+
+    def get_border_css(self, ans):
+        for x in ('color', 'style', 'width'):
+            val = getattr(self, 'border_'+x)
+            if x == 'width' and val is not inherit:
+                val = '%.3gpt' % val
+            if val is not inherit:
+                ans['border-%s' % x] = val
+
+    def clear_border_css(self):
+        for x in ('color', 'style', 'width'):
+            setattr(self, 'border_'+x, inherit)
+
+    @property
+    def css(self):
+        if self._css is None:
+            c = self._css = OrderedDict()
+            td = set()
+            if self.text_decoration is not inherit:
+                td.add(self.text_decoration)
+            if self.strike and self.strike is not inherit:
+                td.add('line-through')
+            if self.dstrike and self.dstrike is not inherit:
+                td.add('line-through')
+            if td:
+                c['text-decoration'] = ' '.join(td)
+            if self.caps is True:
+                c['text-transform'] = 'uppercase'
+            if self.i is True:
+                c['font-style'] = 'italic'
+            if self.shadow and self.shadow is not inherit:
+                c['text-shadow'] = '2px 2px'
+            if self.smallCaps is True:
+                c['font-variant'] = 'small-caps'
+            if self.vanish is True or self.webHidden is True:
+                c['display'] = 'none'
+
+            self.get_border_css(c)
+            if self.padding is not inherit:
+                c['padding'] = '%.3gpt' % self.padding
+
+            for x in ('color', 'background_color'):
+                val = getattr(self, x)
+                if val is not inherit:
+                    c[x.replace('_', '-')] = val
+
+            for x in ('letter_spacing', 'font_size'):
+                val = getattr(self, x)
+                if val is not inherit:
+                    c[x.replace('_', '-')] = '%.3gpt' % val
+
+            if self.position is not inherit:
+                c['vertical-align'] = '%.3gpt' % self.position
+
+            if self.highlight is not inherit and self.highlight != 'transparent':
+                c['background-color'] = self.highlight
+
+            if self.b:
+                c['font-weight'] = 'bold'
+
+            if self.font_family is not inherit:
+                c['font-family'] = self.font_family
+
+        return self._css
+
+    def same_border(self, other):
+        return self.get_border_css({}) == other.get_border_css({})
--- a/ebook_converter/ebooks/docx/cleanup.py
+++ b/ebook_converter/ebooks/docx/cleanup.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os
+from polyglot.builtins import itervalues, range
+
+NBSP = '\xa0'
+
+
+def mergeable(previous, current):
+    if previous.tail or current.tail:
+        return False
+    if previous.get('class', None) != current.get('class', None):
+        return False
+    if current.get('id', False):
+        return False
+    for attr in ('style', 'lang', 'dir'):
+        if previous.get(attr) != current.get(attr):
+            return False
+    try:
+        return next(previous.itersiblings()) is current
+    except StopIteration:
+        return False
+
+
+def append_text(parent, text):
+    if len(parent) > 0:
+        parent[-1].tail = (parent[-1].tail or '') + text
+    else:
+        parent.text = (parent.text or '') + text
+
+
+def merge(parent, span):
+    if span.text:
+        append_text(parent, span.text)
+    for child in span:
+        parent.append(child)
+    if span.tail:
+        append_text(parent, span.tail)
+    span.getparent().remove(span)
+
+
+def merge_run(run):
+    parent = run[0]
+    for span in run[1:]:
+        merge(parent, span)
+
+
+def liftable(css):
+    # A <span> is liftable if all its styling would work just as well if it is
+    # specified on the parent element.
+    prefixes = {x.partition('-')[0] for x in css}
+    return not (prefixes - {'text', 'font', 'letter', 'color', 'background'})
+
+
+def add_text(elem, attr, text):
+    old = getattr(elem, attr) or ''
+    setattr(elem, attr, old + text)
+
+
+def lift(span):
+    # Replace an element by its content (text, children and tail)
+    parent = span.getparent()
+    idx = parent.index(span)
+    try:
+        last_child = span[-1]
+    except IndexError:
+        last_child = None
+
+    if span.text:
+        if idx == 0:
+            add_text(parent, 'text', span.text)
+        else:
+            add_text(parent[idx - 1], 'tail', span.text)
+
+    for child in reversed(span):
+        parent.insert(idx, child)
+    parent.remove(span)
+
+    if span.tail:
+        if last_child is None:
+            if idx == 0:
+                add_text(parent, 'text', span.tail)
+            else:
+                add_text(parent[idx - 1], 'tail', span.tail)
+        else:
+            add_text(last_child, 'tail', span.tail)
+
+
+def before_count(root, tag, limit=10):
+    body = root.xpath('//body[1]')
+    if not body:
+        return limit
+    ans = 0
+    for elem in body[0].iterdescendants():
+        if elem is tag:
+            return ans
+        ans += 1
+        if ans > limit:
+            return limit
+
+
+def wrap_contents(tag_name, elem):
+    wrapper = elem.makeelement(tag_name)
+    wrapper.text, elem.text = elem.text, ''
+    for child in elem:
+        elem.remove(child)
+        wrapper.append(child)
+    elem.append(wrapper)
+
+
+def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath):
+    # Apply vertical-align
+    for span in root.xpath('//span[@data-docx-vert]'):
+        wrap_contents(span.attrib.pop('data-docx-vert'), span)
+
+    # Move <hr>s outside paragraphs, if possible.
+    pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
+    for hr in root.xpath('//span/hr'):
+        p = pancestor(hr)
+        if p:
+            p = p[0]
+            descendants = tuple(p.iterdescendants())
+            if descendants[-1] is hr:
+                parent = p.getparent()
+                idx = parent.index(p)
+                parent.insert(idx+1, hr)
+                hr.tail = '\n\t'
+
+    # Merge consecutive spans that have the same styling
+    current_run = []
+    for span in root.xpath('//span'):
+        if not current_run:
+            current_run.append(span)
+        else:
+            last = current_run[-1]
+            if mergeable(last, span):
+                current_run.append(span)
+            else:
+                if len(current_run) > 1:
+                    merge_run(current_run)
+                current_run = [span]
+
+    # Process dir attributes
+    class_map = dict(itervalues(styles.classes))
+    parents = ('p', 'div') + tuple('h%d' % i for i in range(1, 7))
+    for parent in root.xpath('//*[(%s)]' % ' or '.join('name()="%s"' % t for t in parents)):
+        # Ensure that children of rtl parents that are not rtl have an
+        # explicit dir set. Also, remove dir from children if it is the same as
+        # that of the parent.
+        if len(parent):
+            parent_dir = parent.get('dir')
+            for child in parent.iterchildren('span'):
+                child_dir = child.get('dir')
+                if parent_dir == 'rtl' and child_dir != 'rtl':
+                    child_dir = 'ltr'
+                    child.set('dir', child_dir)
+                if child_dir and child_dir == parent_dir:
+                    child.attrib.pop('dir')
+
+    # Remove unnecessary span tags that are the only child of a parent block
+    # element
+    for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)):
+        if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None):
+            # We have a block whose contents are entirely enclosed in a <span>
+            span = parent[0]
+            span_class = span.get('class', None)
+            span_css = class_map.get(span_class, {})
+            span_dir = span.get('dir')
+            if liftable(span_css) and (not span_dir or span_dir == parent.get('dir')):
+                pclass = parent.get('class', None)
+                if span_class:
+                    pclass = (pclass + ' ' + span_class) if pclass else span_class
+                    parent.set('class', pclass)
+                parent.text = span.text
+                parent.remove(span)
+                if span.get('lang'):
+                    parent.set('lang', span.get('lang'))
+                if span.get('dir'):
+                    parent.set('dir', span.get('dir'))
+                for child in span:
+                    parent.append(child)
+
+    # Make spans whose only styling is bold or italic into <b> and <i> tags
+    for span in root.xpath('//span[@class and not(@style)]'):
+        css = class_map.get(span.get('class', None), {})
+        if len(css) == 1:
+            if css == {'font-style':'italic'}:
+                span.tag = 'i'
+                del span.attrib['class']
+            elif css == {'font-weight':'bold'}:
+                span.tag = 'b'
+                del span.attrib['class']
+
+    # Get rid of <span>s that have no styling
+    for span in root.xpath('//span[not(@class or @id or @style or @lang or @dir)]'):
+        lift(span)
+
+    # Convert <p><br style="page-break-after:always"> </p> style page breaks
+    # into something the viewer will render as a page break
+    for p in root.xpath('//p[br[@style="page-break-after:always"]]'):
+        if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()):
+            p.remove(p[0])
+            prefix = p.get('style', '')
+            if prefix:
+                prefix += '; '
+            p.set('style', prefix + 'page-break-after:always')
+            p.text = NBSP if not p.text else p.text
+
+    if detect_cover:
+        # Check if the first image in the document is possibly a cover
+        img = root.xpath('//img[@src][1]')
+        if img:
+            img = img[0]
+            path = os.path.join(dest_dir, img.get('src'))
+            if os.path.exists(path) and before_count(root, img, limit=10) < 5:
+                from calibre.utils.imghdr import identify
+                try:
+                    with lopen(path, 'rb') as imf:
+                        fmt, width, height = identify(imf)
+                except:
+                    width, height, fmt = 0, 0, None  # noqa
+                del fmt
+                try:
+                    is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
+                except ZeroDivisionError:
+                    is_cover = False
+                if is_cover:
+                    log.debug('Detected an image that looks like a cover')
+                    img.getparent().remove(img)
+                    return path
--- a/ebook_converter/ebooks/docx/container.py
+++ b/ebook_converter/ebooks/docx/container.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os, sys, shutil
+
+from lxml import etree
+
+from calibre import walk, guess_type
+from calibre.ebooks.metadata import string_to_authors, authors_to_sort_string
+from calibre.ebooks.metadata.book.base import Metadata
+from calibre.ebooks.docx import InvalidDOCX
+from calibre.ebooks.docx.names import DOCXNamespace
+from calibre.ptempfile import PersistentTemporaryDirectory
+from calibre.utils.localization import canonicalize_lang
+from calibre.utils.logging import default_log
+from calibre.utils.zipfile import ZipFile
+from calibre.utils.xml_parse import safe_xml_fromstring
+
+
+def fromstring(raw, parser=None):
+    return safe_xml_fromstring(raw)
+
+# Read metadata {{{
+
+
+def read_doc_props(raw, mi, XPath):
+    root = fromstring(raw)
+    titles = XPath('//dc:title')(root)
+    if titles:
+        title = titles[0].text
+        if title and title.strip():
+            mi.title = title.strip()
+    tags = []
+    for subject in XPath('//dc:subject')(root):
+        if subject.text and subject.text.strip():
+            tags.append(subject.text.strip().replace(',', '_'))
+    for keywords in XPath('//cp:keywords')(root):
+        if keywords.text and keywords.text.strip():
+            for x in keywords.text.split():
+                tags.extend(y.strip() for y in x.split(',') if y.strip())
+    if tags:
+        mi.tags = tags
+    authors = XPath('//dc:creator')(root)
+    aut = []
+    for author in authors:
+        if author.text and author.text.strip():
+            aut.extend(string_to_authors(author.text))
+    if aut:
+        mi.authors = aut
+        mi.author_sort = authors_to_sort_string(aut)
+
+    desc = XPath('//dc:description')(root)
+    if desc:
+        raw = etree.tostring(desc[0], method='text', encoding='unicode')
+        raw = raw.replace('_x000d_', '')  # Word 2007 mangles newlines in the summary
+        mi.comments = raw.strip()
+
+    langs = []
+    for lang in XPath('//dc:language')(root):
+        if lang.text and lang.text.strip():
+            l = canonicalize_lang(lang.text)
+            if l:
+                langs.append(l)
+    if langs:
+        mi.languages = langs
+
+
+def read_app_props(raw, mi):
+    root = fromstring(raw)
+    company = root.xpath('//*[local-name()="Company"]')
+    if company and company[0].text and company[0].text.strip():
+        mi.publisher = company[0].text.strip()
+
+
+def read_default_style_language(raw, mi, XPath):
+    root = fromstring(raw)
+    for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root):
+        lang = canonicalize_lang(lang)
+        if lang:
+            mi.languages = [lang]
+            break
+# }}}
+
+
+class DOCX(object):
+
+    def __init__(self, path_or_stream, log=None, extract=True):
+        self.docx_is_transitional = True
+        stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
+        self.name = getattr(stream, 'name', None) or '<stream>'
+        self.log = log or default_log
+        if extract:
+            self.extract(stream)
+        else:
+            self.init_zipfile(stream)
+        self.read_content_types()
+        self.read_package_relationships()
+        self.namespace = DOCXNamespace(self.docx_is_transitional)
+
+    def init_zipfile(self, stream):
+        self.zipf = ZipFile(stream)
+        self.names = frozenset(self.zipf.namelist())
+
+    def extract(self, stream):
+        self.tdir = PersistentTemporaryDirectory('docx_container')
+        try:
+            zf = ZipFile(stream)
+            zf.extractall(self.tdir)
+        except:
+            self.log.exception('DOCX appears to be invalid ZIP file, trying a'
+                    ' more forgiving ZIP parser')
+            from calibre.utils.localunzip import extractall
+            stream.seek(0)
+            extractall(stream, self.tdir)
+
+        self.names = {}
+        for f in walk(self.tdir):
+            name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
+            self.names[name] = f
+
+    def exists(self, name):
+        return name in self.names
+
+    def read(self, name):
+        if hasattr(self, 'zipf'):
+            return self.zipf.open(name).read()
+        path = self.names[name]
+        with open(path, 'rb') as f:
+            return f.read()
+
+    def read_content_types(self):
+        try:
+            raw = self.read('[Content_Types].xml')
+        except KeyError:
+            raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
+        root = fromstring(raw)
+        self.content_types = {}
+        self.default_content_types = {}
+        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
+            self.default_content_types[item.get('Extension').lower()] = item.get('ContentType')
+        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'):
+            name = item.get('PartName').lstrip('/')
+            self.content_types[name] = item.get('ContentType')
+
+    def content_type(self, name):
+        if name in self.content_types:
+            return self.content_types[name]
+        ext = name.rpartition('.')[-1].lower()
+        if ext in self.default_content_types:
+            return self.default_content_types[ext]
+        return guess_type(name)[0]
+
+    def read_package_relationships(self):
+        try:
+            raw = self.read('_rels/.rels')
+        except KeyError:
+            raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
+        root = fromstring(raw)
+        self.relationships = {}
+        self.relationships_rmap = {}
+        for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
+            target = item.get('Target').lstrip('/')
+            typ = item.get('Type')
+            if target == 'word/document.xml':
+                self.docx_is_transitional = typ != 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument'
+            self.relationships[typ] = target
+            self.relationships_rmap[target] = typ
+
+    @property
+    def document_name(self):
+        name = self.relationships.get(self.namespace.names['DOCUMENT'], None)
+        if name is None:
+            names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
+            if not names:
+                raise InvalidDOCX('The file %s docx file has no main document' % self.name)
+            name = names[0]
+        return name
+
+    @property
+    def document(self):
+        return fromstring(self.read(self.document_name))
+
+    @property
+    def document_relationships(self):
+        return self.get_relationships(self.document_name)
+
+    def get_relationships(self, name):
+        base = '/'.join(name.split('/')[:-1])
+        by_id, by_type = {}, {}
+        parts = name.split('/')
+        name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels'])
+        try:
+            raw = self.read(name)
+        except KeyError:
+            pass
+        else:
+            root = fromstring(raw)
+            for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
+                target = item.get('Target')
+                if item.get('TargetMode', None) != 'External' and not target.startswith('#'):
+                    target = '/'.join((base, target.lstrip('/')))
+                typ = item.get('Type')
+                Id = item.get('Id')
+                by_id[Id] = by_type[typ] = target
+
+        return by_id, by_type
+
+    def get_document_properties_names(self):
+        name = self.relationships.get(self.namespace.names['DOCPROPS'], None)
+        if name is None:
+            names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
+            if names:
+                name = names[0]
+        yield name
+        name = self.relationships.get(self.namespace.names['APPPROPS'], None)
+        if name is None:
+            names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
+            if names:
+                name = names[0]
+        yield name
+
+    @property
+    def metadata(self):
+        mi = Metadata(_('Unknown'))
+        dp_name, ap_name = self.get_document_properties_names()
+        if dp_name:
+            try:
+                raw = self.read(dp_name)
+            except KeyError:
+                pass
+            else:
+                read_doc_props(raw, mi, self.namespace.XPath)
+        if mi.is_null('language'):
+            try:
+                raw = self.read('word/styles.xml')
+            except KeyError:
+                pass
+            else:
+                read_default_style_language(raw, mi, self.namespace.XPath)
+
+        ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None)
+        if ap_name:
+            try:
+                raw = self.read(ap_name)
+            except KeyError:
+                pass
+            else:
+                read_app_props(raw, mi)
+
+        return mi
+
+    def close(self):
+        if hasattr(self, 'zipf'):
+            self.zipf.close()
+        else:
+            try:
+                shutil.rmtree(self.tdir)
+            except EnvironmentError:
+                pass
+
+
+if __name__ == '__main__':
+    d = DOCX(sys.argv[-1], extract=False)
+    print(d.metadata)
--- a/ebook_converter/ebooks/docx/fields.py
+++ b/ebook_converter/ebooks/docx/fields.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re
+
+from calibre.ebooks.docx.index import process_index, polish_index_markup
+from polyglot.builtins import iteritems, native_string_type
+
+
+class Field(object):
+
+    def __init__(self, start):
+        self.start = start
+        self.end = None
+        self.contents = []
+        self.buf = []
+        self.instructions = None
+        self.name = None
+
+    def add_instr(self, elem):
+        self.add_raw(elem.text)
+
+    def add_raw(self, raw):
+        if not raw:
+            return
+        if self.name is None:
+            # There are cases where partial index entries end with
+            # a significant space, along the lines of
+            # <>Summary <>  ...  <>Hearing<>.
+            # No known examples of starting with a space yet.
+            # self.name, raw = raw.strip().partition(' ')[0::2]
+            self.name, raw = raw.lstrip().partition(' ')[0::2]
+        self.buf.append(raw)
+
+    def finalize(self):
+        self.instructions = ''.join(self.buf)
+        del self.buf
+
+
+WORD, FLAG = 0, 1
+scanner = re.Scanner([
+    (r'\\\S{1}', lambda s, t: (t, FLAG)),  # A flag of the form \x
+    (r'"[^"]*"', lambda s, t: (t[1:-1], WORD)),  # Quoted word
+    (r'[^\s\\"]\S*', lambda s, t: (t, WORD)),  # A non-quoted word, must not start with a backslash or a space or a quote
+    (r'\s+', None),
+], flags=re.DOTALL)
+
+null = object()
+
+
+def parser(name, field_map, default_field_name=None):
+
+    field_map = dict((x.split(':') for x in field_map.split()))
+
+    def parse(raw, log=None):
+        ans = {}
+        last_option = None
+        raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
+        for token, token_type in scanner.scan(raw)[0]:
+            token = token.replace('\x01', '\\').replace('\x02', '"')
+            if token_type is FLAG:
+                last_option = field_map.get(token[1], null)
+                if last_option is not None:
+                    ans[last_option] = None
+            elif token_type is WORD:
+                if last_option is None:
+                    ans[default_field_name] = token
+                else:
+                    ans[last_option] = token
+                    last_option = None
+        ans.pop(null, None)
+        return ans
+
+    parse.__name__ = native_string_type('parse_' + name)
+
+    return parse
+
+
+parse_hyperlink = parser('hyperlink',
+    'l:anchor m:image-map n:target o:title t:target', 'url')
+
+parse_xe = parser('xe',
+    'b:bold i:italic f:entry-type r:page-range-bookmark t:page-number-text y:yomi', 'text')
+
+parse_index = parser('index',
+    'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator'
+    ' f:entry-type g:page-range-separator h:heading k:crossref-separator'
+    ' l:page-number-separator p:letter-range s:sequence-name r:run-together y:yomi z:langcode')
+
+parse_ref = parser('ref',
+    'd:separator f:footnote h:hyperlink n:number p:position r:relative-number t:suppress w:number-full-context')
+
+parse_noteref = parser('noteref',
+                   'f:footnote h:hyperlink p:position')
+
+
+class Fields(object):
+
+    def __init__(self, namespace):
+        self.namespace = namespace
+        self.fields = []
+        self.index_bookmark_counter = 0
+        self.index_bookmark_prefix = 'index-'
+
+    def __call__(self, doc, log):
+        all_ids = frozenset(self.namespace.XPath('//*/@w:id')(doc))
+        c = 0
+        while self.index_bookmark_prefix in all_ids:
+            c += 1
+            self.index_bookmark_prefix = self.index_bookmark_prefix.replace('-', '%d-' % c)
+        stack = []
+        for elem in self.namespace.XPath(
+            '//*[name()="w:p" or name()="w:r" or'
+            ' name()="w:instrText" or'
+            ' (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end") or'
+            ' name()="w:fldSimple")]')(doc):
+            if elem.tag.endswith('}fldChar'):
+                typ = self.namespace.get(elem, 'w:fldCharType')
+                if typ == 'begin':
+                    stack.append(Field(elem))
+                    self.fields.append(stack[-1])
+                else:
+                    try:
+                        stack.pop().end = elem
+                    except IndexError:
+                        pass
+            elif elem.tag.endswith('}instrText'):
+                if stack:
+                    stack[-1].add_instr(elem)
+            elif elem.tag.endswith('}fldSimple'):
+                field = Field(elem)
+                instr = self.namespace.get(elem, 'w:instr')
+                if instr:
+                    field.add_raw(instr)
+                    self.fields.append(field)
+                    for r in self.namespace.XPath('descendant::w:r')(elem):
+                        field.contents.append(r)
+            else:
+                if stack:
+                    stack[-1].contents.append(elem)
+
+        field_types = ('hyperlink', 'xe', 'index', 'ref', 'noteref')
+        parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types}
+        parsers.update({x:getattr(self, 'parse_'+x) for x in field_types})
+        field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types}
+        field_parsers.update({f:globals()['parse_%s' % f] for f in field_types})
+
+        for f in field_types:
+            setattr(self, '%s_fields' % f, [])
+        unknown_fields = {'TOC', 'toc', 'PAGEREF', 'pageref'}  # The TOC and PAGEREF fields are handled separately
+
+        for field in self.fields:
+            field.finalize()
+            if field.instructions:
+                func = parsers.get(field.name, None)
+                if func is not None:
+                    func(field, field_parsers[field.name], log)
+                elif field.name not in unknown_fields:
+                    log.warn('Encountered unknown field: %s, ignoring it.' % field.name)
+                    unknown_fields.add(field.name)
+
+    def get_runs(self, field):
+        all_runs = []
+        current_runs = []
+        # We only handle spans in a single paragraph
+        # being wrapped in <a>
+        for x in field.contents:
+            if x.tag.endswith('}p'):
+                if current_runs:
+                    all_runs.append(current_runs)
+                current_runs = []
+            elif x.tag.endswith('}r'):
+                current_runs.append(x)
+        if current_runs:
+            all_runs.append(current_runs)
+        return all_runs
+
+    def parse_hyperlink(self, field, parse_func, log):
+        # Parse hyperlink fields
+        hl = parse_func(field.instructions, log)
+        if hl:
+            if 'target' in hl and hl['target'] is None:
+                hl['target'] = '_blank'
+            for runs in self.get_runs(field):
+                self.hyperlink_fields.append((hl, runs))
+
+    def parse_ref(self, field, parse_func, log):
+        ref = parse_func(field.instructions, log)
+        dest = ref.get(None, None)
+        if dest is not None and 'hyperlink' in ref:
+            for runs in self.get_runs(field):
+                self.hyperlink_fields.append(({'anchor':dest}, runs))
+        else:
+            log.warn('Unsupported reference field (%s), ignoring: %r' % (field.name, ref))
+
+    parse_noteref = parse_ref
+
+    def parse_xe(self, field, parse_func, log):
+        # Parse XE fields
+        if None in (field.start, field.end):
+            return
+        xe = parse_func(field.instructions, log)
+        if xe:
+            # We insert a synthetic bookmark around this index item so that we
+            # can link to it later
+            def WORD(x):
+                return self.namespace.expand('w:' + x)
+            self.index_bookmark_counter += 1
+            bmark = xe['anchor'] = '%s%d' % (self.index_bookmark_prefix, self.index_bookmark_counter)
+            p = field.start.getparent()
+            bm = p.makeelement(WORD('bookmarkStart'))
+            bm.set(WORD('id'), bmark), bm.set(WORD('name'), bmark)
+            p.insert(p.index(field.start), bm)
+            p = field.end.getparent()
+            bm = p.makeelement(WORD('bookmarkEnd'))
+            bm.set(WORD('id'), bmark)
+            p.insert(p.index(field.end) + 1, bm)
+            xe['start_elem'] = field.start
+            self.xe_fields.append(xe)
+
+    def parse_index(self, field, parse_func, log):
+        if not field.contents:
+            return
+        idx = parse_func(field.instructions, log)
+        hyperlinks, blocks = process_index(field, idx, self.xe_fields, log, self.namespace.XPath, self.namespace.expand)
+        if not blocks:
+            return
+        for anchor, run in hyperlinks:
+            self.hyperlink_fields.append(({'anchor':anchor}, [run]))
+
+        self.index_fields.append((idx, blocks))
+
+    def polish_markup(self, object_map):
+        if not self.index_fields:
+            return
+        rmap = {v:k for k, v in iteritems(object_map)}
+        for idx, blocks in self.index_fields:
+            polish_index_markup(idx, [rmap[b] for b in blocks])
+
+
+def test_parse_fields(return_tests=False):
+    import unittest
+
+    class TestParseFields(unittest.TestCase):
+
+        def test_hyperlink(self):
+            ae = lambda x, y: self.assertEqual(parse_hyperlink(x, None), y)
+            ae(r'\l anchor1', {'anchor':'anchor1'})
+            ae(r'www.calibre-ebook.com', {'url':'www.calibre-ebook.com'})
+            ae(r'www.calibre-ebook.com \t target \o tt', {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
+            ae(r'"c:\\Some Folder"', {'url': 'c:\\Some Folder'})
+            ae(r'xxxx \y yyyy', {'url': 'xxxx'})
+
+        def test_xe(self):
+            ae = lambda x, y: self.assertEqual(parse_xe(x, None), y)
+            ae(r'"some name"', {'text':'some name'})
+            ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None})
+            ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'})
+
+        def test_index(self):
+            ae = lambda x, y: self.assertEqual(parse_index(x, None), y)
+            ae(r'', {})
+            ae(r'\b \c 1', {'bookmark':None, 'columns-per-page': '1'})
+
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields)
+    if return_tests:
+        return suite
+    unittest.TextTestRunner(verbosity=4).run(suite)
+
+
+if __name__ == '__main__':
+    test_parse_fields()
--- a/ebook_converter/ebooks/docx/fonts.py
+++ b/ebook_converter/ebooks/docx/fonts.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os, re
+from collections import namedtuple
+
+from calibre.ebooks.docx.block_styles import binary_property, inherit
+from calibre.utils.filenames import ascii_filename
+from calibre.utils.fonts.scanner import font_scanner, NoFonts
+from calibre.utils.fonts.utils import panose_to_css_generic_family, is_truetype_font
+from calibre.utils.icu import ord_string
+from polyglot.builtins import codepoint_to_chr, iteritems, range
+
+Embed = namedtuple('Embed', 'name key subsetted')
+
+
+def has_system_fonts(name):
+    try:
+        return bool(font_scanner.fonts_for_family(name))
+    except NoFonts:
+        return False
+
+
+def get_variant(bold=False, italic=False):
+    return {(False, False):'Regular', (False, True):'Italic',
+            (True, False):'Bold', (True, True):'BoldItalic'}[(bold, italic)]
+
+
+def find_fonts_matching(fonts, style='normal', stretch='normal'):
+    for font in fonts:
+        if font['font-style'] == style and font['font-stretch'] == stretch:
+            yield font
+
+
+def weight_key(font):
+    w = font['font-weight']
+    try:
+        return abs(int(w) - 400)
+    except Exception:
+        return abs({'normal': 400, 'bold': 700}.get(w, 1000000) - 400)
+
+
+def get_best_font(fonts, style, stretch):
+    try:
+        return sorted(find_fonts_matching(fonts, style, stretch), key=weight_key)[0]
+    except Exception:
+        pass
+
+
+class Family(object):
+
+    def __init__(self, elem, embed_relationships, XPath, get):
+        self.name = self.family_name = get(elem, 'w:name')
+        self.alt_names = tuple(get(x, 'w:val') for x in XPath('./w:altName')(elem))
+        if self.alt_names and not has_system_fonts(self.name):
+            for x in self.alt_names:
+                if has_system_fonts(x):
+                    self.family_name = x
+                    break
+
+        self.embedded = {}
+        for x in ('Regular', 'Bold', 'Italic', 'BoldItalic'):
+            for y in XPath('./w:embed%s[@r:id]' % x)(elem):
+                rid = get(y, 'r:id')
+                key = get(y, 'w:fontKey')
+                subsetted = get(y, 'w:subsetted') in {'1', 'true', 'on'}
+                if rid in embed_relationships:
+                    self.embedded[x] = Embed(embed_relationships[rid], key, subsetted)
+
+        self.generic_family = 'auto'
+        for x in XPath('./w:family[@w:val]')(elem):
+            self.generic_family = get(x, 'w:val', 'auto')
+
+        ntt = binary_property(elem, 'notTrueType', XPath, get)
+        self.is_ttf = ntt is inherit or not ntt
+
+        self.panose1 = None
+        self.panose_name = None
+        for x in XPath('./w:panose1[@w:val]')(elem):
+            try:
+                v = get(x, 'w:val')
+                v = tuple(int(v[i:i+2], 16) for i in range(0, len(v), 2))
+            except (TypeError, ValueError, IndexError):
+                pass
+            else:
+                self.panose1 = v
+                self.panose_name = panose_to_css_generic_family(v)
+
+        self.css_generic_family = {'roman':'serif', 'swiss':'sans-serif', 'modern':'monospace',
+                                   'decorative':'fantasy', 'script':'cursive'}.get(self.generic_family, None)
+        self.css_generic_family = self.css_generic_family or self.panose_name or 'serif'
+
+
+SYMBOL_MAPS = {  # {{{
+    'Wingdings': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🖉', '✂', '✁', '👓', '🕭', '🕮', '🕯', '🕿', '✆', '🖂', '🖃', '📪', '📫', '📬', '📭', '🗀', '🗁', '🗎', '🗏', '🗐', '🗄', '⏳', '🖮', '🖰', '🖲', '🖳', '🖴', '🖫', '🖬', '✇', '✍', '🖎', '✌', '🖏', '👍', '👎', '☜', '☞', '☜', '🖗', '🖐', '☺', '😐', '☹', '💣', '🕱', '🏳', '🏱', '✈', '☼', '🌢', '❄', '🕆', '✞', '🕈', '✠', '✡', '☪', '☯', '🕉', '☸', '♈', '♉', '♊', '♋', '♌', '♍', '♎', '♏', '♐', '♑', '♒', '♓', '🙰', '🙵', '⚫', '🔾', '◼', '🞏', '🞐', '❑', '❒', '🞟', '⧫', '◆', '❖', '🞙', '⌧', '⮹', '⌘', '🏵', '🏶', '🙶', '🙷', ' ', '🄋', '➀', '➁', '➂', '➃', '➄', '➅', '➆', '➇', '➈', '➉', '🄌', '➊', '➋', '➌', '➍', '➎', '➏', '➐', '➑', '➒', '➓', '🙢', '🙠', '🙡', '🙣', '🙦', '🙤', '🙥', '🙧', '∙', '•', '⬝', '⭘', '🞆', '🞈', '🞊', '🞋', '🔿', '▪', '🞎', '🟀', '🟁', '★', '🟋', '🟏', '🟓', '🟑', '⯐', '⌖', '⯎', '⯏', '⯑', '✪', '✰', '🕐', '🕑', '🕒', '🕓', '🕔', '🕕', '🕖', '🕗', '🕘', '🕙', '🕚', '🕛', '⮰', '⮱', '⮲', '⮳', '⮴', '⮵', '⮶', '⮷', '🙪', '🙫', '🙕', '🙔', '🙗', '🙖', '🙐', '🙑', '🙒', '🙓', '⌫', '⌦', '⮘', '⮚', '⮙', '⮛', '⮈', '⮊', '⮉', '⮋', '🡨', '🡪', '🡩', '🡫', '🡬', '🡭', '🡯', '🡮', '🡸', '🡺', '🡹', '🡻', '🡼', '🡽', '🡿', '🡾', '⇦', '⇨', '⇧', '⇩', '⬄', '⇳', '⬁', '⬀', '⬃', '⬂', '🢬', '🢭', '🗶', '✓', '🗷', '🗹', ' '),  # noqa
+
+    'Wingdings 2': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🖊', '🖋', '🖌', '🖍', '✄', '✀', '🕾', '🕽', '🗅', '🗆', '🗇', '🗈', '🗉', '🗊', '🗋', '🗌', '🗍', '📋', '🗑', '🗔', '🖵', '🖶', '🖷', '🖸', '🖭', '🖯', '🖱', '🖒', '🖓', '🖘', '🖙', '🖚', '🖛', '👈', '👉', '🖜', '🖝', '🖞', '🖟', '🖠', '🖡', '👆', '👇', '🖢', '🖣', '🖑', '🗴', '🗸', '🗵', '☑', '⮽', '☒', '⮾', '⮿', '🛇', '⦸', '🙱', '🙴', '🙲', '🙳', '‽', '🙹', '🙺', '🙻', '🙦', '🙤', '🙥', '🙧', '🙚', '🙘', '🙙', '🙛', '⓪', '①', '②', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩', '⓿', '❶', '❷', '❸', '❹', '❺', '❻', '❼', '❽', '❾', '❿', ' ', '☉', '🌕', '☽', '☾', '⸿', '✝', '🕇', '🕜', '🕝', '🕞', '🕟', '🕠', '🕡', '🕢', '🕣', '🕤', '🕥', '🕦', '🕧', '🙨', '🙩', '⋅', '🞄', '⦁', '●', '●', '🞅', '🞇', '🞉', '⊙', '⦿', '🞌', '🞍', '◾', '■', '□', '🞑', '🞒', '🞓', '🞔', '▣', '🞕', '🞖', '🞗', '🞘', '⬩', '⬥', '◇', '🞚', '◈', '🞛', '🞜', '🞝', '🞞', '⬪', '⬧', '◊', '🞠', '◖', '◗', '⯊', '⯋', '⯀', '⯁', '⬟', '⯂', '⬣', '⬢', '⯃', '⯄', '🞡', '🞢', '🞣', '🞤', '🞥', '🞦', '🞧', '🞨', '🞩', '🞪', '🞫', '🞬', '🞭', '🞮', '🞯', '🞰', '🞱', '🞲', '🞳', '🞴', '🞵', '🞶', '🞷', '🞸', '🞹', '🞺', '🞻', '🞼', '🞽', '🞾', '🞿', '🟀', '🟂', '🟄', '🟆', '🟉', '🟊', '✶', '🟌', '🟎', '🟐', '🟒', '✹', '🟃', '🟇', '✯', '🟍', '🟔', '⯌', '⯍', '※', '⁂', ' ', ' ', ' ', ' ', ' ', ' ',),  # noqa
+
+    'Wingdings 3': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '⭠', '⭢', '⭡', '⭣', '⭤', '⭥', '⭧', '⭦', '⭰', '⭲', '⭱', '⭳', '⭶', '⭸', '⭻', '⭽', '⭤', '⭥', '⭪', '⭬', '⭫', '⭭', '⭍', '⮠', '⮡', '⮢', '⮣', '⮤', '⮥', '⮦', '⮧', '⮐', '⮑', '⮒', '⮓', '⮀', '⮃', '⭾', '⭿', '⮄', '⮆', '⮅', '⮇', '⮏', '⮍', '⮎', '⮌', '⭮', '⭯', '⎋', '⌤', '⌃', '⌥', '␣', '⍽', '⇪', '⮸', '🢠', '🢡', '🢢', '🢣', '🢤', '🢥', '🢦', '🢧', '🢨', '🢩', '🢪', '🢫', '🡐', '🡒', '🡑', '🡓', '🡔', '🡕', '🡗', '🡖', '🡘', '🡙', '▲', '▼', '△', '▽', '◀', '▶', '◁', '▷', '◣', '◢', '◤', '◥', '🞀', '🞂', '🞁', ' ', '🞃', '⯅', '⯆', '⯇', '⯈', '⮜', '⮞', '⮝', '⮟', '🠐', '🠒', '🠑', '🠓', '🠔', '🠖', '🠕', '🠗', '🠘', '🠚', '🠙', '🠛', '🠜', '🠞', '🠝', '🠟', '🠀', '🠂', '🠁', '🠃', '🠄', '🠆', '🠅', '🠇', '🠈', '🠊', '🠉', '🠋', '🠠', '🠢', '🠤', '🠦', '🠨', '🠪', '🠬', '🢜', '🢝', '🢞', '🢟', '🠮', '🠰', '🠲', '🠴', '🠶', '🠸', '🠺', '🠹', '🠻', '🢘', '🢚', '🢙', '🢛', '🠼', '🠾', '🠽', '🠿', '🡀', '🡂', '🡁', '🡃', '🡄', '🡆', '🡅', '🡇', '⮨', '⮩', '⮪', '⮫', '⮬', '⮭', '⮮', '⮯', '🡠', '🡢', '🡡', '🡣', '🡤', '🡥', '🡧', '🡦', '🡰', '🡲', '🡱', '🡳', '🡴', '🡵', '🡷', '🡶', '🢀', '🢂', '🢁', '🢃', '🢄', '🢅', '🢇', '🢆', '🢐', '🢒', '🢑', '🢓', '🢔', '🢕', '🢗', '🢖', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',),  # noqa
+
+    'Webdings': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🕷', '🕸', '🕲', '🕶', '🏆', '🎖', '🖇', '🗨', '🗩', '🗰', '🗱', '🌶', '🎗', '🙾', '🙼', '🗕', '🗖', '🗗', '⏴', '⏵', '⏶', '⏷', '⏪', '⏩', '⏮', '⏭', '⏸', '⏹', '⏺', '🗚', '🗳', '🛠', '🏗', '🏘', '🏙', '🏚', '🏜', '🏭', '🏛', '🏠', '🏖', '🏝', '🛣', '🔍', '🏔', '👁', '👂', '🏞', '🏕', '🛤', '🏟', '🛳', '🕬', '🕫', '🕨', '🔈', '🎔', '🎕', '🗬', '🙽', '🗭', '🗪', '🗫', '⮔', '✔', '🚲', '⬜', '🛡', '📦', '🛱', '⬛', '🚑', '🛈', '🛩', '🛰', '🟈', '🕴', '⬤', '🛥', '🚔', '🗘', '🗙', '❓', '🛲', '🚇', '🚍', '⛳', '⦸', '⊖', '🚭', '🗮', '⏐', '🗯', '🗲', ' ', '🚹', '🚺', '🛉', '🛊', '🚼', '👽', '🏋', '⛷', '🏂', '🏌', '🏊', '🏄', '🏍', '🏎', '🚘', '🗠', '🛢', '📠', '🏷', '📣', '👪', '🗡', '🗢', '🗣', '✯', '🖄', '🖅', '🖃', '🖆', '🖹', '🖺', '🖻', '🕵', '🕰', '🖽', '🖾', '📋', '🗒', '🗓', '🕮', '📚', '🗞', '🗟', '🗃', '🗂', '🖼', '🎭', '🎜', '🎘', '🎙', '🎧', '💿', '🎞', '📷', '🎟', '🎬', '📽', '📹', '📾', '📻', '🎚', '🎛', '📺', '💻', '🖥', '🖦', '🖧', '🍹', '🎮', '🎮', '🕻', '🕼', '🖁', '🖀', '🖨', '🖩', '🖿', '🖪', '🗜', '🔒', '🔓', '🗝', '📥', '📤', '🕳', '🌣', '🌤', '🌥', '🌦', '☁', '🌨', '🌧', '🌩', '🌪', '🌬', '🌫', '🌜', '🌡', '🛋', '🛏', '🍽', '🍸', '🛎', '🛍', 'Ⓟ', '♿', '🛆', '🖈', '🎓', '🗤', '🗥', '🗦', '🗧', '🛪', '🐿', '🐦', '🐟', '🐕', '🐈', '🙬', '🙮', '🙭', '🙯', '🗺', '🌍', '🌏', '🌎', '🕊',),  # noqa
+
+    'Symbol': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '!', '∀', '#', '∃', '%', '&', '∍', '(', ')', '*', '+', ',', '−', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '≅', 'Α', 'Β', 'Χ', 'Δ', 'Ε', 'Φ', 'Γ', 'Η', 'Ι', 'ϑ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο', 'Π', 'Θ', 'Ρ', 'Σ', 'Τ', 'Υ', 'ς', 'Ω', 'Ξ', 'Ψ', 'Ζ', '[', '∴', ']', '⊥', '_', '', 'α', 'β', 'χ', 'δ', 'ε', 'φ', 'γ', 'η', 'ι', 'ϕ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'θ', 'ρ', 'σ', 'τ', 'υ', 'ϖ', 'ω', 'ξ', 'ψ', 'ζ', '{', '|', '}', '~', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '€', 'ϒ', '′', '≤', '⁄', '∞', 'ƒ', '♣', '♥', '♦', '♠', '↔', '←', '↑', '→', '↓', '°', '±', '″', '≥', '×', '∝', '∂', '•', '÷', '≠', '≡', '≈', '…', '⏐', '⎯', '↲', 'ℵ', 'ℑ', 'ℜ', '℘', '⊗', '⊕', '∅', '∩', '∪', '⊃', '⊇', '⊄', '⊂', '⊆', '∈', '∉', '∠', '∂', '®', '©', '™', '∏', '√', '⋅', '¬', '∦', '∧', '⇔', '⇐', '⇑', '⇒', '⇓', '◊', '〈', '®', '©', '™', '∑', '⎛', '⎜', '⎝', '⎡', '⎢', '⎣', '⎧', '⎨', '⎩', '⎪', ' ', '〉', '∫', '⌠', '⎮', '⌡', '⎞', '⎟', '⎠', '⎤', '⎥', '⎦', '⎪', '⎫', '⎬', ' ',),  # noqa
+}  # }}}
+
+SYMBOL_FONT_NAMES = frozenset(n.lower() for n in SYMBOL_MAPS)
+
+
+def is_symbol_font(family):
+    try:
+        return family.lower() in SYMBOL_FONT_NAMES
+    except AttributeError:
+        return False
+
+
+def do_map(m, points):
+    base = 0xf000
+    limit = len(m) + base
+    for p in points:
+        if base < p < limit:
+            yield m[p - base]
+        else:
+            yield codepoint_to_chr(p)
+
+
+def map_symbol_text(text, font):
+    m = SYMBOL_MAPS[font]
+    if isinstance(text, bytes):
+        text = text.decode('utf-8')
+    return ''.join(do_map(m, ord_string(text)))
+
+
+class Fonts(object):
+
+    def __init__(self, namespace):
+        self.namespace = namespace
+        self.fonts = {}
+        self.used = set()
+
+    def __call__(self, root, embed_relationships, docx, dest_dir):
+        for elem in self.namespace.XPath('//w:font[@w:name]')(root):
+            self.fonts[self.namespace.get(elem, 'w:name')] = Family(elem, embed_relationships, self.namespace.XPath, self.namespace.get)
+
+    def family_for(self, name, bold=False, italic=False):
+        f = self.fonts.get(name, None)
+        if f is None:
+            return 'serif'
+        variant = get_variant(bold, italic)
+        self.used.add((name, variant))
+        name = f.name if variant in f.embedded else f.family_name
+        if is_symbol_font(name):
+            return name
+        return '"%s", %s' % (name.replace('"', ''), f.css_generic_family)
+
+    def embed_fonts(self, dest_dir, docx):
+        defs = []
+        dest_dir = os.path.join(dest_dir, 'fonts')
+        for name, variant in self.used:
+            f = self.fonts[name]
+            if variant in f.embedded:
+                if not os.path.exists(dest_dir):
+                    os.mkdir(dest_dir)
+                fname = self.write(name, dest_dir, docx, variant)
+                if fname is not None:
+                    d = {'font-family':'"%s"' % name.replace('"', ''), 'src': 'url("fonts/%s")' % fname}
+                    if 'Bold' in variant:
+                        d['font-weight'] = 'bold'
+                    if 'Italic' in variant:
+                        d['font-style'] = 'italic'
+                    d = ['%s: %s' % (k, v) for k, v in iteritems(d)]
+                    d = ';\n\t'.join(d)
+                    defs.append('@font-face {\n\t%s\n}\n' % d)
+        return '\n'.join(defs)
+
+    def write(self, name, dest_dir, docx, variant):
+        f = self.fonts[name]
+        ef = f.embedded[variant]
+        raw = docx.read(ef.name)
+        prefix = raw[:32]
+        if ef.key:
+            key = re.sub(r'[^A-Fa-f0-9]', '', ef.key)
+            key = bytearray(reversed(tuple(int(key[i:i+2], 16) for i in range(0, len(key), 2))))
+            prefix = bytearray(prefix)
+            prefix = bytes(bytearray(prefix[i]^key[i % len(key)] for i in range(len(prefix))))
+        if not is_truetype_font(prefix):
+            return None
+        ext = 'otf' if prefix.startswith(b'OTTO') else 'ttf'
+        fname = ascii_filename('%s - %s.%s' % (name, variant, ext))
+        with open(os.path.join(dest_dir, fname), 'wb') as dest:
+            dest.write(prefix)
+            dest.write(raw[32:])
+
+        return fname
--- a/ebook_converter/ebooks/docx/footnotes.py
+++ b/ebook_converter/ebooks/docx/footnotes.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import OrderedDict
+from polyglot.builtins import iteritems, unicode_type
+
+
+class Note(object):
+
+    def __init__(self, namespace, parent, rels):
+        self.type = namespace.get(parent, 'w:type', 'normal')
+        self.parent = parent
+        self.rels = rels
+        self.namespace = namespace
+
+    def __iter__(self):
+        for p in self.namespace.descendants(self.parent, 'w:p', 'w:tbl'):
+            yield p
+
+
+class Footnotes(object):
+
+    def __init__(self, namespace):
+        self.namespace = namespace
+        self.footnotes = {}
+        self.endnotes = {}
+        self.counter = 0
+        self.notes = OrderedDict()
+
+    def __call__(self, footnotes, footnotes_rels, endnotes, endnotes_rels):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        if footnotes is not None:
+            for footnote in XPath('./w:footnote[@w:id]')(footnotes):
+                fid = get(footnote, 'w:id')
+                if fid:
+                    self.footnotes[fid] = Note(self.namespace, footnote, footnotes_rels)
+
+        if endnotes is not None:
+            for endnote in XPath('./w:endnote[@w:id]')(endnotes):
+                fid = get(endnote, 'w:id')
+                if fid:
+                    self.endnotes[fid] = Note(self.namespace, endnote, endnotes_rels)
+
+    def get_ref(self, ref):
+        fid = self.namespace.get(ref, 'w:id')
+        notes = self.footnotes if ref.tag.endswith('}footnoteReference') else self.endnotes
+        note = notes.get(fid, None)
+        if note is not None and note.type == 'normal':
+            self.counter += 1
+            anchor = 'note_%d' % self.counter
+            self.notes[anchor] = (unicode_type(self.counter), note)
+            return anchor, unicode_type(self.counter)
+        return None, None
+
+    def __iter__(self):
+        for anchor, (counter, note) in iteritems(self.notes):
+            yield anchor, counter, note
+
+    @property
+    def has_notes(self):
+        return bool(self.notes)
--- a/ebook_converter/ebooks/docx/images.py
+++ b/ebook_converter/ebooks/docx/images.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os
+
+from lxml.html.builder import IMG, HR
+
+from calibre.constants import iswindows
+from calibre.ebooks.docx.names import barename
+from calibre.utils.filenames import ascii_filename
+from calibre.utils.img import resize_to_fit, image_to_data
+from calibre.utils.imghdr import what
+from polyglot.builtins import iteritems, itervalues
+
+
+class LinkedImageNotFound(ValueError):
+
+    def __init__(self, fname):
+        ValueError.__init__(self, fname)
+        self.fname = fname
+
+
+def image_filename(x):
+    return ascii_filename(x).replace(' ', '_').replace('#', '_')
+
+
+def emu_to_pt(x):
+    return x / 12700
+
+
+def pt_to_emu(x):
+    return int(x * 12700)
+
+
+def get_image_properties(parent, XPath, get):
+    width = height = None
+    for extent in XPath('./wp:extent')(parent):
+        try:
+            width = emu_to_pt(int(extent.get('cx')))
+        except (TypeError, ValueError):
+            pass
+        try:
+            height = emu_to_pt(int(extent.get('cy')))
+        except (TypeError, ValueError):
+            pass
+    ans = {}
+    if width is not None:
+        ans['width'] = '%.3gpt' % width
+    if height is not None:
+        ans['height'] = '%.3gpt' % height
+
+    alt = None
+    title = None
+    for docPr in XPath('./wp:docPr')(parent):
+        alt = docPr.get('descr') or alt
+        title = docPr.get('title') or title
+        if docPr.get('hidden', None) in {'true', 'on', '1'}:
+            ans['display'] = 'none'
+
+    return ans, alt, title
+
+
+def get_image_margins(elem):
+    ans = {}
+    for w, css in iteritems({'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}):
+        val = elem.get('dist%s' % w, None)
+        if val is not None:
+            try:
+                val = emu_to_pt(val)
+            except (TypeError, ValueError):
+                continue
+            ans['padding-%s' % css] = '%.3gpt' % val
+    return ans
+
+
+def get_hpos(anchor, page_width, XPath, get, width_frac):
+    for ph in XPath('./wp:positionH')(anchor):
+        rp = ph.get('relativeFrom', None)
+        if rp == 'leftMargin':
+            return 0 + width_frac
+        if rp == 'rightMargin':
+            return 1 + width_frac
+        al = None
+        almap = {'left':0, 'center':0.5, 'right':1}
+        for align in XPath('./wp:align')(ph):
+            al = almap.get(align.text)
+            if al is not None:
+                if rp == 'page':
+                    return al
+                return al + width_frac
+        for po in XPath('./wp:posOffset')(ph):
+            try:
+                pos = emu_to_pt(int(po.text))
+            except (TypeError, ValueError):
+                continue
+            return pos/page_width + width_frac
+
+    for sp in XPath('./wp:simplePos')(anchor):
+        try:
+            x = emu_to_pt(sp.get('x', None))
+        except (TypeError, ValueError):
+            continue
+        return x/page_width + width_frac
+
+    return 0
+
+
+class Images(object):
+
+    def __init__(self, namespace, log):
+        self.namespace = namespace
+        self.rid_map = {}
+        self.used = {}
+        self.resized = {}
+        self.names = set()
+        self.all_images = set()
+        self.links = []
+        self.log = log
+
+    def __call__(self, relationships_by_id):
+        self.rid_map = relationships_by_id
+
+    def read_image_data(self, fname, base=None):
+        if fname.startswith('file://'):
+            src = fname[len('file://'):]
+            if iswindows and src and src[0] == '/':
+                src = src[1:]
+            if not src or not os.path.exists(src):
+                raise LinkedImageNotFound(src)
+            with open(src, 'rb') as rawsrc:
+                raw = rawsrc.read()
+        else:
+            try:
+                raw = self.docx.read(fname)
+            except KeyError:
+                raise LinkedImageNotFound(fname)
+        base = base or image_filename(fname.rpartition('/')[-1]) or 'image'
+        ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
+        if ext == 'emf':
+            # For an example, see: https://bugs.launchpad.net/bugs/1224849
+            self.log('Found an EMF image: %s, trying to extract embedded raster image' % fname)
+            from calibre.utils.wmf.emf import emf_unwrap
+            try:
+                raw = emf_unwrap(raw)
+            except Exception:
+                self.log.exception('Failed to extract embedded raster image from EMF')
+            else:
+                ext = 'png'
+        base = base.rpartition('.')[0]
+        if not base:
+            base = 'image'
+        base += '.' + ext
+        return raw, base
+
+    def unique_name(self, base):
+        exists = frozenset(itervalues(self.used))
+        c = 1
+        name = base
+        while name in exists:
+            n, e = base.rpartition('.')[0::2]
+            name = '%s-%d.%s' % (n, c, e)
+            c += 1
+        return name
+
+    def resize_image(self, raw, base, max_width, max_height):
+        resized, img = resize_to_fit(raw, max_width, max_height)
+        if resized:
+            base, ext = os.path.splitext(base)
+            base = base + '-%dx%d%s' % (max_width, max_height, ext)
+            raw = image_to_data(img, fmt=ext[1:])
+        return raw, base, resized
+
+    def generate_filename(self, rid, base=None, rid_map=None, max_width=None, max_height=None):
+        rid_map = self.rid_map if rid_map is None else rid_map
+        fname = rid_map[rid]
+        key = (fname, max_width, max_height)
+        ans = self.used.get(key)
+        if ans is not None:
+            return ans
+        raw, base = self.read_image_data(fname, base=base)
+        resized = False
+        if max_width is not None and max_height is not None:
+            raw, base, resized = self.resize_image(raw, base, max_width, max_height)
+        name = self.unique_name(base)
+        self.used[key] = name
+        if max_width is not None and max_height is not None and not resized:
+            okey = (fname, None, None)
+            if okey in self.used:
+                return self.used[okey]
+            self.used[okey] = name
+        with open(os.path.join(self.dest_dir, name), 'wb') as f:
+            f.write(raw)
+        self.all_images.add('images/' + name)
+        return name
+
+    def pic_to_img(self, pic, alt, parent, title):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        name = None
+        link = None
+        for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
+            link = {'id':get(hl, 'r:id')}
+            tgt = hl.get('tgtFrame', None)
+            if tgt:
+                link['target'] = tgt
+            title = hl.get('tooltip', None)
+            if title:
+                link['title'] = title
+
+        for pr in XPath('descendant::pic:cNvPr')(pic):
+            name = pr.get('name', None)
+            if name:
+                name = image_filename(name)
+            alt = pr.get('descr') or alt
+            for a in XPath('descendant::a:blip[@r:embed or @r:link]')(pic):
+                rid = get(a, 'r:embed')
+                if not rid:
+                    rid = get(a, 'r:link')
+                if rid and rid in self.rid_map:
+                    try:
+                        src = self.generate_filename(rid, name)
+                    except LinkedImageNotFound as err:
+                        self.log.warn('Linked image: %s not found, ignoring' % err.fname)
+                        continue
+                    img = IMG(src='images/%s' % src)
+                    img.set('alt', alt or 'Image')
+                    if title:
+                        img.set('title', title)
+                    if link is not None:
+                        self.links.append((img, link, self.rid_map))
+                    return img
+
+    def drawing_to_html(self, drawing, page):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        # First process the inline pictures
+        for inline in XPath('./wp:inline')(drawing):
+            style, alt, title = get_image_properties(inline, XPath, get)
+            for pic in XPath('descendant::pic:pic')(inline):
+                ans = self.pic_to_img(pic, alt, inline, title)
+                if ans is not None:
+                    if style:
+                        ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
+                    yield ans
+
+        # Now process the floats
+        for anchor in XPath('./wp:anchor')(drawing):
+            style, alt, title = get_image_properties(anchor, XPath, get)
+            self.get_float_properties(anchor, style, page)
+            for pic in XPath('descendant::pic:pic')(anchor):
+                ans = self.pic_to_img(pic, alt, anchor, title)
+                if ans is not None:
+                    if style:
+                        ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
+                    yield ans
+
+    def pict_to_html(self, pict, page):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        # First see if we have an <hr>
+        is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'}
+        if is_hr:
+            style = {}
+            hr = HR()
+            try:
+                pct = float(get(pict[0], 'o:hrpct'))
+            except (ValueError, TypeError, AttributeError):
+                pass
+            else:
+                if pct > 0:
+                    style['width'] = '%.3g%%' % pct
+            align = get(pict[0], 'o:hralign', 'center')
+            if align in {'left', 'right'}:
+                style['margin-left'] = '0' if align == 'left' else 'auto'
+                style['margin-right'] = 'auto' if align == 'left' else '0'
+            if style:
+                hr.set('style', '; '.join(('%s:%s' % (k, v) for k, v in iteritems(style))))
+            yield hr
+
+        for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict):
+            rid = get(imagedata, 'r:id')
+            if rid in self.rid_map:
+                try:
+                    src = self.generate_filename(rid)
+                except LinkedImageNotFound as err:
+                    self.log.warn('Linked image: %s not found, ignoring' % err.fname)
+                    continue
+                img = IMG(src='images/%s' % src, style="display:block")
+                alt = get(imagedata, 'o:title')
+                img.set('alt', alt or 'Image')
+                yield img
+
+    def get_float_properties(self, anchor, style, page):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        if 'display' not in style:
+            style['display'] = 'block'
+        padding = get_image_margins(anchor)
+        width = float(style.get('width', '100pt')[:-2])
+
+        page_width = page.width - page.margin_left - page.margin_right
+        if page_width <= 0:
+            # Ignore margins
+            page_width = page.width
+
+        hpos = get_hpos(anchor, page_width, XPath, get, width/(2*page_width))
+
+        wrap_elem = None
+        dofloat = False
+
+        for child in reversed(anchor):
+            bt = barename(child.tag)
+            if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}:
+                wrap_elem = child
+                dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'}
+                break
+
+        if wrap_elem is not None:
+            padding.update(get_image_margins(wrap_elem))
+            wt = wrap_elem.get('wrapText', None)
+            hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos
+            if dofloat:
+                style['float'] = 'left' if hpos < 0.65 else 'right'
+            else:
+                ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto')
+                if ml is not None:
+                    style['margin-left'] = ml
+                if mr is not None:
+                    style['margin-right'] = mr
+
+        style.update(padding)
+
+    def to_html(self, elem, page, docx, dest_dir):
+        dest = os.path.join(dest_dir, 'images')
+        if not os.path.exists(dest):
+            os.mkdir(dest)
+        self.dest_dir, self.docx = dest, docx
+        if elem.tag.endswith('}drawing'):
+            for tag in self.drawing_to_html(elem, page):
+                yield tag
+        else:
+            for tag in self.pict_to_html(elem, page):
+                yield tag
--- a/ebook_converter/ebooks/docx/index.py
+++ b/ebook_converter/ebooks/docx/index.py
@@ -0,0 +1,273 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from operator import itemgetter
+
+from lxml import etree
+
+from calibre.utils.icu import partition_by_first_letter, sort_key
+from polyglot.builtins import iteritems, filter
+
+
+def get_applicable_xe_fields(index, xe_fields, XPath, expand):
+    iet = index.get('entry-type', None)
+    xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]
+
+    lr = index.get('letter-range', None)
+    if lr is not None:
+        sl, el = lr.parition('-')[0::2]
+        sl, el = sl.strip(), el.strip()
+        if sl and el:
+            def inrange(text):
+                return sl <= text[0] <= el
+            xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))]
+
+    bmark = index.get('bookmark', None)
+    if bmark is None:
+        return xe_fields
+    attr = expand('w:name')
+    bookmarks = {b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark}
+    ancestors = XPath('ancestor::w:bookmarkStart')
+
+    def contained(xe):
+        # Check if the xe field is contained inside a bookmark with the
+        # specified name
+        return bool(set(ancestors(xe['start_elem'])) & bookmarks)
+
+    return [xe for xe in xe_fields if contained(xe)]
+
+
+def make_block(expand, style, parent, pos):
+    p = parent.makeelement(expand('w:p'))
+    parent.insert(pos, p)
+    if style is not None:
+        ppr = p.makeelement(expand('w:pPr'))
+        p.append(ppr)
+        ps = ppr.makeelement(expand('w:pStyle'))
+        ppr.append(ps)
+        ps.set(expand('w:val'), style)
+    r = p.makeelement(expand('w:r'))
+    p.append(r)
+    t = r.makeelement(expand('w:t'))
+    t.set(expand('xml:space'), 'preserve')
+    r.append(t)
+    return p, t
+
+
+def add_xe(xe, t, expand):
+    run = t.getparent()
+    idx = run.index(t)
+    t.text = xe.get('text') or ' '
+    pt = xe.get('page-number-text', None)
+
+    if pt:
+        p = t.getparent().getparent()
+        r = p.makeelement(expand('w:r'))
+        p.append(r)
+        t2 = r.makeelement(expand('w:t'))
+        t2.set(expand('xml:space'), 'preserve')
+        t2.text = ' [%s]' % pt
+        r.append(t2)
+    # put separate entries on separate lines
+    run.insert(idx + 1, run.makeelement(expand('w:br')))
+    return xe['anchor'], run
+
+
+def process_index(field, index, xe_fields, log, XPath, expand):
+    '''
+    We remove all the word generated index markup and replace it with our own
+    that is more suitable for an ebook.
+    '''
+    styles = []
+    heading_text = index.get('heading', None)
+    heading_style = 'IndexHeading'
+    start_pos = None
+    for elem in field.contents:
+        if elem.tag.endswith('}p'):
+            s = XPath('descendant::pStyle/@w:val')(elem)
+            if s:
+                styles.append(s[0])
+            p = elem.getparent()
+            if start_pos is None:
+                start_pos = (p, p.index(elem))
+            p.remove(elem)
+
+    xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand)
+    if not xe_fields:
+        return [], []
+    if heading_text is not None:
+        groups = partition_by_first_letter(xe_fields, key=itemgetter('text'))
+        items = []
+        for key, fields in iteritems(groups):
+            items.append(key), items.extend(fields)
+        if styles:
+            heading_style = styles[0]
+    else:
+        items = sorted(xe_fields, key=lambda x:sort_key(x['text']))
+
+    hyperlinks = []
+    blocks = []
+    for item in reversed(items):
+        is_heading = not isinstance(item, dict)
+        style = heading_style if is_heading else None
+        p, t = make_block(expand, style, *start_pos)
+        if is_heading:
+            text = heading_text
+            if text.lower().startswith('a'):
+                text = item + text[1:]
+            t.text = text
+        else:
+            hyperlinks.append(add_xe(item, t, expand))
+            blocks.append(p)
+
+    return hyperlinks, blocks
+
+
+def split_up_block(block, a, text, parts, ldict):
+    prefix = parts[:-1]
+    a.text = parts[-1]
+    parent = a.getparent()
+    style = 'display:block; margin-left: %.3gem'
+    for i, prefix in enumerate(prefix):
+        m = 1.5 * i
+        span = parent.makeelement('span', style=style % m)
+        ldict[span]    = i
+        parent.append(span)
+        span.text = prefix
+    span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
+    parent.append(span)
+    span.append(a)
+    ldict[span]    = len(prefix)
+
+
+"""
+The merge algorithm is a little tricky.
+We start with a list of elementary blocks. Each is an HtmlElement, a p node
+with a list of child nodes. The last child may be a link, and the earlier ones are
+just text.
+The list is in reverse order from what we want in the index.
+There is a dictionary ldict which records the level of each child node.
+
+Now we want to do a reduce-like operation, combining all blocks with the same
+top level index entry into a single block representing the structure of all
+references, subentries, etc. under that top entry.
+Here's the algorithm.
+
+Given a block p and the next block n, and the top level entries p1 and n1 in each
+block, which we assume have the same text:
+
+Start with (p, p1) and (n, n1).
+
+Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge:
+
+If there are no more levels in n, and we have a link in nk,
+then add the link from nk to the links for pk.
+This might be the first link for pk, or we might get a list of references.
+
+Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have
+the same text, it must follow pk, it must come before we find any other p entries at
+the same level as pk, and it must have the same level as nk+1.
+
+If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1).
+
+If there is no matching entry, then because of the original reversed order we want
+to insert nk+1 and all following entries from n into p immediately following pk.
+"""
+
+
+def find_match(prev_block, pind, nextent, ldict):
+    curlevel = ldict.get(prev_block[pind], -1)
+    if curlevel < 0:
+        return -1
+    for p in range(pind+1, len(prev_block)):
+        trylev = ldict.get(prev_block[p], -1)
+        if trylev <= curlevel:
+            return -1
+        if trylev > (curlevel+1):
+            continue
+        if prev_block[p].text_content() == nextent.text_content():
+            return p
+    return -1
+
+
+def add_link(pent, nent, ldict):
+    na = nent.xpath('descendant::a[1]')
+    # If there is no link, leave it as text
+    if not na or len(na) == 0:
+        return
+    na = na[0]
+    pa = pent.xpath('descendant::a')
+    if pa and len(pa) > 0:
+        # Put on same line with a comma
+        pa = pa[-1]
+        pa.tail = ', '
+        p = pa.getparent()
+        p.insert(p.index(pa) + 1, na)
+    else:
+        # substitute link na for plain text in pent
+        pent.text = ""
+        pent.append(na)
+
+
+def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict):
+    # First elements match. Any more in next?
+    if len(next_path) == (nind + 1):
+        nextent = next_block[nind]
+        add_link(prev_block[pind], nextent, ldict)
+        return
+
+    nind = nind + 1
+    nextent = next_block[nind]
+    prevent = find_match(prev_block, pind, nextent, ldict)
+    if prevent > 0:
+        merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict)
+        return
+
+    # Want to insert elements into previous block
+    while nind < len(next_block):
+        # insert takes it out of old
+        pind = pind + 1
+        prev_block.insert(pind, next_block[nind])
+
+    next_block.getparent().remove(next_block)
+
+
+def polish_index_markup(index, blocks):
+    # Blocks are in reverse order at this point
+    path_map = {}
+    ldict = {}
+    for block in blocks:
+        cls = block.get('class', '') or ''
+        block.set('class', (cls + ' index-entry').lstrip())
+        a = block.xpath('descendant::a[1]')
+        text = ''
+        if a:
+            text = etree.tostring(a[0], method='text', with_tail=False, encoding='unicode').strip()
+        if ':' in text:
+            path_map[block] = parts = list(filter(None, (x.strip() for x in text.split(':'))))
+            if len(parts) > 1:
+                split_up_block(block, a[0], text, parts, ldict)
+        else:
+            # try using a span all the time
+            path_map[block] = [text]
+            parent = a[0].getparent()
+            span = parent.makeelement('span', style='display:block; margin-left: 0em')
+            parent.append(span)
+            span.append(a[0])
+            ldict[span] = 0
+
+        for br in block.xpath('descendant::br'):
+            br.tail = None
+
+    # We want a single block for each main entry
+    prev_block = blocks[0]
+    for block in blocks[1:]:
+        pp, pn = path_map[prev_block], path_map[block]
+        if pp[0] == pn[0]:
+            merge_blocks(prev_block, block, 0, 0, pn, ldict)
+        else:
+            prev_block = block
--- a/ebook_converter/ebooks/docx/names.py
+++ b/ebook_converter/ebooks/docx/names.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re
+
+from lxml.etree import XPath as X
+
+from calibre.utils.filenames import ascii_text
+from polyglot.builtins import iteritems
+
+# Names {{{
+TRANSITIONAL_NAMES = {
+    'DOCUMENT'  : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument',
+    'DOCPROPS'  : 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties',
+    'APPPROPS'  : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties',
+    'STYLES'    : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles',
+    'NUMBERING' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering',
+    'FONTS'     : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable',
+    'EMBEDDED_FONT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/font',
+    'IMAGES'    : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image',
+    'LINKS'     : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink',
+    'FOOTNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes',
+    'ENDNOTES'  : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes',
+    'THEMES'    : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme',
+    'SETTINGS'  : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings',
+    'WEB_SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings',
+}
+
+STRICT_NAMES = {
+    k:v.replace('http://schemas.openxmlformats.org/officeDocument/2006',  'http://purl.oclc.org/ooxml/officeDocument')
+    for k, v in iteritems(TRANSITIONAL_NAMES)
+}
+
+TRANSITIONAL_NAMESPACES = {
+    'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
+    'o': 'urn:schemas-microsoft-com:office:office',
+    've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
+    'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
+    # Text Content
+    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
+    'w10': 'urn:schemas-microsoft-com:office:word',
+    'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
+    'xml': 'http://www.w3.org/XML/1998/namespace',
+    # Drawing
+    'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
+    'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
+    'mv': 'urn:schemas-microsoft-com:mac:vml',
+    'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
+    'v': 'urn:schemas-microsoft-com:vml',
+    'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
+    # Properties (core and extended)
+    'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
+    'dc': 'http://purl.org/dc/elements/1.1/',
+    'ep': 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
+    'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
+    # Content Types
+    'ct': 'http://schemas.openxmlformats.org/package/2006/content-types',
+    # Package Relationships
+    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
+    'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
+    # Dublin Core document properties
+    'dcmitype': 'http://purl.org/dc/dcmitype/',
+    'dcterms': 'http://purl.org/dc/terms/'
+}
+
+STRICT_NAMESPACES = {
+    k:v.replace(
+        'http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument').replace(
+        'http://schemas.openxmlformats.org/wordprocessingml/2006', 'http://purl.oclc.org/ooxml/wordprocessingml').replace(
+        'http://schemas.openxmlformats.org/drawingml/2006', 'http://purl.oclc.org/ooxml/drawingml')
+    for k, v in iteritems(TRANSITIONAL_NAMESPACES)
+}
+# }}}
+
+
+def barename(x):
+    return x.rpartition('}')[-1]
+
+
+def XML(x):
+    return '{%s}%s' % (TRANSITIONAL_NAMESPACES['xml'], x)
+
+
+def generate_anchor(name, existing):
+    x = y = 'id_' + re.sub(r'[^0-9a-zA-Z_]', '', ascii_text(name)).lstrip('_')
+    c = 1
+    while y in existing:
+        y = '%s_%d' % (x, c)
+        c += 1
+    return y
+
+
+class DOCXNamespace(object):
+
+    def __init__(self, transitional=True):
+        self.xpath_cache = {}
+        if transitional:
+            self.namespaces = TRANSITIONAL_NAMESPACES.copy()
+            self.names = TRANSITIONAL_NAMES.copy()
+        else:
+            self.namespaces = STRICT_NAMESPACES.copy()
+            self.names = STRICT_NAMES.copy()
+
+    def XPath(self, expr):
+        ans = self.xpath_cache.get(expr, None)
+        if ans is None:
+            self.xpath_cache[expr] = ans = X(expr, namespaces=self.namespaces)
+        return ans
+
+    def is_tag(self, x, q):
+        tag = getattr(x, 'tag', x)
+        ns, name = q.partition(':')[0::2]
+        return '{%s}%s' % (self.namespaces.get(ns, None), name) == tag
+
+    def expand(self, name, sep=':'):
+        ns, tag = name.partition(sep)[::2]
+        if ns and tag:
+            tag = '{%s}%s' % (self.namespaces[ns], tag)
+        return tag or ns
+
+    def get(self, x, attr, default=None):
+        return x.attrib.get(self.expand(attr), default)
+
+    def ancestor(self, elem, name):
+        try:
+            return self.XPath('ancestor::%s[1]' % name)(elem)[0]
+        except IndexError:
+            return None
+
+    def children(self, elem, *args):
+        return self.XPath('|'.join('child::%s' % a for a in args))(elem)
+
+    def descendants(self, elem, *args):
+        return self.XPath('|'.join('descendant::%s' % a for a in args))(elem)
+
+    def makeelement(self, root, tag, append=True, **attrs):
+        ans = root.makeelement(self.expand(tag), **{self.expand(k, sep='_'):v for k, v in iteritems(attrs)})
+        if append:
+            root.append(ans)
+        return ans
--- a/ebook_converter/ebooks/docx/numbering.py
+++ b/ebook_converter/ebooks/docx/numbering.py
@@ -0,0 +1,388 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re, string
+from collections import Counter, defaultdict
+from functools import partial
+
+from lxml.html.builder import OL, UL, SPAN
+
+from calibre.ebooks.docx.block_styles import ParagraphStyle
+from calibre.ebooks.docx.char_styles import RunStyle, inherit
+from calibre.ebooks.metadata import roman
+from polyglot.builtins import iteritems, unicode_type
+
+STYLE_MAP = {
+    'aiueo': 'hiragana',
+    'aiueoFullWidth': 'hiragana',
+    'hebrew1': 'hebrew',
+    'iroha': 'katakana-iroha',
+    'irohaFullWidth': 'katakana-iroha',
+    'lowerLetter': 'lower-alpha',
+    'lowerRoman': 'lower-roman',
+    'none': 'none',
+    'upperLetter': 'upper-alpha',
+    'upperRoman': 'upper-roman',
+    'chineseCounting': 'cjk-ideographic',
+    'decimalZero': 'decimal-leading-zero',
+}
+
+
+def alphabet(val, lower=True):
+    x = string.ascii_lowercase if lower else string.ascii_uppercase
+    return x[(abs(val - 1)) % len(x)]
+
+
+alphabet_map = {
+    'lower-alpha':alphabet, 'upper-alpha':partial(alphabet, lower=False),
+    'lower-roman':lambda x:roman(x).lower(), 'upper-roman':roman,
+    'decimal-leading-zero': lambda x: '0%d' % x
+}
+
+
+class Level(object):
+
+    def __init__(self, namespace, lvl=None):
+        self.namespace = namespace
+        self.restart = None
+        self.start = 0
+        self.fmt = 'decimal'
+        self.para_link = None
+        self.paragraph_style = self.character_style = None
+        self.is_numbered = False
+        self.num_template = None
+        self.bullet_template = None
+        self.pic_id = None
+
+        if lvl is not None:
+            self.read_from_xml(lvl)
+
+    def copy(self):
+        ans = Level(self.namespace)
+        for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
+            setattr(ans, x, getattr(self, x))
+        return ans
+
+    def format_template(self, counter, ilvl, template):
+        def sub(m):
+            x = int(m.group(1)) - 1
+            if x > ilvl or x not in counter:
+                return ''
+            val = counter[x] - (0 if x == ilvl else 1)
+            formatter = alphabet_map.get(self.fmt, lambda x: '%d' % x)
+            return formatter(val)
+        return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
+
+    def read_from_xml(self, lvl, override=False):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
+            try:
+                self.restart = int(get(lr, 'w:val'))
+            except (TypeError, ValueError):
+                pass
+
+        for lr in XPath('./w:start[@w:val]')(lvl):
+            try:
+                self.start = int(get(lr, 'w:val'))
+            except (TypeError, ValueError):
+                pass
+
+        for rPr in XPath('./w:rPr')(lvl):
+            ps = RunStyle(self.namespace, rPr)
+            if self.character_style is None:
+                self.character_style = ps
+            else:
+                self.character_style.update(ps)
+
+        lt = None
+        for lr in XPath('./w:lvlText[@w:val]')(lvl):
+            lt = get(lr, 'w:val')
+
+        for lr in XPath('./w:numFmt[@w:val]')(lvl):
+            val = get(lr, 'w:val')
+            if val == 'bullet':
+                self.is_numbered = False
+                cs = self.character_style
+                if lt in {'\uf0a7', 'o'} or (
+                    cs is not None and cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}):
+                    self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
+                else:
+                    self.bullet_template = lt
+                for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl):
+                    self.pic_id = get(lpid, 'w:val')
+            else:
+                self.is_numbered = True
+                self.fmt = STYLE_MAP.get(val, 'decimal')
+                if lt and re.match(r'%\d+\.$', lt) is None:
+                    self.num_template = lt
+
+        for lr in XPath('./w:pStyle[@w:val]')(lvl):
+            self.para_link = get(lr, 'w:val')
+
+        for pPr in XPath('./w:pPr')(lvl):
+            ps = ParagraphStyle(self.namespace, pPr)
+            if self.paragraph_style is None:
+                self.paragraph_style = ps
+            else:
+                self.paragraph_style.update(ps)
+
+    def css(self, images, pic_map, rid_map):
+        ans = {'list-style-type': self.fmt}
+        if self.pic_id:
+            rid = pic_map.get(self.pic_id, None)
+            if rid:
+                try:
+                    fname = images.generate_filename(rid, rid_map=rid_map, max_width=20, max_height=20)
+                except Exception:
+                    fname = None
+                else:
+                    ans['list-style-image'] = 'url("images/%s")' % fname
+        return ans
+
+    def char_css(self):
+        try:
+            css = self.character_style.css
+        except AttributeError:
+            css = {}
+        css.pop('font-family', None)
+        return css
+
+
+class NumberingDefinition(object):
+
+    def __init__(self, namespace, parent=None, an_id=None):
+        self.namespace = namespace
+        XPath, get = self.namespace.XPath, self.namespace.get
+        self.levels = {}
+        self.abstract_numbering_definition_id = an_id
+        if parent is not None:
+            for lvl in XPath('./w:lvl')(parent):
+                try:
+                    ilvl = int(get(lvl, 'w:ilvl', 0))
+                except (TypeError, ValueError):
+                    ilvl = 0
+                self.levels[ilvl] = Level(namespace, lvl)
+
+    def copy(self):
+        ans = NumberingDefinition(self.namespace, an_id=self.abstract_numbering_definition_id)
+        for l, lvl in iteritems(self.levels):
+            ans.levels[l] = lvl.copy()
+        return ans
+
+
+class Numbering(object):
+
+    def __init__(self, namespace):
+        self.namespace = namespace
+        self.definitions = {}
+        self.instances = {}
+        self.counters = defaultdict(Counter)
+        self.starts = {}
+        self.pic_map = {}
+
+    def __call__(self, root, styles, rid_map):
+        ' Read all numbering style definitions '
+        XPath, get = self.namespace.XPath, self.namespace.get
+        self.rid_map = rid_map
+        for npb in XPath('./w:numPicBullet[@w:numPicBulletId]')(root):
+            npbid = get(npb, 'w:numPicBulletId')
+            for idata in XPath('descendant::v:imagedata[@r:id]')(npb):
+                rid = get(idata, 'r:id')
+                self.pic_map[npbid] = rid
+        lazy_load = {}
+        for an in XPath('./w:abstractNum[@w:abstractNumId]')(root):
+            an_id = get(an, 'w:abstractNumId')
+            nsl = XPath('./w:numStyleLink[@w:val]')(an)
+            if nsl:
+                lazy_load[an_id] = get(nsl[0], 'w:val')
+            else:
+                nd = NumberingDefinition(self.namespace, an, an_id=an_id)
+                self.definitions[an_id] = nd
+
+        def create_instance(n, definition):
+            nd = definition.copy()
+            start_overrides = {}
+            for lo in XPath('./w:lvlOverride')(n):
+                try:
+                    ilvl = int(get(lo, 'w:ilvl'))
+                except (ValueError, TypeError):
+                    ilvl = None
+                for so in XPath('./w:startOverride[@w:val]')(lo):
+                    try:
+                        start_override = int(get(so, 'w:val'))
+                    except (TypeError, ValueError):
+                        pass
+                    else:
+                        start_overrides[ilvl] = start_override
+                for lvl in XPath('./w:lvl')(lo)[:1]:
+                    nilvl = get(lvl, 'w:ilvl')
+                    ilvl = nilvl if ilvl is None else ilvl
+                    alvl = nd.levels.get(ilvl, None)
+                    if alvl is None:
+                        alvl = Level(self.namespace)
+                    alvl.read_from_xml(lvl, override=True)
+            for ilvl, so in iteritems(start_overrides):
+                try:
+                    nd.levels[ilvl].start = start_override
+                except KeyError:
+                    pass
+            return nd
+
+        next_pass = {}
+        for n in XPath('./w:num[@w:numId]')(root):
+            an_id = None
+            num_id = get(n, 'w:numId')
+            for an in XPath('./w:abstractNumId[@w:val]')(n):
+                an_id = get(an, 'w:val')
+            d = self.definitions.get(an_id, None)
+            if d is None:
+                next_pass[num_id] = (an_id, n)
+                continue
+            self.instances[num_id] = create_instance(n, d)
+
+        numbering_links = styles.numbering_style_links
+        for an_id, style_link in iteritems(lazy_load):
+            num_id = numbering_links[style_link]
+            self.definitions[an_id] = self.instances[num_id].copy()
+
+        for num_id, (an_id, n) in iteritems(next_pass):
+            d = self.definitions.get(an_id, None)
+            if d is not None:
+                self.instances[num_id] = create_instance(n, d)
+
+        for num_id, d in iteritems(self.instances):
+            self.starts[num_id] = {lvl:d.levels[lvl].start for lvl in d.levels}
+
+    def get_pstyle(self, num_id, style_id):
+        d = self.instances.get(num_id, None)
+        if d is not None:
+            for ilvl, lvl in iteritems(d.levels):
+                if lvl.para_link == style_id:
+                    return ilvl
+
+    def get_para_style(self, num_id, lvl):
+        d = self.instances.get(num_id, None)
+        if d is not None:
+            lvl = d.levels.get(lvl, None)
+            return getattr(lvl, 'paragraph_style', None)
+
+    def update_counter(self, counter, levelnum, levels):
+        counter[levelnum] += 1
+        for ilvl, lvl in iteritems(levels):
+            restart = lvl.restart
+            if (restart is None and ilvl == levelnum + 1) or restart == levelnum + 1:
+                counter[ilvl] = lvl.start
+
+    def apply_markup(self, items, body, styles, object_map, images):
+        seen_instances = set()
+        for p, num_id, ilvl in items:
+            d = self.instances.get(num_id, None)
+            if d is not None:
+                lvl = d.levels.get(ilvl, None)
+                if lvl is not None:
+                    an_id = d.abstract_numbering_definition_id
+                    counter = self.counters[an_id]
+                    if ilvl not in counter or num_id not in seen_instances:
+                        counter[ilvl] = self.starts[num_id][ilvl]
+                    seen_instances.add(num_id)
+                    p.tag = 'li'
+                    p.set('value', '%s' % counter[ilvl])
+                    p.set('list-lvl', unicode_type(ilvl))
+                    p.set('list-id', num_id)
+                    if lvl.num_template is not None:
+                        val = lvl.format_template(counter, ilvl, lvl.num_template)
+                        p.set('list-template', val)
+                    elif lvl.bullet_template is not None:
+                        val = lvl.format_template(counter, ilvl, lvl.bullet_template)
+                        p.set('list-template', val)
+                    self.update_counter(counter, ilvl, d.levels)
+
+        templates = {}
+
+        def commit(current_run):
+            if not current_run:
+                return
+            start = current_run[0]
+            parent = start.getparent()
+            idx = parent.index(start)
+
+            d = self.instances[start.get('list-id')]
+            ilvl = int(start.get('list-lvl'))
+            lvl = d.levels[ilvl]
+            lvlid = start.get('list-id') + start.get('list-lvl')
+            has_template = 'list-template' in start.attrib
+            wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t')
+            if has_template:
+                wrap.set('lvlid', lvlid)
+            else:
+                wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list'))
+            ccss = lvl.char_css()
+            if ccss:
+                ccss = styles.register(ccss, 'bullet')
+            parent.insert(idx, wrap)
+            last_val = None
+            for child in current_run:
+                wrap.append(child)
+                child.tail = '\n\t'
+                if has_template:
+                    span = SPAN()
+                    span.text = child.text
+                    child.text = None
+                    for gc in child:
+                        span.append(gc)
+                    child.append(span)
+                    span = SPAN(child.get('list-template'))
+                    if ccss:
+                        span.set('class', ccss)
+                    last = templates.get(lvlid, '')
+                    if span.text and len(span.text) > len(last):
+                        templates[lvlid] = span.text
+                    child.insert(0, span)
+                for attr in ('list-lvl', 'list-id', 'list-template'):
+                    child.attrib.pop(attr, None)
+                val = int(child.get('value'))
+                if last_val == val - 1 or wrap.tag == 'ul' or (last_val is None and val == 1):
+                    child.attrib.pop('value')
+                last_val = val
+            current_run[-1].tail = '\n'
+            del current_run[:]
+
+        parents = set()
+        for child in body.iterdescendants('li'):
+            parents.add(child.getparent())
+
+        for parent in parents:
+            current_run = []
+            for child in parent:
+                if child.tag == 'li':
+                    if current_run:
+                        last = current_run[-1]
+                        if (last.get('list-id') , last.get('list-lvl')) != (child.get('list-id'), child.get('list-lvl')):
+                            commit(current_run)
+                    current_run.append(child)
+                else:
+                    commit(current_run)
+            commit(current_run)
+
+        # Convert the list items that use custom text for bullets into tables
+        # so that they display correctly
+        for wrap in body.xpath('//ol[@lvlid]'):
+            wrap.attrib.pop('lvlid')
+            wrap.tag = 'div'
+            wrap.set('style', 'display:table')
+            for i, li in enumerate(wrap.iterchildren('li')):
+                li.tag = 'div'
+                li.attrib.pop('value', None)
+                li.set('style', 'display:table-row')
+                obj = object_map[li]
+                bs = styles.para_cache[obj]
+                if i == 0:
+                    wrap.set('style', 'display:table; padding-left:%s' %
+                             bs.css.get('margin-left', '0'))
+                bs.css.pop('margin-left', None)
+                for child in li:
+                    child.set('style', 'display:table-cell')
--- a/ebook_converter/ebooks/docx/settings.py
+++ b/ebook_converter/ebooks/docx/settings.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+
+class Settings(object):
+
+    def __init__(self, namespace):
+        self.default_tab_stop = 720 / 20
+        self.namespace = namespace
+
+    def __call__(self, root):
+        for dts in self.namespace.XPath('//w:defaultTabStop[@w:val]')(root):
+            try:
+                self.default_tab_stop = int(self.namespace.get(dts, 'w:val')) / 20
+            except (ValueError, TypeError, AttributeError):
+                pass
+
--- a/ebook_converter/ebooks/docx/styles.py
+++ b/ebook_converter/ebooks/docx/styles.py
@@ -0,0 +1,504 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import textwrap
+from collections import OrderedDict, Counter
+
+from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit, twips
+from calibre.ebooks.docx.char_styles import RunStyle
+from calibre.ebooks.docx.tables import TableStyle
+from polyglot.builtins import iteritems, itervalues
+
+
+class PageProperties(object):
+
+    '''
+    Class representing page level properties (page size/margins) read from
+    sectPr elements.
+    '''
+
+    def __init__(self, namespace, elems=()):
+        self.width, self.height = 595.28, 841.89  # pts, A4
+        self.margin_left = self.margin_right = 72  # pts
+
+        def setval(attr, val):
+            val = twips(val)
+            if val is not None:
+                setattr(self, attr, val)
+
+        for sectPr in elems:
+            for pgSz in namespace.XPath('./w:pgSz')(sectPr):
+                w, h = namespace.get(pgSz, 'w:w'), namespace.get(pgSz, 'w:h')
+                setval('width', w), setval('height', h)
+            for pgMar in namespace.XPath('./w:pgMar')(sectPr):
+                l, r = namespace.get(pgMar, 'w:left'), namespace.get(pgMar, 'w:right')
+                setval('margin_left', l), setval('margin_right', r)
+
+
+class Style(object):
+    '''
+    Class representing a <w:style> element. Can contain block, character, etc. styles.
+    '''
+
+    def __init__(self, namespace, elem):
+        self.namespace = namespace
+        self.name_path = namespace.XPath('./w:name[@w:val]')
+        self.based_on_path = namespace.XPath('./w:basedOn[@w:val]')
+        self.resolved = False
+        self.style_id = namespace.get(elem, 'w:styleId')
+        self.style_type = namespace.get(elem, 'w:type')
+        names = self.name_path(elem)
+        self.name = namespace.get(names[-1], 'w:val') if names else None
+        based_on = self.based_on_path(elem)
+        self.based_on = namespace.get(based_on[0], 'w:val') if based_on else None
+        if self.style_type == 'numbering':
+            self.based_on = None
+        self.is_default = namespace.get(elem, 'w:default') in {'1', 'on', 'true'}
+
+        self.paragraph_style = self.character_style = self.table_style = None
+
+        if self.style_type in {'paragraph', 'character', 'table'}:
+            if self.style_type == 'table':
+                for tblPr in namespace.XPath('./w:tblPr')(elem):
+                    ts = TableStyle(namespace, tblPr)
+                    if self.table_style is None:
+                        self.table_style = ts
+                    else:
+                        self.table_style.update(ts)
+            if self.style_type in {'paragraph', 'table'}:
+                for pPr in namespace.XPath('./w:pPr')(elem):
+                    ps = ParagraphStyle(namespace, pPr)
+                    if self.paragraph_style is None:
+                        self.paragraph_style = ps
+                    else:
+                        self.paragraph_style.update(ps)
+
+            for rPr in namespace.XPath('./w:rPr')(elem):
+                rs = RunStyle(namespace, rPr)
+                if self.character_style is None:
+                    self.character_style = rs
+                else:
+                    self.character_style.update(rs)
+
+        if self.style_type in {'numbering', 'paragraph'}:
+            self.numbering_style_link = None
+            for x in namespace.XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
+                self.numbering_style_link = namespace.get(x, 'w:val')
+
+    def resolve_based_on(self, parent):
+        if parent.table_style is not None:
+            if self.table_style is None:
+                self.table_style = TableStyle(self.namespace)
+            self.table_style.resolve_based_on(parent.table_style)
+        if parent.paragraph_style is not None:
+            if self.paragraph_style is None:
+                self.paragraph_style = ParagraphStyle(self.namespace)
+            self.paragraph_style.resolve_based_on(parent.paragraph_style)
+        if parent.character_style is not None:
+            if self.character_style is None:
+                self.character_style = RunStyle(self.namespace)
+            self.character_style.resolve_based_on(parent.character_style)
+
+
+class Styles(object):
+
+    '''
+    Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup.
+    '''
+
+    def __init__(self, namespace, tables):
+        self.namespace = namespace
+        self.id_map = OrderedDict()
+        self.para_cache = {}
+        self.para_char_cache = {}
+        self.run_cache = {}
+        self.classes = {}
+        self.counter = Counter()
+        self.default_styles = {}
+        self.tables = tables
+        self.numbering_style_links = {}
+        self.default_paragraph_style = self.default_character_style = None
+
+    def __iter__(self):
+        for s in itervalues(self.id_map):
+            yield s
+
+    def __getitem__(self, key):
+        return self.id_map[key]
+
+    def __len__(self):
+        return len(self.id_map)
+
+    def get(self, key, default=None):
+        return self.id_map.get(key, default)
+
+    def __call__(self, root, fonts, theme):
+        self.fonts, self.theme = fonts, theme
+        self.default_paragraph_style = self.default_character_style = None
+        if root is not None:
+            for s in self.namespace.XPath('//w:style')(root):
+                s = Style(self.namespace, s)
+                if s.style_id:
+                    self.id_map[s.style_id] = s
+                if s.is_default:
+                    self.default_styles[s.style_type] = s
+                if getattr(s, 'numbering_style_link', None) is not None:
+                    self.numbering_style_links[s.style_id] = s.numbering_style_link
+
+            for dd in self.namespace.XPath('./w:docDefaults')(root):
+                for pd in self.namespace.XPath('./w:pPrDefault')(dd):
+                    for pPr in self.namespace.XPath('./w:pPr')(pd):
+                        ps = ParagraphStyle(self.namespace, pPr)
+                        if self.default_paragraph_style is None:
+                            self.default_paragraph_style = ps
+                        else:
+                            self.default_paragraph_style.update(ps)
+                for pd in self.namespace.XPath('./w:rPrDefault')(dd):
+                    for pPr in self.namespace.XPath('./w:rPr')(pd):
+                        ps = RunStyle(self.namespace, pPr)
+                        if self.default_character_style is None:
+                            self.default_character_style = ps
+                        else:
+                            self.default_character_style.update(ps)
+
+        def resolve(s, p):
+            if p is not None:
+                if not p.resolved:
+                    resolve(p, self.get(p.based_on))
+                s.resolve_based_on(p)
+            s.resolved = True
+
+        for s in self:
+            if not s.resolved:
+                resolve(s, self.get(s.based_on))
+
+    def para_val(self, parent_styles, direct_formatting, attr):
+        val = getattr(direct_formatting, attr)
+        if val is inherit:
+            for ps in reversed(parent_styles):
+                pval = getattr(ps, attr)
+                if pval is not inherit:
+                    val = pval
+                    break
+        return val
+
+    def run_val(self, parent_styles, direct_formatting, attr):
+        val = getattr(direct_formatting, attr)
+        if val is not inherit:
+            return val
+        if attr in direct_formatting.toggle_properties:
+            # The spec (section 17.7.3) does not make sense, so we follow the behavior
+            # of Word, which seems to only consider the document default if the
+            # property has not been defined in any styles.
+            vals = [int(getattr(rs, attr)) for rs in parent_styles if rs is not self.default_character_style and getattr(rs, attr) is not inherit]
+            if vals:
+                return sum(vals) % 2 == 1
+            if self.default_character_style is not None:
+                return getattr(self.default_character_style, attr) is True
+            return False
+        for rs in reversed(parent_styles):
+            rval = getattr(rs, attr)
+            if rval is not inherit:
+                return rval
+        return val
+
+    def resolve_paragraph(self, p):
+        ans = self.para_cache.get(p, None)
+        if ans is None:
+            linked_style = None
+            ans = self.para_cache[p] = ParagraphStyle(self.namespace)
+            ans.style_name = None
+            direct_formatting = None
+            is_section_break = False
+            for pPr in self.namespace.XPath('./w:pPr')(p):
+                ps = ParagraphStyle(self.namespace, pPr)
+                if direct_formatting is None:
+                    direct_formatting = ps
+                else:
+                    direct_formatting.update(ps)
+                if self.namespace.XPath('./w:sectPr')(pPr):
+                    is_section_break = True
+
+            if direct_formatting is None:
+                direct_formatting = ParagraphStyle(self.namespace)
+            parent_styles = []
+            if self.default_paragraph_style is not None:
+                parent_styles.append(self.default_paragraph_style)
+            ts = self.tables.para_style(p)
+            if ts is not None:
+                parent_styles.append(ts)
+
+            default_para = self.default_styles.get('paragraph', None)
+            if direct_formatting.linked_style is not None:
+                ls = linked_style = self.get(direct_formatting.linked_style)
+                if ls is not None:
+                    ans.style_name = ls.name
+                    ps = ls.paragraph_style
+                    if ps is not None:
+                        parent_styles.append(ps)
+                    if ls.character_style is not None:
+                        self.para_char_cache[p] = ls.character_style
+            elif default_para is not None:
+                if default_para.paragraph_style is not None:
+                    parent_styles.append(default_para.paragraph_style)
+                if default_para.character_style is not None:
+                    self.para_char_cache[p] = default_para.character_style
+
+            def has_numbering(block_style):
+                num_id, lvl = getattr(block_style, 'numbering_id', inherit), getattr(block_style, 'numbering_level', inherit)
+                return num_id is not None and num_id is not inherit and lvl is not None and lvl is not inherit
+
+            is_numbering = has_numbering(direct_formatting)
+            is_section_break = is_section_break and not self.namespace.XPath('./w:r')(p)
+
+            if is_numbering and not is_section_break:
+                num_id, lvl = direct_formatting.numbering_id, direct_formatting.numbering_level
+                p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
+                ps = self.numbering.get_para_style(num_id, lvl)
+                if ps is not None:
+                    parent_styles.append(ps)
+            if (
+                not is_numbering and not is_section_break and linked_style is not None and has_numbering(linked_style.paragraph_style)
+            ):
+                num_id, lvl = linked_style.paragraph_style.numbering_id, linked_style.paragraph_style.numbering_level
+                p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
+                is_numbering = True
+                ps = self.numbering.get_para_style(num_id, lvl)
+                if ps is not None:
+                    parent_styles.append(ps)
+
+            for attr in ans.all_properties:
+                if not (is_numbering and attr == 'text_indent'):  # skip text-indent for lists
+                    setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr))
+            ans.linked_style = direct_formatting.linked_style
+        return ans
+
+    def resolve_run(self, r):
+        ans = self.run_cache.get(r, None)
+        if ans is None:
+            p = self.namespace.XPath('ancestor::w:p[1]')(r)
+            p = p[0] if p else None
+            ans = self.run_cache[r] = RunStyle(self.namespace)
+            direct_formatting = None
+            for rPr in self.namespace.XPath('./w:rPr')(r):
+                rs = RunStyle(self.namespace, rPr)
+                if direct_formatting is None:
+                    direct_formatting = rs
+                else:
+                    direct_formatting.update(rs)
+
+            if direct_formatting is None:
+                direct_formatting = RunStyle(self.namespace)
+
+            parent_styles = []
+            default_char = self.default_styles.get('character', None)
+            if self.default_character_style is not None:
+                parent_styles.append(self.default_character_style)
+            pstyle = self.para_char_cache.get(p, None)
+            if pstyle is not None:
+                parent_styles.append(pstyle)
+            # As best as I can understand the spec, table overrides should be
+            # applied before paragraph overrides, but word does it
+            # this way, see the December 2007 table header in the demo
+            # document.
+            ts = self.tables.run_style(p)
+            if ts is not None:
+                parent_styles.append(ts)
+            if direct_formatting.linked_style is not None:
+                ls = getattr(self.get(direct_formatting.linked_style), 'character_style', None)
+                if ls is not None:
+                    parent_styles.append(ls)
+            elif default_char is not None and default_char.character_style is not None:
+                parent_styles.append(default_char.character_style)
+
+            for attr in ans.all_properties:
+                setattr(ans, attr, self.run_val(parent_styles, direct_formatting, attr))
+
+            if ans.font_family is not inherit:
+                ff = self.theme.resolve_font_family(ans.font_family)
+                ans.font_family = self.fonts.family_for(ff, ans.b, ans.i)
+
+        return ans
+
+    def resolve(self, obj):
+        if obj.tag.endswith('}p'):
+            return self.resolve_paragraph(obj)
+        if obj.tag.endswith('}r'):
+            return self.resolve_run(obj)
+
+    def cascade(self, layers):
+        self.body_font_family = 'serif'
+        self.body_font_size = '10pt'
+        self.body_color = 'black'
+
+        def promote_property(char_styles, block_style, prop):
+            vals = {getattr(s, prop) for s in char_styles}
+            if len(vals) == 1:
+                # All the character styles have the same value
+                for s in char_styles:
+                    setattr(s, prop, inherit)
+                setattr(block_style, prop, next(iter(vals)))
+
+        for p, runs in iteritems(layers):
+            has_links = '1' in {r.get('is-link', None) for r in runs}
+            char_styles = [self.resolve_run(r) for r in runs]
+            block_style = self.resolve_paragraph(p)
+            for prop in ('font_family', 'font_size', 'cs_font_family', 'cs_font_size', 'color'):
+                if has_links and prop == 'color':
+                    # We cannot promote color as browser rendering engines will
+                    # override the link color setting it to blue, unless the
+                    # color is specified on the link element itself
+                    continue
+                promote_property(char_styles, block_style, prop)
+            for s in char_styles:
+                if s.text_decoration == 'none':
+                    # The default text decoration is 'none'
+                    s.text_decoration = inherit
+
+        def promote_most_common(block_styles, prop, default):
+            c = Counter()
+            for s in block_styles:
+                val = getattr(s, prop)
+                if val is not inherit:
+                    c[val] += 1
+            val = None
+            if c:
+                val = c.most_common(1)[0][0]
+                for s in block_styles:
+                    oval = getattr(s, prop)
+                    if oval is inherit:
+                        if default != val:
+                            setattr(s, prop, default)
+                    elif oval == val:
+                        setattr(s, prop, inherit)
+            return val
+
+        block_styles = tuple(self.resolve_paragraph(p) for p in layers)
+
+        ff = promote_most_common(block_styles, 'font_family', self.body_font_family)
+        if ff is not None:
+            self.body_font_family = ff
+
+        fs = promote_most_common(block_styles, 'font_size', int(self.body_font_size[:2]))
+        if fs is not None:
+            self.body_font_size = '%.3gpt' % fs
+
+        color = promote_most_common(block_styles, 'color', self.body_color)
+        if color is not None:
+            self.body_color = color
+
+    def resolve_numbering(self, numbering):
+        # When a numPr element appears inside a paragraph style, the lvl info
+        # must be discarded and pStyle used instead.
+        self.numbering = numbering
+        for style in self:
+            ps = style.paragraph_style
+            if ps is not None and ps.numbering_id is not inherit:
+                lvl = numbering.get_pstyle(ps.numbering_id, style.style_id)
+                if lvl is None:
+                    ps.numbering_id = ps.numbering_level = inherit
+                else:
+                    ps.numbering_level = lvl
+
+    def apply_contextual_spacing(self, paras):
+        last_para = None
+        for p in paras:
+            if last_para is not None:
+                ls = self.resolve_paragraph(last_para)
+                ps = self.resolve_paragraph(p)
+                if ls.linked_style is not None and ls.linked_style == ps.linked_style:
+                    if ls.contextualSpacing is True:
+                        ls.margin_bottom = 0
+                    if ps.contextualSpacing is True:
+                        ps.margin_top = 0
+            last_para = p
+
+    def apply_section_page_breaks(self, paras):
+        for p in paras:
+            ps = self.resolve_paragraph(p)
+            ps.pageBreakBefore = True
+
+    def register(self, css, prefix):
+        h = hash(frozenset(iteritems(css)))
+        ans, _ = self.classes.get(h, (None, None))
+        if ans is None:
+            self.counter[prefix] += 1
+            ans = '%s_%d' % (prefix, self.counter[prefix])
+            self.classes[h] = (ans, css)
+        return ans
+
+    def generate_classes(self):
+        for bs in itervalues(self.para_cache):
+            css = bs.css
+            if css:
+                self.register(css, 'block')
+        for bs in itervalues(self.run_cache):
+            css = bs.css
+            if css:
+                self.register(css, 'text')
+
+    def class_name(self, css):
+        h = hash(frozenset(iteritems(css)))
+        return self.classes.get(h, (None, None))[0]
+
+    def generate_css(self, dest_dir, docx, notes_nopb, nosupsub):
+        ef = self.fonts.embed_fonts(dest_dir, docx)
+
+        s = '''\
+            body { font-family: %s; font-size: %s; color: %s }
+
+            /* In word all paragraphs have zero margins unless explicitly specified in a style */
+            p, h1, h2, h3, h4, h5, h6, div { margin: 0; padding: 0 }
+            /* In word headings only have bold font if explicitly specified,
+                similarly the font size is the body font size, unless explicitly set. */
+            h1, h2, h3, h4, h5, h6 { font-weight: normal; font-size: 1rem }
+            /* Setting padding-left to zero breaks rendering of lists, so we only set the other values to zero and leave padding-left for the user-agent */
+            ul, ol { margin: 0; padding-top: 0; padding-bottom: 0; padding-right: 0 }
+
+            /* The word hyperlink styling will set text-decoration to underline if needed */
+            a { text-decoration: none }
+
+            sup.noteref a { text-decoration: none }
+
+            h1.notes-header { page-break-before: always }
+
+            dl.footnote dt { font-size: large }
+
+            dl.footnote dt a { text-decoration: none }
+
+            '''
+
+        if not notes_nopb:
+            s += '''\
+            dl.footnote { page-break-after: always }
+            dl.footnote:last-of-type { page-break-after: avoid }
+            '''
+
+        s = s + '''\
+            span.tab { white-space: pre }
+
+            p.index-entry { text-indent: 0pt; }
+            p.index-entry a:visited { color: blue }
+            p.index-entry a:hover { color: red }
+            '''
+
+        if nosupsub:
+            s = s + '''\
+               sup { vertical-align: top }
+               sub { vertical-align: bottom }
+               '''
+
+        prefix = textwrap.dedent(s) % (self.body_font_family, self.body_font_size, self.body_color)
+        if ef:
+            prefix = ef + '\n' + prefix
+
+        ans = []
+        for (cls, css) in sorted(itervalues(self.classes), key=lambda x:x[0]):
+            b = ('\t%s: %s;' % (k, v) for k, v in iteritems(css))
+            b = '\n'.join(b)
+            ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';')))
+        return prefix + '\n' + '\n'.join(ans)
--- a/ebook_converter/ebooks/docx/tables.py
+++ b/ebook_converter/ebooks/docx/tables.py
@@ -0,0 +1,700 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from lxml.html.builder import TABLE, TR, TD
+
+from calibre.ebooks.docx.block_styles import inherit, read_shd as rs, read_border, binary_property, border_props, ParagraphStyle, border_to_css
+from calibre.ebooks.docx.char_styles import RunStyle
+from polyglot.builtins import filter, iteritems, itervalues, range, unicode_type
+
+# Read from XML {{{
+read_shd = rs
+edges = ('left', 'top', 'right', 'bottom')
+
+
+def _read_width(elem, get):
+    ans = inherit
+    try:
+        w = int(get(elem, 'w:w'))
+    except (TypeError, ValueError):
+        w = 0
+    typ = get(elem, 'w:type', 'auto')
+    if typ == 'nil':
+        ans = '0'
+    elif typ == 'auto':
+        ans = 'auto'
+    elif typ == 'dxa':
+        ans = '%.3gpt' % (w/20)
+    elif typ == 'pct':
+        ans = '%.3g%%' % (w/50)
+    return ans
+
+
+def read_width(parent, dest, XPath, get):
+    ans = inherit
+    for tblW in XPath('./w:tblW')(parent):
+        ans = _read_width(tblW, get)
+    setattr(dest, 'width', ans)
+
+
+def read_cell_width(parent, dest, XPath, get):
+    ans = inherit
+    for tblW in XPath('./w:tcW')(parent):
+        ans = _read_width(tblW, get)
+    setattr(dest, 'width', ans)
+
+
+def read_padding(parent, dest, XPath, get):
+    name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
+    ans = {x:inherit for x in edges}
+    for mar in XPath('./w:%s' % name)(parent):
+        for x in edges:
+            for edge in XPath('./w:%s' % x)(mar):
+                ans[x] = _read_width(edge, get)
+    for x in edges:
+        setattr(dest, 'cell_padding_%s' % x, ans[x])
+
+
+def read_justification(parent, dest, XPath, get):
+    left = right = inherit
+    for jc in XPath('./w:jc[@w:val]')(parent):
+        val = get(jc, 'w:val')
+        if not val:
+            continue
+        if val == 'left':
+            right = 'auto'
+        elif val == 'right':
+            left = 'auto'
+        elif val == 'center':
+            left = right = 'auto'
+    setattr(dest, 'margin_left', left)
+    setattr(dest, 'margin_right', right)
+
+
+def read_spacing(parent, dest, XPath, get):
+    ans = inherit
+    for cs in XPath('./w:tblCellSpacing')(parent):
+        ans = _read_width(cs, get)
+    setattr(dest, 'spacing', ans)
+
+
+def read_float(parent, dest, XPath, get):
+    ans = inherit
+    for x in XPath('./w:tblpPr')(parent):
+        ans = {k.rpartition('}')[-1]: v for k, v in iteritems(x.attrib)}
+    setattr(dest, 'float', ans)
+
+
+def read_indent(parent, dest, XPath, get):
+    ans = inherit
+    for cs in XPath('./w:tblInd')(parent):
+        ans = _read_width(cs, get)
+    setattr(dest, 'indent', ans)
+
+
+border_edges = ('left', 'top', 'right', 'bottom', 'insideH', 'insideV')
+
+
+def read_borders(parent, dest, XPath, get):
+    name = 'tblBorders' if parent.tag.endswith('}tblPr') else 'tcBorders'
+    read_border(parent, dest, XPath, get, border_edges, name)
+
+
+def read_height(parent, dest, XPath, get):
+    ans = inherit
+    for rh in XPath('./w:trHeight')(parent):
+        rule = get(rh, 'w:hRule', 'auto')
+        if rule in {'auto', 'atLeast', 'exact'}:
+            val = get(rh, 'w:val')
+            ans = (rule, val)
+    setattr(dest, 'height', ans)
+
+
+def read_vertical_align(parent, dest, XPath, get):
+    ans = inherit
+    for va in XPath('./w:vAlign')(parent):
+        val = get(va, 'w:val')
+        ans = {'center': 'middle', 'top': 'top', 'bottom': 'bottom'}.get(val, 'middle')
+    setattr(dest, 'vertical_align', ans)
+
+
+def read_col_span(parent, dest, XPath, get):
+    ans = inherit
+    for gs in XPath('./w:gridSpan')(parent):
+        try:
+            ans = int(get(gs, 'w:val'))
+        except (TypeError, ValueError):
+            continue
+    setattr(dest, 'col_span', ans)
+
+
+def read_merge(parent, dest, XPath, get):
+    for x in ('hMerge', 'vMerge'):
+        ans = inherit
+        for m in XPath('./w:%s' % x)(parent):
+            ans = get(m, 'w:val', 'continue')
+        setattr(dest, x, ans)
+
+
+def read_band_size(parent, dest, XPath, get):
+    for x in ('Col', 'Row'):
+        ans = 1
+        for y in XPath('./w:tblStyle%sBandSize' % x)(parent):
+            try:
+                ans = int(get(y, 'w:val'))
+            except (TypeError, ValueError):
+                continue
+        setattr(dest, '%s_band_size' % x.lower(), ans)
+
+
+def read_look(parent, dest, XPath, get):
+    ans = 0
+    for x in XPath('./w:tblLook')(parent):
+        try:
+            ans = int(get(x, 'w:val'), 16)
+        except (ValueError, TypeError):
+            continue
+    setattr(dest, 'look', ans)
+
+# }}}
+
+
+def clone(style):
+    if style is None:
+        return None
+    try:
+        ans = type(style)(style.namespace)
+    except TypeError:
+        return None
+    ans.update(style)
+    return ans
+
+
+class Style(object):
+
+    is_bidi = False
+
+    def update(self, other):
+        for prop in self.all_properties:
+            nval = getattr(other, prop)
+            if nval is not inherit:
+                setattr(self, prop, nval)
+
+    def apply_bidi(self):
+        self.is_bidi = True
+
+    def convert_spacing(self):
+        ans = {}
+        if self.spacing is not inherit:
+            if self.spacing in {'auto', '0'}:
+                ans['border-collapse'] = 'collapse'
+            else:
+                ans['border-collapse'] = 'separate'
+                ans['border-spacing'] = self.spacing
+        return ans
+
+    def convert_border(self):
+        c = {}
+        for x in edges:
+            border_to_css(x, self, c)
+            val = getattr(self, 'padding_%s' % x)
+            if val is not inherit:
+                c['padding-%s' % x] = '%.3gpt' % val
+        if self.is_bidi:
+            for a in ('padding-%s', 'border-%s-style', 'border-%s-color', 'border-%s-width'):
+                l, r = c.get(a % 'left'), c.get(a % 'right')
+                if l is not None:
+                    c[a % 'right'] = l
+                if r is not None:
+                    c[a % 'left'] = r
+        return c
+
+
+class RowStyle(Style):
+
+    all_properties = ('height', 'cantSplit', 'hidden', 'spacing',)
+
+    def __init__(self, namespace, trPr=None):
+        self.namespace = namespace
+        if trPr is None:
+            for p in self.all_properties:
+                setattr(self, p, inherit)
+        else:
+            for p in ('hidden', 'cantSplit'):
+                setattr(self, p, binary_property(trPr, p, namespace.XPath, namespace.get))
+            for p in ('spacing', 'height'):
+                f = globals()['read_%s' % p]
+                f(trPr, self, namespace.XPath, namespace.get)
+        self._css = None
+
+    @property
+    def css(self):
+        if self._css is None:
+            c = self._css = {}
+            if self.hidden is True:
+                c['display'] = 'none'
+            if self.cantSplit is True:
+                c['page-break-inside'] = 'avoid'
+            if self.height is not inherit:
+                rule, val = self.height
+                if rule != 'auto':
+                    try:
+                        c['min-height' if rule == 'atLeast' else 'height'] = '%.3gpt' % (int(val)/20)
+                    except (ValueError, TypeError):
+                        pass
+            c.update(self.convert_spacing())
+        return self._css
+
+
+class CellStyle(Style):
+
+    all_properties = ('background_color', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
+        'cell_padding_bottom', 'width', 'vertical_align', 'col_span', 'vMerge', 'hMerge', 'row_span',
+    ) + tuple(k % edge for edge in border_edges for k in border_props)
+
+    def __init__(self, namespace, tcPr=None):
+        self.namespace = namespace
+        if tcPr is None:
+            for p in self.all_properties:
+                setattr(self, p, inherit)
+        else:
+            for x in ('borders', 'shd', 'padding', 'cell_width', 'vertical_align', 'col_span', 'merge'):
+                f = globals()['read_%s' % x]
+                f(tcPr, self, namespace.XPath, namespace.get)
+            self.row_span = inherit
+        self._css = None
+
+    @property
+    def css(self):
+        if self._css is None:
+            self._css = c = {}
+            if self.background_color is not inherit:
+                c['background-color'] = self.background_color
+            if self.width not in (inherit, 'auto'):
+                c['width'] = self.width
+            c['vertical-align'] = 'top' if self.vertical_align is inherit else self.vertical_align
+            for x in edges:
+                val = getattr(self, 'cell_padding_%s' % x)
+                if val not in (inherit, 'auto'):
+                    c['padding-%s' % x] =  val
+                elif val is inherit and x in {'left', 'right'}:
+                    c['padding-%s' % x] = '%.3gpt' % (115/20)
+            # In Word, tables are apparently rendered with some default top and
+            # bottom padding irrespective of the cellMargin values. Simulate
+            # that here.
+            for x in ('top', 'bottom'):
+                if c.get('padding-%s' % x, '0pt') == '0pt':
+                    c['padding-%s' % x] = '0.5ex'
+            c.update(self.convert_border())
+
+        return self._css
+
+
+class TableStyle(Style):
+
+    all_properties = (
+        'width', 'float', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
+        'cell_padding_bottom', 'margin_left', 'margin_right', 'background_color',
+        'spacing', 'indent', 'overrides', 'col_band_size', 'row_band_size', 'look', 'bidi',
+    ) + tuple(k % edge for edge in border_edges for k in border_props)
+
+    def __init__(self, namespace, tblPr=None):
+        self.namespace = namespace
+        if tblPr is None:
+            for p in self.all_properties:
+                setattr(self, p, inherit)
+        else:
+            self.overrides = inherit
+            self.bidi = binary_property(tblPr, 'bidiVisual', namespace.XPath, namespace.get)
+            for x in ('width', 'float', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders', 'band_size', 'look'):
+                f = globals()['read_%s' % x]
+                f(tblPr, self, self.namespace.XPath, self.namespace.get)
+            parent = tblPr.getparent()
+            if self.namespace.is_tag(parent, 'w:style'):
+                self.overrides = {}
+                for tblStylePr in self.namespace.XPath('./w:tblStylePr[@w:type]')(parent):
+                    otype = self.namespace.get(tblStylePr, 'w:type')
+                    orides = self.overrides[otype] = {}
+                    for tblPr in self.namespace.XPath('./w:tblPr')(tblStylePr):
+                        orides['table'] = TableStyle(self.namespace, tblPr)
+                    for trPr in self.namespace.XPath('./w:trPr')(tblStylePr):
+                        orides['row'] = RowStyle(self.namespace, trPr)
+                    for tcPr in self.namespace.XPath('./w:tcPr')(tblStylePr):
+                        orides['cell'] = CellStyle(self.namespace, tcPr)
+                    for pPr in self.namespace.XPath('./w:pPr')(tblStylePr):
+                        orides['para'] = ParagraphStyle(self.namespace, pPr)
+                    for rPr in self.namespace.XPath('./w:rPr')(tblStylePr):
+                        orides['run'] = RunStyle(self.namespace, rPr)
+        self._css = None
+
+    def resolve_based_on(self, parent):
+        for p in self.all_properties:
+            val = getattr(self, p)
+            if val is inherit:
+                setattr(self, p, getattr(parent, p))
+
+    @property
+    def css(self):
+        if self._css is None:
+            c = self._css = {}
+            if self.width not in (inherit, 'auto'):
+                c['width'] = self.width
+            for x in ('background_color', 'margin_left', 'margin_right'):
+                val = getattr(self, x)
+                if val is not inherit:
+                    c[x.replace('_', '-')] = val
+            if self.indent not in (inherit, 'auto') and self.margin_left != 'auto':
+                c['margin-left'] = self.indent
+            if self.float is not inherit:
+                for x in ('left', 'top', 'right', 'bottom'):
+                    val = self.float.get('%sFromText' % x, 0)
+                    try:
+                        val = '%.3gpt' % (int(val) / 20)
+                    except (ValueError, TypeError):
+                        val = '0'
+                    c['margin-%s' % x] = val
+                if 'tblpXSpec' in self.float:
+                    c['float'] = 'right' if self.float['tblpXSpec'] in {'right', 'outside'} else 'left'
+                else:
+                    page = self.page
+                    page_width = page.width - page.margin_left - page.margin_right
+                    try:
+                        x = int(self.float['tblpX']) / 20
+                    except (KeyError, ValueError, TypeError):
+                        x = 0
+                    c['float'] = 'left' if (x/page_width) < 0.65 else 'right'
+            c.update(self.convert_spacing())
+            if 'border-collapse' not in c:
+                c['border-collapse'] = 'collapse'
+            c.update(self.convert_border())
+
+        return self._css
+
+
+class Table(object):
+
+    def __init__(self, namespace, tbl, styles, para_map, is_sub_table=False):
+        self.namespace = namespace
+        self.tbl = tbl
+        self.styles = styles
+        self.is_sub_table = is_sub_table
+
+        # Read Table Style
+        style = {'table':TableStyle(self.namespace)}
+        for tblPr in self.namespace.XPath('./w:tblPr')(tbl):
+            for ts in self.namespace.XPath('./w:tblStyle[@w:val]')(tblPr):
+                style_id = self.namespace.get(ts, 'w:val')
+                s = styles.get(style_id)
+                if s is not None:
+                    if s.table_style is not None:
+                        style['table'].update(s.table_style)
+                    if s.paragraph_style is not None:
+                        if 'paragraph' in style:
+                            style['paragraph'].update(s.paragraph_style)
+                        else:
+                            style['paragraph'] = s.paragraph_style
+                    if s.character_style is not None:
+                        if 'run' in style:
+                            style['run'].update(s.character_style)
+                        else:
+                            style['run'] = s.character_style
+            style['table'].update(TableStyle(self.namespace, tblPr))
+        self.table_style, self.paragraph_style = style['table'], style.get('paragraph', None)
+        self.run_style = style.get('run', None)
+        self.overrides = self.table_style.overrides
+        if self.overrides is inherit:
+            self.overrides = {}
+        if 'wholeTable' in self.overrides and 'table' in self.overrides['wholeTable']:
+            self.table_style.update(self.overrides['wholeTable']['table'])
+
+        self.style_map = {}
+        self.paragraphs = []
+        self.cell_map = []
+
+        rows = self.namespace.XPath('./w:tr')(tbl)
+        for r, tr in enumerate(rows):
+            overrides = self.get_overrides(r, None, len(rows), None)
+            self.resolve_row_style(tr, overrides)
+            cells = self.namespace.XPath('./w:tc')(tr)
+            self.cell_map.append([])
+            for c, tc in enumerate(cells):
+                overrides = self.get_overrides(r, c, len(rows), len(cells))
+                self.resolve_cell_style(tc, overrides, r, c, len(rows), len(cells))
+                self.cell_map[-1].append(tc)
+                for p in self.namespace.XPath('./w:p')(tc):
+                    para_map[p] = self
+                    self.paragraphs.append(p)
+                    self.resolve_para_style(p, overrides)
+
+        self.handle_merged_cells()
+        self.sub_tables = {x:Table(namespace, x, styles, para_map, is_sub_table=True) for x in self.namespace.XPath('./w:tr/w:tc/w:tbl')(tbl)}
+
+    @property
+    def bidi(self):
+        return self.table_style.bidi is True
+
+    def override_allowed(self, name):
+        'Check if the named override is allowed by the tblLook element'
+        if name.endswith('Cell') or name == 'wholeTable':
+            return True
+        look = self.table_style.look
+        if (look & 0x0020 and name == 'firstRow') or (look & 0x0040 and name == 'lastRow') or \
+           (look & 0x0080 and name == 'firstCol') or (look & 0x0100 and name == 'lastCol'):
+            return True
+        if name.startswith('band'):
+            if name.endswith('Horz'):
+                return not bool(look & 0x0200)
+            if name.endswith('Vert'):
+                return not bool(look & 0x0400)
+        return False
+
+    def get_overrides(self, r, c, num_of_rows, num_of_cols_in_row):
+        'List of possible overrides for the given para'
+        overrides = ['wholeTable']
+
+        def divisor(m, n):
+            return (m - (m % n)) // n
+        if c is not None:
+            odd_column_band = (divisor(c, self.table_style.col_band_size) % 2) == 1
+            overrides.append('band%dVert' % (1 if odd_column_band else 2))
+        odd_row_band = (divisor(r, self.table_style.row_band_size) % 2) == 1
+        overrides.append('band%dHorz' % (1 if odd_row_band else 2))
+
+        # According to the OOXML spec columns should have higher override
+        # priority than rows, but Word seems to do it the other way around.
+        if c is not None:
+            if c == 0:
+                overrides.append('firstCol')
+            if c >= num_of_cols_in_row - 1:
+                overrides.append('lastCol')
+        if r == 0:
+            overrides.append('firstRow')
+        if r >= num_of_rows - 1:
+            overrides.append('lastRow')
+        if c is not None:
+            if r == 0:
+                if c == 0:
+                    overrides.append('nwCell')
+                if c == num_of_cols_in_row - 1:
+                    overrides.append('neCell')
+            if r == num_of_rows - 1:
+                if c == 0:
+                    overrides.append('swCell')
+                if c == num_of_cols_in_row - 1:
+                    overrides.append('seCell')
+        return tuple(filter(self.override_allowed, overrides))
+
+    def resolve_row_style(self, tr, overrides):
+        rs = RowStyle(self.namespace)
+        for o in overrides:
+            if o in self.overrides:
+                ovr = self.overrides[o]
+                ors = ovr.get('row', None)
+                if ors is not None:
+                    rs.update(ors)
+
+        for trPr in self.namespace.XPath('./w:trPr')(tr):
+            rs.update(RowStyle(self.namespace, trPr))
+        if self.bidi:
+            rs.apply_bidi()
+        self.style_map[tr] = rs
+
+    def resolve_cell_style(self, tc, overrides, row, col, rows, cols_in_row):
+        cs = CellStyle(self.namespace)
+        for o in overrides:
+            if o in self.overrides:
+                ovr = self.overrides[o]
+                ors = ovr.get('cell', None)
+                if ors is not None:
+                    cs.update(ors)
+
+        for tcPr in self.namespace.XPath('./w:tcPr')(tc):
+            cs.update(CellStyle(self.namespace, tcPr))
+
+        for x in edges:
+            p = 'cell_padding_%s' % x
+            val = getattr(cs, p)
+            if val is inherit:
+                setattr(cs, p, getattr(self.table_style, p))
+
+            is_inside_edge = (
+                (x == 'left' and col > 0) or
+                (x == 'top' and row > 0) or
+                (x == 'right' and col < cols_in_row - 1) or
+                (x == 'bottom' and row < rows -1)
+            )
+            inside_edge = ('insideH' if x in {'top', 'bottom'} else 'insideV') if is_inside_edge else None
+            for prop in border_props:
+                if not prop.startswith('border'):
+                    continue
+                eprop = prop % x
+                iprop = (prop % inside_edge) if inside_edge else None
+                val = getattr(cs, eprop)
+                if val is inherit and iprop is not None:
+                    # Use the insideX borders if the main cell borders are not
+                    # specified
+                    val = getattr(cs, iprop)
+                    if val is inherit:
+                        val = getattr(self.table_style, iprop)
+                if not is_inside_edge and val == 'none':
+                    # Cell borders must override table borders even when the
+                    # table border is not null and the cell border is null.
+                    val = 'hidden'
+                setattr(cs, eprop, val)
+
+        if self.bidi:
+            cs.apply_bidi()
+        self.style_map[tc] = cs
+
+    def resolve_para_style(self, p, overrides):
+        text_styles = [clone(self.paragraph_style), clone(self.run_style)]
+
+        for o in overrides:
+            if o in self.overrides:
+                ovr = self.overrides[o]
+                for i, name in enumerate(('para', 'run')):
+                    ops = ovr.get(name, None)
+                    if ops is not None:
+                        if text_styles[i] is None:
+                            text_styles[i] = ops
+                        else:
+                            text_styles[i].update(ops)
+        self.style_map[p] = text_styles
+
+    def handle_merged_cells(self):
+        if not self.cell_map:
+            return
+        # Handle vMerge
+        max_col_num = max(len(r) for r in self.cell_map)
+        for c in range(max_col_num):
+            cells = [row[c] if c < len(row) else None for row in self.cell_map]
+            runs = [[]]
+            for cell in cells:
+                try:
+                    s = self.style_map[cell]
+                except KeyError:  # cell is None
+                    s = CellStyle(self.namespace)
+                if s.vMerge == 'restart':
+                    runs.append([cell])
+                elif s.vMerge == 'continue':
+                    runs[-1].append(cell)
+                else:
+                    runs.append([])
+            for run in runs:
+                if len(run) > 1:
+                    self.style_map[run[0]].row_span = len(run)
+                    for tc in run[1:]:
+                        tc.getparent().remove(tc)
+
+        # Handle hMerge
+        for cells in self.cell_map:
+            runs = [[]]
+            for cell in cells:
+                try:
+                    s = self.style_map[cell]
+                except KeyError:  # cell is None
+                    s = CellStyle(self.namespace)
+                if s.col_span is not inherit:
+                    runs.append([])
+                    continue
+                if s.hMerge == 'restart':
+                    runs.append([cell])
+                elif s.hMerge == 'continue':
+                    runs[-1].append(cell)
+                else:
+                    runs.append([])
+
+            for run in runs:
+                if len(run) > 1:
+                    self.style_map[run[0]].col_span = len(run)
+                    for tc in run[1:]:
+                        tc.getparent().remove(tc)
+
+    def __iter__(self):
+        for p in self.paragraphs:
+            yield p
+        for t in itervalues(self.sub_tables):
+            for p in t:
+                yield p
+
+    def apply_markup(self, rmap, page, parent=None):
+        table = TABLE('\n\t\t')
+        if self.bidi:
+            table.set('dir', 'rtl')
+        self.table_style.page = page
+        style_map = {}
+        if parent is None:
+            try:
+                first_para = rmap[next(iter(self))]
+            except StopIteration:
+                return
+            parent = first_para.getparent()
+            idx = parent.index(first_para)
+            parent.insert(idx, table)
+        else:
+            parent.append(table)
+        for row in self.namespace.XPath('./w:tr')(self.tbl):
+            tr = TR('\n\t\t\t')
+            style_map[tr] = self.style_map[row]
+            tr.tail = '\n\t\t'
+            table.append(tr)
+            for tc in self.namespace.XPath('./w:tc')(row):
+                td = TD()
+                style_map[td] = s = self.style_map[tc]
+                if s.col_span is not inherit:
+                    td.set('colspan', unicode_type(s.col_span))
+                if s.row_span is not inherit:
+                    td.set('rowspan', unicode_type(s.row_span))
+                td.tail = '\n\t\t\t'
+                tr.append(td)
+                for x in self.namespace.XPath('./w:p|./w:tbl')(tc):
+                    if x.tag.endswith('}p'):
+                        td.append(rmap[x])
+                    else:
+                        self.sub_tables[x].apply_markup(rmap, page, parent=td)
+            if len(tr):
+                tr[-1].tail = '\n\t\t'
+        if len(table):
+            table[-1].tail = '\n\t'
+
+        table_style = self.table_style.css
+        if table_style:
+            table.set('class', self.styles.register(table_style, 'table'))
+        for elem, style in iteritems(style_map):
+            css = style.css
+            if css:
+                elem.set('class', self.styles.register(css, elem.tag))
+
+
+class Tables(object):
+
+    def __init__(self, namespace):
+        self.tables = []
+        self.para_map = {}
+        self.sub_tables = set()
+        self.namespace = namespace
+
+    def register(self, tbl, styles):
+        if tbl in self.sub_tables:
+            return
+        self.tables.append(Table(self.namespace, tbl, styles, self.para_map))
+        self.sub_tables |= set(self.tables[-1].sub_tables)
+
+    def apply_markup(self, object_map, page_map):
+        rmap = {v:k for k, v in iteritems(object_map)}
+        for table in self.tables:
+            table.apply_markup(rmap, page_map[table.tbl])
+
+    def para_style(self, p):
+        table = self.para_map.get(p, None)
+        if table is not None:
+            return table.style_map.get(p, (None, None))[0]
+
+    def run_style(self, p):
+        table = self.para_map.get(p, None)
+        if table is not None:
+            return table.style_map.get(p, (None, None))[1]
--- a/ebook_converter/ebooks/docx/theme.py
+++ b/ebook_converter/ebooks/docx/theme.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+
+class Theme(object):
+
+    def __init__(self, namespace):
+        self.major_latin_font = 'Cambria'
+        self.minor_latin_font = 'Calibri'
+        self.namespace = namespace
+
+    def __call__(self, root):
+        for fs in self.namespace.XPath('//a:fontScheme')(root):
+            for mj in self.namespace.XPath('./a:majorFont')(fs):
+                for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
+                    self.major_latin_font = l.get('typeface')
+            for mj in self.namespace.XPath('./a:minorFont')(fs):
+                for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
+                    self.minor_latin_font = l.get('typeface')
+
+    def resolve_font_family(self, ff):
+        if ff.startswith('|'):
+            ff = ff[1:-1]
+            ff = self.major_latin_font if ff.startswith('major') else self.minor_latin_font
+        return ff
--- a/ebook_converter/ebooks/docx/to_html.py
+++ b/ebook_converter/ebooks/docx/to_html.py
@@ -0,0 +1,839 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import sys, os, re, math, errno, uuid, numbers
+from collections import OrderedDict, defaultdict
+
+from lxml import html
+from lxml.html.builder import (
+    HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
+
+from calibre import guess_type
+from calibre.ebooks.docx.container import DOCX, fromstring
+from calibre.ebooks.docx.names import XML, generate_anchor
+from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
+from calibre.ebooks.docx.numbering import Numbering
+from calibre.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
+from calibre.ebooks.docx.images import Images
+from calibre.ebooks.docx.tables import Tables
+from calibre.ebooks.docx.footnotes import Footnotes
+from calibre.ebooks.docx.cleanup import cleanup_markup
+from calibre.ebooks.docx.theme import Theme
+from calibre.ebooks.docx.toc import create_toc
+from calibre.ebooks.docx.fields import Fields
+from calibre.ebooks.docx.settings import Settings
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
+from polyglot.builtins import iteritems, itervalues, filter, getcwd, map, unicode_type
+
+
+NBSP = '\xa0'
+
+
+class Text:
+
+    def __init__(self, elem, attr, buf):
+        self.elem, self.attr, self.buf = elem, attr, buf
+        self.elems = [self.elem]
+
+    def add_elem(self, elem):
+        self.elems.append(elem)
+        setattr(self.elem, self.attr, ''.join(self.buf))
+        self.elem, self.attr, self.buf = elem, 'tail', []
+
+    def __iter__(self):
+        return iter(self.elems)
+
+
+def html_lang(docx_lang):
+    lang = canonicalize_lang(docx_lang)
+    if lang and lang != 'und':
+        lang = lang_as_iso639_1(lang)
+        if lang:
+            return lang
+
+
+class Convert(object):
+
+    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
+        self.docx = DOCX(path_or_stream, log=log)
+        self.namespace = self.docx.namespace
+        self.ms_pat = re.compile(r'\s{2,}')
+        self.ws_pat = re.compile(r'[\n\r\t]')
+        self.log = self.docx.log
+        self.detect_cover = detect_cover
+        self.notes_text = notes_text or _('Notes')
+        self.notes_nopb = notes_nopb
+        self.nosupsub = nosupsub
+        self.dest_dir = dest_dir or getcwd()
+        self.mi = self.docx.metadata
+        self.body = BODY()
+        self.theme = Theme(self.namespace)
+        self.settings = Settings(self.namespace)
+        self.tables = Tables(self.namespace)
+        self.fields = Fields(self.namespace)
+        self.styles = Styles(self.namespace, self.tables)
+        self.images = Images(self.namespace, self.log)
+        self.object_map = OrderedDict()
+        self.html = HTML(
+            HEAD(
+                META(charset='utf-8'),
+                TITLE(self.mi.title or _('Unknown')),
+                LINK(rel='stylesheet', type='text/css', href='docx.css'),
+            ),
+            self.body
+        )
+        self.html.text='\n\t'
+        self.html[0].text='\n\t\t'
+        self.html[0].tail='\n'
+        for child in self.html[0]:
+            child.tail = '\n\t\t'
+        self.html[0][-1].tail = '\n\t'
+        self.html[1].text = self.html[1].tail = '\n'
+        lang = html_lang(self.mi.language)
+        if lang:
+            self.html.set('lang', lang)
+            self.doc_lang = lang
+        else:
+            self.doc_lang = None
+
+    def __call__(self):
+        doc = self.docx.document
+        relationships_by_id, relationships_by_type = self.docx.document_relationships
+        self.resolve_alternate_content(doc)
+        self.fields(doc, self.log)
+        self.read_styles(relationships_by_type)
+        self.images(relationships_by_id)
+        self.layers = OrderedDict()
+        self.framed = [[]]
+        self.frame_map = {}
+        self.framed_map = {}
+        self.anchor_map = {}
+        self.link_map = defaultdict(list)
+        self.link_source_map = {}
+        self.toc_anchor = None
+        self.block_runs = []
+        paras = []
+
+        self.log.debug('Converting Word markup to HTML')
+
+        self.read_page_properties(doc)
+        self.current_rels = relationships_by_id
+        for wp, page_properties in iteritems(self.page_map):
+            self.current_page = page_properties
+            if wp.tag.endswith('}p'):
+                p = self.convert_p(wp)
+                self.body.append(p)
+                paras.append(wp)
+
+        self.read_block_anchors(doc)
+        self.styles.apply_contextual_spacing(paras)
+        self.mark_block_runs(paras)
+        # Apply page breaks at the start of every section, except the first
+        # section (since that will be the start of the file)
+        self.styles.apply_section_page_breaks(self.section_starts[1:])
+
+        notes_header = None
+        orig_rid_map = self.images.rid_map
+        if self.footnotes.has_notes:
+            self.body.append(H1(self.notes_text))
+            notes_header = self.body[-1]
+            notes_header.set('class', 'notes-header')
+            for anchor, text, note in self.footnotes:
+                dl = DL(id=anchor)
+                dl.set('class', 'footnote')
+                self.body.append(dl)
+                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text)))
+                dl[-1][0].tail = ']'
+                dl.append(DD())
+                paras = []
+                self.images.rid_map = self.current_rels = note.rels[0]
+                for wp in note:
+                    if wp.tag.endswith('}tbl'):
+                        self.tables.register(wp, self.styles)
+                        self.page_map[wp] = self.current_page
+                    else:
+                        p = self.convert_p(wp)
+                        dl[-1].append(p)
+                        paras.append(wp)
+                self.styles.apply_contextual_spacing(paras)
+                self.mark_block_runs(paras)
+
+        for p, wp in iteritems(self.object_map):
+            if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
+                # Paragraph uses tabs for indentation, convert to text-indent
+                parent = p[0]
+                tabs = []
+                for child in parent:
+                    if child.get('class', None) == 'tab':
+                        tabs.append(child)
+                        if child.tail:
+                            break
+                    else:
+                        break
+                indent = len(tabs) * self.settings.default_tab_stop
+                style = self.styles.resolve(wp)
+                if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
+                    if style.text_indent is not inherit:
+                        indent = float(style.text_indent[:-2]) + indent
+                    style.text_indent = '%.3gpt' % indent
+                    parent.text = tabs[-1].tail or ''
+                    list(map(parent.remove, tabs))
+
+        self.images.rid_map = orig_rid_map
+
+        self.resolve_links()
+
+        self.styles.cascade(self.layers)
+
+        self.tables.apply_markup(self.object_map, self.page_map)
+
+        numbered = []
+        for html_obj, obj in iteritems(self.object_map):
+            raw = obj.get('calibre_num_id', None)
+            if raw is not None:
+                lvl, num_id = raw.partition(':')[0::2]
+                try:
+                    lvl = int(lvl)
+                except (TypeError, ValueError):
+                    lvl = 0
+                numbered.append((html_obj, num_id, lvl))
+        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
+        self.apply_frames()
+
+        if len(self.body) > 0:
+            self.body.text = '\n\t'
+            for child in self.body:
+                child.tail = '\n\t'
+            self.body[-1].tail = '\n'
+
+        self.log.debug('Converting styles to CSS')
+        self.styles.generate_classes()
+        for html_obj, obj in iteritems(self.object_map):
+            style = self.styles.resolve(obj)
+            if style is not None:
+                css = style.css
+                if css:
+                    cls = self.styles.class_name(css)
+                    if cls:
+                        html_obj.set('class', cls)
+        for html_obj, css in iteritems(self.framed_map):
+            cls = self.styles.class_name(css)
+            if cls:
+                html_obj.set('class', cls)
+
+        if notes_header is not None:
+            for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
+                notes_header.tag = h.tag
+                cls = h.get('class', None)
+                if cls and cls != 'notes-header':
+                    notes_header.set('class', '%s notes-header' % cls)
+                break
+
+        self.fields.polish_markup(self.object_map)
+
+        self.log.debug('Cleaning up redundant markup generated by Word')
+        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
+
+        return self.write(doc)
+
+    def read_page_properties(self, doc):
+        current = []
+        self.page_map = OrderedDict()
+        self.section_starts = []
+
+        for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
+            if p.tag.endswith('}tbl'):
+                self.tables.register(p, self.styles)
+                current.append(p)
+                continue
+            sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
+            if sect:
+                pr = PageProperties(self.namespace, sect)
+                paras = current + [p]
+                for x in paras:
+                    self.page_map[x] = pr
+                self.section_starts.append(paras[0])
+                current = []
+            else:
+                current.append(p)
+
+        if current:
+            self.section_starts.append(current[0])
+            last = self.namespace.XPath('./w:body/w:sectPr')(doc)
+            pr = PageProperties(self.namespace, last)
+            for x in current:
+                self.page_map[x] = pr
+
+    def resolve_alternate_content(self, doc):
+        # For proprietary extensions in Word documents use the fallback, spec
+        # compliant form
+        # See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
+        for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
+            choices = self.namespace.XPath('./mc:Choice')(ac)
+            fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
+            if fallbacks:
+                for choice in choices:
+                    ac.remove(choice)
+
+    def read_styles(self, relationships_by_type):
+
+        def get_name(rtype, defname):
+            name = relationships_by_type.get(rtype, None)
+            if name is None:
+                cname = self.docx.document_name.split('/')
+                cname[-1] = defname
+                if self.docx.exists('/'.join(cname)):
+                    name = name
+            if name and name.startswith('word/word') and not self.docx.exists(name):
+                name = name.partition('/')[2]
+            return name
+
+        nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
+        sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
+        sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
+        fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
+        tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
+        foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
+        enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
+        numbering = self.numbering = Numbering(self.namespace)
+        footnotes = self.footnotes = Footnotes(self.namespace)
+        fonts = self.fonts = Fonts(self.namespace)
+
+        foraw = enraw = None
+        forel, enrel = ({}, {}), ({}, {})
+        if sename is not None:
+            try:
+                seraw = self.docx.read(sename)
+            except KeyError:
+                self.log.warn('Settings %s do not exist' % sename)
+            except EnvironmentError as e:
+                if e.errno != errno.ENOENT:
+                    raise
+                self.log.warn('Settings %s file missing' % sename)
+            else:
+                self.settings(fromstring(seraw))
+
+        if foname is not None:
+            try:
+                foraw = self.docx.read(foname)
+            except KeyError:
+                self.log.warn('Footnotes %s do not exist' % foname)
+            else:
+                forel = self.docx.get_relationships(foname)
+        if enname is not None:
+            try:
+                enraw = self.docx.read(enname)
+            except KeyError:
+                self.log.warn('Endnotes %s do not exist' % enname)
+            else:
+                enrel = self.docx.get_relationships(enname)
+        footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)
+
+        if fname is not None:
+            embed_relationships = self.docx.get_relationships(fname)[0]
+            try:
+                raw = self.docx.read(fname)
+            except KeyError:
+                self.log.warn('Fonts table %s does not exist' % fname)
+            else:
+                fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)
+
+        if tname is not None:
+            try:
+                raw = self.docx.read(tname)
+            except KeyError:
+                self.log.warn('Styles %s do not exist' % sname)
+            else:
+                self.theme(fromstring(raw))
+
+        styles_loaded = False
+        if sname is not None:
+            try:
+                raw = self.docx.read(sname)
+            except KeyError:
+                self.log.warn('Styles %s do not exist' % sname)
+            else:
+                self.styles(fromstring(raw), fonts, self.theme)
+                styles_loaded = True
+        if not styles_loaded:
+            self.styles(None, fonts, self.theme)
+
+        if nname is not None:
+            try:
+                raw = self.docx.read(nname)
+            except KeyError:
+                self.log.warn('Numbering styles %s do not exist' % nname)
+            else:
+                numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
+
+        self.styles.resolve_numbering(numbering)
+
+    def write(self, doc):
+        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
+        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
+        with lopen(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
+            f.write(raw)
+        css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
+        if css:
+            with lopen(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
+                f.write(css.encode('utf-8'))
+
+        opf = OPFCreator(self.dest_dir, self.mi)
+        opf.toc = toc
+        opf.create_manifest_from_files_in([self.dest_dir])
+        for item in opf.manifest:
+            if item.media_type == 'text/html':
+                item.media_type = guess_type('a.xhtml')[0]
+        opf.create_spine(['index.html'])
+        if self.cover_image is not None:
+            opf.guide.set_cover(self.cover_image)
+
+        def process_guide(E, guide):
+            if self.toc_anchor is not None:
+                guide.append(E.reference(
+                    href='index.html#' + self.toc_anchor, title=_('Table of Contents'), type='toc'))
+        toc_file = os.path.join(self.dest_dir, 'toc.ncx')
+        with lopen(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
+            opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
+        if os.path.getsize(toc_file) == 0:
+            os.remove(toc_file)
+        return os.path.join(self.dest_dir, 'metadata.opf')
+
+    def read_block_anchors(self, doc):
+        doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
+        if doc_anchors:
+            current_bm = set()
+            rmap = {v:k for k, v in iteritems(self.object_map)}
+            for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
+                if p.tag.endswith('}p'):
+                    if current_bm and p in rmap:
+                        para = rmap[p]
+                        if 'id' not in para.attrib:
+                            para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map))))
+                        for name in current_bm:
+                            self.anchor_map[name] = para.get('id')
+                        current_bm = set()
+                elif p in doc_anchors:
+                    anchor = self.namespace.get(p, 'w:name')
+                    if anchor:
+                        current_bm.add(anchor)
+
+    def convert_p(self, p):
+        dest = P()
+        self.object_map[dest] = p
+        style = self.styles.resolve_paragraph(p)
+        self.layers[p] = []
+        self.frame_map[p] = style.frame
+        self.add_frame(dest, style.frame)
+
+        current_anchor = None
+        current_hyperlink = None
+        hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')
+
+        def p_parent(x):
+            # Ensure that nested <w:p> tags are handled. These can occur if a
+            # textbox is present inside a paragraph.
+            while True:
+                x = x.getparent()
+                try:
+                    if x.tag.endswith('}p'):
+                        return x
+                except AttributeError:
+                    break
+
+        for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
+            if p_parent(x) is not p:
+                continue
+            if x.tag.endswith('}r'):
+                span = self.convert_run(x)
+                if current_anchor is not None:
+                    (dest if len(dest) == 0 else span).set('id', current_anchor)
+                    current_anchor = None
+                if current_hyperlink is not None:
+                    try:
+                        hl = hl_xpath(x)[0]
+                        self.link_map[hl].append(span)
+                        self.link_source_map[hl] = self.current_rels
+                        x.set('is-link', '1')
+                    except IndexError:
+                        current_hyperlink = None
+                dest.append(span)
+                self.layers[p].append(x)
+            elif x.tag.endswith('}bookmarkStart'):
+                anchor = self.namespace.get(x, 'w:name')
+                if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
+                    # _GoBack is a special bookmark inserted by Word 2010 for
+                    # the return to previous edit feature, we ignore it
+                    old_anchor = current_anchor
+                    self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map)))
+                    if old_anchor is not None:
+                        # The previous anchor was not applied to any element
+                        for a, t in tuple(iteritems(self.anchor_map)):
+                            if t == old_anchor:
+                                self.anchor_map[a] = current_anchor
+            elif x.tag.endswith('}hyperlink'):
+                current_hyperlink = x
+            elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
+                old_anchor = current_anchor
+                anchor = unicode_type(uuid.uuid4())
+                self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map)))
+                self.toc_anchor = current_anchor
+                if old_anchor is not None:
+                    # The previous anchor was not applied to any element
+                    for a, t in tuple(iteritems(self.anchor_map)):
+                        if t == old_anchor:
+                            self.anchor_map[a] = current_anchor
+        if current_anchor is not None:
+            # This paragraph had no <w:r> descendants
+            dest.set('id', current_anchor)
+            current_anchor = None
+
+        m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
+        if m is not None:
+            n = min(6, max(1, int(m.group(1))))
+            dest.tag = 'h%d' % n
+            dest.set('data-heading-level', unicode_type(n))
+
+        if style.bidi is True:
+            dest.set('dir', 'rtl')
+
+        border_runs = []
+        common_borders = []
+        for span in dest:
+            run = self.object_map[span]
+            style = self.styles.resolve_run(run)
+            if not border_runs or border_runs[-1][1].same_border(style):
+                border_runs.append((span, style))
+            elif border_runs:
+                if len(border_runs) > 1:
+                    common_borders.append(border_runs)
+                border_runs = []
+
+        for border_run in common_borders:
+            spans = []
+            bs = {}
+            for span, style in border_run:
+                style.get_border_css(bs)
+                style.clear_border_css()
+                spans.append(span)
+            if bs:
+                cls = self.styles.register(bs, 'text_border')
+                wrapper = self.wrap_elems(spans, SPAN())
+                wrapper.set('class', cls)
+
+        if not dest.text and len(dest) == 0 and not style.has_visible_border():
+            # Empty paragraph add a non-breaking space so that it is rendered
+            # by WebKit
+            dest.text = NBSP
+
+        # If the last element in a block is a <br> the <br> is not rendered in
+        # HTML, unless it is followed by a trailing space. Word, on the other
+        # hand inserts a blank line for trailing <br>s.
+        if len(dest) > 0 and not dest[-1].tail:
+            if dest[-1].tag == 'br':
+                dest[-1].tail = NBSP
+            elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
+                dest[-1][-1].tail = NBSP
+
+        return dest
+
+    def wrap_elems(self, elems, wrapper):
+        p = elems[0].getparent()
+        idx = p.index(elems[0])
+        p.insert(idx, wrapper)
+        wrapper.tail = elems[-1].tail
+        elems[-1].tail = None
+        for elem in elems:
+            try:
+                p.remove(elem)
+            except ValueError:
+                # Probably a hyperlink that spans multiple
+                # paragraphs,theoretically we should break this up into
+                # multiple hyperlinks, but I can't be bothered.
+                elem.getparent().remove(elem)
+            wrapper.append(elem)
+        return wrapper
+
+    def resolve_links(self):
+        self.resolved_link_map = {}
+        for hyperlink, spans in iteritems(self.link_map):
+            relationships_by_id = self.link_source_map[hyperlink]
+            span = spans[0]
+            if len(spans) > 1:
+                span = self.wrap_elems(spans, SPAN())
+            span.tag = 'a'
+            self.resolved_link_map[hyperlink] = span
+            tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
+            if tgt:
+                span.set('target', tgt)
+            tt = self.namespace.get(hyperlink, 'w:tooltip')
+            if tt:
+                span.set('title', tt)
+            rid = self.namespace.get(hyperlink, 'r:id')
+            if rid and rid in relationships_by_id:
+                span.set('href', relationships_by_id[rid])
+                continue
+            anchor = self.namespace.get(hyperlink, 'w:anchor')
+            if anchor and anchor in self.anchor_map:
+                span.set('href', '#' + self.anchor_map[anchor])
+                continue
+            self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
+                          (rid, anchor))
+            # hrefs that point nowhere give epubcheck a hernia. The element
+            # should be styled explicitly by Word anyway.
+            # span.set('href', '#')
+        rmap = {v:k for k, v in iteritems(self.object_map)}
+        for hyperlink, runs in self.fields.hyperlink_fields:
+            spans = [rmap[r] for r in runs if r in rmap]
+            if not spans:
+                continue
+            span = spans[0]
+            if len(spans) > 1:
+                span = self.wrap_elems(spans, SPAN())
+            span.tag = 'a'
+            tgt = hyperlink.get('target', None)
+            if tgt:
+                span.set('target', tgt)
+            tt = hyperlink.get('title', None)
+            if tt:
+                span.set('title', tt)
+            url = hyperlink.get('url', None)
+            if url is None:
+                anchor = hyperlink.get('anchor', None)
+                if anchor in self.anchor_map:
+                    span.set('href', '#' + self.anchor_map[anchor])
+                    continue
+                self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
+            else:
+                if url in self.anchor_map:
+                    span.set('href', '#' + self.anchor_map[url])
+                    continue
+                span.set('href', url)
+
+        for img, link, relationships_by_id in self.images.links:
+            parent = img.getparent()
+            idx = parent.index(img)
+            a = A(img)
+            a.tail, img.tail = img.tail, None
+            parent.insert(idx, a)
+            tgt = link.get('target', None)
+            if tgt:
+                a.set('target', tgt)
+            tt = link.get('title', None)
+            if tt:
+                a.set('title', tt)
+            rid = link['id']
+            if rid in relationships_by_id:
+                dest = relationships_by_id[rid]
+                if dest.startswith('#'):
+                    if dest[1:] in self.anchor_map:
+                        a.set('href', '#' + self.anchor_map[dest[1:]])
+                else:
+                    a.set('href', dest)
+
+    def convert_run(self, run):
+        ans = SPAN()
+        self.object_map[ans] = run
+        text = Text(ans, 'text', [])
+
+        for child in run:
+            if self.namespace.is_tag(child, 'w:t'):
+                if not child.text:
+                    continue
+                space = child.get(XML('space'), None)
+                preserve = False
+                ctext = child.text
+                if space != 'preserve':
+                    # Remove leading and trailing whitespace. Word ignores
+                    # leading and trailing whitespace without preserve
+                    ctext = ctext.strip(' \n\r\t')
+                # Only use a <span> with white-space:pre-wrap if this element
+                # actually needs it, i.e. if it has more than one
+                # consecutive space or it has newlines or tabs.
+                multi_spaces = self.ms_pat.search(ctext) is not None
+                preserve = multi_spaces or self.ws_pat.search(ctext) is not None
+                if preserve:
+                    text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
+                    ans.append(text.elem)
+                else:
+                    text.buf.append(ctext)
+            elif self.namespace.is_tag(child, 'w:cr'):
+                text.add_elem(BR())
+                ans.append(text.elem)
+            elif self.namespace.is_tag(child, 'w:br'):
+                typ = self.namespace.get(child, 'w:type')
+                if typ in {'column', 'page'}:
+                    br = BR(style='page-break-after:always')
+                else:
+                    clear = child.get('clear', None)
+                    if clear in {'all', 'left', 'right'}:
+                        br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
+                    else:
+                        br = BR()
+                text.add_elem(br)
+                ans.append(text.elem)
+            elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
+                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
+                    text.add_elem(img)
+                    ans.append(text.elem)
+            elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
+                anchor, name = self.footnotes.get_ref(child)
+                if anchor and name:
+                    l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name)
+                    l.set('class', 'noteref')
+                    text.add_elem(l)
+                    ans.append(text.elem)
+            elif self.namespace.is_tag(child, 'w:tab'):
+                spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
+                text.add_elem(SPAN(NBSP * spaces))
+                ans.append(text.elem)
+                ans[-1].set('class', 'tab')
+            elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
+                text.buf.append('\u2011')
+            elif self.namespace.is_tag(child, 'w:softHyphen'):
+                text.buf.append('\u00ad')
+        if text.buf:
+            setattr(text.elem, text.attr, ''.join(text.buf))
+
+        style = self.styles.resolve_run(run)
+        if style.vert_align in {'superscript', 'subscript'}:
+            if ans.text or len(ans):
+                ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub')
+        if style.lang is not inherit:
+            lang = html_lang(style.lang)
+            if lang is not None and lang != self.doc_lang:
+                ans.set('lang', lang)
+        if style.rtl is True:
+            ans.set('dir', 'rtl')
+        if is_symbol_font(style.font_family):
+            for elem in text:
+                if elem.text:
+                    elem.text = map_symbol_text(elem.text, style.font_family)
+                if elem.tail:
+                    elem.tail = map_symbol_text(elem.tail, style.font_family)
+            style.font_family = 'sans-serif'
+        return ans
+
+    def add_frame(self, html_obj, style):
+        last_run = self.framed[-1]
+        if style is inherit:
+            if last_run:
+                self.framed.append([])
+            return
+
+        if last_run:
+            if last_run[-1][1] == style:
+                last_run.append((html_obj, style))
+            else:
+                self.framed[-1].append((html_obj, style))
+        else:
+            last_run.append((html_obj, style))
+
+    def apply_frames(self):
+        for run in filter(None, self.framed):
+            style = run[0][1]
+            paras = tuple(x[0] for x in run)
+            parent = paras[0].getparent()
+            idx = parent.index(paras[0])
+            frame = DIV(*paras)
+            parent.insert(idx, frame)
+            self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
+            self.styles.register(css, 'frame')
+
+        if not self.block_runs:
+            return
+        rmap = {v:k for k, v in iteritems(self.object_map)}
+        for border_style, blocks in self.block_runs:
+            paras = tuple(rmap[p] for p in blocks)
+            for p in paras:
+                if p.tag == 'li':
+                    has_li = True
+                    break
+            else:
+                has_li = False
+            parent = paras[0].getparent()
+            if parent.tag in ('ul', 'ol'):
+                ul = parent
+                parent = ul.getparent()
+                idx = parent.index(ul)
+                frame = DIV(ul)
+            elif has_li:
+                def top_level_tag(x):
+                    while True:
+                        q = x.getparent()
+                        if q is parent or q is None:
+                            break
+                        x = q
+                    return x
+                paras = tuple(map(top_level_tag, paras))
+                idx = parent.index(paras[0])
+                frame = DIV(*paras)
+            else:
+                idx = parent.index(paras[0])
+                frame = DIV(*paras)
+            parent.insert(idx, frame)
+            self.framed_map[frame] = css = border_style.css
+            self.styles.register(css, 'frame')
+
+    def mark_block_runs(self, paras):
+
+        def process_run(run):
+            max_left = max_right = 0
+            has_visible_border = None
+            for p in run:
+                style = self.styles.resolve_paragraph(p)
+                if has_visible_border is None:
+                    has_visible_border = style.has_visible_border()
+                if isinstance(style.margin_left, numbers.Number):
+                    max_left = max(style.margin_left, max_left)
+                if isinstance(style.margin_right, numbers.Number):
+                    max_right = max(style.margin_right, max_right)
+                if has_visible_border:
+                    style.margin_left = style.margin_right = inherit
+                if p is not run[0]:
+                    style.padding_top = 0
+                else:
+                    border_style = style.clone_border_styles()
+                    if has_visible_border:
+                        border_style.margin_top, style.margin_top = style.margin_top, inherit
+                if p is not run[-1]:
+                    style.padding_bottom = 0
+                else:
+                    if has_visible_border:
+                        border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
+                style.clear_borders()
+                if p is not run[-1]:
+                    style.apply_between_border()
+            if has_visible_border:
+                border_style.margin_left, border_style.margin_right = max_left,max_right
+                self.block_runs.append((border_style, run))
+
+        run = []
+        for p in paras:
+            if run and self.frame_map.get(p) == self.frame_map.get(run[-1]):
+                style = self.styles.resolve_paragraph(p)
+                last_style = self.styles.resolve_paragraph(run[-1])
+                if style.has_identical_borders(last_style):
+                    run.append(p)
+                    continue
+            if len(run) > 1:
+                process_run(run)
+            run = [p]
+        if len(run) > 1:
+            process_run(run)
+
+
+if __name__ == '__main__':
+    import shutil
+    from calibre.utils.logging import default_log
+    default_log.filter_level = default_log.DEBUG
+    dest_dir = os.path.join(getcwd(), 'docx_input')
+    if os.path.exists(dest_dir):
+        shutil.rmtree(dest_dir)
+    os.mkdir(dest_dir)
+    Convert(sys.argv[-1], dest_dir=dest_dir, log=default_log)()
--- a/ebook_converter/ebooks/docx/toc.py
+++ b/ebook_converter/ebooks/docx/toc.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import namedtuple
+from itertools import count
+
+from lxml.etree import tostring
+
+from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
+from polyglot.builtins import iteritems, range
+
+
+def from_headings(body, log, namespace, num_levels=3):
+    ' Create a TOC from headings in the document '
+    tocroot = TOC()
+    all_heading_nodes = body.xpath('//*[@data-heading-level]')
+    level_prev = {i+1:None for i in range(num_levels)}
+    level_prev[0] = tocroot
+    level_item_map = {i:frozenset(
+        x for x in all_heading_nodes if int(x.get('data-heading-level')) == i)
+        for i in range(1, num_levels+1)}
+    item_level_map = {e:i for i, elems in iteritems(level_item_map) for e in elems}
+
+    idcount = count()
+
+    def ensure_id(elem):
+        ans = elem.get('id', None)
+        if not ans:
+            ans = 'toc_id_%d' % (next(idcount) + 1)
+            elem.set('id', ans)
+        return ans
+
+    for item in all_heading_nodes:
+        lvl = plvl = item_level_map.get(item, None)
+        if lvl is None:
+            continue
+        parent = None
+        while parent is None:
+            plvl -= 1
+            parent = level_prev[plvl]
+        lvl = plvl + 1
+        elem_id = ensure_id(item)
+        text = elem_to_toc_text(item)
+        toc = parent.add_item('index.html', elem_id, text)
+        level_prev[lvl] = toc
+        for i in range(lvl+1, num_levels+1):
+            level_prev[i] = None
+
+    if len(tuple(tocroot.flat())) > 1:
+        log('Generating Table of Contents from headings')
+        return tocroot
+
+
+def structure_toc(entries):
+    indent_vals = sorted({x.indent for x in entries})
+    last_found = [None for i in indent_vals]
+    newtoc = TOC()
+
+    if len(indent_vals) > 6:
+        for x in entries:
+            newtoc.add_item('index.html', x.anchor, x.text)
+        return newtoc
+
+    def find_parent(level):
+        candidates = last_found[:level]
+        for x in reversed(candidates):
+            if x is not None:
+                return x
+        return newtoc
+
+    for item in entries:
+        level = indent_vals.index(item.indent)
+        parent = find_parent(level)
+        last_found[level] = parent.add_item('index.html', item.anchor,
+                    item.text)
+        for i in range(level+1, len(last_found)):
+            last_found[i] = None
+
+    return newtoc
+
+
+def link_to_txt(a, styles, object_map):
+    if len(a) > 1:
+        for child in a:
+            run = object_map.get(child, None)
+            if run is not None:
+                rs = styles.resolve(run)
+                if rs.css.get('display', None) == 'none':
+                    a.remove(child)
+
+    return tostring(a, method='text', with_tail=False, encoding='unicode').strip()
+
+
+def from_toc(docx, link_map, styles, object_map, log, namespace):
+    XPath, get, ancestor = namespace.XPath, namespace.get, namespace.ancestor
+    toc_level = None
+    level = 0
+    TI = namedtuple('TI', 'text anchor indent')
+    toc = []
+    for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx):
+        n = tag.tag.rpartition('}')[-1]
+        if n == 'fldChar':
+            t = get(tag, 'w:fldCharType')
+            if t == 'begin':
+                level += 1
+            elif t == 'end':
+                level -= 1
+                if toc_level is not None and level < toc_level:
+                    break
+        elif n == 'instrText':
+            if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
+                toc_level = level
+        elif n == 'hyperlink':
+            if toc_level is not None and level >= toc_level and tag in link_map:
+                a = link_map[tag]
+                href = a.get('href', None)
+                txt = link_to_txt(a, styles, object_map)
+                p = ancestor(tag, 'w:p')
+                if txt and href and p is not None:
+                    ps = styles.resolve_paragraph(p)
+                    try:
+                        ml = int(ps.margin_left[:-2])
+                    except (TypeError, ValueError, AttributeError):
+                        ml = 0
+                    if ps.text_align in {'center', 'right'}:
+                        ml = 0
+                    toc.append(TI(txt, href[1:], ml))
+    if toc:
+        log('Found Word Table of Contents, using it to generate the Table of Contents')
+        return structure_toc(toc)
+
+
+def create_toc(docx, body, link_map, styles, object_map, log, namespace):
+    ans = from_toc(docx, link_map, styles, object_map, log, namespace) or from_headings(body, log, namespace)
+    # Remove heading level attributes
+    for h in body.xpath('//*[@data-heading-level]'):
+        del h.attrib['data-heading-level']
+    return ans
--- a/ebook_converter/ebooks/html/init.py
+++ b/ebook_converter/ebooks/html/init.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
--- a/ebook_converter/ebooks/html/input.py
+++ b/ebook_converter/ebooks/html/input.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+'''
+Input plugin for HTML or OPF ebooks.
+'''
+
+import os, re, sys,  errno as gerrno
+
+from calibre.ebooks.oeb.base import urlunquote
+from calibre.ebooks.chardet import detect_xml_encoding
+from calibre.constants import iswindows
+from calibre import unicode_path, as_unicode, replace_entities
+from polyglot.builtins import is_py3, unicode_type
+from polyglot.urllib import urlparse, urlunparse
+
+
+class Link(object):
+
+    '''
+    Represents a link in a HTML file.
+    '''
+
+    @classmethod
+    def url_to_local_path(cls, url, base):
+        path = url.path
+        isabs = False
+        if iswindows and path.startswith('/'):
+            path = path[1:]
+            isabs = True
+        path = urlunparse(('', '', path, url.params, url.query, ''))
+        path = urlunquote(path)
+        if isabs or os.path.isabs(path):
+            return path
+        return os.path.abspath(os.path.join(base, path))
+
+    def __init__(self, url, base):
+        '''
+        :param url:  The url this link points to. Must be an unquoted unicode string.
+        :param base: The base directory that relative URLs are with respect to.
+                     Must be a unicode string.
+        '''
+        assert isinstance(url, unicode_type) and isinstance(base, unicode_type)
+        self.url         = url
+        self.parsed_url  = urlparse(self.url)
+        self.is_local    = self.parsed_url.scheme in ('', 'file')
+        self.is_internal = self.is_local and not bool(self.parsed_url.path)
+        self.path        = None
+        self.fragment    = urlunquote(self.parsed_url.fragment)
+        if self.is_local and not self.is_internal:
+            self.path = self.url_to_local_path(self.parsed_url, base)
+
+    def __hash__(self):
+        if self.path is None:
+            return hash(self.url)
+        return hash(self.path)
+
+    def __eq__(self, other):
+        return self.path == getattr(other, 'path', other)
+
+    def __str__(self):
+        return 'Link: %s --> %s'%(self.url, self.path)
+
+    if not is_py3:
+        __unicode__ = __str__
+
+
+class IgnoreFile(Exception):
+
+    def __init__(self, msg, errno):
+        Exception.__init__(self, msg)
+        self.doesnt_exist = errno == gerrno.ENOENT
+        self.errno = errno
+
+
+class HTMLFile(object):
+
+    '''
+    Contains basic information about an HTML file. This
+    includes a list of links to other files as well as
+    the encoding of each file. Also tries to detect if the file is not a HTML
+    file in which case :member:`is_binary` is set to True.
+
+    The encoding of the file is available as :member:`encoding`.
+    '''
+
+    HTML_PAT  = re.compile(r'<\s*html', re.IGNORECASE)
+    TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
+    LINK_PAT  = re.compile(
+    r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
+    re.DOTALL|re.IGNORECASE)
+
+    def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
+        '''
+        :param level: The level of this file. Should be 0 for the root file.
+        :param encoding: Use `encoding` to decode HTML.
+        :param referrer: The :class:`HTMLFile` that first refers to this file.
+        '''
+        self.path     = unicode_path(path_to_html_file, abs=True)
+        self.title    = os.path.splitext(os.path.basename(self.path))[0]
+        self.base     = os.path.dirname(self.path)
+        self.level    = level
+        self.referrer = referrer
+        self.links    = []
+
+        try:
+            with open(self.path, 'rb') as f:
+                src = header = f.read(4096)
+                encoding = detect_xml_encoding(src)[1]
+                if encoding:
+                    try:
+                        header = header.decode(encoding)
+                    except ValueError:
+                        pass
+                self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
+                if not self.is_binary:
+                    src += f.read()
+        except IOError as err:
+            msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
+            if level == 0:
+                raise IOError(msg)
+            raise IgnoreFile(msg, err.errno)
+
+        if not src:
+            if level == 0:
+                raise ValueError('The file %s is empty'%self.path)
+            self.is_binary = True
+
+        if not self.is_binary:
+            if not encoding:
+                encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
+                self.encoding = encoding
+            else:
+                self.encoding = encoding
+
+            src = src.decode(encoding, 'replace')
+            match = self.TITLE_PAT.search(src)
+            self.title = match.group(1) if match is not None else self.title
+            self.find_links(src)
+
+    def __eq__(self, other):
+        return self.path == getattr(other, 'path', other)
+
+    def __hash__(self):
+        return hash(self.path)
+
+    def __str__(self):
+        return 'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
+
+    def __repr__(self):
+        return unicode_type(self)
+
+    def find_links(self, src):
+        for match in self.LINK_PAT.finditer(src):
+            url = None
+            for i in ('url1', 'url2', 'url3'):
+                url = match.group(i)
+                if url:
+                    break
+            url = replace_entities(url)
+            try:
+                link = self.resolve(url)
+            except ValueError:
+                # Unparseable URL, ignore
+                continue
+            if link not in self.links:
+                self.links.append(link)
+
+    def resolve(self, url):
+        return Link(url, self.base)
+
+
+def depth_first(root, flat, visited=None):
+    yield root
+    if visited is None:
+        visited = set()
+    visited.add(root)
+    for link in root.links:
+        if link.path is not None and link not in visited:
+            try:
+                index = flat.index(link)
+            except ValueError:  # Can happen if max_levels is used
+                continue
+            hf = flat[index]
+            if hf not in visited:
+                yield hf
+                visited.add(hf)
+                for hf in depth_first(hf, flat, visited):
+                    if hf not in visited:
+                        yield hf
+                        visited.add(hf)
+
+
+def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
+    '''
+    Recursively traverse all links in the HTML file.
+
+    :param max_levels: Maximum levels of recursion. Must be non-negative. 0
+                       implies that no links in the root HTML file are followed.
+    :param encoding:   Specify character encoding of HTML files. If `None` it is
+                       auto-detected.
+    :return:           A pair of lists (breadth_first, depth_first). Each list contains
+                       :class:`HTMLFile` objects.
+    '''
+    assert max_levels >= 0
+    level = 0
+    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
+    next_level = list(flat)
+    while level < max_levels and len(next_level) > 0:
+        level += 1
+        nl = []
+        for hf in next_level:
+            rejects = []
+            for link in hf.links:
+                if link.path is None or link.path in flat:
+                    continue
+                try:
+                    nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
+                    if nf.is_binary:
+                        raise IgnoreFile('%s is a binary file'%nf.path, -1)
+                    nl.append(nf)
+                    flat.append(nf)
+                except IgnoreFile as err:
+                    rejects.append(link)
+                    if not err.doesnt_exist or verbose > 1:
+                        print(repr(err))
+            for link in rejects:
+                hf.links.remove(link)
+
+        next_level = list(nl)
+    orec = sys.getrecursionlimit()
+    sys.setrecursionlimit(500000)
+    try:
+        return flat, list(depth_first(flat[0], flat))
+    finally:
+        sys.setrecursionlimit(orec)
+
+
+def get_filelist(htmlfile, dir, opts, log):
+    '''
+    Build list of files referenced by html file or try to detect and use an
+    OPF file instead.
+    '''
+    log.info('Building file list...')
+    filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
+                        verbose=opts.verbose,
+                        encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
+    if opts.verbose:
+        log.debug('\tFound files...')
+        for f in filelist:
+            log.debug('\t\t', f)
+    return filelist
--- a/ebook_converter/ebooks/html/to_zip.py
+++ b/ebook_converter/ebooks/html/to_zip.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import textwrap, os, glob
+
+from calibre.customize import FileTypePlugin
+from calibre.constants import numeric_version
+from polyglot.builtins import unicode_type
+
+
+class HTML2ZIP(FileTypePlugin):
+    name = 'HTML to ZIP'
+    author = 'Kovid Goyal'
+    description = textwrap.dedent(_('''\
+Follow all local links in an HTML file and create a ZIP \
+file containing all linked files. This plugin is run \
+every time you add an HTML file to the library.\
+'''))
+    version = numeric_version
+    file_types = {'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
+    supported_platforms = ['windows', 'osx', 'linux']
+    on_import = True
+
+    def run(self, htmlfile):
+        import codecs
+        from calibre import prints
+        from calibre.ptempfile import TemporaryDirectory
+        from calibre.gui2.convert.gui_conversion import gui_convert
+        from calibre.customize.conversion import OptionRecommendation
+        from calibre.ebooks.epub import initialize_container
+
+        with TemporaryDirectory('_plugin_html2zip') as tdir:
+            recs =[('debug_pipeline', tdir, OptionRecommendation.HIGH)]
+            recs.append(['keep_ligatures', True, OptionRecommendation.HIGH])
+            if self.site_customization and self.site_customization.strip():
+                sc = self.site_customization.strip()
+                enc, _, bf = sc.partition('|')
+                if enc:
+                    try:
+                        codecs.lookup(enc)
+                    except Exception:
+                        prints('Ignoring invalid input encoding for HTML:', enc)
+                    else:
+                        recs.append(['input_encoding', enc, OptionRecommendation.HIGH])
+                if bf == 'bf':
+                    recs.append(['breadth_first', True,
+                        OptionRecommendation.HIGH])
+            gui_convert(htmlfile, tdir, recs, abort_after_input_dump=True)
+            of = self.temporary_file('_plugin_html2zip.zip')
+            tdir = os.path.join(tdir, 'input')
+            opf = glob.glob(os.path.join(tdir, '*.opf'))[0]
+            ncx = glob.glob(os.path.join(tdir, '*.ncx'))
+            if ncx:
+                os.remove(ncx[0])
+            epub = initialize_container(of.name, os.path.basename(opf))
+            epub.add_dir(tdir)
+            epub.close()
+
+        return of.name
+
+    def customization_help(self, gui=False):
+        return _('Character encoding for the input HTML files. Common choices '
+        'include: cp1252, cp1251, latin1 and utf-8.')
+
+    def do_user_config(self, parent=None):
+        '''
+        This method shows a configuration dialog for this plugin. It returns
+        True if the user clicks OK, False otherwise. The changes are
+        automatically applied.
+        '''
+        from PyQt5.Qt import (QDialog, QDialogButtonBox, QVBoxLayout,
+                QLabel, Qt, QLineEdit, QCheckBox)
+
+        config_dialog = QDialog(parent)
+        button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
+        v = QVBoxLayout(config_dialog)
+
+        def size_dialog():
+            config_dialog.resize(config_dialog.sizeHint())
+
+        button_box.accepted.connect(config_dialog.accept)
+        button_box.rejected.connect(config_dialog.reject)
+        config_dialog.setWindowTitle(_('Customize') + ' ' + self.name)
+        from calibre.customize.ui import (plugin_customization,
+                customize_plugin)
+        help_text = self.customization_help(gui=True)
+        help_text = QLabel(help_text, config_dialog)
+        help_text.setWordWrap(True)
+        help_text.setTextInteractionFlags(Qt.LinksAccessibleByMouse | Qt.LinksAccessibleByKeyboard)
+        help_text.setOpenExternalLinks(True)
+        v.addWidget(help_text)
+        bf = QCheckBox(_('Add linked files in breadth first order'))
+        bf.setToolTip(_('Normally, when following links in HTML files'
+            ' calibre does it depth first, i.e. if file A links to B and '
+            ' C, but B links to D, the files are added in the order A, B, D, C. '
+            ' With this option, they will instead be added as A, B, C, D'))
+        sc = plugin_customization(self)
+        if not sc:
+            sc = ''
+        sc = sc.strip()
+        enc = sc.partition('|')[0]
+        bfs = sc.partition('|')[-1]
+        bf.setChecked(bfs == 'bf')
+        sc = QLineEdit(enc, config_dialog)
+        v.addWidget(sc)
+        v.addWidget(bf)
+        v.addWidget(button_box)
+        size_dialog()
+        config_dialog.exec_()
+
+        if config_dialog.result() == QDialog.Accepted:
+            sc = unicode_type(sc.text()).strip()
+            if bf.isChecked():
+                sc += '|bf'
+            customize_plugin(self, sc)
+
+        return config_dialog.result()
--- a/ebook_converter/ebooks/html_entities.py
+++ b/ebook_converter/ebooks/html_entities.py
--- a/ebook_converter/ebooks/lrf/init.py
+++ b/ebook_converter/ebooks/lrf/init.py
@@ -0,0 +1,115 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+"""
+This package contains logic to read and write LRF files.
+The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}.
+"""
+
+from calibre.ebooks.lrf.pylrs.pylrs import Book as _Book
+from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, \
+                                             TextStyle, BlockStyle
+from calibre.ebooks.lrf.fonts import FONT_FILE_MAP
+from calibre.ebooks import ConversionError
+
+__docformat__ = "epytext"
+
+
+class LRFParseError(Exception):
+    pass
+
+
+class PRS500_PROFILE(object):
+    screen_width  = 600
+    screen_height = 775
+    dpi           = 166
+    # Number of pixels to subtract from screen_height when calculating height of text area
+    fudge         = 0
+    font_size     = 10  #: Default (in pt)
+    parindent     = 10  #: Default (in pt)
+    line_space    = 1.2  # : Default (in pt)
+    header_font_size = 6  #: In pt
+    header_height    = 30  # : In px
+    default_fonts    = {'sans': "Swis721 BT Roman", 'mono': "Courier10 BT Roman",
+                         'serif': "Dutch801 Rm BT Roman"}
+
+    name = 'prs500'
+
+
+def find_custom_fonts(options, logger):
+    from calibre.utils.fonts.scanner import font_scanner
+    fonts = {'serif' : None, 'sans' : None, 'mono' : None}
+
+    def family(cmd):
+        return cmd.split(',')[-1].strip()
+    if options.serif_family:
+        f = family(options.serif_family)
+        fonts['serif'] = font_scanner.legacy_fonts_for_family(f)
+        if not fonts['serif']:
+            logger.warn('Unable to find serif family %s'%f)
+    if options.sans_family:
+        f = family(options.sans_family)
+        fonts['sans'] = font_scanner.legacy_fonts_for_family(f)
+        if not fonts['sans']:
+            logger.warn('Unable to find sans family %s'%f)
+    if options.mono_family:
+        f = family(options.mono_family)
+        fonts['mono'] = font_scanner.legacy_fonts_for_family(f)
+        if not fonts['mono']:
+            logger.warn('Unable to find mono family %s'%f)
+    return fonts
+
+
+def Book(options, logger, font_delta=0, header=None,
+         profile=PRS500_PROFILE, **settings):
+    from uuid import uuid4
+    ps = {}
+    ps['topmargin']      = options.top_margin
+    ps['evensidemargin'] = options.left_margin
+    ps['oddsidemargin']  = options.left_margin
+    ps['textwidth']      = profile.screen_width - (options.left_margin + options.right_margin)
+    ps['textheight']     = profile.screen_height - (options.top_margin + options.bottom_margin) \
+                                                 - profile.fudge
+    if header:
+        hdr = Header()
+        hb = TextBlock(textStyle=TextStyle(align='foot',
+                                           fontsize=int(profile.header_font_size*10)),
+                       blockStyle=BlockStyle(blockwidth=ps['textwidth']))
+        hb.append(header)
+        hdr.PutObj(hb)
+        ps['headheight'] = profile.header_height
+        ps['headsep']    = options.header_separation
+        ps['header']     = hdr
+        ps['topmargin']  = 0
+        ps['textheight'] = profile.screen_height - (options.bottom_margin + ps['topmargin']) \
+                                                 - ps['headheight'] - ps['headsep'] - profile.fudge
+
+    fontsize = int(10*profile.font_size+font_delta*20)
+    baselineskip = fontsize + 20
+    fonts = find_custom_fonts(options, logger)
+    tsd = dict(fontsize=fontsize,
+               parindent=int(10*profile.parindent),
+               linespace=int(10*profile.line_space),
+               baselineskip=baselineskip,
+               wordspace=10*options.wordspace)
+    if fonts['serif'] and 'normal' in fonts['serif']:
+        tsd['fontfacename'] = fonts['serif']['normal'][1]
+
+    book = _Book(textstyledefault=tsd,
+                pagestyledefault=ps,
+                blockstyledefault=dict(blockwidth=ps['textwidth']),
+                bookid=uuid4().hex,
+                **settings)
+    for family in fonts.keys():
+        if fonts[family]:
+            for font in fonts[family].values():
+                book.embed_font(*font)
+                FONT_FILE_MAP[font[1]] = font[0]
+
+    for family in ['serif', 'sans', 'mono']:
+        if not fonts[family]:
+            fonts[family] = {'normal' : (None, profile.default_fonts[family])}
+        elif 'normal' not in fonts[family]:
+            raise ConversionError('Could not find the normal version of the ' + family + ' font')
+    return book, fonts
--- a/ebook_converter/ebooks/lrf/fonts.py
+++ b/ebook_converter/ebooks/lrf/fonts.py
@@ -0,0 +1,33 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from PIL import ImageFont
+
+'''
+Default fonts used in the PRS500
+'''
+
+
+LIBERATION_FONT_MAP = {
+            'Swis721 BT Roman'     : 'LiberationSans-Regular',
+            'Dutch801 Rm BT Roman' : 'LiberationSerif-Regular',
+            'Courier10 BT Roman'   : 'LiberationMono-Regular',
+            }
+
+FONT_FILE_MAP = {}
+
+
+def get_font(name, size, encoding='unic'):
+    '''
+    Get an ImageFont object by name.
+    @param size: Font height in pixels. To convert from pts:
+                 sz in pixels = (dpi/72) * size in pts
+    @param encoding: Font encoding to use. E.g. 'unic', 'symbol', 'ADOB', 'ADBE', 'aprm'
+    @param manager: A dict that will store the PersistentTemporary
+    '''
+    if name in LIBERATION_FONT_MAP:
+        return ImageFont.truetype(P('fonts/liberation/%s.ttf' % LIBERATION_FONT_MAP[name]), size, encoding=encoding)
+    elif name in FONT_FILE_MAP:
+        return ImageFont.truetype(FONT_FILE_MAP[name], size, encoding=encoding)
--- a/ebook_converter/ebooks/lrf/html/init.py
+++ b/ebook_converter/ebooks/lrf/html/init.py
@@ -0,0 +1,10 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+"""
+This package contains code to convert HTML ebooks to LRF ebooks.
+"""
+
+__docformat__ = "epytext"
+__author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
--- a/ebook_converter/ebooks/lrf/html/color_map.py
+++ b/ebook_converter/ebooks/lrf/html/color_map.py
@@ -0,0 +1,115 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re
+
+NAME_MAP = {
+             'aliceblue': '#F0F8FF',
+             'antiquewhite': '#FAEBD7',
+             'aqua': '#00FFFF',
+             'aquamarine': '#7FFFD4',
+             'azure': '#F0FFFF',
+             'beige': '#F5F5DC',
+             'bisque': '#FFE4C4',
+             'black': '#000000',
+             'blanchedalmond': '#FFEBCD',
+             'blue': '#0000FF',
+             'brown': '#A52A2A',
+             'burlywood': '#DEB887',
+             'cadetblue': '#5F9EA0',
+             'chartreuse': '#7FFF00',
+             'chocolate': '#D2691E',
+             'coral': '#FF7F50',
+             'crimson': '#DC143C',
+             'cyan': '#00FFFF',
+             'darkblue': '#00008B',
+             'darkgoldenrod': '#B8860B',
+             'darkgreen': '#006400',
+             'darkkhaki': '#BDB76B',
+             'darkmagenta': '#8B008B',
+             'darkolivegreen': '#556B2F',
+             'darkorange': '#FF8C00',
+             'darkorchid': '#9932CC',
+             'darkred': '#8B0000',
+             'darksalmon': '#E9967A',
+             'darkslateblue': '#483D8B',
+             'darkslategrey': '#2F4F4F',
+             'darkviolet': '#9400D3',
+             'deeppink': '#FF1493',
+             'dodgerblue': '#1E90FF',
+             'firebrick': '#B22222',
+             'floralwhite': '#FFFAF0',
+             'forestgreen': '#228B22',
+             'fuchsia': '#FF00FF',
+             'gainsboro': '#DCDCDC',
+             'ghostwhite': '#F8F8FF',
+             'gold': '#FFD700',
+             'goldenrod': '#DAA520',
+             'indianred ': '#CD5C5C',
+             'indigo  ': '#4B0082',
+             'khaki': '#F0E68C',
+             'lavenderblush': '#FFF0F5',
+             'lawngreen': '#7CFC00',
+             'lightblue': '#ADD8E6',
+             'lightcoral': '#F08080',
+             'lightgoldenrodyellow': '#FAFAD2',
+             'lightgray': '#D3D3D3',
+             'lightgrey': '#D3D3D3',
+             'lightskyblue': '#87CEFA',
+             'lightslategrey': '#778899',
+             'lightsteelblue': '#B0C4DE',
+             'lime': '#87CEFA',
+             'linen': '#FAF0E6',
+             'magenta': '#FF00FF',
+             'maroon': '#800000',
+             'mediumaquamarine': '#66CDAA',
+             'mediumblue': '#0000CD',
+             'mediumorchid': '#BA55D3',
+             'mediumpurple': '#9370D8',
+             'mediumseagreen': '#3CB371',
+             'mediumslateblue': '#7B68EE',
+             'midnightblue': '#191970',
+             'moccasin': '#FFE4B5',
+             'navajowhite': '#FFDEAD',
+             'navy': '#000080',
+             'oldlace': '#FDF5E6',
+             'olive': '#808000',
+             'orange': '#FFA500',
+             'orangered': '#FF4500',
+             'orchid': '#DA70D6',
+             'paleturquoise': '#AFEEEE',
+             'papayawhip': '#FFEFD5',
+             'peachpuff': '#FFDAB9',
+             'powderblue': '#B0E0E6',
+             'rosybrown': '#BC8F8F',
+             'royalblue': '#4169E1',
+             'saddlebrown': '#8B4513',
+             'sandybrown': '#8B4513',
+             'seashell': '#FFF5EE',
+             'sienna': '#A0522D',
+             'silver': '#C0C0C0',
+             'skyblue': '#87CEEB',
+             'slategrey': '#708090',
+             'snow': '#FFFAFA',
+             'springgreen': '#00FF7F',
+             'violet': '#EE82EE',
+             'yellowgreen': '#9ACD32'
+            }
+
+hex_pat = re.compile(r'#(\d{2})(\d{2})(\d{2})')
+rgb_pat = re.compile(r'rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', re.IGNORECASE)
+
+
+def lrs_color(html_color):
+    hcol = html_color.lower()
+    match = hex_pat.search(hcol)
+    if match:
+        return '0x00'+match.group(1)+match.group(2)+match.group(3)
+    match = rgb_pat.search(hcol)
+    if match:
+        return '0x00'+hex(int(match.group(1)))[2:]+hex(int(match.group(2)))[2:]+hex(int(match.group(3)))[2:]
+    if hcol in NAME_MAP:
+        return NAME_MAP[hcol].replace('#', '0x00')
+    return '0x00000000'
--- a/ebook_converter/ebooks/lrf/html/convert_from.py
+++ b/ebook_converter/ebooks/lrf/html/convert_from.py
--- a/ebook_converter/ebooks/lrf/html/table.py
+++ b/ebook_converter/ebooks/lrf/html/table.py
@@ -0,0 +1,386 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+import math, sys, re, numbers
+
+from calibre.ebooks.lrf.fonts import get_font
+from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Text, CR, Span, \
+                                             CharButton, Plot, Paragraph, \
+                                             LrsTextTag
+from polyglot.builtins import string_or_bytes, range, native_string_type
+
+
+def ceil(num):
+    return int(math.ceil(num))
+
+
+def print_xml(elem):
+    from calibre.ebooks.lrf.pylrs.pylrs import ElementWriter
+    elem = elem.toElement(native_string_type('utf8'))
+    ew = ElementWriter(elem, sourceEncoding=native_string_type('utf8'))
+    ew.write(sys.stdout)
+    print()
+
+
+def cattrs(base, extra):
+    new = base.copy()
+    new.update(extra)
+    return new
+
+
+def tokens(tb):
+    '''
+    Return the next token. A token is :
+    1. A string
+    a block of text that has the same style
+    '''
+    def process_element(x, attrs):
+        if isinstance(x, CR):
+            yield 2, None
+        elif isinstance(x, Text):
+            yield x.text, cattrs(attrs, {})
+        elif isinstance(x, string_or_bytes):
+            yield x, cattrs(attrs, {})
+        elif isinstance(x, (CharButton, LrsTextTag)):
+            if x.contents:
+                if hasattr(x.contents[0], 'text'):
+                    yield x.contents[0].text, cattrs(attrs, {})
+                elif hasattr(x.contents[0], 'attrs'):
+                    for z in process_element(x.contents[0], x.contents[0].attrs):
+                        yield z
+        elif isinstance(x, Plot):
+            yield x, None
+        elif isinstance(x, Span):
+            attrs = cattrs(attrs, x.attrs)
+            for y in x.contents:
+                for z in process_element(y, attrs):
+                    yield z
+
+    for i in tb.contents:
+        if isinstance(i, CR):
+            yield 1, None
+        elif isinstance(i, Paragraph):
+            for j in i.contents:
+                attrs = {}
+                if hasattr(j, 'attrs'):
+                    attrs = j.attrs
+                for k in process_element(j, attrs):
+                    yield k
+
+
+class Cell(object):
+
+    def __init__(self, conv, tag, css):
+        self.conv = conv
+        self.tag = tag
+        self.css  = css
+        self.text_blocks = []
+        self.pwidth = -1.
+        if tag.has_attr('width') and '%' in tag['width']:
+            try:
+                self.pwidth = float(tag['width'].replace('%', ''))
+            except ValueError:
+                pass
+        if 'width' in css and '%' in css['width']:
+            try:
+                self.pwidth = float(css['width'].replace('%', ''))
+            except ValueError:
+                pass
+        if self.pwidth > 100:
+            self.pwidth = -1
+        self.rowspan = self.colspan = 1
+        try:
+            self.colspan = int(tag['colspan']) if tag.has_attr('colspan') else 1
+            self.rowspan = int(tag['rowspan']) if tag.has_attr('rowspan') else 1
+        except:
+            pass
+
+        pp = conv.current_page
+        conv.book.allow_new_page = False
+        conv.current_page = conv.book.create_page()
+        conv.parse_tag(tag, css)
+        conv.end_current_block()
+        for item in conv.current_page.contents:
+            if isinstance(item, TextBlock):
+                self.text_blocks.append(item)
+        conv.current_page = pp
+        conv.book.allow_new_page = True
+        if not self.text_blocks:
+            tb = conv.book.create_text_block()
+            tb.Paragraph(' ')
+            self.text_blocks.append(tb)
+        for tb in self.text_blocks:
+            tb.parent = None
+            tb.objId  = 0
+            # Needed as we have to eventually change this BlockStyle's width and
+            # height attributes. This blockstyle may be shared with other
+            # elements, so doing that causes havoc.
+            tb.blockStyle = conv.book.create_block_style()
+            ts = conv.book.create_text_style(**tb.textStyle.attrs)
+            ts.attrs['parindent'] = 0
+            tb.textStyle = ts
+            if ts.attrs['align'] == 'foot':
+                if isinstance(tb.contents[-1], Paragraph):
+                    tb.contents[-1].append(' ')
+
+    def pts_to_pixels(self, pts):
+        pts = int(pts)
+        return ceil((float(self.conv.profile.dpi)/72)*(pts/10))
+
+    def minimum_width(self):
+        return max([self.minimum_tb_width(tb) for tb in self.text_blocks])
+
+    def minimum_tb_width(self, tb):
+        ts = tb.textStyle.attrs
+        default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
+        parindent = self.pts_to_pixels(ts['parindent'])
+        mwidth = 0
+        for token, attrs in tokens(tb):
+            font = default_font
+            if isinstance(token, numbers.Integral):  # Handle para and line breaks
+                continue
+            if isinstance(token, Plot):
+                return self.pts_to_pixels(token.xsize)
+            ff = attrs.get('fontfacename', ts['fontfacename'])
+            fs = attrs.get('fontsize', ts['fontsize'])
+            if (ff, fs) != (ts['fontfacename'], ts['fontsize']):
+                font = get_font(ff, self.pts_to_pixels(fs))
+            if not token.strip():
+                continue
+            word = token.split()
+            word = word[0] if word else ""
+            width = font.getsize(word)[0]
+            if width > mwidth:
+                mwidth = width
+        return parindent + mwidth + 2
+
+    def text_block_size(self, tb, maxwidth=sys.maxsize, debug=False):
+        ts = tb.textStyle.attrs
+        default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
+        parindent = self.pts_to_pixels(ts['parindent'])
+        top, bottom, left, right = 0, 0, parindent, parindent
+
+        def add_word(width, height, left, right, top, bottom, ls, ws):
+            if left + width > maxwidth:
+                left = width + ws
+                top += ls
+                bottom = top+ls if top+ls > bottom else bottom
+            else:
+                left += (width + ws)
+                right = left if left > right else right
+                bottom = top+ls if top+ls > bottom else bottom
+            return left, right, top, bottom
+
+        for token, attrs in tokens(tb):
+            if attrs is None:
+                attrs = {}
+            font = default_font
+            ls = self.pts_to_pixels(attrs.get('baselineskip', ts['baselineskip']))+\
+                 self.pts_to_pixels(attrs.get('linespace', ts['linespace']))
+            ws = self.pts_to_pixels(attrs.get('wordspace', ts['wordspace']))
+            if isinstance(token, numbers.Integral):  # Handle para and line breaks
+                if top != bottom:  # Previous element not a line break
+                    top = bottom
+                else:
+                    top += ls
+                    bottom += ls
+                left = parindent if int == 1 else 0
+                continue
+            if isinstance(token, Plot):
+                width, height = self.pts_to_pixels(token.xsize), self.pts_to_pixels(token.ysize)
+                left, right, top, bottom = add_word(width, height, left, right, top, bottom, height, ws)
+                continue
+            ff = attrs.get('fontfacename', ts['fontfacename'])
+            fs = attrs.get('fontsize', ts['fontsize'])
+            if (ff, fs) != (ts['fontfacename'], ts['fontsize']):
+                font = get_font(ff, self.pts_to_pixels(fs))
+            for word in token.split():
+                width, height = font.getsize(word)
+                left, right, top, bottom = add_word(width, height, left, right, top, bottom, ls, ws)
+        return right+3+max(parindent, 10), bottom
+
+    def text_block_preferred_width(self, tb, debug=False):
+        return self.text_block_size(tb, sys.maxsize, debug=debug)[0]
+
+    def preferred_width(self, debug=False):
+        return ceil(max([self.text_block_preferred_width(i, debug=debug) for i in self.text_blocks]))
+
+    def height(self, width):
+        return sum([self.text_block_size(i, width)[1] for i in self.text_blocks])
+
+
+class Row(object):
+
+    def __init__(self, conv, row, css, colpad):
+        self.cells = []
+        self.colpad = colpad
+        cells = row.findAll(re.compile('td|th', re.IGNORECASE))
+        self.targets = []
+        for cell in cells:
+            ccss = conv.tag_css(cell, css)[0]
+            self.cells.append(Cell(conv, cell, ccss))
+        for a in row.findAll(id=True) + row.findAll(name=True):
+            name = a['name'] if a.has_attr('name') else a['id'] if a.has_attr('id') else None
+            if name is not None:
+                self.targets.append(name.replace('#', ''))
+
+    def number_of_cells(self):
+        '''Number of cells in this row. Respects colspan'''
+        ans = 0
+        for cell in self.cells:
+            ans += cell.colspan
+        return ans
+
+    def height(self, widths):
+        i, heights = 0, []
+        for cell in self.cells:
+            width = sum(widths[i:i+cell.colspan])
+            heights.append(cell.height(width))
+            i += cell.colspan
+        if not heights:
+            return 0
+        return max(heights)
+
+    def cell_from_index(self, col):
+        i = -1
+        cell = None
+        for cell in self.cells:
+            for k in range(0, cell.colspan):
+                if i == col:
+                    break
+                i += 1
+            if i == col:
+                break
+        return cell
+
+    def minimum_width(self, col):
+        cell = self.cell_from_index(col)
+        if not cell:
+            return 0
+        return cell.minimum_width()
+
+    def preferred_width(self, col):
+        cell = self.cell_from_index(col)
+        if not cell:
+            return 0
+        return 0 if cell.colspan > 1 else cell.preferred_width()
+
+    def width_percent(self, col):
+        cell = self.cell_from_index(col)
+        if not cell:
+            return -1
+        return -1 if cell.colspan > 1 else cell.pwidth
+
+    def cell_iterator(self):
+        for c in self.cells:
+            yield c
+
+
+class Table(object):
+
+    def __init__(self, conv, table, css, rowpad=10, colpad=10):
+        self.rows = []
+        self.conv = conv
+        self.rowpad = rowpad
+        self.colpad = colpad
+        rows = table.findAll('tr')
+        conv.in_table = True
+        for row in rows:
+            rcss = conv.tag_css(row, css)[0]
+            self.rows.append(Row(conv, row, rcss, colpad))
+        conv.in_table = False
+
+    def number_of_columns(self):
+        max = 0
+        for row in self.rows:
+            max = row.number_of_cells() if row.number_of_cells() > max else max
+        return max
+
+    def number_or_rows(self):
+        return len(self.rows)
+
+    def height(self, maxwidth):
+        ''' Return row heights + self.rowpad'''
+        widths = self.get_widths(maxwidth)
+        return sum([row.height(widths) + self.rowpad for row in self.rows]) - self.rowpad
+
+    def minimum_width(self, col):
+        return max([row.minimum_width(col) for row in self.rows])
+
+    def width_percent(self, col):
+        return max([row.width_percent(col) for row in self.rows])
+
+    def get_widths(self, maxwidth):
+        '''
+        Return widths of columns + self.colpad
+        '''
+        rows, cols = self.number_or_rows(), self.number_of_columns()
+        widths = list(range(cols))
+        for c in range(cols):
+            cellwidths = [0 for i in range(rows)]
+            for r in range(rows):
+                try:
+                    cellwidths[r] = self.rows[r].preferred_width(c)
+                except IndexError:
+                    continue
+            widths[c] = max(cellwidths)
+
+        min_widths = [self.minimum_width(i)+10 for i in range(cols)]
+        for i in range(len(widths)):
+            wp = self.width_percent(i)
+            if wp >= 0:
+                widths[i] = max(min_widths[i], ceil((wp/100) * (maxwidth - (cols-1)*self.colpad)))
+
+        itercount = 0
+
+        while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
+            for i in range(cols):
+                widths[i] = ceil((95/100)*widths[i]) if \
+                    ceil((95/100)*widths[i]) >= min_widths[i] else widths[i]
+            itercount += 1
+
+        return [i+self.colpad for i in widths]
+
+    def blocks(self, maxwidth, maxheight):
+        rows, cols = self.number_or_rows(), self.number_of_columns()
+        cellmatrix = [[None for c in range(cols)] for r in range(rows)]
+        rowpos = [0 for i in range(rows)]
+        for r in range(rows):
+            nc = self.rows[r].cell_iterator()
+            try:
+                while True:
+                    cell = next(nc)
+                    cellmatrix[r][rowpos[r]] = cell
+                    rowpos[r] += cell.colspan
+                    for k in range(1, cell.rowspan):
+                        try:
+                            rowpos[r+k] += 1
+                        except IndexError:
+                            break
+            except StopIteration:  # No more cells in this row
+                continue
+
+        widths = self.get_widths(maxwidth)
+        heights = [row.height(widths) for row in self.rows]
+
+        xpos = [sum(widths[:i]) for i in range(cols)]
+        delta = maxwidth - sum(widths)
+        if delta < 0:
+            delta = 0
+        for r in range(len(cellmatrix)):
+            yield None, 0, heights[r], 0, self.rows[r].targets
+            for c in range(len(cellmatrix[r])):
+                cell = cellmatrix[r][c]
+                if not cell:
+                    continue
+                width = sum(widths[c:c+cell.colspan])-self.colpad*cell.colspan
+                sypos = 0
+                for tb in cell.text_blocks:
+                    tb.blockStyle = self.conv.book.create_block_style(
+                                    blockwidth=width,
+                                    blockheight=cell.text_block_size(tb, width)[1],
+                                    blockrule='horz-fixed')
+
+                    yield tb, xpos[c], sypos, delta, None
+                    sypos += tb.blockStyle.attrs['blockheight']
--- a/ebook_converter/ebooks/lrf/pylrs/init.py
+++ b/ebook_converter/ebooks/lrf/pylrs/init.py
@@ -0,0 +1,7 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+"""
+This package contains code to generate ebooks in the SONY LRS/F format. It was
+originally developed by Mike Higgins and has been extended and modified by Kovid
+Goyal.
+"""
--- a/ebook_converter/ebooks/lrf/pylrs/elements.py
+++ b/ebook_converter/ebooks/lrf/pylrs/elements.py
@@ -0,0 +1,78 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+""" elements.py -- replacements and helpers for ElementTree """
+
+from polyglot.builtins import unicode_type, string_or_bytes
+
+
+class ElementWriter(object):
+
+    def __init__(self, e, header=False, sourceEncoding="ascii",
+                 spaceBeforeClose=True, outputEncodingName="UTF-16"):
+        self.header = header
+        self.e = e
+        self.sourceEncoding=sourceEncoding
+        self.spaceBeforeClose = spaceBeforeClose
+        self.outputEncodingName = outputEncodingName
+
+    def _encodeCdata(self, rawText):
+        if isinstance(rawText, bytes):
+            rawText = rawText.decode(self.sourceEncoding)
+
+        text = rawText.replace("&", "&amp;")
+        text = text.replace("<", "&lt;")
+        text = text.replace(">", "&gt;")
+        return text
+
+    def _writeAttribute(self, f, name, value):
+        f.write(' %s="' % unicode_type(name))
+        if not isinstance(value, string_or_bytes):
+            value = unicode_type(value)
+        value = self._encodeCdata(value)
+        value = value.replace('"', '&quot;')
+        f.write(value)
+        f.write('"')
+
+    def _writeText(self, f, rawText):
+        text = self._encodeCdata(rawText)
+        f.write(text)
+
+    def _write(self, f, e):
+        f.write('<' + unicode_type(e.tag))
+
+        attributes = e.items()
+        attributes.sort()
+        for name, value in attributes:
+            self._writeAttribute(f, name, value)
+
+        if e.text is not None or len(e) > 0:
+            f.write('>')
+
+            if e.text:
+                self._writeText(f, e.text)
+
+            for e2 in e:
+                self._write(f, e2)
+
+            f.write('</%s>' % e.tag)
+        else:
+            if self.spaceBeforeClose:
+                f.write(' ')
+            f.write('/>')
+
+        if e.tail is not None:
+            self._writeText(f, e.tail)
+
+    def toString(self):
+        class x:
+            pass
+        buffer = []
+        x.write = buffer.append
+        self.write(x)
+        return ''.join(buffer)
+
+    def write(self, f):
+        if self.header:
+            f.write('<?xml version="1.0" encoding="%s"?>\n' % self.outputEncodingName)
+
+        self._write(f, self.e)
--- a/ebook_converter/ebooks/lrf/pylrs/pylrf.py
+++ b/ebook_converter/ebooks/lrf/pylrs/pylrf.py
@@ -0,0 +1,773 @@
+#!/usr/bin/env python2
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+"""
+    pylrf.py -- very low level interface to create lrf files.  See pylrs for
+    higher level interface that can use this module to render books to lrf.
+"""
+import struct
+import zlib
+import io
+import codecs
+import os
+
+from .pylrfopt import tagListOptimizer
+from polyglot.builtins import iteritems, string_or_bytes, unicode_type
+
+PYLRF_VERSION = "1.0"
+
+#
+# Acknowledgement:
+#   This software would not have been possible without the pioneering
+#   efforts of the author of lrf2lrs.py, Igor Skochinsky.
+#
+# Copyright (c) 2007 Mike Higgins (Falstaff)
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+#
+# Change History:
+#
+# V1.0 06 Feb 2007
+# Initial Release.
+
+#
+# Current limitations and bugs:
+#   Never "scrambles" any streams (even if asked to).  This does not seem
+#   to hurt anything.
+#
+#   Not based on any official documentation, so many assumptions had to be made.
+#
+#   Can be used to create lrf files that can lock up an eBook reader.
+#   This is your only warning.
+#
+#   Unsupported objects: Canvas, Window, PopUpWindow, Sound, Import,
+#                        SoundStream, ObjectInfo
+#
+#   The only button type supported is JumpButton.
+#
+#   Unsupported tags: SoundStop, Wait, pos on BlockSpace (and those used by
+#                     unsupported objects).
+#
+#   Tags supporting Japanese text and Asian layout have not been tested.
+#
+#   Tested on Python 2.4 and 2.5, Windows XP and Sony PRS-500.
+#
+#   Commented even less than pylrs, but not very useful when called directly,
+#   anyway.
+#
+
+
+class LrfError(Exception):
+    pass
+
+
+def writeByte(f, byte):
+    f.write(struct.pack("<B", byte))
+
+
+def writeWord(f, word):
+    if int(word) > 65535:
+        raise LrfError('Cannot encode a number greater than 65535 in a word.')
+    if int(word) < 0:
+        raise LrfError('Cannot encode a number < 0 in a word: '+unicode_type(word))
+    f.write(struct.pack("<H", int(word)))
+
+
+def writeSignedWord(f, sword):
+    f.write(struct.pack("<h", int(float(sword))))
+
+
+def writeWords(f, *words):
+    f.write(struct.pack("<%dH" % len(words), *words))
+
+
+def writeDWord(f, dword):
+    f.write(struct.pack("<I", int(dword)))
+
+
+def writeDWords(f, *dwords):
+    f.write(struct.pack("<%dI" % len(dwords), *dwords))
+
+
+def writeQWord(f, qword):
+    f.write(struct.pack("<Q", qword))
+
+
+def writeZeros(f, nZeros):
+    f.write(b"\0" * nZeros)
+
+
+def writeString(f, s):
+    f.write(s)
+
+
+def writeIdList(f, idList):
+    writeWord(f, len(idList))
+    writeDWords(f, *idList)
+
+
+def writeColor(f, color):
+    # TODO: allow color names, web format
+    f.write(struct.pack(">I", int(color, 0)))
+
+
+def writeLineWidth(f, width):
+    writeWord(f, int(width))
+
+
+def writeUnicode(f, string, encoding):
+    if isinstance(string, bytes):
+        string = string.decode(encoding)
+    string = string.encode("utf-16-le")
+    length = len(string)
+    if length > 65535:
+        raise LrfError('Cannot write strings longer than 65535 characters.')
+    writeWord(f, length)
+    writeString(f, string)
+
+
+def writeRaw(f, string, encoding):
+    if isinstance(string, bytes):
+        string = string.decode(encoding)
+
+    string = string.encode("utf-16-le")
+    writeString(f, string)
+
+
+def writeRubyAA(f, rubyAA):
+    ralign, radjust = rubyAA
+    radjust = {"line-edge":0x10, "none":0}[radjust]
+    ralign = {"start":1, "center":2}[ralign]
+    writeWord(f, ralign | radjust)
+
+
+def writeBgImage(f, bgInfo):
+    imode, iid = bgInfo
+    imode = {"pfix": 0, "fix":1, "tile":2, "centering":3}[imode]
+    writeWord(f, imode)
+    writeDWord(f, iid)
+
+
+def writeEmpDots(f, dotsInfo, encoding):
+    refDotsFont, dotsFontName, dotsCode = dotsInfo
+    writeDWord(f, refDotsFont)
+    LrfTag("fontfacename", dotsFontName).write(f, encoding)
+    writeWord(f, int(dotsCode, 0))
+
+
+def writeRuledLine(f, lineInfo):
+    lineLength, lineType, lineWidth, lineColor = lineInfo
+    writeWord(f, lineLength)
+    writeWord(f, LINE_TYPE_ENCODING[lineType])
+    writeWord(f, lineWidth)
+    writeColor(f, lineColor)
+
+
+LRF_SIGNATURE = b"L\x00R\x00F\x00\x00\x00"
+
+# XOR_KEY = 48
+XOR_KEY = 65024  # that's what lrf2lrs says -- not used, anyway...
+
+LRF_VERSION = 1000  # is 999 for librie? lrf2lrs uses 1000
+
+IMAGE_TYPE_ENCODING = dict(GIF=0x14, PNG=0x12, BMP=0x13, JPEG=0x11, JPG=0x11)
+
+OBJECT_TYPE_ENCODING = dict(
+        PageTree=0x01,
+        Page=0x02,
+        Header=0x03,
+        Footer=0x04,
+        PageAtr=0x05, PageStyle=0x05,
+        Block=0x06,
+        BlockAtr=0x07, BlockStyle=0x07,
+        MiniPage=0x08,
+        TextBlock=0x0A, Text=0x0A,
+        TextAtr=0x0B, TextStyle=0x0B,
+        ImageBlock=0x0C, Image=0x0C,
+        Canvas=0x0D,
+        ESound=0x0E,
+        ImageStream=0x11,
+        Import=0x12,
+        Button=0x13,
+        Window=0x14,
+        PopUpWindow=0x15,
+        Sound=0x16,
+        SoundStream=0x17,
+        Font=0x19,
+        ObjectInfo=0x1A,
+        BookAtr=0x1C, BookStyle=0x1C,
+        SimpleTextBlock=0x1D,
+        TOC=0x1E
+)
+
+LINE_TYPE_ENCODING =  {
+        'none':0, 'solid':0x10, 'dashed':0x20, 'double':0x30, 'dotted':0x40
+}
+
+BINDING_DIRECTION_ENCODING = dict(Lr=1, Rl=16)
+
+
+TAG_INFO = dict(
+        rawtext=(0, writeRaw),
+        ObjectStart=(0xF500, "<IH"),
+        ObjectEnd=(0xF501,),
+        # InfoLink (0xF502)
+        Link=(0xF503, "<I"),
+        StreamSize=(0xF504, writeDWord),
+        StreamData=(0xF505, writeString),
+        StreamEnd=(0xF506,),
+        oddheaderid=(0xF507, writeDWord),
+        evenheaderid=(0xF508, writeDWord),
+        oddfooterid=(0xF509, writeDWord),
+        evenfooterid=(0xF50A, writeDWord),
+        ObjectList=(0xF50B, writeIdList),
+        fontsize=(0xF511, writeSignedWord),
+        fontwidth=(0xF512, writeSignedWord),
+        fontescapement=(0xF513, writeSignedWord),
+        fontorientation=(0xF514, writeSignedWord),
+        fontweight=(0xF515, writeWord),
+        fontfacename=(0xF516, writeUnicode),
+        textcolor=(0xF517, writeColor),
+        textbgcolor=(0xF518, writeColor),
+        wordspace=(0xF519, writeSignedWord),
+        letterspace=(0xF51A, writeSignedWord),
+        baselineskip=(0xF51B, writeSignedWord),
+        linespace=(0xF51C, writeSignedWord),
+        parindent=(0xF51D, writeSignedWord),
+        parskip=(0xF51E, writeSignedWord),
+        # F51F, F520
+        topmargin=(0xF521, writeWord),
+        headheight=(0xF522, writeWord),
+        headsep=(0xF523, writeWord),
+        oddsidemargin=(0xF524, writeWord),
+        textheight=(0xF525, writeWord),
+        textwidth=(0xF526, writeWord),
+        canvaswidth=(0xF551, writeWord),
+        canvasheight=(0xF552, writeWord),
+        footspace=(0xF527, writeWord),
+        footheight=(0xF528, writeWord),
+        bgimage=(0xF529, writeBgImage),
+        setemptyview=(0xF52A, {'show':1, 'empty':0}, writeWord),
+        pageposition=(0xF52B, {'any':0,'upper':1, 'lower':2}, writeWord),
+        evensidemargin=(0xF52C, writeWord),
+        framemode=(0xF52E,
+                           {'None':0, 'curve':2, 'square':1}, writeWord),
+        blockwidth=(0xF531, writeWord),
+        blockheight=(0xF532, writeWord),
+        blockrule=(0xF533, {"horz-fixed":0x14, "horz-adjustable":0x12,
+                                 "vert-fixed":0x41, "vert-adjustable":0x21,
+                                 "block-fixed":0x44, "block-adjustable":0x22},
+                                 writeWord),
+        bgcolor=(0xF534, writeColor),
+        layout=(0xF535, {'TbRl':0x41, 'LrTb':0x34}, writeWord),
+        framewidth=(0xF536, writeWord),
+        framecolor=(0xF537, writeColor),
+        topskip=(0xF538, writeWord),
+        sidemargin=(0xF539, writeWord),
+        footskip=(0xF53A, writeWord),
+        align=(0xF53C, {'head':1, 'center':4, 'foot':8}, writeWord),
+        column=(0xF53D, writeWord),
+        columnsep=(0xF53E, writeSignedWord),
+        minipagewidth=(0xF541, writeWord),
+        minipageheight=(0xF542, writeWord),
+        yspace=(0xF546, writeWord),
+        xspace=(0xF547, writeWord),
+        PutObj=(0xF549, "<HHI"),
+        ImageRect=(0xF54A, "<HHHH"),
+        ImageSize=(0xF54B, "<HH"),
+        RefObjId=(0xF54C, "<I"),
+        PageDiv=(0xF54E, "<HIHI"),
+        StreamFlags=(0xF554, writeWord),
+        Comment=(0xF555, writeUnicode),
+        FontFilename=(0xF559, writeUnicode),
+        PageList=(0xF55C, writeIdList),
+        FontFacename=(0xF55D, writeUnicode),
+        buttonflags=(0xF561, writeWord),
+        PushButtonStart=(0xF566,),
+        PushButtonEnd=(0xF567,),
+        buttonactions=(0xF56A,),
+        endbuttonactions=(0xF56B,),
+        jumpto=(0xF56C, "<II"),
+        RuledLine=(0xF573, writeRuledLine),
+        rubyaa=(0xF575, writeRubyAA),
+        rubyoverhang=(0xF576, {'none':0, 'auto':1}, writeWord),
+        empdotsposition=(0xF577, {'before':1, 'after':2}, writeWord),
+        empdots=(0xF578, writeEmpDots),
+        emplineposition=(0xF579, {'before':1, 'after':2}, writeWord),
+        emplinetype=(0xF57A, LINE_TYPE_ENCODING, writeWord),
+        ChildPageTree=(0xF57B, "<I"),
+        ParentPageTree=(0xF57C, "<I"),
+        Italic=(0xF581,),
+        ItalicEnd=(0xF582,),
+        pstart=(0xF5A1, writeDWord),  # what goes in the dword? refesound
+        pend=(0xF5A2,),
+        CharButton=(0xF5A7, writeDWord),
+        CharButtonEnd=(0xF5A8,),
+        Rubi=(0xF5A9,),
+        RubiEnd=(0xF5AA,),
+        Oyamoji=(0xF5AB,),
+        OyamojiEnd=(0xF5AC,),
+        Rubimoji=(0xF5AD,),
+        RubimojiEnd=(0xF5AE,),
+        Yoko=(0xF5B1,),
+        YokoEnd=(0xF5B2,),
+        Tate=(0xF5B3,),
+        TateEnd=(0xF5B4,),
+        Nekase=(0xF5B5,),
+        NekaseEnd=(0xF5B6,),
+        Sup=(0xF5B7,),
+        SupEnd=(0xF5B8,),
+        Sub=(0xF5B9,),
+        SubEnd=(0xF5BA,),
+        NoBR=(0xF5BB,),
+        NoBREnd=(0xF5BC,),
+        EmpDots=(0xF5BD,),
+        EmpDotsEnd=(0xF5BE,),
+        EmpLine=(0xF5C1,),
+        EmpLineEnd=(0xF5C2,),
+        DrawChar=(0xF5C3, '<H'),
+        DrawCharEnd=(0xF5C4,),
+        Box=(0xF5C6, LINE_TYPE_ENCODING, writeWord),
+        BoxEnd=(0xF5C7,),
+        Space=(0xF5CA, writeSignedWord),
+        textstring=(0xF5CC, writeUnicode),
+        Plot=(0xF5D1, "<HHII"),
+        CR=(0xF5D2,),
+        RegisterFont=(0xF5D8, writeDWord),
+        setwaitprop=(0xF5DA, {'replay':1, 'noreplay':2}, writeWord),
+        charspace=(0xF5DD, writeSignedWord),
+        textlinewidth=(0xF5F1, writeLineWidth),
+        linecolor=(0xF5F2, writeColor)
+    )
+
+
+class ObjectTableEntry(object):
+
+    def __init__(self, objId, offset, size):
+        self.objId = objId
+        self.offset = offset
+        self.size = size
+
+    def write(self, f):
+        writeDWords(f, self.objId, self.offset, self.size, 0)
+
+
+class LrfTag(object):
+
+    def __init__(self, name, *parameters):
+        try:
+            tagInfo = TAG_INFO[name]
+        except KeyError:
+            raise LrfError("tag name %s not recognized" % name)
+
+        self.name = name
+        self.type = tagInfo[0]
+        self.format = tagInfo[1:]
+
+        if len(parameters) > 1:
+            raise LrfError("only one parameter allowed on tag %s" % name)
+
+        if len(parameters) == 0:
+            self.parameter = None
+        else:
+            self.parameter = parameters[0]
+
+    def write(self, lrf, encoding=None):
+        if self.type != 0:
+            writeWord(lrf, self.type)
+
+        p = self.parameter
+        if p is None:
+            return
+
+        # print "   Writing tag", self.name
+        for f in self.format:
+            if isinstance(f, dict):
+                p = f[p]
+            elif isinstance(f, string_or_bytes):
+                if isinstance(p, tuple):
+                    writeString(lrf, struct.pack(f, *p))
+                else:
+                    writeString(lrf, struct.pack(f, p))
+            else:
+                if f in [writeUnicode, writeRaw, writeEmpDots]:
+                    if encoding is None:
+                        raise LrfError("Tag requires encoding")
+                    f(lrf, p, encoding)
+                else:
+                    f(lrf, p)
+
+
+STREAM_SCRAMBLED = 0x200
+STREAM_COMPRESSED = 0x100
+STREAM_FORCE_COMPRESSED = 0x8100
+STREAM_TOC = 0x0051
+
+
+class LrfStreamBase(object):
+
+    def __init__(self, streamFlags, streamData=None):
+        self.streamFlags = streamFlags
+        self.streamData = streamData
+
+    def setStreamData(self, streamData):
+        self.streamData = streamData
+
+    def getStreamTags(self, optimize=False):
+        # tags:
+        #   StreamFlags
+        #   StreamSize
+        #   StreamStart
+        #   (data)
+        #   StreamEnd
+        #
+        # if flags & 0x200, stream is scrambled
+        # if flags & 0x100, stream is compressed
+
+        flags = self.streamFlags
+        streamBuffer = self.streamData
+
+        # implement scramble?  I never scramble anything...
+
+        if flags & STREAM_FORCE_COMPRESSED == STREAM_FORCE_COMPRESSED:
+            optimize = False
+
+        if flags & STREAM_COMPRESSED == STREAM_COMPRESSED:
+            uncompLen = len(streamBuffer)
+            compStreamBuffer = zlib.compress(streamBuffer)
+            if optimize and uncompLen <= len(compStreamBuffer) + 4:
+                flags &= ~STREAM_COMPRESSED
+            else:
+                streamBuffer = struct.pack("<I", uncompLen) + compStreamBuffer
+
+        return [LrfTag("StreamFlags", flags & 0x01FF),
+                LrfTag("StreamSize", len(streamBuffer)),
+                LrfTag("StreamData", streamBuffer),
+                LrfTag("StreamEnd")]
+
+
+class LrfTagStream(LrfStreamBase):
+
+    def __init__(self, streamFlags, streamTags=None):
+        LrfStreamBase.__init__(self, streamFlags)
+        if streamTags is None:
+            self.tags = []
+        else:
+            self.tags = streamTags[:]
+
+    def appendLrfTag(self, tag):
+        self.tags.append(tag)
+
+    def getStreamTags(self, encoding,
+            optimizeTags=False, optimizeCompression=False):
+        stream = io.BytesIO()
+        if optimizeTags:
+            tagListOptimizer(self.tags)
+
+        for tag in self.tags:
+            tag.write(stream, encoding)
+
+        self.streamData = stream.getvalue()
+        stream.close()
+        return LrfStreamBase.getStreamTags(self, optimize=optimizeCompression)
+
+
+class LrfFileStream(LrfStreamBase):
+
+    def __init__(self, streamFlags, filename):
+        LrfStreamBase.__init__(self, streamFlags)
+        with open(filename, "rb") as f:
+            self.streamData = f.read()
+
+
+class LrfObject(object):
+
+    def __init__(self, name, objId):
+        if objId <= 0:
+            raise LrfError("invalid objId for " + name)
+
+        self.name = name
+        self.objId = objId
+        self.tags = []
+        try:
+            self.type = OBJECT_TYPE_ENCODING[name]
+        except KeyError:
+            raise LrfError("object name %s not recognized" % name)
+
+    def __str__(self):
+        return 'LRFObject: ' + self.name + ", " + unicode_type(self.objId)
+
+    def appendLrfTag(self, tag):
+        self.tags.append(tag)
+
+    def appendLrfTags(self, tagList):
+        self.tags.extend(tagList)
+
+    # deprecated old name
+    append = appendLrfTag
+
+    def appendTagDict(self, tagDict, genClass=None):
+        #
+        # This code does not really belong here, I think.  But it
+        # belongs somewhere, so here it is.
+        #
+        composites = {}
+        for name, value in iteritems(tagDict):
+            if name == 'rubyAlignAndAdjust':
+                continue
+            if name in {
+                    "bgimagemode", "bgimageid", "rubyalign", "rubyadjust",
+                    "empdotscode", "empdotsfontname", "refempdotsfont"}:
+                composites[name] = value
+            else:
+                self.append(LrfTag(name, value))
+
+        if "rubyalign" in composites or "rubyadjust" in composites:
+            ralign = composites.get("rubyalign", "none")
+            radjust = composites.get("rubyadjust", "start")
+            self.append(LrfTag("rubyaa", (ralign, radjust)))
+
+        if "bgimagemode" in composites or "bgimageid" in composites:
+            imode = composites.get("bgimagemode", "fix")
+            iid = composites.get("bgimageid", 0)
+
+            # for some reason, page style uses 0 for "fix"
+            # we call this pfix to differentiate it
+            if genClass == "PageStyle" and imode == "fix":
+                imode = "pfix"
+
+            self.append(LrfTag("bgimage", (imode, iid)))
+
+        if "empdotscode" in composites or "empdotsfontname" in composites or \
+                "refempdotsfont" in composites:
+            dotscode = composites.get("empdotscode", "0x002E")
+            dotsfontname = composites.get("empdotsfontname",
+                    "Dutch801 Rm BT Roman")
+            refdotsfont = composites.get("refempdotsfont", 0)
+            self.append(LrfTag("empdots", (refdotsfont, dotsfontname,
+                dotscode)))
+
+    def write(self, lrf, encoding=None):
+        # print "Writing object", self.name
+        LrfTag("ObjectStart", (self.objId, self.type)).write(lrf)
+
+        for tag in self.tags:
+            tag.write(lrf, encoding)
+
+        LrfTag("ObjectEnd").write(lrf)
+
+
+class LrfToc(LrfObject):
+    """
+        Table of contents.  Format of toc is:
+        [ (pageid, objid, string)...]
+    """
+
+    def __init__(self, objId, toc, se):
+        LrfObject.__init__(self, "TOC", objId)
+        streamData = self._makeTocStream(toc, se)
+        self._makeStreamTags(streamData)
+
+    def _makeStreamTags(self, streamData):
+        stream = LrfStreamBase(STREAM_TOC, streamData)
+        self.tags.extend(stream.getStreamTags())
+
+    def _makeTocStream(self, toc, se):
+        stream = io.BytesIO()
+        nEntries = len(toc)
+
+        writeDWord(stream, nEntries)
+
+        lastOffset = 0
+        writeDWord(stream, lastOffset)
+        for i in range(nEntries - 1):
+            pageId, objId, label = toc[i]
+            entryLen = 4 + 4 + 2 + len(label)*2
+            lastOffset += entryLen
+            writeDWord(stream, lastOffset)
+
+        for entry in toc:
+            pageId, objId, label = entry
+            if pageId <= 0:
+                raise LrfError("page id invalid in toc: " + label)
+            if objId <= 0:
+                raise LrfError("textblock id invalid in toc: " + label)
+
+            writeDWord(stream, pageId)
+            writeDWord(stream, objId)
+            writeUnicode(stream, label, se)
+
+        streamData = stream.getvalue()
+        stream.close()
+        return streamData
+
+
+class LrfWriter(object):
+
+    def __init__(self, sourceEncoding):
+        self.sourceEncoding = sourceEncoding
+
+        # The following flags are just to have a place to remember these
+        # values.  The flags must still be passed to the appropriate classes
+        # in order to have them work.
+
+        self.saveStreamTags = False  # used only in testing -- hogs memory
+
+        # highly experimental -- set to True at your own risk
+        self.optimizeTags = False
+        self.optimizeCompression = False
+
+        # End of placeholders
+
+        self.rootObjId = 0
+        self.rootObj = None
+        self.binding = 1  # 1=front to back, 16=back to front
+        self.dpi = 1600
+        self.width = 600
+        self.height = 800
+        self.colorDepth = 24
+        self.tocObjId = 0
+        self.docInfoXml = ""
+        self.thumbnailEncoding = "JPEG"
+        self.thumbnailData = b""
+        self.objects = []
+        self.objectTable = []
+
+    def getSourceEncoding(self):
+        return self.sourceEncoding
+
+    def toUnicode(self, string):
+        if isinstance(string, bytes):
+            string = string.decode(self.sourceEncoding)
+
+        return string
+
+    def getDocInfoXml(self):
+        return self.docInfoXml
+
+    def setPageTreeId(self, objId):
+        self.pageTreeId = objId
+
+    def getPageTreeId(self):
+        return self.pageTreeId
+
+    def setRootObject(self, obj):
+        if self.rootObjId != 0:
+            raise LrfError("root object already set")
+
+        self.rootObjId = obj.objId
+        self.rootObj = obj
+
+    def registerFontId(self, id):
+        if self.rootObj is None:
+            raise LrfError("can't register font -- no root object")
+
+        self.rootObj.append(LrfTag("RegisterFont", id))
+
+    def setTocObject(self, obj):
+        if self.tocObjId != 0:
+            raise LrfError("toc object already set")
+
+        self.tocObjId = obj.objId
+
+    def setThumbnailFile(self, filename, encoding=None):
+        with open(filename, "rb") as f:
+            self.thumbnailData = f.read()
+
+        if encoding is None:
+            encoding = os.path.splitext(filename)[1][1:]
+
+        encoding = encoding.upper()
+        if encoding not in IMAGE_TYPE_ENCODING:
+            raise LrfError("unknown image type: " + encoding)
+
+        self.thumbnailEncoding = encoding
+
+    def append(self, obj):
+        self.objects.append(obj)
+
+    def addLrfObject(self, objId):
+        pass
+
+    def writeFile(self, lrf):
+        if self.rootObjId == 0:
+            raise LrfError("no root object has been set")
+
+        self.writeHeader(lrf)
+        self.writeObjects(lrf)
+        self.updateObjectTableOffset(lrf)
+        self.updateTocObjectOffset(lrf)
+        self.writeObjectTable(lrf)
+
+    def writeHeader(self, lrf):
+        writeString(lrf, LRF_SIGNATURE)
+        writeWord(lrf, LRF_VERSION)
+        writeWord(lrf, XOR_KEY)
+        writeDWord(lrf, self.rootObjId)
+        writeQWord(lrf, len(self.objects))
+        writeQWord(lrf, 0)  # 0x18 objectTableOffset -- will be updated
+        writeZeros(lrf, 4)  # 0x20 unknown
+        writeWord(lrf, self.binding)
+        writeDWord(lrf, self.dpi)
+        writeWords(lrf, self.width, self.height, self.colorDepth)
+        writeZeros(lrf, 20)  # 0x30 unknown
+        writeDWord(lrf, self.tocObjId)
+        writeDWord(lrf, 0)  # 0x48 tocObjectOffset -- will be updated
+        docInfoXml = codecs.BOM_UTF8 + self.docInfoXml.encode("utf-8")
+        compDocInfo = zlib.compress(docInfoXml)
+        writeWord(lrf, len(compDocInfo) + 4)
+        writeWord(lrf, IMAGE_TYPE_ENCODING[self.thumbnailEncoding])
+        writeDWord(lrf, len(self.thumbnailData))
+        writeDWord(lrf, len(docInfoXml))
+        writeString(lrf, compDocInfo)
+        writeString(lrf, self.thumbnailData)
+
+    def writeObjects(self, lrf):
+        # also appends object entries to the object table
+        self.objectTable = []
+        for obj in self.objects:
+            objStart = lrf.tell()
+            obj.write(lrf, self.sourceEncoding)
+            objEnd = lrf.tell()
+            self.objectTable.append(
+                    ObjectTableEntry(obj.objId, objStart, objEnd-objStart))
+
+    def updateObjectTableOffset(self, lrf):
+        # update the offset of the object table
+        tableOffset = lrf.tell()
+        lrf.seek(0x18, 0)
+        writeQWord(lrf, tableOffset)
+        lrf.seek(0, 2)
+
+    def updateTocObjectOffset(self, lrf):
+        if self.tocObjId == 0:
+            return
+
+        for entry in self.objectTable:
+            if entry.objId == self.tocObjId:
+                lrf.seek(0x48, 0)
+                writeDWord(lrf, entry.offset)
+                lrf.seek(0, 2)
+                break
+        else:
+            raise LrfError("toc object not in object table")
+
+    def writeObjectTable(self, lrf):
+        for tableEntry in self.objectTable:
+            tableEntry.write(lrf)
--- a/ebook_converter/ebooks/lrf/pylrs/pylrfopt.py
+++ b/ebook_converter/ebooks/lrf/pylrs/pylrfopt.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+
+def _optimize(tagList, tagName, conversion):
+    # copy the tag of interest plus any text
+    newTagList = []
+    for tag in tagList:
+        if tag.name == tagName or tag.name == "rawtext":
+            newTagList.append(tag)
+
+    # now, eliminate any duplicates (leaving the last one)
+    for i, newTag in enumerate(newTagList[:-1]):
+        if newTag.name == tagName and newTagList[i+1].name == tagName:
+            tagList.remove(newTag)
+
+    # eliminate redundant settings to same value across text strings
+    newTagList = []
+    for tag in tagList:
+        if tag.name == tagName:
+            newTagList.append(tag)
+
+    for i, newTag in enumerate(newTagList[:-1]):
+        value = conversion(newTag.parameter)
+        nextValue = conversion(newTagList[i+1].parameter)
+        if value == nextValue:
+            tagList.remove(newTagList[i+1])
+
+    # eliminate any setting that don't have text after them
+    while len(tagList) > 0 and tagList[-1].name == tagName:
+        del tagList[-1]
+
+
+def tagListOptimizer(tagList):
+    # this function eliminates redundant or unnecessary tags
+    # it scans a list of tags, looking for text settings that are
+    # changed before any text is output
+    # for example,
+    #  fontsize=100, fontsize=200, text, fontsize=100, fontsize=200
+    # should be:
+    # fontsize=200 text
+    oldSize = len(tagList)
+    _optimize(tagList, "fontsize", int)
+    _optimize(tagList, "fontweight", int)
+    return oldSize - len(tagList)
--- a/ebook_converter/ebooks/lrf/pylrs/pylrs.py
+++ b/ebook_converter/ebooks/lrf/pylrs/pylrs.py
--- a/ebook_converter/ebooks/metadata/init.py
+++ b/ebook_converter/ebooks/metadata/init.py
@@ -0,0 +1,440 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+"""
+Provides abstraction for metadata reading.writing from a variety of ebook formats.
+"""
+import os, sys, re
+
+from calibre import relpath, guess_type, prints, force_unicode
+from calibre.utils.config_base import tweaks
+from polyglot.builtins import codepoint_to_chr, unicode_type, range, map, zip, getcwd, iteritems, itervalues, as_unicode
+from polyglot.urllib import quote, unquote, urlparse
+
+
+try:
+    _author_pat = re.compile(tweaks['authors_split_regex'])
+except Exception:
+    prints('Author split regexp:', tweaks['authors_split_regex'],
+            'is invalid, using default')
+    _author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
+
+
+def string_to_authors(raw):
+    if not raw:
+        return []
+    raw = raw.replace('&&', '\uffff')
+    raw = _author_pat.sub('&', raw)
+    authors = [a.strip().replace('\uffff', '&') for a in raw.split('&')]
+    return [a for a in authors if a]
+
+
+def authors_to_string(authors):
+    if authors is not None:
+        return ' & '.join([a.replace('&', '&&') for a in authors if a])
+    else:
+        return ''
+
+
+def remove_bracketed_text(src, brackets=None):
+    if brackets is None:
+        brackets = {'(': ')', '[': ']', '{': '}'}
+    from collections import Counter
+    counts = Counter()
+    buf = []
+    src = force_unicode(src)
+    rmap = {v: k for k, v in iteritems(brackets)}
+    for char in src:
+        if char in brackets:
+            counts[char] += 1
+        elif char in rmap:
+            idx = rmap[char]
+            if counts[idx] > 0:
+                counts[idx] -= 1
+        elif sum(itervalues(counts)) < 1:
+            buf.append(char)
+    return ''.join(buf)
+
+
+def author_to_author_sort(author, method=None):
+    if not author:
+        return ''
+    sauthor = remove_bracketed_text(author).strip()
+    tokens = sauthor.split()
+    if len(tokens) < 2:
+        return author
+    if method is None:
+        method = tweaks['author_sort_copy_method']
+
+    ltoks = frozenset(x.lower() for x in tokens)
+    copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords'])
+    if ltoks.intersection(copy_words):
+        method = 'copy'
+
+    if method == 'copy':
+        return author
+
+    prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
+    prefixes |= {y+'.' for y in prefixes}
+    while True:
+        if not tokens:
+            return author
+        tok = tokens[0].lower()
+        if tok in prefixes:
+            tokens = tokens[1:]
+        else:
+            break
+
+    suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
+    suffixes |= {y+'.' for y in suffixes}
+
+    suffix = ''
+    while True:
+        if not tokens:
+            return author
+        last = tokens[-1].lower()
+        if last in suffixes:
+            suffix = tokens[-1] + ' ' + suffix
+            tokens = tokens[:-1]
+        else:
+            break
+    suffix = suffix.strip()
+
+    if method == 'comma' and ',' in ''.join(tokens):
+        return author
+
+    atokens = tokens[-1:] + tokens[:-1]
+    num_toks = len(atokens)
+    if suffix:
+        atokens.append(suffix)
+
+    if method != 'nocomma' and num_toks > 1:
+        atokens[0] += ','
+
+    return ' '.join(atokens)
+
+
+def authors_to_sort_string(authors):
+    return ' & '.join(map(author_to_author_sort, authors))
+
+
+_title_pats = {}
+
+
+def get_title_sort_pat(lang=None):
+    ans = _title_pats.get(lang, None)
+    if ans is not None:
+        return ans
+    q = lang
+    from calibre.utils.localization import canonicalize_lang, get_lang
+    if lang is None:
+        q = tweaks['default_language_for_title_sort']
+        if q is None:
+            q = get_lang()
+    q = canonicalize_lang(q) if q else q
+    data = tweaks['per_language_title_sort_articles']
+    try:
+        ans = data.get(q, None)
+    except AttributeError:
+        ans = None  # invalid tweak value
+    try:
+        ans = frozenset(ans) if ans else frozenset(data['eng'])
+    except:
+        ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
+    ans = '|'.join(ans)
+    ans = '^(%s)'%ans
+    try:
+        ans = re.compile(ans, re.IGNORECASE)
+    except:
+        ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE)
+    _title_pats[lang] = ans
+    return ans
+
+
+_ignore_starts = '\'"'+''.join(codepoint_to_chr(x) for x in
+        list(range(0x2018, 0x201e))+[0x2032, 0x2033])
+
+
+def title_sort(title, order=None, lang=None):
+    if order is None:
+        order = tweaks['title_series_sorting']
+    title = title.strip()
+    if order == 'strictly_alphabetic':
+        return title
+    if title and title[0] in _ignore_starts:
+        title = title[1:]
+    match = get_title_sort_pat(lang).search(title)
+    if match:
+        try:
+            prep = match.group(1)
+        except IndexError:
+            pass
+        else:
+            title = title[len(prep):] + ', ' + prep
+            if title[0] in _ignore_starts:
+                title = title[1:]
+    return title.strip()
+
+
+coding = list(zip(
+[1000,900,500,400,100,90,50,40,10,9,5,4,1],
+["M","CM","D","CD","C","XC","L","XL","X","IX","V","IV","I"]
+))
+
+
+def roman(num):
+    if num <= 0 or num >= 4000 or int(num) != num:
+        return unicode_type(num)
+    result = []
+    for d, r in coding:
+        while num >= d:
+            result.append(r)
+            num -= d
+    return ''.join(result)
+
+
+def fmt_sidx(i, fmt='%.2f', use_roman=False):
+    if i is None or i == '':
+        i = 1
+    try:
+        i = float(i)
+    except TypeError:
+        return unicode_type(i)
+    if int(i) == float(i):
+        return roman(int(i)) if use_roman else '%d'%int(i)
+    return fmt%i
+
+
+class Resource(object):
+
+    '''
+    Represents a resource (usually a file on the filesystem or a URL pointing
+    to the web. Such resources are commonly referred to in OPF files.
+
+    They have the interface:
+
+    :member:`path`
+    :member:`mime_type`
+    :method:`href`
+
+    '''
+
+    def __init__(self, href_or_path, basedir=getcwd(), is_path=True):
+        self._href = None
+        self._basedir = basedir
+        self.path = None
+        self.fragment = ''
+        try:
+            self.mime_type = guess_type(href_or_path)[0]
+        except:
+            self.mime_type = None
+        if self.mime_type is None:
+            self.mime_type = 'application/octet-stream'
+        if is_path:
+            path = href_or_path
+            if not os.path.isabs(path):
+                path = os.path.abspath(os.path.join(basedir, path))
+            if isinstance(path, bytes):
+                path = path.decode(sys.getfilesystemencoding())
+            self.path = path
+        else:
+            url = urlparse(href_or_path)
+            if url[0] not in ('', 'file'):
+                self._href = href_or_path
+            else:
+                pc = url[2]
+                if isinstance(pc, unicode_type):
+                    pc = pc.encode('utf-8')
+                pc = unquote(pc).decode('utf-8')
+                self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
+                self.fragment = unquote(url[-1])
+
+    def href(self, basedir=None):
+        '''
+        Return a URL pointing to this resource. If it is a file on the filesystem
+        the URL is relative to `basedir`.
+
+        `basedir`: If None, the basedir of this resource is used (see :method:`set_basedir`).
+        If this resource has no basedir, then the current working directory is used as the basedir.
+        '''
+        if basedir is None:
+            if self._basedir:
+                basedir = self._basedir
+            else:
+                basedir = getcwd()
+        if self.path is None:
+            return self._href
+        f = self.fragment.encode('utf-8') if isinstance(self.fragment, unicode_type) else self.fragment
+        frag = '#'+as_unicode(quote(f)) if self.fragment else ''
+        if self.path == basedir:
+            return ''+frag
+        try:
+            rpath = relpath(self.path, basedir)
+        except OSError:  # On windows path and basedir could be on different drives
+            rpath = self.path
+        if isinstance(rpath, unicode_type):
+            rpath = rpath.encode('utf-8')
+        return as_unicode(quote(rpath.replace(os.sep, '/')))+frag
+
+    def set_basedir(self, path):
+        self._basedir = path
+
+    def basedir(self):
+        return self._basedir
+
+    def __repr__(self):
+        return 'Resource(%s, %s)'%(repr(self.path), repr(self.href()))
+
+
+class ResourceCollection(object):
+
+    def __init__(self):
+        self._resources = []
+
+    def __iter__(self):
+        for r in self._resources:
+            yield r
+
+    def __len__(self):
+        return len(self._resources)
+
+    def __getitem__(self, index):
+        return self._resources[index]
+
+    def __bool__(self):
+        return len(self._resources) > 0
+
+    def __str__(self):
+        resources = map(repr, self)
+        return '[%s]'%', '.join(resources)
+
+    def __repr__(self):
+        return unicode_type(self)
+
+    def append(self, resource):
+        if not isinstance(resource, Resource):
+            raise ValueError('Can only append objects of type Resource')
+        self._resources.append(resource)
+
+    def remove(self, resource):
+        self._resources.remove(resource)
+
+    def replace(self, start, end, items):
+        'Same as list[start:end] = items'
+        self._resources[start:end] = items
+
+    @staticmethod
+    def from_directory_contents(top, topdown=True):
+        collection = ResourceCollection()
+        for spec in os.walk(top, topdown=topdown):
+            path = os.path.abspath(os.path.join(spec[0], spec[1]))
+            res = Resource.from_path(path)
+            res.set_basedir(top)
+            collection.append(res)
+        return collection
+
+    def set_basedir(self, path):
+        for res in self:
+            res.set_basedir(path)
+
+
+def MetaInformation(title, authors=(_('Unknown'),)):
+    ''' Convenient encapsulation of book metadata, needed for compatibility
+        @param title: title or ``_('Unknown')`` or a MetaInformation object
+        @param authors: List of strings or []
+    '''
+    from calibre.ebooks.metadata.book.base import Metadata
+    mi = None
+    if hasattr(title, 'title') and hasattr(title, 'authors'):
+        mi = title
+        title = mi.title
+        authors = mi.authors
+    return Metadata(title, authors, other=mi)
+
+
+def check_isbn10(isbn):
+    try:
+        digits = tuple(map(int, isbn[:9]))
+        products = [(i+1)*digits[i] for i in range(9)]
+        check = sum(products)%11
+        if (check == 10 and isbn[9] == 'X') or check == int(isbn[9]):
+            return isbn
+    except Exception:
+        pass
+    return None
+
+
+def check_isbn13(isbn):
+    try:
+        digits = tuple(map(int, isbn[:12]))
+        products = [(1 if i%2 ==0 else 3)*digits[i] for i in range(12)]
+        check = 10 - (sum(products)%10)
+        if check == 10:
+            check = 0
+        if unicode_type(check) == isbn[12]:
+            return isbn
+    except Exception:
+        pass
+    return None
+
+
+def check_isbn(isbn):
+    if not isbn:
+        return None
+    isbn = re.sub(r'[^0-9X]', '', isbn.upper())
+    all_same = re.match(r'(\d)\1{9,12}$', isbn)
+    if all_same is not None:
+        return None
+    if len(isbn) == 10:
+        return check_isbn10(isbn)
+    if len(isbn) == 13:
+        return check_isbn13(isbn)
+    return None
+
+
+def check_issn(issn):
+    if not issn:
+        return None
+    issn = re.sub(r'[^0-9X]', '', issn.upper())
+    try:
+        digits = tuple(map(int, issn[:7]))
+        products = [(8 - i) * d for i, d in enumerate(digits)]
+        check = 11 - sum(products) % 11
+        if (check == 10 and issn[7] == 'X') or check == int(issn[7]):
+            return issn
+    except Exception:
+        pass
+    return None
+
+
+def format_isbn(isbn):
+    cisbn = check_isbn(isbn)
+    if not cisbn:
+        return isbn
+    i = cisbn
+    if len(i) == 10:
+        return '-'.join((i[:2], i[2:6], i[6:9], i[9]))
+    return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12]))
+
+
+def check_doi(doi):
+    'Check if something that looks like a DOI is present anywhere in the string'
+    if not doi:
+        return None
+    doi_check = re.search(r'10\.\d{4}/\S+', doi)
+    if doi_check is not None:
+        return doi_check.group()
+    return None
+
+
+def rating_to_stars(value, allow_half_stars=False, star='★', half='½'):
+    r = max(0, min(int(value or 0), 10))
+    ans = star * (r // 2)
+    if allow_half_stars and r % 2:
+        ans += half
+    return ans
--- a/ebook_converter/ebooks/metadata/archive.py
+++ b/ebook_converter/ebooks/metadata/archive.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+from contextlib import closing
+
+from calibre.customize import FileTypePlugin
+from calibre.utils.localization import canonicalize_lang
+from polyglot.builtins import filter, unicode_type
+
+
+def is_comic(list_of_names):
+    extensions = {x.rpartition('.')[-1].lower() for x in list_of_names
+                      if '.' in x and x.lower().rpartition('/')[-1] != 'thumbs.db'}
+    comic_extensions = {'jpg', 'jpeg', 'png'}
+    return len(extensions - comic_extensions) == 0
+
+
+def archive_type(stream):
+    from calibre.utils.zipfile import stringFileHeader
+    try:
+        pos = stream.tell()
+    except:
+        pos = 0
+    id_ = stream.read(4)
+    ans = None
+    if id_ == stringFileHeader:
+        ans = 'zip'
+    elif id_.startswith(b'Rar'):
+        ans = 'rar'
+    try:
+        stream.seek(pos)
+    except Exception:
+        pass
+    return ans
+
+
+class KPFExtract(FileTypePlugin):
+
+    name = 'KPF Extract'
+    author = 'Kovid Goyal'
+    description = _('Extract the source DOCX file from Amazon Kindle Create KPF files.'
+            ' Note this will not contain any edits made in the Kindle Create program itself.')
+    file_types = {'kpf'}
+    supported_platforms = ['windows', 'osx', 'linux']
+    on_import = True
+
+    def run(self, archive):
+        from calibre.utils.zipfile import ZipFile
+        with ZipFile(archive, 'r') as zf:
+            fnames = zf.namelist()
+            candidates = [x for x in fnames if x.lower().endswith('.docx')]
+            if not candidates:
+                return archive
+            of = self.temporary_file('_kpf_extract.docx')
+            with closing(of):
+                of.write(zf.read(candidates[0]))
+        return of.name
+
+
+class ArchiveExtract(FileTypePlugin):
+    name = 'Archive Extract'
+    author = 'Kovid Goyal'
+    description = _('Extract common e-book formats from archive files '
+        '(ZIP/RAR). Also try to autodetect if they are actually '
+        'CBZ/CBR files.')
+    file_types = {'zip', 'rar'}
+    supported_platforms = ['windows', 'osx', 'linux']
+    on_import = True
+
+    def run(self, archive):
+        from calibre.utils.zipfile import ZipFile
+        is_rar = archive.lower().endswith('.rar')
+        if is_rar:
+            from calibre.utils.unrar import extract_member, names
+        else:
+            zf = ZipFile(archive, 'r')
+
+        if is_rar:
+            fnames = list(names(archive))
+        else:
+            fnames = zf.namelist()
+
+        def fname_ok(fname):
+            bn = os.path.basename(fname).lower()
+            if bn == 'thumbs.db':
+                return False
+            if '.' not in bn:
+                return False
+            if bn.rpartition('.')[-1] in {'diz', 'nfo'}:
+                return False
+            if '__MACOSX' in fname.split('/'):
+                return False
+            return True
+
+        fnames = list(filter(fname_ok, fnames))
+        if is_comic(fnames):
+            ext = '.cbr' if is_rar else '.cbz'
+            of = self.temporary_file('_archive_extract'+ext)
+            with open(archive, 'rb') as f:
+                of.write(f.read())
+            of.close()
+            return of.name
+        if len(fnames) > 1 or not fnames:
+            return archive
+        fname = fnames[0]
+        ext = os.path.splitext(fname)[1][1:]
+        if ext.lower() not in {
+                'lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', 'mp3', 'pdb',
+                'azw', 'azw1', 'azw3', 'fb2', 'docx', 'doc', 'odt'}:
+            return archive
+
+        of = self.temporary_file('_archive_extract.'+ext)
+        with closing(of):
+            if is_rar:
+                data = extract_member(archive, match=None, name=fname)[1]
+                of.write(data)
+            else:
+                of.write(zf.read(fname))
+        return of.name
+
+
+def get_comic_book_info(d, mi, series_index='volume'):
+    # See http://code.google.com/p/comicbookinfo/wiki/Example
+    series = d.get('series', '')
+    if series.strip():
+        mi.series = series
+        si = d.get(series_index, None)
+        if si is None:
+            si = d.get('issue' if series_index == 'volume' else 'volume', None)
+        if si is not None:
+            try:
+                mi.series_index = float(si)
+            except Exception:
+                mi.series_index = 1
+    if d.get('language', None):
+        lang = canonicalize_lang(d.get('lang'))
+        if lang:
+            mi.languages = [lang]
+    if d.get('rating', -1) > -1:
+        mi.rating = d['rating']
+    for x in ('title', 'publisher'):
+        y = d.get(x, '').strip()
+        if y:
+            setattr(mi, x, y)
+    tags = d.get('tags', [])
+    if tags:
+        mi.tags = tags
+    authors = []
+    for credit in d.get('credits', []):
+        if credit.get('role', '') in ('Writer', 'Artist', 'Cartoonist',
+                'Creator'):
+            x = credit.get('person', '')
+            if x:
+                x = ' '.join((reversed(x.split(', '))))
+                authors.append(x)
+    if authors:
+        mi.authors = authors
+    comments = d.get('comments', '')
+    if comments and comments.strip():
+        mi.comments = comments.strip()
+    pubm, puby = d.get('publicationMonth', None), d.get('publicationYear', None)
+    if puby is not None:
+        from calibre.utils.date import parse_only_date
+        from datetime import date
+        try:
+            dt = date(puby, 6 if pubm is None else pubm, 15)
+            dt = parse_only_date(unicode_type(dt))
+            mi.pubdate = dt
+        except Exception:
+            pass
+
+
+def parse_comic_comment(comment, series_index='volume'):
+    # See http://code.google.com/p/comicbookinfo/wiki/Example
+    from calibre.ebooks.metadata import MetaInformation
+    import json
+    mi = MetaInformation(None, None)
+    m = json.loads(comment)
+    if isinstance(m, dict):
+        for cat in m:
+            if cat.startswith('ComicBookInfo'):
+                get_comic_book_info(m[cat], mi, series_index=series_index)
+                break
+    return mi
+
+
+def get_comic_metadata(stream, stream_type, series_index='volume'):
+    comment = None
+    if stream_type == 'cbz':
+        from calibre.utils.zipfile import ZipFile
+        zf = ZipFile(stream)
+        comment = zf.comment
+    elif stream_type == 'cbr':
+        from calibre.utils.unrar import comment as get_comment
+        comment = get_comment(stream)
+
+    return parse_comic_comment(comment or b'{}', series_index=series_index)
--- a/ebook_converter/ebooks/metadata/book/init.py
+++ b/ebook_converter/ebooks/metadata/book/init.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+'''
+All fields must have a NULL value represented as None for simple types,
+an empty list/dictionary for complex types and (None, None) for cover_data
+'''
+
+SOCIAL_METADATA_FIELDS = frozenset((
+    'tags',             # Ordered list
+    'rating',           # A floating point number between 0 and 10
+    'comments',         # A simple HTML enabled string
+    'series',           # A simple string
+    'series_index',     # A floating point number
+    # Of the form { scheme1:value1, scheme2:value2}
+    # For example: {'isbn':'123456789', 'doi':'xxxx', ... }
+    'identifiers',
+))
+
+'''
+The list of names that convert to identifiers when in get and set.
+'''
+
+TOP_LEVEL_IDENTIFIERS = frozenset((
+    'isbn',
+))
+
+PUBLICATION_METADATA_FIELDS = frozenset((
+    'title',            # title must never be None. Should be _('Unknown')
+    # Pseudo field that can be set, but if not set is auto generated
+    # from title and languages
+    'title_sort',
+    'authors',          # Ordered list. Must never be None, can be [_('Unknown')]
+    'author_sort_map',  # Map of sort strings for each author
+    # Pseudo field that can be set, but if not set is auto generated
+    # from authors and languages
+    'author_sort',
+    'book_producer',
+    'timestamp',        # Dates and times must be timezone aware
+    'pubdate',
+    'last_modified',
+    'rights',
+    # So far only known publication type is periodical:calibre
+    # If None, means book
+    'publication_type',
+    'uuid',             # A UUID usually of type 4
+    'languages',        # ordered list of languages in this publication
+    'publisher',        # Simple string, no special semantics
+    # Absolute path to image file encoded in filesystem_encoding
+    'cover',
+    # Of the form (format, data) where format is, for e.g. 'jpeg', 'png', 'gif'...
+    'cover_data',
+    # Either thumbnail data, or an object with the attribute
+    # image_path which is the path to an image file, encoded
+    # in filesystem_encoding
+    'thumbnail',
+))
+
+BOOK_STRUCTURE_FIELDS = frozenset((
+    # These are used by code, Null values are None.
+    'toc', 'spine', 'guide', 'manifest',
+))
+
+USER_METADATA_FIELDS = frozenset((
+    # A dict of dicts similar to field_metadata. Each field description dict
+    # also contains a value field with the key #value#.
+    'user_metadata',
+))
+
+DEVICE_METADATA_FIELDS = frozenset((
+    'device_collections',   # Ordered list of strings
+    'lpath',                # Unicode, / separated
+    'size',                 # In bytes
+    'mime',                 # Mimetype of the book file being represented
+))
+
+CALIBRE_METADATA_FIELDS = frozenset((
+    'application_id',   # An application id, currently set to the db_id.
+    'db_id',            # the calibre primary key of the item.
+    'formats',          # list of formats (extensions) for this book
+    # a dict of user category names, where the value is a list of item names
+    # from the book that are in that category
+    'user_categories',
+    # a dict of author to an associated hyperlink
+    'author_link_map',
+))
+
+ALL_METADATA_FIELDS =      SOCIAL_METADATA_FIELDS.union(
+                           PUBLICATION_METADATA_FIELDS).union(
+                           BOOK_STRUCTURE_FIELDS).union(
+                           USER_METADATA_FIELDS).union(
+                           DEVICE_METADATA_FIELDS).union(
+                           CALIBRE_METADATA_FIELDS)
+
+# All fields except custom fields
+STANDARD_METADATA_FIELDS = SOCIAL_METADATA_FIELDS.union(
+                           PUBLICATION_METADATA_FIELDS).union(
+                           BOOK_STRUCTURE_FIELDS).union(
+                           DEVICE_METADATA_FIELDS).union(
+                           CALIBRE_METADATA_FIELDS)
+
+# Metadata fields that smart update must do special processing to copy.
+SC_FIELDS_NOT_COPIED =     frozenset(('title', 'title_sort', 'authors',
+                                      'author_sort', 'author_sort_map',
+                                      'cover_data', 'tags', 'languages',
+                                      'identifiers'))
+
+# Metadata fields that smart update should copy only if the source is not None
+SC_FIELDS_COPY_NOT_NULL =  frozenset(('device_collections', 'lpath', 'size', 'comments', 'thumbnail'))
+
+# Metadata fields that smart update should copy without special handling
+SC_COPYABLE_FIELDS =       SOCIAL_METADATA_FIELDS.union(
+                           PUBLICATION_METADATA_FIELDS).union(
+                           BOOK_STRUCTURE_FIELDS).union(
+                           DEVICE_METADATA_FIELDS).union(
+                           CALIBRE_METADATA_FIELDS) - \
+                           SC_FIELDS_NOT_COPIED.union(
+                           SC_FIELDS_COPY_NOT_NULL)
+
+SERIALIZABLE_FIELDS =      SOCIAL_METADATA_FIELDS.union(
+                           USER_METADATA_FIELDS).union(
+                           PUBLICATION_METADATA_FIELDS).union(
+                           CALIBRE_METADATA_FIELDS).union(
+                           DEVICE_METADATA_FIELDS) - \
+                           frozenset(('device_collections', 'formats',
+                               'cover_data'))
+# these are rebuilt when needed
--- a/ebook_converter/ebooks/metadata/book/base.py
+++ b/ebook_converter/ebooks/metadata/book/base.py
@@ -0,0 +1,841 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import copy, traceback
+
+from calibre import prints
+from calibre.constants import DEBUG, ispy3
+from calibre.ebooks.metadata.book import (SC_COPYABLE_FIELDS,
+        SC_FIELDS_COPY_NOT_NULL, STANDARD_METADATA_FIELDS,
+        TOP_LEVEL_IDENTIFIERS, ALL_METADATA_FIELDS)
+from calibre.library.field_metadata import FieldMetadata
+from calibre.utils.icu import sort_key
+from polyglot.builtins import iteritems, unicode_type, filter, map
+
+# Special sets used to optimize the performance of getting and setting
+# attributes on Metadata objects
+SIMPLE_GET = frozenset(STANDARD_METADATA_FIELDS - TOP_LEVEL_IDENTIFIERS)
+SIMPLE_SET = frozenset(SIMPLE_GET - {'identifiers'})
+
+
+def human_readable(size, precision=2):
+    """ Convert a size in bytes into megabytes """
+    return ('%.'+unicode_type(precision)+'f'+ 'MB') % (size/(1024*1024),)
+
+
+NULL_VALUES = {
+                'user_metadata': {},
+                'cover_data'   : (None, None),
+                'tags'         : [],
+                'identifiers'  : {},
+                'languages'    : [],
+                'device_collections': [],
+                'author_sort_map': {},
+                'authors'      : [_('Unknown')],
+                'author_sort'  : _('Unknown'),
+                'title'        : _('Unknown'),
+                'user_categories' : {},
+                'author_link_map' : {},
+                'language'     : 'und'
+}
+
+field_metadata = FieldMetadata()
+
+
+def reset_field_metadata():
+    global field_metadata
+    field_metadata = FieldMetadata()
+
+
+ck = lambda typ: icu_lower(typ).strip().replace(':', '').replace(',', '')
+cv = lambda val: val.strip().replace(',', '|')
+
+
+class Metadata(object):
+
+    '''
+    A class representing all the metadata for a book. The various standard metadata
+    fields are available as attributes of this object. You can also stick
+    arbitrary attributes onto this object.
+
+    Metadata from custom columns should be accessed via the get() method,
+    passing in the lookup name for the column, for example: "#mytags".
+
+    Use the :meth:`is_null` method to test if a field is null.
+
+    This object also has functions to format fields into strings.
+
+    The list of standard metadata fields grows with time is in
+    :data:`STANDARD_METADATA_FIELDS`.
+
+    Please keep the method based API of this class to a minimum. Every method
+    becomes a reserved field name.
+    '''
+    __calibre_serializable__ = True
+
+    def __init__(self, title, authors=(_('Unknown'),), other=None, template_cache=None,
+                 formatter=None):
+        '''
+        @param title: title or ``_('Unknown')``
+        @param authors: List of strings or []
+        @param other: None or a metadata object
+        '''
+        _data = copy.deepcopy(NULL_VALUES)
+        _data.pop('language')
+        object.__setattr__(self, '_data', _data)
+        if other is not None:
+            self.smart_update(other)
+        else:
+            if title:
+                self.title = title
+            if authors:
+                # List of strings or []
+                self.author = list(authors) if authors else []  # Needed for backward compatibility
+                self.authors = list(authors) if authors else []
+        from calibre.ebooks.metadata.book.formatter import SafeFormat
+        self.formatter = SafeFormat() if formatter is None else formatter
+        self.template_cache = template_cache
+
+    def is_null(self, field):
+        '''
+        Return True if the value of field is null in this object.
+        'null' means it is unknown or evaluates to False. So a title of
+        _('Unknown') is null or a language of 'und' is null.
+
+        Be careful with numeric fields since this will return True for zero as
+        well as None.
+
+        Also returns True if the field does not exist.
+        '''
+        try:
+            null_val = NULL_VALUES.get(field, None)
+            val = getattr(self, field, None)
+            return not val or val == null_val
+        except:
+            return True
+
+    def set_null(self, field):
+        null_val = copy.copy(NULL_VALUES.get(field))
+        setattr(self, field, null_val)
+
+    def __getattribute__(self, field):
+        _data = object.__getattribute__(self, '_data')
+        if field in SIMPLE_GET:
+            return _data.get(field, None)
+        if field in TOP_LEVEL_IDENTIFIERS:
+            return _data.get('identifiers').get(field, None)
+        if field == 'language':
+            try:
+                return _data.get('languages', [])[0]
+            except:
+                return NULL_VALUES['language']
+        try:
+            return object.__getattribute__(self, field)
+        except AttributeError:
+            pass
+        if field in _data['user_metadata']:
+            d = _data['user_metadata'][field]
+            val = d['#value#']
+            if d['datatype'] != 'composite':
+                return val
+            if val is None:
+                d['#value#'] = 'RECURSIVE_COMPOSITE FIELD (Metadata) ' + field
+                val = d['#value#'] = self.formatter.safe_format(
+                                            d['display']['composite_template'],
+                                            self,
+                                            _('TEMPLATE ERROR'),
+                                            self, column_name=field,
+                                            template_cache=self.template_cache).strip()
+            return val
+        if field.startswith('#') and field.endswith('_index'):
+            try:
+                return self.get_extra(field[:-6])
+            except:
+                pass
+        raise AttributeError(
+                'Metadata object has no attribute named: '+ repr(field))
+
+    def __setattr__(self, field, val, extra=None):
+        _data = object.__getattribute__(self, '_data')
+        if field in SIMPLE_SET:
+            if val is None:
+                val = copy.copy(NULL_VALUES.get(field, None))
+            _data[field] = val
+        elif field in TOP_LEVEL_IDENTIFIERS:
+            field, val = self._clean_identifier(field, val)
+            identifiers = _data['identifiers']
+            identifiers.pop(field, None)
+            if val:
+                identifiers[field] = val
+        elif field == 'identifiers':
+            if not val:
+                val = copy.copy(NULL_VALUES.get('identifiers', None))
+            self.set_identifiers(val)
+        elif field == 'language':
+            langs = []
+            if val and val.lower() != 'und':
+                langs = [val]
+            _data['languages'] = langs
+        elif field in _data['user_metadata']:
+            _data['user_metadata'][field]['#value#'] = val
+            _data['user_metadata'][field]['#extra#'] = extra
+        else:
+            # You are allowed to stick arbitrary attributes onto this object as
+            # long as they don't conflict with global or user metadata names
+            # Don't abuse this privilege
+            self.__dict__[field] = val
+
+    def __iter__(self):
+        return iter(object.__getattribute__(self, '_data'))
+
+    def has_key(self, key):
+        return key in object.__getattribute__(self, '_data')
+
+    def deepcopy(self, class_generator=lambda : Metadata(None)):
+        ''' Do not use this method unless you know what you are doing, if you
+        want to create a simple clone of this object, use :meth:`deepcopy_metadata`
+        instead. Class_generator must be a function that returns an instance
+        of Metadata or a subclass of it.'''
+        m = class_generator()
+        if not isinstance(m, Metadata):
+            return None
+        object.__setattr__(m, '__dict__', copy.deepcopy(self.__dict__))
+        return m
+
+    def deepcopy_metadata(self):
+        m = Metadata(None)
+        object.__setattr__(m, '_data', copy.deepcopy(object.__getattribute__(self, '_data')))
+        return m
+
+    def get(self, field, default=None):
+        try:
+            return self.__getattribute__(field)
+        except AttributeError:
+            return default
+
+    def get_extra(self, field, default=None):
+        _data = object.__getattribute__(self, '_data')
+        if field in _data['user_metadata']:
+            try:
+                return _data['user_metadata'][field]['#extra#']
+            except:
+                return default
+        raise AttributeError(
+                'Metadata object has no attribute named: '+ repr(field))
+
+    def set(self, field, val, extra=None):
+        self.__setattr__(field, val, extra)
+
+    def get_identifiers(self):
+        '''
+        Return a copy of the identifiers dictionary.
+        The dict is small, and the penalty for using a reference where a copy is
+        needed is large. Also, we don't want any manipulations of the returned
+        dict to show up in the book.
+        '''
+        ans = object.__getattribute__(self,
+            '_data')['identifiers']
+        if not ans:
+            ans = {}
+        return copy.deepcopy(ans)
+
+    def _clean_identifier(self, typ, val):
+        if typ:
+            typ = ck(typ)
+        if val:
+            val = cv(val)
+        return typ, val
+
+    def set_identifiers(self, identifiers):
+        '''
+        Set all identifiers. Note that if you previously set ISBN, calling
+        this method will delete it.
+        '''
+        cleaned = {ck(k):cv(v) for k, v in iteritems(identifiers) if k and v}
+        object.__getattribute__(self, '_data')['identifiers'] = cleaned
+
+    def set_identifier(self, typ, val):
+        'If val is empty, deletes identifier of type typ'
+        typ, val = self._clean_identifier(typ, val)
+        if not typ:
+            return
+        identifiers = object.__getattribute__(self,
+            '_data')['identifiers']
+
+        identifiers.pop(typ, None)
+        if val:
+            identifiers[typ] = val
+
+    def has_identifier(self, typ):
+        identifiers = object.__getattribute__(self,
+            '_data')['identifiers']
+        return typ in identifiers
+
+    # field-oriented interface. Intended to be the same as in LibraryDatabase
+
+    def standard_field_keys(self):
+        '''
+        return a list of all possible keys, even if this book doesn't have them
+        '''
+        return STANDARD_METADATA_FIELDS
+
+    def custom_field_keys(self):
+        '''
+        return a list of the custom fields in this book
+        '''
+        return iter(object.__getattribute__(self, '_data')['user_metadata'])
+
+    def all_field_keys(self):
+        '''
+        All field keys known by this instance, even if their value is None
+        '''
+        _data = object.__getattribute__(self, '_data')
+        return frozenset(ALL_METADATA_FIELDS.union(frozenset(_data['user_metadata'])))
+
+    def metadata_for_field(self, key):
+        '''
+        return metadata describing a standard or custom field.
+        '''
+        if key not in self.custom_field_keys():
+            return self.get_standard_metadata(key, make_copy=False)
+        return self.get_user_metadata(key, make_copy=False)
+
+    def all_non_none_fields(self):
+        '''
+        Return a dictionary containing all non-None metadata fields, including
+        the custom ones.
+        '''
+        result = {}
+        _data = object.__getattribute__(self, '_data')
+        for attr in STANDARD_METADATA_FIELDS:
+            v = _data.get(attr, None)
+            if v is not None:
+                result[attr] = v
+        # separate these because it uses the self.get(), not _data.get()
+        for attr in TOP_LEVEL_IDENTIFIERS:
+            v = self.get(attr, None)
+            if v is not None:
+                result[attr] = v
+        for attr in _data['user_metadata']:
+            v = self.get(attr, None)
+            if v is not None:
+                result[attr] = v
+                if _data['user_metadata'][attr]['datatype'] == 'series':
+                    result[attr+'_index'] = _data['user_metadata'][attr]['#extra#']
+        return result
+
+    # End of field-oriented interface
+
+    # Extended interfaces. These permit one to get copies of metadata dictionaries, and to
+    # get and set custom field metadata
+
+    def get_standard_metadata(self, field, make_copy):
+        '''
+        return field metadata from the field if it is there. Otherwise return
+        None. field is the key name, not the label. Return a copy if requested,
+        just in case the user wants to change values in the dict.
+        '''
+        if field in field_metadata and field_metadata[field]['kind'] == 'field':
+            if make_copy:
+                return copy.deepcopy(field_metadata[field])
+            return field_metadata[field]
+        return None
+
+    def get_all_standard_metadata(self, make_copy):
+        '''
+        return a dict containing all the standard field metadata associated with
+        the book.
+        '''
+        if not make_copy:
+            return field_metadata
+        res = {}
+        for k in field_metadata:
+            if field_metadata[k]['kind'] == 'field':
+                res[k] = copy.deepcopy(field_metadata[k])
+        return res
+
+    def get_all_user_metadata(self, make_copy):
+        '''
+        return a dict containing all the custom field metadata associated with
+        the book.
+        '''
+        _data = object.__getattribute__(self, '_data')
+        user_metadata = _data['user_metadata']
+        if not make_copy:
+            return user_metadata
+        res = {}
+        for k in user_metadata:
+            res[k] = copy.deepcopy(user_metadata[k])
+        return res
+
+    def get_user_metadata(self, field, make_copy):
+        '''
+        return field metadata from the object if it is there. Otherwise return
+        None. field is the key name, not the label. Return a copy if requested,
+        just in case the user wants to change values in the dict.
+        '''
+        _data = object.__getattribute__(self, '_data')
+        _data = _data['user_metadata']
+        if field in _data:
+            if make_copy:
+                return copy.deepcopy(_data[field])
+            return _data[field]
+        return None
+
+    def set_all_user_metadata(self, metadata):
+        '''
+        store custom field metadata into the object. Field is the key name
+        not the label
+        '''
+        if metadata is None:
+            traceback.print_stack()
+            return
+
+        um = {}
+        for key, meta in iteritems(metadata):
+            m = meta.copy()
+            if '#value#' not in m:
+                if m['datatype'] == 'text' and m['is_multiple']:
+                    m['#value#'] = []
+                else:
+                    m['#value#'] = None
+            um[key] = m
+        _data = object.__getattribute__(self, '_data')
+        _data['user_metadata'] = um
+
+    def set_user_metadata(self, field, metadata):
+        '''
+        store custom field metadata for one column into the object. Field is
+        the key name not the label
+        '''
+        if field is not None:
+            if not field.startswith('#'):
+                raise AttributeError(
+                        'Custom field name %s must begin with \'#\''%repr(field))
+            if metadata is None:
+                traceback.print_stack()
+                return
+            m = dict(metadata)
+            # Copying the elements should not be necessary. The objects referenced
+            # in the dict should not change. Of course, they can be replaced.
+            # for k,v in iteritems(metadata):
+            #     m[k] = copy.copy(v)
+            if '#value#' not in m:
+                if m['datatype'] == 'text' and m['is_multiple']:
+                    m['#value#'] = []
+                else:
+                    m['#value#'] = None
+            _data = object.__getattribute__(self, '_data')
+            _data['user_metadata'][field] = m
+
+    def template_to_attribute(self, other, ops):
+        '''
+        Takes a list [(src,dest), (src,dest)], evaluates the template in the
+        context of other, then copies the result to self[dest]. This is on a
+        best-efforts basis. Some assignments can make no sense.
+        '''
+        if not ops:
+            return
+        from calibre.ebooks.metadata.book.formatter import SafeFormat
+        formatter = SafeFormat()
+        for op in ops:
+            try:
+                src = op[0]
+                dest = op[1]
+                val = formatter.safe_format(src, other, 'PLUGBOARD TEMPLATE ERROR', other)
+                if dest == 'tags':
+                    self.set(dest, [f.strip() for f in val.split(',') if f.strip()])
+                elif dest == 'authors':
+                    self.set(dest, [f.strip() for f in val.split('&') if f.strip()])
+                else:
+                    self.set(dest, val)
+            except:
+                if DEBUG:
+                    traceback.print_exc()
+
+    # Old Metadata API {{{
+    def print_all_attributes(self):
+        for x in STANDARD_METADATA_FIELDS:
+            prints('%s:'%x, getattr(self, x, 'None'))
+        for x in self.custom_field_keys():
+            meta = self.get_user_metadata(x, make_copy=False)
+            if meta is not None:
+                prints(x, meta)
+        prints('--------------')
+
+    def smart_update(self, other, replace_metadata=False):
+        '''
+        Merge the information in `other` into self. In case of conflicts, the information
+        in `other` takes precedence, unless the information in `other` is NULL.
+        '''
+        def copy_not_none(dest, src, attr):
+            v = getattr(src, attr, None)
+            if v not in (None, NULL_VALUES.get(attr, None)):
+                setattr(dest, attr, copy.deepcopy(v))
+
+        unknown = _('Unknown')
+        if other.title and other.title != unknown:
+            self.title = other.title
+            if hasattr(other, 'title_sort'):
+                self.title_sort = other.title_sort
+
+        if other.authors and (
+                other.authors[0] != unknown or (
+                    not self.authors or (
+                        len(self.authors) == 1 and self.authors[0] == unknown and
+                        getattr(self, 'author_sort', None) == unknown
+                    )
+                )
+        ):
+            self.authors = list(other.authors)
+            if hasattr(other, 'author_sort_map'):
+                self.author_sort_map = dict(other.author_sort_map)
+            if hasattr(other, 'author_sort'):
+                self.author_sort = other.author_sort
+
+        if replace_metadata:
+            # SPECIAL_FIELDS = frozenset(['lpath', 'size', 'comments', 'thumbnail'])
+            for attr in SC_COPYABLE_FIELDS:
+                setattr(self, attr, getattr(other, attr, 1.0 if
+                        attr == 'series_index' else None))
+            self.tags = other.tags
+            self.cover_data = getattr(other, 'cover_data',
+                                      NULL_VALUES['cover_data'])
+            self.set_all_user_metadata(other.get_all_user_metadata(make_copy=True))
+            for x in SC_FIELDS_COPY_NOT_NULL:
+                copy_not_none(self, other, x)
+            if callable(getattr(other, 'get_identifiers', None)):
+                self.set_identifiers(other.get_identifiers())
+            # language is handled below
+        else:
+            for attr in SC_COPYABLE_FIELDS:
+                copy_not_none(self, other, attr)
+            for x in SC_FIELDS_COPY_NOT_NULL:
+                copy_not_none(self, other, x)
+
+            if other.tags:
+                # Case-insensitive but case preserving merging
+                lotags = [t.lower() for t in other.tags]
+                lstags = [t.lower() for t in self.tags]
+                ot, st = map(frozenset, (lotags, lstags))
+                for t in st.intersection(ot):
+                    sidx = lstags.index(t)
+                    oidx = lotags.index(t)
+                    self.tags[sidx] = other.tags[oidx]
+                self.tags += [t for t in other.tags if t.lower() in ot-st]
+
+            if getattr(other, 'cover_data', False):
+                other_cover = other.cover_data[-1]
+                self_cover = self.cover_data[-1] if self.cover_data else b''
+                if not self_cover:
+                    self_cover = b''
+                if not other_cover:
+                    other_cover = b''
+                if len(other_cover) > len(self_cover):
+                    self.cover_data = other.cover_data
+
+            if callable(getattr(other, 'custom_field_keys', None)):
+                for x in other.custom_field_keys():
+                    meta = other.get_user_metadata(x, make_copy=True)
+                    if meta is not None:
+                        self_tags = self.get(x, [])
+                        self.set_user_metadata(x, meta)  # get... did the deepcopy
+                        other_tags = other.get(x, [])
+                        if meta['datatype'] == 'text' and meta['is_multiple']:
+                            # Case-insensitive but case preserving merging
+                            lotags = [t.lower() for t in other_tags]
+                            try:
+                                lstags = [t.lower() for t in self_tags]
+                            except TypeError:
+                                # Happens if x is not a text, is_multiple field
+                                # on self
+                                lstags = []
+                                self_tags = []
+                            ot, st = map(frozenset, (lotags, lstags))
+                            for t in st.intersection(ot):
+                                sidx = lstags.index(t)
+                                oidx = lotags.index(t)
+                                self_tags[sidx] = other_tags[oidx]
+                            self_tags += [t for t in other_tags if t.lower() in ot-st]
+                            setattr(self, x, self_tags)
+
+            my_comments = getattr(self, 'comments', '')
+            other_comments = getattr(other, 'comments', '')
+            if not my_comments:
+                my_comments = ''
+            if not other_comments:
+                other_comments = ''
+            if len(other_comments.strip()) > len(my_comments.strip()):
+                self.comments = other_comments
+
+            # Copy all the non-none identifiers
+            if callable(getattr(other, 'get_identifiers', None)):
+                d = self.get_identifiers()
+                s = other.get_identifiers()
+                d.update([v for v in iteritems(s) if v[1] is not None])
+                self.set_identifiers(d)
+            else:
+                # other structure not Metadata. Copy the top-level identifiers
+                for attr in TOP_LEVEL_IDENTIFIERS:
+                    copy_not_none(self, other, attr)
+
+        other_lang = getattr(other, 'languages', [])
+        if other_lang and other_lang != ['und']:
+            self.languages = list(other_lang)
+        if not getattr(self, 'series', None):
+            self.series_index = None
+
+    def format_series_index(self, val=None):
+        from calibre.ebooks.metadata import fmt_sidx
+        v = self.series_index if val is None else val
+        try:
+            x = float(v)
+        except Exception:
+            x = 1
+        return fmt_sidx(x)
+
+    def authors_from_string(self, raw):
+        from calibre.ebooks.metadata import string_to_authors
+        self.authors = string_to_authors(raw)
+
+    def format_authors(self):
+        from calibre.ebooks.metadata import authors_to_string
+        return authors_to_string(self.authors)
+
+    def format_tags(self):
+        return ', '.join([unicode_type(t) for t in sorted(self.tags, key=sort_key)])
+
+    def format_rating(self, v=None, divide_by=1):
+        if v is None:
+            if self.rating is not None:
+                return unicode_type(self.rating/divide_by)
+            return 'None'
+        return unicode_type(v/divide_by)
+
+    def format_field(self, key, series_with_index=True):
+        '''
+        Returns the tuple (display_name, formatted_value)
+        '''
+        name, val, ign, ign = self.format_field_extended(key, series_with_index)
+        return (name, val)
+
+    def format_field_extended(self, key, series_with_index=True):
+        from calibre.ebooks.metadata import authors_to_string
+        '''
+        returns the tuple (display_name, formatted_value, original_value,
+        field_metadata)
+        '''
+        from calibre.utils.date import format_date
+
+        # Handle custom series index
+        if key.startswith('#') and key.endswith('_index'):
+            tkey = key[:-6]  # strip the _index
+            cmeta = self.get_user_metadata(tkey, make_copy=False)
+            if cmeta and cmeta['datatype'] == 'series':
+                if self.get(tkey):
+                    res = self.get_extra(tkey)
+                    return (unicode_type(cmeta['name']+'_index'),
+                            self.format_series_index(res), res, cmeta)
+                else:
+                    return (unicode_type(cmeta['name']+'_index'), '', '', cmeta)
+
+        if key in self.custom_field_keys():
+            res = self.get(key, None)       # get evaluates all necessary composites
+            cmeta = self.get_user_metadata(key, make_copy=False)
+            name = unicode_type(cmeta['name'])
+            if res is None or res == '':    # can't check "not res" because of numeric fields
+                return (name, res, None, None)
+            orig_res = res
+            datatype = cmeta['datatype']
+            if datatype == 'text' and cmeta['is_multiple']:
+                res = cmeta['is_multiple']['list_to_ui'].join(res)
+            elif datatype == 'series' and series_with_index:
+                if self.get_extra(key) is not None:
+                    res = res + \
+                        ' [%s]'%self.format_series_index(val=self.get_extra(key))
+            elif datatype == 'datetime':
+                res = format_date(res, cmeta['display'].get('date_format','dd MMM yyyy'))
+            elif datatype == 'bool':
+                res = _('Yes') if res else _('No')
+            elif datatype == 'rating':
+                res = '%.2g'%(res/2)
+            elif datatype in ['int', 'float']:
+                try:
+                    fmt = cmeta['display'].get('number_format', None)
+                    res = fmt.format(res)
+                except:
+                    pass
+            return (name, unicode_type(res), orig_res, cmeta)
+
+        # convert top-level ids into their value
+        if key in TOP_LEVEL_IDENTIFIERS:
+            fmeta = field_metadata['identifiers']
+            name = key
+            res = self.get(key, None)
+            return (name, res, res, fmeta)
+
+        # Translate aliases into the standard field name
+        fmkey = field_metadata.search_term_to_field_key(key)
+        if fmkey in field_metadata and field_metadata[fmkey]['kind'] == 'field':
+            res = self.get(key, None)
+            fmeta = field_metadata[fmkey]
+            name = unicode_type(fmeta['name'])
+            if res is None or res == '':
+                return (name, res, None, None)
+            orig_res = res
+            name = unicode_type(fmeta['name'])
+            datatype = fmeta['datatype']
+            if key == 'authors':
+                res = authors_to_string(res)
+            elif key == 'series_index':
+                res = self.format_series_index(res)
+            elif datatype == 'text' and fmeta['is_multiple']:
+                if isinstance(res, dict):
+                    res = [k + ':' + v for k,v in res.items()]
+                res = fmeta['is_multiple']['list_to_ui'].join(sorted(filter(None, res), key=sort_key))
+            elif datatype == 'series' and series_with_index:
+                res = res + ' [%s]'%self.format_series_index()
+            elif datatype == 'datetime':
+                res = format_date(res, fmeta['display'].get('date_format','dd MMM yyyy'))
+            elif datatype == 'rating':
+                res = '%.2g'%(res/2)
+            elif key == 'size':
+                res = human_readable(res)
+            return (name, unicode_type(res), orig_res, fmeta)
+
+        return (None, None, None, None)
+
+    def __unicode__representation__(self):
+        '''
+        A string representation of this object, suitable for printing to
+        console
+        '''
+        from calibre.utils.date import isoformat
+        from calibre.ebooks.metadata import authors_to_string
+        ans = []
+
+        def fmt(x, y):
+            ans.append('%-20s: %s'%(unicode_type(x), unicode_type(y)))
+
+        fmt('Title', self.title)
+        if self.title_sort:
+            fmt('Title sort', self.title_sort)
+        if self.authors:
+            fmt('Author(s)',  authors_to_string(self.authors) +
+               ((' [' + self.author_sort + ']')
+                if self.author_sort and self.author_sort != _('Unknown') else ''))
+        if self.publisher:
+            fmt('Publisher', self.publisher)
+        if getattr(self, 'book_producer', False):
+            fmt('Book Producer', self.book_producer)
+        if self.tags:
+            fmt('Tags', ', '.join([unicode_type(t) for t in self.tags]))
+        if self.series:
+            fmt('Series', self.series + ' #%s'%self.format_series_index())
+        if not self.is_null('languages'):
+            fmt('Languages', ', '.join(self.languages))
+        if self.rating is not None:
+            fmt('Rating', ('%.2g'%(float(self.rating)/2)) if self.rating
+                    else '')
+        if self.timestamp is not None:
+            fmt('Timestamp', isoformat(self.timestamp))
+        if self.pubdate is not None:
+            fmt('Published', isoformat(self.pubdate))
+        if self.rights is not None:
+            fmt('Rights', unicode_type(self.rights))
+        if self.identifiers:
+            fmt('Identifiers', ', '.join(['%s:%s'%(k, v) for k, v in
+                iteritems(self.identifiers)]))
+        if self.comments:
+            fmt('Comments', self.comments)
+
+        for key in self.custom_field_keys():
+            val = self.get(key, None)
+            if val:
+                (name, val) = self.format_field(key)
+                fmt(name, unicode_type(val))
+        return '\n'.join(ans)
+
+    def to_html(self):
+        '''
+        A HTML representation of this object.
+        '''
+        from calibre.ebooks.metadata import authors_to_string
+        from calibre.utils.date import isoformat
+        ans = [(_('Title'), unicode_type(self.title))]
+        ans += [(_('Author(s)'), (authors_to_string(self.authors) if self.authors else _('Unknown')))]
+        ans += [(_('Publisher'), unicode_type(self.publisher))]
+        ans += [(_('Producer'), unicode_type(self.book_producer))]
+        ans += [(_('Comments'), unicode_type(self.comments))]
+        ans += [('ISBN', unicode_type(self.isbn))]
+        ans += [(_('Tags'), ', '.join([unicode_type(t) for t in self.tags]))]
+        if self.series:
+            ans += [(_('Series'), unicode_type(self.series) + ' #%s'%self.format_series_index())]
+        ans += [(_('Languages'), ', '.join(self.languages))]
+        if self.timestamp is not None:
+            ans += [(_('Timestamp'), unicode_type(isoformat(self.timestamp, as_utc=False, sep=' ')))]
+        if self.pubdate is not None:
+            ans += [(_('Published'), unicode_type(isoformat(self.pubdate, as_utc=False, sep=' ')))]
+        if self.rights is not None:
+            ans += [(_('Rights'), unicode_type(self.rights))]
+        for key in self.custom_field_keys():
+            val = self.get(key, None)
+            if val:
+                (name, val) = self.format_field(key)
+                ans += [(name, val)]
+        for i, x in enumerate(ans):
+            ans[i] = '<tr><td><b>%s</b></td><td>%s</td></tr>'%x
+        return '<table>%s</table>'%'\n'.join(ans)
+
+    if ispy3:
+        __str__ = __unicode__representation__
+    else:
+        __unicode__ = __unicode__representation__
+
+        def __str__(self):
+            return self.__unicode__().encode('utf-8')
+
+    def __nonzero__(self):
+        return bool(self.title or self.author or self.comments or self.tags)
+    __bool__ = __nonzero__
+
+    # }}}
+
+
+def field_from_string(field, raw, field_metadata):
+    ''' Parse the string raw to return an object that is suitable for calling
+    set() on a Metadata object. '''
+    dt = field_metadata['datatype']
+    val = object
+    if dt in {'int', 'float'}:
+        val = int(raw) if dt == 'int' else float(raw)
+    elif dt == 'rating':
+        val = float(raw) * 2
+    elif dt == 'datetime':
+        from calibre.utils.date import parse_only_date
+        val = parse_only_date(raw)
+    elif dt == 'bool':
+        if raw.lower() in {'true', 'yes', 'y'}:
+            val = True
+        elif raw.lower() in {'false', 'no', 'n'}:
+            val = False
+        else:
+            raise ValueError('Unknown value for %s: %s'%(field, raw))
+    elif dt == 'text':
+        ism = field_metadata['is_multiple']
+        if ism:
+            val = [x.strip() for x in raw.split(ism['ui_to_list'])]
+            if field == 'identifiers':
+                val = {x.partition(':')[0]:x.partition(':')[-1] for x in val}
+            elif field == 'languages':
+                from calibre.utils.localization import canonicalize_lang
+                val = [canonicalize_lang(x) for x in val]
+                val = [x for x in val if x]
+    if val is object:
+        val = raw
+    return val
--- a/ebook_converter/ebooks/metadata/book/formatter.py
+++ b/ebook_converter/ebooks/metadata/book/formatter.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from calibre.ebooks.metadata.book import TOP_LEVEL_IDENTIFIERS, ALL_METADATA_FIELDS
+
+from calibre.utils.formatter import TemplateFormatter
+
+
+class SafeFormat(TemplateFormatter):
+
+    def __init__(self):
+        TemplateFormatter.__init__(self)
+
+    def get_value(self, orig_key, args, kwargs):
+        if not orig_key:
+            return ''
+        key = orig_key = orig_key.lower()
+        if (key != 'title_sort' and key not in TOP_LEVEL_IDENTIFIERS and
+                key not in ALL_METADATA_FIELDS):
+            from calibre.ebooks.metadata.book.base import field_metadata
+            key = field_metadata.search_term_to_field_key(key)
+            if key is None or (self.book and
+                                key not in self.book.all_field_keys()):
+                if hasattr(self.book, orig_key):
+                    key = orig_key
+                else:
+                    raise ValueError(_('Value: unknown field ') + orig_key)
+        try:
+            b = self.book.get_user_metadata(key, False)
+        except:
+            b = None
+        if b and b['datatype'] in {'int', 'float'} and self.book.get(key, None) is None:
+            v = ''
+        else:
+            v = self.book.format_field(key, series_with_index=False)[1]
+        if v is None:
+            return ''
+        if v == '':
+            return ''
+        return v
+
+
--- a/ebook_converter/ebooks/metadata/book/json_codec.py
+++ b/ebook_converter/ebooks/metadata/book/json_codec.py
@@ -0,0 +1,218 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+'''
+Created on 4 Jun 2010
+
+@author: charles
+'''
+
+import json, traceback
+from datetime import datetime, time
+
+from calibre.ebooks.metadata.book import SERIALIZABLE_FIELDS
+from calibre.constants import filesystem_encoding, preferred_encoding
+from calibre.library.field_metadata import FieldMetadata
+from calibre import isbytestring
+from polyglot.builtins import iteritems, itervalues, as_bytes
+from polyglot.binary import as_base64_unicode, from_base64_bytes
+
+# Translate datetimes to and from strings. The string form is the datetime in
+# UTC. The returned date is also UTC
+
+
+def string_to_datetime(src):
+    from calibre.utils.iso8601 import parse_iso8601
+    if src != "None":
+        try:
+            return parse_iso8601(src)
+        except Exception:
+            pass
+    return None
+
+
+def datetime_to_string(dateval):
+    from calibre.utils.date import isoformat, UNDEFINED_DATE, local_tz
+    if dateval is None:
+        return "None"
+    if not isinstance(dateval, datetime):
+        dateval = datetime.combine(dateval, time())
+    if hasattr(dateval, 'tzinfo') and dateval.tzinfo is None:
+        dateval = dateval.replace(tzinfo=local_tz)
+    if dateval <= UNDEFINED_DATE:
+        return "None"
+    return isoformat(dateval)
+
+
+def encode_thumbnail(thumbnail):
+    '''
+    Encode the image part of a thumbnail, then return the 3 part tuple
+    '''
+    from calibre.utils.imghdr import identify
+    if thumbnail is None:
+        return None
+    if not isinstance(thumbnail, (tuple, list)):
+        try:
+            width, height = identify(as_bytes(thumbnail))[1:]
+            if width < 0 or height < 0:
+                return None
+            thumbnail = (width, height, thumbnail)
+        except Exception:
+            return None
+    return (thumbnail[0], thumbnail[1], as_base64_unicode(thumbnail[2]))
+
+
+def decode_thumbnail(tup):
+    '''
+    Decode an encoded thumbnail into its 3 component parts
+    '''
+    if tup is None:
+        return None
+    return (tup[0], tup[1], from_base64_bytes(tup[2]))
+
+
+def object_to_unicode(obj, enc=preferred_encoding):
+
+    def dec(x):
+        return x.decode(enc, 'replace')
+
+    if isbytestring(obj):
+        return dec(obj)
+    if isinstance(obj, (list, tuple)):
+        return [dec(x) if isbytestring(x) else object_to_unicode(x) for x in obj]
+    if isinstance(obj, dict):
+        ans = {}
+        for k, v in obj.items():
+            k = object_to_unicode(k)
+            v = object_to_unicode(v)
+            ans[k] = v
+        return ans
+    return obj
+
+
+def encode_is_multiple(fm):
+    if fm.get('is_multiple', None):
+        # migrate is_multiple back to a character
+        fm['is_multiple2'] = fm.get('is_multiple', {})
+        dt = fm.get('datatype', None)
+        if dt == 'composite':
+            fm['is_multiple'] = ','
+        else:
+            fm['is_multiple'] =  '|'
+    else:
+        fm['is_multiple'] = None
+        fm['is_multiple2'] = {}
+
+
+def decode_is_multiple(fm):
+    im = fm.get('is_multiple2',  None)
+    if im:
+        fm['is_multiple'] = im
+        del fm['is_multiple2']
+    else:
+        # Must migrate the is_multiple from char to dict
+        im = fm.get('is_multiple',  {})
+        if im:
+            dt = fm.get('datatype', None)
+            if dt == 'composite':
+                im = {'cache_to_list': ',', 'ui_to_list': ',',
+                      'list_to_ui': ', '}
+            elif fm.get('display', {}).get('is_names', False):
+                im = {'cache_to_list': '|', 'ui_to_list': '&',
+                      'list_to_ui': ', '}
+            else:
+                im = {'cache_to_list': '|', 'ui_to_list': ',',
+                      'list_to_ui': ', '}
+        elif im is None:
+            im = {}
+        fm['is_multiple'] = im
+
+
+class JsonCodec(object):
+
+    def __init__(self, field_metadata=None):
+        self.field_metadata = field_metadata or FieldMetadata()
+
+    def encode_to_file(self, file_, booklist):
+        data = json.dumps(self.encode_booklist_metadata(booklist), indent=2)
+        if not isinstance(data, bytes):
+            data = data.encode('utf-8')
+        file_.write(data)
+
+    def encode_booklist_metadata(self, booklist):
+        result = []
+        for book in booklist:
+            result.append(self.encode_book_metadata(book))
+        return result
+
+    def encode_book_metadata(self, book):
+        result = {}
+        for key in SERIALIZABLE_FIELDS:
+            result[key] = self.encode_metadata_attr(book, key)
+        return result
+
+    def encode_metadata_attr(self, book, key):
+        if key == 'user_metadata':
+            meta = book.get_all_user_metadata(make_copy=True)
+            for fm in itervalues(meta):
+                if fm['datatype'] == 'datetime':
+                    fm['#value#'] = datetime_to_string(fm['#value#'])
+                encode_is_multiple(fm)
+            return meta
+        if key in self.field_metadata:
+            datatype = self.field_metadata[key]['datatype']
+        else:
+            datatype = None
+        value = book.get(key)
+        if key == 'thumbnail':
+            return encode_thumbnail(value)
+        elif isbytestring(value):  # str includes bytes
+            enc = filesystem_encoding if key == 'lpath' else preferred_encoding
+            return object_to_unicode(value, enc=enc)
+        elif datatype == 'datetime':
+            return datetime_to_string(value)
+        else:
+            return object_to_unicode(value)
+
+    def decode_from_file(self, file_, booklist, book_class, prefix):
+        js = []
+        try:
+            js = json.load(file_, encoding='utf-8')
+            for item in js:
+                entry = self.raw_to_book(item, book_class, prefix)
+                if entry is not None:
+                    booklist.append(entry)
+        except:
+            print('exception during JSON decode_from_file')
+            traceback.print_exc()
+
+    def raw_to_book(self, json_book, book_class, prefix):
+        try:
+            book = book_class(prefix, json_book.get('lpath', None))
+            for key,val in iteritems(json_book):
+                meta = self.decode_metadata(key, val)
+                if key == 'user_metadata':
+                    book.set_all_user_metadata(meta)
+                else:
+                    if key == 'classifiers':
+                        key = 'identifiers'
+                    setattr(book, key, meta)
+            return book
+        except:
+            print('exception during JSON decoding')
+            traceback.print_exc()
+
+    def decode_metadata(self, key, value):
+        if key == 'classifiers':
+            key = 'identifiers'
+        if key == 'user_metadata':
+            for fm in itervalues(value):
+                if fm['datatype'] == 'datetime':
+                    fm['#value#'] = string_to_datetime(fm['#value#'])
+                decode_is_multiple(fm)
+            return value
+        elif key in self.field_metadata:
+            if self.field_metadata[key]['datatype'] == 'datetime':
+                return string_to_datetime(value)
+        if key == 'thumbnail':
+            return decode_thumbnail(value)
+        return value
--- a/ebook_converter/ebooks/metadata/html.py
+++ b/ebook_converter/ebooks/metadata/html.py
@@ -0,0 +1,412 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+'''
+Try to read metadata from an HTML file.
+'''
+
+import re
+import unittest
+
+from collections import defaultdict
+from html5_parser import parse
+from lxml.etree import Comment
+
+from calibre.ebooks.metadata import string_to_authors, authors_to_string
+from calibre.ebooks.metadata.book.base import Metadata
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre import replace_entities, isbytestring
+from calibre.utils.date import parse_date, is_date_undefined
+from polyglot.builtins import iteritems
+
+
+def get_metadata(stream):
+    src = stream.read()
+    return get_metadata_(src)
+
+
+COMMENT_NAMES = {
+    'title': 'TITLE',
+    'authors': 'AUTHOR',
+    'publisher': 'PUBLISHER',
+    'isbn': 'ISBN',
+    'languages': 'LANGUAGE',
+    'pubdate': 'PUBDATE',
+    'timestamp': 'TIMESTAMP',
+    'series': 'SERIES',
+    'series_index': 'SERIESNUMBER',
+    'rating': 'RATING',
+    'comments': 'COMMENTS',
+    'tags': 'TAGS',
+}
+
+META_NAMES = {
+    'title' : ('dc.title', 'dcterms.title', 'title'),
+    'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'),
+    'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'),
+    'isbn': ('isbn',),
+    'languages': ('dc.language', 'dcterms.language'),
+    'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'),
+    'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'),
+    'series': ('series',),
+    'series_index': ('seriesnumber', 'series_index', 'series.index'),
+    'rating': ('rating',),
+    'comments': ('comments', 'dc.description'),
+    'tags': ('tags',),
+}
+rmap_comment = {v:k for k, v in iteritems(COMMENT_NAMES)}
+rmap_meta = {v:k for k, l in iteritems(META_NAMES) for v in l}
+
+
+# Extract an HTML attribute value, supports both single and double quotes and
+# single quotes inside double quotes and vice versa.
+attr_pat = r'''(?:(?P<sq>')|(?P<dq>"))(?P<content>(?(sq)[^']+|[^"]+))(?(sq)'|")'''
+
+
+def handle_comment(data, comment_tags):
+    if not hasattr(handle_comment, 'pat'):
+        handle_comment.pat = re.compile(r'''(?P<name>\S+)\s*=\s*%s''' % attr_pat)
+    for match in handle_comment.pat.finditer(data):
+        x = match.group('name')
+        field = None
+        try:
+            field = rmap_comment[x]
+        except KeyError:
+            pass
+        if field:
+            comment_tags[field].append(replace_entities(match.group('content')))
+
+
+def parse_metadata(src):
+    root = parse(src)
+    comment_tags = defaultdict(list)
+    meta_tags = defaultdict(list)
+    meta_tag_ids = defaultdict(list)
+    title = ''
+    identifier_pat = re.compile(r'(?:dc|dcterms)[.:]identifier(?:\.|$)', flags=re.IGNORECASE)
+    id_pat2 = re.compile(r'(?:dc|dcterms)[.:]identifier$', flags=re.IGNORECASE)
+
+    for comment in root.iterdescendants(tag=Comment):
+        if comment.text:
+            handle_comment(comment.text, comment_tags)
+
+    for q in root.iterdescendants(tag='title'):
+        if q.text:
+            title = q.text
+            break
+
+    for meta in root.iterdescendants(tag='meta'):
+        name, content = meta.get('name'), meta.get('content')
+        if not name or not content:
+            continue
+        if identifier_pat.match(name) is not None:
+            scheme = None
+            if id_pat2.match(name) is not None:
+                scheme = meta.get('scheme')
+            else:
+                elements = re.split(r'[.:]', name)
+                if len(elements) == 3 and not meta.get('scheme'):
+                    scheme = elements[2].strip()
+            if scheme:
+                meta_tag_ids[scheme.lower()].append(content)
+        else:
+            x = name.lower()
+            field = None
+            try:
+                field = rmap_meta[x]
+            except KeyError:
+                try:
+                    field = rmap_meta[x.replace(':', '.')]
+                except KeyError:
+                    pass
+            if field:
+                meta_tags[field].append(content)
+
+    return comment_tags, meta_tags, meta_tag_ids, title
+
+
+def get_metadata_(src, encoding=None):
+    # Meta data definitions as in
+    # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9
+
+    if isbytestring(src):
+        if not encoding:
+            src = xml_to_unicode(src)[0]
+        else:
+            src = src.decode(encoding, 'replace')
+    src = src[:150000]  # Searching shouldn't take too long
+    comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src)
+
+    def get_all(field):
+        ans = comment_tags.get(field, meta_tags.get(field, None))
+        if ans:
+            ans = [x.strip() for x in ans if x.strip()]
+        if not ans:
+            ans = None
+        return ans
+
+    def get(field):
+        ans = get_all(field)
+        if ans:
+            ans = ans[0]
+        return ans
+
+    # Title
+    title = get('title') or title_tag.strip() or _('Unknown')
+
+    # Author
+    authors = authors_to_string(get_all('authors')) or _('Unknown')
+
+    # Create MetaInformation with Title and Author
+    mi = Metadata(title, string_to_authors(authors))
+
+    # Single-value text fields
+    for field in ('publisher', 'isbn'):
+        val = get(field)
+        if val:
+            setattr(mi, field, val)
+
+    # Multi-value text fields
+    for field in ('languages',):
+        val = get_all(field)
+        if val:
+            setattr(mi, field, val)
+
+    # HTML fields
+    for field in ('comments',):
+        val = get(field)
+        if val:
+            setattr(mi, field, val.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;'))
+
+    # Date fields
+    for field in ('pubdate', 'timestamp'):
+        try:
+            val = parse_date(get(field))
+        except:
+            pass
+        else:
+            if not is_date_undefined(val):
+                setattr(mi, field, val)
+
+    # SERIES
+    series = get('series')
+    if series:
+        pat = re.compile(r'\[([.0-9]+)\]$')
+        match = pat.search(series)
+        series_index = None
+        if match is not None:
+            try:
+                series_index = float(match.group(1))
+            except:
+                pass
+            series = series.replace(match.group(), '').strip()
+        mi.series = series
+        if series_index is None:
+            series_index = get('series_index')
+            try:
+                series_index = float(series_index)
+            except:
+                pass
+        if series_index is not None:
+            mi.series_index = series_index
+
+    # RATING
+    rating = get('rating')
+    if rating:
+        try:
+            mi.rating = float(rating)
+            if mi.rating < 0:
+                mi.rating = 0
+            if mi.rating > 10:
+                mi.rating = 0
+        except:
+            pass
+
+    # TAGS
+    tags = get_all('tags')
+    if tags:
+        tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
+        if tags:
+            mi.tags = tags
+
+    # IDENTIFIERS
+    for (k,v) in iteritems(meta_tag_ids):
+        v = [x.strip() for x in v if x.strip()]
+        if v:
+            mi.set_identifier(k, v[0])
+
+    return mi
+
+
+class MetadataHtmlTest(unittest.TestCase):
+
+    def compare_metadata(self, meta_a, meta_b):
+        for attr in (
+            'title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series',
+            'series_index', 'rating', 'comments', 'tags', 'identifiers'
+        ):
+            self.assertEqual(getattr(meta_a, attr), getattr(meta_b, attr))
+
+    def get_stream(self, test):
+        from io import BytesIO
+
+        raw = b'''\
+<html>
+    <head>
+'''
+
+        if test in {'title', 'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
+            raw += b'''\
+        }
+        <title>A Title Tag &amp;amp; Title &#x24B8;</title>
+'''
+
+        if test in {'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
+            raw += b'''\
+        <meta name="dc:title" content="A Meta Tag &amp;amp; Title &#9400;" />
+        <meta name="dcterms.creator.aut" content="George Washington" />
+        <meta name="dc.publisher" content="Publisher A" />
+        <meta name="isbn" content="1234567890" />
+        <meta name="dc.language" content="English" />
+        <meta name="dc.date.published" content="2019-01-01" />
+        <meta name="dcterms.created" content="2018-01-01" />
+        <meta name="series" content="Meta Series" />
+        <meta name="seriesnumber" content="1" />
+        <meta name="rating" content="" />
+        <meta name="dc.description" content="" />
+        <meta name="tags" content="tag a, tag b" />
+        <meta name="dc.identifier.url" content="" />
+        <meta name="dc.identifier" scheme="" content="invalid" />
+        <meta name="dc.identifier." content="still invalid" />
+        <meta name="dc.identifier.conflicting" scheme="schemes" content="are also invalid" />
+        <meta name="dc.identifier.custom.subid" content="invalid too" />
+'''
+
+        if test in {'meta_multi', 'comment_single', 'comment_multi'}:
+            raw += b'''\
+        <meta name="title" content="A Different Meta Tag &amp;amp; Title &#9400;" />
+        <meta name="author" content="John Adams with Thomas Jefferson" />
+        <meta name="publisher" content="Publisher B" />
+        <meta name="isbn" content="2345678901" />
+        <meta name="dcterms.language" content="Spanish" />
+        <meta name="date of publication" content="2017-01-01" />
+        <meta name="timestamp" content="2016-01-01" />
+        <meta name="series" content="Another Meta Series" />
+        <meta name="series.index" content="2" />
+        <meta name="rating" content="8" />
+        <meta name="comments" content="meta &quot;comments&quot; &#x2665; HTML &amp;amp;" />
+        <meta name="tags" content="tag c" />
+        <meta name="dc.identifier.url" content="http://google.com/search?q=calibre" />
+'''
+
+        if test in {'comment_single', 'comment_multi'}:
+            raw += b'''\
+        <!-- TITLE="A Comment Tag &amp;amp; Title &#9400;" -->
+        <!-- AUTHOR="James Madison and James Monroe" -->
+        <!-- PUBLISHER="Publisher C" -->
+        <!-- ISBN="3456789012" -->
+        <!-- LANGUAGE="French" -->
+        <!-- PUBDATE="2015-01-01" -->
+        <!-- TIMESTAMP="2014-01-01" -->
+        <!-- SERIES="Comment Series" -->
+        <!-- SERIESNUMBER="3" -->
+        <!-- RATING="20" -->
+        <!-- COMMENTS="comment &quot;comments&quot; &#x2665; HTML -- too &amp;amp;" -->
+        <!-- TAGS="tag d" -->
+'''
+
+        if test in {'comment_multi'}:
+            raw += b'''\
+        <!-- TITLE="Another Comment Tag &amp;amp; Title &#9400;" -->
+        <!-- AUTHOR="John Quincy Adams" -->
+        <!-- PUBLISHER="Publisher D" -->
+        <!-- ISBN="4567890123" -->
+        <!-- LANGUAGE="Japanese" -->
+        <!-- PUBDATE="2013-01-01" -->
+        <!-- TIMESTAMP="2012-01-01" -->
+        <!-- SERIES="Comment Series 2" -->
+        <!-- SERIESNUMBER="4" -->
+        <!-- RATING="1" -->
+        <!-- COMMENTS="comment &quot;comments&quot; &#x2665; HTML -- too &amp;amp; for sure" -->
+        <!-- TAGS="tag e, tag f" -->
+'''
+
+        raw += b'''\
+    </head>
+    <body>
+    </body>
+</html>
+'''
+        return BytesIO(raw)
+
+    def test_input_title(self):
+        stream_meta = get_metadata(self.get_stream('title'))
+        canon_meta = Metadata('A Title Tag &amp; Title Ⓒ', [_('Unknown')])
+        self.compare_metadata(stream_meta, canon_meta)
+
+    def test_input_meta_single(self):
+        stream_meta = get_metadata(self.get_stream('meta_single'))
+        canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington'])
+        canon_meta.publisher = 'Publisher A'
+        canon_meta.languages = ['English']
+        canon_meta.pubdate = parse_date('2019-01-01')
+        canon_meta.timestamp = parse_date('2018-01-01')
+        canon_meta.series = 'Meta Series'
+        canon_meta.series_index = float(1)
+        # canon_meta.rating = float(0)
+        # canon_meta.comments = ''
+        canon_meta.tags = ['tag a', 'tag b']
+        canon_meta.set_identifiers({'isbn': '1234567890'})
+        self.compare_metadata(stream_meta, canon_meta)
+
+    def test_input_meta_multi(self):
+        stream_meta = get_metadata(self.get_stream('meta_multi'))
+        canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson'])
+        canon_meta.publisher = 'Publisher A'
+        canon_meta.languages = ['English', 'Spanish']
+        canon_meta.pubdate = parse_date('2019-01-01')
+        canon_meta.timestamp = parse_date('2018-01-01')
+        canon_meta.series = 'Meta Series'
+        canon_meta.series_index = float(1)
+        canon_meta.rating = float(8)
+        canon_meta.comments = 'meta &quot;comments&quot; ♥ HTML &amp;amp;'
+        canon_meta.tags = ['tag a', 'tag b', 'tag c']
+        canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'})
+        self.compare_metadata(stream_meta, canon_meta)
+
+    def test_input_comment_single(self):
+        stream_meta = get_metadata(self.get_stream('comment_single'))
+        canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ', ['James Madison', 'James Monroe'])
+        canon_meta.publisher = 'Publisher C'
+        canon_meta.languages = ['French']
+        canon_meta.pubdate = parse_date('2015-01-01')
+        canon_meta.timestamp = parse_date('2014-01-01')
+        canon_meta.series = 'Comment Series'
+        canon_meta.series_index = float(3)
+        canon_meta.rating = float(0)
+        canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML -- too &amp;amp;'
+        canon_meta.tags = ['tag d']
+        canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
+        self.compare_metadata(stream_meta, canon_meta)
+
+    def test_input_comment_multi(self):
+        stream_meta = get_metadata(self.get_stream('comment_multi'))
+        canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams'])
+        canon_meta.publisher = 'Publisher C'
+        canon_meta.languages = ['French', 'Japanese']
+        canon_meta.pubdate = parse_date('2015-01-01')
+        canon_meta.timestamp = parse_date('2014-01-01')
+        canon_meta.series = 'Comment Series'
+        canon_meta.series_index = float(3)
+        canon_meta.rating = float(0)
+        canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML -- too &amp;amp;'
+        canon_meta.tags = ['tag d', 'tag e', 'tag f']
+        canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
+        self.compare_metadata(stream_meta, canon_meta)
+
+
+def find_tests():
+    return unittest.TestLoader().loadTestsFromTestCase(MetadataHtmlTest)
--- a/ebook_converter/ebooks/metadata/meta.py
+++ b/ebook_converter/ebooks/metadata/meta.py
@@ -0,0 +1,243 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os, re, collections
+
+from calibre.utils.config import prefs
+from calibre.constants import filesystem_encoding
+from calibre.ebooks.metadata.opf2 import OPF
+from calibre import isbytestring
+from calibre.customize.ui import get_file_type_metadata, set_file_type_metadata
+from calibre.ebooks.metadata import MetaInformation, string_to_authors
+from polyglot.builtins import getcwd, unicode_type
+
+# The priorities for loading metadata from different file types
+# Higher values should be used to update metadata from lower values
+METADATA_PRIORITIES = collections.defaultdict(lambda:0)
+for i, ext in enumerate((
+    'html', 'htm', 'xhtml', 'xhtm',
+    'rtf', 'fb2', 'pdf', 'prc', 'odt',
+    'epub', 'lit', 'lrx', 'lrf', 'mobi',
+    'azw', 'azw3', 'azw1', 'rb', 'imp', 'snb'
+)):
+    METADATA_PRIORITIES[ext] = i + 1
+
+
+def path_to_ext(path):
+    return os.path.splitext(path)[1][1:].lower()
+
+
+def metadata_from_formats(formats, force_read_metadata=False, pattern=None):
+    try:
+        return _metadata_from_formats(formats, force_read_metadata, pattern)
+    except:
+        mi = metadata_from_filename(list(iter(formats))[0], pat=pattern)
+        if not mi.authors:
+            mi.authors = [_('Unknown')]
+        return mi
+
+
+def _metadata_from_formats(formats, force_read_metadata=False, pattern=None):
+    mi = MetaInformation(None, None)
+    formats.sort(key=lambda x: METADATA_PRIORITIES[path_to_ext(x)])
+    extensions = list(map(path_to_ext, formats))
+    if 'opf' in extensions:
+        opf = formats[extensions.index('opf')]
+        mi2 = opf_metadata(opf)
+        if mi2 is not None and mi2.title:
+            return mi2
+
+    for path, ext in zip(formats, extensions):
+        with lopen(path, 'rb') as stream:
+            try:
+                newmi = get_metadata(stream, stream_type=ext,
+                                     use_libprs_metadata=True,
+                                     force_read_metadata=force_read_metadata,
+                                     pattern=pattern)
+                mi.smart_update(newmi)
+            except Exception:
+                continue
+            if getattr(mi, 'application_id', None) is not None:
+                return mi
+
+    if not mi.title:
+        mi.title = _('Unknown')
+    if not mi.authors:
+        mi.authors = [_('Unknown')]
+
+    return mi
+
+
+def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False,
+                 force_read_metadata=False, pattern=None):
+    pos = 0
+    if hasattr(stream, 'tell'):
+        pos = stream.tell()
+    try:
+        return _get_metadata(stream, stream_type, use_libprs_metadata,
+                             force_read_metadata, pattern)
+    finally:
+        if hasattr(stream, 'seek'):
+            stream.seek(pos)
+
+
+def _get_metadata(stream, stream_type, use_libprs_metadata,
+                  force_read_metadata=False, pattern=None):
+    if stream_type:
+        stream_type = stream_type.lower()
+    if stream_type in ('html', 'html', 'xhtml', 'xhtm', 'xml'):
+        stream_type = 'html'
+    if stream_type in ('mobi', 'prc', 'azw'):
+        stream_type = 'mobi'
+    if stream_type in ('odt', 'ods', 'odp', 'odg', 'odf'):
+        stream_type = 'odt'
+
+    opf = None
+    if hasattr(stream, 'name'):
+        c = os.path.splitext(stream.name)[0]+'.opf'
+        if os.access(c, os.R_OK):
+            opf = opf_metadata(os.path.abspath(c))
+
+    if use_libprs_metadata and getattr(opf, 'application_id', None) is not None:
+        return opf
+
+    name = os.path.basename(getattr(stream, 'name', ''))
+    # The fallback pattern matches the default filename format produced by calibre
+    base = metadata_from_filename(name, pat=pattern, fallback_pat=re.compile(
+            r'^(?P<title>.+) - (?P<author>[^-]+)$'))
+    if not base.authors:
+        base.authors = [_('Unknown')]
+    if not base.title:
+        base.title = _('Unknown')
+    mi = MetaInformation(None, None)
+    if force_read_metadata or prefs['read_file_metadata']:
+        mi = get_file_type_metadata(stream, stream_type)
+    base.smart_update(mi)
+    if opf is not None:
+        base.smart_update(opf)
+
+    return base
+
+
+def set_metadata(stream, mi, stream_type='lrf', report_error=None):
+    if stream_type:
+        stream_type = stream_type.lower()
+    set_file_type_metadata(stream, mi, stream_type, report_error=report_error)
+
+
+def metadata_from_filename(name, pat=None, fallback_pat=None):
+    if isbytestring(name):
+        name = name.decode(filesystem_encoding, 'replace')
+    name = name.rpartition('.')[0]
+    mi = MetaInformation(None, None)
+    if pat is None:
+        pat = re.compile(prefs.get('filename_pattern'))
+    name = name.replace('_', ' ')
+    match = pat.search(name)
+    if match is None and fallback_pat is not None:
+        match = fallback_pat.search(name)
+    if match is not None:
+        try:
+            mi.title = match.group('title')
+        except IndexError:
+            pass
+        try:
+            au = match.group('author')
+            aus = string_to_authors(au)
+            if aus:
+                mi.authors = aus
+                if prefs['swap_author_names'] and mi.authors:
+                    def swap(a):
+                        if ',' in a:
+                            parts = a.split(',', 1)
+                        else:
+                            parts = a.split(None, 1)
+                        if len(parts) > 1:
+                            t = parts[-1]
+                            parts = parts[:-1]
+                            parts.insert(0, t)
+                        return ' '.join(parts)
+                    mi.authors = [swap(x) for x in mi.authors]
+        except (IndexError, ValueError):
+            pass
+        try:
+            mi.series = match.group('series')
+        except IndexError:
+            pass
+        try:
+            si = match.group('series_index')
+            mi.series_index = float(si)
+        except (IndexError, ValueError, TypeError):
+            pass
+        try:
+            si = match.group('isbn')
+            mi.isbn = si
+        except (IndexError, ValueError):
+            pass
+        try:
+            publisher = match.group('publisher')
+            mi.publisher = publisher
+        except (IndexError, ValueError):
+            pass
+        try:
+            pubdate = match.group('published')
+            if pubdate:
+                from calibre.utils.date import parse_only_date
+                mi.pubdate = parse_only_date(pubdate)
+        except:
+            pass
+        try:
+            comments = match.group('comments')
+            mi.comments = comments
+        except (IndexError, ValueError):
+            pass
+
+    if mi.is_null('title'):
+        mi.title = name
+    return mi
+
+
+def opf_metadata(opfpath):
+    if hasattr(opfpath, 'read'):
+        f = opfpath
+        opfpath = getattr(f, 'name', getcwd())
+    else:
+        f = open(opfpath, 'rb')
+    try:
+        opf = OPF(f, os.path.dirname(opfpath))
+        if opf.application_id is not None:
+            mi = opf.to_book_metadata()
+            if hasattr(opf, 'cover') and opf.cover:
+                cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
+                if os.access(cpath, os.R_OK):
+                    fmt = cpath.rpartition('.')[-1]
+                    with open(cpath, 'rb') as f:
+                        data = f.read()
+                    mi.cover_data = (fmt, data)
+            return mi
+    except Exception:
+        import traceback
+        traceback.print_exc()
+        pass
+
+
+def forked_read_metadata(path, tdir):
+    from calibre.ebooks.metadata.opf2 import metadata_to_opf
+    with lopen(path, 'rb') as f:
+        fmt = os.path.splitext(path)[1][1:].lower()
+        f.seek(0, 2)
+        sz = f.tell()
+        with lopen(os.path.join(tdir, 'size.txt'), 'wb') as s:
+            s.write(unicode_type(sz).encode('ascii'))
+        f.seek(0)
+        mi = get_metadata(f, fmt)
+    if mi.cover_data and mi.cover_data[1]:
+        with lopen(os.path.join(tdir, 'cover.jpg'), 'wb') as f:
+            f.write(mi.cover_data[1])
+        mi.cover_data = (None, None)
+        mi.cover = 'cover.jpg'
+    opf = metadata_to_opf(mi, default_lang='und')
+    with lopen(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
+        f.write(opf)
--- a/ebook_converter/ebooks/metadata/odt.py
+++ b/ebook_converter/ebooks/metadata/odt.py
@@ -0,0 +1,302 @@
+#!/usr/bin/python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+#
+# Copyright (C) 2006 Søren Roug, European Environment Agency
+#
+# This is free software.  You may redistribute it under the terms
+# of the Apache license and the GNU General Public License Version
+# 2 or at your option any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+#
+# Contributor(s):
+#
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import io
+import json
+import os
+import re
+
+from lxml.etree import fromstring, tostring
+
+from calibre.ebooks.metadata import (
+    MetaInformation, authors_to_string, check_isbn, string_to_authors
+)
+from calibre.utils.date import isoformat, parse_date
+from calibre.utils.imghdr import identify
+from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
+from calibre.utils.zipfile import ZipFile, safe_replace
+from odf.draw import Frame as odFrame, Image as odImage
+from odf.namespaces import DCNS, METANS, OFFICENS
+from odf.opendocument import load as odLoad
+from polyglot.builtins import as_unicode
+
+fields = {
+    'title':            (DCNS, 'title'),
+    'description':      (DCNS, 'description'),
+    'subject':          (DCNS, 'subject'),
+    'creator':          (DCNS, 'creator'),
+    'date':             (DCNS, 'date'),
+    'language':         (DCNS, 'language'),
+    'generator':        (METANS, 'generator'),
+    'initial-creator':  (METANS, 'initial-creator'),
+    'keyword':          (METANS, 'keyword'),
+    'keywords':         (METANS, 'keywords'),
+    'editing-duration': (METANS, 'editing-duration'),
+    'editing-cycles':   (METANS, 'editing-cycles'),
+    'printed-by':       (METANS, 'printed-by'),
+    'print-date':       (METANS, 'print-date'),
+    'creation-date':    (METANS, 'creation-date'),
+    'user-defined':     (METANS, 'user-defined'),
+    # 'template':         (METANS, 'template'),
+}
+
+
+def get_metadata(stream, extract_cover=True):
+    whitespace = re.compile(r'\s+')
+
+    def normalize(s):
+        return whitespace.sub(' ', s).strip()
+
+    with ZipFile(stream) as zf:
+        meta = zf.read('meta.xml')
+        root = fromstring(meta)
+
+        def find(field):
+            ns, tag = fields[field]
+            ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
+            if ans:
+                return normalize(tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip()
+
+        mi = MetaInformation(None, [])
+        title = find('title')
+        if title:
+            mi.title = title
+        creator = find('initial-creator') or find('creator')
+        if creator:
+            mi.authors = string_to_authors(creator)
+        desc = find('description')
+        if desc:
+            mi.comments = desc
+        lang = find('language')
+        if lang and canonicalize_lang(lang):
+            mi.languages = [canonicalize_lang(lang)]
+        kw = find('keyword') or find('keywords')
+        if kw:
+            mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
+        data = {}
+        for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}):
+            name = (tag.get('{%s}name' % METANS) or '').lower()
+            vtype = tag.get('{%s}value-type' % METANS) or 'string'
+            val = tag.text
+            if name and val:
+                if vtype == 'boolean':
+                    val = val == 'true'
+                data[name] = val
+        opfmeta = False  # we need this later for the cover
+        opfnocover = False
+        if data.get('opf.metadata'):
+            # custom metadata contains OPF information
+            opfmeta = True
+            if data.get('opf.titlesort', ''):
+                mi.title_sort = data['opf.titlesort']
+            if data.get('opf.authors', ''):
+                mi.authors = string_to_authors(data['opf.authors'])
+            if data.get('opf.authorsort', ''):
+                mi.author_sort = data['opf.authorsort']
+            if data.get('opf.isbn', ''):
+                isbn = check_isbn(data['opf.isbn'])
+                if isbn is not None:
+                    mi.isbn = isbn
+            if data.get('opf.publisher', ''):
+                mi.publisher = data['opf.publisher']
+            if data.get('opf.pubdate', ''):
+                mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
+            if data.get('opf.identifiers'):
+                try:
+                    mi.identifiers = json.loads(data['opf.identifiers'])
+                except Exception:
+                    pass
+            if data.get('opf.rating'):
+                try:
+                    mi.rating = max(0, min(float(data['opf.rating']), 10))
+                except Exception:
+                    pass
+            if data.get('opf.series', ''):
+                mi.series = data['opf.series']
+                if data.get('opf.seriesindex', ''):
+                    try:
+                        mi.series_index = float(data['opf.seriesindex'])
+                    except Exception:
+                        mi.series_index = 1.0
+            if data.get('opf.language', ''):
+                cl = canonicalize_lang(data['opf.language'])
+                if cl:
+                    mi.languages = [cl]
+            opfnocover = data.get('opf.nocover', False)
+        if not opfnocover:
+            try:
+                read_cover(stream, zf, mi, opfmeta, extract_cover)
+            except Exception:
+                pass  # Do not let an error reading the cover prevent reading other data
+
+    return mi
+
+
+def set_metadata(stream, mi):
+
+    with ZipFile(stream) as zf:
+        raw = _set_metadata(zf.open('meta.xml').read(), mi)
+        # print(raw.decode('utf-8'))
+
+    stream.seek(os.SEEK_SET)
+    safe_replace(stream, "meta.xml", io.BytesIO(raw))
+
+
+def _set_metadata(raw, mi):
+    root = fromstring(raw)
+    namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
+    nsrmap = {v: k for k, v in namespaces.items()}
+
+    def xpath(expr, parent=root):
+        return parent.xpath(expr, namespaces=namespaces)
+
+    def remove(*tag_names):
+        for tag_name in tag_names:
+            ns = fields[tag_name][0]
+            tag_name = '{}:{}'.format(nsrmap[ns], tag_name)
+            for x in xpath('descendant::' + tag_name, meta):
+                x.getparent().remove(x)
+
+    def add(tag, val=None):
+        ans = meta.makeelement('{%s}%s' % fields[tag])
+        ans.text = val
+        meta.append(ans)
+        return ans
+
+    def remove_user_metadata(*names):
+        for x in xpath('//meta:user-defined'):
+            q = (x.get('{%s}name' % METANS) or '').lower()
+            if q in names:
+                x.getparent().remove(x)
+
+    def add_um(name, val, vtype='string'):
+        ans = add('user-defined', val)
+        ans.set('{%s}value-type' % METANS, vtype)
+        ans.set('{%s}name' % METANS, name)
+
+    def add_user_metadata(name, val):
+        if not hasattr(add_user_metadata, 'sentinel_added'):
+            add_user_metadata.sentinel_added = True
+            remove_user_metadata('opf.metadata')
+            add_um('opf.metadata', 'true', 'boolean')
+        val_type = 'string'
+        if hasattr(val, 'strftime'):
+            val = isoformat(val, as_utc=True).split('T')[0]
+            val_type = 'date'
+        add_um(name, val, val_type)
+
+    meta = xpath('//office:meta')[0]
+
+    if not mi.is_null('title'):
+        remove('title')
+        add('title', mi.title)
+        if not mi.is_null('title_sort'):
+            remove_user_metadata('opf.titlesort')
+            add_user_metadata('opf.titlesort', mi.title_sort)
+    if not mi.is_null('authors'):
+        remove('initial-creator', 'creator')
+        val = authors_to_string(mi.authors)
+        add('initial-creator', val), add('creator', val)
+        remove_user_metadata('opf.authors')
+        add_user_metadata('opf.authors', val)
+        if not mi.is_null('author_sort'):
+            remove_user_metadata('opf.authorsort')
+            add_user_metadata('opf.authorsort', mi.author_sort)
+    if not mi.is_null('comments'):
+        remove('description')
+        add('description', mi.comments)
+    if not mi.is_null('tags'):
+        remove('keyword')
+        add('keyword', ', '.join(mi.tags))
+    if not mi.is_null('languages'):
+        lang = lang_as_iso639_1(mi.languages[0])
+        if lang:
+            remove('language')
+            add('language', lang)
+    if not mi.is_null('pubdate'):
+        remove_user_metadata('opf.pubdate')
+        add_user_metadata('opf.pubdate', mi.pubdate)
+    if not mi.is_null('publisher'):
+        remove_user_metadata('opf.publisher')
+        add_user_metadata('opf.publisher', mi.publisher)
+    if not mi.is_null('series'):
+        remove_user_metadata('opf.series', 'opf.seriesindex')
+        add_user_metadata('opf.series', mi.series)
+        add_user_metadata('opf.seriesindex', '{}'.format(mi.series_index))
+    if not mi.is_null('identifiers'):
+        remove_user_metadata('opf.identifiers')
+        add_user_metadata('opf.identifiers', as_unicode(json.dumps(mi.identifiers)))
+    if not mi.is_null('rating'):
+        remove_user_metadata('opf.rating')
+        add_user_metadata('opf.rating', '%.2g' % mi.rating)
+
+    return tostring(root, encoding='utf-8', pretty_print=True)
+
+
+def read_cover(stream, zin, mi, opfmeta, extract_cover):
+    # search for an draw:image in a draw:frame with the name 'opf.cover'
+    # if opf.metadata prop is false, just use the first image that
+    # has a proper size (borrowed from docx)
+    otext = odLoad(stream)
+    cover_href = None
+    cover_data = None
+    cover_frame = None
+    imgnum = 0
+    for frm in otext.topnode.getElementsByType(odFrame):
+        img = frm.getElementsByType(odImage)
+        if len(img) == 0:
+            continue
+        i_href = img[0].getAttribute('href')
+        try:
+            raw = zin.read(i_href)
+        except KeyError:
+            continue
+        try:
+            fmt, width, height = identify(raw)
+        except Exception:
+            continue
+        imgnum += 1
+        if opfmeta and frm.getAttribute('name').lower() == 'opf.cover':
+            cover_href = i_href
+            cover_data = (fmt, raw)
+            cover_frame = frm.getAttribute('name')  # could have upper case
+            break
+        if cover_href is None and imgnum == 1 and 0.8 <= height/width <= 1.8 and height*width >= 12000:
+            # Pick the first image as the cover if it is of a suitable size
+            cover_href = i_href
+            cover_data = (fmt, raw)
+            if not opfmeta:
+                break
+
+    if cover_href is not None:
+        mi.cover = cover_href
+        mi.odf_cover_frame = cover_frame
+        if extract_cover:
+            if not cover_data:
+                raw = zin.read(cover_href)
+                try:
+                    fmt = identify(raw)[0]
+                except Exception:
+                    pass
+                else:
+                    cover_data = (fmt, raw)
+            mi.cover_data = cover_data
--- a/ebook_converter/ebooks/metadata/opf2.py
+++ b/ebook_converter/ebooks/metadata/opf2.py
--- a/ebook_converter/ebooks/metadata/opf3.py
+++ b/ebook_converter/ebooks/metadata/opf3.py
--- a/ebook_converter/ebooks/metadata/rtf.py
+++ b/ebook_converter/ebooks/metadata/rtf.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
+
+"""
+Edit metadata in RTF files.
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+import codecs
+import re
+
+from calibre import force_unicode
+from calibre.ebooks.metadata import MetaInformation
+from polyglot.builtins import codepoint_to_chr, string_or_bytes, unicode_type, int_to_byte, filter
+
+title_pat    = re.compile(br'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
+author_pat   = re.compile(br'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
+comment_pat  = re.compile(br'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
+tags_pat = re.compile(br'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
+publisher_pat = re.compile(br'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)
+
+
+def get_document_info(stream):
+    """
+    Extract the \\info block from an RTF file.
+    Return the info block as a string and the position in the file at which it
+    starts.
+    @param stream: File like object pointing to the RTF file.
+    """
+    block_size = 4096
+    stream.seek(0)
+    found, block = False, b""
+    while not found:
+        prefix = block[-6:]
+        block = prefix + stream.read(block_size)
+        actual_block_size = len(block) - len(prefix)
+        if len(block) == len(prefix):
+            break
+        idx = block.find(br'{\info')
+        if idx >= 0:
+            found = True
+            pos = stream.tell() - actual_block_size + idx - len(prefix)
+            stream.seek(pos)
+        else:
+            if block.find(br'\sect') > -1:
+                break
+    if not found:
+        return None, 0
+    data, count, = [], 0
+    pos = stream.tell()
+    while True:
+        ch = stream.read(1)
+        if ch == b'\\':
+            data.append(ch + stream.read(1))
+            continue
+        if ch == b'{':
+            count += 1
+        elif ch == b'}':
+            count -= 1
+        data.append(ch)
+        if count == 0:
+            break
+    return b''.join(data), pos
+
+
+def detect_codepage(stream):
+    pat = re.compile(br'\\ansicpg(\d+)')
+    match = pat.search(stream.read(512))
+    if match is not None:
+        num = match.group(1)
+        if num == b'0':
+            num = b'1252'
+        try:
+            codec = (b'cp'+num).decode('ascii')
+            codecs.lookup(codec)
+            return codec
+        except Exception:
+            pass
+
+
+def encode(unistr):
+    if not isinstance(unistr, unicode_type):
+        unistr = force_unicode(unistr)
+    return ''.join(c if ord(c) < 128 else '\\u{}?'.format(ord(c)) for c in unistr)
+
+
+def decode(raw, codec):
+    # https://en.wikipedia.org/wiki/Rich_Text_Format#Character_encoding
+
+    def codepage(match):
+        try:
+            return int_to_byte(int(match.group(1), 16)).decode(codec)
+        except ValueError:
+            return '?'
+
+    def uni(match):
+        try:
+            return codepoint_to_chr(int(match.group(1)))
+        except Exception:
+            return '?'
+
+    if isinstance(raw, bytes):
+        raw = raw.decode('ascii', 'replace')
+
+    if codec is not None:
+        raw = re.sub(r"\\'([a-fA-F0-9]{2})", codepage, raw)
+
+    raw = re.sub(r'\\u([0-9]{3,5}).', uni, raw)
+    return raw
+
+
+def get_metadata(stream):
+    """
+    Return metadata as a L{MetaInfo} object
+    """
+    stream.seek(0)
+    if stream.read(5) != br'{\rtf':
+        return MetaInformation(_('Unknown'))
+    block = get_document_info(stream)[0]
+    if not block:
+        return MetaInformation(_('Unknown'))
+
+    stream.seek(0)
+    cpg = detect_codepage(stream)
+    stream.seek(0)
+
+    title_match = title_pat.search(block)
+    if title_match is not None:
+        title = decode(title_match.group(1).strip(), cpg)
+    else:
+        title = _('Unknown')
+    author_match = author_pat.search(block)
+    if author_match is not None:
+        author = decode(author_match.group(1).strip(), cpg)
+    else:
+        author = None
+    mi = MetaInformation(title)
+    if author:
+        mi.authors = [x.strip() for x in author.split(',')]
+
+    comment_match = comment_pat.search(block)
+    if comment_match is not None:
+        comment = decode(comment_match.group(1).strip(), cpg)
+        mi.comments = comment
+    tags_match = tags_pat.search(block)
+    if tags_match is not None:
+        tags = decode(tags_match.group(1).strip(), cpg)
+        mi.tags = list(filter(None, (x.strip() for x in tags.split(','))))
+    publisher_match = publisher_pat.search(block)
+    if publisher_match is not None:
+        publisher = decode(publisher_match.group(1).strip(), cpg)
+        mi.publisher = publisher
+
+    return mi
+
+
+def create_metadata(stream, options):
+    md = [r'{\info']
+    if options.title:
+        title = encode(options.title)
+        md.append(r'{\title %s}'%(title,))
+    if options.authors:
+        au = options.authors
+        if not isinstance(au, string_or_bytes):
+            au = ', '.join(au)
+        author = encode(au)
+        md.append(r'{\author %s}'%(author,))
+    comp = options.comment if hasattr(options, 'comment') else options.comments
+    if comp:
+        comment = encode(comp)
+        md.append(r'{\subject %s}'%(comment,))
+    if options.publisher:
+        publisher = encode(options.publisher)
+        md.append(r'{\manager %s}'%(publisher,))
+    if options.tags:
+        tags = u', '.join(options.tags)
+        tags = encode(tags)
+        md.append(r'{\category %s}'%(tags,))
+    if len(md) > 1:
+        md.append('}')
+        stream.seek(0)
+        src   = stream.read()
+        ans = src[:6] + ''.join(md).encode('ascii') + src[6:]
+        stream.seek(0)
+        stream.write(ans)
+
+
+def set_metadata(stream, options):
+    '''
+    Modify/add RTF metadata in stream
+    @param options: Object with metadata attributes title, author, comment, category
+    '''
+    def add_metadata_item(src, name, val):
+        index = src.rindex('}')
+        return src[:index] + r'{\ '[:-1] + name + ' ' + val + '}}'
+
+    src, pos = get_document_info(stream)
+    if src is None:
+        create_metadata(stream, options)
+    else:
+        src = src.decode('ascii')
+        olen = len(src)
+
+        base_pat = r'\{\\name(.*?)(?<!\\)\}'
+
+        def replace_or_create(src, name, val):
+            val = encode(val)
+            pat = re.compile(base_pat.replace('name', name), re.DOTALL)
+            src, num = pat.subn('{\\' + name + ' ' + val + '}', src)
+            if num == 0:
+                src = add_metadata_item(src, name, val)
+            return src
+
+        if options.title is not None:
+            src = replace_or_create(src, 'title', options.title)
+        if options.comments is not None:
+            src = replace_or_create(src, 'subject', options.comments)
+        if options.authors is not None:
+            src = replace_or_create(src, 'author', ', '.join(options.authors))
+        if options.tags is not None:
+            src = replace_or_create(src, 'category', ', '.join(options.tags))
+        if options.publisher is not None:
+            src = replace_or_create(src, 'manager', options.publisher)
+        stream.seek(pos + olen)
+        after = stream.read()
+        stream.seek(pos)
+        stream.truncate()
+        stream.write(src.encode('ascii'))
+        stream.write(after)
+
+
+def find_tests():
+    import unittest
+    from io import BytesIO
+    from calibre.ebooks.metadata.book.base import Metadata
+
+    class Test(unittest.TestCase):
+
+        def test_rtf_metadata(self):
+            stream = BytesIO(br'{\rtf1\ansi\ansicpg1252}')
+            m = Metadata('Test ø̄title', ['Author One', 'Author БTwo'])
+            m.tags = 'tag1 見tag2'.split()
+            m.comments = '<p>some ⊹comments</p>'
+            m.publisher = 'publiSher'
+            set_metadata(stream, m)
+            stream.seek(0)
+            o = get_metadata(stream)
+            for attr in 'title authors publisher comments tags'.split():
+                self.assertEqual(getattr(m, attr), getattr(o, attr))
+
+    return unittest.defaultTestLoader.loadTestsFromTestCase(Test)
--- a/ebook_converter/ebooks/metadata/toc.py
+++ b/ebook_converter/ebooks/metadata/toc.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env  python2
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os, glob, re, functools
+from collections import Counter
+
+from lxml import etree
+from lxml.builder import ElementMaker
+
+from calibre.constants import __appname__, __version__
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.utils.xml_parse import safe_xml_fromstring
+from calibre.utils.cleantext import clean_xml_chars
+from polyglot.builtins import unicode_type, getcwd
+from polyglot.urllib import unquote, urlparse
+
+NCX_NS = "http://www.daisy.org/z3986/2005/ncx/"
+CALIBRE_NS = "http://calibre.kovidgoyal.net/2009/metadata"
+NSMAP = {None: NCX_NS, 'calibre':CALIBRE_NS}
+E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP)
+C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
+
+
+def parse_html_toc(data):
+    from html5_parser import parse
+    from calibre.utils.cleantext import clean_xml_chars
+    from lxml import etree
+    if isinstance(data, bytes):
+        data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
+    root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
+    for a in root.xpath('//*[@href and local-name()="a"]'):
+        purl = urlparse(unquote(a.get('href')))
+        href, fragment = purl[2], purl[5]
+        if not fragment:
+            fragment = None
+        else:
+            fragment = fragment.strip()
+        href = href.strip()
+
+        txt = etree.tostring(a, method='text', encoding='unicode')
+        yield href, fragment, txt
+
+
+class TOC(list):
+
+    def __init__(self, href=None, fragment=None, text=None, parent=None,
+            play_order=0, base_path=getcwd(), type='unknown', author=None,
+            description=None, toc_thumbnail=None):
+        self.href = href
+        self.fragment = fragment
+        if not self.fragment:
+            self.fragment = None
+        self.text = text
+        self.parent = parent
+        self.base_path = base_path
+        self.play_order = play_order
+        self.type = type
+        self.author = author
+        self.description = description
+        self.toc_thumbnail = toc_thumbnail
+
+    def __str__(self):
+        lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)]
+        for child in self:
+            c = unicode_type(child).splitlines()
+            for l in c:
+                lines.append('\t'+l)
+        return '\n'.join(lines)
+
+    def count(self, type):
+        return len([i for i in self.flat() if i.type == type])
+
+    def purge(self, types, max=0):
+        remove = []
+        for entry in self.flat():
+            if entry.type in types:
+                remove.append(entry)
+        remove = remove[max:]
+        for entry in remove:
+            if entry.parent is None:
+                continue
+            entry.parent.remove(entry)
+        return remove
+
+    def remove(self, entry):
+        list.remove(self, entry)
+        entry.parent = None
+
+    def add_item(self, href, fragment, text, play_order=None, type='unknown',
+            author=None, description=None, toc_thumbnail=None):
+        if play_order is None:
+            play_order = (self[-1].play_order if len(self) else self.play_order) + 1
+        self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
+                        base_path=self.base_path, play_order=play_order,
+                        type=type, author=author, description=description, toc_thumbnail=toc_thumbnail))
+        return self[-1]
+
+    def top_level_items(self):
+        for item in self:
+            if item.text is not None:
+                yield item
+
+    def depth(self):
+        depth = 1
+        for obj in self:
+            c = obj.depth()
+            if c > depth - 1:
+                depth = c + 1
+        return depth
+
+    def flat(self):
+        'Depth first iteration over the tree rooted at self'
+        yield self
+        for obj in self:
+            for i in obj.flat():
+                yield i
+
+    @property
+    def abspath(self):
+        'Return the file this toc entry points to as a absolute path to a file on the system.'
+
+        if self.href is None:
+            return None
+        path = self.href.replace('/', os.sep)
+        if not os.path.isabs(path):
+            path = os.path.join(self.base_path, path)
+        return path
+
+    def read_from_opf(self, opfreader):
+        toc = opfreader.soup.find('spine', toc=True)
+        if toc is not None:
+            toc = toc['toc']
+        if toc is None:
+            try:
+                toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
+            except:
+                for item in opfreader.manifest:
+                    if 'toc' in item.href().lower():
+                        toc = item.href()
+                        break
+
+        if toc is not None:
+            if toc.lower() not in ('ncx', 'ncxtoc'):
+                toc = urlparse(unquote(toc))[2]
+                toc = toc.replace('/', os.sep)
+                if not os.path.isabs(toc):
+                    toc = os.path.join(self.base_path, toc)
+                try:
+                    if not os.path.exists(toc):
+                        bn  = os.path.basename(toc)
+                        bn  = bn.replace('_top.htm', '_toc.htm')  # Bug in BAEN OPF files
+                        toc = os.path.join(os.path.dirname(toc), bn)
+
+                    self.read_html_toc(toc)
+                except:
+                    print('WARNING: Could not read Table of Contents. Continuing anyway.')
+            else:
+                path = opfreader.manifest.item(toc.lower())
+                path = getattr(path, 'path', path)
+                if path and os.access(path, os.R_OK):
+                    try:
+                        self.read_ncx_toc(path)
+                    except Exception as err:
+                        print('WARNING: Invalid NCX file:', err)
+                    return
+                cwd = os.path.abspath(self.base_path)
+                m = glob.glob(os.path.join(cwd, '*.ncx'))
+                if m:
+                    toc = m[0]
+                    self.read_ncx_toc(toc)
+
+    def read_ncx_toc(self, toc, root=None):
+        self.base_path = os.path.dirname(toc)
+        if root is None:
+            with open(toc, 'rb') as f:
+                raw  = xml_to_unicode(f.read(), assume_utf8=True,
+                        strip_encoding_pats=True)[0]
+            root = safe_xml_fromstring(raw)
+        xpn = {'re': 'http://exslt.org/regular-expressions'}
+        XPath = functools.partial(etree.XPath, namespaces=xpn)
+
+        def get_attr(node, default=None, attr='playorder'):
+            for name, val in node.attrib.items():
+                if name and val and name.lower().endswith(attr):
+                    return val
+            return default
+
+        nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]')
+        txt_path = XPath('./*[re:match(local-name(), "text$", "i")]')
+        content_path = XPath('./*[re:match(local-name(), "content$", "i")]')
+        np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]')
+
+        def process_navpoint(np, dest):
+            try:
+                play_order = int(get_attr(np, 1))
+            except:
+                play_order = 1
+            href = fragment = text = None
+            nd = dest
+            nl = nl_path(np)
+            if nl:
+                nl = nl[0]
+                text = ''
+                for txt in txt_path(nl):
+                    text += etree.tostring(txt, method='text',
+                            encoding='unicode', with_tail=False)
+                content = content_path(np)
+                if content and text:
+                    content = content[0]
+                    # if get_attr(content, attr='src'):
+                    purl = urlparse(content.get('src'))
+                    href, fragment = unquote(purl[2]), unquote(purl[5])
+                    nd = dest.add_item(href, fragment, text)
+                    nd.play_order = play_order
+
+            for c in np_path(np):
+                process_navpoint(c, nd)
+
+        nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root)
+        if not nm:
+            raise ValueError('NCX files must have a <navmap> element.')
+        nm = nm[0]
+
+        for child in np_path(nm):
+            process_navpoint(child, self)
+
+    def read_html_toc(self, toc):
+        self.base_path = os.path.dirname(toc)
+        with lopen(toc, 'rb') as f:
+            parsed_toc = parse_html_toc(f.read())
+        for href, fragment, txt in parsed_toc:
+            add = True
+            for i in self.flat():
+                if i.href == href and i.fragment == fragment:
+                    add = False
+                    break
+            if add:
+                self.add_item(href, fragment, txt)
+
+    def render(self, stream, uid):
+        root = E.ncx(
+                E.head(
+                    E.meta(name='dtb:uid', content=unicode_type(uid)),
+                    E.meta(name='dtb:depth', content=unicode_type(self.depth())),
+                    E.meta(name='dtb:generator', content='%s (%s)'%(__appname__,
+                        __version__)),
+                    E.meta(name='dtb:totalPageCount', content='0'),
+                    E.meta(name='dtb:maxPageNumber', content='0'),
+                ),
+                E.docTitle(E.text('Table of Contents')),
+        )
+        navmap = E.navMap()
+        root.append(navmap)
+        root.set('{http://www.w3.org/XML/1998/namespace}lang', 'en')
+        c = Counter()
+
+        def navpoint(parent, np):
+            text = np.text
+            if not text:
+                text = ''
+            c[1] += 1
+            item_id = 'num_%d'%c[1]
+            text = clean_xml_chars(text)
+            elem = E.navPoint(
+                    E.navLabel(E.text(re.sub(r'\s+', ' ', text))),
+                    E.content(src=unicode_type(np.href)+(('#' + unicode_type(np.fragment))
+                        if np.fragment else '')),
+                    id=item_id,
+                    playOrder=unicode_type(np.play_order)
+            )
+            au = getattr(np, 'author', None)
+            if au:
+                au = re.sub(r'\s+', ' ', au)
+                elem.append(C.meta(au, name='author'))
+            desc = getattr(np, 'description', None)
+            if desc:
+                desc = re.sub(r'\s+', ' ', desc)
+                try:
+                    elem.append(C.meta(desc, name='description'))
+                except ValueError:
+                    elem.append(C.meta(clean_xml_chars(desc), name='description'))
+            idx = getattr(np, 'toc_thumbnail', None)
+            if idx:
+                elem.append(C.meta(idx, name='toc_thumbnail'))
+            parent.append(elem)
+            for np2 in np:
+                navpoint(elem, np2)
+
+        for np in self:
+            navpoint(navmap, np)
+        raw = etree.tostring(root, encoding='utf-8', xml_declaration=True,
+                pretty_print=True)
+        stream.write(raw)
--- a/ebook_converter/ebooks/metadata/utils.py
+++ b/ebook_converter/ebooks/metadata/utils.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from collections import namedtuple
+
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.ebooks.oeb.base import OPF
+from calibre.ebooks.oeb.polish.utils import guess_type
+from calibre.spell import parse_lang_code
+from calibre.utils.cleantext import clean_xml_chars
+from calibre.utils.localization import lang_as_iso639_1
+from calibre.utils.xml_parse import safe_xml_fromstring
+from polyglot.builtins import filter, map
+
+OPFVersion = namedtuple('OPFVersion', 'major minor patch')
+
+
+def parse_opf_version(raw):
+    parts = (raw or '').split('.')
+    try:
+        major = int(parts[0])
+    except Exception:
+        return OPFVersion(2, 0, 0)
+    try:
+        v = list(map(int, raw.split('.')))
+    except Exception:
+        v = [major, 0, 0]
+    while len(v) < 3:
+        v.append(0)
+    v = v[:3]
+    return OPFVersion(*v)
+
+
+def parse_opf(stream_or_path):
+    stream = stream_or_path
+    if not hasattr(stream, 'read'):
+        stream = open(stream, 'rb')
+    raw = stream.read()
+    if not raw:
+        raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
+    raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
+    raw = raw[raw.find('<'):]
+    root = safe_xml_fromstring(clean_xml_chars(raw))
+    if root is None:
+        raise ValueError('Not an OPF file')
+    return root
+
+
+def normalize_languages(opf_languages, mi_languages):
+    ' Preserve original country codes and use 2-letter lang codes where possible '
+    def parse(x):
+        try:
+            return parse_lang_code(x)
+        except ValueError:
+            return None
+    opf_languages = filter(None, map(parse, opf_languages))
+    cc_map = {c.langcode:c.countrycode for c in opf_languages}
+    mi_languages = filter(None, map(parse, mi_languages))
+
+    def norm(x):
+        lc = x.langcode
+        cc = x.countrycode or cc_map.get(lc, None)
+        lc = lang_as_iso639_1(lc) or lc
+        if cc:
+            lc += '-' + cc
+        return lc
+    return list(map(norm, mi_languages))
+
+
+def ensure_unique(template, existing):
+    b, e = template.rpartition('.')[::2]
+    if b and e:
+        e = '.' + e
+    else:
+        b, e = template, ''
+    q = template
+    c = 0
+    while q in existing:
+        c += 1
+        q = '%s-%d%s' % (b, c, e)
+    return q
+
+
+def create_manifest_item(root, href_template, id_template, media_type=None):
+    all_ids = frozenset(root.xpath('//*/@id'))
+    all_hrefs = frozenset(root.xpath('//*/@href'))
+    href = ensure_unique(href_template, all_hrefs)
+    item_id = ensure_unique(id_template, all_ids)
+    manifest = root.find(OPF('manifest'))
+    if manifest is not None:
+        i = manifest.makeelement(OPF('item'))
+        i.set('href', href), i.set('id', item_id)
+        i.set('media-type', media_type or guess_type(href_template))
+        manifest.append(i)
+        return i
+
+
+def pretty_print_opf(root):
+    from calibre.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree
+    pretty_opf(root)
+    pretty_xml_tree(root)
--- a/ebook_converter/ebooks/mobi/init.py
+++ b/ebook_converter/ebooks/mobi/init.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env  python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
+
+class MobiError(Exception):
+    pass
+
+
+# That might be a bit small on the PW, but Amazon/KG 2.5 still uses these values, even when delivered to a PW
+MAX_THUMB_SIZE = 16 * 1024
+MAX_THUMB_DIMEN = (180, 240)
--- a/ebook_converter/ebooks/mobi/huffcdic.py
+++ b/ebook_converter/ebooks/mobi/huffcdic.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Decompress MOBI files compressed with the Huff/cdic algorithm. Code thanks to darkninja
+and igorsk.
+'''
+
+import struct
+
+from calibre.ebooks.mobi import MobiError
+from polyglot.builtins import map
+
+
+class Reader(object):
+
+    def __init__(self):
+        self.q = struct.Struct(b'>Q').unpack_from
+
+    def load_huff(self, huff):
+        if huff[0:8] != b'HUFF\x00\x00\x00\x18':
+            raise MobiError('Invalid HUFF header')
+        off1, off2 = struct.unpack_from(b'>LL', huff, 8)
+
+        def dict1_unpack(v):
+            codelen, term, maxcode = v&0x1f, v&0x80, v>>8
+            assert codelen != 0
+            if codelen <= 8:
+                assert term
+            maxcode = ((maxcode + 1) << (32 - codelen)) - 1
+            return (codelen, term, maxcode)
+        self.dict1 = tuple(map(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)))
+
+        dict2 = struct.unpack_from(b'>64L', huff, off2)
+        self.mincode, self.maxcode = (), ()
+        for codelen, mincode in enumerate((0,) + dict2[0::2]):
+            self.mincode += (mincode << (32 - codelen), )
+        for codelen, maxcode in enumerate((0,) + dict2[1::2]):
+            self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
+
+        self.dictionary = []
+
+    def load_cdic(self, cdic):
+        if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
+            raise MobiError('Invalid CDIC header')
+        phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
+        n = min(1<<bits, phrases-len(self.dictionary))
+        h = struct.Struct(b'>H').unpack_from
+
+        def getslice(off):
+            blen, = h(cdic, 16+off)
+            slice = cdic[18+off:18+off+(blen&0x7fff)]
+            return (slice, blen&0x8000)
+        self.dictionary += map(getslice, struct.unpack_from(b'>%dH' % n, cdic, 16))
+
+    def unpack(self, data):
+        q = self.q
+
+        bitsleft = len(data) * 8
+        data += b'\x00\x00\x00\x00\x00\x00\x00\x00'
+        pos = 0
+        x, = q(data, pos)
+        n = 32
+
+        s = []
+        while True:
+            if n <= 0:
+                pos += 4
+                x, = q(data, pos)
+                n += 32
+            code = (x >> n) & ((1 << 32) - 1)
+
+            codelen, term, maxcode = self.dict1[code >> 24]
+            if not term:
+                while code < self.mincode[codelen]:
+                    codelen += 1
+                maxcode = self.maxcode[codelen]
+
+            n -= codelen
+            bitsleft -= codelen
+            if bitsleft < 0:
+                break
+
+            r = (maxcode - code) >> (32 - codelen)
+            slice_, flag = self.dictionary[r]
+            if not flag:
+                self.dictionary[r] = None
+                slice_ = self.unpack(slice_)
+                self.dictionary[r] = (slice_, 1)
+            s.append(slice_)
+        return b''.join(s)
+
+
+class HuffReader(object):
+
+    def __init__(self, huffs):
+        self.reader = Reader()
+        self.reader.load_huff(huffs[0])
+        for cdic in huffs[1:]:
+            self.reader.load_cdic(cdic)
+
+    def unpack(self, section):
+        return self.reader.unpack(section)
--- a/Show More
+++ b/Show More