Added docx writer related modules

2025-12-27 19:42:26 +01:00 · 2020-04-13 16:33:15 +02:00
parent ae80ae5640
commit 98b2dd8d4f
29 changed files with 5956 additions and 0 deletions
--- a/ebook_converter/ebooks/docx/writer/init.py
+++ b/ebook_converter/ebooks/docx/writer/init.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+
+
--- a/ebook_converter/ebooks/docx/writer/container.py
+++ b/ebook_converter/ebooks/docx/writer/container.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import textwrap, os
+
+from lxml import etree
+from lxml.builder import ElementMaker
+
+from calibre import guess_type
+from calibre.constants import numeric_version, __appname__
+from calibre.ebooks.docx.names import DOCXNamespace
+from calibre.ebooks.metadata import authors_to_string
+from calibre.ebooks.pdf.render.common import PAPER_SIZES
+from calibre.utils.date import utcnow
+from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
+from calibre.utils.zipfile import ZipFile
+from polyglot.builtins import iteritems, map, unicode_type, native_string_type
+
+
+def xml2str(root, pretty_print=False, with_tail=False):
+    if hasattr(etree, 'cleanup_namespaces'):
+        etree.cleanup_namespaces(root)
+    ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
+                          pretty_print=pretty_print, with_tail=with_tail)
+    return ans
+
+
+def page_size(opts):
+    width, height = PAPER_SIZES[opts.docx_page_size]
+    if opts.docx_custom_page_size is not None:
+        width, height = map(float, opts.docx_custom_page_size.partition('x')[0::2])
+    return width, height
+
+
+def page_margin(opts, which):
+    val = getattr(opts, 'docx_page_margin_' + which)
+    if val == 0.0:
+        val = getattr(opts, 'margin_' + which)
+    return val
+
+
+def page_effective_area(opts):
+    width, height = page_size(opts)
+    width -= page_margin(opts, 'left') + page_margin(opts, 'right')
+    height -= page_margin(opts, 'top') + page_margin(opts, 'bottom')
+    return width, height  # in pts
+
+
+def create_skeleton(opts, namespaces=None):
+    namespaces = namespaces or DOCXNamespace().namespaces
+
+    def w(x):
+        return '{%s}%s' % (namespaces['w'], x)
+    dn = {k:v for k, v in iteritems(namespaces) if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}}
+    E = ElementMaker(namespace=dn['w'], nsmap=dn)
+    doc = E.document()
+    body = E.body()
+    doc.append(body)
+    width, height = page_size(opts)
+    width, height = int(20 * width), int(20 * height)
+
+    def margin(which):
+        val = page_margin(opts, which)
+        return w(which), unicode_type(int(val * 20))
+    body.append(E.sectPr(
+        E.pgSz(**{w('w'):unicode_type(width), w('h'):unicode_type(height)}),
+        E.pgMar(**dict(map(margin, 'left top right bottom'.split()))),
+        E.cols(**{w('space'):'720'}),
+        E.docGrid(**{w('linePitch'):"360"}),
+    ))
+
+    dn = {k:v for k, v in iteritems(namespaces) if k in tuple('wra') + ('wp',)}
+    E = ElementMaker(namespace=dn['w'], nsmap=dn)
+    styles = E.styles(
+        E.docDefaults(
+            E.rPrDefault(
+                E.rPr(
+                    E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}),
+                    E.sz(**{w('val'):'22'}),
+                    E.szCs(**{w('val'):'22'}),
+                    E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"})
+                )
+            ),
+            E.pPrDefault(
+                E.pPr(
+                    E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"})
+                )
+            )
+        )
+    )
+    return doc, styles, body
+
+
+def update_doc_props(root, mi, namespace):
+    def setm(name, text=None, ns='dc'):
+        ans = root.makeelement('{%s}%s' % (namespace.namespaces[ns], name))
+        for child in tuple(root):
+            if child.tag == ans.tag:
+                root.remove(child)
+        ans.text = text
+        root.append(ans)
+        return ans
+    setm('title', mi.title)
+    setm('creator', authors_to_string(mi.authors))
+    if mi.tags:
+        setm('keywords', ', '.join(mi.tags), ns='cp')
+    if mi.comments:
+        setm('description', mi.comments)
+    if mi.languages:
+        l = canonicalize_lang(mi.languages[0])
+        setm('language', lang_as_iso639_1(l) or l)
+
+
+class DocumentRelationships(object):
+
+    def __init__(self, namespace):
+        self.rmap = {}
+        self.namespace = namespace
+        for typ, target in iteritems({
+                namespace.names['STYLES']: 'styles.xml',
+                namespace.names['NUMBERING']: 'numbering.xml',
+                namespace.names['WEB_SETTINGS']: 'webSettings.xml',
+                namespace.names['FONTS']: 'fontTable.xml',
+        }):
+            self.add_relationship(target, typ)
+
+    def get_relationship_id(self, target, rtype, target_mode=None):
+        return self.rmap.get((target, rtype, target_mode))
+
+    def add_relationship(self, target, rtype, target_mode=None):
+        ans = self.get_relationship_id(target, rtype, target_mode)
+        if ans is None:
+            ans = 'rId%d' % (len(self.rmap) + 1)
+            self.rmap[(target, rtype, target_mode)] = ans
+        return ans
+
+    def add_image(self, target):
+        return self.add_relationship(target, self.namespace.names['IMAGES'])
+
+    def serialize(self):
+        namespaces = self.namespace.namespaces
+        E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
+        relationships = E.Relationships()
+        for (target, rtype, target_mode), rid in iteritems(self.rmap):
+            r = E.Relationship(Id=rid, Type=rtype, Target=target)
+            if target_mode is not None:
+                r.set('TargetMode', target_mode)
+            relationships.append(r)
+        return xml2str(relationships)
+
+
+class DOCX(object):
+
+    def __init__(self, opts, log):
+        self.namespace = DOCXNamespace()
+        namespaces = self.namespace.namespaces
+        self.opts, self.log = opts, log
+        self.document_relationships = DocumentRelationships(self.namespace)
+        self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
+        self.numbering = etree.Element('{%s}numbering' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
+        E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
+        self.embedded_fonts = E.Relationships()
+        self.fonts = {}
+        self.images = {}
+
+    # Boilerplate {{{
+    @property
+    def contenttypes(self):
+        E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']})
+        types = E.Types()
+        for partname, mt in iteritems({
+            "/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
+            "/word/document.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml",
+            "/word/numbering.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml",
+            "/word/styles.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml",
+            "/word/endnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
+            "/word/settings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml",
+            "/word/theme/theme1.xml": "application/vnd.openxmlformats-officedocument.theme+xml",
+            "/word/fontTable.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml",
+            "/word/webSettings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml",
+            "/docProps/core.xml": "application/vnd.openxmlformats-package.core-properties+xml",
+            "/docProps/app.xml": "application/vnd.openxmlformats-officedocument.extended-properties+xml",
+        }):
+            types.append(E.Override(PartName=partname, ContentType=mt))
+        added = {'png', 'gif', 'jpeg', 'jpg', 'svg', 'xml'}
+        for ext in added:
+            types.append(E.Default(Extension=ext, ContentType=guess_type('a.'+ext)[0]))
+        for ext, mt in iteritems({
+            "rels": "application/vnd.openxmlformats-package.relationships+xml",
+            "odttf": "application/vnd.openxmlformats-officedocument.obfuscatedFont",
+        }):
+            added.add(ext)
+            types.append(E.Default(Extension=ext, ContentType=mt))
+        for fname in self.images:
+            ext = fname.rpartition(os.extsep)[-1]
+            if ext not in added:
+                added.add(ext)
+                mt = guess_type('a.' + ext)[0]
+                if mt:
+                    types.append(E.Default(Extension=ext, ContentType=mt))
+        return xml2str(types)
+
+    @property
+    def appproperties(self):
+        E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']})
+        props = E.Properties(
+            E.Application(__appname__),
+            E.AppVersion('%02d.%04d' % numeric_version[:2]),
+            E.DocSecurity('0'),
+            E.HyperlinksChanged('false'),
+            E.LinksUpToDate('true'),
+            E.ScaleCrop('false'),
+            E.SharedDoc('false'),
+        )
+        if self.mi.publisher:
+            props.append(E.Company(self.mi.publisher))
+        return xml2str(props)
+
+    @property
+    def containerrels(self):
+        return textwrap.dedent('''\
+        <?xml version='1.0' encoding='utf-8'?>
+        <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+            <Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/>
+            <Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/>
+            <Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/>
+        </Relationships>'''.format(**self.namespace.names)).encode('utf-8')
+
+    @property
+    def websettings(self):
+        E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']})
+        ws = E.webSettings(
+            E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile)
+        return xml2str(ws)
+
+    # }}}
+
+    def convert_metadata(self, mi):
+        namespaces = self.namespace.namespaces
+        E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()})
+        cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre'))
+        ts = utcnow().isoformat(native_string_type('T')).rpartition('.')[0] + 'Z'
+        for x in 'created modified'.split():
+            x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x), **{'{%s}type' % namespaces['xsi']:'dcterms:W3CDTF'})
+            x.text = ts
+            cp.append(x)
+        self.mi = mi
+        update_doc_props(cp, self.mi, self.namespace)
+        return xml2str(cp)
+
+    def create_empty_document(self, mi):
+        self.document, self.styles = create_skeleton(self.opts)[:2]
+
+    def write(self, path_or_stream, mi, create_empty_document=False):
+        if create_empty_document:
+            self.create_empty_document(mi)
+        with ZipFile(path_or_stream, 'w') as zf:
+            zf.writestr('[Content_Types].xml', self.contenttypes)
+            zf.writestr('_rels/.rels', self.containerrels)
+            zf.writestr('docProps/core.xml', self.convert_metadata(mi))
+            zf.writestr('docProps/app.xml', self.appproperties)
+            zf.writestr('word/webSettings.xml', self.websettings)
+            zf.writestr('word/document.xml', xml2str(self.document))
+            zf.writestr('word/styles.xml', xml2str(self.styles))
+            zf.writestr('word/numbering.xml', xml2str(self.numbering))
+            zf.writestr('word/fontTable.xml', xml2str(self.font_table))
+            zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
+            zf.writestr('word/_rels/fontTable.xml.rels', xml2str(self.embedded_fonts))
+            for fname, data_getter in iteritems(self.images):
+                zf.writestr(fname, data_getter())
+            for fname, data in iteritems(self.fonts):
+                zf.writestr(fname, data)
+
+
+if __name__ == '__main__':
+    d = DOCX(None, None)
+    print(d.websettings)
--- a/ebook_converter/ebooks/docx/writer/fonts.py
+++ b/ebook_converter/ebooks/docx/writer/fonts.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import defaultdict
+from uuid import uuid4
+
+from calibre.ebooks.oeb.base import OEB_STYLES
+from calibre.ebooks.oeb.transforms.subset import find_font_face_rules
+from polyglot.builtins import range
+
+
+def obfuscate_font_data(data, key):
+    prefix = bytearray(data[:32])
+    key = bytearray(reversed(key.bytes))
+    prefix = bytes(bytearray(prefix[i]^key[i % len(key)] for i in range(len(prefix))))
+    return prefix + data[32:]
+
+
+class FontsManager(object):
+
+    def __init__(self, namespace, oeb, opts):
+        self.namespace = namespace
+        self.oeb, self.log, self.opts = oeb, oeb.log, opts
+
+    def serialize(self, text_styles, fonts, embed_relationships, font_data_map):
+        makeelement = self.namespace.makeelement
+        font_families, seen = set(), set()
+        for ts in text_styles:
+            if ts.font_family:
+                lf = ts.font_family.lower()
+                if lf not in seen:
+                    seen.add(lf)
+                    font_families.add(ts.font_family)
+        family_map = {}
+        for family in sorted(font_families):
+            family_map[family] = makeelement(fonts, 'w:font', w_name=family)
+
+        embedded_fonts = []
+        for item in self.oeb.manifest:
+            if item.media_type in OEB_STYLES and hasattr(item.data, 'cssRules'):
+                embedded_fonts.extend(find_font_face_rules(item, self.oeb))
+
+        num = 0
+        face_map = defaultdict(set)
+        rel_map = {}
+        for ef in embedded_fonts:
+            ff = ef['font-family'][0]
+            if ff not in font_families:
+                continue
+            num += 1
+            bold = ef['weight'] > 400
+            italic = ef['font-style'] != 'normal'
+            tag = 'Regular'
+            if bold or italic:
+                tag = 'Italic'
+                if bold and italic:
+                    tag = 'BoldItalic'
+                elif bold:
+                    tag = 'Bold'
+            if tag in face_map[ff]:
+                continue
+            face_map[ff].add(tag)
+            font = family_map[ff]
+            key = uuid4()
+            item = ef['item']
+            rid = rel_map.get(item)
+            if rid is None:
+                rel_map[item] = rid = 'rId%d' % num
+                fname = 'fonts/font%d.odttf' % num
+                makeelement(embed_relationships, 'Relationship', Id=rid, Type=self.namespace.names['EMBEDDED_FONT'], Target=fname)
+                font_data_map['word/' + fname] = obfuscate_font_data(item.data, key)
+            makeelement(font, 'w:embed' + tag, r_id=rid,
+                        w_fontKey='{%s}' % key.urn.rpartition(':')[-1].upper(),
+                        w_subsetted="true" if self.opts.subset_embedded_fonts else "false")
--- a/ebook_converter/ebooks/docx/writer/from_html.py
+++ b/ebook_converter/ebooks/docx/writer/from_html.py
@@ -0,0 +1,617 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re
+from collections import Counter
+
+from calibre.ebooks.docx.writer.container import create_skeleton, page_size, page_effective_area
+from calibre.ebooks.docx.writer.styles import StylesManager, FloatSpec
+from calibre.ebooks.docx.writer.links import LinksManager
+from calibre.ebooks.docx.writer.images import ImagesManager
+from calibre.ebooks.docx.writer.fonts import FontsManager
+from calibre.ebooks.docx.writer.tables import Table
+from calibre.ebooks.docx.writer.lists import ListsManager
+from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
+from calibre.ebooks.oeb.base import XPath, barename
+from calibre.utils.localization import lang_as_iso639_1
+from polyglot.builtins import unicode_type, string_or_bytes
+
+
+def lang_for_tag(tag):
+    for attr in ('lang', '{http://www.w3.org/XML/1998/namespace}lang'):
+        val = lang_as_iso639_1(tag.get(attr))
+        if val:
+            return val
+
+
+class Style(St):
+
+    def __init__(self, *args, **kwargs):
+        St.__init__(self, *args, **kwargs)
+        self._letterSpacing = None
+
+    @property
+    def letterSpacing(self):
+        if self._letterSpacing is not None:
+            val = self._get('letter-spacing')
+            if val == 'normal':
+                self._letterSpacing = val
+            else:
+                self._letterSpacing = self._unit_convert(val)
+        return self._letterSpacing
+
+
+class Stylizer(Sz):
+
+    def style(self, element):
+        try:
+            return self._styles[element]
+        except KeyError:
+            return Style(element, self)
+
+
+class TextRun(object):
+
+    ws_pat = None
+
+    def __init__(self, namespace, style, first_html_parent, lang=None):
+        self.first_html_parent = first_html_parent
+        if self.ws_pat is None:
+            TextRun.ws_pat = self.ws_pat = re.compile(r'\s+')
+        self.style = style
+        self.texts = []
+        self.link = None
+        self.lang = lang
+        self.parent_style = None
+        self.makeelement = namespace.makeelement
+        self.descendant_style = None
+
+    def add_text(self, text, preserve_whitespace, bookmark=None, link=None):
+        if not preserve_whitespace:
+            text = self.ws_pat.sub(' ', text)
+            if text.strip() != text:
+                # If preserve_whitespace is False, Word ignores leading and
+                # trailing whitespace
+                preserve_whitespace = True
+        self.texts.append((text, preserve_whitespace, bookmark))
+        self.link = link
+
+    def add_break(self, clear='none', bookmark=None):
+        self.texts.append((None, clear, bookmark))
+
+    def add_image(self, drawing, bookmark=None):
+        self.texts.append((drawing, None, bookmark))
+
+    def serialize(self, p, links_manager):
+        makeelement = self.makeelement
+        parent = p if self.link is None else links_manager.serialize_hyperlink(p, self.link)
+        r = makeelement(parent, 'w:r')
+        rpr = makeelement(r, 'w:rPr', append=False)
+        if getattr(self.descendant_style, 'id', None) is not None:
+            makeelement(rpr, 'w:rStyle', w_val=self.descendant_style.id)
+        if self.lang:
+            makeelement(rpr, 'w:lang', w_bidi=self.lang, w_val=self.lang, w_eastAsia=self.lang)
+        if len(rpr) > 0:
+            r.append(rpr)
+
+        for text, preserve_whitespace, bookmark in self.texts:
+            if bookmark is not None:
+                bid = links_manager.bookmark_id
+                makeelement(r, 'w:bookmarkStart', w_id=unicode_type(bid), w_name=bookmark)
+            if text is None:
+                makeelement(r, 'w:br', w_clear=preserve_whitespace)
+            elif hasattr(text, 'xpath'):
+                r.append(text)
+            else:
+                t = makeelement(r, 'w:t')
+                t.text = text or ''
+                if preserve_whitespace:
+                    t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+            if bookmark is not None:
+                makeelement(r, 'w:bookmarkEnd', w_id=unicode_type(bid))
+
+    def __repr__(self):
+        return repr(self.texts)
+
+    def is_empty(self):
+        if not self.texts:
+            return True
+        if len(self.texts) == 1 and self.texts[0][:2] == ('', False):
+            return True
+        return False
+
+    @property
+    def style_weight(self):
+        ans = 0
+        for text, preserve_whitespace, bookmark in self.texts:
+            if isinstance(text, unicode_type):
+                ans += len(text)
+        return ans
+
+
+class Block(object):
+
+    def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False, parent_bg=None):
+        self.force_not_empty = False
+        self.namespace = namespace
+        self.bookmarks = set()
+        self.list_tag = (html_block, style) if is_list_item else None
+        self.is_first_block = False
+        self.numbering_id = None
+        self.parent_items = None
+        self.html_block = html_block
+        self.html_tag = barename(html_block.tag)
+        self.float_spec = float_spec
+        if float_spec is not None:
+            float_spec.blocks.append(self)
+        self.html_style = style
+        self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg)
+        self.styles_manager, self.links_manager = styles_manager, links_manager
+        self.keep_next = False
+        self.runs = []
+        self.skipped = False
+        self.linked_style = None
+        self.page_break_before = style['page-break-before'] == 'always'
+        self.keep_lines = style['page-break-inside'] == 'avoid'
+        self.page_break_after = False
+        self.block_lang = None
+
+    def resolve_skipped(self, next_block):
+        if not self.is_empty():
+            return
+        if len(self.html_block) > 0 and self.html_block[0] is next_block.html_block:
+            self.skipped = True
+            if self.list_tag is not None:
+                next_block.list_tag = self.list_tag
+
+    def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None, is_parent_style=False, bookmark=None, link=None, lang=None):
+        ws = style['white-space']
+        preserve_whitespace = ws in {'pre', 'pre-wrap', '-o-pre-wrap'}
+        ts = self.styles_manager.create_text_style(style, is_parent_style=is_parent_style)
+        if self.runs and ts == self.runs[-1].style and link == self.runs[-1].link and lang == self.runs[-1].lang:
+            run = self.runs[-1]
+        else:
+            run = TextRun(self.namespace, ts, self.html_block if html_parent is None else html_parent, lang=lang)
+            self.runs.append(run)
+        if ignore_leading_whitespace and not preserve_whitespace:
+            text = text.lstrip()
+        if preserve_whitespace or ws == 'pre-line':
+            for text in text.splitlines():
+                run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link)
+                bookmark = None
+                run.add_break()
+        else:
+            run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link)
+
+    def add_break(self, clear='none', bookmark=None):
+        if self.runs:
+            run = self.runs[-1]
+        else:
+            run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
+            self.runs.append(run)
+        run.add_break(clear=clear, bookmark=bookmark)
+
+    def add_image(self, drawing, bookmark=None):
+        if self.runs:
+            run = self.runs[-1]
+        else:
+            run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
+            self.runs.append(run)
+        run.add_image(drawing, bookmark=bookmark)
+
+    def serialize(self, body):
+        makeelement = self.namespace.makeelement
+        p = makeelement(body, 'w:p')
+        end_bookmarks = []
+        for bmark in self.bookmarks:
+            end_bookmarks.append(unicode_type(self.links_manager.bookmark_id))
+            makeelement(p, 'w:bookmarkStart', w_id=end_bookmarks[-1], w_name=bmark)
+        if self.block_lang:
+            rpr = makeelement(p, 'w:rPr')
+            makeelement(rpr, 'w:lang', w_val=self.block_lang, w_bidi=self.block_lang, w_eastAsia=self.block_lang)
+
+        ppr = makeelement(p, 'w:pPr')
+        if self.keep_next:
+            makeelement(ppr, 'w:keepNext')
+        if self.float_spec is not None:
+            self.float_spec.serialize(self, ppr)
+        if self.numbering_id is not None:
+            numpr = makeelement(ppr, 'w:numPr')
+            makeelement(numpr, 'w:ilvl', w_val=unicode_type(self.numbering_id[1]))
+            makeelement(numpr, 'w:numId', w_val=unicode_type(self.numbering_id[0]))
+        if self.linked_style is not None:
+            makeelement(ppr, 'w:pStyle', w_val=self.linked_style.id)
+        elif self.style.id:
+            makeelement(ppr, 'w:pStyle', w_val=self.style.id)
+        if self.is_first_block:
+            makeelement(ppr, 'w:pageBreakBefore', w_val='off')
+        elif self.page_break_before:
+            makeelement(ppr, 'w:pageBreakBefore', w_val='on')
+        if self.keep_lines:
+            makeelement(ppr, 'w:keepLines', w_val='on')
+        for run in self.runs:
+            run.serialize(p, self.links_manager)
+        for bmark in end_bookmarks:
+            makeelement(p, 'w:bookmarkEnd', w_id=bmark)
+
+    def __repr__(self):
+        return 'Block(%r)' % self.runs
+    __str__ = __repr__
+
+    def is_empty(self):
+        if self.force_not_empty:
+            return False
+        for run in self.runs:
+            if not run.is_empty():
+                return False
+        return True
+
+
+class Blocks(object):
+
+    def __init__(self, namespace, styles_manager, links_manager):
+        self.top_bookmark = None
+        self.namespace = namespace
+        self.styles_manager = styles_manager
+        self.links_manager = links_manager
+        self.all_blocks = []
+        self.pos = 0
+        self.current_block = None
+        self.items = []
+        self.tables = []
+        self.current_table = None
+        self.open_html_blocks = set()
+        self.html_tag_start_blocks = {}
+
+    def current_or_new_block(self, html_tag, tag_style):
+        return self.current_block or self.start_new_block(html_tag, tag_style)
+
+    def end_current_block(self):
+        if self.current_block is not None:
+            self.all_blocks.append(self.current_block)
+            if self.current_table is not None and self.current_table.current_row is not None:
+                self.current_table.add_block(self.current_block)
+            else:
+                self.block_map[self.current_block] = len(self.items)
+                self.items.append(self.current_block)
+                self.current_block.parent_items = self.items
+        self.current_block = None
+
+    def start_new_block(self, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False):
+        parent_bg = None
+        if html_block is not None:
+            p = html_block.getparent()
+            b = self.html_tag_start_blocks.get(p)
+            if b is not None:
+                ps = self.styles_manager.styles_for_html_blocks.get(p)
+                if ps is not None and ps.background_color is not None:
+                    parent_bg = ps.background_color
+        self.end_current_block()
+        self.current_block = Block(
+            self.namespace, self.styles_manager, self.links_manager, html_block, style,
+            is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item,
+            parent_bg=parent_bg)
+        self.html_tag_start_blocks[html_block] = self.current_block
+        self.open_html_blocks.add(html_block)
+        return self.current_block
+
+    def start_new_table(self, html_tag, tag_style=None):
+        self.current_table = Table(self.namespace, html_tag, tag_style)
+        self.tables.append(self.current_table)
+
+    def start_new_row(self, html_tag, tag_style):
+        if self.current_table is None:
+            self.start_new_table(html_tag)
+        self.current_table.start_new_row(html_tag, tag_style)
+
+    def start_new_cell(self, html_tag, tag_style):
+        if self.current_table is None:
+            self.start_new_table(html_tag)
+        self.current_table.start_new_cell(html_tag, tag_style)
+
+    def finish_tag(self, html_tag):
+        if self.current_block is not None and html_tag in self.open_html_blocks:
+            start_block = self.html_tag_start_blocks.get(html_tag)
+            if start_block is not None and start_block.html_style['page-break-after'] == 'always':
+                self.current_block.page_break_after = True
+            self.end_current_block()
+            self.open_html_blocks.discard(html_tag)
+
+        if self.current_table is not None:
+            table_finished = self.current_table.finish_tag(html_tag)
+            if table_finished:
+                table = self.tables[-1]
+                del self.tables[-1]
+                if self.tables:
+                    self.current_table = self.tables[-1]
+                    self.current_table.add_table(table)
+                else:
+                    self.current_table = None
+                    self.block_map[table] = len(self.items)
+                    self.items.append(table)
+
+    def serialize(self, body):
+        for item in self.items:
+            item.serialize(body)
+
+    def delete_block_at(self, pos=None):
+        pos = self.pos if pos is None else pos
+        block = self.all_blocks[pos]
+        del self.all_blocks[pos]
+        bpos = self.block_map.pop(block, None)
+        if bpos is not None:
+            del self.items[bpos]
+        else:
+            items = self.items if block.parent_items is None else block.parent_items
+            items.remove(block)
+        block.parent_items = None
+        if block.float_spec is not None:
+            block.float_spec.blocks.remove(block)
+        try:
+            next_block = self.all_blocks[pos]
+            next_block.bookmarks.update(block.bookmarks)
+            for attr in 'page_break_after page_break_before'.split():
+                setattr(next_block, attr, getattr(block, attr))
+        except (IndexError, KeyError):
+            pass
+
+    def __enter__(self):
+        self.pos = len(self.all_blocks)
+        self.block_map = {}
+
+    def __exit__(self, etype, value, traceback):
+        if value is not None:
+            return  # Since there was an exception, the data structures are not in a consistent state
+        if self.current_block is not None:
+            self.all_blocks.append(self.current_block)
+        self.current_block = None
+        if len(self.all_blocks) > self.pos and self.all_blocks[self.pos].is_empty():
+            # Delete the empty block corresponding to the <body> tag when the
+            # body tag has no inline content before its first sub-block
+            self.delete_block_at(self.pos)
+        if self.pos > 0 and self.pos < len(self.all_blocks):
+            # Insert a page break corresponding to the start of the html file
+            self.all_blocks[self.pos].page_break_before = True
+            if self.top_bookmark is not None:
+                self.all_blocks[self.pos].bookmarks.add(self.top_bookmark)
+        self.top_bookmark = None
+        self.block_map = {}
+
+    def apply_page_break_after(self):
+        for i, block in enumerate(self.all_blocks):
+            if block.page_break_after and i < len(self.all_blocks) - 1:
+                next_block = self.all_blocks[i + 1]
+                if next_block.parent_items is block.parent_items and block.parent_items is self.items:
+                    next_block.page_break_before = True
+
+    def resolve_language(self):
+        default_lang = self.styles_manager.document_lang
+        for block in self.all_blocks:
+            count = Counter()
+            for run in block.runs:
+                count[run.lang] += 1
+            if count:
+                block.block_lang = bl = count.most_common(1)[0][0]
+                for run in block.runs:
+                    if run.lang == bl:
+                        run.lang = None
+                if bl == default_lang:
+                    block.block_lang = None
+
+    def __repr__(self):
+        return 'Block(%r)' % self.runs
+
+
+class Convert(object):
+
+    # Word does not apply default styling to hyperlinks, so we ensure they get
+    # default styling (the conversion pipeline does not apply any styling to
+    # them).
+    base_css = '''
+    a[href] { text-decoration: underline; color: blue }
+    '''
+
+    def __init__(self, oeb, docx, mi, add_cover, add_toc):
+        self.oeb, self.docx, self.add_cover, self.add_toc = oeb, docx, add_cover, add_toc
+        self.log, self.opts = docx.log, docx.opts
+        self.mi = mi
+        self.cover_img = None
+        p = self.opts.output_profile
+        p.width_pts, p.height_pts = page_effective_area(self.opts)
+
+    def __call__(self):
+        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
+        self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
+        self.svg_rasterizer(self.oeb, self.opts)
+
+        self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language)
+        self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log)
+        self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts)
+        self.lists_manager = ListsManager(self.docx)
+        self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
+        self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager)
+        self.current_link = self.current_lang = None
+
+        for item in self.oeb.spine:
+            self.log.debug('Processing', item.href)
+            self.process_item(item)
+        if self.add_toc:
+            self.links_manager.process_toc_links(self.oeb)
+
+        if self.add_cover and self.oeb.metadata.cover and unicode_type(self.oeb.metadata.cover[0]) in self.oeb.manifest.ids:
+            cover_id = unicode_type(self.oeb.metadata.cover[0])
+            item = self.oeb.manifest.ids[cover_id]
+            self.cover_img = self.images_manager.read_image(item.href)
+
+        all_blocks = self.blocks.all_blocks
+        remove_blocks = []
+        for i, block in enumerate(all_blocks):
+            try:
+                nb = all_blocks[i+1]
+            except IndexError:
+                break
+            block.resolve_skipped(nb)
+            if block.skipped:
+                remove_blocks.append((i, block))
+        for pos, block in reversed(remove_blocks):
+            self.blocks.delete_block_at(pos)
+        self.blocks.all_blocks[0].is_first_block = True
+        self.blocks.apply_page_break_after()
+        self.blocks.resolve_language()
+
+        if self.cover_img is not None:
+            self.cover_img = self.images_manager.create_cover_markup(self.cover_img, self.opts.preserve_cover_aspect_ratio, *page_size(self.opts))
+        self.lists_manager.finalize(all_blocks)
+        self.styles_manager.finalize(all_blocks)
+        self.write()
+
+    def process_item(self, item):
+        self.current_item = item
+        stylizer = self.svg_rasterizer.stylizer_cache.get(item)
+        if stylizer is None:
+            stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css)
+        self.abshref = self.images_manager.abshref = item.abshref
+
+        self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang
+        for i, body in enumerate(XPath('//h:body')(item.data)):
+            with self.blocks:
+                self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(self.links_manager.top_anchor, self.current_item, body)
+                self.process_tag(body, stylizer, is_first_tag=i == 0)
+
+    def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None):
+        tagname = barename(html_tag.tag)
+        tag_style = stylizer.style(html_tag)
+        ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden
+        display = tag_style._get('display')
+        is_block = False
+
+        if not ignore_tag_contents:
+            previous_link = self.current_link
+            if tagname == 'a' and html_tag.get('href'):
+                self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title'))
+            previous_lang = self.current_lang
+            tag_lang = lang_for_tag(html_tag)
+            if tag_lang:
+                self.current_lang = tag_lang
+
+            is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag
+            if float_spec is None and is_float:
+                float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style)
+
+            if display in {'inline', 'inline-block'} or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
+                if is_float and float_spec.is_dropcaps:
+                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
+                    float_spec = None
+                else:
+                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
+            elif display == 'list-item':
+                self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True)
+            elif display.startswith('table') or display == 'inline-table':
+                if display == 'table-cell':
+                    self.blocks.start_new_cell(html_tag, tag_style)
+                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True)
+                elif display == 'table-row':
+                    self.blocks.start_new_row(html_tag, tag_style)
+                elif display in {'table', 'inline-table'}:
+                    self.blocks.end_current_block()
+                    self.blocks.start_new_table(html_tag, tag_style)
+            else:
+                if tagname == 'img' and is_float:
+                    # Image is floating so dont start a new paragraph for it
+                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
+                else:
+                    if tagname == 'hr':
+                        for edge in 'right bottom left'.split():
+                            tag_style.set('border-%s-style' % edge, 'none')
+                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
+
+            for child in html_tag.iterchildren():
+                if isinstance(getattr(child, 'tag', None), string_or_bytes):
+                    self.process_tag(child, stylizer, float_spec=float_spec)
+                else:  # Comment/PI/etc.
+                    tail = getattr(child, 'tail', None)
+                    if tail:
+                        block = self.create_block_from_parent(html_tag, stylizer)
+                        block.add_text(tail, tag_style, is_parent_style=False, link=self.current_link, lang=self.current_lang)
+
+            is_block = html_tag in self.blocks.open_html_blocks
+            self.blocks.finish_tag(html_tag)
+            if is_block and tag_style['page-break-after'] == 'avoid':
+                self.blocks.all_blocks[-1].keep_next = True
+
+            self.current_link = previous_link
+            self.current_lang = previous_lang
+
+        # Now, process the tail if any
+
+        if display == 'table-row':
+            return  # We ignore the tail for these tags
+
+        ignore_whitespace_tail = is_block or display.startswith('table')
+        if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()):
+            # Ignore trailing space after a block tag, as otherwise it will
+            # become a new empty paragraph
+            block = self.create_block_from_parent(html_tag, stylizer)
+            block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link, lang=self.current_lang)
+
+    def create_block_from_parent(self, html_tag, stylizer):
+        parent = html_tag.getparent()
+        block = self.blocks.current_or_new_block(parent, stylizer.style(parent))
+        # Do not inherit page-break-before from parent
+        block.page_break_before = False
+        return block
+
+    def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False):
+        block = self.blocks.start_new_block(
+            html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item)
+        anchor = html_tag.get('id') or html_tag.get('name')
+        if anchor:
+            block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag))
+        if tagname == 'img':
+            self.images_manager.add_image(html_tag, block, stylizer, as_block=True)
+        else:
+            text = html_tag.text
+            if text:
+                block.add_text(text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang)
+            elif tagname == 'li' and len(html_tag) and barename(html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]):
+                block.force_not_empty = True
+
+    def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
+        anchor = html_tag.get('id') or html_tag.get('name') or None
+        bmark = None
+        if anchor:
+            bmark = self.bookmark_for_anchor(anchor, html_tag)
+        if tagname == 'br':
+            if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]:
+                block = self.create_block_from_parent(html_tag, stylizer)
+                block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'), bookmark=bmark)
+        elif tagname == 'img':
+            block = self.create_block_from_parent(html_tag, stylizer)
+            self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark)
+        else:
+            if html_tag.text:
+                block = self.create_block_from_parent(html_tag, stylizer)
+                block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang)
+            elif bmark:
+                block = self.create_block_from_parent(html_tag, stylizer)
+                block.add_text('', tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang)
+
+    def bookmark_for_anchor(self, anchor, html_tag):
+        return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag)
+
+    def write(self):
+        self.docx.document, self.docx.styles, body = create_skeleton(self.opts)
+        self.blocks.serialize(body)
+        body.append(body[0])  # Move <sectPr> to the end
+        if self.links_manager.toc:
+            self.links_manager.serialize_toc(body, self.styles_manager.primary_heading_style)
+        if self.cover_img is not None:
+            self.images_manager.write_cover_block(body, self.cover_img)
+        self.styles_manager.serialize(self.docx.styles)
+        self.images_manager.serialize(self.docx.images)
+        self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts)
+        self.lists_manager.serialize(self.docx.numbering)
--- a/ebook_converter/ebooks/docx/writer/images.py
+++ b/ebook_converter/ebooks/docx/writer/images.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os
+import posixpath
+from collections import namedtuple
+from functools import partial
+from polyglot.builtins import iteritems, itervalues, map, unicode_type
+
+from lxml import etree
+
+from calibre import fit_image
+from calibre.ebooks.oeb.base import urlunquote
+from calibre.ebooks.docx.images import pt_to_emu
+from calibre.utils.filenames import ascii_filename
+from calibre.utils.imghdr import identify
+
+Image = namedtuple('Image', 'rid fname width height fmt item')
+
+
+def as_num(x):
+    try:
+        return float(x)
+    except Exception:
+        pass
+    return 0
+
+
+def get_image_margins(style):
+    ans = {}
+    for edge in 'Left Right Top Bottom'.split():
+        val = as_num(getattr(style, 'padding' + edge)) + as_num(getattr(style, 'margin' + edge))
+        ans['dist' + edge[0]] = unicode_type(pt_to_emu(val))
+    return ans
+
+
+class ImagesManager(object):
+
+    def __init__(self, oeb, document_relationships, opts):
+        self.oeb, self.log = oeb, oeb.log
+        self.page_width, self.page_height = opts.output_profile.width_pts, opts.output_profile.height_pts
+        self.images = {}
+        self.seen_filenames = set()
+        self.document_relationships = document_relationships
+        self.count = 0
+
+    def read_image(self, href):
+        if href not in self.images:
+            item = self.oeb.manifest.hrefs.get(href)
+            if item is None or not isinstance(item.data, bytes):
+                return
+            try:
+                fmt, width, height = identify(item.data)
+            except Exception:
+                self.log.warning('Replacing corrupted image with blank: %s' % href)
+                item.data = I('blank.png', data=True, allow_user_override=False)
+                fmt, width, height = identify(item.data)
+            image_fname = 'media/' + self.create_filename(href, fmt)
+            image_rid = self.document_relationships.add_image(image_fname)
+            self.images[href] = Image(image_rid, image_fname, width, height, fmt, item)
+            item.unload_data_from_memory()
+        return self.images[href]
+
+    def add_image(self, img, block, stylizer, bookmark=None, as_block=False):
+        src = img.get('src')
+        if not src:
+            return
+        href = self.abshref(src)
+        try:
+            rid = self.read_image(href).rid
+        except AttributeError:
+            return
+        drawing = self.create_image_markup(img, stylizer, href, as_block=as_block)
+        block.add_image(drawing, bookmark=bookmark)
+        return rid
+
+    def create_image_markup(self, html_img, stylizer, href, as_block=False):
+        # TODO: img inside a link (clickable image)
+        style = stylizer.style(html_img)
+        floating = style['float']
+        if floating not in {'left', 'right'}:
+            floating = None
+        if as_block:
+            ml, mr = style._get('margin-left'), style._get('margin-right')
+            if ml == 'auto':
+                floating = 'center' if mr == 'auto' else 'right'
+            if mr == 'auto':
+                floating = 'center' if ml == 'auto' else 'right'
+        else:
+            parent = html_img.getparent()
+            if len(parent) == 1 and not (parent.text or '').strip() and not (html_img.tail or '').strip():
+                pstyle = stylizer.style(parent)
+                if 'block' in pstyle['display']:
+                    # We have an inline image alone inside a block
+                    as_block = True
+                    floating = pstyle['float']
+                    if floating not in {'left', 'right'}:
+                        floating = None
+                        if pstyle['text-align'] in ('center', 'right'):
+                            floating = pstyle['text-align']
+                    floating = floating or 'left'
+        fake_margins = floating is None
+        self.count += 1
+        img = self.images[href]
+        name = urlunquote(posixpath.basename(href))
+        width, height = style.img_size(img.width, img.height)
+        scaled, width, height = fit_image(width, height, self.page_width, self.page_height)
+        width, height = map(pt_to_emu, (width, height))
+
+        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
+
+        root = etree.Element('root', nsmap=namespaces)
+        ans = makeelement(root, 'w:drawing', append=False)
+        if floating is None:
+            parent = makeelement(ans, 'wp:inline')
+        else:
+            parent = makeelement(ans, 'wp:anchor', **get_image_margins(style))
+            # The next three lines are boilerplate that Word requires, even
+            # though the DOCX specs define defaults for all of them
+            parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0")
+            parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
+            makeelement(parent, 'wp:simplePos', x='0', y='0')
+            makeelement(makeelement(parent, 'wp:positionH', relativeFrom='margin'), 'wp:align').text = floating
+            makeelement(makeelement(parent, 'wp:positionV', relativeFrom='line'), 'wp:align').text = 'top'
+        makeelement(parent, 'wp:extent', cx=unicode_type(width), cy=unicode_type(height))
+        if fake_margins:
+            # DOCX does not support setting margins for inline images, so we
+            # fake it by using effect extents to simulate margins
+            makeelement(parent, 'wp:effectExtent', **{k[-1].lower():v for k, v in iteritems(get_image_margins(style))})
+        else:
+            makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
+        if floating is not None:
+            # The idiotic Word requires this to be after the extent settings
+            if as_block:
+                makeelement(parent, 'wp:wrapTopAndBottom')
+            else:
+                makeelement(parent, 'wp:wrapSquare', wrapText='bothSides')
+        self.create_docx_image_markup(parent, name, html_img.get('alt') or name, img.rid, width, height)
+        return ans
+
+    def create_docx_image_markup(self, parent, name, alt, img_rid, width, height):
+        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
+        makeelement(parent, 'wp:docPr', id=unicode_type(self.count), name=name, descr=alt)
+        makeelement(makeelement(parent, 'wp:cNvGraphicFramePr'), 'a:graphicFrameLocks', noChangeAspect="1")
+        g = makeelement(parent, 'a:graphic')
+        gd = makeelement(g, 'a:graphicData', uri=namespaces['pic'])
+        pic = makeelement(gd, 'pic:pic')
+        nvPicPr = makeelement(pic, 'pic:nvPicPr')
+        makeelement(nvPicPr, 'pic:cNvPr', id='0', name=name, descr=alt)
+        makeelement(nvPicPr, 'pic:cNvPicPr')
+        bf = makeelement(pic, 'pic:blipFill')
+        makeelement(bf, 'a:blip', r_embed=img_rid)
+        makeelement(makeelement(bf, 'a:stretch'), 'a:fillRect')
+        spPr = makeelement(pic, 'pic:spPr')
+        xfrm = makeelement(spPr, 'a:xfrm')
+        makeelement(xfrm, 'a:off', x='0', y='0'), makeelement(xfrm, 'a:ext', cx=unicode_type(width), cy=unicode_type(height))
+        makeelement(makeelement(spPr, 'a:prstGeom', prst='rect'), 'a:avLst')
+
+    def create_filename(self, href, fmt):
+        fname = ascii_filename(urlunquote(posixpath.basename(href)))
+        fname = posixpath.splitext(fname)[0]
+        fname = fname[:75].rstrip('.') or 'image'
+        num = 0
+        base = fname
+        while fname.lower() in self.seen_filenames:
+            num += 1
+            fname = base + unicode_type(num)
+        self.seen_filenames.add(fname.lower())
+        fname += os.extsep + fmt.lower()
+        return fname
+
+    def serialize(self, images_map):
+        for img in itervalues(self.images):
+            images_map['word/' + img.fname] = partial(self.get_data, img.item)
+
+    def get_data(self, item):
+        try:
+            return item.data
+        finally:
+            item.unload_data_from_memory(False)
+
+    def create_cover_markup(self, img, preserve_aspect_ratio, width, height):
+        self.count += 1
+        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
+        if preserve_aspect_ratio:
+            if img.width >= img.height:
+                ar = img.height / img.width
+                height = ar * width
+            else:
+                ar = img.width / img.height
+                width = ar * height
+
+        root = etree.Element('root', nsmap=namespaces)
+        ans = makeelement(root, 'w:drawing', append=False)
+        parent = makeelement(ans, 'wp:anchor', **{'dist'+edge:'0' for edge in 'LRTB'})
+        parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0")
+        parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
+        makeelement(parent, 'wp:simplePos', x='0', y='0')
+        makeelement(makeelement(parent, 'wp:positionH', relativeFrom='page'), 'wp:align').text = 'center'
+        makeelement(makeelement(parent, 'wp:positionV', relativeFrom='page'), 'wp:align').text = 'center'
+        width, height = map(pt_to_emu, (width, height))
+        makeelement(parent, 'wp:extent', cx=unicode_type(width), cy=unicode_type(height))
+        makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
+        makeelement(parent, 'wp:wrapTopAndBottom')
+        self.create_docx_image_markup(parent, 'cover.jpg', _('Cover'), img.rid, width, height)
+        return ans
+
+    def write_cover_block(self, body, cover_image):
+        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
+        pbb = body[0].xpath('//*[local-name()="pageBreakBefore"]')[0]
+        pbb.set('{%s}val' % namespaces['w'], 'on')
+        p = makeelement(body, 'w:p', append=False)
+        body.insert(0, p)
+        r = makeelement(p, 'w:r')
+        r.append(cover_image)
--- a/ebook_converter/ebooks/docx/writer/links.py
+++ b/ebook_converter/ebooks/docx/writer/links.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import posixpath, re
+from uuid import uuid4
+
+from calibre.utils.filenames import ascii_text
+from polyglot.builtins import unicode_type
+from polyglot.urllib import urlparse
+
+
+def start_text(tag, prefix_len=0, top_level=True):
+    ans = tag.text or ''
+    limit = 50 - prefix_len
+    if len(ans) < limit:
+        for child in tag.iterchildren('*'):
+            ans += start_text(child, len(ans), top_level=False) + (child.tail or '')
+            if len(ans) >= limit:
+                break
+    if top_level and len(ans) > limit:
+        ans = ans[:limit] + '...'
+    return ans
+
+
+class TOCItem(object):
+
+    def __init__(self, title, bmark, level):
+        self.title, self.bmark, self.level = title, bmark, level
+        self.is_first = self.is_last = False
+
+    def serialize(self, body, makeelement):
+        p = makeelement(body, 'w:p', append=False)
+        ppr = makeelement(p, 'w:pPr')
+        makeelement(ppr, 'w:pStyle', w_val="Normal")
+        makeelement(ppr, 'w:ind', w_left='0', w_firstLineChars='0', w_firstLine='0', w_leftChars=unicode_type(200 * self.level))
+        if self.is_first:
+            makeelement(ppr, 'w:pageBreakBefore', w_val='off')
+            r = makeelement(p, 'w:r')
+            makeelement(r, 'w:fldChar', w_fldCharType='begin')
+            r = makeelement(p, 'w:r')
+            makeelement(r, 'w:instrText').text = r' TOC \h '
+            r[0].set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+            r = makeelement(p, 'w:r')
+            makeelement(r, 'w:fldChar', w_fldCharType='separate')
+        hl = makeelement(p, 'w:hyperlink', w_anchor=self.bmark)
+        r = makeelement(hl, 'w:r')
+        rpr = makeelement(r, 'w:rPr')
+        makeelement(rpr, 'w:color', w_val='0000FF', w_themeColor='hyperlink')
+        makeelement(rpr, 'w:u', w_val='single')
+        makeelement(r, 'w:t').text = self.title
+        if self.is_last:
+            r = makeelement(p, 'w:r')
+            makeelement(r, 'w:fldChar', w_fldCharType='end')
+        body.insert(0, p)
+
+
+def sanitize_bookmark_name(base):
+    # Max length allowed by Word appears to be 40, we use 32 to leave some
+    # space for making the name unique
+    return re.sub(r'[^0-9a-zA-Z]', '_', ascii_text(base))[:32].rstrip('_')
+
+
+class LinksManager(object):
+
+    def __init__(self, namespace, document_relationships, log):
+        self.namespace = namespace
+        self.log = log
+        self.document_relationships = document_relationships
+        self.top_anchor = unicode_type(uuid4().hex)
+        self.anchor_map = {}
+        self.used_bookmark_names = set()
+        self.bmark_id = 0
+        self.document_hrefs = set()
+        self.external_links = {}
+        self.toc = []
+
+    def bookmark_for_anchor(self, anchor, current_item, html_tag):
+        key = (current_item.href, anchor)
+        if key in self.anchor_map:
+            return self.anchor_map[key]
+        if anchor == self.top_anchor:
+            name = ('Top of %s' % posixpath.basename(current_item.href))
+            self.document_hrefs.add(current_item.href)
+        else:
+            name = start_text(html_tag).strip() or anchor
+        name = sanitize_bookmark_name(name)
+        i, bname = 0, name
+        while name in self.used_bookmark_names:
+            i += 1
+            name  = bname + ('_%d' % i)
+        self.anchor_map[key] = name
+        self.used_bookmark_names.add(name)
+        return name
+
+    @property
+    def bookmark_id(self):
+        self.bmark_id += 1
+        return self.bmark_id
+
+    def serialize_hyperlink(self, parent, link):
+        item, url, tooltip = link
+        purl = urlparse(url)
+        href = purl.path
+
+        def make_link(parent, anchor=None, id=None, tooltip=None):
+            kw = {}
+            if anchor is not None:
+                kw['w_anchor'] = anchor
+            elif id is not None:
+                kw['r_id'] = id
+            if tooltip:
+                kw['w_tooltip'] = tooltip
+            return self.namespace.makeelement(parent, 'w:hyperlink', **kw)
+
+        if not purl.scheme:
+            href = item.abshref(href)
+            if href in self.document_hrefs:
+                key = (href, purl.fragment or self.top_anchor)
+                if key in self.anchor_map:
+                    bmark = self.anchor_map[key]
+                else:
+                    bmark = self.anchor_map[(href, self.top_anchor)]
+                return make_link(parent, anchor=bmark, tooltip=tooltip)
+            else:
+                self.log.warn('Ignoring internal hyperlink with href (%s) pointing to unknown destination' % url)
+        if purl.scheme in {'http', 'https', 'ftp'}:
+            if url not in self.external_links:
+                self.external_links[url] = self.document_relationships.add_relationship(url, self.namespace.names['LINKS'], target_mode='External')
+            return make_link(parent, id=self.external_links[url], tooltip=tooltip)
+        return parent
+
+    def process_toc_node(self, toc, level=0):
+        href = toc.href
+        if href:
+            purl = urlparse(href)
+            href = purl.path
+            if href in self.document_hrefs:
+                key = (href, purl.fragment or self.top_anchor)
+                if key in self.anchor_map:
+                    bmark = self.anchor_map[key]
+                else:
+                    bmark = self.anchor_map[(href, self.top_anchor)]
+                self.toc.append(TOCItem(toc.title, bmark, level))
+        for child in toc:
+            self.process_toc_node(child, level+1)
+
+    def process_toc_links(self, oeb):
+        self.toc = []
+        has_toc = oeb.toc and oeb.toc.count() > 1
+        if not has_toc:
+            return
+        for child in oeb.toc:
+            self.process_toc_node(child)
+        if self.toc:
+            self.toc[0].is_first = True
+            self.toc[-1].is_last = True
+
+    def serialize_toc(self, body, primary_heading_style):
+        pbb = body[0].xpath('//*[local-name()="pageBreakBefore"]')[0]
+        pbb.set('{%s}val' % self.namespace.namespaces['w'], 'on')
+        for block in reversed(self.toc):
+            block.serialize(body, self.namespace.makeelement)
+        title = __('Table of Contents')
+        makeelement = self.namespace.makeelement
+        p = makeelement(body, 'w:p', append=False)
+        ppr = makeelement(p, 'w:pPr')
+        if primary_heading_style is not None:
+            makeelement(ppr, 'w:pStyle', w_val=primary_heading_style.id)
+        makeelement(ppr, 'w:pageBreakBefore', w_val='off')
+        makeelement(makeelement(p, 'w:r'), 'w:t').text = title
+        body.insert(0, p)
--- a/ebook_converter/ebooks/docx/writer/lists.py
+++ b/ebook_converter/ebooks/docx/writer/lists.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import defaultdict
+from operator import attrgetter
+
+from polyglot.builtins import iteritems, itervalues, unicode_type
+
+LIST_STYLES = frozenset(
+    'disc circle square decimal decimal-leading-zero lower-roman upper-roman'
+    ' lower-greek lower-alpha lower-latin upper-alpha upper-latin hiragana hebrew'
+    ' katakana-iroha cjk-ideographic'.split())
+
+STYLE_MAP = {
+    'disc': 'bullet',
+    'circle': 'o',
+    'square': '\uf0a7',
+    'decimal': 'decimal',
+    'decimal-leading-zero': 'decimalZero',
+    'lower-roman': 'lowerRoman',
+    'upper-roman': 'upperRoman',
+    'lower-alpha': 'lowerLetter',
+    'lower-latin': 'lowerLetter',
+    'upper-alpha': 'upperLetter',
+    'upper-latin': 'upperLetter',
+    'hiragana': 'aiueo',
+    'hebrew': 'hebrew1',
+    'katakana-iroha': 'iroha',
+    'cjk-ideographic': 'chineseCounting',
+}
+
+
+def find_list_containers(list_tag, tag_style):
+    node = list_tag
+    stylizer = tag_style._stylizer
+    ans = []
+    while True:
+        parent = node.getparent()
+        if parent is None or parent is node:
+            break
+        node = parent
+        style = stylizer.style(node)
+        lst = (style._style.get('list-style-type', None) or '').lower()
+        if lst in LIST_STYLES:
+            ans.append(node)
+    return ans
+
+
+class NumberingDefinition(object):
+
+    def __init__(self, top_most, stylizer, namespace):
+        self.namespace = namespace
+        self.top_most = top_most
+        self.stylizer = stylizer
+        self.level_map = defaultdict(list)
+        self.num_id = None
+
+    def finalize(self):
+        items_for_level = defaultdict(list)
+        container_for_level = {}
+        type_for_level = {}
+        for ilvl, items in iteritems(self.level_map):
+            for container, list_tag, block, list_type, tag_style in items:
+                items_for_level[ilvl].append(list_tag)
+                container_for_level[ilvl] = container
+                type_for_level[ilvl] = list_type
+        self.levels = tuple(
+            Level(type_for_level[ilvl], container_for_level[ilvl], items_for_level[ilvl], ilvl=ilvl)
+            for ilvl in sorted(self.level_map)
+        )
+
+    def __hash__(self):
+        return hash(self.levels)
+
+    def link_blocks(self):
+        for ilvl, items in iteritems(self.level_map):
+            for container, list_tag, block, list_type, tag_style in items:
+                block.numbering_id = (self.num_id + 1, ilvl)
+
+    def serialize(self, parent):
+        makeelement = self.namespace.makeelement
+        an = makeelement(parent, 'w:abstractNum', w_abstractNumId=unicode_type(self.num_id))
+        makeelement(an, 'w:multiLevelType', w_val='hybridMultilevel')
+        makeelement(an, 'w:name', w_val='List %d' % (self.num_id + 1))
+        for level in self.levels:
+            level.serialize(an, makeelement)
+
+
+class Level(object):
+
+    def __init__(self, list_type, container, items, ilvl=0):
+        self.ilvl = ilvl
+        try:
+            self.start = int(container.get('start'))
+        except Exception:
+            self.start = 1
+        if items:
+            try:
+                self.start = int(items[0].get('value'))
+            except Exception:
+                pass
+        if list_type in {'disc', 'circle', 'square'}:
+            self.num_fmt = 'bullet'
+            self.lvl_text = '\uf0b7' if list_type == 'disc' else STYLE_MAP[list_type]
+        else:
+            self.lvl_text = '%{}.'.format(self.ilvl + 1)
+            self.num_fmt = STYLE_MAP.get(list_type, 'decimal')
+
+    def __hash__(self):
+        return hash((self.start, self.num_fmt, self.lvl_text))
+
+    def serialize(self, parent, makeelement):
+        lvl = makeelement(parent, 'w:lvl', w_ilvl=unicode_type(self.ilvl))
+        makeelement(lvl, 'w:start', w_val=unicode_type(self.start))
+        makeelement(lvl, 'w:numFmt', w_val=self.num_fmt)
+        makeelement(lvl, 'w:lvlText', w_val=self.lvl_text)
+        makeelement(lvl, 'w:lvlJc', w_val='left')
+        makeelement(makeelement(lvl, 'w:pPr'), 'w:ind', w_hanging='360', w_left=unicode_type(1152 + self.ilvl * 360))
+        if self.num_fmt == 'bullet':
+            ff = {'\uf0b7':'Symbol', '\uf0a7':'Wingdings'}.get(self.lvl_text, 'Courier New')
+            makeelement(makeelement(lvl, 'w:rPr'), 'w:rFonts', w_ascii=ff, w_hAnsi=ff, w_hint="default")
+
+
+class ListsManager(object):
+
+    def __init__(self, docx):
+        self.namespace = docx.namespace
+        self.lists = {}
+
+    def finalize(self, all_blocks):
+        lists = {}
+        for block in all_blocks:
+            if block.list_tag is not None:
+                list_tag, tag_style = block.list_tag
+                list_type = (tag_style['list-style-type'] or '').lower()
+                if list_type not in LIST_STYLES:
+                    continue
+                container_tags = find_list_containers(list_tag, tag_style)
+                if not container_tags:
+                    continue
+                top_most = container_tags[-1]
+                if top_most not in lists:
+                    lists[top_most] = NumberingDefinition(top_most, tag_style._stylizer, self.namespace)
+                l = lists[top_most]
+                ilvl = len(container_tags) - 1
+                l.level_map[ilvl].append((container_tags[0], list_tag, block, list_type, tag_style))
+
+        [nd.finalize() for nd in itervalues(lists)]
+        definitions = {}
+        for defn in itervalues(lists):
+            try:
+                defn = definitions[defn]
+            except KeyError:
+                definitions[defn] = defn
+                defn.num_id = len(definitions) - 1
+            defn.link_blocks()
+        self.definitions = sorted(itervalues(definitions), key=attrgetter('num_id'))
+
+    def serialize(self, parent):
+        for defn in self.definitions:
+            defn.serialize(parent)
+        makeelement = self.namespace.makeelement
+        for defn in self.definitions:
+            n = makeelement(parent, 'w:num', w_numId=unicode_type(defn.num_id + 1))
+            makeelement(n, 'w:abstractNumId', w_val=unicode_type(defn.num_id))
--- a/ebook_converter/ebooks/docx/writer/styles.py
+++ b/ebook_converter/ebooks/docx/writer/styles.py
@@ -0,0 +1,768 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import numbers
+from collections import Counter, defaultdict
+from operator import attrgetter
+
+from lxml import etree
+
+from calibre.ebooks import parse_css_length
+from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero
+from calibre.utils.localization import lang_as_iso639_1
+from polyglot.builtins import iteritems, filter, unicode_type
+from tinycss.css21 import CSS21Parser
+
+css_parser = CSS21Parser()
+
+border_edges = ('left', 'top', 'right', 'bottom')
+border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
+ignore = object()
+
+
+def parse_css_font_family(raw):
+    decl, errs = css_parser.parse_style_attr('font-family:' + raw)
+    if decl:
+        for token in decl[0].value:
+            if token.type in 'STRING IDENT':
+                val = token.value
+                if val == 'inherit':
+                    break
+                yield val
+
+
+def css_font_family_to_docx(raw):
+    generic = {'serif':'Cambria', 'sansserif':'Candara', 'sans-serif':'Candara', 'fantasy':'Comic Sans', 'cursive':'Segoe Script'}
+    for ff in parse_css_font_family(raw):
+        return generic.get(ff.lower(), ff)
+
+
+def bmap(x):
+    return 'on' if x else 'off'
+
+
+def is_dropcaps(html_tag, tag_style):
+    return len(html_tag) < 2 and len(etree.tostring(html_tag, method='text', encoding='unicode', with_tail=False)) < 5 and tag_style['float'] == 'left'
+
+
+class CombinedStyle(object):
+
+    def __init__(self, bs, rs, blocks, namespace):
+        self.bs, self.rs, self.blocks = bs, rs, blocks
+        self.namespace = namespace
+        self.id = self.name = self.seq = None
+        self.outline_level = None
+
+    def apply(self):
+        for block in self.blocks:
+            block.linked_style = self
+            for run in block.runs:
+                run.parent_style = self.rs
+
+    def serialize(self, styles, normal_style):
+        makeelement = self.namespace.makeelement
+        w = lambda x: '{%s}%s' % (self.namespace.namespaces['w'], x)
+        block = makeelement(styles, 'w:style', w_styleId=self.id, w_type='paragraph')
+        makeelement(block, 'w:name', w_val=self.name)
+        makeelement(block, 'w:qFormat')
+        if self is not normal_style:
+            makeelement(block, 'w:basedOn', w_val=normal_style.id)
+        if self.seq == 0:
+            block.set(w('default'), '1')
+        pPr = makeelement(block, 'w:pPr')
+        self.bs.serialize_properties(pPr, normal_style.bs)
+        if self.outline_level is not None:
+            makeelement(pPr, 'w:outlineLvl', w_val=unicode_type(self.outline_level + 1))
+        rPr = makeelement(block, 'w:rPr')
+        self.rs.serialize_properties(rPr, normal_style.rs)
+
+
+class FloatSpec(object):
+
+    def __init__(self, namespace, html_tag, tag_style):
+        self.makeelement = namespace.makeelement
+        self.is_dropcaps = is_dropcaps(html_tag, tag_style)
+        self.blocks = []
+        if self.is_dropcaps:
+            self.dropcaps_lines = 3
+        else:
+            self.x_align = tag_style['float']
+            self.w = self.h = None
+            if tag_style._get('width') != 'auto':
+                self.w = int(20 * max(tag_style['min-width'], tag_style['width']))
+            if tag_style._get('height') == 'auto':
+                self.h_rule = 'auto'
+            else:
+                if tag_style['min-height'] > 0:
+                    self.h_rule, self.h = 'atLeast', tag_style['min-height']
+                else:
+                    self.h_rule, self.h = 'exact', tag_style['height']
+                self.h = int(20 * self.h)
+            self.h_space = int(20 * max(tag_style['margin-right'], tag_style['margin-left']))
+            self.v_space = int(20 * max(tag_style['margin-top'], tag_style['margin-bottom']))
+
+        read_css_block_borders(self, tag_style)
+
+    def serialize(self, block, parent):
+        if self.is_dropcaps:
+            attrs = dict(w_dropCap='drop', w_lines=unicode_type(self.dropcaps_lines), w_wrap='around', w_vAnchor='text', w_hAnchor='text')
+        else:
+            attrs = dict(
+                w_wrap='around', w_vAnchor='text', w_hAnchor='text', w_xAlign=self.x_align, w_y='1',
+                w_hSpace=unicode_type(self.h_space), w_vSpace=unicode_type(self.v_space), w_hRule=self.h_rule
+            )
+            if self.w is not None:
+                attrs['w_w'] = unicode_type(self.w)
+            if self.h is not None:
+                attrs['w_h'] = unicode_type(self.h)
+        self.makeelement(parent, 'w:framePr', **attrs)
+        # Margins are already applied by the frame style, so override them to
+        # be zero on individual blocks
+        self.makeelement(parent, 'w:ind', w_left='0', w_leftChars='0', w_right='0', w_rightChars='0')
+        attrs = {}
+        if block is self.blocks[0]:
+            attrs.update(dict(w_before='0', w_beforeLines='0'))
+        if block is self.blocks[-1]:
+            attrs.update(dict(w_after='0', w_afterLines='0'))
+        if attrs:
+            self.makeelement(parent, 'w:spacing', **attrs)
+        # Similarly apply the same border and padding properties to all blocks
+        # in this floatspec
+        bdr = self.makeelement(parent, 'w:pBdr')
+        for edge in border_edges:
+            padding = getattr(self, 'padding_' + edge)
+            width = getattr(self, 'border_%s_width' % edge)
+            bstyle = getattr(self, 'border_%s_style' % edge)
+            self.makeelement(
+                bdr, 'w:'+edge, w_space=unicode_type(padding), w_val=bstyle, w_sz=unicode_type(width), w_color=getattr(self, 'border_%s_color' % edge))
+
+
+class DOCXStyle(object):
+
+    ALL_PROPS = ()
+    TYPE = 'paragraph'
+
+    def __init__(self, namespace):
+        self.namespace = namespace
+        self.w = lambda x: '{%s}%s' % (namespace.namespaces['w'], x)
+        self.id = self.name = None
+        self.next_style = None
+        self.calculate_hash()
+
+    def calculate_hash(self):
+        self._hash = hash(tuple(
+            getattr(self, x) for x in self.ALL_PROPS))
+
+    def makeelement(self, parent, name, **attrs):
+        return parent.makeelement(self.w(name), **{self.w(k):v for k, v in iteritems(attrs)})
+
+    def __hash__(self):
+        return self._hash
+
+    def __eq__(self, other):
+        for x in self.ALL_PROPS:
+            if getattr(self, x) != getattr(other, x, None):
+                return False
+        return True
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __repr__(self):
+        return etree.tostring(self.serialize(etree.Element(self.__class__.__name__, nsmap={'w':self.namespace.namespaces['w']})), pretty_print=True)
+    __str__ = __repr__
+
+    def serialize(self, styles, normal_style):
+        makeelement = self.makeelement
+        style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE)
+        style.append(makeelement(style, 'name', val=self.name))
+        if self is not normal_style:
+            style.append(makeelement(style, 'basedOn', val=normal_style.id))
+        styles.append(style)
+        return style
+
+
+LINE_STYLES = {
+    'none'  : 'none',
+    'hidden': 'none',
+    'dotted': 'dotted',
+    'dashed': 'dashed',
+    'solid' : 'single',
+    'double': 'double',
+    'groove': 'threeDEngrave',
+    'ridge' : 'threeDEmboss',
+    'inset' : 'inset',
+    'outset': 'outset',
+}
+
+
+class TextStyle(DOCXStyle):
+
+    ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color',
+                 'background_color', 'underline', 'strike', 'dstrike', 'caps',
+                 'shadow', 'small_caps', 'spacing', 'vertical_align', 'padding',
+                 'border_style', 'border_width', 'border_color')
+    TYPE = 'character'
+
+    def __init__(self, namespace, css, is_parent_style=False):
+        self.font_family = css_font_family_to_docx(css['font-family'])
+        try:
+            self.font_size = max(0, int(float(css['font-size']) * 2))  # stylizer normalizes all font sizes into pts
+        except (ValueError, TypeError, AttributeError):
+            self.font_size = None
+
+        fw = css['font-weight']
+        self.bold = (fw.lower() if hasattr(fw, 'lower') else fw) in {'bold', 'bolder'} or int_or_zero(fw) >= 700
+        self.italic = css['font-style'].lower() in {'italic', 'oblique'}
+        self.color = convert_color(css['color'])
+        self.background_color = None if is_parent_style else convert_color(css.backgroundColor)
+        td = set((css.effective_text_decoration or '').split())
+        self.underline = 'underline' in td
+        self.dstrike = 'line-through' in td and 'overline' in td
+        self.strike = not self.dstrike and 'line-through' in td
+        self.text_transform = css['text-transform']  # TODO: If lowercase or capitalize, transform the actual text
+        self.caps = self.text_transform == 'uppercase'
+        self.small_caps = css['font-variant'].lower() in {'small-caps', 'smallcaps'}
+        self.shadow = css['text-shadow'] not in {'none', None}
+        try:
+            self.spacing = int(float(css['letter-spacing']) * 20)
+        except (ValueError, TypeError, AttributeError):
+            self.spacing = None
+        va = css.first_vertical_align
+        if isinstance(va, numbers.Number):
+            self.vertical_align = unicode_type(int(va * 2))
+        else:
+            val = {
+                'top':'superscript', 'text-top':'superscript', 'sup':'superscript', 'super':'superscript',
+                'bottom':'subscript', 'text-bottom':'subscript', 'sub':'subscript'}.get(va)
+            self.vertical_align = val or 'baseline'
+
+        self.padding = self.border_color = self.border_width = self.border_style = None
+        if not is_parent_style:
+            # DOCX does not support individual borders/padding for inline content
+            for edge in border_edges:
+                # In DOCX padding can only be a positive integer
+                try:
+                    padding = max(0, int(css['padding-' + edge]))
+                except ValueError:
+                    padding = 0
+                if self.padding is None:
+                    self.padding = padding
+                elif self.padding != padding:
+                    self.padding = ignore
+                val = css['border-%s-width' % edge]
+                if not isinstance(val, numbers.Number):
+                    val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0)
+                val = min(96, max(2, int(val * 8)))
+                if self.border_width is None:
+                    self.border_width = val
+                elif self.border_width != val:
+                    self.border_width = ignore
+                color = convert_color(css['border-%s-color' % edge])
+                if self.border_color is None:
+                    self.border_color = color
+                elif self.border_color != color:
+                    self.border_color = ignore
+                style = LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none')
+                if self.border_style is None:
+                    self.border_style = style
+                elif self.border_style != style:
+                    self.border_style = ignore
+
+        if self.padding in (None, ignore):
+            self.padding = 0
+        if self.border_width in (None, ignore):
+            self.border_width = 0
+        if self.border_style in (None, ignore):
+            self.border_style = 'none'
+        if self.border_color in (None, ignore):
+            self.border_color = 'auto'
+        if self.border_style == 'none':
+            self.border_width, self.border_color = 0, 'auto'
+
+        DOCXStyle.__init__(self, namespace)
+
+    def serialize_borders(self, bdr, normal_style):
+        w = self.w
+        is_normal_style = self is normal_style
+        if is_normal_style or self.padding != normal_style.padding:
+            bdr.set(w('space'), unicode_type(self.padding))
+        if is_normal_style or self.border_width != normal_style.border_width:
+            bdr.set(w('sz'), unicode_type(self.border_width))
+        if is_normal_style or self.border_style != normal_style.border_style:
+            bdr.set(w('val'), self.border_style)
+        if is_normal_style or self.border_color != normal_style.border_color:
+            bdr.set(w('color'), self.border_color)
+        return bdr
+
+    def serialize(self, styles, normal_style):
+        makeelement = self.makeelement
+        style_root = DOCXStyle.serialize(self, styles, normal_style)
+        style = makeelement(style_root, 'rPr')
+        self.serialize_properties(style, normal_style)
+        if len(style) > 0:
+            style_root.append(style)
+        return style_root
+
+    def serialize_properties(self, rPr, normal_style):
+        makeelement = self.makeelement
+        is_normal_style = self is normal_style
+        if is_normal_style or self.font_family != normal_style.font_family:
+            rPr.append(makeelement(
+                rPr, 'rFonts', **{k:self.font_family for k in 'ascii cs eastAsia hAnsi'.split()}))
+
+        for name, attr, vmap in (('sz', 'font_size', str), ('b', 'bold', bmap), ('i', 'italic', bmap)):
+            val = getattr(self, attr)
+            if is_normal_style or getattr(normal_style, attr) != val:
+                for suffix in ('', 'Cs'):
+                    rPr.append(makeelement(rPr, name + suffix, val=vmap(val)))
+
+        def check_attr(attr):
+            val = getattr(self, attr)
+            return is_normal_style or (val != getattr(normal_style, attr))
+
+        if check_attr('color'):
+            rPr.append(makeelement(rPr, 'color', val=self.color or 'auto'))
+        if check_attr('background_color'):
+            rPr.append(makeelement(rPr, 'shd', fill=self.background_color or 'auto'))
+        if check_attr('underline'):
+            rPr.append(makeelement(rPr, 'u', val='single' if self.underline else 'none'))
+        if check_attr('dstrike'):
+            rPr.append(makeelement(rPr, 'dstrike', val=bmap(self.dstrike)))
+        if check_attr('strike'):
+            rPr.append(makeelement(rPr, 'strike', val=bmap(self.strike)))
+        if check_attr('caps'):
+            rPr.append(makeelement(rPr, 'caps', val=bmap(self.caps)))
+        if check_attr('small_caps'):
+            rPr.append(makeelement(rPr, 'smallCaps', val=bmap(self.small_caps)))
+        if check_attr('shadow'):
+            rPr.append(makeelement(rPr, 'shadow', val=bmap(self.shadow)))
+        if check_attr('spacing'):
+            rPr.append(makeelement(rPr, 'spacing', val=unicode_type(self.spacing or 0)))
+        if is_normal_style:
+            rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align if self.vertical_align in {'superscript', 'subscript'} else 'baseline'))
+        elif self.vertical_align != normal_style.vertical_align:
+            if self.vertical_align in {'superscript', 'subscript', 'baseline'}:
+                rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align))
+            else:
+                rPr.append(makeelement(rPr, 'position', val=self.vertical_align))
+
+        bdr = self.serialize_borders(makeelement(rPr, 'bdr'), normal_style)
+        if bdr.attrib:
+            rPr.append(bdr)
+
+
+class DescendantTextStyle(object):
+
+    def __init__(self, parent_style, child_style):
+        self.id = self.name = None
+        self.makeelement = child_style.makeelement
+
+        p = []
+
+        def add(name, **props):
+            p.append((name, frozenset(iteritems(props))))
+
+        def vals(attr):
+            return getattr(parent_style, attr), getattr(child_style, attr)
+
+        def check(attr):
+            pval, cval = vals(attr)
+            return pval != cval
+
+        if parent_style.font_family != child_style.font_family:
+            add('rFonts', **{k:child_style.font_family for k in 'ascii cs eastAsia hAnsi'.split()})
+
+        for name, attr in (('sz', 'font_size'), ('b', 'bold'), ('i', 'italic')):
+            pval, cval = vals(attr)
+            if pval != cval:
+                val = 'on' if attr in {'bold', 'italic'} else unicode_type(cval)  # bold, italic are toggle properties
+                for suffix in ('', 'Cs'):
+                    add(name + suffix, val=val)
+
+        if check('color'):
+            add('color', val=child_style.color or 'auto')
+        if check('background_color'):
+            add('shd', fill=child_style.background_color or 'auto')
+        if check('underline'):
+            add('u', val='single' if child_style.underline else 'none')
+        if check('dstrike'):
+            add('dstrike', val=bmap(child_style.dstrike))
+        if check('strike'):
+            add('strike', val='on')  # toggle property
+        if check('caps'):
+            add('caps', val='on')  # toggle property
+        if check('small_caps'):
+            add('smallCaps', val='on')  # toggle property
+        if check('shadow'):
+            add('shadow', val='on')  # toggle property
+        if check('spacing'):
+            add('spacing', val=unicode_type(child_style.spacing or 0))
+        if check('vertical_align'):
+            val = child_style.vertical_align
+            if val in {'superscript', 'subscript', 'baseline'}:
+                add('vertAlign', val=val)
+            else:
+                add('position', val=val)
+
+        bdr = {}
+        if check('padding'):
+            bdr['space'] = unicode_type(child_style.padding)
+        if check('border_width'):
+            bdr['sz'] = unicode_type(child_style.border_width)
+        if check('border_style'):
+            bdr['val'] = child_style.border_style
+        if check('border_color'):
+            bdr['color'] = child_style.border_color
+        if bdr:
+            add('bdr', **bdr)
+        self.properties = tuple(p)
+        self._hash = hash(self.properties)
+
+    def __hash__(self):
+        return self._hash
+
+    def __eq__(self, other):
+        return self.properties == other.properties
+
+    def __ne__(self, other):
+        return self.properties != other.properties
+
+    def serialize(self, styles):
+        makeelement = self.makeelement
+        style = makeelement(styles, 'style', styleId=self.id, type='character')
+        style.append(makeelement(style, 'name', val=self.name))
+        rpr = makeelement(style, 'rPr')
+        style.append(rpr)
+        for name, attrs in self.properties:
+            rpr.append(makeelement(style, name, **dict(attrs)))
+        styles.append(style)
+        return style
+
+
+def read_css_block_borders(self, css, store_css_style=False):
+    for edge in border_edges:
+        if css is None:
+            setattr(self, 'padding_' + edge, 0)
+            setattr(self, 'margin_' + edge, 0)
+            setattr(self, 'css_margin_' + edge, '')
+            setattr(self, 'border_%s_width' % edge, 2)
+            setattr(self, 'border_%s_color' % edge, None)
+            setattr(self, 'border_%s_style' %  edge, 'none')
+            if store_css_style:
+                setattr(self, 'border_%s_css_style' %  edge, 'none')
+        else:
+            # In DOCX padding can only be a positive integer
+            try:
+                setattr(self, 'padding_' + edge, max(0, int(css['padding-' + edge])))
+            except ValueError:
+                setattr(self, 'padding_' + edge, 0)  # invalid value for padding
+            # In DOCX margin must be a positive integer in twips (twentieth of a point)
+            try:
+                setattr(self, 'margin_' + edge, max(0, int(css['margin-' + edge] * 20)))
+            except ValueError:
+                setattr(self, 'margin_' + edge, 0)  # for e.g.: margin: auto
+            setattr(self, 'css_margin_' + edge, css._style.get('margin-' + edge, ''))
+            val = css['border-%s-width' % edge]
+            if not isinstance(val, numbers.Number):
+                val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0)
+            val = min(96, max(2, int(val * 8)))
+            setattr(self, 'border_%s_width' % edge, val)
+            setattr(self, 'border_%s_color' % edge, convert_color(css['border-%s-color' % edge]) or 'auto')
+            setattr(self, 'border_%s_style' %  edge, LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none'))
+            if store_css_style:
+                setattr(self, 'border_%s_css_style' %  edge, css['border-%s-style' % edge].lower())
+
+
+class BlockStyle(DOCXStyle):
+
+    ALL_PROPS = tuple(
+        'text_align css_text_indent text_indent line_height background_color'.split(
+        ) + ['margin_' + edge for edge in border_edges
+        ] + ['css_margin_' + edge for edge in border_edges
+        ] + [x%edge for edge in border_edges for x in border_props]
+    )
+
+    def __init__(self, namespace, css, html_block, is_table_cell=False, parent_bg=None):
+        read_css_block_borders(self, css)
+        if is_table_cell:
+            for edge in border_edges:
+                setattr(self, 'border_%s_style' % edge, 'none')
+                setattr(self, 'border_%s_width' % edge, 0)
+                setattr(self, 'padding_' + edge, 0)
+                setattr(self, 'margin_' + edge, 0)
+        if css is None:
+            self.text_indent = 0
+            self.css_text_indent = None
+            self.line_height = 280
+            self.background_color = None
+            self.text_align = 'left'
+        else:
+            try:
+                self.text_indent = int(css['text-indent'] * 20)
+                self.css_text_indent = css._get('text-indent')
+            except (TypeError, ValueError):
+                self.text_indent = 0
+                self.css_text_indent = None
+            try:
+                self.line_height = max(0, int(css.lineHeight * 20))
+            except (TypeError, ValueError):
+                self.line_height = max(0, int(1.2 * css.fontSize * 20))
+            self.background_color = None if is_table_cell else convert_color(css['background-color'])
+            if not is_table_cell and self.background_color is None:
+                self.background_color = parent_bg
+            try:
+                ws = css['white-space'].lower()
+                preserve_whitespace = ws in {'pre', 'pre-wrap'}
+            except Exception:
+                preserve_whitespace = False
+            try:
+                aval = css['text-align'].lower()
+                if preserve_whitespace:
+                    aval = 'start'
+                self.text_align = {'start':'left', 'left':'left', 'end':'right', 'right':'right', 'center':'center', 'justify':'both', 'centre':'center'}.get(
+                    aval, 'left')
+            except AttributeError:
+                self.text_align = 'left'
+
+        DOCXStyle.__init__(self, namespace)
+
+    def serialize_borders(self, bdr, normal_style):
+        w = self.w
+        for edge in border_edges:
+            e = bdr.makeelement(w(edge))
+            padding = getattr(self, 'padding_' + edge)
+            if (self is normal_style and padding > 0) or (padding != getattr(normal_style, 'padding_' + edge)):
+                e.set(w('space'), unicode_type(padding))
+            width = getattr(self, 'border_%s_width' % edge)
+            bstyle = getattr(self, 'border_%s_style' % edge)
+            if (self is normal_style and width > 0 and bstyle != 'none'
+                    ) or width != getattr(normal_style, 'border_%s_width' % edge
+                    ) or bstyle != getattr(normal_style, 'border_%s_style' % edge):
+                e.set(w('val'), bstyle)
+                e.set(w('sz'), unicode_type(width))
+                e.set(w('color'), getattr(self, 'border_%s_color' % edge))
+            if e.attrib:
+                bdr.append(e)
+        return bdr
+
+    def serialize(self, styles, normal_style):
+        makeelement = self.makeelement
+        style_root = DOCXStyle.serialize(self, styles, normal_style)
+        style = makeelement(style_root, 'pPr')
+        self.serialize_properties(style, normal_style)
+        if len(style) > 0:
+            style_root.append(style)
+        return style_root
+
+    def serialize_properties(self, pPr, normal_style):
+        makeelement, w = self.makeelement, self.w
+        spacing = makeelement(pPr, 'spacing')
+        for edge, attr in iteritems({'top':'before', 'bottom':'after'}):
+            getter = attrgetter('css_margin_' + edge)
+            css_val, css_unit = parse_css_length(getter(self))
+            if css_unit in ('em', 'ex'):
+                lines = max(0, int(css_val * (50 if css_unit == 'ex' else 100)))
+                if (self is normal_style and lines > 0) or getter(self) != getter(normal_style):
+                    spacing.set(w(attr + 'Lines'), unicode_type(lines))
+            else:
+                getter = attrgetter('margin_' + edge)
+                val = getter(self)
+                if (self is normal_style and val > 0) or val != getter(normal_style):
+                    spacing.set(w(attr), unicode_type(val))
+
+        if self is normal_style or self.line_height != normal_style.line_height:
+            spacing.set(w('line'), unicode_type(self.line_height))
+            spacing.set(w('lineRule'), 'atLeast')
+
+        if spacing.attrib:
+            pPr.append(spacing)
+
+        ind = makeelement(pPr, 'ind')
+        for edge in ('left', 'right'):
+            getter = attrgetter('css_margin_' + edge)
+            css_val, css_unit = parse_css_length(getter(self))
+            if css_unit in ('em', 'ex'):
+                chars = max(0, int(css_val * (50 if css_unit == 'ex' else 100)))
+                if (self is normal_style and chars > 0) or getter(self) != getter(normal_style):
+                    ind.set(w(edge + 'Chars'), unicode_type(chars))
+            else:
+                getter = attrgetter('margin_' + edge)
+                val = getter(self)
+                if (self is normal_style and val > 0) or val != getter(normal_style):
+                    ind.set(w(edge), unicode_type(val))
+                    ind.set(w(edge + 'Chars'), '0')  # This is needed to override any declaration in the parent style
+        css_val, css_unit = parse_css_length(self.css_text_indent)
+        if css_unit in ('em', 'ex'):
+            chars = int(css_val * (50 if css_unit == 'ex' else 100))
+            if css_val >= 0:
+                if (self is normal_style and chars > 0) or self.css_text_indent != normal_style.css_text_indent:
+                    ind.set(w('firstLineChars'), unicode_type(chars))
+            else:
+                if (self is normal_style and chars < 0) or self.css_text_indent != normal_style.css_text_indent:
+                    ind.set(w('hangingChars'), unicode_type(abs(chars)))
+        else:
+            val = self.text_indent
+            if val >= 0:
+                if (self is normal_style and val > 0) or self.text_indent != normal_style.text_indent:
+                    ind.set(w('firstLine'), unicode_type(val))
+                    ind.set(w('firstLineChars'), '0')  # This is needed to override any declaration in the parent style
+            else:
+                if (self is normal_style and val < 0) or self.text_indent != normal_style.text_indent:
+                    ind.set(w('hanging'), unicode_type(abs(val)))
+                    ind.set(w('hangingChars'), '0')
+        if ind.attrib:
+            pPr.append(ind)
+
+        if (self is normal_style and self.background_color) or self.background_color != normal_style.background_color:
+            pPr.append(makeelement(pPr, 'shd', val='clear', color='auto', fill=self.background_color or 'auto'))
+
+        pbdr = self.serialize_borders(pPr.makeelement(w('pBdr')), normal_style)
+        if len(pbdr):
+            pPr.append(pbdr)
+
+        if self is normal_style or self.text_align != normal_style.text_align:
+            pPr.append(makeelement(pPr, 'jc', val=self.text_align))
+
+        if self is not normal_style and self.next_style is not None:
+            pPr.append(makeelement(pPr, 'next', val=self.next_style))
+
+
+class StylesManager(object):
+
+    def __init__(self, namespace, log, document_lang):
+        self.namespace = namespace
+        self.document_lang = lang_as_iso639_1(document_lang) or 'en'
+        self.log = log
+        self.block_styles, self.text_styles = {}, {}
+        self.styles_for_html_blocks = {}
+
+    def create_text_style(self, css_style, is_parent_style=False):
+        ans = TextStyle(self.namespace, css_style, is_parent_style=is_parent_style)
+        existing = self.text_styles.get(ans, None)
+        if existing is None:
+            self.text_styles[ans] = ans
+        else:
+            ans = existing
+        return ans
+
+    def create_block_style(self, css_style, html_block, is_table_cell=False, parent_bg=None):
+        ans = BlockStyle(self.namespace, css_style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg)
+        existing = self.block_styles.get(ans, None)
+        if existing is None:
+            self.block_styles[ans] = ans
+        else:
+            ans = existing
+        self.styles_for_html_blocks[html_block] = ans
+        return ans
+
+    def finalize(self, all_blocks):
+        block_counts, run_counts = Counter(), Counter()
+        block_rmap, run_rmap = defaultdict(list), defaultdict(list)
+        used_pairs = defaultdict(list)
+        heading_styles = defaultdict(list)
+        headings = frozenset('h1 h2 h3 h4 h5 h6'.split())
+        pure_block_styles = set()
+
+        for block in all_blocks:
+            bs = block.style
+            block_counts[bs] += 1
+            block_rmap[block.style].append(block)
+            local_run_counts = Counter()
+            for run in block.runs:
+                count = run.style_weight
+                run_counts[run.style] += count
+                local_run_counts[run.style] += count
+                run_rmap[run.style].append(run)
+            if local_run_counts:
+                rs = local_run_counts.most_common(1)[0][0]
+                used_pairs[(bs, rs)].append(block)
+                if block.html_tag in headings:
+                    heading_styles[block.html_tag].append((bs, rs))
+            else:
+                pure_block_styles.add(bs)
+
+        self.pure_block_styles = sorted(pure_block_styles, key=block_counts.__getitem__)
+        bnum = len(unicode_type(max(1, len(pure_block_styles) - 1)))
+        for i, bs in enumerate(self.pure_block_styles):
+            bs.id = bs.name = '%0{}d Block'.format(bnum) % i
+            bs.seq = i
+            if i == 0:
+                self.normal_pure_block_style = bs
+
+        counts = Counter()
+        smap = {}
+        for (bs, rs), blocks in iteritems(used_pairs):
+            s = CombinedStyle(bs, rs, blocks, self.namespace)
+            smap[(bs, rs)] = s
+            counts[s] += sum(1 for b in blocks if not b.is_empty())
+        for i, heading_tag in enumerate(sorted(heading_styles)):
+            styles = sorted((smap[k] for k in heading_styles[heading_tag]), key=counts.__getitem__)
+            styles = list(filter(lambda s:s.outline_level is None, styles))
+            if styles:
+                heading_style = styles[-1]
+                heading_style.outline_level = i
+
+        snum = len(unicode_type(max(1, len(counts) - 1)))
+        heading_styles = []
+        for i, (style, count) in enumerate(counts.most_common()):
+            if i == 0:
+                self.normal_style = style
+                style.id = style.name = 'Normal'
+            else:
+                if style.outline_level is None:
+                    val = 'Para %0{}d'.format(snum) % i
+                else:
+                    val = 'Heading %d' % (style.outline_level + 1)
+                    heading_styles.append(style)
+                style.id = style.name = val
+            style.seq = i
+        self.combined_styles = sorted(counts, key=attrgetter('seq'))
+        [ls.apply() for ls in self.combined_styles]
+
+        descendant_style_map = {}
+        ds_counts = Counter()
+        for block in all_blocks:
+            for run in block.runs:
+                if run.parent_style is not run.style and run.parent_style and run.style:
+                    ds = DescendantTextStyle(run.parent_style, run.style)
+                    if ds.properties:
+                        run.descendant_style = descendant_style_map.get(ds)
+                        if run.descendant_style is None:
+                            run.descendant_style = descendant_style_map[ds] = ds
+                        ds_counts[run.descendant_style] += run.style_weight
+        rnum = len(unicode_type(max(1, len(ds_counts) - 1)))
+        for i, (text_style, count) in enumerate(ds_counts.most_common()):
+            text_style.id = 'Text%d' % i
+            text_style.name = '%0{}d Text'.format(rnum) % i
+            text_style.seq = i
+        self.descendant_text_styles = sorted(descendant_style_map, key=attrgetter('seq'))
+
+        self.log.debug('%d Text Styles %d Combined styles' % tuple(map(len, (
+            self.descendant_text_styles, self.combined_styles))))
+
+        self.primary_heading_style = None
+        if heading_styles:
+            heading_styles.sort(key=attrgetter('outline_level'))
+            self.primary_heading_style = heading_styles[0]
+        else:
+            ms = 0
+            for s in self.combined_styles:
+                if s.rs.font_size > ms:
+                    self.primary_heading_style = s
+                    ms = s.rs.font_size
+
+    def serialize(self, styles):
+        lang = styles.xpath('descendant::*[local-name()="lang"]')[0]
+        for k in tuple(lang.attrib):
+            lang.attrib[k] = self.document_lang
+        for style in self.combined_styles:
+            style.serialize(styles, self.normal_style)
+        for style in self.descendant_text_styles:
+            style.serialize(styles)
+        for style in sorted(self.pure_block_styles, key=attrgetter('seq')):
+            style.serialize(styles, self.normal_pure_block_style)
--- a/ebook_converter/ebooks/docx/writer/tables.py
+++ b/ebook_converter/ebooks/docx/writer/tables.py
@@ -0,0 +1,371 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import namedtuple
+
+from calibre.ebooks.docx.writer.utils import convert_color
+from calibre.ebooks.docx.writer.styles import read_css_block_borders as rcbb, border_edges
+from polyglot.builtins import iteritems, range, unicode_type
+
+
+class Dummy(object):
+    pass
+
+
+Border = namedtuple('Border', 'css_style style width color level')
+border_style_weight = {
+    x:100-i for i, x in enumerate(('double', 'solid', 'dashed', 'dotted', 'ridge', 'outset', 'groove', 'inset'))}
+
+
+class SpannedCell(object):
+
+    def __init__(self, spanning_cell, horizontal=True):
+        self.spanning_cell = spanning_cell
+        self.horizontal = horizontal
+        self.row_span = self.col_span = 1
+
+    def resolve_borders(self):
+        pass
+
+    def serialize(self, tr, makeelement):
+        tc = makeelement(tr, 'w:tc')
+        tcPr = makeelement(tc, 'w:tcPr')
+        makeelement(tcPr, 'w:%sMerge' % ('h' if self.horizontal else 'v'), w_val='continue')
+        makeelement(tc, 'w:p')
+
+    def applicable_borders(self, edge):
+        return self.spanning_cell.applicable_borders(edge)
+
+
+def read_css_block_borders(self, css):
+    obj = Dummy()
+    rcbb(obj, css, store_css_style=True)
+    for edge in border_edges:
+        setattr(self, 'border_' + edge, Border(
+            getattr(obj, 'border_%s_css_style' % edge),
+            getattr(obj, 'border_%s_style' % edge),
+            getattr(obj, 'border_%s_width' % edge),
+            getattr(obj, 'border_%s_color' % edge),
+            self.BLEVEL
+        ))
+        setattr(self, 'padding_' + edge, getattr(obj, 'padding_' + edge))
+
+
+def as_percent(x):
+    if x and x.endswith('%'):
+        try:
+            return float(x.rstrip('%'))
+        except Exception:
+            pass
+
+
+def convert_width(tag_style):
+    if tag_style is not None:
+        w = tag_style._get('width')
+        wp = as_percent(w)
+        if w == 'auto':
+            return ('auto', 0)
+        elif wp is not None:
+            return ('pct', int(wp * 50))
+        else:
+            try:
+                return ('dxa', int(float(tag_style['width']) * 20))
+            except Exception:
+                pass
+    return ('auto', 0)
+
+
+class Cell(object):
+
+    BLEVEL = 2
+
+    def __init__(self, row, html_tag, tag_style=None):
+        self.row = row
+        self.table = self.row.table
+        self.html_tag = html_tag
+        try:
+            self.row_span = max(0, int(html_tag.get('rowspan', 1)))
+        except Exception:
+            self.row_span = 1
+        try:
+            self.col_span = max(0, int(html_tag.get('colspan', 1)))
+        except Exception:
+            self.col_span = 1
+        if tag_style is None:
+            self.valign = 'center'
+        else:
+            self.valign = {'top':'top', 'bottom':'bottom', 'middle':'center'}.get(tag_style._get('vertical-align'))
+        self.items = []
+        self.width = convert_width(tag_style)
+        self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
+        read_css_block_borders(self, tag_style)
+
+    def add_block(self, block):
+        self.items.append(block)
+        block.parent_items = self.items
+
+    def add_table(self, table):
+        self.items.append(table)
+        return table
+
+    def serialize(self, parent, makeelement):
+        tc = makeelement(parent, 'w:tc')
+        tcPr = makeelement(tc, 'w:tcPr')
+        makeelement(tcPr, 'w:tcW', w_type=self.width[0], w_w=unicode_type(self.width[1]))
+        # For some reason, Word 2007 refuses to honor <w:shd> at the table or row
+        # level, despite what the specs say, so we inherit and apply at the
+        # cell level
+        bc = self.background_color or self.row.background_color or self.row.table.background_color
+        if bc:
+            makeelement(tcPr, 'w:shd', w_val="clear", w_color="auto", w_fill=bc)
+
+        b = makeelement(tcPr, 'w:tcBorders', append=False)
+        for edge, border in iteritems(self.borders):
+            if border is not None and border.width > 0 and border.style != 'none':
+                makeelement(b, 'w:' + edge, w_val=border.style, w_sz=unicode_type(border.width), w_color=border.color)
+        if len(b) > 0:
+            tcPr.append(b)
+
+        m = makeelement(tcPr, 'w:tcMar', append=False)
+        for edge in border_edges:
+            padding = getattr(self, 'padding_' + edge)
+            if edge in {'top', 'bottom'} or (edge == 'left' and self is self.row.first_cell) or (edge == 'right' and self is self.row.last_cell):
+                padding += getattr(self.row, 'padding_' + edge)
+            if padding > 0:
+                makeelement(m, 'w:' + edge, w_type='dxa', w_w=unicode_type(int(padding * 20)))
+        if len(m) > 0:
+            tcPr.append(m)
+
+        if self.valign is not None:
+            makeelement(tcPr, 'w:vAlign', w_val=self.valign)
+
+        if self.row_span > 1:
+            makeelement(tcPr, 'w:vMerge', w_val='restart')
+        if self.col_span > 1:
+            makeelement(tcPr, 'w:hMerge', w_val='restart')
+
+        item = None
+        for item in self.items:
+            item.serialize(tc)
+        if item is None or isinstance(item, Table):
+            # Word 2007 requires the last element in a table cell to be a paragraph
+            makeelement(tc, 'w:p')
+
+    def applicable_borders(self, edge):
+        if edge == 'left':
+            items = {self.table, self.row, self} if self.row.first_cell is self else {self}
+        elif edge == 'top':
+            items = ({self.table} if self.table.first_row is self.row else set()) | {self, self.row}
+        elif edge == 'right':
+            items = {self.table, self, self.row} if self.row.last_cell is self else {self}
+        elif edge == 'bottom':
+            items = ({self.table} if self.table.last_row is self.row else set()) | {self, self.row}
+        return {getattr(x, 'border_' + edge) for x in items}
+
+    def resolve_border(self, edge):
+        # In Word cell borders override table borders, and Word ignores row
+        # borders, so we consolidate all borders as cell borders
+        # In HTML the priority is as described here:
+        # http://www.w3.org/TR/CSS21/tables.html#border-conflict-resolution
+        neighbor = self.neighbor(edge)
+        borders = self.applicable_borders(edge)
+        if neighbor is not None:
+            nedge = {'left':'right', 'top':'bottom', 'right':'left', 'bottom':'top'}[edge]
+            borders |= neighbor.applicable_borders(nedge)
+
+        for b in borders:
+            if b.css_style == 'hidden':
+                return None
+
+        def weight(border):
+            return (
+                0 if border.css_style == 'none' else 1,
+                border.width,
+                border_style_weight.get(border.css_style, 0),
+                border.level)
+        border = sorted(borders, key=weight)[-1]
+        return border
+
+    def resolve_borders(self):
+        self.borders = {edge:self.resolve_border(edge) for edge in border_edges}
+
+    def neighbor(self, edge):
+        idx = self.row.cells.index(self)
+        ans = None
+        if edge == 'left':
+            ans = self.row.cells[idx-1] if idx > 0 else None
+        elif edge == 'right':
+            ans = self.row.cells[idx+1] if (idx + 1) < len(self.row.cells) else None
+        elif edge == 'top':
+            ridx = self.table.rows.index(self.row)
+            if ridx > 0 and idx < len(self.table.rows[ridx-1].cells):
+                ans = self.table.rows[ridx-1].cells[idx]
+        elif edge == 'bottom':
+            ridx = self.table.rows.index(self.row)
+            if ridx + 1 < len(self.table.rows) and idx < len(self.table.rows[ridx+1].cells):
+                ans = self.table.rows[ridx+1].cells[idx]
+        return getattr(ans, 'spanning_cell', ans)
+
+
+class Row(object):
+
+    BLEVEL = 1
+
+    def __init__(self, table, html_tag, tag_style=None):
+        self.table = table
+        self.html_tag = html_tag
+        self.orig_tag_style = tag_style
+        self.cells = []
+        self.current_cell = None
+        self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
+        read_css_block_borders(self, tag_style)
+
+    @property
+    def first_cell(self):
+        return self.cells[0] if self.cells else None
+
+    @property
+    def last_cell(self):
+        return self.cells[-1] if self.cells else None
+
+    def start_new_cell(self, html_tag, tag_style):
+        self.current_cell = Cell(self, html_tag, tag_style)
+
+    def finish_tag(self, html_tag):
+        if self.current_cell is not None:
+            if html_tag is self.current_cell.html_tag:
+                self.cells.append(self.current_cell)
+                self.current_cell = None
+
+    def add_block(self, block):
+        if self.current_cell is None:
+            self.start_new_cell(self.html_tag, self.orig_tag_style)
+        self.current_cell.add_block(block)
+
+    def add_table(self, table):
+        if self.current_cell is None:
+            self.current_cell = Cell(self, self.html_tag, self.orig_tag_style)
+        return self.current_cell.add_table(table)
+
+    def serialize(self, parent, makeelement):
+        tr = makeelement(parent, 'w:tr')
+        for cell in self.cells:
+            cell.serialize(tr, makeelement)
+
+
+class Table(object):
+
+    BLEVEL = 0
+
+    def __init__(self, namespace, html_tag, tag_style=None):
+        self.namespace = namespace
+        self.html_tag = html_tag
+        self.orig_tag_style = tag_style
+        self.rows = []
+        self.current_row = None
+        self.width = convert_width(tag_style)
+        self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
+        self.jc = None
+        self.float = None
+        self.margin_left = self.margin_right = self.margin_top = self.margin_bottom = None
+        if tag_style is not None:
+            ml, mr = tag_style._get('margin-left'), tag_style.get('margin-right')
+            if ml == 'auto':
+                self.jc = 'center' if mr == 'auto' else 'right'
+            self.float = tag_style['float']
+            for edge in border_edges:
+                setattr(self, 'margin_' + edge, tag_style['margin-' + edge])
+        read_css_block_borders(self, tag_style)
+
+    @property
+    def first_row(self):
+        return self.rows[0] if self.rows else None
+
+    @property
+    def last_row(self):
+        return self.rows[-1] if self.rows else None
+
+    def finish_tag(self, html_tag):
+        if self.current_row is not None:
+            self.current_row.finish_tag(html_tag)
+            if self.current_row.html_tag is html_tag:
+                self.rows.append(self.current_row)
+                self.current_row = None
+        table_ended = self.html_tag is html_tag
+        if table_ended:
+            self.expand_spanned_cells()
+            for row in self.rows:
+                for cell in row.cells:
+                    cell.resolve_borders()
+        return table_ended
+
+    def expand_spanned_cells(self):
+        # Expand horizontally
+        for row in self.rows:
+            for cell in tuple(row.cells):
+                idx = row.cells.index(cell)
+                if cell.col_span > 1 and (cell is row.cells[-1] or not isinstance(row.cells[idx+1], SpannedCell)):
+                    row.cells[idx:idx+1] = [cell] + [SpannedCell(cell, horizontal=True) for i in range(1, cell.col_span)]
+
+        # Expand vertically
+        for r, row in enumerate(self.rows):
+            for idx, cell in enumerate(row.cells):
+                if cell.row_span > 1:
+                    for nrow in self.rows[r+1:]:
+                        sc = SpannedCell(cell, horizontal=False)
+                        try:
+                            tcell = nrow.cells[idx]
+                        except Exception:
+                            tcell = None
+                        if tcell is None:
+                            nrow.cells.extend([SpannedCell(nrow.cells[-1], horizontal=True) for i in range(idx - len(nrow.cells))])
+                            nrow.cells.append(sc)
+                        else:
+                            if isinstance(tcell, SpannedCell):
+                                # Conflict between rowspan and colspan
+                                break
+                            else:
+                                nrow.cells.insert(idx, sc)
+
+    def start_new_row(self, html_tag, html_style):
+        if self.current_row is not None:
+            self.rows.append(self.current_row)
+        self.current_row = Row(self, html_tag, html_style)
+
+    def start_new_cell(self, html_tag, html_style):
+        if self.current_row is None:
+            self.start_new_row(html_tag, None)
+        self.current_row.start_new_cell(html_tag, html_style)
+
+    def add_block(self, block):
+        self.current_row.add_block(block)
+
+    def add_table(self, table):
+        if self.current_row is None:
+            self.current_row = Row(self, self.html_tag, self.orig_tag_style)
+        return self.current_row.add_table(table)
+
+    def serialize(self, parent):
+        makeelement = self.namespace.makeelement
+        rows = [r for r in self.rows if r.cells]
+        if not rows:
+            return
+        tbl = makeelement(parent, 'w:tbl')
+        tblPr = makeelement(tbl, 'w:tblPr')
+        makeelement(tblPr, 'w:tblW', w_type=self.width[0], w_w=unicode_type(self.width[1]))
+        if self.float in {'left', 'right'}:
+            kw = {'w_vertAnchor':'text', 'w_horzAnchor':'text', 'w_tblpXSpec':self.float}
+            for edge in border_edges:
+                val = getattr(self, 'margin_' + edge) or 0
+                if {self.float, edge} == {'left', 'right'}:
+                    val = max(val, 2)
+                kw['w_' + edge + 'FromText'] = unicode_type(max(0, int(val *20)))
+            makeelement(tblPr, 'w:tblpPr', **kw)
+        if self.jc is not None:
+            makeelement(tblPr, 'w:jc', w_val=self.jc)
+        for row in rows:
+            row.serialize(tbl, makeelement)
--- a/ebook_converter/ebooks/docx/writer/utils.py
+++ b/ebook_converter/ebooks/docx/writer/utils.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from tinycss.color3 import parse_color_string
+
+
+def int_or_zero(raw):
+    try:
+        return int(raw)
+    except (ValueError, TypeError, AttributeError):
+        return 0
+
+# convert_color() {{{
+
+
+def convert_color(value):
+    if not value:
+        return
+    if value.lower() == 'currentcolor':
+        return 'auto'
+    val = parse_color_string(value)
+    if val is None:
+        return
+    if val.alpha < 0.01:
+        return
+    return '%02X%02X%02X' % (int(val.red * 255), int(val.green * 255), int(val.blue * 255))
+
+
+def test_convert_color(return_tests=False):
+    import unittest
+
+    class TestColors(unittest.TestCase):
+
+        def test_color_conversion(self):
+            ae = self.assertEqual
+            cc = convert_color
+            ae(None, cc(None))
+            ae(None, cc('transparent'))
+            ae(None, cc('none'))
+            ae(None, cc('#12j456'))
+            ae('auto', cc('currentColor'))
+            ae('F0F8FF', cc('AliceBlue'))
+            ae('000000', cc('black'))
+            ae('FF0000', cc('red'))
+            ae('00FF00', cc('lime'))
+            ae(cc('#001'), '000011')
+            ae('12345D', cc('#12345d'))
+            ae('FFFFFF', cc('rgb(255, 255, 255)'))
+            ae('FF0000', cc('rgba(255, 0, 0, 23)'))
+    tests = unittest.defaultTestLoader.loadTestsFromTestCase(TestColors)
+    if return_tests:
+        return tests
+    unittest.TextTestRunner(verbosity=4).run(tests)
+# }}}
--- a/ebook_converter/ebooks/oeb/transforms/subset.py
+++ b/ebook_converter/ebooks/oeb/transforms/subset.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from collections import defaultdict
+
+from calibre.ebooks.oeb.base import urlnormalize, css_text
+from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont
+from polyglot.builtins import iteritems, itervalues, unicode_type, range
+from tinycss.fonts3 import parse_font_family
+
+
+def get_font_properties(rule, default=None):
+    '''
+    Given a CSS rule, extract normalized font properties from
+    it. Note that shorthand font property should already have been expanded
+    by the CSS flattening code.
+    '''
+    props = {}
+    s = rule.style
+    for q in ('font-family', 'src', 'font-weight', 'font-stretch',
+            'font-style'):
+        g = 'uri' if q == 'src' else 'value'
+        try:
+            val = s.getProperty(q).propertyValue[0]
+            val = getattr(val, g)
+            if q == 'font-family':
+                val = parse_font_family(css_text(s.getProperty(q).propertyValue))
+                if val and val[0] == 'inherit':
+                    val = None
+        except (IndexError, KeyError, AttributeError, TypeError, ValueError):
+            val = None if q in {'src', 'font-family'} else default
+        if q in {'font-weight', 'font-stretch', 'font-style'}:
+            val = unicode_type(val).lower() if (val or val == 0) else val
+            if val == 'inherit':
+                val = default
+        if q == 'font-weight':
+            val = {'normal':'400', 'bold':'700'}.get(val, val)
+            if val not in {'100', '200', '300', '400', '500', '600', '700',
+                    '800', '900', 'bolder', 'lighter'}:
+                val = default
+            if val == 'normal':
+                val = '400'
+        elif q == 'font-style':
+            if val not in {'normal', 'italic', 'oblique'}:
+                val = default
+        elif q == 'font-stretch':
+            if val not in {'normal', 'ultra-condensed', 'extra-condensed',
+                    'condensed', 'semi-condensed', 'semi-expanded',
+                    'expanded', 'extra-expanded', 'ultra-expanded'}:
+                val = default
+        props[q] = val
+    return props
+
+
+def find_font_face_rules(sheet, oeb):
+    '''
+    Find all @font-face rules in the given sheet and extract the relevant info from them.
+    sheet can be either a ManifestItem or a CSSStyleSheet.
+    '''
+    ans = []
+    try:
+        rules = sheet.data.cssRules
+    except AttributeError:
+        rules = sheet.cssRules
+
+    for i, rule in enumerate(rules):
+        if rule.type != rule.FONT_FACE_RULE:
+            continue
+        props = get_font_properties(rule, default='normal')
+        if not props['font-family'] or not props['src']:
+            continue
+
+        try:
+            path = sheet.abshref(props['src'])
+        except AttributeError:
+            path = props['src']
+        ff = oeb.manifest.hrefs.get(urlnormalize(path), None)
+        if not ff:
+            continue
+        props['item'] = ff
+        if props['font-weight'] in {'bolder', 'lighter'}:
+            props['font-weight'] = '400'
+        props['weight'] = int(props['font-weight'])
+        props['rule'] = rule
+        props['chars'] = set()
+        ans.append(props)
+
+    return ans
+
+
+def elem_style(style_rules, cls, inherited_style):
+    '''
+    Find the effective style for the given element.
+    '''
+    classes = cls.split()
+    style = inherited_style.copy()
+    for cls in classes:
+        style.update(style_rules.get(cls, {}))
+    wt = style.get('font-weight', None)
+    pwt = inherited_style.get('font-weight', '400')
+    if wt == 'bolder':
+        style['font-weight'] = {
+                '100':'400',
+                '200':'400',
+                '300':'400',
+                '400':'700',
+                '500':'700',
+                }.get(pwt, '900')
+    elif wt == 'lighter':
+        style['font-weight'] = {
+                '600':'400', '700':'400',
+                '800':'700', '900':'700'}.get(pwt, '100')
+
+    return style
+
+
+class SubsetFonts(object):
+
+    '''
+    Subset all embedded fonts. Must be run after CSS flattening, as it requires
+    CSS normalization and flattening to work.
+    '''
+
+    def __call__(self, oeb, log, opts):
+        self.oeb, self.log, self.opts = oeb, log, opts
+
+        self.find_embedded_fonts()
+        if not self.embedded_fonts:
+            self.log.debug('No embedded fonts found')
+            return
+        self.find_style_rules()
+        self.find_font_usage()
+
+        totals = [0, 0]
+
+        def remove(font):
+            totals[1] += len(font['item'].data)
+            self.oeb.manifest.remove(font['item'])
+            font['rule'].parentStyleSheet.deleteRule(font['rule'])
+
+        fonts = {}
+        for font in self.embedded_fonts:
+            item, chars = font['item'], font['chars']
+            if item.href in fonts:
+                fonts[item.href]['chars'] |= chars
+            else:
+                fonts[item.href] = font
+
+        for font in itervalues(fonts):
+            if not font['chars']:
+                self.log('The font %s is unused. Removing it.'%font['src'])
+                remove(font)
+                continue
+            try:
+                raw, old_stats, new_stats = subset(font['item'].data, font['chars'])
+            except NoGlyphs:
+                self.log('The font %s has no used glyphs. Removing it.'%font['src'])
+                remove(font)
+                continue
+            except UnsupportedFont as e:
+                self.log.warn('The font %s is unsupported for subsetting. %s'%(
+                    font['src'], e))
+                sz = len(font['item'].data)
+                totals[0] += sz
+                totals[1] += sz
+            else:
+                font['item'].data = raw
+                nlen = sum(itervalues(new_stats))
+                olen = sum(itervalues(old_stats))
+                self.log('Decreased the font %s to %.1f%% of its original size'%
+                        (font['src'], nlen/olen *100))
+                totals[0] += nlen
+                totals[1] += olen
+
+            font['item'].unload_data_from_memory()
+
+        if totals[0]:
+            self.log('Reduced total font size to %.1f%% of original'%
+                    (totals[0]/totals[1] * 100))
+
+    def find_embedded_fonts(self):
+        '''
+        Find all @font-face rules and extract the relevant info from them.
+        '''
+        self.embedded_fonts = []
+        for item in self.oeb.manifest:
+            if not hasattr(item.data, 'cssRules'):
+                continue
+            self.embedded_fonts.extend(find_font_face_rules(item, self.oeb))
+
+    def find_style_rules(self):
+        '''
+        Extract all font related style information from all stylesheets into a
+        dict mapping classes to font properties specified by that class. All
+        the heavy lifting has already been done by the CSS flattening code.
+        '''
+        rules = defaultdict(dict)
+        for item in self.oeb.manifest:
+            if not hasattr(item.data, 'cssRules'):
+                continue
+            for i, rule in enumerate(item.data.cssRules):
+                if rule.type != rule.STYLE_RULE:
+                    continue
+                props = {k:v for k,v in
+                        iteritems(get_font_properties(rule)) if v}
+                if not props:
+                    continue
+                for sel in rule.selectorList:
+                    sel = sel.selectorText
+                    if sel and sel.startswith('.'):
+                        # We dont care about pseudo-selectors as the worst that
+                        # can happen is some extra characters will remain in
+                        # the font
+                        sel = sel.partition(':')[0]
+                        rules[sel[1:]].update(props)
+
+        self.style_rules = dict(rules)
+
+    def find_font_usage(self):
+        for item in self.oeb.manifest:
+            if not hasattr(item.data, 'xpath'):
+                continue
+            for body in item.data.xpath('//*[local-name()="body"]'):
+                base = {'font-family':['serif'], 'font-weight': '400',
+                        'font-style':'normal', 'font-stretch':'normal'}
+                self.find_usage_in(body, base)
+
+    def used_font(self, style):
+        '''
+        Given a style find the embedded font that matches it. Returns None if
+        no match is found (can happen if no family matches).
+        '''
+        ff = style.get('font-family', [])
+        lnames = {unicode_type(x).lower() for x in ff}
+        matching_set = []
+
+        # Filter on font-family
+        for ef in self.embedded_fonts:
+            flnames = {x.lower() for x in ef.get('font-family', [])}
+            if not lnames.intersection(flnames):
+                continue
+            matching_set.append(ef)
+        if not matching_set:
+            return None
+
+        # Filter on font-stretch
+        widths = {x:i for i, x in enumerate(('ultra-condensed',
+                'extra-condensed', 'condensed', 'semi-condensed', 'normal',
+                'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'
+                ))}
+
+        width = widths[style.get('font-stretch', 'normal')]
+        for f in matching_set:
+            f['width'] = widths[style.get('font-stretch', 'normal')]
+
+        min_dist = min(abs(width-f['width']) for f in matching_set)
+        nearest = [f for f in matching_set if abs(width-f['width']) ==
+            min_dist]
+        if width <= 4:
+            lmatches = [f for f in nearest if f['width'] <= width]
+        else:
+            lmatches = [f for f in nearest if f['width'] >= width]
+        matching_set = (lmatches or nearest)
+
+        # Filter on font-style
+        fs = style.get('font-style', 'normal')
+        order = {
+                'oblique':['oblique', 'italic', 'normal'],
+                'normal':['normal', 'oblique', 'italic']
+            }.get(fs, ['italic', 'oblique', 'normal'])
+        for q in order:
+            matches = [f for f in matching_set if f.get('font-style', 'normal') == q]
+            if matches:
+                matching_set = matches
+                break
+
+        # Filter on font weight
+        fw = int(style.get('font-weight', '400'))
+        if fw == 400:
+            q = [400, 500, 300, 200, 100, 600, 700, 800, 900]
+        elif fw == 500:
+            q = [500, 400, 300, 200, 100, 600, 700, 800, 900]
+        elif fw < 400:
+            q = [fw] + list(range(fw-100, -100, -100)) + list(range(fw+100,
+                100, 1000))
+        else:
+            q = [fw] + list(range(fw+100, 100, 1000)) + list(range(fw-100,
+                -100, -100))
+        for wt in q:
+            matches = [f for f in matching_set if f['weight'] == wt]
+            if matches:
+                return matches[0]
+
+    def find_chars(self, elem):
+        ans = set()
+        if elem.text:
+            ans |= set(elem.text)
+        for child in elem:
+            if child.tail:
+                ans |= set(child.tail)
+        return ans
+
+    def find_usage_in(self, elem, inherited_style):
+        style = elem_style(self.style_rules, elem.get('class', '') or '', inherited_style)
+        for child in elem:
+            self.find_usage_in(child, style)
+        font = self.used_font(style)
+        if font:
+            chars = self.find_chars(elem)
+            if chars:
+                font['chars'] |= chars
--- a/ebook_converter/ebooks/pdf/render/init.py
+++ b/ebook_converter/ebooks/pdf/render/init.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+
--- a/ebook_converter/ebooks/pdf/render/common.py
+++ b/ebook_converter/ebooks/pdf/render/common.py
@@ -0,0 +1,247 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import codecs, zlib, numbers
+from io import BytesIO
+from datetime import datetime
+
+from calibre.constants import plugins, ispy3
+from calibre.utils.logging import default_log
+from polyglot.builtins import iteritems, unicode_type, codepoint_to_chr
+from polyglot.binary import as_hex_bytes
+
+pdf_float = plugins['speedup'][0].pdf_float
+
+EOL = b'\n'
+
+# Sizes {{{
+inch = 72.0
+cm = inch / 2.54
+mm = cm * 0.1
+pica = 12.0
+didot = 0.375 * mm
+cicero = 12 * didot
+
+_W, _H = (21*cm, 29.7*cm)
+
+A6 = (_W*.5, _H*.5)
+A5 = (_H*.5, _W)
+A4 = (_W, _H)
+A3 = (_H, _W*2)
+A2 = (_W*2, _H*2)
+A1 = (_H*2, _W*4)
+A0 = (_W*4, _H*4)
+
+LETTER = (8.5*inch, 11*inch)
+LEGAL = (8.5*inch, 14*inch)
+ELEVENSEVENTEEN = (11*inch, 17*inch)
+
+_BW, _BH = (25*cm, 35.3*cm)
+B6 = (_BW*.5, _BH*.5)
+B5 = (_BH*.5, _BW)
+B4 = (_BW, _BH)
+B3 = (_BH*2, _BW)
+B2 = (_BW*2, _BH*2)
+B1 = (_BH*4, _BW*2)
+B0 = (_BW*4, _BH*4)
+
+PAPER_SIZES = {k:globals()[k.upper()] for k in ('a0 a1 a2 a3 a4 a5 a6 b0 b1 b2'
+               ' b3 b4 b5 b6 letter legal').split()}
+
+# }}}
+
+
+def fmtnum(o):
+    if isinstance(o, float):
+        return pdf_float(o)
+    return unicode_type(o)
+
+
+def serialize(o, stream):
+    if isinstance(o, float):
+        stream.write_raw(pdf_float(o).encode('ascii'))
+    elif isinstance(o, bool):
+        # Must check bool before int as bools are subclasses of int
+        stream.write_raw(b'true' if o else b'false')
+    elif isinstance(o, numbers.Integral):
+        stream.write_raw(unicode_type(o).encode('ascii') if ispy3 else bytes(o))
+    elif hasattr(o, 'pdf_serialize'):
+        o.pdf_serialize(stream)
+    elif o is None:
+        stream.write_raw(b'null')
+    elif isinstance(o, datetime):
+        val = o.strftime("D:%Y%m%d%H%M%%02d%z")%min(59, o.second)
+        if datetime.tzinfo is not None:
+            val = "(%s'%s')"%(val[:-2], val[-2:])
+        stream.write(val.encode('ascii'))
+    else:
+        raise ValueError('Unknown object: %r'%o)
+
+
+class Name(unicode_type):
+
+    def pdf_serialize(self, stream):
+        raw = self.encode('ascii')
+        if len(raw) > 126:
+            raise ValueError('Name too long: %r'%self)
+        raw = bytearray(raw)
+        sharp = ord(b'#')
+        buf = (
+            codepoint_to_chr(x).encode('ascii') if 33 < x < 126 and x != sharp else
+            '#{:x}'.format(x).encode('ascii') for x in raw)
+        stream.write(b'/'+b''.join(buf))
+
+
+def escape_pdf_string(bytestring):
+    indices = []
+    bad = []
+    ba = bytearray(bytestring)
+    bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')}
+    for i, num in enumerate(ba):
+        if num == 40:  # (
+            indices.append((i, 40))
+        elif num == 41:  # )
+            if indices:
+                indices.pop()
+            else:
+                bad.append((i, 41))
+        elif num in bad_map:  # '\n\r\f\b\t\\' see Table 3.2 in PDF 1.7 spec
+            bad.append((i, bad_map[num]))
+    bad = sorted(indices + bad, reverse=True)
+    if not bad:
+        return bytestring
+    for i, repl in bad:
+        ba[i:i+1] = (92, repl)  # 92 = ord('\')
+    return bytes(ba)
+
+
+class String(unicode_type):
+
+    def pdf_serialize(self, stream):
+        try:
+            raw = self.encode('latin1')
+            if raw.startswith(codecs.BOM_UTF16_BE):
+                raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
+        except UnicodeEncodeError:
+            raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
+        stream.write(b'('+escape_pdf_string(raw)+b')')
+
+
+class UTF16String(unicode_type):
+
+    def pdf_serialize(self, stream):
+        raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
+        if False:
+            # Disabled as the parentheses based strings give easier to debug
+            # PDF files
+            stream.write(b'<' + as_hex_bytes(raw) + b'>')
+        else:
+            stream.write(b'('+escape_pdf_string(raw)+b')')
+
+
+class Dictionary(dict):
+
+    def pdf_serialize(self, stream):
+        stream.write(b'<<' + EOL)
+        sorted_keys = sorted(self,
+                             key=lambda x:({'Type':'1', 'Subtype':'2'}.get(
+                                 x, x)+x))
+        for k in sorted_keys:
+            serialize(Name(k), stream)
+            stream.write(b' ')
+            serialize(self[k], stream)
+            stream.write(EOL)
+        stream.write(b'>>' + EOL)
+
+
+class InlineDictionary(Dictionary):
+
+    def pdf_serialize(self, stream):
+        stream.write(b'<< ')
+        for k, v in iteritems(self):
+            serialize(Name(k), stream)
+            stream.write(b' ')
+            serialize(v, stream)
+            stream.write(b' ')
+        stream.write(b'>>')
+
+
+class Array(list):
+
+    def pdf_serialize(self, stream):
+        stream.write(b'[')
+        for i, o in enumerate(self):
+            if i != 0:
+                stream.write(b' ')
+            serialize(o, stream)
+        stream.write(b']')
+
+
+class Stream(BytesIO):
+
+    def __init__(self, compress=False):
+        BytesIO.__init__(self)
+        self.compress = compress
+        self.filters = Array()
+
+    def add_extra_keys(self, d):
+        pass
+
+    def pdf_serialize(self, stream):
+        raw = self.getvalue()
+        dl = len(raw)
+        filters = self.filters
+        if self.compress:
+            filters.append(Name('FlateDecode'))
+            raw = zlib.compress(raw)
+
+        d = InlineDictionary({'Length':len(raw), 'DL':dl})
+        self.add_extra_keys(d)
+        if filters:
+            d['Filter'] = filters
+        serialize(d, stream)
+        stream.write(EOL+b'stream'+EOL)
+        stream.write(raw)
+        stream.write(EOL+b'endstream'+EOL)
+
+    def write_line(self, raw=b''):
+        self.write(raw if isinstance(raw, bytes) else raw.encode('ascii'))
+        self.write(EOL)
+
+    def write(self, raw):
+        super(Stream, self).write(raw if isinstance(raw, bytes) else
+                                  raw.encode('ascii'))
+
+    def write_raw(self, raw):
+        BytesIO.write(self, raw)
+
+
+class Reference(object):
+
+    def __init__(self, num, obj):
+        self.num, self.obj = num, obj
+
+    def pdf_serialize(self, stream):
+        raw = '%d 0 R'%self.num
+        stream.write(raw.encode('ascii'))
+
+    def __repr__(self):
+        return '%d 0 R'%self.num
+
+    def __str__(self):
+        return repr(self)
+# }}}
+
+
+def current_log(newlog=None):
+    if newlog:
+        current_log.ans = newlog
+    return current_log.ans or default_log
+
+
+current_log.ans = None
--- a/ebook_converter/utils/fonts/sfnt/init.py
+++ b/ebook_converter/utils/fonts/sfnt/init.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from datetime import datetime, timedelta
+
+
+def align_block(raw, multiple=4, pad=b'\0'):
+    '''
+    Return raw with enough pad bytes append to ensure its length is a multiple
+    of 4.
+    '''
+    extra = len(raw) % multiple
+    if extra == 0:
+        return raw
+    return raw + pad*(multiple - extra)
+
+
+class UnknownTable(object):
+
+    def __init__(self, raw):
+        self.raw = raw
+
+    def __call__(self):
+        return self.raw
+
+    def __len__(self):
+        return len(self.raw)
+
+
+class DateTimeProperty(object):
+
+    def __init__(self, name):
+        self.name = name
+
+    def __get__(self, obj, type=None):
+        return datetime(1904, 1, 1) + timedelta(seconds=getattr(obj,
+            self.name))
+
+    def __set__(self, obj, val):
+        td = val - datetime(1904, 1, 1)
+        setattr(obj, self.name, int(td.total_seconds()))
+
+
+class FixedProperty(object):
+
+    def __init__(self, name):
+        self.name = name
+
+    def __get__(self, obj, type=None):
+        val = getattr(obj, self.name)
+        return val / 0x10000
+
+    def __set__(self, obj, val):
+        return int(round(val*(0x10000)))
+
+
+def max_power_of_two(x):
+    """
+Return the highest exponent of two, so that
+    (2 ** exponent) <= x
+    """
+    exponent = 0
+    while x:
+        x = x >> 1
+        exponent += 1
+    return max(exponent - 1, 0)
+
+
+def load_font(stream_or_path):
+    raw = stream_or_path
+    if hasattr(raw, 'read'):
+        raw = raw.read()
+    from calibre.utils.fonts.sfnt.container import Sfnt
+    return Sfnt(raw)
+
--- a/ebook_converter/utils/fonts/sfnt/cff/init.py
+++ b/ebook_converter/utils/fonts/sfnt/cff/init.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+
--- a/ebook_converter/utils/fonts/sfnt/cff/constants.py
+++ b/ebook_converter/utils/fonts/sfnt/cff/constants.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+# cff_standard_strings {{{
+# The 391 Standard Strings as used in the CFF format.
+# from Adobe Technical None #5176, version 1.0, 18 March 1998
+
+cff_standard_strings = [
+'.notdef', 'space', 'exclam', 'quotedbl', 'numbersign', 'dollar', 'percent',
+'ampersand', 'quoteright', 'parenleft', 'parenright', 'asterisk', 'plus',
+'comma', 'hyphen', 'period', 'slash', 'zero', 'one', 'two', 'three', 'four',
+'five', 'six', 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
+'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
+'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+'bracketleft', 'backslash', 'bracketright', 'asciicircum', 'underscore',
+'quoteleft', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'braceleft',
+'bar', 'braceright', 'asciitilde', 'exclamdown', 'cent', 'sterling',
+'fraction', 'yen', 'florin', 'section', 'currency', 'quotesingle',
+'quotedblleft', 'guillemotleft', 'guilsinglleft', 'guilsinglright', 'fi', 'fl',
+'endash', 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
+'quotesinglbase', 'quotedblbase', 'quotedblright', 'guillemotright',
+'ellipsis', 'perthousand', 'questiondown', 'grave', 'acute', 'circumflex',
+'tilde', 'macron', 'breve', 'dotaccent', 'dieresis', 'ring', 'cedilla',
+'hungarumlaut', 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
+'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash', 'oslash', 'oe',
+'germandbls', 'onesuperior', 'logicalnot', 'mu', 'trademark', 'Eth', 'onehalf',
+'plusminus', 'Thorn', 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
+'threequarters', 'twosuperior', 'registered', 'minus', 'eth', 'multiply',
+'threesuperior', 'copyright', 'Aacute', 'Acircumflex', 'Adieresis', 'Agrave',
+'Aring', 'Atilde', 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
+'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde', 'Oacute',
+'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde', 'Scaron', 'Uacute',
+'Ucircumflex', 'Udieresis', 'Ugrave', 'Yacute', 'Ydieresis', 'Zcaron',
+'aacute', 'acircumflex', 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla',
+'eacute', 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
+'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex', 'odieresis',
+'ograve', 'otilde', 'scaron', 'uacute', 'ucircumflex', 'udieresis', 'ugrave',
+'yacute', 'ydieresis', 'zcaron', 'exclamsmall', 'Hungarumlautsmall',
+'dollaroldstyle', 'dollarsuperior', 'ampersandsmall', 'Acutesmall',
+'parenleftsuperior', 'parenrightsuperior', 'twodotenleader', 'onedotenleader',
+'zerooldstyle', 'oneoldstyle', 'twooldstyle', 'threeoldstyle', 'fouroldstyle',
+'fiveoldstyle', 'sixoldstyle', 'sevenoldstyle', 'eightoldstyle',
+'nineoldstyle', 'commasuperior', 'threequartersemdash', 'periodsuperior',
+'questionsmall', 'asuperior', 'bsuperior', 'centsuperior', 'dsuperior',
+'esuperior', 'isuperior', 'lsuperior', 'msuperior', 'nsuperior', 'osuperior',
+'rsuperior', 'ssuperior', 'tsuperior', 'ff', 'ffi', 'ffl', 'parenleftinferior',
+'parenrightinferior', 'Circumflexsmall', 'hyphensuperior', 'Gravesmall',
+'Asmall', 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall', 'Hsmall',
+'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall', 'Nsmall', 'Osmall', 'Psmall',
+'Qsmall', 'Rsmall', 'Ssmall', 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall',
+'Ysmall', 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
+'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall', 'Zcaronsmall',
+'Dieresissmall', 'Brevesmall', 'Caronsmall', 'Dotaccentsmall', 'Macronsmall',
+'figuredash', 'hypheninferior', 'Ogoneksmall', 'Ringsmall', 'Cedillasmall',
+'questiondownsmall', 'oneeighth', 'threeeighths', 'fiveeighths',
+'seveneighths', 'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
+'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
+'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior', 'threeinferior',
+'fourinferior', 'fiveinferior', 'sixinferior', 'seveninferior',
+'eightinferior', 'nineinferior', 'centinferior', 'dollarinferior',
+'periodinferior', 'commainferior', 'Agravesmall', 'Aacutesmall',
+'Acircumflexsmall', 'Atildesmall', 'Adieresissmall', 'Aringsmall', 'AEsmall',
+'Ccedillasmall', 'Egravesmall', 'Eacutesmall', 'Ecircumflexsmall',
+'Edieresissmall', 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
+'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall', 'Oacutesmall',
+'Ocircumflexsmall', 'Otildesmall', 'Odieresissmall', 'OEsmall', 'Oslashsmall',
+'Ugravesmall', 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
+'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000', '001.001', '001.002',
+'001.003', 'Black', 'Bold', 'Book', 'Light', 'Medium', 'Regular', 'Roman',
+'Semibold'
+]
+# }}}
+
+
+STANDARD_CHARSETS = [  # {{{
+# ISOAdobe
+(".notdef", "space", "exclam", "quotedbl", "numbersign", "dollar",
+    "percent", "ampersand", "quoteright", "parenleft", "parenright",
+    "asterisk", "plus", "comma", "hyphen", "period", "slash", "zero",
+    "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
+    "colon", "semicolon", "less", "equal", "greater", "question", "at",
+    "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
+    "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
+    "bracketleft", "backslash", "bracketright", "asciicircum",
+    "underscore", "quoteleft", "a", "b", "c", "d", "e", "f", "g", "h", "i",
+    "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w",
+    "x", "y", "z", "braceleft", "bar", "braceright", "asciitilde",
+    "exclamdown", "cent", "sterling", "fraction", "yen", "florin",
+    "section", "currency", "quotesingle", "quotedblleft", "guillemotleft",
+    "guilsinglleft", "guilsinglright", "fi", "fl", "endash", "dagger",
+    "daggerdbl", "periodcentered", "paragraph", "bullet", "quotesinglbase",
+    "quotedblbase", "quotedblright", "guillemotright", "ellipsis",
+    "perthousand", "questiondown", "grave", "acute", "circumflex", "tilde",
+    "macron", "breve", "dotaccent", "dieresis", "ring", "cedilla",
+    "hungarumlaut", "ogonek", "caron", "emdash", "AE", "ordfeminine",
+    "Lslash", "Oslash", "OE", "ordmasculine", "ae", "dotlessi", "lslash",
+    "oslash", "oe", "germandbls", "onesuperior", "logicalnot", "mu",
+    "trademark", "Eth", "onehalf", "plusminus", "Thorn", "onequarter",
+    "divide", "brokenbar", "degree", "thorn", "threequarters",
+    "twosuperior", "registered", "minus", "eth", "multiply",
+    "threesuperior", "copyright", "Aacute", "Acircumflex", "Adieresis",
+    "Agrave", "Aring", "Atilde", "Ccedilla", "Eacute", "Ecircumflex",
+    "Edieresis", "Egrave", "Iacute", "Icircumflex", "Idieresis", "Igrave",
+    "Ntilde", "Oacute", "Ocircumflex", "Odieresis", "Ograve", "Otilde",
+    "Scaron", "Uacute", "Ucircumflex", "Udieresis", "Ugrave", "Yacute",
+    "Ydieresis", "Zcaron", "aacute", "acircumflex", "adieresis", "agrave",
+    "aring", "atilde", "ccedilla", "eacute", "ecircumflex", "edieresis",
+    "egrave", "iacute", "icircumflex", "idieresis", "igrave", "ntilde",
+    "oacute", "ocircumflex", "odieresis", "ograve", "otilde", "scaron",
+    "uacute", "ucircumflex", "udieresis", "ugrave", "yacute", "ydieresis",
+    "zcaron"),
+
+# Expert
+("notdef", "space", "exclamsmall", "Hungarumlautsmall", "dollaroldstyle",
+    "dollarsuperior", "ampersandsmall", "Acutesmall", "parenleftsuperior",
+    "parenrightsuperior", "twodotenleader", "onedotenleader", "comma",
+    "hyphen", "period", "fraction", "zerooldstyle", "oneoldstyle",
+    "twooldstyle", "threeoldstyle", "fouroldstyle", "fiveoldstyle",
+    "sixoldstyle", "sevenoldstyle", "eightoldstyle", "nineoldstyle",
+    "colon", "semicolon", "commasuperior", "threequartersemdash",
+    "periodsuperior", "questionsmall", "asuperior", "bsuperior",
+    "centsuperior", "dsuperior", "esuperior", "isuperior", "lsuperior",
+    "msuperior", "nsuperior", "osuperior", "rsuperior", "ssuperior",
+    "tsuperior", "ff", "fi", "fl", "ffi", "ffl", "parenleftinferior",
+    "parenrightinferior", "Circumflexsmall", "hyphensuperior",
+    "Gravesmall", "Asmall", "Bsmall", "Csmall", "Dsmall", "Esmall",
+    "Fsmall", "Gsmall", "Hsmall", "Ismall", "Jsmall", "Ksmall", "Lsmall",
+    "Msmall", "Nsmall", "Osmall", "Psmall", "Qsmall", "Rsmall", "Ssmall",
+    "Tsmall", "Usmall", "Vsmall", "Wsmall", "Xsmall", "Ysmall", "Zsmall",
+    "colonmonetary", "onefitted", "rupiah", "Tildesmall",
+    "exclamdownsmall", "centoldstyle", "Lslashsmall", "Scaronsmall",
+    "Zcaronsmall", "Dieresissmall", "Brevesmall", "Caronsmall",
+    "Dotaccentsmall", "Macronsmall", "figuredash", "hypheninferior",
+    "Ogoneksmall", "Ringsmall", "Cedillasmall", "onequarter", "onehalf",
+    "threequarters", "questiondownsmall", "oneeighth", "threeeighths",
+    "fiveeighths", "seveneighths", "onethird", "twothirds", "zerosuperior",
+    "onesuperior", "twosuperior", "threesuperior", "foursuperior",
+    "fivesuperior", "sixsuperior", "sevensuperior", "eightsuperior",
+    "ninesuperior", "zeroinferior", "oneinferior", "twoinferior",
+    "threeinferior", "fourinferior", "fiveinferior", "sixinferior",
+    "seveninferior", "eightinferior", "nineinferior", "centinferior",
+    "dollarinferior", "periodinferior", "commainferior", "Agravesmall",
+    "Aacutesmall", "Acircumflexsmall", "Atildesmall", "Adieresissmall",
+    "Aringsmall", "AEsmall", "Ccedillasmall", "Egravesmall", "Eacutesmall",
+    "Ecircumflexsmall", "Edieresissmall", "Igravesmall", "Iacutesmall",
+    "Icircumflexsmall", "Idieresissmall", "Ethsmall", "Ntildesmall",
+    "Ogravesmall", "Oacutesmall", "Ocircumflexsmall", "Otildesmall",
+    "Odieresissmall", "OEsmall", "Oslashsmall", "Ugravesmall",
+    "Uacutesmall", "Ucircumflexsmall", "Udieresissmall", "Yacutesmall",
+    "Thornsmall", "Ydieresissmall"),
+
+# Expert Subset
+(".notdef", "space", "dollaroldstyle", "dollarsuperior",
+        "parenleftsuperior", "parenrightsuperior", "twodotenleader",
+        "onedotenleader", "comma", "hyphen", "period", "fraction",
+        "zerooldstyle", "oneoldstyle", "twooldstyle", "threeoldstyle",
+        "fouroldstyle", "fiveoldstyle", "sixoldstyle", "sevenoldstyle",
+        "eightoldstyle", "nineoldstyle", "colon", "semicolon",
+        "commasuperior", "threequartersemdash", "periodsuperior",
+        "asuperior", "bsuperior", "centsuperior", "dsuperior", "esuperior",
+        "isuperior", "lsuperior", "msuperior", "nsuperior", "osuperior",
+        "rsuperior", "ssuperior", "tsuperior", "ff", "fi", "fl", "ffi",
+        "ffl", "parenleftinferior", "parenrightinferior", "hyphensuperior",
+        "colonmonetary", "onefitted", "rupiah", "centoldstyle",
+        "figuredash", "hypheninferior", "onequarter", "onehalf",
+        "threequarters", "oneeighth", "threeeighths", "fiveeighths",
+        "seveneighths", "onethird", "twothirds", "zerosuperior",
+        "onesuperior", "twosuperior", "threesuperior", "foursuperior",
+        "fivesuperior", "sixsuperior", "sevensuperior", "eightsuperior",
+        "ninesuperior", "zeroinferior", "oneinferior", "twoinferior",
+        "threeinferior", "fourinferior", "fiveinferior", "sixinferior",
+        "seveninferior", "eightinferior", "nineinferior", "centinferior",
+        "dollarinferior", "periodinferior", "commainferior"),
+]  # }}}
+
--- a/ebook_converter/utils/fonts/sfnt/cff/dict_data.py
+++ b/ebook_converter/utils/fonts/sfnt/cff/dict_data.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from struct import pack, unpack_from
+from polyglot.builtins import range, unicode_type
+
+t1_operand_encoding = [None] * 256
+t1_operand_encoding[0:32] = (32) * ["do_operator"]
+t1_operand_encoding[32:247] = (247 - 32) * ["read_byte"]
+t1_operand_encoding[247:251] = (251 - 247) * ["read_small_int1"]
+t1_operand_encoding[251:255] = (255 - 251) * ["read_small_int2"]
+t1_operand_encoding[255] = "read_long_int"
+
+t2_operand_encoding = t1_operand_encoding[:]
+t2_operand_encoding[28] = "read_short_int"
+t2_operand_encoding[255] = "read_fixed_1616"
+
+cff_dict_operand_encoding = t2_operand_encoding[:]
+cff_dict_operand_encoding[29] = "read_long_int"
+cff_dict_operand_encoding[30] = "read_real_number"
+cff_dict_operand_encoding[255] = "reserved"
+
+real_nibbles = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+        '.', 'E', 'E-', None, '-']
+real_nibbles_map = {x:i for i, x in enumerate(real_nibbles)}
+
+
+class ByteCode(dict):
+
+    def read_byte(self, b0, data, index):
+        return b0 - 139, index
+
+    def read_small_int1(self, b0, data, index):
+        b1 = ord(data[index:index+1])
+        return (b0-247)*256 + b1 + 108, index+1
+
+    def read_small_int2(self, b0, data, index):
+        b1 = ord(data[index:index+1])
+        return -(b0-251)*256 - b1 - 108, index+1
+
+    def read_short_int(self, b0, data, index):
+        value, = unpack_from(b">h", data, index)
+        return value, index+2
+
+    def read_long_int(self, b0, data, index):
+        value, = unpack_from(b">l", data, index)
+        return value, index+4
+
+    def read_fixed_1616(self, b0, data, index):
+        value, = unpack_from(b">l", data, index)
+        return value / 65536.0, index+4
+
+    def read_real_number(self, b0, data, index):
+        number = ''
+        while True:
+            b = ord(data[index:index+1])
+            index = index + 1
+            nibble0 = (b & 0xf0) >> 4
+            nibble1 = b & 0x0f
+            if nibble0 == 0xf:
+                break
+            number = number + real_nibbles[nibble0]
+            if nibble1 == 0xf:
+                break
+            number = number + real_nibbles[nibble1]
+        return float(number), index
+
+    def write_float(self, f, encoding='ignored'):
+        s = unicode_type(f).upper()
+        if s[:2] == "0.":
+            s = s[1:]
+        elif s[:3] == "-0.":
+            s = "-" + s[2:]
+        nibbles = []
+        while s:
+            c = s[0]
+            s = s[1:]
+            if c == "E" and s[:1] == "-":
+                s = s[1:]
+                c = "E-"
+            nibbles.append(real_nibbles_map[c])
+        nibbles.append(0xf)
+        if len(nibbles) % 2:
+            nibbles.append(0xf)
+        d = bytearray([30])
+        for i in range(0, len(nibbles), 2):
+            d.append(nibbles[i] << 4 | nibbles[i+1])
+        return bytes(d)
+
+    def write_int(self, value, encoding="cff"):
+        four_byte_op = {'cff':29, 't1':255}.get(encoding, None)
+
+        if -107 <= value <= 107:
+            code = bytes(bytearray([value + 139]))
+        elif 108 <= value <= 1131:
+            value = value - 108
+            code = bytes(bytearray([(value >> 8) + 247, (value & 0xFF)]))
+        elif -1131 <= value <= -108:
+            value = -value - 108
+            code = bytes(bytearray([(value >> 8) + 251, (value & 0xFF)]))
+        elif four_byte_op is None:
+            # T2 only supports 2 byte ints
+            code = bytes(bytearray([28])) + pack(b">h", value)
+        else:
+            code = bytes(bytearray([four_byte_op])) + pack(b">l", value)
+        return code
+
+    def write_offset(self, value):
+        return bytes(bytearray([29])) + pack(b">l", value)
+
+    def write_number(self, value, encoding="cff"):
+        f = self.write_float if isinstance(value, float) else self.write_int
+        return f(value, encoding)
+
+
+class Dict(ByteCode):
+
+    operand_encoding = cff_dict_operand_encoding
+    TABLE = ()
+    FILTERED = frozenset()
+    OFFSETS = frozenset()
+
+    def __init__(self):
+        ByteCode.__init__(self)
+
+        self.operators = {op:(name, arg) for op, name, arg, default in
+                self.TABLE}
+        self.defaults = {name:default for op, name, arg, default in self.TABLE}
+
+    def safe_get(self, name):
+        return self.get(name, self.defaults[name])
+
+    def decompile(self, strings, global_subrs, data):
+        self.strings = strings
+        self.global_subrs = global_subrs
+        self.stack = []
+        index = 0
+        while index < len(data):
+            b0 = ord(data[index:index+1])
+            index += 1
+            handler = getattr(self, self.operand_encoding[b0])
+            value, index = handler(b0, data, index)
+            if value is not None:
+                self.stack.append(value)
+
+    def do_operator(self, b0, data, index):
+        if b0 == 12:
+            op = (b0, ord(data[index:index+1]))
+            index += 1
+        else:
+            op = b0
+        operator, arg_type = self.operators[op]
+        self.handle_operator(operator, arg_type)
+        return None, index
+
+    def handle_operator(self, operator, arg_type):
+        if isinstance(arg_type, tuple):
+            value = ()
+            for i in range(len(arg_type)-1, -1, -1):
+                arg = arg_type[i]
+                arghandler = getattr(self, 'arg_' + arg)
+                value = (arghandler(operator),) + value
+        else:
+            arghandler = getattr(self, 'arg_' + arg_type)
+            value = arghandler(operator)
+        self[operator] = value
+
+    def arg_number(self, name):
+        return self.stack.pop()
+
+    def arg_SID(self, name):
+        return self.strings[self.stack.pop()]
+
+    def arg_array(self, name):
+        ans = self.stack[:]
+        del self.stack[:]
+        return ans
+
+    def arg_delta(self, name):
+        out = []
+        current = 0
+        for v in self.stack:
+            current = current + v
+            out.append(current)
+        del self.stack[:]
+        return out
+
+    def compile(self, strings):
+        data = []
+        for op, name, arg, default in self.TABLE:
+            if name in self.FILTERED:
+                continue
+            val = self.safe_get(name)
+            opcode = bytes(bytearray(op if isinstance(op, tuple) else [op]))
+            if val != self.defaults[name]:
+                self.encoding_offset = name in self.OFFSETS
+                if isinstance(arg, tuple):
+                    if len(val) != len(arg):
+                        raise ValueError('Invalid argument %s for operator: %s'
+                                %(val, op))
+                    for typ, v in zip(arg, val):
+                        if typ == 'SID':
+                            val = strings(val)
+                        data.append(getattr(self, 'encode_'+typ)(v))
+                else:
+                    if arg == 'SID':
+                        val = strings(val)
+                    data.append(getattr(self, 'encode_'+arg)(val))
+                data.append(opcode)
+        self.raw = b''.join(data)
+        return self.raw
+
+    def encode_number(self, val):
+        if self.encoding_offset:
+            return self.write_offset(val)
+        return self.write_number(val)
+
+    def encode_SID(self, val):
+        return self.write_int(val)
+
+    def encode_array(self, val):
+        return b''.join(map(self.encode_number, val))
+
+    def encode_delta(self, value):
+        out = []
+        last = 0
+        for v in value:
+            out.append(v - last)
+            last = v
+        return self.encode_array(out)
+
+
+class TopDict(Dict):
+
+    TABLE = (
+    # opcode     name                  argument type   default
+    ((12, 30), 'ROS',        ('SID','SID','number'), None,),
+    ((12, 20), 'SyntheticBase',      'number',       None,),
+    (0,        'version',            'SID',          None,),
+    (1,        'Notice',             'SID',          None,),
+    ((12, 0),  'Copyright',          'SID',          None,),
+    (2,        'FullName',           'SID',          None,),
+    ((12, 38), 'FontName',           'SID',          None,),
+    (3,        'FamilyName',         'SID',          None,),
+    (4,        'Weight',             'SID',          None,),
+    ((12, 1),  'isFixedPitch',       'number',       0,),
+    ((12, 2),  'ItalicAngle',        'number',       0,),
+    ((12, 3),  'UnderlinePosition',  'number',       None,),
+    ((12, 4),  'UnderlineThickness', 'number',       50,),
+    ((12, 5),  'PaintType',          'number',       0,),
+    ((12, 6),  'CharstringType',     'number',       2,),
+    ((12, 7),  'FontMatrix',         'array',  [0.001,0,0,0.001,0,0],),
+    (13,       'UniqueID',           'number',       None,),
+    (5,        'FontBBox',           'array',  [0,0,0,0],),
+    ((12, 8),  'StrokeWidth',        'number',       0,),
+    (14,       'XUID',               'array',        None,),
+    ((12, 21), 'PostScript',         'SID',          None,),
+    ((12, 22), 'BaseFontName',       'SID',          None,),
+    ((12, 23), 'BaseFontBlend',      'delta',        None,),
+    ((12, 31), 'CIDFontVersion',     'number',       0,),
+    ((12, 32), 'CIDFontRevision',    'number',       0,),
+    ((12, 33), 'CIDFontType',        'number',       0,),
+    ((12, 34), 'CIDCount',           'number',       8720,),
+    (15,       'charset',            'number',       0,),
+    ((12, 35), 'UIDBase',            'number',       None,),
+    (16,       'Encoding',           'number',       0,),
+    (18,       'Private',       ('number','number'), None,),
+    ((12, 37), 'FDSelect',           'number',       None,),
+    ((12, 36), 'FDArray',            'number',       None,),
+    (17,       'CharStrings',        'number',       None,),
+    )
+
+    # We will not write these operators out
+    FILTERED = {'ROS', 'SyntheticBase', 'UniqueID', 'XUID',
+            'CIDFontVersion', 'CIDFontRevision', 'CIDFontType', 'CIDCount',
+            'UIDBase', 'Encoding', 'FDSelect', 'FDArray'}
+    OFFSETS = {'charset', 'Encoding', 'CharStrings', 'Private'}
+
+
+class PrivateDict(Dict):
+
+    TABLE = (
+    #   opcode     name                  argument type   default
+    (6,        'BlueValues',         'delta',        None,),
+    (7,        'OtherBlues',         'delta',        None,),
+    (8,        'FamilyBlues',        'delta',        None,),
+    (9,        'FamilyOtherBlues',   'delta',        None,),
+    ((12, 9),  'BlueScale',          'number',       0.039625,),
+    ((12, 10), 'BlueShift',          'number',       7,),
+    ((12, 11), 'BlueFuzz',           'number',       1,),
+    (10,       'StdHW',              'number',       None,),
+    (11,       'StdVW',              'number',       None,),
+    ((12, 12), 'StemSnapH',          'delta',        None,),
+    ((12, 13), 'StemSnapV',          'delta',        None,),
+    ((12, 14), 'ForceBold',          'number',       0,),
+    ((12, 15), 'ForceBoldThreshold', 'number',       None,),  # deprecated
+    ((12, 16), 'lenIV',              'number',       None,),  # deprecated
+    ((12, 17), 'LanguageGroup',      'number',       0,),
+    ((12, 18), 'ExpansionFactor',    'number',       0.06,),
+    ((12, 19), 'initialRandomSeed',  'number',       0,),
+    (20,       'defaultWidthX',      'number',       0,),
+    (21,       'nominalWidthX',      'number',       0,),
+    (19,       'Subrs',              'number',       None,),
+    )
+
+    OFFSETS = {'Subrs'}
--- a/ebook_converter/utils/fonts/sfnt/cff/table.py
+++ b/ebook_converter/utils/fonts/sfnt/cff/table.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from struct import unpack_from, unpack, calcsize
+from functools import partial
+
+from calibre.utils.fonts.sfnt import UnknownTable
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs
+from calibre.utils.fonts.sfnt.cff.dict_data import TopDict, PrivateDict
+from calibre.utils.fonts.sfnt.cff.constants import (cff_standard_strings,
+        STANDARD_CHARSETS)
+from polyglot.builtins import iteritems, itervalues, range
+
+# Useful links
+# http://www.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/5176.CFF.pdf
+# http://www.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/5177.Type2.pdf
+
+
+class CFF(object):
+
+    def __init__(self, raw):
+        (self.major_version, self.minor_version, self.header_size,
+                self.offset_size) = unpack_from(b'>4B', raw)
+        if (self.major_version, self.minor_version) != (1, 0):
+            raise UnsupportedFont('The CFF table has unknown version: '
+                    '(%d, %d)'%(self.major_version, self.minor_version))
+        offset = self.header_size
+
+        # Read Names Index
+        self.font_names = Index(raw, offset)
+        offset = self.font_names.pos
+        if len(self.font_names) > 1:
+            raise UnsupportedFont('CFF table has more than one font.')
+
+        # Read Top Dict
+        self.top_index = Index(raw, offset)
+        self.top_dict = TopDict()
+        offset = self.top_index.pos
+
+        # Read strings
+        self.strings = Strings(raw, offset)
+        offset = self.strings.pos
+
+        # Read global subroutines
+        self.global_subrs = Subrs(raw, offset)
+        offset = self.global_subrs.pos
+
+        # Decompile Top Dict
+        self.top_dict.decompile(self.strings, self.global_subrs, self.top_index[0])
+        self.is_CID = 'ROS' in self.top_dict
+        if self.is_CID:
+            raise UnsupportedFont('Subsetting of CID keyed fonts is not supported')
+
+        # Read CharStrings (Glyph definitions)
+        try:
+            offset = self.top_dict['CharStrings']
+        except KeyError:
+            raise ValueError('This font has no CharStrings')
+        cs_type = self.top_dict.safe_get('CharstringType')
+        if cs_type != 2:
+            raise UnsupportedFont('This font has unsupported CharstringType: '
+                    '%s'%cs_type)
+        self.char_strings = CharStringsIndex(raw, offset)
+        self.num_glyphs = len(self.char_strings)
+
+        # Read Private Dict
+        self.private_dict = self.private_subrs = None
+        pd = self.top_dict.safe_get('Private')
+        if pd:
+            size, offset = pd
+            self.private_dict = PrivateDict()
+            self.private_dict.decompile(self.strings, self.global_subrs,
+                    raw[offset:offset+size])
+            if 'Subrs' in self.private_dict:
+                self.private_subrs = Subrs(raw, offset +
+                        self.private_dict['Subrs'])
+
+        # Read charset (Glyph names)
+        self.charset = Charset(raw, self.top_dict.safe_get('charset'),
+                self.strings, self.num_glyphs, self.is_CID)
+
+        # import pprint
+        # pprint.pprint(self.top_dict)
+        # pprint.pprint(self.private_dict)
+
+
+class Index(list):
+
+    def __init__(self, raw, offset, prepend=()):
+        list.__init__(self)
+        self.extend(prepend)
+
+        count = unpack_from(b'>H', raw, offset)[0]
+        offset += 2
+        self.pos = offset
+
+        if count > 0:
+            self.offset_size = unpack_from(b'>B', raw, offset)[0]
+            offset += 1
+            if self.offset_size == 3:
+                offsets = [unpack(b'>L', b'\0' + raw[i:i+3])[0]
+                            for i in range(offset, offset+3*(count+1), 3)]
+            else:
+                fmt = {1:'B', 2:'H', 4:'L'}[self.offset_size]
+                fmt = ('>%d%s'%(count+1, fmt)).encode('ascii')
+                offsets = unpack_from(fmt, raw, offset)
+            offset += self.offset_size * (count+1) - 1
+
+            for i in range(len(offsets)-1):
+                off, noff = offsets[i:i+2]
+                obj = raw[offset+off:offset+noff]
+                self.append(obj)
+
+            try:
+                self.pos = offset + offsets[-1]
+            except IndexError:
+                self.pos = offset
+
+
+class Strings(Index):
+
+    def __init__(self, raw, offset):
+        super(Strings, self).__init__(raw, offset, prepend=[x.encode('ascii')
+            for x in cff_standard_strings])
+
+
+class Charset(list):
+
+    def __init__(self, raw, offset, strings, num_glyphs, is_CID):
+        super(Charset, self).__init__()
+        self.standard_charset = offset if offset in {0, 1, 2} else None
+        if is_CID and self.standard_charset is not None:
+            raise ValueError("CID font must not use a standard charset")
+        if self.standard_charset is None:
+            self.append(b'.notdef')
+            fmt = unpack_from(b'>B', raw, offset)[0]
+            offset += 1
+            f = {0:self.parse_fmt0, 1:self.parse_fmt1,
+                2:partial(self.parse_fmt1, is_two_byte=True)}.get(fmt, None)
+            if f is None:
+                raise UnsupportedFont('This font uses unsupported charset '
+                        'table format: %d'%fmt)
+            f(raw, offset, strings, num_glyphs, is_CID)
+
+    def parse_fmt0(self, raw, offset, strings, num_glyphs, is_CID):
+        fmt = ('>%dH'%(num_glyphs-1)).encode('ascii')
+        ids = unpack_from(fmt, raw, offset)
+        if is_CID:
+            ids = ('cid%05d'%x for x in ids)
+        else:
+            ids = (strings[x] for x in ids)
+        self.extend(ids)
+
+    def parse_fmt1(self, raw, offset, strings, num_glyphs, is_CID,
+            is_two_byte=False):
+        fmt = b'>2H' if is_two_byte else b'>HB'
+        sz = calcsize(fmt)
+        count = 1
+        while count < num_glyphs:
+            first, nleft = unpack_from(fmt, raw, offset)
+            offset += sz
+            count += nleft + 1
+            self.extend('cid%05d'%x if is_CID else strings[x] for x in
+                    range(first, first + nleft+1))
+
+    def lookup(self, glyph_id):
+        if self.standard_charset is None:
+            return self[glyph_id]
+        return STANDARD_CHARSETS[self.standard_charset][glyph_id].encode('ascii')
+
+    def safe_lookup(self, glyph_id):
+        try:
+            return self.lookup(glyph_id)
+        except (KeyError, IndexError, ValueError):
+            return None
+
+
+class Subrs(Index):
+    pass
+
+
+class CharStringsIndex(Index):
+    pass
+
+
+class CFFTable(UnknownTable):
+
+    def decompile(self):
+        self.cff = CFF(self.raw)
+
+    def subset(self, character_map, extra_glyphs):
+        from calibre.utils.fonts.sfnt.cff.writer import Subset
+        # Map codes from the cmap table to glyph names, this will be used to
+        # reconstruct character_map for the subset font
+        charset_map = {code:self.cff.charset.safe_lookup(glyph_id) for code,
+                glyph_id in iteritems(character_map)}
+        charset = set(itervalues(charset_map))
+        charset.discard(None)
+        if not charset and character_map:
+            raise NoGlyphs('This font has no glyphs for the specified characters')
+        charset |= {
+            self.cff.charset.safe_lookup(glyph_id) for glyph_id in extra_glyphs}
+        charset.discard(None)
+        s = Subset(self.cff, charset)
+
+        # Rebuild character_map with the glyph ids from the subset font
+        character_map.clear()
+        for code, charname in iteritems(charset_map):
+            glyph_id = s.charname_map.get(charname, None)
+            if glyph_id:
+                character_map[code] = glyph_id
+
+        # Check that raw is parseable
+        CFF(s.raw)
+
+        self.raw = s.raw
--- a/ebook_converter/utils/fonts/sfnt/cmap.py
+++ b/ebook_converter/utils/fonts/sfnt/cmap.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+# Note that the code for creating a BMP table (cmap format 4) is taken with
+# thanks from the fonttools project (BSD licensed).
+
+from struct import unpack_from, calcsize, pack
+from collections import OrderedDict
+
+from calibre.utils.fonts.utils import read_bmp_prefix
+from calibre.utils.fonts.sfnt import UnknownTable, max_power_of_two
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+from polyglot.builtins import range
+
+
+def split_range(start_code, end_code, cmap):  # {{{
+    # Try to split a range of character codes into subranges with consecutive
+    # glyph IDs in such a way that the cmap4 subtable can be stored "most"
+    # efficiently.
+    if start_code == end_code:
+        return [], [end_code]
+
+    last_id = cmap[start_code]
+    last_code = start_code
+    in_order = None
+    ordered_begin = None
+    sub_ranges = []
+
+    # Gather subranges in which the glyph IDs are consecutive.
+    for code in range(start_code + 1, end_code + 1):
+        glyph_id = cmap[code]
+
+        if glyph_id - 1 == last_id:
+            if in_order is None or not in_order:
+                in_order = 1
+                ordered_begin = last_code
+        else:
+            if in_order:
+                in_order = 0
+                sub_ranges.append((ordered_begin, last_code))
+                ordered_begin = None
+
+        last_id = glyph_id
+        last_code = code
+
+    if in_order:
+        sub_ranges.append((ordered_begin, last_code))
+    assert last_code == end_code
+
+    # Now filter out those new subranges that would only make the data bigger.
+    # A new segment cost 8 bytes, not using a new segment costs 2 bytes per
+    # character.
+    new_ranges = []
+    for b, e in sub_ranges:
+        if b == start_code and e == end_code:
+            break  # the whole range, we're fine
+        if b == start_code or e == end_code:
+            threshold = 4  # split costs one more segment
+        else:
+            threshold = 8  # split costs two more segments
+        if (e - b + 1) > threshold:
+            new_ranges.append((b, e))
+    sub_ranges = new_ranges
+
+    if not sub_ranges:
+        return [], [end_code]
+
+    if sub_ranges[0][0] != start_code:
+        sub_ranges.insert(0, (start_code, sub_ranges[0][0] - 1))
+    if sub_ranges[-1][1] != end_code:
+        sub_ranges.append((sub_ranges[-1][1] + 1, end_code))
+
+    # Fill the "holes" in the segments list -- those are the segments in which
+    # the glyph IDs are _not_ consecutive.
+    i = 1
+    while i < len(sub_ranges):
+        if sub_ranges[i-1][1] + 1 != sub_ranges[i][0]:
+            sub_ranges.insert(i, (sub_ranges[i-1][1] + 1, sub_ranges[i][0] - 1))
+            i = i + 1
+        i = i + 1
+
+    # Transform the ranges into start_code/end_code lists.
+    start = []
+    end = []
+    for b, e in sub_ranges:
+        start.append(b)
+        end.append(e)
+    start.pop(0)
+
+    assert len(start) + 1 == len(end)
+    return start, end
+# }}}
+
+
+def set_id_delta(id_delta):  # {{{
+    # The lowest gid in glyphIndexArray, after subtracting id_delta, must be 1.
+    # id_delta is a short, and must be between -32K and 32K
+    # startCode can be between 0 and 64K-1, and the first glyph index can be between 1 and 64K-1
+    # This means that we have a problem because we can need to assign to
+    # id_delta values
+    # between -(64K-2) and 64K -1.
+    # Since the final gi is reconstructed from the glyphArray GID by:
+    #    (short)finalGID = (gid +  id_delta) % 0x10000),
+    # we can get from a startCode of 0 to a final GID of 64 -1K by subtracting 1, and casting the
+    # negative number to an unsigned short.
+    # Similarly , we can get from a startCode of 64K-1 to a final GID of 1 by adding 2, because of
+    # the modulo arithmetic.
+
+    if id_delta > 0x7FFF:
+        id_delta = id_delta - 0x10000
+    elif id_delta <  -0x7FFF:
+        id_delta = id_delta + 0x10000
+
+    return id_delta
+# }}}
+
+
+class BMPTable(object):
+
+    def __init__(self, raw):
+        self.raw = raw
+        (self.start_count, self.end_count, self.range_offset, self.id_delta,
+         self.glyph_id_len, self.glyph_id_map, self.array_len) = \
+                read_bmp_prefix(raw, 0)
+
+    def get_glyph_ids(self, codes):
+        for code in codes:
+            found = False
+            for i, ec in enumerate(self.end_count):
+                if ec >= code:
+                    sc = self.start_count[i]
+                    if sc <= code:
+                        found = True
+                        ro = self.range_offset[i]
+                        if ro == 0:
+                            glyph_id = self.id_delta[i] + code
+                        else:
+                            idx = ro//2 + (code - sc) + i - self.array_len
+                            glyph_id = self.glyph_id_map[idx]
+                            if glyph_id != 0:
+                                glyph_id += self.id_delta[i]
+                        yield glyph_id % 0x10000
+                        break
+            if not found:
+                yield 0
+
+    def get_glyph_map(self, glyph_ids):
+        ans = {}
+        for i, ec in enumerate(self.end_count):
+            sc = self.start_count[i]
+            for code in range(sc, ec+1):
+                ro = self.range_offset[i]
+                if ro == 0:
+                    glyph_id = self.id_delta[i] + code
+                else:
+                    idx = ro//2 + (code - sc) + i - self.array_len
+                    glyph_id = self.glyph_id_map[idx]
+                    if glyph_id != 0:
+                        glyph_id += self.id_delta[i]
+                glyph_id %= 0x10000
+                if glyph_id in glyph_ids and code not in ans:
+                    ans[code] = glyph_id
+        return ans
+
+
+class CmapTable(UnknownTable):
+
+    def __init__(self, *args, **kwargs):
+        super(CmapTable, self).__init__(*args, **kwargs)
+
+        self.version, self.num_tables = unpack_from(b'>HH', self.raw)
+
+        self.tables = {}
+
+        offset = 4
+        sz = calcsize(b'>HHL')
+        recs = []
+        for i in range(self.num_tables):
+            platform, encoding, table_offset = unpack_from(b'>HHL', self.raw,
+                    offset)
+            offset += sz
+            recs.append((platform, encoding, table_offset))
+
+        self.bmp_table = None
+
+        for i in range(len(recs)):
+            platform, encoding, offset = recs[i]
+            try:
+                next_offset = recs[i+1][-1]
+            except IndexError:
+                next_offset = len(self.raw)
+            table = self.raw[offset:next_offset]
+            if table:
+                fmt = unpack_from(b'>H', table)[0]
+                if platform == 3 and encoding == 1 and fmt == 4:
+                    self.bmp_table = BMPTable(table)
+
+    def get_character_map(self, chars):
+        '''
+        Get a mapping of character codes to glyph ids in the font.
+        '''
+        if self.bmp_table is None:
+            raise UnsupportedFont('This font has no Windows BMP cmap subtable.'
+                    ' Most likely a special purpose font.')
+        chars = sorted(set(chars))
+        ans = OrderedDict()
+        for i, glyph_id in enumerate(self.bmp_table.get_glyph_ids(chars)):
+            if glyph_id > 0:
+                ans[chars[i]] = glyph_id
+        return ans
+
+    def get_glyph_map(self, glyph_ids):
+        '''
+        Get a mapping of character codes to glyph ids for the specified glyph
+        ids.
+        '''
+        if self.bmp_table is None:
+            raise UnsupportedFont('This font has no Windows BMP cmap subtable.'
+                    ' Most likely a special purpose font.')
+        glyph_ids = frozenset(glyph_ids)
+        return self.bmp_table.get_glyph_map(glyph_ids)
+
+    def set_character_map(self, cmap):
+        self.version, self.num_tables = 0, 1
+        fmt = b'>7H'
+        codes = sorted(cmap)
+
+        if not codes:
+            start_code = [0xffff]
+            end_code = [0xffff]
+        else:
+            last_code = codes[0]
+            end_code = []
+            start_code = [last_code]
+
+            for code in codes[1:]:
+                if code == last_code + 1:
+                    last_code = code
+                    continue
+                start, end = split_range(start_code[-1], last_code, cmap)
+                start_code.extend(start)
+                end_code.extend(end)
+                start_code.append(code)
+                last_code = code
+            end_code.append(last_code)
+            start_code.append(0xffff)
+            end_code.append(0xffff)
+
+        id_delta = []
+        id_range_offset = []
+        glyph_index_array = []
+        for i in range(len(end_code)-1):  # skip the closing codes (0xffff)
+            indices = list(cmap[char_code] for char_code in range(start_code[i], end_code[i] + 1))
+            if indices == list(range(indices[0], indices[0] + len(indices))):
+                # indices is a contiguous list
+                id_delta_temp = set_id_delta(indices[0] - start_code[i])
+                id_delta.append(id_delta_temp)
+                id_range_offset.append(0)
+            else:
+                id_delta.append(0)
+                id_range_offset.append(2 * (len(end_code) + len(glyph_index_array) - i))
+                glyph_index_array.extend(indices)
+        id_delta.append(1)  # 0xffff + 1 == 0. So this end code maps to .notdef
+        id_range_offset.append(0)
+
+        seg_count = len(end_code)
+        max_exponent = max_power_of_two(seg_count)
+        search_range = 2 * (2 ** max_exponent)
+        entry_selector = max_exponent
+        range_shift = 2 * seg_count - search_range
+
+        char_code_array = end_code + [0] + start_code
+        char_code_array = pack(b'>%dH'%len(char_code_array), *char_code_array)
+        id_delta_array = pack(b'>%dh'%len(id_delta), *id_delta)
+        rest_array = id_range_offset + glyph_index_array
+        rest_array = pack(b'>%dH'%len(rest_array), *rest_array)
+        data = char_code_array + id_delta_array + rest_array
+
+        length = calcsize(fmt) + len(data)
+        header = pack(fmt, 4, length, 0, 2*seg_count, search_range, entry_selector, range_shift)
+        self.bmp_table = header + data
+
+        fmt = b'>4HL'
+        offset = calcsize(fmt)
+        self.raw = pack(fmt, self.version, self.num_tables, 3, 1, offset) + self.bmp_table
--- a/ebook_converter/utils/fonts/sfnt/common.py
+++ b/ebook_converter/utils/fonts/sfnt/common.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from struct import unpack_from, calcsize
+from collections import OrderedDict, namedtuple
+
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+from polyglot.builtins import range, iteritems
+
+
+class Unpackable(object):
+
+    def __init__(self, raw, offset):
+        self.raw, self.offset = raw, offset
+        self.start_pos = offset
+
+    def unpack(self, fmt, single_special=True):
+        fmt = fmt.encode('ascii') if not isinstance(fmt, bytes) else fmt
+        ans = unpack_from(b'>'+fmt, self.raw, self.offset)
+        if single_special and len(ans) == 1:
+            ans = ans[0]
+        self.offset += calcsize(fmt)
+        return ans
+
+
+class SimpleListTable(list):
+
+    'A table that contains a list of subtables'
+
+    child_class = None
+
+    def __init__(self, raw, offset):
+        list.__init__(self)
+
+        data = Unpackable(raw, offset)
+        self.read_extra_header(data)
+
+        count = data.unpack('H')
+        for i in range(count):
+            offset = data.unpack('H')
+            self.append(self.child_class(raw, data.start_pos + offset))
+        self.read_extra_footer(data)
+
+    def read_extra_header(self, data):
+        pass
+
+    def read_extra_footer(self, data):
+        pass
+
+
+class ListTable(OrderedDict):
+
+    'A table that contains an ordered mapping of table tag to subtable'
+
+    child_class = None
+
+    def __init__(self, raw, offset):
+        OrderedDict.__init__(self)
+
+        data = Unpackable(raw, offset)
+        self.read_extra_header(data)
+
+        count = data.unpack('H')
+        for i in range(count):
+            tag, coffset = data.unpack('4sH')
+            self[tag] = self.child_class(raw, data.start_pos + coffset)
+
+        self.read_extra_footer(data)
+
+    def read_extra_header(self, data):
+        pass
+
+    def read_extra_footer(self, data):
+        pass
+
+    def dump(self, prefix=''):
+        print(prefix, self.__class__.__name__, sep='')
+        prefix += '  '
+        for tag, child in iteritems(self):
+            print(prefix, tag, sep='')
+            child.dump(prefix=prefix+'  ')
+
+
+class IndexTable(list):
+
+    def __init__(self, raw, offset):
+        data = Unpackable(raw, offset)
+        self.read_extra_header(data)
+
+        count = data.unpack('H')
+        for i in range(count):
+            self.append(data.unpack('H'))
+
+    def read_extra_header(self, data):
+        pass
+
+    def dump(self, prefix=''):
+        print(prefix, self.__class__.__name__, sep='')
+
+
+class LanguageSystemTable(IndexTable):
+
+    def read_extra_header(self, data):
+        self.lookup_order, self.required_feature_index = data.unpack('2H')
+        if self.lookup_order != 0:
+            raise UnsupportedFont('This LanguageSystemTable has an unknown'
+                    ' lookup order: 0x%x'%self.lookup_order)
+
+
+class ScriptTable(ListTable):
+
+    child_class = LanguageSystemTable
+
+    def __init__(self, raw, offset):
+        ListTable.__init__(self, raw, offset)
+
+    def read_extra_header(self, data):
+        start_pos = data.offset
+        default_offset = data.unpack('H')
+        self[b'default'] = (LanguageSystemTable(data.raw, start_pos +
+            default_offset) if default_offset else None)
+
+
+class ScriptListTable(ListTable):
+
+    child_class = ScriptTable
+
+
+class FeatureTable(IndexTable):
+
+    def read_extra_header(self, data):
+        self.feature_params = data.unpack('H')
+        if False and self.feature_params != 0:
+            # Source code pro sets this to non NULL
+            raise UnsupportedFont(
+                'This FeatureTable has non NULL FeatureParams: 0x%x'%self.feature_params)
+
+
+class FeatureListTable(ListTable):
+
+    child_class = FeatureTable
+
+
+class LookupTable(SimpleListTable):
+
+    def read_extra_header(self, data):
+        self.lookup_type, self.lookup_flag = data.unpack('2H')
+        self.set_child_class()
+
+    def set_child_class(self):
+        raise NotImplementedError()
+
+    def read_extra_footer(self, data):
+        if self.lookup_flag & 0x0010:
+            self.mark_filtering_set = data.unpack('H')
+
+
+def ExtensionSubstitution(raw, offset, subtable_map={}):
+    data = Unpackable(raw, offset)
+    subst_format, extension_lookup_type, offset = data.unpack('2HL')
+    if subst_format != 1:
+        raise UnsupportedFont('ExtensionSubstitution has unknown format: 0x%x'%subst_format)
+    return subtable_map[extension_lookup_type](raw, offset+data.start_pos)
+
+
+CoverageRange = namedtuple('CoverageRange', 'start end start_coverage_index')
+
+
+class Coverage(object):
+
+    def __init__(self, raw, offset, parent_table_name):
+        data = Unpackable(raw, offset)
+        self.format, count = data.unpack('2H')
+
+        if self.format not in {1, 2}:
+            raise UnsupportedFont('Unknown Coverage format: 0x%x in %s'%(
+                self.format, parent_table_name))
+        if self.format == 1:
+            self.glyph_ids = data.unpack('%dH'%count, single_special=False)
+            self.glyph_ids_map = {gid:i for i, gid in
+                    enumerate(self.glyph_ids)}
+        else:
+            self.ranges = []
+            ranges = data.unpack('%dH'%(3*count), single_special=False)
+            for i in range(count):
+                start, end, start_coverage_index = ranges[i*3:(i+1)*3]
+                self.ranges.append(CoverageRange(start, end, start_coverage_index))
+
+    def coverage_indices(self, glyph_ids):
+        '''Return map of glyph_id -> coverage index. Map contains only those
+        glyph_ids that are covered by this table and that are present in
+        glyph_ids.'''
+        ans = OrderedDict()
+        for gid in glyph_ids:
+            if self.format == 1:
+                idx = self.glyph_ids_map.get(gid, None)
+                if idx is not None:
+                    ans[gid] = idx
+            else:
+                for start, end, start_coverage_index in self.ranges:
+                    if start <= gid <= end:
+                        ans[gid] = start_coverage_index + (gid-start)
+        return ans
+
+
+class UnknownLookupSubTable(object):
+
+    formats = {}
+
+    def __init__(self, raw, offset):
+        data = Unpackable(raw, offset)
+        self.format = data.unpack('H')
+        if self.format not in self.formats:
+            raise UnsupportedFont('Unknown format for Lookup Subtable %s: 0x%x'%(
+                self.__class__.__name__, self.format))
+        if self.has_initial_coverage:
+            coverage_offset = data.unpack('H') + data.start_pos
+            self.coverage = Coverage(raw, coverage_offset, self.__class__.__name__)
+        self.initialize(data)
+
+    @property
+    def has_initial_coverage(self):
+        return True
+
+    def all_substitutions(self, glyph_ids):
+        ''' Return a set of all glyph ids that could be substituted for any
+        subset of the specified glyph ids (which must be a set)'''
+        raise NotImplementedError()
+
+    def read_sets(self, data, read_item=None, set_is_index=False):
+        count = data.unpack('H')
+        sets = data.unpack('%dH'%count, single_special=False)
+        coverage_to_items_map = []
+        for offset in sets:
+            # Read items in the set
+            data.offset = start_pos = offset + data.start_pos
+            count = data.unpack('H')
+            item_offsets = data.unpack('%dH'%count, single_special=False)
+            items = []
+            for offset in item_offsets:
+                data.offset = offset + start_pos
+                if set_is_index:
+                    items.append(offset)
+                else:
+                    items.append(read_item(data))
+            coverage_to_items_map.append(items)
+        return coverage_to_items_map
--- a/ebook_converter/utils/fonts/sfnt/container.py
+++ b/ebook_converter/utils/fonts/sfnt/container.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+# License: GPLv3 Copyright: 2012, Kovid Goyal <kovid at kovidgoyal.net>
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from collections import OrderedDict
+from io import BytesIO
+from struct import calcsize, pack
+
+from calibre.utils.fonts.sfnt import UnknownTable, align_block, max_power_of_two
+from calibre.utils.fonts.sfnt.cff.table import CFFTable
+from calibre.utils.fonts.sfnt.cmap import CmapTable
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+from calibre.utils.fonts.sfnt.glyf import GlyfTable
+from calibre.utils.fonts.sfnt.gsub import GSUBTable
+from calibre.utils.fonts.sfnt.head import (
+    HeadTable, HorizontalHeader, OS2Table, PostTable, VerticalHeader
+)
+from calibre.utils.fonts.sfnt.kern import KernTable
+from calibre.utils.fonts.sfnt.loca import LocaTable
+from calibre.utils.fonts.sfnt.maxp import MaxpTable
+from calibre.utils.fonts.utils import checksum_of_block, get_tables, verify_checksums
+
+# OpenType spec: http://www.microsoft.com/typography/otspec/otff.htm
+
+
+class Sfnt(object):
+
+    TABLE_MAP = {
+        b'head' : HeadTable,
+        b'hhea' : HorizontalHeader,
+        b'vhea' : VerticalHeader,
+        b'maxp' : MaxpTable,
+        b'loca' : LocaTable,
+        b'glyf' : GlyfTable,
+        b'cmap' : CmapTable,
+        b'CFF ' : CFFTable,
+        b'kern' : KernTable,
+        b'GSUB' : GSUBTable,
+        b'OS/2' : OS2Table,
+        b'post' : PostTable,
+    }
+
+    def __init__(self, raw_or_get_table):
+        self.tables = {}
+        if isinstance(raw_or_get_table, bytes):
+            raw = raw_or_get_table
+            self.sfnt_version = raw[:4]
+            if self.sfnt_version not in {b'\x00\x01\x00\x00', b'OTTO', b'true',
+                    b'type1'}:
+                raise UnsupportedFont('Font has unknown sfnt version: %r'%self.sfnt_version)
+            for table_tag, table, table_index, table_offset, table_checksum in get_tables(raw):
+                self.tables[table_tag] = self.TABLE_MAP.get(
+                    table_tag, UnknownTable)(table)
+        else:
+            for table_tag in {
+                b'cmap', b'hhea', b'head', b'hmtx', b'maxp', b'name', b'OS/2',
+                b'post', b'cvt ', b'fpgm', b'glyf', b'loca', b'prep', b'CFF ',
+                b'VORG', b'EBDT', b'EBLC', b'EBSC', b'BASE', b'GSUB', b'GPOS',
+                b'GDEF', b'JSTF', b'gasp', b'hdmx', b'kern', b'LTSH', b'PCLT',
+                b'VDMX', b'vhea', b'vmtx', b'MATH'}:
+                table = bytes(raw_or_get_table(table_tag))
+                if table:
+                    self.tables[table_tag] = self.TABLE_MAP.get(
+                        table_tag, UnknownTable)(table)
+            if not self.tables:
+                raise UnsupportedFont('This font has no tables')
+            self.sfnt_version = (b'\0\x01\0\0' if b'glyf' in self.tables
+                                    else b'OTTO')
+
+    def __getitem__(self, key):
+        return self.tables[key]
+
+    def __contains__(self, key):
+        return key in self.tables
+
+    def __delitem__(self, key):
+        del self.tables[key]
+
+    def __iter__(self):
+        '''Iterate over the table tags in order.'''
+        for x in sorted(self.tables):
+            yield x
+        # Although the optimal order is not alphabetical, the OTF spec says
+        # they should be alphabetical, so we stick with that. See
+        # http://partners.adobe.com/public/developer/opentype/index_recs.html
+        # for optimal order.
+        # keys = list(self.tables)
+        # order = {x:i for i, x in enumerate((b'head', b'hhea', b'maxp', b'OS/2',
+        #     b'hmtx', b'LTSH', b'VDMX', b'hdmx', b'cmap', b'fpgm', b'prep',
+        #     b'cvt ', b'loca', b'glyf', b'CFF ', b'kern', b'name', b'post',
+        #     b'gasp', b'PCLT', b'DSIG'))}
+        # keys.sort(key=lambda x:order.get(x, 1000))
+        # for x in keys:
+        #     yield x
+
+    def pop(self, key, default=None):
+        return self.tables.pop(key, default)
+
+    def get(self, key, default=None):
+        return self.tables.get(key, default)
+
+    def sizes(self):
+        ans = OrderedDict()
+        for tag in self:
+            ans[tag] = len(self[tag])
+        return ans
+
+    def __call__(self, stream=None):
+        stream = BytesIO() if stream is None else stream
+
+        def spack(*args):
+            stream.write(pack(*args))
+
+        stream.seek(0)
+
+        # Write header
+        num_tables = len(self.tables)
+        ln2 = max_power_of_two(num_tables)
+        srange = (2**ln2) * 16
+        spack(b'>4s4H',
+            self.sfnt_version, num_tables, srange, ln2, num_tables * 16 - srange)
+
+        # Write tables
+        head_offset = None
+        table_data = []
+        offset = stream.tell() + (calcsize(b'>4s3L') * num_tables)
+        sizes = OrderedDict()
+        for tag in self:
+            table = self.tables[tag]
+            raw = table()
+            table_len = len(raw)
+            if tag == b'head':
+                head_offset = offset
+                raw = raw[:8] + b'\0\0\0\0' + raw[12:]
+            raw = align_block(raw)
+            checksum = checksum_of_block(raw)
+            spack(b'>4s3L', tag, checksum, offset, table_len)
+            offset += len(raw)
+            table_data.append(raw)
+            sizes[tag] = table_len
+
+        for x in table_data:
+            stream.write(x)
+
+        checksum = checksum_of_block(stream.getvalue())
+        q = (0xB1B0AFBA - checksum) & 0xffffffff
+        stream.seek(head_offset + 8)
+        spack(b'>L', q)
+
+        return stream.getvalue(), sizes
+
+
+def test_roundtrip(ff=None):
+    if ff is None:
+        data = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
+    else:
+        with open(ff, 'rb') as f:
+            data = f.read()
+    rd = Sfnt(data)()[0]
+    verify_checksums(rd)
+    if data[:12] != rd[:12]:
+        raise ValueError('Roundtripping failed, font header not the same')
+    if len(data) != len(rd):
+        raise ValueError('Roundtripping failed, size different (%d vs. %d)'%
+                         (len(data), len(rd)))
+
+
+if __name__ == '__main__':
+    import sys
+    test_roundtrip(sys.argv[-1])
--- a/ebook_converter/utils/fonts/sfnt/errors.py
+++ b/ebook_converter/utils/fonts/sfnt/errors.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+class UnsupportedFont(ValueError):
+    pass
+
+
+class NoGlyphs(ValueError):
+    pass
+
--- a/ebook_converter/utils/fonts/sfnt/glyf.py
+++ b/ebook_converter/utils/fonts/sfnt/glyf.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from struct import unpack_from
+from collections import OrderedDict
+
+from calibre.utils.fonts.sfnt import UnknownTable
+from polyglot.builtins import iteritems
+
+ARG_1_AND_2_ARE_WORDS      = 0x0001  # if set args are words otherwise they are bytes
+ARGS_ARE_XY_VALUES         = 0x0002  # if set args are xy values, otherwise they are points
+ROUND_XY_TO_GRID           = 0x0004  # for the xy values if above is true
+WE_HAVE_A_SCALE            = 0x0008  # Sx = Sy, otherwise scale == 1.0
+NON_OVERLAPPING            = 0x0010  # set to same value for all components (obsolete!)
+MORE_COMPONENTS            = 0x0020  # indicates at least one more glyph after this one
+WE_HAVE_AN_X_AND_Y_SCALE   = 0x0040  # Sx, Sy
+WE_HAVE_A_TWO_BY_TWO       = 0x0080  # t00, t01, t10, t11
+WE_HAVE_INSTRUCTIONS       = 0x0100  # instructions follow
+USE_MY_METRICS             = 0x0200  # apply these metrics to parent glyph
+OVERLAP_COMPOUND           = 0x0400  # used by Apple in GX fonts
+SCALED_COMPONENT_OFFSET    = 0x0800  # composite designed to have the component offset scaled (designed for Apple)
+UNSCALED_COMPONENT_OFFSET  = 0x1000  # composite designed not to have the component offset scaled (designed for MS)
+
+
+class SimpleGlyph(object):
+
+    def __init__(self, num_of_countours, raw):
+        self.num_of_countours = num_of_countours
+        self.raw = raw
+        # The list of glyph indices referred to by this glyph, will always be
+        # empty for a simple glyph and not empty for a composite glyph
+        self.glyph_indices = []
+        self.is_composite = False
+
+    def __len__(self):
+        return len(self.raw)
+
+    def __call__(self):
+        return self.raw
+
+
+class CompositeGlyph(SimpleGlyph):
+
+    def __init__(self, num_of_countours, raw):
+        super(CompositeGlyph, self).__init__(num_of_countours, raw)
+        self.is_composite = True
+
+        flags = MORE_COMPONENTS
+        offset = 10
+        while flags & MORE_COMPONENTS:
+            flags, glyph_index = unpack_from(b'>HH', raw, offset)
+            self.glyph_indices.append(glyph_index)
+            offset += 4
+            if flags & ARG_1_AND_2_ARE_WORDS:
+                offset += 4
+            else:
+                offset += 2
+            if flags & WE_HAVE_A_SCALE:
+                offset += 2
+            elif flags & WE_HAVE_AN_X_AND_Y_SCALE:
+                offset += 4
+            elif flags & WE_HAVE_A_TWO_BY_TWO:
+                offset += 8
+
+
+class GlyfTable(UnknownTable):
+
+    def glyph_data(self, offset, length, as_raw=False):
+        raw = self.raw[offset:offset+length]
+        if as_raw:
+            return raw
+        num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0
+        if num_of_countours >= 0:
+            return SimpleGlyph(num_of_countours, raw)
+        return CompositeGlyph(num_of_countours, raw)
+
+    def update(self, sorted_glyph_map):
+        ans = OrderedDict()
+        offset = 0
+        block = []
+        for glyph_id, glyph in iteritems(sorted_glyph_map):
+            raw = glyph()
+            pad = 4 - (len(raw) % 4)
+            if pad < 4:
+                raw += b'\0' * pad
+            ans[glyph_id] = offset, len(raw)
+            offset += len(raw)
+            block.append(raw)
+        self.raw = b''.join(block)
+        return ans
--- a/ebook_converter/utils/fonts/sfnt/gsub.py
+++ b/ebook_converter/utils/fonts/sfnt/gsub.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from struct import unpack_from
+from functools import partial
+
+from calibre.utils.fonts.sfnt import UnknownTable, FixedProperty
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+from calibre.utils.fonts.sfnt.common import (ScriptListTable, FeatureListTable,
+        SimpleListTable, LookupTable, ExtensionSubstitution,
+        UnknownLookupSubTable)
+from polyglot.builtins import iteritems, itervalues
+
+
+class SingleSubstitution(UnknownLookupSubTable):
+
+    formats = {1, 2}
+
+    def initialize(self, data):
+        if self.format == 1:
+            self.delta = data.unpack('h')
+        else:
+            count = data.unpack('H')
+            self.substitutes = data.unpack('%dH'%count, single_special=False)
+
+    def all_substitutions(self, glyph_ids):
+        gid_index_map = self.coverage.coverage_indices(glyph_ids)
+        if self.format == 1:
+            return {gid + self.delta for gid in gid_index_map}
+        return {self.substitutes[i] for i in itervalues(gid_index_map)}
+
+
+class MultipleSubstitution(UnknownLookupSubTable):
+
+    formats = {1}
+
+    def initialize(self, data):
+        self.coverage_to_subs_map = self.read_sets(data, set_is_index=True)
+
+    def all_substitutions(self, glyph_ids):
+        gid_index_map = self.coverage.coverage_indices(glyph_ids)
+        ans = set()
+        for index in itervalues(gid_index_map):
+            glyphs = set(self.coverage_to_subs_map[index])
+            ans |= glyphs
+        return ans
+
+
+class AlternateSubstitution(MultipleSubstitution):
+    pass
+
+
+class LigatureSubstitution(UnknownLookupSubTable):
+
+    formats = {1}
+
+    def initialize(self, data):
+        self.coverage_to_lig_map = self.read_sets(data, self.read_ligature)
+
+    def read_ligature(self, data):
+        lig_glyph, count = data.unpack('HH')
+        components = data.unpack('%dH'%(count-1), single_special=False)
+        return (lig_glyph, components)
+
+    def all_substitutions(self, glyph_ids):
+        gid_index_map = self.coverage.coverage_indices(glyph_ids)
+        ans = set()
+        for start_glyph_id, index in iteritems(gid_index_map):
+            for glyph_id, components in self.coverage_to_lig_map[index]:
+                components = (start_glyph_id,) + components
+                if set(components).issubset(glyph_ids):
+                    ans.add(glyph_id)
+        return ans
+
+
+class ContexttualSubstitution(UnknownLookupSubTable):
+
+    formats = {1, 2, 3}
+
+    @property
+    def has_initial_coverage(self):
+        return self.format != 3
+
+    def initialize(self, data):
+        pass  # TODO
+
+    def all_substitutions(self, glyph_ids):
+        # This table only defined substitution in terms of other tables
+        return set()
+
+
+class ChainingContextualSubstitution(UnknownLookupSubTable):
+
+    formats = {1, 2, 3}
+
+    @property
+    def has_initial_coverage(self):
+        return self.format != 3
+
+    def initialize(self, data):
+        pass  # TODO
+
+    def all_substitutions(self, glyph_ids):
+        # This table only defined substitution in terms of other tables
+        return set()
+
+
+class ReverseChainSingleSubstitution(UnknownLookupSubTable):
+
+    formats = {1}
+
+    def initialize(self, data):
+        backtrack_count = data.unpack('H')
+        backtrack_offsets = data.unpack('%dH'%backtrack_count,
+                single_special=False)
+        lookahead_count = data.unpack('H')
+        lookahead_offsets = data.unpack('%dH'%lookahead_count,
+                single_special=False)
+        backtrack_offsets = [data.start_pos + x for x in backtrack_offsets]
+        lookahead_offsets = [data.start_pos + x for x in lookahead_offsets]
+        backtrack_offsets, lookahead_offsets  # TODO: Use these
+        count = data.unpack('H')
+        self.substitutes = data.unpack('%dH'%count)
+
+    def all_substitutions(self, glyph_ids):
+        gid_index_map = self.coverage.coverage_indices(glyph_ids)
+        return {self.substitutes[i] for i in itervalues(gid_index_map)}
+
+
+subtable_map = {
+        1: SingleSubstitution,
+        2: MultipleSubstitution,
+        3: AlternateSubstitution,
+        4: LigatureSubstitution,
+        5: ContexttualSubstitution,
+        6: ChainingContextualSubstitution,
+        8: ReverseChainSingleSubstitution,
+}
+
+
+class GSUBLookupTable(LookupTable):
+
+    def set_child_class(self):
+        if self.lookup_type == 7:
+            self.child_class = partial(ExtensionSubstitution,
+                    subtable_map=subtable_map)
+        else:
+            self.child_class = subtable_map[self.lookup_type]
+
+
+class LookupListTable(SimpleListTable):
+
+    child_class = GSUBLookupTable
+
+
+class GSUBTable(UnknownTable):
+
+    version = FixedProperty('_version')
+
+    def decompile(self):
+        (self._version, self.scriptlist_offset, self.featurelist_offset,
+                self.lookuplist_offset) = unpack_from(b'>L3H', self.raw)
+        if self._version != 0x10000:
+            raise UnsupportedFont('The GSUB table has unknown version: 0x%x'%
+                    self._version)
+
+        self.script_list_table = ScriptListTable(self.raw,
+                self.scriptlist_offset)
+        # self.script_list_table.dump()
+
+        self.feature_list_table = FeatureListTable(self.raw,
+                self.featurelist_offset)
+        # self.feature_list_table.dump()
+
+        self.lookup_list_table = LookupListTable(self.raw,
+                self.lookuplist_offset)
+
+    def all_substitutions(self, glyph_ids):
+        glyph_ids = frozenset(glyph_ids)
+        ans = set(glyph_ids)
+        for lookup_table in self.lookup_list_table:
+            for subtable in lookup_table:
+                glyphs = subtable.all_substitutions(ans)
+                if glyphs:
+                    ans |= glyphs
+        return ans - {glyph_ids}
--- a/ebook_converter/utils/fonts/sfnt/head.py
+++ b/ebook_converter/utils/fonts/sfnt/head.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from struct import unpack_from, pack, calcsize
+
+from calibre.utils.fonts.sfnt import UnknownTable, DateTimeProperty, FixedProperty
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+from calibre.utils.fonts.sfnt.loca import read_array
+from polyglot.builtins import zip
+
+
+class HeadTable(UnknownTable):
+
+    created = DateTimeProperty('_created')
+    modified = DateTimeProperty('_modified')
+    version_number = FixedProperty('_version_number')
+    font_revision = FixedProperty('_font_revision')
+
+    def __init__(self, *args, **kwargs):
+        super(HeadTable, self).__init__(*args, **kwargs)
+
+        field_types = (
+                '_version_number' , 'l',
+                '_font_revision'  , 'l',
+                'checksum_adjustment' , 'L',
+                'magic_number' , 'L',
+                'flags' , 'H',
+                'units_per_em' , 'H',
+                '_created' , 'q',
+                '_modified' , 'q',
+                'x_min' , 'h',
+                'y_min' , 'h',
+                'x_max' , 'h',
+                'y_max' , 'h',
+                'mac_style' , 'H',
+                'lowest_rec_ppem' , 'H',
+                'font_direction_hint' , 'h',
+                'index_to_loc_format' , 'h',
+                'glyph_data_format'   , 'h'
+        )
+
+        self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii')
+        self._fields = field_types[0::2]
+
+        for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)):
+            setattr(self, f, val)
+
+    def update(self):
+        vals = [getattr(self, f) for f in self._fields]
+        self.raw = pack(self._fmt, *vals)
+
+
+class HorizontalHeader(UnknownTable):
+
+    version_number = FixedProperty('_version_number')
+
+    def read_data(self, hmtx):
+        if hasattr(self, 'ascender'):
+            return
+        field_types = (
+            '_version_number' , 'l',
+            'ascender', 'h',
+            'descender', 'h',
+            'line_gap', 'h',
+            'advance_width_max', 'H',
+            'min_left_side_bearing', 'h',
+            'min_right_side_bearing', 'h',
+            'x_max_extent', 'h',
+            'caret_slope_rise', 'h',
+            'caret_slop_run', 'h',
+            'caret_offset', 'h',
+            'r1', 'h',
+            'r2', 'h',
+            'r3', 'h',
+            'r4', 'h',
+            'metric_data_format', 'h',
+            'number_of_h_metrics', 'H',
+        )
+
+        self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii')
+        self._fields = field_types[0::2]
+
+        for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)):
+            setattr(self, f, val)
+
+        raw = hmtx.raw
+        num = self.number_of_h_metrics
+        if len(raw) < 4*num:
+            raise UnsupportedFont('The hmtx table has insufficient data')
+        long_hor_metric = raw[:4*num]
+        a = read_array(long_hor_metric)
+        self.advance_widths = a[0::2]
+        a = read_array(long_hor_metric, 'h')
+        self.left_side_bearings = a[1::2]
+
+
+class VerticalHeader(UnknownTable):
+
+    version_number = FixedProperty('_version_number')
+
+    def read_data(self, vmtx):
+        if hasattr(self, 'ascender'):
+            return
+        field_types = (
+            '_version_number' , 'l',
+            'ascender', 'h',
+            'descender', 'h',
+            'line_gap', 'h',
+            'advance_height_max', 'H',
+            'min_top_side_bearing', 'h',
+            'min_bottom_side_bearing', 'h',
+            'y_max_extent', 'h',
+            'caret_slope_rise', 'h',
+            'caret_slop_run', 'h',
+            'caret_offset', 'h',
+            'r1', 'h',
+            'r2', 'h',
+            'r3', 'h',
+            'r4', 'h',
+            'metric_data_format', 'h',
+            'number_of_v_metrics', 'H',
+        )
+
+        self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii')
+        self._fields = field_types[0::2]
+
+        for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)):
+            setattr(self, f, val)
+
+        raw = vmtx.raw
+        num = self.number_of_v_metrics
+        if len(raw) < 4*num:
+            raise UnsupportedFont('The vmtx table has insufficient data')
+        long_hor_metric = raw[:4*num]
+        long_hor_metric = raw[:4*num]
+        a = read_array(long_hor_metric)
+        self.advance_heights = a[0::2]
+        a = read_array(long_hor_metric, 'h')
+        self.top_side_bearings = a[1::2]
+
+
+class OS2Table(UnknownTable):
+
+    def read_data(self):
+        if hasattr(self, 'char_width'):
+            return
+        ver, = unpack_from(b'>H', self.raw)
+        field_types = [
+            'version' , 'H',
+            'average_char_width', 'h',
+            'weight_class', 'H',
+            'width_class', 'H',
+            'fs_type', 'H',
+            'subscript_x_size', 'h',
+            'subscript_y_size', 'h',
+            'subscript_x_offset', 'h',
+            'subscript_y_offset', 'h',
+            'superscript_x_size', 'h',
+            'superscript_y_size', 'h',
+            'superscript_x_offset', 'h',
+            'superscript_y_offset', 'h',
+            'strikeout_size', 'h',
+            'strikeout_position', 'h',
+            'family_class', 'h',
+            'panose', '10s',
+            'ranges', '16s',
+            'vendor_id', '4s',
+            'selection', 'H',
+            'first_char_index', 'H',
+            'last_char_index', 'H',
+            'typo_ascender', 'h',
+            'typo_descender', 'h',
+            'typo_line_gap', 'h',
+            'win_ascent', 'H',
+            'win_descent', 'H',
+        ]
+        if ver > 1:
+            field_types += [
+                'code_page_range', '8s',
+                'x_height', 'h',
+                'cap_height', 'h',
+                'default_char', 'H',
+                'break_char', 'H',
+                'max_context', 'H',
+            ]
+
+        self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii')
+        self._fields = field_types[0::2]
+
+        for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)):
+            setattr(self, f, val)
+
+    def zero_fstype(self):
+        prefix = calcsize(b'>HhHH')
+        self.raw = self.raw[:prefix] + b'\0\0' + self.raw[prefix+2:]
+        self.fs_type = 0
+
+
+class PostTable(UnknownTable):
+
+    version_number = FixedProperty('_version')
+    italic_angle = FixedProperty('_italic_angle')
+
+    def read_data(self):
+        if hasattr(self, 'underline_position'):
+            return
+        (self._version, self._italic_angle, self.underline_position,
+         self.underline_thickness) = unpack_from(b'>llhh', self.raw)
--- a/ebook_converter/utils/fonts/sfnt/kern.py
+++ b/ebook_converter/utils/fonts/sfnt/kern.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from struct import unpack_from, calcsize, pack, error as struct_error
+
+from calibre.utils.fonts.sfnt import (UnknownTable, FixedProperty,
+        max_power_of_two)
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+from polyglot.builtins import range
+
+
+class KernTable(UnknownTable):
+
+    version = FixedProperty('_version')
+
+    def __init__(self, *args, **kwargs):
+        super(KernTable, self).__init__(*args, **kwargs)
+        self._version, self.num_tables = unpack_from(b'>HH', self.raw)
+        if self._version == 1 and len(self.raw) >= 8:
+            self._version, self.num_tables = unpack_from(b'>LL', self.raw)
+        self.headerfmt = b'>HH' if self._version == 0 else b'>LL'
+
+    def restrict_to_glyphs(self, glyph_ids):
+        if self._version not in {0, 0x10000}:
+            raise UnsupportedFont('kern table has version: %x'%self._version)
+        offset = 4 if (self._version == 0) else 8
+        tables = []
+        for i in range(self.num_tables):
+            if self._version == 0:
+                version, length, coverage = unpack_from(b'>3H', self.raw, offset)
+                table_format = version
+            else:
+                length, coverage = unpack_from(b'>LH', self.raw, offset)
+                table_format = coverage & 0xff
+            raw = self.raw[offset:offset+length]
+            if table_format == 0:
+                raw = self.restrict_format_0(raw, glyph_ids)
+                if not raw:
+                    continue
+            tables.append(raw)
+            offset += length
+        self.raw = pack(self.headerfmt, self._version, len(tables)) + b''.join(tables)
+
+    def restrict_format_0(self, raw, glyph_ids):
+        if self._version == 0:
+            version, length, coverage, npairs = unpack_from(b'>4H', raw)
+            headerfmt = b'>3H'
+        else:
+            length, coverage, tuple_index, npairs = unpack_from(b'>L3H', raw)
+            headerfmt = b'>L2H'
+
+        offset = calcsize(headerfmt + b'4H')
+        entries = []
+        entrysz = calcsize(b'>2Hh')
+        for i in range(npairs):
+            try:
+                left, right, value = unpack_from(b'>2Hh', raw, offset)
+            except struct_error:
+                offset = len(raw)
+                break  # Buggy kern table
+            if left in glyph_ids and right in glyph_ids:
+                entries.append(pack(b'>2Hh', left, right, value))
+            offset += entrysz
+
+        if offset != len(raw):
+            raise UnsupportedFont('This font has extra data at the end of'
+                    ' a Format 0 kern subtable')
+
+        npairs = len(entries)
+        if npairs == 0:
+            return b''
+
+        entry_selector = max_power_of_two(npairs)
+        search_range = (2 ** entry_selector) * 6
+        range_shift = (npairs - (2 ** entry_selector)) * 6
+
+        entries = b''.join(entries)
+        length = calcsize(headerfmt + b'4H') + len(entries)
+        if self._version == 0:
+            header = pack(headerfmt, version, length, coverage)
+        else:
+            header = pack(headerfmt, length, coverage, tuple_index)
+        return header + pack(b'>4H', npairs, search_range, entry_selector,
+                range_shift) + entries
--- a/ebook_converter/utils/fonts/sfnt/loca.py
+++ b/ebook_converter/utils/fonts/sfnt/loca.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import array, sys
+from operator import itemgetter
+from itertools import repeat
+
+from calibre.utils.fonts.sfnt import UnknownTable
+from polyglot.builtins import iteritems, range
+
+
+def four_byte_type_code():
+    for c in 'IL':
+        a = array.array(c)
+        if a.itemsize == 4:
+            return c
+
+
+def read_array(data, fmt='H'):
+    ans = array.array(fmt, data)
+    if sys.byteorder != 'big':
+        ans.byteswap()
+    return ans
+
+
+class LocaTable(UnknownTable):
+
+    def load_offsets(self, head_table, maxp_table):
+        fmt = 'H' if head_table.index_to_loc_format == 0 else four_byte_type_code()
+        locs = read_array(self.raw, fmt)
+        self.offset_map = locs.tolist()
+        if fmt == 'H':
+            self.offset_map = [2*i for i in self.offset_map]
+        self.fmt = fmt
+
+    def glyph_location(self, glyph_id):
+        offset = self.offset_map[glyph_id]
+        next_offset = self.offset_map[glyph_id+1]
+        return offset, next_offset - offset
+
+    def update(self, resolved_glyph_map):
+        '''
+        Update this table to contain pointers only to the glyphs in
+        resolved_glyph_map which must be a map of glyph_ids to (offset, sz)
+        Note that the loca table is generated for all glyphs from 0 to the
+        largest glyph that is either in resolved_glyph_map or was present
+        originally. The pointers to glyphs that have no data will be set to
+        zero. This preserves glyph ids.
+        '''
+        current_max_glyph_id = len(self.offset_map) - 2
+        max_glyph_id = max(resolved_glyph_map or (0,))
+        max_glyph_id = max(max_glyph_id, current_max_glyph_id)
+        self.offset_map = list(repeat(0, max_glyph_id + 2))
+        glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in
+                    iteritems(resolved_glyph_map)]
+        glyphs.sort(key=itemgetter(1))
+        for glyph_id, offset, sz in glyphs:
+            self.offset_map[glyph_id] = offset
+            self.offset_map[glyph_id+1] = offset + sz
+        # Fix all zero entries to be the same as the previous entry, which
+        # means that if the ith entry is zero, the i-1 glyph is not present.
+        for i in range(1, len(self.offset_map)):
+            if self.offset_map[i] == 0:
+                self.offset_map[i] = self.offset_map[i-1]
+
+        vals = self.offset_map
+        max_offset = max(vals) if vals else 0
+        if max_offset < 0x20000 and all(l % 2 == 0 for l in vals):
+            self.fmt = 'H'
+            vals = array.array(self.fmt, (i // 2 for i in vals))
+        else:
+            self.fmt = four_byte_type_code()
+            vals = array.array(self.fmt, vals)
+
+        if sys.byteorder != "big":
+            vals.byteswap()
+        self.raw = vals.tostring()
+    subset = update
+
+    def dump_glyphs(self, sfnt):
+        if not hasattr(self, 'offset_map'):
+            self.load_offsets(sfnt[b'head'], sfnt[b'maxp'])
+        for i in range(len(self.offset_map)-1):
+            off, noff = self.offset_map[i], self.offset_map[i+1]
+            if noff != off:
+                print('Glyph id:', i, 'size:', noff-off)
--- a/ebook_converter/utils/fonts/sfnt/maxp.py
+++ b/ebook_converter/utils/fonts/sfnt/maxp.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from struct import unpack_from, pack
+
+from calibre.utils.fonts.sfnt import UnknownTable, FixedProperty
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+from polyglot.builtins import zip
+
+
+class MaxpTable(UnknownTable):
+
+    version = FixedProperty('_version')
+
+    def __init__(self, *args, **kwargs):
+        super(MaxpTable, self).__init__(*args, **kwargs)
+
+        self._fmt = b'>lH'
+        self._version, self.num_glyphs = unpack_from(self._fmt, self.raw)
+        self.fields = ('_version', 'num_glyphs')
+
+        if self.version > 1.0:
+            raise UnsupportedFont('This font has a maxp table with version: %s'
+                    %self.version)
+        if self.version == 1.0:
+            self.fields = ('_version', 'num_glyphs', 'max_points',
+                    'max_contours', 'max_composite_points',
+                    'max_composite_contours', 'max_zones',
+                    'max_twilight_points', 'max_storage', 'max_function_defs',
+                    'max_instruction_defs', 'max_stack_elements',
+                    'max_size_of_instructions', 'max_component_elements',
+                    'max_component_depth')
+            self._fmt = b'>lH' + b'H'*(len(self.fields)-2)
+
+            vals = unpack_from(self._fmt, self.raw)
+            for f, val in zip(self.fields, vals):
+                setattr(self, f, val)
+
+    def update(self):
+        vals = [getattr(self, f) for f in self.fields]
+        self.raw = pack(self._fmt, *vals)
--- a/ebook_converter/utils/fonts/sfnt/subset.py
+++ b/ebook_converter/utils/fonts/sfnt/subset.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import traceback
+from collections import OrderedDict
+from operator import itemgetter
+from functools import partial
+
+from calibre.utils.icu import safe_chr, ord_string
+from calibre.utils.fonts.sfnt.container import Sfnt
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs
+from polyglot.builtins import unicode_type, range, iteritems, itervalues, map
+
+# TrueType outlines {{{
+
+
+def resolve_glyphs(loca, glyf, character_map, extra_glyphs):
+    unresolved_glyphs = set(itervalues(character_map)) | extra_glyphs
+    unresolved_glyphs.add(0)  # We always want the .notdef glyph
+    resolved_glyphs = {}
+
+    while unresolved_glyphs:
+        glyph_id = unresolved_glyphs.pop()
+        try:
+            offset, length = loca.glyph_location(glyph_id)
+        except (IndexError, ValueError, KeyError, TypeError):
+            continue
+        glyph = glyf.glyph_data(offset, length)
+        resolved_glyphs[glyph_id] = glyph
+        for gid in glyph.glyph_indices:
+            if gid not in resolved_glyphs:
+                unresolved_glyphs.add(gid)
+
+    return OrderedDict(sorted(iteritems(resolved_glyphs), key=itemgetter(0)))
+
+
+def subset_truetype(sfnt, character_map, extra_glyphs):
+    loca = sfnt[b'loca']
+    glyf = sfnt[b'glyf']
+
+    try:
+        head, maxp = sfnt[b'head'], sfnt[b'maxp']
+    except KeyError:
+        raise UnsupportedFont('This font does not contain head and/or maxp tables')
+    loca.load_offsets(head, maxp)
+
+    resolved_glyphs = resolve_glyphs(loca, glyf, character_map, extra_glyphs)
+    if not resolved_glyphs or set(resolved_glyphs) == {0}:
+        raise NoGlyphs('This font has no glyphs for the specified character '
+                'set, subsetting it is pointless')
+
+    # Keep only character codes that have resolved glyphs
+    for code, glyph_id in tuple(iteritems(character_map)):
+        if glyph_id not in resolved_glyphs:
+            del character_map[code]
+
+    # Update the glyf table
+    glyph_offset_map = glyf.update(resolved_glyphs)
+
+    # Update the loca table
+    loca.subset(glyph_offset_map)
+    head.index_to_loc_format = 0 if loca.fmt == 'H' else 1
+    head.update()
+    maxp.num_glyphs = len(loca.offset_map) - 1
+
+# }}}
+
+
+def subset_postscript(sfnt, character_map, extra_glyphs):
+    cff = sfnt[b'CFF ']
+    cff.decompile()
+    cff.subset(character_map, extra_glyphs)
+
+
+def do_warn(warnings, *args):
+    for arg in args:
+        for line in arg.splitlines():
+            if warnings is None:
+                print(line)
+            else:
+                warnings.append(line)
+    if warnings is None:
+        print()
+    else:
+        warnings.append('')
+
+
+def pdf_subset(sfnt, glyphs):
+    for tag in tuple(sfnt.tables):
+        if tag not in {b'hhea', b'head', b'hmtx', b'maxp',
+                       b'OS/2', b'post', b'cvt ', b'fpgm', b'glyf', b'loca',
+                       b'prep', b'CFF ', b'VORG'}:
+            # Remove non core tables since they are unused in PDF rendering
+            del sfnt[tag]
+    if b'loca' in sfnt and b'glyf' in sfnt:
+        # TrueType Outlines
+        subset_truetype(sfnt, {}, glyphs)
+    elif b'CFF ' in sfnt:
+        # PostScript Outlines
+        subset_postscript(sfnt, {}, glyphs)
+    else:
+        raise UnsupportedFont('This font does not contain TrueType '
+                'or PostScript outlines')
+
+
+def safe_ord(x):
+    return ord_string(unicode_type(x))[0]
+
+
+def subset(raw, individual_chars, ranges=(), warnings=None):
+    warn = partial(do_warn, warnings)
+
+    chars = set(map(safe_ord, individual_chars))
+    for r in ranges:
+        chars |= set(range(safe_ord(r[0]), safe_ord(r[1])+1))
+
+    # Always add the space character for ease of use from the command line
+    if safe_ord(' ') not in chars:
+        chars.add(safe_ord(' '))
+
+    sfnt = Sfnt(raw)
+    old_sizes = sfnt.sizes()
+
+    # Remove the Digital Signature table since it is useless in a subset
+    # font anyway
+    sfnt.pop(b'DSIG', None)
+
+    # Remove non core tables as they aren't likely to be used by renderers
+    # anyway
+    core_tables = {b'cmap', b'hhea', b'head', b'hmtx', b'maxp', b'name',
+            b'OS/2', b'post', b'cvt ', b'fpgm', b'glyf', b'loca', b'prep',
+            b'CFF ', b'VORG', b'EBDT', b'EBLC', b'EBSC', b'BASE', b'GSUB',
+            b'GPOS', b'GDEF', b'JSTF', b'gasp', b'hdmx', b'kern', b'LTSH',
+            b'PCLT', b'VDMX', b'vhea', b'vmtx', b'MATH'}
+    for tag in list(sfnt):
+        if tag not in core_tables:
+            del sfnt[tag]
+
+    try:
+        cmap = sfnt[b'cmap']
+    except KeyError:
+        raise UnsupportedFont('This font has no cmap table')
+
+    # Get mapping of chars to glyph ids for all specified chars
+    character_map = cmap.get_character_map(chars)
+
+    extra_glyphs = set()
+
+    if b'GSUB' in sfnt:
+        # Parse all substitution rules to ensure that glyphs that can be
+        # substituted for the specified set of glyphs are not removed
+        gsub = sfnt[b'GSUB']
+        try:
+            gsub.decompile()
+            extra_glyphs = gsub.all_substitutions(itervalues(character_map))
+        except UnsupportedFont as e:
+            warn('Usupported GSUB table: %s'%e)
+        except Exception:
+            warn('Failed to decompile GSUB table:', traceback.format_exc())
+
+    if b'loca' in sfnt and b'glyf' in sfnt:
+        # TrueType Outlines
+        subset_truetype(sfnt, character_map, extra_glyphs)
+    elif b'CFF ' in sfnt:
+        # PostScript Outlines
+        subset_postscript(sfnt, character_map, extra_glyphs)
+    else:
+        raise UnsupportedFont('This font does not contain TrueType '
+                'or PostScript outlines')
+
+    # Restrict the cmap table to only contain entries for the resolved glyphs
+    cmap.set_character_map(character_map)
+
+    if b'kern' in sfnt:
+        try:
+            sfnt[b'kern'].restrict_to_glyphs(frozenset(itervalues(character_map)))
+        except UnsupportedFont as e:
+            warn('kern table unsupported, ignoring: %s'%e)
+        except Exception:
+            warn('Subsetting of kern table failed, ignoring:',
+                    traceback.format_exc())
+
+    raw, new_sizes = sfnt()
+    return raw, old_sizes, new_sizes
+
+# CLI {{{
+
+
+def option_parser():
+    import textwrap
+    from calibre.utils.config import OptionParser
+    parser = OptionParser(usage=textwrap.dedent('''\
+            %prog [options] input_font_file output_font_file characters_to_keep
+
+            Subset the specified font, keeping only the glyphs for the characters in
+            characters_to_keep. characters_to_keep is a comma separated list of characters of
+            the form: a,b,c,A-Z,0-9,xyz
+
+            You can specify ranges in the list of characters, as shown above.
+            '''))
+    parser.add_option('-c', '--codes', default=False, action='store_true',
+            help='If specified, the list of characters is interpreted as '
+            'numeric unicode codes instead of characters. So to specify the '
+            'characters a,b you would use 97,98 or U+0061,U+0062')
+    parser.prog = 'subset-font'
+    return parser
+
+
+def print_stats(old_stats, new_stats):
+    from calibre import prints
+    prints('========= Table comparison (original vs. subset) =========')
+    prints('Table', ' ', '%10s'%'Size', '  ', 'Percent', '   ', '%10s'%'New Size',
+            ' New Percent')
+    prints('='*80)
+    old_total = sum(itervalues(old_stats))
+    new_total = sum(itervalues(new_stats))
+    tables = sorted(old_stats, key=lambda x:old_stats[x],
+            reverse=True)
+    for table in tables:
+        osz = old_stats[table]
+        op = osz/old_total * 100
+        nsz = new_stats.get(table, 0)
+        np = nsz/new_total * 100
+        suffix = ' | same size'
+        if nsz != osz:
+            suffix = ' | reduced to %.1f %%'%(nsz/osz * 100)
+        prints('%4s'%table, '  ', '%10s'%osz, '  ', '%5.1f %%'%op, '   ',
+                '%10s'%nsz, '  ', '%5.1f %%'%np, suffix)
+    prints('='*80)
+
+
+def main(args):
+    import sys, time
+    from calibre import prints
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    if len(args) < 4 or len(args) > 4:
+        parser.print_help()
+        raise SystemExit(1)
+    iff, off, chars = args[1:]
+    with open(iff, 'rb') as f:
+        orig = f.read()
+
+    chars = [x for x in chars.split(',')]
+    individual, ranges = set(), set()
+
+    def not_single(c):
+        if len(c) > 1:
+            prints(c, 'is not a single character', file=sys.stderr)
+            raise SystemExit(1)
+
+    def conv_code(c):
+        if c.upper()[:2] in ('U+', '0X'):
+            c = int(c[2:], 16)
+        return safe_chr(int(c))
+
+    for c in chars:
+        if '-' in c:
+            parts = [x.strip() for x in c.split('-')]
+            if len(parts) != 2:
+                prints('Invalid range:', c, file=sys.stderr)
+                raise SystemExit(1)
+            if opts.codes:
+                parts = tuple(map(conv_code, parts))
+            tuple(map(not_single, parts))
+            ranges.add(tuple(parts))
+        else:
+            if opts.codes:
+                c = conv_code(c)
+            not_single(c)
+            individual.add(c)
+    st = time.time()
+    sf, old_stats, new_stats = subset(orig, individual, ranges)
+    taken = time.time() - st
+    reduced = (len(sf)/len(orig)) * 100
+
+    def sz(x):
+        return '%gKB'%(len(x)/1024.)
+    print_stats(old_stats, new_stats)
+    prints('Original size:', sz(orig), 'Subset size:', sz(sf), 'Reduced to: %g%%'%(reduced))
+    prints('Subsetting took %g seconds'%taken)
+    with open(off, 'wb') as f:
+        f.write(sf)
+    prints('Subset font written to:', off)
+
+
+if __name__ == '__main__':
+    try:
+        import init_calibre
+        init_calibre
+    except ImportError:
+        pass
+    import sys
+    main(sys.argv)
+# }}}
+
+# Tests {{{
+
+
+def test_mem():
+    from calibre.utils.mem import memory
+    import gc
+    gc.collect()
+    start_mem = memory()
+    raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
+    calls = 1000
+    for i in range(calls):
+        subset(raw, (), (('a', 'z'),))
+    del raw
+    for i in range(3):
+        gc.collect()
+    print('Leaked memory per call:', (memory() - start_mem)/calls*1024, 'KB')
+
+
+def test():
+    raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
+    sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
+    if len(sf) > 0.3 * len(raw):
+        raise Exception('Subsetting failed')
+
+
+def all():
+    from calibre.utils.fonts.scanner import font_scanner
+    failed = []
+    unsupported = []
+    warnings = {}
+    total = 0
+    averages = []
+    for family in font_scanner.find_font_families():
+        for font in font_scanner.fonts_for_family(family):
+            raw = font_scanner.get_font_data(font)
+            print('Subsetting', font['full_name'], end='\t')
+            total += 1
+            try:
+                w = []
+                sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')),
+                        (), w)
+                if w:
+                    warnings[font['full_name'] + ' (%s)'%font['path']] = w
+            except NoGlyphs:
+                print('No glyphs!')
+                continue
+            except UnsupportedFont as e:
+                unsupported.append((font['full_name'], font['path'], unicode_type(e)))
+                print('Unsupported!')
+                continue
+            except Exception as e:
+                print('Failed!')
+                failed.append((font['full_name'], font['path'], unicode_type(e)))
+            else:
+                averages.append(sum(itervalues(new_stats))/sum(itervalues(old_stats)) * 100)
+                print('Reduced to:', '%.1f'%averages[-1] , '%')
+    if unsupported:
+        print('\n\nUnsupported:')
+        for name, path, err in unsupported:
+            print(name, path, err)
+            print()
+    if warnings:
+        print('\n\nWarnings:')
+    for name, w in iteritems(warnings):
+        if w:
+            print(name)
+            print('', '\n\t'.join(w), sep='\t')
+    if failed:
+        print('\n\nFailures:')
+        for name, path, err in failed:
+            print(name, path, err)
+            print()
+
+    print('Average reduction to: %.1f%%'%(sum(averages)/len(averages)))
+    print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:',
+            len(failed), 'Warnings:', len(warnings))
+
+
+# }}}