Added docx writer related modules

2020-04-13 16:33:15 +02:00
parent ae80ae5640
commit 98b2dd8d4f
29 changed files with 5956 additions and 0 deletions
@@ -0,0 +1,9 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+
+
@@ -0,0 +1,281 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import textwrap, os
+
+from lxml import etree
+from lxml.builder import ElementMaker
+
+from calibre import guess_type
+from calibre.constants import numeric_version, __appname__
+from calibre.ebooks.docx.names import DOCXNamespace
+from calibre.ebooks.metadata import authors_to_string
+from calibre.ebooks.pdf.render.common import PAPER_SIZES
+from calibre.utils.date import utcnow
+from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
+from calibre.utils.zipfile import ZipFile
+from polyglot.builtins import iteritems, map, unicode_type, native_string_type
+
+
+def xml2str(root, pretty_print=False, with_tail=False):
+    if hasattr(etree, 'cleanup_namespaces'):
+        etree.cleanup_namespaces(root)
+    ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
+                          pretty_print=pretty_print, with_tail=with_tail)
+    return ans
+
+
+def page_size(opts):
+    width, height = PAPER_SIZES[opts.docx_page_size]
+    if opts.docx_custom_page_size is not None:
+        width, height = map(float, opts.docx_custom_page_size.partition('x')[0::2])
+    return width, height
+
+
+def page_margin(opts, which):
+    val = getattr(opts, 'docx_page_margin_' + which)
+    if val == 0.0:
+        val = getattr(opts, 'margin_' + which)
+    return val
+
+
+def page_effective_area(opts):
+    width, height = page_size(opts)
+    width -= page_margin(opts, 'left') + page_margin(opts, 'right')
+    height -= page_margin(opts, 'top') + page_margin(opts, 'bottom')
+    return width, height  # in pts
+
+
+def create_skeleton(opts, namespaces=None):
+    namespaces = namespaces or DOCXNamespace().namespaces
+
+    def w(x):
+        return '{%s}%s' % (namespaces['w'], x)
+    dn = {k:v for k, v in iteritems(namespaces) if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}}
+    E = ElementMaker(namespace=dn['w'], nsmap=dn)
+    doc = E.document()
+    body = E.body()
+    doc.append(body)
+    width, height = page_size(opts)
+    width, height = int(20 * width), int(20 * height)
+
+    def margin(which):
+        val = page_margin(opts, which)
+        return w(which), unicode_type(int(val * 20))
+    body.append(E.sectPr(
+        E.pgSz(**{w('w'):unicode_type(width), w('h'):unicode_type(height)}),
+        E.pgMar(**dict(map(margin, 'left top right bottom'.split()))),
+        E.cols(**{w('space'):'720'}),
+        E.docGrid(**{w('linePitch'):"360"}),
+    ))
+
+    dn = {k:v for k, v in iteritems(namespaces) if k in tuple('wra') + ('wp',)}
+    E = ElementMaker(namespace=dn['w'], nsmap=dn)
+    styles = E.styles(
+        E.docDefaults(
+            E.rPrDefault(
+                E.rPr(
+                    E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}),
+                    E.sz(**{w('val'):'22'}),
+                    E.szCs(**{w('val'):'22'}),
+                    E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"})
+                )
+            ),
+            E.pPrDefault(
+                E.pPr(
+                    E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"})
+                )
+            )
+        )
+    )
+    return doc, styles, body
+
+
+def update_doc_props(root, mi, namespace):
+    def setm(name, text=None, ns='dc'):
+        ans = root.makeelement('{%s}%s' % (namespace.namespaces[ns], name))
+        for child in tuple(root):
+            if child.tag == ans.tag:
+                root.remove(child)
+        ans.text = text
+        root.append(ans)
+        return ans
+    setm('title', mi.title)
+    setm('creator', authors_to_string(mi.authors))
+    if mi.tags:
+        setm('keywords', ', '.join(mi.tags), ns='cp')
+    if mi.comments:
+        setm('description', mi.comments)
+    if mi.languages:
+        l = canonicalize_lang(mi.languages[0])
+        setm('language', lang_as_iso639_1(l) or l)
+
+
+class DocumentRelationships(object):
+
+    def __init__(self, namespace):
+        self.rmap = {}
+        self.namespace = namespace
+        for typ, target in iteritems({
+                namespace.names['STYLES']: 'styles.xml',
+                namespace.names['NUMBERING']: 'numbering.xml',
+                namespace.names['WEB_SETTINGS']: 'webSettings.xml',
+                namespace.names['FONTS']: 'fontTable.xml',
+        }):
+            self.add_relationship(target, typ)
+
+    def get_relationship_id(self, target, rtype, target_mode=None):
+        return self.rmap.get((target, rtype, target_mode))
+
+    def add_relationship(self, target, rtype, target_mode=None):
+        ans = self.get_relationship_id(target, rtype, target_mode)
+        if ans is None:
+            ans = 'rId%d' % (len(self.rmap) + 1)
+            self.rmap[(target, rtype, target_mode)] = ans
+        return ans
+
+    def add_image(self, target):
+        return self.add_relationship(target, self.namespace.names['IMAGES'])
+
+    def serialize(self):
+        namespaces = self.namespace.namespaces
+        E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
+        relationships = E.Relationships()
+        for (target, rtype, target_mode), rid in iteritems(self.rmap):
+            r = E.Relationship(Id=rid, Type=rtype, Target=target)
+            if target_mode is not None:
+                r.set('TargetMode', target_mode)
+            relationships.append(r)
+        return xml2str(relationships)
+
+
+class DOCX(object):
+
+    def __init__(self, opts, log):
+        self.namespace = DOCXNamespace()
+        namespaces = self.namespace.namespaces
+        self.opts, self.log = opts, log
+        self.document_relationships = DocumentRelationships(self.namespace)
+        self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
+        self.numbering = etree.Element('{%s}numbering' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
+        E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
+        self.embedded_fonts = E.Relationships()
+        self.fonts = {}
+        self.images = {}
+
+    # Boilerplate {{{
+    @property
+    def contenttypes(self):
+        E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']})
+        types = E.Types()
+        for partname, mt in iteritems({
+            "/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
+            "/word/document.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml",
+            "/word/numbering.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml",
+            "/word/styles.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml",
+            "/word/endnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
+            "/word/settings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml",
+            "/word/theme/theme1.xml": "application/vnd.openxmlformats-officedocument.theme+xml",
+            "/word/fontTable.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml",
+            "/word/webSettings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml",
+            "/docProps/core.xml": "application/vnd.openxmlformats-package.core-properties+xml",
+            "/docProps/app.xml": "application/vnd.openxmlformats-officedocument.extended-properties+xml",
+        }):
+            types.append(E.Override(PartName=partname, ContentType=mt))
+        added = {'png', 'gif', 'jpeg', 'jpg', 'svg', 'xml'}
+        for ext in added:
+            types.append(E.Default(Extension=ext, ContentType=guess_type('a.'+ext)[0]))
+        for ext, mt in iteritems({
+            "rels": "application/vnd.openxmlformats-package.relationships+xml",
+            "odttf": "application/vnd.openxmlformats-officedocument.obfuscatedFont",
+        }):
+            added.add(ext)
+            types.append(E.Default(Extension=ext, ContentType=mt))
+        for fname in self.images:
+            ext = fname.rpartition(os.extsep)[-1]
+            if ext not in added:
+                added.add(ext)
+                mt = guess_type('a.' + ext)[0]
+                if mt:
+                    types.append(E.Default(Extension=ext, ContentType=mt))
+        return xml2str(types)
+
+    @property
+    def appproperties(self):
+        E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']})
+        props = E.Properties(
+            E.Application(__appname__),
+            E.AppVersion('%02d.%04d' % numeric_version[:2]),
+            E.DocSecurity('0'),
+            E.HyperlinksChanged('false'),
+            E.LinksUpToDate('true'),
+            E.ScaleCrop('false'),
+            E.SharedDoc('false'),
+        )
+        if self.mi.publisher:
+            props.append(E.Company(self.mi.publisher))
+        return xml2str(props)
+
+    @property
+    def containerrels(self):
+        return textwrap.dedent('''\
+        <?xml version='1.0' encoding='utf-8'?>
+        <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+            <Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/>
+            <Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/>
+            <Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/>
+        </Relationships>'''.format(**self.namespace.names)).encode('utf-8')
+
+    @property
+    def websettings(self):
+        E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']})
+        ws = E.webSettings(
+            E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile)
+        return xml2str(ws)
+
+    # }}}
+
+    def convert_metadata(self, mi):
+        namespaces = self.namespace.namespaces
+        E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()})
+        cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre'))
+        ts = utcnow().isoformat(native_string_type('T')).rpartition('.')[0] + 'Z'
+        for x in 'created modified'.split():
+            x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x), **{'{%s}type' % namespaces['xsi']:'dcterms:W3CDTF'})
+            x.text = ts
+            cp.append(x)
+        self.mi = mi
+        update_doc_props(cp, self.mi, self.namespace)
+        return xml2str(cp)
+
+    def create_empty_document(self, mi):
+        self.document, self.styles = create_skeleton(self.opts)[:2]
+
+    def write(self, path_or_stream, mi, create_empty_document=False):
+        if create_empty_document:
+            self.create_empty_document(mi)
+        with ZipFile(path_or_stream, 'w') as zf:
+            zf.writestr('[Content_Types].xml', self.contenttypes)
+            zf.writestr('_rels/.rels', self.containerrels)
+            zf.writestr('docProps/core.xml', self.convert_metadata(mi))
+            zf.writestr('docProps/app.xml', self.appproperties)
+            zf.writestr('word/webSettings.xml', self.websettings)
+            zf.writestr('word/document.xml', xml2str(self.document))
+            zf.writestr('word/styles.xml', xml2str(self.styles))
+            zf.writestr('word/numbering.xml', xml2str(self.numbering))
+            zf.writestr('word/fontTable.xml', xml2str(self.font_table))
+            zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
+            zf.writestr('word/_rels/fontTable.xml.rels', xml2str(self.embedded_fonts))
+            for fname, data_getter in iteritems(self.images):
+                zf.writestr(fname, data_getter())
+            for fname, data in iteritems(self.fonts):
+                zf.writestr(fname, data)
+
+
+if __name__ == '__main__':
+    d = DOCX(None, None)
+    print(d.websettings)
@@ -0,0 +1,78 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import defaultdict
+from uuid import uuid4
+
+from calibre.ebooks.oeb.base import OEB_STYLES
+from calibre.ebooks.oeb.transforms.subset import find_font_face_rules
+from polyglot.builtins import range
+
+
+def obfuscate_font_data(data, key):
+    prefix = bytearray(data[:32])
+    key = bytearray(reversed(key.bytes))
+    prefix = bytes(bytearray(prefix[i]^key[i % len(key)] for i in range(len(prefix))))
+    return prefix + data[32:]
+
+
+class FontsManager(object):
+
+    def __init__(self, namespace, oeb, opts):
+        self.namespace = namespace
+        self.oeb, self.log, self.opts = oeb, oeb.log, opts
+
+    def serialize(self, text_styles, fonts, embed_relationships, font_data_map):
+        makeelement = self.namespace.makeelement
+        font_families, seen = set(), set()
+        for ts in text_styles:
+            if ts.font_family:
+                lf = ts.font_family.lower()
+                if lf not in seen:
+                    seen.add(lf)
+                    font_families.add(ts.font_family)
+        family_map = {}
+        for family in sorted(font_families):
+            family_map[family] = makeelement(fonts, 'w:font', w_name=family)
+
+        embedded_fonts = []
+        for item in self.oeb.manifest:
+            if item.media_type in OEB_STYLES and hasattr(item.data, 'cssRules'):
+                embedded_fonts.extend(find_font_face_rules(item, self.oeb))
+
+        num = 0
+        face_map = defaultdict(set)
+        rel_map = {}
+        for ef in embedded_fonts:
+            ff = ef['font-family'][0]
+            if ff not in font_families:
+                continue
+            num += 1
+            bold = ef['weight'] > 400
+            italic = ef['font-style'] != 'normal'
+            tag = 'Regular'
+            if bold or italic:
+                tag = 'Italic'
+                if bold and italic:
+                    tag = 'BoldItalic'
+                elif bold:
+                    tag = 'Bold'
+            if tag in face_map[ff]:
+                continue
+            face_map[ff].add(tag)
+            font = family_map[ff]
+            key = uuid4()
+            item = ef['item']
+            rid = rel_map.get(item)
+            if rid is None:
+                rel_map[item] = rid = 'rId%d' % num
+                fname = 'fonts/font%d.odttf' % num
+                makeelement(embed_relationships, 'Relationship', Id=rid, Type=self.namespace.names['EMBEDDED_FONT'], Target=fname)
+                font_data_map['word/' + fname] = obfuscate_font_data(item.data, key)
+            makeelement(font, 'w:embed' + tag, r_id=rid,
+                        w_fontKey='{%s}' % key.urn.rpartition(':')[-1].upper(),
+                        w_subsetted="true" if self.opts.subset_embedded_fonts else "false")
@@ -0,0 +1,617 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re
+from collections import Counter
+
+from calibre.ebooks.docx.writer.container import create_skeleton, page_size, page_effective_area
+from calibre.ebooks.docx.writer.styles import StylesManager, FloatSpec
+from calibre.ebooks.docx.writer.links import LinksManager
+from calibre.ebooks.docx.writer.images import ImagesManager
+from calibre.ebooks.docx.writer.fonts import FontsManager
+from calibre.ebooks.docx.writer.tables import Table
+from calibre.ebooks.docx.writer.lists import ListsManager
+from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
+from calibre.ebooks.oeb.base import XPath, barename
+from calibre.utils.localization import lang_as_iso639_1
+from polyglot.builtins import unicode_type, string_or_bytes
+
+
+def lang_for_tag(tag):
+    for attr in ('lang', '{http://www.w3.org/XML/1998/namespace}lang'):
+        val = lang_as_iso639_1(tag.get(attr))
+        if val:
+            return val
+
+
+class Style(St):
+
+    def __init__(self, *args, **kwargs):
+        St.__init__(self, *args, **kwargs)
+        self._letterSpacing = None
+
+    @property
+    def letterSpacing(self):
+        if self._letterSpacing is not None:
+            val = self._get('letter-spacing')
+            if val == 'normal':
+                self._letterSpacing = val
+            else:
+                self._letterSpacing = self._unit_convert(val)
+        return self._letterSpacing
+
+
+class Stylizer(Sz):
+
+    def style(self, element):
+        try:
+            return self._styles[element]
+        except KeyError:
+            return Style(element, self)
+
+
+class TextRun(object):
+
+    ws_pat = None
+
+    def __init__(self, namespace, style, first_html_parent, lang=None):
+        self.first_html_parent = first_html_parent
+        if self.ws_pat is None:
+            TextRun.ws_pat = self.ws_pat = re.compile(r'\s+')
+        self.style = style
+        self.texts = []
+        self.link = None
+        self.lang = lang
+        self.parent_style = None
+        self.makeelement = namespace.makeelement
+        self.descendant_style = None
+
+    def add_text(self, text, preserve_whitespace, bookmark=None, link=None):
+        if not preserve_whitespace:
+            text = self.ws_pat.sub(' ', text)
+            if text.strip() != text:
+                # If preserve_whitespace is False, Word ignores leading and
+                # trailing whitespace
+                preserve_whitespace = True
+        self.texts.append((text, preserve_whitespace, bookmark))
+        self.link = link
+
+    def add_break(self, clear='none', bookmark=None):
+        self.texts.append((None, clear, bookmark))
+
+    def add_image(self, drawing, bookmark=None):
+        self.texts.append((drawing, None, bookmark))
+
+    def serialize(self, p, links_manager):
+        makeelement = self.makeelement
+        parent = p if self.link is None else links_manager.serialize_hyperlink(p, self.link)
+        r = makeelement(parent, 'w:r')
+        rpr = makeelement(r, 'w:rPr', append=False)
+        if getattr(self.descendant_style, 'id', None) is not None:
+            makeelement(rpr, 'w:rStyle', w_val=self.descendant_style.id)
+        if self.lang:
+            makeelement(rpr, 'w:lang', w_bidi=self.lang, w_val=self.lang, w_eastAsia=self.lang)
+        if len(rpr) > 0:
+            r.append(rpr)
+
+        for text, preserve_whitespace, bookmark in self.texts:
+            if bookmark is not None:
+                bid = links_manager.bookmark_id
+                makeelement(r, 'w:bookmarkStart', w_id=unicode_type(bid), w_name=bookmark)
+            if text is None:
+                makeelement(r, 'w:br', w_clear=preserve_whitespace)
+            elif hasattr(text, 'xpath'):
+                r.append(text)
+            else:
+                t = makeelement(r, 'w:t')
+                t.text = text or ''
+                if preserve_whitespace:
+                    t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+            if bookmark is not None:
+                makeelement(r, 'w:bookmarkEnd', w_id=unicode_type(bid))
+
+    def __repr__(self):
+        return repr(self.texts)
+
+    def is_empty(self):
+        if not self.texts:
+            return True
+        if len(self.texts) == 1 and self.texts[0][:2] == ('', False):
+            return True
+        return False
+
+    @property
+    def style_weight(self):
+        ans = 0
+        for text, preserve_whitespace, bookmark in self.texts:
+            if isinstance(text, unicode_type):
+                ans += len(text)
+        return ans
+
+
+class Block(object):
+
+    def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False, parent_bg=None):
+        self.force_not_empty = False
+        self.namespace = namespace
+        self.bookmarks = set()
+        self.list_tag = (html_block, style) if is_list_item else None
+        self.is_first_block = False
+        self.numbering_id = None
+        self.parent_items = None
+        self.html_block = html_block
+        self.html_tag = barename(html_block.tag)
+        self.float_spec = float_spec
+        if float_spec is not None:
+            float_spec.blocks.append(self)
+        self.html_style = style
+        self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg)
+        self.styles_manager, self.links_manager = styles_manager, links_manager
+        self.keep_next = False
+        self.runs = []
+        self.skipped = False
+        self.linked_style = None
+        self.page_break_before = style['page-break-before'] == 'always'
+        self.keep_lines = style['page-break-inside'] == 'avoid'
+        self.page_break_after = False
+        self.block_lang = None
+
+    def resolve_skipped(self, next_block):
+        if not self.is_empty():
+            return
+        if len(self.html_block) > 0 and self.html_block[0] is next_block.html_block:
+            self.skipped = True
+            if self.list_tag is not None:
+                next_block.list_tag = self.list_tag
+
+    def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None, is_parent_style=False, bookmark=None, link=None, lang=None):
+        ws = style['white-space']
+        preserve_whitespace = ws in {'pre', 'pre-wrap', '-o-pre-wrap'}
+        ts = self.styles_manager.create_text_style(style, is_parent_style=is_parent_style)
+        if self.runs and ts == self.runs[-1].style and link == self.runs[-1].link and lang == self.runs[-1].lang:
+            run = self.runs[-1]
+        else:
+            run = TextRun(self.namespace, ts, self.html_block if html_parent is None else html_parent, lang=lang)
+            self.runs.append(run)
+        if ignore_leading_whitespace and not preserve_whitespace:
+            text = text.lstrip()
+        if preserve_whitespace or ws == 'pre-line':
+            for text in text.splitlines():
+                run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link)
+                bookmark = None
+                run.add_break()
+        else:
+            run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link)
+
+    def add_break(self, clear='none', bookmark=None):
+        if self.runs:
+            run = self.runs[-1]
+        else:
+            run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
+            self.runs.append(run)
+        run.add_break(clear=clear, bookmark=bookmark)
+
+    def add_image(self, drawing, bookmark=None):
+        if self.runs:
+            run = self.runs[-1]
+        else:
+            run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
+            self.runs.append(run)
+        run.add_image(drawing, bookmark=bookmark)
+
+    def serialize(self, body):
+        makeelement = self.namespace.makeelement
+        p = makeelement(body, 'w:p')
+        end_bookmarks = []
+        for bmark in self.bookmarks:
+            end_bookmarks.append(unicode_type(self.links_manager.bookmark_id))
+            makeelement(p, 'w:bookmarkStart', w_id=end_bookmarks[-1], w_name=bmark)
+        if self.block_lang:
+            rpr = makeelement(p, 'w:rPr')
+            makeelement(rpr, 'w:lang', w_val=self.block_lang, w_bidi=self.block_lang, w_eastAsia=self.block_lang)
+
+        ppr = makeelement(p, 'w:pPr')
+        if self.keep_next:
+            makeelement(ppr, 'w:keepNext')
+        if self.float_spec is not None:
+            self.float_spec.serialize(self, ppr)
+        if self.numbering_id is not None:
+            numpr = makeelement(ppr, 'w:numPr')
+            makeelement(numpr, 'w:ilvl', w_val=unicode_type(self.numbering_id[1]))
+            makeelement(numpr, 'w:numId', w_val=unicode_type(self.numbering_id[0]))
+        if self.linked_style is not None:
+            makeelement(ppr, 'w:pStyle', w_val=self.linked_style.id)
+        elif self.style.id:
+            makeelement(ppr, 'w:pStyle', w_val=self.style.id)
+        if self.is_first_block:
+            makeelement(ppr, 'w:pageBreakBefore', w_val='off')
+        elif self.page_break_before:
+            makeelement(ppr, 'w:pageBreakBefore', w_val='on')
+        if self.keep_lines:
+            makeelement(ppr, 'w:keepLines', w_val='on')
+        for run in self.runs:
+            run.serialize(p, self.links_manager)
+        for bmark in end_bookmarks:
+            makeelement(p, 'w:bookmarkEnd', w_id=bmark)
+
+    def __repr__(self):
+        return 'Block(%r)' % self.runs
+    __str__ = __repr__
+
+    def is_empty(self):
+        if self.force_not_empty:
+            return False
+        for run in self.runs:
+            if not run.is_empty():
+                return False
+        return True
+
+
+class Blocks(object):
+
+    def __init__(self, namespace, styles_manager, links_manager):
+        self.top_bookmark = None
+        self.namespace = namespace
+        self.styles_manager = styles_manager
+        self.links_manager = links_manager
+        self.all_blocks = []
+        self.pos = 0
+        self.current_block = None
+        self.items = []
+        self.tables = []
+        self.current_table = None
+        self.open_html_blocks = set()
+        self.html_tag_start_blocks = {}
+
+    def current_or_new_block(self, html_tag, tag_style):
+        return self.current_block or self.start_new_block(html_tag, tag_style)
+
+    def end_current_block(self):
+        if self.current_block is not None:
+            self.all_blocks.append(self.current_block)
+            if self.current_table is not None and self.current_table.current_row is not None:
+                self.current_table.add_block(self.current_block)
+            else:
+                self.block_map[self.current_block] = len(self.items)
+                self.items.append(self.current_block)
+                self.current_block.parent_items = self.items
+        self.current_block = None
+
+    def start_new_block(self, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False):
+        parent_bg = None
+        if html_block is not None:
+            p = html_block.getparent()
+            b = self.html_tag_start_blocks.get(p)
+            if b is not None:
+                ps = self.styles_manager.styles_for_html_blocks.get(p)
+                if ps is not None and ps.background_color is not None:
+                    parent_bg = ps.background_color
+        self.end_current_block()
+        self.current_block = Block(
+            self.namespace, self.styles_manager, self.links_manager, html_block, style,
+            is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item,
+            parent_bg=parent_bg)
+        self.html_tag_start_blocks[html_block] = self.current_block
+        self.open_html_blocks.add(html_block)
+        return self.current_block
+
+    def start_new_table(self, html_tag, tag_style=None):
+        self.current_table = Table(self.namespace, html_tag, tag_style)
+        self.tables.append(self.current_table)
+
+    def start_new_row(self, html_tag, tag_style):
+        if self.current_table is None:
+            self.start_new_table(html_tag)
+        self.current_table.start_new_row(html_tag, tag_style)
+
+    def start_new_cell(self, html_tag, tag_style):
+        if self.current_table is None:
+            self.start_new_table(html_tag)
+        self.current_table.start_new_cell(html_tag, tag_style)
+
+    def finish_tag(self, html_tag):
+        if self.current_block is not None and html_tag in self.open_html_blocks:
+            start_block = self.html_tag_start_blocks.get(html_tag)
+            if start_block is not None and start_block.html_style['page-break-after'] == 'always':
+                self.current_block.page_break_after = True
+            self.end_current_block()
+            self.open_html_blocks.discard(html_tag)
+
+        if self.current_table is not None:
+            table_finished = self.current_table.finish_tag(html_tag)
+            if table_finished:
+                table = self.tables[-1]
+                del self.tables[-1]
+                if self.tables:
+                    self.current_table = self.tables[-1]
+                    self.current_table.add_table(table)
+                else:
+                    self.current_table = None
+                    self.block_map[table] = len(self.items)
+                    self.items.append(table)
+
+    def serialize(self, body):
+        for item in self.items:
+            item.serialize(body)
+
+    def delete_block_at(self, pos=None):
+        pos = self.pos if pos is None else pos
+        block = self.all_blocks[pos]
+        del self.all_blocks[pos]
+        bpos = self.block_map.pop(block, None)
+        if bpos is not None:
+            del self.items[bpos]
+        else:
+            items = self.items if block.parent_items is None else block.parent_items
+            items.remove(block)
+        block.parent_items = None
+        if block.float_spec is not None:
+            block.float_spec.blocks.remove(block)
+        try:
+            next_block = self.all_blocks[pos]
+            next_block.bookmarks.update(block.bookmarks)
+            for attr in 'page_break_after page_break_before'.split():
+                setattr(next_block, attr, getattr(block, attr))
+        except (IndexError, KeyError):
+            pass
+
+    def __enter__(self):
+        self.pos = len(self.all_blocks)
+        self.block_map = {}
+
+    def __exit__(self, etype, value, traceback):
+        if value is not None:
+            return  # Since there was an exception, the data structures are not in a consistent state
+        if self.current_block is not None:
+            self.all_blocks.append(self.current_block)
+        self.current_block = None
+        if len(self.all_blocks) > self.pos and self.all_blocks[self.pos].is_empty():
+            # Delete the empty block corresponding to the <body> tag when the
+            # body tag has no inline content before its first sub-block
+            self.delete_block_at(self.pos)
+        if self.pos > 0 and self.pos < len(self.all_blocks):
+            # Insert a page break corresponding to the start of the html file
+            self.all_blocks[self.pos].page_break_before = True
+            if self.top_bookmark is not None:
+                self.all_blocks[self.pos].bookmarks.add(self.top_bookmark)
+        self.top_bookmark = None
+        self.block_map = {}
+
+    def apply_page_break_after(self):
+        for i, block in enumerate(self.all_blocks):
+            if block.page_break_after and i < len(self.all_blocks) - 1:
+                next_block = self.all_blocks[i + 1]
+                if next_block.parent_items is block.parent_items and block.parent_items is self.items:
+                    next_block.page_break_before = True
+
+    def resolve_language(self):
+        default_lang = self.styles_manager.document_lang
+        for block in self.all_blocks:
+            count = Counter()
+            for run in block.runs:
+                count[run.lang] += 1
+            if count:
+                block.block_lang = bl = count.most_common(1)[0][0]
+                for run in block.runs:
+                    if run.lang == bl:
+                        run.lang = None
+                if bl == default_lang:
+                    block.block_lang = None
+
+    def __repr__(self):
+        return 'Block(%r)' % self.runs
+
+
+class Convert(object):
+
+    # Word does not apply default styling to hyperlinks, so we ensure they get
+    # default styling (the conversion pipeline does not apply any styling to
+    # them).
+    base_css = '''
+    a[href] { text-decoration: underline; color: blue }
+    '''
+
+    def __init__(self, oeb, docx, mi, add_cover, add_toc):
+        self.oeb, self.docx, self.add_cover, self.add_toc = oeb, docx, add_cover, add_toc
+        self.log, self.opts = docx.log, docx.opts
+        self.mi = mi
+        self.cover_img = None
+        p = self.opts.output_profile
+        p.width_pts, p.height_pts = page_effective_area(self.opts)
+
+    def __call__(self):
+        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
+        self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
+        self.svg_rasterizer(self.oeb, self.opts)
+
+        self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language)
+        self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log)
+        self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts)
+        self.lists_manager = ListsManager(self.docx)
+        self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
+        self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager)
+        self.current_link = self.current_lang = None
+
+        for item in self.oeb.spine:
+            self.log.debug('Processing', item.href)
+            self.process_item(item)
+        if self.add_toc:
+            self.links_manager.process_toc_links(self.oeb)
+
+        if self.add_cover and self.oeb.metadata.cover and unicode_type(self.oeb.metadata.cover[0]) in self.oeb.manifest.ids:
+            cover_id = unicode_type(self.oeb.metadata.cover[0])
+            item = self.oeb.manifest.ids[cover_id]
+            self.cover_img = self.images_manager.read_image(item.href)
+
+        all_blocks = self.blocks.all_blocks
+        remove_blocks = []
+        for i, block in enumerate(all_blocks):
+            try:
+                nb = all_blocks[i+1]
+            except IndexError:
+                break
+            block.resolve_skipped(nb)
+            if block.skipped:
+                remove_blocks.append((i, block))
+        for pos, block in reversed(remove_blocks):
+            self.blocks.delete_block_at(pos)
+        self.blocks.all_blocks[0].is_first_block = True
+        self.blocks.apply_page_break_after()
+        self.blocks.resolve_language()
+
+        if self.cover_img is not None:
+            self.cover_img = self.images_manager.create_cover_markup(self.cover_img, self.opts.preserve_cover_aspect_ratio, *page_size(self.opts))
+        self.lists_manager.finalize(all_blocks)
+        self.styles_manager.finalize(all_blocks)
+        self.write()
+
+    def process_item(self, item):
+        self.current_item = item
+        stylizer = self.svg_rasterizer.stylizer_cache.get(item)
+        if stylizer is None:
+            stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css)
+        self.abshref = self.images_manager.abshref = item.abshref
+
+        self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang
+        for i, body in enumerate(XPath('//h:body')(item.data)):
+            with self.blocks:
+                self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(self.links_manager.top_anchor, self.current_item, body)
+                self.process_tag(body, stylizer, is_first_tag=i == 0)
+
+    def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None):
+        tagname = barename(html_tag.tag)
+        tag_style = stylizer.style(html_tag)
+        ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden
+        display = tag_style._get('display')
+        is_block = False
+
+        if not ignore_tag_contents:
+            previous_link = self.current_link
+            if tagname == 'a' and html_tag.get('href'):
+                self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title'))
+            previous_lang = self.current_lang
+            tag_lang = lang_for_tag(html_tag)
+            if tag_lang:
+                self.current_lang = tag_lang
+
+            is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag
+            if float_spec is None and is_float:
+                float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style)
+
+            if display in {'inline', 'inline-block'} or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
+                if is_float and float_spec.is_dropcaps:
+                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
+                    float_spec = None
+                else:
+                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
+            elif display == 'list-item':
+                self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True)
+            elif display.startswith('table') or display == 'inline-table':
+                if display == 'table-cell':
+                    self.blocks.start_new_cell(html_tag, tag_style)
+                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True)
+                elif display == 'table-row':
+                    self.blocks.start_new_row(html_tag, tag_style)
+                elif display in {'table', 'inline-table'}:
+                    self.blocks.end_current_block()
+                    self.blocks.start_new_table(html_tag, tag_style)
+            else:
+                if tagname == 'img' and is_float:
+                    # Image is floating so dont start a new paragraph for it
+                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
+                else:
+                    if tagname == 'hr':
+                        for edge in 'right bottom left'.split():
+                            tag_style.set('border-%s-style' % edge, 'none')
+                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
+
+            for child in html_tag.iterchildren():
+                if isinstance(getattr(child, 'tag', None), string_or_bytes):
+                    self.process_tag(child, stylizer, float_spec=float_spec)
+                else:  # Comment/PI/etc.
+                    tail = getattr(child, 'tail', None)
+                    if tail:
+                        block = self.create_block_from_parent(html_tag, stylizer)
+                        block.add_text(tail, tag_style, is_parent_style=False, link=self.current_link, lang=self.current_lang)
+
+            is_block = html_tag in self.blocks.open_html_blocks
+            self.blocks.finish_tag(html_tag)
+            if is_block and tag_style['page-break-after'] == 'avoid':
+                self.blocks.all_blocks[-1].keep_next = True
+
+            self.current_link = previous_link
+            self.current_lang = previous_lang
+
+        # Now, process the tail if any
+
+        if display == 'table-row':
+            return  # We ignore the tail for these tags
+
+        ignore_whitespace_tail = is_block or display.startswith('table')
+        if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()):
+            # Ignore trailing space after a block tag, as otherwise it will
+            # become a new empty paragraph
+            block = self.create_block_from_parent(html_tag, stylizer)
+            block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link, lang=self.current_lang)
+
+    def create_block_from_parent(self, html_tag, stylizer):
+        parent = html_tag.getparent()
+        block = self.blocks.current_or_new_block(parent, stylizer.style(parent))
+        # Do not inherit page-break-before from parent
+        block.page_break_before = False
+        return block
+
+    def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False):
+        block = self.blocks.start_new_block(
+            html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item)
+        anchor = html_tag.get('id') or html_tag.get('name')
+        if anchor:
+            block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag))
+        if tagname == 'img':
+            self.images_manager.add_image(html_tag, block, stylizer, as_block=True)
+        else:
+            text = html_tag.text
+            if text:
+                block.add_text(text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang)
+            elif tagname == 'li' and len(html_tag) and barename(html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]):
+                block.force_not_empty = True
+
+    def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
+        anchor = html_tag.get('id') or html_tag.get('name') or None
+        bmark = None
+        if anchor:
+            bmark = self.bookmark_for_anchor(anchor, html_tag)
+        if tagname == 'br':
+            if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]:
+                block = self.create_block_from_parent(html_tag, stylizer)
+                block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'), bookmark=bmark)
+        elif tagname == 'img':
+            block = self.create_block_from_parent(html_tag, stylizer)
+            self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark)
+        else:
+            if html_tag.text:
+                block = self.create_block_from_parent(html_tag, stylizer)
+                block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang)
+            elif bmark:
+                block = self.create_block_from_parent(html_tag, stylizer)
+                block.add_text('', tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang)
+
+    def bookmark_for_anchor(self, anchor, html_tag):
+        return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag)
+
+    def write(self):
+        self.docx.document, self.docx.styles, body = create_skeleton(self.opts)
+        self.blocks.serialize(body)
+        body.append(body[0])  # Move <sectPr> to the end
+        if self.links_manager.toc:
+            self.links_manager.serialize_toc(body, self.styles_manager.primary_heading_style)
+        if self.cover_img is not None:
+            self.images_manager.write_cover_block(body, self.cover_img)
+        self.styles_manager.serialize(self.docx.styles)
+        self.images_manager.serialize(self.docx.images)
+        self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts)
+        self.lists_manager.serialize(self.docx.numbering)
@@ -0,0 +1,219 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os
+import posixpath
+from collections import namedtuple
+from functools import partial
+from polyglot.builtins import iteritems, itervalues, map, unicode_type
+
+from lxml import etree
+
+from calibre import fit_image
+from calibre.ebooks.oeb.base import urlunquote
+from calibre.ebooks.docx.images import pt_to_emu
+from calibre.utils.filenames import ascii_filename
+from calibre.utils.imghdr import identify
+
+Image = namedtuple('Image', 'rid fname width height fmt item')
+
+
+def as_num(x):
+    try:
+        return float(x)
+    except Exception:
+        pass
+    return 0
+
+
+def get_image_margins(style):
+    ans = {}
+    for edge in 'Left Right Top Bottom'.split():
+        val = as_num(getattr(style, 'padding' + edge)) + as_num(getattr(style, 'margin' + edge))
+        ans['dist' + edge[0]] = unicode_type(pt_to_emu(val))
+    return ans
+
+
+class ImagesManager(object):
+
+    def __init__(self, oeb, document_relationships, opts):
+        self.oeb, self.log = oeb, oeb.log
+        self.page_width, self.page_height = opts.output_profile.width_pts, opts.output_profile.height_pts
+        self.images = {}
+        self.seen_filenames = set()
+        self.document_relationships = document_relationships
+        self.count = 0
+
+    def read_image(self, href):
+        if href not in self.images:
+            item = self.oeb.manifest.hrefs.get(href)
+            if item is None or not isinstance(item.data, bytes):
+                return
+            try:
+                fmt, width, height = identify(item.data)
+            except Exception:
+                self.log.warning('Replacing corrupted image with blank: %s' % href)
+                item.data = I('blank.png', data=True, allow_user_override=False)
+                fmt, width, height = identify(item.data)
+            image_fname = 'media/' + self.create_filename(href, fmt)
+            image_rid = self.document_relationships.add_image(image_fname)
+            self.images[href] = Image(image_rid, image_fname, width, height, fmt, item)
+            item.unload_data_from_memory()
+        return self.images[href]
+
+    def add_image(self, img, block, stylizer, bookmark=None, as_block=False):
+        src = img.get('src')
+        if not src:
+            return
+        href = self.abshref(src)
+        try:
+            rid = self.read_image(href).rid
+        except AttributeError:
+            return
+        drawing = self.create_image_markup(img, stylizer, href, as_block=as_block)
+        block.add_image(drawing, bookmark=bookmark)
+        return rid
+
+    def create_image_markup(self, html_img, stylizer, href, as_block=False):
+        # TODO: img inside a link (clickable image)
+        style = stylizer.style(html_img)
+        floating = style['float']
+        if floating not in {'left', 'right'}:
+            floating = None
+        if as_block:
+            ml, mr = style._get('margin-left'), style._get('margin-right')
+            if ml == 'auto':
+                floating = 'center' if mr == 'auto' else 'right'
+            if mr == 'auto':
+                floating = 'center' if ml == 'auto' else 'right'
+        else:
+            parent = html_img.getparent()
+            if len(parent) == 1 and not (parent.text or '').strip() and not (html_img.tail or '').strip():
+                pstyle = stylizer.style(parent)
+                if 'block' in pstyle['display']:
+                    # We have an inline image alone inside a block
+                    as_block = True
+                    floating = pstyle['float']
+                    if floating not in {'left', 'right'}:
+                        floating = None
+                        if pstyle['text-align'] in ('center', 'right'):
+                            floating = pstyle['text-align']
+                    floating = floating or 'left'
+        fake_margins = floating is None
+        self.count += 1
+        img = self.images[href]
+        name = urlunquote(posixpath.basename(href))
+        width, height = style.img_size(img.width, img.height)
+        scaled, width, height = fit_image(width, height, self.page_width, self.page_height)
+        width, height = map(pt_to_emu, (width, height))
+
+        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
+
+        root = etree.Element('root', nsmap=namespaces)
+        ans = makeelement(root, 'w:drawing', append=False)
+        if floating is None:
+            parent = makeelement(ans, 'wp:inline')
+        else:
+            parent = makeelement(ans, 'wp:anchor', **get_image_margins(style))
+            # The next three lines are boilerplate that Word requires, even
+            # though the DOCX specs define defaults for all of them
+            parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0")
+            parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
+            makeelement(parent, 'wp:simplePos', x='0', y='0')
+            makeelement(makeelement(parent, 'wp:positionH', relativeFrom='margin'), 'wp:align').text = floating
+            makeelement(makeelement(parent, 'wp:positionV', relativeFrom='line'), 'wp:align').text = 'top'
+        makeelement(parent, 'wp:extent', cx=unicode_type(width), cy=unicode_type(height))
+        if fake_margins:
+            # DOCX does not support setting margins for inline images, so we
+            # fake it by using effect extents to simulate margins
+            makeelement(parent, 'wp:effectExtent', **{k[-1].lower():v for k, v in iteritems(get_image_margins(style))})
+        else:
+            makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
+        if floating is not None:
+            # The idiotic Word requires this to be after the extent settings
+            if as_block:
+                makeelement(parent, 'wp:wrapTopAndBottom')
+            else:
+                makeelement(parent, 'wp:wrapSquare', wrapText='bothSides')
+        self.create_docx_image_markup(parent, name, html_img.get('alt') or name, img.rid, width, height)
+        return ans
+
+    def create_docx_image_markup(self, parent, name, alt, img_rid, width, height):
+        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
+        makeelement(parent, 'wp:docPr', id=unicode_type(self.count), name=name, descr=alt)
+        makeelement(makeelement(parent, 'wp:cNvGraphicFramePr'), 'a:graphicFrameLocks', noChangeAspect="1")
+        g = makeelement(parent, 'a:graphic')
+        gd = makeelement(g, 'a:graphicData', uri=namespaces['pic'])
+        pic = makeelement(gd, 'pic:pic')
+        nvPicPr = makeelement(pic, 'pic:nvPicPr')
+        makeelement(nvPicPr, 'pic:cNvPr', id='0', name=name, descr=alt)
+        makeelement(nvPicPr, 'pic:cNvPicPr')
+        bf = makeelement(pic, 'pic:blipFill')
+        makeelement(bf, 'a:blip', r_embed=img_rid)
+        makeelement(makeelement(bf, 'a:stretch'), 'a:fillRect')
+        spPr = makeelement(pic, 'pic:spPr')
+        xfrm = makeelement(spPr, 'a:xfrm')
+        makeelement(xfrm, 'a:off', x='0', y='0'), makeelement(xfrm, 'a:ext', cx=unicode_type(width), cy=unicode_type(height))
+        makeelement(makeelement(spPr, 'a:prstGeom', prst='rect'), 'a:avLst')
+
+    def create_filename(self, href, fmt):
+        fname = ascii_filename(urlunquote(posixpath.basename(href)))
+        fname = posixpath.splitext(fname)[0]
+        fname = fname[:75].rstrip('.') or 'image'
+        num = 0
+        base = fname
+        while fname.lower() in self.seen_filenames:
+            num += 1
+            fname = base + unicode_type(num)
+        self.seen_filenames.add(fname.lower())
+        fname += os.extsep + fmt.lower()
+        return fname
+
+    def serialize(self, images_map):
+        for img in itervalues(self.images):
+            images_map['word/' + img.fname] = partial(self.get_data, img.item)
+
+    def get_data(self, item):
+        try:
+            return item.data
+        finally:
+            item.unload_data_from_memory(False)
+
+    def create_cover_markup(self, img, preserve_aspect_ratio, width, height):
+        self.count += 1
+        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
+        if preserve_aspect_ratio:
+            if img.width >= img.height:
+                ar = img.height / img.width
+                height = ar * width
+            else:
+                ar = img.width / img.height
+                width = ar * height
+
+        root = etree.Element('root', nsmap=namespaces)
+        ans = makeelement(root, 'w:drawing', append=False)
+        parent = makeelement(ans, 'wp:anchor', **{'dist'+edge:'0' for edge in 'LRTB'})
+        parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0")
+        parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
+        makeelement(parent, 'wp:simplePos', x='0', y='0')
+        makeelement(makeelement(parent, 'wp:positionH', relativeFrom='page'), 'wp:align').text = 'center'
+        makeelement(makeelement(parent, 'wp:positionV', relativeFrom='page'), 'wp:align').text = 'center'
+        width, height = map(pt_to_emu, (width, height))
+        makeelement(parent, 'wp:extent', cx=unicode_type(width), cy=unicode_type(height))
+        makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
+        makeelement(parent, 'wp:wrapTopAndBottom')
+        self.create_docx_image_markup(parent, 'cover.jpg', _('Cover'), img.rid, width, height)
+        return ans
+
+    def write_cover_block(self, body, cover_image):
+        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
+        pbb = body[0].xpath('//*[local-name()="pageBreakBefore"]')[0]
+        pbb.set('{%s}val' % namespaces['w'], 'on')
+        p = makeelement(body, 'w:p', append=False)
+        body.insert(0, p)
+        r = makeelement(p, 'w:r')
+        r.append(cover_image)
@@ -0,0 +1,175 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import posixpath, re
+from uuid import uuid4
+
+from calibre.utils.filenames import ascii_text
+from polyglot.builtins import unicode_type
+from polyglot.urllib import urlparse
+
+
+def start_text(tag, prefix_len=0, top_level=True):
+    ans = tag.text or ''
+    limit = 50 - prefix_len
+    if len(ans) < limit:
+        for child in tag.iterchildren('*'):
+            ans += start_text(child, len(ans), top_level=False) + (child.tail or '')
+            if len(ans) >= limit:
+                break
+    if top_level and len(ans) > limit:
+        ans = ans[:limit] + '...'
+    return ans
+
+
+class TOCItem(object):
+
+    def __init__(self, title, bmark, level):
+        self.title, self.bmark, self.level = title, bmark, level
+        self.is_first = self.is_last = False
+
+    def serialize(self, body, makeelement):
+        p = makeelement(body, 'w:p', append=False)
+        ppr = makeelement(p, 'w:pPr')
+        makeelement(ppr, 'w:pStyle', w_val="Normal")
+        makeelement(ppr, 'w:ind', w_left='0', w_firstLineChars='0', w_firstLine='0', w_leftChars=unicode_type(200 * self.level))
+        if self.is_first:
+            makeelement(ppr, 'w:pageBreakBefore', w_val='off')
+            r = makeelement(p, 'w:r')
+            makeelement(r, 'w:fldChar', w_fldCharType='begin')
+            r = makeelement(p, 'w:r')
+            makeelement(r, 'w:instrText').text = r' TOC \h '
+            r[0].set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+            r = makeelement(p, 'w:r')
+            makeelement(r, 'w:fldChar', w_fldCharType='separate')
+        hl = makeelement(p, 'w:hyperlink', w_anchor=self.bmark)
+        r = makeelement(hl, 'w:r')
+        rpr = makeelement(r, 'w:rPr')
+        makeelement(rpr, 'w:color', w_val='0000FF', w_themeColor='hyperlink')
+        makeelement(rpr, 'w:u', w_val='single')
+        makeelement(r, 'w:t').text = self.title
+        if self.is_last:
+            r = makeelement(p, 'w:r')
+            makeelement(r, 'w:fldChar', w_fldCharType='end')
+        body.insert(0, p)
+
+
+def sanitize_bookmark_name(base):
+    # Max length allowed by Word appears to be 40, we use 32 to leave some
+    # space for making the name unique
+    return re.sub(r'[^0-9a-zA-Z]', '_', ascii_text(base))[:32].rstrip('_')
+
+
+class LinksManager(object):
+
+    def __init__(self, namespace, document_relationships, log):
+        self.namespace = namespace
+        self.log = log
+        self.document_relationships = document_relationships
+        self.top_anchor = unicode_type(uuid4().hex)
+        self.anchor_map = {}
+        self.used_bookmark_names = set()
+        self.bmark_id = 0
+        self.document_hrefs = set()
+        self.external_links = {}
+        self.toc = []
+
+    def bookmark_for_anchor(self, anchor, current_item, html_tag):
+        key = (current_item.href, anchor)
+        if key in self.anchor_map:
+            return self.anchor_map[key]
+        if anchor == self.top_anchor:
+            name = ('Top of %s' % posixpath.basename(current_item.href))
+            self.document_hrefs.add(current_item.href)
+        else:
+            name = start_text(html_tag).strip() or anchor
+        name = sanitize_bookmark_name(name)
+        i, bname = 0, name
+        while name in self.used_bookmark_names:
+            i += 1
+            name  = bname + ('_%d' % i)
+        self.anchor_map[key] = name
+        self.used_bookmark_names.add(name)
+        return name
+
+    @property
+    def bookmark_id(self):
+        self.bmark_id += 1
+        return self.bmark_id
+
+    def serialize_hyperlink(self, parent, link):
+        item, url, tooltip = link
+        purl = urlparse(url)
+        href = purl.path
+
+        def make_link(parent, anchor=None, id=None, tooltip=None):
+            kw = {}
+            if anchor is not None:
+                kw['w_anchor'] = anchor
+            elif id is not None:
+                kw['r_id'] = id
+            if tooltip:
+                kw['w_tooltip'] = tooltip
+            return self.namespace.makeelement(parent, 'w:hyperlink', **kw)
+
+        if not purl.scheme:
+            href = item.abshref(href)
+            if href in self.document_hrefs:
+                key = (href, purl.fragment or self.top_anchor)
+                if key in self.anchor_map:
+                    bmark = self.anchor_map[key]
+                else:
+                    bmark = self.anchor_map[(href, self.top_anchor)]
+                return make_link(parent, anchor=bmark, tooltip=tooltip)
+            else:
+                self.log.warn('Ignoring internal hyperlink with href (%s) pointing to unknown destination' % url)
+        if purl.scheme in {'http', 'https', 'ftp'}:
+            if url not in self.external_links:
+                self.external_links[url] = self.document_relationships.add_relationship(url, self.namespace.names['LINKS'], target_mode='External')
+            return make_link(parent, id=self.external_links[url], tooltip=tooltip)
+        return parent
+
+    def process_toc_node(self, toc, level=0):
+        href = toc.href
+        if href:
+            purl = urlparse(href)
+            href = purl.path
+            if href in self.document_hrefs:
+                key = (href, purl.fragment or self.top_anchor)
+                if key in self.anchor_map:
+                    bmark = self.anchor_map[key]
+                else:
+                    bmark = self.anchor_map[(href, self.top_anchor)]
+                self.toc.append(TOCItem(toc.title, bmark, level))
+        for child in toc:
+            self.process_toc_node(child, level+1)
+
+    def process_toc_links(self, oeb):
+        self.toc = []
+        has_toc = oeb.toc and oeb.toc.count() > 1
+        if not has_toc:
+            return
+        for child in oeb.toc:
+            self.process_toc_node(child)
+        if self.toc:
+            self.toc[0].is_first = True
+            self.toc[-1].is_last = True
+
+    def serialize_toc(self, body, primary_heading_style):
+        pbb = body[0].xpath('//*[local-name()="pageBreakBefore"]')[0]
+        pbb.set('{%s}val' % self.namespace.namespaces['w'], 'on')
+        for block in reversed(self.toc):
+            block.serialize(body, self.namespace.makeelement)
+        title = __('Table of Contents')
+        makeelement = self.namespace.makeelement
+        p = makeelement(body, 'w:p', append=False)
+        ppr = makeelement(p, 'w:pPr')
+        if primary_heading_style is not None:
+            makeelement(ppr, 'w:pStyle', w_val=primary_heading_style.id)
+        makeelement(ppr, 'w:pageBreakBefore', w_val='off')
+        makeelement(makeelement(p, 'w:r'), 'w:t').text = title
+        body.insert(0, p)
@@ -0,0 +1,169 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import defaultdict
+from operator import attrgetter
+
+from polyglot.builtins import iteritems, itervalues, unicode_type
+
+LIST_STYLES = frozenset(
+    'disc circle square decimal decimal-leading-zero lower-roman upper-roman'
+    ' lower-greek lower-alpha lower-latin upper-alpha upper-latin hiragana hebrew'
+    ' katakana-iroha cjk-ideographic'.split())
+
+STYLE_MAP = {
+    'disc': 'bullet',
+    'circle': 'o',
+    'square': '\uf0a7',
+    'decimal': 'decimal',
+    'decimal-leading-zero': 'decimalZero',
+    'lower-roman': 'lowerRoman',
+    'upper-roman': 'upperRoman',
+    'lower-alpha': 'lowerLetter',
+    'lower-latin': 'lowerLetter',
+    'upper-alpha': 'upperLetter',
+    'upper-latin': 'upperLetter',
+    'hiragana': 'aiueo',
+    'hebrew': 'hebrew1',
+    'katakana-iroha': 'iroha',
+    'cjk-ideographic': 'chineseCounting',
+}
+
+
+def find_list_containers(list_tag, tag_style):
+    node = list_tag
+    stylizer = tag_style._stylizer
+    ans = []
+    while True:
+        parent = node.getparent()
+        if parent is None or parent is node:
+            break
+        node = parent
+        style = stylizer.style(node)
+        lst = (style._style.get('list-style-type', None) or '').lower()
+        if lst in LIST_STYLES:
+            ans.append(node)
+    return ans
+
+
+class NumberingDefinition(object):
+
+    def __init__(self, top_most, stylizer, namespace):
+        self.namespace = namespace
+        self.top_most = top_most
+        self.stylizer = stylizer
+        self.level_map = defaultdict(list)
+        self.num_id = None
+
+    def finalize(self):
+        items_for_level = defaultdict(list)
+        container_for_level = {}
+        type_for_level = {}
+        for ilvl, items in iteritems(self.level_map):
+            for container, list_tag, block, list_type, tag_style in items:
+                items_for_level[ilvl].append(list_tag)
+                container_for_level[ilvl] = container
+                type_for_level[ilvl] = list_type
+        self.levels = tuple(
+            Level(type_for_level[ilvl], container_for_level[ilvl], items_for_level[ilvl], ilvl=ilvl)
+            for ilvl in sorted(self.level_map)
+        )
+
+    def __hash__(self):
+        return hash(self.levels)
+
+    def link_blocks(self):
+        for ilvl, items in iteritems(self.level_map):
+            for container, list_tag, block, list_type, tag_style in items:
+                block.numbering_id = (self.num_id + 1, ilvl)
+
+    def serialize(self, parent):
+        makeelement = self.namespace.makeelement
+        an = makeelement(parent, 'w:abstractNum', w_abstractNumId=unicode_type(self.num_id))
+        makeelement(an, 'w:multiLevelType', w_val='hybridMultilevel')
+        makeelement(an, 'w:name', w_val='List %d' % (self.num_id + 1))
+        for level in self.levels:
+            level.serialize(an, makeelement)
+
+
+class Level(object):
+
+    def __init__(self, list_type, container, items, ilvl=0):
+        self.ilvl = ilvl
+        try:
+            self.start = int(container.get('start'))
+        except Exception:
+            self.start = 1
+        if items:
+            try:
+                self.start = int(items[0].get('value'))
+            except Exception:
+                pass
+        if list_type in {'disc', 'circle', 'square'}:
+            self.num_fmt = 'bullet'
+            self.lvl_text = '\uf0b7' if list_type == 'disc' else STYLE_MAP[list_type]
+        else:
+            self.lvl_text = '%{}.'.format(self.ilvl + 1)
+            self.num_fmt = STYLE_MAP.get(list_type, 'decimal')
+
+    def __hash__(self):
+        return hash((self.start, self.num_fmt, self.lvl_text))
+
+    def serialize(self, parent, makeelement):
+        lvl = makeelement(parent, 'w:lvl', w_ilvl=unicode_type(self.ilvl))
+        makeelement(lvl, 'w:start', w_val=unicode_type(self.start))
+        makeelement(lvl, 'w:numFmt', w_val=self.num_fmt)
+        makeelement(lvl, 'w:lvlText', w_val=self.lvl_text)
+        makeelement(lvl, 'w:lvlJc', w_val='left')
+        makeelement(makeelement(lvl, 'w:pPr'), 'w:ind', w_hanging='360', w_left=unicode_type(1152 + self.ilvl * 360))
+        if self.num_fmt == 'bullet':
+            ff = {'\uf0b7':'Symbol', '\uf0a7':'Wingdings'}.get(self.lvl_text, 'Courier New')
+            makeelement(makeelement(lvl, 'w:rPr'), 'w:rFonts', w_ascii=ff, w_hAnsi=ff, w_hint="default")
+
+
+class ListsManager(object):
+
+    def __init__(self, docx):
+        self.namespace = docx.namespace
+        self.lists = {}
+
+    def finalize(self, all_blocks):
+        lists = {}
+        for block in all_blocks:
+            if block.list_tag is not None:
+                list_tag, tag_style = block.list_tag
+                list_type = (tag_style['list-style-type'] or '').lower()
+                if list_type not in LIST_STYLES:
+                    continue
+                container_tags = find_list_containers(list_tag, tag_style)
+                if not container_tags:
+                    continue
+                top_most = container_tags[-1]
+                if top_most not in lists:
+                    lists[top_most] = NumberingDefinition(top_most, tag_style._stylizer, self.namespace)
+                l = lists[top_most]
+                ilvl = len(container_tags) - 1
+                l.level_map[ilvl].append((container_tags[0], list_tag, block, list_type, tag_style))
+
+        [nd.finalize() for nd in itervalues(lists)]
+        definitions = {}
+        for defn in itervalues(lists):
+            try:
+                defn = definitions[defn]
+            except KeyError:
+                definitions[defn] = defn
+                defn.num_id = len(definitions) - 1
+            defn.link_blocks()
+        self.definitions = sorted(itervalues(definitions), key=attrgetter('num_id'))
+
+    def serialize(self, parent):
+        for defn in self.definitions:
+            defn.serialize(parent)
+        makeelement = self.namespace.makeelement
+        for defn in self.definitions:
+            n = makeelement(parent, 'w:num', w_numId=unicode_type(defn.num_id + 1))
+            makeelement(n, 'w:abstractNumId', w_val=unicode_type(defn.num_id))
@@ -0,0 +1,768 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import numbers
+from collections import Counter, defaultdict
+from operator import attrgetter
+
+from lxml import etree
+
+from calibre.ebooks import parse_css_length
+from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero
+from calibre.utils.localization import lang_as_iso639_1
+from polyglot.builtins import iteritems, filter, unicode_type
+from tinycss.css21 import CSS21Parser
+
+css_parser = CSS21Parser()
+
+border_edges = ('left', 'top', 'right', 'bottom')
+border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
+ignore = object()
+
+
+def parse_css_font_family(raw):
+    decl, errs = css_parser.parse_style_attr('font-family:' + raw)
+    if decl:
+        for token in decl[0].value:
+            if token.type in 'STRING IDENT':
+                val = token.value
+                if val == 'inherit':
+                    break
+                yield val
+
+
+def css_font_family_to_docx(raw):
+    generic = {'serif':'Cambria', 'sansserif':'Candara', 'sans-serif':'Candara', 'fantasy':'Comic Sans', 'cursive':'Segoe Script'}
+    for ff in parse_css_font_family(raw):
+        return generic.get(ff.lower(), ff)
+
+
+def bmap(x):
+    return 'on' if x else 'off'
+
+
+def is_dropcaps(html_tag, tag_style):
+    return len(html_tag) < 2 and len(etree.tostring(html_tag, method='text', encoding='unicode', with_tail=False)) < 5 and tag_style['float'] == 'left'
+
+
+class CombinedStyle(object):
+
+    def __init__(self, bs, rs, blocks, namespace):
+        self.bs, self.rs, self.blocks = bs, rs, blocks
+        self.namespace = namespace
+        self.id = self.name = self.seq = None
+        self.outline_level = None
+
+    def apply(self):
+        for block in self.blocks:
+            block.linked_style = self
+            for run in block.runs:
+                run.parent_style = self.rs
+
+    def serialize(self, styles, normal_style):
+        makeelement = self.namespace.makeelement
+        w = lambda x: '{%s}%s' % (self.namespace.namespaces['w'], x)
+        block = makeelement(styles, 'w:style', w_styleId=self.id, w_type='paragraph')
+        makeelement(block, 'w:name', w_val=self.name)
+        makeelement(block, 'w:qFormat')
+        if self is not normal_style:
+            makeelement(block, 'w:basedOn', w_val=normal_style.id)
+        if self.seq == 0:
+            block.set(w('default'), '1')
+        pPr = makeelement(block, 'w:pPr')
+        self.bs.serialize_properties(pPr, normal_style.bs)
+        if self.outline_level is not None:
+            makeelement(pPr, 'w:outlineLvl', w_val=unicode_type(self.outline_level + 1))
+        rPr = makeelement(block, 'w:rPr')
+        self.rs.serialize_properties(rPr, normal_style.rs)
+
+
+class FloatSpec(object):
+
+    def __init__(self, namespace, html_tag, tag_style):
+        self.makeelement = namespace.makeelement
+        self.is_dropcaps = is_dropcaps(html_tag, tag_style)
+        self.blocks = []
+        if self.is_dropcaps:
+            self.dropcaps_lines = 3
+        else:
+            self.x_align = tag_style['float']
+            self.w = self.h = None
+            if tag_style._get('width') != 'auto':
+                self.w = int(20 * max(tag_style['min-width'], tag_style['width']))
+            if tag_style._get('height') == 'auto':
+                self.h_rule = 'auto'
+            else:
+                if tag_style['min-height'] > 0:
+                    self.h_rule, self.h = 'atLeast', tag_style['min-height']
+                else:
+                    self.h_rule, self.h = 'exact', tag_style['height']
+                self.h = int(20 * self.h)
+            self.h_space = int(20 * max(tag_style['margin-right'], tag_style['margin-left']))
+            self.v_space = int(20 * max(tag_style['margin-top'], tag_style['margin-bottom']))
+
+        read_css_block_borders(self, tag_style)
+
+    def serialize(self, block, parent):
+        if self.is_dropcaps:
+            attrs = dict(w_dropCap='drop', w_lines=unicode_type(self.dropcaps_lines), w_wrap='around', w_vAnchor='text', w_hAnchor='text')
+        else:
+            attrs = dict(
+                w_wrap='around', w_vAnchor='text', w_hAnchor='text', w_xAlign=self.x_align, w_y='1',
+                w_hSpace=unicode_type(self.h_space), w_vSpace=unicode_type(self.v_space), w_hRule=self.h_rule
+            )
+            if self.w is not None:
+                attrs['w_w'] = unicode_type(self.w)
+            if self.h is not None:
+                attrs['w_h'] = unicode_type(self.h)
+        self.makeelement(parent, 'w:framePr', **attrs)
+        # Margins are already applied by the frame style, so override them to
+        # be zero on individual blocks
+        self.makeelement(parent, 'w:ind', w_left='0', w_leftChars='0', w_right='0', w_rightChars='0')
+        attrs = {}
+        if block is self.blocks[0]:
+            attrs.update(dict(w_before='0', w_beforeLines='0'))
+        if block is self.blocks[-1]:
+            attrs.update(dict(w_after='0', w_afterLines='0'))
+        if attrs:
+            self.makeelement(parent, 'w:spacing', **attrs)
+        # Similarly apply the same border and padding properties to all blocks
+        # in this floatspec
+        bdr = self.makeelement(parent, 'w:pBdr')
+        for edge in border_edges:
+            padding = getattr(self, 'padding_' + edge)
+            width = getattr(self, 'border_%s_width' % edge)
+            bstyle = getattr(self, 'border_%s_style' % edge)
+            self.makeelement(
+                bdr, 'w:'+edge, w_space=unicode_type(padding), w_val=bstyle, w_sz=unicode_type(width), w_color=getattr(self, 'border_%s_color' % edge))
+
+
+class DOCXStyle(object):
+
+    ALL_PROPS = ()
+    TYPE = 'paragraph'
+
+    def __init__(self, namespace):
+        self.namespace = namespace
+        self.w = lambda x: '{%s}%s' % (namespace.namespaces['w'], x)
+        self.id = self.name = None
+        self.next_style = None
+        self.calculate_hash()
+
+    def calculate_hash(self):
+        self._hash = hash(tuple(
+            getattr(self, x) for x in self.ALL_PROPS))
+
+    def makeelement(self, parent, name, **attrs):
+        return parent.makeelement(self.w(name), **{self.w(k):v for k, v in iteritems(attrs)})
+
+    def __hash__(self):
+        return self._hash
+
+    def __eq__(self, other):
+        for x in self.ALL_PROPS:
+            if getattr(self, x) != getattr(other, x, None):
+                return False
+        return True
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __repr__(self):
+        return etree.tostring(self.serialize(etree.Element(self.__class__.__name__, nsmap={'w':self.namespace.namespaces['w']})), pretty_print=True)
+    __str__ = __repr__
+
+    def serialize(self, styles, normal_style):
+        makeelement = self.makeelement
+        style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE)
+        style.append(makeelement(style, 'name', val=self.name))
+        if self is not normal_style:
+            style.append(makeelement(style, 'basedOn', val=normal_style.id))
+        styles.append(style)
+        return style
+
+
+LINE_STYLES = {
+    'none'  : 'none',
+    'hidden': 'none',
+    'dotted': 'dotted',
+    'dashed': 'dashed',
+    'solid' : 'single',
+    'double': 'double',
+    'groove': 'threeDEngrave',
+    'ridge' : 'threeDEmboss',
+    'inset' : 'inset',
+    'outset': 'outset',
+}
+
+
+class TextStyle(DOCXStyle):
+
+    ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color',
+                 'background_color', 'underline', 'strike', 'dstrike', 'caps',
+                 'shadow', 'small_caps', 'spacing', 'vertical_align', 'padding',
+                 'border_style', 'border_width', 'border_color')
+    TYPE = 'character'
+
+    def __init__(self, namespace, css, is_parent_style=False):
+        self.font_family = css_font_family_to_docx(css['font-family'])
+        try:
+            self.font_size = max(0, int(float(css['font-size']) * 2))  # stylizer normalizes all font sizes into pts
+        except (ValueError, TypeError, AttributeError):
+            self.font_size = None
+
+        fw = css['font-weight']
+        self.bold = (fw.lower() if hasattr(fw, 'lower') else fw) in {'bold', 'bolder'} or int_or_zero(fw) >= 700
+        self.italic = css['font-style'].lower() in {'italic', 'oblique'}
+        self.color = convert_color(css['color'])
+        self.background_color = None if is_parent_style else convert_color(css.backgroundColor)
+        td = set((css.effective_text_decoration or '').split())
+        self.underline = 'underline' in td
+        self.dstrike = 'line-through' in td and 'overline' in td
+        self.strike = not self.dstrike and 'line-through' in td
+        self.text_transform = css['text-transform']  # TODO: If lowercase or capitalize, transform the actual text
+        self.caps = self.text_transform == 'uppercase'
+        self.small_caps = css['font-variant'].lower() in {'small-caps', 'smallcaps'}
+        self.shadow = css['text-shadow'] not in {'none', None}
+        try:
+            self.spacing = int(float(css['letter-spacing']) * 20)
+        except (ValueError, TypeError, AttributeError):
+            self.spacing = None
+        va = css.first_vertical_align
+        if isinstance(va, numbers.Number):
+            self.vertical_align = unicode_type(int(va * 2))
+        else:
+            val = {
+                'top':'superscript', 'text-top':'superscript', 'sup':'superscript', 'super':'superscript',
+                'bottom':'subscript', 'text-bottom':'subscript', 'sub':'subscript'}.get(va)
+            self.vertical_align = val or 'baseline'
+
+        self.padding = self.border_color = self.border_width = self.border_style = None
+        if not is_parent_style:
+            # DOCX does not support individual borders/padding for inline content
+            for edge in border_edges:
+                # In DOCX padding can only be a positive integer
+                try:
+                    padding = max(0, int(css['padding-' + edge]))
+                except ValueError:
+                    padding = 0
+                if self.padding is None:
+                    self.padding = padding
+                elif self.padding != padding:
+                    self.padding = ignore
+                val = css['border-%s-width' % edge]
+                if not isinstance(val, numbers.Number):
+                    val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0)
+                val = min(96, max(2, int(val * 8)))
+                if self.border_width is None:
+                    self.border_width = val
+                elif self.border_width != val:
+                    self.border_width = ignore
+                color = convert_color(css['border-%s-color' % edge])
+                if self.border_color is None:
+                    self.border_color = color
+                elif self.border_color != color:
+                    self.border_color = ignore
+                style = LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none')
+                if self.border_style is None:
+                    self.border_style = style
+                elif self.border_style != style:
+                    self.border_style = ignore
+
+        if self.padding in (None, ignore):
+            self.padding = 0
+        if self.border_width in (None, ignore):
+            self.border_width = 0
+        if self.border_style in (None, ignore):
+            self.border_style = 'none'
+        if self.border_color in (None, ignore):
+            self.border_color = 'auto'
+        if self.border_style == 'none':
+            self.border_width, self.border_color = 0, 'auto'
+
+        DOCXStyle.__init__(self, namespace)
+
+    def serialize_borders(self, bdr, normal_style):
+        w = self.w
+        is_normal_style = self is normal_style
+        if is_normal_style or self.padding != normal_style.padding:
+            bdr.set(w('space'), unicode_type(self.padding))
+        if is_normal_style or self.border_width != normal_style.border_width:
+            bdr.set(w('sz'), unicode_type(self.border_width))
+        if is_normal_style or self.border_style != normal_style.border_style:
+            bdr.set(w('val'), self.border_style)
+        if is_normal_style or self.border_color != normal_style.border_color:
+            bdr.set(w('color'), self.border_color)
+        return bdr
+
+    def serialize(self, styles, normal_style):
+        makeelement = self.makeelement
+        style_root = DOCXStyle.serialize(self, styles, normal_style)
+        style = makeelement(style_root, 'rPr')
+        self.serialize_properties(style, normal_style)
+        if len(style) > 0:
+            style_root.append(style)
+        return style_root
+
+    def serialize_properties(self, rPr, normal_style):
+        makeelement = self.makeelement
+        is_normal_style = self is normal_style
+        if is_normal_style or self.font_family != normal_style.font_family:
+            rPr.append(makeelement(
+                rPr, 'rFonts', **{k:self.font_family for k in 'ascii cs eastAsia hAnsi'.split()}))
+
+        for name, attr, vmap in (('sz', 'font_size', str), ('b', 'bold', bmap), ('i', 'italic', bmap)):
+            val = getattr(self, attr)
+            if is_normal_style or getattr(normal_style, attr) != val:
+                for suffix in ('', 'Cs'):
+                    rPr.append(makeelement(rPr, name + suffix, val=vmap(val)))
+
+        def check_attr(attr):
+            val = getattr(self, attr)
+            return is_normal_style or (val != getattr(normal_style, attr))
+
+        if check_attr('color'):
+            rPr.append(makeelement(rPr, 'color', val=self.color or 'auto'))
+        if check_attr('background_color'):
+            rPr.append(makeelement(rPr, 'shd', fill=self.background_color or 'auto'))
+        if check_attr('underline'):
+            rPr.append(makeelement(rPr, 'u', val='single' if self.underline else 'none'))
+        if check_attr('dstrike'):
+            rPr.append(makeelement(rPr, 'dstrike', val=bmap(self.dstrike)))
+        if check_attr('strike'):
+            rPr.append(makeelement(rPr, 'strike', val=bmap(self.strike)))
+        if check_attr('caps'):
+            rPr.append(makeelement(rPr, 'caps', val=bmap(self.caps)))
+        if check_attr('small_caps'):
+            rPr.append(makeelement(rPr, 'smallCaps', val=bmap(self.small_caps)))
+        if check_attr('shadow'):
+            rPr.append(makeelement(rPr, 'shadow', val=bmap(self.shadow)))
+        if check_attr('spacing'):
+            rPr.append(makeelement(rPr, 'spacing', val=unicode_type(self.spacing or 0)))
+        if is_normal_style:
+            rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align if self.vertical_align in {'superscript', 'subscript'} else 'baseline'))
+        elif self.vertical_align != normal_style.vertical_align:
+            if self.vertical_align in {'superscript', 'subscript', 'baseline'}:
+                rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align))
+            else:
+                rPr.append(makeelement(rPr, 'position', val=self.vertical_align))
+
+        bdr = self.serialize_borders(makeelement(rPr, 'bdr'), normal_style)
+        if bdr.attrib:
+            rPr.append(bdr)
+
+
+class DescendantTextStyle(object):
+
+    def __init__(self, parent_style, child_style):
+        self.id = self.name = None
+        self.makeelement = child_style.makeelement
+
+        p = []
+
+        def add(name, **props):
+            p.append((name, frozenset(iteritems(props))))
+
+        def vals(attr):
+            return getattr(parent_style, attr), getattr(child_style, attr)
+
+        def check(attr):
+            pval, cval = vals(attr)
+            return pval != cval
+
+        if parent_style.font_family != child_style.font_family:
+            add('rFonts', **{k:child_style.font_family for k in 'ascii cs eastAsia hAnsi'.split()})
+
+        for name, attr in (('sz', 'font_size'), ('b', 'bold'), ('i', 'italic')):
+            pval, cval = vals(attr)
+            if pval != cval:
+                val = 'on' if attr in {'bold', 'italic'} else unicode_type(cval)  # bold, italic are toggle properties
+                for suffix in ('', 'Cs'):
+                    add(name + suffix, val=val)
+
+        if check('color'):
+            add('color', val=child_style.color or 'auto')
+        if check('background_color'):
+            add('shd', fill=child_style.background_color or 'auto')
+        if check('underline'):
+            add('u', val='single' if child_style.underline else 'none')
+        if check('dstrike'):
+            add('dstrike', val=bmap(child_style.dstrike))
+        if check('strike'):
+            add('strike', val='on')  # toggle property
+        if check('caps'):
+            add('caps', val='on')  # toggle property
+        if check('small_caps'):
+            add('smallCaps', val='on')  # toggle property
+        if check('shadow'):
+            add('shadow', val='on')  # toggle property
+        if check('spacing'):
+            add('spacing', val=unicode_type(child_style.spacing or 0))
+        if check('vertical_align'):
+            val = child_style.vertical_align
+            if val in {'superscript', 'subscript', 'baseline'}:
+                add('vertAlign', val=val)
+            else:
+                add('position', val=val)
+
+        bdr = {}
+        if check('padding'):
+            bdr['space'] = unicode_type(child_style.padding)
+        if check('border_width'):
+            bdr['sz'] = unicode_type(child_style.border_width)
+        if check('border_style'):
+            bdr['val'] = child_style.border_style
+        if check('border_color'):
+            bdr['color'] = child_style.border_color
+        if bdr:
+            add('bdr', **bdr)
+        self.properties = tuple(p)
+        self._hash = hash(self.properties)
+
+    def __hash__(self):
+        return self._hash
+
+    def __eq__(self, other):
+        return self.properties == other.properties
+
+    def __ne__(self, other):
+        return self.properties != other.properties
+
+    def serialize(self, styles):
+        makeelement = self.makeelement
+        style = makeelement(styles, 'style', styleId=self.id, type='character')
+        style.append(makeelement(style, 'name', val=self.name))
+        rpr = makeelement(style, 'rPr')
+        style.append(rpr)
+        for name, attrs in self.properties:
+            rpr.append(makeelement(style, name, **dict(attrs)))
+        styles.append(style)
+        return style
+
+
+def read_css_block_borders(self, css, store_css_style=False):
+    for edge in border_edges:
+        if css is None:
+            setattr(self, 'padding_' + edge, 0)
+            setattr(self, 'margin_' + edge, 0)
+            setattr(self, 'css_margin_' + edge, '')
+            setattr(self, 'border_%s_width' % edge, 2)
+            setattr(self, 'border_%s_color' % edge, None)
+            setattr(self, 'border_%s_style' %  edge, 'none')
+            if store_css_style:
+                setattr(self, 'border_%s_css_style' %  edge, 'none')
+        else:
+            # In DOCX padding can only be a positive integer
+            try:
+                setattr(self, 'padding_' + edge, max(0, int(css['padding-' + edge])))
+            except ValueError:
+                setattr(self, 'padding_' + edge, 0)  # invalid value for padding
+            # In DOCX margin must be a positive integer in twips (twentieth of a point)
+            try:
+                setattr(self, 'margin_' + edge, max(0, int(css['margin-' + edge] * 20)))
+            except ValueError:
+                setattr(self, 'margin_' + edge, 0)  # for e.g.: margin: auto
+            setattr(self, 'css_margin_' + edge, css._style.get('margin-' + edge, ''))
+            val = css['border-%s-width' % edge]
+            if not isinstance(val, numbers.Number):
+                val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0)
+            val = min(96, max(2, int(val * 8)))
+            setattr(self, 'border_%s_width' % edge, val)
+            setattr(self, 'border_%s_color' % edge, convert_color(css['border-%s-color' % edge]) or 'auto')
+            setattr(self, 'border_%s_style' %  edge, LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none'))
+            if store_css_style:
+                setattr(self, 'border_%s_css_style' %  edge, css['border-%s-style' % edge].lower())
+
+
+class BlockStyle(DOCXStyle):
+
+    ALL_PROPS = tuple(
+        'text_align css_text_indent text_indent line_height background_color'.split(
+        ) + ['margin_' + edge for edge in border_edges
+        ] + ['css_margin_' + edge for edge in border_edges
+        ] + [x%edge for edge in border_edges for x in border_props]
+    )
+
+    def __init__(self, namespace, css, html_block, is_table_cell=False, parent_bg=None):
+        read_css_block_borders(self, css)
+        if is_table_cell:
+            for edge in border_edges:
+                setattr(self, 'border_%s_style' % edge, 'none')
+                setattr(self, 'border_%s_width' % edge, 0)
+                setattr(self, 'padding_' + edge, 0)
+                setattr(self, 'margin_' + edge, 0)
+        if css is None:
+            self.text_indent = 0
+            self.css_text_indent = None
+            self.line_height = 280
+            self.background_color = None
+            self.text_align = 'left'
+        else:
+            try:
+                self.text_indent = int(css['text-indent'] * 20)
+                self.css_text_indent = css._get('text-indent')
+            except (TypeError, ValueError):
+                self.text_indent = 0
+                self.css_text_indent = None
+            try:
+                self.line_height = max(0, int(css.lineHeight * 20))
+            except (TypeError, ValueError):
+                self.line_height = max(0, int(1.2 * css.fontSize * 20))
+            self.background_color = None if is_table_cell else convert_color(css['background-color'])
+            if not is_table_cell and self.background_color is None:
+                self.background_color = parent_bg
+            try:
+                ws = css['white-space'].lower()
+                preserve_whitespace = ws in {'pre', 'pre-wrap'}
+            except Exception:
+                preserve_whitespace = False
+            try:
+                aval = css['text-align'].lower()
+                if preserve_whitespace:
+                    aval = 'start'
+                self.text_align = {'start':'left', 'left':'left', 'end':'right', 'right':'right', 'center':'center', 'justify':'both', 'centre':'center'}.get(
+                    aval, 'left')
+            except AttributeError:
+                self.text_align = 'left'
+
+        DOCXStyle.__init__(self, namespace)
+
+    def serialize_borders(self, bdr, normal_style):
+        w = self.w
+        for edge in border_edges:
+            e = bdr.makeelement(w(edge))
+            padding = getattr(self, 'padding_' + edge)
+            if (self is normal_style and padding > 0) or (padding != getattr(normal_style, 'padding_' + edge)):
+                e.set(w('space'), unicode_type(padding))
+            width = getattr(self, 'border_%s_width' % edge)
+            bstyle = getattr(self, 'border_%s_style' % edge)
+            if (self is normal_style and width > 0 and bstyle != 'none'
+                    ) or width != getattr(normal_style, 'border_%s_width' % edge
+                    ) or bstyle != getattr(normal_style, 'border_%s_style' % edge):
+                e.set(w('val'), bstyle)
+                e.set(w('sz'), unicode_type(width))
+                e.set(w('color'), getattr(self, 'border_%s_color' % edge))
+            if e.attrib:
+                bdr.append(e)
+        return bdr
+
+    def serialize(self, styles, normal_style):
+        makeelement = self.makeelement
+        style_root = DOCXStyle.serialize(self, styles, normal_style)
+        style = makeelement(style_root, 'pPr')
+        self.serialize_properties(style, normal_style)
+        if len(style) > 0:
+            style_root.append(style)
+        return style_root
+
+    def serialize_properties(self, pPr, normal_style):
+        makeelement, w = self.makeelement, self.w
+        spacing = makeelement(pPr, 'spacing')
+        for edge, attr in iteritems({'top':'before', 'bottom':'after'}):
+            getter = attrgetter('css_margin_' + edge)
+            css_val, css_unit = parse_css_length(getter(self))
+            if css_unit in ('em', 'ex'):
+                lines = max(0, int(css_val * (50 if css_unit == 'ex' else 100)))
+                if (self is normal_style and lines > 0) or getter(self) != getter(normal_style):
+                    spacing.set(w(attr + 'Lines'), unicode_type(lines))
+            else:
+                getter = attrgetter('margin_' + edge)
+                val = getter(self)
+                if (self is normal_style and val > 0) or val != getter(normal_style):
+                    spacing.set(w(attr), unicode_type(val))
+
+        if self is normal_style or self.line_height != normal_style.line_height:
+            spacing.set(w('line'), unicode_type(self.line_height))
+            spacing.set(w('lineRule'), 'atLeast')
+
+        if spacing.attrib:
+            pPr.append(spacing)
+
+        ind = makeelement(pPr, 'ind')
+        for edge in ('left', 'right'):
+            getter = attrgetter('css_margin_' + edge)
+            css_val, css_unit = parse_css_length(getter(self))
+            if css_unit in ('em', 'ex'):
+                chars = max(0, int(css_val * (50 if css_unit == 'ex' else 100)))
+                if (self is normal_style and chars > 0) or getter(self) != getter(normal_style):
+                    ind.set(w(edge + 'Chars'), unicode_type(chars))
+            else:
+                getter = attrgetter('margin_' + edge)
+                val = getter(self)
+                if (self is normal_style and val > 0) or val != getter(normal_style):
+                    ind.set(w(edge), unicode_type(val))
+                    ind.set(w(edge + 'Chars'), '0')  # This is needed to override any declaration in the parent style
+        css_val, css_unit = parse_css_length(self.css_text_indent)
+        if css_unit in ('em', 'ex'):
+            chars = int(css_val * (50 if css_unit == 'ex' else 100))
+            if css_val >= 0:
+                if (self is normal_style and chars > 0) or self.css_text_indent != normal_style.css_text_indent:
+                    ind.set(w('firstLineChars'), unicode_type(chars))
+            else:
+                if (self is normal_style and chars < 0) or self.css_text_indent != normal_style.css_text_indent:
+                    ind.set(w('hangingChars'), unicode_type(abs(chars)))
+        else:
+            val = self.text_indent
+            if val >= 0:
+                if (self is normal_style and val > 0) or self.text_indent != normal_style.text_indent:
+                    ind.set(w('firstLine'), unicode_type(val))
+                    ind.set(w('firstLineChars'), '0')  # This is needed to override any declaration in the parent style
+            else:
+                if (self is normal_style and val < 0) or self.text_indent != normal_style.text_indent:
+                    ind.set(w('hanging'), unicode_type(abs(val)))
+                    ind.set(w('hangingChars'), '0')
+        if ind.attrib:
+            pPr.append(ind)
+
+        if (self is normal_style and self.background_color) or self.background_color != normal_style.background_color:
+            pPr.append(makeelement(pPr, 'shd', val='clear', color='auto', fill=self.background_color or 'auto'))
+
+        pbdr = self.serialize_borders(pPr.makeelement(w('pBdr')), normal_style)
+        if len(pbdr):
+            pPr.append(pbdr)
+
+        if self is normal_style or self.text_align != normal_style.text_align:
+            pPr.append(makeelement(pPr, 'jc', val=self.text_align))
+
+        if self is not normal_style and self.next_style is not None:
+            pPr.append(makeelement(pPr, 'next', val=self.next_style))
+
+
+class StylesManager(object):
+
+    def __init__(self, namespace, log, document_lang):
+        self.namespace = namespace
+        self.document_lang = lang_as_iso639_1(document_lang) or 'en'
+        self.log = log
+        self.block_styles, self.text_styles = {}, {}
+        self.styles_for_html_blocks = {}
+
+    def create_text_style(self, css_style, is_parent_style=False):
+        ans = TextStyle(self.namespace, css_style, is_parent_style=is_parent_style)
+        existing = self.text_styles.get(ans, None)
+        if existing is None:
+            self.text_styles[ans] = ans
+        else:
+            ans = existing
+        return ans
+
+    def create_block_style(self, css_style, html_block, is_table_cell=False, parent_bg=None):
+        ans = BlockStyle(self.namespace, css_style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg)
+        existing = self.block_styles.get(ans, None)
+        if existing is None:
+            self.block_styles[ans] = ans
+        else:
+            ans = existing
+        self.styles_for_html_blocks[html_block] = ans
+        return ans
+
+    def finalize(self, all_blocks):
+        block_counts, run_counts = Counter(), Counter()
+        block_rmap, run_rmap = defaultdict(list), defaultdict(list)
+        used_pairs = defaultdict(list)
+        heading_styles = defaultdict(list)
+        headings = frozenset('h1 h2 h3 h4 h5 h6'.split())
+        pure_block_styles = set()
+
+        for block in all_blocks:
+            bs = block.style
+            block_counts[bs] += 1
+            block_rmap[block.style].append(block)
+            local_run_counts = Counter()
+            for run in block.runs:
+                count = run.style_weight
+                run_counts[run.style] += count
+                local_run_counts[run.style] += count
+                run_rmap[run.style].append(run)
+            if local_run_counts:
+                rs = local_run_counts.most_common(1)[0][0]
+                used_pairs[(bs, rs)].append(block)
+                if block.html_tag in headings:
+                    heading_styles[block.html_tag].append((bs, rs))
+            else:
+                pure_block_styles.add(bs)
+
+        self.pure_block_styles = sorted(pure_block_styles, key=block_counts.__getitem__)
+        bnum = len(unicode_type(max(1, len(pure_block_styles) - 1)))
+        for i, bs in enumerate(self.pure_block_styles):
+            bs.id = bs.name = '%0{}d Block'.format(bnum) % i
+            bs.seq = i
+            if i == 0:
+                self.normal_pure_block_style = bs
+
+        counts = Counter()
+        smap = {}
+        for (bs, rs), blocks in iteritems(used_pairs):
+            s = CombinedStyle(bs, rs, blocks, self.namespace)
+            smap[(bs, rs)] = s
+            counts[s] += sum(1 for b in blocks if not b.is_empty())
+        for i, heading_tag in enumerate(sorted(heading_styles)):
+            styles = sorted((smap[k] for k in heading_styles[heading_tag]), key=counts.__getitem__)
+            styles = list(filter(lambda s:s.outline_level is None, styles))
+            if styles:
+                heading_style = styles[-1]
+                heading_style.outline_level = i
+
+        snum = len(unicode_type(max(1, len(counts) - 1)))
+        heading_styles = []
+        for i, (style, count) in enumerate(counts.most_common()):
+            if i == 0:
+                self.normal_style = style
+                style.id = style.name = 'Normal'
+            else:
+                if style.outline_level is None:
+                    val = 'Para %0{}d'.format(snum) % i
+                else:
+                    val = 'Heading %d' % (style.outline_level + 1)
+                    heading_styles.append(style)
+                style.id = style.name = val
+            style.seq = i
+        self.combined_styles = sorted(counts, key=attrgetter('seq'))
+        [ls.apply() for ls in self.combined_styles]
+
+        descendant_style_map = {}
+        ds_counts = Counter()
+        for block in all_blocks:
+            for run in block.runs:
+                if run.parent_style is not run.style and run.parent_style and run.style:
+                    ds = DescendantTextStyle(run.parent_style, run.style)
+                    if ds.properties:
+                        run.descendant_style = descendant_style_map.get(ds)
+                        if run.descendant_style is None:
+                            run.descendant_style = descendant_style_map[ds] = ds
+                        ds_counts[run.descendant_style] += run.style_weight
+        rnum = len(unicode_type(max(1, len(ds_counts) - 1)))
+        for i, (text_style, count) in enumerate(ds_counts.most_common()):
+            text_style.id = 'Text%d' % i
+            text_style.name = '%0{}d Text'.format(rnum) % i
+            text_style.seq = i
+        self.descendant_text_styles = sorted(descendant_style_map, key=attrgetter('seq'))
+
+        self.log.debug('%d Text Styles %d Combined styles' % tuple(map(len, (
+            self.descendant_text_styles, self.combined_styles))))
+
+        self.primary_heading_style = None
+        if heading_styles:
+            heading_styles.sort(key=attrgetter('outline_level'))
+            self.primary_heading_style = heading_styles[0]
+        else:
+            ms = 0
+            for s in self.combined_styles:
+                if s.rs.font_size > ms:
+                    self.primary_heading_style = s
+                    ms = s.rs.font_size
+
+    def serialize(self, styles):
+        lang = styles.xpath('descendant::*[local-name()="lang"]')[0]
+        for k in tuple(lang.attrib):
+            lang.attrib[k] = self.document_lang
+        for style in self.combined_styles:
+            style.serialize(styles, self.normal_style)
+        for style in self.descendant_text_styles:
+            style.serialize(styles)
+        for style in sorted(self.pure_block_styles, key=attrgetter('seq')):
+            style.serialize(styles, self.normal_pure_block_style)
@@ -0,0 +1,371 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import namedtuple
+
+from calibre.ebooks.docx.writer.utils import convert_color
+from calibre.ebooks.docx.writer.styles import read_css_block_borders as rcbb, border_edges
+from polyglot.builtins import iteritems, range, unicode_type
+
+
+class Dummy(object):
+    pass
+
+
+Border = namedtuple('Border', 'css_style style width color level')
+border_style_weight = {
+    x:100-i for i, x in enumerate(('double', 'solid', 'dashed', 'dotted', 'ridge', 'outset', 'groove', 'inset'))}
+
+
+class SpannedCell(object):
+
+    def __init__(self, spanning_cell, horizontal=True):
+        self.spanning_cell = spanning_cell
+        self.horizontal = horizontal
+        self.row_span = self.col_span = 1
+
+    def resolve_borders(self):
+        pass
+
+    def serialize(self, tr, makeelement):
+        tc = makeelement(tr, 'w:tc')
+        tcPr = makeelement(tc, 'w:tcPr')
+        makeelement(tcPr, 'w:%sMerge' % ('h' if self.horizontal else 'v'), w_val='continue')
+        makeelement(tc, 'w:p')
+
+    def applicable_borders(self, edge):
+        return self.spanning_cell.applicable_borders(edge)
+
+
+def read_css_block_borders(self, css):
+    obj = Dummy()
+    rcbb(obj, css, store_css_style=True)
+    for edge in border_edges:
+        setattr(self, 'border_' + edge, Border(
+            getattr(obj, 'border_%s_css_style' % edge),
+            getattr(obj, 'border_%s_style' % edge),
+            getattr(obj, 'border_%s_width' % edge),
+            getattr(obj, 'border_%s_color' % edge),
+            self.BLEVEL
+        ))
+        setattr(self, 'padding_' + edge, getattr(obj, 'padding_' + edge))
+
+
+def as_percent(x):
+    if x and x.endswith('%'):
+        try:
+            return float(x.rstrip('%'))
+        except Exception:
+            pass
+
+
+def convert_width(tag_style):
+    if tag_style is not None:
+        w = tag_style._get('width')
+        wp = as_percent(w)
+        if w == 'auto':
+            return ('auto', 0)
+        elif wp is not None:
+            return ('pct', int(wp * 50))
+        else:
+            try:
+                return ('dxa', int(float(tag_style['width']) * 20))
+            except Exception:
+                pass
+    return ('auto', 0)
+
+
+class Cell(object):
+
+    BLEVEL = 2
+
+    def __init__(self, row, html_tag, tag_style=None):
+        self.row = row
+        self.table = self.row.table
+        self.html_tag = html_tag
+        try:
+            self.row_span = max(0, int(html_tag.get('rowspan', 1)))
+        except Exception:
+            self.row_span = 1
+        try:
+            self.col_span = max(0, int(html_tag.get('colspan', 1)))
+        except Exception:
+            self.col_span = 1
+        if tag_style is None:
+            self.valign = 'center'
+        else:
+            self.valign = {'top':'top', 'bottom':'bottom', 'middle':'center'}.get(tag_style._get('vertical-align'))
+        self.items = []
+        self.width = convert_width(tag_style)
+        self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
+        read_css_block_borders(self, tag_style)
+
+    def add_block(self, block):
+        self.items.append(block)
+        block.parent_items = self.items
+
+    def add_table(self, table):
+        self.items.append(table)
+        return table
+
+    def serialize(self, parent, makeelement):
+        tc = makeelement(parent, 'w:tc')
+        tcPr = makeelement(tc, 'w:tcPr')
+        makeelement(tcPr, 'w:tcW', w_type=self.width[0], w_w=unicode_type(self.width[1]))
+        # For some reason, Word 2007 refuses to honor <w:shd> at the table or row
+        # level, despite what the specs say, so we inherit and apply at the
+        # cell level
+        bc = self.background_color or self.row.background_color or self.row.table.background_color
+        if bc:
+            makeelement(tcPr, 'w:shd', w_val="clear", w_color="auto", w_fill=bc)
+
+        b = makeelement(tcPr, 'w:tcBorders', append=False)
+        for edge, border in iteritems(self.borders):
+            if border is not None and border.width > 0 and border.style != 'none':
+                makeelement(b, 'w:' + edge, w_val=border.style, w_sz=unicode_type(border.width), w_color=border.color)
+        if len(b) > 0:
+            tcPr.append(b)
+
+        m = makeelement(tcPr, 'w:tcMar', append=False)
+        for edge in border_edges:
+            padding = getattr(self, 'padding_' + edge)
+            if edge in {'top', 'bottom'} or (edge == 'left' and self is self.row.first_cell) or (edge == 'right' and self is self.row.last_cell):
+                padding += getattr(self.row, 'padding_' + edge)
+            if padding > 0:
+                makeelement(m, 'w:' + edge, w_type='dxa', w_w=unicode_type(int(padding * 20)))
+        if len(m) > 0:
+            tcPr.append(m)
+
+        if self.valign is not None:
+            makeelement(tcPr, 'w:vAlign', w_val=self.valign)
+
+        if self.row_span > 1:
+            makeelement(tcPr, 'w:vMerge', w_val='restart')
+        if self.col_span > 1:
+            makeelement(tcPr, 'w:hMerge', w_val='restart')
+
+        item = None
+        for item in self.items:
+            item.serialize(tc)
+        if item is None or isinstance(item, Table):
+            # Word 2007 requires the last element in a table cell to be a paragraph
+            makeelement(tc, 'w:p')
+
+    def applicable_borders(self, edge):
+        if edge == 'left':
+            items = {self.table, self.row, self} if self.row.first_cell is self else {self}
+        elif edge == 'top':
+            items = ({self.table} if self.table.first_row is self.row else set()) | {self, self.row}
+        elif edge == 'right':
+            items = {self.table, self, self.row} if self.row.last_cell is self else {self}
+        elif edge == 'bottom':
+            items = ({self.table} if self.table.last_row is self.row else set()) | {self, self.row}
+        return {getattr(x, 'border_' + edge) for x in items}
+
+    def resolve_border(self, edge):
+        # In Word cell borders override table borders, and Word ignores row
+        # borders, so we consolidate all borders as cell borders
+        # In HTML the priority is as described here:
+        # http://www.w3.org/TR/CSS21/tables.html#border-conflict-resolution
+        neighbor = self.neighbor(edge)
+        borders = self.applicable_borders(edge)
+        if neighbor is not None:
+            nedge = {'left':'right', 'top':'bottom', 'right':'left', 'bottom':'top'}[edge]
+            borders |= neighbor.applicable_borders(nedge)
+
+        for b in borders:
+            if b.css_style == 'hidden':
+                return None
+
+        def weight(border):
+            return (
+                0 if border.css_style == 'none' else 1,
+                border.width,
+                border_style_weight.get(border.css_style, 0),
+                border.level)
+        border = sorted(borders, key=weight)[-1]
+        return border
+
+    def resolve_borders(self):
+        self.borders = {edge:self.resolve_border(edge) for edge in border_edges}
+
+    def neighbor(self, edge):
+        idx = self.row.cells.index(self)
+        ans = None
+        if edge == 'left':
+            ans = self.row.cells[idx-1] if idx > 0 else None
+        elif edge == 'right':
+            ans = self.row.cells[idx+1] if (idx + 1) < len(self.row.cells) else None
+        elif edge == 'top':
+            ridx = self.table.rows.index(self.row)
+            if ridx > 0 and idx < len(self.table.rows[ridx-1].cells):
+                ans = self.table.rows[ridx-1].cells[idx]
+        elif edge == 'bottom':
+            ridx = self.table.rows.index(self.row)
+            if ridx + 1 < len(self.table.rows) and idx < len(self.table.rows[ridx+1].cells):
+                ans = self.table.rows[ridx+1].cells[idx]
+        return getattr(ans, 'spanning_cell', ans)
+
+
+class Row(object):
+
+    BLEVEL = 1
+
+    def __init__(self, table, html_tag, tag_style=None):
+        self.table = table
+        self.html_tag = html_tag
+        self.orig_tag_style = tag_style
+        self.cells = []
+        self.current_cell = None
+        self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
+        read_css_block_borders(self, tag_style)
+
+    @property
+    def first_cell(self):
+        return self.cells[0] if self.cells else None
+
+    @property
+    def last_cell(self):
+        return self.cells[-1] if self.cells else None
+
+    def start_new_cell(self, html_tag, tag_style):
+        self.current_cell = Cell(self, html_tag, tag_style)
+
+    def finish_tag(self, html_tag):
+        if self.current_cell is not None:
+            if html_tag is self.current_cell.html_tag:
+                self.cells.append(self.current_cell)
+                self.current_cell = None
+
+    def add_block(self, block):
+        if self.current_cell is None:
+            self.start_new_cell(self.html_tag, self.orig_tag_style)
+        self.current_cell.add_block(block)
+
+    def add_table(self, table):
+        if self.current_cell is None:
+            self.current_cell = Cell(self, self.html_tag, self.orig_tag_style)
+        return self.current_cell.add_table(table)
+
+    def serialize(self, parent, makeelement):
+        tr = makeelement(parent, 'w:tr')
+        for cell in self.cells:
+            cell.serialize(tr, makeelement)
+
+
+class Table(object):
+
+    BLEVEL = 0
+
+    def __init__(self, namespace, html_tag, tag_style=None):
+        self.namespace = namespace
+        self.html_tag = html_tag
+        self.orig_tag_style = tag_style
+        self.rows = []
+        self.current_row = None
+        self.width = convert_width(tag_style)
+        self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
+        self.jc = None
+        self.float = None
+        self.margin_left = self.margin_right = self.margin_top = self.margin_bottom = None
+        if tag_style is not None:
+            ml, mr = tag_style._get('margin-left'), tag_style.get('margin-right')
+            if ml == 'auto':
+                self.jc = 'center' if mr == 'auto' else 'right'
+            self.float = tag_style['float']
+            for edge in border_edges:
+                setattr(self, 'margin_' + edge, tag_style['margin-' + edge])
+        read_css_block_borders(self, tag_style)
+
+    @property
+    def first_row(self):
+        return self.rows[0] if self.rows else None
+
+    @property
+    def last_row(self):
+        return self.rows[-1] if self.rows else None
+
+    def finish_tag(self, html_tag):
+        if self.current_row is not None:
+            self.current_row.finish_tag(html_tag)
+            if self.current_row.html_tag is html_tag:
+                self.rows.append(self.current_row)
+                self.current_row = None
+        table_ended = self.html_tag is html_tag
+        if table_ended:
+            self.expand_spanned_cells()
+            for row in self.rows:
+                for cell in row.cells:
+                    cell.resolve_borders()
+        return table_ended
+
+    def expand_spanned_cells(self):
+        # Expand horizontally
+        for row in self.rows:
+            for cell in tuple(row.cells):
+                idx = row.cells.index(cell)
+                if cell.col_span > 1 and (cell is row.cells[-1] or not isinstance(row.cells[idx+1], SpannedCell)):
+                    row.cells[idx:idx+1] = [cell] + [SpannedCell(cell, horizontal=True) for i in range(1, cell.col_span)]
+
+        # Expand vertically
+        for r, row in enumerate(self.rows):
+            for idx, cell in enumerate(row.cells):
+                if cell.row_span > 1:
+                    for nrow in self.rows[r+1:]:
+                        sc = SpannedCell(cell, horizontal=False)
+                        try:
+                            tcell = nrow.cells[idx]
+                        except Exception:
+                            tcell = None
+                        if tcell is None:
+                            nrow.cells.extend([SpannedCell(nrow.cells[-1], horizontal=True) for i in range(idx - len(nrow.cells))])
+                            nrow.cells.append(sc)
+                        else:
+                            if isinstance(tcell, SpannedCell):
+                                # Conflict between rowspan and colspan
+                                break
+                            else:
+                                nrow.cells.insert(idx, sc)
+
+    def start_new_row(self, html_tag, html_style):
+        if self.current_row is not None:
+            self.rows.append(self.current_row)
+        self.current_row = Row(self, html_tag, html_style)
+
+    def start_new_cell(self, html_tag, html_style):
+        if self.current_row is None:
+            self.start_new_row(html_tag, None)
+        self.current_row.start_new_cell(html_tag, html_style)
+
+    def add_block(self, block):
+        self.current_row.add_block(block)
+
+    def add_table(self, table):
+        if self.current_row is None:
+            self.current_row = Row(self, self.html_tag, self.orig_tag_style)
+        return self.current_row.add_table(table)
+
+    def serialize(self, parent):
+        makeelement = self.namespace.makeelement
+        rows = [r for r in self.rows if r.cells]
+        if not rows:
+            return
+        tbl = makeelement(parent, 'w:tbl')
+        tblPr = makeelement(tbl, 'w:tblPr')
+        makeelement(tblPr, 'w:tblW', w_type=self.width[0], w_w=unicode_type(self.width[1]))
+        if self.float in {'left', 'right'}:
+            kw = {'w_vertAnchor':'text', 'w_horzAnchor':'text', 'w_tblpXSpec':self.float}
+            for edge in border_edges:
+                val = getattr(self, 'margin_' + edge) or 0
+                if {self.float, edge} == {'left', 'right'}:
+                    val = max(val, 2)
+                kw['w_' + edge + 'FromText'] = unicode_type(max(0, int(val *20)))
+            makeelement(tblPr, 'w:tblpPr', **kw)
+        if self.jc is not None:
+            makeelement(tblPr, 'w:jc', w_val=self.jc)
+        for row in rows:
+            row.serialize(tbl, makeelement)
@@ -0,0 +1,58 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from tinycss.color3 import parse_color_string
+
+
+def int_or_zero(raw):
+    try:
+        return int(raw)
+    except (ValueError, TypeError, AttributeError):
+        return 0
+
+# convert_color() {{{
+
+
+def convert_color(value):
+    if not value:
+        return
+    if value.lower() == 'currentcolor':
+        return 'auto'
+    val = parse_color_string(value)
+    if val is None:
+        return
+    if val.alpha < 0.01:
+        return
+    return '%02X%02X%02X' % (int(val.red * 255), int(val.green * 255), int(val.blue * 255))
+
+
+def test_convert_color(return_tests=False):
+    import unittest
+
+    class TestColors(unittest.TestCase):
+
+        def test_color_conversion(self):
+            ae = self.assertEqual
+            cc = convert_color
+            ae(None, cc(None))
+            ae(None, cc('transparent'))
+            ae(None, cc('none'))
+            ae(None, cc('#12j456'))
+            ae('auto', cc('currentColor'))
+            ae('F0F8FF', cc('AliceBlue'))
+            ae('000000', cc('black'))
+            ae('FF0000', cc('red'))
+            ae('00FF00', cc('lime'))
+            ae(cc('#001'), '000011')
+            ae('12345D', cc('#12345d'))
+            ae('FFFFFF', cc('rgb(255, 255, 255)'))
+            ae('FF0000', cc('rgba(255, 0, 0, 23)'))
+    tests = unittest.defaultTestLoader.loadTestsFromTestCase(TestColors)
+    if return_tests:
+        return tests
+    unittest.TextTestRunner(verbosity=4).run(tests)
+# }}}
@@ -0,0 +1,316 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from collections import defaultdict
+
+from calibre.ebooks.oeb.base import urlnormalize, css_text
+from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont
+from polyglot.builtins import iteritems, itervalues, unicode_type, range
+from tinycss.fonts3 import parse_font_family
+
+
+def get_font_properties(rule, default=None):
+    '''
+    Given a CSS rule, extract normalized font properties from
+    it. Note that shorthand font property should already have been expanded
+    by the CSS flattening code.
+    '''
+    props = {}
+    s = rule.style
+    for q in ('font-family', 'src', 'font-weight', 'font-stretch',
+            'font-style'):
+        g = 'uri' if q == 'src' else 'value'
+        try:
+            val = s.getProperty(q).propertyValue[0]
+            val = getattr(val, g)
+            if q == 'font-family':
+                val = parse_font_family(css_text(s.getProperty(q).propertyValue))
+                if val and val[0] == 'inherit':
+                    val = None
+        except (IndexError, KeyError, AttributeError, TypeError, ValueError):
+            val = None if q in {'src', 'font-family'} else default
+        if q in {'font-weight', 'font-stretch', 'font-style'}:
+            val = unicode_type(val).lower() if (val or val == 0) else val
+            if val == 'inherit':
+                val = default
+        if q == 'font-weight':
+            val = {'normal':'400', 'bold':'700'}.get(val, val)
+            if val not in {'100', '200', '300', '400', '500', '600', '700',
+                    '800', '900', 'bolder', 'lighter'}:
+                val = default
+            if val == 'normal':
+                val = '400'
+        elif q == 'font-style':
+            if val not in {'normal', 'italic', 'oblique'}:
+                val = default
+        elif q == 'font-stretch':
+            if val not in {'normal', 'ultra-condensed', 'extra-condensed',
+                    'condensed', 'semi-condensed', 'semi-expanded',
+                    'expanded', 'extra-expanded', 'ultra-expanded'}:
+                val = default
+        props[q] = val
+    return props
+
+
+def find_font_face_rules(sheet, oeb):
+    '''
+    Find all @font-face rules in the given sheet and extract the relevant info from them.
+    sheet can be either a ManifestItem or a CSSStyleSheet.
+    '''
+    ans = []
+    try:
+        rules = sheet.data.cssRules
+    except AttributeError:
+        rules = sheet.cssRules
+
+    for i, rule in enumerate(rules):
+        if rule.type != rule.FONT_FACE_RULE:
+            continue
+        props = get_font_properties(rule, default='normal')
+        if not props['font-family'] or not props['src']:
+            continue
+
+        try:
+            path = sheet.abshref(props['src'])
+        except AttributeError:
+            path = props['src']
+        ff = oeb.manifest.hrefs.get(urlnormalize(path), None)
+        if not ff:
+            continue
+        props['item'] = ff
+        if props['font-weight'] in {'bolder', 'lighter'}:
+            props['font-weight'] = '400'
+        props['weight'] = int(props['font-weight'])
+        props['rule'] = rule
+        props['chars'] = set()
+        ans.append(props)
+
+    return ans
+
+
+def elem_style(style_rules, cls, inherited_style):
+    '''
+    Find the effective style for the given element.
+    '''
+    classes = cls.split()
+    style = inherited_style.copy()
+    for cls in classes:
+        style.update(style_rules.get(cls, {}))
+    wt = style.get('font-weight', None)
+    pwt = inherited_style.get('font-weight', '400')
+    if wt == 'bolder':
+        style['font-weight'] = {
+                '100':'400',
+                '200':'400',
+                '300':'400',
+                '400':'700',
+                '500':'700',
+                }.get(pwt, '900')
+    elif wt == 'lighter':
+        style['font-weight'] = {
+                '600':'400', '700':'400',
+                '800':'700', '900':'700'}.get(pwt, '100')
+
+    return style
+
+
+class SubsetFonts(object):
+
+    '''
+    Subset all embedded fonts. Must be run after CSS flattening, as it requires
+    CSS normalization and flattening to work.
+    '''
+
+    def __call__(self, oeb, log, opts):
+        self.oeb, self.log, self.opts = oeb, log, opts
+
+        self.find_embedded_fonts()
+        if not self.embedded_fonts:
+            self.log.debug('No embedded fonts found')
+            return
+        self.find_style_rules()
+        self.find_font_usage()
+
+        totals = [0, 0]
+
+        def remove(font):
+            totals[1] += len(font['item'].data)
+            self.oeb.manifest.remove(font['item'])
+            font['rule'].parentStyleSheet.deleteRule(font['rule'])
+
+        fonts = {}
+        for font in self.embedded_fonts:
+            item, chars = font['item'], font['chars']
+            if item.href in fonts:
+                fonts[item.href]['chars'] |= chars
+            else:
+                fonts[item.href] = font
+
+        for font in itervalues(fonts):
+            if not font['chars']:
+                self.log('The font %s is unused. Removing it.'%font['src'])
+                remove(font)
+                continue
+            try:
+                raw, old_stats, new_stats = subset(font['item'].data, font['chars'])
+            except NoGlyphs:
+                self.log('The font %s has no used glyphs. Removing it.'%font['src'])
+                remove(font)
+                continue
+            except UnsupportedFont as e:
+                self.log.warn('The font %s is unsupported for subsetting. %s'%(
+                    font['src'], e))
+                sz = len(font['item'].data)
+                totals[0] += sz
+                totals[1] += sz
+            else:
+                font['item'].data = raw
+                nlen = sum(itervalues(new_stats))
+                olen = sum(itervalues(old_stats))
+                self.log('Decreased the font %s to %.1f%% of its original size'%
+                        (font['src'], nlen/olen *100))
+                totals[0] += nlen
+                totals[1] += olen
+
+            font['item'].unload_data_from_memory()
+
+        if totals[0]:
+            self.log('Reduced total font size to %.1f%% of original'%
+                    (totals[0]/totals[1] * 100))
+
+    def find_embedded_fonts(self):
+        '''
+        Find all @font-face rules and extract the relevant info from them.
+        '''
+        self.embedded_fonts = []
+        for item in self.oeb.manifest:
+            if not hasattr(item.data, 'cssRules'):
+                continue
+            self.embedded_fonts.extend(find_font_face_rules(item, self.oeb))
+
+    def find_style_rules(self):
+        '''
+        Extract all font related style information from all stylesheets into a
+        dict mapping classes to font properties specified by that class. All
+        the heavy lifting has already been done by the CSS flattening code.
+        '''
+        rules = defaultdict(dict)
+        for item in self.oeb.manifest:
+            if not hasattr(item.data, 'cssRules'):
+                continue
+            for i, rule in enumerate(item.data.cssRules):
+                if rule.type != rule.STYLE_RULE:
+                    continue
+                props = {k:v for k,v in
+                        iteritems(get_font_properties(rule)) if v}
+                if not props:
+                    continue
+                for sel in rule.selectorList:
+                    sel = sel.selectorText
+                    if sel and sel.startswith('.'):
+                        # We dont care about pseudo-selectors as the worst that
+                        # can happen is some extra characters will remain in
+                        # the font
+                        sel = sel.partition(':')[0]
+                        rules[sel[1:]].update(props)
+
+        self.style_rules = dict(rules)
+
+    def find_font_usage(self):
+        for item in self.oeb.manifest:
+            if not hasattr(item.data, 'xpath'):
+                continue
+            for body in item.data.xpath('//*[local-name()="body"]'):
+                base = {'font-family':['serif'], 'font-weight': '400',
+                        'font-style':'normal', 'font-stretch':'normal'}
+                self.find_usage_in(body, base)
+
+    def used_font(self, style):
+        '''
+        Given a style find the embedded font that matches it. Returns None if
+        no match is found (can happen if no family matches).
+        '''
+        ff = style.get('font-family', [])
+        lnames = {unicode_type(x).lower() for x in ff}
+        matching_set = []
+
+        # Filter on font-family
+        for ef in self.embedded_fonts:
+            flnames = {x.lower() for x in ef.get('font-family', [])}
+            if not lnames.intersection(flnames):
+                continue
+            matching_set.append(ef)
+        if not matching_set:
+            return None
+
+        # Filter on font-stretch
+        widths = {x:i for i, x in enumerate(('ultra-condensed',
+                'extra-condensed', 'condensed', 'semi-condensed', 'normal',
+                'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'
+                ))}
+
+        width = widths[style.get('font-stretch', 'normal')]
+        for f in matching_set:
+            f['width'] = widths[style.get('font-stretch', 'normal')]
+
+        min_dist = min(abs(width-f['width']) for f in matching_set)
+        nearest = [f for f in matching_set if abs(width-f['width']) ==
+            min_dist]
+        if width <= 4:
+            lmatches = [f for f in nearest if f['width'] <= width]
+        else:
+            lmatches = [f for f in nearest if f['width'] >= width]
+        matching_set = (lmatches or nearest)
+
+        # Filter on font-style
+        fs = style.get('font-style', 'normal')
+        order = {
+                'oblique':['oblique', 'italic', 'normal'],
+                'normal':['normal', 'oblique', 'italic']
+            }.get(fs, ['italic', 'oblique', 'normal'])
+        for q in order:
+            matches = [f for f in matching_set if f.get('font-style', 'normal') == q]
+            if matches:
+                matching_set = matches
+                break
+
+        # Filter on font weight
+        fw = int(style.get('font-weight', '400'))
+        if fw == 400:
+            q = [400, 500, 300, 200, 100, 600, 700, 800, 900]
+        elif fw == 500:
+            q = [500, 400, 300, 200, 100, 600, 700, 800, 900]
+        elif fw < 400:
+            q = [fw] + list(range(fw-100, -100, -100)) + list(range(fw+100,
+                100, 1000))
+        else:
+            q = [fw] + list(range(fw+100, 100, 1000)) + list(range(fw-100,
+                -100, -100))
+        for wt in q:
+            matches = [f for f in matching_set if f['weight'] == wt]
+            if matches:
+                return matches[0]
+
+    def find_chars(self, elem):
+        ans = set()
+        if elem.text:
+            ans |= set(elem.text)
+        for child in elem:
+            if child.tail:
+                ans |= set(child.tail)
+        return ans
+
+    def find_usage_in(self, elem, inherited_style):
+        style = elem_style(self.style_rules, elem.get('class', '') or '', inherited_style)
+        for child in elem:
+            self.find_usage_in(child, style)
+        font = self.used_font(style)
+        if font:
+            chars = self.find_chars(elem)
+            if chars:
+                font['chars'] |= chars
@@ -0,0 +1,10 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+
@@ -0,0 +1,247 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import codecs, zlib, numbers
+from io import BytesIO
+from datetime import datetime
+
+from calibre.constants import plugins, ispy3
+from calibre.utils.logging import default_log
+from polyglot.builtins import iteritems, unicode_type, codepoint_to_chr
+from polyglot.binary import as_hex_bytes
+
+pdf_float = plugins['speedup'][0].pdf_float
+
+EOL = b'\n'
+
+# Sizes {{{
+inch = 72.0
+cm = inch / 2.54
+mm = cm * 0.1
+pica = 12.0
+didot = 0.375 * mm
+cicero = 12 * didot
+
+_W, _H = (21*cm, 29.7*cm)
+
+A6 = (_W*.5, _H*.5)
+A5 = (_H*.5, _W)
+A4 = (_W, _H)
+A3 = (_H, _W*2)
+A2 = (_W*2, _H*2)
+A1 = (_H*2, _W*4)
+A0 = (_W*4, _H*4)
+
+LETTER = (8.5*inch, 11*inch)
+LEGAL = (8.5*inch, 14*inch)
+ELEVENSEVENTEEN = (11*inch, 17*inch)
+
+_BW, _BH = (25*cm, 35.3*cm)
+B6 = (_BW*.5, _BH*.5)
+B5 = (_BH*.5, _BW)
+B4 = (_BW, _BH)
+B3 = (_BH*2, _BW)
+B2 = (_BW*2, _BH*2)
+B1 = (_BH*4, _BW*2)
+B0 = (_BW*4, _BH*4)
+
+PAPER_SIZES = {k:globals()[k.upper()] for k in ('a0 a1 a2 a3 a4 a5 a6 b0 b1 b2'
+               ' b3 b4 b5 b6 letter legal').split()}
+
+# }}}
+
+
+def fmtnum(o):
+    if isinstance(o, float):
+        return pdf_float(o)
+    return unicode_type(o)
+
+
+def serialize(o, stream):
+    if isinstance(o, float):
+        stream.write_raw(pdf_float(o).encode('ascii'))
+    elif isinstance(o, bool):
+        # Must check bool before int as bools are subclasses of int
+        stream.write_raw(b'true' if o else b'false')
+    elif isinstance(o, numbers.Integral):
+        stream.write_raw(unicode_type(o).encode('ascii') if ispy3 else bytes(o))
+    elif hasattr(o, 'pdf_serialize'):
+        o.pdf_serialize(stream)
+    elif o is None:
+        stream.write_raw(b'null')
+    elif isinstance(o, datetime):
+        val = o.strftime("D:%Y%m%d%H%M%%02d%z")%min(59, o.second)
+        if datetime.tzinfo is not None:
+            val = "(%s'%s')"%(val[:-2], val[-2:])
+        stream.write(val.encode('ascii'))
+    else:
+        raise ValueError('Unknown object: %r'%o)
+
+
+class Name(unicode_type):
+
+    def pdf_serialize(self, stream):
+        raw = self.encode('ascii')
+        if len(raw) > 126:
+            raise ValueError('Name too long: %r'%self)
+        raw = bytearray(raw)
+        sharp = ord(b'#')
+        buf = (
+            codepoint_to_chr(x).encode('ascii') if 33 < x < 126 and x != sharp else
+            '#{:x}'.format(x).encode('ascii') for x in raw)
+        stream.write(b'/'+b''.join(buf))
+
+
+def escape_pdf_string(bytestring):
+    indices = []
+    bad = []
+    ba = bytearray(bytestring)
+    bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')}
+    for i, num in enumerate(ba):
+        if num == 40:  # (
+            indices.append((i, 40))
+        elif num == 41:  # )
+            if indices:
+                indices.pop()
+            else:
+                bad.append((i, 41))
+        elif num in bad_map:  # '\n\r\f\b\t\\' see Table 3.2 in PDF 1.7 spec
+            bad.append((i, bad_map[num]))
+    bad = sorted(indices + bad, reverse=True)
+    if not bad:
+        return bytestring
+    for i, repl in bad:
+        ba[i:i+1] = (92, repl)  # 92 = ord('\')
+    return bytes(ba)
+
+
+class String(unicode_type):
+
+    def pdf_serialize(self, stream):
+        try:
+            raw = self.encode('latin1')
+            if raw.startswith(codecs.BOM_UTF16_BE):
+                raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
+        except UnicodeEncodeError:
+            raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
+        stream.write(b'('+escape_pdf_string(raw)+b')')
+
+
+class UTF16String(unicode_type):
+
+    def pdf_serialize(self, stream):
+        raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
+        if False:
+            # Disabled as the parentheses based strings give easier to debug
+            # PDF files
+            stream.write(b'<' + as_hex_bytes(raw) + b'>')
+        else:
+            stream.write(b'('+escape_pdf_string(raw)+b')')
+
+
+class Dictionary(dict):
+
+    def pdf_serialize(self, stream):
+        stream.write(b'<<' + EOL)
+        sorted_keys = sorted(self,
+                             key=lambda x:({'Type':'1', 'Subtype':'2'}.get(
+                                 x, x)+x))
+        for k in sorted_keys:
+            serialize(Name(k), stream)
+            stream.write(b' ')
+            serialize(self[k], stream)
+            stream.write(EOL)
+        stream.write(b'>>' + EOL)
+
+
+class InlineDictionary(Dictionary):
+
+    def pdf_serialize(self, stream):
+        stream.write(b'<< ')
+        for k, v in iteritems(self):
+            serialize(Name(k), stream)
+            stream.write(b' ')
+            serialize(v, stream)
+            stream.write(b' ')
+        stream.write(b'>>')
+
+
+class Array(list):
+
+    def pdf_serialize(self, stream):
+        stream.write(b'[')
+        for i, o in enumerate(self):
+            if i != 0:
+                stream.write(b' ')
+            serialize(o, stream)
+        stream.write(b']')
+
+
+class Stream(BytesIO):
+
+    def __init__(self, compress=False):
+        BytesIO.__init__(self)
+        self.compress = compress
+        self.filters = Array()
+
+    def add_extra_keys(self, d):
+        pass
+
+    def pdf_serialize(self, stream):
+        raw = self.getvalue()
+        dl = len(raw)
+        filters = self.filters
+        if self.compress:
+            filters.append(Name('FlateDecode'))
+            raw = zlib.compress(raw)
+
+        d = InlineDictionary({'Length':len(raw), 'DL':dl})
+        self.add_extra_keys(d)
+        if filters:
+            d['Filter'] = filters
+        serialize(d, stream)
+        stream.write(EOL+b'stream'+EOL)
+        stream.write(raw)
+        stream.write(EOL+b'endstream'+EOL)
+
+    def write_line(self, raw=b''):
+        self.write(raw if isinstance(raw, bytes) else raw.encode('ascii'))
+        self.write(EOL)
+
+    def write(self, raw):
+        super(Stream, self).write(raw if isinstance(raw, bytes) else
+                                  raw.encode('ascii'))
+
+    def write_raw(self, raw):
+        BytesIO.write(self, raw)
+
+
+class Reference(object):
+
+    def __init__(self, num, obj):
+        self.num, self.obj = num, obj
+
+    def pdf_serialize(self, stream):
+        raw = '%d 0 R'%self.num
+        stream.write(raw.encode('ascii'))
+
+    def __repr__(self):
+        return '%d 0 R'%self.num
+
+    def __str__(self):
+        return repr(self)
+# }}}
+
+
+def current_log(newlog=None):
+    if newlog:
+        current_log.ans = newlog
+    return current_log.ans or default_log
+
+
+current_log.ans = None