Initial import

2026-04-06 21:13:34 +02:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
--- a/ebook_converter/ebooks/docx/init.py
+++ b/ebook_converter/ebooks/docx/init.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+
+class InvalidDOCX(ValueError):
+    pass
+
--- a/ebook_converter/ebooks/docx/block_styles.py
+++ b/ebook_converter/ebooks/docx/block_styles.py
@@ -0,0 +1,478 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import numbers
+from collections import OrderedDict
+from polyglot.builtins import iteritems
+
+
+class Inherit(object):
+
+    def __eq__(self, other):
+        return other is self
+
+    def __hash__(self):
+        return id(self)
+
+    def __lt__(self, other):
+        return False
+
+    def __gt__(self, other):
+        return other is not self
+
+    def __ge__(self, other):
+        if self is other:
+            return True
+        return True
+
+    def __le__(self, other):
+        if self is other:
+            return True
+        return False
+
+
+inherit = Inherit()
+
+
+def binary_property(parent, name, XPath, get):
+    vals = XPath('./w:%s' % name)(parent)
+    if not vals:
+        return inherit
+    val = get(vals[0], 'w:val', 'on')
+    return True if val in {'on', '1', 'true'} else False
+
+
+def simple_color(col, auto='black'):
+    if not col or col == 'auto' or len(col) != 6:
+        return auto
+    return '#'+col
+
+
+def simple_float(val, mult=1.0):
+    try:
+        return float(val) * mult
+    except (ValueError, TypeError, AttributeError, KeyError):
+        pass
+
+
+def twips(val, mult=0.05):
+    ''' Parse val as either a pure number representing twentieths of a point or a number followed by the suffix pt, representing pts.'''
+    try:
+        return float(val) * mult
+    except (ValueError, TypeError, AttributeError, KeyError):
+        if val and val.endswith('pt') and mult == 0.05:
+            return twips(val[:-2], mult=1.0)
+
+
+LINE_STYLES = {  # {{{
+    'basicBlackDashes': 'dashed',
+    'basicBlackDots': 'dotted',
+    'basicBlackSquares': 'dashed',
+    'basicThinLines': 'solid',
+    'dashDotStroked': 'groove',
+    'dashed': 'dashed',
+    'dashSmallGap': 'dashed',
+    'dotDash': 'dashed',
+    'dotDotDash': 'dashed',
+    'dotted': 'dotted',
+    'double': 'double',
+    'inset': 'inset',
+    'nil': 'none',
+    'none': 'none',
+    'outset': 'outset',
+    'single': 'solid',
+    'thick': 'solid',
+    'thickThinLargeGap': 'double',
+    'thickThinMediumGap': 'double',
+    'thickThinSmallGap' : 'double',
+    'thinThickLargeGap': 'double',
+    'thinThickMediumGap': 'double',
+    'thinThickSmallGap': 'double',
+    'thinThickThinLargeGap': 'double',
+    'thinThickThinMediumGap': 'double',
+    'thinThickThinSmallGap': 'double',
+    'threeDEmboss': 'ridge',
+    'threeDEngrave': 'groove',
+    'triple': 'double',
+}  # }}}
+
+# Read from XML {{{
+
+border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
+border_edges = ('left', 'top', 'right', 'bottom', 'between')
+
+
+def read_single_border(parent, edge, XPath, get):
+    color = style = width = padding = None
+    for elem in XPath('./w:%s' % edge)(parent):
+        c = get(elem, 'w:color')
+        if c is not None:
+            color = simple_color(c)
+        s = get(elem, 'w:val')
+        if s is not None:
+            style = LINE_STYLES.get(s, 'solid')
+        space = get(elem, 'w:space')
+        if space is not None:
+            try:
+                padding = float(space)
+            except (ValueError, TypeError):
+                pass
+        sz = get(elem, 'w:sz')
+        if sz is not None:
+            # we dont care about art borders (they are only used for page borders)
+            try:
+                width = min(96, max(2, float(sz))) / 8
+            except (ValueError, TypeError):
+                pass
+    return {p:v for p, v in zip(border_props, (padding, width, style, color))}
+
+
+def read_border(parent, dest, XPath, get, border_edges=border_edges, name='pBdr'):
+    vals = {k % edge:inherit for edge in border_edges for k in border_props}
+
+    for border in XPath('./w:' + name)(parent):
+        for edge in border_edges:
+            for prop, val in iteritems(read_single_border(border, edge, XPath, get)):
+                if val is not None:
+                    vals[prop % edge] = val
+
+    for key, val in iteritems(vals):
+        setattr(dest, key, val)
+
+
+def border_to_css(edge, style, css):
+    bs = getattr(style, 'border_%s_style' % edge)
+    bc = getattr(style, 'border_%s_color' % edge)
+    bw = getattr(style, 'border_%s_width' % edge)
+    if isinstance(bw, numbers.Number):
+        # WebKit needs at least 1pt to render borders and 3pt to render double borders
+        bw = max(bw, (3 if bs == 'double' else 1))
+    if bs is not inherit and bs is not None:
+        css['border-%s-style' % edge] = bs
+    if bc is not inherit and bc is not None:
+        css['border-%s-color' % edge] = bc
+    if bw is not inherit and bw is not None:
+        if isinstance(bw, numbers.Number):
+            bw = '%.3gpt' % bw
+        css['border-%s-width' % edge] = bw
+
+
+def read_indent(parent, dest, XPath, get):
+    padding_left = padding_right = text_indent = inherit
+    for indent in XPath('./w:ind')(parent):
+        l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
+        pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None
+        if pl is not None:
+            padding_left = '%.3g%s' % (pl, 'em' if lc is not None else 'pt')
+
+        r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars')
+        pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None
+        if pr is not None:
+            padding_right = '%.3g%s' % (pr, 'em' if rc is not None else 'pt')
+
+        h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars')
+        fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars')
+        h = h if h is None else '-'+h
+        hc = hc if hc is None else '-'+hc
+        ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
+              simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
+        if ti is not None:
+            text_indent = '%.3g%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
+
+    setattr(dest, 'margin_left', padding_left)
+    setattr(dest, 'margin_right', padding_right)
+    setattr(dest, 'text_indent', text_indent)
+
+
+def read_justification(parent, dest, XPath, get):
+    ans = inherit
+    for jc in XPath('./w:jc[@w:val]')(parent):
+        val = get(jc, 'w:val')
+        if not val:
+            continue
+        if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val:
+            ans = 'justify'
+        elif val in {'left', 'center', 'right', 'start', 'end'}:
+            ans = val
+        elif val in {'start', 'end'}:
+            ans = {'start':'left'}.get(val, 'right')
+    setattr(dest, 'text_align', ans)
+
+
+def read_spacing(parent, dest, XPath, get):
+    padding_top = padding_bottom = line_height = inherit
+    for s in XPath('./w:spacing')(parent):
+        a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
+        pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None
+        if pb is not None:
+            padding_bottom = '%.3g%s' % (pb, 'ex' if al is not None else 'pt')
+
+        b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing')
+        pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None
+        if pt is not None:
+            padding_top = '%.3g%s' % (pt, 'ex' if bl is not None else 'pt')
+
+        l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto')
+        if l is not None:
+            lh = simple_float(l, 0.05) if lr in {'exact', 'atLeast'} else simple_float(l, 1/240.0)
+            if lh is not None:
+                line_height = '%.3g%s' % (lh, 'pt' if lr in {'exact', 'atLeast'} else '')
+
+    setattr(dest, 'margin_top', padding_top)
+    setattr(dest, 'margin_bottom', padding_bottom)
+    setattr(dest, 'line_height', line_height)
+
+
+def read_shd(parent, dest, XPath, get):
+    ans = inherit
+    for shd in XPath('./w:shd[@w:fill]')(parent):
+        val = get(shd, 'w:fill')
+        if val:
+            ans = simple_color(val, auto='transparent')
+    setattr(dest, 'background_color', ans)
+
+
+def read_numbering(parent, dest, XPath, get):
+    lvl = num_id = inherit
+    for np in XPath('./w:numPr')(parent):
+        for ilvl in XPath('./w:ilvl[@w:val]')(np):
+            try:
+                lvl = int(get(ilvl, 'w:val'))
+            except (ValueError, TypeError):
+                pass
+        for num in XPath('./w:numId[@w:val]')(np):
+            num_id = get(num, 'w:val')
+    setattr(dest, 'numbering_id', num_id)
+    setattr(dest, 'numbering_level', lvl)
+
+
+class Frame(object):
+
+    all_attributes = ('drop_cap', 'h', 'w', 'h_anchor', 'h_rule', 'v_anchor', 'wrap',
+                      'h_space', 'v_space', 'lines', 'x_align', 'y_align', 'x', 'y')
+
+    def __init__(self, fp, XPath, get):
+        self.drop_cap = get(fp, 'w:dropCap', 'none')
+        try:
+            self.h = int(get(fp, 'w:h'))/20
+        except (ValueError, TypeError):
+            self.h = 0
+        try:
+            self.w = int(get(fp, 'w:w'))/20
+        except (ValueError, TypeError):
+            self.w = None
+        try:
+            self.x = int(get(fp, 'w:x'))/20
+        except (ValueError, TypeError):
+            self.x = 0
+        try:
+            self.y = int(get(fp, 'w:y'))/20
+        except (ValueError, TypeError):
+            self.y = 0
+
+        self.h_anchor = get(fp, 'w:hAnchor', 'page')
+        self.h_rule = get(fp, 'w:hRule', 'auto')
+        self.v_anchor = get(fp, 'w:vAnchor', 'page')
+        self.wrap = get(fp, 'w:wrap', 'around')
+        self.x_align = get(fp, 'w:xAlign')
+        self.y_align = get(fp, 'w:yAlign')
+
+        try:
+            self.h_space = int(get(fp, 'w:hSpace'))/20
+        except (ValueError, TypeError):
+            self.h_space = 0
+        try:
+            self.v_space = int(get(fp, 'w:vSpace'))/20
+        except (ValueError, TypeError):
+            self.v_space = 0
+        try:
+            self.lines = int(get(fp, 'w:lines'))
+        except (ValueError, TypeError):
+            self.lines = 1
+
+    def css(self, page):
+        is_dropcap = self.drop_cap in {'drop', 'margin'}
+        ans = {'overflow': 'hidden'}
+
+        if is_dropcap:
+            ans['float'] = 'left'
+            ans['margin'] = '0'
+            ans['padding-right'] = '0.2em'
+        else:
+            if self.h_rule != 'auto':
+                t = 'min-height' if self.h_rule == 'atLeast' else 'height'
+                ans[t] = '%.3gpt' % self.h
+            if self.w is not None:
+                ans['width'] = '%.3gpt' % self.w
+            ans['padding-top'] = ans['padding-bottom'] = '%.3gpt' % self.v_space
+            if self.wrap not in {None, 'none'}:
+                ans['padding-left'] = ans['padding-right'] = '%.3gpt' % self.h_space
+                if self.x_align is None:
+                    fl = 'left' if self.x/page.width < 0.5 else 'right'
+                else:
+                    fl = 'right' if self.x_align == 'right' else 'left'
+                ans['float'] = fl
+        return ans
+
+    def __eq__(self, other):
+        for x in self.all_attributes:
+            if getattr(other, x, inherit) != getattr(self, x):
+                return False
+        return True
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+
+def read_frame(parent, dest, XPath, get):
+    ans = inherit
+    for fp in XPath('./w:framePr')(parent):
+        ans = Frame(fp, XPath, get)
+    setattr(dest, 'frame', ans)
+
+# }}}
+
+
+class ParagraphStyle(object):
+
+    all_properties = (
+        'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
+        'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
+        'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
+        'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
+
+        # Border margins padding
+        'border_left_width', 'border_left_style', 'border_left_color', 'padding_left',
+        'border_top_width', 'border_top_style', 'border_top_color', 'padding_top',
+        'border_right_width', 'border_right_style', 'border_right_color', 'padding_right',
+        'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom',
+        'border_between_width', 'border_between_style', 'border_between_color', 'padding_between',
+        'margin_left', 'margin_top', 'margin_right', 'margin_bottom',
+
+        # Misc.
+        'text_indent', 'text_align', 'line_height', 'background_color',
+        'numbering_id', 'numbering_level', 'font_family', 'font_size', 'color', 'frame',
+        'cs_font_size', 'cs_font_family',
+    )
+
+    def __init__(self, namespace, pPr=None):
+        self.namespace = namespace
+        self.linked_style = None
+        if pPr is None:
+            for p in self.all_properties:
+                setattr(self, p, inherit)
+        else:
+            for p in (
+                'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
+                'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
+                'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
+                'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
+            ):
+                setattr(self, p, binary_property(pPr, p, namespace.XPath, namespace.get))
+
+            for x in ('border', 'indent', 'justification', 'spacing', 'shd', 'numbering', 'frame'):
+                f = read_funcs[x]
+                f(pPr, self, namespace.XPath, namespace.get)
+
+            for s in namespace.XPath('./w:pStyle[@w:val]')(pPr):
+                self.linked_style = namespace.get(s, 'w:val')
+
+            self.font_family = self.font_size = self.color = self.cs_font_size = self.cs_font_family = inherit
+
+        self._css = None
+        self._border_key = None
+
+    def update(self, other):
+        for prop in self.all_properties:
+            nval = getattr(other, prop)
+            if nval is not inherit:
+                setattr(self, prop, nval)
+        if other.linked_style is not None:
+            self.linked_style = other.linked_style
+
+    def resolve_based_on(self, parent):
+        for p in self.all_properties:
+            val = getattr(self, p)
+            if val is inherit:
+                setattr(self, p, getattr(parent, p))
+
+    @property
+    def css(self):
+        if self._css is None:
+            self._css = c = OrderedDict()
+            if self.keepLines is True:
+                c['page-break-inside'] = 'avoid'
+            if self.pageBreakBefore is True:
+                c['page-break-before'] = 'always'
+            if self.keepNext is True:
+                c['page-break-after'] = 'avoid'
+            for edge in ('left', 'top', 'right', 'bottom'):
+                border_to_css(edge, self, c)
+                val = getattr(self, 'padding_%s' % edge)
+                if val is not inherit:
+                    c['padding-%s' % edge] = '%.3gpt' % val
+                val = getattr(self, 'margin_%s' % edge)
+                if val is not inherit:
+                    c['margin-%s' % edge] = val
+
+            if self.line_height not in {inherit, '1'}:
+                c['line-height'] = self.line_height
+
+            for x in ('text_indent', 'background_color', 'font_family', 'font_size', 'color'):
+                val = getattr(self, x)
+                if val is not inherit:
+                    if x == 'font_size':
+                        val = '%.3gpt' % val
+                    c[x.replace('_', '-')] = val
+            ta = self.text_align
+            if ta is not inherit:
+                if self.bidi is True:
+                    ta = {'left':'right', 'right':'left'}.get(ta, ta)
+                c['text-align'] = ta
+
+        return self._css
+
+    @property
+    def border_key(self):
+        if self._border_key is None:
+            k = []
+            for edge in border_edges:
+                for prop in border_props:
+                    prop = prop % edge
+                    k.append(getattr(self, prop))
+            self._border_key = tuple(k)
+        return self._border_key
+
+    def has_identical_borders(self, other_style):
+        return self.border_key == getattr(other_style, 'border_key', None)
+
+    def clear_borders(self):
+        for edge in border_edges[:-1]:
+            for prop in ('width', 'color', 'style'):
+                setattr(self, 'border_%s_%s' % (edge, prop), inherit)
+
+    def clone_border_styles(self):
+        style = ParagraphStyle(self.namespace)
+        for edge in border_edges[:-1]:
+            for prop in ('width', 'color', 'style'):
+                attr = 'border_%s_%s' % (edge, prop)
+                setattr(style, attr, getattr(self, attr))
+        return style
+
+    def apply_between_border(self):
+        for prop in ('width', 'color', 'style'):
+            setattr(self, 'border_bottom_%s' % prop, getattr(self, 'border_between_%s' % prop))
+
+    def has_visible_border(self):
+        for edge in border_edges[:-1]:
+            bw, bs = getattr(self, 'border_%s_width' % edge), getattr(self, 'border_%s_style' % edge)
+            if bw is not inherit and bw and bs is not inherit and bs != 'none':
+                return True
+        return False
+
+
+read_funcs = {k[5:]:v for k, v in iteritems(globals()) if k.startswith('read_')}
--- a/ebook_converter/ebooks/docx/char_styles.py
+++ b/ebook_converter/ebooks/docx/char_styles.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import OrderedDict
+from calibre.ebooks.docx.block_styles import (  # noqa
+    inherit, simple_color, LINE_STYLES, simple_float, binary_property, read_shd)
+
+# Read from XML {{{
+
+
+def read_text_border(parent, dest, XPath, get):
+    border_color = border_style = border_width = padding = inherit
+    elems = XPath('./w:bdr')(parent)
+    if elems and elems[0].attrib:
+        border_color = simple_color('auto')
+        border_style = 'none'
+        border_width = 1
+    for elem in elems:
+        color = get(elem, 'w:color')
+        if color is not None:
+            border_color = simple_color(color)
+        style = get(elem, 'w:val')
+        if style is not None:
+            border_style = LINE_STYLES.get(style, 'solid')
+        space = get(elem, 'w:space')
+        if space is not None:
+            try:
+                padding = float(space)
+            except (ValueError, TypeError):
+                pass
+        sz = get(elem, 'w:sz')
+        if sz is not None:
+            # we dont care about art borders (they are only used for page borders)
+            try:
+                # A border of less than 1pt is not rendered by WebKit
+                border_width = min(96, max(8, float(sz))) / 8
+            except (ValueError, TypeError):
+                pass
+
+    setattr(dest, 'border_color', border_color)
+    setattr(dest, 'border_style', border_style)
+    setattr(dest, 'border_width', border_width)
+    setattr(dest, 'padding', padding)
+
+
+def read_color(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:color[@w:val]')(parent):
+        val = get(col, 'w:val')
+        if not val:
+            continue
+        ans = simple_color(val)
+    setattr(dest, 'color', ans)
+
+
+def convert_highlight_color(val):
+    return {
+        'darkBlue': '#000080', 'darkCyan': '#008080', 'darkGray': '#808080',
+        'darkGreen': '#008000', 'darkMagenta': '#800080', 'darkRed': '#800000', 'darkYellow': '#808000',
+        'lightGray': '#c0c0c0'}.get(val, val)
+
+
+def read_highlight(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:highlight[@w:val]')(parent):
+        val = get(col, 'w:val')
+        if not val:
+            continue
+        if not val or val == 'none':
+            val = 'transparent'
+        else:
+            val = convert_highlight_color(val)
+        ans = val
+    setattr(dest, 'highlight', ans)
+
+
+def read_lang(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:lang[@w:val]')(parent):
+        val = get(col, 'w:val')
+        if not val:
+            continue
+        try:
+            code = int(val, 16)
+        except (ValueError, TypeError):
+            ans = val
+        else:
+            from calibre.ebooks.docx.lcid import lcid
+            val = lcid.get(code, None)
+            if val:
+                ans = val
+    setattr(dest, 'lang', ans)
+
+
+def read_letter_spacing(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:spacing[@w:val]')(parent):
+        val = simple_float(get(col, 'w:val'), 0.05)
+        if val is not None:
+            ans = val
+    setattr(dest, 'letter_spacing', ans)
+
+
+def read_underline(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:u[@w:val]')(parent):
+        val = get(col, 'w:val')
+        if val:
+            ans = val if val == 'none' else 'underline'
+    setattr(dest, 'text_decoration', ans)
+
+
+def read_vert_align(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:vertAlign[@w:val]')(parent):
+        val = get(col, 'w:val')
+        if val and val in {'baseline', 'subscript', 'superscript'}:
+            ans = val
+    setattr(dest, 'vert_align', ans)
+
+
+def read_position(parent, dest, XPath, get):
+    ans = inherit
+    for col in XPath('./w:position[@w:val]')(parent):
+        val = get(col, 'w:val')
+        try:
+            ans = float(val)/2.0
+        except Exception:
+            pass
+    setattr(dest, 'position', ans)
+
+
+def read_font(parent, dest, XPath, get):
+    ff = inherit
+    for col in XPath('./w:rFonts')(parent):
+        val = get(col, 'w:asciiTheme')
+        if val:
+            val = '|%s|' % val
+        else:
+            val = get(col, 'w:ascii')
+        if val:
+            ff = val
+    setattr(dest, 'font_family', ff)
+    for col in XPath('./w:sz[@w:val]')(parent):
+        val = simple_float(get(col, 'w:val'), 0.5)
+        if val is not None:
+            setattr(dest, 'font_size', val)
+            return
+    setattr(dest, 'font_size', inherit)
+
+
+def read_font_cs(parent, dest, XPath, get):
+    ff = inherit
+    for col in XPath('./w:rFonts')(parent):
+        val = get(col, 'w:csTheme')
+        if val:
+            val = '|%s|' % val
+        else:
+            val = get(col, 'w:cs')
+        if val:
+            ff = val
+    setattr(dest, 'cs_font_family', ff)
+    for col in XPath('./w:szCS[@w:val]')(parent):
+        val = simple_float(get(col, 'w:val'), 0.5)
+        if val is not None:
+            setattr(dest, 'font_size', val)
+            return
+    setattr(dest, 'cs_font_size', inherit)
+
+# }}}
+
+
+class RunStyle(object):
+
+    all_properties = {
+        'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint',
+        'rtl', 'shadow', 'smallCaps', 'strike', 'vanish', 'webHidden',
+
+        'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background_color',
+        'letter_spacing', 'font_size', 'text_decoration', 'vert_align', 'lang', 'font_family', 'position',
+        'cs_font_size', 'cs_font_family'
+    }
+
+    toggle_properties = {
+        'b', 'bCs', 'caps', 'emboss', 'i', 'iCs', 'imprint', 'shadow', 'smallCaps', 'strike', 'vanish',
+    }
+
+    def __init__(self, namespace, rPr=None):
+        self.namespace = namespace
+        self.linked_style = None
+        if rPr is None:
+            for p in self.all_properties:
+                setattr(self, p, inherit)
+        else:
+            X, g = namespace.XPath, namespace.get
+            for p in (
+                'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
+                'smallCaps', 'strike', 'vanish', 'webHidden',
+            ):
+                setattr(self, p, binary_property(rPr, p, X, g))
+
+            read_font(rPr, self, X, g)
+            read_font_cs(rPr, self, X, g)
+            read_text_border(rPr, self, X, g)
+            read_color(rPr, self, X, g)
+            read_highlight(rPr, self, X, g)
+            read_shd(rPr, self, X, g)
+            read_letter_spacing(rPr, self, X, g)
+            read_underline(rPr, self, X, g)
+            read_vert_align(rPr, self, X, g)
+            read_position(rPr, self, X, g)
+            read_lang(rPr, self, X, g)
+
+            for s in X('./w:rStyle[@w:val]')(rPr):
+                self.linked_style = g(s, 'w:val')
+
+        self._css = None
+
+    def update(self, other):
+        for prop in self.all_properties:
+            nval = getattr(other, prop)
+            if nval is not inherit:
+                setattr(self, prop, nval)
+        if other.linked_style is not None:
+            self.linked_style = other.linked_style
+
+    def resolve_based_on(self, parent):
+        for p in self.all_properties:
+            val = getattr(self, p)
+            if val is inherit:
+                setattr(self, p, getattr(parent, p))
+
+    def get_border_css(self, ans):
+        for x in ('color', 'style', 'width'):
+            val = getattr(self, 'border_'+x)
+            if x == 'width' and val is not inherit:
+                val = '%.3gpt' % val
+            if val is not inherit:
+                ans['border-%s' % x] = val
+
+    def clear_border_css(self):
+        for x in ('color', 'style', 'width'):
+            setattr(self, 'border_'+x, inherit)
+
+    @property
+    def css(self):
+        if self._css is None:
+            c = self._css = OrderedDict()
+            td = set()
+            if self.text_decoration is not inherit:
+                td.add(self.text_decoration)
+            if self.strike and self.strike is not inherit:
+                td.add('line-through')
+            if self.dstrike and self.dstrike is not inherit:
+                td.add('line-through')
+            if td:
+                c['text-decoration'] = ' '.join(td)
+            if self.caps is True:
+                c['text-transform'] = 'uppercase'
+            if self.i is True:
+                c['font-style'] = 'italic'
+            if self.shadow and self.shadow is not inherit:
+                c['text-shadow'] = '2px 2px'
+            if self.smallCaps is True:
+                c['font-variant'] = 'small-caps'
+            if self.vanish is True or self.webHidden is True:
+                c['display'] = 'none'
+
+            self.get_border_css(c)
+            if self.padding is not inherit:
+                c['padding'] = '%.3gpt' % self.padding
+
+            for x in ('color', 'background_color'):
+                val = getattr(self, x)
+                if val is not inherit:
+                    c[x.replace('_', '-')] = val
+
+            for x in ('letter_spacing', 'font_size'):
+                val = getattr(self, x)
+                if val is not inherit:
+                    c[x.replace('_', '-')] = '%.3gpt' % val
+
+            if self.position is not inherit:
+                c['vertical-align'] = '%.3gpt' % self.position
+
+            if self.highlight is not inherit and self.highlight != 'transparent':
+                c['background-color'] = self.highlight
+
+            if self.b:
+                c['font-weight'] = 'bold'
+
+            if self.font_family is not inherit:
+                c['font-family'] = self.font_family
+
+        return self._css
+
+    def same_border(self, other):
+        return self.get_border_css({}) == other.get_border_css({})
--- a/ebook_converter/ebooks/docx/cleanup.py
+++ b/ebook_converter/ebooks/docx/cleanup.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os
+from polyglot.builtins import itervalues, range
+
+NBSP = '\xa0'
+
+
+def mergeable(previous, current):
+    if previous.tail or current.tail:
+        return False
+    if previous.get('class', None) != current.get('class', None):
+        return False
+    if current.get('id', False):
+        return False
+    for attr in ('style', 'lang', 'dir'):
+        if previous.get(attr) != current.get(attr):
+            return False
+    try:
+        return next(previous.itersiblings()) is current
+    except StopIteration:
+        return False
+
+
+def append_text(parent, text):
+    if len(parent) > 0:
+        parent[-1].tail = (parent[-1].tail or '') + text
+    else:
+        parent.text = (parent.text or '') + text
+
+
+def merge(parent, span):
+    if span.text:
+        append_text(parent, span.text)
+    for child in span:
+        parent.append(child)
+    if span.tail:
+        append_text(parent, span.tail)
+    span.getparent().remove(span)
+
+
+def merge_run(run):
+    parent = run[0]
+    for span in run[1:]:
+        merge(parent, span)
+
+
+def liftable(css):
+    # A <span> is liftable if all its styling would work just as well if it is
+    # specified on the parent element.
+    prefixes = {x.partition('-')[0] for x in css}
+    return not (prefixes - {'text', 'font', 'letter', 'color', 'background'})
+
+
+def add_text(elem, attr, text):
+    old = getattr(elem, attr) or ''
+    setattr(elem, attr, old + text)
+
+
+def lift(span):
+    # Replace an element by its content (text, children and tail)
+    parent = span.getparent()
+    idx = parent.index(span)
+    try:
+        last_child = span[-1]
+    except IndexError:
+        last_child = None
+
+    if span.text:
+        if idx == 0:
+            add_text(parent, 'text', span.text)
+        else:
+            add_text(parent[idx - 1], 'tail', span.text)
+
+    for child in reversed(span):
+        parent.insert(idx, child)
+    parent.remove(span)
+
+    if span.tail:
+        if last_child is None:
+            if idx == 0:
+                add_text(parent, 'text', span.tail)
+            else:
+                add_text(parent[idx - 1], 'tail', span.tail)
+        else:
+            add_text(last_child, 'tail', span.tail)
+
+
+def before_count(root, tag, limit=10):
+    body = root.xpath('//body[1]')
+    if not body:
+        return limit
+    ans = 0
+    for elem in body[0].iterdescendants():
+        if elem is tag:
+            return ans
+        ans += 1
+        if ans > limit:
+            return limit
+
+
+def wrap_contents(tag_name, elem):
+    wrapper = elem.makeelement(tag_name)
+    wrapper.text, elem.text = elem.text, ''
+    for child in elem:
+        elem.remove(child)
+        wrapper.append(child)
+    elem.append(wrapper)
+
+
+def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath):
+    # Apply vertical-align
+    for span in root.xpath('//span[@data-docx-vert]'):
+        wrap_contents(span.attrib.pop('data-docx-vert'), span)
+
+    # Move <hr>s outside paragraphs, if possible.
+    pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
+    for hr in root.xpath('//span/hr'):
+        p = pancestor(hr)
+        if p:
+            p = p[0]
+            descendants = tuple(p.iterdescendants())
+            if descendants[-1] is hr:
+                parent = p.getparent()
+                idx = parent.index(p)
+                parent.insert(idx+1, hr)
+                hr.tail = '\n\t'
+
+    # Merge consecutive spans that have the same styling
+    current_run = []
+    for span in root.xpath('//span'):
+        if not current_run:
+            current_run.append(span)
+        else:
+            last = current_run[-1]
+            if mergeable(last, span):
+                current_run.append(span)
+            else:
+                if len(current_run) > 1:
+                    merge_run(current_run)
+                current_run = [span]
+
+    # Process dir attributes
+    class_map = dict(itervalues(styles.classes))
+    parents = ('p', 'div') + tuple('h%d' % i for i in range(1, 7))
+    for parent in root.xpath('//*[(%s)]' % ' or '.join('name()="%s"' % t for t in parents)):
+        # Ensure that children of rtl parents that are not rtl have an
+        # explicit dir set. Also, remove dir from children if it is the same as
+        # that of the parent.
+        if len(parent):
+            parent_dir = parent.get('dir')
+            for child in parent.iterchildren('span'):
+                child_dir = child.get('dir')
+                if parent_dir == 'rtl' and child_dir != 'rtl':
+                    child_dir = 'ltr'
+                    child.set('dir', child_dir)
+                if child_dir and child_dir == parent_dir:
+                    child.attrib.pop('dir')
+
+    # Remove unnecessary span tags that are the only child of a parent block
+    # element
+    for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)):
+        if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None):
+            # We have a block whose contents are entirely enclosed in a <span>
+            span = parent[0]
+            span_class = span.get('class', None)
+            span_css = class_map.get(span_class, {})
+            span_dir = span.get('dir')
+            if liftable(span_css) and (not span_dir or span_dir == parent.get('dir')):
+                pclass = parent.get('class', None)
+                if span_class:
+                    pclass = (pclass + ' ' + span_class) if pclass else span_class
+                    parent.set('class', pclass)
+                parent.text = span.text
+                parent.remove(span)
+                if span.get('lang'):
+                    parent.set('lang', span.get('lang'))
+                if span.get('dir'):
+                    parent.set('dir', span.get('dir'))
+                for child in span:
+                    parent.append(child)
+
+    # Make spans whose only styling is bold or italic into <b> and <i> tags
+    for span in root.xpath('//span[@class and not(@style)]'):
+        css = class_map.get(span.get('class', None), {})
+        if len(css) == 1:
+            if css == {'font-style':'italic'}:
+                span.tag = 'i'
+                del span.attrib['class']
+            elif css == {'font-weight':'bold'}:
+                span.tag = 'b'
+                del span.attrib['class']
+
+    # Get rid of <span>s that have no styling
+    for span in root.xpath('//span[not(@class or @id or @style or @lang or @dir)]'):
+        lift(span)
+
+    # Convert <p><br style="page-break-after:always"> </p> style page breaks
+    # into something the viewer will render as a page break
+    for p in root.xpath('//p[br[@style="page-break-after:always"]]'):
+        if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()):
+            p.remove(p[0])
+            prefix = p.get('style', '')
+            if prefix:
+                prefix += '; '
+            p.set('style', prefix + 'page-break-after:always')
+            p.text = NBSP if not p.text else p.text
+
+    if detect_cover:
+        # Check if the first image in the document is possibly a cover
+        img = root.xpath('//img[@src][1]')
+        if img:
+            img = img[0]
+            path = os.path.join(dest_dir, img.get('src'))
+            if os.path.exists(path) and before_count(root, img, limit=10) < 5:
+                from calibre.utils.imghdr import identify
+                try:
+                    with lopen(path, 'rb') as imf:
+                        fmt, width, height = identify(imf)
+                except:
+                    width, height, fmt = 0, 0, None  # noqa
+                del fmt
+                try:
+                    is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
+                except ZeroDivisionError:
+                    is_cover = False
+                if is_cover:
+                    log.debug('Detected an image that looks like a cover')
+                    img.getparent().remove(img)
+                    return path
--- a/ebook_converter/ebooks/docx/container.py
+++ b/ebook_converter/ebooks/docx/container.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os, sys, shutil
+
+from lxml import etree
+
+from calibre import walk, guess_type
+from calibre.ebooks.metadata import string_to_authors, authors_to_sort_string
+from calibre.ebooks.metadata.book.base import Metadata
+from calibre.ebooks.docx import InvalidDOCX
+from calibre.ebooks.docx.names import DOCXNamespace
+from calibre.ptempfile import PersistentTemporaryDirectory
+from calibre.utils.localization import canonicalize_lang
+from calibre.utils.logging import default_log
+from calibre.utils.zipfile import ZipFile
+from calibre.utils.xml_parse import safe_xml_fromstring
+
+
+def fromstring(raw, parser=None):
+    return safe_xml_fromstring(raw)
+
+# Read metadata {{{
+
+
+def read_doc_props(raw, mi, XPath):
+    root = fromstring(raw)
+    titles = XPath('//dc:title')(root)
+    if titles:
+        title = titles[0].text
+        if title and title.strip():
+            mi.title = title.strip()
+    tags = []
+    for subject in XPath('//dc:subject')(root):
+        if subject.text and subject.text.strip():
+            tags.append(subject.text.strip().replace(',', '_'))
+    for keywords in XPath('//cp:keywords')(root):
+        if keywords.text and keywords.text.strip():
+            for x in keywords.text.split():
+                tags.extend(y.strip() for y in x.split(',') if y.strip())
+    if tags:
+        mi.tags = tags
+    authors = XPath('//dc:creator')(root)
+    aut = []
+    for author in authors:
+        if author.text and author.text.strip():
+            aut.extend(string_to_authors(author.text))
+    if aut:
+        mi.authors = aut
+        mi.author_sort = authors_to_sort_string(aut)
+
+    desc = XPath('//dc:description')(root)
+    if desc:
+        raw = etree.tostring(desc[0], method='text', encoding='unicode')
+        raw = raw.replace('_x000d_', '')  # Word 2007 mangles newlines in the summary
+        mi.comments = raw.strip()
+
+    langs = []
+    for lang in XPath('//dc:language')(root):
+        if lang.text and lang.text.strip():
+            l = canonicalize_lang(lang.text)
+            if l:
+                langs.append(l)
+    if langs:
+        mi.languages = langs
+
+
+def read_app_props(raw, mi):
+    root = fromstring(raw)
+    company = root.xpath('//*[local-name()="Company"]')
+    if company and company[0].text and company[0].text.strip():
+        mi.publisher = company[0].text.strip()
+
+
+def read_default_style_language(raw, mi, XPath):
+    root = fromstring(raw)
+    for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root):
+        lang = canonicalize_lang(lang)
+        if lang:
+            mi.languages = [lang]
+            break
+# }}}
+
+
+class DOCX(object):
+
+    def __init__(self, path_or_stream, log=None, extract=True):
+        self.docx_is_transitional = True
+        stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
+        self.name = getattr(stream, 'name', None) or '<stream>'
+        self.log = log or default_log
+        if extract:
+            self.extract(stream)
+        else:
+            self.init_zipfile(stream)
+        self.read_content_types()
+        self.read_package_relationships()
+        self.namespace = DOCXNamespace(self.docx_is_transitional)
+
+    def init_zipfile(self, stream):
+        self.zipf = ZipFile(stream)
+        self.names = frozenset(self.zipf.namelist())
+
+    def extract(self, stream):
+        self.tdir = PersistentTemporaryDirectory('docx_container')
+        try:
+            zf = ZipFile(stream)
+            zf.extractall(self.tdir)
+        except:
+            self.log.exception('DOCX appears to be invalid ZIP file, trying a'
+                    ' more forgiving ZIP parser')
+            from calibre.utils.localunzip import extractall
+            stream.seek(0)
+            extractall(stream, self.tdir)
+
+        self.names = {}
+        for f in walk(self.tdir):
+            name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
+            self.names[name] = f
+
+    def exists(self, name):
+        return name in self.names
+
+    def read(self, name):
+        if hasattr(self, 'zipf'):
+            return self.zipf.open(name).read()
+        path = self.names[name]
+        with open(path, 'rb') as f:
+            return f.read()
+
+    def read_content_types(self):
+        try:
+            raw = self.read('[Content_Types].xml')
+        except KeyError:
+            raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
+        root = fromstring(raw)
+        self.content_types = {}
+        self.default_content_types = {}
+        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
+            self.default_content_types[item.get('Extension').lower()] = item.get('ContentType')
+        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'):
+            name = item.get('PartName').lstrip('/')
+            self.content_types[name] = item.get('ContentType')
+
+    def content_type(self, name):
+        if name in self.content_types:
+            return self.content_types[name]
+        ext = name.rpartition('.')[-1].lower()
+        if ext in self.default_content_types:
+            return self.default_content_types[ext]
+        return guess_type(name)[0]
+
+    def read_package_relationships(self):
+        try:
+            raw = self.read('_rels/.rels')
+        except KeyError:
+            raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
+        root = fromstring(raw)
+        self.relationships = {}
+        self.relationships_rmap = {}
+        for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
+            target = item.get('Target').lstrip('/')
+            typ = item.get('Type')
+            if target == 'word/document.xml':
+                self.docx_is_transitional = typ != 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument'
+            self.relationships[typ] = target
+            self.relationships_rmap[target] = typ
+
+    @property
+    def document_name(self):
+        name = self.relationships.get(self.namespace.names['DOCUMENT'], None)
+        if name is None:
+            names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
+            if not names:
+                raise InvalidDOCX('The file %s docx file has no main document' % self.name)
+            name = names[0]
+        return name
+
+    @property
+    def document(self):
+        return fromstring(self.read(self.document_name))
+
+    @property
+    def document_relationships(self):
+        return self.get_relationships(self.document_name)
+
+    def get_relationships(self, name):
+        base = '/'.join(name.split('/')[:-1])
+        by_id, by_type = {}, {}
+        parts = name.split('/')
+        name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels'])
+        try:
+            raw = self.read(name)
+        except KeyError:
+            pass
+        else:
+            root = fromstring(raw)
+            for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
+                target = item.get('Target')
+                if item.get('TargetMode', None) != 'External' and not target.startswith('#'):
+                    target = '/'.join((base, target.lstrip('/')))
+                typ = item.get('Type')
+                Id = item.get('Id')
+                by_id[Id] = by_type[typ] = target
+
+        return by_id, by_type
+
+    def get_document_properties_names(self):
+        name = self.relationships.get(self.namespace.names['DOCPROPS'], None)
+        if name is None:
+            names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
+            if names:
+                name = names[0]
+        yield name
+        name = self.relationships.get(self.namespace.names['APPPROPS'], None)
+        if name is None:
+            names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
+            if names:
+                name = names[0]
+        yield name
+
+    @property
+    def metadata(self):
+        mi = Metadata(_('Unknown'))
+        dp_name, ap_name = self.get_document_properties_names()
+        if dp_name:
+            try:
+                raw = self.read(dp_name)
+            except KeyError:
+                pass
+            else:
+                read_doc_props(raw, mi, self.namespace.XPath)
+        if mi.is_null('language'):
+            try:
+                raw = self.read('word/styles.xml')
+            except KeyError:
+                pass
+            else:
+                read_default_style_language(raw, mi, self.namespace.XPath)
+
+        ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None)
+        if ap_name:
+            try:
+                raw = self.read(ap_name)
+            except KeyError:
+                pass
+            else:
+                read_app_props(raw, mi)
+
+        return mi
+
+    def close(self):
+        if hasattr(self, 'zipf'):
+            self.zipf.close()
+        else:
+            try:
+                shutil.rmtree(self.tdir)
+            except EnvironmentError:
+                pass
+
+
+if __name__ == '__main__':
+    d = DOCX(sys.argv[-1], extract=False)
+    print(d.metadata)
--- a/ebook_converter/ebooks/docx/fields.py
+++ b/ebook_converter/ebooks/docx/fields.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re
+
+from calibre.ebooks.docx.index import process_index, polish_index_markup
+from polyglot.builtins import iteritems, native_string_type
+
+
+class Field(object):
+
+    def __init__(self, start):
+        self.start = start
+        self.end = None
+        self.contents = []
+        self.buf = []
+        self.instructions = None
+        self.name = None
+
+    def add_instr(self, elem):
+        self.add_raw(elem.text)
+
+    def add_raw(self, raw):
+        if not raw:
+            return
+        if self.name is None:
+            # There are cases where partial index entries end with
+            # a significant space, along the lines of
+            # <>Summary <>  ...  <>Hearing<>.
+            # No known examples of starting with a space yet.
+            # self.name, raw = raw.strip().partition(' ')[0::2]
+            self.name, raw = raw.lstrip().partition(' ')[0::2]
+        self.buf.append(raw)
+
+    def finalize(self):
+        self.instructions = ''.join(self.buf)
+        del self.buf
+
+
+WORD, FLAG = 0, 1
+scanner = re.Scanner([
+    (r'\\\S{1}', lambda s, t: (t, FLAG)),  # A flag of the form \x
+    (r'"[^"]*"', lambda s, t: (t[1:-1], WORD)),  # Quoted word
+    (r'[^\s\\"]\S*', lambda s, t: (t, WORD)),  # A non-quoted word, must not start with a backslash or a space or a quote
+    (r'\s+', None),
+], flags=re.DOTALL)
+
+null = object()
+
+
+def parser(name, field_map, default_field_name=None):
+
+    field_map = dict((x.split(':') for x in field_map.split()))
+
+    def parse(raw, log=None):
+        ans = {}
+        last_option = None
+        raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
+        for token, token_type in scanner.scan(raw)[0]:
+            token = token.replace('\x01', '\\').replace('\x02', '"')
+            if token_type is FLAG:
+                last_option = field_map.get(token[1], null)
+                if last_option is not None:
+                    ans[last_option] = None
+            elif token_type is WORD:
+                if last_option is None:
+                    ans[default_field_name] = token
+                else:
+                    ans[last_option] = token
+                    last_option = None
+        ans.pop(null, None)
+        return ans
+
+    parse.__name__ = native_string_type('parse_' + name)
+
+    return parse
+
+
+parse_hyperlink = parser('hyperlink',
+    'l:anchor m:image-map n:target o:title t:target', 'url')
+
+parse_xe = parser('xe',
+    'b:bold i:italic f:entry-type r:page-range-bookmark t:page-number-text y:yomi', 'text')
+
+parse_index = parser('index',
+    'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator'
+    ' f:entry-type g:page-range-separator h:heading k:crossref-separator'
+    ' l:page-number-separator p:letter-range s:sequence-name r:run-together y:yomi z:langcode')
+
+parse_ref = parser('ref',
+    'd:separator f:footnote h:hyperlink n:number p:position r:relative-number t:suppress w:number-full-context')
+
+parse_noteref = parser('noteref',
+                   'f:footnote h:hyperlink p:position')
+
+
+class Fields(object):
+
+    def __init__(self, namespace):
+        self.namespace = namespace
+        self.fields = []
+        self.index_bookmark_counter = 0
+        self.index_bookmark_prefix = 'index-'
+
+    def __call__(self, doc, log):
+        all_ids = frozenset(self.namespace.XPath('//*/@w:id')(doc))
+        c = 0
+        while self.index_bookmark_prefix in all_ids:
+            c += 1
+            self.index_bookmark_prefix = self.index_bookmark_prefix.replace('-', '%d-' % c)
+        stack = []
+        for elem in self.namespace.XPath(
+            '//*[name()="w:p" or name()="w:r" or'
+            ' name()="w:instrText" or'
+            ' (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end") or'
+            ' name()="w:fldSimple")]')(doc):
+            if elem.tag.endswith('}fldChar'):
+                typ = self.namespace.get(elem, 'w:fldCharType')
+                if typ == 'begin':
+                    stack.append(Field(elem))
+                    self.fields.append(stack[-1])
+                else:
+                    try:
+                        stack.pop().end = elem
+                    except IndexError:
+                        pass
+            elif elem.tag.endswith('}instrText'):
+                if stack:
+                    stack[-1].add_instr(elem)
+            elif elem.tag.endswith('}fldSimple'):
+                field = Field(elem)
+                instr = self.namespace.get(elem, 'w:instr')
+                if instr:
+                    field.add_raw(instr)
+                    self.fields.append(field)
+                    for r in self.namespace.XPath('descendant::w:r')(elem):
+                        field.contents.append(r)
+            else:
+                if stack:
+                    stack[-1].contents.append(elem)
+
+        field_types = ('hyperlink', 'xe', 'index', 'ref', 'noteref')
+        parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types}
+        parsers.update({x:getattr(self, 'parse_'+x) for x in field_types})
+        field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types}
+        field_parsers.update({f:globals()['parse_%s' % f] for f in field_types})
+
+        for f in field_types:
+            setattr(self, '%s_fields' % f, [])
+        unknown_fields = {'TOC', 'toc', 'PAGEREF', 'pageref'}  # The TOC and PAGEREF fields are handled separately
+
+        for field in self.fields:
+            field.finalize()
+            if field.instructions:
+                func = parsers.get(field.name, None)
+                if func is not None:
+                    func(field, field_parsers[field.name], log)
+                elif field.name not in unknown_fields:
+                    log.warn('Encountered unknown field: %s, ignoring it.' % field.name)
+                    unknown_fields.add(field.name)
+
+    def get_runs(self, field):
+        all_runs = []
+        current_runs = []
+        # We only handle spans in a single paragraph
+        # being wrapped in <a>
+        for x in field.contents:
+            if x.tag.endswith('}p'):
+                if current_runs:
+                    all_runs.append(current_runs)
+                current_runs = []
+            elif x.tag.endswith('}r'):
+                current_runs.append(x)
+        if current_runs:
+            all_runs.append(current_runs)
+        return all_runs
+
+    def parse_hyperlink(self, field, parse_func, log):
+        # Parse hyperlink fields
+        hl = parse_func(field.instructions, log)
+        if hl:
+            if 'target' in hl and hl['target'] is None:
+                hl['target'] = '_blank'
+            for runs in self.get_runs(field):
+                self.hyperlink_fields.append((hl, runs))
+
+    def parse_ref(self, field, parse_func, log):
+        ref = parse_func(field.instructions, log)
+        dest = ref.get(None, None)
+        if dest is not None and 'hyperlink' in ref:
+            for runs in self.get_runs(field):
+                self.hyperlink_fields.append(({'anchor':dest}, runs))
+        else:
+            log.warn('Unsupported reference field (%s), ignoring: %r' % (field.name, ref))
+
+    parse_noteref = parse_ref
+
+    def parse_xe(self, field, parse_func, log):
+        # Parse XE fields
+        if None in (field.start, field.end):
+            return
+        xe = parse_func(field.instructions, log)
+        if xe:
+            # We insert a synthetic bookmark around this index item so that we
+            # can link to it later
+            def WORD(x):
+                return self.namespace.expand('w:' + x)
+            self.index_bookmark_counter += 1
+            bmark = xe['anchor'] = '%s%d' % (self.index_bookmark_prefix, self.index_bookmark_counter)
+            p = field.start.getparent()
+            bm = p.makeelement(WORD('bookmarkStart'))
+            bm.set(WORD('id'), bmark), bm.set(WORD('name'), bmark)
+            p.insert(p.index(field.start), bm)
+            p = field.end.getparent()
+            bm = p.makeelement(WORD('bookmarkEnd'))
+            bm.set(WORD('id'), bmark)
+            p.insert(p.index(field.end) + 1, bm)
+            xe['start_elem'] = field.start
+            self.xe_fields.append(xe)
+
+    def parse_index(self, field, parse_func, log):
+        if not field.contents:
+            return
+        idx = parse_func(field.instructions, log)
+        hyperlinks, blocks = process_index(field, idx, self.xe_fields, log, self.namespace.XPath, self.namespace.expand)
+        if not blocks:
+            return
+        for anchor, run in hyperlinks:
+            self.hyperlink_fields.append(({'anchor':anchor}, [run]))
+
+        self.index_fields.append((idx, blocks))
+
+    def polish_markup(self, object_map):
+        if not self.index_fields:
+            return
+        rmap = {v:k for k, v in iteritems(object_map)}
+        for idx, blocks in self.index_fields:
+            polish_index_markup(idx, [rmap[b] for b in blocks])
+
+
+def test_parse_fields(return_tests=False):
+    import unittest
+
+    class TestParseFields(unittest.TestCase):
+
+        def test_hyperlink(self):
+            ae = lambda x, y: self.assertEqual(parse_hyperlink(x, None), y)
+            ae(r'\l anchor1', {'anchor':'anchor1'})
+            ae(r'www.calibre-ebook.com', {'url':'www.calibre-ebook.com'})
+            ae(r'www.calibre-ebook.com \t target \o tt', {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
+            ae(r'"c:\\Some Folder"', {'url': 'c:\\Some Folder'})
+            ae(r'xxxx \y yyyy', {'url': 'xxxx'})
+
+        def test_xe(self):
+            ae = lambda x, y: self.assertEqual(parse_xe(x, None), y)
+            ae(r'"some name"', {'text':'some name'})
+            ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None})
+            ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'})
+
+        def test_index(self):
+            ae = lambda x, y: self.assertEqual(parse_index(x, None), y)
+            ae(r'', {})
+            ae(r'\b \c 1', {'bookmark':None, 'columns-per-page': '1'})
+
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields)
+    if return_tests:
+        return suite
+    unittest.TextTestRunner(verbosity=4).run(suite)
+
+
+if __name__ == '__main__':
+    test_parse_fields()
--- a/ebook_converter/ebooks/docx/fonts.py
+++ b/ebook_converter/ebooks/docx/fonts.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os, re
+from collections import namedtuple
+
+from calibre.ebooks.docx.block_styles import binary_property, inherit
+from calibre.utils.filenames import ascii_filename
+from calibre.utils.fonts.scanner import font_scanner, NoFonts
+from calibre.utils.fonts.utils import panose_to_css_generic_family, is_truetype_font
+from calibre.utils.icu import ord_string
+from polyglot.builtins import codepoint_to_chr, iteritems, range
+
+Embed = namedtuple('Embed', 'name key subsetted')
+
+
+def has_system_fonts(name):
+    try:
+        return bool(font_scanner.fonts_for_family(name))
+    except NoFonts:
+        return False
+
+
+def get_variant(bold=False, italic=False):
+    return {(False, False):'Regular', (False, True):'Italic',
+            (True, False):'Bold', (True, True):'BoldItalic'}[(bold, italic)]
+
+
+def find_fonts_matching(fonts, style='normal', stretch='normal'):
+    for font in fonts:
+        if font['font-style'] == style and font['font-stretch'] == stretch:
+            yield font
+
+
+def weight_key(font):
+    w = font['font-weight']
+    try:
+        return abs(int(w) - 400)
+    except Exception:
+        return abs({'normal': 400, 'bold': 700}.get(w, 1000000) - 400)
+
+
+def get_best_font(fonts, style, stretch):
+    try:
+        return sorted(find_fonts_matching(fonts, style, stretch), key=weight_key)[0]
+    except Exception:
+        pass
+
+
+class Family(object):
+
+    def __init__(self, elem, embed_relationships, XPath, get):
+        self.name = self.family_name = get(elem, 'w:name')
+        self.alt_names = tuple(get(x, 'w:val') for x in XPath('./w:altName')(elem))
+        if self.alt_names and not has_system_fonts(self.name):
+            for x in self.alt_names:
+                if has_system_fonts(x):
+                    self.family_name = x
+                    break
+
+        self.embedded = {}
+        for x in ('Regular', 'Bold', 'Italic', 'BoldItalic'):
+            for y in XPath('./w:embed%s[@r:id]' % x)(elem):
+                rid = get(y, 'r:id')
+                key = get(y, 'w:fontKey')
+                subsetted = get(y, 'w:subsetted') in {'1', 'true', 'on'}
+                if rid in embed_relationships:
+                    self.embedded[x] = Embed(embed_relationships[rid], key, subsetted)
+
+        self.generic_family = 'auto'
+        for x in XPath('./w:family[@w:val]')(elem):
+            self.generic_family = get(x, 'w:val', 'auto')
+
+        ntt = binary_property(elem, 'notTrueType', XPath, get)
+        self.is_ttf = ntt is inherit or not ntt
+
+        self.panose1 = None
+        self.panose_name = None
+        for x in XPath('./w:panose1[@w:val]')(elem):
+            try:
+                v = get(x, 'w:val')
+                v = tuple(int(v[i:i+2], 16) for i in range(0, len(v), 2))
+            except (TypeError, ValueError, IndexError):
+                pass
+            else:
+                self.panose1 = v
+                self.panose_name = panose_to_css_generic_family(v)
+
+        self.css_generic_family = {'roman':'serif', 'swiss':'sans-serif', 'modern':'monospace',
+                                   'decorative':'fantasy', 'script':'cursive'}.get(self.generic_family, None)
+        self.css_generic_family = self.css_generic_family or self.panose_name or 'serif'
+
+
+SYMBOL_MAPS = {  # {{{
+    'Wingdings': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🖉', '✂', '✁', '👓', '🕭', '🕮', '🕯', '🕿', '✆', '🖂', '🖃', '📪', '📫', '📬', '📭', '🗀', '🗁', '🗎', '🗏', '🗐', '🗄', '⏳', '🖮', '🖰', '🖲', '🖳', '🖴', '🖫', '🖬', '✇', '✍', '🖎', '✌', '🖏', '👍', '👎', '☜', '☞', '☜', '🖗', '🖐', '☺', '😐', '☹', '💣', '🕱', '🏳', '🏱', '✈', '☼', '🌢', '❄', '🕆', '✞', '🕈', '✠', '✡', '☪', '☯', '🕉', '☸', '♈', '♉', '♊', '♋', '♌', '♍', '♎', '♏', '♐', '♑', '♒', '♓', '🙰', '🙵', '⚫', '🔾', '◼', '🞏', '🞐', '❑', '❒', '🞟', '⧫', '◆', '❖', '🞙', '⌧', '⮹', '⌘', '🏵', '🏶', '🙶', '🙷', ' ', '🄋', '➀', '➁', '➂', '➃', '➄', '➅', '➆', '➇', '➈', '➉', '🄌', '➊', '➋', '➌', '➍', '➎', '➏', '➐', '➑', '➒', '➓', '🙢', '🙠', '🙡', '🙣', '🙦', '🙤', '🙥', '🙧', '∙', '•', '⬝', '⭘', '🞆', '🞈', '🞊', '🞋', '🔿', '▪', '🞎', '🟀', '🟁', '★', '🟋', '🟏', '🟓', '🟑', '⯐', '⌖', '⯎', '⯏', '⯑', '✪', '✰', '🕐', '🕑', '🕒', '🕓', '🕔', '🕕', '🕖', '🕗', '🕘', '🕙', '🕚', '🕛', '⮰', '⮱', '⮲', '⮳', '⮴', '⮵', '⮶', '⮷', '🙪', '🙫', '🙕', '🙔', '🙗', '🙖', '🙐', '🙑', '🙒', '🙓', '⌫', '⌦', '⮘', '⮚', '⮙', '⮛', '⮈', '⮊', '⮉', '⮋', '🡨', '🡪', '🡩', '🡫', '🡬', '🡭', '🡯', '🡮', '🡸', '🡺', '🡹', '🡻', '🡼', '🡽', '🡿', '🡾', '⇦', '⇨', '⇧', '⇩', '⬄', '⇳', '⬁', '⬀', '⬃', '⬂', '🢬', '🢭', '🗶', '✓', '🗷', '🗹', ' '),  # noqa
+
+    'Wingdings 2': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🖊', '🖋', '🖌', '🖍', '✄', '✀', '🕾', '🕽', '🗅', '🗆', '🗇', '🗈', '🗉', '🗊', '🗋', '🗌', '🗍', '📋', '🗑', '🗔', '🖵', '🖶', '🖷', '🖸', '🖭', '🖯', '🖱', '🖒', '🖓', '🖘', '🖙', '🖚', '🖛', '👈', '👉', '🖜', '🖝', '🖞', '🖟', '🖠', '🖡', '👆', '👇', '🖢', '🖣', '🖑', '🗴', '🗸', '🗵', '☑', '⮽', '☒', '⮾', '⮿', '🛇', '⦸', '🙱', '🙴', '🙲', '🙳', '‽', '🙹', '🙺', '🙻', '🙦', '🙤', '🙥', '🙧', '🙚', '🙘', '🙙', '🙛', '⓪', '①', '②', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩', '⓿', '❶', '❷', '❸', '❹', '❺', '❻', '❼', '❽', '❾', '❿', ' ', '☉', '🌕', '☽', '☾', '⸿', '✝', '🕇', '🕜', '🕝', '🕞', '🕟', '🕠', '🕡', '🕢', '🕣', '🕤', '🕥', '🕦', '🕧', '🙨', '🙩', '⋅', '🞄', '⦁', '●', '●', '🞅', '🞇', '🞉', '⊙', '⦿', '🞌', '🞍', '◾', '■', '□', '🞑', '🞒', '🞓', '🞔', '▣', '🞕', '🞖', '🞗', '🞘', '⬩', '⬥', '◇', '🞚', '◈', '🞛', '🞜', '🞝', '🞞', '⬪', '⬧', '◊', '🞠', '◖', '◗', '⯊', '⯋', '⯀', '⯁', '⬟', '⯂', '⬣', '⬢', '⯃', '⯄', '🞡', '🞢', '🞣', '🞤', '🞥', '🞦', '🞧', '🞨', '🞩', '🞪', '🞫', '🞬', '🞭', '🞮', '🞯', '🞰', '🞱', '🞲', '🞳', '🞴', '🞵', '🞶', '🞷', '🞸', '🞹', '🞺', '🞻', '🞼', '🞽', '🞾', '🞿', '🟀', '🟂', '🟄', '🟆', '🟉', '🟊', '✶', '🟌', '🟎', '🟐', '🟒', '✹', '🟃', '🟇', '✯', '🟍', '🟔', '⯌', '⯍', '※', '⁂', ' ', ' ', ' ', ' ', ' ', ' ',),  # noqa
+
+    'Wingdings 3': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '⭠', '⭢', '⭡', '⭣', '⭤', '⭥', '⭧', '⭦', '⭰', '⭲', '⭱', '⭳', '⭶', '⭸', '⭻', '⭽', '⭤', '⭥', '⭪', '⭬', '⭫', '⭭', '⭍', '⮠', '⮡', '⮢', '⮣', '⮤', '⮥', '⮦', '⮧', '⮐', '⮑', '⮒', '⮓', '⮀', '⮃', '⭾', '⭿', '⮄', '⮆', '⮅', '⮇', '⮏', '⮍', '⮎', '⮌', '⭮', '⭯', '⎋', '⌤', '⌃', '⌥', '␣', '⍽', '⇪', '⮸', '🢠', '🢡', '🢢', '🢣', '🢤', '🢥', '🢦', '🢧', '🢨', '🢩', '🢪', '🢫', '🡐', '🡒', '🡑', '🡓', '🡔', '🡕', '🡗', '🡖', '🡘', '🡙', '▲', '▼', '△', '▽', '◀', '▶', '◁', '▷', '◣', '◢', '◤', '◥', '🞀', '🞂', '🞁', ' ', '🞃', '⯅', '⯆', '⯇', '⯈', '⮜', '⮞', '⮝', '⮟', '🠐', '🠒', '🠑', '🠓', '🠔', '🠖', '🠕', '🠗', '🠘', '🠚', '🠙', '🠛', '🠜', '🠞', '🠝', '🠟', '🠀', '🠂', '🠁', '🠃', '🠄', '🠆', '🠅', '🠇', '🠈', '🠊', '🠉', '🠋', '🠠', '🠢', '🠤', '🠦', '🠨', '🠪', '🠬', '🢜', '🢝', '🢞', '🢟', '🠮', '🠰', '🠲', '🠴', '🠶', '🠸', '🠺', '🠹', '🠻', '🢘', '🢚', '🢙', '🢛', '🠼', '🠾', '🠽', '🠿', '🡀', '🡂', '🡁', '🡃', '🡄', '🡆', '🡅', '🡇', '⮨', '⮩', '⮪', '⮫', '⮬', '⮭', '⮮', '⮯', '🡠', '🡢', '🡡', '🡣', '🡤', '🡥', '🡧', '🡦', '🡰', '🡲', '🡱', '🡳', '🡴', '🡵', '🡷', '🡶', '🢀', '🢂', '🢁', '🢃', '🢄', '🢅', '🢇', '🢆', '🢐', '🢒', '🢑', '🢓', '🢔', '🢕', '🢗', '🢖', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',),  # noqa
+
+    'Webdings': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🕷', '🕸', '🕲', '🕶', '🏆', '🎖', '🖇', '🗨', '🗩', '🗰', '🗱', '🌶', '🎗', '🙾', '🙼', '🗕', '🗖', '🗗', '⏴', '⏵', '⏶', '⏷', '⏪', '⏩', '⏮', '⏭', '⏸', '⏹', '⏺', '🗚', '🗳', '🛠', '🏗', '🏘', '🏙', '🏚', '🏜', '🏭', '🏛', '🏠', '🏖', '🏝', '🛣', '🔍', '🏔', '👁', '👂', '🏞', '🏕', '🛤', '🏟', '🛳', '🕬', '🕫', '🕨', '🔈', '🎔', '🎕', '🗬', '🙽', '🗭', '🗪', '🗫', '⮔', '✔', '🚲', '⬜', '🛡', '📦', '🛱', '⬛', '🚑', '🛈', '🛩', '🛰', '🟈', '🕴', '⬤', '🛥', '🚔', '🗘', '🗙', '❓', '🛲', '🚇', '🚍', '⛳', '⦸', '⊖', '🚭', '🗮', '⏐', '🗯', '🗲', ' ', '🚹', '🚺', '🛉', '🛊', '🚼', '👽', '🏋', '⛷', '🏂', '🏌', '🏊', '🏄', '🏍', '🏎', '🚘', '🗠', '🛢', '📠', '🏷', '📣', '👪', '🗡', '🗢', '🗣', '✯', '🖄', '🖅', '🖃', '🖆', '🖹', '🖺', '🖻', '🕵', '🕰', '🖽', '🖾', '📋', '🗒', '🗓', '🕮', '📚', '🗞', '🗟', '🗃', '🗂', '🖼', '🎭', '🎜', '🎘', '🎙', '🎧', '💿', '🎞', '📷', '🎟', '🎬', '📽', '📹', '📾', '📻', '🎚', '🎛', '📺', '💻', '🖥', '🖦', '🖧', '🍹', '🎮', '🎮', '🕻', '🕼', '🖁', '🖀', '🖨', '🖩', '🖿', '🖪', '🗜', '🔒', '🔓', '🗝', '📥', '📤', '🕳', '🌣', '🌤', '🌥', '🌦', '☁', '🌨', '🌧', '🌩', '🌪', '🌬', '🌫', '🌜', '🌡', '🛋', '🛏', '🍽', '🍸', '🛎', '🛍', 'Ⓟ', '♿', '🛆', '🖈', '🎓', '🗤', '🗥', '🗦', '🗧', '🛪', '🐿', '🐦', '🐟', '🐕', '🐈', '🙬', '🙮', '🙭', '🙯', '🗺', '🌍', '🌏', '🌎', '🕊',),  # noqa
+
+    'Symbol': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '!', '∀', '#', '∃', '%', '&', '∍', '(', ')', '*', '+', ',', '−', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '≅', 'Α', 'Β', 'Χ', 'Δ', 'Ε', 'Φ', 'Γ', 'Η', 'Ι', 'ϑ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο', 'Π', 'Θ', 'Ρ', 'Σ', 'Τ', 'Υ', 'ς', 'Ω', 'Ξ', 'Ψ', 'Ζ', '[', '∴', ']', '⊥', '_', '', 'α', 'β', 'χ', 'δ', 'ε', 'φ', 'γ', 'η', 'ι', 'ϕ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'θ', 'ρ', 'σ', 'τ', 'υ', 'ϖ', 'ω', 'ξ', 'ψ', 'ζ', '{', '|', '}', '~', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '€', 'ϒ', '′', '≤', '⁄', '∞', 'ƒ', '♣', '♥', '♦', '♠', '↔', '←', '↑', '→', '↓', '°', '±', '″', '≥', '×', '∝', '∂', '•', '÷', '≠', '≡', '≈', '…', '⏐', '⎯', '↲', 'ℵ', 'ℑ', 'ℜ', '℘', '⊗', '⊕', '∅', '∩', '∪', '⊃', '⊇', '⊄', '⊂', '⊆', '∈', '∉', '∠', '∂', '®', '©', '™', '∏', '√', '⋅', '¬', '∦', '∧', '⇔', '⇐', '⇑', '⇒', '⇓', '◊', '〈', '®', '©', '™', '∑', '⎛', '⎜', '⎝', '⎡', '⎢', '⎣', '⎧', '⎨', '⎩', '⎪', ' ', '〉', '∫', '⌠', '⎮', '⌡', '⎞', '⎟', '⎠', '⎤', '⎥', '⎦', '⎪', '⎫', '⎬', ' ',),  # noqa
+}  # }}}
+
+SYMBOL_FONT_NAMES = frozenset(n.lower() for n in SYMBOL_MAPS)
+
+
+def is_symbol_font(family):
+    try:
+        return family.lower() in SYMBOL_FONT_NAMES
+    except AttributeError:
+        return False
+
+
+def do_map(m, points):
+    base = 0xf000
+    limit = len(m) + base
+    for p in points:
+        if base < p < limit:
+            yield m[p - base]
+        else:
+            yield codepoint_to_chr(p)
+
+
+def map_symbol_text(text, font):
+    m = SYMBOL_MAPS[font]
+    if isinstance(text, bytes):
+        text = text.decode('utf-8')
+    return ''.join(do_map(m, ord_string(text)))
+
+
+class Fonts(object):
+
+    def __init__(self, namespace):
+        self.namespace = namespace
+        self.fonts = {}
+        self.used = set()
+
+    def __call__(self, root, embed_relationships, docx, dest_dir):
+        for elem in self.namespace.XPath('//w:font[@w:name]')(root):
+            self.fonts[self.namespace.get(elem, 'w:name')] = Family(elem, embed_relationships, self.namespace.XPath, self.namespace.get)
+
+    def family_for(self, name, bold=False, italic=False):
+        f = self.fonts.get(name, None)
+        if f is None:
+            return 'serif'
+        variant = get_variant(bold, italic)
+        self.used.add((name, variant))
+        name = f.name if variant in f.embedded else f.family_name
+        if is_symbol_font(name):
+            return name
+        return '"%s", %s' % (name.replace('"', ''), f.css_generic_family)
+
+    def embed_fonts(self, dest_dir, docx):
+        defs = []
+        dest_dir = os.path.join(dest_dir, 'fonts')
+        for name, variant in self.used:
+            f = self.fonts[name]
+            if variant in f.embedded:
+                if not os.path.exists(dest_dir):
+                    os.mkdir(dest_dir)
+                fname = self.write(name, dest_dir, docx, variant)
+                if fname is not None:
+                    d = {'font-family':'"%s"' % name.replace('"', ''), 'src': 'url("fonts/%s")' % fname}
+                    if 'Bold' in variant:
+                        d['font-weight'] = 'bold'
+                    if 'Italic' in variant:
+                        d['font-style'] = 'italic'
+                    d = ['%s: %s' % (k, v) for k, v in iteritems(d)]
+                    d = ';\n\t'.join(d)
+                    defs.append('@font-face {\n\t%s\n}\n' % d)
+        return '\n'.join(defs)
+
+    def write(self, name, dest_dir, docx, variant):
+        f = self.fonts[name]
+        ef = f.embedded[variant]
+        raw = docx.read(ef.name)
+        prefix = raw[:32]
+        if ef.key:
+            key = re.sub(r'[^A-Fa-f0-9]', '', ef.key)
+            key = bytearray(reversed(tuple(int(key[i:i+2], 16) for i in range(0, len(key), 2))))
+            prefix = bytearray(prefix)
+            prefix = bytes(bytearray(prefix[i]^key[i % len(key)] for i in range(len(prefix))))
+        if not is_truetype_font(prefix):
+            return None
+        ext = 'otf' if prefix.startswith(b'OTTO') else 'ttf'
+        fname = ascii_filename('%s - %s.%s' % (name, variant, ext))
+        with open(os.path.join(dest_dir, fname), 'wb') as dest:
+            dest.write(prefix)
+            dest.write(raw[32:])
+
+        return fname
--- a/ebook_converter/ebooks/docx/footnotes.py
+++ b/ebook_converter/ebooks/docx/footnotes.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import OrderedDict
+from polyglot.builtins import iteritems, unicode_type
+
+
+class Note(object):
+
+    def __init__(self, namespace, parent, rels):
+        self.type = namespace.get(parent, 'w:type', 'normal')
+        self.parent = parent
+        self.rels = rels
+        self.namespace = namespace
+
+    def __iter__(self):
+        for p in self.namespace.descendants(self.parent, 'w:p', 'w:tbl'):
+            yield p
+
+
+class Footnotes(object):
+
+    def __init__(self, namespace):
+        self.namespace = namespace
+        self.footnotes = {}
+        self.endnotes = {}
+        self.counter = 0
+        self.notes = OrderedDict()
+
+    def __call__(self, footnotes, footnotes_rels, endnotes, endnotes_rels):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        if footnotes is not None:
+            for footnote in XPath('./w:footnote[@w:id]')(footnotes):
+                fid = get(footnote, 'w:id')
+                if fid:
+                    self.footnotes[fid] = Note(self.namespace, footnote, footnotes_rels)
+
+        if endnotes is not None:
+            for endnote in XPath('./w:endnote[@w:id]')(endnotes):
+                fid = get(endnote, 'w:id')
+                if fid:
+                    self.endnotes[fid] = Note(self.namespace, endnote, endnotes_rels)
+
+    def get_ref(self, ref):
+        fid = self.namespace.get(ref, 'w:id')
+        notes = self.footnotes if ref.tag.endswith('}footnoteReference') else self.endnotes
+        note = notes.get(fid, None)
+        if note is not None and note.type == 'normal':
+            self.counter += 1
+            anchor = 'note_%d' % self.counter
+            self.notes[anchor] = (unicode_type(self.counter), note)
+            return anchor, unicode_type(self.counter)
+        return None, None
+
+    def __iter__(self):
+        for anchor, (counter, note) in iteritems(self.notes):
+            yield anchor, counter, note
+
+    @property
+    def has_notes(self):
+        return bool(self.notes)
--- a/ebook_converter/ebooks/docx/images.py
+++ b/ebook_converter/ebooks/docx/images.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os
+
+from lxml.html.builder import IMG, HR
+
+from calibre.constants import iswindows
+from calibre.ebooks.docx.names import barename
+from calibre.utils.filenames import ascii_filename
+from calibre.utils.img import resize_to_fit, image_to_data
+from calibre.utils.imghdr import what
+from polyglot.builtins import iteritems, itervalues
+
+
+class LinkedImageNotFound(ValueError):
+
+    def __init__(self, fname):
+        ValueError.__init__(self, fname)
+        self.fname = fname
+
+
+def image_filename(x):
+    return ascii_filename(x).replace(' ', '_').replace('#', '_')
+
+
+def emu_to_pt(x):
+    return x / 12700
+
+
+def pt_to_emu(x):
+    return int(x * 12700)
+
+
+def get_image_properties(parent, XPath, get):
+    width = height = None
+    for extent in XPath('./wp:extent')(parent):
+        try:
+            width = emu_to_pt(int(extent.get('cx')))
+        except (TypeError, ValueError):
+            pass
+        try:
+            height = emu_to_pt(int(extent.get('cy')))
+        except (TypeError, ValueError):
+            pass
+    ans = {}
+    if width is not None:
+        ans['width'] = '%.3gpt' % width
+    if height is not None:
+        ans['height'] = '%.3gpt' % height
+
+    alt = None
+    title = None
+    for docPr in XPath('./wp:docPr')(parent):
+        alt = docPr.get('descr') or alt
+        title = docPr.get('title') or title
+        if docPr.get('hidden', None) in {'true', 'on', '1'}:
+            ans['display'] = 'none'
+
+    return ans, alt, title
+
+
+def get_image_margins(elem):
+    ans = {}
+    for w, css in iteritems({'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}):
+        val = elem.get('dist%s' % w, None)
+        if val is not None:
+            try:
+                val = emu_to_pt(val)
+            except (TypeError, ValueError):
+                continue
+            ans['padding-%s' % css] = '%.3gpt' % val
+    return ans
+
+
+def get_hpos(anchor, page_width, XPath, get, width_frac):
+    for ph in XPath('./wp:positionH')(anchor):
+        rp = ph.get('relativeFrom', None)
+        if rp == 'leftMargin':
+            return 0 + width_frac
+        if rp == 'rightMargin':
+            return 1 + width_frac
+        al = None
+        almap = {'left':0, 'center':0.5, 'right':1}
+        for align in XPath('./wp:align')(ph):
+            al = almap.get(align.text)
+            if al is not None:
+                if rp == 'page':
+                    return al
+                return al + width_frac
+        for po in XPath('./wp:posOffset')(ph):
+            try:
+                pos = emu_to_pt(int(po.text))
+            except (TypeError, ValueError):
+                continue
+            return pos/page_width + width_frac
+
+    for sp in XPath('./wp:simplePos')(anchor):
+        try:
+            x = emu_to_pt(sp.get('x', None))
+        except (TypeError, ValueError):
+            continue
+        return x/page_width + width_frac
+
+    return 0
+
+
+class Images(object):
+
+    def __init__(self, namespace, log):
+        self.namespace = namespace
+        self.rid_map = {}
+        self.used = {}
+        self.resized = {}
+        self.names = set()
+        self.all_images = set()
+        self.links = []
+        self.log = log
+
+    def __call__(self, relationships_by_id):
+        self.rid_map = relationships_by_id
+
+    def read_image_data(self, fname, base=None):
+        if fname.startswith('file://'):
+            src = fname[len('file://'):]
+            if iswindows and src and src[0] == '/':
+                src = src[1:]
+            if not src or not os.path.exists(src):
+                raise LinkedImageNotFound(src)
+            with open(src, 'rb') as rawsrc:
+                raw = rawsrc.read()
+        else:
+            try:
+                raw = self.docx.read(fname)
+            except KeyError:
+                raise LinkedImageNotFound(fname)
+        base = base or image_filename(fname.rpartition('/')[-1]) or 'image'
+        ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
+        if ext == 'emf':
+            # For an example, see: https://bugs.launchpad.net/bugs/1224849
+            self.log('Found an EMF image: %s, trying to extract embedded raster image' % fname)
+            from calibre.utils.wmf.emf import emf_unwrap
+            try:
+                raw = emf_unwrap(raw)
+            except Exception:
+                self.log.exception('Failed to extract embedded raster image from EMF')
+            else:
+                ext = 'png'
+        base = base.rpartition('.')[0]
+        if not base:
+            base = 'image'
+        base += '.' + ext
+        return raw, base
+
+    def unique_name(self, base):
+        exists = frozenset(itervalues(self.used))
+        c = 1
+        name = base
+        while name in exists:
+            n, e = base.rpartition('.')[0::2]
+            name = '%s-%d.%s' % (n, c, e)
+            c += 1
+        return name
+
+    def resize_image(self, raw, base, max_width, max_height):
+        resized, img = resize_to_fit(raw, max_width, max_height)
+        if resized:
+            base, ext = os.path.splitext(base)
+            base = base + '-%dx%d%s' % (max_width, max_height, ext)
+            raw = image_to_data(img, fmt=ext[1:])
+        return raw, base, resized
+
+    def generate_filename(self, rid, base=None, rid_map=None, max_width=None, max_height=None):
+        rid_map = self.rid_map if rid_map is None else rid_map
+        fname = rid_map[rid]
+        key = (fname, max_width, max_height)
+        ans = self.used.get(key)
+        if ans is not None:
+            return ans
+        raw, base = self.read_image_data(fname, base=base)
+        resized = False
+        if max_width is not None and max_height is not None:
+            raw, base, resized = self.resize_image(raw, base, max_width, max_height)
+        name = self.unique_name(base)
+        self.used[key] = name
+        if max_width is not None and max_height is not None and not resized:
+            okey = (fname, None, None)
+            if okey in self.used:
+                return self.used[okey]
+            self.used[okey] = name
+        with open(os.path.join(self.dest_dir, name), 'wb') as f:
+            f.write(raw)
+        self.all_images.add('images/' + name)
+        return name
+
+    def pic_to_img(self, pic, alt, parent, title):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        name = None
+        link = None
+        for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
+            link = {'id':get(hl, 'r:id')}
+            tgt = hl.get('tgtFrame', None)
+            if tgt:
+                link['target'] = tgt
+            title = hl.get('tooltip', None)
+            if title:
+                link['title'] = title
+
+        for pr in XPath('descendant::pic:cNvPr')(pic):
+            name = pr.get('name', None)
+            if name:
+                name = image_filename(name)
+            alt = pr.get('descr') or alt
+            for a in XPath('descendant::a:blip[@r:embed or @r:link]')(pic):
+                rid = get(a, 'r:embed')
+                if not rid:
+                    rid = get(a, 'r:link')
+                if rid and rid in self.rid_map:
+                    try:
+                        src = self.generate_filename(rid, name)
+                    except LinkedImageNotFound as err:
+                        self.log.warn('Linked image: %s not found, ignoring' % err.fname)
+                        continue
+                    img = IMG(src='images/%s' % src)
+                    img.set('alt', alt or 'Image')
+                    if title:
+                        img.set('title', title)
+                    if link is not None:
+                        self.links.append((img, link, self.rid_map))
+                    return img
+
+    def drawing_to_html(self, drawing, page):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        # First process the inline pictures
+        for inline in XPath('./wp:inline')(drawing):
+            style, alt, title = get_image_properties(inline, XPath, get)
+            for pic in XPath('descendant::pic:pic')(inline):
+                ans = self.pic_to_img(pic, alt, inline, title)
+                if ans is not None:
+                    if style:
+                        ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
+                    yield ans
+
+        # Now process the floats
+        for anchor in XPath('./wp:anchor')(drawing):
+            style, alt, title = get_image_properties(anchor, XPath, get)
+            self.get_float_properties(anchor, style, page)
+            for pic in XPath('descendant::pic:pic')(anchor):
+                ans = self.pic_to_img(pic, alt, anchor, title)
+                if ans is not None:
+                    if style:
+                        ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
+                    yield ans
+
+    def pict_to_html(self, pict, page):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        # First see if we have an <hr>
+        is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'}
+        if is_hr:
+            style = {}
+            hr = HR()
+            try:
+                pct = float(get(pict[0], 'o:hrpct'))
+            except (ValueError, TypeError, AttributeError):
+                pass
+            else:
+                if pct > 0:
+                    style['width'] = '%.3g%%' % pct
+            align = get(pict[0], 'o:hralign', 'center')
+            if align in {'left', 'right'}:
+                style['margin-left'] = '0' if align == 'left' else 'auto'
+                style['margin-right'] = 'auto' if align == 'left' else '0'
+            if style:
+                hr.set('style', '; '.join(('%s:%s' % (k, v) for k, v in iteritems(style))))
+            yield hr
+
+        for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict):
+            rid = get(imagedata, 'r:id')
+            if rid in self.rid_map:
+                try:
+                    src = self.generate_filename(rid)
+                except LinkedImageNotFound as err:
+                    self.log.warn('Linked image: %s not found, ignoring' % err.fname)
+                    continue
+                img = IMG(src='images/%s' % src, style="display:block")
+                alt = get(imagedata, 'o:title')
+                img.set('alt', alt or 'Image')
+                yield img
+
+    def get_float_properties(self, anchor, style, page):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        if 'display' not in style:
+            style['display'] = 'block'
+        padding = get_image_margins(anchor)
+        width = float(style.get('width', '100pt')[:-2])
+
+        page_width = page.width - page.margin_left - page.margin_right
+        if page_width <= 0:
+            # Ignore margins
+            page_width = page.width
+
+        hpos = get_hpos(anchor, page_width, XPath, get, width/(2*page_width))
+
+        wrap_elem = None
+        dofloat = False
+
+        for child in reversed(anchor):
+            bt = barename(child.tag)
+            if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}:
+                wrap_elem = child
+                dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'}
+                break
+
+        if wrap_elem is not None:
+            padding.update(get_image_margins(wrap_elem))
+            wt = wrap_elem.get('wrapText', None)
+            hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos
+            if dofloat:
+                style['float'] = 'left' if hpos < 0.65 else 'right'
+            else:
+                ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto')
+                if ml is not None:
+                    style['margin-left'] = ml
+                if mr is not None:
+                    style['margin-right'] = mr
+
+        style.update(padding)
+
+    def to_html(self, elem, page, docx, dest_dir):
+        dest = os.path.join(dest_dir, 'images')
+        if not os.path.exists(dest):
+            os.mkdir(dest)
+        self.dest_dir, self.docx = dest, docx
+        if elem.tag.endswith('}drawing'):
+            for tag in self.drawing_to_html(elem, page):
+                yield tag
+        else:
+            for tag in self.pict_to_html(elem, page):
+                yield tag
--- a/ebook_converter/ebooks/docx/index.py
+++ b/ebook_converter/ebooks/docx/index.py
@@ -0,0 +1,273 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from operator import itemgetter
+
+from lxml import etree
+
+from calibre.utils.icu import partition_by_first_letter, sort_key
+from polyglot.builtins import iteritems, filter
+
+
+def get_applicable_xe_fields(index, xe_fields, XPath, expand):
+    iet = index.get('entry-type', None)
+    xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]
+
+    lr = index.get('letter-range', None)
+    if lr is not None:
+        sl, el = lr.parition('-')[0::2]
+        sl, el = sl.strip(), el.strip()
+        if sl and el:
+            def inrange(text):
+                return sl <= text[0] <= el
+            xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))]
+
+    bmark = index.get('bookmark', None)
+    if bmark is None:
+        return xe_fields
+    attr = expand('w:name')
+    bookmarks = {b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark}
+    ancestors = XPath('ancestor::w:bookmarkStart')
+
+    def contained(xe):
+        # Check if the xe field is contained inside a bookmark with the
+        # specified name
+        return bool(set(ancestors(xe['start_elem'])) & bookmarks)
+
+    return [xe for xe in xe_fields if contained(xe)]
+
+
+def make_block(expand, style, parent, pos):
+    p = parent.makeelement(expand('w:p'))
+    parent.insert(pos, p)
+    if style is not None:
+        ppr = p.makeelement(expand('w:pPr'))
+        p.append(ppr)
+        ps = ppr.makeelement(expand('w:pStyle'))
+        ppr.append(ps)
+        ps.set(expand('w:val'), style)
+    r = p.makeelement(expand('w:r'))
+    p.append(r)
+    t = r.makeelement(expand('w:t'))
+    t.set(expand('xml:space'), 'preserve')
+    r.append(t)
+    return p, t
+
+
+def add_xe(xe, t, expand):
+    run = t.getparent()
+    idx = run.index(t)
+    t.text = xe.get('text') or ' '
+    pt = xe.get('page-number-text', None)
+
+    if pt:
+        p = t.getparent().getparent()
+        r = p.makeelement(expand('w:r'))
+        p.append(r)
+        t2 = r.makeelement(expand('w:t'))
+        t2.set(expand('xml:space'), 'preserve')
+        t2.text = ' [%s]' % pt
+        r.append(t2)
+    # put separate entries on separate lines
+    run.insert(idx + 1, run.makeelement(expand('w:br')))
+    return xe['anchor'], run
+
+
+def process_index(field, index, xe_fields, log, XPath, expand):
+    '''
+    We remove all the word generated index markup and replace it with our own
+    that is more suitable for an ebook.
+    '''
+    styles = []
+    heading_text = index.get('heading', None)
+    heading_style = 'IndexHeading'
+    start_pos = None
+    for elem in field.contents:
+        if elem.tag.endswith('}p'):
+            s = XPath('descendant::pStyle/@w:val')(elem)
+            if s:
+                styles.append(s[0])
+            p = elem.getparent()
+            if start_pos is None:
+                start_pos = (p, p.index(elem))
+            p.remove(elem)
+
+    xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand)
+    if not xe_fields:
+        return [], []
+    if heading_text is not None:
+        groups = partition_by_first_letter(xe_fields, key=itemgetter('text'))
+        items = []
+        for key, fields in iteritems(groups):
+            items.append(key), items.extend(fields)
+        if styles:
+            heading_style = styles[0]
+    else:
+        items = sorted(xe_fields, key=lambda x:sort_key(x['text']))
+
+    hyperlinks = []
+    blocks = []
+    for item in reversed(items):
+        is_heading = not isinstance(item, dict)
+        style = heading_style if is_heading else None
+        p, t = make_block(expand, style, *start_pos)
+        if is_heading:
+            text = heading_text
+            if text.lower().startswith('a'):
+                text = item + text[1:]
+            t.text = text
+        else:
+            hyperlinks.append(add_xe(item, t, expand))
+            blocks.append(p)
+
+    return hyperlinks, blocks
+
+
+def split_up_block(block, a, text, parts, ldict):
+    prefix = parts[:-1]
+    a.text = parts[-1]
+    parent = a.getparent()
+    style = 'display:block; margin-left: %.3gem'
+    for i, prefix in enumerate(prefix):
+        m = 1.5 * i
+        span = parent.makeelement('span', style=style % m)
+        ldict[span]    = i
+        parent.append(span)
+        span.text = prefix
+    span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
+    parent.append(span)
+    span.append(a)
+    ldict[span]    = len(prefix)
+
+
+"""
+The merge algorithm is a little tricky.
+We start with a list of elementary blocks. Each is an HtmlElement, a p node
+with a list of child nodes. The last child may be a link, and the earlier ones are
+just text.
+The list is in reverse order from what we want in the index.
+There is a dictionary ldict which records the level of each child node.
+
+Now we want to do a reduce-like operation, combining all blocks with the same
+top level index entry into a single block representing the structure of all
+references, subentries, etc. under that top entry.
+Here's the algorithm.
+
+Given a block p and the next block n, and the top level entries p1 and n1 in each
+block, which we assume have the same text:
+
+Start with (p, p1) and (n, n1).
+
+Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge:
+
+If there are no more levels in n, and we have a link in nk,
+then add the link from nk to the links for pk.
+This might be the first link for pk, or we might get a list of references.
+
+Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have
+the same text, it must follow pk, it must come before we find any other p entries at
+the same level as pk, and it must have the same level as nk+1.
+
+If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1).
+
+If there is no matching entry, then because of the original reversed order we want
+to insert nk+1 and all following entries from n into p immediately following pk.
+"""
+
+
+def find_match(prev_block, pind, nextent, ldict):
+    curlevel = ldict.get(prev_block[pind], -1)
+    if curlevel < 0:
+        return -1
+    for p in range(pind+1, len(prev_block)):
+        trylev = ldict.get(prev_block[p], -1)
+        if trylev <= curlevel:
+            return -1
+        if trylev > (curlevel+1):
+            continue
+        if prev_block[p].text_content() == nextent.text_content():
+            return p
+    return -1
+
+
+def add_link(pent, nent, ldict):
+    na = nent.xpath('descendant::a[1]')
+    # If there is no link, leave it as text
+    if not na or len(na) == 0:
+        return
+    na = na[0]
+    pa = pent.xpath('descendant::a')
+    if pa and len(pa) > 0:
+        # Put on same line with a comma
+        pa = pa[-1]
+        pa.tail = ', '
+        p = pa.getparent()
+        p.insert(p.index(pa) + 1, na)
+    else:
+        # substitute link na for plain text in pent
+        pent.text = ""
+        pent.append(na)
+
+
+def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict):
+    # First elements match. Any more in next?
+    if len(next_path) == (nind + 1):
+        nextent = next_block[nind]
+        add_link(prev_block[pind], nextent, ldict)
+        return
+
+    nind = nind + 1
+    nextent = next_block[nind]
+    prevent = find_match(prev_block, pind, nextent, ldict)
+    if prevent > 0:
+        merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict)
+        return
+
+    # Want to insert elements into previous block
+    while nind < len(next_block):
+        # insert takes it out of old
+        pind = pind + 1
+        prev_block.insert(pind, next_block[nind])
+
+    next_block.getparent().remove(next_block)
+
+
+def polish_index_markup(index, blocks):
+    # Blocks are in reverse order at this point
+    path_map = {}
+    ldict = {}
+    for block in blocks:
+        cls = block.get('class', '') or ''
+        block.set('class', (cls + ' index-entry').lstrip())
+        a = block.xpath('descendant::a[1]')
+        text = ''
+        if a:
+            text = etree.tostring(a[0], method='text', with_tail=False, encoding='unicode').strip()
+        if ':' in text:
+            path_map[block] = parts = list(filter(None, (x.strip() for x in text.split(':'))))
+            if len(parts) > 1:
+                split_up_block(block, a[0], text, parts, ldict)
+        else:
+            # try using a span all the time
+            path_map[block] = [text]
+            parent = a[0].getparent()
+            span = parent.makeelement('span', style='display:block; margin-left: 0em')
+            parent.append(span)
+            span.append(a[0])
+            ldict[span] = 0
+
+        for br in block.xpath('descendant::br'):
+            br.tail = None
+
+    # We want a single block for each main entry
+    prev_block = blocks[0]
+    for block in blocks[1:]:
+        pp, pn = path_map[prev_block], path_map[block]
+        if pp[0] == pn[0]:
+            merge_blocks(prev_block, block, 0, 0, pn, ldict)
+        else:
+            prev_block = block
--- a/ebook_converter/ebooks/docx/names.py
+++ b/ebook_converter/ebooks/docx/names.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re
+
+from lxml.etree import XPath as X
+
+from calibre.utils.filenames import ascii_text
+from polyglot.builtins import iteritems
+
+# Names {{{
+TRANSITIONAL_NAMES = {
+    'DOCUMENT'  : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument',
+    'DOCPROPS'  : 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties',
+    'APPPROPS'  : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties',
+    'STYLES'    : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles',
+    'NUMBERING' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering',
+    'FONTS'     : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable',
+    'EMBEDDED_FONT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/font',
+    'IMAGES'    : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image',
+    'LINKS'     : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink',
+    'FOOTNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes',
+    'ENDNOTES'  : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes',
+    'THEMES'    : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme',
+    'SETTINGS'  : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings',
+    'WEB_SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings',
+}
+
+STRICT_NAMES = {
+    k:v.replace('http://schemas.openxmlformats.org/officeDocument/2006',  'http://purl.oclc.org/ooxml/officeDocument')
+    for k, v in iteritems(TRANSITIONAL_NAMES)
+}
+
+TRANSITIONAL_NAMESPACES = {
+    'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
+    'o': 'urn:schemas-microsoft-com:office:office',
+    've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
+    'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
+    # Text Content
+    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
+    'w10': 'urn:schemas-microsoft-com:office:word',
+    'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
+    'xml': 'http://www.w3.org/XML/1998/namespace',
+    # Drawing
+    'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
+    'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
+    'mv': 'urn:schemas-microsoft-com:mac:vml',
+    'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
+    'v': 'urn:schemas-microsoft-com:vml',
+    'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
+    # Properties (core and extended)
+    'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
+    'dc': 'http://purl.org/dc/elements/1.1/',
+    'ep': 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
+    'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
+    # Content Types
+    'ct': 'http://schemas.openxmlformats.org/package/2006/content-types',
+    # Package Relationships
+    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
+    'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
+    # Dublin Core document properties
+    'dcmitype': 'http://purl.org/dc/dcmitype/',
+    'dcterms': 'http://purl.org/dc/terms/'
+}
+
+STRICT_NAMESPACES = {
+    k:v.replace(
+        'http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument').replace(
+        'http://schemas.openxmlformats.org/wordprocessingml/2006', 'http://purl.oclc.org/ooxml/wordprocessingml').replace(
+        'http://schemas.openxmlformats.org/drawingml/2006', 'http://purl.oclc.org/ooxml/drawingml')
+    for k, v in iteritems(TRANSITIONAL_NAMESPACES)
+}
+# }}}
+
+
+def barename(x):
+    return x.rpartition('}')[-1]
+
+
+def XML(x):
+    return '{%s}%s' % (TRANSITIONAL_NAMESPACES['xml'], x)
+
+
+def generate_anchor(name, existing):
+    x = y = 'id_' + re.sub(r'[^0-9a-zA-Z_]', '', ascii_text(name)).lstrip('_')
+    c = 1
+    while y in existing:
+        y = '%s_%d' % (x, c)
+        c += 1
+    return y
+
+
+class DOCXNamespace(object):
+
+    def __init__(self, transitional=True):
+        self.xpath_cache = {}
+        if transitional:
+            self.namespaces = TRANSITIONAL_NAMESPACES.copy()
+            self.names = TRANSITIONAL_NAMES.copy()
+        else:
+            self.namespaces = STRICT_NAMESPACES.copy()
+            self.names = STRICT_NAMES.copy()
+
+    def XPath(self, expr):
+        ans = self.xpath_cache.get(expr, None)
+        if ans is None:
+            self.xpath_cache[expr] = ans = X(expr, namespaces=self.namespaces)
+        return ans
+
+    def is_tag(self, x, q):
+        tag = getattr(x, 'tag', x)
+        ns, name = q.partition(':')[0::2]
+        return '{%s}%s' % (self.namespaces.get(ns, None), name) == tag
+
+    def expand(self, name, sep=':'):
+        ns, tag = name.partition(sep)[::2]
+        if ns and tag:
+            tag = '{%s}%s' % (self.namespaces[ns], tag)
+        return tag or ns
+
+    def get(self, x, attr, default=None):
+        return x.attrib.get(self.expand(attr), default)
+
+    def ancestor(self, elem, name):
+        try:
+            return self.XPath('ancestor::%s[1]' % name)(elem)[0]
+        except IndexError:
+            return None
+
+    def children(self, elem, *args):
+        return self.XPath('|'.join('child::%s' % a for a in args))(elem)
+
+    def descendants(self, elem, *args):
+        return self.XPath('|'.join('descendant::%s' % a for a in args))(elem)
+
+    def makeelement(self, root, tag, append=True, **attrs):
+        ans = root.makeelement(self.expand(tag), **{self.expand(k, sep='_'):v for k, v in iteritems(attrs)})
+        if append:
+            root.append(ans)
+        return ans
--- a/ebook_converter/ebooks/docx/numbering.py
+++ b/ebook_converter/ebooks/docx/numbering.py
@@ -0,0 +1,388 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re, string
+from collections import Counter, defaultdict
+from functools import partial
+
+from lxml.html.builder import OL, UL, SPAN
+
+from calibre.ebooks.docx.block_styles import ParagraphStyle
+from calibre.ebooks.docx.char_styles import RunStyle, inherit
+from calibre.ebooks.metadata import roman
+from polyglot.builtins import iteritems, unicode_type
+
+STYLE_MAP = {
+    'aiueo': 'hiragana',
+    'aiueoFullWidth': 'hiragana',
+    'hebrew1': 'hebrew',
+    'iroha': 'katakana-iroha',
+    'irohaFullWidth': 'katakana-iroha',
+    'lowerLetter': 'lower-alpha',
+    'lowerRoman': 'lower-roman',
+    'none': 'none',
+    'upperLetter': 'upper-alpha',
+    'upperRoman': 'upper-roman',
+    'chineseCounting': 'cjk-ideographic',
+    'decimalZero': 'decimal-leading-zero',
+}
+
+
+def alphabet(val, lower=True):
+    x = string.ascii_lowercase if lower else string.ascii_uppercase
+    return x[(abs(val - 1)) % len(x)]
+
+
+alphabet_map = {
+    'lower-alpha':alphabet, 'upper-alpha':partial(alphabet, lower=False),
+    'lower-roman':lambda x:roman(x).lower(), 'upper-roman':roman,
+    'decimal-leading-zero': lambda x: '0%d' % x
+}
+
+
+class Level(object):
+
+    def __init__(self, namespace, lvl=None):
+        self.namespace = namespace
+        self.restart = None
+        self.start = 0
+        self.fmt = 'decimal'
+        self.para_link = None
+        self.paragraph_style = self.character_style = None
+        self.is_numbered = False
+        self.num_template = None
+        self.bullet_template = None
+        self.pic_id = None
+
+        if lvl is not None:
+            self.read_from_xml(lvl)
+
+    def copy(self):
+        ans = Level(self.namespace)
+        for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
+            setattr(ans, x, getattr(self, x))
+        return ans
+
+    def format_template(self, counter, ilvl, template):
+        def sub(m):
+            x = int(m.group(1)) - 1
+            if x > ilvl or x not in counter:
+                return ''
+            val = counter[x] - (0 if x == ilvl else 1)
+            formatter = alphabet_map.get(self.fmt, lambda x: '%d' % x)
+            return formatter(val)
+        return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
+
+    def read_from_xml(self, lvl, override=False):
+        XPath, get = self.namespace.XPath, self.namespace.get
+        for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
+            try:
+                self.restart = int(get(lr, 'w:val'))
+            except (TypeError, ValueError):
+                pass
+
+        for lr in XPath('./w:start[@w:val]')(lvl):
+            try:
+                self.start = int(get(lr, 'w:val'))
+            except (TypeError, ValueError):
+                pass
+
+        for rPr in XPath('./w:rPr')(lvl):
+            ps = RunStyle(self.namespace, rPr)
+            if self.character_style is None:
+                self.character_style = ps
+            else:
+                self.character_style.update(ps)
+
+        lt = None
+        for lr in XPath('./w:lvlText[@w:val]')(lvl):
+            lt = get(lr, 'w:val')
+
+        for lr in XPath('./w:numFmt[@w:val]')(lvl):
+            val = get(lr, 'w:val')
+            if val == 'bullet':
+                self.is_numbered = False
+                cs = self.character_style
+                if lt in {'\uf0a7', 'o'} or (
+                    cs is not None and cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}):
+                    self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
+                else:
+                    self.bullet_template = lt
+                for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl):
+                    self.pic_id = get(lpid, 'w:val')
+            else:
+                self.is_numbered = True
+                self.fmt = STYLE_MAP.get(val, 'decimal')
+                if lt and re.match(r'%\d+\.$', lt) is None:
+                    self.num_template = lt
+
+        for lr in XPath('./w:pStyle[@w:val]')(lvl):
+            self.para_link = get(lr, 'w:val')
+
+        for pPr in XPath('./w:pPr')(lvl):
+            ps = ParagraphStyle(self.namespace, pPr)
+            if self.paragraph_style is None:
+                self.paragraph_style = ps
+            else:
+                self.paragraph_style.update(ps)
+
+    def css(self, images, pic_map, rid_map):
+        ans = {'list-style-type': self.fmt}
+        if self.pic_id:
+            rid = pic_map.get(self.pic_id, None)
+            if rid:
+                try:
+                    fname = images.generate_filename(rid, rid_map=rid_map, max_width=20, max_height=20)
+                except Exception:
+                    fname = None
+                else:
+                    ans['list-style-image'] = 'url("images/%s")' % fname
+        return ans
+
+    def char_css(self):
+        try:
+            css = self.character_style.css
+        except AttributeError:
+            css = {}
+        css.pop('font-family', None)
+        return css
+
+
+class NumberingDefinition(object):
+
+    def __init__(self, namespace, parent=None, an_id=None):
+        self.namespace = namespace
+        XPath, get = self.namespace.XPath, self.namespace.get
+        self.levels = {}
+        self.abstract_numbering_definition_id = an_id
+        if parent is not None:
+            for lvl in XPath('./w:lvl')(parent):
+                try:
+                    ilvl = int(get(lvl, 'w:ilvl', 0))
+                except (TypeError, ValueError):
+                    ilvl = 0
+                self.levels[ilvl] = Level(namespace, lvl)
+
+    def copy(self):
+        ans = NumberingDefinition(self.namespace, an_id=self.abstract_numbering_definition_id)
+        for l, lvl in iteritems(self.levels):
+            ans.levels[l] = lvl.copy()
+        return ans
+
+
+class Numbering(object):
+
+    def __init__(self, namespace):
+        self.namespace = namespace
+        self.definitions = {}
+        self.instances = {}
+        self.counters = defaultdict(Counter)
+        self.starts = {}
+        self.pic_map = {}
+
+    def __call__(self, root, styles, rid_map):
+        ' Read all numbering style definitions '
+        XPath, get = self.namespace.XPath, self.namespace.get
+        self.rid_map = rid_map
+        for npb in XPath('./w:numPicBullet[@w:numPicBulletId]')(root):
+            npbid = get(npb, 'w:numPicBulletId')
+            for idata in XPath('descendant::v:imagedata[@r:id]')(npb):
+                rid = get(idata, 'r:id')
+                self.pic_map[npbid] = rid
+        lazy_load = {}
+        for an in XPath('./w:abstractNum[@w:abstractNumId]')(root):
+            an_id = get(an, 'w:abstractNumId')
+            nsl = XPath('./w:numStyleLink[@w:val]')(an)
+            if nsl:
+                lazy_load[an_id] = get(nsl[0], 'w:val')
+            else:
+                nd = NumberingDefinition(self.namespace, an, an_id=an_id)
+                self.definitions[an_id] = nd
+
+        def create_instance(n, definition):
+            nd = definition.copy()
+            start_overrides = {}
+            for lo in XPath('./w:lvlOverride')(n):
+                try:
+                    ilvl = int(get(lo, 'w:ilvl'))
+                except (ValueError, TypeError):
+                    ilvl = None
+                for so in XPath('./w:startOverride[@w:val]')(lo):
+                    try:
+                        start_override = int(get(so, 'w:val'))
+                    except (TypeError, ValueError):
+                        pass
+                    else:
+                        start_overrides[ilvl] = start_override
+                for lvl in XPath('./w:lvl')(lo)[:1]:
+                    nilvl = get(lvl, 'w:ilvl')
+                    ilvl = nilvl if ilvl is None else ilvl
+                    alvl = nd.levels.get(ilvl, None)
+                    if alvl is None:
+                        alvl = Level(self.namespace)
+                    alvl.read_from_xml(lvl, override=True)
+            for ilvl, so in iteritems(start_overrides):
+                try:
+                    nd.levels[ilvl].start = start_override
+                except KeyError:
+                    pass
+            return nd
+
+        next_pass = {}
+        for n in XPath('./w:num[@w:numId]')(root):
+            an_id = None
+            num_id = get(n, 'w:numId')
+            for an in XPath('./w:abstractNumId[@w:val]')(n):
+                an_id = get(an, 'w:val')
+            d = self.definitions.get(an_id, None)
+            if d is None:
+                next_pass[num_id] = (an_id, n)
+                continue
+            self.instances[num_id] = create_instance(n, d)
+
+        numbering_links = styles.numbering_style_links
+        for an_id, style_link in iteritems(lazy_load):
+            num_id = numbering_links[style_link]
+            self.definitions[an_id] = self.instances[num_id].copy()
+
+        for num_id, (an_id, n) in iteritems(next_pass):
+            d = self.definitions.get(an_id, None)
+            if d is not None:
+                self.instances[num_id] = create_instance(n, d)
+
+        for num_id, d in iteritems(self.instances):
+            self.starts[num_id] = {lvl:d.levels[lvl].start for lvl in d.levels}
+
+    def get_pstyle(self, num_id, style_id):
+        d = self.instances.get(num_id, None)
+        if d is not None:
+            for ilvl, lvl in iteritems(d.levels):
+                if lvl.para_link == style_id:
+                    return ilvl
+
+    def get_para_style(self, num_id, lvl):
+        d = self.instances.get(num_id, None)
+        if d is not None:
+            lvl = d.levels.get(lvl, None)
+            return getattr(lvl, 'paragraph_style', None)
+
+    def update_counter(self, counter, levelnum, levels):
+        counter[levelnum] += 1
+        for ilvl, lvl in iteritems(levels):
+            restart = lvl.restart
+            if (restart is None and ilvl == levelnum + 1) or restart == levelnum + 1:
+                counter[ilvl] = lvl.start
+
+    def apply_markup(self, items, body, styles, object_map, images):
+        seen_instances = set()
+        for p, num_id, ilvl in items:
+            d = self.instances.get(num_id, None)
+            if d is not None:
+                lvl = d.levels.get(ilvl, None)
+                if lvl is not None:
+                    an_id = d.abstract_numbering_definition_id
+                    counter = self.counters[an_id]
+                    if ilvl not in counter or num_id not in seen_instances:
+                        counter[ilvl] = self.starts[num_id][ilvl]
+                    seen_instances.add(num_id)
+                    p.tag = 'li'
+                    p.set('value', '%s' % counter[ilvl])
+                    p.set('list-lvl', unicode_type(ilvl))
+                    p.set('list-id', num_id)
+                    if lvl.num_template is not None:
+                        val = lvl.format_template(counter, ilvl, lvl.num_template)
+                        p.set('list-template', val)
+                    elif lvl.bullet_template is not None:
+                        val = lvl.format_template(counter, ilvl, lvl.bullet_template)
+                        p.set('list-template', val)
+                    self.update_counter(counter, ilvl, d.levels)
+
+        templates = {}
+
+        def commit(current_run):
+            if not current_run:
+                return
+            start = current_run[0]
+            parent = start.getparent()
+            idx = parent.index(start)
+
+            d = self.instances[start.get('list-id')]
+            ilvl = int(start.get('list-lvl'))
+            lvl = d.levels[ilvl]
+            lvlid = start.get('list-id') + start.get('list-lvl')
+            has_template = 'list-template' in start.attrib
+            wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t')
+            if has_template:
+                wrap.set('lvlid', lvlid)
+            else:
+                wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list'))
+            ccss = lvl.char_css()
+            if ccss:
+                ccss = styles.register(ccss, 'bullet')
+            parent.insert(idx, wrap)
+            last_val = None
+            for child in current_run:
+                wrap.append(child)
+                child.tail = '\n\t'
+                if has_template:
+                    span = SPAN()
+                    span.text = child.text
+                    child.text = None
+                    for gc in child:
+                        span.append(gc)
+                    child.append(span)
+                    span = SPAN(child.get('list-template'))
+                    if ccss:
+                        span.set('class', ccss)
+                    last = templates.get(lvlid, '')
+                    if span.text and len(span.text) > len(last):
+                        templates[lvlid] = span.text
+                    child.insert(0, span)
+                for attr in ('list-lvl', 'list-id', 'list-template'):
+                    child.attrib.pop(attr, None)
+                val = int(child.get('value'))
+                if last_val == val - 1 or wrap.tag == 'ul' or (last_val is None and val == 1):
+                    child.attrib.pop('value')
+                last_val = val
+            current_run[-1].tail = '\n'
+            del current_run[:]
+
+        parents = set()
+        for child in body.iterdescendants('li'):
+            parents.add(child.getparent())
+
+        for parent in parents:
+            current_run = []
+            for child in parent:
+                if child.tag == 'li':
+                    if current_run:
+                        last = current_run[-1]
+                        if (last.get('list-id') , last.get('list-lvl')) != (child.get('list-id'), child.get('list-lvl')):
+                            commit(current_run)
+                    current_run.append(child)
+                else:
+                    commit(current_run)
+            commit(current_run)
+
+        # Convert the list items that use custom text for bullets into tables
+        # so that they display correctly
+        for wrap in body.xpath('//ol[@lvlid]'):
+            wrap.attrib.pop('lvlid')
+            wrap.tag = 'div'
+            wrap.set('style', 'display:table')
+            for i, li in enumerate(wrap.iterchildren('li')):
+                li.tag = 'div'
+                li.attrib.pop('value', None)
+                li.set('style', 'display:table-row')
+                obj = object_map[li]
+                bs = styles.para_cache[obj]
+                if i == 0:
+                    wrap.set('style', 'display:table; padding-left:%s' %
+                             bs.css.get('margin-left', '0'))
+                bs.css.pop('margin-left', None)
+                for child in li:
+                    child.set('style', 'display:table-cell')
--- a/ebook_converter/ebooks/docx/settings.py
+++ b/ebook_converter/ebooks/docx/settings.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+
+class Settings(object):
+
+    def __init__(self, namespace):
+        self.default_tab_stop = 720 / 20
+        self.namespace = namespace
+
+    def __call__(self, root):
+        for dts in self.namespace.XPath('//w:defaultTabStop[@w:val]')(root):
+            try:
+                self.default_tab_stop = int(self.namespace.get(dts, 'w:val')) / 20
+            except (ValueError, TypeError, AttributeError):
+                pass
+
--- a/ebook_converter/ebooks/docx/styles.py
+++ b/ebook_converter/ebooks/docx/styles.py
@@ -0,0 +1,504 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import textwrap
+from collections import OrderedDict, Counter
+
+from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit, twips
+from calibre.ebooks.docx.char_styles import RunStyle
+from calibre.ebooks.docx.tables import TableStyle
+from polyglot.builtins import iteritems, itervalues
+
+
+class PageProperties(object):
+
+    '''
+    Class representing page level properties (page size/margins) read from
+    sectPr elements.
+    '''
+
+    def __init__(self, namespace, elems=()):
+        self.width, self.height = 595.28, 841.89  # pts, A4
+        self.margin_left = self.margin_right = 72  # pts
+
+        def setval(attr, val):
+            val = twips(val)
+            if val is not None:
+                setattr(self, attr, val)
+
+        for sectPr in elems:
+            for pgSz in namespace.XPath('./w:pgSz')(sectPr):
+                w, h = namespace.get(pgSz, 'w:w'), namespace.get(pgSz, 'w:h')
+                setval('width', w), setval('height', h)
+            for pgMar in namespace.XPath('./w:pgMar')(sectPr):
+                l, r = namespace.get(pgMar, 'w:left'), namespace.get(pgMar, 'w:right')
+                setval('margin_left', l), setval('margin_right', r)
+
+
+class Style(object):
+    '''
+    Class representing a <w:style> element. Can contain block, character, etc. styles.
+    '''
+
+    def __init__(self, namespace, elem):
+        self.namespace = namespace
+        self.name_path = namespace.XPath('./w:name[@w:val]')
+        self.based_on_path = namespace.XPath('./w:basedOn[@w:val]')
+        self.resolved = False
+        self.style_id = namespace.get(elem, 'w:styleId')
+        self.style_type = namespace.get(elem, 'w:type')
+        names = self.name_path(elem)
+        self.name = namespace.get(names[-1], 'w:val') if names else None
+        based_on = self.based_on_path(elem)
+        self.based_on = namespace.get(based_on[0], 'w:val') if based_on else None
+        if self.style_type == 'numbering':
+            self.based_on = None
+        self.is_default = namespace.get(elem, 'w:default') in {'1', 'on', 'true'}
+
+        self.paragraph_style = self.character_style = self.table_style = None
+
+        if self.style_type in {'paragraph', 'character', 'table'}:
+            if self.style_type == 'table':
+                for tblPr in namespace.XPath('./w:tblPr')(elem):
+                    ts = TableStyle(namespace, tblPr)
+                    if self.table_style is None:
+                        self.table_style = ts
+                    else:
+                        self.table_style.update(ts)
+            if self.style_type in {'paragraph', 'table'}:
+                for pPr in namespace.XPath('./w:pPr')(elem):
+                    ps = ParagraphStyle(namespace, pPr)
+                    if self.paragraph_style is None:
+                        self.paragraph_style = ps
+                    else:
+                        self.paragraph_style.update(ps)
+
+            for rPr in namespace.XPath('./w:rPr')(elem):
+                rs = RunStyle(namespace, rPr)
+                if self.character_style is None:
+                    self.character_style = rs
+                else:
+                    self.character_style.update(rs)
+
+        if self.style_type in {'numbering', 'paragraph'}:
+            self.numbering_style_link = None
+            for x in namespace.XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
+                self.numbering_style_link = namespace.get(x, 'w:val')
+
+    def resolve_based_on(self, parent):
+        if parent.table_style is not None:
+            if self.table_style is None:
+                self.table_style = TableStyle(self.namespace)
+            self.table_style.resolve_based_on(parent.table_style)
+        if parent.paragraph_style is not None:
+            if self.paragraph_style is None:
+                self.paragraph_style = ParagraphStyle(self.namespace)
+            self.paragraph_style.resolve_based_on(parent.paragraph_style)
+        if parent.character_style is not None:
+            if self.character_style is None:
+                self.character_style = RunStyle(self.namespace)
+            self.character_style.resolve_based_on(parent.character_style)
+
+
+class Styles(object):
+
+    '''
+    Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup.
+    '''
+
+    def __init__(self, namespace, tables):
+        self.namespace = namespace
+        self.id_map = OrderedDict()
+        self.para_cache = {}
+        self.para_char_cache = {}
+        self.run_cache = {}
+        self.classes = {}
+        self.counter = Counter()
+        self.default_styles = {}
+        self.tables = tables
+        self.numbering_style_links = {}
+        self.default_paragraph_style = self.default_character_style = None
+
+    def __iter__(self):
+        for s in itervalues(self.id_map):
+            yield s
+
+    def __getitem__(self, key):
+        return self.id_map[key]
+
+    def __len__(self):
+        return len(self.id_map)
+
+    def get(self, key, default=None):
+        return self.id_map.get(key, default)
+
+    def __call__(self, root, fonts, theme):
+        self.fonts, self.theme = fonts, theme
+        self.default_paragraph_style = self.default_character_style = None
+        if root is not None:
+            for s in self.namespace.XPath('//w:style')(root):
+                s = Style(self.namespace, s)
+                if s.style_id:
+                    self.id_map[s.style_id] = s
+                if s.is_default:
+                    self.default_styles[s.style_type] = s
+                if getattr(s, 'numbering_style_link', None) is not None:
+                    self.numbering_style_links[s.style_id] = s.numbering_style_link
+
+            for dd in self.namespace.XPath('./w:docDefaults')(root):
+                for pd in self.namespace.XPath('./w:pPrDefault')(dd):
+                    for pPr in self.namespace.XPath('./w:pPr')(pd):
+                        ps = ParagraphStyle(self.namespace, pPr)
+                        if self.default_paragraph_style is None:
+                            self.default_paragraph_style = ps
+                        else:
+                            self.default_paragraph_style.update(ps)
+                for pd in self.namespace.XPath('./w:rPrDefault')(dd):
+                    for pPr in self.namespace.XPath('./w:rPr')(pd):
+                        ps = RunStyle(self.namespace, pPr)
+                        if self.default_character_style is None:
+                            self.default_character_style = ps
+                        else:
+                            self.default_character_style.update(ps)
+
+        def resolve(s, p):
+            if p is not None:
+                if not p.resolved:
+                    resolve(p, self.get(p.based_on))
+                s.resolve_based_on(p)
+            s.resolved = True
+
+        for s in self:
+            if not s.resolved:
+                resolve(s, self.get(s.based_on))
+
+    def para_val(self, parent_styles, direct_formatting, attr):
+        val = getattr(direct_formatting, attr)
+        if val is inherit:
+            for ps in reversed(parent_styles):
+                pval = getattr(ps, attr)
+                if pval is not inherit:
+                    val = pval
+                    break
+        return val
+
+    def run_val(self, parent_styles, direct_formatting, attr):
+        val = getattr(direct_formatting, attr)
+        if val is not inherit:
+            return val
+        if attr in direct_formatting.toggle_properties:
+            # The spec (section 17.7.3) does not make sense, so we follow the behavior
+            # of Word, which seems to only consider the document default if the
+            # property has not been defined in any styles.
+            vals = [int(getattr(rs, attr)) for rs in parent_styles if rs is not self.default_character_style and getattr(rs, attr) is not inherit]
+            if vals:
+                return sum(vals) % 2 == 1
+            if self.default_character_style is not None:
+                return getattr(self.default_character_style, attr) is True
+            return False
+        for rs in reversed(parent_styles):
+            rval = getattr(rs, attr)
+            if rval is not inherit:
+                return rval
+        return val
+
+    def resolve_paragraph(self, p):
+        ans = self.para_cache.get(p, None)
+        if ans is None:
+            linked_style = None
+            ans = self.para_cache[p] = ParagraphStyle(self.namespace)
+            ans.style_name = None
+            direct_formatting = None
+            is_section_break = False
+            for pPr in self.namespace.XPath('./w:pPr')(p):
+                ps = ParagraphStyle(self.namespace, pPr)
+                if direct_formatting is None:
+                    direct_formatting = ps
+                else:
+                    direct_formatting.update(ps)
+                if self.namespace.XPath('./w:sectPr')(pPr):
+                    is_section_break = True
+
+            if direct_formatting is None:
+                direct_formatting = ParagraphStyle(self.namespace)
+            parent_styles = []
+            if self.default_paragraph_style is not None:
+                parent_styles.append(self.default_paragraph_style)
+            ts = self.tables.para_style(p)
+            if ts is not None:
+                parent_styles.append(ts)
+
+            default_para = self.default_styles.get('paragraph', None)
+            if direct_formatting.linked_style is not None:
+                ls = linked_style = self.get(direct_formatting.linked_style)
+                if ls is not None:
+                    ans.style_name = ls.name
+                    ps = ls.paragraph_style
+                    if ps is not None:
+                        parent_styles.append(ps)
+                    if ls.character_style is not None:
+                        self.para_char_cache[p] = ls.character_style
+            elif default_para is not None:
+                if default_para.paragraph_style is not None:
+                    parent_styles.append(default_para.paragraph_style)
+                if default_para.character_style is not None:
+                    self.para_char_cache[p] = default_para.character_style
+
+            def has_numbering(block_style):
+                num_id, lvl = getattr(block_style, 'numbering_id', inherit), getattr(block_style, 'numbering_level', inherit)
+                return num_id is not None and num_id is not inherit and lvl is not None and lvl is not inherit
+
+            is_numbering = has_numbering(direct_formatting)
+            is_section_break = is_section_break and not self.namespace.XPath('./w:r')(p)
+
+            if is_numbering and not is_section_break:
+                num_id, lvl = direct_formatting.numbering_id, direct_formatting.numbering_level
+                p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
+                ps = self.numbering.get_para_style(num_id, lvl)
+                if ps is not None:
+                    parent_styles.append(ps)
+            if (
+                not is_numbering and not is_section_break and linked_style is not None and has_numbering(linked_style.paragraph_style)
+            ):
+                num_id, lvl = linked_style.paragraph_style.numbering_id, linked_style.paragraph_style.numbering_level
+                p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
+                is_numbering = True
+                ps = self.numbering.get_para_style(num_id, lvl)
+                if ps is not None:
+                    parent_styles.append(ps)
+
+            for attr in ans.all_properties:
+                if not (is_numbering and attr == 'text_indent'):  # skip text-indent for lists
+                    setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr))
+            ans.linked_style = direct_formatting.linked_style
+        return ans
+
+    def resolve_run(self, r):
+        ans = self.run_cache.get(r, None)
+        if ans is None:
+            p = self.namespace.XPath('ancestor::w:p[1]')(r)
+            p = p[0] if p else None
+            ans = self.run_cache[r] = RunStyle(self.namespace)
+            direct_formatting = None
+            for rPr in self.namespace.XPath('./w:rPr')(r):
+                rs = RunStyle(self.namespace, rPr)
+                if direct_formatting is None:
+                    direct_formatting = rs
+                else:
+                    direct_formatting.update(rs)
+
+            if direct_formatting is None:
+                direct_formatting = RunStyle(self.namespace)
+
+            parent_styles = []
+            default_char = self.default_styles.get('character', None)
+            if self.default_character_style is not None:
+                parent_styles.append(self.default_character_style)
+            pstyle = self.para_char_cache.get(p, None)
+            if pstyle is not None:
+                parent_styles.append(pstyle)
+            # As best as I can understand the spec, table overrides should be
+            # applied before paragraph overrides, but word does it
+            # this way, see the December 2007 table header in the demo
+            # document.
+            ts = self.tables.run_style(p)
+            if ts is not None:
+                parent_styles.append(ts)
+            if direct_formatting.linked_style is not None:
+                ls = getattr(self.get(direct_formatting.linked_style), 'character_style', None)
+                if ls is not None:
+                    parent_styles.append(ls)
+            elif default_char is not None and default_char.character_style is not None:
+                parent_styles.append(default_char.character_style)
+
+            for attr in ans.all_properties:
+                setattr(ans, attr, self.run_val(parent_styles, direct_formatting, attr))
+
+            if ans.font_family is not inherit:
+                ff = self.theme.resolve_font_family(ans.font_family)
+                ans.font_family = self.fonts.family_for(ff, ans.b, ans.i)
+
+        return ans
+
+    def resolve(self, obj):
+        if obj.tag.endswith('}p'):
+            return self.resolve_paragraph(obj)
+        if obj.tag.endswith('}r'):
+            return self.resolve_run(obj)
+
+    def cascade(self, layers):
+        self.body_font_family = 'serif'
+        self.body_font_size = '10pt'
+        self.body_color = 'black'
+
+        def promote_property(char_styles, block_style, prop):
+            vals = {getattr(s, prop) for s in char_styles}
+            if len(vals) == 1:
+                # All the character styles have the same value
+                for s in char_styles:
+                    setattr(s, prop, inherit)
+                setattr(block_style, prop, next(iter(vals)))
+
+        for p, runs in iteritems(layers):
+            has_links = '1' in {r.get('is-link', None) for r in runs}
+            char_styles = [self.resolve_run(r) for r in runs]
+            block_style = self.resolve_paragraph(p)
+            for prop in ('font_family', 'font_size', 'cs_font_family', 'cs_font_size', 'color'):
+                if has_links and prop == 'color':
+                    # We cannot promote color as browser rendering engines will
+                    # override the link color setting it to blue, unless the
+                    # color is specified on the link element itself
+                    continue
+                promote_property(char_styles, block_style, prop)
+            for s in char_styles:
+                if s.text_decoration == 'none':
+                    # The default text decoration is 'none'
+                    s.text_decoration = inherit
+
+        def promote_most_common(block_styles, prop, default):
+            c = Counter()
+            for s in block_styles:
+                val = getattr(s, prop)
+                if val is not inherit:
+                    c[val] += 1
+            val = None
+            if c:
+                val = c.most_common(1)[0][0]
+                for s in block_styles:
+                    oval = getattr(s, prop)
+                    if oval is inherit:
+                        if default != val:
+                            setattr(s, prop, default)
+                    elif oval == val:
+                        setattr(s, prop, inherit)
+            return val
+
+        block_styles = tuple(self.resolve_paragraph(p) for p in layers)
+
+        ff = promote_most_common(block_styles, 'font_family', self.body_font_family)
+        if ff is not None:
+            self.body_font_family = ff
+
+        fs = promote_most_common(block_styles, 'font_size', int(self.body_font_size[:2]))
+        if fs is not None:
+            self.body_font_size = '%.3gpt' % fs
+
+        color = promote_most_common(block_styles, 'color', self.body_color)
+        if color is not None:
+            self.body_color = color
+
+    def resolve_numbering(self, numbering):
+        # When a numPr element appears inside a paragraph style, the lvl info
+        # must be discarded and pStyle used instead.
+        self.numbering = numbering
+        for style in self:
+            ps = style.paragraph_style
+            if ps is not None and ps.numbering_id is not inherit:
+                lvl = numbering.get_pstyle(ps.numbering_id, style.style_id)
+                if lvl is None:
+                    ps.numbering_id = ps.numbering_level = inherit
+                else:
+                    ps.numbering_level = lvl
+
+    def apply_contextual_spacing(self, paras):
+        last_para = None
+        for p in paras:
+            if last_para is not None:
+                ls = self.resolve_paragraph(last_para)
+                ps = self.resolve_paragraph(p)
+                if ls.linked_style is not None and ls.linked_style == ps.linked_style:
+                    if ls.contextualSpacing is True:
+                        ls.margin_bottom = 0
+                    if ps.contextualSpacing is True:
+                        ps.margin_top = 0
+            last_para = p
+
+    def apply_section_page_breaks(self, paras):
+        for p in paras:
+            ps = self.resolve_paragraph(p)
+            ps.pageBreakBefore = True
+
+    def register(self, css, prefix):
+        h = hash(frozenset(iteritems(css)))
+        ans, _ = self.classes.get(h, (None, None))
+        if ans is None:
+            self.counter[prefix] += 1
+            ans = '%s_%d' % (prefix, self.counter[prefix])
+            self.classes[h] = (ans, css)
+        return ans
+
+    def generate_classes(self):
+        for bs in itervalues(self.para_cache):
+            css = bs.css
+            if css:
+                self.register(css, 'block')
+        for bs in itervalues(self.run_cache):
+            css = bs.css
+            if css:
+                self.register(css, 'text')
+
+    def class_name(self, css):
+        h = hash(frozenset(iteritems(css)))
+        return self.classes.get(h, (None, None))[0]
+
+    def generate_css(self, dest_dir, docx, notes_nopb, nosupsub):
+        ef = self.fonts.embed_fonts(dest_dir, docx)
+
+        s = '''\
+            body { font-family: %s; font-size: %s; color: %s }
+
+            /* In word all paragraphs have zero margins unless explicitly specified in a style */
+            p, h1, h2, h3, h4, h5, h6, div { margin: 0; padding: 0 }
+            /* In word headings only have bold font if explicitly specified,
+                similarly the font size is the body font size, unless explicitly set. */
+            h1, h2, h3, h4, h5, h6 { font-weight: normal; font-size: 1rem }
+            /* Setting padding-left to zero breaks rendering of lists, so we only set the other values to zero and leave padding-left for the user-agent */
+            ul, ol { margin: 0; padding-top: 0; padding-bottom: 0; padding-right: 0 }
+
+            /* The word hyperlink styling will set text-decoration to underline if needed */
+            a { text-decoration: none }
+
+            sup.noteref a { text-decoration: none }
+
+            h1.notes-header { page-break-before: always }
+
+            dl.footnote dt { font-size: large }
+
+            dl.footnote dt a { text-decoration: none }
+
+            '''
+
+        if not notes_nopb:
+            s += '''\
+            dl.footnote { page-break-after: always }
+            dl.footnote:last-of-type { page-break-after: avoid }
+            '''
+
+        s = s + '''\
+            span.tab { white-space: pre }
+
+            p.index-entry { text-indent: 0pt; }
+            p.index-entry a:visited { color: blue }
+            p.index-entry a:hover { color: red }
+            '''
+
+        if nosupsub:
+            s = s + '''\
+               sup { vertical-align: top }
+               sub { vertical-align: bottom }
+               '''
+
+        prefix = textwrap.dedent(s) % (self.body_font_family, self.body_font_size, self.body_color)
+        if ef:
+            prefix = ef + '\n' + prefix
+
+        ans = []
+        for (cls, css) in sorted(itervalues(self.classes), key=lambda x:x[0]):
+            b = ('\t%s: %s;' % (k, v) for k, v in iteritems(css))
+            b = '\n'.join(b)
+            ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';')))
+        return prefix + '\n' + '\n'.join(ans)
--- a/ebook_converter/ebooks/docx/tables.py
+++ b/ebook_converter/ebooks/docx/tables.py
@@ -0,0 +1,700 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from lxml.html.builder import TABLE, TR, TD
+
+from calibre.ebooks.docx.block_styles import inherit, read_shd as rs, read_border, binary_property, border_props, ParagraphStyle, border_to_css
+from calibre.ebooks.docx.char_styles import RunStyle
+from polyglot.builtins import filter, iteritems, itervalues, range, unicode_type
+
+# Read from XML {{{
+read_shd = rs
+edges = ('left', 'top', 'right', 'bottom')
+
+
+def _read_width(elem, get):
+    ans = inherit
+    try:
+        w = int(get(elem, 'w:w'))
+    except (TypeError, ValueError):
+        w = 0
+    typ = get(elem, 'w:type', 'auto')
+    if typ == 'nil':
+        ans = '0'
+    elif typ == 'auto':
+        ans = 'auto'
+    elif typ == 'dxa':
+        ans = '%.3gpt' % (w/20)
+    elif typ == 'pct':
+        ans = '%.3g%%' % (w/50)
+    return ans
+
+
+def read_width(parent, dest, XPath, get):
+    ans = inherit
+    for tblW in XPath('./w:tblW')(parent):
+        ans = _read_width(tblW, get)
+    setattr(dest, 'width', ans)
+
+
+def read_cell_width(parent, dest, XPath, get):
+    ans = inherit
+    for tblW in XPath('./w:tcW')(parent):
+        ans = _read_width(tblW, get)
+    setattr(dest, 'width', ans)
+
+
+def read_padding(parent, dest, XPath, get):
+    name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
+    ans = {x:inherit for x in edges}
+    for mar in XPath('./w:%s' % name)(parent):
+        for x in edges:
+            for edge in XPath('./w:%s' % x)(mar):
+                ans[x] = _read_width(edge, get)
+    for x in edges:
+        setattr(dest, 'cell_padding_%s' % x, ans[x])
+
+
+def read_justification(parent, dest, XPath, get):
+    left = right = inherit
+    for jc in XPath('./w:jc[@w:val]')(parent):
+        val = get(jc, 'w:val')
+        if not val:
+            continue
+        if val == 'left':
+            right = 'auto'
+        elif val == 'right':
+            left = 'auto'
+        elif val == 'center':
+            left = right = 'auto'
+    setattr(dest, 'margin_left', left)
+    setattr(dest, 'margin_right', right)
+
+
+def read_spacing(parent, dest, XPath, get):
+    ans = inherit
+    for cs in XPath('./w:tblCellSpacing')(parent):
+        ans = _read_width(cs, get)
+    setattr(dest, 'spacing', ans)
+
+
+def read_float(parent, dest, XPath, get):
+    ans = inherit
+    for x in XPath('./w:tblpPr')(parent):
+        ans = {k.rpartition('}')[-1]: v for k, v in iteritems(x.attrib)}
+    setattr(dest, 'float', ans)
+
+
+def read_indent(parent, dest, XPath, get):
+    ans = inherit
+    for cs in XPath('./w:tblInd')(parent):
+        ans = _read_width(cs, get)
+    setattr(dest, 'indent', ans)
+
+
+border_edges = ('left', 'top', 'right', 'bottom', 'insideH', 'insideV')
+
+
+def read_borders(parent, dest, XPath, get):
+    name = 'tblBorders' if parent.tag.endswith('}tblPr') else 'tcBorders'
+    read_border(parent, dest, XPath, get, border_edges, name)
+
+
+def read_height(parent, dest, XPath, get):
+    ans = inherit
+    for rh in XPath('./w:trHeight')(parent):
+        rule = get(rh, 'w:hRule', 'auto')
+        if rule in {'auto', 'atLeast', 'exact'}:
+            val = get(rh, 'w:val')
+            ans = (rule, val)
+    setattr(dest, 'height', ans)
+
+
+def read_vertical_align(parent, dest, XPath, get):
+    ans = inherit
+    for va in XPath('./w:vAlign')(parent):
+        val = get(va, 'w:val')
+        ans = {'center': 'middle', 'top': 'top', 'bottom': 'bottom'}.get(val, 'middle')
+    setattr(dest, 'vertical_align', ans)
+
+
+def read_col_span(parent, dest, XPath, get):
+    ans = inherit
+    for gs in XPath('./w:gridSpan')(parent):
+        try:
+            ans = int(get(gs, 'w:val'))
+        except (TypeError, ValueError):
+            continue
+    setattr(dest, 'col_span', ans)
+
+
+def read_merge(parent, dest, XPath, get):
+    for x in ('hMerge', 'vMerge'):
+        ans = inherit
+        for m in XPath('./w:%s' % x)(parent):
+            ans = get(m, 'w:val', 'continue')
+        setattr(dest, x, ans)
+
+
+def read_band_size(parent, dest, XPath, get):
+    for x in ('Col', 'Row'):
+        ans = 1
+        for y in XPath('./w:tblStyle%sBandSize' % x)(parent):
+            try:
+                ans = int(get(y, 'w:val'))
+            except (TypeError, ValueError):
+                continue
+        setattr(dest, '%s_band_size' % x.lower(), ans)
+
+
+def read_look(parent, dest, XPath, get):
+    ans = 0
+    for x in XPath('./w:tblLook')(parent):
+        try:
+            ans = int(get(x, 'w:val'), 16)
+        except (ValueError, TypeError):
+            continue
+    setattr(dest, 'look', ans)
+
+# }}}
+
+
+def clone(style):
+    if style is None:
+        return None
+    try:
+        ans = type(style)(style.namespace)
+    except TypeError:
+        return None
+    ans.update(style)
+    return ans
+
+
+class Style(object):
+
+    is_bidi = False
+
+    def update(self, other):
+        for prop in self.all_properties:
+            nval = getattr(other, prop)
+            if nval is not inherit:
+                setattr(self, prop, nval)
+
+    def apply_bidi(self):
+        self.is_bidi = True
+
+    def convert_spacing(self):
+        ans = {}
+        if self.spacing is not inherit:
+            if self.spacing in {'auto', '0'}:
+                ans['border-collapse'] = 'collapse'
+            else:
+                ans['border-collapse'] = 'separate'
+                ans['border-spacing'] = self.spacing
+        return ans
+
+    def convert_border(self):
+        c = {}
+        for x in edges:
+            border_to_css(x, self, c)
+            val = getattr(self, 'padding_%s' % x)
+            if val is not inherit:
+                c['padding-%s' % x] = '%.3gpt' % val
+        if self.is_bidi:
+            for a in ('padding-%s', 'border-%s-style', 'border-%s-color', 'border-%s-width'):
+                l, r = c.get(a % 'left'), c.get(a % 'right')
+                if l is not None:
+                    c[a % 'right'] = l
+                if r is not None:
+                    c[a % 'left'] = r
+        return c
+
+
+class RowStyle(Style):
+
+    all_properties = ('height', 'cantSplit', 'hidden', 'spacing',)
+
+    def __init__(self, namespace, trPr=None):
+        self.namespace = namespace
+        if trPr is None:
+            for p in self.all_properties:
+                setattr(self, p, inherit)
+        else:
+            for p in ('hidden', 'cantSplit'):
+                setattr(self, p, binary_property(trPr, p, namespace.XPath, namespace.get))
+            for p in ('spacing', 'height'):
+                f = globals()['read_%s' % p]
+                f(trPr, self, namespace.XPath, namespace.get)
+        self._css = None
+
+    @property
+    def css(self):
+        if self._css is None:
+            c = self._css = {}
+            if self.hidden is True:
+                c['display'] = 'none'
+            if self.cantSplit is True:
+                c['page-break-inside'] = 'avoid'
+            if self.height is not inherit:
+                rule, val = self.height
+                if rule != 'auto':
+                    try:
+                        c['min-height' if rule == 'atLeast' else 'height'] = '%.3gpt' % (int(val)/20)
+                    except (ValueError, TypeError):
+                        pass
+            c.update(self.convert_spacing())
+        return self._css
+
+
+class CellStyle(Style):
+
+    all_properties = ('background_color', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
+        'cell_padding_bottom', 'width', 'vertical_align', 'col_span', 'vMerge', 'hMerge', 'row_span',
+    ) + tuple(k % edge for edge in border_edges for k in border_props)
+
+    def __init__(self, namespace, tcPr=None):
+        self.namespace = namespace
+        if tcPr is None:
+            for p in self.all_properties:
+                setattr(self, p, inherit)
+        else:
+            for x in ('borders', 'shd', 'padding', 'cell_width', 'vertical_align', 'col_span', 'merge'):
+                f = globals()['read_%s' % x]
+                f(tcPr, self, namespace.XPath, namespace.get)
+            self.row_span = inherit
+        self._css = None
+
+    @property
+    def css(self):
+        if self._css is None:
+            self._css = c = {}
+            if self.background_color is not inherit:
+                c['background-color'] = self.background_color
+            if self.width not in (inherit, 'auto'):
+                c['width'] = self.width
+            c['vertical-align'] = 'top' if self.vertical_align is inherit else self.vertical_align
+            for x in edges:
+                val = getattr(self, 'cell_padding_%s' % x)
+                if val not in (inherit, 'auto'):
+                    c['padding-%s' % x] =  val
+                elif val is inherit and x in {'left', 'right'}:
+                    c['padding-%s' % x] = '%.3gpt' % (115/20)
+            # In Word, tables are apparently rendered with some default top and
+            # bottom padding irrespective of the cellMargin values. Simulate
+            # that here.
+            for x in ('top', 'bottom'):
+                if c.get('padding-%s' % x, '0pt') == '0pt':
+                    c['padding-%s' % x] = '0.5ex'
+            c.update(self.convert_border())
+
+        return self._css
+
+
+class TableStyle(Style):
+
+    all_properties = (
+        'width', 'float', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
+        'cell_padding_bottom', 'margin_left', 'margin_right', 'background_color',
+        'spacing', 'indent', 'overrides', 'col_band_size', 'row_band_size', 'look', 'bidi',
+    ) + tuple(k % edge for edge in border_edges for k in border_props)
+
+    def __init__(self, namespace, tblPr=None):
+        self.namespace = namespace
+        if tblPr is None:
+            for p in self.all_properties:
+                setattr(self, p, inherit)
+        else:
+            self.overrides = inherit
+            self.bidi = binary_property(tblPr, 'bidiVisual', namespace.XPath, namespace.get)
+            for x in ('width', 'float', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders', 'band_size', 'look'):
+                f = globals()['read_%s' % x]
+                f(tblPr, self, self.namespace.XPath, self.namespace.get)
+            parent = tblPr.getparent()
+            if self.namespace.is_tag(parent, 'w:style'):
+                self.overrides = {}
+                for tblStylePr in self.namespace.XPath('./w:tblStylePr[@w:type]')(parent):
+                    otype = self.namespace.get(tblStylePr, 'w:type')
+                    orides = self.overrides[otype] = {}
+                    for tblPr in self.namespace.XPath('./w:tblPr')(tblStylePr):
+                        orides['table'] = TableStyle(self.namespace, tblPr)
+                    for trPr in self.namespace.XPath('./w:trPr')(tblStylePr):
+                        orides['row'] = RowStyle(self.namespace, trPr)
+                    for tcPr in self.namespace.XPath('./w:tcPr')(tblStylePr):
+                        orides['cell'] = CellStyle(self.namespace, tcPr)
+                    for pPr in self.namespace.XPath('./w:pPr')(tblStylePr):
+                        orides['para'] = ParagraphStyle(self.namespace, pPr)
+                    for rPr in self.namespace.XPath('./w:rPr')(tblStylePr):
+                        orides['run'] = RunStyle(self.namespace, rPr)
+        self._css = None
+
+    def resolve_based_on(self, parent):
+        for p in self.all_properties:
+            val = getattr(self, p)
+            if val is inherit:
+                setattr(self, p, getattr(parent, p))
+
+    @property
+    def css(self):
+        if self._css is None:
+            c = self._css = {}
+            if self.width not in (inherit, 'auto'):
+                c['width'] = self.width
+            for x in ('background_color', 'margin_left', 'margin_right'):
+                val = getattr(self, x)
+                if val is not inherit:
+                    c[x.replace('_', '-')] = val
+            if self.indent not in (inherit, 'auto') and self.margin_left != 'auto':
+                c['margin-left'] = self.indent
+            if self.float is not inherit:
+                for x in ('left', 'top', 'right', 'bottom'):
+                    val = self.float.get('%sFromText' % x, 0)
+                    try:
+                        val = '%.3gpt' % (int(val) / 20)
+                    except (ValueError, TypeError):
+                        val = '0'
+                    c['margin-%s' % x] = val
+                if 'tblpXSpec' in self.float:
+                    c['float'] = 'right' if self.float['tblpXSpec'] in {'right', 'outside'} else 'left'
+                else:
+                    page = self.page
+                    page_width = page.width - page.margin_left - page.margin_right
+                    try:
+                        x = int(self.float['tblpX']) / 20
+                    except (KeyError, ValueError, TypeError):
+                        x = 0
+                    c['float'] = 'left' if (x/page_width) < 0.65 else 'right'
+            c.update(self.convert_spacing())
+            if 'border-collapse' not in c:
+                c['border-collapse'] = 'collapse'
+            c.update(self.convert_border())
+
+        return self._css
+
+
+class Table(object):
+
+    def __init__(self, namespace, tbl, styles, para_map, is_sub_table=False):
+        self.namespace = namespace
+        self.tbl = tbl
+        self.styles = styles
+        self.is_sub_table = is_sub_table
+
+        # Read Table Style
+        style = {'table':TableStyle(self.namespace)}
+        for tblPr in self.namespace.XPath('./w:tblPr')(tbl):
+            for ts in self.namespace.XPath('./w:tblStyle[@w:val]')(tblPr):
+                style_id = self.namespace.get(ts, 'w:val')
+                s = styles.get(style_id)
+                if s is not None:
+                    if s.table_style is not None:
+                        style['table'].update(s.table_style)
+                    if s.paragraph_style is not None:
+                        if 'paragraph' in style:
+                            style['paragraph'].update(s.paragraph_style)
+                        else:
+                            style['paragraph'] = s.paragraph_style
+                    if s.character_style is not None:
+                        if 'run' in style:
+                            style['run'].update(s.character_style)
+                        else:
+                            style['run'] = s.character_style
+            style['table'].update(TableStyle(self.namespace, tblPr))
+        self.table_style, self.paragraph_style = style['table'], style.get('paragraph', None)
+        self.run_style = style.get('run', None)
+        self.overrides = self.table_style.overrides
+        if self.overrides is inherit:
+            self.overrides = {}
+        if 'wholeTable' in self.overrides and 'table' in self.overrides['wholeTable']:
+            self.table_style.update(self.overrides['wholeTable']['table'])
+
+        self.style_map = {}
+        self.paragraphs = []
+        self.cell_map = []
+
+        rows = self.namespace.XPath('./w:tr')(tbl)
+        for r, tr in enumerate(rows):
+            overrides = self.get_overrides(r, None, len(rows), None)
+            self.resolve_row_style(tr, overrides)
+            cells = self.namespace.XPath('./w:tc')(tr)
+            self.cell_map.append([])
+            for c, tc in enumerate(cells):
+                overrides = self.get_overrides(r, c, len(rows), len(cells))
+                self.resolve_cell_style(tc, overrides, r, c, len(rows), len(cells))
+                self.cell_map[-1].append(tc)
+                for p in self.namespace.XPath('./w:p')(tc):
+                    para_map[p] = self
+                    self.paragraphs.append(p)
+                    self.resolve_para_style(p, overrides)
+
+        self.handle_merged_cells()
+        self.sub_tables = {x:Table(namespace, x, styles, para_map, is_sub_table=True) for x in self.namespace.XPath('./w:tr/w:tc/w:tbl')(tbl)}
+
+    @property
+    def bidi(self):
+        return self.table_style.bidi is True
+
+    def override_allowed(self, name):
+        'Check if the named override is allowed by the tblLook element'
+        if name.endswith('Cell') or name == 'wholeTable':
+            return True
+        look = self.table_style.look
+        if (look & 0x0020 and name == 'firstRow') or (look & 0x0040 and name == 'lastRow') or \
+           (look & 0x0080 and name == 'firstCol') or (look & 0x0100 and name == 'lastCol'):
+            return True
+        if name.startswith('band'):
+            if name.endswith('Horz'):
+                return not bool(look & 0x0200)
+            if name.endswith('Vert'):
+                return not bool(look & 0x0400)
+        return False
+
+    def get_overrides(self, r, c, num_of_rows, num_of_cols_in_row):
+        'List of possible overrides for the given para'
+        overrides = ['wholeTable']
+
+        def divisor(m, n):
+            return (m - (m % n)) // n
+        if c is not None:
+            odd_column_band = (divisor(c, self.table_style.col_band_size) % 2) == 1
+            overrides.append('band%dVert' % (1 if odd_column_band else 2))
+        odd_row_band = (divisor(r, self.table_style.row_band_size) % 2) == 1
+        overrides.append('band%dHorz' % (1 if odd_row_band else 2))
+
+        # According to the OOXML spec columns should have higher override
+        # priority than rows, but Word seems to do it the other way around.
+        if c is not None:
+            if c == 0:
+                overrides.append('firstCol')
+            if c >= num_of_cols_in_row - 1:
+                overrides.append('lastCol')
+        if r == 0:
+            overrides.append('firstRow')
+        if r >= num_of_rows - 1:
+            overrides.append('lastRow')
+        if c is not None:
+            if r == 0:
+                if c == 0:
+                    overrides.append('nwCell')
+                if c == num_of_cols_in_row - 1:
+                    overrides.append('neCell')
+            if r == num_of_rows - 1:
+                if c == 0:
+                    overrides.append('swCell')
+                if c == num_of_cols_in_row - 1:
+                    overrides.append('seCell')
+        return tuple(filter(self.override_allowed, overrides))
+
+    def resolve_row_style(self, tr, overrides):
+        rs = RowStyle(self.namespace)
+        for o in overrides:
+            if o in self.overrides:
+                ovr = self.overrides[o]
+                ors = ovr.get('row', None)
+                if ors is not None:
+                    rs.update(ors)
+
+        for trPr in self.namespace.XPath('./w:trPr')(tr):
+            rs.update(RowStyle(self.namespace, trPr))
+        if self.bidi:
+            rs.apply_bidi()
+        self.style_map[tr] = rs
+
+    def resolve_cell_style(self, tc, overrides, row, col, rows, cols_in_row):
+        cs = CellStyle(self.namespace)
+        for o in overrides:
+            if o in self.overrides:
+                ovr = self.overrides[o]
+                ors = ovr.get('cell', None)
+                if ors is not None:
+                    cs.update(ors)
+
+        for tcPr in self.namespace.XPath('./w:tcPr')(tc):
+            cs.update(CellStyle(self.namespace, tcPr))
+
+        for x in edges:
+            p = 'cell_padding_%s' % x
+            val = getattr(cs, p)
+            if val is inherit:
+                setattr(cs, p, getattr(self.table_style, p))
+
+            is_inside_edge = (
+                (x == 'left' and col > 0) or
+                (x == 'top' and row > 0) or
+                (x == 'right' and col < cols_in_row - 1) or
+                (x == 'bottom' and row < rows -1)
+            )
+            inside_edge = ('insideH' if x in {'top', 'bottom'} else 'insideV') if is_inside_edge else None
+            for prop in border_props:
+                if not prop.startswith('border'):
+                    continue
+                eprop = prop % x
+                iprop = (prop % inside_edge) if inside_edge else None
+                val = getattr(cs, eprop)
+                if val is inherit and iprop is not None:
+                    # Use the insideX borders if the main cell borders are not
+                    # specified
+                    val = getattr(cs, iprop)
+                    if val is inherit:
+                        val = getattr(self.table_style, iprop)
+                if not is_inside_edge and val == 'none':
+                    # Cell borders must override table borders even when the
+                    # table border is not null and the cell border is null.
+                    val = 'hidden'
+                setattr(cs, eprop, val)
+
+        if self.bidi:
+            cs.apply_bidi()
+        self.style_map[tc] = cs
+
+    def resolve_para_style(self, p, overrides):
+        text_styles = [clone(self.paragraph_style), clone(self.run_style)]
+
+        for o in overrides:
+            if o in self.overrides:
+                ovr = self.overrides[o]
+                for i, name in enumerate(('para', 'run')):
+                    ops = ovr.get(name, None)
+                    if ops is not None:
+                        if text_styles[i] is None:
+                            text_styles[i] = ops
+                        else:
+                            text_styles[i].update(ops)
+        self.style_map[p] = text_styles
+
+    def handle_merged_cells(self):
+        if not self.cell_map:
+            return
+        # Handle vMerge
+        max_col_num = max(len(r) for r in self.cell_map)
+        for c in range(max_col_num):
+            cells = [row[c] if c < len(row) else None for row in self.cell_map]
+            runs = [[]]
+            for cell in cells:
+                try:
+                    s = self.style_map[cell]
+                except KeyError:  # cell is None
+                    s = CellStyle(self.namespace)
+                if s.vMerge == 'restart':
+                    runs.append([cell])
+                elif s.vMerge == 'continue':
+                    runs[-1].append(cell)
+                else:
+                    runs.append([])
+            for run in runs:
+                if len(run) > 1:
+                    self.style_map[run[0]].row_span = len(run)
+                    for tc in run[1:]:
+                        tc.getparent().remove(tc)
+
+        # Handle hMerge
+        for cells in self.cell_map:
+            runs = [[]]
+            for cell in cells:
+                try:
+                    s = self.style_map[cell]
+                except KeyError:  # cell is None
+                    s = CellStyle(self.namespace)
+                if s.col_span is not inherit:
+                    runs.append([])
+                    continue
+                if s.hMerge == 'restart':
+                    runs.append([cell])
+                elif s.hMerge == 'continue':
+                    runs[-1].append(cell)
+                else:
+                    runs.append([])
+
+            for run in runs:
+                if len(run) > 1:
+                    self.style_map[run[0]].col_span = len(run)
+                    for tc in run[1:]:
+                        tc.getparent().remove(tc)
+
+    def __iter__(self):
+        for p in self.paragraphs:
+            yield p
+        for t in itervalues(self.sub_tables):
+            for p in t:
+                yield p
+
+    def apply_markup(self, rmap, page, parent=None):
+        table = TABLE('\n\t\t')
+        if self.bidi:
+            table.set('dir', 'rtl')
+        self.table_style.page = page
+        style_map = {}
+        if parent is None:
+            try:
+                first_para = rmap[next(iter(self))]
+            except StopIteration:
+                return
+            parent = first_para.getparent()
+            idx = parent.index(first_para)
+            parent.insert(idx, table)
+        else:
+            parent.append(table)
+        for row in self.namespace.XPath('./w:tr')(self.tbl):
+            tr = TR('\n\t\t\t')
+            style_map[tr] = self.style_map[row]
+            tr.tail = '\n\t\t'
+            table.append(tr)
+            for tc in self.namespace.XPath('./w:tc')(row):
+                td = TD()
+                style_map[td] = s = self.style_map[tc]
+                if s.col_span is not inherit:
+                    td.set('colspan', unicode_type(s.col_span))
+                if s.row_span is not inherit:
+                    td.set('rowspan', unicode_type(s.row_span))
+                td.tail = '\n\t\t\t'
+                tr.append(td)
+                for x in self.namespace.XPath('./w:p|./w:tbl')(tc):
+                    if x.tag.endswith('}p'):
+                        td.append(rmap[x])
+                    else:
+                        self.sub_tables[x].apply_markup(rmap, page, parent=td)
+            if len(tr):
+                tr[-1].tail = '\n\t\t'
+        if len(table):
+            table[-1].tail = '\n\t'
+
+        table_style = self.table_style.css
+        if table_style:
+            table.set('class', self.styles.register(table_style, 'table'))
+        for elem, style in iteritems(style_map):
+            css = style.css
+            if css:
+                elem.set('class', self.styles.register(css, elem.tag))
+
+
+class Tables(object):
+
+    def __init__(self, namespace):
+        self.tables = []
+        self.para_map = {}
+        self.sub_tables = set()
+        self.namespace = namespace
+
+    def register(self, tbl, styles):
+        if tbl in self.sub_tables:
+            return
+        self.tables.append(Table(self.namespace, tbl, styles, self.para_map))
+        self.sub_tables |= set(self.tables[-1].sub_tables)
+
+    def apply_markup(self, object_map, page_map):
+        rmap = {v:k for k, v in iteritems(object_map)}
+        for table in self.tables:
+            table.apply_markup(rmap, page_map[table.tbl])
+
+    def para_style(self, p):
+        table = self.para_map.get(p, None)
+        if table is not None:
+            return table.style_map.get(p, (None, None))[0]
+
+    def run_style(self, p):
+        table = self.para_map.get(p, None)
+        if table is not None:
+            return table.style_map.get(p, (None, None))[1]
--- a/ebook_converter/ebooks/docx/theme.py
+++ b/ebook_converter/ebooks/docx/theme.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+
+class Theme(object):
+
+    def __init__(self, namespace):
+        self.major_latin_font = 'Cambria'
+        self.minor_latin_font = 'Calibri'
+        self.namespace = namespace
+
+    def __call__(self, root):
+        for fs in self.namespace.XPath('//a:fontScheme')(root):
+            for mj in self.namespace.XPath('./a:majorFont')(fs):
+                for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
+                    self.major_latin_font = l.get('typeface')
+            for mj in self.namespace.XPath('./a:minorFont')(fs):
+                for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
+                    self.minor_latin_font = l.get('typeface')
+
+    def resolve_font_family(self, ff):
+        if ff.startswith('|'):
+            ff = ff[1:-1]
+            ff = self.major_latin_font if ff.startswith('major') else self.minor_latin_font
+        return ff
--- a/ebook_converter/ebooks/docx/to_html.py
+++ b/ebook_converter/ebooks/docx/to_html.py
@@ -0,0 +1,839 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import sys, os, re, math, errno, uuid, numbers
+from collections import OrderedDict, defaultdict
+
+from lxml import html
+from lxml.html.builder import (
+    HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
+
+from calibre import guess_type
+from calibre.ebooks.docx.container import DOCX, fromstring
+from calibre.ebooks.docx.names import XML, generate_anchor
+from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
+from calibre.ebooks.docx.numbering import Numbering
+from calibre.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
+from calibre.ebooks.docx.images import Images
+from calibre.ebooks.docx.tables import Tables
+from calibre.ebooks.docx.footnotes import Footnotes
+from calibre.ebooks.docx.cleanup import cleanup_markup
+from calibre.ebooks.docx.theme import Theme
+from calibre.ebooks.docx.toc import create_toc
+from calibre.ebooks.docx.fields import Fields
+from calibre.ebooks.docx.settings import Settings
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
+from polyglot.builtins import iteritems, itervalues, filter, getcwd, map, unicode_type
+
+
+NBSP = '\xa0'
+
+
+class Text:
+
+    def __init__(self, elem, attr, buf):
+        self.elem, self.attr, self.buf = elem, attr, buf
+        self.elems = [self.elem]
+
+    def add_elem(self, elem):
+        self.elems.append(elem)
+        setattr(self.elem, self.attr, ''.join(self.buf))
+        self.elem, self.attr, self.buf = elem, 'tail', []
+
+    def __iter__(self):
+        return iter(self.elems)
+
+
+def html_lang(docx_lang):
+    lang = canonicalize_lang(docx_lang)
+    if lang and lang != 'und':
+        lang = lang_as_iso639_1(lang)
+        if lang:
+            return lang
+
+
+class Convert(object):
+
+    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
+        self.docx = DOCX(path_or_stream, log=log)
+        self.namespace = self.docx.namespace
+        self.ms_pat = re.compile(r'\s{2,}')
+        self.ws_pat = re.compile(r'[\n\r\t]')
+        self.log = self.docx.log
+        self.detect_cover = detect_cover
+        self.notes_text = notes_text or _('Notes')
+        self.notes_nopb = notes_nopb
+        self.nosupsub = nosupsub
+        self.dest_dir = dest_dir or getcwd()
+        self.mi = self.docx.metadata
+        self.body = BODY()
+        self.theme = Theme(self.namespace)
+        self.settings = Settings(self.namespace)
+        self.tables = Tables(self.namespace)
+        self.fields = Fields(self.namespace)
+        self.styles = Styles(self.namespace, self.tables)
+        self.images = Images(self.namespace, self.log)
+        self.object_map = OrderedDict()
+        self.html = HTML(
+            HEAD(
+                META(charset='utf-8'),
+                TITLE(self.mi.title or _('Unknown')),
+                LINK(rel='stylesheet', type='text/css', href='docx.css'),
+            ),
+            self.body
+        )
+        self.html.text='\n\t'
+        self.html[0].text='\n\t\t'
+        self.html[0].tail='\n'
+        for child in self.html[0]:
+            child.tail = '\n\t\t'
+        self.html[0][-1].tail = '\n\t'
+        self.html[1].text = self.html[1].tail = '\n'
+        lang = html_lang(self.mi.language)
+        if lang:
+            self.html.set('lang', lang)
+            self.doc_lang = lang
+        else:
+            self.doc_lang = None
+
+    def __call__(self):
+        doc = self.docx.document
+        relationships_by_id, relationships_by_type = self.docx.document_relationships
+        self.resolve_alternate_content(doc)
+        self.fields(doc, self.log)
+        self.read_styles(relationships_by_type)
+        self.images(relationships_by_id)
+        self.layers = OrderedDict()
+        self.framed = [[]]
+        self.frame_map = {}
+        self.framed_map = {}
+        self.anchor_map = {}
+        self.link_map = defaultdict(list)
+        self.link_source_map = {}
+        self.toc_anchor = None
+        self.block_runs = []
+        paras = []
+
+        self.log.debug('Converting Word markup to HTML')
+
+        self.read_page_properties(doc)
+        self.current_rels = relationships_by_id
+        for wp, page_properties in iteritems(self.page_map):
+            self.current_page = page_properties
+            if wp.tag.endswith('}p'):
+                p = self.convert_p(wp)
+                self.body.append(p)
+                paras.append(wp)
+
+        self.read_block_anchors(doc)
+        self.styles.apply_contextual_spacing(paras)
+        self.mark_block_runs(paras)
+        # Apply page breaks at the start of every section, except the first
+        # section (since that will be the start of the file)
+        self.styles.apply_section_page_breaks(self.section_starts[1:])
+
+        notes_header = None
+        orig_rid_map = self.images.rid_map
+        if self.footnotes.has_notes:
+            self.body.append(H1(self.notes_text))
+            notes_header = self.body[-1]
+            notes_header.set('class', 'notes-header')
+            for anchor, text, note in self.footnotes:
+                dl = DL(id=anchor)
+                dl.set('class', 'footnote')
+                self.body.append(dl)
+                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text)))
+                dl[-1][0].tail = ']'
+                dl.append(DD())
+                paras = []
+                self.images.rid_map = self.current_rels = note.rels[0]
+                for wp in note:
+                    if wp.tag.endswith('}tbl'):
+                        self.tables.register(wp, self.styles)
+                        self.page_map[wp] = self.current_page
+                    else:
+                        p = self.convert_p(wp)
+                        dl[-1].append(p)
+                        paras.append(wp)
+                self.styles.apply_contextual_spacing(paras)
+                self.mark_block_runs(paras)
+
+        for p, wp in iteritems(self.object_map):
+            if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
+                # Paragraph uses tabs for indentation, convert to text-indent
+                parent = p[0]
+                tabs = []
+                for child in parent:
+                    if child.get('class', None) == 'tab':
+                        tabs.append(child)
+                        if child.tail:
+                            break
+                    else:
+                        break
+                indent = len(tabs) * self.settings.default_tab_stop
+                style = self.styles.resolve(wp)
+                if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
+                    if style.text_indent is not inherit:
+                        indent = float(style.text_indent[:-2]) + indent
+                    style.text_indent = '%.3gpt' % indent
+                    parent.text = tabs[-1].tail or ''
+                    list(map(parent.remove, tabs))
+
+        self.images.rid_map = orig_rid_map
+
+        self.resolve_links()
+
+        self.styles.cascade(self.layers)
+
+        self.tables.apply_markup(self.object_map, self.page_map)
+
+        numbered = []
+        for html_obj, obj in iteritems(self.object_map):
+            raw = obj.get('calibre_num_id', None)
+            if raw is not None:
+                lvl, num_id = raw.partition(':')[0::2]
+                try:
+                    lvl = int(lvl)
+                except (TypeError, ValueError):
+                    lvl = 0
+                numbered.append((html_obj, num_id, lvl))
+        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
+        self.apply_frames()
+
+        if len(self.body) > 0:
+            self.body.text = '\n\t'
+            for child in self.body:
+                child.tail = '\n\t'
+            self.body[-1].tail = '\n'
+
+        self.log.debug('Converting styles to CSS')
+        self.styles.generate_classes()
+        for html_obj, obj in iteritems(self.object_map):
+            style = self.styles.resolve(obj)
+            if style is not None:
+                css = style.css
+                if css:
+                    cls = self.styles.class_name(css)
+                    if cls:
+                        html_obj.set('class', cls)
+        for html_obj, css in iteritems(self.framed_map):
+            cls = self.styles.class_name(css)
+            if cls:
+                html_obj.set('class', cls)
+
+        if notes_header is not None:
+            for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
+                notes_header.tag = h.tag
+                cls = h.get('class', None)
+                if cls and cls != 'notes-header':
+                    notes_header.set('class', '%s notes-header' % cls)
+                break
+
+        self.fields.polish_markup(self.object_map)
+
+        self.log.debug('Cleaning up redundant markup generated by Word')
+        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
+
+        return self.write(doc)
+
+    def read_page_properties(self, doc):
+        current = []
+        self.page_map = OrderedDict()
+        self.section_starts = []
+
+        for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
+            if p.tag.endswith('}tbl'):
+                self.tables.register(p, self.styles)
+                current.append(p)
+                continue
+            sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
+            if sect:
+                pr = PageProperties(self.namespace, sect)
+                paras = current + [p]
+                for x in paras:
+                    self.page_map[x] = pr
+                self.section_starts.append(paras[0])
+                current = []
+            else:
+                current.append(p)
+
+        if current:
+            self.section_starts.append(current[0])
+            last = self.namespace.XPath('./w:body/w:sectPr')(doc)
+            pr = PageProperties(self.namespace, last)
+            for x in current:
+                self.page_map[x] = pr
+
+    def resolve_alternate_content(self, doc):
+        # For proprietary extensions in Word documents use the fallback, spec
+        # compliant form
+        # See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
+        for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
+            choices = self.namespace.XPath('./mc:Choice')(ac)
+            fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
+            if fallbacks:
+                for choice in choices:
+                    ac.remove(choice)
+
+    def read_styles(self, relationships_by_type):
+
+        def get_name(rtype, defname):
+            name = relationships_by_type.get(rtype, None)
+            if name is None:
+                cname = self.docx.document_name.split('/')
+                cname[-1] = defname
+                if self.docx.exists('/'.join(cname)):
+                    name = name
+            if name and name.startswith('word/word') and not self.docx.exists(name):
+                name = name.partition('/')[2]
+            return name
+
+        nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
+        sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
+        sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
+        fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
+        tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
+        foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
+        enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
+        numbering = self.numbering = Numbering(self.namespace)
+        footnotes = self.footnotes = Footnotes(self.namespace)
+        fonts = self.fonts = Fonts(self.namespace)
+
+        foraw = enraw = None
+        forel, enrel = ({}, {}), ({}, {})
+        if sename is not None:
+            try:
+                seraw = self.docx.read(sename)
+            except KeyError:
+                self.log.warn('Settings %s do not exist' % sename)
+            except EnvironmentError as e:
+                if e.errno != errno.ENOENT:
+                    raise
+                self.log.warn('Settings %s file missing' % sename)
+            else:
+                self.settings(fromstring(seraw))
+
+        if foname is not None:
+            try:
+                foraw = self.docx.read(foname)
+            except KeyError:
+                self.log.warn('Footnotes %s do not exist' % foname)
+            else:
+                forel = self.docx.get_relationships(foname)
+        if enname is not None:
+            try:
+                enraw = self.docx.read(enname)
+            except KeyError:
+                self.log.warn('Endnotes %s do not exist' % enname)
+            else:
+                enrel = self.docx.get_relationships(enname)
+        footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)
+
+        if fname is not None:
+            embed_relationships = self.docx.get_relationships(fname)[0]
+            try:
+                raw = self.docx.read(fname)
+            except KeyError:
+                self.log.warn('Fonts table %s does not exist' % fname)
+            else:
+                fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)
+
+        if tname is not None:
+            try:
+                raw = self.docx.read(tname)
+            except KeyError:
+                self.log.warn('Styles %s do not exist' % sname)
+            else:
+                self.theme(fromstring(raw))
+
+        styles_loaded = False
+        if sname is not None:
+            try:
+                raw = self.docx.read(sname)
+            except KeyError:
+                self.log.warn('Styles %s do not exist' % sname)
+            else:
+                self.styles(fromstring(raw), fonts, self.theme)
+                styles_loaded = True
+        if not styles_loaded:
+            self.styles(None, fonts, self.theme)
+
+        if nname is not None:
+            try:
+                raw = self.docx.read(nname)
+            except KeyError:
+                self.log.warn('Numbering styles %s do not exist' % nname)
+            else:
+                numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
+
+        self.styles.resolve_numbering(numbering)
+
+    def write(self, doc):
+        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
+        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
+        with lopen(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
+            f.write(raw)
+        css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
+        if css:
+            with lopen(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
+                f.write(css.encode('utf-8'))
+
+        opf = OPFCreator(self.dest_dir, self.mi)
+        opf.toc = toc
+        opf.create_manifest_from_files_in([self.dest_dir])
+        for item in opf.manifest:
+            if item.media_type == 'text/html':
+                item.media_type = guess_type('a.xhtml')[0]
+        opf.create_spine(['index.html'])
+        if self.cover_image is not None:
+            opf.guide.set_cover(self.cover_image)
+
+        def process_guide(E, guide):
+            if self.toc_anchor is not None:
+                guide.append(E.reference(
+                    href='index.html#' + self.toc_anchor, title=_('Table of Contents'), type='toc'))
+        toc_file = os.path.join(self.dest_dir, 'toc.ncx')
+        with lopen(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
+            opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
+        if os.path.getsize(toc_file) == 0:
+            os.remove(toc_file)
+        return os.path.join(self.dest_dir, 'metadata.opf')
+
+    def read_block_anchors(self, doc):
+        doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
+        if doc_anchors:
+            current_bm = set()
+            rmap = {v:k for k, v in iteritems(self.object_map)}
+            for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
+                if p.tag.endswith('}p'):
+                    if current_bm and p in rmap:
+                        para = rmap[p]
+                        if 'id' not in para.attrib:
+                            para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map))))
+                        for name in current_bm:
+                            self.anchor_map[name] = para.get('id')
+                        current_bm = set()
+                elif p in doc_anchors:
+                    anchor = self.namespace.get(p, 'w:name')
+                    if anchor:
+                        current_bm.add(anchor)
+
+    def convert_p(self, p):
+        dest = P()
+        self.object_map[dest] = p
+        style = self.styles.resolve_paragraph(p)
+        self.layers[p] = []
+        self.frame_map[p] = style.frame
+        self.add_frame(dest, style.frame)
+
+        current_anchor = None
+        current_hyperlink = None
+        hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')
+
+        def p_parent(x):
+            # Ensure that nested <w:p> tags are handled. These can occur if a
+            # textbox is present inside a paragraph.
+            while True:
+                x = x.getparent()
+                try:
+                    if x.tag.endswith('}p'):
+                        return x
+                except AttributeError:
+                    break
+
+        for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
+            if p_parent(x) is not p:
+                continue
+            if x.tag.endswith('}r'):
+                span = self.convert_run(x)
+                if current_anchor is not None:
+                    (dest if len(dest) == 0 else span).set('id', current_anchor)
+                    current_anchor = None
+                if current_hyperlink is not None:
+                    try:
+                        hl = hl_xpath(x)[0]
+                        self.link_map[hl].append(span)
+                        self.link_source_map[hl] = self.current_rels
+                        x.set('is-link', '1')
+                    except IndexError:
+                        current_hyperlink = None
+                dest.append(span)
+                self.layers[p].append(x)
+            elif x.tag.endswith('}bookmarkStart'):
+                anchor = self.namespace.get(x, 'w:name')
+                if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
+                    # _GoBack is a special bookmark inserted by Word 2010 for
+                    # the return to previous edit feature, we ignore it
+                    old_anchor = current_anchor
+                    self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map)))
+                    if old_anchor is not None:
+                        # The previous anchor was not applied to any element
+                        for a, t in tuple(iteritems(self.anchor_map)):
+                            if t == old_anchor:
+                                self.anchor_map[a] = current_anchor
+            elif x.tag.endswith('}hyperlink'):
+                current_hyperlink = x
+            elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
+                old_anchor = current_anchor
+                anchor = unicode_type(uuid.uuid4())
+                self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map)))
+                self.toc_anchor = current_anchor
+                if old_anchor is not None:
+                    # The previous anchor was not applied to any element
+                    for a, t in tuple(iteritems(self.anchor_map)):
+                        if t == old_anchor:
+                            self.anchor_map[a] = current_anchor
+        if current_anchor is not None:
+            # This paragraph had no <w:r> descendants
+            dest.set('id', current_anchor)
+            current_anchor = None
+
+        m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
+        if m is not None:
+            n = min(6, max(1, int(m.group(1))))
+            dest.tag = 'h%d' % n
+            dest.set('data-heading-level', unicode_type(n))
+
+        if style.bidi is True:
+            dest.set('dir', 'rtl')
+
+        border_runs = []
+        common_borders = []
+        for span in dest:
+            run = self.object_map[span]
+            style = self.styles.resolve_run(run)
+            if not border_runs or border_runs[-1][1].same_border(style):
+                border_runs.append((span, style))
+            elif border_runs:
+                if len(border_runs) > 1:
+                    common_borders.append(border_runs)
+                border_runs = []
+
+        for border_run in common_borders:
+            spans = []
+            bs = {}
+            for span, style in border_run:
+                style.get_border_css(bs)
+                style.clear_border_css()
+                spans.append(span)
+            if bs:
+                cls = self.styles.register(bs, 'text_border')
+                wrapper = self.wrap_elems(spans, SPAN())
+                wrapper.set('class', cls)
+
+        if not dest.text and len(dest) == 0 and not style.has_visible_border():
+            # Empty paragraph add a non-breaking space so that it is rendered
+            # by WebKit
+            dest.text = NBSP
+
+        # If the last element in a block is a <br> the <br> is not rendered in
+        # HTML, unless it is followed by a trailing space. Word, on the other
+        # hand inserts a blank line for trailing <br>s.
+        if len(dest) > 0 and not dest[-1].tail:
+            if dest[-1].tag == 'br':
+                dest[-1].tail = NBSP
+            elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
+                dest[-1][-1].tail = NBSP
+
+        return dest
+
+    def wrap_elems(self, elems, wrapper):
+        p = elems[0].getparent()
+        idx = p.index(elems[0])
+        p.insert(idx, wrapper)
+        wrapper.tail = elems[-1].tail
+        elems[-1].tail = None
+        for elem in elems:
+            try:
+                p.remove(elem)
+            except ValueError:
+                # Probably a hyperlink that spans multiple
+                # paragraphs,theoretically we should break this up into
+                # multiple hyperlinks, but I can't be bothered.
+                elem.getparent().remove(elem)
+            wrapper.append(elem)
+        return wrapper
+
+    def resolve_links(self):
+        self.resolved_link_map = {}
+        for hyperlink, spans in iteritems(self.link_map):
+            relationships_by_id = self.link_source_map[hyperlink]
+            span = spans[0]
+            if len(spans) > 1:
+                span = self.wrap_elems(spans, SPAN())
+            span.tag = 'a'
+            self.resolved_link_map[hyperlink] = span
+            tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
+            if tgt:
+                span.set('target', tgt)
+            tt = self.namespace.get(hyperlink, 'w:tooltip')
+            if tt:
+                span.set('title', tt)
+            rid = self.namespace.get(hyperlink, 'r:id')
+            if rid and rid in relationships_by_id:
+                span.set('href', relationships_by_id[rid])
+                continue
+            anchor = self.namespace.get(hyperlink, 'w:anchor')
+            if anchor and anchor in self.anchor_map:
+                span.set('href', '#' + self.anchor_map[anchor])
+                continue
+            self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
+                          (rid, anchor))
+            # hrefs that point nowhere give epubcheck a hernia. The element
+            # should be styled explicitly by Word anyway.
+            # span.set('href', '#')
+        rmap = {v:k for k, v in iteritems(self.object_map)}
+        for hyperlink, runs in self.fields.hyperlink_fields:
+            spans = [rmap[r] for r in runs if r in rmap]
+            if not spans:
+                continue
+            span = spans[0]
+            if len(spans) > 1:
+                span = self.wrap_elems(spans, SPAN())
+            span.tag = 'a'
+            tgt = hyperlink.get('target', None)
+            if tgt:
+                span.set('target', tgt)
+            tt = hyperlink.get('title', None)
+            if tt:
+                span.set('title', tt)
+            url = hyperlink.get('url', None)
+            if url is None:
+                anchor = hyperlink.get('anchor', None)
+                if anchor in self.anchor_map:
+                    span.set('href', '#' + self.anchor_map[anchor])
+                    continue
+                self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
+            else:
+                if url in self.anchor_map:
+                    span.set('href', '#' + self.anchor_map[url])
+                    continue
+                span.set('href', url)
+
+        for img, link, relationships_by_id in self.images.links:
+            parent = img.getparent()
+            idx = parent.index(img)
+            a = A(img)
+            a.tail, img.tail = img.tail, None
+            parent.insert(idx, a)
+            tgt = link.get('target', None)
+            if tgt:
+                a.set('target', tgt)
+            tt = link.get('title', None)
+            if tt:
+                a.set('title', tt)
+            rid = link['id']
+            if rid in relationships_by_id:
+                dest = relationships_by_id[rid]
+                if dest.startswith('#'):
+                    if dest[1:] in self.anchor_map:
+                        a.set('href', '#' + self.anchor_map[dest[1:]])
+                else:
+                    a.set('href', dest)
+
+    def convert_run(self, run):
+        ans = SPAN()
+        self.object_map[ans] = run
+        text = Text(ans, 'text', [])
+
+        for child in run:
+            if self.namespace.is_tag(child, 'w:t'):
+                if not child.text:
+                    continue
+                space = child.get(XML('space'), None)
+                preserve = False
+                ctext = child.text
+                if space != 'preserve':
+                    # Remove leading and trailing whitespace. Word ignores
+                    # leading and trailing whitespace without preserve
+                    ctext = ctext.strip(' \n\r\t')
+                # Only use a <span> with white-space:pre-wrap if this element
+                # actually needs it, i.e. if it has more than one
+                # consecutive space or it has newlines or tabs.
+                multi_spaces = self.ms_pat.search(ctext) is not None
+                preserve = multi_spaces or self.ws_pat.search(ctext) is not None
+                if preserve:
+                    text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
+                    ans.append(text.elem)
+                else:
+                    text.buf.append(ctext)
+            elif self.namespace.is_tag(child, 'w:cr'):
+                text.add_elem(BR())
+                ans.append(text.elem)
+            elif self.namespace.is_tag(child, 'w:br'):
+                typ = self.namespace.get(child, 'w:type')
+                if typ in {'column', 'page'}:
+                    br = BR(style='page-break-after:always')
+                else:
+                    clear = child.get('clear', None)
+                    if clear in {'all', 'left', 'right'}:
+                        br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
+                    else:
+                        br = BR()
+                text.add_elem(br)
+                ans.append(text.elem)
+            elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
+                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
+                    text.add_elem(img)
+                    ans.append(text.elem)
+            elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
+                anchor, name = self.footnotes.get_ref(child)
+                if anchor and name:
+                    l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name)
+                    l.set('class', 'noteref')
+                    text.add_elem(l)
+                    ans.append(text.elem)
+            elif self.namespace.is_tag(child, 'w:tab'):
+                spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
+                text.add_elem(SPAN(NBSP * spaces))
+                ans.append(text.elem)
+                ans[-1].set('class', 'tab')
+            elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
+                text.buf.append('\u2011')
+            elif self.namespace.is_tag(child, 'w:softHyphen'):
+                text.buf.append('\u00ad')
+        if text.buf:
+            setattr(text.elem, text.attr, ''.join(text.buf))
+
+        style = self.styles.resolve_run(run)
+        if style.vert_align in {'superscript', 'subscript'}:
+            if ans.text or len(ans):
+                ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub')
+        if style.lang is not inherit:
+            lang = html_lang(style.lang)
+            if lang is not None and lang != self.doc_lang:
+                ans.set('lang', lang)
+        if style.rtl is True:
+            ans.set('dir', 'rtl')
+        if is_symbol_font(style.font_family):
+            for elem in text:
+                if elem.text:
+                    elem.text = map_symbol_text(elem.text, style.font_family)
+                if elem.tail:
+                    elem.tail = map_symbol_text(elem.tail, style.font_family)
+            style.font_family = 'sans-serif'
+        return ans
+
+    def add_frame(self, html_obj, style):
+        last_run = self.framed[-1]
+        if style is inherit:
+            if last_run:
+                self.framed.append([])
+            return
+
+        if last_run:
+            if last_run[-1][1] == style:
+                last_run.append((html_obj, style))
+            else:
+                self.framed[-1].append((html_obj, style))
+        else:
+            last_run.append((html_obj, style))
+
+    def apply_frames(self):
+        for run in filter(None, self.framed):
+            style = run[0][1]
+            paras = tuple(x[0] for x in run)
+            parent = paras[0].getparent()
+            idx = parent.index(paras[0])
+            frame = DIV(*paras)
+            parent.insert(idx, frame)
+            self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
+            self.styles.register(css, 'frame')
+
+        if not self.block_runs:
+            return
+        rmap = {v:k for k, v in iteritems(self.object_map)}
+        for border_style, blocks in self.block_runs:
+            paras = tuple(rmap[p] for p in blocks)
+            for p in paras:
+                if p.tag == 'li':
+                    has_li = True
+                    break
+            else:
+                has_li = False
+            parent = paras[0].getparent()
+            if parent.tag in ('ul', 'ol'):
+                ul = parent
+                parent = ul.getparent()
+                idx = parent.index(ul)
+                frame = DIV(ul)
+            elif has_li:
+                def top_level_tag(x):
+                    while True:
+                        q = x.getparent()
+                        if q is parent or q is None:
+                            break
+                        x = q
+                    return x
+                paras = tuple(map(top_level_tag, paras))
+                idx = parent.index(paras[0])
+                frame = DIV(*paras)
+            else:
+                idx = parent.index(paras[0])
+                frame = DIV(*paras)
+            parent.insert(idx, frame)
+            self.framed_map[frame] = css = border_style.css
+            self.styles.register(css, 'frame')
+
+    def mark_block_runs(self, paras):
+
+        def process_run(run):
+            max_left = max_right = 0
+            has_visible_border = None
+            for p in run:
+                style = self.styles.resolve_paragraph(p)
+                if has_visible_border is None:
+                    has_visible_border = style.has_visible_border()
+                if isinstance(style.margin_left, numbers.Number):
+                    max_left = max(style.margin_left, max_left)
+                if isinstance(style.margin_right, numbers.Number):
+                    max_right = max(style.margin_right, max_right)
+                if has_visible_border:
+                    style.margin_left = style.margin_right = inherit
+                if p is not run[0]:
+                    style.padding_top = 0
+                else:
+                    border_style = style.clone_border_styles()
+                    if has_visible_border:
+                        border_style.margin_top, style.margin_top = style.margin_top, inherit
+                if p is not run[-1]:
+                    style.padding_bottom = 0
+                else:
+                    if has_visible_border:
+                        border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
+                style.clear_borders()
+                if p is not run[-1]:
+                    style.apply_between_border()
+            if has_visible_border:
+                border_style.margin_left, border_style.margin_right = max_left,max_right
+                self.block_runs.append((border_style, run))
+
+        run = []
+        for p in paras:
+            if run and self.frame_map.get(p) == self.frame_map.get(run[-1]):
+                style = self.styles.resolve_paragraph(p)
+                last_style = self.styles.resolve_paragraph(run[-1])
+                if style.has_identical_borders(last_style):
+                    run.append(p)
+                    continue
+            if len(run) > 1:
+                process_run(run)
+            run = [p]
+        if len(run) > 1:
+            process_run(run)
+
+
+if __name__ == '__main__':
+    import shutil
+    from calibre.utils.logging import default_log
+    default_log.filter_level = default_log.DEBUG
+    dest_dir = os.path.join(getcwd(), 'docx_input')
+    if os.path.exists(dest_dir):
+        shutil.rmtree(dest_dir)
+    os.mkdir(dest_dir)
+    Convert(sys.argv[-1], dest_dir=dest_dir, log=default_log)()
--- a/ebook_converter/ebooks/docx/toc.py
+++ b/ebook_converter/ebooks/docx/toc.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import namedtuple
+from itertools import count
+
+from lxml.etree import tostring
+
+from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
+from polyglot.builtins import iteritems, range
+
+
+def from_headings(body, log, namespace, num_levels=3):
+    ' Create a TOC from headings in the document '
+    tocroot = TOC()
+    all_heading_nodes = body.xpath('//*[@data-heading-level]')
+    level_prev = {i+1:None for i in range(num_levels)}
+    level_prev[0] = tocroot
+    level_item_map = {i:frozenset(
+        x for x in all_heading_nodes if int(x.get('data-heading-level')) == i)
+        for i in range(1, num_levels+1)}
+    item_level_map = {e:i for i, elems in iteritems(level_item_map) for e in elems}
+
+    idcount = count()
+
+    def ensure_id(elem):
+        ans = elem.get('id', None)
+        if not ans:
+            ans = 'toc_id_%d' % (next(idcount) + 1)
+            elem.set('id', ans)
+        return ans
+
+    for item in all_heading_nodes:
+        lvl = plvl = item_level_map.get(item, None)
+        if lvl is None:
+            continue
+        parent = None
+        while parent is None:
+            plvl -= 1
+            parent = level_prev[plvl]
+        lvl = plvl + 1
+        elem_id = ensure_id(item)
+        text = elem_to_toc_text(item)
+        toc = parent.add_item('index.html', elem_id, text)
+        level_prev[lvl] = toc
+        for i in range(lvl+1, num_levels+1):
+            level_prev[i] = None
+
+    if len(tuple(tocroot.flat())) > 1:
+        log('Generating Table of Contents from headings')
+        return tocroot
+
+
+def structure_toc(entries):
+    indent_vals = sorted({x.indent for x in entries})
+    last_found = [None for i in indent_vals]
+    newtoc = TOC()
+
+    if len(indent_vals) > 6:
+        for x in entries:
+            newtoc.add_item('index.html', x.anchor, x.text)
+        return newtoc
+
+    def find_parent(level):
+        candidates = last_found[:level]
+        for x in reversed(candidates):
+            if x is not None:
+                return x
+        return newtoc
+
+    for item in entries:
+        level = indent_vals.index(item.indent)
+        parent = find_parent(level)
+        last_found[level] = parent.add_item('index.html', item.anchor,
+                    item.text)
+        for i in range(level+1, len(last_found)):
+            last_found[i] = None
+
+    return newtoc
+
+
+def link_to_txt(a, styles, object_map):
+    if len(a) > 1:
+        for child in a:
+            run = object_map.get(child, None)
+            if run is not None:
+                rs = styles.resolve(run)
+                if rs.css.get('display', None) == 'none':
+                    a.remove(child)
+
+    return tostring(a, method='text', with_tail=False, encoding='unicode').strip()
+
+
+def from_toc(docx, link_map, styles, object_map, log, namespace):
+    XPath, get, ancestor = namespace.XPath, namespace.get, namespace.ancestor
+    toc_level = None
+    level = 0
+    TI = namedtuple('TI', 'text anchor indent')
+    toc = []
+    for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx):
+        n = tag.tag.rpartition('}')[-1]
+        if n == 'fldChar':
+            t = get(tag, 'w:fldCharType')
+            if t == 'begin':
+                level += 1
+            elif t == 'end':
+                level -= 1
+                if toc_level is not None and level < toc_level:
+                    break
+        elif n == 'instrText':
+            if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
+                toc_level = level
+        elif n == 'hyperlink':
+            if toc_level is not None and level >= toc_level and tag in link_map:
+                a = link_map[tag]
+                href = a.get('href', None)
+                txt = link_to_txt(a, styles, object_map)
+                p = ancestor(tag, 'w:p')
+                if txt and href and p is not None:
+                    ps = styles.resolve_paragraph(p)
+                    try:
+                        ml = int(ps.margin_left[:-2])
+                    except (TypeError, ValueError, AttributeError):
+                        ml = 0
+                    if ps.text_align in {'center', 'right'}:
+                        ml = 0
+                    toc.append(TI(txt, href[1:], ml))
+    if toc:
+        log('Found Word Table of Contents, using it to generate the Table of Contents')
+        return structure_toc(toc)
+
+
+def create_toc(docx, body, link_map, styles, object_map, log, namespace):
+    ans = from_toc(docx, link_map, styles, object_map, log, namespace) or from_headings(body, log, namespace)
+    # Remove heading level attributes
+    for h in body.xpath('//*[@data-heading-level]'):
+        del h.attrib['data-heading-level']
+    return ans