1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-27 22:03:32 +01:00

Initial import

This commit is contained in:
2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
class InvalidDOCX(ValueError):
pass

View File

@@ -0,0 +1,478 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import numbers
from collections import OrderedDict
from polyglot.builtins import iteritems
class Inherit(object):
def __eq__(self, other):
return other is self
def __hash__(self):
return id(self)
def __lt__(self, other):
return False
def __gt__(self, other):
return other is not self
def __ge__(self, other):
if self is other:
return True
return True
def __le__(self, other):
if self is other:
return True
return False
inherit = Inherit()
def binary_property(parent, name, XPath, get):
vals = XPath('./w:%s' % name)(parent)
if not vals:
return inherit
val = get(vals[0], 'w:val', 'on')
return True if val in {'on', '1', 'true'} else False
def simple_color(col, auto='black'):
if not col or col == 'auto' or len(col) != 6:
return auto
return '#'+col
def simple_float(val, mult=1.0):
try:
return float(val) * mult
except (ValueError, TypeError, AttributeError, KeyError):
pass
def twips(val, mult=0.05):
''' Parse val as either a pure number representing twentieths of a point or a number followed by the suffix pt, representing pts.'''
try:
return float(val) * mult
except (ValueError, TypeError, AttributeError, KeyError):
if val and val.endswith('pt') and mult == 0.05:
return twips(val[:-2], mult=1.0)
LINE_STYLES = { # {{{
'basicBlackDashes': 'dashed',
'basicBlackDots': 'dotted',
'basicBlackSquares': 'dashed',
'basicThinLines': 'solid',
'dashDotStroked': 'groove',
'dashed': 'dashed',
'dashSmallGap': 'dashed',
'dotDash': 'dashed',
'dotDotDash': 'dashed',
'dotted': 'dotted',
'double': 'double',
'inset': 'inset',
'nil': 'none',
'none': 'none',
'outset': 'outset',
'single': 'solid',
'thick': 'solid',
'thickThinLargeGap': 'double',
'thickThinMediumGap': 'double',
'thickThinSmallGap' : 'double',
'thinThickLargeGap': 'double',
'thinThickMediumGap': 'double',
'thinThickSmallGap': 'double',
'thinThickThinLargeGap': 'double',
'thinThickThinMediumGap': 'double',
'thinThickThinSmallGap': 'double',
'threeDEmboss': 'ridge',
'threeDEngrave': 'groove',
'triple': 'double',
} # }}}
# Read from XML {{{
border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
border_edges = ('left', 'top', 'right', 'bottom', 'between')
def read_single_border(parent, edge, XPath, get):
color = style = width = padding = None
for elem in XPath('./w:%s' % edge)(parent):
c = get(elem, 'w:color')
if c is not None:
color = simple_color(c)
s = get(elem, 'w:val')
if s is not None:
style = LINE_STYLES.get(s, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
padding = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
width = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
return {p:v for p, v in zip(border_props, (padding, width, style, color))}
def read_border(parent, dest, XPath, get, border_edges=border_edges, name='pBdr'):
vals = {k % edge:inherit for edge in border_edges for k in border_props}
for border in XPath('./w:' + name)(parent):
for edge in border_edges:
for prop, val in iteritems(read_single_border(border, edge, XPath, get)):
if val is not None:
vals[prop % edge] = val
for key, val in iteritems(vals):
setattr(dest, key, val)
def border_to_css(edge, style, css):
bs = getattr(style, 'border_%s_style' % edge)
bc = getattr(style, 'border_%s_color' % edge)
bw = getattr(style, 'border_%s_width' % edge)
if isinstance(bw, numbers.Number):
# WebKit needs at least 1pt to render borders and 3pt to render double borders
bw = max(bw, (3 if bs == 'double' else 1))
if bs is not inherit and bs is not None:
css['border-%s-style' % edge] = bs
if bc is not inherit and bc is not None:
css['border-%s-color' % edge] = bc
if bw is not inherit and bw is not None:
if isinstance(bw, numbers.Number):
bw = '%.3gpt' % bw
css['border-%s-width' % edge] = bw
def read_indent(parent, dest, XPath, get):
padding_left = padding_right = text_indent = inherit
for indent in XPath('./w:ind')(parent):
l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None
if pl is not None:
padding_left = '%.3g%s' % (pl, 'em' if lc is not None else 'pt')
r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars')
pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None
if pr is not None:
padding_right = '%.3g%s' % (pr, 'em' if rc is not None else 'pt')
h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars')
fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars')
h = h if h is None else '-'+h
hc = hc if hc is None else '-'+hc
ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
if ti is not None:
text_indent = '%.3g%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
setattr(dest, 'margin_left', padding_left)
setattr(dest, 'margin_right', padding_right)
setattr(dest, 'text_indent', text_indent)
def read_justification(parent, dest, XPath, get):
ans = inherit
for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val:
ans = 'justify'
elif val in {'left', 'center', 'right', 'start', 'end'}:
ans = val
elif val in {'start', 'end'}:
ans = {'start':'left'}.get(val, 'right')
setattr(dest, 'text_align', ans)
def read_spacing(parent, dest, XPath, get):
padding_top = padding_bottom = line_height = inherit
for s in XPath('./w:spacing')(parent):
a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None
if pb is not None:
padding_bottom = '%.3g%s' % (pb, 'ex' if al is not None else 'pt')
b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing')
pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None
if pt is not None:
padding_top = '%.3g%s' % (pt, 'ex' if bl is not None else 'pt')
l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto')
if l is not None:
lh = simple_float(l, 0.05) if lr in {'exact', 'atLeast'} else simple_float(l, 1/240.0)
if lh is not None:
line_height = '%.3g%s' % (lh, 'pt' if lr in {'exact', 'atLeast'} else '')
setattr(dest, 'margin_top', padding_top)
setattr(dest, 'margin_bottom', padding_bottom)
setattr(dest, 'line_height', line_height)
def read_shd(parent, dest, XPath, get):
ans = inherit
for shd in XPath('./w:shd[@w:fill]')(parent):
val = get(shd, 'w:fill')
if val:
ans = simple_color(val, auto='transparent')
setattr(dest, 'background_color', ans)
def read_numbering(parent, dest, XPath, get):
lvl = num_id = inherit
for np in XPath('./w:numPr')(parent):
for ilvl in XPath('./w:ilvl[@w:val]')(np):
try:
lvl = int(get(ilvl, 'w:val'))
except (ValueError, TypeError):
pass
for num in XPath('./w:numId[@w:val]')(np):
num_id = get(num, 'w:val')
setattr(dest, 'numbering_id', num_id)
setattr(dest, 'numbering_level', lvl)
class Frame(object):
all_attributes = ('drop_cap', 'h', 'w', 'h_anchor', 'h_rule', 'v_anchor', 'wrap',
'h_space', 'v_space', 'lines', 'x_align', 'y_align', 'x', 'y')
def __init__(self, fp, XPath, get):
self.drop_cap = get(fp, 'w:dropCap', 'none')
try:
self.h = int(get(fp, 'w:h'))/20
except (ValueError, TypeError):
self.h = 0
try:
self.w = int(get(fp, 'w:w'))/20
except (ValueError, TypeError):
self.w = None
try:
self.x = int(get(fp, 'w:x'))/20
except (ValueError, TypeError):
self.x = 0
try:
self.y = int(get(fp, 'w:y'))/20
except (ValueError, TypeError):
self.y = 0
self.h_anchor = get(fp, 'w:hAnchor', 'page')
self.h_rule = get(fp, 'w:hRule', 'auto')
self.v_anchor = get(fp, 'w:vAnchor', 'page')
self.wrap = get(fp, 'w:wrap', 'around')
self.x_align = get(fp, 'w:xAlign')
self.y_align = get(fp, 'w:yAlign')
try:
self.h_space = int(get(fp, 'w:hSpace'))/20
except (ValueError, TypeError):
self.h_space = 0
try:
self.v_space = int(get(fp, 'w:vSpace'))/20
except (ValueError, TypeError):
self.v_space = 0
try:
self.lines = int(get(fp, 'w:lines'))
except (ValueError, TypeError):
self.lines = 1
def css(self, page):
is_dropcap = self.drop_cap in {'drop', 'margin'}
ans = {'overflow': 'hidden'}
if is_dropcap:
ans['float'] = 'left'
ans['margin'] = '0'
ans['padding-right'] = '0.2em'
else:
if self.h_rule != 'auto':
t = 'min-height' if self.h_rule == 'atLeast' else 'height'
ans[t] = '%.3gpt' % self.h
if self.w is not None:
ans['width'] = '%.3gpt' % self.w
ans['padding-top'] = ans['padding-bottom'] = '%.3gpt' % self.v_space
if self.wrap not in {None, 'none'}:
ans['padding-left'] = ans['padding-right'] = '%.3gpt' % self.h_space
if self.x_align is None:
fl = 'left' if self.x/page.width < 0.5 else 'right'
else:
fl = 'right' if self.x_align == 'right' else 'left'
ans['float'] = fl
return ans
def __eq__(self, other):
for x in self.all_attributes:
if getattr(other, x, inherit) != getattr(self, x):
return False
return True
def __ne__(self, other):
return not self.__eq__(other)
def read_frame(parent, dest, XPath, get):
ans = inherit
for fp in XPath('./w:framePr')(parent):
ans = Frame(fp, XPath, get)
setattr(dest, 'frame', ans)
# }}}
class ParagraphStyle(object):
all_properties = (
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
# Border margins padding
'border_left_width', 'border_left_style', 'border_left_color', 'padding_left',
'border_top_width', 'border_top_style', 'border_top_color', 'padding_top',
'border_right_width', 'border_right_style', 'border_right_color', 'padding_right',
'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom',
'border_between_width', 'border_between_style', 'border_between_color', 'padding_between',
'margin_left', 'margin_top', 'margin_right', 'margin_bottom',
# Misc.
'text_indent', 'text_align', 'line_height', 'background_color',
'numbering_id', 'numbering_level', 'font_family', 'font_size', 'color', 'frame',
'cs_font_size', 'cs_font_family',
)
def __init__(self, namespace, pPr=None):
self.namespace = namespace
self.linked_style = None
if pPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for p in (
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
):
setattr(self, p, binary_property(pPr, p, namespace.XPath, namespace.get))
for x in ('border', 'indent', 'justification', 'spacing', 'shd', 'numbering', 'frame'):
f = read_funcs[x]
f(pPr, self, namespace.XPath, namespace.get)
for s in namespace.XPath('./w:pStyle[@w:val]')(pPr):
self.linked_style = namespace.get(s, 'w:val')
self.font_family = self.font_size = self.color = self.cs_font_size = self.cs_font_family = inherit
self._css = None
self._border_key = None
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
if other.linked_style is not None:
self.linked_style = other.linked_style
def resolve_based_on(self, parent):
for p in self.all_properties:
val = getattr(self, p)
if val is inherit:
setattr(self, p, getattr(parent, p))
@property
def css(self):
if self._css is None:
self._css = c = OrderedDict()
if self.keepLines is True:
c['page-break-inside'] = 'avoid'
if self.pageBreakBefore is True:
c['page-break-before'] = 'always'
if self.keepNext is True:
c['page-break-after'] = 'avoid'
for edge in ('left', 'top', 'right', 'bottom'):
border_to_css(edge, self, c)
val = getattr(self, 'padding_%s' % edge)
if val is not inherit:
c['padding-%s' % edge] = '%.3gpt' % val
val = getattr(self, 'margin_%s' % edge)
if val is not inherit:
c['margin-%s' % edge] = val
if self.line_height not in {inherit, '1'}:
c['line-height'] = self.line_height
for x in ('text_indent', 'background_color', 'font_family', 'font_size', 'color'):
val = getattr(self, x)
if val is not inherit:
if x == 'font_size':
val = '%.3gpt' % val
c[x.replace('_', '-')] = val
ta = self.text_align
if ta is not inherit:
if self.bidi is True:
ta = {'left':'right', 'right':'left'}.get(ta, ta)
c['text-align'] = ta
return self._css
@property
def border_key(self):
if self._border_key is None:
k = []
for edge in border_edges:
for prop in border_props:
prop = prop % edge
k.append(getattr(self, prop))
self._border_key = tuple(k)
return self._border_key
def has_identical_borders(self, other_style):
return self.border_key == getattr(other_style, 'border_key', None)
def clear_borders(self):
for edge in border_edges[:-1]:
for prop in ('width', 'color', 'style'):
setattr(self, 'border_%s_%s' % (edge, prop), inherit)
def clone_border_styles(self):
style = ParagraphStyle(self.namespace)
for edge in border_edges[:-1]:
for prop in ('width', 'color', 'style'):
attr = 'border_%s_%s' % (edge, prop)
setattr(style, attr, getattr(self, attr))
return style
def apply_between_border(self):
for prop in ('width', 'color', 'style'):
setattr(self, 'border_bottom_%s' % prop, getattr(self, 'border_between_%s' % prop))
def has_visible_border(self):
for edge in border_edges[:-1]:
bw, bs = getattr(self, 'border_%s_width' % edge), getattr(self, 'border_%s_style' % edge)
if bw is not inherit and bw and bs is not inherit and bs != 'none':
return True
return False
read_funcs = {k[5:]:v for k, v in iteritems(globals()) if k.startswith('read_')}

View File

@@ -0,0 +1,302 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from calibre.ebooks.docx.block_styles import ( # noqa
inherit, simple_color, LINE_STYLES, simple_float, binary_property, read_shd)
# Read from XML {{{
def read_text_border(parent, dest, XPath, get):
border_color = border_style = border_width = padding = inherit
elems = XPath('./w:bdr')(parent)
if elems and elems[0].attrib:
border_color = simple_color('auto')
border_style = 'none'
border_width = 1
for elem in elems:
color = get(elem, 'w:color')
if color is not None:
border_color = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
border_style = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
padding = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
# A border of less than 1pt is not rendered by WebKit
border_width = min(96, max(8, float(sz))) / 8
except (ValueError, TypeError):
pass
setattr(dest, 'border_color', border_color)
setattr(dest, 'border_style', border_style)
setattr(dest, 'border_width', border_width)
setattr(dest, 'padding', padding)
def read_color(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:color[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
ans = simple_color(val)
setattr(dest, 'color', ans)
def convert_highlight_color(val):
return {
'darkBlue': '#000080', 'darkCyan': '#008080', 'darkGray': '#808080',
'darkGreen': '#008000', 'darkMagenta': '#800080', 'darkRed': '#800000', 'darkYellow': '#808000',
'lightGray': '#c0c0c0'}.get(val, val)
def read_highlight(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:highlight[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
if not val or val == 'none':
val = 'transparent'
else:
val = convert_highlight_color(val)
ans = val
setattr(dest, 'highlight', ans)
def read_lang(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:lang[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
try:
code = int(val, 16)
except (ValueError, TypeError):
ans = val
else:
from calibre.ebooks.docx.lcid import lcid
val = lcid.get(code, None)
if val:
ans = val
setattr(dest, 'lang', ans)
def read_letter_spacing(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:spacing[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.05)
if val is not None:
ans = val
setattr(dest, 'letter_spacing', ans)
def read_underline(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:u[@w:val]')(parent):
val = get(col, 'w:val')
if val:
ans = val if val == 'none' else 'underline'
setattr(dest, 'text_decoration', ans)
def read_vert_align(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:vertAlign[@w:val]')(parent):
val = get(col, 'w:val')
if val and val in {'baseline', 'subscript', 'superscript'}:
ans = val
setattr(dest, 'vert_align', ans)
def read_position(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:position[@w:val]')(parent):
val = get(col, 'w:val')
try:
ans = float(val)/2.0
except Exception:
pass
setattr(dest, 'position', ans)
def read_font(parent, dest, XPath, get):
ff = inherit
for col in XPath('./w:rFonts')(parent):
val = get(col, 'w:asciiTheme')
if val:
val = '|%s|' % val
else:
val = get(col, 'w:ascii')
if val:
ff = val
setattr(dest, 'font_family', ff)
for col in XPath('./w:sz[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.5)
if val is not None:
setattr(dest, 'font_size', val)
return
setattr(dest, 'font_size', inherit)
def read_font_cs(parent, dest, XPath, get):
ff = inherit
for col in XPath('./w:rFonts')(parent):
val = get(col, 'w:csTheme')
if val:
val = '|%s|' % val
else:
val = get(col, 'w:cs')
if val:
ff = val
setattr(dest, 'cs_font_family', ff)
for col in XPath('./w:szCS[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.5)
if val is not None:
setattr(dest, 'font_size', val)
return
setattr(dest, 'cs_font_size', inherit)
# }}}
class RunStyle(object):
all_properties = {
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint',
'rtl', 'shadow', 'smallCaps', 'strike', 'vanish', 'webHidden',
'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background_color',
'letter_spacing', 'font_size', 'text_decoration', 'vert_align', 'lang', 'font_family', 'position',
'cs_font_size', 'cs_font_family'
}
toggle_properties = {
'b', 'bCs', 'caps', 'emboss', 'i', 'iCs', 'imprint', 'shadow', 'smallCaps', 'strike', 'vanish',
}
def __init__(self, namespace, rPr=None):
self.namespace = namespace
self.linked_style = None
if rPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
X, g = namespace.XPath, namespace.get
for p in (
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
'smallCaps', 'strike', 'vanish', 'webHidden',
):
setattr(self, p, binary_property(rPr, p, X, g))
read_font(rPr, self, X, g)
read_font_cs(rPr, self, X, g)
read_text_border(rPr, self, X, g)
read_color(rPr, self, X, g)
read_highlight(rPr, self, X, g)
read_shd(rPr, self, X, g)
read_letter_spacing(rPr, self, X, g)
read_underline(rPr, self, X, g)
read_vert_align(rPr, self, X, g)
read_position(rPr, self, X, g)
read_lang(rPr, self, X, g)
for s in X('./w:rStyle[@w:val]')(rPr):
self.linked_style = g(s, 'w:val')
self._css = None
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
if other.linked_style is not None:
self.linked_style = other.linked_style
def resolve_based_on(self, parent):
for p in self.all_properties:
val = getattr(self, p)
if val is inherit:
setattr(self, p, getattr(parent, p))
def get_border_css(self, ans):
for x in ('color', 'style', 'width'):
val = getattr(self, 'border_'+x)
if x == 'width' and val is not inherit:
val = '%.3gpt' % val
if val is not inherit:
ans['border-%s' % x] = val
def clear_border_css(self):
for x in ('color', 'style', 'width'):
setattr(self, 'border_'+x, inherit)
@property
def css(self):
if self._css is None:
c = self._css = OrderedDict()
td = set()
if self.text_decoration is not inherit:
td.add(self.text_decoration)
if self.strike and self.strike is not inherit:
td.add('line-through')
if self.dstrike and self.dstrike is not inherit:
td.add('line-through')
if td:
c['text-decoration'] = ' '.join(td)
if self.caps is True:
c['text-transform'] = 'uppercase'
if self.i is True:
c['font-style'] = 'italic'
if self.shadow and self.shadow is not inherit:
c['text-shadow'] = '2px 2px'
if self.smallCaps is True:
c['font-variant'] = 'small-caps'
if self.vanish is True or self.webHidden is True:
c['display'] = 'none'
self.get_border_css(c)
if self.padding is not inherit:
c['padding'] = '%.3gpt' % self.padding
for x in ('color', 'background_color'):
val = getattr(self, x)
if val is not inherit:
c[x.replace('_', '-')] = val
for x in ('letter_spacing', 'font_size'):
val = getattr(self, x)
if val is not inherit:
c[x.replace('_', '-')] = '%.3gpt' % val
if self.position is not inherit:
c['vertical-align'] = '%.3gpt' % self.position
if self.highlight is not inherit and self.highlight != 'transparent':
c['background-color'] = self.highlight
if self.b:
c['font-weight'] = 'bold'
if self.font_family is not inherit:
c['font-family'] = self.font_family
return self._css
def same_border(self, other):
return self.get_border_css({}) == other.get_border_css({})

View File

@@ -0,0 +1,235 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os
from polyglot.builtins import itervalues, range
NBSP = '\xa0'
def mergeable(previous, current):
if previous.tail or current.tail:
return False
if previous.get('class', None) != current.get('class', None):
return False
if current.get('id', False):
return False
for attr in ('style', 'lang', 'dir'):
if previous.get(attr) != current.get(attr):
return False
try:
return next(previous.itersiblings()) is current
except StopIteration:
return False
def append_text(parent, text):
if len(parent) > 0:
parent[-1].tail = (parent[-1].tail or '') + text
else:
parent.text = (parent.text or '') + text
def merge(parent, span):
if span.text:
append_text(parent, span.text)
for child in span:
parent.append(child)
if span.tail:
append_text(parent, span.tail)
span.getparent().remove(span)
def merge_run(run):
parent = run[0]
for span in run[1:]:
merge(parent, span)
def liftable(css):
# A <span> is liftable if all its styling would work just as well if it is
# specified on the parent element.
prefixes = {x.partition('-')[0] for x in css}
return not (prefixes - {'text', 'font', 'letter', 'color', 'background'})
def add_text(elem, attr, text):
old = getattr(elem, attr) or ''
setattr(elem, attr, old + text)
def lift(span):
# Replace an element by its content (text, children and tail)
parent = span.getparent()
idx = parent.index(span)
try:
last_child = span[-1]
except IndexError:
last_child = None
if span.text:
if idx == 0:
add_text(parent, 'text', span.text)
else:
add_text(parent[idx - 1], 'tail', span.text)
for child in reversed(span):
parent.insert(idx, child)
parent.remove(span)
if span.tail:
if last_child is None:
if idx == 0:
add_text(parent, 'text', span.tail)
else:
add_text(parent[idx - 1], 'tail', span.tail)
else:
add_text(last_child, 'tail', span.tail)
def before_count(root, tag, limit=10):
body = root.xpath('//body[1]')
if not body:
return limit
ans = 0
for elem in body[0].iterdescendants():
if elem is tag:
return ans
ans += 1
if ans > limit:
return limit
def wrap_contents(tag_name, elem):
wrapper = elem.makeelement(tag_name)
wrapper.text, elem.text = elem.text, ''
for child in elem:
elem.remove(child)
wrapper.append(child)
elem.append(wrapper)
def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath):
# Apply vertical-align
for span in root.xpath('//span[@data-docx-vert]'):
wrap_contents(span.attrib.pop('data-docx-vert'), span)
# Move <hr>s outside paragraphs, if possible.
pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
for hr in root.xpath('//span/hr'):
p = pancestor(hr)
if p:
p = p[0]
descendants = tuple(p.iterdescendants())
if descendants[-1] is hr:
parent = p.getparent()
idx = parent.index(p)
parent.insert(idx+1, hr)
hr.tail = '\n\t'
# Merge consecutive spans that have the same styling
current_run = []
for span in root.xpath('//span'):
if not current_run:
current_run.append(span)
else:
last = current_run[-1]
if mergeable(last, span):
current_run.append(span)
else:
if len(current_run) > 1:
merge_run(current_run)
current_run = [span]
# Process dir attributes
class_map = dict(itervalues(styles.classes))
parents = ('p', 'div') + tuple('h%d' % i for i in range(1, 7))
for parent in root.xpath('//*[(%s)]' % ' or '.join('name()="%s"' % t for t in parents)):
# Ensure that children of rtl parents that are not rtl have an
# explicit dir set. Also, remove dir from children if it is the same as
# that of the parent.
if len(parent):
parent_dir = parent.get('dir')
for child in parent.iterchildren('span'):
child_dir = child.get('dir')
if parent_dir == 'rtl' and child_dir != 'rtl':
child_dir = 'ltr'
child.set('dir', child_dir)
if child_dir and child_dir == parent_dir:
child.attrib.pop('dir')
# Remove unnecessary span tags that are the only child of a parent block
# element
for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)):
if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None):
# We have a block whose contents are entirely enclosed in a <span>
span = parent[0]
span_class = span.get('class', None)
span_css = class_map.get(span_class, {})
span_dir = span.get('dir')
if liftable(span_css) and (not span_dir or span_dir == parent.get('dir')):
pclass = parent.get('class', None)
if span_class:
pclass = (pclass + ' ' + span_class) if pclass else span_class
parent.set('class', pclass)
parent.text = span.text
parent.remove(span)
if span.get('lang'):
parent.set('lang', span.get('lang'))
if span.get('dir'):
parent.set('dir', span.get('dir'))
for child in span:
parent.append(child)
# Make spans whose only styling is bold or italic into <b> and <i> tags
for span in root.xpath('//span[@class and not(@style)]'):
css = class_map.get(span.get('class', None), {})
if len(css) == 1:
if css == {'font-style':'italic'}:
span.tag = 'i'
del span.attrib['class']
elif css == {'font-weight':'bold'}:
span.tag = 'b'
del span.attrib['class']
# Get rid of <span>s that have no styling
for span in root.xpath('//span[not(@class or @id or @style or @lang or @dir)]'):
lift(span)
# Convert <p><br style="page-break-after:always"> </p> style page breaks
# into something the viewer will render as a page break
for p in root.xpath('//p[br[@style="page-break-after:always"]]'):
if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()):
p.remove(p[0])
prefix = p.get('style', '')
if prefix:
prefix += '; '
p.set('style', prefix + 'page-break-after:always')
p.text = NBSP if not p.text else p.text
if detect_cover:
# Check if the first image in the document is possibly a cover
img = root.xpath('//img[@src][1]')
if img:
img = img[0]
path = os.path.join(dest_dir, img.get('src'))
if os.path.exists(path) and before_count(root, img, limit=10) < 5:
from calibre.utils.imghdr import identify
try:
with lopen(path, 'rb') as imf:
fmt, width, height = identify(imf)
except:
width, height, fmt = 0, 0, None # noqa
del fmt
try:
is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
except ZeroDivisionError:
is_cover = False
if is_cover:
log.debug('Detected an image that looks like a cover')
img.getparent().remove(img)
return path

View File

@@ -0,0 +1,268 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os, sys, shutil
from lxml import etree
from calibre import walk, guess_type
from calibre.ebooks.metadata import string_to_authors, authors_to_sort_string
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.docx import InvalidDOCX
from calibre.ebooks.docx.names import DOCXNamespace
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.localization import canonicalize_lang
from calibre.utils.logging import default_log
from calibre.utils.zipfile import ZipFile
from calibre.utils.xml_parse import safe_xml_fromstring
def fromstring(raw, parser=None):
return safe_xml_fromstring(raw)
# Read metadata {{{
def read_doc_props(raw, mi, XPath):
root = fromstring(raw)
titles = XPath('//dc:title')(root)
if titles:
title = titles[0].text
if title and title.strip():
mi.title = title.strip()
tags = []
for subject in XPath('//dc:subject')(root):
if subject.text and subject.text.strip():
tags.append(subject.text.strip().replace(',', '_'))
for keywords in XPath('//cp:keywords')(root):
if keywords.text and keywords.text.strip():
for x in keywords.text.split():
tags.extend(y.strip() for y in x.split(',') if y.strip())
if tags:
mi.tags = tags
authors = XPath('//dc:creator')(root)
aut = []
for author in authors:
if author.text and author.text.strip():
aut.extend(string_to_authors(author.text))
if aut:
mi.authors = aut
mi.author_sort = authors_to_sort_string(aut)
desc = XPath('//dc:description')(root)
if desc:
raw = etree.tostring(desc[0], method='text', encoding='unicode')
raw = raw.replace('_x000d_', '') # Word 2007 mangles newlines in the summary
mi.comments = raw.strip()
langs = []
for lang in XPath('//dc:language')(root):
if lang.text and lang.text.strip():
l = canonicalize_lang(lang.text)
if l:
langs.append(l)
if langs:
mi.languages = langs
def read_app_props(raw, mi):
root = fromstring(raw)
company = root.xpath('//*[local-name()="Company"]')
if company and company[0].text and company[0].text.strip():
mi.publisher = company[0].text.strip()
def read_default_style_language(raw, mi, XPath):
root = fromstring(raw)
for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root):
lang = canonicalize_lang(lang)
if lang:
mi.languages = [lang]
break
# }}}
class DOCX(object):
def __init__(self, path_or_stream, log=None, extract=True):
self.docx_is_transitional = True
stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
self.name = getattr(stream, 'name', None) or '<stream>'
self.log = log or default_log
if extract:
self.extract(stream)
else:
self.init_zipfile(stream)
self.read_content_types()
self.read_package_relationships()
self.namespace = DOCXNamespace(self.docx_is_transitional)
def init_zipfile(self, stream):
self.zipf = ZipFile(stream)
self.names = frozenset(self.zipf.namelist())
def extract(self, stream):
self.tdir = PersistentTemporaryDirectory('docx_container')
try:
zf = ZipFile(stream)
zf.extractall(self.tdir)
except:
self.log.exception('DOCX appears to be invalid ZIP file, trying a'
' more forgiving ZIP parser')
from calibre.utils.localunzip import extractall
stream.seek(0)
extractall(stream, self.tdir)
self.names = {}
for f in walk(self.tdir):
name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
self.names[name] = f
def exists(self, name):
return name in self.names
def read(self, name):
if hasattr(self, 'zipf'):
return self.zipf.open(name).read()
path = self.names[name]
with open(path, 'rb') as f:
return f.read()
def read_content_types(self):
try:
raw = self.read('[Content_Types].xml')
except KeyError:
raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
root = fromstring(raw)
self.content_types = {}
self.default_content_types = {}
for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
self.default_content_types[item.get('Extension').lower()] = item.get('ContentType')
for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'):
name = item.get('PartName').lstrip('/')
self.content_types[name] = item.get('ContentType')
def content_type(self, name):
if name in self.content_types:
return self.content_types[name]
ext = name.rpartition('.')[-1].lower()
if ext in self.default_content_types:
return self.default_content_types[ext]
return guess_type(name)[0]
def read_package_relationships(self):
try:
raw = self.read('_rels/.rels')
except KeyError:
raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
root = fromstring(raw)
self.relationships = {}
self.relationships_rmap = {}
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
target = item.get('Target').lstrip('/')
typ = item.get('Type')
if target == 'word/document.xml':
self.docx_is_transitional = typ != 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument'
self.relationships[typ] = target
self.relationships_rmap[target] = typ
@property
def document_name(self):
name = self.relationships.get(self.namespace.names['DOCUMENT'], None)
if name is None:
names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
if not names:
raise InvalidDOCX('The file %s docx file has no main document' % self.name)
name = names[0]
return name
@property
def document(self):
return fromstring(self.read(self.document_name))
@property
def document_relationships(self):
return self.get_relationships(self.document_name)
def get_relationships(self, name):
base = '/'.join(name.split('/')[:-1])
by_id, by_type = {}, {}
parts = name.split('/')
name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels'])
try:
raw = self.read(name)
except KeyError:
pass
else:
root = fromstring(raw)
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
target = item.get('Target')
if item.get('TargetMode', None) != 'External' and not target.startswith('#'):
target = '/'.join((base, target.lstrip('/')))
typ = item.get('Type')
Id = item.get('Id')
by_id[Id] = by_type[typ] = target
return by_id, by_type
def get_document_properties_names(self):
name = self.relationships.get(self.namespace.names['DOCPROPS'], None)
if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
if names:
name = names[0]
yield name
name = self.relationships.get(self.namespace.names['APPPROPS'], None)
if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
if names:
name = names[0]
yield name
@property
def metadata(self):
mi = Metadata(_('Unknown'))
dp_name, ap_name = self.get_document_properties_names()
if dp_name:
try:
raw = self.read(dp_name)
except KeyError:
pass
else:
read_doc_props(raw, mi, self.namespace.XPath)
if mi.is_null('language'):
try:
raw = self.read('word/styles.xml')
except KeyError:
pass
else:
read_default_style_language(raw, mi, self.namespace.XPath)
ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None)
if ap_name:
try:
raw = self.read(ap_name)
except KeyError:
pass
else:
read_app_props(raw, mi)
return mi
def close(self):
if hasattr(self, 'zipf'):
self.zipf.close()
else:
try:
shutil.rmtree(self.tdir)
except EnvironmentError:
pass
if __name__ == '__main__':
d = DOCX(sys.argv[-1], extract=False)
print(d.metadata)

View File

@@ -0,0 +1,276 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from calibre.ebooks.docx.index import process_index, polish_index_markup
from polyglot.builtins import iteritems, native_string_type
class Field(object):
def __init__(self, start):
self.start = start
self.end = None
self.contents = []
self.buf = []
self.instructions = None
self.name = None
def add_instr(self, elem):
self.add_raw(elem.text)
def add_raw(self, raw):
if not raw:
return
if self.name is None:
# There are cases where partial index entries end with
# a significant space, along the lines of
# <>Summary <> ... <>Hearing<>.
# No known examples of starting with a space yet.
# self.name, raw = raw.strip().partition(' ')[0::2]
self.name, raw = raw.lstrip().partition(' ')[0::2]
self.buf.append(raw)
def finalize(self):
self.instructions = ''.join(self.buf)
del self.buf
WORD, FLAG = 0, 1
scanner = re.Scanner([
(r'\\\S{1}', lambda s, t: (t, FLAG)), # A flag of the form \x
(r'"[^"]*"', lambda s, t: (t[1:-1], WORD)), # Quoted word
(r'[^\s\\"]\S*', lambda s, t: (t, WORD)), # A non-quoted word, must not start with a backslash or a space or a quote
(r'\s+', None),
], flags=re.DOTALL)
null = object()
def parser(name, field_map, default_field_name=None):
field_map = dict((x.split(':') for x in field_map.split()))
def parse(raw, log=None):
ans = {}
last_option = None
raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
for token, token_type in scanner.scan(raw)[0]:
token = token.replace('\x01', '\\').replace('\x02', '"')
if token_type is FLAG:
last_option = field_map.get(token[1], null)
if last_option is not None:
ans[last_option] = None
elif token_type is WORD:
if last_option is None:
ans[default_field_name] = token
else:
ans[last_option] = token
last_option = None
ans.pop(null, None)
return ans
parse.__name__ = native_string_type('parse_' + name)
return parse
parse_hyperlink = parser('hyperlink',
'l:anchor m:image-map n:target o:title t:target', 'url')
parse_xe = parser('xe',
'b:bold i:italic f:entry-type r:page-range-bookmark t:page-number-text y:yomi', 'text')
parse_index = parser('index',
'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator'
' f:entry-type g:page-range-separator h:heading k:crossref-separator'
' l:page-number-separator p:letter-range s:sequence-name r:run-together y:yomi z:langcode')
parse_ref = parser('ref',
'd:separator f:footnote h:hyperlink n:number p:position r:relative-number t:suppress w:number-full-context')
parse_noteref = parser('noteref',
'f:footnote h:hyperlink p:position')
class Fields(object):
def __init__(self, namespace):
self.namespace = namespace
self.fields = []
self.index_bookmark_counter = 0
self.index_bookmark_prefix = 'index-'
def __call__(self, doc, log):
all_ids = frozenset(self.namespace.XPath('//*/@w:id')(doc))
c = 0
while self.index_bookmark_prefix in all_ids:
c += 1
self.index_bookmark_prefix = self.index_bookmark_prefix.replace('-', '%d-' % c)
stack = []
for elem in self.namespace.XPath(
'//*[name()="w:p" or name()="w:r" or'
' name()="w:instrText" or'
' (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end") or'
' name()="w:fldSimple")]')(doc):
if elem.tag.endswith('}fldChar'):
typ = self.namespace.get(elem, 'w:fldCharType')
if typ == 'begin':
stack.append(Field(elem))
self.fields.append(stack[-1])
else:
try:
stack.pop().end = elem
except IndexError:
pass
elif elem.tag.endswith('}instrText'):
if stack:
stack[-1].add_instr(elem)
elif elem.tag.endswith('}fldSimple'):
field = Field(elem)
instr = self.namespace.get(elem, 'w:instr')
if instr:
field.add_raw(instr)
self.fields.append(field)
for r in self.namespace.XPath('descendant::w:r')(elem):
field.contents.append(r)
else:
if stack:
stack[-1].contents.append(elem)
field_types = ('hyperlink', 'xe', 'index', 'ref', 'noteref')
parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types}
parsers.update({x:getattr(self, 'parse_'+x) for x in field_types})
field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types}
field_parsers.update({f:globals()['parse_%s' % f] for f in field_types})
for f in field_types:
setattr(self, '%s_fields' % f, [])
unknown_fields = {'TOC', 'toc', 'PAGEREF', 'pageref'} # The TOC and PAGEREF fields are handled separately
for field in self.fields:
field.finalize()
if field.instructions:
func = parsers.get(field.name, None)
if func is not None:
func(field, field_parsers[field.name], log)
elif field.name not in unknown_fields:
log.warn('Encountered unknown field: %s, ignoring it.' % field.name)
unknown_fields.add(field.name)
def get_runs(self, field):
all_runs = []
current_runs = []
# We only handle spans in a single paragraph
# being wrapped in <a>
for x in field.contents:
if x.tag.endswith('}p'):
if current_runs:
all_runs.append(current_runs)
current_runs = []
elif x.tag.endswith('}r'):
current_runs.append(x)
if current_runs:
all_runs.append(current_runs)
return all_runs
def parse_hyperlink(self, field, parse_func, log):
# Parse hyperlink fields
hl = parse_func(field.instructions, log)
if hl:
if 'target' in hl and hl['target'] is None:
hl['target'] = '_blank'
for runs in self.get_runs(field):
self.hyperlink_fields.append((hl, runs))
def parse_ref(self, field, parse_func, log):
ref = parse_func(field.instructions, log)
dest = ref.get(None, None)
if dest is not None and 'hyperlink' in ref:
for runs in self.get_runs(field):
self.hyperlink_fields.append(({'anchor':dest}, runs))
else:
log.warn('Unsupported reference field (%s), ignoring: %r' % (field.name, ref))
parse_noteref = parse_ref
def parse_xe(self, field, parse_func, log):
# Parse XE fields
if None in (field.start, field.end):
return
xe = parse_func(field.instructions, log)
if xe:
# We insert a synthetic bookmark around this index item so that we
# can link to it later
def WORD(x):
return self.namespace.expand('w:' + x)
self.index_bookmark_counter += 1
bmark = xe['anchor'] = '%s%d' % (self.index_bookmark_prefix, self.index_bookmark_counter)
p = field.start.getparent()
bm = p.makeelement(WORD('bookmarkStart'))
bm.set(WORD('id'), bmark), bm.set(WORD('name'), bmark)
p.insert(p.index(field.start), bm)
p = field.end.getparent()
bm = p.makeelement(WORD('bookmarkEnd'))
bm.set(WORD('id'), bmark)
p.insert(p.index(field.end) + 1, bm)
xe['start_elem'] = field.start
self.xe_fields.append(xe)
def parse_index(self, field, parse_func, log):
if not field.contents:
return
idx = parse_func(field.instructions, log)
hyperlinks, blocks = process_index(field, idx, self.xe_fields, log, self.namespace.XPath, self.namespace.expand)
if not blocks:
return
for anchor, run in hyperlinks:
self.hyperlink_fields.append(({'anchor':anchor}, [run]))
self.index_fields.append((idx, blocks))
def polish_markup(self, object_map):
if not self.index_fields:
return
rmap = {v:k for k, v in iteritems(object_map)}
for idx, blocks in self.index_fields:
polish_index_markup(idx, [rmap[b] for b in blocks])
def test_parse_fields(return_tests=False):
import unittest
class TestParseFields(unittest.TestCase):
def test_hyperlink(self):
ae = lambda x, y: self.assertEqual(parse_hyperlink(x, None), y)
ae(r'\l anchor1', {'anchor':'anchor1'})
ae(r'www.calibre-ebook.com', {'url':'www.calibre-ebook.com'})
ae(r'www.calibre-ebook.com \t target \o tt', {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
ae(r'"c:\\Some Folder"', {'url': 'c:\\Some Folder'})
ae(r'xxxx \y yyyy', {'url': 'xxxx'})
def test_xe(self):
ae = lambda x, y: self.assertEqual(parse_xe(x, None), y)
ae(r'"some name"', {'text':'some name'})
ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None})
ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'})
def test_index(self):
ae = lambda x, y: self.assertEqual(parse_index(x, None), y)
ae(r'', {})
ae(r'\b \c 1', {'bookmark':None, 'columns-per-page': '1'})
suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields)
if return_tests:
return suite
unittest.TextTestRunner(verbosity=4).run(suite)
if __name__ == '__main__':
test_parse_fields()

View File

@@ -0,0 +1,197 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os, re
from collections import namedtuple
from calibre.ebooks.docx.block_styles import binary_property, inherit
from calibre.utils.filenames import ascii_filename
from calibre.utils.fonts.scanner import font_scanner, NoFonts
from calibre.utils.fonts.utils import panose_to_css_generic_family, is_truetype_font
from calibre.utils.icu import ord_string
from polyglot.builtins import codepoint_to_chr, iteritems, range
Embed = namedtuple('Embed', 'name key subsetted')
def has_system_fonts(name):
try:
return bool(font_scanner.fonts_for_family(name))
except NoFonts:
return False
def get_variant(bold=False, italic=False):
return {(False, False):'Regular', (False, True):'Italic',
(True, False):'Bold', (True, True):'BoldItalic'}[(bold, italic)]
def find_fonts_matching(fonts, style='normal', stretch='normal'):
for font in fonts:
if font['font-style'] == style and font['font-stretch'] == stretch:
yield font
def weight_key(font):
w = font['font-weight']
try:
return abs(int(w) - 400)
except Exception:
return abs({'normal': 400, 'bold': 700}.get(w, 1000000) - 400)
def get_best_font(fonts, style, stretch):
try:
return sorted(find_fonts_matching(fonts, style, stretch), key=weight_key)[0]
except Exception:
pass
class Family(object):
def __init__(self, elem, embed_relationships, XPath, get):
self.name = self.family_name = get(elem, 'w:name')
self.alt_names = tuple(get(x, 'w:val') for x in XPath('./w:altName')(elem))
if self.alt_names and not has_system_fonts(self.name):
for x in self.alt_names:
if has_system_fonts(x):
self.family_name = x
break
self.embedded = {}
for x in ('Regular', 'Bold', 'Italic', 'BoldItalic'):
for y in XPath('./w:embed%s[@r:id]' % x)(elem):
rid = get(y, 'r:id')
key = get(y, 'w:fontKey')
subsetted = get(y, 'w:subsetted') in {'1', 'true', 'on'}
if rid in embed_relationships:
self.embedded[x] = Embed(embed_relationships[rid], key, subsetted)
self.generic_family = 'auto'
for x in XPath('./w:family[@w:val]')(elem):
self.generic_family = get(x, 'w:val', 'auto')
ntt = binary_property(elem, 'notTrueType', XPath, get)
self.is_ttf = ntt is inherit or not ntt
self.panose1 = None
self.panose_name = None
for x in XPath('./w:panose1[@w:val]')(elem):
try:
v = get(x, 'w:val')
v = tuple(int(v[i:i+2], 16) for i in range(0, len(v), 2))
except (TypeError, ValueError, IndexError):
pass
else:
self.panose1 = v
self.panose_name = panose_to_css_generic_family(v)
self.css_generic_family = {'roman':'serif', 'swiss':'sans-serif', 'modern':'monospace',
'decorative':'fantasy', 'script':'cursive'}.get(self.generic_family, None)
self.css_generic_family = self.css_generic_family or self.panose_name or 'serif'
SYMBOL_MAPS = { # {{{
'Wingdings': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🖉', '', '', '👓', '🕭', '🕮', '🕯', '🕿', '', '🖂', '🖃', '📪', '📫', '📬', '📭', '🗀', '🗁', '🗎', '🗏', '🗐', '🗄', '', '🖮', '🖰', '🖲', '🖳', '🖴', '🖫', '🖬', '', '', '🖎', '', '🖏', '👍', '👎', '', '', '', '🖗', '🖐', '', '😐', '', '💣', '🕱', '🏳', '🏱', '', '', '🌢', '', '🕆', '', '🕈', '', '', '', '', '🕉', '', '', '', '', '', '', '', '', '', '', '', '', '', '🙰', '🙵', '', '🔾', '', '🞏', '🞐', '', '', '🞟', '', '', '', '🞙', '', '', '', '🏵', '🏶', '🙶', '🙷', ' ', '🄋', '', '', '', '', '', '', '', '', '', '', '🄌', '', '', '', '', '', '', '', '', '', '', '🙢', '🙠', '🙡', '🙣', '🙦', '🙤', '🙥', '🙧', '', '', '', '', '🞆', '🞈', '🞊', '🞋', '🔿', '', '🞎', '🟀', '🟁', '', '🟋', '🟏', '🟓', '🟑', '', '', '', '', '', '', '', '🕐', '🕑', '🕒', '🕓', '🕔', '🕕', '🕖', '🕗', '🕘', '🕙', '🕚', '🕛', '', '', '', '', '', '', '', '', '🙪', '🙫', '🙕', '🙔', '🙗', '🙖', '🙐', '🙑', '🙒', '🙓', '', '', '', '', '', '', '', '', '', '', '🡨', '🡪', '🡩', '🡫', '🡬', '🡭', '🡯', '🡮', '🡸', '🡺', '🡹', '🡻', '🡼', '🡽', '🡿', '🡾', '', '', '', '', '', '', '', '', '', '', '🢬', '🢭', '🗶', '', '🗷', '🗹', ' '), # noqa
'Wingdings 2': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🖊', '🖋', '🖌', '🖍', '', '', '🕾', '🕽', '🗅', '🗆', '🗇', '🗈', '🗉', '🗊', '🗋', '🗌', '🗍', '📋', '🗑', '🗔', '🖵', '🖶', '🖷', '🖸', '🖭', '🖯', '🖱', '🖒', '🖓', '🖘', '🖙', '🖚', '🖛', '👈', '👉', '🖜', '🖝', '🖞', '🖟', '🖠', '🖡', '👆', '👇', '🖢', '🖣', '🖑', '🗴', '🗸', '🗵', '', '', '', '', '⮿', '🛇', '', '🙱', '🙴', '🙲', '🙳', '', '🙹', '🙺', '🙻', '🙦', '🙤', '🙥', '🙧', '🙚', '🙘', '🙙', '🙛', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ' ', '', '🌕', '', '', '⸿', '', '🕇', '🕜', '🕝', '🕞', '🕟', '🕠', '🕡', '🕢', '🕣', '🕤', '🕥', '🕦', '🕧', '🙨', '🙩', '', '🞄', '', '', '', '🞅', '🞇', '🞉', '', '⦿', '🞌', '🞍', '', '', '', '🞑', '🞒', '🞓', '🞔', '', '🞕', '🞖', '🞗', '🞘', '', '', '', '🞚', '', '🞛', '🞜', '🞝', '🞞', '', '', '', '🞠', '', '', '', '', '', '', '', '', '', '', '', '', '🞡', '🞢', '🞣', '🞤', '🞥', '🞦', '🞧', '🞨', '🞩', '🞪', '🞫', '🞬', '🞭', '🞮', '🞯', '🞰', '🞱', '🞲', '🞳', '🞴', '🞵', '🞶', '🞷', '🞸', '🞹', '🞺', '🞻', '🞼', '🞽', '🞾', '🞿', '🟀', '🟂', '🟄', '🟆', '🟉', '🟊', '', '🟌', '🟎', '🟐', '🟒', '', '🟃', '🟇', '', '🟍', '🟔', '', '', '', '', ' ', ' ', ' ', ' ', ' ', ' ',), # noqa
'Wingdings 3': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '⭿', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '🢠', '🢡', '🢢', '🢣', '🢤', '🢥', '🢦', '🢧', '🢨', '🢩', '🢪', '🢫', '🡐', '🡒', '🡑', '🡓', '🡔', '🡕', '🡗', '🡖', '🡘', '🡙', '', '', '', '', '', '', '', '', '', '', '', '', '🞀', '🞂', '🞁', ' ', '🞃', '', '', '', '', '', '', '', '', '🠐', '🠒', '🠑', '🠓', '🠔', '🠖', '🠕', '🠗', '🠘', '🠚', '🠙', '🠛', '🠜', '🠞', '🠝', '🠟', '🠀', '🠂', '🠁', '🠃', '🠄', '🠆', '🠅', '🠇', '🠈', '🠊', '🠉', '🠋', '🠠', '🠢', '🠤', '🠦', '🠨', '🠪', '🠬', '🢜', '🢝', '🢞', '🢟', '🠮', '🠰', '🠲', '🠴', '🠶', '🠸', '🠺', '🠹', '🠻', '🢘', '🢚', '🢙', '🢛', '🠼', '🠾', '🠽', '🠿', '🡀', '🡂', '🡁', '🡃', '🡄', '🡆', '🡅', '🡇', '', '', '', '', '', '', '', '', '🡠', '🡢', '🡡', '🡣', '🡤', '🡥', '🡧', '🡦', '🡰', '🡲', '🡱', '🡳', '🡴', '🡵', '🡷', '🡶', '🢀', '🢂', '🢁', '🢃', '🢄', '🢅', '🢇', '🢆', '🢐', '🢒', '🢑', '🢓', '🢔', '🢕', '🢗', '🢖', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',), # noqa
'Webdings': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🕷', '🕸', '🕲', '🕶', '🏆', '🎖', '🖇', '🗨', '🗩', '🗰', '🗱', '🌶', '🎗', '🙾', '🙼', '🗕', '🗖', '🗗', '', '', '', '', '', '', '', '', '', '', '', '🗚', '🗳', '🛠', '🏗', '🏘', '🏙', '🏚', '🏜', '🏭', '🏛', '🏠', '🏖', '🏝', '🛣', '🔍', '🏔', '👁', '👂', '🏞', '🏕', '🛤', '🏟', '🛳', '🕬', '🕫', '🕨', '🔈', '🎔', '🎕', '🗬', '🙽', '🗭', '🗪', '🗫', '', '', '🚲', '', '🛡', '📦', '🛱', '', '🚑', '🛈', '🛩', '🛰', '🟈', '🕴', '', '🛥', '🚔', '🗘', '🗙', '', '🛲', '🚇', '🚍', '', '', '', '🚭', '🗮', '', '🗯', '🗲', ' ', '🚹', '🚺', '🛉', '🛊', '🚼', '👽', '🏋', '', '🏂', '🏌', '🏊', '🏄', '🏍', '🏎', '🚘', '🗠', '🛢', '📠', '🏷', '📣', '👪', '🗡', '🗢', '🗣', '', '🖄', '🖅', '🖃', '🖆', '🖹', '🖺', '🖻', '🕵', '🕰', '🖽', '🖾', '📋', '🗒', '🗓', '🕮', '📚', '🗞', '🗟', '🗃', '🗂', '🖼', '🎭', '🎜', '🎘', '🎙', '🎧', '💿', '🎞', '📷', '🎟', '🎬', '📽', '📹', '📾', '📻', '🎚', '🎛', '📺', '💻', '🖥', '🖦', '🖧', '🍹', '🎮', '🎮', '🕻', '🕼', '🖁', '🖀', '🖨', '🖩', '🖿', '🖪', '🗜', '🔒', '🔓', '🗝', '📥', '📤', '🕳', '🌣', '🌤', '🌥', '🌦', '', '🌨', '🌧', '🌩', '🌪', '🌬', '🌫', '🌜', '🌡', '🛋', '🛏', '🍽', '🍸', '🛎', '🛍', '', '', '🛆', '🖈', '🎓', '🗤', '🗥', '🗦', '🗧', '🛪', '🐿', '🐦', '🐟', '🐕', '🐈', '🙬', '🙮', '🙭', '🙯', '🗺', '🌍', '🌏', '🌎', '🕊',), # noqa
'Symbol': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '!', '', '#', '', '%', '&', '', '(', ')', '*', '+', ',', '', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '', 'Α', 'Β', 'Χ', 'Δ', 'Ε', 'Φ', 'Γ', 'Η', 'Ι', 'ϑ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο', 'Π', 'Θ', 'Ρ', 'Σ', 'Τ', 'Υ', 'ς', 'Ω', 'Ξ', 'Ψ', 'Ζ', '[', '', ']', '', '_', '', 'α', 'β', 'χ', 'δ', 'ε', 'φ', 'γ', 'η', 'ι', 'ϕ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'θ', 'ρ', 'σ', 'τ', 'υ', 'ϖ', 'ω', 'ξ', 'ψ', 'ζ', '{', '|', '}', '~', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', 'ϒ', '', '', '', '', 'ƒ', '', '', '', '', '', '', '', '', '', '°', '±', '', '', '×', '', '', '', '÷', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '®', '©', '', '', '', '', '¬', '', '', '', '', '', '', '', '', '', '®', '©', '', '', '', '', '', '', '', '', '', '', '', '', ' ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ' ',), # noqa
} # }}}
SYMBOL_FONT_NAMES = frozenset(n.lower() for n in SYMBOL_MAPS)
def is_symbol_font(family):
try:
return family.lower() in SYMBOL_FONT_NAMES
except AttributeError:
return False
def do_map(m, points):
base = 0xf000
limit = len(m) + base
for p in points:
if base < p < limit:
yield m[p - base]
else:
yield codepoint_to_chr(p)
def map_symbol_text(text, font):
m = SYMBOL_MAPS[font]
if isinstance(text, bytes):
text = text.decode('utf-8')
return ''.join(do_map(m, ord_string(text)))
class Fonts(object):
def __init__(self, namespace):
self.namespace = namespace
self.fonts = {}
self.used = set()
def __call__(self, root, embed_relationships, docx, dest_dir):
for elem in self.namespace.XPath('//w:font[@w:name]')(root):
self.fonts[self.namespace.get(elem, 'w:name')] = Family(elem, embed_relationships, self.namespace.XPath, self.namespace.get)
def family_for(self, name, bold=False, italic=False):
f = self.fonts.get(name, None)
if f is None:
return 'serif'
variant = get_variant(bold, italic)
self.used.add((name, variant))
name = f.name if variant in f.embedded else f.family_name
if is_symbol_font(name):
return name
return '"%s", %s' % (name.replace('"', ''), f.css_generic_family)
def embed_fonts(self, dest_dir, docx):
defs = []
dest_dir = os.path.join(dest_dir, 'fonts')
for name, variant in self.used:
f = self.fonts[name]
if variant in f.embedded:
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
fname = self.write(name, dest_dir, docx, variant)
if fname is not None:
d = {'font-family':'"%s"' % name.replace('"', ''), 'src': 'url("fonts/%s")' % fname}
if 'Bold' in variant:
d['font-weight'] = 'bold'
if 'Italic' in variant:
d['font-style'] = 'italic'
d = ['%s: %s' % (k, v) for k, v in iteritems(d)]
d = ';\n\t'.join(d)
defs.append('@font-face {\n\t%s\n}\n' % d)
return '\n'.join(defs)
def write(self, name, dest_dir, docx, variant):
f = self.fonts[name]
ef = f.embedded[variant]
raw = docx.read(ef.name)
prefix = raw[:32]
if ef.key:
key = re.sub(r'[^A-Fa-f0-9]', '', ef.key)
key = bytearray(reversed(tuple(int(key[i:i+2], 16) for i in range(0, len(key), 2))))
prefix = bytearray(prefix)
prefix = bytes(bytearray(prefix[i]^key[i % len(key)] for i in range(len(prefix))))
if not is_truetype_font(prefix):
return None
ext = 'otf' if prefix.startswith(b'OTTO') else 'ttf'
fname = ascii_filename('%s - %s.%s' % (name, variant, ext))
with open(os.path.join(dest_dir, fname), 'wb') as dest:
dest.write(prefix)
dest.write(raw[32:])
return fname

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from polyglot.builtins import iteritems, unicode_type
class Note(object):
def __init__(self, namespace, parent, rels):
self.type = namespace.get(parent, 'w:type', 'normal')
self.parent = parent
self.rels = rels
self.namespace = namespace
def __iter__(self):
for p in self.namespace.descendants(self.parent, 'w:p', 'w:tbl'):
yield p
class Footnotes(object):
def __init__(self, namespace):
self.namespace = namespace
self.footnotes = {}
self.endnotes = {}
self.counter = 0
self.notes = OrderedDict()
def __call__(self, footnotes, footnotes_rels, endnotes, endnotes_rels):
XPath, get = self.namespace.XPath, self.namespace.get
if footnotes is not None:
for footnote in XPath('./w:footnote[@w:id]')(footnotes):
fid = get(footnote, 'w:id')
if fid:
self.footnotes[fid] = Note(self.namespace, footnote, footnotes_rels)
if endnotes is not None:
for endnote in XPath('./w:endnote[@w:id]')(endnotes):
fid = get(endnote, 'w:id')
if fid:
self.endnotes[fid] = Note(self.namespace, endnote, endnotes_rels)
def get_ref(self, ref):
fid = self.namespace.get(ref, 'w:id')
notes = self.footnotes if ref.tag.endswith('}footnoteReference') else self.endnotes
note = notes.get(fid, None)
if note is not None and note.type == 'normal':
self.counter += 1
anchor = 'note_%d' % self.counter
self.notes[anchor] = (unicode_type(self.counter), note)
return anchor, unicode_type(self.counter)
return None, None
def __iter__(self):
for anchor, (counter, note) in iteritems(self.notes):
yield anchor, counter, note
@property
def has_notes(self):
return bool(self.notes)

View File

@@ -0,0 +1,343 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os
from lxml.html.builder import IMG, HR
from calibre.constants import iswindows
from calibre.ebooks.docx.names import barename
from calibre.utils.filenames import ascii_filename
from calibre.utils.img import resize_to_fit, image_to_data
from calibre.utils.imghdr import what
from polyglot.builtins import iteritems, itervalues
class LinkedImageNotFound(ValueError):
def __init__(self, fname):
ValueError.__init__(self, fname)
self.fname = fname
def image_filename(x):
return ascii_filename(x).replace(' ', '_').replace('#', '_')
def emu_to_pt(x):
return x / 12700
def pt_to_emu(x):
return int(x * 12700)
def get_image_properties(parent, XPath, get):
width = height = None
for extent in XPath('./wp:extent')(parent):
try:
width = emu_to_pt(int(extent.get('cx')))
except (TypeError, ValueError):
pass
try:
height = emu_to_pt(int(extent.get('cy')))
except (TypeError, ValueError):
pass
ans = {}
if width is not None:
ans['width'] = '%.3gpt' % width
if height is not None:
ans['height'] = '%.3gpt' % height
alt = None
title = None
for docPr in XPath('./wp:docPr')(parent):
alt = docPr.get('descr') or alt
title = docPr.get('title') or title
if docPr.get('hidden', None) in {'true', 'on', '1'}:
ans['display'] = 'none'
return ans, alt, title
def get_image_margins(elem):
ans = {}
for w, css in iteritems({'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}):
val = elem.get('dist%s' % w, None)
if val is not None:
try:
val = emu_to_pt(val)
except (TypeError, ValueError):
continue
ans['padding-%s' % css] = '%.3gpt' % val
return ans
def get_hpos(anchor, page_width, XPath, get, width_frac):
for ph in XPath('./wp:positionH')(anchor):
rp = ph.get('relativeFrom', None)
if rp == 'leftMargin':
return 0 + width_frac
if rp == 'rightMargin':
return 1 + width_frac
al = None
almap = {'left':0, 'center':0.5, 'right':1}
for align in XPath('./wp:align')(ph):
al = almap.get(align.text)
if al is not None:
if rp == 'page':
return al
return al + width_frac
for po in XPath('./wp:posOffset')(ph):
try:
pos = emu_to_pt(int(po.text))
except (TypeError, ValueError):
continue
return pos/page_width + width_frac
for sp in XPath('./wp:simplePos')(anchor):
try:
x = emu_to_pt(sp.get('x', None))
except (TypeError, ValueError):
continue
return x/page_width + width_frac
return 0
class Images(object):
def __init__(self, namespace, log):
self.namespace = namespace
self.rid_map = {}
self.used = {}
self.resized = {}
self.names = set()
self.all_images = set()
self.links = []
self.log = log
def __call__(self, relationships_by_id):
self.rid_map = relationships_by_id
def read_image_data(self, fname, base=None):
if fname.startswith('file://'):
src = fname[len('file://'):]
if iswindows and src and src[0] == '/':
src = src[1:]
if not src or not os.path.exists(src):
raise LinkedImageNotFound(src)
with open(src, 'rb') as rawsrc:
raw = rawsrc.read()
else:
try:
raw = self.docx.read(fname)
except KeyError:
raise LinkedImageNotFound(fname)
base = base or image_filename(fname.rpartition('/')[-1]) or 'image'
ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
if ext == 'emf':
# For an example, see: https://bugs.launchpad.net/bugs/1224849
self.log('Found an EMF image: %s, trying to extract embedded raster image' % fname)
from calibre.utils.wmf.emf import emf_unwrap
try:
raw = emf_unwrap(raw)
except Exception:
self.log.exception('Failed to extract embedded raster image from EMF')
else:
ext = 'png'
base = base.rpartition('.')[0]
if not base:
base = 'image'
base += '.' + ext
return raw, base
def unique_name(self, base):
exists = frozenset(itervalues(self.used))
c = 1
name = base
while name in exists:
n, e = base.rpartition('.')[0::2]
name = '%s-%d.%s' % (n, c, e)
c += 1
return name
def resize_image(self, raw, base, max_width, max_height):
resized, img = resize_to_fit(raw, max_width, max_height)
if resized:
base, ext = os.path.splitext(base)
base = base + '-%dx%d%s' % (max_width, max_height, ext)
raw = image_to_data(img, fmt=ext[1:])
return raw, base, resized
def generate_filename(self, rid, base=None, rid_map=None, max_width=None, max_height=None):
rid_map = self.rid_map if rid_map is None else rid_map
fname = rid_map[rid]
key = (fname, max_width, max_height)
ans = self.used.get(key)
if ans is not None:
return ans
raw, base = self.read_image_data(fname, base=base)
resized = False
if max_width is not None and max_height is not None:
raw, base, resized = self.resize_image(raw, base, max_width, max_height)
name = self.unique_name(base)
self.used[key] = name
if max_width is not None and max_height is not None and not resized:
okey = (fname, None, None)
if okey in self.used:
return self.used[okey]
self.used[okey] = name
with open(os.path.join(self.dest_dir, name), 'wb') as f:
f.write(raw)
self.all_images.add('images/' + name)
return name
def pic_to_img(self, pic, alt, parent, title):
XPath, get = self.namespace.XPath, self.namespace.get
name = None
link = None
for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
link = {'id':get(hl, 'r:id')}
tgt = hl.get('tgtFrame', None)
if tgt:
link['target'] = tgt
title = hl.get('tooltip', None)
if title:
link['title'] = title
for pr in XPath('descendant::pic:cNvPr')(pic):
name = pr.get('name', None)
if name:
name = image_filename(name)
alt = pr.get('descr') or alt
for a in XPath('descendant::a:blip[@r:embed or @r:link]')(pic):
rid = get(a, 'r:embed')
if not rid:
rid = get(a, 'r:link')
if rid and rid in self.rid_map:
try:
src = self.generate_filename(rid, name)
except LinkedImageNotFound as err:
self.log.warn('Linked image: %s not found, ignoring' % err.fname)
continue
img = IMG(src='images/%s' % src)
img.set('alt', alt or 'Image')
if title:
img.set('title', title)
if link is not None:
self.links.append((img, link, self.rid_map))
return img
def drawing_to_html(self, drawing, page):
XPath, get = self.namespace.XPath, self.namespace.get
# First process the inline pictures
for inline in XPath('./wp:inline')(drawing):
style, alt, title = get_image_properties(inline, XPath, get)
for pic in XPath('descendant::pic:pic')(inline):
ans = self.pic_to_img(pic, alt, inline, title)
if ans is not None:
if style:
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
yield ans
# Now process the floats
for anchor in XPath('./wp:anchor')(drawing):
style, alt, title = get_image_properties(anchor, XPath, get)
self.get_float_properties(anchor, style, page)
for pic in XPath('descendant::pic:pic')(anchor):
ans = self.pic_to_img(pic, alt, anchor, title)
if ans is not None:
if style:
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
yield ans
def pict_to_html(self, pict, page):
XPath, get = self.namespace.XPath, self.namespace.get
# First see if we have an <hr>
is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'}
if is_hr:
style = {}
hr = HR()
try:
pct = float(get(pict[0], 'o:hrpct'))
except (ValueError, TypeError, AttributeError):
pass
else:
if pct > 0:
style['width'] = '%.3g%%' % pct
align = get(pict[0], 'o:hralign', 'center')
if align in {'left', 'right'}:
style['margin-left'] = '0' if align == 'left' else 'auto'
style['margin-right'] = 'auto' if align == 'left' else '0'
if style:
hr.set('style', '; '.join(('%s:%s' % (k, v) for k, v in iteritems(style))))
yield hr
for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict):
rid = get(imagedata, 'r:id')
if rid in self.rid_map:
try:
src = self.generate_filename(rid)
except LinkedImageNotFound as err:
self.log.warn('Linked image: %s not found, ignoring' % err.fname)
continue
img = IMG(src='images/%s' % src, style="display:block")
alt = get(imagedata, 'o:title')
img.set('alt', alt or 'Image')
yield img
def get_float_properties(self, anchor, style, page):
XPath, get = self.namespace.XPath, self.namespace.get
if 'display' not in style:
style['display'] = 'block'
padding = get_image_margins(anchor)
width = float(style.get('width', '100pt')[:-2])
page_width = page.width - page.margin_left - page.margin_right
if page_width <= 0:
# Ignore margins
page_width = page.width
hpos = get_hpos(anchor, page_width, XPath, get, width/(2*page_width))
wrap_elem = None
dofloat = False
for child in reversed(anchor):
bt = barename(child.tag)
if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}:
wrap_elem = child
dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'}
break
if wrap_elem is not None:
padding.update(get_image_margins(wrap_elem))
wt = wrap_elem.get('wrapText', None)
hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos
if dofloat:
style['float'] = 'left' if hpos < 0.65 else 'right'
else:
ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto')
if ml is not None:
style['margin-left'] = ml
if mr is not None:
style['margin-right'] = mr
style.update(padding)
def to_html(self, elem, page, docx, dest_dir):
dest = os.path.join(dest_dir, 'images')
if not os.path.exists(dest):
os.mkdir(dest)
self.dest_dir, self.docx = dest, docx
if elem.tag.endswith('}drawing'):
for tag in self.drawing_to_html(elem, page):
yield tag
else:
for tag in self.pict_to_html(elem, page):
yield tag

View File

@@ -0,0 +1,273 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from operator import itemgetter
from lxml import etree
from calibre.utils.icu import partition_by_first_letter, sort_key
from polyglot.builtins import iteritems, filter
def get_applicable_xe_fields(index, xe_fields, XPath, expand):
iet = index.get('entry-type', None)
xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]
lr = index.get('letter-range', None)
if lr is not None:
sl, el = lr.parition('-')[0::2]
sl, el = sl.strip(), el.strip()
if sl and el:
def inrange(text):
return sl <= text[0] <= el
xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))]
bmark = index.get('bookmark', None)
if bmark is None:
return xe_fields
attr = expand('w:name')
bookmarks = {b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark}
ancestors = XPath('ancestor::w:bookmarkStart')
def contained(xe):
# Check if the xe field is contained inside a bookmark with the
# specified name
return bool(set(ancestors(xe['start_elem'])) & bookmarks)
return [xe for xe in xe_fields if contained(xe)]
def make_block(expand, style, parent, pos):
p = parent.makeelement(expand('w:p'))
parent.insert(pos, p)
if style is not None:
ppr = p.makeelement(expand('w:pPr'))
p.append(ppr)
ps = ppr.makeelement(expand('w:pStyle'))
ppr.append(ps)
ps.set(expand('w:val'), style)
r = p.makeelement(expand('w:r'))
p.append(r)
t = r.makeelement(expand('w:t'))
t.set(expand('xml:space'), 'preserve')
r.append(t)
return p, t
def add_xe(xe, t, expand):
run = t.getparent()
idx = run.index(t)
t.text = xe.get('text') or ' '
pt = xe.get('page-number-text', None)
if pt:
p = t.getparent().getparent()
r = p.makeelement(expand('w:r'))
p.append(r)
t2 = r.makeelement(expand('w:t'))
t2.set(expand('xml:space'), 'preserve')
t2.text = ' [%s]' % pt
r.append(t2)
# put separate entries on separate lines
run.insert(idx + 1, run.makeelement(expand('w:br')))
return xe['anchor'], run
def process_index(field, index, xe_fields, log, XPath, expand):
'''
We remove all the word generated index markup and replace it with our own
that is more suitable for an ebook.
'''
styles = []
heading_text = index.get('heading', None)
heading_style = 'IndexHeading'
start_pos = None
for elem in field.contents:
if elem.tag.endswith('}p'):
s = XPath('descendant::pStyle/@w:val')(elem)
if s:
styles.append(s[0])
p = elem.getparent()
if start_pos is None:
start_pos = (p, p.index(elem))
p.remove(elem)
xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand)
if not xe_fields:
return [], []
if heading_text is not None:
groups = partition_by_first_letter(xe_fields, key=itemgetter('text'))
items = []
for key, fields in iteritems(groups):
items.append(key), items.extend(fields)
if styles:
heading_style = styles[0]
else:
items = sorted(xe_fields, key=lambda x:sort_key(x['text']))
hyperlinks = []
blocks = []
for item in reversed(items):
is_heading = not isinstance(item, dict)
style = heading_style if is_heading else None
p, t = make_block(expand, style, *start_pos)
if is_heading:
text = heading_text
if text.lower().startswith('a'):
text = item + text[1:]
t.text = text
else:
hyperlinks.append(add_xe(item, t, expand))
blocks.append(p)
return hyperlinks, blocks
def split_up_block(block, a, text, parts, ldict):
prefix = parts[:-1]
a.text = parts[-1]
parent = a.getparent()
style = 'display:block; margin-left: %.3gem'
for i, prefix in enumerate(prefix):
m = 1.5 * i
span = parent.makeelement('span', style=style % m)
ldict[span] = i
parent.append(span)
span.text = prefix
span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
parent.append(span)
span.append(a)
ldict[span] = len(prefix)
"""
The merge algorithm is a little tricky.
We start with a list of elementary blocks. Each is an HtmlElement, a p node
with a list of child nodes. The last child may be a link, and the earlier ones are
just text.
The list is in reverse order from what we want in the index.
There is a dictionary ldict which records the level of each child node.
Now we want to do a reduce-like operation, combining all blocks with the same
top level index entry into a single block representing the structure of all
references, subentries, etc. under that top entry.
Here's the algorithm.
Given a block p and the next block n, and the top level entries p1 and n1 in each
block, which we assume have the same text:
Start with (p, p1) and (n, n1).
Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge:
If there are no more levels in n, and we have a link in nk,
then add the link from nk to the links for pk.
This might be the first link for pk, or we might get a list of references.
Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have
the same text, it must follow pk, it must come before we find any other p entries at
the same level as pk, and it must have the same level as nk+1.
If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1).
If there is no matching entry, then because of the original reversed order we want
to insert nk+1 and all following entries from n into p immediately following pk.
"""
def find_match(prev_block, pind, nextent, ldict):
curlevel = ldict.get(prev_block[pind], -1)
if curlevel < 0:
return -1
for p in range(pind+1, len(prev_block)):
trylev = ldict.get(prev_block[p], -1)
if trylev <= curlevel:
return -1
if trylev > (curlevel+1):
continue
if prev_block[p].text_content() == nextent.text_content():
return p
return -1
def add_link(pent, nent, ldict):
na = nent.xpath('descendant::a[1]')
# If there is no link, leave it as text
if not na or len(na) == 0:
return
na = na[0]
pa = pent.xpath('descendant::a')
if pa and len(pa) > 0:
# Put on same line with a comma
pa = pa[-1]
pa.tail = ', '
p = pa.getparent()
p.insert(p.index(pa) + 1, na)
else:
# substitute link na for plain text in pent
pent.text = ""
pent.append(na)
def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict):
# First elements match. Any more in next?
if len(next_path) == (nind + 1):
nextent = next_block[nind]
add_link(prev_block[pind], nextent, ldict)
return
nind = nind + 1
nextent = next_block[nind]
prevent = find_match(prev_block, pind, nextent, ldict)
if prevent > 0:
merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict)
return
# Want to insert elements into previous block
while nind < len(next_block):
# insert takes it out of old
pind = pind + 1
prev_block.insert(pind, next_block[nind])
next_block.getparent().remove(next_block)
def polish_index_markup(index, blocks):
# Blocks are in reverse order at this point
path_map = {}
ldict = {}
for block in blocks:
cls = block.get('class', '') or ''
block.set('class', (cls + ' index-entry').lstrip())
a = block.xpath('descendant::a[1]')
text = ''
if a:
text = etree.tostring(a[0], method='text', with_tail=False, encoding='unicode').strip()
if ':' in text:
path_map[block] = parts = list(filter(None, (x.strip() for x in text.split(':'))))
if len(parts) > 1:
split_up_block(block, a[0], text, parts, ldict)
else:
# try using a span all the time
path_map[block] = [text]
parent = a[0].getparent()
span = parent.makeelement('span', style='display:block; margin-left: 0em')
parent.append(span)
span.append(a[0])
ldict[span] = 0
for br in block.xpath('descendant::br'):
br.tail = None
# We want a single block for each main entry
prev_block = blocks[0]
for block in blocks[1:]:
pp, pn = path_map[prev_block], path_map[block]
if pp[0] == pn[0]:
merge_blocks(prev_block, block, 0, 0, pn, ldict)
else:
prev_block = block

View File

@@ -0,0 +1,144 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from lxml.etree import XPath as X
from calibre.utils.filenames import ascii_text
from polyglot.builtins import iteritems
# Names {{{
TRANSITIONAL_NAMES = {
'DOCUMENT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument',
'DOCPROPS' : 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties',
'APPPROPS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties',
'STYLES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles',
'NUMBERING' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering',
'FONTS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable',
'EMBEDDED_FONT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/font',
'IMAGES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image',
'LINKS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink',
'FOOTNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes',
'ENDNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes',
'THEMES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme',
'SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings',
'WEB_SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings',
}
STRICT_NAMES = {
k:v.replace('http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument')
for k, v in iteritems(TRANSITIONAL_NAMES)
}
TRANSITIONAL_NAMESPACES = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
'o': 'urn:schemas-microsoft-com:office:office',
've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
# Text Content
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'w10': 'urn:schemas-microsoft-com:office:word',
'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
'xml': 'http://www.w3.org/XML/1998/namespace',
# Drawing
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
'mv': 'urn:schemas-microsoft-com:mac:vml',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
'v': 'urn:schemas-microsoft-com:vml',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
# Properties (core and extended)
'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
'dc': 'http://purl.org/dc/elements/1.1/',
'ep': 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
# Content Types
'ct': 'http://schemas.openxmlformats.org/package/2006/content-types',
# Package Relationships
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
# Dublin Core document properties
'dcmitype': 'http://purl.org/dc/dcmitype/',
'dcterms': 'http://purl.org/dc/terms/'
}
STRICT_NAMESPACES = {
k:v.replace(
'http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument').replace(
'http://schemas.openxmlformats.org/wordprocessingml/2006', 'http://purl.oclc.org/ooxml/wordprocessingml').replace(
'http://schemas.openxmlformats.org/drawingml/2006', 'http://purl.oclc.org/ooxml/drawingml')
for k, v in iteritems(TRANSITIONAL_NAMESPACES)
}
# }}}
def barename(x):
return x.rpartition('}')[-1]
def XML(x):
return '{%s}%s' % (TRANSITIONAL_NAMESPACES['xml'], x)
def generate_anchor(name, existing):
x = y = 'id_' + re.sub(r'[^0-9a-zA-Z_]', '', ascii_text(name)).lstrip('_')
c = 1
while y in existing:
y = '%s_%d' % (x, c)
c += 1
return y
class DOCXNamespace(object):
def __init__(self, transitional=True):
self.xpath_cache = {}
if transitional:
self.namespaces = TRANSITIONAL_NAMESPACES.copy()
self.names = TRANSITIONAL_NAMES.copy()
else:
self.namespaces = STRICT_NAMESPACES.copy()
self.names = STRICT_NAMES.copy()
def XPath(self, expr):
ans = self.xpath_cache.get(expr, None)
if ans is None:
self.xpath_cache[expr] = ans = X(expr, namespaces=self.namespaces)
return ans
def is_tag(self, x, q):
tag = getattr(x, 'tag', x)
ns, name = q.partition(':')[0::2]
return '{%s}%s' % (self.namespaces.get(ns, None), name) == tag
def expand(self, name, sep=':'):
ns, tag = name.partition(sep)[::2]
if ns and tag:
tag = '{%s}%s' % (self.namespaces[ns], tag)
return tag or ns
def get(self, x, attr, default=None):
return x.attrib.get(self.expand(attr), default)
def ancestor(self, elem, name):
try:
return self.XPath('ancestor::%s[1]' % name)(elem)[0]
except IndexError:
return None
def children(self, elem, *args):
return self.XPath('|'.join('child::%s' % a for a in args))(elem)
def descendants(self, elem, *args):
return self.XPath('|'.join('descendant::%s' % a for a in args))(elem)
def makeelement(self, root, tag, append=True, **attrs):
ans = root.makeelement(self.expand(tag), **{self.expand(k, sep='_'):v for k, v in iteritems(attrs)})
if append:
root.append(ans)
return ans

View File

@@ -0,0 +1,388 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re, string
from collections import Counter, defaultdict
from functools import partial
from lxml.html.builder import OL, UL, SPAN
from calibre.ebooks.docx.block_styles import ParagraphStyle
from calibre.ebooks.docx.char_styles import RunStyle, inherit
from calibre.ebooks.metadata import roman
from polyglot.builtins import iteritems, unicode_type
STYLE_MAP = {
'aiueo': 'hiragana',
'aiueoFullWidth': 'hiragana',
'hebrew1': 'hebrew',
'iroha': 'katakana-iroha',
'irohaFullWidth': 'katakana-iroha',
'lowerLetter': 'lower-alpha',
'lowerRoman': 'lower-roman',
'none': 'none',
'upperLetter': 'upper-alpha',
'upperRoman': 'upper-roman',
'chineseCounting': 'cjk-ideographic',
'decimalZero': 'decimal-leading-zero',
}
def alphabet(val, lower=True):
x = string.ascii_lowercase if lower else string.ascii_uppercase
return x[(abs(val - 1)) % len(x)]
alphabet_map = {
'lower-alpha':alphabet, 'upper-alpha':partial(alphabet, lower=False),
'lower-roman':lambda x:roman(x).lower(), 'upper-roman':roman,
'decimal-leading-zero': lambda x: '0%d' % x
}
class Level(object):
def __init__(self, namespace, lvl=None):
self.namespace = namespace
self.restart = None
self.start = 0
self.fmt = 'decimal'
self.para_link = None
self.paragraph_style = self.character_style = None
self.is_numbered = False
self.num_template = None
self.bullet_template = None
self.pic_id = None
if lvl is not None:
self.read_from_xml(lvl)
def copy(self):
ans = Level(self.namespace)
for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
setattr(ans, x, getattr(self, x))
return ans
def format_template(self, counter, ilvl, template):
def sub(m):
x = int(m.group(1)) - 1
if x > ilvl or x not in counter:
return ''
val = counter[x] - (0 if x == ilvl else 1)
formatter = alphabet_map.get(self.fmt, lambda x: '%d' % x)
return formatter(val)
return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
def read_from_xml(self, lvl, override=False):
XPath, get = self.namespace.XPath, self.namespace.get
for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
try:
self.restart = int(get(lr, 'w:val'))
except (TypeError, ValueError):
pass
for lr in XPath('./w:start[@w:val]')(lvl):
try:
self.start = int(get(lr, 'w:val'))
except (TypeError, ValueError):
pass
for rPr in XPath('./w:rPr')(lvl):
ps = RunStyle(self.namespace, rPr)
if self.character_style is None:
self.character_style = ps
else:
self.character_style.update(ps)
lt = None
for lr in XPath('./w:lvlText[@w:val]')(lvl):
lt = get(lr, 'w:val')
for lr in XPath('./w:numFmt[@w:val]')(lvl):
val = get(lr, 'w:val')
if val == 'bullet':
self.is_numbered = False
cs = self.character_style
if lt in {'\uf0a7', 'o'} or (
cs is not None and cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}):
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
else:
self.bullet_template = lt
for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl):
self.pic_id = get(lpid, 'w:val')
else:
self.is_numbered = True
self.fmt = STYLE_MAP.get(val, 'decimal')
if lt and re.match(r'%\d+\.$', lt) is None:
self.num_template = lt
for lr in XPath('./w:pStyle[@w:val]')(lvl):
self.para_link = get(lr, 'w:val')
for pPr in XPath('./w:pPr')(lvl):
ps = ParagraphStyle(self.namespace, pPr)
if self.paragraph_style is None:
self.paragraph_style = ps
else:
self.paragraph_style.update(ps)
def css(self, images, pic_map, rid_map):
ans = {'list-style-type': self.fmt}
if self.pic_id:
rid = pic_map.get(self.pic_id, None)
if rid:
try:
fname = images.generate_filename(rid, rid_map=rid_map, max_width=20, max_height=20)
except Exception:
fname = None
else:
ans['list-style-image'] = 'url("images/%s")' % fname
return ans
def char_css(self):
try:
css = self.character_style.css
except AttributeError:
css = {}
css.pop('font-family', None)
return css
class NumberingDefinition(object):
def __init__(self, namespace, parent=None, an_id=None):
self.namespace = namespace
XPath, get = self.namespace.XPath, self.namespace.get
self.levels = {}
self.abstract_numbering_definition_id = an_id
if parent is not None:
for lvl in XPath('./w:lvl')(parent):
try:
ilvl = int(get(lvl, 'w:ilvl', 0))
except (TypeError, ValueError):
ilvl = 0
self.levels[ilvl] = Level(namespace, lvl)
def copy(self):
ans = NumberingDefinition(self.namespace, an_id=self.abstract_numbering_definition_id)
for l, lvl in iteritems(self.levels):
ans.levels[l] = lvl.copy()
return ans
class Numbering(object):
def __init__(self, namespace):
self.namespace = namespace
self.definitions = {}
self.instances = {}
self.counters = defaultdict(Counter)
self.starts = {}
self.pic_map = {}
def __call__(self, root, styles, rid_map):
' Read all numbering style definitions '
XPath, get = self.namespace.XPath, self.namespace.get
self.rid_map = rid_map
for npb in XPath('./w:numPicBullet[@w:numPicBulletId]')(root):
npbid = get(npb, 'w:numPicBulletId')
for idata in XPath('descendant::v:imagedata[@r:id]')(npb):
rid = get(idata, 'r:id')
self.pic_map[npbid] = rid
lazy_load = {}
for an in XPath('./w:abstractNum[@w:abstractNumId]')(root):
an_id = get(an, 'w:abstractNumId')
nsl = XPath('./w:numStyleLink[@w:val]')(an)
if nsl:
lazy_load[an_id] = get(nsl[0], 'w:val')
else:
nd = NumberingDefinition(self.namespace, an, an_id=an_id)
self.definitions[an_id] = nd
def create_instance(n, definition):
nd = definition.copy()
start_overrides = {}
for lo in XPath('./w:lvlOverride')(n):
try:
ilvl = int(get(lo, 'w:ilvl'))
except (ValueError, TypeError):
ilvl = None
for so in XPath('./w:startOverride[@w:val]')(lo):
try:
start_override = int(get(so, 'w:val'))
except (TypeError, ValueError):
pass
else:
start_overrides[ilvl] = start_override
for lvl in XPath('./w:lvl')(lo)[:1]:
nilvl = get(lvl, 'w:ilvl')
ilvl = nilvl if ilvl is None else ilvl
alvl = nd.levels.get(ilvl, None)
if alvl is None:
alvl = Level(self.namespace)
alvl.read_from_xml(lvl, override=True)
for ilvl, so in iteritems(start_overrides):
try:
nd.levels[ilvl].start = start_override
except KeyError:
pass
return nd
next_pass = {}
for n in XPath('./w:num[@w:numId]')(root):
an_id = None
num_id = get(n, 'w:numId')
for an in XPath('./w:abstractNumId[@w:val]')(n):
an_id = get(an, 'w:val')
d = self.definitions.get(an_id, None)
if d is None:
next_pass[num_id] = (an_id, n)
continue
self.instances[num_id] = create_instance(n, d)
numbering_links = styles.numbering_style_links
for an_id, style_link in iteritems(lazy_load):
num_id = numbering_links[style_link]
self.definitions[an_id] = self.instances[num_id].copy()
for num_id, (an_id, n) in iteritems(next_pass):
d = self.definitions.get(an_id, None)
if d is not None:
self.instances[num_id] = create_instance(n, d)
for num_id, d in iteritems(self.instances):
self.starts[num_id] = {lvl:d.levels[lvl].start for lvl in d.levels}
def get_pstyle(self, num_id, style_id):
d = self.instances.get(num_id, None)
if d is not None:
for ilvl, lvl in iteritems(d.levels):
if lvl.para_link == style_id:
return ilvl
def get_para_style(self, num_id, lvl):
d = self.instances.get(num_id, None)
if d is not None:
lvl = d.levels.get(lvl, None)
return getattr(lvl, 'paragraph_style', None)
def update_counter(self, counter, levelnum, levels):
counter[levelnum] += 1
for ilvl, lvl in iteritems(levels):
restart = lvl.restart
if (restart is None and ilvl == levelnum + 1) or restart == levelnum + 1:
counter[ilvl] = lvl.start
def apply_markup(self, items, body, styles, object_map, images):
seen_instances = set()
for p, num_id, ilvl in items:
d = self.instances.get(num_id, None)
if d is not None:
lvl = d.levels.get(ilvl, None)
if lvl is not None:
an_id = d.abstract_numbering_definition_id
counter = self.counters[an_id]
if ilvl not in counter or num_id not in seen_instances:
counter[ilvl] = self.starts[num_id][ilvl]
seen_instances.add(num_id)
p.tag = 'li'
p.set('value', '%s' % counter[ilvl])
p.set('list-lvl', unicode_type(ilvl))
p.set('list-id', num_id)
if lvl.num_template is not None:
val = lvl.format_template(counter, ilvl, lvl.num_template)
p.set('list-template', val)
elif lvl.bullet_template is not None:
val = lvl.format_template(counter, ilvl, lvl.bullet_template)
p.set('list-template', val)
self.update_counter(counter, ilvl, d.levels)
templates = {}
def commit(current_run):
if not current_run:
return
start = current_run[0]
parent = start.getparent()
idx = parent.index(start)
d = self.instances[start.get('list-id')]
ilvl = int(start.get('list-lvl'))
lvl = d.levels[ilvl]
lvlid = start.get('list-id') + start.get('list-lvl')
has_template = 'list-template' in start.attrib
wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t')
if has_template:
wrap.set('lvlid', lvlid)
else:
wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list'))
ccss = lvl.char_css()
if ccss:
ccss = styles.register(ccss, 'bullet')
parent.insert(idx, wrap)
last_val = None
for child in current_run:
wrap.append(child)
child.tail = '\n\t'
if has_template:
span = SPAN()
span.text = child.text
child.text = None
for gc in child:
span.append(gc)
child.append(span)
span = SPAN(child.get('list-template'))
if ccss:
span.set('class', ccss)
last = templates.get(lvlid, '')
if span.text and len(span.text) > len(last):
templates[lvlid] = span.text
child.insert(0, span)
for attr in ('list-lvl', 'list-id', 'list-template'):
child.attrib.pop(attr, None)
val = int(child.get('value'))
if last_val == val - 1 or wrap.tag == 'ul' or (last_val is None and val == 1):
child.attrib.pop('value')
last_val = val
current_run[-1].tail = '\n'
del current_run[:]
parents = set()
for child in body.iterdescendants('li'):
parents.add(child.getparent())
for parent in parents:
current_run = []
for child in parent:
if child.tag == 'li':
if current_run:
last = current_run[-1]
if (last.get('list-id') , last.get('list-lvl')) != (child.get('list-id'), child.get('list-lvl')):
commit(current_run)
current_run.append(child)
else:
commit(current_run)
commit(current_run)
# Convert the list items that use custom text for bullets into tables
# so that they display correctly
for wrap in body.xpath('//ol[@lvlid]'):
wrap.attrib.pop('lvlid')
wrap.tag = 'div'
wrap.set('style', 'display:table')
for i, li in enumerate(wrap.iterchildren('li')):
li.tag = 'div'
li.attrib.pop('value', None)
li.set('style', 'display:table-row')
obj = object_map[li]
bs = styles.para_cache[obj]
if i == 0:
wrap.set('style', 'display:table; padding-left:%s' %
bs.css.get('margin-left', '0'))
bs.css.pop('margin-left', None)
for child in li:
child.set('style', 'display:table-cell')

View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
class Settings(object):
def __init__(self, namespace):
self.default_tab_stop = 720 / 20
self.namespace = namespace
def __call__(self, root):
for dts in self.namespace.XPath('//w:defaultTabStop[@w:val]')(root):
try:
self.default_tab_stop = int(self.namespace.get(dts, 'w:val')) / 20
except (ValueError, TypeError, AttributeError):
pass

View File

@@ -0,0 +1,504 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import textwrap
from collections import OrderedDict, Counter
from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit, twips
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.tables import TableStyle
from polyglot.builtins import iteritems, itervalues
class PageProperties(object):
'''
Class representing page level properties (page size/margins) read from
sectPr elements.
'''
def __init__(self, namespace, elems=()):
self.width, self.height = 595.28, 841.89 # pts, A4
self.margin_left = self.margin_right = 72 # pts
def setval(attr, val):
val = twips(val)
if val is not None:
setattr(self, attr, val)
for sectPr in elems:
for pgSz in namespace.XPath('./w:pgSz')(sectPr):
w, h = namespace.get(pgSz, 'w:w'), namespace.get(pgSz, 'w:h')
setval('width', w), setval('height', h)
for pgMar in namespace.XPath('./w:pgMar')(sectPr):
l, r = namespace.get(pgMar, 'w:left'), namespace.get(pgMar, 'w:right')
setval('margin_left', l), setval('margin_right', r)
class Style(object):
'''
Class representing a <w:style> element. Can contain block, character, etc. styles.
'''
def __init__(self, namespace, elem):
self.namespace = namespace
self.name_path = namespace.XPath('./w:name[@w:val]')
self.based_on_path = namespace.XPath('./w:basedOn[@w:val]')
self.resolved = False
self.style_id = namespace.get(elem, 'w:styleId')
self.style_type = namespace.get(elem, 'w:type')
names = self.name_path(elem)
self.name = namespace.get(names[-1], 'w:val') if names else None
based_on = self.based_on_path(elem)
self.based_on = namespace.get(based_on[0], 'w:val') if based_on else None
if self.style_type == 'numbering':
self.based_on = None
self.is_default = namespace.get(elem, 'w:default') in {'1', 'on', 'true'}
self.paragraph_style = self.character_style = self.table_style = None
if self.style_type in {'paragraph', 'character', 'table'}:
if self.style_type == 'table':
for tblPr in namespace.XPath('./w:tblPr')(elem):
ts = TableStyle(namespace, tblPr)
if self.table_style is None:
self.table_style = ts
else:
self.table_style.update(ts)
if self.style_type in {'paragraph', 'table'}:
for pPr in namespace.XPath('./w:pPr')(elem):
ps = ParagraphStyle(namespace, pPr)
if self.paragraph_style is None:
self.paragraph_style = ps
else:
self.paragraph_style.update(ps)
for rPr in namespace.XPath('./w:rPr')(elem):
rs = RunStyle(namespace, rPr)
if self.character_style is None:
self.character_style = rs
else:
self.character_style.update(rs)
if self.style_type in {'numbering', 'paragraph'}:
self.numbering_style_link = None
for x in namespace.XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
self.numbering_style_link = namespace.get(x, 'w:val')
def resolve_based_on(self, parent):
if parent.table_style is not None:
if self.table_style is None:
self.table_style = TableStyle(self.namespace)
self.table_style.resolve_based_on(parent.table_style)
if parent.paragraph_style is not None:
if self.paragraph_style is None:
self.paragraph_style = ParagraphStyle(self.namespace)
self.paragraph_style.resolve_based_on(parent.paragraph_style)
if parent.character_style is not None:
if self.character_style is None:
self.character_style = RunStyle(self.namespace)
self.character_style.resolve_based_on(parent.character_style)
class Styles(object):
'''
Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup.
'''
def __init__(self, namespace, tables):
self.namespace = namespace
self.id_map = OrderedDict()
self.para_cache = {}
self.para_char_cache = {}
self.run_cache = {}
self.classes = {}
self.counter = Counter()
self.default_styles = {}
self.tables = tables
self.numbering_style_links = {}
self.default_paragraph_style = self.default_character_style = None
def __iter__(self):
for s in itervalues(self.id_map):
yield s
def __getitem__(self, key):
return self.id_map[key]
def __len__(self):
return len(self.id_map)
def get(self, key, default=None):
return self.id_map.get(key, default)
def __call__(self, root, fonts, theme):
self.fonts, self.theme = fonts, theme
self.default_paragraph_style = self.default_character_style = None
if root is not None:
for s in self.namespace.XPath('//w:style')(root):
s = Style(self.namespace, s)
if s.style_id:
self.id_map[s.style_id] = s
if s.is_default:
self.default_styles[s.style_type] = s
if getattr(s, 'numbering_style_link', None) is not None:
self.numbering_style_links[s.style_id] = s.numbering_style_link
for dd in self.namespace.XPath('./w:docDefaults')(root):
for pd in self.namespace.XPath('./w:pPrDefault')(dd):
for pPr in self.namespace.XPath('./w:pPr')(pd):
ps = ParagraphStyle(self.namespace, pPr)
if self.default_paragraph_style is None:
self.default_paragraph_style = ps
else:
self.default_paragraph_style.update(ps)
for pd in self.namespace.XPath('./w:rPrDefault')(dd):
for pPr in self.namespace.XPath('./w:rPr')(pd):
ps = RunStyle(self.namespace, pPr)
if self.default_character_style is None:
self.default_character_style = ps
else:
self.default_character_style.update(ps)
def resolve(s, p):
if p is not None:
if not p.resolved:
resolve(p, self.get(p.based_on))
s.resolve_based_on(p)
s.resolved = True
for s in self:
if not s.resolved:
resolve(s, self.get(s.based_on))
def para_val(self, parent_styles, direct_formatting, attr):
val = getattr(direct_formatting, attr)
if val is inherit:
for ps in reversed(parent_styles):
pval = getattr(ps, attr)
if pval is not inherit:
val = pval
break
return val
def run_val(self, parent_styles, direct_formatting, attr):
val = getattr(direct_formatting, attr)
if val is not inherit:
return val
if attr in direct_formatting.toggle_properties:
# The spec (section 17.7.3) does not make sense, so we follow the behavior
# of Word, which seems to only consider the document default if the
# property has not been defined in any styles.
vals = [int(getattr(rs, attr)) for rs in parent_styles if rs is not self.default_character_style and getattr(rs, attr) is not inherit]
if vals:
return sum(vals) % 2 == 1
if self.default_character_style is not None:
return getattr(self.default_character_style, attr) is True
return False
for rs in reversed(parent_styles):
rval = getattr(rs, attr)
if rval is not inherit:
return rval
return val
def resolve_paragraph(self, p):
ans = self.para_cache.get(p, None)
if ans is None:
linked_style = None
ans = self.para_cache[p] = ParagraphStyle(self.namespace)
ans.style_name = None
direct_formatting = None
is_section_break = False
for pPr in self.namespace.XPath('./w:pPr')(p):
ps = ParagraphStyle(self.namespace, pPr)
if direct_formatting is None:
direct_formatting = ps
else:
direct_formatting.update(ps)
if self.namespace.XPath('./w:sectPr')(pPr):
is_section_break = True
if direct_formatting is None:
direct_formatting = ParagraphStyle(self.namespace)
parent_styles = []
if self.default_paragraph_style is not None:
parent_styles.append(self.default_paragraph_style)
ts = self.tables.para_style(p)
if ts is not None:
parent_styles.append(ts)
default_para = self.default_styles.get('paragraph', None)
if direct_formatting.linked_style is not None:
ls = linked_style = self.get(direct_formatting.linked_style)
if ls is not None:
ans.style_name = ls.name
ps = ls.paragraph_style
if ps is not None:
parent_styles.append(ps)
if ls.character_style is not None:
self.para_char_cache[p] = ls.character_style
elif default_para is not None:
if default_para.paragraph_style is not None:
parent_styles.append(default_para.paragraph_style)
if default_para.character_style is not None:
self.para_char_cache[p] = default_para.character_style
def has_numbering(block_style):
num_id, lvl = getattr(block_style, 'numbering_id', inherit), getattr(block_style, 'numbering_level', inherit)
return num_id is not None and num_id is not inherit and lvl is not None and lvl is not inherit
is_numbering = has_numbering(direct_formatting)
is_section_break = is_section_break and not self.namespace.XPath('./w:r')(p)
if is_numbering and not is_section_break:
num_id, lvl = direct_formatting.numbering_id, direct_formatting.numbering_level
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
ps = self.numbering.get_para_style(num_id, lvl)
if ps is not None:
parent_styles.append(ps)
if (
not is_numbering and not is_section_break and linked_style is not None and has_numbering(linked_style.paragraph_style)
):
num_id, lvl = linked_style.paragraph_style.numbering_id, linked_style.paragraph_style.numbering_level
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
is_numbering = True
ps = self.numbering.get_para_style(num_id, lvl)
if ps is not None:
parent_styles.append(ps)
for attr in ans.all_properties:
if not (is_numbering and attr == 'text_indent'): # skip text-indent for lists
setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr))
ans.linked_style = direct_formatting.linked_style
return ans
def resolve_run(self, r):
ans = self.run_cache.get(r, None)
if ans is None:
p = self.namespace.XPath('ancestor::w:p[1]')(r)
p = p[0] if p else None
ans = self.run_cache[r] = RunStyle(self.namespace)
direct_formatting = None
for rPr in self.namespace.XPath('./w:rPr')(r):
rs = RunStyle(self.namespace, rPr)
if direct_formatting is None:
direct_formatting = rs
else:
direct_formatting.update(rs)
if direct_formatting is None:
direct_formatting = RunStyle(self.namespace)
parent_styles = []
default_char = self.default_styles.get('character', None)
if self.default_character_style is not None:
parent_styles.append(self.default_character_style)
pstyle = self.para_char_cache.get(p, None)
if pstyle is not None:
parent_styles.append(pstyle)
# As best as I can understand the spec, table overrides should be
# applied before paragraph overrides, but word does it
# this way, see the December 2007 table header in the demo
# document.
ts = self.tables.run_style(p)
if ts is not None:
parent_styles.append(ts)
if direct_formatting.linked_style is not None:
ls = getattr(self.get(direct_formatting.linked_style), 'character_style', None)
if ls is not None:
parent_styles.append(ls)
elif default_char is not None and default_char.character_style is not None:
parent_styles.append(default_char.character_style)
for attr in ans.all_properties:
setattr(ans, attr, self.run_val(parent_styles, direct_formatting, attr))
if ans.font_family is not inherit:
ff = self.theme.resolve_font_family(ans.font_family)
ans.font_family = self.fonts.family_for(ff, ans.b, ans.i)
return ans
def resolve(self, obj):
if obj.tag.endswith('}p'):
return self.resolve_paragraph(obj)
if obj.tag.endswith('}r'):
return self.resolve_run(obj)
def cascade(self, layers):
self.body_font_family = 'serif'
self.body_font_size = '10pt'
self.body_color = 'black'
def promote_property(char_styles, block_style, prop):
vals = {getattr(s, prop) for s in char_styles}
if len(vals) == 1:
# All the character styles have the same value
for s in char_styles:
setattr(s, prop, inherit)
setattr(block_style, prop, next(iter(vals)))
for p, runs in iteritems(layers):
has_links = '1' in {r.get('is-link', None) for r in runs}
char_styles = [self.resolve_run(r) for r in runs]
block_style = self.resolve_paragraph(p)
for prop in ('font_family', 'font_size', 'cs_font_family', 'cs_font_size', 'color'):
if has_links and prop == 'color':
# We cannot promote color as browser rendering engines will
# override the link color setting it to blue, unless the
# color is specified on the link element itself
continue
promote_property(char_styles, block_style, prop)
for s in char_styles:
if s.text_decoration == 'none':
# The default text decoration is 'none'
s.text_decoration = inherit
def promote_most_common(block_styles, prop, default):
c = Counter()
for s in block_styles:
val = getattr(s, prop)
if val is not inherit:
c[val] += 1
val = None
if c:
val = c.most_common(1)[0][0]
for s in block_styles:
oval = getattr(s, prop)
if oval is inherit:
if default != val:
setattr(s, prop, default)
elif oval == val:
setattr(s, prop, inherit)
return val
block_styles = tuple(self.resolve_paragraph(p) for p in layers)
ff = promote_most_common(block_styles, 'font_family', self.body_font_family)
if ff is not None:
self.body_font_family = ff
fs = promote_most_common(block_styles, 'font_size', int(self.body_font_size[:2]))
if fs is not None:
self.body_font_size = '%.3gpt' % fs
color = promote_most_common(block_styles, 'color', self.body_color)
if color is not None:
self.body_color = color
def resolve_numbering(self, numbering):
# When a numPr element appears inside a paragraph style, the lvl info
# must be discarded and pStyle used instead.
self.numbering = numbering
for style in self:
ps = style.paragraph_style
if ps is not None and ps.numbering_id is not inherit:
lvl = numbering.get_pstyle(ps.numbering_id, style.style_id)
if lvl is None:
ps.numbering_id = ps.numbering_level = inherit
else:
ps.numbering_level = lvl
def apply_contextual_spacing(self, paras):
last_para = None
for p in paras:
if last_para is not None:
ls = self.resolve_paragraph(last_para)
ps = self.resolve_paragraph(p)
if ls.linked_style is not None and ls.linked_style == ps.linked_style:
if ls.contextualSpacing is True:
ls.margin_bottom = 0
if ps.contextualSpacing is True:
ps.margin_top = 0
last_para = p
def apply_section_page_breaks(self, paras):
for p in paras:
ps = self.resolve_paragraph(p)
ps.pageBreakBefore = True
def register(self, css, prefix):
h = hash(frozenset(iteritems(css)))
ans, _ = self.classes.get(h, (None, None))
if ans is None:
self.counter[prefix] += 1
ans = '%s_%d' % (prefix, self.counter[prefix])
self.classes[h] = (ans, css)
return ans
def generate_classes(self):
for bs in itervalues(self.para_cache):
css = bs.css
if css:
self.register(css, 'block')
for bs in itervalues(self.run_cache):
css = bs.css
if css:
self.register(css, 'text')
def class_name(self, css):
h = hash(frozenset(iteritems(css)))
return self.classes.get(h, (None, None))[0]
def generate_css(self, dest_dir, docx, notes_nopb, nosupsub):
ef = self.fonts.embed_fonts(dest_dir, docx)
s = '''\
body { font-family: %s; font-size: %s; color: %s }
/* In word all paragraphs have zero margins unless explicitly specified in a style */
p, h1, h2, h3, h4, h5, h6, div { margin: 0; padding: 0 }
/* In word headings only have bold font if explicitly specified,
similarly the font size is the body font size, unless explicitly set. */
h1, h2, h3, h4, h5, h6 { font-weight: normal; font-size: 1rem }
/* Setting padding-left to zero breaks rendering of lists, so we only set the other values to zero and leave padding-left for the user-agent */
ul, ol { margin: 0; padding-top: 0; padding-bottom: 0; padding-right: 0 }
/* The word hyperlink styling will set text-decoration to underline if needed */
a { text-decoration: none }
sup.noteref a { text-decoration: none }
h1.notes-header { page-break-before: always }
dl.footnote dt { font-size: large }
dl.footnote dt a { text-decoration: none }
'''
if not notes_nopb:
s += '''\
dl.footnote { page-break-after: always }
dl.footnote:last-of-type { page-break-after: avoid }
'''
s = s + '''\
span.tab { white-space: pre }
p.index-entry { text-indent: 0pt; }
p.index-entry a:visited { color: blue }
p.index-entry a:hover { color: red }
'''
if nosupsub:
s = s + '''\
sup { vertical-align: top }
sub { vertical-align: bottom }
'''
prefix = textwrap.dedent(s) % (self.body_font_family, self.body_font_size, self.body_color)
if ef:
prefix = ef + '\n' + prefix
ans = []
for (cls, css) in sorted(itervalues(self.classes), key=lambda x:x[0]):
b = ('\t%s: %s;' % (k, v) for k, v in iteritems(css))
b = '\n'.join(b)
ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';')))
return prefix + '\n' + '\n'.join(ans)

View File

@@ -0,0 +1,700 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from lxml.html.builder import TABLE, TR, TD
from calibre.ebooks.docx.block_styles import inherit, read_shd as rs, read_border, binary_property, border_props, ParagraphStyle, border_to_css
from calibre.ebooks.docx.char_styles import RunStyle
from polyglot.builtins import filter, iteritems, itervalues, range, unicode_type
# Read from XML {{{
read_shd = rs
edges = ('left', 'top', 'right', 'bottom')
def _read_width(elem, get):
ans = inherit
try:
w = int(get(elem, 'w:w'))
except (TypeError, ValueError):
w = 0
typ = get(elem, 'w:type', 'auto')
if typ == 'nil':
ans = '0'
elif typ == 'auto':
ans = 'auto'
elif typ == 'dxa':
ans = '%.3gpt' % (w/20)
elif typ == 'pct':
ans = '%.3g%%' % (w/50)
return ans
def read_width(parent, dest, XPath, get):
ans = inherit
for tblW in XPath('./w:tblW')(parent):
ans = _read_width(tblW, get)
setattr(dest, 'width', ans)
def read_cell_width(parent, dest, XPath, get):
ans = inherit
for tblW in XPath('./w:tcW')(parent):
ans = _read_width(tblW, get)
setattr(dest, 'width', ans)
def read_padding(parent, dest, XPath, get):
name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
ans = {x:inherit for x in edges}
for mar in XPath('./w:%s' % name)(parent):
for x in edges:
for edge in XPath('./w:%s' % x)(mar):
ans[x] = _read_width(edge, get)
for x in edges:
setattr(dest, 'cell_padding_%s' % x, ans[x])
def read_justification(parent, dest, XPath, get):
left = right = inherit
for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if val == 'left':
right = 'auto'
elif val == 'right':
left = 'auto'
elif val == 'center':
left = right = 'auto'
setattr(dest, 'margin_left', left)
setattr(dest, 'margin_right', right)
def read_spacing(parent, dest, XPath, get):
ans = inherit
for cs in XPath('./w:tblCellSpacing')(parent):
ans = _read_width(cs, get)
setattr(dest, 'spacing', ans)
def read_float(parent, dest, XPath, get):
ans = inherit
for x in XPath('./w:tblpPr')(parent):
ans = {k.rpartition('}')[-1]: v for k, v in iteritems(x.attrib)}
setattr(dest, 'float', ans)
def read_indent(parent, dest, XPath, get):
ans = inherit
for cs in XPath('./w:tblInd')(parent):
ans = _read_width(cs, get)
setattr(dest, 'indent', ans)
border_edges = ('left', 'top', 'right', 'bottom', 'insideH', 'insideV')
def read_borders(parent, dest, XPath, get):
name = 'tblBorders' if parent.tag.endswith('}tblPr') else 'tcBorders'
read_border(parent, dest, XPath, get, border_edges, name)
def read_height(parent, dest, XPath, get):
ans = inherit
for rh in XPath('./w:trHeight')(parent):
rule = get(rh, 'w:hRule', 'auto')
if rule in {'auto', 'atLeast', 'exact'}:
val = get(rh, 'w:val')
ans = (rule, val)
setattr(dest, 'height', ans)
def read_vertical_align(parent, dest, XPath, get):
ans = inherit
for va in XPath('./w:vAlign')(parent):
val = get(va, 'w:val')
ans = {'center': 'middle', 'top': 'top', 'bottom': 'bottom'}.get(val, 'middle')
setattr(dest, 'vertical_align', ans)
def read_col_span(parent, dest, XPath, get):
ans = inherit
for gs in XPath('./w:gridSpan')(parent):
try:
ans = int(get(gs, 'w:val'))
except (TypeError, ValueError):
continue
setattr(dest, 'col_span', ans)
def read_merge(parent, dest, XPath, get):
for x in ('hMerge', 'vMerge'):
ans = inherit
for m in XPath('./w:%s' % x)(parent):
ans = get(m, 'w:val', 'continue')
setattr(dest, x, ans)
def read_band_size(parent, dest, XPath, get):
for x in ('Col', 'Row'):
ans = 1
for y in XPath('./w:tblStyle%sBandSize' % x)(parent):
try:
ans = int(get(y, 'w:val'))
except (TypeError, ValueError):
continue
setattr(dest, '%s_band_size' % x.lower(), ans)
def read_look(parent, dest, XPath, get):
ans = 0
for x in XPath('./w:tblLook')(parent):
try:
ans = int(get(x, 'w:val'), 16)
except (ValueError, TypeError):
continue
setattr(dest, 'look', ans)
# }}}
def clone(style):
if style is None:
return None
try:
ans = type(style)(style.namespace)
except TypeError:
return None
ans.update(style)
return ans
class Style(object):
is_bidi = False
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
def apply_bidi(self):
self.is_bidi = True
def convert_spacing(self):
ans = {}
if self.spacing is not inherit:
if self.spacing in {'auto', '0'}:
ans['border-collapse'] = 'collapse'
else:
ans['border-collapse'] = 'separate'
ans['border-spacing'] = self.spacing
return ans
def convert_border(self):
c = {}
for x in edges:
border_to_css(x, self, c)
val = getattr(self, 'padding_%s' % x)
if val is not inherit:
c['padding-%s' % x] = '%.3gpt' % val
if self.is_bidi:
for a in ('padding-%s', 'border-%s-style', 'border-%s-color', 'border-%s-width'):
l, r = c.get(a % 'left'), c.get(a % 'right')
if l is not None:
c[a % 'right'] = l
if r is not None:
c[a % 'left'] = r
return c
class RowStyle(Style):
all_properties = ('height', 'cantSplit', 'hidden', 'spacing',)
def __init__(self, namespace, trPr=None):
self.namespace = namespace
if trPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for p in ('hidden', 'cantSplit'):
setattr(self, p, binary_property(trPr, p, namespace.XPath, namespace.get))
for p in ('spacing', 'height'):
f = globals()['read_%s' % p]
f(trPr, self, namespace.XPath, namespace.get)
self._css = None
@property
def css(self):
if self._css is None:
c = self._css = {}
if self.hidden is True:
c['display'] = 'none'
if self.cantSplit is True:
c['page-break-inside'] = 'avoid'
if self.height is not inherit:
rule, val = self.height
if rule != 'auto':
try:
c['min-height' if rule == 'atLeast' else 'height'] = '%.3gpt' % (int(val)/20)
except (ValueError, TypeError):
pass
c.update(self.convert_spacing())
return self._css
class CellStyle(Style):
all_properties = ('background_color', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
'cell_padding_bottom', 'width', 'vertical_align', 'col_span', 'vMerge', 'hMerge', 'row_span',
) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, namespace, tcPr=None):
self.namespace = namespace
if tcPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for x in ('borders', 'shd', 'padding', 'cell_width', 'vertical_align', 'col_span', 'merge'):
f = globals()['read_%s' % x]
f(tcPr, self, namespace.XPath, namespace.get)
self.row_span = inherit
self._css = None
@property
def css(self):
if self._css is None:
self._css = c = {}
if self.background_color is not inherit:
c['background-color'] = self.background_color
if self.width not in (inherit, 'auto'):
c['width'] = self.width
c['vertical-align'] = 'top' if self.vertical_align is inherit else self.vertical_align
for x in edges:
val = getattr(self, 'cell_padding_%s' % x)
if val not in (inherit, 'auto'):
c['padding-%s' % x] = val
elif val is inherit and x in {'left', 'right'}:
c['padding-%s' % x] = '%.3gpt' % (115/20)
# In Word, tables are apparently rendered with some default top and
# bottom padding irrespective of the cellMargin values. Simulate
# that here.
for x in ('top', 'bottom'):
if c.get('padding-%s' % x, '0pt') == '0pt':
c['padding-%s' % x] = '0.5ex'
c.update(self.convert_border())
return self._css
class TableStyle(Style):
all_properties = (
'width', 'float', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
'cell_padding_bottom', 'margin_left', 'margin_right', 'background_color',
'spacing', 'indent', 'overrides', 'col_band_size', 'row_band_size', 'look', 'bidi',
) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, namespace, tblPr=None):
self.namespace = namespace
if tblPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
self.overrides = inherit
self.bidi = binary_property(tblPr, 'bidiVisual', namespace.XPath, namespace.get)
for x in ('width', 'float', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders', 'band_size', 'look'):
f = globals()['read_%s' % x]
f(tblPr, self, self.namespace.XPath, self.namespace.get)
parent = tblPr.getparent()
if self.namespace.is_tag(parent, 'w:style'):
self.overrides = {}
for tblStylePr in self.namespace.XPath('./w:tblStylePr[@w:type]')(parent):
otype = self.namespace.get(tblStylePr, 'w:type')
orides = self.overrides[otype] = {}
for tblPr in self.namespace.XPath('./w:tblPr')(tblStylePr):
orides['table'] = TableStyle(self.namespace, tblPr)
for trPr in self.namespace.XPath('./w:trPr')(tblStylePr):
orides['row'] = RowStyle(self.namespace, trPr)
for tcPr in self.namespace.XPath('./w:tcPr')(tblStylePr):
orides['cell'] = CellStyle(self.namespace, tcPr)
for pPr in self.namespace.XPath('./w:pPr')(tblStylePr):
orides['para'] = ParagraphStyle(self.namespace, pPr)
for rPr in self.namespace.XPath('./w:rPr')(tblStylePr):
orides['run'] = RunStyle(self.namespace, rPr)
self._css = None
def resolve_based_on(self, parent):
for p in self.all_properties:
val = getattr(self, p)
if val is inherit:
setattr(self, p, getattr(parent, p))
@property
def css(self):
if self._css is None:
c = self._css = {}
if self.width not in (inherit, 'auto'):
c['width'] = self.width
for x in ('background_color', 'margin_left', 'margin_right'):
val = getattr(self, x)
if val is not inherit:
c[x.replace('_', '-')] = val
if self.indent not in (inherit, 'auto') and self.margin_left != 'auto':
c['margin-left'] = self.indent
if self.float is not inherit:
for x in ('left', 'top', 'right', 'bottom'):
val = self.float.get('%sFromText' % x, 0)
try:
val = '%.3gpt' % (int(val) / 20)
except (ValueError, TypeError):
val = '0'
c['margin-%s' % x] = val
if 'tblpXSpec' in self.float:
c['float'] = 'right' if self.float['tblpXSpec'] in {'right', 'outside'} else 'left'
else:
page = self.page
page_width = page.width - page.margin_left - page.margin_right
try:
x = int(self.float['tblpX']) / 20
except (KeyError, ValueError, TypeError):
x = 0
c['float'] = 'left' if (x/page_width) < 0.65 else 'right'
c.update(self.convert_spacing())
if 'border-collapse' not in c:
c['border-collapse'] = 'collapse'
c.update(self.convert_border())
return self._css
class Table(object):
def __init__(self, namespace, tbl, styles, para_map, is_sub_table=False):
self.namespace = namespace
self.tbl = tbl
self.styles = styles
self.is_sub_table = is_sub_table
# Read Table Style
style = {'table':TableStyle(self.namespace)}
for tblPr in self.namespace.XPath('./w:tblPr')(tbl):
for ts in self.namespace.XPath('./w:tblStyle[@w:val]')(tblPr):
style_id = self.namespace.get(ts, 'w:val')
s = styles.get(style_id)
if s is not None:
if s.table_style is not None:
style['table'].update(s.table_style)
if s.paragraph_style is not None:
if 'paragraph' in style:
style['paragraph'].update(s.paragraph_style)
else:
style['paragraph'] = s.paragraph_style
if s.character_style is not None:
if 'run' in style:
style['run'].update(s.character_style)
else:
style['run'] = s.character_style
style['table'].update(TableStyle(self.namespace, tblPr))
self.table_style, self.paragraph_style = style['table'], style.get('paragraph', None)
self.run_style = style.get('run', None)
self.overrides = self.table_style.overrides
if self.overrides is inherit:
self.overrides = {}
if 'wholeTable' in self.overrides and 'table' in self.overrides['wholeTable']:
self.table_style.update(self.overrides['wholeTable']['table'])
self.style_map = {}
self.paragraphs = []
self.cell_map = []
rows = self.namespace.XPath('./w:tr')(tbl)
for r, tr in enumerate(rows):
overrides = self.get_overrides(r, None, len(rows), None)
self.resolve_row_style(tr, overrides)
cells = self.namespace.XPath('./w:tc')(tr)
self.cell_map.append([])
for c, tc in enumerate(cells):
overrides = self.get_overrides(r, c, len(rows), len(cells))
self.resolve_cell_style(tc, overrides, r, c, len(rows), len(cells))
self.cell_map[-1].append(tc)
for p in self.namespace.XPath('./w:p')(tc):
para_map[p] = self
self.paragraphs.append(p)
self.resolve_para_style(p, overrides)
self.handle_merged_cells()
self.sub_tables = {x:Table(namespace, x, styles, para_map, is_sub_table=True) for x in self.namespace.XPath('./w:tr/w:tc/w:tbl')(tbl)}
@property
def bidi(self):
return self.table_style.bidi is True
def override_allowed(self, name):
'Check if the named override is allowed by the tblLook element'
if name.endswith('Cell') or name == 'wholeTable':
return True
look = self.table_style.look
if (look & 0x0020 and name == 'firstRow') or (look & 0x0040 and name == 'lastRow') or \
(look & 0x0080 and name == 'firstCol') or (look & 0x0100 and name == 'lastCol'):
return True
if name.startswith('band'):
if name.endswith('Horz'):
return not bool(look & 0x0200)
if name.endswith('Vert'):
return not bool(look & 0x0400)
return False
def get_overrides(self, r, c, num_of_rows, num_of_cols_in_row):
'List of possible overrides for the given para'
overrides = ['wholeTable']
def divisor(m, n):
return (m - (m % n)) // n
if c is not None:
odd_column_band = (divisor(c, self.table_style.col_band_size) % 2) == 1
overrides.append('band%dVert' % (1 if odd_column_band else 2))
odd_row_band = (divisor(r, self.table_style.row_band_size) % 2) == 1
overrides.append('band%dHorz' % (1 if odd_row_band else 2))
# According to the OOXML spec columns should have higher override
# priority than rows, but Word seems to do it the other way around.
if c is not None:
if c == 0:
overrides.append('firstCol')
if c >= num_of_cols_in_row - 1:
overrides.append('lastCol')
if r == 0:
overrides.append('firstRow')
if r >= num_of_rows - 1:
overrides.append('lastRow')
if c is not None:
if r == 0:
if c == 0:
overrides.append('nwCell')
if c == num_of_cols_in_row - 1:
overrides.append('neCell')
if r == num_of_rows - 1:
if c == 0:
overrides.append('swCell')
if c == num_of_cols_in_row - 1:
overrides.append('seCell')
return tuple(filter(self.override_allowed, overrides))
def resolve_row_style(self, tr, overrides):
rs = RowStyle(self.namespace)
for o in overrides:
if o in self.overrides:
ovr = self.overrides[o]
ors = ovr.get('row', None)
if ors is not None:
rs.update(ors)
for trPr in self.namespace.XPath('./w:trPr')(tr):
rs.update(RowStyle(self.namespace, trPr))
if self.bidi:
rs.apply_bidi()
self.style_map[tr] = rs
def resolve_cell_style(self, tc, overrides, row, col, rows, cols_in_row):
cs = CellStyle(self.namespace)
for o in overrides:
if o in self.overrides:
ovr = self.overrides[o]
ors = ovr.get('cell', None)
if ors is not None:
cs.update(ors)
for tcPr in self.namespace.XPath('./w:tcPr')(tc):
cs.update(CellStyle(self.namespace, tcPr))
for x in edges:
p = 'cell_padding_%s' % x
val = getattr(cs, p)
if val is inherit:
setattr(cs, p, getattr(self.table_style, p))
is_inside_edge = (
(x == 'left' and col > 0) or
(x == 'top' and row > 0) or
(x == 'right' and col < cols_in_row - 1) or
(x == 'bottom' and row < rows -1)
)
inside_edge = ('insideH' if x in {'top', 'bottom'} else 'insideV') if is_inside_edge else None
for prop in border_props:
if not prop.startswith('border'):
continue
eprop = prop % x
iprop = (prop % inside_edge) if inside_edge else None
val = getattr(cs, eprop)
if val is inherit and iprop is not None:
# Use the insideX borders if the main cell borders are not
# specified
val = getattr(cs, iprop)
if val is inherit:
val = getattr(self.table_style, iprop)
if not is_inside_edge and val == 'none':
# Cell borders must override table borders even when the
# table border is not null and the cell border is null.
val = 'hidden'
setattr(cs, eprop, val)
if self.bidi:
cs.apply_bidi()
self.style_map[tc] = cs
def resolve_para_style(self, p, overrides):
text_styles = [clone(self.paragraph_style), clone(self.run_style)]
for o in overrides:
if o in self.overrides:
ovr = self.overrides[o]
for i, name in enumerate(('para', 'run')):
ops = ovr.get(name, None)
if ops is not None:
if text_styles[i] is None:
text_styles[i] = ops
else:
text_styles[i].update(ops)
self.style_map[p] = text_styles
def handle_merged_cells(self):
if not self.cell_map:
return
# Handle vMerge
max_col_num = max(len(r) for r in self.cell_map)
for c in range(max_col_num):
cells = [row[c] if c < len(row) else None for row in self.cell_map]
runs = [[]]
for cell in cells:
try:
s = self.style_map[cell]
except KeyError: # cell is None
s = CellStyle(self.namespace)
if s.vMerge == 'restart':
runs.append([cell])
elif s.vMerge == 'continue':
runs[-1].append(cell)
else:
runs.append([])
for run in runs:
if len(run) > 1:
self.style_map[run[0]].row_span = len(run)
for tc in run[1:]:
tc.getparent().remove(tc)
# Handle hMerge
for cells in self.cell_map:
runs = [[]]
for cell in cells:
try:
s = self.style_map[cell]
except KeyError: # cell is None
s = CellStyle(self.namespace)
if s.col_span is not inherit:
runs.append([])
continue
if s.hMerge == 'restart':
runs.append([cell])
elif s.hMerge == 'continue':
runs[-1].append(cell)
else:
runs.append([])
for run in runs:
if len(run) > 1:
self.style_map[run[0]].col_span = len(run)
for tc in run[1:]:
tc.getparent().remove(tc)
def __iter__(self):
for p in self.paragraphs:
yield p
for t in itervalues(self.sub_tables):
for p in t:
yield p
def apply_markup(self, rmap, page, parent=None):
table = TABLE('\n\t\t')
if self.bidi:
table.set('dir', 'rtl')
self.table_style.page = page
style_map = {}
if parent is None:
try:
first_para = rmap[next(iter(self))]
except StopIteration:
return
parent = first_para.getparent()
idx = parent.index(first_para)
parent.insert(idx, table)
else:
parent.append(table)
for row in self.namespace.XPath('./w:tr')(self.tbl):
tr = TR('\n\t\t\t')
style_map[tr] = self.style_map[row]
tr.tail = '\n\t\t'
table.append(tr)
for tc in self.namespace.XPath('./w:tc')(row):
td = TD()
style_map[td] = s = self.style_map[tc]
if s.col_span is not inherit:
td.set('colspan', unicode_type(s.col_span))
if s.row_span is not inherit:
td.set('rowspan', unicode_type(s.row_span))
td.tail = '\n\t\t\t'
tr.append(td)
for x in self.namespace.XPath('./w:p|./w:tbl')(tc):
if x.tag.endswith('}p'):
td.append(rmap[x])
else:
self.sub_tables[x].apply_markup(rmap, page, parent=td)
if len(tr):
tr[-1].tail = '\n\t\t'
if len(table):
table[-1].tail = '\n\t'
table_style = self.table_style.css
if table_style:
table.set('class', self.styles.register(table_style, 'table'))
for elem, style in iteritems(style_map):
css = style.css
if css:
elem.set('class', self.styles.register(css, elem.tag))
class Tables(object):
def __init__(self, namespace):
self.tables = []
self.para_map = {}
self.sub_tables = set()
self.namespace = namespace
def register(self, tbl, styles):
if tbl in self.sub_tables:
return
self.tables.append(Table(self.namespace, tbl, styles, self.para_map))
self.sub_tables |= set(self.tables[-1].sub_tables)
def apply_markup(self, object_map, page_map):
rmap = {v:k for k, v in iteritems(object_map)}
for table in self.tables:
table.apply_markup(rmap, page_map[table.tbl])
def para_style(self, p):
table = self.para_map.get(p, None)
if table is not None:
return table.style_map.get(p, (None, None))[0]
def run_style(self, p):
table = self.para_map.get(p, None)
if table is not None:
return table.style_map.get(p, (None, None))[1]

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
class Theme(object):
def __init__(self, namespace):
self.major_latin_font = 'Cambria'
self.minor_latin_font = 'Calibri'
self.namespace = namespace
def __call__(self, root):
for fs in self.namespace.XPath('//a:fontScheme')(root):
for mj in self.namespace.XPath('./a:majorFont')(fs):
for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
self.major_latin_font = l.get('typeface')
for mj in self.namespace.XPath('./a:minorFont')(fs):
for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
self.minor_latin_font = l.get('typeface')
def resolve_font_family(self, ff):
if ff.startswith('|'):
ff = ff[1:-1]
ff = self.major_latin_font if ff.startswith('major') else self.minor_latin_font
return ff

View File

@@ -0,0 +1,839 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os, re, math, errno, uuid, numbers
from collections import OrderedDict, defaultdict
from lxml import html
from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
from calibre import guess_type
from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import XML, generate_anchor
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
from calibre.ebooks.docx.images import Images
from calibre.ebooks.docx.tables import Tables
from calibre.ebooks.docx.footnotes import Footnotes
from calibre.ebooks.docx.cleanup import cleanup_markup
from calibre.ebooks.docx.theme import Theme
from calibre.ebooks.docx.toc import create_toc
from calibre.ebooks.docx.fields import Fields
from calibre.ebooks.docx.settings import Settings
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from polyglot.builtins import iteritems, itervalues, filter, getcwd, map, unicode_type
NBSP = '\xa0'
class Text:
def __init__(self, elem, attr, buf):
self.elem, self.attr, self.buf = elem, attr, buf
self.elems = [self.elem]
def add_elem(self, elem):
self.elems.append(elem)
setattr(self.elem, self.attr, ''.join(self.buf))
self.elem, self.attr, self.buf = elem, 'tail', []
def __iter__(self):
return iter(self.elems)
def html_lang(docx_lang):
lang = canonicalize_lang(docx_lang)
if lang and lang != 'und':
lang = lang_as_iso639_1(lang)
if lang:
return lang
class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
self.docx = DOCX(path_or_stream, log=log)
self.namespace = self.docx.namespace
self.ms_pat = re.compile(r'\s{2,}')
self.ws_pat = re.compile(r'[\n\r\t]')
self.log = self.docx.log
self.detect_cover = detect_cover
self.notes_text = notes_text or _('Notes')
self.notes_nopb = notes_nopb
self.nosupsub = nosupsub
self.dest_dir = dest_dir or getcwd()
self.mi = self.docx.metadata
self.body = BODY()
self.theme = Theme(self.namespace)
self.settings = Settings(self.namespace)
self.tables = Tables(self.namespace)
self.fields = Fields(self.namespace)
self.styles = Styles(self.namespace, self.tables)
self.images = Images(self.namespace, self.log)
self.object_map = OrderedDict()
self.html = HTML(
HEAD(
META(charset='utf-8'),
TITLE(self.mi.title or _('Unknown')),
LINK(rel='stylesheet', type='text/css', href='docx.css'),
),
self.body
)
self.html.text='\n\t'
self.html[0].text='\n\t\t'
self.html[0].tail='\n'
for child in self.html[0]:
child.tail = '\n\t\t'
self.html[0][-1].tail = '\n\t'
self.html[1].text = self.html[1].tail = '\n'
lang = html_lang(self.mi.language)
if lang:
self.html.set('lang', lang)
self.doc_lang = lang
else:
self.doc_lang = None
def __call__(self):
doc = self.docx.document
relationships_by_id, relationships_by_type = self.docx.document_relationships
self.resolve_alternate_content(doc)
self.fields(doc, self.log)
self.read_styles(relationships_by_type)
self.images(relationships_by_id)
self.layers = OrderedDict()
self.framed = [[]]
self.frame_map = {}
self.framed_map = {}
self.anchor_map = {}
self.link_map = defaultdict(list)
self.link_source_map = {}
self.toc_anchor = None
self.block_runs = []
paras = []
self.log.debug('Converting Word markup to HTML')
self.read_page_properties(doc)
self.current_rels = relationships_by_id
for wp, page_properties in iteritems(self.page_map):
self.current_page = page_properties
if wp.tag.endswith('}p'):
p = self.convert_p(wp)
self.body.append(p)
paras.append(wp)
self.read_block_anchors(doc)
self.styles.apply_contextual_spacing(paras)
self.mark_block_runs(paras)
# Apply page breaks at the start of every section, except the first
# section (since that will be the start of the file)
self.styles.apply_section_page_breaks(self.section_starts[1:])
notes_header = None
orig_rid_map = self.images.rid_map
if self.footnotes.has_notes:
self.body.append(H1(self.notes_text))
notes_header = self.body[-1]
notes_header.set('class', 'notes-header')
for anchor, text, note in self.footnotes:
dl = DL(id=anchor)
dl.set('class', 'footnote')
self.body.append(dl)
dl.append(DT('[', A('' + text, href='#back_%s' % anchor, title=text)))
dl[-1][0].tail = ']'
dl.append(DD())
paras = []
self.images.rid_map = self.current_rels = note.rels[0]
for wp in note:
if wp.tag.endswith('}tbl'):
self.tables.register(wp, self.styles)
self.page_map[wp] = self.current_page
else:
p = self.convert_p(wp)
dl[-1].append(p)
paras.append(wp)
self.styles.apply_contextual_spacing(paras)
self.mark_block_runs(paras)
for p, wp in iteritems(self.object_map):
if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
# Paragraph uses tabs for indentation, convert to text-indent
parent = p[0]
tabs = []
for child in parent:
if child.get('class', None) == 'tab':
tabs.append(child)
if child.tail:
break
else:
break
indent = len(tabs) * self.settings.default_tab_stop
style = self.styles.resolve(wp)
if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
if style.text_indent is not inherit:
indent = float(style.text_indent[:-2]) + indent
style.text_indent = '%.3gpt' % indent
parent.text = tabs[-1].tail or ''
list(map(parent.remove, tabs))
self.images.rid_map = orig_rid_map
self.resolve_links()
self.styles.cascade(self.layers)
self.tables.apply_markup(self.object_map, self.page_map)
numbered = []
for html_obj, obj in iteritems(self.object_map):
raw = obj.get('calibre_num_id', None)
if raw is not None:
lvl, num_id = raw.partition(':')[0::2]
try:
lvl = int(lvl)
except (TypeError, ValueError):
lvl = 0
numbered.append((html_obj, num_id, lvl))
self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
self.apply_frames()
if len(self.body) > 0:
self.body.text = '\n\t'
for child in self.body:
child.tail = '\n\t'
self.body[-1].tail = '\n'
self.log.debug('Converting styles to CSS')
self.styles.generate_classes()
for html_obj, obj in iteritems(self.object_map):
style = self.styles.resolve(obj)
if style is not None:
css = style.css
if css:
cls = self.styles.class_name(css)
if cls:
html_obj.set('class', cls)
for html_obj, css in iteritems(self.framed_map):
cls = self.styles.class_name(css)
if cls:
html_obj.set('class', cls)
if notes_header is not None:
for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
notes_header.tag = h.tag
cls = h.get('class', None)
if cls and cls != 'notes-header':
notes_header.set('class', '%s notes-header' % cls)
break
self.fields.polish_markup(self.object_map)
self.log.debug('Cleaning up redundant markup generated by Word')
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
return self.write(doc)
def read_page_properties(self, doc):
current = []
self.page_map = OrderedDict()
self.section_starts = []
for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
if p.tag.endswith('}tbl'):
self.tables.register(p, self.styles)
current.append(p)
continue
sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
if sect:
pr = PageProperties(self.namespace, sect)
paras = current + [p]
for x in paras:
self.page_map[x] = pr
self.section_starts.append(paras[0])
current = []
else:
current.append(p)
if current:
self.section_starts.append(current[0])
last = self.namespace.XPath('./w:body/w:sectPr')(doc)
pr = PageProperties(self.namespace, last)
for x in current:
self.page_map[x] = pr
def resolve_alternate_content(self, doc):
# For proprietary extensions in Word documents use the fallback, spec
# compliant form
# See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
choices = self.namespace.XPath('./mc:Choice')(ac)
fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
if fallbacks:
for choice in choices:
ac.remove(choice)
def read_styles(self, relationships_by_type):
def get_name(rtype, defname):
name = relationships_by_type.get(rtype, None)
if name is None:
cname = self.docx.document_name.split('/')
cname[-1] = defname
if self.docx.exists('/'.join(cname)):
name = name
if name and name.startswith('word/word') and not self.docx.exists(name):
name = name.partition('/')[2]
return name
nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
numbering = self.numbering = Numbering(self.namespace)
footnotes = self.footnotes = Footnotes(self.namespace)
fonts = self.fonts = Fonts(self.namespace)
foraw = enraw = None
forel, enrel = ({}, {}), ({}, {})
if sename is not None:
try:
seraw = self.docx.read(sename)
except KeyError:
self.log.warn('Settings %s do not exist' % sename)
except EnvironmentError as e:
if e.errno != errno.ENOENT:
raise
self.log.warn('Settings %s file missing' % sename)
else:
self.settings(fromstring(seraw))
if foname is not None:
try:
foraw = self.docx.read(foname)
except KeyError:
self.log.warn('Footnotes %s do not exist' % foname)
else:
forel = self.docx.get_relationships(foname)
if enname is not None:
try:
enraw = self.docx.read(enname)
except KeyError:
self.log.warn('Endnotes %s do not exist' % enname)
else:
enrel = self.docx.get_relationships(enname)
footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)
if fname is not None:
embed_relationships = self.docx.get_relationships(fname)[0]
try:
raw = self.docx.read(fname)
except KeyError:
self.log.warn('Fonts table %s does not exist' % fname)
else:
fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)
if tname is not None:
try:
raw = self.docx.read(tname)
except KeyError:
self.log.warn('Styles %s do not exist' % sname)
else:
self.theme(fromstring(raw))
styles_loaded = False
if sname is not None:
try:
raw = self.docx.read(sname)
except KeyError:
self.log.warn('Styles %s do not exist' % sname)
else:
self.styles(fromstring(raw), fonts, self.theme)
styles_loaded = True
if not styles_loaded:
self.styles(None, fonts, self.theme)
if nname is not None:
try:
raw = self.docx.read(nname)
except KeyError:
self.log.warn('Numbering styles %s do not exist' % nname)
else:
numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
self.styles.resolve_numbering(numbering)
def write(self, doc):
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with lopen(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw)
css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
if css:
with lopen(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
f.write(css.encode('utf-8'))
opf = OPFCreator(self.dest_dir, self.mi)
opf.toc = toc
opf.create_manifest_from_files_in([self.dest_dir])
for item in opf.manifest:
if item.media_type == 'text/html':
item.media_type = guess_type('a.xhtml')[0]
opf.create_spine(['index.html'])
if self.cover_image is not None:
opf.guide.set_cover(self.cover_image)
def process_guide(E, guide):
if self.toc_anchor is not None:
guide.append(E.reference(
href='index.html#' + self.toc_anchor, title=_('Table of Contents'), type='toc'))
toc_file = os.path.join(self.dest_dir, 'toc.ncx')
with lopen(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
if os.path.getsize(toc_file) == 0:
os.remove(toc_file)
return os.path.join(self.dest_dir, 'metadata.opf')
def read_block_anchors(self, doc):
doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
if doc_anchors:
current_bm = set()
rmap = {v:k for k, v in iteritems(self.object_map)}
for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
if p.tag.endswith('}p'):
if current_bm and p in rmap:
para = rmap[p]
if 'id' not in para.attrib:
para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map))))
for name in current_bm:
self.anchor_map[name] = para.get('id')
current_bm = set()
elif p in doc_anchors:
anchor = self.namespace.get(p, 'w:name')
if anchor:
current_bm.add(anchor)
def convert_p(self, p):
dest = P()
self.object_map[dest] = p
style = self.styles.resolve_paragraph(p)
self.layers[p] = []
self.frame_map[p] = style.frame
self.add_frame(dest, style.frame)
current_anchor = None
current_hyperlink = None
hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')
def p_parent(x):
# Ensure that nested <w:p> tags are handled. These can occur if a
# textbox is present inside a paragraph.
while True:
x = x.getparent()
try:
if x.tag.endswith('}p'):
return x
except AttributeError:
break
for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
if p_parent(x) is not p:
continue
if x.tag.endswith('}r'):
span = self.convert_run(x)
if current_anchor is not None:
(dest if len(dest) == 0 else span).set('id', current_anchor)
current_anchor = None
if current_hyperlink is not None:
try:
hl = hl_xpath(x)[0]
self.link_map[hl].append(span)
self.link_source_map[hl] = self.current_rels
x.set('is-link', '1')
except IndexError:
current_hyperlink = None
dest.append(span)
self.layers[p].append(x)
elif x.tag.endswith('}bookmarkStart'):
anchor = self.namespace.get(x, 'w:name')
if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
# _GoBack is a special bookmark inserted by Word 2010 for
# the return to previous edit feature, we ignore it
old_anchor = current_anchor
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map)))
if old_anchor is not None:
# The previous anchor was not applied to any element
for a, t in tuple(iteritems(self.anchor_map)):
if t == old_anchor:
self.anchor_map[a] = current_anchor
elif x.tag.endswith('}hyperlink'):
current_hyperlink = x
elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
old_anchor = current_anchor
anchor = unicode_type(uuid.uuid4())
self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map)))
self.toc_anchor = current_anchor
if old_anchor is not None:
# The previous anchor was not applied to any element
for a, t in tuple(iteritems(self.anchor_map)):
if t == old_anchor:
self.anchor_map[a] = current_anchor
if current_anchor is not None:
# This paragraph had no <w:r> descendants
dest.set('id', current_anchor)
current_anchor = None
m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
if m is not None:
n = min(6, max(1, int(m.group(1))))
dest.tag = 'h%d' % n
dest.set('data-heading-level', unicode_type(n))
if style.bidi is True:
dest.set('dir', 'rtl')
border_runs = []
common_borders = []
for span in dest:
run = self.object_map[span]
style = self.styles.resolve_run(run)
if not border_runs or border_runs[-1][1].same_border(style):
border_runs.append((span, style))
elif border_runs:
if len(border_runs) > 1:
common_borders.append(border_runs)
border_runs = []
for border_run in common_borders:
spans = []
bs = {}
for span, style in border_run:
style.get_border_css(bs)
style.clear_border_css()
spans.append(span)
if bs:
cls = self.styles.register(bs, 'text_border')
wrapper = self.wrap_elems(spans, SPAN())
wrapper.set('class', cls)
if not dest.text and len(dest) == 0 and not style.has_visible_border():
# Empty paragraph add a non-breaking space so that it is rendered
# by WebKit
dest.text = NBSP
# If the last element in a block is a <br> the <br> is not rendered in
# HTML, unless it is followed by a trailing space. Word, on the other
# hand inserts a blank line for trailing <br>s.
if len(dest) > 0 and not dest[-1].tail:
if dest[-1].tag == 'br':
dest[-1].tail = NBSP
elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
dest[-1][-1].tail = NBSP
return dest
def wrap_elems(self, elems, wrapper):
p = elems[0].getparent()
idx = p.index(elems[0])
p.insert(idx, wrapper)
wrapper.tail = elems[-1].tail
elems[-1].tail = None
for elem in elems:
try:
p.remove(elem)
except ValueError:
# Probably a hyperlink that spans multiple
# paragraphs,theoretically we should break this up into
# multiple hyperlinks, but I can't be bothered.
elem.getparent().remove(elem)
wrapper.append(elem)
return wrapper
def resolve_links(self):
self.resolved_link_map = {}
for hyperlink, spans in iteritems(self.link_map):
relationships_by_id = self.link_source_map[hyperlink]
span = spans[0]
if len(spans) > 1:
span = self.wrap_elems(spans, SPAN())
span.tag = 'a'
self.resolved_link_map[hyperlink] = span
tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
if tgt:
span.set('target', tgt)
tt = self.namespace.get(hyperlink, 'w:tooltip')
if tt:
span.set('title', tt)
rid = self.namespace.get(hyperlink, 'r:id')
if rid and rid in relationships_by_id:
span.set('href', relationships_by_id[rid])
continue
anchor = self.namespace.get(hyperlink, 'w:anchor')
if anchor and anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
(rid, anchor))
# hrefs that point nowhere give epubcheck a hernia. The element
# should be styled explicitly by Word anyway.
# span.set('href', '#')
rmap = {v:k for k, v in iteritems(self.object_map)}
for hyperlink, runs in self.fields.hyperlink_fields:
spans = [rmap[r] for r in runs if r in rmap]
if not spans:
continue
span = spans[0]
if len(spans) > 1:
span = self.wrap_elems(spans, SPAN())
span.tag = 'a'
tgt = hyperlink.get('target', None)
if tgt:
span.set('target', tgt)
tt = hyperlink.get('title', None)
if tt:
span.set('title', tt)
url = hyperlink.get('url', None)
if url is None:
anchor = hyperlink.get('anchor', None)
if anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
else:
if url in self.anchor_map:
span.set('href', '#' + self.anchor_map[url])
continue
span.set('href', url)
for img, link, relationships_by_id in self.images.links:
parent = img.getparent()
idx = parent.index(img)
a = A(img)
a.tail, img.tail = img.tail, None
parent.insert(idx, a)
tgt = link.get('target', None)
if tgt:
a.set('target', tgt)
tt = link.get('title', None)
if tt:
a.set('title', tt)
rid = link['id']
if rid in relationships_by_id:
dest = relationships_by_id[rid]
if dest.startswith('#'):
if dest[1:] in self.anchor_map:
a.set('href', '#' + self.anchor_map[dest[1:]])
else:
a.set('href', dest)
def convert_run(self, run):
ans = SPAN()
self.object_map[ans] = run
text = Text(ans, 'text', [])
for child in run:
if self.namespace.is_tag(child, 'w:t'):
if not child.text:
continue
space = child.get(XML('space'), None)
preserve = False
ctext = child.text
if space != 'preserve':
# Remove leading and trailing whitespace. Word ignores
# leading and trailing whitespace without preserve
ctext = ctext.strip(' \n\r\t')
# Only use a <span> with white-space:pre-wrap if this element
# actually needs it, i.e. if it has more than one
# consecutive space or it has newlines or tabs.
multi_spaces = self.ms_pat.search(ctext) is not None
preserve = multi_spaces or self.ws_pat.search(ctext) is not None
if preserve:
text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
ans.append(text.elem)
else:
text.buf.append(ctext)
elif self.namespace.is_tag(child, 'w:cr'):
text.add_elem(BR())
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:br'):
typ = self.namespace.get(child, 'w:type')
if typ in {'column', 'page'}:
br = BR(style='page-break-after:always')
else:
clear = child.get('clear', None)
if clear in {'all', 'left', 'right'}:
br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
else:
br = BR()
text.add_elem(br)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
text.add_elem(img)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
anchor, name = self.footnotes.get_ref(child)
if anchor and name:
l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name)
l.set('class', 'noteref')
text.add_elem(l)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:tab'):
spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
text.add_elem(SPAN(NBSP * spaces))
ans.append(text.elem)
ans[-1].set('class', 'tab')
elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
text.buf.append('\u2011')
elif self.namespace.is_tag(child, 'w:softHyphen'):
text.buf.append('\u00ad')
if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf))
style = self.styles.resolve_run(run)
if style.vert_align in {'superscript', 'subscript'}:
if ans.text or len(ans):
ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub')
if style.lang is not inherit:
lang = html_lang(style.lang)
if lang is not None and lang != self.doc_lang:
ans.set('lang', lang)
if style.rtl is True:
ans.set('dir', 'rtl')
if is_symbol_font(style.font_family):
for elem in text:
if elem.text:
elem.text = map_symbol_text(elem.text, style.font_family)
if elem.tail:
elem.tail = map_symbol_text(elem.tail, style.font_family)
style.font_family = 'sans-serif'
return ans
def add_frame(self, html_obj, style):
last_run = self.framed[-1]
if style is inherit:
if last_run:
self.framed.append([])
return
if last_run:
if last_run[-1][1] == style:
last_run.append((html_obj, style))
else:
self.framed[-1].append((html_obj, style))
else:
last_run.append((html_obj, style))
def apply_frames(self):
for run in filter(None, self.framed):
style = run[0][1]
paras = tuple(x[0] for x in run)
parent = paras[0].getparent()
idx = parent.index(paras[0])
frame = DIV(*paras)
parent.insert(idx, frame)
self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
self.styles.register(css, 'frame')
if not self.block_runs:
return
rmap = {v:k for k, v in iteritems(self.object_map)}
for border_style, blocks in self.block_runs:
paras = tuple(rmap[p] for p in blocks)
for p in paras:
if p.tag == 'li':
has_li = True
break
else:
has_li = False
parent = paras[0].getparent()
if parent.tag in ('ul', 'ol'):
ul = parent
parent = ul.getparent()
idx = parent.index(ul)
frame = DIV(ul)
elif has_li:
def top_level_tag(x):
while True:
q = x.getparent()
if q is parent or q is None:
break
x = q
return x
paras = tuple(map(top_level_tag, paras))
idx = parent.index(paras[0])
frame = DIV(*paras)
else:
idx = parent.index(paras[0])
frame = DIV(*paras)
parent.insert(idx, frame)
self.framed_map[frame] = css = border_style.css
self.styles.register(css, 'frame')
def mark_block_runs(self, paras):
def process_run(run):
max_left = max_right = 0
has_visible_border = None
for p in run:
style = self.styles.resolve_paragraph(p)
if has_visible_border is None:
has_visible_border = style.has_visible_border()
if isinstance(style.margin_left, numbers.Number):
max_left = max(style.margin_left, max_left)
if isinstance(style.margin_right, numbers.Number):
max_right = max(style.margin_right, max_right)
if has_visible_border:
style.margin_left = style.margin_right = inherit
if p is not run[0]:
style.padding_top = 0
else:
border_style = style.clone_border_styles()
if has_visible_border:
border_style.margin_top, style.margin_top = style.margin_top, inherit
if p is not run[-1]:
style.padding_bottom = 0
else:
if has_visible_border:
border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
style.clear_borders()
if p is not run[-1]:
style.apply_between_border()
if has_visible_border:
border_style.margin_left, border_style.margin_right = max_left,max_right
self.block_runs.append((border_style, run))
run = []
for p in paras:
if run and self.frame_map.get(p) == self.frame_map.get(run[-1]):
style = self.styles.resolve_paragraph(p)
last_style = self.styles.resolve_paragraph(run[-1])
if style.has_identical_borders(last_style):
run.append(p)
continue
if len(run) > 1:
process_run(run)
run = [p]
if len(run) > 1:
process_run(run)
if __name__ == '__main__':
import shutil
from calibre.utils.logging import default_log
default_log.filter_level = default_log.DEBUG
dest_dir = os.path.join(getcwd(), 'docx_input')
if os.path.exists(dest_dir):
shutil.rmtree(dest_dir)
os.mkdir(dest_dir)
Convert(sys.argv[-1], dest_dir=dest_dir, log=default_log)()

View File

@@ -0,0 +1,143 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import namedtuple
from itertools import count
from lxml.etree import tostring
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
from polyglot.builtins import iteritems, range
def from_headings(body, log, namespace, num_levels=3):
' Create a TOC from headings in the document '
tocroot = TOC()
all_heading_nodes = body.xpath('//*[@data-heading-level]')
level_prev = {i+1:None for i in range(num_levels)}
level_prev[0] = tocroot
level_item_map = {i:frozenset(
x for x in all_heading_nodes if int(x.get('data-heading-level')) == i)
for i in range(1, num_levels+1)}
item_level_map = {e:i for i, elems in iteritems(level_item_map) for e in elems}
idcount = count()
def ensure_id(elem):
ans = elem.get('id', None)
if not ans:
ans = 'toc_id_%d' % (next(idcount) + 1)
elem.set('id', ans)
return ans
for item in all_heading_nodes:
lvl = plvl = item_level_map.get(item, None)
if lvl is None:
continue
parent = None
while parent is None:
plvl -= 1
parent = level_prev[plvl]
lvl = plvl + 1
elem_id = ensure_id(item)
text = elem_to_toc_text(item)
toc = parent.add_item('index.html', elem_id, text)
level_prev[lvl] = toc
for i in range(lvl+1, num_levels+1):
level_prev[i] = None
if len(tuple(tocroot.flat())) > 1:
log('Generating Table of Contents from headings')
return tocroot
def structure_toc(entries):
indent_vals = sorted({x.indent for x in entries})
last_found = [None for i in indent_vals]
newtoc = TOC()
if len(indent_vals) > 6:
for x in entries:
newtoc.add_item('index.html', x.anchor, x.text)
return newtoc
def find_parent(level):
candidates = last_found[:level]
for x in reversed(candidates):
if x is not None:
return x
return newtoc
for item in entries:
level = indent_vals.index(item.indent)
parent = find_parent(level)
last_found[level] = parent.add_item('index.html', item.anchor,
item.text)
for i in range(level+1, len(last_found)):
last_found[i] = None
return newtoc
def link_to_txt(a, styles, object_map):
if len(a) > 1:
for child in a:
run = object_map.get(child, None)
if run is not None:
rs = styles.resolve(run)
if rs.css.get('display', None) == 'none':
a.remove(child)
return tostring(a, method='text', with_tail=False, encoding='unicode').strip()
def from_toc(docx, link_map, styles, object_map, log, namespace):
XPath, get, ancestor = namespace.XPath, namespace.get, namespace.ancestor
toc_level = None
level = 0
TI = namedtuple('TI', 'text anchor indent')
toc = []
for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx):
n = tag.tag.rpartition('}')[-1]
if n == 'fldChar':
t = get(tag, 'w:fldCharType')
if t == 'begin':
level += 1
elif t == 'end':
level -= 1
if toc_level is not None and level < toc_level:
break
elif n == 'instrText':
if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
toc_level = level
elif n == 'hyperlink':
if toc_level is not None and level >= toc_level and tag in link_map:
a = link_map[tag]
href = a.get('href', None)
txt = link_to_txt(a, styles, object_map)
p = ancestor(tag, 'w:p')
if txt and href and p is not None:
ps = styles.resolve_paragraph(p)
try:
ml = int(ps.margin_left[:-2])
except (TypeError, ValueError, AttributeError):
ml = 0
if ps.text_align in {'center', 'right'}:
ml = 0
toc.append(TI(txt, href[1:], ml))
if toc:
log('Found Word Table of Contents, using it to generate the Table of Contents')
return structure_toc(toc)
def create_toc(docx, body, link_map, styles, object_map, log, namespace):
ans = from_toc(docx, link_map, styles, object_map, log, namespace) or from_headings(body, log, namespace)
# Remove heading level attributes
for h in body.xpath('//*[@data-heading-level]'):
del h.attrib['data-heading-level']
return ans