mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-07 04:04:12 +01:00
502 lines
21 KiB
Python
502 lines
21 KiB
Python
import textwrap
|
|
from collections import OrderedDict, Counter
|
|
|
|
from ebook_converter.ebooks.docx.block_styles import ParagraphStyle, inherit, twips
|
|
from ebook_converter.ebooks.docx.char_styles import RunStyle
|
|
from ebook_converter.ebooks.docx.tables import TableStyle
|
|
from ebook_converter.polyglot.builtins import iteritems, itervalues
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
|
|
class PageProperties(object):
|
|
|
|
'''
|
|
Class representing page level properties (page size/margins) read from
|
|
sectPr elements.
|
|
'''
|
|
|
|
def __init__(self, namespace, elems=()):
|
|
self.width, self.height = 595.28, 841.89 # pts, A4
|
|
self.margin_left = self.margin_right = 72 # pts
|
|
|
|
def setval(attr, val):
|
|
val = twips(val)
|
|
if val is not None:
|
|
setattr(self, attr, val)
|
|
|
|
for sectPr in elems:
|
|
for pgSz in namespace.XPath('./w:pgSz')(sectPr):
|
|
w, h = namespace.get(pgSz, 'w:w'), namespace.get(pgSz, 'w:h')
|
|
setval('width', w), setval('height', h)
|
|
for pgMar in namespace.XPath('./w:pgMar')(sectPr):
|
|
l, r = namespace.get(pgMar, 'w:left'), namespace.get(pgMar, 'w:right')
|
|
setval('margin_left', l), setval('margin_right', r)
|
|
|
|
|
|
class Style(object):
|
|
'''
|
|
Class representing a <w:style> element. Can contain block, character, etc. styles.
|
|
'''
|
|
|
|
def __init__(self, namespace, elem):
|
|
self.namespace = namespace
|
|
self.name_path = namespace.XPath('./w:name[@w:val]')
|
|
self.based_on_path = namespace.XPath('./w:basedOn[@w:val]')
|
|
self.resolved = False
|
|
self.style_id = namespace.get(elem, 'w:styleId')
|
|
self.style_type = namespace.get(elem, 'w:type')
|
|
names = self.name_path(elem)
|
|
self.name = namespace.get(names[-1], 'w:val') if names else None
|
|
based_on = self.based_on_path(elem)
|
|
self.based_on = namespace.get(based_on[0], 'w:val') if based_on else None
|
|
if self.style_type == 'numbering':
|
|
self.based_on = None
|
|
self.is_default = namespace.get(elem, 'w:default') in {'1', 'on', 'true'}
|
|
|
|
self.paragraph_style = self.character_style = self.table_style = None
|
|
|
|
if self.style_type in {'paragraph', 'character', 'table'}:
|
|
if self.style_type == 'table':
|
|
for tblPr in namespace.XPath('./w:tblPr')(elem):
|
|
ts = TableStyle(namespace, tblPr)
|
|
if self.table_style is None:
|
|
self.table_style = ts
|
|
else:
|
|
self.table_style.update(ts)
|
|
if self.style_type in {'paragraph', 'table'}:
|
|
for pPr in namespace.XPath('./w:pPr')(elem):
|
|
ps = ParagraphStyle(namespace, pPr)
|
|
if self.paragraph_style is None:
|
|
self.paragraph_style = ps
|
|
else:
|
|
self.paragraph_style.update(ps)
|
|
|
|
for rPr in namespace.XPath('./w:rPr')(elem):
|
|
rs = RunStyle(namespace, rPr)
|
|
if self.character_style is None:
|
|
self.character_style = rs
|
|
else:
|
|
self.character_style.update(rs)
|
|
|
|
if self.style_type in {'numbering', 'paragraph'}:
|
|
self.numbering_style_link = None
|
|
for x in namespace.XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
|
|
self.numbering_style_link = namespace.get(x, 'w:val')
|
|
|
|
def resolve_based_on(self, parent):
|
|
if parent.table_style is not None:
|
|
if self.table_style is None:
|
|
self.table_style = TableStyle(self.namespace)
|
|
self.table_style.resolve_based_on(parent.table_style)
|
|
if parent.paragraph_style is not None:
|
|
if self.paragraph_style is None:
|
|
self.paragraph_style = ParagraphStyle(self.namespace)
|
|
self.paragraph_style.resolve_based_on(parent.paragraph_style)
|
|
if parent.character_style is not None:
|
|
if self.character_style is None:
|
|
self.character_style = RunStyle(self.namespace)
|
|
self.character_style.resolve_based_on(parent.character_style)
|
|
|
|
|
|
class Styles(object):
|
|
|
|
'''
|
|
Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup.
|
|
'''
|
|
|
|
def __init__(self, namespace, tables):
|
|
self.namespace = namespace
|
|
self.id_map = OrderedDict()
|
|
self.para_cache = {}
|
|
self.para_char_cache = {}
|
|
self.run_cache = {}
|
|
self.classes = {}
|
|
self.counter = Counter()
|
|
self.default_styles = {}
|
|
self.tables = tables
|
|
self.numbering_style_links = {}
|
|
self.default_paragraph_style = self.default_character_style = None
|
|
|
|
def __iter__(self):
|
|
for s in itervalues(self.id_map):
|
|
yield s
|
|
|
|
def __getitem__(self, key):
|
|
return self.id_map[key]
|
|
|
|
def __len__(self):
|
|
return len(self.id_map)
|
|
|
|
def get(self, key, default=None):
|
|
return self.id_map.get(key, default)
|
|
|
|
def __call__(self, root, fonts, theme):
|
|
self.fonts, self.theme = fonts, theme
|
|
self.default_paragraph_style = self.default_character_style = None
|
|
if root is not None:
|
|
for s in self.namespace.XPath('//w:style')(root):
|
|
s = Style(self.namespace, s)
|
|
if s.style_id:
|
|
self.id_map[s.style_id] = s
|
|
if s.is_default:
|
|
self.default_styles[s.style_type] = s
|
|
if getattr(s, 'numbering_style_link', None) is not None:
|
|
self.numbering_style_links[s.style_id] = s.numbering_style_link
|
|
|
|
for dd in self.namespace.XPath('./w:docDefaults')(root):
|
|
for pd in self.namespace.XPath('./w:pPrDefault')(dd):
|
|
for pPr in self.namespace.XPath('./w:pPr')(pd):
|
|
ps = ParagraphStyle(self.namespace, pPr)
|
|
if self.default_paragraph_style is None:
|
|
self.default_paragraph_style = ps
|
|
else:
|
|
self.default_paragraph_style.update(ps)
|
|
for pd in self.namespace.XPath('./w:rPrDefault')(dd):
|
|
for pPr in self.namespace.XPath('./w:rPr')(pd):
|
|
ps = RunStyle(self.namespace, pPr)
|
|
if self.default_character_style is None:
|
|
self.default_character_style = ps
|
|
else:
|
|
self.default_character_style.update(ps)
|
|
|
|
def resolve(s, p):
|
|
if p is not None:
|
|
if not p.resolved:
|
|
resolve(p, self.get(p.based_on))
|
|
s.resolve_based_on(p)
|
|
s.resolved = True
|
|
|
|
for s in self:
|
|
if not s.resolved:
|
|
resolve(s, self.get(s.based_on))
|
|
|
|
def para_val(self, parent_styles, direct_formatting, attr):
|
|
val = getattr(direct_formatting, attr)
|
|
if val is inherit:
|
|
for ps in reversed(parent_styles):
|
|
pval = getattr(ps, attr)
|
|
if pval is not inherit:
|
|
val = pval
|
|
break
|
|
return val
|
|
|
|
def run_val(self, parent_styles, direct_formatting, attr):
|
|
val = getattr(direct_formatting, attr)
|
|
if val is not inherit:
|
|
return val
|
|
if attr in direct_formatting.toggle_properties:
|
|
# The spec (section 17.7.3) does not make sense, so we follow the behavior
|
|
# of Word, which seems to only consider the document default if the
|
|
# property has not been defined in any styles.
|
|
vals = [int(getattr(rs, attr)) for rs in parent_styles if rs is not self.default_character_style and getattr(rs, attr) is not inherit]
|
|
if vals:
|
|
return sum(vals) % 2 == 1
|
|
if self.default_character_style is not None:
|
|
return getattr(self.default_character_style, attr) is True
|
|
return False
|
|
for rs in reversed(parent_styles):
|
|
rval = getattr(rs, attr)
|
|
if rval is not inherit:
|
|
return rval
|
|
return val
|
|
|
|
def resolve_paragraph(self, p):
|
|
ans = self.para_cache.get(p, None)
|
|
if ans is None:
|
|
linked_style = None
|
|
ans = self.para_cache[p] = ParagraphStyle(self.namespace)
|
|
ans.style_name = None
|
|
direct_formatting = None
|
|
is_section_break = False
|
|
for pPr in self.namespace.XPath('./w:pPr')(p):
|
|
ps = ParagraphStyle(self.namespace, pPr)
|
|
if direct_formatting is None:
|
|
direct_formatting = ps
|
|
else:
|
|
direct_formatting.update(ps)
|
|
if self.namespace.XPath('./w:sectPr')(pPr):
|
|
is_section_break = True
|
|
|
|
if direct_formatting is None:
|
|
direct_formatting = ParagraphStyle(self.namespace)
|
|
parent_styles = []
|
|
if self.default_paragraph_style is not None:
|
|
parent_styles.append(self.default_paragraph_style)
|
|
ts = self.tables.para_style(p)
|
|
if ts is not None:
|
|
parent_styles.append(ts)
|
|
|
|
default_para = self.default_styles.get('paragraph', None)
|
|
if direct_formatting.linked_style is not None:
|
|
ls = linked_style = self.get(direct_formatting.linked_style)
|
|
if ls is not None:
|
|
ans.style_name = ls.name
|
|
ps = ls.paragraph_style
|
|
if ps is not None:
|
|
parent_styles.append(ps)
|
|
if ls.character_style is not None:
|
|
self.para_char_cache[p] = ls.character_style
|
|
elif default_para is not None:
|
|
if default_para.paragraph_style is not None:
|
|
parent_styles.append(default_para.paragraph_style)
|
|
if default_para.character_style is not None:
|
|
self.para_char_cache[p] = default_para.character_style
|
|
|
|
def has_numbering(block_style):
|
|
num_id, lvl = getattr(block_style, 'numbering_id', inherit), getattr(block_style, 'numbering_level', inherit)
|
|
return num_id is not None and num_id is not inherit and lvl is not None and lvl is not inherit
|
|
|
|
is_numbering = has_numbering(direct_formatting)
|
|
is_section_break = is_section_break and not self.namespace.XPath('./w:r')(p)
|
|
|
|
if is_numbering and not is_section_break:
|
|
num_id, lvl = direct_formatting.numbering_id, direct_formatting.numbering_level
|
|
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
|
|
ps = self.numbering.get_para_style(num_id, lvl)
|
|
if ps is not None:
|
|
parent_styles.append(ps)
|
|
if (
|
|
not is_numbering and not is_section_break and linked_style is not None and has_numbering(linked_style.paragraph_style)
|
|
):
|
|
num_id, lvl = linked_style.paragraph_style.numbering_id, linked_style.paragraph_style.numbering_level
|
|
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
|
|
is_numbering = True
|
|
ps = self.numbering.get_para_style(num_id, lvl)
|
|
if ps is not None:
|
|
parent_styles.append(ps)
|
|
|
|
for attr in ans.all_properties:
|
|
if not (is_numbering and attr == 'text_indent'): # skip text-indent for lists
|
|
setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr))
|
|
ans.linked_style = direct_formatting.linked_style
|
|
return ans
|
|
|
|
def resolve_run(self, r):
|
|
ans = self.run_cache.get(r, None)
|
|
if ans is None:
|
|
p = self.namespace.XPath('ancestor::w:p[1]')(r)
|
|
p = p[0] if p else None
|
|
ans = self.run_cache[r] = RunStyle(self.namespace)
|
|
direct_formatting = None
|
|
for rPr in self.namespace.XPath('./w:rPr')(r):
|
|
rs = RunStyle(self.namespace, rPr)
|
|
if direct_formatting is None:
|
|
direct_formatting = rs
|
|
else:
|
|
direct_formatting.update(rs)
|
|
|
|
if direct_formatting is None:
|
|
direct_formatting = RunStyle(self.namespace)
|
|
|
|
parent_styles = []
|
|
default_char = self.default_styles.get('character', None)
|
|
if self.default_character_style is not None:
|
|
parent_styles.append(self.default_character_style)
|
|
pstyle = self.para_char_cache.get(p, None)
|
|
if pstyle is not None:
|
|
parent_styles.append(pstyle)
|
|
# As best as I can understand the spec, table overrides should be
|
|
# applied before paragraph overrides, but word does it
|
|
# this way, see the December 2007 table header in the demo
|
|
# document.
|
|
ts = self.tables.run_style(p)
|
|
if ts is not None:
|
|
parent_styles.append(ts)
|
|
if direct_formatting.linked_style is not None:
|
|
ls = getattr(self.get(direct_formatting.linked_style), 'character_style', None)
|
|
if ls is not None:
|
|
parent_styles.append(ls)
|
|
elif default_char is not None and default_char.character_style is not None:
|
|
parent_styles.append(default_char.character_style)
|
|
|
|
for attr in ans.all_properties:
|
|
setattr(ans, attr, self.run_val(parent_styles, direct_formatting, attr))
|
|
|
|
if ans.font_family is not inherit:
|
|
ff = self.theme.resolve_font_family(ans.font_family)
|
|
ans.font_family = self.fonts.family_for(ff, ans.b, ans.i)
|
|
|
|
return ans
|
|
|
|
def resolve(self, obj):
|
|
if obj.tag.endswith('}p'):
|
|
return self.resolve_paragraph(obj)
|
|
if obj.tag.endswith('}r'):
|
|
return self.resolve_run(obj)
|
|
|
|
def cascade(self, layers):
|
|
self.body_font_family = 'serif'
|
|
self.body_font_size = '10pt'
|
|
self.body_color = 'black'
|
|
|
|
def promote_property(char_styles, block_style, prop):
|
|
vals = {getattr(s, prop) for s in char_styles}
|
|
if len(vals) == 1:
|
|
# All the character styles have the same value
|
|
for s in char_styles:
|
|
setattr(s, prop, inherit)
|
|
setattr(block_style, prop, next(iter(vals)))
|
|
|
|
for p, runs in iteritems(layers):
|
|
has_links = '1' in {r.get('is-link', None) for r in runs}
|
|
char_styles = [self.resolve_run(r) for r in runs]
|
|
block_style = self.resolve_paragraph(p)
|
|
for prop in ('font_family', 'font_size', 'cs_font_family', 'cs_font_size', 'color'):
|
|
if has_links and prop == 'color':
|
|
# We cannot promote color as browser rendering engines will
|
|
# override the link color setting it to blue, unless the
|
|
# color is specified on the link element itself
|
|
continue
|
|
promote_property(char_styles, block_style, prop)
|
|
for s in char_styles:
|
|
if s.text_decoration == 'none':
|
|
# The default text decoration is 'none'
|
|
s.text_decoration = inherit
|
|
|
|
def promote_most_common(block_styles, prop, default):
|
|
c = Counter()
|
|
for s in block_styles:
|
|
val = getattr(s, prop)
|
|
if val is not inherit:
|
|
c[val] += 1
|
|
val = None
|
|
if c:
|
|
val = c.most_common(1)[0][0]
|
|
for s in block_styles:
|
|
oval = getattr(s, prop)
|
|
if oval is inherit:
|
|
if default != val:
|
|
setattr(s, prop, default)
|
|
elif oval == val:
|
|
setattr(s, prop, inherit)
|
|
return val
|
|
|
|
block_styles = tuple(self.resolve_paragraph(p) for p in layers)
|
|
|
|
ff = promote_most_common(block_styles, 'font_family', self.body_font_family)
|
|
if ff is not None:
|
|
self.body_font_family = ff
|
|
|
|
fs = promote_most_common(block_styles, 'font_size', int(self.body_font_size[:2]))
|
|
if fs is not None:
|
|
self.body_font_size = '%.3gpt' % fs
|
|
|
|
color = promote_most_common(block_styles, 'color', self.body_color)
|
|
if color is not None:
|
|
self.body_color = color
|
|
|
|
def resolve_numbering(self, numbering):
|
|
# When a numPr element appears inside a paragraph style, the lvl info
|
|
# must be discarded and pStyle used instead.
|
|
self.numbering = numbering
|
|
for style in self:
|
|
ps = style.paragraph_style
|
|
if ps is not None and ps.numbering_id is not inherit:
|
|
lvl = numbering.get_pstyle(ps.numbering_id, style.style_id)
|
|
if lvl is None:
|
|
ps.numbering_id = ps.numbering_level = inherit
|
|
else:
|
|
ps.numbering_level = lvl
|
|
|
|
def apply_contextual_spacing(self, paras):
|
|
last_para = None
|
|
for p in paras:
|
|
if last_para is not None:
|
|
ls = self.resolve_paragraph(last_para)
|
|
ps = self.resolve_paragraph(p)
|
|
if ls.linked_style is not None and ls.linked_style == ps.linked_style:
|
|
if ls.contextualSpacing is True:
|
|
ls.margin_bottom = 0
|
|
if ps.contextualSpacing is True:
|
|
ps.margin_top = 0
|
|
last_para = p
|
|
|
|
def apply_section_page_breaks(self, paras):
|
|
for p in paras:
|
|
ps = self.resolve_paragraph(p)
|
|
ps.pageBreakBefore = True
|
|
|
|
def register(self, css, prefix):
|
|
h = hash(frozenset(iteritems(css)))
|
|
ans, _ = self.classes.get(h, (None, None))
|
|
if ans is None:
|
|
self.counter[prefix] += 1
|
|
ans = '%s_%d' % (prefix, self.counter[prefix])
|
|
self.classes[h] = (ans, css)
|
|
return ans
|
|
|
|
def generate_classes(self):
|
|
for bs in itervalues(self.para_cache):
|
|
css = bs.css
|
|
if css:
|
|
self.register(css, 'block')
|
|
for bs in itervalues(self.run_cache):
|
|
css = bs.css
|
|
if css:
|
|
self.register(css, 'text')
|
|
|
|
def class_name(self, css):
|
|
h = hash(frozenset(iteritems(css)))
|
|
return self.classes.get(h, (None, None))[0]
|
|
|
|
def generate_css(self, dest_dir, docx, notes_nopb, nosupsub):
|
|
ef = self.fonts.embed_fonts(dest_dir, docx)
|
|
|
|
s = '''\
|
|
body { font-family: %s; font-size: %s; color: %s }
|
|
|
|
/* In word all paragraphs have zero margins unless explicitly specified in a style */
|
|
p, h1, h2, h3, h4, h5, h6, div { margin: 0; padding: 0 }
|
|
/* In word headings only have bold font if explicitly specified,
|
|
similarly the font size is the body font size, unless explicitly set. */
|
|
h1, h2, h3, h4, h5, h6 { font-weight: normal; font-size: 1rem }
|
|
/* Setting padding-left to zero breaks rendering of lists, so we only set the other values to zero and leave padding-left for the user-agent */
|
|
ul, ol { margin: 0; padding-top: 0; padding-bottom: 0; padding-right: 0 }
|
|
|
|
/* The word hyperlink styling will set text-decoration to underline if needed */
|
|
a { text-decoration: none }
|
|
|
|
sup.noteref a { text-decoration: none }
|
|
|
|
h1.notes-header { page-break-before: always }
|
|
|
|
dl.footnote dt { font-size: large }
|
|
|
|
dl.footnote dt a { text-decoration: none }
|
|
|
|
'''
|
|
|
|
if not notes_nopb:
|
|
s += '''\
|
|
dl.footnote { page-break-after: always }
|
|
dl.footnote:last-of-type { page-break-after: avoid }
|
|
'''
|
|
|
|
s = s + '''\
|
|
span.tab { white-space: pre }
|
|
|
|
p.index-entry { text-indent: 0pt; }
|
|
p.index-entry a:visited { color: blue }
|
|
p.index-entry a:hover { color: red }
|
|
'''
|
|
|
|
if nosupsub:
|
|
s = s + '''\
|
|
sup { vertical-align: top }
|
|
sub { vertical-align: bottom }
|
|
'''
|
|
|
|
prefix = textwrap.dedent(s) % (self.body_font_family, self.body_font_size, self.body_color)
|
|
if ef:
|
|
prefix = ef + '\n' + prefix
|
|
|
|
ans = []
|
|
for (cls, css) in sorted(itervalues(self.classes), key=lambda x:x[0]):
|
|
b = ('\t%s: %s;' % (k, v) for k, v in iteritems(css))
|
|
b = '\n'.join(b)
|
|
ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';')))
|
|
return prefix + '\n' + '\n'.join(ans)
|