1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-04-15 18:43:30 +02:00

Added docx writer related modules

This commit is contained in:
2020-04-13 16:33:15 +02:00
parent ae80ae5640
commit 98b2dd8d4f
29 changed files with 5956 additions and 0 deletions

View File

@@ -0,0 +1,9 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@@ -0,0 +1,281 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import textwrap, os
from lxml import etree
from lxml.builder import ElementMaker
from calibre import guess_type
from calibre.constants import numeric_version, __appname__
from calibre.ebooks.docx.names import DOCXNamespace
from calibre.ebooks.metadata import authors_to_string
from calibre.ebooks.pdf.render.common import PAPER_SIZES
from calibre.utils.date import utcnow
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from calibre.utils.zipfile import ZipFile
from polyglot.builtins import iteritems, map, unicode_type, native_string_type
def xml2str(root, pretty_print=False, with_tail=False):
if hasattr(etree, 'cleanup_namespaces'):
etree.cleanup_namespaces(root)
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print, with_tail=with_tail)
return ans
def page_size(opts):
width, height = PAPER_SIZES[opts.docx_page_size]
if opts.docx_custom_page_size is not None:
width, height = map(float, opts.docx_custom_page_size.partition('x')[0::2])
return width, height
def page_margin(opts, which):
val = getattr(opts, 'docx_page_margin_' + which)
if val == 0.0:
val = getattr(opts, 'margin_' + which)
return val
def page_effective_area(opts):
width, height = page_size(opts)
width -= page_margin(opts, 'left') + page_margin(opts, 'right')
height -= page_margin(opts, 'top') + page_margin(opts, 'bottom')
return width, height # in pts
def create_skeleton(opts, namespaces=None):
namespaces = namespaces or DOCXNamespace().namespaces
def w(x):
return '{%s}%s' % (namespaces['w'], x)
dn = {k:v for k, v in iteritems(namespaces) if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}}
E = ElementMaker(namespace=dn['w'], nsmap=dn)
doc = E.document()
body = E.body()
doc.append(body)
width, height = page_size(opts)
width, height = int(20 * width), int(20 * height)
def margin(which):
val = page_margin(opts, which)
return w(which), unicode_type(int(val * 20))
body.append(E.sectPr(
E.pgSz(**{w('w'):unicode_type(width), w('h'):unicode_type(height)}),
E.pgMar(**dict(map(margin, 'left top right bottom'.split()))),
E.cols(**{w('space'):'720'}),
E.docGrid(**{w('linePitch'):"360"}),
))
dn = {k:v for k, v in iteritems(namespaces) if k in tuple('wra') + ('wp',)}
E = ElementMaker(namespace=dn['w'], nsmap=dn)
styles = E.styles(
E.docDefaults(
E.rPrDefault(
E.rPr(
E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}),
E.sz(**{w('val'):'22'}),
E.szCs(**{w('val'):'22'}),
E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"})
)
),
E.pPrDefault(
E.pPr(
E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"})
)
)
)
)
return doc, styles, body
def update_doc_props(root, mi, namespace):
def setm(name, text=None, ns='dc'):
ans = root.makeelement('{%s}%s' % (namespace.namespaces[ns], name))
for child in tuple(root):
if child.tag == ans.tag:
root.remove(child)
ans.text = text
root.append(ans)
return ans
setm('title', mi.title)
setm('creator', authors_to_string(mi.authors))
if mi.tags:
setm('keywords', ', '.join(mi.tags), ns='cp')
if mi.comments:
setm('description', mi.comments)
if mi.languages:
l = canonicalize_lang(mi.languages[0])
setm('language', lang_as_iso639_1(l) or l)
class DocumentRelationships(object):
def __init__(self, namespace):
self.rmap = {}
self.namespace = namespace
for typ, target in iteritems({
namespace.names['STYLES']: 'styles.xml',
namespace.names['NUMBERING']: 'numbering.xml',
namespace.names['WEB_SETTINGS']: 'webSettings.xml',
namespace.names['FONTS']: 'fontTable.xml',
}):
self.add_relationship(target, typ)
def get_relationship_id(self, target, rtype, target_mode=None):
return self.rmap.get((target, rtype, target_mode))
def add_relationship(self, target, rtype, target_mode=None):
ans = self.get_relationship_id(target, rtype, target_mode)
if ans is None:
ans = 'rId%d' % (len(self.rmap) + 1)
self.rmap[(target, rtype, target_mode)] = ans
return ans
def add_image(self, target):
return self.add_relationship(target, self.namespace.names['IMAGES'])
def serialize(self):
namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
relationships = E.Relationships()
for (target, rtype, target_mode), rid in iteritems(self.rmap):
r = E.Relationship(Id=rid, Type=rtype, Target=target)
if target_mode is not None:
r.set('TargetMode', target_mode)
relationships.append(r)
return xml2str(relationships)
class DOCX(object):
def __init__(self, opts, log):
self.namespace = DOCXNamespace()
namespaces = self.namespace.namespaces
self.opts, self.log = opts, log
self.document_relationships = DocumentRelationships(self.namespace)
self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
self.numbering = etree.Element('{%s}numbering' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
self.embedded_fonts = E.Relationships()
self.fonts = {}
self.images = {}
# Boilerplate {{{
@property
def contenttypes(self):
E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']})
types = E.Types()
for partname, mt in iteritems({
"/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
"/word/document.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml",
"/word/numbering.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml",
"/word/styles.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml",
"/word/endnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
"/word/settings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml",
"/word/theme/theme1.xml": "application/vnd.openxmlformats-officedocument.theme+xml",
"/word/fontTable.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml",
"/word/webSettings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml",
"/docProps/core.xml": "application/vnd.openxmlformats-package.core-properties+xml",
"/docProps/app.xml": "application/vnd.openxmlformats-officedocument.extended-properties+xml",
}):
types.append(E.Override(PartName=partname, ContentType=mt))
added = {'png', 'gif', 'jpeg', 'jpg', 'svg', 'xml'}
for ext in added:
types.append(E.Default(Extension=ext, ContentType=guess_type('a.'+ext)[0]))
for ext, mt in iteritems({
"rels": "application/vnd.openxmlformats-package.relationships+xml",
"odttf": "application/vnd.openxmlformats-officedocument.obfuscatedFont",
}):
added.add(ext)
types.append(E.Default(Extension=ext, ContentType=mt))
for fname in self.images:
ext = fname.rpartition(os.extsep)[-1]
if ext not in added:
added.add(ext)
mt = guess_type('a.' + ext)[0]
if mt:
types.append(E.Default(Extension=ext, ContentType=mt))
return xml2str(types)
@property
def appproperties(self):
E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']})
props = E.Properties(
E.Application(__appname__),
E.AppVersion('%02d.%04d' % numeric_version[:2]),
E.DocSecurity('0'),
E.HyperlinksChanged('false'),
E.LinksUpToDate('true'),
E.ScaleCrop('false'),
E.SharedDoc('false'),
)
if self.mi.publisher:
props.append(E.Company(self.mi.publisher))
return xml2str(props)
@property
def containerrels(self):
return textwrap.dedent('''\
<?xml version='1.0' encoding='utf-8'?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/>
<Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/>
<Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/>
</Relationships>'''.format(**self.namespace.names)).encode('utf-8')
@property
def websettings(self):
E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']})
ws = E.webSettings(
E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile)
return xml2str(ws)
# }}}
def convert_metadata(self, mi):
namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()})
cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre'))
ts = utcnow().isoformat(native_string_type('T')).rpartition('.')[0] + 'Z'
for x in 'created modified'.split():
x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x), **{'{%s}type' % namespaces['xsi']:'dcterms:W3CDTF'})
x.text = ts
cp.append(x)
self.mi = mi
update_doc_props(cp, self.mi, self.namespace)
return xml2str(cp)
def create_empty_document(self, mi):
self.document, self.styles = create_skeleton(self.opts)[:2]
def write(self, path_or_stream, mi, create_empty_document=False):
if create_empty_document:
self.create_empty_document(mi)
with ZipFile(path_or_stream, 'w') as zf:
zf.writestr('[Content_Types].xml', self.contenttypes)
zf.writestr('_rels/.rels', self.containerrels)
zf.writestr('docProps/core.xml', self.convert_metadata(mi))
zf.writestr('docProps/app.xml', self.appproperties)
zf.writestr('word/webSettings.xml', self.websettings)
zf.writestr('word/document.xml', xml2str(self.document))
zf.writestr('word/styles.xml', xml2str(self.styles))
zf.writestr('word/numbering.xml', xml2str(self.numbering))
zf.writestr('word/fontTable.xml', xml2str(self.font_table))
zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
zf.writestr('word/_rels/fontTable.xml.rels', xml2str(self.embedded_fonts))
for fname, data_getter in iteritems(self.images):
zf.writestr(fname, data_getter())
for fname, data in iteritems(self.fonts):
zf.writestr(fname, data)
if __name__ == '__main__':
d = DOCX(None, None)
print(d.websettings)

View File

@@ -0,0 +1,78 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import defaultdict
from uuid import uuid4
from calibre.ebooks.oeb.base import OEB_STYLES
from calibre.ebooks.oeb.transforms.subset import find_font_face_rules
from polyglot.builtins import range
def obfuscate_font_data(data, key):
prefix = bytearray(data[:32])
key = bytearray(reversed(key.bytes))
prefix = bytes(bytearray(prefix[i]^key[i % len(key)] for i in range(len(prefix))))
return prefix + data[32:]
class FontsManager(object):
def __init__(self, namespace, oeb, opts):
self.namespace = namespace
self.oeb, self.log, self.opts = oeb, oeb.log, opts
def serialize(self, text_styles, fonts, embed_relationships, font_data_map):
makeelement = self.namespace.makeelement
font_families, seen = set(), set()
for ts in text_styles:
if ts.font_family:
lf = ts.font_family.lower()
if lf not in seen:
seen.add(lf)
font_families.add(ts.font_family)
family_map = {}
for family in sorted(font_families):
family_map[family] = makeelement(fonts, 'w:font', w_name=family)
embedded_fonts = []
for item in self.oeb.manifest:
if item.media_type in OEB_STYLES and hasattr(item.data, 'cssRules'):
embedded_fonts.extend(find_font_face_rules(item, self.oeb))
num = 0
face_map = defaultdict(set)
rel_map = {}
for ef in embedded_fonts:
ff = ef['font-family'][0]
if ff not in font_families:
continue
num += 1
bold = ef['weight'] > 400
italic = ef['font-style'] != 'normal'
tag = 'Regular'
if bold or italic:
tag = 'Italic'
if bold and italic:
tag = 'BoldItalic'
elif bold:
tag = 'Bold'
if tag in face_map[ff]:
continue
face_map[ff].add(tag)
font = family_map[ff]
key = uuid4()
item = ef['item']
rid = rel_map.get(item)
if rid is None:
rel_map[item] = rid = 'rId%d' % num
fname = 'fonts/font%d.odttf' % num
makeelement(embed_relationships, 'Relationship', Id=rid, Type=self.namespace.names['EMBEDDED_FONT'], Target=fname)
font_data_map['word/' + fname] = obfuscate_font_data(item.data, key)
makeelement(font, 'w:embed' + tag, r_id=rid,
w_fontKey='{%s}' % key.urn.rpartition(':')[-1].upper(),
w_subsetted="true" if self.opts.subset_embedded_fonts else "false")

View File

@@ -0,0 +1,617 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from collections import Counter
from calibre.ebooks.docx.writer.container import create_skeleton, page_size, page_effective_area
from calibre.ebooks.docx.writer.styles import StylesManager, FloatSpec
from calibre.ebooks.docx.writer.links import LinksManager
from calibre.ebooks.docx.writer.images import ImagesManager
from calibre.ebooks.docx.writer.fonts import FontsManager
from calibre.ebooks.docx.writer.tables import Table
from calibre.ebooks.docx.writer.lists import ListsManager
from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
from calibre.ebooks.oeb.base import XPath, barename
from calibre.utils.localization import lang_as_iso639_1
from polyglot.builtins import unicode_type, string_or_bytes
def lang_for_tag(tag):
for attr in ('lang', '{http://www.w3.org/XML/1998/namespace}lang'):
val = lang_as_iso639_1(tag.get(attr))
if val:
return val
class Style(St):
def __init__(self, *args, **kwargs):
St.__init__(self, *args, **kwargs)
self._letterSpacing = None
@property
def letterSpacing(self):
if self._letterSpacing is not None:
val = self._get('letter-spacing')
if val == 'normal':
self._letterSpacing = val
else:
self._letterSpacing = self._unit_convert(val)
return self._letterSpacing
class Stylizer(Sz):
def style(self, element):
try:
return self._styles[element]
except KeyError:
return Style(element, self)
class TextRun(object):
ws_pat = None
def __init__(self, namespace, style, first_html_parent, lang=None):
self.first_html_parent = first_html_parent
if self.ws_pat is None:
TextRun.ws_pat = self.ws_pat = re.compile(r'\s+')
self.style = style
self.texts = []
self.link = None
self.lang = lang
self.parent_style = None
self.makeelement = namespace.makeelement
self.descendant_style = None
def add_text(self, text, preserve_whitespace, bookmark=None, link=None):
if not preserve_whitespace:
text = self.ws_pat.sub(' ', text)
if text.strip() != text:
# If preserve_whitespace is False, Word ignores leading and
# trailing whitespace
preserve_whitespace = True
self.texts.append((text, preserve_whitespace, bookmark))
self.link = link
def add_break(self, clear='none', bookmark=None):
self.texts.append((None, clear, bookmark))
def add_image(self, drawing, bookmark=None):
self.texts.append((drawing, None, bookmark))
def serialize(self, p, links_manager):
makeelement = self.makeelement
parent = p if self.link is None else links_manager.serialize_hyperlink(p, self.link)
r = makeelement(parent, 'w:r')
rpr = makeelement(r, 'w:rPr', append=False)
if getattr(self.descendant_style, 'id', None) is not None:
makeelement(rpr, 'w:rStyle', w_val=self.descendant_style.id)
if self.lang:
makeelement(rpr, 'w:lang', w_bidi=self.lang, w_val=self.lang, w_eastAsia=self.lang)
if len(rpr) > 0:
r.append(rpr)
for text, preserve_whitespace, bookmark in self.texts:
if bookmark is not None:
bid = links_manager.bookmark_id
makeelement(r, 'w:bookmarkStart', w_id=unicode_type(bid), w_name=bookmark)
if text is None:
makeelement(r, 'w:br', w_clear=preserve_whitespace)
elif hasattr(text, 'xpath'):
r.append(text)
else:
t = makeelement(r, 'w:t')
t.text = text or ''
if preserve_whitespace:
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
if bookmark is not None:
makeelement(r, 'w:bookmarkEnd', w_id=unicode_type(bid))
def __repr__(self):
return repr(self.texts)
def is_empty(self):
if not self.texts:
return True
if len(self.texts) == 1 and self.texts[0][:2] == ('', False):
return True
return False
@property
def style_weight(self):
ans = 0
for text, preserve_whitespace, bookmark in self.texts:
if isinstance(text, unicode_type):
ans += len(text)
return ans
class Block(object):
def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False, parent_bg=None):
self.force_not_empty = False
self.namespace = namespace
self.bookmarks = set()
self.list_tag = (html_block, style) if is_list_item else None
self.is_first_block = False
self.numbering_id = None
self.parent_items = None
self.html_block = html_block
self.html_tag = barename(html_block.tag)
self.float_spec = float_spec
if float_spec is not None:
float_spec.blocks.append(self)
self.html_style = style
self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg)
self.styles_manager, self.links_manager = styles_manager, links_manager
self.keep_next = False
self.runs = []
self.skipped = False
self.linked_style = None
self.page_break_before = style['page-break-before'] == 'always'
self.keep_lines = style['page-break-inside'] == 'avoid'
self.page_break_after = False
self.block_lang = None
def resolve_skipped(self, next_block):
if not self.is_empty():
return
if len(self.html_block) > 0 and self.html_block[0] is next_block.html_block:
self.skipped = True
if self.list_tag is not None:
next_block.list_tag = self.list_tag
def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None, is_parent_style=False, bookmark=None, link=None, lang=None):
ws = style['white-space']
preserve_whitespace = ws in {'pre', 'pre-wrap', '-o-pre-wrap'}
ts = self.styles_manager.create_text_style(style, is_parent_style=is_parent_style)
if self.runs and ts == self.runs[-1].style and link == self.runs[-1].link and lang == self.runs[-1].lang:
run = self.runs[-1]
else:
run = TextRun(self.namespace, ts, self.html_block if html_parent is None else html_parent, lang=lang)
self.runs.append(run)
if ignore_leading_whitespace and not preserve_whitespace:
text = text.lstrip()
if preserve_whitespace or ws == 'pre-line':
for text in text.splitlines():
run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link)
bookmark = None
run.add_break()
else:
run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link)
def add_break(self, clear='none', bookmark=None):
if self.runs:
run = self.runs[-1]
else:
run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run)
run.add_break(clear=clear, bookmark=bookmark)
def add_image(self, drawing, bookmark=None):
if self.runs:
run = self.runs[-1]
else:
run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run)
run.add_image(drawing, bookmark=bookmark)
def serialize(self, body):
makeelement = self.namespace.makeelement
p = makeelement(body, 'w:p')
end_bookmarks = []
for bmark in self.bookmarks:
end_bookmarks.append(unicode_type(self.links_manager.bookmark_id))
makeelement(p, 'w:bookmarkStart', w_id=end_bookmarks[-1], w_name=bmark)
if self.block_lang:
rpr = makeelement(p, 'w:rPr')
makeelement(rpr, 'w:lang', w_val=self.block_lang, w_bidi=self.block_lang, w_eastAsia=self.block_lang)
ppr = makeelement(p, 'w:pPr')
if self.keep_next:
makeelement(ppr, 'w:keepNext')
if self.float_spec is not None:
self.float_spec.serialize(self, ppr)
if self.numbering_id is not None:
numpr = makeelement(ppr, 'w:numPr')
makeelement(numpr, 'w:ilvl', w_val=unicode_type(self.numbering_id[1]))
makeelement(numpr, 'w:numId', w_val=unicode_type(self.numbering_id[0]))
if self.linked_style is not None:
makeelement(ppr, 'w:pStyle', w_val=self.linked_style.id)
elif self.style.id:
makeelement(ppr, 'w:pStyle', w_val=self.style.id)
if self.is_first_block:
makeelement(ppr, 'w:pageBreakBefore', w_val='off')
elif self.page_break_before:
makeelement(ppr, 'w:pageBreakBefore', w_val='on')
if self.keep_lines:
makeelement(ppr, 'w:keepLines', w_val='on')
for run in self.runs:
run.serialize(p, self.links_manager)
for bmark in end_bookmarks:
makeelement(p, 'w:bookmarkEnd', w_id=bmark)
def __repr__(self):
return 'Block(%r)' % self.runs
__str__ = __repr__
def is_empty(self):
if self.force_not_empty:
return False
for run in self.runs:
if not run.is_empty():
return False
return True
class Blocks(object):
def __init__(self, namespace, styles_manager, links_manager):
self.top_bookmark = None
self.namespace = namespace
self.styles_manager = styles_manager
self.links_manager = links_manager
self.all_blocks = []
self.pos = 0
self.current_block = None
self.items = []
self.tables = []
self.current_table = None
self.open_html_blocks = set()
self.html_tag_start_blocks = {}
def current_or_new_block(self, html_tag, tag_style):
return self.current_block or self.start_new_block(html_tag, tag_style)
def end_current_block(self):
if self.current_block is not None:
self.all_blocks.append(self.current_block)
if self.current_table is not None and self.current_table.current_row is not None:
self.current_table.add_block(self.current_block)
else:
self.block_map[self.current_block] = len(self.items)
self.items.append(self.current_block)
self.current_block.parent_items = self.items
self.current_block = None
def start_new_block(self, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False):
parent_bg = None
if html_block is not None:
p = html_block.getparent()
b = self.html_tag_start_blocks.get(p)
if b is not None:
ps = self.styles_manager.styles_for_html_blocks.get(p)
if ps is not None and ps.background_color is not None:
parent_bg = ps.background_color
self.end_current_block()
self.current_block = Block(
self.namespace, self.styles_manager, self.links_manager, html_block, style,
is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item,
parent_bg=parent_bg)
self.html_tag_start_blocks[html_block] = self.current_block
self.open_html_blocks.add(html_block)
return self.current_block
def start_new_table(self, html_tag, tag_style=None):
self.current_table = Table(self.namespace, html_tag, tag_style)
self.tables.append(self.current_table)
def start_new_row(self, html_tag, tag_style):
if self.current_table is None:
self.start_new_table(html_tag)
self.current_table.start_new_row(html_tag, tag_style)
def start_new_cell(self, html_tag, tag_style):
if self.current_table is None:
self.start_new_table(html_tag)
self.current_table.start_new_cell(html_tag, tag_style)
def finish_tag(self, html_tag):
if self.current_block is not None and html_tag in self.open_html_blocks:
start_block = self.html_tag_start_blocks.get(html_tag)
if start_block is not None and start_block.html_style['page-break-after'] == 'always':
self.current_block.page_break_after = True
self.end_current_block()
self.open_html_blocks.discard(html_tag)
if self.current_table is not None:
table_finished = self.current_table.finish_tag(html_tag)
if table_finished:
table = self.tables[-1]
del self.tables[-1]
if self.tables:
self.current_table = self.tables[-1]
self.current_table.add_table(table)
else:
self.current_table = None
self.block_map[table] = len(self.items)
self.items.append(table)
def serialize(self, body):
for item in self.items:
item.serialize(body)
def delete_block_at(self, pos=None):
pos = self.pos if pos is None else pos
block = self.all_blocks[pos]
del self.all_blocks[pos]
bpos = self.block_map.pop(block, None)
if bpos is not None:
del self.items[bpos]
else:
items = self.items if block.parent_items is None else block.parent_items
items.remove(block)
block.parent_items = None
if block.float_spec is not None:
block.float_spec.blocks.remove(block)
try:
next_block = self.all_blocks[pos]
next_block.bookmarks.update(block.bookmarks)
for attr in 'page_break_after page_break_before'.split():
setattr(next_block, attr, getattr(block, attr))
except (IndexError, KeyError):
pass
def __enter__(self):
self.pos = len(self.all_blocks)
self.block_map = {}
def __exit__(self, etype, value, traceback):
if value is not None:
return # Since there was an exception, the data structures are not in a consistent state
if self.current_block is not None:
self.all_blocks.append(self.current_block)
self.current_block = None
if len(self.all_blocks) > self.pos and self.all_blocks[self.pos].is_empty():
# Delete the empty block corresponding to the <body> tag when the
# body tag has no inline content before its first sub-block
self.delete_block_at(self.pos)
if self.pos > 0 and self.pos < len(self.all_blocks):
# Insert a page break corresponding to the start of the html file
self.all_blocks[self.pos].page_break_before = True
if self.top_bookmark is not None:
self.all_blocks[self.pos].bookmarks.add(self.top_bookmark)
self.top_bookmark = None
self.block_map = {}
def apply_page_break_after(self):
for i, block in enumerate(self.all_blocks):
if block.page_break_after and i < len(self.all_blocks) - 1:
next_block = self.all_blocks[i + 1]
if next_block.parent_items is block.parent_items and block.parent_items is self.items:
next_block.page_break_before = True
def resolve_language(self):
default_lang = self.styles_manager.document_lang
for block in self.all_blocks:
count = Counter()
for run in block.runs:
count[run.lang] += 1
if count:
block.block_lang = bl = count.most_common(1)[0][0]
for run in block.runs:
if run.lang == bl:
run.lang = None
if bl == default_lang:
block.block_lang = None
def __repr__(self):
return 'Block(%r)' % self.runs
class Convert(object):
# Word does not apply default styling to hyperlinks, so we ensure they get
# default styling (the conversion pipeline does not apply any styling to
# them).
base_css = '''
a[href] { text-decoration: underline; color: blue }
'''
def __init__(self, oeb, docx, mi, add_cover, add_toc):
self.oeb, self.docx, self.add_cover, self.add_toc = oeb, docx, add_cover, add_toc
self.log, self.opts = docx.log, docx.opts
self.mi = mi
self.cover_img = None
p = self.opts.output_profile
p.width_pts, p.height_pts = page_effective_area(self.opts)
def __call__(self):
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
self.svg_rasterizer(self.oeb, self.opts)
self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language)
self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log)
self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts)
self.lists_manager = ListsManager(self.docx)
self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager)
self.current_link = self.current_lang = None
for item in self.oeb.spine:
self.log.debug('Processing', item.href)
self.process_item(item)
if self.add_toc:
self.links_manager.process_toc_links(self.oeb)
if self.add_cover and self.oeb.metadata.cover and unicode_type(self.oeb.metadata.cover[0]) in self.oeb.manifest.ids:
cover_id = unicode_type(self.oeb.metadata.cover[0])
item = self.oeb.manifest.ids[cover_id]
self.cover_img = self.images_manager.read_image(item.href)
all_blocks = self.blocks.all_blocks
remove_blocks = []
for i, block in enumerate(all_blocks):
try:
nb = all_blocks[i+1]
except IndexError:
break
block.resolve_skipped(nb)
if block.skipped:
remove_blocks.append((i, block))
for pos, block in reversed(remove_blocks):
self.blocks.delete_block_at(pos)
self.blocks.all_blocks[0].is_first_block = True
self.blocks.apply_page_break_after()
self.blocks.resolve_language()
if self.cover_img is not None:
self.cover_img = self.images_manager.create_cover_markup(self.cover_img, self.opts.preserve_cover_aspect_ratio, *page_size(self.opts))
self.lists_manager.finalize(all_blocks)
self.styles_manager.finalize(all_blocks)
self.write()
def process_item(self, item):
self.current_item = item
stylizer = self.svg_rasterizer.stylizer_cache.get(item)
if stylizer is None:
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css)
self.abshref = self.images_manager.abshref = item.abshref
self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang
for i, body in enumerate(XPath('//h:body')(item.data)):
with self.blocks:
self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(self.links_manager.top_anchor, self.current_item, body)
self.process_tag(body, stylizer, is_first_tag=i == 0)
def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None):
tagname = barename(html_tag.tag)
tag_style = stylizer.style(html_tag)
ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden
display = tag_style._get('display')
is_block = False
if not ignore_tag_contents:
previous_link = self.current_link
if tagname == 'a' and html_tag.get('href'):
self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title'))
previous_lang = self.current_lang
tag_lang = lang_for_tag(html_tag)
if tag_lang:
self.current_lang = tag_lang
is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag
if float_spec is None and is_float:
float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style)
if display in {'inline', 'inline-block'} or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph
if is_float and float_spec.is_dropcaps:
self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
float_spec = None
else:
self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
elif display == 'list-item':
self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True)
elif display.startswith('table') or display == 'inline-table':
if display == 'table-cell':
self.blocks.start_new_cell(html_tag, tag_style)
self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True)
elif display == 'table-row':
self.blocks.start_new_row(html_tag, tag_style)
elif display in {'table', 'inline-table'}:
self.blocks.end_current_block()
self.blocks.start_new_table(html_tag, tag_style)
else:
if tagname == 'img' and is_float:
# Image is floating so dont start a new paragraph for it
self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
else:
if tagname == 'hr':
for edge in 'right bottom left'.split():
tag_style.set('border-%s-style' % edge, 'none')
self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
for child in html_tag.iterchildren():
if isinstance(getattr(child, 'tag', None), string_or_bytes):
self.process_tag(child, stylizer, float_spec=float_spec)
else: # Comment/PI/etc.
tail = getattr(child, 'tail', None)
if tail:
block = self.create_block_from_parent(html_tag, stylizer)
block.add_text(tail, tag_style, is_parent_style=False, link=self.current_link, lang=self.current_lang)
is_block = html_tag in self.blocks.open_html_blocks
self.blocks.finish_tag(html_tag)
if is_block and tag_style['page-break-after'] == 'avoid':
self.blocks.all_blocks[-1].keep_next = True
self.current_link = previous_link
self.current_lang = previous_lang
# Now, process the tail if any
if display == 'table-row':
return # We ignore the tail for these tags
ignore_whitespace_tail = is_block or display.startswith('table')
if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()):
# Ignore trailing space after a block tag, as otherwise it will
# become a new empty paragraph
block = self.create_block_from_parent(html_tag, stylizer)
block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link, lang=self.current_lang)
def create_block_from_parent(self, html_tag, stylizer):
parent = html_tag.getparent()
block = self.blocks.current_or_new_block(parent, stylizer.style(parent))
# Do not inherit page-break-before from parent
block.page_break_before = False
return block
def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False):
block = self.blocks.start_new_block(
html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item)
anchor = html_tag.get('id') or html_tag.get('name')
if anchor:
block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag))
if tagname == 'img':
self.images_manager.add_image(html_tag, block, stylizer, as_block=True)
else:
text = html_tag.text
if text:
block.add_text(text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang)
elif tagname == 'li' and len(html_tag) and barename(html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]):
block.force_not_empty = True
def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
anchor = html_tag.get('id') or html_tag.get('name') or None
bmark = None
if anchor:
bmark = self.bookmark_for_anchor(anchor, html_tag)
if tagname == 'br':
if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]:
block = self.create_block_from_parent(html_tag, stylizer)
block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'), bookmark=bmark)
elif tagname == 'img':
block = self.create_block_from_parent(html_tag, stylizer)
self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark)
else:
if html_tag.text:
block = self.create_block_from_parent(html_tag, stylizer)
block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang)
elif bmark:
block = self.create_block_from_parent(html_tag, stylizer)
block.add_text('', tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang)
def bookmark_for_anchor(self, anchor, html_tag):
return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag)
def write(self):
self.docx.document, self.docx.styles, body = create_skeleton(self.opts)
self.blocks.serialize(body)
body.append(body[0]) # Move <sectPr> to the end
if self.links_manager.toc:
self.links_manager.serialize_toc(body, self.styles_manager.primary_heading_style)
if self.cover_img is not None:
self.images_manager.write_cover_block(body, self.cover_img)
self.styles_manager.serialize(self.docx.styles)
self.images_manager.serialize(self.docx.images)
self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts)
self.lists_manager.serialize(self.docx.numbering)

View File

@@ -0,0 +1,219 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import os
import posixpath
from collections import namedtuple
from functools import partial
from polyglot.builtins import iteritems, itervalues, map, unicode_type
from lxml import etree
from calibre import fit_image
from calibre.ebooks.oeb.base import urlunquote
from calibre.ebooks.docx.images import pt_to_emu
from calibre.utils.filenames import ascii_filename
from calibre.utils.imghdr import identify
Image = namedtuple('Image', 'rid fname width height fmt item')
def as_num(x):
try:
return float(x)
except Exception:
pass
return 0
def get_image_margins(style):
ans = {}
for edge in 'Left Right Top Bottom'.split():
val = as_num(getattr(style, 'padding' + edge)) + as_num(getattr(style, 'margin' + edge))
ans['dist' + edge[0]] = unicode_type(pt_to_emu(val))
return ans
class ImagesManager(object):
def __init__(self, oeb, document_relationships, opts):
self.oeb, self.log = oeb, oeb.log
self.page_width, self.page_height = opts.output_profile.width_pts, opts.output_profile.height_pts
self.images = {}
self.seen_filenames = set()
self.document_relationships = document_relationships
self.count = 0
def read_image(self, href):
if href not in self.images:
item = self.oeb.manifest.hrefs.get(href)
if item is None or not isinstance(item.data, bytes):
return
try:
fmt, width, height = identify(item.data)
except Exception:
self.log.warning('Replacing corrupted image with blank: %s' % href)
item.data = I('blank.png', data=True, allow_user_override=False)
fmt, width, height = identify(item.data)
image_fname = 'media/' + self.create_filename(href, fmt)
image_rid = self.document_relationships.add_image(image_fname)
self.images[href] = Image(image_rid, image_fname, width, height, fmt, item)
item.unload_data_from_memory()
return self.images[href]
def add_image(self, img, block, stylizer, bookmark=None, as_block=False):
src = img.get('src')
if not src:
return
href = self.abshref(src)
try:
rid = self.read_image(href).rid
except AttributeError:
return
drawing = self.create_image_markup(img, stylizer, href, as_block=as_block)
block.add_image(drawing, bookmark=bookmark)
return rid
def create_image_markup(self, html_img, stylizer, href, as_block=False):
# TODO: img inside a link (clickable image)
style = stylizer.style(html_img)
floating = style['float']
if floating not in {'left', 'right'}:
floating = None
if as_block:
ml, mr = style._get('margin-left'), style._get('margin-right')
if ml == 'auto':
floating = 'center' if mr == 'auto' else 'right'
if mr == 'auto':
floating = 'center' if ml == 'auto' else 'right'
else:
parent = html_img.getparent()
if len(parent) == 1 and not (parent.text or '').strip() and not (html_img.tail or '').strip():
pstyle = stylizer.style(parent)
if 'block' in pstyle['display']:
# We have an inline image alone inside a block
as_block = True
floating = pstyle['float']
if floating not in {'left', 'right'}:
floating = None
if pstyle['text-align'] in ('center', 'right'):
floating = pstyle['text-align']
floating = floating or 'left'
fake_margins = floating is None
self.count += 1
img = self.images[href]
name = urlunquote(posixpath.basename(href))
width, height = style.img_size(img.width, img.height)
scaled, width, height = fit_image(width, height, self.page_width, self.page_height)
width, height = map(pt_to_emu, (width, height))
makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
root = etree.Element('root', nsmap=namespaces)
ans = makeelement(root, 'w:drawing', append=False)
if floating is None:
parent = makeelement(ans, 'wp:inline')
else:
parent = makeelement(ans, 'wp:anchor', **get_image_margins(style))
# The next three lines are boilerplate that Word requires, even
# though the DOCX specs define defaults for all of them
parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0")
parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
makeelement(parent, 'wp:simplePos', x='0', y='0')
makeelement(makeelement(parent, 'wp:positionH', relativeFrom='margin'), 'wp:align').text = floating
makeelement(makeelement(parent, 'wp:positionV', relativeFrom='line'), 'wp:align').text = 'top'
makeelement(parent, 'wp:extent', cx=unicode_type(width), cy=unicode_type(height))
if fake_margins:
# DOCX does not support setting margins for inline images, so we
# fake it by using effect extents to simulate margins
makeelement(parent, 'wp:effectExtent', **{k[-1].lower():v for k, v in iteritems(get_image_margins(style))})
else:
makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
if floating is not None:
# The idiotic Word requires this to be after the extent settings
if as_block:
makeelement(parent, 'wp:wrapTopAndBottom')
else:
makeelement(parent, 'wp:wrapSquare', wrapText='bothSides')
self.create_docx_image_markup(parent, name, html_img.get('alt') or name, img.rid, width, height)
return ans
def create_docx_image_markup(self, parent, name, alt, img_rid, width, height):
makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
makeelement(parent, 'wp:docPr', id=unicode_type(self.count), name=name, descr=alt)
makeelement(makeelement(parent, 'wp:cNvGraphicFramePr'), 'a:graphicFrameLocks', noChangeAspect="1")
g = makeelement(parent, 'a:graphic')
gd = makeelement(g, 'a:graphicData', uri=namespaces['pic'])
pic = makeelement(gd, 'pic:pic')
nvPicPr = makeelement(pic, 'pic:nvPicPr')
makeelement(nvPicPr, 'pic:cNvPr', id='0', name=name, descr=alt)
makeelement(nvPicPr, 'pic:cNvPicPr')
bf = makeelement(pic, 'pic:blipFill')
makeelement(bf, 'a:blip', r_embed=img_rid)
makeelement(makeelement(bf, 'a:stretch'), 'a:fillRect')
spPr = makeelement(pic, 'pic:spPr')
xfrm = makeelement(spPr, 'a:xfrm')
makeelement(xfrm, 'a:off', x='0', y='0'), makeelement(xfrm, 'a:ext', cx=unicode_type(width), cy=unicode_type(height))
makeelement(makeelement(spPr, 'a:prstGeom', prst='rect'), 'a:avLst')
def create_filename(self, href, fmt):
fname = ascii_filename(urlunquote(posixpath.basename(href)))
fname = posixpath.splitext(fname)[0]
fname = fname[:75].rstrip('.') or 'image'
num = 0
base = fname
while fname.lower() in self.seen_filenames:
num += 1
fname = base + unicode_type(num)
self.seen_filenames.add(fname.lower())
fname += os.extsep + fmt.lower()
return fname
def serialize(self, images_map):
for img in itervalues(self.images):
images_map['word/' + img.fname] = partial(self.get_data, img.item)
def get_data(self, item):
try:
return item.data
finally:
item.unload_data_from_memory(False)
def create_cover_markup(self, img, preserve_aspect_ratio, width, height):
self.count += 1
makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
if preserve_aspect_ratio:
if img.width >= img.height:
ar = img.height / img.width
height = ar * width
else:
ar = img.width / img.height
width = ar * height
root = etree.Element('root', nsmap=namespaces)
ans = makeelement(root, 'w:drawing', append=False)
parent = makeelement(ans, 'wp:anchor', **{'dist'+edge:'0' for edge in 'LRTB'})
parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0")
parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
makeelement(parent, 'wp:simplePos', x='0', y='0')
makeelement(makeelement(parent, 'wp:positionH', relativeFrom='page'), 'wp:align').text = 'center'
makeelement(makeelement(parent, 'wp:positionV', relativeFrom='page'), 'wp:align').text = 'center'
width, height = map(pt_to_emu, (width, height))
makeelement(parent, 'wp:extent', cx=unicode_type(width), cy=unicode_type(height))
makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
makeelement(parent, 'wp:wrapTopAndBottom')
self.create_docx_image_markup(parent, 'cover.jpg', _('Cover'), img.rid, width, height)
return ans
def write_cover_block(self, body, cover_image):
makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
pbb = body[0].xpath('//*[local-name()="pageBreakBefore"]')[0]
pbb.set('{%s}val' % namespaces['w'], 'on')
p = makeelement(body, 'w:p', append=False)
body.insert(0, p)
r = makeelement(p, 'w:r')
r.append(cover_image)

View File

@@ -0,0 +1,175 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import posixpath, re
from uuid import uuid4
from calibre.utils.filenames import ascii_text
from polyglot.builtins import unicode_type
from polyglot.urllib import urlparse
def start_text(tag, prefix_len=0, top_level=True):
ans = tag.text or ''
limit = 50 - prefix_len
if len(ans) < limit:
for child in tag.iterchildren('*'):
ans += start_text(child, len(ans), top_level=False) + (child.tail or '')
if len(ans) >= limit:
break
if top_level and len(ans) > limit:
ans = ans[:limit] + '...'
return ans
class TOCItem(object):
def __init__(self, title, bmark, level):
self.title, self.bmark, self.level = title, bmark, level
self.is_first = self.is_last = False
def serialize(self, body, makeelement):
p = makeelement(body, 'w:p', append=False)
ppr = makeelement(p, 'w:pPr')
makeelement(ppr, 'w:pStyle', w_val="Normal")
makeelement(ppr, 'w:ind', w_left='0', w_firstLineChars='0', w_firstLine='0', w_leftChars=unicode_type(200 * self.level))
if self.is_first:
makeelement(ppr, 'w:pageBreakBefore', w_val='off')
r = makeelement(p, 'w:r')
makeelement(r, 'w:fldChar', w_fldCharType='begin')
r = makeelement(p, 'w:r')
makeelement(r, 'w:instrText').text = r' TOC \h '
r[0].set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
r = makeelement(p, 'w:r')
makeelement(r, 'w:fldChar', w_fldCharType='separate')
hl = makeelement(p, 'w:hyperlink', w_anchor=self.bmark)
r = makeelement(hl, 'w:r')
rpr = makeelement(r, 'w:rPr')
makeelement(rpr, 'w:color', w_val='0000FF', w_themeColor='hyperlink')
makeelement(rpr, 'w:u', w_val='single')
makeelement(r, 'w:t').text = self.title
if self.is_last:
r = makeelement(p, 'w:r')
makeelement(r, 'w:fldChar', w_fldCharType='end')
body.insert(0, p)
def sanitize_bookmark_name(base):
# Max length allowed by Word appears to be 40, we use 32 to leave some
# space for making the name unique
return re.sub(r'[^0-9a-zA-Z]', '_', ascii_text(base))[:32].rstrip('_')
class LinksManager(object):
def __init__(self, namespace, document_relationships, log):
self.namespace = namespace
self.log = log
self.document_relationships = document_relationships
self.top_anchor = unicode_type(uuid4().hex)
self.anchor_map = {}
self.used_bookmark_names = set()
self.bmark_id = 0
self.document_hrefs = set()
self.external_links = {}
self.toc = []
def bookmark_for_anchor(self, anchor, current_item, html_tag):
key = (current_item.href, anchor)
if key in self.anchor_map:
return self.anchor_map[key]
if anchor == self.top_anchor:
name = ('Top of %s' % posixpath.basename(current_item.href))
self.document_hrefs.add(current_item.href)
else:
name = start_text(html_tag).strip() or anchor
name = sanitize_bookmark_name(name)
i, bname = 0, name
while name in self.used_bookmark_names:
i += 1
name = bname + ('_%d' % i)
self.anchor_map[key] = name
self.used_bookmark_names.add(name)
return name
@property
def bookmark_id(self):
self.bmark_id += 1
return self.bmark_id
def serialize_hyperlink(self, parent, link):
item, url, tooltip = link
purl = urlparse(url)
href = purl.path
def make_link(parent, anchor=None, id=None, tooltip=None):
kw = {}
if anchor is not None:
kw['w_anchor'] = anchor
elif id is not None:
kw['r_id'] = id
if tooltip:
kw['w_tooltip'] = tooltip
return self.namespace.makeelement(parent, 'w:hyperlink', **kw)
if not purl.scheme:
href = item.abshref(href)
if href in self.document_hrefs:
key = (href, purl.fragment or self.top_anchor)
if key in self.anchor_map:
bmark = self.anchor_map[key]
else:
bmark = self.anchor_map[(href, self.top_anchor)]
return make_link(parent, anchor=bmark, tooltip=tooltip)
else:
self.log.warn('Ignoring internal hyperlink with href (%s) pointing to unknown destination' % url)
if purl.scheme in {'http', 'https', 'ftp'}:
if url not in self.external_links:
self.external_links[url] = self.document_relationships.add_relationship(url, self.namespace.names['LINKS'], target_mode='External')
return make_link(parent, id=self.external_links[url], tooltip=tooltip)
return parent
def process_toc_node(self, toc, level=0):
href = toc.href
if href:
purl = urlparse(href)
href = purl.path
if href in self.document_hrefs:
key = (href, purl.fragment or self.top_anchor)
if key in self.anchor_map:
bmark = self.anchor_map[key]
else:
bmark = self.anchor_map[(href, self.top_anchor)]
self.toc.append(TOCItem(toc.title, bmark, level))
for child in toc:
self.process_toc_node(child, level+1)
def process_toc_links(self, oeb):
self.toc = []
has_toc = oeb.toc and oeb.toc.count() > 1
if not has_toc:
return
for child in oeb.toc:
self.process_toc_node(child)
if self.toc:
self.toc[0].is_first = True
self.toc[-1].is_last = True
def serialize_toc(self, body, primary_heading_style):
pbb = body[0].xpath('//*[local-name()="pageBreakBefore"]')[0]
pbb.set('{%s}val' % self.namespace.namespaces['w'], 'on')
for block in reversed(self.toc):
block.serialize(body, self.namespace.makeelement)
title = __('Table of Contents')
makeelement = self.namespace.makeelement
p = makeelement(body, 'w:p', append=False)
ppr = makeelement(p, 'w:pPr')
if primary_heading_style is not None:
makeelement(ppr, 'w:pStyle', w_val=primary_heading_style.id)
makeelement(ppr, 'w:pageBreakBefore', w_val='off')
makeelement(makeelement(p, 'w:r'), 'w:t').text = title
body.insert(0, p)

View File

@@ -0,0 +1,169 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import defaultdict
from operator import attrgetter
from polyglot.builtins import iteritems, itervalues, unicode_type
LIST_STYLES = frozenset(
'disc circle square decimal decimal-leading-zero lower-roman upper-roman'
' lower-greek lower-alpha lower-latin upper-alpha upper-latin hiragana hebrew'
' katakana-iroha cjk-ideographic'.split())
STYLE_MAP = {
'disc': 'bullet',
'circle': 'o',
'square': '\uf0a7',
'decimal': 'decimal',
'decimal-leading-zero': 'decimalZero',
'lower-roman': 'lowerRoman',
'upper-roman': 'upperRoman',
'lower-alpha': 'lowerLetter',
'lower-latin': 'lowerLetter',
'upper-alpha': 'upperLetter',
'upper-latin': 'upperLetter',
'hiragana': 'aiueo',
'hebrew': 'hebrew1',
'katakana-iroha': 'iroha',
'cjk-ideographic': 'chineseCounting',
}
def find_list_containers(list_tag, tag_style):
node = list_tag
stylizer = tag_style._stylizer
ans = []
while True:
parent = node.getparent()
if parent is None or parent is node:
break
node = parent
style = stylizer.style(node)
lst = (style._style.get('list-style-type', None) or '').lower()
if lst in LIST_STYLES:
ans.append(node)
return ans
class NumberingDefinition(object):
def __init__(self, top_most, stylizer, namespace):
self.namespace = namespace
self.top_most = top_most
self.stylizer = stylizer
self.level_map = defaultdict(list)
self.num_id = None
def finalize(self):
items_for_level = defaultdict(list)
container_for_level = {}
type_for_level = {}
for ilvl, items in iteritems(self.level_map):
for container, list_tag, block, list_type, tag_style in items:
items_for_level[ilvl].append(list_tag)
container_for_level[ilvl] = container
type_for_level[ilvl] = list_type
self.levels = tuple(
Level(type_for_level[ilvl], container_for_level[ilvl], items_for_level[ilvl], ilvl=ilvl)
for ilvl in sorted(self.level_map)
)
def __hash__(self):
return hash(self.levels)
def link_blocks(self):
for ilvl, items in iteritems(self.level_map):
for container, list_tag, block, list_type, tag_style in items:
block.numbering_id = (self.num_id + 1, ilvl)
def serialize(self, parent):
makeelement = self.namespace.makeelement
an = makeelement(parent, 'w:abstractNum', w_abstractNumId=unicode_type(self.num_id))
makeelement(an, 'w:multiLevelType', w_val='hybridMultilevel')
makeelement(an, 'w:name', w_val='List %d' % (self.num_id + 1))
for level in self.levels:
level.serialize(an, makeelement)
class Level(object):
def __init__(self, list_type, container, items, ilvl=0):
self.ilvl = ilvl
try:
self.start = int(container.get('start'))
except Exception:
self.start = 1
if items:
try:
self.start = int(items[0].get('value'))
except Exception:
pass
if list_type in {'disc', 'circle', 'square'}:
self.num_fmt = 'bullet'
self.lvl_text = '\uf0b7' if list_type == 'disc' else STYLE_MAP[list_type]
else:
self.lvl_text = '%{}.'.format(self.ilvl + 1)
self.num_fmt = STYLE_MAP.get(list_type, 'decimal')
def __hash__(self):
return hash((self.start, self.num_fmt, self.lvl_text))
def serialize(self, parent, makeelement):
lvl = makeelement(parent, 'w:lvl', w_ilvl=unicode_type(self.ilvl))
makeelement(lvl, 'w:start', w_val=unicode_type(self.start))
makeelement(lvl, 'w:numFmt', w_val=self.num_fmt)
makeelement(lvl, 'w:lvlText', w_val=self.lvl_text)
makeelement(lvl, 'w:lvlJc', w_val='left')
makeelement(makeelement(lvl, 'w:pPr'), 'w:ind', w_hanging='360', w_left=unicode_type(1152 + self.ilvl * 360))
if self.num_fmt == 'bullet':
ff = {'\uf0b7':'Symbol', '\uf0a7':'Wingdings'}.get(self.lvl_text, 'Courier New')
makeelement(makeelement(lvl, 'w:rPr'), 'w:rFonts', w_ascii=ff, w_hAnsi=ff, w_hint="default")
class ListsManager(object):
def __init__(self, docx):
self.namespace = docx.namespace
self.lists = {}
def finalize(self, all_blocks):
lists = {}
for block in all_blocks:
if block.list_tag is not None:
list_tag, tag_style = block.list_tag
list_type = (tag_style['list-style-type'] or '').lower()
if list_type not in LIST_STYLES:
continue
container_tags = find_list_containers(list_tag, tag_style)
if not container_tags:
continue
top_most = container_tags[-1]
if top_most not in lists:
lists[top_most] = NumberingDefinition(top_most, tag_style._stylizer, self.namespace)
l = lists[top_most]
ilvl = len(container_tags) - 1
l.level_map[ilvl].append((container_tags[0], list_tag, block, list_type, tag_style))
[nd.finalize() for nd in itervalues(lists)]
definitions = {}
for defn in itervalues(lists):
try:
defn = definitions[defn]
except KeyError:
definitions[defn] = defn
defn.num_id = len(definitions) - 1
defn.link_blocks()
self.definitions = sorted(itervalues(definitions), key=attrgetter('num_id'))
def serialize(self, parent):
for defn in self.definitions:
defn.serialize(parent)
makeelement = self.namespace.makeelement
for defn in self.definitions:
n = makeelement(parent, 'w:num', w_numId=unicode_type(defn.num_id + 1))
makeelement(n, 'w:abstractNumId', w_val=unicode_type(defn.num_id))

View File

@@ -0,0 +1,768 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import numbers
from collections import Counter, defaultdict
from operator import attrgetter
from lxml import etree
from calibre.ebooks import parse_css_length
from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero
from calibre.utils.localization import lang_as_iso639_1
from polyglot.builtins import iteritems, filter, unicode_type
from tinycss.css21 import CSS21Parser
css_parser = CSS21Parser()
border_edges = ('left', 'top', 'right', 'bottom')
border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
ignore = object()
def parse_css_font_family(raw):
decl, errs = css_parser.parse_style_attr('font-family:' + raw)
if decl:
for token in decl[0].value:
if token.type in 'STRING IDENT':
val = token.value
if val == 'inherit':
break
yield val
def css_font_family_to_docx(raw):
generic = {'serif':'Cambria', 'sansserif':'Candara', 'sans-serif':'Candara', 'fantasy':'Comic Sans', 'cursive':'Segoe Script'}
for ff in parse_css_font_family(raw):
return generic.get(ff.lower(), ff)
def bmap(x):
return 'on' if x else 'off'
def is_dropcaps(html_tag, tag_style):
return len(html_tag) < 2 and len(etree.tostring(html_tag, method='text', encoding='unicode', with_tail=False)) < 5 and tag_style['float'] == 'left'
class CombinedStyle(object):
def __init__(self, bs, rs, blocks, namespace):
self.bs, self.rs, self.blocks = bs, rs, blocks
self.namespace = namespace
self.id = self.name = self.seq = None
self.outline_level = None
def apply(self):
for block in self.blocks:
block.linked_style = self
for run in block.runs:
run.parent_style = self.rs
def serialize(self, styles, normal_style):
makeelement = self.namespace.makeelement
w = lambda x: '{%s}%s' % (self.namespace.namespaces['w'], x)
block = makeelement(styles, 'w:style', w_styleId=self.id, w_type='paragraph')
makeelement(block, 'w:name', w_val=self.name)
makeelement(block, 'w:qFormat')
if self is not normal_style:
makeelement(block, 'w:basedOn', w_val=normal_style.id)
if self.seq == 0:
block.set(w('default'), '1')
pPr = makeelement(block, 'w:pPr')
self.bs.serialize_properties(pPr, normal_style.bs)
if self.outline_level is not None:
makeelement(pPr, 'w:outlineLvl', w_val=unicode_type(self.outline_level + 1))
rPr = makeelement(block, 'w:rPr')
self.rs.serialize_properties(rPr, normal_style.rs)
class FloatSpec(object):
def __init__(self, namespace, html_tag, tag_style):
self.makeelement = namespace.makeelement
self.is_dropcaps = is_dropcaps(html_tag, tag_style)
self.blocks = []
if self.is_dropcaps:
self.dropcaps_lines = 3
else:
self.x_align = tag_style['float']
self.w = self.h = None
if tag_style._get('width') != 'auto':
self.w = int(20 * max(tag_style['min-width'], tag_style['width']))
if tag_style._get('height') == 'auto':
self.h_rule = 'auto'
else:
if tag_style['min-height'] > 0:
self.h_rule, self.h = 'atLeast', tag_style['min-height']
else:
self.h_rule, self.h = 'exact', tag_style['height']
self.h = int(20 * self.h)
self.h_space = int(20 * max(tag_style['margin-right'], tag_style['margin-left']))
self.v_space = int(20 * max(tag_style['margin-top'], tag_style['margin-bottom']))
read_css_block_borders(self, tag_style)
def serialize(self, block, parent):
if self.is_dropcaps:
attrs = dict(w_dropCap='drop', w_lines=unicode_type(self.dropcaps_lines), w_wrap='around', w_vAnchor='text', w_hAnchor='text')
else:
attrs = dict(
w_wrap='around', w_vAnchor='text', w_hAnchor='text', w_xAlign=self.x_align, w_y='1',
w_hSpace=unicode_type(self.h_space), w_vSpace=unicode_type(self.v_space), w_hRule=self.h_rule
)
if self.w is not None:
attrs['w_w'] = unicode_type(self.w)
if self.h is not None:
attrs['w_h'] = unicode_type(self.h)
self.makeelement(parent, 'w:framePr', **attrs)
# Margins are already applied by the frame style, so override them to
# be zero on individual blocks
self.makeelement(parent, 'w:ind', w_left='0', w_leftChars='0', w_right='0', w_rightChars='0')
attrs = {}
if block is self.blocks[0]:
attrs.update(dict(w_before='0', w_beforeLines='0'))
if block is self.blocks[-1]:
attrs.update(dict(w_after='0', w_afterLines='0'))
if attrs:
self.makeelement(parent, 'w:spacing', **attrs)
# Similarly apply the same border and padding properties to all blocks
# in this floatspec
bdr = self.makeelement(parent, 'w:pBdr')
for edge in border_edges:
padding = getattr(self, 'padding_' + edge)
width = getattr(self, 'border_%s_width' % edge)
bstyle = getattr(self, 'border_%s_style' % edge)
self.makeelement(
bdr, 'w:'+edge, w_space=unicode_type(padding), w_val=bstyle, w_sz=unicode_type(width), w_color=getattr(self, 'border_%s_color' % edge))
class DOCXStyle(object):
ALL_PROPS = ()
TYPE = 'paragraph'
def __init__(self, namespace):
self.namespace = namespace
self.w = lambda x: '{%s}%s' % (namespace.namespaces['w'], x)
self.id = self.name = None
self.next_style = None
self.calculate_hash()
def calculate_hash(self):
self._hash = hash(tuple(
getattr(self, x) for x in self.ALL_PROPS))
def makeelement(self, parent, name, **attrs):
return parent.makeelement(self.w(name), **{self.w(k):v for k, v in iteritems(attrs)})
def __hash__(self):
return self._hash
def __eq__(self, other):
for x in self.ALL_PROPS:
if getattr(self, x) != getattr(other, x, None):
return False
return True
def __ne__(self, other):
return not self == other
def __repr__(self):
return etree.tostring(self.serialize(etree.Element(self.__class__.__name__, nsmap={'w':self.namespace.namespaces['w']})), pretty_print=True)
__str__ = __repr__
def serialize(self, styles, normal_style):
makeelement = self.makeelement
style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE)
style.append(makeelement(style, 'name', val=self.name))
if self is not normal_style:
style.append(makeelement(style, 'basedOn', val=normal_style.id))
styles.append(style)
return style
LINE_STYLES = {
'none' : 'none',
'hidden': 'none',
'dotted': 'dotted',
'dashed': 'dashed',
'solid' : 'single',
'double': 'double',
'groove': 'threeDEngrave',
'ridge' : 'threeDEmboss',
'inset' : 'inset',
'outset': 'outset',
}
class TextStyle(DOCXStyle):
ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color',
'background_color', 'underline', 'strike', 'dstrike', 'caps',
'shadow', 'small_caps', 'spacing', 'vertical_align', 'padding',
'border_style', 'border_width', 'border_color')
TYPE = 'character'
def __init__(self, namespace, css, is_parent_style=False):
self.font_family = css_font_family_to_docx(css['font-family'])
try:
self.font_size = max(0, int(float(css['font-size']) * 2)) # stylizer normalizes all font sizes into pts
except (ValueError, TypeError, AttributeError):
self.font_size = None
fw = css['font-weight']
self.bold = (fw.lower() if hasattr(fw, 'lower') else fw) in {'bold', 'bolder'} or int_or_zero(fw) >= 700
self.italic = css['font-style'].lower() in {'italic', 'oblique'}
self.color = convert_color(css['color'])
self.background_color = None if is_parent_style else convert_color(css.backgroundColor)
td = set((css.effective_text_decoration or '').split())
self.underline = 'underline' in td
self.dstrike = 'line-through' in td and 'overline' in td
self.strike = not self.dstrike and 'line-through' in td
self.text_transform = css['text-transform'] # TODO: If lowercase or capitalize, transform the actual text
self.caps = self.text_transform == 'uppercase'
self.small_caps = css['font-variant'].lower() in {'small-caps', 'smallcaps'}
self.shadow = css['text-shadow'] not in {'none', None}
try:
self.spacing = int(float(css['letter-spacing']) * 20)
except (ValueError, TypeError, AttributeError):
self.spacing = None
va = css.first_vertical_align
if isinstance(va, numbers.Number):
self.vertical_align = unicode_type(int(va * 2))
else:
val = {
'top':'superscript', 'text-top':'superscript', 'sup':'superscript', 'super':'superscript',
'bottom':'subscript', 'text-bottom':'subscript', 'sub':'subscript'}.get(va)
self.vertical_align = val or 'baseline'
self.padding = self.border_color = self.border_width = self.border_style = None
if not is_parent_style:
# DOCX does not support individual borders/padding for inline content
for edge in border_edges:
# In DOCX padding can only be a positive integer
try:
padding = max(0, int(css['padding-' + edge]))
except ValueError:
padding = 0
if self.padding is None:
self.padding = padding
elif self.padding != padding:
self.padding = ignore
val = css['border-%s-width' % edge]
if not isinstance(val, numbers.Number):
val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0)
val = min(96, max(2, int(val * 8)))
if self.border_width is None:
self.border_width = val
elif self.border_width != val:
self.border_width = ignore
color = convert_color(css['border-%s-color' % edge])
if self.border_color is None:
self.border_color = color
elif self.border_color != color:
self.border_color = ignore
style = LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none')
if self.border_style is None:
self.border_style = style
elif self.border_style != style:
self.border_style = ignore
if self.padding in (None, ignore):
self.padding = 0
if self.border_width in (None, ignore):
self.border_width = 0
if self.border_style in (None, ignore):
self.border_style = 'none'
if self.border_color in (None, ignore):
self.border_color = 'auto'
if self.border_style == 'none':
self.border_width, self.border_color = 0, 'auto'
DOCXStyle.__init__(self, namespace)
def serialize_borders(self, bdr, normal_style):
w = self.w
is_normal_style = self is normal_style
if is_normal_style or self.padding != normal_style.padding:
bdr.set(w('space'), unicode_type(self.padding))
if is_normal_style or self.border_width != normal_style.border_width:
bdr.set(w('sz'), unicode_type(self.border_width))
if is_normal_style or self.border_style != normal_style.border_style:
bdr.set(w('val'), self.border_style)
if is_normal_style or self.border_color != normal_style.border_color:
bdr.set(w('color'), self.border_color)
return bdr
def serialize(self, styles, normal_style):
makeelement = self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'rPr')
self.serialize_properties(style, normal_style)
if len(style) > 0:
style_root.append(style)
return style_root
def serialize_properties(self, rPr, normal_style):
makeelement = self.makeelement
is_normal_style = self is normal_style
if is_normal_style or self.font_family != normal_style.font_family:
rPr.append(makeelement(
rPr, 'rFonts', **{k:self.font_family for k in 'ascii cs eastAsia hAnsi'.split()}))
for name, attr, vmap in (('sz', 'font_size', str), ('b', 'bold', bmap), ('i', 'italic', bmap)):
val = getattr(self, attr)
if is_normal_style or getattr(normal_style, attr) != val:
for suffix in ('', 'Cs'):
rPr.append(makeelement(rPr, name + suffix, val=vmap(val)))
def check_attr(attr):
val = getattr(self, attr)
return is_normal_style or (val != getattr(normal_style, attr))
if check_attr('color'):
rPr.append(makeelement(rPr, 'color', val=self.color or 'auto'))
if check_attr('background_color'):
rPr.append(makeelement(rPr, 'shd', fill=self.background_color or 'auto'))
if check_attr('underline'):
rPr.append(makeelement(rPr, 'u', val='single' if self.underline else 'none'))
if check_attr('dstrike'):
rPr.append(makeelement(rPr, 'dstrike', val=bmap(self.dstrike)))
if check_attr('strike'):
rPr.append(makeelement(rPr, 'strike', val=bmap(self.strike)))
if check_attr('caps'):
rPr.append(makeelement(rPr, 'caps', val=bmap(self.caps)))
if check_attr('small_caps'):
rPr.append(makeelement(rPr, 'smallCaps', val=bmap(self.small_caps)))
if check_attr('shadow'):
rPr.append(makeelement(rPr, 'shadow', val=bmap(self.shadow)))
if check_attr('spacing'):
rPr.append(makeelement(rPr, 'spacing', val=unicode_type(self.spacing or 0)))
if is_normal_style:
rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align if self.vertical_align in {'superscript', 'subscript'} else 'baseline'))
elif self.vertical_align != normal_style.vertical_align:
if self.vertical_align in {'superscript', 'subscript', 'baseline'}:
rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align))
else:
rPr.append(makeelement(rPr, 'position', val=self.vertical_align))
bdr = self.serialize_borders(makeelement(rPr, 'bdr'), normal_style)
if bdr.attrib:
rPr.append(bdr)
class DescendantTextStyle(object):
def __init__(self, parent_style, child_style):
self.id = self.name = None
self.makeelement = child_style.makeelement
p = []
def add(name, **props):
p.append((name, frozenset(iteritems(props))))
def vals(attr):
return getattr(parent_style, attr), getattr(child_style, attr)
def check(attr):
pval, cval = vals(attr)
return pval != cval
if parent_style.font_family != child_style.font_family:
add('rFonts', **{k:child_style.font_family for k in 'ascii cs eastAsia hAnsi'.split()})
for name, attr in (('sz', 'font_size'), ('b', 'bold'), ('i', 'italic')):
pval, cval = vals(attr)
if pval != cval:
val = 'on' if attr in {'bold', 'italic'} else unicode_type(cval) # bold, italic are toggle properties
for suffix in ('', 'Cs'):
add(name + suffix, val=val)
if check('color'):
add('color', val=child_style.color or 'auto')
if check('background_color'):
add('shd', fill=child_style.background_color or 'auto')
if check('underline'):
add('u', val='single' if child_style.underline else 'none')
if check('dstrike'):
add('dstrike', val=bmap(child_style.dstrike))
if check('strike'):
add('strike', val='on') # toggle property
if check('caps'):
add('caps', val='on') # toggle property
if check('small_caps'):
add('smallCaps', val='on') # toggle property
if check('shadow'):
add('shadow', val='on') # toggle property
if check('spacing'):
add('spacing', val=unicode_type(child_style.spacing or 0))
if check('vertical_align'):
val = child_style.vertical_align
if val in {'superscript', 'subscript', 'baseline'}:
add('vertAlign', val=val)
else:
add('position', val=val)
bdr = {}
if check('padding'):
bdr['space'] = unicode_type(child_style.padding)
if check('border_width'):
bdr['sz'] = unicode_type(child_style.border_width)
if check('border_style'):
bdr['val'] = child_style.border_style
if check('border_color'):
bdr['color'] = child_style.border_color
if bdr:
add('bdr', **bdr)
self.properties = tuple(p)
self._hash = hash(self.properties)
def __hash__(self):
return self._hash
def __eq__(self, other):
return self.properties == other.properties
def __ne__(self, other):
return self.properties != other.properties
def serialize(self, styles):
makeelement = self.makeelement
style = makeelement(styles, 'style', styleId=self.id, type='character')
style.append(makeelement(style, 'name', val=self.name))
rpr = makeelement(style, 'rPr')
style.append(rpr)
for name, attrs in self.properties:
rpr.append(makeelement(style, name, **dict(attrs)))
styles.append(style)
return style
def read_css_block_borders(self, css, store_css_style=False):
for edge in border_edges:
if css is None:
setattr(self, 'padding_' + edge, 0)
setattr(self, 'margin_' + edge, 0)
setattr(self, 'css_margin_' + edge, '')
setattr(self, 'border_%s_width' % edge, 2)
setattr(self, 'border_%s_color' % edge, None)
setattr(self, 'border_%s_style' % edge, 'none')
if store_css_style:
setattr(self, 'border_%s_css_style' % edge, 'none')
else:
# In DOCX padding can only be a positive integer
try:
setattr(self, 'padding_' + edge, max(0, int(css['padding-' + edge])))
except ValueError:
setattr(self, 'padding_' + edge, 0) # invalid value for padding
# In DOCX margin must be a positive integer in twips (twentieth of a point)
try:
setattr(self, 'margin_' + edge, max(0, int(css['margin-' + edge] * 20)))
except ValueError:
setattr(self, 'margin_' + edge, 0) # for e.g.: margin: auto
setattr(self, 'css_margin_' + edge, css._style.get('margin-' + edge, ''))
val = css['border-%s-width' % edge]
if not isinstance(val, numbers.Number):
val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0)
val = min(96, max(2, int(val * 8)))
setattr(self, 'border_%s_width' % edge, val)
setattr(self, 'border_%s_color' % edge, convert_color(css['border-%s-color' % edge]) or 'auto')
setattr(self, 'border_%s_style' % edge, LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none'))
if store_css_style:
setattr(self, 'border_%s_css_style' % edge, css['border-%s-style' % edge].lower())
class BlockStyle(DOCXStyle):
ALL_PROPS = tuple(
'text_align css_text_indent text_indent line_height background_color'.split(
) + ['margin_' + edge for edge in border_edges
] + ['css_margin_' + edge for edge in border_edges
] + [x%edge for edge in border_edges for x in border_props]
)
def __init__(self, namespace, css, html_block, is_table_cell=False, parent_bg=None):
read_css_block_borders(self, css)
if is_table_cell:
for edge in border_edges:
setattr(self, 'border_%s_style' % edge, 'none')
setattr(self, 'border_%s_width' % edge, 0)
setattr(self, 'padding_' + edge, 0)
setattr(self, 'margin_' + edge, 0)
if css is None:
self.text_indent = 0
self.css_text_indent = None
self.line_height = 280
self.background_color = None
self.text_align = 'left'
else:
try:
self.text_indent = int(css['text-indent'] * 20)
self.css_text_indent = css._get('text-indent')
except (TypeError, ValueError):
self.text_indent = 0
self.css_text_indent = None
try:
self.line_height = max(0, int(css.lineHeight * 20))
except (TypeError, ValueError):
self.line_height = max(0, int(1.2 * css.fontSize * 20))
self.background_color = None if is_table_cell else convert_color(css['background-color'])
if not is_table_cell and self.background_color is None:
self.background_color = parent_bg
try:
ws = css['white-space'].lower()
preserve_whitespace = ws in {'pre', 'pre-wrap'}
except Exception:
preserve_whitespace = False
try:
aval = css['text-align'].lower()
if preserve_whitespace:
aval = 'start'
self.text_align = {'start':'left', 'left':'left', 'end':'right', 'right':'right', 'center':'center', 'justify':'both', 'centre':'center'}.get(
aval, 'left')
except AttributeError:
self.text_align = 'left'
DOCXStyle.__init__(self, namespace)
def serialize_borders(self, bdr, normal_style):
w = self.w
for edge in border_edges:
e = bdr.makeelement(w(edge))
padding = getattr(self, 'padding_' + edge)
if (self is normal_style and padding > 0) or (padding != getattr(normal_style, 'padding_' + edge)):
e.set(w('space'), unicode_type(padding))
width = getattr(self, 'border_%s_width' % edge)
bstyle = getattr(self, 'border_%s_style' % edge)
if (self is normal_style and width > 0 and bstyle != 'none'
) or width != getattr(normal_style, 'border_%s_width' % edge
) or bstyle != getattr(normal_style, 'border_%s_style' % edge):
e.set(w('val'), bstyle)
e.set(w('sz'), unicode_type(width))
e.set(w('color'), getattr(self, 'border_%s_color' % edge))
if e.attrib:
bdr.append(e)
return bdr
def serialize(self, styles, normal_style):
makeelement = self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'pPr')
self.serialize_properties(style, normal_style)
if len(style) > 0:
style_root.append(style)
return style_root
def serialize_properties(self, pPr, normal_style):
makeelement, w = self.makeelement, self.w
spacing = makeelement(pPr, 'spacing')
for edge, attr in iteritems({'top':'before', 'bottom':'after'}):
getter = attrgetter('css_margin_' + edge)
css_val, css_unit = parse_css_length(getter(self))
if css_unit in ('em', 'ex'):
lines = max(0, int(css_val * (50 if css_unit == 'ex' else 100)))
if (self is normal_style and lines > 0) or getter(self) != getter(normal_style):
spacing.set(w(attr + 'Lines'), unicode_type(lines))
else:
getter = attrgetter('margin_' + edge)
val = getter(self)
if (self is normal_style and val > 0) or val != getter(normal_style):
spacing.set(w(attr), unicode_type(val))
if self is normal_style or self.line_height != normal_style.line_height:
spacing.set(w('line'), unicode_type(self.line_height))
spacing.set(w('lineRule'), 'atLeast')
if spacing.attrib:
pPr.append(spacing)
ind = makeelement(pPr, 'ind')
for edge in ('left', 'right'):
getter = attrgetter('css_margin_' + edge)
css_val, css_unit = parse_css_length(getter(self))
if css_unit in ('em', 'ex'):
chars = max(0, int(css_val * (50 if css_unit == 'ex' else 100)))
if (self is normal_style and chars > 0) or getter(self) != getter(normal_style):
ind.set(w(edge + 'Chars'), unicode_type(chars))
else:
getter = attrgetter('margin_' + edge)
val = getter(self)
if (self is normal_style and val > 0) or val != getter(normal_style):
ind.set(w(edge), unicode_type(val))
ind.set(w(edge + 'Chars'), '0') # This is needed to override any declaration in the parent style
css_val, css_unit = parse_css_length(self.css_text_indent)
if css_unit in ('em', 'ex'):
chars = int(css_val * (50 if css_unit == 'ex' else 100))
if css_val >= 0:
if (self is normal_style and chars > 0) or self.css_text_indent != normal_style.css_text_indent:
ind.set(w('firstLineChars'), unicode_type(chars))
else:
if (self is normal_style and chars < 0) or self.css_text_indent != normal_style.css_text_indent:
ind.set(w('hangingChars'), unicode_type(abs(chars)))
else:
val = self.text_indent
if val >= 0:
if (self is normal_style and val > 0) or self.text_indent != normal_style.text_indent:
ind.set(w('firstLine'), unicode_type(val))
ind.set(w('firstLineChars'), '0') # This is needed to override any declaration in the parent style
else:
if (self is normal_style and val < 0) or self.text_indent != normal_style.text_indent:
ind.set(w('hanging'), unicode_type(abs(val)))
ind.set(w('hangingChars'), '0')
if ind.attrib:
pPr.append(ind)
if (self is normal_style and self.background_color) or self.background_color != normal_style.background_color:
pPr.append(makeelement(pPr, 'shd', val='clear', color='auto', fill=self.background_color or 'auto'))
pbdr = self.serialize_borders(pPr.makeelement(w('pBdr')), normal_style)
if len(pbdr):
pPr.append(pbdr)
if self is normal_style or self.text_align != normal_style.text_align:
pPr.append(makeelement(pPr, 'jc', val=self.text_align))
if self is not normal_style and self.next_style is not None:
pPr.append(makeelement(pPr, 'next', val=self.next_style))
class StylesManager(object):
def __init__(self, namespace, log, document_lang):
self.namespace = namespace
self.document_lang = lang_as_iso639_1(document_lang) or 'en'
self.log = log
self.block_styles, self.text_styles = {}, {}
self.styles_for_html_blocks = {}
def create_text_style(self, css_style, is_parent_style=False):
ans = TextStyle(self.namespace, css_style, is_parent_style=is_parent_style)
existing = self.text_styles.get(ans, None)
if existing is None:
self.text_styles[ans] = ans
else:
ans = existing
return ans
def create_block_style(self, css_style, html_block, is_table_cell=False, parent_bg=None):
ans = BlockStyle(self.namespace, css_style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg)
existing = self.block_styles.get(ans, None)
if existing is None:
self.block_styles[ans] = ans
else:
ans = existing
self.styles_for_html_blocks[html_block] = ans
return ans
def finalize(self, all_blocks):
block_counts, run_counts = Counter(), Counter()
block_rmap, run_rmap = defaultdict(list), defaultdict(list)
used_pairs = defaultdict(list)
heading_styles = defaultdict(list)
headings = frozenset('h1 h2 h3 h4 h5 h6'.split())
pure_block_styles = set()
for block in all_blocks:
bs = block.style
block_counts[bs] += 1
block_rmap[block.style].append(block)
local_run_counts = Counter()
for run in block.runs:
count = run.style_weight
run_counts[run.style] += count
local_run_counts[run.style] += count
run_rmap[run.style].append(run)
if local_run_counts:
rs = local_run_counts.most_common(1)[0][0]
used_pairs[(bs, rs)].append(block)
if block.html_tag in headings:
heading_styles[block.html_tag].append((bs, rs))
else:
pure_block_styles.add(bs)
self.pure_block_styles = sorted(pure_block_styles, key=block_counts.__getitem__)
bnum = len(unicode_type(max(1, len(pure_block_styles) - 1)))
for i, bs in enumerate(self.pure_block_styles):
bs.id = bs.name = '%0{}d Block'.format(bnum) % i
bs.seq = i
if i == 0:
self.normal_pure_block_style = bs
counts = Counter()
smap = {}
for (bs, rs), blocks in iteritems(used_pairs):
s = CombinedStyle(bs, rs, blocks, self.namespace)
smap[(bs, rs)] = s
counts[s] += sum(1 for b in blocks if not b.is_empty())
for i, heading_tag in enumerate(sorted(heading_styles)):
styles = sorted((smap[k] for k in heading_styles[heading_tag]), key=counts.__getitem__)
styles = list(filter(lambda s:s.outline_level is None, styles))
if styles:
heading_style = styles[-1]
heading_style.outline_level = i
snum = len(unicode_type(max(1, len(counts) - 1)))
heading_styles = []
for i, (style, count) in enumerate(counts.most_common()):
if i == 0:
self.normal_style = style
style.id = style.name = 'Normal'
else:
if style.outline_level is None:
val = 'Para %0{}d'.format(snum) % i
else:
val = 'Heading %d' % (style.outline_level + 1)
heading_styles.append(style)
style.id = style.name = val
style.seq = i
self.combined_styles = sorted(counts, key=attrgetter('seq'))
[ls.apply() for ls in self.combined_styles]
descendant_style_map = {}
ds_counts = Counter()
for block in all_blocks:
for run in block.runs:
if run.parent_style is not run.style and run.parent_style and run.style:
ds = DescendantTextStyle(run.parent_style, run.style)
if ds.properties:
run.descendant_style = descendant_style_map.get(ds)
if run.descendant_style is None:
run.descendant_style = descendant_style_map[ds] = ds
ds_counts[run.descendant_style] += run.style_weight
rnum = len(unicode_type(max(1, len(ds_counts) - 1)))
for i, (text_style, count) in enumerate(ds_counts.most_common()):
text_style.id = 'Text%d' % i
text_style.name = '%0{}d Text'.format(rnum) % i
text_style.seq = i
self.descendant_text_styles = sorted(descendant_style_map, key=attrgetter('seq'))
self.log.debug('%d Text Styles %d Combined styles' % tuple(map(len, (
self.descendant_text_styles, self.combined_styles))))
self.primary_heading_style = None
if heading_styles:
heading_styles.sort(key=attrgetter('outline_level'))
self.primary_heading_style = heading_styles[0]
else:
ms = 0
for s in self.combined_styles:
if s.rs.font_size > ms:
self.primary_heading_style = s
ms = s.rs.font_size
def serialize(self, styles):
lang = styles.xpath('descendant::*[local-name()="lang"]')[0]
for k in tuple(lang.attrib):
lang.attrib[k] = self.document_lang
for style in self.combined_styles:
style.serialize(styles, self.normal_style)
for style in self.descendant_text_styles:
style.serialize(styles)
for style in sorted(self.pure_block_styles, key=attrgetter('seq')):
style.serialize(styles, self.normal_pure_block_style)

View File

@@ -0,0 +1,371 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import namedtuple
from calibre.ebooks.docx.writer.utils import convert_color
from calibre.ebooks.docx.writer.styles import read_css_block_borders as rcbb, border_edges
from polyglot.builtins import iteritems, range, unicode_type
class Dummy(object):
pass
Border = namedtuple('Border', 'css_style style width color level')
border_style_weight = {
x:100-i for i, x in enumerate(('double', 'solid', 'dashed', 'dotted', 'ridge', 'outset', 'groove', 'inset'))}
class SpannedCell(object):
def __init__(self, spanning_cell, horizontal=True):
self.spanning_cell = spanning_cell
self.horizontal = horizontal
self.row_span = self.col_span = 1
def resolve_borders(self):
pass
def serialize(self, tr, makeelement):
tc = makeelement(tr, 'w:tc')
tcPr = makeelement(tc, 'w:tcPr')
makeelement(tcPr, 'w:%sMerge' % ('h' if self.horizontal else 'v'), w_val='continue')
makeelement(tc, 'w:p')
def applicable_borders(self, edge):
return self.spanning_cell.applicable_borders(edge)
def read_css_block_borders(self, css):
obj = Dummy()
rcbb(obj, css, store_css_style=True)
for edge in border_edges:
setattr(self, 'border_' + edge, Border(
getattr(obj, 'border_%s_css_style' % edge),
getattr(obj, 'border_%s_style' % edge),
getattr(obj, 'border_%s_width' % edge),
getattr(obj, 'border_%s_color' % edge),
self.BLEVEL
))
setattr(self, 'padding_' + edge, getattr(obj, 'padding_' + edge))
def as_percent(x):
if x and x.endswith('%'):
try:
return float(x.rstrip('%'))
except Exception:
pass
def convert_width(tag_style):
if tag_style is not None:
w = tag_style._get('width')
wp = as_percent(w)
if w == 'auto':
return ('auto', 0)
elif wp is not None:
return ('pct', int(wp * 50))
else:
try:
return ('dxa', int(float(tag_style['width']) * 20))
except Exception:
pass
return ('auto', 0)
class Cell(object):
BLEVEL = 2
def __init__(self, row, html_tag, tag_style=None):
self.row = row
self.table = self.row.table
self.html_tag = html_tag
try:
self.row_span = max(0, int(html_tag.get('rowspan', 1)))
except Exception:
self.row_span = 1
try:
self.col_span = max(0, int(html_tag.get('colspan', 1)))
except Exception:
self.col_span = 1
if tag_style is None:
self.valign = 'center'
else:
self.valign = {'top':'top', 'bottom':'bottom', 'middle':'center'}.get(tag_style._get('vertical-align'))
self.items = []
self.width = convert_width(tag_style)
self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
read_css_block_borders(self, tag_style)
def add_block(self, block):
self.items.append(block)
block.parent_items = self.items
def add_table(self, table):
self.items.append(table)
return table
def serialize(self, parent, makeelement):
tc = makeelement(parent, 'w:tc')
tcPr = makeelement(tc, 'w:tcPr')
makeelement(tcPr, 'w:tcW', w_type=self.width[0], w_w=unicode_type(self.width[1]))
# For some reason, Word 2007 refuses to honor <w:shd> at the table or row
# level, despite what the specs say, so we inherit and apply at the
# cell level
bc = self.background_color or self.row.background_color or self.row.table.background_color
if bc:
makeelement(tcPr, 'w:shd', w_val="clear", w_color="auto", w_fill=bc)
b = makeelement(tcPr, 'w:tcBorders', append=False)
for edge, border in iteritems(self.borders):
if border is not None and border.width > 0 and border.style != 'none':
makeelement(b, 'w:' + edge, w_val=border.style, w_sz=unicode_type(border.width), w_color=border.color)
if len(b) > 0:
tcPr.append(b)
m = makeelement(tcPr, 'w:tcMar', append=False)
for edge in border_edges:
padding = getattr(self, 'padding_' + edge)
if edge in {'top', 'bottom'} or (edge == 'left' and self is self.row.first_cell) or (edge == 'right' and self is self.row.last_cell):
padding += getattr(self.row, 'padding_' + edge)
if padding > 0:
makeelement(m, 'w:' + edge, w_type='dxa', w_w=unicode_type(int(padding * 20)))
if len(m) > 0:
tcPr.append(m)
if self.valign is not None:
makeelement(tcPr, 'w:vAlign', w_val=self.valign)
if self.row_span > 1:
makeelement(tcPr, 'w:vMerge', w_val='restart')
if self.col_span > 1:
makeelement(tcPr, 'w:hMerge', w_val='restart')
item = None
for item in self.items:
item.serialize(tc)
if item is None or isinstance(item, Table):
# Word 2007 requires the last element in a table cell to be a paragraph
makeelement(tc, 'w:p')
def applicable_borders(self, edge):
if edge == 'left':
items = {self.table, self.row, self} if self.row.first_cell is self else {self}
elif edge == 'top':
items = ({self.table} if self.table.first_row is self.row else set()) | {self, self.row}
elif edge == 'right':
items = {self.table, self, self.row} if self.row.last_cell is self else {self}
elif edge == 'bottom':
items = ({self.table} if self.table.last_row is self.row else set()) | {self, self.row}
return {getattr(x, 'border_' + edge) for x in items}
def resolve_border(self, edge):
# In Word cell borders override table borders, and Word ignores row
# borders, so we consolidate all borders as cell borders
# In HTML the priority is as described here:
# http://www.w3.org/TR/CSS21/tables.html#border-conflict-resolution
neighbor = self.neighbor(edge)
borders = self.applicable_borders(edge)
if neighbor is not None:
nedge = {'left':'right', 'top':'bottom', 'right':'left', 'bottom':'top'}[edge]
borders |= neighbor.applicable_borders(nedge)
for b in borders:
if b.css_style == 'hidden':
return None
def weight(border):
return (
0 if border.css_style == 'none' else 1,
border.width,
border_style_weight.get(border.css_style, 0),
border.level)
border = sorted(borders, key=weight)[-1]
return border
def resolve_borders(self):
self.borders = {edge:self.resolve_border(edge) for edge in border_edges}
def neighbor(self, edge):
idx = self.row.cells.index(self)
ans = None
if edge == 'left':
ans = self.row.cells[idx-1] if idx > 0 else None
elif edge == 'right':
ans = self.row.cells[idx+1] if (idx + 1) < len(self.row.cells) else None
elif edge == 'top':
ridx = self.table.rows.index(self.row)
if ridx > 0 and idx < len(self.table.rows[ridx-1].cells):
ans = self.table.rows[ridx-1].cells[idx]
elif edge == 'bottom':
ridx = self.table.rows.index(self.row)
if ridx + 1 < len(self.table.rows) and idx < len(self.table.rows[ridx+1].cells):
ans = self.table.rows[ridx+1].cells[idx]
return getattr(ans, 'spanning_cell', ans)
class Row(object):
BLEVEL = 1
def __init__(self, table, html_tag, tag_style=None):
self.table = table
self.html_tag = html_tag
self.orig_tag_style = tag_style
self.cells = []
self.current_cell = None
self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
read_css_block_borders(self, tag_style)
@property
def first_cell(self):
return self.cells[0] if self.cells else None
@property
def last_cell(self):
return self.cells[-1] if self.cells else None
def start_new_cell(self, html_tag, tag_style):
self.current_cell = Cell(self, html_tag, tag_style)
def finish_tag(self, html_tag):
if self.current_cell is not None:
if html_tag is self.current_cell.html_tag:
self.cells.append(self.current_cell)
self.current_cell = None
def add_block(self, block):
if self.current_cell is None:
self.start_new_cell(self.html_tag, self.orig_tag_style)
self.current_cell.add_block(block)
def add_table(self, table):
if self.current_cell is None:
self.current_cell = Cell(self, self.html_tag, self.orig_tag_style)
return self.current_cell.add_table(table)
def serialize(self, parent, makeelement):
tr = makeelement(parent, 'w:tr')
for cell in self.cells:
cell.serialize(tr, makeelement)
class Table(object):
BLEVEL = 0
def __init__(self, namespace, html_tag, tag_style=None):
self.namespace = namespace
self.html_tag = html_tag
self.orig_tag_style = tag_style
self.rows = []
self.current_row = None
self.width = convert_width(tag_style)
self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
self.jc = None
self.float = None
self.margin_left = self.margin_right = self.margin_top = self.margin_bottom = None
if tag_style is not None:
ml, mr = tag_style._get('margin-left'), tag_style.get('margin-right')
if ml == 'auto':
self.jc = 'center' if mr == 'auto' else 'right'
self.float = tag_style['float']
for edge in border_edges:
setattr(self, 'margin_' + edge, tag_style['margin-' + edge])
read_css_block_borders(self, tag_style)
@property
def first_row(self):
return self.rows[0] if self.rows else None
@property
def last_row(self):
return self.rows[-1] if self.rows else None
def finish_tag(self, html_tag):
if self.current_row is not None:
self.current_row.finish_tag(html_tag)
if self.current_row.html_tag is html_tag:
self.rows.append(self.current_row)
self.current_row = None
table_ended = self.html_tag is html_tag
if table_ended:
self.expand_spanned_cells()
for row in self.rows:
for cell in row.cells:
cell.resolve_borders()
return table_ended
def expand_spanned_cells(self):
# Expand horizontally
for row in self.rows:
for cell in tuple(row.cells):
idx = row.cells.index(cell)
if cell.col_span > 1 and (cell is row.cells[-1] or not isinstance(row.cells[idx+1], SpannedCell)):
row.cells[idx:idx+1] = [cell] + [SpannedCell(cell, horizontal=True) for i in range(1, cell.col_span)]
# Expand vertically
for r, row in enumerate(self.rows):
for idx, cell in enumerate(row.cells):
if cell.row_span > 1:
for nrow in self.rows[r+1:]:
sc = SpannedCell(cell, horizontal=False)
try:
tcell = nrow.cells[idx]
except Exception:
tcell = None
if tcell is None:
nrow.cells.extend([SpannedCell(nrow.cells[-1], horizontal=True) for i in range(idx - len(nrow.cells))])
nrow.cells.append(sc)
else:
if isinstance(tcell, SpannedCell):
# Conflict between rowspan and colspan
break
else:
nrow.cells.insert(idx, sc)
def start_new_row(self, html_tag, html_style):
if self.current_row is not None:
self.rows.append(self.current_row)
self.current_row = Row(self, html_tag, html_style)
def start_new_cell(self, html_tag, html_style):
if self.current_row is None:
self.start_new_row(html_tag, None)
self.current_row.start_new_cell(html_tag, html_style)
def add_block(self, block):
self.current_row.add_block(block)
def add_table(self, table):
if self.current_row is None:
self.current_row = Row(self, self.html_tag, self.orig_tag_style)
return self.current_row.add_table(table)
def serialize(self, parent):
makeelement = self.namespace.makeelement
rows = [r for r in self.rows if r.cells]
if not rows:
return
tbl = makeelement(parent, 'w:tbl')
tblPr = makeelement(tbl, 'w:tblPr')
makeelement(tblPr, 'w:tblW', w_type=self.width[0], w_w=unicode_type(self.width[1]))
if self.float in {'left', 'right'}:
kw = {'w_vertAnchor':'text', 'w_horzAnchor':'text', 'w_tblpXSpec':self.float}
for edge in border_edges:
val = getattr(self, 'margin_' + edge) or 0
if {self.float, edge} == {'left', 'right'}:
val = max(val, 2)
kw['w_' + edge + 'FromText'] = unicode_type(max(0, int(val *20)))
makeelement(tblPr, 'w:tblpPr', **kw)
if self.jc is not None:
makeelement(tblPr, 'w:jc', w_val=self.jc)
for row in rows:
row.serialize(tbl, makeelement)

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from tinycss.color3 import parse_color_string
def int_or_zero(raw):
try:
return int(raw)
except (ValueError, TypeError, AttributeError):
return 0
# convert_color() {{{
def convert_color(value):
if not value:
return
if value.lower() == 'currentcolor':
return 'auto'
val = parse_color_string(value)
if val is None:
return
if val.alpha < 0.01:
return
return '%02X%02X%02X' % (int(val.red * 255), int(val.green * 255), int(val.blue * 255))
def test_convert_color(return_tests=False):
import unittest
class TestColors(unittest.TestCase):
def test_color_conversion(self):
ae = self.assertEqual
cc = convert_color
ae(None, cc(None))
ae(None, cc('transparent'))
ae(None, cc('none'))
ae(None, cc('#12j456'))
ae('auto', cc('currentColor'))
ae('F0F8FF', cc('AliceBlue'))
ae('000000', cc('black'))
ae('FF0000', cc('red'))
ae('00FF00', cc('lime'))
ae(cc('#001'), '000011')
ae('12345D', cc('#12345d'))
ae('FFFFFF', cc('rgb(255, 255, 255)'))
ae('FF0000', cc('rgba(255, 0, 0, 23)'))
tests = unittest.defaultTestLoader.loadTestsFromTestCase(TestColors)
if return_tests:
return tests
unittest.TextTestRunner(verbosity=4).run(tests)
# }}}

View File

@@ -0,0 +1,316 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from collections import defaultdict
from calibre.ebooks.oeb.base import urlnormalize, css_text
from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont
from polyglot.builtins import iteritems, itervalues, unicode_type, range
from tinycss.fonts3 import parse_font_family
def get_font_properties(rule, default=None):
'''
Given a CSS rule, extract normalized font properties from
it. Note that shorthand font property should already have been expanded
by the CSS flattening code.
'''
props = {}
s = rule.style
for q in ('font-family', 'src', 'font-weight', 'font-stretch',
'font-style'):
g = 'uri' if q == 'src' else 'value'
try:
val = s.getProperty(q).propertyValue[0]
val = getattr(val, g)
if q == 'font-family':
val = parse_font_family(css_text(s.getProperty(q).propertyValue))
if val and val[0] == 'inherit':
val = None
except (IndexError, KeyError, AttributeError, TypeError, ValueError):
val = None if q in {'src', 'font-family'} else default
if q in {'font-weight', 'font-stretch', 'font-style'}:
val = unicode_type(val).lower() if (val or val == 0) else val
if val == 'inherit':
val = default
if q == 'font-weight':
val = {'normal':'400', 'bold':'700'}.get(val, val)
if val not in {'100', '200', '300', '400', '500', '600', '700',
'800', '900', 'bolder', 'lighter'}:
val = default
if val == 'normal':
val = '400'
elif q == 'font-style':
if val not in {'normal', 'italic', 'oblique'}:
val = default
elif q == 'font-stretch':
if val not in {'normal', 'ultra-condensed', 'extra-condensed',
'condensed', 'semi-condensed', 'semi-expanded',
'expanded', 'extra-expanded', 'ultra-expanded'}:
val = default
props[q] = val
return props
def find_font_face_rules(sheet, oeb):
'''
Find all @font-face rules in the given sheet and extract the relevant info from them.
sheet can be either a ManifestItem or a CSSStyleSheet.
'''
ans = []
try:
rules = sheet.data.cssRules
except AttributeError:
rules = sheet.cssRules
for i, rule in enumerate(rules):
if rule.type != rule.FONT_FACE_RULE:
continue
props = get_font_properties(rule, default='normal')
if not props['font-family'] or not props['src']:
continue
try:
path = sheet.abshref(props['src'])
except AttributeError:
path = props['src']
ff = oeb.manifest.hrefs.get(urlnormalize(path), None)
if not ff:
continue
props['item'] = ff
if props['font-weight'] in {'bolder', 'lighter'}:
props['font-weight'] = '400'
props['weight'] = int(props['font-weight'])
props['rule'] = rule
props['chars'] = set()
ans.append(props)
return ans
def elem_style(style_rules, cls, inherited_style):
'''
Find the effective style for the given element.
'''
classes = cls.split()
style = inherited_style.copy()
for cls in classes:
style.update(style_rules.get(cls, {}))
wt = style.get('font-weight', None)
pwt = inherited_style.get('font-weight', '400')
if wt == 'bolder':
style['font-weight'] = {
'100':'400',
'200':'400',
'300':'400',
'400':'700',
'500':'700',
}.get(pwt, '900')
elif wt == 'lighter':
style['font-weight'] = {
'600':'400', '700':'400',
'800':'700', '900':'700'}.get(pwt, '100')
return style
class SubsetFonts(object):
'''
Subset all embedded fonts. Must be run after CSS flattening, as it requires
CSS normalization and flattening to work.
'''
def __call__(self, oeb, log, opts):
self.oeb, self.log, self.opts = oeb, log, opts
self.find_embedded_fonts()
if not self.embedded_fonts:
self.log.debug('No embedded fonts found')
return
self.find_style_rules()
self.find_font_usage()
totals = [0, 0]
def remove(font):
totals[1] += len(font['item'].data)
self.oeb.manifest.remove(font['item'])
font['rule'].parentStyleSheet.deleteRule(font['rule'])
fonts = {}
for font in self.embedded_fonts:
item, chars = font['item'], font['chars']
if item.href in fonts:
fonts[item.href]['chars'] |= chars
else:
fonts[item.href] = font
for font in itervalues(fonts):
if not font['chars']:
self.log('The font %s is unused. Removing it.'%font['src'])
remove(font)
continue
try:
raw, old_stats, new_stats = subset(font['item'].data, font['chars'])
except NoGlyphs:
self.log('The font %s has no used glyphs. Removing it.'%font['src'])
remove(font)
continue
except UnsupportedFont as e:
self.log.warn('The font %s is unsupported for subsetting. %s'%(
font['src'], e))
sz = len(font['item'].data)
totals[0] += sz
totals[1] += sz
else:
font['item'].data = raw
nlen = sum(itervalues(new_stats))
olen = sum(itervalues(old_stats))
self.log('Decreased the font %s to %.1f%% of its original size'%
(font['src'], nlen/olen *100))
totals[0] += nlen
totals[1] += olen
font['item'].unload_data_from_memory()
if totals[0]:
self.log('Reduced total font size to %.1f%% of original'%
(totals[0]/totals[1] * 100))
def find_embedded_fonts(self):
'''
Find all @font-face rules and extract the relevant info from them.
'''
self.embedded_fonts = []
for item in self.oeb.manifest:
if not hasattr(item.data, 'cssRules'):
continue
self.embedded_fonts.extend(find_font_face_rules(item, self.oeb))
def find_style_rules(self):
'''
Extract all font related style information from all stylesheets into a
dict mapping classes to font properties specified by that class. All
the heavy lifting has already been done by the CSS flattening code.
'''
rules = defaultdict(dict)
for item in self.oeb.manifest:
if not hasattr(item.data, 'cssRules'):
continue
for i, rule in enumerate(item.data.cssRules):
if rule.type != rule.STYLE_RULE:
continue
props = {k:v for k,v in
iteritems(get_font_properties(rule)) if v}
if not props:
continue
for sel in rule.selectorList:
sel = sel.selectorText
if sel and sel.startswith('.'):
# We dont care about pseudo-selectors as the worst that
# can happen is some extra characters will remain in
# the font
sel = sel.partition(':')[0]
rules[sel[1:]].update(props)
self.style_rules = dict(rules)
def find_font_usage(self):
for item in self.oeb.manifest:
if not hasattr(item.data, 'xpath'):
continue
for body in item.data.xpath('//*[local-name()="body"]'):
base = {'font-family':['serif'], 'font-weight': '400',
'font-style':'normal', 'font-stretch':'normal'}
self.find_usage_in(body, base)
def used_font(self, style):
'''
Given a style find the embedded font that matches it. Returns None if
no match is found (can happen if no family matches).
'''
ff = style.get('font-family', [])
lnames = {unicode_type(x).lower() for x in ff}
matching_set = []
# Filter on font-family
for ef in self.embedded_fonts:
flnames = {x.lower() for x in ef.get('font-family', [])}
if not lnames.intersection(flnames):
continue
matching_set.append(ef)
if not matching_set:
return None
# Filter on font-stretch
widths = {x:i for i, x in enumerate(('ultra-condensed',
'extra-condensed', 'condensed', 'semi-condensed', 'normal',
'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'
))}
width = widths[style.get('font-stretch', 'normal')]
for f in matching_set:
f['width'] = widths[style.get('font-stretch', 'normal')]
min_dist = min(abs(width-f['width']) for f in matching_set)
nearest = [f for f in matching_set if abs(width-f['width']) ==
min_dist]
if width <= 4:
lmatches = [f for f in nearest if f['width'] <= width]
else:
lmatches = [f for f in nearest if f['width'] >= width]
matching_set = (lmatches or nearest)
# Filter on font-style
fs = style.get('font-style', 'normal')
order = {
'oblique':['oblique', 'italic', 'normal'],
'normal':['normal', 'oblique', 'italic']
}.get(fs, ['italic', 'oblique', 'normal'])
for q in order:
matches = [f for f in matching_set if f.get('font-style', 'normal') == q]
if matches:
matching_set = matches
break
# Filter on font weight
fw = int(style.get('font-weight', '400'))
if fw == 400:
q = [400, 500, 300, 200, 100, 600, 700, 800, 900]
elif fw == 500:
q = [500, 400, 300, 200, 100, 600, 700, 800, 900]
elif fw < 400:
q = [fw] + list(range(fw-100, -100, -100)) + list(range(fw+100,
100, 1000))
else:
q = [fw] + list(range(fw+100, 100, 1000)) + list(range(fw-100,
-100, -100))
for wt in q:
matches = [f for f in matching_set if f['weight'] == wt]
if matches:
return matches[0]
def find_chars(self, elem):
ans = set()
if elem.text:
ans |= set(elem.text)
for child in elem:
if child.tail:
ans |= set(child.tail)
return ans
def find_usage_in(self, elem, inherited_style):
style = elem_style(self.style_rules, elem.get('class', '') or '', inherited_style)
for child in elem:
self.find_usage_in(child, style)
font = self.used_font(style)
if font:
chars = self.find_chars(elem)
if chars:
font['chars'] |= chars

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,247 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import codecs, zlib, numbers
from io import BytesIO
from datetime import datetime
from calibre.constants import plugins, ispy3
from calibre.utils.logging import default_log
from polyglot.builtins import iteritems, unicode_type, codepoint_to_chr
from polyglot.binary import as_hex_bytes
pdf_float = plugins['speedup'][0].pdf_float
EOL = b'\n'
# Sizes {{{
inch = 72.0
cm = inch / 2.54
mm = cm * 0.1
pica = 12.0
didot = 0.375 * mm
cicero = 12 * didot
_W, _H = (21*cm, 29.7*cm)
A6 = (_W*.5, _H*.5)
A5 = (_H*.5, _W)
A4 = (_W, _H)
A3 = (_H, _W*2)
A2 = (_W*2, _H*2)
A1 = (_H*2, _W*4)
A0 = (_W*4, _H*4)
LETTER = (8.5*inch, 11*inch)
LEGAL = (8.5*inch, 14*inch)
ELEVENSEVENTEEN = (11*inch, 17*inch)
_BW, _BH = (25*cm, 35.3*cm)
B6 = (_BW*.5, _BH*.5)
B5 = (_BH*.5, _BW)
B4 = (_BW, _BH)
B3 = (_BH*2, _BW)
B2 = (_BW*2, _BH*2)
B1 = (_BH*4, _BW*2)
B0 = (_BW*4, _BH*4)
PAPER_SIZES = {k:globals()[k.upper()] for k in ('a0 a1 a2 a3 a4 a5 a6 b0 b1 b2'
' b3 b4 b5 b6 letter legal').split()}
# }}}
def fmtnum(o):
if isinstance(o, float):
return pdf_float(o)
return unicode_type(o)
def serialize(o, stream):
if isinstance(o, float):
stream.write_raw(pdf_float(o).encode('ascii'))
elif isinstance(o, bool):
# Must check bool before int as bools are subclasses of int
stream.write_raw(b'true' if o else b'false')
elif isinstance(o, numbers.Integral):
stream.write_raw(unicode_type(o).encode('ascii') if ispy3 else bytes(o))
elif hasattr(o, 'pdf_serialize'):
o.pdf_serialize(stream)
elif o is None:
stream.write_raw(b'null')
elif isinstance(o, datetime):
val = o.strftime("D:%Y%m%d%H%M%%02d%z")%min(59, o.second)
if datetime.tzinfo is not None:
val = "(%s'%s')"%(val[:-2], val[-2:])
stream.write(val.encode('ascii'))
else:
raise ValueError('Unknown object: %r'%o)
class Name(unicode_type):
def pdf_serialize(self, stream):
raw = self.encode('ascii')
if len(raw) > 126:
raise ValueError('Name too long: %r'%self)
raw = bytearray(raw)
sharp = ord(b'#')
buf = (
codepoint_to_chr(x).encode('ascii') if 33 < x < 126 and x != sharp else
'#{:x}'.format(x).encode('ascii') for x in raw)
stream.write(b'/'+b''.join(buf))
def escape_pdf_string(bytestring):
indices = []
bad = []
ba = bytearray(bytestring)
bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')}
for i, num in enumerate(ba):
if num == 40: # (
indices.append((i, 40))
elif num == 41: # )
if indices:
indices.pop()
else:
bad.append((i, 41))
elif num in bad_map: # '\n\r\f\b\t\\' see Table 3.2 in PDF 1.7 spec
bad.append((i, bad_map[num]))
bad = sorted(indices + bad, reverse=True)
if not bad:
return bytestring
for i, repl in bad:
ba[i:i+1] = (92, repl) # 92 = ord('\')
return bytes(ba)
class String(unicode_type):
def pdf_serialize(self, stream):
try:
raw = self.encode('latin1')
if raw.startswith(codecs.BOM_UTF16_BE):
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
except UnicodeEncodeError:
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
stream.write(b'('+escape_pdf_string(raw)+b')')
class UTF16String(unicode_type):
def pdf_serialize(self, stream):
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
if False:
# Disabled as the parentheses based strings give easier to debug
# PDF files
stream.write(b'<' + as_hex_bytes(raw) + b'>')
else:
stream.write(b'('+escape_pdf_string(raw)+b')')
class Dictionary(dict):
def pdf_serialize(self, stream):
stream.write(b'<<' + EOL)
sorted_keys = sorted(self,
key=lambda x:({'Type':'1', 'Subtype':'2'}.get(
x, x)+x))
for k in sorted_keys:
serialize(Name(k), stream)
stream.write(b' ')
serialize(self[k], stream)
stream.write(EOL)
stream.write(b'>>' + EOL)
class InlineDictionary(Dictionary):
def pdf_serialize(self, stream):
stream.write(b'<< ')
for k, v in iteritems(self):
serialize(Name(k), stream)
stream.write(b' ')
serialize(v, stream)
stream.write(b' ')
stream.write(b'>>')
class Array(list):
def pdf_serialize(self, stream):
stream.write(b'[')
for i, o in enumerate(self):
if i != 0:
stream.write(b' ')
serialize(o, stream)
stream.write(b']')
class Stream(BytesIO):
def __init__(self, compress=False):
BytesIO.__init__(self)
self.compress = compress
self.filters = Array()
def add_extra_keys(self, d):
pass
def pdf_serialize(self, stream):
raw = self.getvalue()
dl = len(raw)
filters = self.filters
if self.compress:
filters.append(Name('FlateDecode'))
raw = zlib.compress(raw)
d = InlineDictionary({'Length':len(raw), 'DL':dl})
self.add_extra_keys(d)
if filters:
d['Filter'] = filters
serialize(d, stream)
stream.write(EOL+b'stream'+EOL)
stream.write(raw)
stream.write(EOL+b'endstream'+EOL)
def write_line(self, raw=b''):
self.write(raw if isinstance(raw, bytes) else raw.encode('ascii'))
self.write(EOL)
def write(self, raw):
super(Stream, self).write(raw if isinstance(raw, bytes) else
raw.encode('ascii'))
def write_raw(self, raw):
BytesIO.write(self, raw)
class Reference(object):
def __init__(self, num, obj):
self.num, self.obj = num, obj
def pdf_serialize(self, stream):
raw = '%d 0 R'%self.num
stream.write(raw.encode('ascii'))
def __repr__(self):
return '%d 0 R'%self.num
def __str__(self):
return repr(self)
# }}}
def current_log(newlog=None):
if newlog:
current_log.ans = newlog
return current_log.ans or default_log
current_log.ans = None