1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2025-12-27 19:42:26 +01:00

Added docx writer related modules

This commit is contained in:
2020-04-13 16:33:15 +02:00
parent ae80ae5640
commit 98b2dd8d4f
29 changed files with 5956 additions and 0 deletions

View File

@@ -0,0 +1,9 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@@ -0,0 +1,281 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import textwrap, os
from lxml import etree
from lxml.builder import ElementMaker
from calibre import guess_type
from calibre.constants import numeric_version, __appname__
from calibre.ebooks.docx.names import DOCXNamespace
from calibre.ebooks.metadata import authors_to_string
from calibre.ebooks.pdf.render.common import PAPER_SIZES
from calibre.utils.date import utcnow
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from calibre.utils.zipfile import ZipFile
from polyglot.builtins import iteritems, map, unicode_type, native_string_type
def xml2str(root, pretty_print=False, with_tail=False):
if hasattr(etree, 'cleanup_namespaces'):
etree.cleanup_namespaces(root)
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print, with_tail=with_tail)
return ans
def page_size(opts):
width, height = PAPER_SIZES[opts.docx_page_size]
if opts.docx_custom_page_size is not None:
width, height = map(float, opts.docx_custom_page_size.partition('x')[0::2])
return width, height
def page_margin(opts, which):
val = getattr(opts, 'docx_page_margin_' + which)
if val == 0.0:
val = getattr(opts, 'margin_' + which)
return val
def page_effective_area(opts):
width, height = page_size(opts)
width -= page_margin(opts, 'left') + page_margin(opts, 'right')
height -= page_margin(opts, 'top') + page_margin(opts, 'bottom')
return width, height # in pts
def create_skeleton(opts, namespaces=None):
namespaces = namespaces or DOCXNamespace().namespaces
def w(x):
return '{%s}%s' % (namespaces['w'], x)
dn = {k:v for k, v in iteritems(namespaces) if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}}
E = ElementMaker(namespace=dn['w'], nsmap=dn)
doc = E.document()
body = E.body()
doc.append(body)
width, height = page_size(opts)
width, height = int(20 * width), int(20 * height)
def margin(which):
val = page_margin(opts, which)
return w(which), unicode_type(int(val * 20))
body.append(E.sectPr(
E.pgSz(**{w('w'):unicode_type(width), w('h'):unicode_type(height)}),
E.pgMar(**dict(map(margin, 'left top right bottom'.split()))),
E.cols(**{w('space'):'720'}),
E.docGrid(**{w('linePitch'):"360"}),
))
dn = {k:v for k, v in iteritems(namespaces) if k in tuple('wra') + ('wp',)}
E = ElementMaker(namespace=dn['w'], nsmap=dn)
styles = E.styles(
E.docDefaults(
E.rPrDefault(
E.rPr(
E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}),
E.sz(**{w('val'):'22'}),
E.szCs(**{w('val'):'22'}),
E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"})
)
),
E.pPrDefault(
E.pPr(
E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"})
)
)
)
)
return doc, styles, body
def update_doc_props(root, mi, namespace):
def setm(name, text=None, ns='dc'):
ans = root.makeelement('{%s}%s' % (namespace.namespaces[ns], name))
for child in tuple(root):
if child.tag == ans.tag:
root.remove(child)
ans.text = text
root.append(ans)
return ans
setm('title', mi.title)
setm('creator', authors_to_string(mi.authors))
if mi.tags:
setm('keywords', ', '.join(mi.tags), ns='cp')
if mi.comments:
setm('description', mi.comments)
if mi.languages:
l = canonicalize_lang(mi.languages[0])
setm('language', lang_as_iso639_1(l) or l)
class DocumentRelationships(object):
def __init__(self, namespace):
self.rmap = {}
self.namespace = namespace
for typ, target in iteritems({
namespace.names['STYLES']: 'styles.xml',
namespace.names['NUMBERING']: 'numbering.xml',
namespace.names['WEB_SETTINGS']: 'webSettings.xml',
namespace.names['FONTS']: 'fontTable.xml',
}):
self.add_relationship(target, typ)
def get_relationship_id(self, target, rtype, target_mode=None):
return self.rmap.get((target, rtype, target_mode))
def add_relationship(self, target, rtype, target_mode=None):
ans = self.get_relationship_id(target, rtype, target_mode)
if ans is None:
ans = 'rId%d' % (len(self.rmap) + 1)
self.rmap[(target, rtype, target_mode)] = ans
return ans
def add_image(self, target):
return self.add_relationship(target, self.namespace.names['IMAGES'])
def serialize(self):
namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
relationships = E.Relationships()
for (target, rtype, target_mode), rid in iteritems(self.rmap):
r = E.Relationship(Id=rid, Type=rtype, Target=target)
if target_mode is not None:
r.set('TargetMode', target_mode)
relationships.append(r)
return xml2str(relationships)
class DOCX(object):
def __init__(self, opts, log):
self.namespace = DOCXNamespace()
namespaces = self.namespace.namespaces
self.opts, self.log = opts, log
self.document_relationships = DocumentRelationships(self.namespace)
self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
self.numbering = etree.Element('{%s}numbering' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
self.embedded_fonts = E.Relationships()
self.fonts = {}
self.images = {}
# Boilerplate {{{
@property
def contenttypes(self):
E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']})
types = E.Types()
for partname, mt in iteritems({
"/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
"/word/document.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml",
"/word/numbering.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml",
"/word/styles.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml",
"/word/endnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
"/word/settings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml",
"/word/theme/theme1.xml": "application/vnd.openxmlformats-officedocument.theme+xml",
"/word/fontTable.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml",
"/word/webSettings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml",
"/docProps/core.xml": "application/vnd.openxmlformats-package.core-properties+xml",
"/docProps/app.xml": "application/vnd.openxmlformats-officedocument.extended-properties+xml",
}):
types.append(E.Override(PartName=partname, ContentType=mt))
added = {'png', 'gif', 'jpeg', 'jpg', 'svg', 'xml'}
for ext in added:
types.append(E.Default(Extension=ext, ContentType=guess_type('a.'+ext)[0]))
for ext, mt in iteritems({
"rels": "application/vnd.openxmlformats-package.relationships+xml",
"odttf": "application/vnd.openxmlformats-officedocument.obfuscatedFont",
}):
added.add(ext)
types.append(E.Default(Extension=ext, ContentType=mt))
for fname in self.images:
ext = fname.rpartition(os.extsep)[-1]
if ext not in added:
added.add(ext)
mt = guess_type('a.' + ext)[0]
if mt:
types.append(E.Default(Extension=ext, ContentType=mt))
return xml2str(types)
@property
def appproperties(self):
E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']})
props = E.Properties(
E.Application(__appname__),
E.AppVersion('%02d.%04d' % numeric_version[:2]),
E.DocSecurity('0'),
E.HyperlinksChanged('false'),
E.LinksUpToDate('true'),
E.ScaleCrop('false'),
E.SharedDoc('false'),
)
if self.mi.publisher:
props.append(E.Company(self.mi.publisher))
return xml2str(props)
@property
def containerrels(self):
return textwrap.dedent('''\
<?xml version='1.0' encoding='utf-8'?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/>
<Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/>
<Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/>
</Relationships>'''.format(**self.namespace.names)).encode('utf-8')
@property
def websettings(self):
E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']})
ws = E.webSettings(
E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile)
return xml2str(ws)
# }}}
def convert_metadata(self, mi):
namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()})
cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre'))
ts = utcnow().isoformat(native_string_type('T')).rpartition('.')[0] + 'Z'
for x in 'created modified'.split():
x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x), **{'{%s}type' % namespaces['xsi']:'dcterms:W3CDTF'})
x.text = ts
cp.append(x)
self.mi = mi
update_doc_props(cp, self.mi, self.namespace)
return xml2str(cp)
def create_empty_document(self, mi):
self.document, self.styles = create_skeleton(self.opts)[:2]
def write(self, path_or_stream, mi, create_empty_document=False):
if create_empty_document:
self.create_empty_document(mi)
with ZipFile(path_or_stream, 'w') as zf:
zf.writestr('[Content_Types].xml', self.contenttypes)
zf.writestr('_rels/.rels', self.containerrels)
zf.writestr('docProps/core.xml', self.convert_metadata(mi))
zf.writestr('docProps/app.xml', self.appproperties)
zf.writestr('word/webSettings.xml', self.websettings)
zf.writestr('word/document.xml', xml2str(self.document))
zf.writestr('word/styles.xml', xml2str(self.styles))
zf.writestr('word/numbering.xml', xml2str(self.numbering))
zf.writestr('word/fontTable.xml', xml2str(self.font_table))
zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
zf.writestr('word/_rels/fontTable.xml.rels', xml2str(self.embedded_fonts))
for fname, data_getter in iteritems(self.images):
zf.writestr(fname, data_getter())
for fname, data in iteritems(self.fonts):
zf.writestr(fname, data)
if __name__ == '__main__':
d = DOCX(None, None)
print(d.websettings)

View File

@@ -0,0 +1,78 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import defaultdict
from uuid import uuid4
from calibre.ebooks.oeb.base import OEB_STYLES
from calibre.ebooks.oeb.transforms.subset import find_font_face_rules
from polyglot.builtins import range
def obfuscate_font_data(data, key):
prefix = bytearray(data[:32])
key = bytearray(reversed(key.bytes))
prefix = bytes(bytearray(prefix[i]^key[i % len(key)] for i in range(len(prefix))))
return prefix + data[32:]
class FontsManager(object):
def __init__(self, namespace, oeb, opts):
self.namespace = namespace
self.oeb, self.log, self.opts = oeb, oeb.log, opts
def serialize(self, text_styles, fonts, embed_relationships, font_data_map):
makeelement = self.namespace.makeelement
font_families, seen = set(), set()
for ts in text_styles:
if ts.font_family:
lf = ts.font_family.lower()
if lf not in seen:
seen.add(lf)
font_families.add(ts.font_family)
family_map = {}
for family in sorted(font_families):
family_map[family] = makeelement(fonts, 'w:font', w_name=family)
embedded_fonts = []
for item in self.oeb.manifest:
if item.media_type in OEB_STYLES and hasattr(item.data, 'cssRules'):
embedded_fonts.extend(find_font_face_rules(item, self.oeb))
num = 0
face_map = defaultdict(set)
rel_map = {}
for ef in embedded_fonts:
ff = ef['font-family'][0]
if ff not in font_families:
continue
num += 1
bold = ef['weight'] > 400
italic = ef['font-style'] != 'normal'
tag = 'Regular'
if bold or italic:
tag = 'Italic'
if bold and italic:
tag = 'BoldItalic'
elif bold:
tag = 'Bold'
if tag in face_map[ff]:
continue
face_map[ff].add(tag)
font = family_map[ff]
key = uuid4()
item = ef['item']
rid = rel_map.get(item)
if rid is None:
rel_map[item] = rid = 'rId%d' % num
fname = 'fonts/font%d.odttf' % num
makeelement(embed_relationships, 'Relationship', Id=rid, Type=self.namespace.names['EMBEDDED_FONT'], Target=fname)
font_data_map['word/' + fname] = obfuscate_font_data(item.data, key)
makeelement(font, 'w:embed' + tag, r_id=rid,
w_fontKey='{%s}' % key.urn.rpartition(':')[-1].upper(),
w_subsetted="true" if self.opts.subset_embedded_fonts else "false")

View File

@@ -0,0 +1,617 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from collections import Counter
from calibre.ebooks.docx.writer.container import create_skeleton, page_size, page_effective_area
from calibre.ebooks.docx.writer.styles import StylesManager, FloatSpec
from calibre.ebooks.docx.writer.links import LinksManager
from calibre.ebooks.docx.writer.images import ImagesManager
from calibre.ebooks.docx.writer.fonts import FontsManager
from calibre.ebooks.docx.writer.tables import Table
from calibre.ebooks.docx.writer.lists import ListsManager
from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
from calibre.ebooks.oeb.base import XPath, barename
from calibre.utils.localization import lang_as_iso639_1
from polyglot.builtins import unicode_type, string_or_bytes
def lang_for_tag(tag):
for attr in ('lang', '{http://www.w3.org/XML/1998/namespace}lang'):
val = lang_as_iso639_1(tag.get(attr))
if val:
return val
class Style(St):
def __init__(self, *args, **kwargs):
St.__init__(self, *args, **kwargs)
self._letterSpacing = None
@property
def letterSpacing(self):
if self._letterSpacing is not None:
val = self._get('letter-spacing')
if val == 'normal':
self._letterSpacing = val
else:
self._letterSpacing = self._unit_convert(val)
return self._letterSpacing
class Stylizer(Sz):
def style(self, element):
try:
return self._styles[element]
except KeyError:
return Style(element, self)
class TextRun(object):
ws_pat = None
def __init__(self, namespace, style, first_html_parent, lang=None):
self.first_html_parent = first_html_parent
if self.ws_pat is None:
TextRun.ws_pat = self.ws_pat = re.compile(r'\s+')
self.style = style
self.texts = []
self.link = None
self.lang = lang
self.parent_style = None
self.makeelement = namespace.makeelement
self.descendant_style = None
def add_text(self, text, preserve_whitespace, bookmark=None, link=None):
if not preserve_whitespace:
text = self.ws_pat.sub(' ', text)
if text.strip() != text:
# If preserve_whitespace is False, Word ignores leading and
# trailing whitespace
preserve_whitespace = True
self.texts.append((text, preserve_whitespace, bookmark))
self.link = link
def add_break(self, clear='none', bookmark=None):
self.texts.append((None, clear, bookmark))
def add_image(self, drawing, bookmark=None):
self.texts.append((drawing, None, bookmark))
def serialize(self, p, links_manager):
makeelement = self.makeelement
parent = p if self.link is None else links_manager.serialize_hyperlink(p, self.link)
r = makeelement(parent, 'w:r')
rpr = makeelement(r, 'w:rPr', append=False)
if getattr(self.descendant_style, 'id', None) is not None:
makeelement(rpr, 'w:rStyle', w_val=self.descendant_style.id)
if self.lang:
makeelement(rpr, 'w:lang', w_bidi=self.lang, w_val=self.lang, w_eastAsia=self.lang)
if len(rpr) > 0:
r.append(rpr)
for text, preserve_whitespace, bookmark in self.texts:
if bookmark is not None:
bid = links_manager.bookmark_id
makeelement(r, 'w:bookmarkStart', w_id=unicode_type(bid), w_name=bookmark)
if text is None:
makeelement(r, 'w:br', w_clear=preserve_whitespace)
elif hasattr(text, 'xpath'):
r.append(text)
else:
t = makeelement(r, 'w:t')
t.text = text or ''
if preserve_whitespace:
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
if bookmark is not None:
makeelement(r, 'w:bookmarkEnd', w_id=unicode_type(bid))
def __repr__(self):
return repr(self.texts)
def is_empty(self):
if not self.texts:
return True
if len(self.texts) == 1 and self.texts[0][:2] == ('', False):
return True
return False
@property
def style_weight(self):
ans = 0
for text, preserve_whitespace, bookmark in self.texts:
if isinstance(text, unicode_type):
ans += len(text)
return ans
class Block(object):
def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False, parent_bg=None):
self.force_not_empty = False
self.namespace = namespace
self.bookmarks = set()
self.list_tag = (html_block, style) if is_list_item else None
self.is_first_block = False
self.numbering_id = None
self.parent_items = None
self.html_block = html_block
self.html_tag = barename(html_block.tag)
self.float_spec = float_spec
if float_spec is not None:
float_spec.blocks.append(self)
self.html_style = style
self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg)
self.styles_manager, self.links_manager = styles_manager, links_manager
self.keep_next = False
self.runs = []
self.skipped = False
self.linked_style = None
self.page_break_before = style['page-break-before'] == 'always'
self.keep_lines = style['page-break-inside'] == 'avoid'
self.page_break_after = False
self.block_lang = None
def resolve_skipped(self, next_block):
if not self.is_empty():
return
if len(self.html_block) > 0 and self.html_block[0] is next_block.html_block:
self.skipped = True
if self.list_tag is not None:
next_block.list_tag = self.list_tag
def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None, is_parent_style=False, bookmark=None, link=None, lang=None):
ws = style['white-space']
preserve_whitespace = ws in {'pre', 'pre-wrap', '-o-pre-wrap'}
ts = self.styles_manager.create_text_style(style, is_parent_style=is_parent_style)
if self.runs and ts == self.runs[-1].style and link == self.runs[-1].link and lang == self.runs[-1].lang:
run = self.runs[-1]
else:
run = TextRun(self.namespace, ts, self.html_block if html_parent is None else html_parent, lang=lang)
self.runs.append(run)
if ignore_leading_whitespace and not preserve_whitespace:
text = text.lstrip()
if preserve_whitespace or ws == 'pre-line':
for text in text.splitlines():
run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link)
bookmark = None
run.add_break()
else:
run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link)
def add_break(self, clear='none', bookmark=None):
if self.runs:
run = self.runs[-1]
else:
run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run)
run.add_break(clear=clear, bookmark=bookmark)
def add_image(self, drawing, bookmark=None):
if self.runs:
run = self.runs[-1]
else:
run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run)
run.add_image(drawing, bookmark=bookmark)
def serialize(self, body):
makeelement = self.namespace.makeelement
p = makeelement(body, 'w:p')
end_bookmarks = []
for bmark in self.bookmarks:
end_bookmarks.append(unicode_type(self.links_manager.bookmark_id))
makeelement(p, 'w:bookmarkStart', w_id=end_bookmarks[-1], w_name=bmark)
if self.block_lang:
rpr = makeelement(p, 'w:rPr')
makeelement(rpr, 'w:lang', w_val=self.block_lang, w_bidi=self.block_lang, w_eastAsia=self.block_lang)
ppr = makeelement(p, 'w:pPr')
if self.keep_next:
makeelement(ppr, 'w:keepNext')
if self.float_spec is not None:
self.float_spec.serialize(self, ppr)
if self.numbering_id is not None:
numpr = makeelement(ppr, 'w:numPr')
makeelement(numpr, 'w:ilvl', w_val=unicode_type(self.numbering_id[1]))
makeelement(numpr, 'w:numId', w_val=unicode_type(self.numbering_id[0]))
if self.linked_style is not None:
makeelement(ppr, 'w:pStyle', w_val=self.linked_style.id)
elif self.style.id:
makeelement(ppr, 'w:pStyle', w_val=self.style.id)
if self.is_first_block:
makeelement(ppr, 'w:pageBreakBefore', w_val='off')
elif self.page_break_before:
makeelement(ppr, 'w:pageBreakBefore', w_val='on')
if self.keep_lines:
makeelement(ppr, 'w:keepLines', w_val='on')
for run in self.runs:
run.serialize(p, self.links_manager)
for bmark in end_bookmarks:
makeelement(p, 'w:bookmarkEnd', w_id=bmark)
def __repr__(self):
return 'Block(%r)' % self.runs
__str__ = __repr__
def is_empty(self):
if self.force_not_empty:
return False
for run in self.runs:
if not run.is_empty():
return False
return True
class Blocks(object):
def __init__(self, namespace, styles_manager, links_manager):
self.top_bookmark = None
self.namespace = namespace
self.styles_manager = styles_manager
self.links_manager = links_manager
self.all_blocks = []
self.pos = 0
self.current_block = None
self.items = []
self.tables = []
self.current_table = None
self.open_html_blocks = set()
self.html_tag_start_blocks = {}
def current_or_new_block(self, html_tag, tag_style):
return self.current_block or self.start_new_block(html_tag, tag_style)
def end_current_block(self):
if self.current_block is not None:
self.all_blocks.append(self.current_block)
if self.current_table is not None and self.current_table.current_row is not None:
self.current_table.add_block(self.current_block)
else:
self.block_map[self.current_block] = len(self.items)
self.items.append(self.current_block)
self.current_block.parent_items = self.items
self.current_block = None
def start_new_block(self, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False):
parent_bg = None
if html_block is not None:
p = html_block.getparent()
b = self.html_tag_start_blocks.get(p)
if b is not None:
ps = self.styles_manager.styles_for_html_blocks.get(p)
if ps is not None and ps.background_color is not None:
parent_bg = ps.background_color
self.end_current_block()
self.current_block = Block(
self.namespace, self.styles_manager, self.links_manager, html_block, style,
is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item,
parent_bg=parent_bg)
self.html_tag_start_blocks[html_block] = self.current_block
self.open_html_blocks.add(html_block)
return self.current_block
def start_new_table(self, html_tag, tag_style=None):
self.current_table = Table(self.namespace, html_tag, tag_style)
self.tables.append(self.current_table)
def start_new_row(self, html_tag, tag_style):
if self.current_table is None:
self.start_new_table(html_tag)
self.current_table.start_new_row(html_tag, tag_style)
def start_new_cell(self, html_tag, tag_style):
if self.current_table is None:
self.start_new_table(html_tag)
self.current_table.start_new_cell(html_tag, tag_style)
def finish_tag(self, html_tag):
if self.current_block is not None and html_tag in self.open_html_blocks:
start_block = self.html_tag_start_blocks.get(html_tag)
if start_block is not None and start_block.html_style['page-break-after'] == 'always':
self.current_block.page_break_after = True
self.end_current_block()
self.open_html_blocks.discard(html_tag)
if self.current_table is not None:
table_finished = self.current_table.finish_tag(html_tag)
if table_finished:
table = self.tables[-1]
del self.tables[-1]
if self.tables:
self.current_table = self.tables[-1]
self.current_table.add_table(table)
else:
self.current_table = None
self.block_map[table] = len(self.items)
self.items.append(table)
def serialize(self, body):
for item in self.items:
item.serialize(body)
def delete_block_at(self, pos=None):
pos = self.pos if pos is None else pos
block = self.all_blocks[pos]
del self.all_blocks[pos]
bpos = self.block_map.pop(block, None)
if bpos is not None:
del self.items[bpos]
else:
items = self.items if block.parent_items is None else block.parent_items
items.remove(block)
block.parent_items = None
if block.float_spec is not None:
block.float_spec.blocks.remove(block)
try:
next_block = self.all_blocks[pos]
next_block.bookmarks.update(block.bookmarks)
for attr in 'page_break_after page_break_before'.split():
setattr(next_block, attr, getattr(block, attr))
except (IndexError, KeyError):
pass
def __enter__(self):
self.pos = len(self.all_blocks)
self.block_map = {}
def __exit__(self, etype, value, traceback):
if value is not None:
return # Since there was an exception, the data structures are not in a consistent state
if self.current_block is not None:
self.all_blocks.append(self.current_block)
self.current_block = None
if len(self.all_blocks) > self.pos and self.all_blocks[self.pos].is_empty():
# Delete the empty block corresponding to the <body> tag when the
# body tag has no inline content before its first sub-block
self.delete_block_at(self.pos)
if self.pos > 0 and self.pos < len(self.all_blocks):
# Insert a page break corresponding to the start of the html file
self.all_blocks[self.pos].page_break_before = True
if self.top_bookmark is not None:
self.all_blocks[self.pos].bookmarks.add(self.top_bookmark)
self.top_bookmark = None
self.block_map = {}
def apply_page_break_after(self):
for i, block in enumerate(self.all_blocks):
if block.page_break_after and i < len(self.all_blocks) - 1:
next_block = self.all_blocks[i + 1]
if next_block.parent_items is block.parent_items and block.parent_items is self.items:
next_block.page_break_before = True
def resolve_language(self):
default_lang = self.styles_manager.document_lang
for block in self.all_blocks:
count = Counter()
for run in block.runs:
count[run.lang] += 1
if count:
block.block_lang = bl = count.most_common(1)[0][0]
for run in block.runs:
if run.lang == bl:
run.lang = None
if bl == default_lang:
block.block_lang = None
def __repr__(self):
return 'Block(%r)' % self.runs
class Convert(object):
# Word does not apply default styling to hyperlinks, so we ensure they get
# default styling (the conversion pipeline does not apply any styling to
# them).
base_css = '''
a[href] { text-decoration: underline; color: blue }
'''
def __init__(self, oeb, docx, mi, add_cover, add_toc):
self.oeb, self.docx, self.add_cover, self.add_toc = oeb, docx, add_cover, add_toc
self.log, self.opts = docx.log, docx.opts
self.mi = mi
self.cover_img = None
p = self.opts.output_profile
p.width_pts, p.height_pts = page_effective_area(self.opts)
def __call__(self):
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
self.svg_rasterizer(self.oeb, self.opts)
self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language)
self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log)
self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts)
self.lists_manager = ListsManager(self.docx)
self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager)
self.current_link = self.current_lang = None
for item in self.oeb.spine:
self.log.debug('Processing', item.href)
self.process_item(item)
if self.add_toc:
self.links_manager.process_toc_links(self.oeb)
if self.add_cover and self.oeb.metadata.cover and unicode_type(self.oeb.metadata.cover[0]) in self.oeb.manifest.ids:
cover_id = unicode_type(self.oeb.metadata.cover[0])
item = self.oeb.manifest.ids[cover_id]
self.cover_img = self.images_manager.read_image(item.href)
all_blocks = self.blocks.all_blocks
remove_blocks = []
for i, block in enumerate(all_blocks):
try:
nb = all_blocks[i+1]
except IndexError:
break
block.resolve_skipped(nb)
if block.skipped:
remove_blocks.append((i, block))
for pos, block in reversed(remove_blocks):
self.blocks.delete_block_at(pos)
self.blocks.all_blocks[0].is_first_block = True
self.blocks.apply_page_break_after()
self.blocks.resolve_language()
if self.cover_img is not None:
self.cover_img = self.images_manager.create_cover_markup(self.cover_img, self.opts.preserve_cover_aspect_ratio, *page_size(self.opts))
self.lists_manager.finalize(all_blocks)
self.styles_manager.finalize(all_blocks)
self.write()
def process_item(self, item):
self.current_item = item
stylizer = self.svg_rasterizer.stylizer_cache.get(item)
if stylizer is None:
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css)
self.abshref = self.images_manager.abshref = item.abshref
self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang
for i, body in enumerate(XPath('//h:body')(item.data)):
with self.blocks:
self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(self.links_manager.top_anchor, self.current_item, body)
self.process_tag(body, stylizer, is_first_tag=i == 0)
def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None):
tagname = barename(html_tag.tag)
tag_style = stylizer.style(html_tag)
ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden
display = tag_style._get('display')
is_block = False
if not ignore_tag_contents:
previous_link = self.current_link
if tagname == 'a' and html_tag.get('href'):
self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title'))
previous_lang = self.current_lang
tag_lang = lang_for_tag(html_tag)
if tag_lang:
self.current_lang = tag_lang
is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag
if float_spec is None and is_float:
float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style)
if display in {'inline', 'inline-block'} or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph
if is_float and float_spec.is_dropcaps:
self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
float_spec = None
else:
self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
elif display == 'list-item':
self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True)
elif display.startswith('table') or display == 'inline-table':
if display == 'table-cell':
self.blocks.start_new_cell(html_tag, tag_style)
self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True)
elif display == 'table-row':
self.blocks.start_new_row(html_tag, tag_style)
elif display in {'table', 'inline-table'}:
self.blocks.end_current_block()
self.blocks.start_new_table(html_tag, tag_style)
else:
if tagname == 'img' and is_float:
# Image is floating so dont start a new paragraph for it
self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
else:
if tagname == 'hr':
for edge in 'right bottom left'.split():
tag_style.set('border-%s-style' % edge, 'none')
self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
for child in html_tag.iterchildren():
if isinstance(getattr(child, 'tag', None), string_or_bytes):
self.process_tag(child, stylizer, float_spec=float_spec)
else: # Comment/PI/etc.
tail = getattr(child, 'tail', None)
if tail:
block = self.create_block_from_parent(html_tag, stylizer)
block.add_text(tail, tag_style, is_parent_style=False, link=self.current_link, lang=self.current_lang)
is_block = html_tag in self.blocks.open_html_blocks
self.blocks.finish_tag(html_tag)
if is_block and tag_style['page-break-after'] == 'avoid':
self.blocks.all_blocks[-1].keep_next = True
self.current_link = previous_link
self.current_lang = previous_lang
# Now, process the tail if any
if display == 'table-row':
return # We ignore the tail for these tags
ignore_whitespace_tail = is_block or display.startswith('table')
if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()):
# Ignore trailing space after a block tag, as otherwise it will
# become a new empty paragraph
block = self.create_block_from_parent(html_tag, stylizer)
block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link, lang=self.current_lang)
def create_block_from_parent(self, html_tag, stylizer):
parent = html_tag.getparent()
block = self.blocks.current_or_new_block(parent, stylizer.style(parent))
# Do not inherit page-break-before from parent
block.page_break_before = False
return block
def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False):
block = self.blocks.start_new_block(
html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item)
anchor = html_tag.get('id') or html_tag.get('name')
if anchor:
block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag))
if tagname == 'img':
self.images_manager.add_image(html_tag, block, stylizer, as_block=True)
else:
text = html_tag.text
if text:
block.add_text(text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang)
elif tagname == 'li' and len(html_tag) and barename(html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]):
block.force_not_empty = True
def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
anchor = html_tag.get('id') or html_tag.get('name') or None
bmark = None
if anchor:
bmark = self.bookmark_for_anchor(anchor, html_tag)
if tagname == 'br':
if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]:
block = self.create_block_from_parent(html_tag, stylizer)
block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'), bookmark=bmark)
elif tagname == 'img':
block = self.create_block_from_parent(html_tag, stylizer)
self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark)
else:
if html_tag.text:
block = self.create_block_from_parent(html_tag, stylizer)
block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang)
elif bmark:
block = self.create_block_from_parent(html_tag, stylizer)
block.add_text('', tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang)
def bookmark_for_anchor(self, anchor, html_tag):
return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag)
def write(self):
self.docx.document, self.docx.styles, body = create_skeleton(self.opts)
self.blocks.serialize(body)
body.append(body[0]) # Move <sectPr> to the end
if self.links_manager.toc:
self.links_manager.serialize_toc(body, self.styles_manager.primary_heading_style)
if self.cover_img is not None:
self.images_manager.write_cover_block(body, self.cover_img)
self.styles_manager.serialize(self.docx.styles)
self.images_manager.serialize(self.docx.images)
self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts)
self.lists_manager.serialize(self.docx.numbering)

View File

@@ -0,0 +1,219 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import os
import posixpath
from collections import namedtuple
from functools import partial
from polyglot.builtins import iteritems, itervalues, map, unicode_type
from lxml import etree
from calibre import fit_image
from calibre.ebooks.oeb.base import urlunquote
from calibre.ebooks.docx.images import pt_to_emu
from calibre.utils.filenames import ascii_filename
from calibre.utils.imghdr import identify
Image = namedtuple('Image', 'rid fname width height fmt item')
def as_num(x):
try:
return float(x)
except Exception:
pass
return 0
def get_image_margins(style):
ans = {}
for edge in 'Left Right Top Bottom'.split():
val = as_num(getattr(style, 'padding' + edge)) + as_num(getattr(style, 'margin' + edge))
ans['dist' + edge[0]] = unicode_type(pt_to_emu(val))
return ans
class ImagesManager(object):
def __init__(self, oeb, document_relationships, opts):
self.oeb, self.log = oeb, oeb.log
self.page_width, self.page_height = opts.output_profile.width_pts, opts.output_profile.height_pts
self.images = {}
self.seen_filenames = set()
self.document_relationships = document_relationships
self.count = 0
def read_image(self, href):
if href not in self.images:
item = self.oeb.manifest.hrefs.get(href)
if item is None or not isinstance(item.data, bytes):
return
try:
fmt, width, height = identify(item.data)
except Exception:
self.log.warning('Replacing corrupted image with blank: %s' % href)
item.data = I('blank.png', data=True, allow_user_override=False)
fmt, width, height = identify(item.data)
image_fname = 'media/' + self.create_filename(href, fmt)
image_rid = self.document_relationships.add_image(image_fname)
self.images[href] = Image(image_rid, image_fname, width, height, fmt, item)
item.unload_data_from_memory()
return self.images[href]
def add_image(self, img, block, stylizer, bookmark=None, as_block=False):
src = img.get('src')
if not src:
return
href = self.abshref(src)
try:
rid = self.read_image(href).rid
except AttributeError:
return
drawing = self.create_image_markup(img, stylizer, href, as_block=as_block)
block.add_image(drawing, bookmark=bookmark)
return rid
def create_image_markup(self, html_img, stylizer, href, as_block=False):
# TODO: img inside a link (clickable image)
style = stylizer.style(html_img)
floating = style['float']
if floating not in {'left', 'right'}:
floating = None
if as_block:
ml, mr = style._get('margin-left'), style._get('margin-right')
if ml == 'auto':
floating = 'center' if mr == 'auto' else 'right'
if mr == 'auto':
floating = 'center' if ml == 'auto' else 'right'
else:
parent = html_img.getparent()
if len(parent) == 1 and not (parent.text or '').strip() and not (html_img.tail or '').strip():
pstyle = stylizer.style(parent)
if 'block' in pstyle['display']:
# We have an inline image alone inside a block
as_block = True
floating = pstyle['float']
if floating not in {'left', 'right'}:
floating = None
if pstyle['text-align'] in ('center', 'right'):
floating = pstyle['text-align']
floating = floating or 'left'
fake_margins = floating is None
self.count += 1
img = self.images[href]
name = urlunquote(posixpath.basename(href))
width, height = style.img_size(img.width, img.height)
scaled, width, height = fit_image(width, height, self.page_width, self.page_height)
width, height = map(pt_to_emu, (width, height))
makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
root = etree.Element('root', nsmap=namespaces)
ans = makeelement(root, 'w:drawing', append=False)
if floating is None:
parent = makeelement(ans, 'wp:inline')
else:
parent = makeelement(ans, 'wp:anchor', **get_image_margins(style))
# The next three lines are boilerplate that Word requires, even
# though the DOCX specs define defaults for all of them
parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0")
parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
makeelement(parent, 'wp:simplePos', x='0', y='0')
makeelement(makeelement(parent, 'wp:positionH', relativeFrom='margin'), 'wp:align').text = floating
makeelement(makeelement(parent, 'wp:positionV', relativeFrom='line'), 'wp:align').text = 'top'
makeelement(parent, 'wp:extent', cx=unicode_type(width), cy=unicode_type(height))
if fake_margins:
# DOCX does not support setting margins for inline images, so we
# fake it by using effect extents to simulate margins
makeelement(parent, 'wp:effectExtent', **{k[-1].lower():v for k, v in iteritems(get_image_margins(style))})
else:
makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
if floating is not None:
# The idiotic Word requires this to be after the extent settings
if as_block:
makeelement(parent, 'wp:wrapTopAndBottom')
else:
makeelement(parent, 'wp:wrapSquare', wrapText='bothSides')
self.create_docx_image_markup(parent, name, html_img.get('alt') or name, img.rid, width, height)
return ans
def create_docx_image_markup(self, parent, name, alt, img_rid, width, height):
makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
makeelement(parent, 'wp:docPr', id=unicode_type(self.count), name=name, descr=alt)
makeelement(makeelement(parent, 'wp:cNvGraphicFramePr'), 'a:graphicFrameLocks', noChangeAspect="1")
g = makeelement(parent, 'a:graphic')
gd = makeelement(g, 'a:graphicData', uri=namespaces['pic'])
pic = makeelement(gd, 'pic:pic')
nvPicPr = makeelement(pic, 'pic:nvPicPr')
makeelement(nvPicPr, 'pic:cNvPr', id='0', name=name, descr=alt)
makeelement(nvPicPr, 'pic:cNvPicPr')
bf = makeelement(pic, 'pic:blipFill')
makeelement(bf, 'a:blip', r_embed=img_rid)
makeelement(makeelement(bf, 'a:stretch'), 'a:fillRect')
spPr = makeelement(pic, 'pic:spPr')
xfrm = makeelement(spPr, 'a:xfrm')
makeelement(xfrm, 'a:off', x='0', y='0'), makeelement(xfrm, 'a:ext', cx=unicode_type(width), cy=unicode_type(height))
makeelement(makeelement(spPr, 'a:prstGeom', prst='rect'), 'a:avLst')
def create_filename(self, href, fmt):
fname = ascii_filename(urlunquote(posixpath.basename(href)))
fname = posixpath.splitext(fname)[0]
fname = fname[:75].rstrip('.') or 'image'
num = 0
base = fname
while fname.lower() in self.seen_filenames:
num += 1
fname = base + unicode_type(num)
self.seen_filenames.add(fname.lower())
fname += os.extsep + fmt.lower()
return fname
def serialize(self, images_map):
for img in itervalues(self.images):
images_map['word/' + img.fname] = partial(self.get_data, img.item)
def get_data(self, item):
try:
return item.data
finally:
item.unload_data_from_memory(False)
def create_cover_markup(self, img, preserve_aspect_ratio, width, height):
self.count += 1
makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
if preserve_aspect_ratio:
if img.width >= img.height:
ar = img.height / img.width
height = ar * width
else:
ar = img.width / img.height
width = ar * height
root = etree.Element('root', nsmap=namespaces)
ans = makeelement(root, 'w:drawing', append=False)
parent = makeelement(ans, 'wp:anchor', **{'dist'+edge:'0' for edge in 'LRTB'})
parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0")
parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
makeelement(parent, 'wp:simplePos', x='0', y='0')
makeelement(makeelement(parent, 'wp:positionH', relativeFrom='page'), 'wp:align').text = 'center'
makeelement(makeelement(parent, 'wp:positionV', relativeFrom='page'), 'wp:align').text = 'center'
width, height = map(pt_to_emu, (width, height))
makeelement(parent, 'wp:extent', cx=unicode_type(width), cy=unicode_type(height))
makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
makeelement(parent, 'wp:wrapTopAndBottom')
self.create_docx_image_markup(parent, 'cover.jpg', _('Cover'), img.rid, width, height)
return ans
def write_cover_block(self, body, cover_image):
makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
pbb = body[0].xpath('//*[local-name()="pageBreakBefore"]')[0]
pbb.set('{%s}val' % namespaces['w'], 'on')
p = makeelement(body, 'w:p', append=False)
body.insert(0, p)
r = makeelement(p, 'w:r')
r.append(cover_image)

View File

@@ -0,0 +1,175 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import posixpath, re
from uuid import uuid4
from calibre.utils.filenames import ascii_text
from polyglot.builtins import unicode_type
from polyglot.urllib import urlparse
def start_text(tag, prefix_len=0, top_level=True):
ans = tag.text or ''
limit = 50 - prefix_len
if len(ans) < limit:
for child in tag.iterchildren('*'):
ans += start_text(child, len(ans), top_level=False) + (child.tail or '')
if len(ans) >= limit:
break
if top_level and len(ans) > limit:
ans = ans[:limit] + '...'
return ans
class TOCItem(object):
def __init__(self, title, bmark, level):
self.title, self.bmark, self.level = title, bmark, level
self.is_first = self.is_last = False
def serialize(self, body, makeelement):
p = makeelement(body, 'w:p', append=False)
ppr = makeelement(p, 'w:pPr')
makeelement(ppr, 'w:pStyle', w_val="Normal")
makeelement(ppr, 'w:ind', w_left='0', w_firstLineChars='0', w_firstLine='0', w_leftChars=unicode_type(200 * self.level))
if self.is_first:
makeelement(ppr, 'w:pageBreakBefore', w_val='off')
r = makeelement(p, 'w:r')
makeelement(r, 'w:fldChar', w_fldCharType='begin')
r = makeelement(p, 'w:r')
makeelement(r, 'w:instrText').text = r' TOC \h '
r[0].set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
r = makeelement(p, 'w:r')
makeelement(r, 'w:fldChar', w_fldCharType='separate')
hl = makeelement(p, 'w:hyperlink', w_anchor=self.bmark)
r = makeelement(hl, 'w:r')
rpr = makeelement(r, 'w:rPr')
makeelement(rpr, 'w:color', w_val='0000FF', w_themeColor='hyperlink')
makeelement(rpr, 'w:u', w_val='single')
makeelement(r, 'w:t').text = self.title
if self.is_last:
r = makeelement(p, 'w:r')
makeelement(r, 'w:fldChar', w_fldCharType='end')
body.insert(0, p)
def sanitize_bookmark_name(base):
# Max length allowed by Word appears to be 40, we use 32 to leave some
# space for making the name unique
return re.sub(r'[^0-9a-zA-Z]', '_', ascii_text(base))[:32].rstrip('_')
class LinksManager(object):
def __init__(self, namespace, document_relationships, log):
self.namespace = namespace
self.log = log
self.document_relationships = document_relationships
self.top_anchor = unicode_type(uuid4().hex)
self.anchor_map = {}
self.used_bookmark_names = set()
self.bmark_id = 0
self.document_hrefs = set()
self.external_links = {}
self.toc = []
def bookmark_for_anchor(self, anchor, current_item, html_tag):
key = (current_item.href, anchor)
if key in self.anchor_map:
return self.anchor_map[key]
if anchor == self.top_anchor:
name = ('Top of %s' % posixpath.basename(current_item.href))
self.document_hrefs.add(current_item.href)
else:
name = start_text(html_tag).strip() or anchor
name = sanitize_bookmark_name(name)
i, bname = 0, name
while name in self.used_bookmark_names:
i += 1
name = bname + ('_%d' % i)
self.anchor_map[key] = name
self.used_bookmark_names.add(name)
return name
@property
def bookmark_id(self):
self.bmark_id += 1
return self.bmark_id
def serialize_hyperlink(self, parent, link):
item, url, tooltip = link
purl = urlparse(url)
href = purl.path
def make_link(parent, anchor=None, id=None, tooltip=None):
kw = {}
if anchor is not None:
kw['w_anchor'] = anchor
elif id is not None:
kw['r_id'] = id
if tooltip:
kw['w_tooltip'] = tooltip
return self.namespace.makeelement(parent, 'w:hyperlink', **kw)
if not purl.scheme:
href = item.abshref(href)
if href in self.document_hrefs:
key = (href, purl.fragment or self.top_anchor)
if key in self.anchor_map:
bmark = self.anchor_map[key]
else:
bmark = self.anchor_map[(href, self.top_anchor)]
return make_link(parent, anchor=bmark, tooltip=tooltip)
else:
self.log.warn('Ignoring internal hyperlink with href (%s) pointing to unknown destination' % url)
if purl.scheme in {'http', 'https', 'ftp'}:
if url not in self.external_links:
self.external_links[url] = self.document_relationships.add_relationship(url, self.namespace.names['LINKS'], target_mode='External')
return make_link(parent, id=self.external_links[url], tooltip=tooltip)
return parent
def process_toc_node(self, toc, level=0):
href = toc.href
if href:
purl = urlparse(href)
href = purl.path
if href in self.document_hrefs:
key = (href, purl.fragment or self.top_anchor)
if key in self.anchor_map:
bmark = self.anchor_map[key]
else:
bmark = self.anchor_map[(href, self.top_anchor)]
self.toc.append(TOCItem(toc.title, bmark, level))
for child in toc:
self.process_toc_node(child, level+1)
def process_toc_links(self, oeb):
self.toc = []
has_toc = oeb.toc and oeb.toc.count() > 1
if not has_toc:
return
for child in oeb.toc:
self.process_toc_node(child)
if self.toc:
self.toc[0].is_first = True
self.toc[-1].is_last = True
def serialize_toc(self, body, primary_heading_style):
pbb = body[0].xpath('//*[local-name()="pageBreakBefore"]')[0]
pbb.set('{%s}val' % self.namespace.namespaces['w'], 'on')
for block in reversed(self.toc):
block.serialize(body, self.namespace.makeelement)
title = __('Table of Contents')
makeelement = self.namespace.makeelement
p = makeelement(body, 'w:p', append=False)
ppr = makeelement(p, 'w:pPr')
if primary_heading_style is not None:
makeelement(ppr, 'w:pStyle', w_val=primary_heading_style.id)
makeelement(ppr, 'w:pageBreakBefore', w_val='off')
makeelement(makeelement(p, 'w:r'), 'w:t').text = title
body.insert(0, p)

View File

@@ -0,0 +1,169 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import defaultdict
from operator import attrgetter
from polyglot.builtins import iteritems, itervalues, unicode_type
LIST_STYLES = frozenset(
'disc circle square decimal decimal-leading-zero lower-roman upper-roman'
' lower-greek lower-alpha lower-latin upper-alpha upper-latin hiragana hebrew'
' katakana-iroha cjk-ideographic'.split())
STYLE_MAP = {
'disc': 'bullet',
'circle': 'o',
'square': '\uf0a7',
'decimal': 'decimal',
'decimal-leading-zero': 'decimalZero',
'lower-roman': 'lowerRoman',
'upper-roman': 'upperRoman',
'lower-alpha': 'lowerLetter',
'lower-latin': 'lowerLetter',
'upper-alpha': 'upperLetter',
'upper-latin': 'upperLetter',
'hiragana': 'aiueo',
'hebrew': 'hebrew1',
'katakana-iroha': 'iroha',
'cjk-ideographic': 'chineseCounting',
}
def find_list_containers(list_tag, tag_style):
node = list_tag
stylizer = tag_style._stylizer
ans = []
while True:
parent = node.getparent()
if parent is None or parent is node:
break
node = parent
style = stylizer.style(node)
lst = (style._style.get('list-style-type', None) or '').lower()
if lst in LIST_STYLES:
ans.append(node)
return ans
class NumberingDefinition(object):
def __init__(self, top_most, stylizer, namespace):
self.namespace = namespace
self.top_most = top_most
self.stylizer = stylizer
self.level_map = defaultdict(list)
self.num_id = None
def finalize(self):
items_for_level = defaultdict(list)
container_for_level = {}
type_for_level = {}
for ilvl, items in iteritems(self.level_map):
for container, list_tag, block, list_type, tag_style in items:
items_for_level[ilvl].append(list_tag)
container_for_level[ilvl] = container
type_for_level[ilvl] = list_type
self.levels = tuple(
Level(type_for_level[ilvl], container_for_level[ilvl], items_for_level[ilvl], ilvl=ilvl)
for ilvl in sorted(self.level_map)
)
def __hash__(self):
return hash(self.levels)
def link_blocks(self):
for ilvl, items in iteritems(self.level_map):
for container, list_tag, block, list_type, tag_style in items:
block.numbering_id = (self.num_id + 1, ilvl)
def serialize(self, parent):
makeelement = self.namespace.makeelement
an = makeelement(parent, 'w:abstractNum', w_abstractNumId=unicode_type(self.num_id))
makeelement(an, 'w:multiLevelType', w_val='hybridMultilevel')
makeelement(an, 'w:name', w_val='List %d' % (self.num_id + 1))
for level in self.levels:
level.serialize(an, makeelement)
class Level(object):
def __init__(self, list_type, container, items, ilvl=0):
self.ilvl = ilvl
try:
self.start = int(container.get('start'))
except Exception:
self.start = 1
if items:
try:
self.start = int(items[0].get('value'))
except Exception:
pass
if list_type in {'disc', 'circle', 'square'}:
self.num_fmt = 'bullet'
self.lvl_text = '\uf0b7' if list_type == 'disc' else STYLE_MAP[list_type]
else:
self.lvl_text = '%{}.'.format(self.ilvl + 1)
self.num_fmt = STYLE_MAP.get(list_type, 'decimal')
def __hash__(self):
return hash((self.start, self.num_fmt, self.lvl_text))
def serialize(self, parent, makeelement):
lvl = makeelement(parent, 'w:lvl', w_ilvl=unicode_type(self.ilvl))
makeelement(lvl, 'w:start', w_val=unicode_type(self.start))
makeelement(lvl, 'w:numFmt', w_val=self.num_fmt)
makeelement(lvl, 'w:lvlText', w_val=self.lvl_text)
makeelement(lvl, 'w:lvlJc', w_val='left')
makeelement(makeelement(lvl, 'w:pPr'), 'w:ind', w_hanging='360', w_left=unicode_type(1152 + self.ilvl * 360))
if self.num_fmt == 'bullet':
ff = {'\uf0b7':'Symbol', '\uf0a7':'Wingdings'}.get(self.lvl_text, 'Courier New')
makeelement(makeelement(lvl, 'w:rPr'), 'w:rFonts', w_ascii=ff, w_hAnsi=ff, w_hint="default")
class ListsManager(object):
def __init__(self, docx):
self.namespace = docx.namespace
self.lists = {}
def finalize(self, all_blocks):
lists = {}
for block in all_blocks:
if block.list_tag is not None:
list_tag, tag_style = block.list_tag
list_type = (tag_style['list-style-type'] or '').lower()
if list_type not in LIST_STYLES:
continue
container_tags = find_list_containers(list_tag, tag_style)
if not container_tags:
continue
top_most = container_tags[-1]
if top_most not in lists:
lists[top_most] = NumberingDefinition(top_most, tag_style._stylizer, self.namespace)
l = lists[top_most]
ilvl = len(container_tags) - 1
l.level_map[ilvl].append((container_tags[0], list_tag, block, list_type, tag_style))
[nd.finalize() for nd in itervalues(lists)]
definitions = {}
for defn in itervalues(lists):
try:
defn = definitions[defn]
except KeyError:
definitions[defn] = defn
defn.num_id = len(definitions) - 1
defn.link_blocks()
self.definitions = sorted(itervalues(definitions), key=attrgetter('num_id'))
def serialize(self, parent):
for defn in self.definitions:
defn.serialize(parent)
makeelement = self.namespace.makeelement
for defn in self.definitions:
n = makeelement(parent, 'w:num', w_numId=unicode_type(defn.num_id + 1))
makeelement(n, 'w:abstractNumId', w_val=unicode_type(defn.num_id))

View File

@@ -0,0 +1,768 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import numbers
from collections import Counter, defaultdict
from operator import attrgetter
from lxml import etree
from calibre.ebooks import parse_css_length
from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero
from calibre.utils.localization import lang_as_iso639_1
from polyglot.builtins import iteritems, filter, unicode_type
from tinycss.css21 import CSS21Parser
css_parser = CSS21Parser()
border_edges = ('left', 'top', 'right', 'bottom')
border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
ignore = object()
def parse_css_font_family(raw):
decl, errs = css_parser.parse_style_attr('font-family:' + raw)
if decl:
for token in decl[0].value:
if token.type in 'STRING IDENT':
val = token.value
if val == 'inherit':
break
yield val
def css_font_family_to_docx(raw):
generic = {'serif':'Cambria', 'sansserif':'Candara', 'sans-serif':'Candara', 'fantasy':'Comic Sans', 'cursive':'Segoe Script'}
for ff in parse_css_font_family(raw):
return generic.get(ff.lower(), ff)
def bmap(x):
return 'on' if x else 'off'
def is_dropcaps(html_tag, tag_style):
return len(html_tag) < 2 and len(etree.tostring(html_tag, method='text', encoding='unicode', with_tail=False)) < 5 and tag_style['float'] == 'left'
class CombinedStyle(object):
def __init__(self, bs, rs, blocks, namespace):
self.bs, self.rs, self.blocks = bs, rs, blocks
self.namespace = namespace
self.id = self.name = self.seq = None
self.outline_level = None
def apply(self):
for block in self.blocks:
block.linked_style = self
for run in block.runs:
run.parent_style = self.rs
def serialize(self, styles, normal_style):
makeelement = self.namespace.makeelement
w = lambda x: '{%s}%s' % (self.namespace.namespaces['w'], x)
block = makeelement(styles, 'w:style', w_styleId=self.id, w_type='paragraph')
makeelement(block, 'w:name', w_val=self.name)
makeelement(block, 'w:qFormat')
if self is not normal_style:
makeelement(block, 'w:basedOn', w_val=normal_style.id)
if self.seq == 0:
block.set(w('default'), '1')
pPr = makeelement(block, 'w:pPr')
self.bs.serialize_properties(pPr, normal_style.bs)
if self.outline_level is not None:
makeelement(pPr, 'w:outlineLvl', w_val=unicode_type(self.outline_level + 1))
rPr = makeelement(block, 'w:rPr')
self.rs.serialize_properties(rPr, normal_style.rs)
class FloatSpec(object):
def __init__(self, namespace, html_tag, tag_style):
self.makeelement = namespace.makeelement
self.is_dropcaps = is_dropcaps(html_tag, tag_style)
self.blocks = []
if self.is_dropcaps:
self.dropcaps_lines = 3
else:
self.x_align = tag_style['float']
self.w = self.h = None
if tag_style._get('width') != 'auto':
self.w = int(20 * max(tag_style['min-width'], tag_style['width']))
if tag_style._get('height') == 'auto':
self.h_rule = 'auto'
else:
if tag_style['min-height'] > 0:
self.h_rule, self.h = 'atLeast', tag_style['min-height']
else:
self.h_rule, self.h = 'exact', tag_style['height']
self.h = int(20 * self.h)
self.h_space = int(20 * max(tag_style['margin-right'], tag_style['margin-left']))
self.v_space = int(20 * max(tag_style['margin-top'], tag_style['margin-bottom']))
read_css_block_borders(self, tag_style)
def serialize(self, block, parent):
if self.is_dropcaps:
attrs = dict(w_dropCap='drop', w_lines=unicode_type(self.dropcaps_lines), w_wrap='around', w_vAnchor='text', w_hAnchor='text')
else:
attrs = dict(
w_wrap='around', w_vAnchor='text', w_hAnchor='text', w_xAlign=self.x_align, w_y='1',
w_hSpace=unicode_type(self.h_space), w_vSpace=unicode_type(self.v_space), w_hRule=self.h_rule
)
if self.w is not None:
attrs['w_w'] = unicode_type(self.w)
if self.h is not None:
attrs['w_h'] = unicode_type(self.h)
self.makeelement(parent, 'w:framePr', **attrs)
# Margins are already applied by the frame style, so override them to
# be zero on individual blocks
self.makeelement(parent, 'w:ind', w_left='0', w_leftChars='0', w_right='0', w_rightChars='0')
attrs = {}
if block is self.blocks[0]:
attrs.update(dict(w_before='0', w_beforeLines='0'))
if block is self.blocks[-1]:
attrs.update(dict(w_after='0', w_afterLines='0'))
if attrs:
self.makeelement(parent, 'w:spacing', **attrs)
# Similarly apply the same border and padding properties to all blocks
# in this floatspec
bdr = self.makeelement(parent, 'w:pBdr')
for edge in border_edges:
padding = getattr(self, 'padding_' + edge)
width = getattr(self, 'border_%s_width' % edge)
bstyle = getattr(self, 'border_%s_style' % edge)
self.makeelement(
bdr, 'w:'+edge, w_space=unicode_type(padding), w_val=bstyle, w_sz=unicode_type(width), w_color=getattr(self, 'border_%s_color' % edge))
class DOCXStyle(object):
ALL_PROPS = ()
TYPE = 'paragraph'
def __init__(self, namespace):
self.namespace = namespace
self.w = lambda x: '{%s}%s' % (namespace.namespaces['w'], x)
self.id = self.name = None
self.next_style = None
self.calculate_hash()
def calculate_hash(self):
self._hash = hash(tuple(
getattr(self, x) for x in self.ALL_PROPS))
def makeelement(self, parent, name, **attrs):
return parent.makeelement(self.w(name), **{self.w(k):v for k, v in iteritems(attrs)})
def __hash__(self):
return self._hash
def __eq__(self, other):
for x in self.ALL_PROPS:
if getattr(self, x) != getattr(other, x, None):
return False
return True
def __ne__(self, other):
return not self == other
def __repr__(self):
return etree.tostring(self.serialize(etree.Element(self.__class__.__name__, nsmap={'w':self.namespace.namespaces['w']})), pretty_print=True)
__str__ = __repr__
def serialize(self, styles, normal_style):
makeelement = self.makeelement
style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE)
style.append(makeelement(style, 'name', val=self.name))
if self is not normal_style:
style.append(makeelement(style, 'basedOn', val=normal_style.id))
styles.append(style)
return style
LINE_STYLES = {
'none' : 'none',
'hidden': 'none',
'dotted': 'dotted',
'dashed': 'dashed',
'solid' : 'single',
'double': 'double',
'groove': 'threeDEngrave',
'ridge' : 'threeDEmboss',
'inset' : 'inset',
'outset': 'outset',
}
class TextStyle(DOCXStyle):
ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color',
'background_color', 'underline', 'strike', 'dstrike', 'caps',
'shadow', 'small_caps', 'spacing', 'vertical_align', 'padding',
'border_style', 'border_width', 'border_color')
TYPE = 'character'
def __init__(self, namespace, css, is_parent_style=False):
self.font_family = css_font_family_to_docx(css['font-family'])
try:
self.font_size = max(0, int(float(css['font-size']) * 2)) # stylizer normalizes all font sizes into pts
except (ValueError, TypeError, AttributeError):
self.font_size = None
fw = css['font-weight']
self.bold = (fw.lower() if hasattr(fw, 'lower') else fw) in {'bold', 'bolder'} or int_or_zero(fw) >= 700
self.italic = css['font-style'].lower() in {'italic', 'oblique'}
self.color = convert_color(css['color'])
self.background_color = None if is_parent_style else convert_color(css.backgroundColor)
td = set((css.effective_text_decoration or '').split())
self.underline = 'underline' in td
self.dstrike = 'line-through' in td and 'overline' in td
self.strike = not self.dstrike and 'line-through' in td
self.text_transform = css['text-transform'] # TODO: If lowercase or capitalize, transform the actual text
self.caps = self.text_transform == 'uppercase'
self.small_caps = css['font-variant'].lower() in {'small-caps', 'smallcaps'}
self.shadow = css['text-shadow'] not in {'none', None}
try:
self.spacing = int(float(css['letter-spacing']) * 20)
except (ValueError, TypeError, AttributeError):
self.spacing = None
va = css.first_vertical_align
if isinstance(va, numbers.Number):
self.vertical_align = unicode_type(int(va * 2))
else:
val = {
'top':'superscript', 'text-top':'superscript', 'sup':'superscript', 'super':'superscript',
'bottom':'subscript', 'text-bottom':'subscript', 'sub':'subscript'}.get(va)
self.vertical_align = val or 'baseline'
self.padding = self.border_color = self.border_width = self.border_style = None
if not is_parent_style:
# DOCX does not support individual borders/padding for inline content
for edge in border_edges:
# In DOCX padding can only be a positive integer
try:
padding = max(0, int(css['padding-' + edge]))
except ValueError:
padding = 0
if self.padding is None:
self.padding = padding
elif self.padding != padding:
self.padding = ignore
val = css['border-%s-width' % edge]
if not isinstance(val, numbers.Number):
val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0)
val = min(96, max(2, int(val * 8)))
if self.border_width is None:
self.border_width = val
elif self.border_width != val:
self.border_width = ignore
color = convert_color(css['border-%s-color' % edge])
if self.border_color is None:
self.border_color = color
elif self.border_color != color:
self.border_color = ignore
style = LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none')
if self.border_style is None:
self.border_style = style
elif self.border_style != style:
self.border_style = ignore
if self.padding in (None, ignore):
self.padding = 0
if self.border_width in (None, ignore):
self.border_width = 0
if self.border_style in (None, ignore):
self.border_style = 'none'
if self.border_color in (None, ignore):
self.border_color = 'auto'
if self.border_style == 'none':
self.border_width, self.border_color = 0, 'auto'
DOCXStyle.__init__(self, namespace)
def serialize_borders(self, bdr, normal_style):
w = self.w
is_normal_style = self is normal_style
if is_normal_style or self.padding != normal_style.padding:
bdr.set(w('space'), unicode_type(self.padding))
if is_normal_style or self.border_width != normal_style.border_width:
bdr.set(w('sz'), unicode_type(self.border_width))
if is_normal_style or self.border_style != normal_style.border_style:
bdr.set(w('val'), self.border_style)
if is_normal_style or self.border_color != normal_style.border_color:
bdr.set(w('color'), self.border_color)
return bdr
def serialize(self, styles, normal_style):
makeelement = self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'rPr')
self.serialize_properties(style, normal_style)
if len(style) > 0:
style_root.append(style)
return style_root
def serialize_properties(self, rPr, normal_style):
makeelement = self.makeelement
is_normal_style = self is normal_style
if is_normal_style or self.font_family != normal_style.font_family:
rPr.append(makeelement(
rPr, 'rFonts', **{k:self.font_family for k in 'ascii cs eastAsia hAnsi'.split()}))
for name, attr, vmap in (('sz', 'font_size', str), ('b', 'bold', bmap), ('i', 'italic', bmap)):
val = getattr(self, attr)
if is_normal_style or getattr(normal_style, attr) != val:
for suffix in ('', 'Cs'):
rPr.append(makeelement(rPr, name + suffix, val=vmap(val)))
def check_attr(attr):
val = getattr(self, attr)
return is_normal_style or (val != getattr(normal_style, attr))
if check_attr('color'):
rPr.append(makeelement(rPr, 'color', val=self.color or 'auto'))
if check_attr('background_color'):
rPr.append(makeelement(rPr, 'shd', fill=self.background_color or 'auto'))
if check_attr('underline'):
rPr.append(makeelement(rPr, 'u', val='single' if self.underline else 'none'))
if check_attr('dstrike'):
rPr.append(makeelement(rPr, 'dstrike', val=bmap(self.dstrike)))
if check_attr('strike'):
rPr.append(makeelement(rPr, 'strike', val=bmap(self.strike)))
if check_attr('caps'):
rPr.append(makeelement(rPr, 'caps', val=bmap(self.caps)))
if check_attr('small_caps'):
rPr.append(makeelement(rPr, 'smallCaps', val=bmap(self.small_caps)))
if check_attr('shadow'):
rPr.append(makeelement(rPr, 'shadow', val=bmap(self.shadow)))
if check_attr('spacing'):
rPr.append(makeelement(rPr, 'spacing', val=unicode_type(self.spacing or 0)))
if is_normal_style:
rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align if self.vertical_align in {'superscript', 'subscript'} else 'baseline'))
elif self.vertical_align != normal_style.vertical_align:
if self.vertical_align in {'superscript', 'subscript', 'baseline'}:
rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align))
else:
rPr.append(makeelement(rPr, 'position', val=self.vertical_align))
bdr = self.serialize_borders(makeelement(rPr, 'bdr'), normal_style)
if bdr.attrib:
rPr.append(bdr)
class DescendantTextStyle(object):
def __init__(self, parent_style, child_style):
self.id = self.name = None
self.makeelement = child_style.makeelement
p = []
def add(name, **props):
p.append((name, frozenset(iteritems(props))))
def vals(attr):
return getattr(parent_style, attr), getattr(child_style, attr)
def check(attr):
pval, cval = vals(attr)
return pval != cval
if parent_style.font_family != child_style.font_family:
add('rFonts', **{k:child_style.font_family for k in 'ascii cs eastAsia hAnsi'.split()})
for name, attr in (('sz', 'font_size'), ('b', 'bold'), ('i', 'italic')):
pval, cval = vals(attr)
if pval != cval:
val = 'on' if attr in {'bold', 'italic'} else unicode_type(cval) # bold, italic are toggle properties
for suffix in ('', 'Cs'):
add(name + suffix, val=val)
if check('color'):
add('color', val=child_style.color or 'auto')
if check('background_color'):
add('shd', fill=child_style.background_color or 'auto')
if check('underline'):
add('u', val='single' if child_style.underline else 'none')
if check('dstrike'):
add('dstrike', val=bmap(child_style.dstrike))
if check('strike'):
add('strike', val='on') # toggle property
if check('caps'):
add('caps', val='on') # toggle property
if check('small_caps'):
add('smallCaps', val='on') # toggle property
if check('shadow'):
add('shadow', val='on') # toggle property
if check('spacing'):
add('spacing', val=unicode_type(child_style.spacing or 0))
if check('vertical_align'):
val = child_style.vertical_align
if val in {'superscript', 'subscript', 'baseline'}:
add('vertAlign', val=val)
else:
add('position', val=val)
bdr = {}
if check('padding'):
bdr['space'] = unicode_type(child_style.padding)
if check('border_width'):
bdr['sz'] = unicode_type(child_style.border_width)
if check('border_style'):
bdr['val'] = child_style.border_style
if check('border_color'):
bdr['color'] = child_style.border_color
if bdr:
add('bdr', **bdr)
self.properties = tuple(p)
self._hash = hash(self.properties)
def __hash__(self):
return self._hash
def __eq__(self, other):
return self.properties == other.properties
def __ne__(self, other):
return self.properties != other.properties
def serialize(self, styles):
makeelement = self.makeelement
style = makeelement(styles, 'style', styleId=self.id, type='character')
style.append(makeelement(style, 'name', val=self.name))
rpr = makeelement(style, 'rPr')
style.append(rpr)
for name, attrs in self.properties:
rpr.append(makeelement(style, name, **dict(attrs)))
styles.append(style)
return style
def read_css_block_borders(self, css, store_css_style=False):
for edge in border_edges:
if css is None:
setattr(self, 'padding_' + edge, 0)
setattr(self, 'margin_' + edge, 0)
setattr(self, 'css_margin_' + edge, '')
setattr(self, 'border_%s_width' % edge, 2)
setattr(self, 'border_%s_color' % edge, None)
setattr(self, 'border_%s_style' % edge, 'none')
if store_css_style:
setattr(self, 'border_%s_css_style' % edge, 'none')
else:
# In DOCX padding can only be a positive integer
try:
setattr(self, 'padding_' + edge, max(0, int(css['padding-' + edge])))
except ValueError:
setattr(self, 'padding_' + edge, 0) # invalid value for padding
# In DOCX margin must be a positive integer in twips (twentieth of a point)
try:
setattr(self, 'margin_' + edge, max(0, int(css['margin-' + edge] * 20)))
except ValueError:
setattr(self, 'margin_' + edge, 0) # for e.g.: margin: auto
setattr(self, 'css_margin_' + edge, css._style.get('margin-' + edge, ''))
val = css['border-%s-width' % edge]
if not isinstance(val, numbers.Number):
val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0)
val = min(96, max(2, int(val * 8)))
setattr(self, 'border_%s_width' % edge, val)
setattr(self, 'border_%s_color' % edge, convert_color(css['border-%s-color' % edge]) or 'auto')
setattr(self, 'border_%s_style' % edge, LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none'))
if store_css_style:
setattr(self, 'border_%s_css_style' % edge, css['border-%s-style' % edge].lower())
class BlockStyle(DOCXStyle):
ALL_PROPS = tuple(
'text_align css_text_indent text_indent line_height background_color'.split(
) + ['margin_' + edge for edge in border_edges
] + ['css_margin_' + edge for edge in border_edges
] + [x%edge for edge in border_edges for x in border_props]
)
def __init__(self, namespace, css, html_block, is_table_cell=False, parent_bg=None):
read_css_block_borders(self, css)
if is_table_cell:
for edge in border_edges:
setattr(self, 'border_%s_style' % edge, 'none')
setattr(self, 'border_%s_width' % edge, 0)
setattr(self, 'padding_' + edge, 0)
setattr(self, 'margin_' + edge, 0)
if css is None:
self.text_indent = 0
self.css_text_indent = None
self.line_height = 280
self.background_color = None
self.text_align = 'left'
else:
try:
self.text_indent = int(css['text-indent'] * 20)
self.css_text_indent = css._get('text-indent')
except (TypeError, ValueError):
self.text_indent = 0
self.css_text_indent = None
try:
self.line_height = max(0, int(css.lineHeight * 20))
except (TypeError, ValueError):
self.line_height = max(0, int(1.2 * css.fontSize * 20))
self.background_color = None if is_table_cell else convert_color(css['background-color'])
if not is_table_cell and self.background_color is None:
self.background_color = parent_bg
try:
ws = css['white-space'].lower()
preserve_whitespace = ws in {'pre', 'pre-wrap'}
except Exception:
preserve_whitespace = False
try:
aval = css['text-align'].lower()
if preserve_whitespace:
aval = 'start'
self.text_align = {'start':'left', 'left':'left', 'end':'right', 'right':'right', 'center':'center', 'justify':'both', 'centre':'center'}.get(
aval, 'left')
except AttributeError:
self.text_align = 'left'
DOCXStyle.__init__(self, namespace)
def serialize_borders(self, bdr, normal_style):
w = self.w
for edge in border_edges:
e = bdr.makeelement(w(edge))
padding = getattr(self, 'padding_' + edge)
if (self is normal_style and padding > 0) or (padding != getattr(normal_style, 'padding_' + edge)):
e.set(w('space'), unicode_type(padding))
width = getattr(self, 'border_%s_width' % edge)
bstyle = getattr(self, 'border_%s_style' % edge)
if (self is normal_style and width > 0 and bstyle != 'none'
) or width != getattr(normal_style, 'border_%s_width' % edge
) or bstyle != getattr(normal_style, 'border_%s_style' % edge):
e.set(w('val'), bstyle)
e.set(w('sz'), unicode_type(width))
e.set(w('color'), getattr(self, 'border_%s_color' % edge))
if e.attrib:
bdr.append(e)
return bdr
def serialize(self, styles, normal_style):
makeelement = self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'pPr')
self.serialize_properties(style, normal_style)
if len(style) > 0:
style_root.append(style)
return style_root
def serialize_properties(self, pPr, normal_style):
makeelement, w = self.makeelement, self.w
spacing = makeelement(pPr, 'spacing')
for edge, attr in iteritems({'top':'before', 'bottom':'after'}):
getter = attrgetter('css_margin_' + edge)
css_val, css_unit = parse_css_length(getter(self))
if css_unit in ('em', 'ex'):
lines = max(0, int(css_val * (50 if css_unit == 'ex' else 100)))
if (self is normal_style and lines > 0) or getter(self) != getter(normal_style):
spacing.set(w(attr + 'Lines'), unicode_type(lines))
else:
getter = attrgetter('margin_' + edge)
val = getter(self)
if (self is normal_style and val > 0) or val != getter(normal_style):
spacing.set(w(attr), unicode_type(val))
if self is normal_style or self.line_height != normal_style.line_height:
spacing.set(w('line'), unicode_type(self.line_height))
spacing.set(w('lineRule'), 'atLeast')
if spacing.attrib:
pPr.append(spacing)
ind = makeelement(pPr, 'ind')
for edge in ('left', 'right'):
getter = attrgetter('css_margin_' + edge)
css_val, css_unit = parse_css_length(getter(self))
if css_unit in ('em', 'ex'):
chars = max(0, int(css_val * (50 if css_unit == 'ex' else 100)))
if (self is normal_style and chars > 0) or getter(self) != getter(normal_style):
ind.set(w(edge + 'Chars'), unicode_type(chars))
else:
getter = attrgetter('margin_' + edge)
val = getter(self)
if (self is normal_style and val > 0) or val != getter(normal_style):
ind.set(w(edge), unicode_type(val))
ind.set(w(edge + 'Chars'), '0') # This is needed to override any declaration in the parent style
css_val, css_unit = parse_css_length(self.css_text_indent)
if css_unit in ('em', 'ex'):
chars = int(css_val * (50 if css_unit == 'ex' else 100))
if css_val >= 0:
if (self is normal_style and chars > 0) or self.css_text_indent != normal_style.css_text_indent:
ind.set(w('firstLineChars'), unicode_type(chars))
else:
if (self is normal_style and chars < 0) or self.css_text_indent != normal_style.css_text_indent:
ind.set(w('hangingChars'), unicode_type(abs(chars)))
else:
val = self.text_indent
if val >= 0:
if (self is normal_style and val > 0) or self.text_indent != normal_style.text_indent:
ind.set(w('firstLine'), unicode_type(val))
ind.set(w('firstLineChars'), '0') # This is needed to override any declaration in the parent style
else:
if (self is normal_style and val < 0) or self.text_indent != normal_style.text_indent:
ind.set(w('hanging'), unicode_type(abs(val)))
ind.set(w('hangingChars'), '0')
if ind.attrib:
pPr.append(ind)
if (self is normal_style and self.background_color) or self.background_color != normal_style.background_color:
pPr.append(makeelement(pPr, 'shd', val='clear', color='auto', fill=self.background_color or 'auto'))
pbdr = self.serialize_borders(pPr.makeelement(w('pBdr')), normal_style)
if len(pbdr):
pPr.append(pbdr)
if self is normal_style or self.text_align != normal_style.text_align:
pPr.append(makeelement(pPr, 'jc', val=self.text_align))
if self is not normal_style and self.next_style is not None:
pPr.append(makeelement(pPr, 'next', val=self.next_style))
class StylesManager(object):
def __init__(self, namespace, log, document_lang):
self.namespace = namespace
self.document_lang = lang_as_iso639_1(document_lang) or 'en'
self.log = log
self.block_styles, self.text_styles = {}, {}
self.styles_for_html_blocks = {}
def create_text_style(self, css_style, is_parent_style=False):
ans = TextStyle(self.namespace, css_style, is_parent_style=is_parent_style)
existing = self.text_styles.get(ans, None)
if existing is None:
self.text_styles[ans] = ans
else:
ans = existing
return ans
def create_block_style(self, css_style, html_block, is_table_cell=False, parent_bg=None):
ans = BlockStyle(self.namespace, css_style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg)
existing = self.block_styles.get(ans, None)
if existing is None:
self.block_styles[ans] = ans
else:
ans = existing
self.styles_for_html_blocks[html_block] = ans
return ans
def finalize(self, all_blocks):
block_counts, run_counts = Counter(), Counter()
block_rmap, run_rmap = defaultdict(list), defaultdict(list)
used_pairs = defaultdict(list)
heading_styles = defaultdict(list)
headings = frozenset('h1 h2 h3 h4 h5 h6'.split())
pure_block_styles = set()
for block in all_blocks:
bs = block.style
block_counts[bs] += 1
block_rmap[block.style].append(block)
local_run_counts = Counter()
for run in block.runs:
count = run.style_weight
run_counts[run.style] += count
local_run_counts[run.style] += count
run_rmap[run.style].append(run)
if local_run_counts:
rs = local_run_counts.most_common(1)[0][0]
used_pairs[(bs, rs)].append(block)
if block.html_tag in headings:
heading_styles[block.html_tag].append((bs, rs))
else:
pure_block_styles.add(bs)
self.pure_block_styles = sorted(pure_block_styles, key=block_counts.__getitem__)
bnum = len(unicode_type(max(1, len(pure_block_styles) - 1)))
for i, bs in enumerate(self.pure_block_styles):
bs.id = bs.name = '%0{}d Block'.format(bnum) % i
bs.seq = i
if i == 0:
self.normal_pure_block_style = bs
counts = Counter()
smap = {}
for (bs, rs), blocks in iteritems(used_pairs):
s = CombinedStyle(bs, rs, blocks, self.namespace)
smap[(bs, rs)] = s
counts[s] += sum(1 for b in blocks if not b.is_empty())
for i, heading_tag in enumerate(sorted(heading_styles)):
styles = sorted((smap[k] for k in heading_styles[heading_tag]), key=counts.__getitem__)
styles = list(filter(lambda s:s.outline_level is None, styles))
if styles:
heading_style = styles[-1]
heading_style.outline_level = i
snum = len(unicode_type(max(1, len(counts) - 1)))
heading_styles = []
for i, (style, count) in enumerate(counts.most_common()):
if i == 0:
self.normal_style = style
style.id = style.name = 'Normal'
else:
if style.outline_level is None:
val = 'Para %0{}d'.format(snum) % i
else:
val = 'Heading %d' % (style.outline_level + 1)
heading_styles.append(style)
style.id = style.name = val
style.seq = i
self.combined_styles = sorted(counts, key=attrgetter('seq'))
[ls.apply() for ls in self.combined_styles]
descendant_style_map = {}
ds_counts = Counter()
for block in all_blocks:
for run in block.runs:
if run.parent_style is not run.style and run.parent_style and run.style:
ds = DescendantTextStyle(run.parent_style, run.style)
if ds.properties:
run.descendant_style = descendant_style_map.get(ds)
if run.descendant_style is None:
run.descendant_style = descendant_style_map[ds] = ds
ds_counts[run.descendant_style] += run.style_weight
rnum = len(unicode_type(max(1, len(ds_counts) - 1)))
for i, (text_style, count) in enumerate(ds_counts.most_common()):
text_style.id = 'Text%d' % i
text_style.name = '%0{}d Text'.format(rnum) % i
text_style.seq = i
self.descendant_text_styles = sorted(descendant_style_map, key=attrgetter('seq'))
self.log.debug('%d Text Styles %d Combined styles' % tuple(map(len, (
self.descendant_text_styles, self.combined_styles))))
self.primary_heading_style = None
if heading_styles:
heading_styles.sort(key=attrgetter('outline_level'))
self.primary_heading_style = heading_styles[0]
else:
ms = 0
for s in self.combined_styles:
if s.rs.font_size > ms:
self.primary_heading_style = s
ms = s.rs.font_size
def serialize(self, styles):
lang = styles.xpath('descendant::*[local-name()="lang"]')[0]
for k in tuple(lang.attrib):
lang.attrib[k] = self.document_lang
for style in self.combined_styles:
style.serialize(styles, self.normal_style)
for style in self.descendant_text_styles:
style.serialize(styles)
for style in sorted(self.pure_block_styles, key=attrgetter('seq')):
style.serialize(styles, self.normal_pure_block_style)

View File

@@ -0,0 +1,371 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import namedtuple
from calibre.ebooks.docx.writer.utils import convert_color
from calibre.ebooks.docx.writer.styles import read_css_block_borders as rcbb, border_edges
from polyglot.builtins import iteritems, range, unicode_type
class Dummy(object):
pass
Border = namedtuple('Border', 'css_style style width color level')
border_style_weight = {
x:100-i for i, x in enumerate(('double', 'solid', 'dashed', 'dotted', 'ridge', 'outset', 'groove', 'inset'))}
class SpannedCell(object):
def __init__(self, spanning_cell, horizontal=True):
self.spanning_cell = spanning_cell
self.horizontal = horizontal
self.row_span = self.col_span = 1
def resolve_borders(self):
pass
def serialize(self, tr, makeelement):
tc = makeelement(tr, 'w:tc')
tcPr = makeelement(tc, 'w:tcPr')
makeelement(tcPr, 'w:%sMerge' % ('h' if self.horizontal else 'v'), w_val='continue')
makeelement(tc, 'w:p')
def applicable_borders(self, edge):
return self.spanning_cell.applicable_borders(edge)
def read_css_block_borders(self, css):
obj = Dummy()
rcbb(obj, css, store_css_style=True)
for edge in border_edges:
setattr(self, 'border_' + edge, Border(
getattr(obj, 'border_%s_css_style' % edge),
getattr(obj, 'border_%s_style' % edge),
getattr(obj, 'border_%s_width' % edge),
getattr(obj, 'border_%s_color' % edge),
self.BLEVEL
))
setattr(self, 'padding_' + edge, getattr(obj, 'padding_' + edge))
def as_percent(x):
if x and x.endswith('%'):
try:
return float(x.rstrip('%'))
except Exception:
pass
def convert_width(tag_style):
if tag_style is not None:
w = tag_style._get('width')
wp = as_percent(w)
if w == 'auto':
return ('auto', 0)
elif wp is not None:
return ('pct', int(wp * 50))
else:
try:
return ('dxa', int(float(tag_style['width']) * 20))
except Exception:
pass
return ('auto', 0)
class Cell(object):
BLEVEL = 2
def __init__(self, row, html_tag, tag_style=None):
self.row = row
self.table = self.row.table
self.html_tag = html_tag
try:
self.row_span = max(0, int(html_tag.get('rowspan', 1)))
except Exception:
self.row_span = 1
try:
self.col_span = max(0, int(html_tag.get('colspan', 1)))
except Exception:
self.col_span = 1
if tag_style is None:
self.valign = 'center'
else:
self.valign = {'top':'top', 'bottom':'bottom', 'middle':'center'}.get(tag_style._get('vertical-align'))
self.items = []
self.width = convert_width(tag_style)
self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
read_css_block_borders(self, tag_style)
def add_block(self, block):
self.items.append(block)
block.parent_items = self.items
def add_table(self, table):
self.items.append(table)
return table
def serialize(self, parent, makeelement):
tc = makeelement(parent, 'w:tc')
tcPr = makeelement(tc, 'w:tcPr')
makeelement(tcPr, 'w:tcW', w_type=self.width[0], w_w=unicode_type(self.width[1]))
# For some reason, Word 2007 refuses to honor <w:shd> at the table or row
# level, despite what the specs say, so we inherit and apply at the
# cell level
bc = self.background_color or self.row.background_color or self.row.table.background_color
if bc:
makeelement(tcPr, 'w:shd', w_val="clear", w_color="auto", w_fill=bc)
b = makeelement(tcPr, 'w:tcBorders', append=False)
for edge, border in iteritems(self.borders):
if border is not None and border.width > 0 and border.style != 'none':
makeelement(b, 'w:' + edge, w_val=border.style, w_sz=unicode_type(border.width), w_color=border.color)
if len(b) > 0:
tcPr.append(b)
m = makeelement(tcPr, 'w:tcMar', append=False)
for edge in border_edges:
padding = getattr(self, 'padding_' + edge)
if edge in {'top', 'bottom'} or (edge == 'left' and self is self.row.first_cell) or (edge == 'right' and self is self.row.last_cell):
padding += getattr(self.row, 'padding_' + edge)
if padding > 0:
makeelement(m, 'w:' + edge, w_type='dxa', w_w=unicode_type(int(padding * 20)))
if len(m) > 0:
tcPr.append(m)
if self.valign is not None:
makeelement(tcPr, 'w:vAlign', w_val=self.valign)
if self.row_span > 1:
makeelement(tcPr, 'w:vMerge', w_val='restart')
if self.col_span > 1:
makeelement(tcPr, 'w:hMerge', w_val='restart')
item = None
for item in self.items:
item.serialize(tc)
if item is None or isinstance(item, Table):
# Word 2007 requires the last element in a table cell to be a paragraph
makeelement(tc, 'w:p')
def applicable_borders(self, edge):
if edge == 'left':
items = {self.table, self.row, self} if self.row.first_cell is self else {self}
elif edge == 'top':
items = ({self.table} if self.table.first_row is self.row else set()) | {self, self.row}
elif edge == 'right':
items = {self.table, self, self.row} if self.row.last_cell is self else {self}
elif edge == 'bottom':
items = ({self.table} if self.table.last_row is self.row else set()) | {self, self.row}
return {getattr(x, 'border_' + edge) for x in items}
def resolve_border(self, edge):
# In Word cell borders override table borders, and Word ignores row
# borders, so we consolidate all borders as cell borders
# In HTML the priority is as described here:
# http://www.w3.org/TR/CSS21/tables.html#border-conflict-resolution
neighbor = self.neighbor(edge)
borders = self.applicable_borders(edge)
if neighbor is not None:
nedge = {'left':'right', 'top':'bottom', 'right':'left', 'bottom':'top'}[edge]
borders |= neighbor.applicable_borders(nedge)
for b in borders:
if b.css_style == 'hidden':
return None
def weight(border):
return (
0 if border.css_style == 'none' else 1,
border.width,
border_style_weight.get(border.css_style, 0),
border.level)
border = sorted(borders, key=weight)[-1]
return border
def resolve_borders(self):
self.borders = {edge:self.resolve_border(edge) for edge in border_edges}
def neighbor(self, edge):
idx = self.row.cells.index(self)
ans = None
if edge == 'left':
ans = self.row.cells[idx-1] if idx > 0 else None
elif edge == 'right':
ans = self.row.cells[idx+1] if (idx + 1) < len(self.row.cells) else None
elif edge == 'top':
ridx = self.table.rows.index(self.row)
if ridx > 0 and idx < len(self.table.rows[ridx-1].cells):
ans = self.table.rows[ridx-1].cells[idx]
elif edge == 'bottom':
ridx = self.table.rows.index(self.row)
if ridx + 1 < len(self.table.rows) and idx < len(self.table.rows[ridx+1].cells):
ans = self.table.rows[ridx+1].cells[idx]
return getattr(ans, 'spanning_cell', ans)
class Row(object):
BLEVEL = 1
def __init__(self, table, html_tag, tag_style=None):
self.table = table
self.html_tag = html_tag
self.orig_tag_style = tag_style
self.cells = []
self.current_cell = None
self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
read_css_block_borders(self, tag_style)
@property
def first_cell(self):
return self.cells[0] if self.cells else None
@property
def last_cell(self):
return self.cells[-1] if self.cells else None
def start_new_cell(self, html_tag, tag_style):
self.current_cell = Cell(self, html_tag, tag_style)
def finish_tag(self, html_tag):
if self.current_cell is not None:
if html_tag is self.current_cell.html_tag:
self.cells.append(self.current_cell)
self.current_cell = None
def add_block(self, block):
if self.current_cell is None:
self.start_new_cell(self.html_tag, self.orig_tag_style)
self.current_cell.add_block(block)
def add_table(self, table):
if self.current_cell is None:
self.current_cell = Cell(self, self.html_tag, self.orig_tag_style)
return self.current_cell.add_table(table)
def serialize(self, parent, makeelement):
tr = makeelement(parent, 'w:tr')
for cell in self.cells:
cell.serialize(tr, makeelement)
class Table(object):
BLEVEL = 0
def __init__(self, namespace, html_tag, tag_style=None):
self.namespace = namespace
self.html_tag = html_tag
self.orig_tag_style = tag_style
self.rows = []
self.current_row = None
self.width = convert_width(tag_style)
self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor)
self.jc = None
self.float = None
self.margin_left = self.margin_right = self.margin_top = self.margin_bottom = None
if tag_style is not None:
ml, mr = tag_style._get('margin-left'), tag_style.get('margin-right')
if ml == 'auto':
self.jc = 'center' if mr == 'auto' else 'right'
self.float = tag_style['float']
for edge in border_edges:
setattr(self, 'margin_' + edge, tag_style['margin-' + edge])
read_css_block_borders(self, tag_style)
@property
def first_row(self):
return self.rows[0] if self.rows else None
@property
def last_row(self):
return self.rows[-1] if self.rows else None
def finish_tag(self, html_tag):
if self.current_row is not None:
self.current_row.finish_tag(html_tag)
if self.current_row.html_tag is html_tag:
self.rows.append(self.current_row)
self.current_row = None
table_ended = self.html_tag is html_tag
if table_ended:
self.expand_spanned_cells()
for row in self.rows:
for cell in row.cells:
cell.resolve_borders()
return table_ended
def expand_spanned_cells(self):
# Expand horizontally
for row in self.rows:
for cell in tuple(row.cells):
idx = row.cells.index(cell)
if cell.col_span > 1 and (cell is row.cells[-1] or not isinstance(row.cells[idx+1], SpannedCell)):
row.cells[idx:idx+1] = [cell] + [SpannedCell(cell, horizontal=True) for i in range(1, cell.col_span)]
# Expand vertically
for r, row in enumerate(self.rows):
for idx, cell in enumerate(row.cells):
if cell.row_span > 1:
for nrow in self.rows[r+1:]:
sc = SpannedCell(cell, horizontal=False)
try:
tcell = nrow.cells[idx]
except Exception:
tcell = None
if tcell is None:
nrow.cells.extend([SpannedCell(nrow.cells[-1], horizontal=True) for i in range(idx - len(nrow.cells))])
nrow.cells.append(sc)
else:
if isinstance(tcell, SpannedCell):
# Conflict between rowspan and colspan
break
else:
nrow.cells.insert(idx, sc)
def start_new_row(self, html_tag, html_style):
if self.current_row is not None:
self.rows.append(self.current_row)
self.current_row = Row(self, html_tag, html_style)
def start_new_cell(self, html_tag, html_style):
if self.current_row is None:
self.start_new_row(html_tag, None)
self.current_row.start_new_cell(html_tag, html_style)
def add_block(self, block):
self.current_row.add_block(block)
def add_table(self, table):
if self.current_row is None:
self.current_row = Row(self, self.html_tag, self.orig_tag_style)
return self.current_row.add_table(table)
def serialize(self, parent):
makeelement = self.namespace.makeelement
rows = [r for r in self.rows if r.cells]
if not rows:
return
tbl = makeelement(parent, 'w:tbl')
tblPr = makeelement(tbl, 'w:tblPr')
makeelement(tblPr, 'w:tblW', w_type=self.width[0], w_w=unicode_type(self.width[1]))
if self.float in {'left', 'right'}:
kw = {'w_vertAnchor':'text', 'w_horzAnchor':'text', 'w_tblpXSpec':self.float}
for edge in border_edges:
val = getattr(self, 'margin_' + edge) or 0
if {self.float, edge} == {'left', 'right'}:
val = max(val, 2)
kw['w_' + edge + 'FromText'] = unicode_type(max(0, int(val *20)))
makeelement(tblPr, 'w:tblpPr', **kw)
if self.jc is not None:
makeelement(tblPr, 'w:jc', w_val=self.jc)
for row in rows:
row.serialize(tbl, makeelement)

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from tinycss.color3 import parse_color_string
def int_or_zero(raw):
try:
return int(raw)
except (ValueError, TypeError, AttributeError):
return 0
# convert_color() {{{
def convert_color(value):
if not value:
return
if value.lower() == 'currentcolor':
return 'auto'
val = parse_color_string(value)
if val is None:
return
if val.alpha < 0.01:
return
return '%02X%02X%02X' % (int(val.red * 255), int(val.green * 255), int(val.blue * 255))
def test_convert_color(return_tests=False):
import unittest
class TestColors(unittest.TestCase):
def test_color_conversion(self):
ae = self.assertEqual
cc = convert_color
ae(None, cc(None))
ae(None, cc('transparent'))
ae(None, cc('none'))
ae(None, cc('#12j456'))
ae('auto', cc('currentColor'))
ae('F0F8FF', cc('AliceBlue'))
ae('000000', cc('black'))
ae('FF0000', cc('red'))
ae('00FF00', cc('lime'))
ae(cc('#001'), '000011')
ae('12345D', cc('#12345d'))
ae('FFFFFF', cc('rgb(255, 255, 255)'))
ae('FF0000', cc('rgba(255, 0, 0, 23)'))
tests = unittest.defaultTestLoader.loadTestsFromTestCase(TestColors)
if return_tests:
return tests
unittest.TextTestRunner(verbosity=4).run(tests)
# }}}

View File

@@ -0,0 +1,316 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from collections import defaultdict
from calibre.ebooks.oeb.base import urlnormalize, css_text
from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont
from polyglot.builtins import iteritems, itervalues, unicode_type, range
from tinycss.fonts3 import parse_font_family
def get_font_properties(rule, default=None):
'''
Given a CSS rule, extract normalized font properties from
it. Note that shorthand font property should already have been expanded
by the CSS flattening code.
'''
props = {}
s = rule.style
for q in ('font-family', 'src', 'font-weight', 'font-stretch',
'font-style'):
g = 'uri' if q == 'src' else 'value'
try:
val = s.getProperty(q).propertyValue[0]
val = getattr(val, g)
if q == 'font-family':
val = parse_font_family(css_text(s.getProperty(q).propertyValue))
if val and val[0] == 'inherit':
val = None
except (IndexError, KeyError, AttributeError, TypeError, ValueError):
val = None if q in {'src', 'font-family'} else default
if q in {'font-weight', 'font-stretch', 'font-style'}:
val = unicode_type(val).lower() if (val or val == 0) else val
if val == 'inherit':
val = default
if q == 'font-weight':
val = {'normal':'400', 'bold':'700'}.get(val, val)
if val not in {'100', '200', '300', '400', '500', '600', '700',
'800', '900', 'bolder', 'lighter'}:
val = default
if val == 'normal':
val = '400'
elif q == 'font-style':
if val not in {'normal', 'italic', 'oblique'}:
val = default
elif q == 'font-stretch':
if val not in {'normal', 'ultra-condensed', 'extra-condensed',
'condensed', 'semi-condensed', 'semi-expanded',
'expanded', 'extra-expanded', 'ultra-expanded'}:
val = default
props[q] = val
return props
def find_font_face_rules(sheet, oeb):
'''
Find all @font-face rules in the given sheet and extract the relevant info from them.
sheet can be either a ManifestItem or a CSSStyleSheet.
'''
ans = []
try:
rules = sheet.data.cssRules
except AttributeError:
rules = sheet.cssRules
for i, rule in enumerate(rules):
if rule.type != rule.FONT_FACE_RULE:
continue
props = get_font_properties(rule, default='normal')
if not props['font-family'] or not props['src']:
continue
try:
path = sheet.abshref(props['src'])
except AttributeError:
path = props['src']
ff = oeb.manifest.hrefs.get(urlnormalize(path), None)
if not ff:
continue
props['item'] = ff
if props['font-weight'] in {'bolder', 'lighter'}:
props['font-weight'] = '400'
props['weight'] = int(props['font-weight'])
props['rule'] = rule
props['chars'] = set()
ans.append(props)
return ans
def elem_style(style_rules, cls, inherited_style):
'''
Find the effective style for the given element.
'''
classes = cls.split()
style = inherited_style.copy()
for cls in classes:
style.update(style_rules.get(cls, {}))
wt = style.get('font-weight', None)
pwt = inherited_style.get('font-weight', '400')
if wt == 'bolder':
style['font-weight'] = {
'100':'400',
'200':'400',
'300':'400',
'400':'700',
'500':'700',
}.get(pwt, '900')
elif wt == 'lighter':
style['font-weight'] = {
'600':'400', '700':'400',
'800':'700', '900':'700'}.get(pwt, '100')
return style
class SubsetFonts(object):
'''
Subset all embedded fonts. Must be run after CSS flattening, as it requires
CSS normalization and flattening to work.
'''
def __call__(self, oeb, log, opts):
self.oeb, self.log, self.opts = oeb, log, opts
self.find_embedded_fonts()
if not self.embedded_fonts:
self.log.debug('No embedded fonts found')
return
self.find_style_rules()
self.find_font_usage()
totals = [0, 0]
def remove(font):
totals[1] += len(font['item'].data)
self.oeb.manifest.remove(font['item'])
font['rule'].parentStyleSheet.deleteRule(font['rule'])
fonts = {}
for font in self.embedded_fonts:
item, chars = font['item'], font['chars']
if item.href in fonts:
fonts[item.href]['chars'] |= chars
else:
fonts[item.href] = font
for font in itervalues(fonts):
if not font['chars']:
self.log('The font %s is unused. Removing it.'%font['src'])
remove(font)
continue
try:
raw, old_stats, new_stats = subset(font['item'].data, font['chars'])
except NoGlyphs:
self.log('The font %s has no used glyphs. Removing it.'%font['src'])
remove(font)
continue
except UnsupportedFont as e:
self.log.warn('The font %s is unsupported for subsetting. %s'%(
font['src'], e))
sz = len(font['item'].data)
totals[0] += sz
totals[1] += sz
else:
font['item'].data = raw
nlen = sum(itervalues(new_stats))
olen = sum(itervalues(old_stats))
self.log('Decreased the font %s to %.1f%% of its original size'%
(font['src'], nlen/olen *100))
totals[0] += nlen
totals[1] += olen
font['item'].unload_data_from_memory()
if totals[0]:
self.log('Reduced total font size to %.1f%% of original'%
(totals[0]/totals[1] * 100))
def find_embedded_fonts(self):
'''
Find all @font-face rules and extract the relevant info from them.
'''
self.embedded_fonts = []
for item in self.oeb.manifest:
if not hasattr(item.data, 'cssRules'):
continue
self.embedded_fonts.extend(find_font_face_rules(item, self.oeb))
def find_style_rules(self):
'''
Extract all font related style information from all stylesheets into a
dict mapping classes to font properties specified by that class. All
the heavy lifting has already been done by the CSS flattening code.
'''
rules = defaultdict(dict)
for item in self.oeb.manifest:
if not hasattr(item.data, 'cssRules'):
continue
for i, rule in enumerate(item.data.cssRules):
if rule.type != rule.STYLE_RULE:
continue
props = {k:v for k,v in
iteritems(get_font_properties(rule)) if v}
if not props:
continue
for sel in rule.selectorList:
sel = sel.selectorText
if sel and sel.startswith('.'):
# We dont care about pseudo-selectors as the worst that
# can happen is some extra characters will remain in
# the font
sel = sel.partition(':')[0]
rules[sel[1:]].update(props)
self.style_rules = dict(rules)
def find_font_usage(self):
for item in self.oeb.manifest:
if not hasattr(item.data, 'xpath'):
continue
for body in item.data.xpath('//*[local-name()="body"]'):
base = {'font-family':['serif'], 'font-weight': '400',
'font-style':'normal', 'font-stretch':'normal'}
self.find_usage_in(body, base)
def used_font(self, style):
'''
Given a style find the embedded font that matches it. Returns None if
no match is found (can happen if no family matches).
'''
ff = style.get('font-family', [])
lnames = {unicode_type(x).lower() for x in ff}
matching_set = []
# Filter on font-family
for ef in self.embedded_fonts:
flnames = {x.lower() for x in ef.get('font-family', [])}
if not lnames.intersection(flnames):
continue
matching_set.append(ef)
if not matching_set:
return None
# Filter on font-stretch
widths = {x:i for i, x in enumerate(('ultra-condensed',
'extra-condensed', 'condensed', 'semi-condensed', 'normal',
'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'
))}
width = widths[style.get('font-stretch', 'normal')]
for f in matching_set:
f['width'] = widths[style.get('font-stretch', 'normal')]
min_dist = min(abs(width-f['width']) for f in matching_set)
nearest = [f for f in matching_set if abs(width-f['width']) ==
min_dist]
if width <= 4:
lmatches = [f for f in nearest if f['width'] <= width]
else:
lmatches = [f for f in nearest if f['width'] >= width]
matching_set = (lmatches or nearest)
# Filter on font-style
fs = style.get('font-style', 'normal')
order = {
'oblique':['oblique', 'italic', 'normal'],
'normal':['normal', 'oblique', 'italic']
}.get(fs, ['italic', 'oblique', 'normal'])
for q in order:
matches = [f for f in matching_set if f.get('font-style', 'normal') == q]
if matches:
matching_set = matches
break
# Filter on font weight
fw = int(style.get('font-weight', '400'))
if fw == 400:
q = [400, 500, 300, 200, 100, 600, 700, 800, 900]
elif fw == 500:
q = [500, 400, 300, 200, 100, 600, 700, 800, 900]
elif fw < 400:
q = [fw] + list(range(fw-100, -100, -100)) + list(range(fw+100,
100, 1000))
else:
q = [fw] + list(range(fw+100, 100, 1000)) + list(range(fw-100,
-100, -100))
for wt in q:
matches = [f for f in matching_set if f['weight'] == wt]
if matches:
return matches[0]
def find_chars(self, elem):
ans = set()
if elem.text:
ans |= set(elem.text)
for child in elem:
if child.tail:
ans |= set(child.tail)
return ans
def find_usage_in(self, elem, inherited_style):
style = elem_style(self.style_rules, elem.get('class', '') or '', inherited_style)
for child in elem:
self.find_usage_in(child, style)
font = self.used_font(style)
if font:
chars = self.find_chars(elem)
if chars:
font['chars'] |= chars

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,247 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import codecs, zlib, numbers
from io import BytesIO
from datetime import datetime
from calibre.constants import plugins, ispy3
from calibre.utils.logging import default_log
from polyglot.builtins import iteritems, unicode_type, codepoint_to_chr
from polyglot.binary import as_hex_bytes
pdf_float = plugins['speedup'][0].pdf_float
EOL = b'\n'
# Sizes {{{
inch = 72.0
cm = inch / 2.54
mm = cm * 0.1
pica = 12.0
didot = 0.375 * mm
cicero = 12 * didot
_W, _H = (21*cm, 29.7*cm)
A6 = (_W*.5, _H*.5)
A5 = (_H*.5, _W)
A4 = (_W, _H)
A3 = (_H, _W*2)
A2 = (_W*2, _H*2)
A1 = (_H*2, _W*4)
A0 = (_W*4, _H*4)
LETTER = (8.5*inch, 11*inch)
LEGAL = (8.5*inch, 14*inch)
ELEVENSEVENTEEN = (11*inch, 17*inch)
_BW, _BH = (25*cm, 35.3*cm)
B6 = (_BW*.5, _BH*.5)
B5 = (_BH*.5, _BW)
B4 = (_BW, _BH)
B3 = (_BH*2, _BW)
B2 = (_BW*2, _BH*2)
B1 = (_BH*4, _BW*2)
B0 = (_BW*4, _BH*4)
PAPER_SIZES = {k:globals()[k.upper()] for k in ('a0 a1 a2 a3 a4 a5 a6 b0 b1 b2'
' b3 b4 b5 b6 letter legal').split()}
# }}}
def fmtnum(o):
if isinstance(o, float):
return pdf_float(o)
return unicode_type(o)
def serialize(o, stream):
if isinstance(o, float):
stream.write_raw(pdf_float(o).encode('ascii'))
elif isinstance(o, bool):
# Must check bool before int as bools are subclasses of int
stream.write_raw(b'true' if o else b'false')
elif isinstance(o, numbers.Integral):
stream.write_raw(unicode_type(o).encode('ascii') if ispy3 else bytes(o))
elif hasattr(o, 'pdf_serialize'):
o.pdf_serialize(stream)
elif o is None:
stream.write_raw(b'null')
elif isinstance(o, datetime):
val = o.strftime("D:%Y%m%d%H%M%%02d%z")%min(59, o.second)
if datetime.tzinfo is not None:
val = "(%s'%s')"%(val[:-2], val[-2:])
stream.write(val.encode('ascii'))
else:
raise ValueError('Unknown object: %r'%o)
class Name(unicode_type):
def pdf_serialize(self, stream):
raw = self.encode('ascii')
if len(raw) > 126:
raise ValueError('Name too long: %r'%self)
raw = bytearray(raw)
sharp = ord(b'#')
buf = (
codepoint_to_chr(x).encode('ascii') if 33 < x < 126 and x != sharp else
'#{:x}'.format(x).encode('ascii') for x in raw)
stream.write(b'/'+b''.join(buf))
def escape_pdf_string(bytestring):
indices = []
bad = []
ba = bytearray(bytestring)
bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')}
for i, num in enumerate(ba):
if num == 40: # (
indices.append((i, 40))
elif num == 41: # )
if indices:
indices.pop()
else:
bad.append((i, 41))
elif num in bad_map: # '\n\r\f\b\t\\' see Table 3.2 in PDF 1.7 spec
bad.append((i, bad_map[num]))
bad = sorted(indices + bad, reverse=True)
if not bad:
return bytestring
for i, repl in bad:
ba[i:i+1] = (92, repl) # 92 = ord('\')
return bytes(ba)
class String(unicode_type):
def pdf_serialize(self, stream):
try:
raw = self.encode('latin1')
if raw.startswith(codecs.BOM_UTF16_BE):
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
except UnicodeEncodeError:
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
stream.write(b'('+escape_pdf_string(raw)+b')')
class UTF16String(unicode_type):
def pdf_serialize(self, stream):
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
if False:
# Disabled as the parentheses based strings give easier to debug
# PDF files
stream.write(b'<' + as_hex_bytes(raw) + b'>')
else:
stream.write(b'('+escape_pdf_string(raw)+b')')
class Dictionary(dict):
def pdf_serialize(self, stream):
stream.write(b'<<' + EOL)
sorted_keys = sorted(self,
key=lambda x:({'Type':'1', 'Subtype':'2'}.get(
x, x)+x))
for k in sorted_keys:
serialize(Name(k), stream)
stream.write(b' ')
serialize(self[k], stream)
stream.write(EOL)
stream.write(b'>>' + EOL)
class InlineDictionary(Dictionary):
def pdf_serialize(self, stream):
stream.write(b'<< ')
for k, v in iteritems(self):
serialize(Name(k), stream)
stream.write(b' ')
serialize(v, stream)
stream.write(b' ')
stream.write(b'>>')
class Array(list):
def pdf_serialize(self, stream):
stream.write(b'[')
for i, o in enumerate(self):
if i != 0:
stream.write(b' ')
serialize(o, stream)
stream.write(b']')
class Stream(BytesIO):
def __init__(self, compress=False):
BytesIO.__init__(self)
self.compress = compress
self.filters = Array()
def add_extra_keys(self, d):
pass
def pdf_serialize(self, stream):
raw = self.getvalue()
dl = len(raw)
filters = self.filters
if self.compress:
filters.append(Name('FlateDecode'))
raw = zlib.compress(raw)
d = InlineDictionary({'Length':len(raw), 'DL':dl})
self.add_extra_keys(d)
if filters:
d['Filter'] = filters
serialize(d, stream)
stream.write(EOL+b'stream'+EOL)
stream.write(raw)
stream.write(EOL+b'endstream'+EOL)
def write_line(self, raw=b''):
self.write(raw if isinstance(raw, bytes) else raw.encode('ascii'))
self.write(EOL)
def write(self, raw):
super(Stream, self).write(raw if isinstance(raw, bytes) else
raw.encode('ascii'))
def write_raw(self, raw):
BytesIO.write(self, raw)
class Reference(object):
def __init__(self, num, obj):
self.num, self.obj = num, obj
def pdf_serialize(self, stream):
raw = '%d 0 R'%self.num
stream.write(raw.encode('ascii'))
def __repr__(self):
return '%d 0 R'%self.num
def __str__(self):
return repr(self)
# }}}
def current_log(newlog=None):
if newlog:
current_log.ans = newlog
return current_log.ans or default_log
current_log.ans = None

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from datetime import datetime, timedelta
def align_block(raw, multiple=4, pad=b'\0'):
'''
Return raw with enough pad bytes append to ensure its length is a multiple
of 4.
'''
extra = len(raw) % multiple
if extra == 0:
return raw
return raw + pad*(multiple - extra)
class UnknownTable(object):
def __init__(self, raw):
self.raw = raw
def __call__(self):
return self.raw
def __len__(self):
return len(self.raw)
class DateTimeProperty(object):
def __init__(self, name):
self.name = name
def __get__(self, obj, type=None):
return datetime(1904, 1, 1) + timedelta(seconds=getattr(obj,
self.name))
def __set__(self, obj, val):
td = val - datetime(1904, 1, 1)
setattr(obj, self.name, int(td.total_seconds()))
class FixedProperty(object):
def __init__(self, name):
self.name = name
def __get__(self, obj, type=None):
val = getattr(obj, self.name)
return val / 0x10000
def __set__(self, obj, val):
return int(round(val*(0x10000)))
def max_power_of_two(x):
"""
Return the highest exponent of two, so that
(2 ** exponent) <= x
"""
exponent = 0
while x:
x = x >> 1
exponent += 1
return max(exponent - 1, 0)
def load_font(stream_or_path):
raw = stream_or_path
if hasattr(raw, 'read'):
raw = raw.read()
from calibre.utils.fonts.sfnt.container import Sfnt
return Sfnt(raw)

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,182 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
# cff_standard_strings {{{
# The 391 Standard Strings as used in the CFF format.
# from Adobe Technical None #5176, version 1.0, 18 March 1998
cff_standard_strings = [
'.notdef', 'space', 'exclam', 'quotedbl', 'numbersign', 'dollar', 'percent',
'ampersand', 'quoteright', 'parenleft', 'parenright', 'asterisk', 'plus',
'comma', 'hyphen', 'period', 'slash', 'zero', 'one', 'two', 'three', 'four',
'five', 'six', 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'bracketleft', 'backslash', 'bracketright', 'asciicircum', 'underscore',
'quoteleft', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'braceleft',
'bar', 'braceright', 'asciitilde', 'exclamdown', 'cent', 'sterling',
'fraction', 'yen', 'florin', 'section', 'currency', 'quotesingle',
'quotedblleft', 'guillemotleft', 'guilsinglleft', 'guilsinglright', 'fi', 'fl',
'endash', 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
'quotesinglbase', 'quotedblbase', 'quotedblright', 'guillemotright',
'ellipsis', 'perthousand', 'questiondown', 'grave', 'acute', 'circumflex',
'tilde', 'macron', 'breve', 'dotaccent', 'dieresis', 'ring', 'cedilla',
'hungarumlaut', 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash', 'oslash', 'oe',
'germandbls', 'onesuperior', 'logicalnot', 'mu', 'trademark', 'Eth', 'onehalf',
'plusminus', 'Thorn', 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
'threequarters', 'twosuperior', 'registered', 'minus', 'eth', 'multiply',
'threesuperior', 'copyright', 'Aacute', 'Acircumflex', 'Adieresis', 'Agrave',
'Aring', 'Atilde', 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde', 'Oacute',
'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde', 'Scaron', 'Uacute',
'Ucircumflex', 'Udieresis', 'Ugrave', 'Yacute', 'Ydieresis', 'Zcaron',
'aacute', 'acircumflex', 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla',
'eacute', 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex', 'odieresis',
'ograve', 'otilde', 'scaron', 'uacute', 'ucircumflex', 'udieresis', 'ugrave',
'yacute', 'ydieresis', 'zcaron', 'exclamsmall', 'Hungarumlautsmall',
'dollaroldstyle', 'dollarsuperior', 'ampersandsmall', 'Acutesmall',
'parenleftsuperior', 'parenrightsuperior', 'twodotenleader', 'onedotenleader',
'zerooldstyle', 'oneoldstyle', 'twooldstyle', 'threeoldstyle', 'fouroldstyle',
'fiveoldstyle', 'sixoldstyle', 'sevenoldstyle', 'eightoldstyle',
'nineoldstyle', 'commasuperior', 'threequartersemdash', 'periodsuperior',
'questionsmall', 'asuperior', 'bsuperior', 'centsuperior', 'dsuperior',
'esuperior', 'isuperior', 'lsuperior', 'msuperior', 'nsuperior', 'osuperior',
'rsuperior', 'ssuperior', 'tsuperior', 'ff', 'ffi', 'ffl', 'parenleftinferior',
'parenrightinferior', 'Circumflexsmall', 'hyphensuperior', 'Gravesmall',
'Asmall', 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall', 'Hsmall',
'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall', 'Nsmall', 'Osmall', 'Psmall',
'Qsmall', 'Rsmall', 'Ssmall', 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall',
'Ysmall', 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall', 'Zcaronsmall',
'Dieresissmall', 'Brevesmall', 'Caronsmall', 'Dotaccentsmall', 'Macronsmall',
'figuredash', 'hypheninferior', 'Ogoneksmall', 'Ringsmall', 'Cedillasmall',
'questiondownsmall', 'oneeighth', 'threeeighths', 'fiveeighths',
'seveneighths', 'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior', 'threeinferior',
'fourinferior', 'fiveinferior', 'sixinferior', 'seveninferior',
'eightinferior', 'nineinferior', 'centinferior', 'dollarinferior',
'periodinferior', 'commainferior', 'Agravesmall', 'Aacutesmall',
'Acircumflexsmall', 'Atildesmall', 'Adieresissmall', 'Aringsmall', 'AEsmall',
'Ccedillasmall', 'Egravesmall', 'Eacutesmall', 'Ecircumflexsmall',
'Edieresissmall', 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall', 'Oacutesmall',
'Ocircumflexsmall', 'Otildesmall', 'Odieresissmall', 'OEsmall', 'Oslashsmall',
'Ugravesmall', 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000', '001.001', '001.002',
'001.003', 'Black', 'Bold', 'Book', 'Light', 'Medium', 'Regular', 'Roman',
'Semibold'
]
# }}}
STANDARD_CHARSETS = [ # {{{
# ISOAdobe
(".notdef", "space", "exclam", "quotedbl", "numbersign", "dollar",
"percent", "ampersand", "quoteright", "parenleft", "parenright",
"asterisk", "plus", "comma", "hyphen", "period", "slash", "zero",
"one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
"colon", "semicolon", "less", "equal", "greater", "question", "at",
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
"O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
"bracketleft", "backslash", "bracketright", "asciicircum",
"underscore", "quoteleft", "a", "b", "c", "d", "e", "f", "g", "h", "i",
"j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w",
"x", "y", "z", "braceleft", "bar", "braceright", "asciitilde",
"exclamdown", "cent", "sterling", "fraction", "yen", "florin",
"section", "currency", "quotesingle", "quotedblleft", "guillemotleft",
"guilsinglleft", "guilsinglright", "fi", "fl", "endash", "dagger",
"daggerdbl", "periodcentered", "paragraph", "bullet", "quotesinglbase",
"quotedblbase", "quotedblright", "guillemotright", "ellipsis",
"perthousand", "questiondown", "grave", "acute", "circumflex", "tilde",
"macron", "breve", "dotaccent", "dieresis", "ring", "cedilla",
"hungarumlaut", "ogonek", "caron", "emdash", "AE", "ordfeminine",
"Lslash", "Oslash", "OE", "ordmasculine", "ae", "dotlessi", "lslash",
"oslash", "oe", "germandbls", "onesuperior", "logicalnot", "mu",
"trademark", "Eth", "onehalf", "plusminus", "Thorn", "onequarter",
"divide", "brokenbar", "degree", "thorn", "threequarters",
"twosuperior", "registered", "minus", "eth", "multiply",
"threesuperior", "copyright", "Aacute", "Acircumflex", "Adieresis",
"Agrave", "Aring", "Atilde", "Ccedilla", "Eacute", "Ecircumflex",
"Edieresis", "Egrave", "Iacute", "Icircumflex", "Idieresis", "Igrave",
"Ntilde", "Oacute", "Ocircumflex", "Odieresis", "Ograve", "Otilde",
"Scaron", "Uacute", "Ucircumflex", "Udieresis", "Ugrave", "Yacute",
"Ydieresis", "Zcaron", "aacute", "acircumflex", "adieresis", "agrave",
"aring", "atilde", "ccedilla", "eacute", "ecircumflex", "edieresis",
"egrave", "iacute", "icircumflex", "idieresis", "igrave", "ntilde",
"oacute", "ocircumflex", "odieresis", "ograve", "otilde", "scaron",
"uacute", "ucircumflex", "udieresis", "ugrave", "yacute", "ydieresis",
"zcaron"),
# Expert
("notdef", "space", "exclamsmall", "Hungarumlautsmall", "dollaroldstyle",
"dollarsuperior", "ampersandsmall", "Acutesmall", "parenleftsuperior",
"parenrightsuperior", "twodotenleader", "onedotenleader", "comma",
"hyphen", "period", "fraction", "zerooldstyle", "oneoldstyle",
"twooldstyle", "threeoldstyle", "fouroldstyle", "fiveoldstyle",
"sixoldstyle", "sevenoldstyle", "eightoldstyle", "nineoldstyle",
"colon", "semicolon", "commasuperior", "threequartersemdash",
"periodsuperior", "questionsmall", "asuperior", "bsuperior",
"centsuperior", "dsuperior", "esuperior", "isuperior", "lsuperior",
"msuperior", "nsuperior", "osuperior", "rsuperior", "ssuperior",
"tsuperior", "ff", "fi", "fl", "ffi", "ffl", "parenleftinferior",
"parenrightinferior", "Circumflexsmall", "hyphensuperior",
"Gravesmall", "Asmall", "Bsmall", "Csmall", "Dsmall", "Esmall",
"Fsmall", "Gsmall", "Hsmall", "Ismall", "Jsmall", "Ksmall", "Lsmall",
"Msmall", "Nsmall", "Osmall", "Psmall", "Qsmall", "Rsmall", "Ssmall",
"Tsmall", "Usmall", "Vsmall", "Wsmall", "Xsmall", "Ysmall", "Zsmall",
"colonmonetary", "onefitted", "rupiah", "Tildesmall",
"exclamdownsmall", "centoldstyle", "Lslashsmall", "Scaronsmall",
"Zcaronsmall", "Dieresissmall", "Brevesmall", "Caronsmall",
"Dotaccentsmall", "Macronsmall", "figuredash", "hypheninferior",
"Ogoneksmall", "Ringsmall", "Cedillasmall", "onequarter", "onehalf",
"threequarters", "questiondownsmall", "oneeighth", "threeeighths",
"fiveeighths", "seveneighths", "onethird", "twothirds", "zerosuperior",
"onesuperior", "twosuperior", "threesuperior", "foursuperior",
"fivesuperior", "sixsuperior", "sevensuperior", "eightsuperior",
"ninesuperior", "zeroinferior", "oneinferior", "twoinferior",
"threeinferior", "fourinferior", "fiveinferior", "sixinferior",
"seveninferior", "eightinferior", "nineinferior", "centinferior",
"dollarinferior", "periodinferior", "commainferior", "Agravesmall",
"Aacutesmall", "Acircumflexsmall", "Atildesmall", "Adieresissmall",
"Aringsmall", "AEsmall", "Ccedillasmall", "Egravesmall", "Eacutesmall",
"Ecircumflexsmall", "Edieresissmall", "Igravesmall", "Iacutesmall",
"Icircumflexsmall", "Idieresissmall", "Ethsmall", "Ntildesmall",
"Ogravesmall", "Oacutesmall", "Ocircumflexsmall", "Otildesmall",
"Odieresissmall", "OEsmall", "Oslashsmall", "Ugravesmall",
"Uacutesmall", "Ucircumflexsmall", "Udieresissmall", "Yacutesmall",
"Thornsmall", "Ydieresissmall"),
# Expert Subset
(".notdef", "space", "dollaroldstyle", "dollarsuperior",
"parenleftsuperior", "parenrightsuperior", "twodotenleader",
"onedotenleader", "comma", "hyphen", "period", "fraction",
"zerooldstyle", "oneoldstyle", "twooldstyle", "threeoldstyle",
"fouroldstyle", "fiveoldstyle", "sixoldstyle", "sevenoldstyle",
"eightoldstyle", "nineoldstyle", "colon", "semicolon",
"commasuperior", "threequartersemdash", "periodsuperior",
"asuperior", "bsuperior", "centsuperior", "dsuperior", "esuperior",
"isuperior", "lsuperior", "msuperior", "nsuperior", "osuperior",
"rsuperior", "ssuperior", "tsuperior", "ff", "fi", "fl", "ffi",
"ffl", "parenleftinferior", "parenrightinferior", "hyphensuperior",
"colonmonetary", "onefitted", "rupiah", "centoldstyle",
"figuredash", "hypheninferior", "onequarter", "onehalf",
"threequarters", "oneeighth", "threeeighths", "fiveeighths",
"seveneighths", "onethird", "twothirds", "zerosuperior",
"onesuperior", "twosuperior", "threesuperior", "foursuperior",
"fivesuperior", "sixsuperior", "sevensuperior", "eightsuperior",
"ninesuperior", "zeroinferior", "oneinferior", "twoinferior",
"threeinferior", "fourinferior", "fiveinferior", "sixinferior",
"seveninferior", "eightinferior", "nineinferior", "centinferior",
"dollarinferior", "periodinferior", "commainferior"),
] # }}}

View File

@@ -0,0 +1,311 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import pack, unpack_from
from polyglot.builtins import range, unicode_type
t1_operand_encoding = [None] * 256
t1_operand_encoding[0:32] = (32) * ["do_operator"]
t1_operand_encoding[32:247] = (247 - 32) * ["read_byte"]
t1_operand_encoding[247:251] = (251 - 247) * ["read_small_int1"]
t1_operand_encoding[251:255] = (255 - 251) * ["read_small_int2"]
t1_operand_encoding[255] = "read_long_int"
t2_operand_encoding = t1_operand_encoding[:]
t2_operand_encoding[28] = "read_short_int"
t2_operand_encoding[255] = "read_fixed_1616"
cff_dict_operand_encoding = t2_operand_encoding[:]
cff_dict_operand_encoding[29] = "read_long_int"
cff_dict_operand_encoding[30] = "read_real_number"
cff_dict_operand_encoding[255] = "reserved"
real_nibbles = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'.', 'E', 'E-', None, '-']
real_nibbles_map = {x:i for i, x in enumerate(real_nibbles)}
class ByteCode(dict):
def read_byte(self, b0, data, index):
return b0 - 139, index
def read_small_int1(self, b0, data, index):
b1 = ord(data[index:index+1])
return (b0-247)*256 + b1 + 108, index+1
def read_small_int2(self, b0, data, index):
b1 = ord(data[index:index+1])
return -(b0-251)*256 - b1 - 108, index+1
def read_short_int(self, b0, data, index):
value, = unpack_from(b">h", data, index)
return value, index+2
def read_long_int(self, b0, data, index):
value, = unpack_from(b">l", data, index)
return value, index+4
def read_fixed_1616(self, b0, data, index):
value, = unpack_from(b">l", data, index)
return value / 65536.0, index+4
def read_real_number(self, b0, data, index):
number = ''
while True:
b = ord(data[index:index+1])
index = index + 1
nibble0 = (b & 0xf0) >> 4
nibble1 = b & 0x0f
if nibble0 == 0xf:
break
number = number + real_nibbles[nibble0]
if nibble1 == 0xf:
break
number = number + real_nibbles[nibble1]
return float(number), index
def write_float(self, f, encoding='ignored'):
s = unicode_type(f).upper()
if s[:2] == "0.":
s = s[1:]
elif s[:3] == "-0.":
s = "-" + s[2:]
nibbles = []
while s:
c = s[0]
s = s[1:]
if c == "E" and s[:1] == "-":
s = s[1:]
c = "E-"
nibbles.append(real_nibbles_map[c])
nibbles.append(0xf)
if len(nibbles) % 2:
nibbles.append(0xf)
d = bytearray([30])
for i in range(0, len(nibbles), 2):
d.append(nibbles[i] << 4 | nibbles[i+1])
return bytes(d)
def write_int(self, value, encoding="cff"):
four_byte_op = {'cff':29, 't1':255}.get(encoding, None)
if -107 <= value <= 107:
code = bytes(bytearray([value + 139]))
elif 108 <= value <= 1131:
value = value - 108
code = bytes(bytearray([(value >> 8) + 247, (value & 0xFF)]))
elif -1131 <= value <= -108:
value = -value - 108
code = bytes(bytearray([(value >> 8) + 251, (value & 0xFF)]))
elif four_byte_op is None:
# T2 only supports 2 byte ints
code = bytes(bytearray([28])) + pack(b">h", value)
else:
code = bytes(bytearray([four_byte_op])) + pack(b">l", value)
return code
def write_offset(self, value):
return bytes(bytearray([29])) + pack(b">l", value)
def write_number(self, value, encoding="cff"):
f = self.write_float if isinstance(value, float) else self.write_int
return f(value, encoding)
class Dict(ByteCode):
operand_encoding = cff_dict_operand_encoding
TABLE = ()
FILTERED = frozenset()
OFFSETS = frozenset()
def __init__(self):
ByteCode.__init__(self)
self.operators = {op:(name, arg) for op, name, arg, default in
self.TABLE}
self.defaults = {name:default for op, name, arg, default in self.TABLE}
def safe_get(self, name):
return self.get(name, self.defaults[name])
def decompile(self, strings, global_subrs, data):
self.strings = strings
self.global_subrs = global_subrs
self.stack = []
index = 0
while index < len(data):
b0 = ord(data[index:index+1])
index += 1
handler = getattr(self, self.operand_encoding[b0])
value, index = handler(b0, data, index)
if value is not None:
self.stack.append(value)
def do_operator(self, b0, data, index):
if b0 == 12:
op = (b0, ord(data[index:index+1]))
index += 1
else:
op = b0
operator, arg_type = self.operators[op]
self.handle_operator(operator, arg_type)
return None, index
def handle_operator(self, operator, arg_type):
if isinstance(arg_type, tuple):
value = ()
for i in range(len(arg_type)-1, -1, -1):
arg = arg_type[i]
arghandler = getattr(self, 'arg_' + arg)
value = (arghandler(operator),) + value
else:
arghandler = getattr(self, 'arg_' + arg_type)
value = arghandler(operator)
self[operator] = value
def arg_number(self, name):
return self.stack.pop()
def arg_SID(self, name):
return self.strings[self.stack.pop()]
def arg_array(self, name):
ans = self.stack[:]
del self.stack[:]
return ans
def arg_delta(self, name):
out = []
current = 0
for v in self.stack:
current = current + v
out.append(current)
del self.stack[:]
return out
def compile(self, strings):
data = []
for op, name, arg, default in self.TABLE:
if name in self.FILTERED:
continue
val = self.safe_get(name)
opcode = bytes(bytearray(op if isinstance(op, tuple) else [op]))
if val != self.defaults[name]:
self.encoding_offset = name in self.OFFSETS
if isinstance(arg, tuple):
if len(val) != len(arg):
raise ValueError('Invalid argument %s for operator: %s'
%(val, op))
for typ, v in zip(arg, val):
if typ == 'SID':
val = strings(val)
data.append(getattr(self, 'encode_'+typ)(v))
else:
if arg == 'SID':
val = strings(val)
data.append(getattr(self, 'encode_'+arg)(val))
data.append(opcode)
self.raw = b''.join(data)
return self.raw
def encode_number(self, val):
if self.encoding_offset:
return self.write_offset(val)
return self.write_number(val)
def encode_SID(self, val):
return self.write_int(val)
def encode_array(self, val):
return b''.join(map(self.encode_number, val))
def encode_delta(self, value):
out = []
last = 0
for v in value:
out.append(v - last)
last = v
return self.encode_array(out)
class TopDict(Dict):
TABLE = (
# opcode name argument type default
((12, 30), 'ROS', ('SID','SID','number'), None,),
((12, 20), 'SyntheticBase', 'number', None,),
(0, 'version', 'SID', None,),
(1, 'Notice', 'SID', None,),
((12, 0), 'Copyright', 'SID', None,),
(2, 'FullName', 'SID', None,),
((12, 38), 'FontName', 'SID', None,),
(3, 'FamilyName', 'SID', None,),
(4, 'Weight', 'SID', None,),
((12, 1), 'isFixedPitch', 'number', 0,),
((12, 2), 'ItalicAngle', 'number', 0,),
((12, 3), 'UnderlinePosition', 'number', None,),
((12, 4), 'UnderlineThickness', 'number', 50,),
((12, 5), 'PaintType', 'number', 0,),
((12, 6), 'CharstringType', 'number', 2,),
((12, 7), 'FontMatrix', 'array', [0.001,0,0,0.001,0,0],),
(13, 'UniqueID', 'number', None,),
(5, 'FontBBox', 'array', [0,0,0,0],),
((12, 8), 'StrokeWidth', 'number', 0,),
(14, 'XUID', 'array', None,),
((12, 21), 'PostScript', 'SID', None,),
((12, 22), 'BaseFontName', 'SID', None,),
((12, 23), 'BaseFontBlend', 'delta', None,),
((12, 31), 'CIDFontVersion', 'number', 0,),
((12, 32), 'CIDFontRevision', 'number', 0,),
((12, 33), 'CIDFontType', 'number', 0,),
((12, 34), 'CIDCount', 'number', 8720,),
(15, 'charset', 'number', 0,),
((12, 35), 'UIDBase', 'number', None,),
(16, 'Encoding', 'number', 0,),
(18, 'Private', ('number','number'), None,),
((12, 37), 'FDSelect', 'number', None,),
((12, 36), 'FDArray', 'number', None,),
(17, 'CharStrings', 'number', None,),
)
# We will not write these operators out
FILTERED = {'ROS', 'SyntheticBase', 'UniqueID', 'XUID',
'CIDFontVersion', 'CIDFontRevision', 'CIDFontType', 'CIDCount',
'UIDBase', 'Encoding', 'FDSelect', 'FDArray'}
OFFSETS = {'charset', 'Encoding', 'CharStrings', 'Private'}
class PrivateDict(Dict):
TABLE = (
# opcode name argument type default
(6, 'BlueValues', 'delta', None,),
(7, 'OtherBlues', 'delta', None,),
(8, 'FamilyBlues', 'delta', None,),
(9, 'FamilyOtherBlues', 'delta', None,),
((12, 9), 'BlueScale', 'number', 0.039625,),
((12, 10), 'BlueShift', 'number', 7,),
((12, 11), 'BlueFuzz', 'number', 1,),
(10, 'StdHW', 'number', None,),
(11, 'StdVW', 'number', None,),
((12, 12), 'StemSnapH', 'delta', None,),
((12, 13), 'StemSnapV', 'delta', None,),
((12, 14), 'ForceBold', 'number', 0,),
((12, 15), 'ForceBoldThreshold', 'number', None,), # deprecated
((12, 16), 'lenIV', 'number', None,), # deprecated
((12, 17), 'LanguageGroup', 'number', 0,),
((12, 18), 'ExpansionFactor', 'number', 0.06,),
((12, 19), 'initialRandomSeed', 'number', 0,),
(20, 'defaultWidthX', 'number', 0,),
(21, 'nominalWidthX', 'number', 0,),
(19, 'Subrs', 'number', None,),
)
OFFSETS = {'Subrs'}

View File

@@ -0,0 +1,221 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from, unpack, calcsize
from functools import partial
from calibre.utils.fonts.sfnt import UnknownTable
from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs
from calibre.utils.fonts.sfnt.cff.dict_data import TopDict, PrivateDict
from calibre.utils.fonts.sfnt.cff.constants import (cff_standard_strings,
STANDARD_CHARSETS)
from polyglot.builtins import iteritems, itervalues, range
# Useful links
# http://www.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/5176.CFF.pdf
# http://www.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/5177.Type2.pdf
class CFF(object):
def __init__(self, raw):
(self.major_version, self.minor_version, self.header_size,
self.offset_size) = unpack_from(b'>4B', raw)
if (self.major_version, self.minor_version) != (1, 0):
raise UnsupportedFont('The CFF table has unknown version: '
'(%d, %d)'%(self.major_version, self.minor_version))
offset = self.header_size
# Read Names Index
self.font_names = Index(raw, offset)
offset = self.font_names.pos
if len(self.font_names) > 1:
raise UnsupportedFont('CFF table has more than one font.')
# Read Top Dict
self.top_index = Index(raw, offset)
self.top_dict = TopDict()
offset = self.top_index.pos
# Read strings
self.strings = Strings(raw, offset)
offset = self.strings.pos
# Read global subroutines
self.global_subrs = Subrs(raw, offset)
offset = self.global_subrs.pos
# Decompile Top Dict
self.top_dict.decompile(self.strings, self.global_subrs, self.top_index[0])
self.is_CID = 'ROS' in self.top_dict
if self.is_CID:
raise UnsupportedFont('Subsetting of CID keyed fonts is not supported')
# Read CharStrings (Glyph definitions)
try:
offset = self.top_dict['CharStrings']
except KeyError:
raise ValueError('This font has no CharStrings')
cs_type = self.top_dict.safe_get('CharstringType')
if cs_type != 2:
raise UnsupportedFont('This font has unsupported CharstringType: '
'%s'%cs_type)
self.char_strings = CharStringsIndex(raw, offset)
self.num_glyphs = len(self.char_strings)
# Read Private Dict
self.private_dict = self.private_subrs = None
pd = self.top_dict.safe_get('Private')
if pd:
size, offset = pd
self.private_dict = PrivateDict()
self.private_dict.decompile(self.strings, self.global_subrs,
raw[offset:offset+size])
if 'Subrs' in self.private_dict:
self.private_subrs = Subrs(raw, offset +
self.private_dict['Subrs'])
# Read charset (Glyph names)
self.charset = Charset(raw, self.top_dict.safe_get('charset'),
self.strings, self.num_glyphs, self.is_CID)
# import pprint
# pprint.pprint(self.top_dict)
# pprint.pprint(self.private_dict)
class Index(list):
def __init__(self, raw, offset, prepend=()):
list.__init__(self)
self.extend(prepend)
count = unpack_from(b'>H', raw, offset)[0]
offset += 2
self.pos = offset
if count > 0:
self.offset_size = unpack_from(b'>B', raw, offset)[0]
offset += 1
if self.offset_size == 3:
offsets = [unpack(b'>L', b'\0' + raw[i:i+3])[0]
for i in range(offset, offset+3*(count+1), 3)]
else:
fmt = {1:'B', 2:'H', 4:'L'}[self.offset_size]
fmt = ('>%d%s'%(count+1, fmt)).encode('ascii')
offsets = unpack_from(fmt, raw, offset)
offset += self.offset_size * (count+1) - 1
for i in range(len(offsets)-1):
off, noff = offsets[i:i+2]
obj = raw[offset+off:offset+noff]
self.append(obj)
try:
self.pos = offset + offsets[-1]
except IndexError:
self.pos = offset
class Strings(Index):
def __init__(self, raw, offset):
super(Strings, self).__init__(raw, offset, prepend=[x.encode('ascii')
for x in cff_standard_strings])
class Charset(list):
def __init__(self, raw, offset, strings, num_glyphs, is_CID):
super(Charset, self).__init__()
self.standard_charset = offset if offset in {0, 1, 2} else None
if is_CID and self.standard_charset is not None:
raise ValueError("CID font must not use a standard charset")
if self.standard_charset is None:
self.append(b'.notdef')
fmt = unpack_from(b'>B', raw, offset)[0]
offset += 1
f = {0:self.parse_fmt0, 1:self.parse_fmt1,
2:partial(self.parse_fmt1, is_two_byte=True)}.get(fmt, None)
if f is None:
raise UnsupportedFont('This font uses unsupported charset '
'table format: %d'%fmt)
f(raw, offset, strings, num_glyphs, is_CID)
def parse_fmt0(self, raw, offset, strings, num_glyphs, is_CID):
fmt = ('>%dH'%(num_glyphs-1)).encode('ascii')
ids = unpack_from(fmt, raw, offset)
if is_CID:
ids = ('cid%05d'%x for x in ids)
else:
ids = (strings[x] for x in ids)
self.extend(ids)
def parse_fmt1(self, raw, offset, strings, num_glyphs, is_CID,
is_two_byte=False):
fmt = b'>2H' if is_two_byte else b'>HB'
sz = calcsize(fmt)
count = 1
while count < num_glyphs:
first, nleft = unpack_from(fmt, raw, offset)
offset += sz
count += nleft + 1
self.extend('cid%05d'%x if is_CID else strings[x] for x in
range(first, first + nleft+1))
def lookup(self, glyph_id):
if self.standard_charset is None:
return self[glyph_id]
return STANDARD_CHARSETS[self.standard_charset][glyph_id].encode('ascii')
def safe_lookup(self, glyph_id):
try:
return self.lookup(glyph_id)
except (KeyError, IndexError, ValueError):
return None
class Subrs(Index):
pass
class CharStringsIndex(Index):
pass
class CFFTable(UnknownTable):
def decompile(self):
self.cff = CFF(self.raw)
def subset(self, character_map, extra_glyphs):
from calibre.utils.fonts.sfnt.cff.writer import Subset
# Map codes from the cmap table to glyph names, this will be used to
# reconstruct character_map for the subset font
charset_map = {code:self.cff.charset.safe_lookup(glyph_id) for code,
glyph_id in iteritems(character_map)}
charset = set(itervalues(charset_map))
charset.discard(None)
if not charset and character_map:
raise NoGlyphs('This font has no glyphs for the specified characters')
charset |= {
self.cff.charset.safe_lookup(glyph_id) for glyph_id in extra_glyphs}
charset.discard(None)
s = Subset(self.cff, charset)
# Rebuild character_map with the glyph ids from the subset font
character_map.clear()
for code, charname in iteritems(charset_map):
glyph_id = s.charname_map.get(charname, None)
if glyph_id:
character_map[code] = glyph_id
# Check that raw is parseable
CFF(s.raw)
self.raw = s.raw

View File

@@ -0,0 +1,290 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
# Note that the code for creating a BMP table (cmap format 4) is taken with
# thanks from the fonttools project (BSD licensed).
from struct import unpack_from, calcsize, pack
from collections import OrderedDict
from calibre.utils.fonts.utils import read_bmp_prefix
from calibre.utils.fonts.sfnt import UnknownTable, max_power_of_two
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from polyglot.builtins import range
def split_range(start_code, end_code, cmap): # {{{
# Try to split a range of character codes into subranges with consecutive
# glyph IDs in such a way that the cmap4 subtable can be stored "most"
# efficiently.
if start_code == end_code:
return [], [end_code]
last_id = cmap[start_code]
last_code = start_code
in_order = None
ordered_begin = None
sub_ranges = []
# Gather subranges in which the glyph IDs are consecutive.
for code in range(start_code + 1, end_code + 1):
glyph_id = cmap[code]
if glyph_id - 1 == last_id:
if in_order is None or not in_order:
in_order = 1
ordered_begin = last_code
else:
if in_order:
in_order = 0
sub_ranges.append((ordered_begin, last_code))
ordered_begin = None
last_id = glyph_id
last_code = code
if in_order:
sub_ranges.append((ordered_begin, last_code))
assert last_code == end_code
# Now filter out those new subranges that would only make the data bigger.
# A new segment cost 8 bytes, not using a new segment costs 2 bytes per
# character.
new_ranges = []
for b, e in sub_ranges:
if b == start_code and e == end_code:
break # the whole range, we're fine
if b == start_code or e == end_code:
threshold = 4 # split costs one more segment
else:
threshold = 8 # split costs two more segments
if (e - b + 1) > threshold:
new_ranges.append((b, e))
sub_ranges = new_ranges
if not sub_ranges:
return [], [end_code]
if sub_ranges[0][0] != start_code:
sub_ranges.insert(0, (start_code, sub_ranges[0][0] - 1))
if sub_ranges[-1][1] != end_code:
sub_ranges.append((sub_ranges[-1][1] + 1, end_code))
# Fill the "holes" in the segments list -- those are the segments in which
# the glyph IDs are _not_ consecutive.
i = 1
while i < len(sub_ranges):
if sub_ranges[i-1][1] + 1 != sub_ranges[i][0]:
sub_ranges.insert(i, (sub_ranges[i-1][1] + 1, sub_ranges[i][0] - 1))
i = i + 1
i = i + 1
# Transform the ranges into start_code/end_code lists.
start = []
end = []
for b, e in sub_ranges:
start.append(b)
end.append(e)
start.pop(0)
assert len(start) + 1 == len(end)
return start, end
# }}}
def set_id_delta(id_delta): # {{{
# The lowest gid in glyphIndexArray, after subtracting id_delta, must be 1.
# id_delta is a short, and must be between -32K and 32K
# startCode can be between 0 and 64K-1, and the first glyph index can be between 1 and 64K-1
# This means that we have a problem because we can need to assign to
# id_delta values
# between -(64K-2) and 64K -1.
# Since the final gi is reconstructed from the glyphArray GID by:
# (short)finalGID = (gid + id_delta) % 0x10000),
# we can get from a startCode of 0 to a final GID of 64 -1K by subtracting 1, and casting the
# negative number to an unsigned short.
# Similarly , we can get from a startCode of 64K-1 to a final GID of 1 by adding 2, because of
# the modulo arithmetic.
if id_delta > 0x7FFF:
id_delta = id_delta - 0x10000
elif id_delta < -0x7FFF:
id_delta = id_delta + 0x10000
return id_delta
# }}}
class BMPTable(object):
def __init__(self, raw):
self.raw = raw
(self.start_count, self.end_count, self.range_offset, self.id_delta,
self.glyph_id_len, self.glyph_id_map, self.array_len) = \
read_bmp_prefix(raw, 0)
def get_glyph_ids(self, codes):
for code in codes:
found = False
for i, ec in enumerate(self.end_count):
if ec >= code:
sc = self.start_count[i]
if sc <= code:
found = True
ro = self.range_offset[i]
if ro == 0:
glyph_id = self.id_delta[i] + code
else:
idx = ro//2 + (code - sc) + i - self.array_len
glyph_id = self.glyph_id_map[idx]
if glyph_id != 0:
glyph_id += self.id_delta[i]
yield glyph_id % 0x10000
break
if not found:
yield 0
def get_glyph_map(self, glyph_ids):
ans = {}
for i, ec in enumerate(self.end_count):
sc = self.start_count[i]
for code in range(sc, ec+1):
ro = self.range_offset[i]
if ro == 0:
glyph_id = self.id_delta[i] + code
else:
idx = ro//2 + (code - sc) + i - self.array_len
glyph_id = self.glyph_id_map[idx]
if glyph_id != 0:
glyph_id += self.id_delta[i]
glyph_id %= 0x10000
if glyph_id in glyph_ids and code not in ans:
ans[code] = glyph_id
return ans
class CmapTable(UnknownTable):
def __init__(self, *args, **kwargs):
super(CmapTable, self).__init__(*args, **kwargs)
self.version, self.num_tables = unpack_from(b'>HH', self.raw)
self.tables = {}
offset = 4
sz = calcsize(b'>HHL')
recs = []
for i in range(self.num_tables):
platform, encoding, table_offset = unpack_from(b'>HHL', self.raw,
offset)
offset += sz
recs.append((platform, encoding, table_offset))
self.bmp_table = None
for i in range(len(recs)):
platform, encoding, offset = recs[i]
try:
next_offset = recs[i+1][-1]
except IndexError:
next_offset = len(self.raw)
table = self.raw[offset:next_offset]
if table:
fmt = unpack_from(b'>H', table)[0]
if platform == 3 and encoding == 1 and fmt == 4:
self.bmp_table = BMPTable(table)
def get_character_map(self, chars):
'''
Get a mapping of character codes to glyph ids in the font.
'''
if self.bmp_table is None:
raise UnsupportedFont('This font has no Windows BMP cmap subtable.'
' Most likely a special purpose font.')
chars = sorted(set(chars))
ans = OrderedDict()
for i, glyph_id in enumerate(self.bmp_table.get_glyph_ids(chars)):
if glyph_id > 0:
ans[chars[i]] = glyph_id
return ans
def get_glyph_map(self, glyph_ids):
'''
Get a mapping of character codes to glyph ids for the specified glyph
ids.
'''
if self.bmp_table is None:
raise UnsupportedFont('This font has no Windows BMP cmap subtable.'
' Most likely a special purpose font.')
glyph_ids = frozenset(glyph_ids)
return self.bmp_table.get_glyph_map(glyph_ids)
def set_character_map(self, cmap):
self.version, self.num_tables = 0, 1
fmt = b'>7H'
codes = sorted(cmap)
if not codes:
start_code = [0xffff]
end_code = [0xffff]
else:
last_code = codes[0]
end_code = []
start_code = [last_code]
for code in codes[1:]:
if code == last_code + 1:
last_code = code
continue
start, end = split_range(start_code[-1], last_code, cmap)
start_code.extend(start)
end_code.extend(end)
start_code.append(code)
last_code = code
end_code.append(last_code)
start_code.append(0xffff)
end_code.append(0xffff)
id_delta = []
id_range_offset = []
glyph_index_array = []
for i in range(len(end_code)-1): # skip the closing codes (0xffff)
indices = list(cmap[char_code] for char_code in range(start_code[i], end_code[i] + 1))
if indices == list(range(indices[0], indices[0] + len(indices))):
# indices is a contiguous list
id_delta_temp = set_id_delta(indices[0] - start_code[i])
id_delta.append(id_delta_temp)
id_range_offset.append(0)
else:
id_delta.append(0)
id_range_offset.append(2 * (len(end_code) + len(glyph_index_array) - i))
glyph_index_array.extend(indices)
id_delta.append(1) # 0xffff + 1 == 0. So this end code maps to .notdef
id_range_offset.append(0)
seg_count = len(end_code)
max_exponent = max_power_of_two(seg_count)
search_range = 2 * (2 ** max_exponent)
entry_selector = max_exponent
range_shift = 2 * seg_count - search_range
char_code_array = end_code + [0] + start_code
char_code_array = pack(b'>%dH'%len(char_code_array), *char_code_array)
id_delta_array = pack(b'>%dh'%len(id_delta), *id_delta)
rest_array = id_range_offset + glyph_index_array
rest_array = pack(b'>%dH'%len(rest_array), *rest_array)
data = char_code_array + id_delta_array + rest_array
length = calcsize(fmt) + len(data)
header = pack(fmt, 4, length, 0, 2*seg_count, search_range, entry_selector, range_shift)
self.bmp_table = header + data
fmt = b'>4HL'
offset = calcsize(fmt)
self.raw = pack(fmt, self.version, self.num_tables, 3, 1, offset) + self.bmp_table

View File

@@ -0,0 +1,252 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from, calcsize
from collections import OrderedDict, namedtuple
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from polyglot.builtins import range, iteritems
class Unpackable(object):
def __init__(self, raw, offset):
self.raw, self.offset = raw, offset
self.start_pos = offset
def unpack(self, fmt, single_special=True):
fmt = fmt.encode('ascii') if not isinstance(fmt, bytes) else fmt
ans = unpack_from(b'>'+fmt, self.raw, self.offset)
if single_special and len(ans) == 1:
ans = ans[0]
self.offset += calcsize(fmt)
return ans
class SimpleListTable(list):
'A table that contains a list of subtables'
child_class = None
def __init__(self, raw, offset):
list.__init__(self)
data = Unpackable(raw, offset)
self.read_extra_header(data)
count = data.unpack('H')
for i in range(count):
offset = data.unpack('H')
self.append(self.child_class(raw, data.start_pos + offset))
self.read_extra_footer(data)
def read_extra_header(self, data):
pass
def read_extra_footer(self, data):
pass
class ListTable(OrderedDict):
'A table that contains an ordered mapping of table tag to subtable'
child_class = None
def __init__(self, raw, offset):
OrderedDict.__init__(self)
data = Unpackable(raw, offset)
self.read_extra_header(data)
count = data.unpack('H')
for i in range(count):
tag, coffset = data.unpack('4sH')
self[tag] = self.child_class(raw, data.start_pos + coffset)
self.read_extra_footer(data)
def read_extra_header(self, data):
pass
def read_extra_footer(self, data):
pass
def dump(self, prefix=''):
print(prefix, self.__class__.__name__, sep='')
prefix += ' '
for tag, child in iteritems(self):
print(prefix, tag, sep='')
child.dump(prefix=prefix+' ')
class IndexTable(list):
def __init__(self, raw, offset):
data = Unpackable(raw, offset)
self.read_extra_header(data)
count = data.unpack('H')
for i in range(count):
self.append(data.unpack('H'))
def read_extra_header(self, data):
pass
def dump(self, prefix=''):
print(prefix, self.__class__.__name__, sep='')
class LanguageSystemTable(IndexTable):
def read_extra_header(self, data):
self.lookup_order, self.required_feature_index = data.unpack('2H')
if self.lookup_order != 0:
raise UnsupportedFont('This LanguageSystemTable has an unknown'
' lookup order: 0x%x'%self.lookup_order)
class ScriptTable(ListTable):
child_class = LanguageSystemTable
def __init__(self, raw, offset):
ListTable.__init__(self, raw, offset)
def read_extra_header(self, data):
start_pos = data.offset
default_offset = data.unpack('H')
self[b'default'] = (LanguageSystemTable(data.raw, start_pos +
default_offset) if default_offset else None)
class ScriptListTable(ListTable):
child_class = ScriptTable
class FeatureTable(IndexTable):
def read_extra_header(self, data):
self.feature_params = data.unpack('H')
if False and self.feature_params != 0:
# Source code pro sets this to non NULL
raise UnsupportedFont(
'This FeatureTable has non NULL FeatureParams: 0x%x'%self.feature_params)
class FeatureListTable(ListTable):
child_class = FeatureTable
class LookupTable(SimpleListTable):
def read_extra_header(self, data):
self.lookup_type, self.lookup_flag = data.unpack('2H')
self.set_child_class()
def set_child_class(self):
raise NotImplementedError()
def read_extra_footer(self, data):
if self.lookup_flag & 0x0010:
self.mark_filtering_set = data.unpack('H')
def ExtensionSubstitution(raw, offset, subtable_map={}):
data = Unpackable(raw, offset)
subst_format, extension_lookup_type, offset = data.unpack('2HL')
if subst_format != 1:
raise UnsupportedFont('ExtensionSubstitution has unknown format: 0x%x'%subst_format)
return subtable_map[extension_lookup_type](raw, offset+data.start_pos)
CoverageRange = namedtuple('CoverageRange', 'start end start_coverage_index')
class Coverage(object):
def __init__(self, raw, offset, parent_table_name):
data = Unpackable(raw, offset)
self.format, count = data.unpack('2H')
if self.format not in {1, 2}:
raise UnsupportedFont('Unknown Coverage format: 0x%x in %s'%(
self.format, parent_table_name))
if self.format == 1:
self.glyph_ids = data.unpack('%dH'%count, single_special=False)
self.glyph_ids_map = {gid:i for i, gid in
enumerate(self.glyph_ids)}
else:
self.ranges = []
ranges = data.unpack('%dH'%(3*count), single_special=False)
for i in range(count):
start, end, start_coverage_index = ranges[i*3:(i+1)*3]
self.ranges.append(CoverageRange(start, end, start_coverage_index))
def coverage_indices(self, glyph_ids):
'''Return map of glyph_id -> coverage index. Map contains only those
glyph_ids that are covered by this table and that are present in
glyph_ids.'''
ans = OrderedDict()
for gid in glyph_ids:
if self.format == 1:
idx = self.glyph_ids_map.get(gid, None)
if idx is not None:
ans[gid] = idx
else:
for start, end, start_coverage_index in self.ranges:
if start <= gid <= end:
ans[gid] = start_coverage_index + (gid-start)
return ans
class UnknownLookupSubTable(object):
formats = {}
def __init__(self, raw, offset):
data = Unpackable(raw, offset)
self.format = data.unpack('H')
if self.format not in self.formats:
raise UnsupportedFont('Unknown format for Lookup Subtable %s: 0x%x'%(
self.__class__.__name__, self.format))
if self.has_initial_coverage:
coverage_offset = data.unpack('H') + data.start_pos
self.coverage = Coverage(raw, coverage_offset, self.__class__.__name__)
self.initialize(data)
@property
def has_initial_coverage(self):
return True
def all_substitutions(self, glyph_ids):
''' Return a set of all glyph ids that could be substituted for any
subset of the specified glyph ids (which must be a set)'''
raise NotImplementedError()
def read_sets(self, data, read_item=None, set_is_index=False):
count = data.unpack('H')
sets = data.unpack('%dH'%count, single_special=False)
coverage_to_items_map = []
for offset in sets:
# Read items in the set
data.offset = start_pos = offset + data.start_pos
count = data.unpack('H')
item_offsets = data.unpack('%dH'%count, single_special=False)
items = []
for offset in item_offsets:
data.offset = offset + start_pos
if set_is_index:
items.append(offset)
else:
items.append(read_item(data))
coverage_to_items_map.append(items)
return coverage_to_items_map

View File

@@ -0,0 +1,171 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
# License: GPLv3 Copyright: 2012, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
from collections import OrderedDict
from io import BytesIO
from struct import calcsize, pack
from calibre.utils.fonts.sfnt import UnknownTable, align_block, max_power_of_two
from calibre.utils.fonts.sfnt.cff.table import CFFTable
from calibre.utils.fonts.sfnt.cmap import CmapTable
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from calibre.utils.fonts.sfnt.glyf import GlyfTable
from calibre.utils.fonts.sfnt.gsub import GSUBTable
from calibre.utils.fonts.sfnt.head import (
HeadTable, HorizontalHeader, OS2Table, PostTable, VerticalHeader
)
from calibre.utils.fonts.sfnt.kern import KernTable
from calibre.utils.fonts.sfnt.loca import LocaTable
from calibre.utils.fonts.sfnt.maxp import MaxpTable
from calibre.utils.fonts.utils import checksum_of_block, get_tables, verify_checksums
# OpenType spec: http://www.microsoft.com/typography/otspec/otff.htm
class Sfnt(object):
TABLE_MAP = {
b'head' : HeadTable,
b'hhea' : HorizontalHeader,
b'vhea' : VerticalHeader,
b'maxp' : MaxpTable,
b'loca' : LocaTable,
b'glyf' : GlyfTable,
b'cmap' : CmapTable,
b'CFF ' : CFFTable,
b'kern' : KernTable,
b'GSUB' : GSUBTable,
b'OS/2' : OS2Table,
b'post' : PostTable,
}
def __init__(self, raw_or_get_table):
self.tables = {}
if isinstance(raw_or_get_table, bytes):
raw = raw_or_get_table
self.sfnt_version = raw[:4]
if self.sfnt_version not in {b'\x00\x01\x00\x00', b'OTTO', b'true',
b'type1'}:
raise UnsupportedFont('Font has unknown sfnt version: %r'%self.sfnt_version)
for table_tag, table, table_index, table_offset, table_checksum in get_tables(raw):
self.tables[table_tag] = self.TABLE_MAP.get(
table_tag, UnknownTable)(table)
else:
for table_tag in {
b'cmap', b'hhea', b'head', b'hmtx', b'maxp', b'name', b'OS/2',
b'post', b'cvt ', b'fpgm', b'glyf', b'loca', b'prep', b'CFF ',
b'VORG', b'EBDT', b'EBLC', b'EBSC', b'BASE', b'GSUB', b'GPOS',
b'GDEF', b'JSTF', b'gasp', b'hdmx', b'kern', b'LTSH', b'PCLT',
b'VDMX', b'vhea', b'vmtx', b'MATH'}:
table = bytes(raw_or_get_table(table_tag))
if table:
self.tables[table_tag] = self.TABLE_MAP.get(
table_tag, UnknownTable)(table)
if not self.tables:
raise UnsupportedFont('This font has no tables')
self.sfnt_version = (b'\0\x01\0\0' if b'glyf' in self.tables
else b'OTTO')
def __getitem__(self, key):
return self.tables[key]
def __contains__(self, key):
return key in self.tables
def __delitem__(self, key):
del self.tables[key]
def __iter__(self):
'''Iterate over the table tags in order.'''
for x in sorted(self.tables):
yield x
# Although the optimal order is not alphabetical, the OTF spec says
# they should be alphabetical, so we stick with that. See
# http://partners.adobe.com/public/developer/opentype/index_recs.html
# for optimal order.
# keys = list(self.tables)
# order = {x:i for i, x in enumerate((b'head', b'hhea', b'maxp', b'OS/2',
# b'hmtx', b'LTSH', b'VDMX', b'hdmx', b'cmap', b'fpgm', b'prep',
# b'cvt ', b'loca', b'glyf', b'CFF ', b'kern', b'name', b'post',
# b'gasp', b'PCLT', b'DSIG'))}
# keys.sort(key=lambda x:order.get(x, 1000))
# for x in keys:
# yield x
def pop(self, key, default=None):
return self.tables.pop(key, default)
def get(self, key, default=None):
return self.tables.get(key, default)
def sizes(self):
ans = OrderedDict()
for tag in self:
ans[tag] = len(self[tag])
return ans
def __call__(self, stream=None):
stream = BytesIO() if stream is None else stream
def spack(*args):
stream.write(pack(*args))
stream.seek(0)
# Write header
num_tables = len(self.tables)
ln2 = max_power_of_two(num_tables)
srange = (2**ln2) * 16
spack(b'>4s4H',
self.sfnt_version, num_tables, srange, ln2, num_tables * 16 - srange)
# Write tables
head_offset = None
table_data = []
offset = stream.tell() + (calcsize(b'>4s3L') * num_tables)
sizes = OrderedDict()
for tag in self:
table = self.tables[tag]
raw = table()
table_len = len(raw)
if tag == b'head':
head_offset = offset
raw = raw[:8] + b'\0\0\0\0' + raw[12:]
raw = align_block(raw)
checksum = checksum_of_block(raw)
spack(b'>4s3L', tag, checksum, offset, table_len)
offset += len(raw)
table_data.append(raw)
sizes[tag] = table_len
for x in table_data:
stream.write(x)
checksum = checksum_of_block(stream.getvalue())
q = (0xB1B0AFBA - checksum) & 0xffffffff
stream.seek(head_offset + 8)
spack(b'>L', q)
return stream.getvalue(), sizes
def test_roundtrip(ff=None):
if ff is None:
data = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
else:
with open(ff, 'rb') as f:
data = f.read()
rd = Sfnt(data)()[0]
verify_checksums(rd)
if data[:12] != rd[:12]:
raise ValueError('Roundtripping failed, font header not the same')
if len(data) != len(rd):
raise ValueError('Roundtripping failed, size different (%d vs. %d)'%
(len(data), len(rd)))
if __name__ == '__main__':
import sys
test_roundtrip(sys.argv[-1])

View File

@@ -0,0 +1,16 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class UnsupportedFont(ValueError):
pass
class NoGlyphs(ValueError):
pass

View File

@@ -0,0 +1,95 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from
from collections import OrderedDict
from calibre.utils.fonts.sfnt import UnknownTable
from polyglot.builtins import iteritems
ARG_1_AND_2_ARE_WORDS = 0x0001 # if set args are words otherwise they are bytes
ARGS_ARE_XY_VALUES = 0x0002 # if set args are xy values, otherwise they are points
ROUND_XY_TO_GRID = 0x0004 # for the xy values if above is true
WE_HAVE_A_SCALE = 0x0008 # Sx = Sy, otherwise scale == 1.0
NON_OVERLAPPING = 0x0010 # set to same value for all components (obsolete!)
MORE_COMPONENTS = 0x0020 # indicates at least one more glyph after this one
WE_HAVE_AN_X_AND_Y_SCALE = 0x0040 # Sx, Sy
WE_HAVE_A_TWO_BY_TWO = 0x0080 # t00, t01, t10, t11
WE_HAVE_INSTRUCTIONS = 0x0100 # instructions follow
USE_MY_METRICS = 0x0200 # apply these metrics to parent glyph
OVERLAP_COMPOUND = 0x0400 # used by Apple in GX fonts
SCALED_COMPONENT_OFFSET = 0x0800 # composite designed to have the component offset scaled (designed for Apple)
UNSCALED_COMPONENT_OFFSET = 0x1000 # composite designed not to have the component offset scaled (designed for MS)
class SimpleGlyph(object):
def __init__(self, num_of_countours, raw):
self.num_of_countours = num_of_countours
self.raw = raw
# The list of glyph indices referred to by this glyph, will always be
# empty for a simple glyph and not empty for a composite glyph
self.glyph_indices = []
self.is_composite = False
def __len__(self):
return len(self.raw)
def __call__(self):
return self.raw
class CompositeGlyph(SimpleGlyph):
def __init__(self, num_of_countours, raw):
super(CompositeGlyph, self).__init__(num_of_countours, raw)
self.is_composite = True
flags = MORE_COMPONENTS
offset = 10
while flags & MORE_COMPONENTS:
flags, glyph_index = unpack_from(b'>HH', raw, offset)
self.glyph_indices.append(glyph_index)
offset += 4
if flags & ARG_1_AND_2_ARE_WORDS:
offset += 4
else:
offset += 2
if flags & WE_HAVE_A_SCALE:
offset += 2
elif flags & WE_HAVE_AN_X_AND_Y_SCALE:
offset += 4
elif flags & WE_HAVE_A_TWO_BY_TWO:
offset += 8
class GlyfTable(UnknownTable):
def glyph_data(self, offset, length, as_raw=False):
raw = self.raw[offset:offset+length]
if as_raw:
return raw
num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0
if num_of_countours >= 0:
return SimpleGlyph(num_of_countours, raw)
return CompositeGlyph(num_of_countours, raw)
def update(self, sorted_glyph_map):
ans = OrderedDict()
offset = 0
block = []
for glyph_id, glyph in iteritems(sorted_glyph_map):
raw = glyph()
pad = 4 - (len(raw) % 4)
if pad < 4:
raw += b'\0' * pad
ans[glyph_id] = offset, len(raw)
offset += len(raw)
block.append(raw)
self.raw = b''.join(block)
return ans

View File

@@ -0,0 +1,191 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from
from functools import partial
from calibre.utils.fonts.sfnt import UnknownTable, FixedProperty
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from calibre.utils.fonts.sfnt.common import (ScriptListTable, FeatureListTable,
SimpleListTable, LookupTable, ExtensionSubstitution,
UnknownLookupSubTable)
from polyglot.builtins import iteritems, itervalues
class SingleSubstitution(UnknownLookupSubTable):
formats = {1, 2}
def initialize(self, data):
if self.format == 1:
self.delta = data.unpack('h')
else:
count = data.unpack('H')
self.substitutes = data.unpack('%dH'%count, single_special=False)
def all_substitutions(self, glyph_ids):
gid_index_map = self.coverage.coverage_indices(glyph_ids)
if self.format == 1:
return {gid + self.delta for gid in gid_index_map}
return {self.substitutes[i] for i in itervalues(gid_index_map)}
class MultipleSubstitution(UnknownLookupSubTable):
formats = {1}
def initialize(self, data):
self.coverage_to_subs_map = self.read_sets(data, set_is_index=True)
def all_substitutions(self, glyph_ids):
gid_index_map = self.coverage.coverage_indices(glyph_ids)
ans = set()
for index in itervalues(gid_index_map):
glyphs = set(self.coverage_to_subs_map[index])
ans |= glyphs
return ans
class AlternateSubstitution(MultipleSubstitution):
pass
class LigatureSubstitution(UnknownLookupSubTable):
formats = {1}
def initialize(self, data):
self.coverage_to_lig_map = self.read_sets(data, self.read_ligature)
def read_ligature(self, data):
lig_glyph, count = data.unpack('HH')
components = data.unpack('%dH'%(count-1), single_special=False)
return (lig_glyph, components)
def all_substitutions(self, glyph_ids):
gid_index_map = self.coverage.coverage_indices(glyph_ids)
ans = set()
for start_glyph_id, index in iteritems(gid_index_map):
for glyph_id, components in self.coverage_to_lig_map[index]:
components = (start_glyph_id,) + components
if set(components).issubset(glyph_ids):
ans.add(glyph_id)
return ans
class ContexttualSubstitution(UnknownLookupSubTable):
formats = {1, 2, 3}
@property
def has_initial_coverage(self):
return self.format != 3
def initialize(self, data):
pass # TODO
def all_substitutions(self, glyph_ids):
# This table only defined substitution in terms of other tables
return set()
class ChainingContextualSubstitution(UnknownLookupSubTable):
formats = {1, 2, 3}
@property
def has_initial_coverage(self):
return self.format != 3
def initialize(self, data):
pass # TODO
def all_substitutions(self, glyph_ids):
# This table only defined substitution in terms of other tables
return set()
class ReverseChainSingleSubstitution(UnknownLookupSubTable):
formats = {1}
def initialize(self, data):
backtrack_count = data.unpack('H')
backtrack_offsets = data.unpack('%dH'%backtrack_count,
single_special=False)
lookahead_count = data.unpack('H')
lookahead_offsets = data.unpack('%dH'%lookahead_count,
single_special=False)
backtrack_offsets = [data.start_pos + x for x in backtrack_offsets]
lookahead_offsets = [data.start_pos + x for x in lookahead_offsets]
backtrack_offsets, lookahead_offsets # TODO: Use these
count = data.unpack('H')
self.substitutes = data.unpack('%dH'%count)
def all_substitutions(self, glyph_ids):
gid_index_map = self.coverage.coverage_indices(glyph_ids)
return {self.substitutes[i] for i in itervalues(gid_index_map)}
subtable_map = {
1: SingleSubstitution,
2: MultipleSubstitution,
3: AlternateSubstitution,
4: LigatureSubstitution,
5: ContexttualSubstitution,
6: ChainingContextualSubstitution,
8: ReverseChainSingleSubstitution,
}
class GSUBLookupTable(LookupTable):
def set_child_class(self):
if self.lookup_type == 7:
self.child_class = partial(ExtensionSubstitution,
subtable_map=subtable_map)
else:
self.child_class = subtable_map[self.lookup_type]
class LookupListTable(SimpleListTable):
child_class = GSUBLookupTable
class GSUBTable(UnknownTable):
version = FixedProperty('_version')
def decompile(self):
(self._version, self.scriptlist_offset, self.featurelist_offset,
self.lookuplist_offset) = unpack_from(b'>L3H', self.raw)
if self._version != 0x10000:
raise UnsupportedFont('The GSUB table has unknown version: 0x%x'%
self._version)
self.script_list_table = ScriptListTable(self.raw,
self.scriptlist_offset)
# self.script_list_table.dump()
self.feature_list_table = FeatureListTable(self.raw,
self.featurelist_offset)
# self.feature_list_table.dump()
self.lookup_list_table = LookupListTable(self.raw,
self.lookuplist_offset)
def all_substitutions(self, glyph_ids):
glyph_ids = frozenset(glyph_ids)
ans = set(glyph_ids)
for lookup_table in self.lookup_list_table:
for subtable in lookup_table:
glyphs = subtable.all_substitutions(ans)
if glyphs:
ans |= glyphs
return ans - {glyph_ids}

View File

@@ -0,0 +1,213 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from, pack, calcsize
from calibre.utils.fonts.sfnt import UnknownTable, DateTimeProperty, FixedProperty
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from calibre.utils.fonts.sfnt.loca import read_array
from polyglot.builtins import zip
class HeadTable(UnknownTable):
created = DateTimeProperty('_created')
modified = DateTimeProperty('_modified')
version_number = FixedProperty('_version_number')
font_revision = FixedProperty('_font_revision')
def __init__(self, *args, **kwargs):
super(HeadTable, self).__init__(*args, **kwargs)
field_types = (
'_version_number' , 'l',
'_font_revision' , 'l',
'checksum_adjustment' , 'L',
'magic_number' , 'L',
'flags' , 'H',
'units_per_em' , 'H',
'_created' , 'q',
'_modified' , 'q',
'x_min' , 'h',
'y_min' , 'h',
'x_max' , 'h',
'y_max' , 'h',
'mac_style' , 'H',
'lowest_rec_ppem' , 'H',
'font_direction_hint' , 'h',
'index_to_loc_format' , 'h',
'glyph_data_format' , 'h'
)
self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii')
self._fields = field_types[0::2]
for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)):
setattr(self, f, val)
def update(self):
vals = [getattr(self, f) for f in self._fields]
self.raw = pack(self._fmt, *vals)
class HorizontalHeader(UnknownTable):
version_number = FixedProperty('_version_number')
def read_data(self, hmtx):
if hasattr(self, 'ascender'):
return
field_types = (
'_version_number' , 'l',
'ascender', 'h',
'descender', 'h',
'line_gap', 'h',
'advance_width_max', 'H',
'min_left_side_bearing', 'h',
'min_right_side_bearing', 'h',
'x_max_extent', 'h',
'caret_slope_rise', 'h',
'caret_slop_run', 'h',
'caret_offset', 'h',
'r1', 'h',
'r2', 'h',
'r3', 'h',
'r4', 'h',
'metric_data_format', 'h',
'number_of_h_metrics', 'H',
)
self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii')
self._fields = field_types[0::2]
for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)):
setattr(self, f, val)
raw = hmtx.raw
num = self.number_of_h_metrics
if len(raw) < 4*num:
raise UnsupportedFont('The hmtx table has insufficient data')
long_hor_metric = raw[:4*num]
a = read_array(long_hor_metric)
self.advance_widths = a[0::2]
a = read_array(long_hor_metric, 'h')
self.left_side_bearings = a[1::2]
class VerticalHeader(UnknownTable):
version_number = FixedProperty('_version_number')
def read_data(self, vmtx):
if hasattr(self, 'ascender'):
return
field_types = (
'_version_number' , 'l',
'ascender', 'h',
'descender', 'h',
'line_gap', 'h',
'advance_height_max', 'H',
'min_top_side_bearing', 'h',
'min_bottom_side_bearing', 'h',
'y_max_extent', 'h',
'caret_slope_rise', 'h',
'caret_slop_run', 'h',
'caret_offset', 'h',
'r1', 'h',
'r2', 'h',
'r3', 'h',
'r4', 'h',
'metric_data_format', 'h',
'number_of_v_metrics', 'H',
)
self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii')
self._fields = field_types[0::2]
for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)):
setattr(self, f, val)
raw = vmtx.raw
num = self.number_of_v_metrics
if len(raw) < 4*num:
raise UnsupportedFont('The vmtx table has insufficient data')
long_hor_metric = raw[:4*num]
long_hor_metric = raw[:4*num]
a = read_array(long_hor_metric)
self.advance_heights = a[0::2]
a = read_array(long_hor_metric, 'h')
self.top_side_bearings = a[1::2]
class OS2Table(UnknownTable):
def read_data(self):
if hasattr(self, 'char_width'):
return
ver, = unpack_from(b'>H', self.raw)
field_types = [
'version' , 'H',
'average_char_width', 'h',
'weight_class', 'H',
'width_class', 'H',
'fs_type', 'H',
'subscript_x_size', 'h',
'subscript_y_size', 'h',
'subscript_x_offset', 'h',
'subscript_y_offset', 'h',
'superscript_x_size', 'h',
'superscript_y_size', 'h',
'superscript_x_offset', 'h',
'superscript_y_offset', 'h',
'strikeout_size', 'h',
'strikeout_position', 'h',
'family_class', 'h',
'panose', '10s',
'ranges', '16s',
'vendor_id', '4s',
'selection', 'H',
'first_char_index', 'H',
'last_char_index', 'H',
'typo_ascender', 'h',
'typo_descender', 'h',
'typo_line_gap', 'h',
'win_ascent', 'H',
'win_descent', 'H',
]
if ver > 1:
field_types += [
'code_page_range', '8s',
'x_height', 'h',
'cap_height', 'h',
'default_char', 'H',
'break_char', 'H',
'max_context', 'H',
]
self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii')
self._fields = field_types[0::2]
for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)):
setattr(self, f, val)
def zero_fstype(self):
prefix = calcsize(b'>HhHH')
self.raw = self.raw[:prefix] + b'\0\0' + self.raw[prefix+2:]
self.fs_type = 0
class PostTable(UnknownTable):
version_number = FixedProperty('_version')
italic_angle = FixedProperty('_italic_angle')
def read_data(self):
if hasattr(self, 'underline_position'):
return
(self._version, self._italic_angle, self.underline_position,
self.underline_thickness) = unpack_from(b'>llhh', self.raw)

View File

@@ -0,0 +1,89 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from, calcsize, pack, error as struct_error
from calibre.utils.fonts.sfnt import (UnknownTable, FixedProperty,
max_power_of_two)
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from polyglot.builtins import range
class KernTable(UnknownTable):
version = FixedProperty('_version')
def __init__(self, *args, **kwargs):
super(KernTable, self).__init__(*args, **kwargs)
self._version, self.num_tables = unpack_from(b'>HH', self.raw)
if self._version == 1 and len(self.raw) >= 8:
self._version, self.num_tables = unpack_from(b'>LL', self.raw)
self.headerfmt = b'>HH' if self._version == 0 else b'>LL'
def restrict_to_glyphs(self, glyph_ids):
if self._version not in {0, 0x10000}:
raise UnsupportedFont('kern table has version: %x'%self._version)
offset = 4 if (self._version == 0) else 8
tables = []
for i in range(self.num_tables):
if self._version == 0:
version, length, coverage = unpack_from(b'>3H', self.raw, offset)
table_format = version
else:
length, coverage = unpack_from(b'>LH', self.raw, offset)
table_format = coverage & 0xff
raw = self.raw[offset:offset+length]
if table_format == 0:
raw = self.restrict_format_0(raw, glyph_ids)
if not raw:
continue
tables.append(raw)
offset += length
self.raw = pack(self.headerfmt, self._version, len(tables)) + b''.join(tables)
def restrict_format_0(self, raw, glyph_ids):
if self._version == 0:
version, length, coverage, npairs = unpack_from(b'>4H', raw)
headerfmt = b'>3H'
else:
length, coverage, tuple_index, npairs = unpack_from(b'>L3H', raw)
headerfmt = b'>L2H'
offset = calcsize(headerfmt + b'4H')
entries = []
entrysz = calcsize(b'>2Hh')
for i in range(npairs):
try:
left, right, value = unpack_from(b'>2Hh', raw, offset)
except struct_error:
offset = len(raw)
break # Buggy kern table
if left in glyph_ids and right in glyph_ids:
entries.append(pack(b'>2Hh', left, right, value))
offset += entrysz
if offset != len(raw):
raise UnsupportedFont('This font has extra data at the end of'
' a Format 0 kern subtable')
npairs = len(entries)
if npairs == 0:
return b''
entry_selector = max_power_of_two(npairs)
search_range = (2 ** entry_selector) * 6
range_shift = (npairs - (2 ** entry_selector)) * 6
entries = b''.join(entries)
length = calcsize(headerfmt + b'4H') + len(entries)
if self._version == 0:
header = pack(headerfmt, version, length, coverage)
else:
header = pack(headerfmt, length, coverage, tuple_index)
return header + pack(b'>4H', npairs, search_range, entry_selector,
range_shift) + entries

View File

@@ -0,0 +1,91 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import array, sys
from operator import itemgetter
from itertools import repeat
from calibre.utils.fonts.sfnt import UnknownTable
from polyglot.builtins import iteritems, range
def four_byte_type_code():
for c in 'IL':
a = array.array(c)
if a.itemsize == 4:
return c
def read_array(data, fmt='H'):
ans = array.array(fmt, data)
if sys.byteorder != 'big':
ans.byteswap()
return ans
class LocaTable(UnknownTable):
def load_offsets(self, head_table, maxp_table):
fmt = 'H' if head_table.index_to_loc_format == 0 else four_byte_type_code()
locs = read_array(self.raw, fmt)
self.offset_map = locs.tolist()
if fmt == 'H':
self.offset_map = [2*i for i in self.offset_map]
self.fmt = fmt
def glyph_location(self, glyph_id):
offset = self.offset_map[glyph_id]
next_offset = self.offset_map[glyph_id+1]
return offset, next_offset - offset
def update(self, resolved_glyph_map):
'''
Update this table to contain pointers only to the glyphs in
resolved_glyph_map which must be a map of glyph_ids to (offset, sz)
Note that the loca table is generated for all glyphs from 0 to the
largest glyph that is either in resolved_glyph_map or was present
originally. The pointers to glyphs that have no data will be set to
zero. This preserves glyph ids.
'''
current_max_glyph_id = len(self.offset_map) - 2
max_glyph_id = max(resolved_glyph_map or (0,))
max_glyph_id = max(max_glyph_id, current_max_glyph_id)
self.offset_map = list(repeat(0, max_glyph_id + 2))
glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in
iteritems(resolved_glyph_map)]
glyphs.sort(key=itemgetter(1))
for glyph_id, offset, sz in glyphs:
self.offset_map[glyph_id] = offset
self.offset_map[glyph_id+1] = offset + sz
# Fix all zero entries to be the same as the previous entry, which
# means that if the ith entry is zero, the i-1 glyph is not present.
for i in range(1, len(self.offset_map)):
if self.offset_map[i] == 0:
self.offset_map[i] = self.offset_map[i-1]
vals = self.offset_map
max_offset = max(vals) if vals else 0
if max_offset < 0x20000 and all(l % 2 == 0 for l in vals):
self.fmt = 'H'
vals = array.array(self.fmt, (i // 2 for i in vals))
else:
self.fmt = four_byte_type_code()
vals = array.array(self.fmt, vals)
if sys.byteorder != "big":
vals.byteswap()
self.raw = vals.tostring()
subset = update
def dump_glyphs(self, sfnt):
if not hasattr(self, 'offset_map'):
self.load_offsets(sfnt[b'head'], sfnt[b'maxp'])
for i in range(len(self.offset_map)-1):
off, noff = self.offset_map[i], self.offset_map[i+1]
if noff != off:
print('Glyph id:', i, 'size:', noff-off)

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from, pack
from calibre.utils.fonts.sfnt import UnknownTable, FixedProperty
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from polyglot.builtins import zip
class MaxpTable(UnknownTable):
version = FixedProperty('_version')
def __init__(self, *args, **kwargs):
super(MaxpTable, self).__init__(*args, **kwargs)
self._fmt = b'>lH'
self._version, self.num_glyphs = unpack_from(self._fmt, self.raw)
self.fields = ('_version', 'num_glyphs')
if self.version > 1.0:
raise UnsupportedFont('This font has a maxp table with version: %s'
%self.version)
if self.version == 1.0:
self.fields = ('_version', 'num_glyphs', 'max_points',
'max_contours', 'max_composite_points',
'max_composite_contours', 'max_zones',
'max_twilight_points', 'max_storage', 'max_function_defs',
'max_instruction_defs', 'max_stack_elements',
'max_size_of_instructions', 'max_component_elements',
'max_component_depth')
self._fmt = b'>lH' + b'H'*(len(self.fields)-2)
vals = unpack_from(self._fmt, self.raw)
for f, val in zip(self.fields, vals):
setattr(self, f, val)
def update(self):
vals = [getattr(self, f) for f in self.fields]
self.raw = pack(self._fmt, *vals)

View File

@@ -0,0 +1,380 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import traceback
from collections import OrderedDict
from operator import itemgetter
from functools import partial
from calibre.utils.icu import safe_chr, ord_string
from calibre.utils.fonts.sfnt.container import Sfnt
from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs
from polyglot.builtins import unicode_type, range, iteritems, itervalues, map
# TrueType outlines {{{
def resolve_glyphs(loca, glyf, character_map, extra_glyphs):
unresolved_glyphs = set(itervalues(character_map)) | extra_glyphs
unresolved_glyphs.add(0) # We always want the .notdef glyph
resolved_glyphs = {}
while unresolved_glyphs:
glyph_id = unresolved_glyphs.pop()
try:
offset, length = loca.glyph_location(glyph_id)
except (IndexError, ValueError, KeyError, TypeError):
continue
glyph = glyf.glyph_data(offset, length)
resolved_glyphs[glyph_id] = glyph
for gid in glyph.glyph_indices:
if gid not in resolved_glyphs:
unresolved_glyphs.add(gid)
return OrderedDict(sorted(iteritems(resolved_glyphs), key=itemgetter(0)))
def subset_truetype(sfnt, character_map, extra_glyphs):
loca = sfnt[b'loca']
glyf = sfnt[b'glyf']
try:
head, maxp = sfnt[b'head'], sfnt[b'maxp']
except KeyError:
raise UnsupportedFont('This font does not contain head and/or maxp tables')
loca.load_offsets(head, maxp)
resolved_glyphs = resolve_glyphs(loca, glyf, character_map, extra_glyphs)
if not resolved_glyphs or set(resolved_glyphs) == {0}:
raise NoGlyphs('This font has no glyphs for the specified character '
'set, subsetting it is pointless')
# Keep only character codes that have resolved glyphs
for code, glyph_id in tuple(iteritems(character_map)):
if glyph_id not in resolved_glyphs:
del character_map[code]
# Update the glyf table
glyph_offset_map = glyf.update(resolved_glyphs)
# Update the loca table
loca.subset(glyph_offset_map)
head.index_to_loc_format = 0 if loca.fmt == 'H' else 1
head.update()
maxp.num_glyphs = len(loca.offset_map) - 1
# }}}
def subset_postscript(sfnt, character_map, extra_glyphs):
cff = sfnt[b'CFF ']
cff.decompile()
cff.subset(character_map, extra_glyphs)
def do_warn(warnings, *args):
for arg in args:
for line in arg.splitlines():
if warnings is None:
print(line)
else:
warnings.append(line)
if warnings is None:
print()
else:
warnings.append('')
def pdf_subset(sfnt, glyphs):
for tag in tuple(sfnt.tables):
if tag not in {b'hhea', b'head', b'hmtx', b'maxp',
b'OS/2', b'post', b'cvt ', b'fpgm', b'glyf', b'loca',
b'prep', b'CFF ', b'VORG'}:
# Remove non core tables since they are unused in PDF rendering
del sfnt[tag]
if b'loca' in sfnt and b'glyf' in sfnt:
# TrueType Outlines
subset_truetype(sfnt, {}, glyphs)
elif b'CFF ' in sfnt:
# PostScript Outlines
subset_postscript(sfnt, {}, glyphs)
else:
raise UnsupportedFont('This font does not contain TrueType '
'or PostScript outlines')
def safe_ord(x):
return ord_string(unicode_type(x))[0]
def subset(raw, individual_chars, ranges=(), warnings=None):
warn = partial(do_warn, warnings)
chars = set(map(safe_ord, individual_chars))
for r in ranges:
chars |= set(range(safe_ord(r[0]), safe_ord(r[1])+1))
# Always add the space character for ease of use from the command line
if safe_ord(' ') not in chars:
chars.add(safe_ord(' '))
sfnt = Sfnt(raw)
old_sizes = sfnt.sizes()
# Remove the Digital Signature table since it is useless in a subset
# font anyway
sfnt.pop(b'DSIG', None)
# Remove non core tables as they aren't likely to be used by renderers
# anyway
core_tables = {b'cmap', b'hhea', b'head', b'hmtx', b'maxp', b'name',
b'OS/2', b'post', b'cvt ', b'fpgm', b'glyf', b'loca', b'prep',
b'CFF ', b'VORG', b'EBDT', b'EBLC', b'EBSC', b'BASE', b'GSUB',
b'GPOS', b'GDEF', b'JSTF', b'gasp', b'hdmx', b'kern', b'LTSH',
b'PCLT', b'VDMX', b'vhea', b'vmtx', b'MATH'}
for tag in list(sfnt):
if tag not in core_tables:
del sfnt[tag]
try:
cmap = sfnt[b'cmap']
except KeyError:
raise UnsupportedFont('This font has no cmap table')
# Get mapping of chars to glyph ids for all specified chars
character_map = cmap.get_character_map(chars)
extra_glyphs = set()
if b'GSUB' in sfnt:
# Parse all substitution rules to ensure that glyphs that can be
# substituted for the specified set of glyphs are not removed
gsub = sfnt[b'GSUB']
try:
gsub.decompile()
extra_glyphs = gsub.all_substitutions(itervalues(character_map))
except UnsupportedFont as e:
warn('Usupported GSUB table: %s'%e)
except Exception:
warn('Failed to decompile GSUB table:', traceback.format_exc())
if b'loca' in sfnt and b'glyf' in sfnt:
# TrueType Outlines
subset_truetype(sfnt, character_map, extra_glyphs)
elif b'CFF ' in sfnt:
# PostScript Outlines
subset_postscript(sfnt, character_map, extra_glyphs)
else:
raise UnsupportedFont('This font does not contain TrueType '
'or PostScript outlines')
# Restrict the cmap table to only contain entries for the resolved glyphs
cmap.set_character_map(character_map)
if b'kern' in sfnt:
try:
sfnt[b'kern'].restrict_to_glyphs(frozenset(itervalues(character_map)))
except UnsupportedFont as e:
warn('kern table unsupported, ignoring: %s'%e)
except Exception:
warn('Subsetting of kern table failed, ignoring:',
traceback.format_exc())
raw, new_sizes = sfnt()
return raw, old_sizes, new_sizes
# CLI {{{
def option_parser():
import textwrap
from calibre.utils.config import OptionParser
parser = OptionParser(usage=textwrap.dedent('''\
%prog [options] input_font_file output_font_file characters_to_keep
Subset the specified font, keeping only the glyphs for the characters in
characters_to_keep. characters_to_keep is a comma separated list of characters of
the form: a,b,c,A-Z,0-9,xyz
You can specify ranges in the list of characters, as shown above.
'''))
parser.add_option('-c', '--codes', default=False, action='store_true',
help='If specified, the list of characters is interpreted as '
'numeric unicode codes instead of characters. So to specify the '
'characters a,b you would use 97,98 or U+0061,U+0062')
parser.prog = 'subset-font'
return parser
def print_stats(old_stats, new_stats):
from calibre import prints
prints('========= Table comparison (original vs. subset) =========')
prints('Table', ' ', '%10s'%'Size', ' ', 'Percent', ' ', '%10s'%'New Size',
' New Percent')
prints('='*80)
old_total = sum(itervalues(old_stats))
new_total = sum(itervalues(new_stats))
tables = sorted(old_stats, key=lambda x:old_stats[x],
reverse=True)
for table in tables:
osz = old_stats[table]
op = osz/old_total * 100
nsz = new_stats.get(table, 0)
np = nsz/new_total * 100
suffix = ' | same size'
if nsz != osz:
suffix = ' | reduced to %.1f %%'%(nsz/osz * 100)
prints('%4s'%table, ' ', '%10s'%osz, ' ', '%5.1f %%'%op, ' ',
'%10s'%nsz, ' ', '%5.1f %%'%np, suffix)
prints('='*80)
def main(args):
import sys, time
from calibre import prints
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) < 4 or len(args) > 4:
parser.print_help()
raise SystemExit(1)
iff, off, chars = args[1:]
with open(iff, 'rb') as f:
orig = f.read()
chars = [x for x in chars.split(',')]
individual, ranges = set(), set()
def not_single(c):
if len(c) > 1:
prints(c, 'is not a single character', file=sys.stderr)
raise SystemExit(1)
def conv_code(c):
if c.upper()[:2] in ('U+', '0X'):
c = int(c[2:], 16)
return safe_chr(int(c))
for c in chars:
if '-' in c:
parts = [x.strip() for x in c.split('-')]
if len(parts) != 2:
prints('Invalid range:', c, file=sys.stderr)
raise SystemExit(1)
if opts.codes:
parts = tuple(map(conv_code, parts))
tuple(map(not_single, parts))
ranges.add(tuple(parts))
else:
if opts.codes:
c = conv_code(c)
not_single(c)
individual.add(c)
st = time.time()
sf, old_stats, new_stats = subset(orig, individual, ranges)
taken = time.time() - st
reduced = (len(sf)/len(orig)) * 100
def sz(x):
return '%gKB'%(len(x)/1024.)
print_stats(old_stats, new_stats)
prints('Original size:', sz(orig), 'Subset size:', sz(sf), 'Reduced to: %g%%'%(reduced))
prints('Subsetting took %g seconds'%taken)
with open(off, 'wb') as f:
f.write(sf)
prints('Subset font written to:', off)
if __name__ == '__main__':
try:
import init_calibre
init_calibre
except ImportError:
pass
import sys
main(sys.argv)
# }}}
# Tests {{{
def test_mem():
from calibre.utils.mem import memory
import gc
gc.collect()
start_mem = memory()
raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
calls = 1000
for i in range(calls):
subset(raw, (), (('a', 'z'),))
del raw
for i in range(3):
gc.collect()
print('Leaked memory per call:', (memory() - start_mem)/calls*1024, 'KB')
def test():
raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
if len(sf) > 0.3 * len(raw):
raise Exception('Subsetting failed')
def all():
from calibre.utils.fonts.scanner import font_scanner
failed = []
unsupported = []
warnings = {}
total = 0
averages = []
for family in font_scanner.find_font_families():
for font in font_scanner.fonts_for_family(family):
raw = font_scanner.get_font_data(font)
print('Subsetting', font['full_name'], end='\t')
total += 1
try:
w = []
sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')),
(), w)
if w:
warnings[font['full_name'] + ' (%s)'%font['path']] = w
except NoGlyphs:
print('No glyphs!')
continue
except UnsupportedFont as e:
unsupported.append((font['full_name'], font['path'], unicode_type(e)))
print('Unsupported!')
continue
except Exception as e:
print('Failed!')
failed.append((font['full_name'], font['path'], unicode_type(e)))
else:
averages.append(sum(itervalues(new_stats))/sum(itervalues(old_stats)) * 100)
print('Reduced to:', '%.1f'%averages[-1] , '%')
if unsupported:
print('\n\nUnsupported:')
for name, path, err in unsupported:
print(name, path, err)
print()
if warnings:
print('\n\nWarnings:')
for name, w in iteritems(warnings):
if w:
print(name)
print('', '\n\t'.join(w), sep='\t')
if failed:
print('\n\nFailures:')
for name, path, err in failed:
print(name, path, err)
print()
print('Average reduction to: %.1f%%'%(sum(averages)/len(averages)))
print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:',
len(failed), 'Warnings:', len(warnings))
# }}}