diff --git a/ebook_converter/ebooks/docx/writer/__init__.py b/ebook_converter/ebooks/docx/writer/__init__.py new file mode 100644 index 0000000..d9dc365 --- /dev/null +++ b/ebook_converter/ebooks/docx/writer/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + + + diff --git a/ebook_converter/ebooks/docx/writer/container.py b/ebook_converter/ebooks/docx/writer/container.py new file mode 100644 index 0000000..1c0d25a --- /dev/null +++ b/ebook_converter/ebooks/docx/writer/container.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import textwrap, os + +from lxml import etree +from lxml.builder import ElementMaker + +from calibre import guess_type +from calibre.constants import numeric_version, __appname__ +from calibre.ebooks.docx.names import DOCXNamespace +from calibre.ebooks.metadata import authors_to_string +from calibre.ebooks.pdf.render.common import PAPER_SIZES +from calibre.utils.date import utcnow +from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 +from calibre.utils.zipfile import ZipFile +from polyglot.builtins import iteritems, map, unicode_type, native_string_type + + +def xml2str(root, pretty_print=False, with_tail=False): + if hasattr(etree, 'cleanup_namespaces'): + etree.cleanup_namespaces(root) + ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, + pretty_print=pretty_print, with_tail=with_tail) + return ans + + +def page_size(opts): + width, height = PAPER_SIZES[opts.docx_page_size] + if opts.docx_custom_page_size is not None: + width, height = map(float, opts.docx_custom_page_size.partition('x')[0::2]) + return width, height + + +def page_margin(opts, which): + val = getattr(opts, 'docx_page_margin_' + which) + if val == 0.0: + val = getattr(opts, 'margin_' + which) + return val + + +def page_effective_area(opts): + width, height = page_size(opts) + width -= page_margin(opts, 'left') + page_margin(opts, 'right') + height -= page_margin(opts, 'top') + page_margin(opts, 'bottom') + return width, height # in pts + + +def create_skeleton(opts, namespaces=None): + namespaces = namespaces or DOCXNamespace().namespaces + + def w(x): + return '{%s}%s' % (namespaces['w'], x) + dn = {k:v for k, v in iteritems(namespaces) if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}} + E = ElementMaker(namespace=dn['w'], nsmap=dn) + doc = E.document() + body = E.body() + doc.append(body) + width, height = page_size(opts) + width, height = int(20 * width), int(20 * height) + + def margin(which): + val = page_margin(opts, which) + return w(which), unicode_type(int(val * 20)) + body.append(E.sectPr( + E.pgSz(**{w('w'):unicode_type(width), w('h'):unicode_type(height)}), + E.pgMar(**dict(map(margin, 'left top right bottom'.split()))), + E.cols(**{w('space'):'720'}), + E.docGrid(**{w('linePitch'):"360"}), + )) + + dn = {k:v for k, v in iteritems(namespaces) if k in tuple('wra') + ('wp',)} + E = ElementMaker(namespace=dn['w'], nsmap=dn) + styles = E.styles( + E.docDefaults( + E.rPrDefault( + E.rPr( + E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}), + E.sz(**{w('val'):'22'}), + E.szCs(**{w('val'):'22'}), + E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"}) + ) + ), + E.pPrDefault( + E.pPr( + E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"}) + ) + ) + ) + ) + return doc, styles, body + + +def update_doc_props(root, mi, namespace): + def setm(name, text=None, ns='dc'): + ans = root.makeelement('{%s}%s' % (namespace.namespaces[ns], name)) + for child in tuple(root): + if child.tag == ans.tag: + root.remove(child) + ans.text = text + root.append(ans) + return ans + setm('title', mi.title) + setm('creator', authors_to_string(mi.authors)) + if mi.tags: + setm('keywords', ', '.join(mi.tags), ns='cp') + if mi.comments: + setm('description', mi.comments) + if mi.languages: + l = canonicalize_lang(mi.languages[0]) + setm('language', lang_as_iso639_1(l) or l) + + +class DocumentRelationships(object): + + def __init__(self, namespace): + self.rmap = {} + self.namespace = namespace + for typ, target in iteritems({ + namespace.names['STYLES']: 'styles.xml', + namespace.names['NUMBERING']: 'numbering.xml', + namespace.names['WEB_SETTINGS']: 'webSettings.xml', + namespace.names['FONTS']: 'fontTable.xml', + }): + self.add_relationship(target, typ) + + def get_relationship_id(self, target, rtype, target_mode=None): + return self.rmap.get((target, rtype, target_mode)) + + def add_relationship(self, target, rtype, target_mode=None): + ans = self.get_relationship_id(target, rtype, target_mode) + if ans is None: + ans = 'rId%d' % (len(self.rmap) + 1) + self.rmap[(target, rtype, target_mode)] = ans + return ans + + def add_image(self, target): + return self.add_relationship(target, self.namespace.names['IMAGES']) + + def serialize(self): + namespaces = self.namespace.namespaces + E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']}) + relationships = E.Relationships() + for (target, rtype, target_mode), rid in iteritems(self.rmap): + r = E.Relationship(Id=rid, Type=rtype, Target=target) + if target_mode is not None: + r.set('TargetMode', target_mode) + relationships.append(r) + return xml2str(relationships) + + +class DOCX(object): + + def __init__(self, opts, log): + self.namespace = DOCXNamespace() + namespaces = self.namespace.namespaces + self.opts, self.log = opts, log + self.document_relationships = DocumentRelationships(self.namespace) + self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'}) + self.numbering = etree.Element('{%s}numbering' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'}) + E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']}) + self.embedded_fonts = E.Relationships() + self.fonts = {} + self.images = {} + + # Boilerplate {{{ + @property + def contenttypes(self): + E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']}) + types = E.Types() + for partname, mt in iteritems({ + "/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml", + "/word/document.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml", + "/word/numbering.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml", + "/word/styles.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml", + "/word/endnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml", + "/word/settings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml", + "/word/theme/theme1.xml": "application/vnd.openxmlformats-officedocument.theme+xml", + "/word/fontTable.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml", + "/word/webSettings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml", + "/docProps/core.xml": "application/vnd.openxmlformats-package.core-properties+xml", + "/docProps/app.xml": "application/vnd.openxmlformats-officedocument.extended-properties+xml", + }): + types.append(E.Override(PartName=partname, ContentType=mt)) + added = {'png', 'gif', 'jpeg', 'jpg', 'svg', 'xml'} + for ext in added: + types.append(E.Default(Extension=ext, ContentType=guess_type('a.'+ext)[0])) + for ext, mt in iteritems({ + "rels": "application/vnd.openxmlformats-package.relationships+xml", + "odttf": "application/vnd.openxmlformats-officedocument.obfuscatedFont", + }): + added.add(ext) + types.append(E.Default(Extension=ext, ContentType=mt)) + for fname in self.images: + ext = fname.rpartition(os.extsep)[-1] + if ext not in added: + added.add(ext) + mt = guess_type('a.' + ext)[0] + if mt: + types.append(E.Default(Extension=ext, ContentType=mt)) + return xml2str(types) + + @property + def appproperties(self): + E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']}) + props = E.Properties( + E.Application(__appname__), + E.AppVersion('%02d.%04d' % numeric_version[:2]), + E.DocSecurity('0'), + E.HyperlinksChanged('false'), + E.LinksUpToDate('true'), + E.ScaleCrop('false'), + E.SharedDoc('false'), + ) + if self.mi.publisher: + props.append(E.Company(self.mi.publisher)) + return xml2str(props) + + @property + def containerrels(self): + return textwrap.dedent('''\ + + + + + + '''.format(**self.namespace.names)).encode('utf-8') + + @property + def websettings(self): + E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']}) + ws = E.webSettings( + E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile) + return xml2str(ws) + + # }}} + + def convert_metadata(self, mi): + namespaces = self.namespace.namespaces + E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()}) + cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre')) + ts = utcnow().isoformat(native_string_type('T')).rpartition('.')[0] + 'Z' + for x in 'created modified'.split(): + x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x), **{'{%s}type' % namespaces['xsi']:'dcterms:W3CDTF'}) + x.text = ts + cp.append(x) + self.mi = mi + update_doc_props(cp, self.mi, self.namespace) + return xml2str(cp) + + def create_empty_document(self, mi): + self.document, self.styles = create_skeleton(self.opts)[:2] + + def write(self, path_or_stream, mi, create_empty_document=False): + if create_empty_document: + self.create_empty_document(mi) + with ZipFile(path_or_stream, 'w') as zf: + zf.writestr('[Content_Types].xml', self.contenttypes) + zf.writestr('_rels/.rels', self.containerrels) + zf.writestr('docProps/core.xml', self.convert_metadata(mi)) + zf.writestr('docProps/app.xml', self.appproperties) + zf.writestr('word/webSettings.xml', self.websettings) + zf.writestr('word/document.xml', xml2str(self.document)) + zf.writestr('word/styles.xml', xml2str(self.styles)) + zf.writestr('word/numbering.xml', xml2str(self.numbering)) + zf.writestr('word/fontTable.xml', xml2str(self.font_table)) + zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize()) + zf.writestr('word/_rels/fontTable.xml.rels', xml2str(self.embedded_fonts)) + for fname, data_getter in iteritems(self.images): + zf.writestr(fname, data_getter()) + for fname, data in iteritems(self.fonts): + zf.writestr(fname, data) + + +if __name__ == '__main__': + d = DOCX(None, None) + print(d.websettings) diff --git a/ebook_converter/ebooks/docx/writer/fonts.py b/ebook_converter/ebooks/docx/writer/fonts.py new file mode 100644 index 0000000..4432ea9 --- /dev/null +++ b/ebook_converter/ebooks/docx/writer/fonts.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2015, Kovid Goyal ' + +from collections import defaultdict +from uuid import uuid4 + +from calibre.ebooks.oeb.base import OEB_STYLES +from calibre.ebooks.oeb.transforms.subset import find_font_face_rules +from polyglot.builtins import range + + +def obfuscate_font_data(data, key): + prefix = bytearray(data[:32]) + key = bytearray(reversed(key.bytes)) + prefix = bytes(bytearray(prefix[i]^key[i % len(key)] for i in range(len(prefix)))) + return prefix + data[32:] + + +class FontsManager(object): + + def __init__(self, namespace, oeb, opts): + self.namespace = namespace + self.oeb, self.log, self.opts = oeb, oeb.log, opts + + def serialize(self, text_styles, fonts, embed_relationships, font_data_map): + makeelement = self.namespace.makeelement + font_families, seen = set(), set() + for ts in text_styles: + if ts.font_family: + lf = ts.font_family.lower() + if lf not in seen: + seen.add(lf) + font_families.add(ts.font_family) + family_map = {} + for family in sorted(font_families): + family_map[family] = makeelement(fonts, 'w:font', w_name=family) + + embedded_fonts = [] + for item in self.oeb.manifest: + if item.media_type in OEB_STYLES and hasattr(item.data, 'cssRules'): + embedded_fonts.extend(find_font_face_rules(item, self.oeb)) + + num = 0 + face_map = defaultdict(set) + rel_map = {} + for ef in embedded_fonts: + ff = ef['font-family'][0] + if ff not in font_families: + continue + num += 1 + bold = ef['weight'] > 400 + italic = ef['font-style'] != 'normal' + tag = 'Regular' + if bold or italic: + tag = 'Italic' + if bold and italic: + tag = 'BoldItalic' + elif bold: + tag = 'Bold' + if tag in face_map[ff]: + continue + face_map[ff].add(tag) + font = family_map[ff] + key = uuid4() + item = ef['item'] + rid = rel_map.get(item) + if rid is None: + rel_map[item] = rid = 'rId%d' % num + fname = 'fonts/font%d.odttf' % num + makeelement(embed_relationships, 'Relationship', Id=rid, Type=self.namespace.names['EMBEDDED_FONT'], Target=fname) + font_data_map['word/' + fname] = obfuscate_font_data(item.data, key) + makeelement(font, 'w:embed' + tag, r_id=rid, + w_fontKey='{%s}' % key.urn.rpartition(':')[-1].upper(), + w_subsetted="true" if self.opts.subset_embedded_fonts else "false") diff --git a/ebook_converter/ebooks/docx/writer/from_html.py b/ebook_converter/ebooks/docx/writer/from_html.py new file mode 100644 index 0000000..f2618b1 --- /dev/null +++ b/ebook_converter/ebooks/docx/writer/from_html.py @@ -0,0 +1,617 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import re +from collections import Counter + +from calibre.ebooks.docx.writer.container import create_skeleton, page_size, page_effective_area +from calibre.ebooks.docx.writer.styles import StylesManager, FloatSpec +from calibre.ebooks.docx.writer.links import LinksManager +from calibre.ebooks.docx.writer.images import ImagesManager +from calibre.ebooks.docx.writer.fonts import FontsManager +from calibre.ebooks.docx.writer.tables import Table +from calibre.ebooks.docx.writer.lists import ListsManager +from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St +from calibre.ebooks.oeb.base import XPath, barename +from calibre.utils.localization import lang_as_iso639_1 +from polyglot.builtins import unicode_type, string_or_bytes + + +def lang_for_tag(tag): + for attr in ('lang', '{http://www.w3.org/XML/1998/namespace}lang'): + val = lang_as_iso639_1(tag.get(attr)) + if val: + return val + + +class Style(St): + + def __init__(self, *args, **kwargs): + St.__init__(self, *args, **kwargs) + self._letterSpacing = None + + @property + def letterSpacing(self): + if self._letterSpacing is not None: + val = self._get('letter-spacing') + if val == 'normal': + self._letterSpacing = val + else: + self._letterSpacing = self._unit_convert(val) + return self._letterSpacing + + +class Stylizer(Sz): + + def style(self, element): + try: + return self._styles[element] + except KeyError: + return Style(element, self) + + +class TextRun(object): + + ws_pat = None + + def __init__(self, namespace, style, first_html_parent, lang=None): + self.first_html_parent = first_html_parent + if self.ws_pat is None: + TextRun.ws_pat = self.ws_pat = re.compile(r'\s+') + self.style = style + self.texts = [] + self.link = None + self.lang = lang + self.parent_style = None + self.makeelement = namespace.makeelement + self.descendant_style = None + + def add_text(self, text, preserve_whitespace, bookmark=None, link=None): + if not preserve_whitespace: + text = self.ws_pat.sub(' ', text) + if text.strip() != text: + # If preserve_whitespace is False, Word ignores leading and + # trailing whitespace + preserve_whitespace = True + self.texts.append((text, preserve_whitespace, bookmark)) + self.link = link + + def add_break(self, clear='none', bookmark=None): + self.texts.append((None, clear, bookmark)) + + def add_image(self, drawing, bookmark=None): + self.texts.append((drawing, None, bookmark)) + + def serialize(self, p, links_manager): + makeelement = self.makeelement + parent = p if self.link is None else links_manager.serialize_hyperlink(p, self.link) + r = makeelement(parent, 'w:r') + rpr = makeelement(r, 'w:rPr', append=False) + if getattr(self.descendant_style, 'id', None) is not None: + makeelement(rpr, 'w:rStyle', w_val=self.descendant_style.id) + if self.lang: + makeelement(rpr, 'w:lang', w_bidi=self.lang, w_val=self.lang, w_eastAsia=self.lang) + if len(rpr) > 0: + r.append(rpr) + + for text, preserve_whitespace, bookmark in self.texts: + if bookmark is not None: + bid = links_manager.bookmark_id + makeelement(r, 'w:bookmarkStart', w_id=unicode_type(bid), w_name=bookmark) + if text is None: + makeelement(r, 'w:br', w_clear=preserve_whitespace) + elif hasattr(text, 'xpath'): + r.append(text) + else: + t = makeelement(r, 'w:t') + t.text = text or '' + if preserve_whitespace: + t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') + if bookmark is not None: + makeelement(r, 'w:bookmarkEnd', w_id=unicode_type(bid)) + + def __repr__(self): + return repr(self.texts) + + def is_empty(self): + if not self.texts: + return True + if len(self.texts) == 1 and self.texts[0][:2] == ('', False): + return True + return False + + @property + def style_weight(self): + ans = 0 + for text, preserve_whitespace, bookmark in self.texts: + if isinstance(text, unicode_type): + ans += len(text) + return ans + + +class Block(object): + + def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False, parent_bg=None): + self.force_not_empty = False + self.namespace = namespace + self.bookmarks = set() + self.list_tag = (html_block, style) if is_list_item else None + self.is_first_block = False + self.numbering_id = None + self.parent_items = None + self.html_block = html_block + self.html_tag = barename(html_block.tag) + self.float_spec = float_spec + if float_spec is not None: + float_spec.blocks.append(self) + self.html_style = style + self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg) + self.styles_manager, self.links_manager = styles_manager, links_manager + self.keep_next = False + self.runs = [] + self.skipped = False + self.linked_style = None + self.page_break_before = style['page-break-before'] == 'always' + self.keep_lines = style['page-break-inside'] == 'avoid' + self.page_break_after = False + self.block_lang = None + + def resolve_skipped(self, next_block): + if not self.is_empty(): + return + if len(self.html_block) > 0 and self.html_block[0] is next_block.html_block: + self.skipped = True + if self.list_tag is not None: + next_block.list_tag = self.list_tag + + def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None, is_parent_style=False, bookmark=None, link=None, lang=None): + ws = style['white-space'] + preserve_whitespace = ws in {'pre', 'pre-wrap', '-o-pre-wrap'} + ts = self.styles_manager.create_text_style(style, is_parent_style=is_parent_style) + if self.runs and ts == self.runs[-1].style and link == self.runs[-1].link and lang == self.runs[-1].lang: + run = self.runs[-1] + else: + run = TextRun(self.namespace, ts, self.html_block if html_parent is None else html_parent, lang=lang) + self.runs.append(run) + if ignore_leading_whitespace and not preserve_whitespace: + text = text.lstrip() + if preserve_whitespace or ws == 'pre-line': + for text in text.splitlines(): + run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link) + bookmark = None + run.add_break() + else: + run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link) + + def add_break(self, clear='none', bookmark=None): + if self.runs: + run = self.runs[-1] + else: + run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block) + self.runs.append(run) + run.add_break(clear=clear, bookmark=bookmark) + + def add_image(self, drawing, bookmark=None): + if self.runs: + run = self.runs[-1] + else: + run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block) + self.runs.append(run) + run.add_image(drawing, bookmark=bookmark) + + def serialize(self, body): + makeelement = self.namespace.makeelement + p = makeelement(body, 'w:p') + end_bookmarks = [] + for bmark in self.bookmarks: + end_bookmarks.append(unicode_type(self.links_manager.bookmark_id)) + makeelement(p, 'w:bookmarkStart', w_id=end_bookmarks[-1], w_name=bmark) + if self.block_lang: + rpr = makeelement(p, 'w:rPr') + makeelement(rpr, 'w:lang', w_val=self.block_lang, w_bidi=self.block_lang, w_eastAsia=self.block_lang) + + ppr = makeelement(p, 'w:pPr') + if self.keep_next: + makeelement(ppr, 'w:keepNext') + if self.float_spec is not None: + self.float_spec.serialize(self, ppr) + if self.numbering_id is not None: + numpr = makeelement(ppr, 'w:numPr') + makeelement(numpr, 'w:ilvl', w_val=unicode_type(self.numbering_id[1])) + makeelement(numpr, 'w:numId', w_val=unicode_type(self.numbering_id[0])) + if self.linked_style is not None: + makeelement(ppr, 'w:pStyle', w_val=self.linked_style.id) + elif self.style.id: + makeelement(ppr, 'w:pStyle', w_val=self.style.id) + if self.is_first_block: + makeelement(ppr, 'w:pageBreakBefore', w_val='off') + elif self.page_break_before: + makeelement(ppr, 'w:pageBreakBefore', w_val='on') + if self.keep_lines: + makeelement(ppr, 'w:keepLines', w_val='on') + for run in self.runs: + run.serialize(p, self.links_manager) + for bmark in end_bookmarks: + makeelement(p, 'w:bookmarkEnd', w_id=bmark) + + def __repr__(self): + return 'Block(%r)' % self.runs + __str__ = __repr__ + + def is_empty(self): + if self.force_not_empty: + return False + for run in self.runs: + if not run.is_empty(): + return False + return True + + +class Blocks(object): + + def __init__(self, namespace, styles_manager, links_manager): + self.top_bookmark = None + self.namespace = namespace + self.styles_manager = styles_manager + self.links_manager = links_manager + self.all_blocks = [] + self.pos = 0 + self.current_block = None + self.items = [] + self.tables = [] + self.current_table = None + self.open_html_blocks = set() + self.html_tag_start_blocks = {} + + def current_or_new_block(self, html_tag, tag_style): + return self.current_block or self.start_new_block(html_tag, tag_style) + + def end_current_block(self): + if self.current_block is not None: + self.all_blocks.append(self.current_block) + if self.current_table is not None and self.current_table.current_row is not None: + self.current_table.add_block(self.current_block) + else: + self.block_map[self.current_block] = len(self.items) + self.items.append(self.current_block) + self.current_block.parent_items = self.items + self.current_block = None + + def start_new_block(self, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False): + parent_bg = None + if html_block is not None: + p = html_block.getparent() + b = self.html_tag_start_blocks.get(p) + if b is not None: + ps = self.styles_manager.styles_for_html_blocks.get(p) + if ps is not None and ps.background_color is not None: + parent_bg = ps.background_color + self.end_current_block() + self.current_block = Block( + self.namespace, self.styles_manager, self.links_manager, html_block, style, + is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item, + parent_bg=parent_bg) + self.html_tag_start_blocks[html_block] = self.current_block + self.open_html_blocks.add(html_block) + return self.current_block + + def start_new_table(self, html_tag, tag_style=None): + self.current_table = Table(self.namespace, html_tag, tag_style) + self.tables.append(self.current_table) + + def start_new_row(self, html_tag, tag_style): + if self.current_table is None: + self.start_new_table(html_tag) + self.current_table.start_new_row(html_tag, tag_style) + + def start_new_cell(self, html_tag, tag_style): + if self.current_table is None: + self.start_new_table(html_tag) + self.current_table.start_new_cell(html_tag, tag_style) + + def finish_tag(self, html_tag): + if self.current_block is not None and html_tag in self.open_html_blocks: + start_block = self.html_tag_start_blocks.get(html_tag) + if start_block is not None and start_block.html_style['page-break-after'] == 'always': + self.current_block.page_break_after = True + self.end_current_block() + self.open_html_blocks.discard(html_tag) + + if self.current_table is not None: + table_finished = self.current_table.finish_tag(html_tag) + if table_finished: + table = self.tables[-1] + del self.tables[-1] + if self.tables: + self.current_table = self.tables[-1] + self.current_table.add_table(table) + else: + self.current_table = None + self.block_map[table] = len(self.items) + self.items.append(table) + + def serialize(self, body): + for item in self.items: + item.serialize(body) + + def delete_block_at(self, pos=None): + pos = self.pos if pos is None else pos + block = self.all_blocks[pos] + del self.all_blocks[pos] + bpos = self.block_map.pop(block, None) + if bpos is not None: + del self.items[bpos] + else: + items = self.items if block.parent_items is None else block.parent_items + items.remove(block) + block.parent_items = None + if block.float_spec is not None: + block.float_spec.blocks.remove(block) + try: + next_block = self.all_blocks[pos] + next_block.bookmarks.update(block.bookmarks) + for attr in 'page_break_after page_break_before'.split(): + setattr(next_block, attr, getattr(block, attr)) + except (IndexError, KeyError): + pass + + def __enter__(self): + self.pos = len(self.all_blocks) + self.block_map = {} + + def __exit__(self, etype, value, traceback): + if value is not None: + return # Since there was an exception, the data structures are not in a consistent state + if self.current_block is not None: + self.all_blocks.append(self.current_block) + self.current_block = None + if len(self.all_blocks) > self.pos and self.all_blocks[self.pos].is_empty(): + # Delete the empty block corresponding to the tag when the + # body tag has no inline content before its first sub-block + self.delete_block_at(self.pos) + if self.pos > 0 and self.pos < len(self.all_blocks): + # Insert a page break corresponding to the start of the html file + self.all_blocks[self.pos].page_break_before = True + if self.top_bookmark is not None: + self.all_blocks[self.pos].bookmarks.add(self.top_bookmark) + self.top_bookmark = None + self.block_map = {} + + def apply_page_break_after(self): + for i, block in enumerate(self.all_blocks): + if block.page_break_after and i < len(self.all_blocks) - 1: + next_block = self.all_blocks[i + 1] + if next_block.parent_items is block.parent_items and block.parent_items is self.items: + next_block.page_break_before = True + + def resolve_language(self): + default_lang = self.styles_manager.document_lang + for block in self.all_blocks: + count = Counter() + for run in block.runs: + count[run.lang] += 1 + if count: + block.block_lang = bl = count.most_common(1)[0][0] + for run in block.runs: + if run.lang == bl: + run.lang = None + if bl == default_lang: + block.block_lang = None + + def __repr__(self): + return 'Block(%r)' % self.runs + + +class Convert(object): + + # Word does not apply default styling to hyperlinks, so we ensure they get + # default styling (the conversion pipeline does not apply any styling to + # them). + base_css = ''' + a[href] { text-decoration: underline; color: blue } + ''' + + def __init__(self, oeb, docx, mi, add_cover, add_toc): + self.oeb, self.docx, self.add_cover, self.add_toc = oeb, docx, add_cover, add_toc + self.log, self.opts = docx.log, docx.opts + self.mi = mi + self.cover_img = None + p = self.opts.output_profile + p.width_pts, p.height_pts = page_effective_area(self.opts) + + def __call__(self): + from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer + self.svg_rasterizer = SVGRasterizer(base_css=self.base_css) + self.svg_rasterizer(self.oeb, self.opts) + + self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language) + self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log) + self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts) + self.lists_manager = ListsManager(self.docx) + self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts) + self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager) + self.current_link = self.current_lang = None + + for item in self.oeb.spine: + self.log.debug('Processing', item.href) + self.process_item(item) + if self.add_toc: + self.links_manager.process_toc_links(self.oeb) + + if self.add_cover and self.oeb.metadata.cover and unicode_type(self.oeb.metadata.cover[0]) in self.oeb.manifest.ids: + cover_id = unicode_type(self.oeb.metadata.cover[0]) + item = self.oeb.manifest.ids[cover_id] + self.cover_img = self.images_manager.read_image(item.href) + + all_blocks = self.blocks.all_blocks + remove_blocks = [] + for i, block in enumerate(all_blocks): + try: + nb = all_blocks[i+1] + except IndexError: + break + block.resolve_skipped(nb) + if block.skipped: + remove_blocks.append((i, block)) + for pos, block in reversed(remove_blocks): + self.blocks.delete_block_at(pos) + self.blocks.all_blocks[0].is_first_block = True + self.blocks.apply_page_break_after() + self.blocks.resolve_language() + + if self.cover_img is not None: + self.cover_img = self.images_manager.create_cover_markup(self.cover_img, self.opts.preserve_cover_aspect_ratio, *page_size(self.opts)) + self.lists_manager.finalize(all_blocks) + self.styles_manager.finalize(all_blocks) + self.write() + + def process_item(self, item): + self.current_item = item + stylizer = self.svg_rasterizer.stylizer_cache.get(item) + if stylizer is None: + stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css) + self.abshref = self.images_manager.abshref = item.abshref + + self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang + for i, body in enumerate(XPath('//h:body')(item.data)): + with self.blocks: + self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(self.links_manager.top_anchor, self.current_item, body) + self.process_tag(body, stylizer, is_first_tag=i == 0) + + def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None): + tagname = barename(html_tag.tag) + tag_style = stylizer.style(html_tag) + ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden + display = tag_style._get('display') + is_block = False + + if not ignore_tag_contents: + previous_link = self.current_link + if tagname == 'a' and html_tag.get('href'): + self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title')) + previous_lang = self.current_lang + tag_lang = lang_for_tag(html_tag) + if tag_lang: + self.current_lang = tag_lang + + is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag + if float_spec is None and is_float: + float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style) + + if display in {'inline', 'inline-block'} or tagname == 'br': #
has display:block but we dont want to start a new paragraph + if is_float and float_spec.is_dropcaps: + self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) + float_spec = None + else: + self.add_inline_tag(tagname, html_tag, tag_style, stylizer) + elif display == 'list-item': + self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True) + elif display.startswith('table') or display == 'inline-table': + if display == 'table-cell': + self.blocks.start_new_cell(html_tag, tag_style) + self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True) + elif display == 'table-row': + self.blocks.start_new_row(html_tag, tag_style) + elif display in {'table', 'inline-table'}: + self.blocks.end_current_block() + self.blocks.start_new_table(html_tag, tag_style) + else: + if tagname == 'img' and is_float: + # Image is floating so dont start a new paragraph for it + self.add_inline_tag(tagname, html_tag, tag_style, stylizer) + else: + if tagname == 'hr': + for edge in 'right bottom left'.split(): + tag_style.set('border-%s-style' % edge, 'none') + self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) + + for child in html_tag.iterchildren(): + if isinstance(getattr(child, 'tag', None), string_or_bytes): + self.process_tag(child, stylizer, float_spec=float_spec) + else: # Comment/PI/etc. + tail = getattr(child, 'tail', None) + if tail: + block = self.create_block_from_parent(html_tag, stylizer) + block.add_text(tail, tag_style, is_parent_style=False, link=self.current_link, lang=self.current_lang) + + is_block = html_tag in self.blocks.open_html_blocks + self.blocks.finish_tag(html_tag) + if is_block and tag_style['page-break-after'] == 'avoid': + self.blocks.all_blocks[-1].keep_next = True + + self.current_link = previous_link + self.current_lang = previous_lang + + # Now, process the tail if any + + if display == 'table-row': + return # We ignore the tail for these tags + + ignore_whitespace_tail = is_block or display.startswith('table') + if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()): + # Ignore trailing space after a block tag, as otherwise it will + # become a new empty paragraph + block = self.create_block_from_parent(html_tag, stylizer) + block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link, lang=self.current_lang) + + def create_block_from_parent(self, html_tag, stylizer): + parent = html_tag.getparent() + block = self.blocks.current_or_new_block(parent, stylizer.style(parent)) + # Do not inherit page-break-before from parent + block.page_break_before = False + return block + + def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False): + block = self.blocks.start_new_block( + html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item) + anchor = html_tag.get('id') or html_tag.get('name') + if anchor: + block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag)) + if tagname == 'img': + self.images_manager.add_image(html_tag, block, stylizer, as_block=True) + else: + text = html_tag.text + if text: + block.add_text(text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang) + elif tagname == 'li' and len(html_tag) and barename(html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]): + block.force_not_empty = True + + def add_inline_tag(self, tagname, html_tag, tag_style, stylizer): + anchor = html_tag.get('id') or html_tag.get('name') or None + bmark = None + if anchor: + bmark = self.bookmark_for_anchor(anchor, html_tag) + if tagname == 'br': + if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]: + block = self.create_block_from_parent(html_tag, stylizer) + block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'), bookmark=bmark) + elif tagname == 'img': + block = self.create_block_from_parent(html_tag, stylizer) + self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark) + else: + if html_tag.text: + block = self.create_block_from_parent(html_tag, stylizer) + block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang) + elif bmark: + block = self.create_block_from_parent(html_tag, stylizer) + block.add_text('', tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang) + + def bookmark_for_anchor(self, anchor, html_tag): + return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag) + + def write(self): + self.docx.document, self.docx.styles, body = create_skeleton(self.opts) + self.blocks.serialize(body) + body.append(body[0]) # Move to the end + if self.links_manager.toc: + self.links_manager.serialize_toc(body, self.styles_manager.primary_heading_style) + if self.cover_img is not None: + self.images_manager.write_cover_block(body, self.cover_img) + self.styles_manager.serialize(self.docx.styles) + self.images_manager.serialize(self.docx.images) + self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts) + self.lists_manager.serialize(self.docx.numbering) diff --git a/ebook_converter/ebooks/docx/writer/images.py b/ebook_converter/ebooks/docx/writer/images.py new file mode 100644 index 0000000..bf5c293 --- /dev/null +++ b/ebook_converter/ebooks/docx/writer/images.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2015, Kovid Goyal ' + +import os +import posixpath +from collections import namedtuple +from functools import partial +from polyglot.builtins import iteritems, itervalues, map, unicode_type + +from lxml import etree + +from calibre import fit_image +from calibre.ebooks.oeb.base import urlunquote +from calibre.ebooks.docx.images import pt_to_emu +from calibre.utils.filenames import ascii_filename +from calibre.utils.imghdr import identify + +Image = namedtuple('Image', 'rid fname width height fmt item') + + +def as_num(x): + try: + return float(x) + except Exception: + pass + return 0 + + +def get_image_margins(style): + ans = {} + for edge in 'Left Right Top Bottom'.split(): + val = as_num(getattr(style, 'padding' + edge)) + as_num(getattr(style, 'margin' + edge)) + ans['dist' + edge[0]] = unicode_type(pt_to_emu(val)) + return ans + + +class ImagesManager(object): + + def __init__(self, oeb, document_relationships, opts): + self.oeb, self.log = oeb, oeb.log + self.page_width, self.page_height = opts.output_profile.width_pts, opts.output_profile.height_pts + self.images = {} + self.seen_filenames = set() + self.document_relationships = document_relationships + self.count = 0 + + def read_image(self, href): + if href not in self.images: + item = self.oeb.manifest.hrefs.get(href) + if item is None or not isinstance(item.data, bytes): + return + try: + fmt, width, height = identify(item.data) + except Exception: + self.log.warning('Replacing corrupted image with blank: %s' % href) + item.data = I('blank.png', data=True, allow_user_override=False) + fmt, width, height = identify(item.data) + image_fname = 'media/' + self.create_filename(href, fmt) + image_rid = self.document_relationships.add_image(image_fname) + self.images[href] = Image(image_rid, image_fname, width, height, fmt, item) + item.unload_data_from_memory() + return self.images[href] + + def add_image(self, img, block, stylizer, bookmark=None, as_block=False): + src = img.get('src') + if not src: + return + href = self.abshref(src) + try: + rid = self.read_image(href).rid + except AttributeError: + return + drawing = self.create_image_markup(img, stylizer, href, as_block=as_block) + block.add_image(drawing, bookmark=bookmark) + return rid + + def create_image_markup(self, html_img, stylizer, href, as_block=False): + # TODO: img inside a link (clickable image) + style = stylizer.style(html_img) + floating = style['float'] + if floating not in {'left', 'right'}: + floating = None + if as_block: + ml, mr = style._get('margin-left'), style._get('margin-right') + if ml == 'auto': + floating = 'center' if mr == 'auto' else 'right' + if mr == 'auto': + floating = 'center' if ml == 'auto' else 'right' + else: + parent = html_img.getparent() + if len(parent) == 1 and not (parent.text or '').strip() and not (html_img.tail or '').strip(): + pstyle = stylizer.style(parent) + if 'block' in pstyle['display']: + # We have an inline image alone inside a block + as_block = True + floating = pstyle['float'] + if floating not in {'left', 'right'}: + floating = None + if pstyle['text-align'] in ('center', 'right'): + floating = pstyle['text-align'] + floating = floating or 'left' + fake_margins = floating is None + self.count += 1 + img = self.images[href] + name = urlunquote(posixpath.basename(href)) + width, height = style.img_size(img.width, img.height) + scaled, width, height = fit_image(width, height, self.page_width, self.page_height) + width, height = map(pt_to_emu, (width, height)) + + makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces + + root = etree.Element('root', nsmap=namespaces) + ans = makeelement(root, 'w:drawing', append=False) + if floating is None: + parent = makeelement(ans, 'wp:inline') + else: + parent = makeelement(ans, 'wp:anchor', **get_image_margins(style)) + # The next three lines are boilerplate that Word requires, even + # though the DOCX specs define defaults for all of them + parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0") + parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1') + makeelement(parent, 'wp:simplePos', x='0', y='0') + makeelement(makeelement(parent, 'wp:positionH', relativeFrom='margin'), 'wp:align').text = floating + makeelement(makeelement(parent, 'wp:positionV', relativeFrom='line'), 'wp:align').text = 'top' + makeelement(parent, 'wp:extent', cx=unicode_type(width), cy=unicode_type(height)) + if fake_margins: + # DOCX does not support setting margins for inline images, so we + # fake it by using effect extents to simulate margins + makeelement(parent, 'wp:effectExtent', **{k[-1].lower():v for k, v in iteritems(get_image_margins(style))}) + else: + makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0') + if floating is not None: + # The idiotic Word requires this to be after the extent settings + if as_block: + makeelement(parent, 'wp:wrapTopAndBottom') + else: + makeelement(parent, 'wp:wrapSquare', wrapText='bothSides') + self.create_docx_image_markup(parent, name, html_img.get('alt') or name, img.rid, width, height) + return ans + + def create_docx_image_markup(self, parent, name, alt, img_rid, width, height): + makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces + makeelement(parent, 'wp:docPr', id=unicode_type(self.count), name=name, descr=alt) + makeelement(makeelement(parent, 'wp:cNvGraphicFramePr'), 'a:graphicFrameLocks', noChangeAspect="1") + g = makeelement(parent, 'a:graphic') + gd = makeelement(g, 'a:graphicData', uri=namespaces['pic']) + pic = makeelement(gd, 'pic:pic') + nvPicPr = makeelement(pic, 'pic:nvPicPr') + makeelement(nvPicPr, 'pic:cNvPr', id='0', name=name, descr=alt) + makeelement(nvPicPr, 'pic:cNvPicPr') + bf = makeelement(pic, 'pic:blipFill') + makeelement(bf, 'a:blip', r_embed=img_rid) + makeelement(makeelement(bf, 'a:stretch'), 'a:fillRect') + spPr = makeelement(pic, 'pic:spPr') + xfrm = makeelement(spPr, 'a:xfrm') + makeelement(xfrm, 'a:off', x='0', y='0'), makeelement(xfrm, 'a:ext', cx=unicode_type(width), cy=unicode_type(height)) + makeelement(makeelement(spPr, 'a:prstGeom', prst='rect'), 'a:avLst') + + def create_filename(self, href, fmt): + fname = ascii_filename(urlunquote(posixpath.basename(href))) + fname = posixpath.splitext(fname)[0] + fname = fname[:75].rstrip('.') or 'image' + num = 0 + base = fname + while fname.lower() in self.seen_filenames: + num += 1 + fname = base + unicode_type(num) + self.seen_filenames.add(fname.lower()) + fname += os.extsep + fmt.lower() + return fname + + def serialize(self, images_map): + for img in itervalues(self.images): + images_map['word/' + img.fname] = partial(self.get_data, img.item) + + def get_data(self, item): + try: + return item.data + finally: + item.unload_data_from_memory(False) + + def create_cover_markup(self, img, preserve_aspect_ratio, width, height): + self.count += 1 + makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces + if preserve_aspect_ratio: + if img.width >= img.height: + ar = img.height / img.width + height = ar * width + else: + ar = img.width / img.height + width = ar * height + + root = etree.Element('root', nsmap=namespaces) + ans = makeelement(root, 'w:drawing', append=False) + parent = makeelement(ans, 'wp:anchor', **{'dist'+edge:'0' for edge in 'LRTB'}) + parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0") + parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1') + makeelement(parent, 'wp:simplePos', x='0', y='0') + makeelement(makeelement(parent, 'wp:positionH', relativeFrom='page'), 'wp:align').text = 'center' + makeelement(makeelement(parent, 'wp:positionV', relativeFrom='page'), 'wp:align').text = 'center' + width, height = map(pt_to_emu, (width, height)) + makeelement(parent, 'wp:extent', cx=unicode_type(width), cy=unicode_type(height)) + makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0') + makeelement(parent, 'wp:wrapTopAndBottom') + self.create_docx_image_markup(parent, 'cover.jpg', _('Cover'), img.rid, width, height) + return ans + + def write_cover_block(self, body, cover_image): + makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces + pbb = body[0].xpath('//*[local-name()="pageBreakBefore"]')[0] + pbb.set('{%s}val' % namespaces['w'], 'on') + p = makeelement(body, 'w:p', append=False) + body.insert(0, p) + r = makeelement(p, 'w:r') + r.append(cover_image) diff --git a/ebook_converter/ebooks/docx/writer/links.py b/ebook_converter/ebooks/docx/writer/links.py new file mode 100644 index 0000000..6eb80b8 --- /dev/null +++ b/ebook_converter/ebooks/docx/writer/links.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2015, Kovid Goyal ' + +import posixpath, re +from uuid import uuid4 + +from calibre.utils.filenames import ascii_text +from polyglot.builtins import unicode_type +from polyglot.urllib import urlparse + + +def start_text(tag, prefix_len=0, top_level=True): + ans = tag.text or '' + limit = 50 - prefix_len + if len(ans) < limit: + for child in tag.iterchildren('*'): + ans += start_text(child, len(ans), top_level=False) + (child.tail or '') + if len(ans) >= limit: + break + if top_level and len(ans) > limit: + ans = ans[:limit] + '...' + return ans + + +class TOCItem(object): + + def __init__(self, title, bmark, level): + self.title, self.bmark, self.level = title, bmark, level + self.is_first = self.is_last = False + + def serialize(self, body, makeelement): + p = makeelement(body, 'w:p', append=False) + ppr = makeelement(p, 'w:pPr') + makeelement(ppr, 'w:pStyle', w_val="Normal") + makeelement(ppr, 'w:ind', w_left='0', w_firstLineChars='0', w_firstLine='0', w_leftChars=unicode_type(200 * self.level)) + if self.is_first: + makeelement(ppr, 'w:pageBreakBefore', w_val='off') + r = makeelement(p, 'w:r') + makeelement(r, 'w:fldChar', w_fldCharType='begin') + r = makeelement(p, 'w:r') + makeelement(r, 'w:instrText').text = r' TOC \h ' + r[0].set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') + r = makeelement(p, 'w:r') + makeelement(r, 'w:fldChar', w_fldCharType='separate') + hl = makeelement(p, 'w:hyperlink', w_anchor=self.bmark) + r = makeelement(hl, 'w:r') + rpr = makeelement(r, 'w:rPr') + makeelement(rpr, 'w:color', w_val='0000FF', w_themeColor='hyperlink') + makeelement(rpr, 'w:u', w_val='single') + makeelement(r, 'w:t').text = self.title + if self.is_last: + r = makeelement(p, 'w:r') + makeelement(r, 'w:fldChar', w_fldCharType='end') + body.insert(0, p) + + +def sanitize_bookmark_name(base): + # Max length allowed by Word appears to be 40, we use 32 to leave some + # space for making the name unique + return re.sub(r'[^0-9a-zA-Z]', '_', ascii_text(base))[:32].rstrip('_') + + +class LinksManager(object): + + def __init__(self, namespace, document_relationships, log): + self.namespace = namespace + self.log = log + self.document_relationships = document_relationships + self.top_anchor = unicode_type(uuid4().hex) + self.anchor_map = {} + self.used_bookmark_names = set() + self.bmark_id = 0 + self.document_hrefs = set() + self.external_links = {} + self.toc = [] + + def bookmark_for_anchor(self, anchor, current_item, html_tag): + key = (current_item.href, anchor) + if key in self.anchor_map: + return self.anchor_map[key] + if anchor == self.top_anchor: + name = ('Top of %s' % posixpath.basename(current_item.href)) + self.document_hrefs.add(current_item.href) + else: + name = start_text(html_tag).strip() or anchor + name = sanitize_bookmark_name(name) + i, bname = 0, name + while name in self.used_bookmark_names: + i += 1 + name = bname + ('_%d' % i) + self.anchor_map[key] = name + self.used_bookmark_names.add(name) + return name + + @property + def bookmark_id(self): + self.bmark_id += 1 + return self.bmark_id + + def serialize_hyperlink(self, parent, link): + item, url, tooltip = link + purl = urlparse(url) + href = purl.path + + def make_link(parent, anchor=None, id=None, tooltip=None): + kw = {} + if anchor is not None: + kw['w_anchor'] = anchor + elif id is not None: + kw['r_id'] = id + if tooltip: + kw['w_tooltip'] = tooltip + return self.namespace.makeelement(parent, 'w:hyperlink', **kw) + + if not purl.scheme: + href = item.abshref(href) + if href in self.document_hrefs: + key = (href, purl.fragment or self.top_anchor) + if key in self.anchor_map: + bmark = self.anchor_map[key] + else: + bmark = self.anchor_map[(href, self.top_anchor)] + return make_link(parent, anchor=bmark, tooltip=tooltip) + else: + self.log.warn('Ignoring internal hyperlink with href (%s) pointing to unknown destination' % url) + if purl.scheme in {'http', 'https', 'ftp'}: + if url not in self.external_links: + self.external_links[url] = self.document_relationships.add_relationship(url, self.namespace.names['LINKS'], target_mode='External') + return make_link(parent, id=self.external_links[url], tooltip=tooltip) + return parent + + def process_toc_node(self, toc, level=0): + href = toc.href + if href: + purl = urlparse(href) + href = purl.path + if href in self.document_hrefs: + key = (href, purl.fragment or self.top_anchor) + if key in self.anchor_map: + bmark = self.anchor_map[key] + else: + bmark = self.anchor_map[(href, self.top_anchor)] + self.toc.append(TOCItem(toc.title, bmark, level)) + for child in toc: + self.process_toc_node(child, level+1) + + def process_toc_links(self, oeb): + self.toc = [] + has_toc = oeb.toc and oeb.toc.count() > 1 + if not has_toc: + return + for child in oeb.toc: + self.process_toc_node(child) + if self.toc: + self.toc[0].is_first = True + self.toc[-1].is_last = True + + def serialize_toc(self, body, primary_heading_style): + pbb = body[0].xpath('//*[local-name()="pageBreakBefore"]')[0] + pbb.set('{%s}val' % self.namespace.namespaces['w'], 'on') + for block in reversed(self.toc): + block.serialize(body, self.namespace.makeelement) + title = __('Table of Contents') + makeelement = self.namespace.makeelement + p = makeelement(body, 'w:p', append=False) + ppr = makeelement(p, 'w:pPr') + if primary_heading_style is not None: + makeelement(ppr, 'w:pStyle', w_val=primary_heading_style.id) + makeelement(ppr, 'w:pageBreakBefore', w_val='off') + makeelement(makeelement(p, 'w:r'), 'w:t').text = title + body.insert(0, p) diff --git a/ebook_converter/ebooks/docx/writer/lists.py b/ebook_converter/ebooks/docx/writer/lists.py new file mode 100644 index 0000000..259cfb3 --- /dev/null +++ b/ebook_converter/ebooks/docx/writer/lists.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2015, Kovid Goyal ' + +from collections import defaultdict +from operator import attrgetter + +from polyglot.builtins import iteritems, itervalues, unicode_type + +LIST_STYLES = frozenset( + 'disc circle square decimal decimal-leading-zero lower-roman upper-roman' + ' lower-greek lower-alpha lower-latin upper-alpha upper-latin hiragana hebrew' + ' katakana-iroha cjk-ideographic'.split()) + +STYLE_MAP = { + 'disc': 'bullet', + 'circle': 'o', + 'square': '\uf0a7', + 'decimal': 'decimal', + 'decimal-leading-zero': 'decimalZero', + 'lower-roman': 'lowerRoman', + 'upper-roman': 'upperRoman', + 'lower-alpha': 'lowerLetter', + 'lower-latin': 'lowerLetter', + 'upper-alpha': 'upperLetter', + 'upper-latin': 'upperLetter', + 'hiragana': 'aiueo', + 'hebrew': 'hebrew1', + 'katakana-iroha': 'iroha', + 'cjk-ideographic': 'chineseCounting', +} + + +def find_list_containers(list_tag, tag_style): + node = list_tag + stylizer = tag_style._stylizer + ans = [] + while True: + parent = node.getparent() + if parent is None or parent is node: + break + node = parent + style = stylizer.style(node) + lst = (style._style.get('list-style-type', None) or '').lower() + if lst in LIST_STYLES: + ans.append(node) + return ans + + +class NumberingDefinition(object): + + def __init__(self, top_most, stylizer, namespace): + self.namespace = namespace + self.top_most = top_most + self.stylizer = stylizer + self.level_map = defaultdict(list) + self.num_id = None + + def finalize(self): + items_for_level = defaultdict(list) + container_for_level = {} + type_for_level = {} + for ilvl, items in iteritems(self.level_map): + for container, list_tag, block, list_type, tag_style in items: + items_for_level[ilvl].append(list_tag) + container_for_level[ilvl] = container + type_for_level[ilvl] = list_type + self.levels = tuple( + Level(type_for_level[ilvl], container_for_level[ilvl], items_for_level[ilvl], ilvl=ilvl) + for ilvl in sorted(self.level_map) + ) + + def __hash__(self): + return hash(self.levels) + + def link_blocks(self): + for ilvl, items in iteritems(self.level_map): + for container, list_tag, block, list_type, tag_style in items: + block.numbering_id = (self.num_id + 1, ilvl) + + def serialize(self, parent): + makeelement = self.namespace.makeelement + an = makeelement(parent, 'w:abstractNum', w_abstractNumId=unicode_type(self.num_id)) + makeelement(an, 'w:multiLevelType', w_val='hybridMultilevel') + makeelement(an, 'w:name', w_val='List %d' % (self.num_id + 1)) + for level in self.levels: + level.serialize(an, makeelement) + + +class Level(object): + + def __init__(self, list_type, container, items, ilvl=0): + self.ilvl = ilvl + try: + self.start = int(container.get('start')) + except Exception: + self.start = 1 + if items: + try: + self.start = int(items[0].get('value')) + except Exception: + pass + if list_type in {'disc', 'circle', 'square'}: + self.num_fmt = 'bullet' + self.lvl_text = '\uf0b7' if list_type == 'disc' else STYLE_MAP[list_type] + else: + self.lvl_text = '%{}.'.format(self.ilvl + 1) + self.num_fmt = STYLE_MAP.get(list_type, 'decimal') + + def __hash__(self): + return hash((self.start, self.num_fmt, self.lvl_text)) + + def serialize(self, parent, makeelement): + lvl = makeelement(parent, 'w:lvl', w_ilvl=unicode_type(self.ilvl)) + makeelement(lvl, 'w:start', w_val=unicode_type(self.start)) + makeelement(lvl, 'w:numFmt', w_val=self.num_fmt) + makeelement(lvl, 'w:lvlText', w_val=self.lvl_text) + makeelement(lvl, 'w:lvlJc', w_val='left') + makeelement(makeelement(lvl, 'w:pPr'), 'w:ind', w_hanging='360', w_left=unicode_type(1152 + self.ilvl * 360)) + if self.num_fmt == 'bullet': + ff = {'\uf0b7':'Symbol', '\uf0a7':'Wingdings'}.get(self.lvl_text, 'Courier New') + makeelement(makeelement(lvl, 'w:rPr'), 'w:rFonts', w_ascii=ff, w_hAnsi=ff, w_hint="default") + + +class ListsManager(object): + + def __init__(self, docx): + self.namespace = docx.namespace + self.lists = {} + + def finalize(self, all_blocks): + lists = {} + for block in all_blocks: + if block.list_tag is not None: + list_tag, tag_style = block.list_tag + list_type = (tag_style['list-style-type'] or '').lower() + if list_type not in LIST_STYLES: + continue + container_tags = find_list_containers(list_tag, tag_style) + if not container_tags: + continue + top_most = container_tags[-1] + if top_most not in lists: + lists[top_most] = NumberingDefinition(top_most, tag_style._stylizer, self.namespace) + l = lists[top_most] + ilvl = len(container_tags) - 1 + l.level_map[ilvl].append((container_tags[0], list_tag, block, list_type, tag_style)) + + [nd.finalize() for nd in itervalues(lists)] + definitions = {} + for defn in itervalues(lists): + try: + defn = definitions[defn] + except KeyError: + definitions[defn] = defn + defn.num_id = len(definitions) - 1 + defn.link_blocks() + self.definitions = sorted(itervalues(definitions), key=attrgetter('num_id')) + + def serialize(self, parent): + for defn in self.definitions: + defn.serialize(parent) + makeelement = self.namespace.makeelement + for defn in self.definitions: + n = makeelement(parent, 'w:num', w_numId=unicode_type(defn.num_id + 1)) + makeelement(n, 'w:abstractNumId', w_val=unicode_type(defn.num_id)) diff --git a/ebook_converter/ebooks/docx/writer/styles.py b/ebook_converter/ebooks/docx/writer/styles.py new file mode 100644 index 0000000..d05451f --- /dev/null +++ b/ebook_converter/ebooks/docx/writer/styles.py @@ -0,0 +1,768 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2015, Kovid Goyal ' + +import numbers +from collections import Counter, defaultdict +from operator import attrgetter + +from lxml import etree + +from calibre.ebooks import parse_css_length +from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero +from calibre.utils.localization import lang_as_iso639_1 +from polyglot.builtins import iteritems, filter, unicode_type +from tinycss.css21 import CSS21Parser + +css_parser = CSS21Parser() + +border_edges = ('left', 'top', 'right', 'bottom') +border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color') +ignore = object() + + +def parse_css_font_family(raw): + decl, errs = css_parser.parse_style_attr('font-family:' + raw) + if decl: + for token in decl[0].value: + if token.type in 'STRING IDENT': + val = token.value + if val == 'inherit': + break + yield val + + +def css_font_family_to_docx(raw): + generic = {'serif':'Cambria', 'sansserif':'Candara', 'sans-serif':'Candara', 'fantasy':'Comic Sans', 'cursive':'Segoe Script'} + for ff in parse_css_font_family(raw): + return generic.get(ff.lower(), ff) + + +def bmap(x): + return 'on' if x else 'off' + + +def is_dropcaps(html_tag, tag_style): + return len(html_tag) < 2 and len(etree.tostring(html_tag, method='text', encoding='unicode', with_tail=False)) < 5 and tag_style['float'] == 'left' + + +class CombinedStyle(object): + + def __init__(self, bs, rs, blocks, namespace): + self.bs, self.rs, self.blocks = bs, rs, blocks + self.namespace = namespace + self.id = self.name = self.seq = None + self.outline_level = None + + def apply(self): + for block in self.blocks: + block.linked_style = self + for run in block.runs: + run.parent_style = self.rs + + def serialize(self, styles, normal_style): + makeelement = self.namespace.makeelement + w = lambda x: '{%s}%s' % (self.namespace.namespaces['w'], x) + block = makeelement(styles, 'w:style', w_styleId=self.id, w_type='paragraph') + makeelement(block, 'w:name', w_val=self.name) + makeelement(block, 'w:qFormat') + if self is not normal_style: + makeelement(block, 'w:basedOn', w_val=normal_style.id) + if self.seq == 0: + block.set(w('default'), '1') + pPr = makeelement(block, 'w:pPr') + self.bs.serialize_properties(pPr, normal_style.bs) + if self.outline_level is not None: + makeelement(pPr, 'w:outlineLvl', w_val=unicode_type(self.outline_level + 1)) + rPr = makeelement(block, 'w:rPr') + self.rs.serialize_properties(rPr, normal_style.rs) + + +class FloatSpec(object): + + def __init__(self, namespace, html_tag, tag_style): + self.makeelement = namespace.makeelement + self.is_dropcaps = is_dropcaps(html_tag, tag_style) + self.blocks = [] + if self.is_dropcaps: + self.dropcaps_lines = 3 + else: + self.x_align = tag_style['float'] + self.w = self.h = None + if tag_style._get('width') != 'auto': + self.w = int(20 * max(tag_style['min-width'], tag_style['width'])) + if tag_style._get('height') == 'auto': + self.h_rule = 'auto' + else: + if tag_style['min-height'] > 0: + self.h_rule, self.h = 'atLeast', tag_style['min-height'] + else: + self.h_rule, self.h = 'exact', tag_style['height'] + self.h = int(20 * self.h) + self.h_space = int(20 * max(tag_style['margin-right'], tag_style['margin-left'])) + self.v_space = int(20 * max(tag_style['margin-top'], tag_style['margin-bottom'])) + + read_css_block_borders(self, tag_style) + + def serialize(self, block, parent): + if self.is_dropcaps: + attrs = dict(w_dropCap='drop', w_lines=unicode_type(self.dropcaps_lines), w_wrap='around', w_vAnchor='text', w_hAnchor='text') + else: + attrs = dict( + w_wrap='around', w_vAnchor='text', w_hAnchor='text', w_xAlign=self.x_align, w_y='1', + w_hSpace=unicode_type(self.h_space), w_vSpace=unicode_type(self.v_space), w_hRule=self.h_rule + ) + if self.w is not None: + attrs['w_w'] = unicode_type(self.w) + if self.h is not None: + attrs['w_h'] = unicode_type(self.h) + self.makeelement(parent, 'w:framePr', **attrs) + # Margins are already applied by the frame style, so override them to + # be zero on individual blocks + self.makeelement(parent, 'w:ind', w_left='0', w_leftChars='0', w_right='0', w_rightChars='0') + attrs = {} + if block is self.blocks[0]: + attrs.update(dict(w_before='0', w_beforeLines='0')) + if block is self.blocks[-1]: + attrs.update(dict(w_after='0', w_afterLines='0')) + if attrs: + self.makeelement(parent, 'w:spacing', **attrs) + # Similarly apply the same border and padding properties to all blocks + # in this floatspec + bdr = self.makeelement(parent, 'w:pBdr') + for edge in border_edges: + padding = getattr(self, 'padding_' + edge) + width = getattr(self, 'border_%s_width' % edge) + bstyle = getattr(self, 'border_%s_style' % edge) + self.makeelement( + bdr, 'w:'+edge, w_space=unicode_type(padding), w_val=bstyle, w_sz=unicode_type(width), w_color=getattr(self, 'border_%s_color' % edge)) + + +class DOCXStyle(object): + + ALL_PROPS = () + TYPE = 'paragraph' + + def __init__(self, namespace): + self.namespace = namespace + self.w = lambda x: '{%s}%s' % (namespace.namespaces['w'], x) + self.id = self.name = None + self.next_style = None + self.calculate_hash() + + def calculate_hash(self): + self._hash = hash(tuple( + getattr(self, x) for x in self.ALL_PROPS)) + + def makeelement(self, parent, name, **attrs): + return parent.makeelement(self.w(name), **{self.w(k):v for k, v in iteritems(attrs)}) + + def __hash__(self): + return self._hash + + def __eq__(self, other): + for x in self.ALL_PROPS: + if getattr(self, x) != getattr(other, x, None): + return False + return True + + def __ne__(self, other): + return not self == other + + def __repr__(self): + return etree.tostring(self.serialize(etree.Element(self.__class__.__name__, nsmap={'w':self.namespace.namespaces['w']})), pretty_print=True) + __str__ = __repr__ + + def serialize(self, styles, normal_style): + makeelement = self.makeelement + style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE) + style.append(makeelement(style, 'name', val=self.name)) + if self is not normal_style: + style.append(makeelement(style, 'basedOn', val=normal_style.id)) + styles.append(style) + return style + + +LINE_STYLES = { + 'none' : 'none', + 'hidden': 'none', + 'dotted': 'dotted', + 'dashed': 'dashed', + 'solid' : 'single', + 'double': 'double', + 'groove': 'threeDEngrave', + 'ridge' : 'threeDEmboss', + 'inset' : 'inset', + 'outset': 'outset', +} + + +class TextStyle(DOCXStyle): + + ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color', + 'background_color', 'underline', 'strike', 'dstrike', 'caps', + 'shadow', 'small_caps', 'spacing', 'vertical_align', 'padding', + 'border_style', 'border_width', 'border_color') + TYPE = 'character' + + def __init__(self, namespace, css, is_parent_style=False): + self.font_family = css_font_family_to_docx(css['font-family']) + try: + self.font_size = max(0, int(float(css['font-size']) * 2)) # stylizer normalizes all font sizes into pts + except (ValueError, TypeError, AttributeError): + self.font_size = None + + fw = css['font-weight'] + self.bold = (fw.lower() if hasattr(fw, 'lower') else fw) in {'bold', 'bolder'} or int_or_zero(fw) >= 700 + self.italic = css['font-style'].lower() in {'italic', 'oblique'} + self.color = convert_color(css['color']) + self.background_color = None if is_parent_style else convert_color(css.backgroundColor) + td = set((css.effective_text_decoration or '').split()) + self.underline = 'underline' in td + self.dstrike = 'line-through' in td and 'overline' in td + self.strike = not self.dstrike and 'line-through' in td + self.text_transform = css['text-transform'] # TODO: If lowercase or capitalize, transform the actual text + self.caps = self.text_transform == 'uppercase' + self.small_caps = css['font-variant'].lower() in {'small-caps', 'smallcaps'} + self.shadow = css['text-shadow'] not in {'none', None} + try: + self.spacing = int(float(css['letter-spacing']) * 20) + except (ValueError, TypeError, AttributeError): + self.spacing = None + va = css.first_vertical_align + if isinstance(va, numbers.Number): + self.vertical_align = unicode_type(int(va * 2)) + else: + val = { + 'top':'superscript', 'text-top':'superscript', 'sup':'superscript', 'super':'superscript', + 'bottom':'subscript', 'text-bottom':'subscript', 'sub':'subscript'}.get(va) + self.vertical_align = val or 'baseline' + + self.padding = self.border_color = self.border_width = self.border_style = None + if not is_parent_style: + # DOCX does not support individual borders/padding for inline content + for edge in border_edges: + # In DOCX padding can only be a positive integer + try: + padding = max(0, int(css['padding-' + edge])) + except ValueError: + padding = 0 + if self.padding is None: + self.padding = padding + elif self.padding != padding: + self.padding = ignore + val = css['border-%s-width' % edge] + if not isinstance(val, numbers.Number): + val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0) + val = min(96, max(2, int(val * 8))) + if self.border_width is None: + self.border_width = val + elif self.border_width != val: + self.border_width = ignore + color = convert_color(css['border-%s-color' % edge]) + if self.border_color is None: + self.border_color = color + elif self.border_color != color: + self.border_color = ignore + style = LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none') + if self.border_style is None: + self.border_style = style + elif self.border_style != style: + self.border_style = ignore + + if self.padding in (None, ignore): + self.padding = 0 + if self.border_width in (None, ignore): + self.border_width = 0 + if self.border_style in (None, ignore): + self.border_style = 'none' + if self.border_color in (None, ignore): + self.border_color = 'auto' + if self.border_style == 'none': + self.border_width, self.border_color = 0, 'auto' + + DOCXStyle.__init__(self, namespace) + + def serialize_borders(self, bdr, normal_style): + w = self.w + is_normal_style = self is normal_style + if is_normal_style or self.padding != normal_style.padding: + bdr.set(w('space'), unicode_type(self.padding)) + if is_normal_style or self.border_width != normal_style.border_width: + bdr.set(w('sz'), unicode_type(self.border_width)) + if is_normal_style or self.border_style != normal_style.border_style: + bdr.set(w('val'), self.border_style) + if is_normal_style or self.border_color != normal_style.border_color: + bdr.set(w('color'), self.border_color) + return bdr + + def serialize(self, styles, normal_style): + makeelement = self.makeelement + style_root = DOCXStyle.serialize(self, styles, normal_style) + style = makeelement(style_root, 'rPr') + self.serialize_properties(style, normal_style) + if len(style) > 0: + style_root.append(style) + return style_root + + def serialize_properties(self, rPr, normal_style): + makeelement = self.makeelement + is_normal_style = self is normal_style + if is_normal_style or self.font_family != normal_style.font_family: + rPr.append(makeelement( + rPr, 'rFonts', **{k:self.font_family for k in 'ascii cs eastAsia hAnsi'.split()})) + + for name, attr, vmap in (('sz', 'font_size', str), ('b', 'bold', bmap), ('i', 'italic', bmap)): + val = getattr(self, attr) + if is_normal_style or getattr(normal_style, attr) != val: + for suffix in ('', 'Cs'): + rPr.append(makeelement(rPr, name + suffix, val=vmap(val))) + + def check_attr(attr): + val = getattr(self, attr) + return is_normal_style or (val != getattr(normal_style, attr)) + + if check_attr('color'): + rPr.append(makeelement(rPr, 'color', val=self.color or 'auto')) + if check_attr('background_color'): + rPr.append(makeelement(rPr, 'shd', fill=self.background_color or 'auto')) + if check_attr('underline'): + rPr.append(makeelement(rPr, 'u', val='single' if self.underline else 'none')) + if check_attr('dstrike'): + rPr.append(makeelement(rPr, 'dstrike', val=bmap(self.dstrike))) + if check_attr('strike'): + rPr.append(makeelement(rPr, 'strike', val=bmap(self.strike))) + if check_attr('caps'): + rPr.append(makeelement(rPr, 'caps', val=bmap(self.caps))) + if check_attr('small_caps'): + rPr.append(makeelement(rPr, 'smallCaps', val=bmap(self.small_caps))) + if check_attr('shadow'): + rPr.append(makeelement(rPr, 'shadow', val=bmap(self.shadow))) + if check_attr('spacing'): + rPr.append(makeelement(rPr, 'spacing', val=unicode_type(self.spacing or 0))) + if is_normal_style: + rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align if self.vertical_align in {'superscript', 'subscript'} else 'baseline')) + elif self.vertical_align != normal_style.vertical_align: + if self.vertical_align in {'superscript', 'subscript', 'baseline'}: + rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align)) + else: + rPr.append(makeelement(rPr, 'position', val=self.vertical_align)) + + bdr = self.serialize_borders(makeelement(rPr, 'bdr'), normal_style) + if bdr.attrib: + rPr.append(bdr) + + +class DescendantTextStyle(object): + + def __init__(self, parent_style, child_style): + self.id = self.name = None + self.makeelement = child_style.makeelement + + p = [] + + def add(name, **props): + p.append((name, frozenset(iteritems(props)))) + + def vals(attr): + return getattr(parent_style, attr), getattr(child_style, attr) + + def check(attr): + pval, cval = vals(attr) + return pval != cval + + if parent_style.font_family != child_style.font_family: + add('rFonts', **{k:child_style.font_family for k in 'ascii cs eastAsia hAnsi'.split()}) + + for name, attr in (('sz', 'font_size'), ('b', 'bold'), ('i', 'italic')): + pval, cval = vals(attr) + if pval != cval: + val = 'on' if attr in {'bold', 'italic'} else unicode_type(cval) # bold, italic are toggle properties + for suffix in ('', 'Cs'): + add(name + suffix, val=val) + + if check('color'): + add('color', val=child_style.color or 'auto') + if check('background_color'): + add('shd', fill=child_style.background_color or 'auto') + if check('underline'): + add('u', val='single' if child_style.underline else 'none') + if check('dstrike'): + add('dstrike', val=bmap(child_style.dstrike)) + if check('strike'): + add('strike', val='on') # toggle property + if check('caps'): + add('caps', val='on') # toggle property + if check('small_caps'): + add('smallCaps', val='on') # toggle property + if check('shadow'): + add('shadow', val='on') # toggle property + if check('spacing'): + add('spacing', val=unicode_type(child_style.spacing or 0)) + if check('vertical_align'): + val = child_style.vertical_align + if val in {'superscript', 'subscript', 'baseline'}: + add('vertAlign', val=val) + else: + add('position', val=val) + + bdr = {} + if check('padding'): + bdr['space'] = unicode_type(child_style.padding) + if check('border_width'): + bdr['sz'] = unicode_type(child_style.border_width) + if check('border_style'): + bdr['val'] = child_style.border_style + if check('border_color'): + bdr['color'] = child_style.border_color + if bdr: + add('bdr', **bdr) + self.properties = tuple(p) + self._hash = hash(self.properties) + + def __hash__(self): + return self._hash + + def __eq__(self, other): + return self.properties == other.properties + + def __ne__(self, other): + return self.properties != other.properties + + def serialize(self, styles): + makeelement = self.makeelement + style = makeelement(styles, 'style', styleId=self.id, type='character') + style.append(makeelement(style, 'name', val=self.name)) + rpr = makeelement(style, 'rPr') + style.append(rpr) + for name, attrs in self.properties: + rpr.append(makeelement(style, name, **dict(attrs))) + styles.append(style) + return style + + +def read_css_block_borders(self, css, store_css_style=False): + for edge in border_edges: + if css is None: + setattr(self, 'padding_' + edge, 0) + setattr(self, 'margin_' + edge, 0) + setattr(self, 'css_margin_' + edge, '') + setattr(self, 'border_%s_width' % edge, 2) + setattr(self, 'border_%s_color' % edge, None) + setattr(self, 'border_%s_style' % edge, 'none') + if store_css_style: + setattr(self, 'border_%s_css_style' % edge, 'none') + else: + # In DOCX padding can only be a positive integer + try: + setattr(self, 'padding_' + edge, max(0, int(css['padding-' + edge]))) + except ValueError: + setattr(self, 'padding_' + edge, 0) # invalid value for padding + # In DOCX margin must be a positive integer in twips (twentieth of a point) + try: + setattr(self, 'margin_' + edge, max(0, int(css['margin-' + edge] * 20))) + except ValueError: + setattr(self, 'margin_' + edge, 0) # for e.g.: margin: auto + setattr(self, 'css_margin_' + edge, css._style.get('margin-' + edge, '')) + val = css['border-%s-width' % edge] + if not isinstance(val, numbers.Number): + val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0) + val = min(96, max(2, int(val * 8))) + setattr(self, 'border_%s_width' % edge, val) + setattr(self, 'border_%s_color' % edge, convert_color(css['border-%s-color' % edge]) or 'auto') + setattr(self, 'border_%s_style' % edge, LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none')) + if store_css_style: + setattr(self, 'border_%s_css_style' % edge, css['border-%s-style' % edge].lower()) + + +class BlockStyle(DOCXStyle): + + ALL_PROPS = tuple( + 'text_align css_text_indent text_indent line_height background_color'.split( + ) + ['margin_' + edge for edge in border_edges + ] + ['css_margin_' + edge for edge in border_edges + ] + [x%edge for edge in border_edges for x in border_props] + ) + + def __init__(self, namespace, css, html_block, is_table_cell=False, parent_bg=None): + read_css_block_borders(self, css) + if is_table_cell: + for edge in border_edges: + setattr(self, 'border_%s_style' % edge, 'none') + setattr(self, 'border_%s_width' % edge, 0) + setattr(self, 'padding_' + edge, 0) + setattr(self, 'margin_' + edge, 0) + if css is None: + self.text_indent = 0 + self.css_text_indent = None + self.line_height = 280 + self.background_color = None + self.text_align = 'left' + else: + try: + self.text_indent = int(css['text-indent'] * 20) + self.css_text_indent = css._get('text-indent') + except (TypeError, ValueError): + self.text_indent = 0 + self.css_text_indent = None + try: + self.line_height = max(0, int(css.lineHeight * 20)) + except (TypeError, ValueError): + self.line_height = max(0, int(1.2 * css.fontSize * 20)) + self.background_color = None if is_table_cell else convert_color(css['background-color']) + if not is_table_cell and self.background_color is None: + self.background_color = parent_bg + try: + ws = css['white-space'].lower() + preserve_whitespace = ws in {'pre', 'pre-wrap'} + except Exception: + preserve_whitespace = False + try: + aval = css['text-align'].lower() + if preserve_whitespace: + aval = 'start' + self.text_align = {'start':'left', 'left':'left', 'end':'right', 'right':'right', 'center':'center', 'justify':'both', 'centre':'center'}.get( + aval, 'left') + except AttributeError: + self.text_align = 'left' + + DOCXStyle.__init__(self, namespace) + + def serialize_borders(self, bdr, normal_style): + w = self.w + for edge in border_edges: + e = bdr.makeelement(w(edge)) + padding = getattr(self, 'padding_' + edge) + if (self is normal_style and padding > 0) or (padding != getattr(normal_style, 'padding_' + edge)): + e.set(w('space'), unicode_type(padding)) + width = getattr(self, 'border_%s_width' % edge) + bstyle = getattr(self, 'border_%s_style' % edge) + if (self is normal_style and width > 0 and bstyle != 'none' + ) or width != getattr(normal_style, 'border_%s_width' % edge + ) or bstyle != getattr(normal_style, 'border_%s_style' % edge): + e.set(w('val'), bstyle) + e.set(w('sz'), unicode_type(width)) + e.set(w('color'), getattr(self, 'border_%s_color' % edge)) + if e.attrib: + bdr.append(e) + return bdr + + def serialize(self, styles, normal_style): + makeelement = self.makeelement + style_root = DOCXStyle.serialize(self, styles, normal_style) + style = makeelement(style_root, 'pPr') + self.serialize_properties(style, normal_style) + if len(style) > 0: + style_root.append(style) + return style_root + + def serialize_properties(self, pPr, normal_style): + makeelement, w = self.makeelement, self.w + spacing = makeelement(pPr, 'spacing') + for edge, attr in iteritems({'top':'before', 'bottom':'after'}): + getter = attrgetter('css_margin_' + edge) + css_val, css_unit = parse_css_length(getter(self)) + if css_unit in ('em', 'ex'): + lines = max(0, int(css_val * (50 if css_unit == 'ex' else 100))) + if (self is normal_style and lines > 0) or getter(self) != getter(normal_style): + spacing.set(w(attr + 'Lines'), unicode_type(lines)) + else: + getter = attrgetter('margin_' + edge) + val = getter(self) + if (self is normal_style and val > 0) or val != getter(normal_style): + spacing.set(w(attr), unicode_type(val)) + + if self is normal_style or self.line_height != normal_style.line_height: + spacing.set(w('line'), unicode_type(self.line_height)) + spacing.set(w('lineRule'), 'atLeast') + + if spacing.attrib: + pPr.append(spacing) + + ind = makeelement(pPr, 'ind') + for edge in ('left', 'right'): + getter = attrgetter('css_margin_' + edge) + css_val, css_unit = parse_css_length(getter(self)) + if css_unit in ('em', 'ex'): + chars = max(0, int(css_val * (50 if css_unit == 'ex' else 100))) + if (self is normal_style and chars > 0) or getter(self) != getter(normal_style): + ind.set(w(edge + 'Chars'), unicode_type(chars)) + else: + getter = attrgetter('margin_' + edge) + val = getter(self) + if (self is normal_style and val > 0) or val != getter(normal_style): + ind.set(w(edge), unicode_type(val)) + ind.set(w(edge + 'Chars'), '0') # This is needed to override any declaration in the parent style + css_val, css_unit = parse_css_length(self.css_text_indent) + if css_unit in ('em', 'ex'): + chars = int(css_val * (50 if css_unit == 'ex' else 100)) + if css_val >= 0: + if (self is normal_style and chars > 0) or self.css_text_indent != normal_style.css_text_indent: + ind.set(w('firstLineChars'), unicode_type(chars)) + else: + if (self is normal_style and chars < 0) or self.css_text_indent != normal_style.css_text_indent: + ind.set(w('hangingChars'), unicode_type(abs(chars))) + else: + val = self.text_indent + if val >= 0: + if (self is normal_style and val > 0) or self.text_indent != normal_style.text_indent: + ind.set(w('firstLine'), unicode_type(val)) + ind.set(w('firstLineChars'), '0') # This is needed to override any declaration in the parent style + else: + if (self is normal_style and val < 0) or self.text_indent != normal_style.text_indent: + ind.set(w('hanging'), unicode_type(abs(val))) + ind.set(w('hangingChars'), '0') + if ind.attrib: + pPr.append(ind) + + if (self is normal_style and self.background_color) or self.background_color != normal_style.background_color: + pPr.append(makeelement(pPr, 'shd', val='clear', color='auto', fill=self.background_color or 'auto')) + + pbdr = self.serialize_borders(pPr.makeelement(w('pBdr')), normal_style) + if len(pbdr): + pPr.append(pbdr) + + if self is normal_style or self.text_align != normal_style.text_align: + pPr.append(makeelement(pPr, 'jc', val=self.text_align)) + + if self is not normal_style and self.next_style is not None: + pPr.append(makeelement(pPr, 'next', val=self.next_style)) + + +class StylesManager(object): + + def __init__(self, namespace, log, document_lang): + self.namespace = namespace + self.document_lang = lang_as_iso639_1(document_lang) or 'en' + self.log = log + self.block_styles, self.text_styles = {}, {} + self.styles_for_html_blocks = {} + + def create_text_style(self, css_style, is_parent_style=False): + ans = TextStyle(self.namespace, css_style, is_parent_style=is_parent_style) + existing = self.text_styles.get(ans, None) + if existing is None: + self.text_styles[ans] = ans + else: + ans = existing + return ans + + def create_block_style(self, css_style, html_block, is_table_cell=False, parent_bg=None): + ans = BlockStyle(self.namespace, css_style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg) + existing = self.block_styles.get(ans, None) + if existing is None: + self.block_styles[ans] = ans + else: + ans = existing + self.styles_for_html_blocks[html_block] = ans + return ans + + def finalize(self, all_blocks): + block_counts, run_counts = Counter(), Counter() + block_rmap, run_rmap = defaultdict(list), defaultdict(list) + used_pairs = defaultdict(list) + heading_styles = defaultdict(list) + headings = frozenset('h1 h2 h3 h4 h5 h6'.split()) + pure_block_styles = set() + + for block in all_blocks: + bs = block.style + block_counts[bs] += 1 + block_rmap[block.style].append(block) + local_run_counts = Counter() + for run in block.runs: + count = run.style_weight + run_counts[run.style] += count + local_run_counts[run.style] += count + run_rmap[run.style].append(run) + if local_run_counts: + rs = local_run_counts.most_common(1)[0][0] + used_pairs[(bs, rs)].append(block) + if block.html_tag in headings: + heading_styles[block.html_tag].append((bs, rs)) + else: + pure_block_styles.add(bs) + + self.pure_block_styles = sorted(pure_block_styles, key=block_counts.__getitem__) + bnum = len(unicode_type(max(1, len(pure_block_styles) - 1))) + for i, bs in enumerate(self.pure_block_styles): + bs.id = bs.name = '%0{}d Block'.format(bnum) % i + bs.seq = i + if i == 0: + self.normal_pure_block_style = bs + + counts = Counter() + smap = {} + for (bs, rs), blocks in iteritems(used_pairs): + s = CombinedStyle(bs, rs, blocks, self.namespace) + smap[(bs, rs)] = s + counts[s] += sum(1 for b in blocks if not b.is_empty()) + for i, heading_tag in enumerate(sorted(heading_styles)): + styles = sorted((smap[k] for k in heading_styles[heading_tag]), key=counts.__getitem__) + styles = list(filter(lambda s:s.outline_level is None, styles)) + if styles: + heading_style = styles[-1] + heading_style.outline_level = i + + snum = len(unicode_type(max(1, len(counts) - 1))) + heading_styles = [] + for i, (style, count) in enumerate(counts.most_common()): + if i == 0: + self.normal_style = style + style.id = style.name = 'Normal' + else: + if style.outline_level is None: + val = 'Para %0{}d'.format(snum) % i + else: + val = 'Heading %d' % (style.outline_level + 1) + heading_styles.append(style) + style.id = style.name = val + style.seq = i + self.combined_styles = sorted(counts, key=attrgetter('seq')) + [ls.apply() for ls in self.combined_styles] + + descendant_style_map = {} + ds_counts = Counter() + for block in all_blocks: + for run in block.runs: + if run.parent_style is not run.style and run.parent_style and run.style: + ds = DescendantTextStyle(run.parent_style, run.style) + if ds.properties: + run.descendant_style = descendant_style_map.get(ds) + if run.descendant_style is None: + run.descendant_style = descendant_style_map[ds] = ds + ds_counts[run.descendant_style] += run.style_weight + rnum = len(unicode_type(max(1, len(ds_counts) - 1))) + for i, (text_style, count) in enumerate(ds_counts.most_common()): + text_style.id = 'Text%d' % i + text_style.name = '%0{}d Text'.format(rnum) % i + text_style.seq = i + self.descendant_text_styles = sorted(descendant_style_map, key=attrgetter('seq')) + + self.log.debug('%d Text Styles %d Combined styles' % tuple(map(len, ( + self.descendant_text_styles, self.combined_styles)))) + + self.primary_heading_style = None + if heading_styles: + heading_styles.sort(key=attrgetter('outline_level')) + self.primary_heading_style = heading_styles[0] + else: + ms = 0 + for s in self.combined_styles: + if s.rs.font_size > ms: + self.primary_heading_style = s + ms = s.rs.font_size + + def serialize(self, styles): + lang = styles.xpath('descendant::*[local-name()="lang"]')[0] + for k in tuple(lang.attrib): + lang.attrib[k] = self.document_lang + for style in self.combined_styles: + style.serialize(styles, self.normal_style) + for style in self.descendant_text_styles: + style.serialize(styles) + for style in sorted(self.pure_block_styles, key=attrgetter('seq')): + style.serialize(styles, self.normal_pure_block_style) diff --git a/ebook_converter/ebooks/docx/writer/tables.py b/ebook_converter/ebooks/docx/writer/tables.py new file mode 100644 index 0000000..209d420 --- /dev/null +++ b/ebook_converter/ebooks/docx/writer/tables.py @@ -0,0 +1,371 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2015, Kovid Goyal ' + +from collections import namedtuple + +from calibre.ebooks.docx.writer.utils import convert_color +from calibre.ebooks.docx.writer.styles import read_css_block_borders as rcbb, border_edges +from polyglot.builtins import iteritems, range, unicode_type + + +class Dummy(object): + pass + + +Border = namedtuple('Border', 'css_style style width color level') +border_style_weight = { + x:100-i for i, x in enumerate(('double', 'solid', 'dashed', 'dotted', 'ridge', 'outset', 'groove', 'inset'))} + + +class SpannedCell(object): + + def __init__(self, spanning_cell, horizontal=True): + self.spanning_cell = spanning_cell + self.horizontal = horizontal + self.row_span = self.col_span = 1 + + def resolve_borders(self): + pass + + def serialize(self, tr, makeelement): + tc = makeelement(tr, 'w:tc') + tcPr = makeelement(tc, 'w:tcPr') + makeelement(tcPr, 'w:%sMerge' % ('h' if self.horizontal else 'v'), w_val='continue') + makeelement(tc, 'w:p') + + def applicable_borders(self, edge): + return self.spanning_cell.applicable_borders(edge) + + +def read_css_block_borders(self, css): + obj = Dummy() + rcbb(obj, css, store_css_style=True) + for edge in border_edges: + setattr(self, 'border_' + edge, Border( + getattr(obj, 'border_%s_css_style' % edge), + getattr(obj, 'border_%s_style' % edge), + getattr(obj, 'border_%s_width' % edge), + getattr(obj, 'border_%s_color' % edge), + self.BLEVEL + )) + setattr(self, 'padding_' + edge, getattr(obj, 'padding_' + edge)) + + +def as_percent(x): + if x and x.endswith('%'): + try: + return float(x.rstrip('%')) + except Exception: + pass + + +def convert_width(tag_style): + if tag_style is not None: + w = tag_style._get('width') + wp = as_percent(w) + if w == 'auto': + return ('auto', 0) + elif wp is not None: + return ('pct', int(wp * 50)) + else: + try: + return ('dxa', int(float(tag_style['width']) * 20)) + except Exception: + pass + return ('auto', 0) + + +class Cell(object): + + BLEVEL = 2 + + def __init__(self, row, html_tag, tag_style=None): + self.row = row + self.table = self.row.table + self.html_tag = html_tag + try: + self.row_span = max(0, int(html_tag.get('rowspan', 1))) + except Exception: + self.row_span = 1 + try: + self.col_span = max(0, int(html_tag.get('colspan', 1))) + except Exception: + self.col_span = 1 + if tag_style is None: + self.valign = 'center' + else: + self.valign = {'top':'top', 'bottom':'bottom', 'middle':'center'}.get(tag_style._get('vertical-align')) + self.items = [] + self.width = convert_width(tag_style) + self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor) + read_css_block_borders(self, tag_style) + + def add_block(self, block): + self.items.append(block) + block.parent_items = self.items + + def add_table(self, table): + self.items.append(table) + return table + + def serialize(self, parent, makeelement): + tc = makeelement(parent, 'w:tc') + tcPr = makeelement(tc, 'w:tcPr') + makeelement(tcPr, 'w:tcW', w_type=self.width[0], w_w=unicode_type(self.width[1])) + # For some reason, Word 2007 refuses to honor at the table or row + # level, despite what the specs say, so we inherit and apply at the + # cell level + bc = self.background_color or self.row.background_color or self.row.table.background_color + if bc: + makeelement(tcPr, 'w:shd', w_val="clear", w_color="auto", w_fill=bc) + + b = makeelement(tcPr, 'w:tcBorders', append=False) + for edge, border in iteritems(self.borders): + if border is not None and border.width > 0 and border.style != 'none': + makeelement(b, 'w:' + edge, w_val=border.style, w_sz=unicode_type(border.width), w_color=border.color) + if len(b) > 0: + tcPr.append(b) + + m = makeelement(tcPr, 'w:tcMar', append=False) + for edge in border_edges: + padding = getattr(self, 'padding_' + edge) + if edge in {'top', 'bottom'} or (edge == 'left' and self is self.row.first_cell) or (edge == 'right' and self is self.row.last_cell): + padding += getattr(self.row, 'padding_' + edge) + if padding > 0: + makeelement(m, 'w:' + edge, w_type='dxa', w_w=unicode_type(int(padding * 20))) + if len(m) > 0: + tcPr.append(m) + + if self.valign is not None: + makeelement(tcPr, 'w:vAlign', w_val=self.valign) + + if self.row_span > 1: + makeelement(tcPr, 'w:vMerge', w_val='restart') + if self.col_span > 1: + makeelement(tcPr, 'w:hMerge', w_val='restart') + + item = None + for item in self.items: + item.serialize(tc) + if item is None or isinstance(item, Table): + # Word 2007 requires the last element in a table cell to be a paragraph + makeelement(tc, 'w:p') + + def applicable_borders(self, edge): + if edge == 'left': + items = {self.table, self.row, self} if self.row.first_cell is self else {self} + elif edge == 'top': + items = ({self.table} if self.table.first_row is self.row else set()) | {self, self.row} + elif edge == 'right': + items = {self.table, self, self.row} if self.row.last_cell is self else {self} + elif edge == 'bottom': + items = ({self.table} if self.table.last_row is self.row else set()) | {self, self.row} + return {getattr(x, 'border_' + edge) for x in items} + + def resolve_border(self, edge): + # In Word cell borders override table borders, and Word ignores row + # borders, so we consolidate all borders as cell borders + # In HTML the priority is as described here: + # http://www.w3.org/TR/CSS21/tables.html#border-conflict-resolution + neighbor = self.neighbor(edge) + borders = self.applicable_borders(edge) + if neighbor is not None: + nedge = {'left':'right', 'top':'bottom', 'right':'left', 'bottom':'top'}[edge] + borders |= neighbor.applicable_borders(nedge) + + for b in borders: + if b.css_style == 'hidden': + return None + + def weight(border): + return ( + 0 if border.css_style == 'none' else 1, + border.width, + border_style_weight.get(border.css_style, 0), + border.level) + border = sorted(borders, key=weight)[-1] + return border + + def resolve_borders(self): + self.borders = {edge:self.resolve_border(edge) for edge in border_edges} + + def neighbor(self, edge): + idx = self.row.cells.index(self) + ans = None + if edge == 'left': + ans = self.row.cells[idx-1] if idx > 0 else None + elif edge == 'right': + ans = self.row.cells[idx+1] if (idx + 1) < len(self.row.cells) else None + elif edge == 'top': + ridx = self.table.rows.index(self.row) + if ridx > 0 and idx < len(self.table.rows[ridx-1].cells): + ans = self.table.rows[ridx-1].cells[idx] + elif edge == 'bottom': + ridx = self.table.rows.index(self.row) + if ridx + 1 < len(self.table.rows) and idx < len(self.table.rows[ridx+1].cells): + ans = self.table.rows[ridx+1].cells[idx] + return getattr(ans, 'spanning_cell', ans) + + +class Row(object): + + BLEVEL = 1 + + def __init__(self, table, html_tag, tag_style=None): + self.table = table + self.html_tag = html_tag + self.orig_tag_style = tag_style + self.cells = [] + self.current_cell = None + self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor) + read_css_block_borders(self, tag_style) + + @property + def first_cell(self): + return self.cells[0] if self.cells else None + + @property + def last_cell(self): + return self.cells[-1] if self.cells else None + + def start_new_cell(self, html_tag, tag_style): + self.current_cell = Cell(self, html_tag, tag_style) + + def finish_tag(self, html_tag): + if self.current_cell is not None: + if html_tag is self.current_cell.html_tag: + self.cells.append(self.current_cell) + self.current_cell = None + + def add_block(self, block): + if self.current_cell is None: + self.start_new_cell(self.html_tag, self.orig_tag_style) + self.current_cell.add_block(block) + + def add_table(self, table): + if self.current_cell is None: + self.current_cell = Cell(self, self.html_tag, self.orig_tag_style) + return self.current_cell.add_table(table) + + def serialize(self, parent, makeelement): + tr = makeelement(parent, 'w:tr') + for cell in self.cells: + cell.serialize(tr, makeelement) + + +class Table(object): + + BLEVEL = 0 + + def __init__(self, namespace, html_tag, tag_style=None): + self.namespace = namespace + self.html_tag = html_tag + self.orig_tag_style = tag_style + self.rows = [] + self.current_row = None + self.width = convert_width(tag_style) + self.background_color = None if tag_style is None else convert_color(tag_style.backgroundColor) + self.jc = None + self.float = None + self.margin_left = self.margin_right = self.margin_top = self.margin_bottom = None + if tag_style is not None: + ml, mr = tag_style._get('margin-left'), tag_style.get('margin-right') + if ml == 'auto': + self.jc = 'center' if mr == 'auto' else 'right' + self.float = tag_style['float'] + for edge in border_edges: + setattr(self, 'margin_' + edge, tag_style['margin-' + edge]) + read_css_block_borders(self, tag_style) + + @property + def first_row(self): + return self.rows[0] if self.rows else None + + @property + def last_row(self): + return self.rows[-1] if self.rows else None + + def finish_tag(self, html_tag): + if self.current_row is not None: + self.current_row.finish_tag(html_tag) + if self.current_row.html_tag is html_tag: + self.rows.append(self.current_row) + self.current_row = None + table_ended = self.html_tag is html_tag + if table_ended: + self.expand_spanned_cells() + for row in self.rows: + for cell in row.cells: + cell.resolve_borders() + return table_ended + + def expand_spanned_cells(self): + # Expand horizontally + for row in self.rows: + for cell in tuple(row.cells): + idx = row.cells.index(cell) + if cell.col_span > 1 and (cell is row.cells[-1] or not isinstance(row.cells[idx+1], SpannedCell)): + row.cells[idx:idx+1] = [cell] + [SpannedCell(cell, horizontal=True) for i in range(1, cell.col_span)] + + # Expand vertically + for r, row in enumerate(self.rows): + for idx, cell in enumerate(row.cells): + if cell.row_span > 1: + for nrow in self.rows[r+1:]: + sc = SpannedCell(cell, horizontal=False) + try: + tcell = nrow.cells[idx] + except Exception: + tcell = None + if tcell is None: + nrow.cells.extend([SpannedCell(nrow.cells[-1], horizontal=True) for i in range(idx - len(nrow.cells))]) + nrow.cells.append(sc) + else: + if isinstance(tcell, SpannedCell): + # Conflict between rowspan and colspan + break + else: + nrow.cells.insert(idx, sc) + + def start_new_row(self, html_tag, html_style): + if self.current_row is not None: + self.rows.append(self.current_row) + self.current_row = Row(self, html_tag, html_style) + + def start_new_cell(self, html_tag, html_style): + if self.current_row is None: + self.start_new_row(html_tag, None) + self.current_row.start_new_cell(html_tag, html_style) + + def add_block(self, block): + self.current_row.add_block(block) + + def add_table(self, table): + if self.current_row is None: + self.current_row = Row(self, self.html_tag, self.orig_tag_style) + return self.current_row.add_table(table) + + def serialize(self, parent): + makeelement = self.namespace.makeelement + rows = [r for r in self.rows if r.cells] + if not rows: + return + tbl = makeelement(parent, 'w:tbl') + tblPr = makeelement(tbl, 'w:tblPr') + makeelement(tblPr, 'w:tblW', w_type=self.width[0], w_w=unicode_type(self.width[1])) + if self.float in {'left', 'right'}: + kw = {'w_vertAnchor':'text', 'w_horzAnchor':'text', 'w_tblpXSpec':self.float} + for edge in border_edges: + val = getattr(self, 'margin_' + edge) or 0 + if {self.float, edge} == {'left', 'right'}: + val = max(val, 2) + kw['w_' + edge + 'FromText'] = unicode_type(max(0, int(val *20))) + makeelement(tblPr, 'w:tblpPr', **kw) + if self.jc is not None: + makeelement(tblPr, 'w:jc', w_val=self.jc) + for row in rows: + row.serialize(tbl, makeelement) diff --git a/ebook_converter/ebooks/docx/writer/utils.py b/ebook_converter/ebooks/docx/writer/utils.py new file mode 100644 index 0000000..30cabd4 --- /dev/null +++ b/ebook_converter/ebooks/docx/writer/utils.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +from tinycss.color3 import parse_color_string + + +def int_or_zero(raw): + try: + return int(raw) + except (ValueError, TypeError, AttributeError): + return 0 + +# convert_color() {{{ + + +def convert_color(value): + if not value: + return + if value.lower() == 'currentcolor': + return 'auto' + val = parse_color_string(value) + if val is None: + return + if val.alpha < 0.01: + return + return '%02X%02X%02X' % (int(val.red * 255), int(val.green * 255), int(val.blue * 255)) + + +def test_convert_color(return_tests=False): + import unittest + + class TestColors(unittest.TestCase): + + def test_color_conversion(self): + ae = self.assertEqual + cc = convert_color + ae(None, cc(None)) + ae(None, cc('transparent')) + ae(None, cc('none')) + ae(None, cc('#12j456')) + ae('auto', cc('currentColor')) + ae('F0F8FF', cc('AliceBlue')) + ae('000000', cc('black')) + ae('FF0000', cc('red')) + ae('00FF00', cc('lime')) + ae(cc('#001'), '000011') + ae('12345D', cc('#12345d')) + ae('FFFFFF', cc('rgb(255, 255, 255)')) + ae('FF0000', cc('rgba(255, 0, 0, 23)')) + tests = unittest.defaultTestLoader.loadTestsFromTestCase(TestColors) + if return_tests: + return tests + unittest.TextTestRunner(verbosity=4).run(tests) +# }}} diff --git a/ebook_converter/ebooks/oeb/transforms/subset.py b/ebook_converter/ebooks/oeb/transforms/subset.py new file mode 100644 index 0000000..070dc65 --- /dev/null +++ b/ebook_converter/ebooks/oeb/transforms/subset.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from collections import defaultdict + +from calibre.ebooks.oeb.base import urlnormalize, css_text +from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont +from polyglot.builtins import iteritems, itervalues, unicode_type, range +from tinycss.fonts3 import parse_font_family + + +def get_font_properties(rule, default=None): + ''' + Given a CSS rule, extract normalized font properties from + it. Note that shorthand font property should already have been expanded + by the CSS flattening code. + ''' + props = {} + s = rule.style + for q in ('font-family', 'src', 'font-weight', 'font-stretch', + 'font-style'): + g = 'uri' if q == 'src' else 'value' + try: + val = s.getProperty(q).propertyValue[0] + val = getattr(val, g) + if q == 'font-family': + val = parse_font_family(css_text(s.getProperty(q).propertyValue)) + if val and val[0] == 'inherit': + val = None + except (IndexError, KeyError, AttributeError, TypeError, ValueError): + val = None if q in {'src', 'font-family'} else default + if q in {'font-weight', 'font-stretch', 'font-style'}: + val = unicode_type(val).lower() if (val or val == 0) else val + if val == 'inherit': + val = default + if q == 'font-weight': + val = {'normal':'400', 'bold':'700'}.get(val, val) + if val not in {'100', '200', '300', '400', '500', '600', '700', + '800', '900', 'bolder', 'lighter'}: + val = default + if val == 'normal': + val = '400' + elif q == 'font-style': + if val not in {'normal', 'italic', 'oblique'}: + val = default + elif q == 'font-stretch': + if val not in {'normal', 'ultra-condensed', 'extra-condensed', + 'condensed', 'semi-condensed', 'semi-expanded', + 'expanded', 'extra-expanded', 'ultra-expanded'}: + val = default + props[q] = val + return props + + +def find_font_face_rules(sheet, oeb): + ''' + Find all @font-face rules in the given sheet and extract the relevant info from them. + sheet can be either a ManifestItem or a CSSStyleSheet. + ''' + ans = [] + try: + rules = sheet.data.cssRules + except AttributeError: + rules = sheet.cssRules + + for i, rule in enumerate(rules): + if rule.type != rule.FONT_FACE_RULE: + continue + props = get_font_properties(rule, default='normal') + if not props['font-family'] or not props['src']: + continue + + try: + path = sheet.abshref(props['src']) + except AttributeError: + path = props['src'] + ff = oeb.manifest.hrefs.get(urlnormalize(path), None) + if not ff: + continue + props['item'] = ff + if props['font-weight'] in {'bolder', 'lighter'}: + props['font-weight'] = '400' + props['weight'] = int(props['font-weight']) + props['rule'] = rule + props['chars'] = set() + ans.append(props) + + return ans + + +def elem_style(style_rules, cls, inherited_style): + ''' + Find the effective style for the given element. + ''' + classes = cls.split() + style = inherited_style.copy() + for cls in classes: + style.update(style_rules.get(cls, {})) + wt = style.get('font-weight', None) + pwt = inherited_style.get('font-weight', '400') + if wt == 'bolder': + style['font-weight'] = { + '100':'400', + '200':'400', + '300':'400', + '400':'700', + '500':'700', + }.get(pwt, '900') + elif wt == 'lighter': + style['font-weight'] = { + '600':'400', '700':'400', + '800':'700', '900':'700'}.get(pwt, '100') + + return style + + +class SubsetFonts(object): + + ''' + Subset all embedded fonts. Must be run after CSS flattening, as it requires + CSS normalization and flattening to work. + ''' + + def __call__(self, oeb, log, opts): + self.oeb, self.log, self.opts = oeb, log, opts + + self.find_embedded_fonts() + if not self.embedded_fonts: + self.log.debug('No embedded fonts found') + return + self.find_style_rules() + self.find_font_usage() + + totals = [0, 0] + + def remove(font): + totals[1] += len(font['item'].data) + self.oeb.manifest.remove(font['item']) + font['rule'].parentStyleSheet.deleteRule(font['rule']) + + fonts = {} + for font in self.embedded_fonts: + item, chars = font['item'], font['chars'] + if item.href in fonts: + fonts[item.href]['chars'] |= chars + else: + fonts[item.href] = font + + for font in itervalues(fonts): + if not font['chars']: + self.log('The font %s is unused. Removing it.'%font['src']) + remove(font) + continue + try: + raw, old_stats, new_stats = subset(font['item'].data, font['chars']) + except NoGlyphs: + self.log('The font %s has no used glyphs. Removing it.'%font['src']) + remove(font) + continue + except UnsupportedFont as e: + self.log.warn('The font %s is unsupported for subsetting. %s'%( + font['src'], e)) + sz = len(font['item'].data) + totals[0] += sz + totals[1] += sz + else: + font['item'].data = raw + nlen = sum(itervalues(new_stats)) + olen = sum(itervalues(old_stats)) + self.log('Decreased the font %s to %.1f%% of its original size'% + (font['src'], nlen/olen *100)) + totals[0] += nlen + totals[1] += olen + + font['item'].unload_data_from_memory() + + if totals[0]: + self.log('Reduced total font size to %.1f%% of original'% + (totals[0]/totals[1] * 100)) + + def find_embedded_fonts(self): + ''' + Find all @font-face rules and extract the relevant info from them. + ''' + self.embedded_fonts = [] + for item in self.oeb.manifest: + if not hasattr(item.data, 'cssRules'): + continue + self.embedded_fonts.extend(find_font_face_rules(item, self.oeb)) + + def find_style_rules(self): + ''' + Extract all font related style information from all stylesheets into a + dict mapping classes to font properties specified by that class. All + the heavy lifting has already been done by the CSS flattening code. + ''' + rules = defaultdict(dict) + for item in self.oeb.manifest: + if not hasattr(item.data, 'cssRules'): + continue + for i, rule in enumerate(item.data.cssRules): + if rule.type != rule.STYLE_RULE: + continue + props = {k:v for k,v in + iteritems(get_font_properties(rule)) if v} + if not props: + continue + for sel in rule.selectorList: + sel = sel.selectorText + if sel and sel.startswith('.'): + # We dont care about pseudo-selectors as the worst that + # can happen is some extra characters will remain in + # the font + sel = sel.partition(':')[0] + rules[sel[1:]].update(props) + + self.style_rules = dict(rules) + + def find_font_usage(self): + for item in self.oeb.manifest: + if not hasattr(item.data, 'xpath'): + continue + for body in item.data.xpath('//*[local-name()="body"]'): + base = {'font-family':['serif'], 'font-weight': '400', + 'font-style':'normal', 'font-stretch':'normal'} + self.find_usage_in(body, base) + + def used_font(self, style): + ''' + Given a style find the embedded font that matches it. Returns None if + no match is found (can happen if no family matches). + ''' + ff = style.get('font-family', []) + lnames = {unicode_type(x).lower() for x in ff} + matching_set = [] + + # Filter on font-family + for ef in self.embedded_fonts: + flnames = {x.lower() for x in ef.get('font-family', [])} + if not lnames.intersection(flnames): + continue + matching_set.append(ef) + if not matching_set: + return None + + # Filter on font-stretch + widths = {x:i for i, x in enumerate(('ultra-condensed', + 'extra-condensed', 'condensed', 'semi-condensed', 'normal', + 'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded' + ))} + + width = widths[style.get('font-stretch', 'normal')] + for f in matching_set: + f['width'] = widths[style.get('font-stretch', 'normal')] + + min_dist = min(abs(width-f['width']) for f in matching_set) + nearest = [f for f in matching_set if abs(width-f['width']) == + min_dist] + if width <= 4: + lmatches = [f for f in nearest if f['width'] <= width] + else: + lmatches = [f for f in nearest if f['width'] >= width] + matching_set = (lmatches or nearest) + + # Filter on font-style + fs = style.get('font-style', 'normal') + order = { + 'oblique':['oblique', 'italic', 'normal'], + 'normal':['normal', 'oblique', 'italic'] + }.get(fs, ['italic', 'oblique', 'normal']) + for q in order: + matches = [f for f in matching_set if f.get('font-style', 'normal') == q] + if matches: + matching_set = matches + break + + # Filter on font weight + fw = int(style.get('font-weight', '400')) + if fw == 400: + q = [400, 500, 300, 200, 100, 600, 700, 800, 900] + elif fw == 500: + q = [500, 400, 300, 200, 100, 600, 700, 800, 900] + elif fw < 400: + q = [fw] + list(range(fw-100, -100, -100)) + list(range(fw+100, + 100, 1000)) + else: + q = [fw] + list(range(fw+100, 100, 1000)) + list(range(fw-100, + -100, -100)) + for wt in q: + matches = [f for f in matching_set if f['weight'] == wt] + if matches: + return matches[0] + + def find_chars(self, elem): + ans = set() + if elem.text: + ans |= set(elem.text) + for child in elem: + if child.tail: + ans |= set(child.tail) + return ans + + def find_usage_in(self, elem, inherited_style): + style = elem_style(self.style_rules, elem.get('class', '') or '', inherited_style) + for child in elem: + self.find_usage_in(child, style) + font = self.used_font(style) + if font: + chars = self.find_chars(elem) + if chars: + font['chars'] |= chars diff --git a/ebook_converter/ebooks/pdf/render/__init__.py b/ebook_converter/ebooks/pdf/render/__init__.py new file mode 100644 index 0000000..d7ade49 --- /dev/null +++ b/ebook_converter/ebooks/pdf/render/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + + diff --git a/ebook_converter/ebooks/pdf/render/common.py b/ebook_converter/ebooks/pdf/render/common.py new file mode 100644 index 0000000..1b5f66c --- /dev/null +++ b/ebook_converter/ebooks/pdf/render/common.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import codecs, zlib, numbers +from io import BytesIO +from datetime import datetime + +from calibre.constants import plugins, ispy3 +from calibre.utils.logging import default_log +from polyglot.builtins import iteritems, unicode_type, codepoint_to_chr +from polyglot.binary import as_hex_bytes + +pdf_float = plugins['speedup'][0].pdf_float + +EOL = b'\n' + +# Sizes {{{ +inch = 72.0 +cm = inch / 2.54 +mm = cm * 0.1 +pica = 12.0 +didot = 0.375 * mm +cicero = 12 * didot + +_W, _H = (21*cm, 29.7*cm) + +A6 = (_W*.5, _H*.5) +A5 = (_H*.5, _W) +A4 = (_W, _H) +A3 = (_H, _W*2) +A2 = (_W*2, _H*2) +A1 = (_H*2, _W*4) +A0 = (_W*4, _H*4) + +LETTER = (8.5*inch, 11*inch) +LEGAL = (8.5*inch, 14*inch) +ELEVENSEVENTEEN = (11*inch, 17*inch) + +_BW, _BH = (25*cm, 35.3*cm) +B6 = (_BW*.5, _BH*.5) +B5 = (_BH*.5, _BW) +B4 = (_BW, _BH) +B3 = (_BH*2, _BW) +B2 = (_BW*2, _BH*2) +B1 = (_BH*4, _BW*2) +B0 = (_BW*4, _BH*4) + +PAPER_SIZES = {k:globals()[k.upper()] for k in ('a0 a1 a2 a3 a4 a5 a6 b0 b1 b2' + ' b3 b4 b5 b6 letter legal').split()} + +# }}} + + +def fmtnum(o): + if isinstance(o, float): + return pdf_float(o) + return unicode_type(o) + + +def serialize(o, stream): + if isinstance(o, float): + stream.write_raw(pdf_float(o).encode('ascii')) + elif isinstance(o, bool): + # Must check bool before int as bools are subclasses of int + stream.write_raw(b'true' if o else b'false') + elif isinstance(o, numbers.Integral): + stream.write_raw(unicode_type(o).encode('ascii') if ispy3 else bytes(o)) + elif hasattr(o, 'pdf_serialize'): + o.pdf_serialize(stream) + elif o is None: + stream.write_raw(b'null') + elif isinstance(o, datetime): + val = o.strftime("D:%Y%m%d%H%M%%02d%z")%min(59, o.second) + if datetime.tzinfo is not None: + val = "(%s'%s')"%(val[:-2], val[-2:]) + stream.write(val.encode('ascii')) + else: + raise ValueError('Unknown object: %r'%o) + + +class Name(unicode_type): + + def pdf_serialize(self, stream): + raw = self.encode('ascii') + if len(raw) > 126: + raise ValueError('Name too long: %r'%self) + raw = bytearray(raw) + sharp = ord(b'#') + buf = ( + codepoint_to_chr(x).encode('ascii') if 33 < x < 126 and x != sharp else + '#{:x}'.format(x).encode('ascii') for x in raw) + stream.write(b'/'+b''.join(buf)) + + +def escape_pdf_string(bytestring): + indices = [] + bad = [] + ba = bytearray(bytestring) + bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')} + for i, num in enumerate(ba): + if num == 40: # ( + indices.append((i, 40)) + elif num == 41: # ) + if indices: + indices.pop() + else: + bad.append((i, 41)) + elif num in bad_map: # '\n\r\f\b\t\\' see Table 3.2 in PDF 1.7 spec + bad.append((i, bad_map[num])) + bad = sorted(indices + bad, reverse=True) + if not bad: + return bytestring + for i, repl in bad: + ba[i:i+1] = (92, repl) # 92 = ord('\') + return bytes(ba) + + +class String(unicode_type): + + def pdf_serialize(self, stream): + try: + raw = self.encode('latin1') + if raw.startswith(codecs.BOM_UTF16_BE): + raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be') + except UnicodeEncodeError: + raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be') + stream.write(b'('+escape_pdf_string(raw)+b')') + + +class UTF16String(unicode_type): + + def pdf_serialize(self, stream): + raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be') + if False: + # Disabled as the parentheses based strings give easier to debug + # PDF files + stream.write(b'<' + as_hex_bytes(raw) + b'>') + else: + stream.write(b'('+escape_pdf_string(raw)+b')') + + +class Dictionary(dict): + + def pdf_serialize(self, stream): + stream.write(b'<<' + EOL) + sorted_keys = sorted(self, + key=lambda x:({'Type':'1', 'Subtype':'2'}.get( + x, x)+x)) + for k in sorted_keys: + serialize(Name(k), stream) + stream.write(b' ') + serialize(self[k], stream) + stream.write(EOL) + stream.write(b'>>' + EOL) + + +class InlineDictionary(Dictionary): + + def pdf_serialize(self, stream): + stream.write(b'<< ') + for k, v in iteritems(self): + serialize(Name(k), stream) + stream.write(b' ') + serialize(v, stream) + stream.write(b' ') + stream.write(b'>>') + + +class Array(list): + + def pdf_serialize(self, stream): + stream.write(b'[') + for i, o in enumerate(self): + if i != 0: + stream.write(b' ') + serialize(o, stream) + stream.write(b']') + + +class Stream(BytesIO): + + def __init__(self, compress=False): + BytesIO.__init__(self) + self.compress = compress + self.filters = Array() + + def add_extra_keys(self, d): + pass + + def pdf_serialize(self, stream): + raw = self.getvalue() + dl = len(raw) + filters = self.filters + if self.compress: + filters.append(Name('FlateDecode')) + raw = zlib.compress(raw) + + d = InlineDictionary({'Length':len(raw), 'DL':dl}) + self.add_extra_keys(d) + if filters: + d['Filter'] = filters + serialize(d, stream) + stream.write(EOL+b'stream'+EOL) + stream.write(raw) + stream.write(EOL+b'endstream'+EOL) + + def write_line(self, raw=b''): + self.write(raw if isinstance(raw, bytes) else raw.encode('ascii')) + self.write(EOL) + + def write(self, raw): + super(Stream, self).write(raw if isinstance(raw, bytes) else + raw.encode('ascii')) + + def write_raw(self, raw): + BytesIO.write(self, raw) + + +class Reference(object): + + def __init__(self, num, obj): + self.num, self.obj = num, obj + + def pdf_serialize(self, stream): + raw = '%d 0 R'%self.num + stream.write(raw.encode('ascii')) + + def __repr__(self): + return '%d 0 R'%self.num + + def __str__(self): + return repr(self) +# }}} + + +def current_log(newlog=None): + if newlog: + current_log.ans = newlog + return current_log.ans or default_log + + +current_log.ans = None diff --git a/ebook_converter/utils/fonts/sfnt/__init__.py b/ebook_converter/utils/fonts/sfnt/__init__.py new file mode 100644 index 0000000..16979ad --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/__init__.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from datetime import datetime, timedelta + + +def align_block(raw, multiple=4, pad=b'\0'): + ''' + Return raw with enough pad bytes append to ensure its length is a multiple + of 4. + ''' + extra = len(raw) % multiple + if extra == 0: + return raw + return raw + pad*(multiple - extra) + + +class UnknownTable(object): + + def __init__(self, raw): + self.raw = raw + + def __call__(self): + return self.raw + + def __len__(self): + return len(self.raw) + + +class DateTimeProperty(object): + + def __init__(self, name): + self.name = name + + def __get__(self, obj, type=None): + return datetime(1904, 1, 1) + timedelta(seconds=getattr(obj, + self.name)) + + def __set__(self, obj, val): + td = val - datetime(1904, 1, 1) + setattr(obj, self.name, int(td.total_seconds())) + + +class FixedProperty(object): + + def __init__(self, name): + self.name = name + + def __get__(self, obj, type=None): + val = getattr(obj, self.name) + return val / 0x10000 + + def __set__(self, obj, val): + return int(round(val*(0x10000))) + + +def max_power_of_two(x): + """ +Return the highest exponent of two, so that + (2 ** exponent) <= x + """ + exponent = 0 + while x: + x = x >> 1 + exponent += 1 + return max(exponent - 1, 0) + + +def load_font(stream_or_path): + raw = stream_or_path + if hasattr(raw, 'read'): + raw = raw.read() + from calibre.utils.fonts.sfnt.container import Sfnt + return Sfnt(raw) + diff --git a/ebook_converter/utils/fonts/sfnt/cff/__init__.py b/ebook_converter/utils/fonts/sfnt/cff/__init__.py new file mode 100644 index 0000000..d7ade49 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/cff/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + + diff --git a/ebook_converter/utils/fonts/sfnt/cff/constants.py b/ebook_converter/utils/fonts/sfnt/cff/constants.py new file mode 100644 index 0000000..62ddf05 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/cff/constants.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +# cff_standard_strings {{{ +# The 391 Standard Strings as used in the CFF format. +# from Adobe Technical None #5176, version 1.0, 18 March 1998 + +cff_standard_strings = [ +'.notdef', 'space', 'exclam', 'quotedbl', 'numbersign', 'dollar', 'percent', +'ampersand', 'quoteright', 'parenleft', 'parenright', 'asterisk', 'plus', +'comma', 'hyphen', 'period', 'slash', 'zero', 'one', 'two', 'three', 'four', +'five', 'six', 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal', +'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', +'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', +'bracketleft', 'backslash', 'bracketright', 'asciicircum', 'underscore', +'quoteleft', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', +'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'braceleft', +'bar', 'braceright', 'asciitilde', 'exclamdown', 'cent', 'sterling', +'fraction', 'yen', 'florin', 'section', 'currency', 'quotesingle', +'quotedblleft', 'guillemotleft', 'guilsinglleft', 'guilsinglright', 'fi', 'fl', +'endash', 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet', +'quotesinglbase', 'quotedblbase', 'quotedblright', 'guillemotright', +'ellipsis', 'perthousand', 'questiondown', 'grave', 'acute', 'circumflex', +'tilde', 'macron', 'breve', 'dotaccent', 'dieresis', 'ring', 'cedilla', +'hungarumlaut', 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash', +'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash', 'oslash', 'oe', +'germandbls', 'onesuperior', 'logicalnot', 'mu', 'trademark', 'Eth', 'onehalf', +'plusminus', 'Thorn', 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn', +'threequarters', 'twosuperior', 'registered', 'minus', 'eth', 'multiply', +'threesuperior', 'copyright', 'Aacute', 'Acircumflex', 'Adieresis', 'Agrave', +'Aring', 'Atilde', 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave', +'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde', 'Oacute', +'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde', 'Scaron', 'Uacute', +'Ucircumflex', 'Udieresis', 'Ugrave', 'Yacute', 'Ydieresis', 'Zcaron', +'aacute', 'acircumflex', 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', +'eacute', 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex', +'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex', 'odieresis', +'ograve', 'otilde', 'scaron', 'uacute', 'ucircumflex', 'udieresis', 'ugrave', +'yacute', 'ydieresis', 'zcaron', 'exclamsmall', 'Hungarumlautsmall', +'dollaroldstyle', 'dollarsuperior', 'ampersandsmall', 'Acutesmall', +'parenleftsuperior', 'parenrightsuperior', 'twodotenleader', 'onedotenleader', +'zerooldstyle', 'oneoldstyle', 'twooldstyle', 'threeoldstyle', 'fouroldstyle', +'fiveoldstyle', 'sixoldstyle', 'sevenoldstyle', 'eightoldstyle', +'nineoldstyle', 'commasuperior', 'threequartersemdash', 'periodsuperior', +'questionsmall', 'asuperior', 'bsuperior', 'centsuperior', 'dsuperior', +'esuperior', 'isuperior', 'lsuperior', 'msuperior', 'nsuperior', 'osuperior', +'rsuperior', 'ssuperior', 'tsuperior', 'ff', 'ffi', 'ffl', 'parenleftinferior', +'parenrightinferior', 'Circumflexsmall', 'hyphensuperior', 'Gravesmall', +'Asmall', 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall', 'Hsmall', +'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall', 'Nsmall', 'Osmall', 'Psmall', +'Qsmall', 'Rsmall', 'Ssmall', 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', +'Ysmall', 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall', +'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall', 'Zcaronsmall', +'Dieresissmall', 'Brevesmall', 'Caronsmall', 'Dotaccentsmall', 'Macronsmall', +'figuredash', 'hypheninferior', 'Ogoneksmall', 'Ringsmall', 'Cedillasmall', +'questiondownsmall', 'oneeighth', 'threeeighths', 'fiveeighths', +'seveneighths', 'onethird', 'twothirds', 'zerosuperior', 'foursuperior', +'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior', +'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior', 'threeinferior', +'fourinferior', 'fiveinferior', 'sixinferior', 'seveninferior', +'eightinferior', 'nineinferior', 'centinferior', 'dollarinferior', +'periodinferior', 'commainferior', 'Agravesmall', 'Aacutesmall', +'Acircumflexsmall', 'Atildesmall', 'Adieresissmall', 'Aringsmall', 'AEsmall', +'Ccedillasmall', 'Egravesmall', 'Eacutesmall', 'Ecircumflexsmall', +'Edieresissmall', 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall', +'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall', 'Oacutesmall', +'Ocircumflexsmall', 'Otildesmall', 'Odieresissmall', 'OEsmall', 'Oslashsmall', +'Ugravesmall', 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall', +'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000', '001.001', '001.002', +'001.003', 'Black', 'Bold', 'Book', 'Light', 'Medium', 'Regular', 'Roman', +'Semibold' +] +# }}} + + +STANDARD_CHARSETS = [ # {{{ +# ISOAdobe +(".notdef", "space", "exclam", "quotedbl", "numbersign", "dollar", + "percent", "ampersand", "quoteright", "parenleft", "parenright", + "asterisk", "plus", "comma", "hyphen", "period", "slash", "zero", + "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", + "colon", "semicolon", "less", "equal", "greater", "question", "at", + "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", + "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", + "bracketleft", "backslash", "bracketright", "asciicircum", + "underscore", "quoteleft", "a", "b", "c", "d", "e", "f", "g", "h", "i", + "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", + "x", "y", "z", "braceleft", "bar", "braceright", "asciitilde", + "exclamdown", "cent", "sterling", "fraction", "yen", "florin", + "section", "currency", "quotesingle", "quotedblleft", "guillemotleft", + "guilsinglleft", "guilsinglright", "fi", "fl", "endash", "dagger", + "daggerdbl", "periodcentered", "paragraph", "bullet", "quotesinglbase", + "quotedblbase", "quotedblright", "guillemotright", "ellipsis", + "perthousand", "questiondown", "grave", "acute", "circumflex", "tilde", + "macron", "breve", "dotaccent", "dieresis", "ring", "cedilla", + "hungarumlaut", "ogonek", "caron", "emdash", "AE", "ordfeminine", + "Lslash", "Oslash", "OE", "ordmasculine", "ae", "dotlessi", "lslash", + "oslash", "oe", "germandbls", "onesuperior", "logicalnot", "mu", + "trademark", "Eth", "onehalf", "plusminus", "Thorn", "onequarter", + "divide", "brokenbar", "degree", "thorn", "threequarters", + "twosuperior", "registered", "minus", "eth", "multiply", + "threesuperior", "copyright", "Aacute", "Acircumflex", "Adieresis", + "Agrave", "Aring", "Atilde", "Ccedilla", "Eacute", "Ecircumflex", + "Edieresis", "Egrave", "Iacute", "Icircumflex", "Idieresis", "Igrave", + "Ntilde", "Oacute", "Ocircumflex", "Odieresis", "Ograve", "Otilde", + "Scaron", "Uacute", "Ucircumflex", "Udieresis", "Ugrave", "Yacute", + "Ydieresis", "Zcaron", "aacute", "acircumflex", "adieresis", "agrave", + "aring", "atilde", "ccedilla", "eacute", "ecircumflex", "edieresis", + "egrave", "iacute", "icircumflex", "idieresis", "igrave", "ntilde", + "oacute", "ocircumflex", "odieresis", "ograve", "otilde", "scaron", + "uacute", "ucircumflex", "udieresis", "ugrave", "yacute", "ydieresis", + "zcaron"), + +# Expert +("notdef", "space", "exclamsmall", "Hungarumlautsmall", "dollaroldstyle", + "dollarsuperior", "ampersandsmall", "Acutesmall", "parenleftsuperior", + "parenrightsuperior", "twodotenleader", "onedotenleader", "comma", + "hyphen", "period", "fraction", "zerooldstyle", "oneoldstyle", + "twooldstyle", "threeoldstyle", "fouroldstyle", "fiveoldstyle", + "sixoldstyle", "sevenoldstyle", "eightoldstyle", "nineoldstyle", + "colon", "semicolon", "commasuperior", "threequartersemdash", + "periodsuperior", "questionsmall", "asuperior", "bsuperior", + "centsuperior", "dsuperior", "esuperior", "isuperior", "lsuperior", + "msuperior", "nsuperior", "osuperior", "rsuperior", "ssuperior", + "tsuperior", "ff", "fi", "fl", "ffi", "ffl", "parenleftinferior", + "parenrightinferior", "Circumflexsmall", "hyphensuperior", + "Gravesmall", "Asmall", "Bsmall", "Csmall", "Dsmall", "Esmall", + "Fsmall", "Gsmall", "Hsmall", "Ismall", "Jsmall", "Ksmall", "Lsmall", + "Msmall", "Nsmall", "Osmall", "Psmall", "Qsmall", "Rsmall", "Ssmall", + "Tsmall", "Usmall", "Vsmall", "Wsmall", "Xsmall", "Ysmall", "Zsmall", + "colonmonetary", "onefitted", "rupiah", "Tildesmall", + "exclamdownsmall", "centoldstyle", "Lslashsmall", "Scaronsmall", + "Zcaronsmall", "Dieresissmall", "Brevesmall", "Caronsmall", + "Dotaccentsmall", "Macronsmall", "figuredash", "hypheninferior", + "Ogoneksmall", "Ringsmall", "Cedillasmall", "onequarter", "onehalf", + "threequarters", "questiondownsmall", "oneeighth", "threeeighths", + "fiveeighths", "seveneighths", "onethird", "twothirds", "zerosuperior", + "onesuperior", "twosuperior", "threesuperior", "foursuperior", + "fivesuperior", "sixsuperior", "sevensuperior", "eightsuperior", + "ninesuperior", "zeroinferior", "oneinferior", "twoinferior", + "threeinferior", "fourinferior", "fiveinferior", "sixinferior", + "seveninferior", "eightinferior", "nineinferior", "centinferior", + "dollarinferior", "periodinferior", "commainferior", "Agravesmall", + "Aacutesmall", "Acircumflexsmall", "Atildesmall", "Adieresissmall", + "Aringsmall", "AEsmall", "Ccedillasmall", "Egravesmall", "Eacutesmall", + "Ecircumflexsmall", "Edieresissmall", "Igravesmall", "Iacutesmall", + "Icircumflexsmall", "Idieresissmall", "Ethsmall", "Ntildesmall", + "Ogravesmall", "Oacutesmall", "Ocircumflexsmall", "Otildesmall", + "Odieresissmall", "OEsmall", "Oslashsmall", "Ugravesmall", + "Uacutesmall", "Ucircumflexsmall", "Udieresissmall", "Yacutesmall", + "Thornsmall", "Ydieresissmall"), + +# Expert Subset +(".notdef", "space", "dollaroldstyle", "dollarsuperior", + "parenleftsuperior", "parenrightsuperior", "twodotenleader", + "onedotenleader", "comma", "hyphen", "period", "fraction", + "zerooldstyle", "oneoldstyle", "twooldstyle", "threeoldstyle", + "fouroldstyle", "fiveoldstyle", "sixoldstyle", "sevenoldstyle", + "eightoldstyle", "nineoldstyle", "colon", "semicolon", + "commasuperior", "threequartersemdash", "periodsuperior", + "asuperior", "bsuperior", "centsuperior", "dsuperior", "esuperior", + "isuperior", "lsuperior", "msuperior", "nsuperior", "osuperior", + "rsuperior", "ssuperior", "tsuperior", "ff", "fi", "fl", "ffi", + "ffl", "parenleftinferior", "parenrightinferior", "hyphensuperior", + "colonmonetary", "onefitted", "rupiah", "centoldstyle", + "figuredash", "hypheninferior", "onequarter", "onehalf", + "threequarters", "oneeighth", "threeeighths", "fiveeighths", + "seveneighths", "onethird", "twothirds", "zerosuperior", + "onesuperior", "twosuperior", "threesuperior", "foursuperior", + "fivesuperior", "sixsuperior", "sevensuperior", "eightsuperior", + "ninesuperior", "zeroinferior", "oneinferior", "twoinferior", + "threeinferior", "fourinferior", "fiveinferior", "sixinferior", + "seveninferior", "eightinferior", "nineinferior", "centinferior", + "dollarinferior", "periodinferior", "commainferior"), +] # }}} + diff --git a/ebook_converter/utils/fonts/sfnt/cff/dict_data.py b/ebook_converter/utils/fonts/sfnt/cff/dict_data.py new file mode 100644 index 0000000..80a87f1 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/cff/dict_data.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import pack, unpack_from +from polyglot.builtins import range, unicode_type + +t1_operand_encoding = [None] * 256 +t1_operand_encoding[0:32] = (32) * ["do_operator"] +t1_operand_encoding[32:247] = (247 - 32) * ["read_byte"] +t1_operand_encoding[247:251] = (251 - 247) * ["read_small_int1"] +t1_operand_encoding[251:255] = (255 - 251) * ["read_small_int2"] +t1_operand_encoding[255] = "read_long_int" + +t2_operand_encoding = t1_operand_encoding[:] +t2_operand_encoding[28] = "read_short_int" +t2_operand_encoding[255] = "read_fixed_1616" + +cff_dict_operand_encoding = t2_operand_encoding[:] +cff_dict_operand_encoding[29] = "read_long_int" +cff_dict_operand_encoding[30] = "read_real_number" +cff_dict_operand_encoding[255] = "reserved" + +real_nibbles = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + '.', 'E', 'E-', None, '-'] +real_nibbles_map = {x:i for i, x in enumerate(real_nibbles)} + + +class ByteCode(dict): + + def read_byte(self, b0, data, index): + return b0 - 139, index + + def read_small_int1(self, b0, data, index): + b1 = ord(data[index:index+1]) + return (b0-247)*256 + b1 + 108, index+1 + + def read_small_int2(self, b0, data, index): + b1 = ord(data[index:index+1]) + return -(b0-251)*256 - b1 - 108, index+1 + + def read_short_int(self, b0, data, index): + value, = unpack_from(b">h", data, index) + return value, index+2 + + def read_long_int(self, b0, data, index): + value, = unpack_from(b">l", data, index) + return value, index+4 + + def read_fixed_1616(self, b0, data, index): + value, = unpack_from(b">l", data, index) + return value / 65536.0, index+4 + + def read_real_number(self, b0, data, index): + number = '' + while True: + b = ord(data[index:index+1]) + index = index + 1 + nibble0 = (b & 0xf0) >> 4 + nibble1 = b & 0x0f + if nibble0 == 0xf: + break + number = number + real_nibbles[nibble0] + if nibble1 == 0xf: + break + number = number + real_nibbles[nibble1] + return float(number), index + + def write_float(self, f, encoding='ignored'): + s = unicode_type(f).upper() + if s[:2] == "0.": + s = s[1:] + elif s[:3] == "-0.": + s = "-" + s[2:] + nibbles = [] + while s: + c = s[0] + s = s[1:] + if c == "E" and s[:1] == "-": + s = s[1:] + c = "E-" + nibbles.append(real_nibbles_map[c]) + nibbles.append(0xf) + if len(nibbles) % 2: + nibbles.append(0xf) + d = bytearray([30]) + for i in range(0, len(nibbles), 2): + d.append(nibbles[i] << 4 | nibbles[i+1]) + return bytes(d) + + def write_int(self, value, encoding="cff"): + four_byte_op = {'cff':29, 't1':255}.get(encoding, None) + + if -107 <= value <= 107: + code = bytes(bytearray([value + 139])) + elif 108 <= value <= 1131: + value = value - 108 + code = bytes(bytearray([(value >> 8) + 247, (value & 0xFF)])) + elif -1131 <= value <= -108: + value = -value - 108 + code = bytes(bytearray([(value >> 8) + 251, (value & 0xFF)])) + elif four_byte_op is None: + # T2 only supports 2 byte ints + code = bytes(bytearray([28])) + pack(b">h", value) + else: + code = bytes(bytearray([four_byte_op])) + pack(b">l", value) + return code + + def write_offset(self, value): + return bytes(bytearray([29])) + pack(b">l", value) + + def write_number(self, value, encoding="cff"): + f = self.write_float if isinstance(value, float) else self.write_int + return f(value, encoding) + + +class Dict(ByteCode): + + operand_encoding = cff_dict_operand_encoding + TABLE = () + FILTERED = frozenset() + OFFSETS = frozenset() + + def __init__(self): + ByteCode.__init__(self) + + self.operators = {op:(name, arg) for op, name, arg, default in + self.TABLE} + self.defaults = {name:default for op, name, arg, default in self.TABLE} + + def safe_get(self, name): + return self.get(name, self.defaults[name]) + + def decompile(self, strings, global_subrs, data): + self.strings = strings + self.global_subrs = global_subrs + self.stack = [] + index = 0 + while index < len(data): + b0 = ord(data[index:index+1]) + index += 1 + handler = getattr(self, self.operand_encoding[b0]) + value, index = handler(b0, data, index) + if value is not None: + self.stack.append(value) + + def do_operator(self, b0, data, index): + if b0 == 12: + op = (b0, ord(data[index:index+1])) + index += 1 + else: + op = b0 + operator, arg_type = self.operators[op] + self.handle_operator(operator, arg_type) + return None, index + + def handle_operator(self, operator, arg_type): + if isinstance(arg_type, tuple): + value = () + for i in range(len(arg_type)-1, -1, -1): + arg = arg_type[i] + arghandler = getattr(self, 'arg_' + arg) + value = (arghandler(operator),) + value + else: + arghandler = getattr(self, 'arg_' + arg_type) + value = arghandler(operator) + self[operator] = value + + def arg_number(self, name): + return self.stack.pop() + + def arg_SID(self, name): + return self.strings[self.stack.pop()] + + def arg_array(self, name): + ans = self.stack[:] + del self.stack[:] + return ans + + def arg_delta(self, name): + out = [] + current = 0 + for v in self.stack: + current = current + v + out.append(current) + del self.stack[:] + return out + + def compile(self, strings): + data = [] + for op, name, arg, default in self.TABLE: + if name in self.FILTERED: + continue + val = self.safe_get(name) + opcode = bytes(bytearray(op if isinstance(op, tuple) else [op])) + if val != self.defaults[name]: + self.encoding_offset = name in self.OFFSETS + if isinstance(arg, tuple): + if len(val) != len(arg): + raise ValueError('Invalid argument %s for operator: %s' + %(val, op)) + for typ, v in zip(arg, val): + if typ == 'SID': + val = strings(val) + data.append(getattr(self, 'encode_'+typ)(v)) + else: + if arg == 'SID': + val = strings(val) + data.append(getattr(self, 'encode_'+arg)(val)) + data.append(opcode) + self.raw = b''.join(data) + return self.raw + + def encode_number(self, val): + if self.encoding_offset: + return self.write_offset(val) + return self.write_number(val) + + def encode_SID(self, val): + return self.write_int(val) + + def encode_array(self, val): + return b''.join(map(self.encode_number, val)) + + def encode_delta(self, value): + out = [] + last = 0 + for v in value: + out.append(v - last) + last = v + return self.encode_array(out) + + +class TopDict(Dict): + + TABLE = ( + # opcode name argument type default + ((12, 30), 'ROS', ('SID','SID','number'), None,), + ((12, 20), 'SyntheticBase', 'number', None,), + (0, 'version', 'SID', None,), + (1, 'Notice', 'SID', None,), + ((12, 0), 'Copyright', 'SID', None,), + (2, 'FullName', 'SID', None,), + ((12, 38), 'FontName', 'SID', None,), + (3, 'FamilyName', 'SID', None,), + (4, 'Weight', 'SID', None,), + ((12, 1), 'isFixedPitch', 'number', 0,), + ((12, 2), 'ItalicAngle', 'number', 0,), + ((12, 3), 'UnderlinePosition', 'number', None,), + ((12, 4), 'UnderlineThickness', 'number', 50,), + ((12, 5), 'PaintType', 'number', 0,), + ((12, 6), 'CharstringType', 'number', 2,), + ((12, 7), 'FontMatrix', 'array', [0.001,0,0,0.001,0,0],), + (13, 'UniqueID', 'number', None,), + (5, 'FontBBox', 'array', [0,0,0,0],), + ((12, 8), 'StrokeWidth', 'number', 0,), + (14, 'XUID', 'array', None,), + ((12, 21), 'PostScript', 'SID', None,), + ((12, 22), 'BaseFontName', 'SID', None,), + ((12, 23), 'BaseFontBlend', 'delta', None,), + ((12, 31), 'CIDFontVersion', 'number', 0,), + ((12, 32), 'CIDFontRevision', 'number', 0,), + ((12, 33), 'CIDFontType', 'number', 0,), + ((12, 34), 'CIDCount', 'number', 8720,), + (15, 'charset', 'number', 0,), + ((12, 35), 'UIDBase', 'number', None,), + (16, 'Encoding', 'number', 0,), + (18, 'Private', ('number','number'), None,), + ((12, 37), 'FDSelect', 'number', None,), + ((12, 36), 'FDArray', 'number', None,), + (17, 'CharStrings', 'number', None,), + ) + + # We will not write these operators out + FILTERED = {'ROS', 'SyntheticBase', 'UniqueID', 'XUID', + 'CIDFontVersion', 'CIDFontRevision', 'CIDFontType', 'CIDCount', + 'UIDBase', 'Encoding', 'FDSelect', 'FDArray'} + OFFSETS = {'charset', 'Encoding', 'CharStrings', 'Private'} + + +class PrivateDict(Dict): + + TABLE = ( + # opcode name argument type default + (6, 'BlueValues', 'delta', None,), + (7, 'OtherBlues', 'delta', None,), + (8, 'FamilyBlues', 'delta', None,), + (9, 'FamilyOtherBlues', 'delta', None,), + ((12, 9), 'BlueScale', 'number', 0.039625,), + ((12, 10), 'BlueShift', 'number', 7,), + ((12, 11), 'BlueFuzz', 'number', 1,), + (10, 'StdHW', 'number', None,), + (11, 'StdVW', 'number', None,), + ((12, 12), 'StemSnapH', 'delta', None,), + ((12, 13), 'StemSnapV', 'delta', None,), + ((12, 14), 'ForceBold', 'number', 0,), + ((12, 15), 'ForceBoldThreshold', 'number', None,), # deprecated + ((12, 16), 'lenIV', 'number', None,), # deprecated + ((12, 17), 'LanguageGroup', 'number', 0,), + ((12, 18), 'ExpansionFactor', 'number', 0.06,), + ((12, 19), 'initialRandomSeed', 'number', 0,), + (20, 'defaultWidthX', 'number', 0,), + (21, 'nominalWidthX', 'number', 0,), + (19, 'Subrs', 'number', None,), + ) + + OFFSETS = {'Subrs'} diff --git a/ebook_converter/utils/fonts/sfnt/cff/table.py b/ebook_converter/utils/fonts/sfnt/cff/table.py new file mode 100644 index 0000000..5a76090 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/cff/table.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import unpack_from, unpack, calcsize +from functools import partial + +from calibre.utils.fonts.sfnt import UnknownTable +from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs +from calibre.utils.fonts.sfnt.cff.dict_data import TopDict, PrivateDict +from calibre.utils.fonts.sfnt.cff.constants import (cff_standard_strings, + STANDARD_CHARSETS) +from polyglot.builtins import iteritems, itervalues, range + +# Useful links +# http://www.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/5176.CFF.pdf +# http://www.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/5177.Type2.pdf + + +class CFF(object): + + def __init__(self, raw): + (self.major_version, self.minor_version, self.header_size, + self.offset_size) = unpack_from(b'>4B', raw) + if (self.major_version, self.minor_version) != (1, 0): + raise UnsupportedFont('The CFF table has unknown version: ' + '(%d, %d)'%(self.major_version, self.minor_version)) + offset = self.header_size + + # Read Names Index + self.font_names = Index(raw, offset) + offset = self.font_names.pos + if len(self.font_names) > 1: + raise UnsupportedFont('CFF table has more than one font.') + + # Read Top Dict + self.top_index = Index(raw, offset) + self.top_dict = TopDict() + offset = self.top_index.pos + + # Read strings + self.strings = Strings(raw, offset) + offset = self.strings.pos + + # Read global subroutines + self.global_subrs = Subrs(raw, offset) + offset = self.global_subrs.pos + + # Decompile Top Dict + self.top_dict.decompile(self.strings, self.global_subrs, self.top_index[0]) + self.is_CID = 'ROS' in self.top_dict + if self.is_CID: + raise UnsupportedFont('Subsetting of CID keyed fonts is not supported') + + # Read CharStrings (Glyph definitions) + try: + offset = self.top_dict['CharStrings'] + except KeyError: + raise ValueError('This font has no CharStrings') + cs_type = self.top_dict.safe_get('CharstringType') + if cs_type != 2: + raise UnsupportedFont('This font has unsupported CharstringType: ' + '%s'%cs_type) + self.char_strings = CharStringsIndex(raw, offset) + self.num_glyphs = len(self.char_strings) + + # Read Private Dict + self.private_dict = self.private_subrs = None + pd = self.top_dict.safe_get('Private') + if pd: + size, offset = pd + self.private_dict = PrivateDict() + self.private_dict.decompile(self.strings, self.global_subrs, + raw[offset:offset+size]) + if 'Subrs' in self.private_dict: + self.private_subrs = Subrs(raw, offset + + self.private_dict['Subrs']) + + # Read charset (Glyph names) + self.charset = Charset(raw, self.top_dict.safe_get('charset'), + self.strings, self.num_glyphs, self.is_CID) + + # import pprint + # pprint.pprint(self.top_dict) + # pprint.pprint(self.private_dict) + + +class Index(list): + + def __init__(self, raw, offset, prepend=()): + list.__init__(self) + self.extend(prepend) + + count = unpack_from(b'>H', raw, offset)[0] + offset += 2 + self.pos = offset + + if count > 0: + self.offset_size = unpack_from(b'>B', raw, offset)[0] + offset += 1 + if self.offset_size == 3: + offsets = [unpack(b'>L', b'\0' + raw[i:i+3])[0] + for i in range(offset, offset+3*(count+1), 3)] + else: + fmt = {1:'B', 2:'H', 4:'L'}[self.offset_size] + fmt = ('>%d%s'%(count+1, fmt)).encode('ascii') + offsets = unpack_from(fmt, raw, offset) + offset += self.offset_size * (count+1) - 1 + + for i in range(len(offsets)-1): + off, noff = offsets[i:i+2] + obj = raw[offset+off:offset+noff] + self.append(obj) + + try: + self.pos = offset + offsets[-1] + except IndexError: + self.pos = offset + + +class Strings(Index): + + def __init__(self, raw, offset): + super(Strings, self).__init__(raw, offset, prepend=[x.encode('ascii') + for x in cff_standard_strings]) + + +class Charset(list): + + def __init__(self, raw, offset, strings, num_glyphs, is_CID): + super(Charset, self).__init__() + self.standard_charset = offset if offset in {0, 1, 2} else None + if is_CID and self.standard_charset is not None: + raise ValueError("CID font must not use a standard charset") + if self.standard_charset is None: + self.append(b'.notdef') + fmt = unpack_from(b'>B', raw, offset)[0] + offset += 1 + f = {0:self.parse_fmt0, 1:self.parse_fmt1, + 2:partial(self.parse_fmt1, is_two_byte=True)}.get(fmt, None) + if f is None: + raise UnsupportedFont('This font uses unsupported charset ' + 'table format: %d'%fmt) + f(raw, offset, strings, num_glyphs, is_CID) + + def parse_fmt0(self, raw, offset, strings, num_glyphs, is_CID): + fmt = ('>%dH'%(num_glyphs-1)).encode('ascii') + ids = unpack_from(fmt, raw, offset) + if is_CID: + ids = ('cid%05d'%x for x in ids) + else: + ids = (strings[x] for x in ids) + self.extend(ids) + + def parse_fmt1(self, raw, offset, strings, num_glyphs, is_CID, + is_two_byte=False): + fmt = b'>2H' if is_two_byte else b'>HB' + sz = calcsize(fmt) + count = 1 + while count < num_glyphs: + first, nleft = unpack_from(fmt, raw, offset) + offset += sz + count += nleft + 1 + self.extend('cid%05d'%x if is_CID else strings[x] for x in + range(first, first + nleft+1)) + + def lookup(self, glyph_id): + if self.standard_charset is None: + return self[glyph_id] + return STANDARD_CHARSETS[self.standard_charset][glyph_id].encode('ascii') + + def safe_lookup(self, glyph_id): + try: + return self.lookup(glyph_id) + except (KeyError, IndexError, ValueError): + return None + + +class Subrs(Index): + pass + + +class CharStringsIndex(Index): + pass + + +class CFFTable(UnknownTable): + + def decompile(self): + self.cff = CFF(self.raw) + + def subset(self, character_map, extra_glyphs): + from calibre.utils.fonts.sfnt.cff.writer import Subset + # Map codes from the cmap table to glyph names, this will be used to + # reconstruct character_map for the subset font + charset_map = {code:self.cff.charset.safe_lookup(glyph_id) for code, + glyph_id in iteritems(character_map)} + charset = set(itervalues(charset_map)) + charset.discard(None) + if not charset and character_map: + raise NoGlyphs('This font has no glyphs for the specified characters') + charset |= { + self.cff.charset.safe_lookup(glyph_id) for glyph_id in extra_glyphs} + charset.discard(None) + s = Subset(self.cff, charset) + + # Rebuild character_map with the glyph ids from the subset font + character_map.clear() + for code, charname in iteritems(charset_map): + glyph_id = s.charname_map.get(charname, None) + if glyph_id: + character_map[code] = glyph_id + + # Check that raw is parseable + CFF(s.raw) + + self.raw = s.raw diff --git a/ebook_converter/utils/fonts/sfnt/cmap.py b/ebook_converter/utils/fonts/sfnt/cmap.py new file mode 100644 index 0000000..e495f68 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/cmap.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +# Note that the code for creating a BMP table (cmap format 4) is taken with +# thanks from the fonttools project (BSD licensed). + +from struct import unpack_from, calcsize, pack +from collections import OrderedDict + +from calibre.utils.fonts.utils import read_bmp_prefix +from calibre.utils.fonts.sfnt import UnknownTable, max_power_of_two +from calibre.utils.fonts.sfnt.errors import UnsupportedFont +from polyglot.builtins import range + + +def split_range(start_code, end_code, cmap): # {{{ + # Try to split a range of character codes into subranges with consecutive + # glyph IDs in such a way that the cmap4 subtable can be stored "most" + # efficiently. + if start_code == end_code: + return [], [end_code] + + last_id = cmap[start_code] + last_code = start_code + in_order = None + ordered_begin = None + sub_ranges = [] + + # Gather subranges in which the glyph IDs are consecutive. + for code in range(start_code + 1, end_code + 1): + glyph_id = cmap[code] + + if glyph_id - 1 == last_id: + if in_order is None or not in_order: + in_order = 1 + ordered_begin = last_code + else: + if in_order: + in_order = 0 + sub_ranges.append((ordered_begin, last_code)) + ordered_begin = None + + last_id = glyph_id + last_code = code + + if in_order: + sub_ranges.append((ordered_begin, last_code)) + assert last_code == end_code + + # Now filter out those new subranges that would only make the data bigger. + # A new segment cost 8 bytes, not using a new segment costs 2 bytes per + # character. + new_ranges = [] + for b, e in sub_ranges: + if b == start_code and e == end_code: + break # the whole range, we're fine + if b == start_code or e == end_code: + threshold = 4 # split costs one more segment + else: + threshold = 8 # split costs two more segments + if (e - b + 1) > threshold: + new_ranges.append((b, e)) + sub_ranges = new_ranges + + if not sub_ranges: + return [], [end_code] + + if sub_ranges[0][0] != start_code: + sub_ranges.insert(0, (start_code, sub_ranges[0][0] - 1)) + if sub_ranges[-1][1] != end_code: + sub_ranges.append((sub_ranges[-1][1] + 1, end_code)) + + # Fill the "holes" in the segments list -- those are the segments in which + # the glyph IDs are _not_ consecutive. + i = 1 + while i < len(sub_ranges): + if sub_ranges[i-1][1] + 1 != sub_ranges[i][0]: + sub_ranges.insert(i, (sub_ranges[i-1][1] + 1, sub_ranges[i][0] - 1)) + i = i + 1 + i = i + 1 + + # Transform the ranges into start_code/end_code lists. + start = [] + end = [] + for b, e in sub_ranges: + start.append(b) + end.append(e) + start.pop(0) + + assert len(start) + 1 == len(end) + return start, end +# }}} + + +def set_id_delta(id_delta): # {{{ + # The lowest gid in glyphIndexArray, after subtracting id_delta, must be 1. + # id_delta is a short, and must be between -32K and 32K + # startCode can be between 0 and 64K-1, and the first glyph index can be between 1 and 64K-1 + # This means that we have a problem because we can need to assign to + # id_delta values + # between -(64K-2) and 64K -1. + # Since the final gi is reconstructed from the glyphArray GID by: + # (short)finalGID = (gid + id_delta) % 0x10000), + # we can get from a startCode of 0 to a final GID of 64 -1K by subtracting 1, and casting the + # negative number to an unsigned short. + # Similarly , we can get from a startCode of 64K-1 to a final GID of 1 by adding 2, because of + # the modulo arithmetic. + + if id_delta > 0x7FFF: + id_delta = id_delta - 0x10000 + elif id_delta < -0x7FFF: + id_delta = id_delta + 0x10000 + + return id_delta +# }}} + + +class BMPTable(object): + + def __init__(self, raw): + self.raw = raw + (self.start_count, self.end_count, self.range_offset, self.id_delta, + self.glyph_id_len, self.glyph_id_map, self.array_len) = \ + read_bmp_prefix(raw, 0) + + def get_glyph_ids(self, codes): + for code in codes: + found = False + for i, ec in enumerate(self.end_count): + if ec >= code: + sc = self.start_count[i] + if sc <= code: + found = True + ro = self.range_offset[i] + if ro == 0: + glyph_id = self.id_delta[i] + code + else: + idx = ro//2 + (code - sc) + i - self.array_len + glyph_id = self.glyph_id_map[idx] + if glyph_id != 0: + glyph_id += self.id_delta[i] + yield glyph_id % 0x10000 + break + if not found: + yield 0 + + def get_glyph_map(self, glyph_ids): + ans = {} + for i, ec in enumerate(self.end_count): + sc = self.start_count[i] + for code in range(sc, ec+1): + ro = self.range_offset[i] + if ro == 0: + glyph_id = self.id_delta[i] + code + else: + idx = ro//2 + (code - sc) + i - self.array_len + glyph_id = self.glyph_id_map[idx] + if glyph_id != 0: + glyph_id += self.id_delta[i] + glyph_id %= 0x10000 + if glyph_id in glyph_ids and code not in ans: + ans[code] = glyph_id + return ans + + +class CmapTable(UnknownTable): + + def __init__(self, *args, **kwargs): + super(CmapTable, self).__init__(*args, **kwargs) + + self.version, self.num_tables = unpack_from(b'>HH', self.raw) + + self.tables = {} + + offset = 4 + sz = calcsize(b'>HHL') + recs = [] + for i in range(self.num_tables): + platform, encoding, table_offset = unpack_from(b'>HHL', self.raw, + offset) + offset += sz + recs.append((platform, encoding, table_offset)) + + self.bmp_table = None + + for i in range(len(recs)): + platform, encoding, offset = recs[i] + try: + next_offset = recs[i+1][-1] + except IndexError: + next_offset = len(self.raw) + table = self.raw[offset:next_offset] + if table: + fmt = unpack_from(b'>H', table)[0] + if platform == 3 and encoding == 1 and fmt == 4: + self.bmp_table = BMPTable(table) + + def get_character_map(self, chars): + ''' + Get a mapping of character codes to glyph ids in the font. + ''' + if self.bmp_table is None: + raise UnsupportedFont('This font has no Windows BMP cmap subtable.' + ' Most likely a special purpose font.') + chars = sorted(set(chars)) + ans = OrderedDict() + for i, glyph_id in enumerate(self.bmp_table.get_glyph_ids(chars)): + if glyph_id > 0: + ans[chars[i]] = glyph_id + return ans + + def get_glyph_map(self, glyph_ids): + ''' + Get a mapping of character codes to glyph ids for the specified glyph + ids. + ''' + if self.bmp_table is None: + raise UnsupportedFont('This font has no Windows BMP cmap subtable.' + ' Most likely a special purpose font.') + glyph_ids = frozenset(glyph_ids) + return self.bmp_table.get_glyph_map(glyph_ids) + + def set_character_map(self, cmap): + self.version, self.num_tables = 0, 1 + fmt = b'>7H' + codes = sorted(cmap) + + if not codes: + start_code = [0xffff] + end_code = [0xffff] + else: + last_code = codes[0] + end_code = [] + start_code = [last_code] + + for code in codes[1:]: + if code == last_code + 1: + last_code = code + continue + start, end = split_range(start_code[-1], last_code, cmap) + start_code.extend(start) + end_code.extend(end) + start_code.append(code) + last_code = code + end_code.append(last_code) + start_code.append(0xffff) + end_code.append(0xffff) + + id_delta = [] + id_range_offset = [] + glyph_index_array = [] + for i in range(len(end_code)-1): # skip the closing codes (0xffff) + indices = list(cmap[char_code] for char_code in range(start_code[i], end_code[i] + 1)) + if indices == list(range(indices[0], indices[0] + len(indices))): + # indices is a contiguous list + id_delta_temp = set_id_delta(indices[0] - start_code[i]) + id_delta.append(id_delta_temp) + id_range_offset.append(0) + else: + id_delta.append(0) + id_range_offset.append(2 * (len(end_code) + len(glyph_index_array) - i)) + glyph_index_array.extend(indices) + id_delta.append(1) # 0xffff + 1 == 0. So this end code maps to .notdef + id_range_offset.append(0) + + seg_count = len(end_code) + max_exponent = max_power_of_two(seg_count) + search_range = 2 * (2 ** max_exponent) + entry_selector = max_exponent + range_shift = 2 * seg_count - search_range + + char_code_array = end_code + [0] + start_code + char_code_array = pack(b'>%dH'%len(char_code_array), *char_code_array) + id_delta_array = pack(b'>%dh'%len(id_delta), *id_delta) + rest_array = id_range_offset + glyph_index_array + rest_array = pack(b'>%dH'%len(rest_array), *rest_array) + data = char_code_array + id_delta_array + rest_array + + length = calcsize(fmt) + len(data) + header = pack(fmt, 4, length, 0, 2*seg_count, search_range, entry_selector, range_shift) + self.bmp_table = header + data + + fmt = b'>4HL' + offset = calcsize(fmt) + self.raw = pack(fmt, self.version, self.num_tables, 3, 1, offset) + self.bmp_table diff --git a/ebook_converter/utils/fonts/sfnt/common.py b/ebook_converter/utils/fonts/sfnt/common.py new file mode 100644 index 0000000..91ef9df --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/common.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import unpack_from, calcsize +from collections import OrderedDict, namedtuple + +from calibre.utils.fonts.sfnt.errors import UnsupportedFont +from polyglot.builtins import range, iteritems + + +class Unpackable(object): + + def __init__(self, raw, offset): + self.raw, self.offset = raw, offset + self.start_pos = offset + + def unpack(self, fmt, single_special=True): + fmt = fmt.encode('ascii') if not isinstance(fmt, bytes) else fmt + ans = unpack_from(b'>'+fmt, self.raw, self.offset) + if single_special and len(ans) == 1: + ans = ans[0] + self.offset += calcsize(fmt) + return ans + + +class SimpleListTable(list): + + 'A table that contains a list of subtables' + + child_class = None + + def __init__(self, raw, offset): + list.__init__(self) + + data = Unpackable(raw, offset) + self.read_extra_header(data) + + count = data.unpack('H') + for i in range(count): + offset = data.unpack('H') + self.append(self.child_class(raw, data.start_pos + offset)) + self.read_extra_footer(data) + + def read_extra_header(self, data): + pass + + def read_extra_footer(self, data): + pass + + +class ListTable(OrderedDict): + + 'A table that contains an ordered mapping of table tag to subtable' + + child_class = None + + def __init__(self, raw, offset): + OrderedDict.__init__(self) + + data = Unpackable(raw, offset) + self.read_extra_header(data) + + count = data.unpack('H') + for i in range(count): + tag, coffset = data.unpack('4sH') + self[tag] = self.child_class(raw, data.start_pos + coffset) + + self.read_extra_footer(data) + + def read_extra_header(self, data): + pass + + def read_extra_footer(self, data): + pass + + def dump(self, prefix=''): + print(prefix, self.__class__.__name__, sep='') + prefix += ' ' + for tag, child in iteritems(self): + print(prefix, tag, sep='') + child.dump(prefix=prefix+' ') + + +class IndexTable(list): + + def __init__(self, raw, offset): + data = Unpackable(raw, offset) + self.read_extra_header(data) + + count = data.unpack('H') + for i in range(count): + self.append(data.unpack('H')) + + def read_extra_header(self, data): + pass + + def dump(self, prefix=''): + print(prefix, self.__class__.__name__, sep='') + + +class LanguageSystemTable(IndexTable): + + def read_extra_header(self, data): + self.lookup_order, self.required_feature_index = data.unpack('2H') + if self.lookup_order != 0: + raise UnsupportedFont('This LanguageSystemTable has an unknown' + ' lookup order: 0x%x'%self.lookup_order) + + +class ScriptTable(ListTable): + + child_class = LanguageSystemTable + + def __init__(self, raw, offset): + ListTable.__init__(self, raw, offset) + + def read_extra_header(self, data): + start_pos = data.offset + default_offset = data.unpack('H') + self[b'default'] = (LanguageSystemTable(data.raw, start_pos + + default_offset) if default_offset else None) + + +class ScriptListTable(ListTable): + + child_class = ScriptTable + + +class FeatureTable(IndexTable): + + def read_extra_header(self, data): + self.feature_params = data.unpack('H') + if False and self.feature_params != 0: + # Source code pro sets this to non NULL + raise UnsupportedFont( + 'This FeatureTable has non NULL FeatureParams: 0x%x'%self.feature_params) + + +class FeatureListTable(ListTable): + + child_class = FeatureTable + + +class LookupTable(SimpleListTable): + + def read_extra_header(self, data): + self.lookup_type, self.lookup_flag = data.unpack('2H') + self.set_child_class() + + def set_child_class(self): + raise NotImplementedError() + + def read_extra_footer(self, data): + if self.lookup_flag & 0x0010: + self.mark_filtering_set = data.unpack('H') + + +def ExtensionSubstitution(raw, offset, subtable_map={}): + data = Unpackable(raw, offset) + subst_format, extension_lookup_type, offset = data.unpack('2HL') + if subst_format != 1: + raise UnsupportedFont('ExtensionSubstitution has unknown format: 0x%x'%subst_format) + return subtable_map[extension_lookup_type](raw, offset+data.start_pos) + + +CoverageRange = namedtuple('CoverageRange', 'start end start_coverage_index') + + +class Coverage(object): + + def __init__(self, raw, offset, parent_table_name): + data = Unpackable(raw, offset) + self.format, count = data.unpack('2H') + + if self.format not in {1, 2}: + raise UnsupportedFont('Unknown Coverage format: 0x%x in %s'%( + self.format, parent_table_name)) + if self.format == 1: + self.glyph_ids = data.unpack('%dH'%count, single_special=False) + self.glyph_ids_map = {gid:i for i, gid in + enumerate(self.glyph_ids)} + else: + self.ranges = [] + ranges = data.unpack('%dH'%(3*count), single_special=False) + for i in range(count): + start, end, start_coverage_index = ranges[i*3:(i+1)*3] + self.ranges.append(CoverageRange(start, end, start_coverage_index)) + + def coverage_indices(self, glyph_ids): + '''Return map of glyph_id -> coverage index. Map contains only those + glyph_ids that are covered by this table and that are present in + glyph_ids.''' + ans = OrderedDict() + for gid in glyph_ids: + if self.format == 1: + idx = self.glyph_ids_map.get(gid, None) + if idx is not None: + ans[gid] = idx + else: + for start, end, start_coverage_index in self.ranges: + if start <= gid <= end: + ans[gid] = start_coverage_index + (gid-start) + return ans + + +class UnknownLookupSubTable(object): + + formats = {} + + def __init__(self, raw, offset): + data = Unpackable(raw, offset) + self.format = data.unpack('H') + if self.format not in self.formats: + raise UnsupportedFont('Unknown format for Lookup Subtable %s: 0x%x'%( + self.__class__.__name__, self.format)) + if self.has_initial_coverage: + coverage_offset = data.unpack('H') + data.start_pos + self.coverage = Coverage(raw, coverage_offset, self.__class__.__name__) + self.initialize(data) + + @property + def has_initial_coverage(self): + return True + + def all_substitutions(self, glyph_ids): + ''' Return a set of all glyph ids that could be substituted for any + subset of the specified glyph ids (which must be a set)''' + raise NotImplementedError() + + def read_sets(self, data, read_item=None, set_is_index=False): + count = data.unpack('H') + sets = data.unpack('%dH'%count, single_special=False) + coverage_to_items_map = [] + for offset in sets: + # Read items in the set + data.offset = start_pos = offset + data.start_pos + count = data.unpack('H') + item_offsets = data.unpack('%dH'%count, single_special=False) + items = [] + for offset in item_offsets: + data.offset = offset + start_pos + if set_is_index: + items.append(offset) + else: + items.append(read_item(data)) + coverage_to_items_map.append(items) + return coverage_to_items_map diff --git a/ebook_converter/utils/fonts/sfnt/container.py b/ebook_converter/utils/fonts/sfnt/container.py new file mode 100644 index 0000000..79d8796 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/container.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +# License: GPLv3 Copyright: 2012, Kovid Goyal +from __future__ import absolute_import, division, print_function, unicode_literals + +from collections import OrderedDict +from io import BytesIO +from struct import calcsize, pack + +from calibre.utils.fonts.sfnt import UnknownTable, align_block, max_power_of_two +from calibre.utils.fonts.sfnt.cff.table import CFFTable +from calibre.utils.fonts.sfnt.cmap import CmapTable +from calibre.utils.fonts.sfnt.errors import UnsupportedFont +from calibre.utils.fonts.sfnt.glyf import GlyfTable +from calibre.utils.fonts.sfnt.gsub import GSUBTable +from calibre.utils.fonts.sfnt.head import ( + HeadTable, HorizontalHeader, OS2Table, PostTable, VerticalHeader +) +from calibre.utils.fonts.sfnt.kern import KernTable +from calibre.utils.fonts.sfnt.loca import LocaTable +from calibre.utils.fonts.sfnt.maxp import MaxpTable +from calibre.utils.fonts.utils import checksum_of_block, get_tables, verify_checksums + +# OpenType spec: http://www.microsoft.com/typography/otspec/otff.htm + + +class Sfnt(object): + + TABLE_MAP = { + b'head' : HeadTable, + b'hhea' : HorizontalHeader, + b'vhea' : VerticalHeader, + b'maxp' : MaxpTable, + b'loca' : LocaTable, + b'glyf' : GlyfTable, + b'cmap' : CmapTable, + b'CFF ' : CFFTable, + b'kern' : KernTable, + b'GSUB' : GSUBTable, + b'OS/2' : OS2Table, + b'post' : PostTable, + } + + def __init__(self, raw_or_get_table): + self.tables = {} + if isinstance(raw_or_get_table, bytes): + raw = raw_or_get_table + self.sfnt_version = raw[:4] + if self.sfnt_version not in {b'\x00\x01\x00\x00', b'OTTO', b'true', + b'type1'}: + raise UnsupportedFont('Font has unknown sfnt version: %r'%self.sfnt_version) + for table_tag, table, table_index, table_offset, table_checksum in get_tables(raw): + self.tables[table_tag] = self.TABLE_MAP.get( + table_tag, UnknownTable)(table) + else: + for table_tag in { + b'cmap', b'hhea', b'head', b'hmtx', b'maxp', b'name', b'OS/2', + b'post', b'cvt ', b'fpgm', b'glyf', b'loca', b'prep', b'CFF ', + b'VORG', b'EBDT', b'EBLC', b'EBSC', b'BASE', b'GSUB', b'GPOS', + b'GDEF', b'JSTF', b'gasp', b'hdmx', b'kern', b'LTSH', b'PCLT', + b'VDMX', b'vhea', b'vmtx', b'MATH'}: + table = bytes(raw_or_get_table(table_tag)) + if table: + self.tables[table_tag] = self.TABLE_MAP.get( + table_tag, UnknownTable)(table) + if not self.tables: + raise UnsupportedFont('This font has no tables') + self.sfnt_version = (b'\0\x01\0\0' if b'glyf' in self.tables + else b'OTTO') + + def __getitem__(self, key): + return self.tables[key] + + def __contains__(self, key): + return key in self.tables + + def __delitem__(self, key): + del self.tables[key] + + def __iter__(self): + '''Iterate over the table tags in order.''' + for x in sorted(self.tables): + yield x + # Although the optimal order is not alphabetical, the OTF spec says + # they should be alphabetical, so we stick with that. See + # http://partners.adobe.com/public/developer/opentype/index_recs.html + # for optimal order. + # keys = list(self.tables) + # order = {x:i for i, x in enumerate((b'head', b'hhea', b'maxp', b'OS/2', + # b'hmtx', b'LTSH', b'VDMX', b'hdmx', b'cmap', b'fpgm', b'prep', + # b'cvt ', b'loca', b'glyf', b'CFF ', b'kern', b'name', b'post', + # b'gasp', b'PCLT', b'DSIG'))} + # keys.sort(key=lambda x:order.get(x, 1000)) + # for x in keys: + # yield x + + def pop(self, key, default=None): + return self.tables.pop(key, default) + + def get(self, key, default=None): + return self.tables.get(key, default) + + def sizes(self): + ans = OrderedDict() + for tag in self: + ans[tag] = len(self[tag]) + return ans + + def __call__(self, stream=None): + stream = BytesIO() if stream is None else stream + + def spack(*args): + stream.write(pack(*args)) + + stream.seek(0) + + # Write header + num_tables = len(self.tables) + ln2 = max_power_of_two(num_tables) + srange = (2**ln2) * 16 + spack(b'>4s4H', + self.sfnt_version, num_tables, srange, ln2, num_tables * 16 - srange) + + # Write tables + head_offset = None + table_data = [] + offset = stream.tell() + (calcsize(b'>4s3L') * num_tables) + sizes = OrderedDict() + for tag in self: + table = self.tables[tag] + raw = table() + table_len = len(raw) + if tag == b'head': + head_offset = offset + raw = raw[:8] + b'\0\0\0\0' + raw[12:] + raw = align_block(raw) + checksum = checksum_of_block(raw) + spack(b'>4s3L', tag, checksum, offset, table_len) + offset += len(raw) + table_data.append(raw) + sizes[tag] = table_len + + for x in table_data: + stream.write(x) + + checksum = checksum_of_block(stream.getvalue()) + q = (0xB1B0AFBA - checksum) & 0xffffffff + stream.seek(head_offset + 8) + spack(b'>L', q) + + return stream.getvalue(), sizes + + +def test_roundtrip(ff=None): + if ff is None: + data = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True) + else: + with open(ff, 'rb') as f: + data = f.read() + rd = Sfnt(data)()[0] + verify_checksums(rd) + if data[:12] != rd[:12]: + raise ValueError('Roundtripping failed, font header not the same') + if len(data) != len(rd): + raise ValueError('Roundtripping failed, size different (%d vs. %d)'% + (len(data), len(rd))) + + +if __name__ == '__main__': + import sys + test_roundtrip(sys.argv[-1]) diff --git a/ebook_converter/utils/fonts/sfnt/errors.py b/ebook_converter/utils/fonts/sfnt/errors.py new file mode 100644 index 0000000..e7d0e3c --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/errors.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +class UnsupportedFont(ValueError): + pass + + +class NoGlyphs(ValueError): + pass + diff --git a/ebook_converter/utils/fonts/sfnt/glyf.py b/ebook_converter/utils/fonts/sfnt/glyf.py new file mode 100644 index 0000000..313b7e6 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/glyf.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import unpack_from +from collections import OrderedDict + +from calibre.utils.fonts.sfnt import UnknownTable +from polyglot.builtins import iteritems + +ARG_1_AND_2_ARE_WORDS = 0x0001 # if set args are words otherwise they are bytes +ARGS_ARE_XY_VALUES = 0x0002 # if set args are xy values, otherwise they are points +ROUND_XY_TO_GRID = 0x0004 # for the xy values if above is true +WE_HAVE_A_SCALE = 0x0008 # Sx = Sy, otherwise scale == 1.0 +NON_OVERLAPPING = 0x0010 # set to same value for all components (obsolete!) +MORE_COMPONENTS = 0x0020 # indicates at least one more glyph after this one +WE_HAVE_AN_X_AND_Y_SCALE = 0x0040 # Sx, Sy +WE_HAVE_A_TWO_BY_TWO = 0x0080 # t00, t01, t10, t11 +WE_HAVE_INSTRUCTIONS = 0x0100 # instructions follow +USE_MY_METRICS = 0x0200 # apply these metrics to parent glyph +OVERLAP_COMPOUND = 0x0400 # used by Apple in GX fonts +SCALED_COMPONENT_OFFSET = 0x0800 # composite designed to have the component offset scaled (designed for Apple) +UNSCALED_COMPONENT_OFFSET = 0x1000 # composite designed not to have the component offset scaled (designed for MS) + + +class SimpleGlyph(object): + + def __init__(self, num_of_countours, raw): + self.num_of_countours = num_of_countours + self.raw = raw + # The list of glyph indices referred to by this glyph, will always be + # empty for a simple glyph and not empty for a composite glyph + self.glyph_indices = [] + self.is_composite = False + + def __len__(self): + return len(self.raw) + + def __call__(self): + return self.raw + + +class CompositeGlyph(SimpleGlyph): + + def __init__(self, num_of_countours, raw): + super(CompositeGlyph, self).__init__(num_of_countours, raw) + self.is_composite = True + + flags = MORE_COMPONENTS + offset = 10 + while flags & MORE_COMPONENTS: + flags, glyph_index = unpack_from(b'>HH', raw, offset) + self.glyph_indices.append(glyph_index) + offset += 4 + if flags & ARG_1_AND_2_ARE_WORDS: + offset += 4 + else: + offset += 2 + if flags & WE_HAVE_A_SCALE: + offset += 2 + elif flags & WE_HAVE_AN_X_AND_Y_SCALE: + offset += 4 + elif flags & WE_HAVE_A_TWO_BY_TWO: + offset += 8 + + +class GlyfTable(UnknownTable): + + def glyph_data(self, offset, length, as_raw=False): + raw = self.raw[offset:offset+length] + if as_raw: + return raw + num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0 + if num_of_countours >= 0: + return SimpleGlyph(num_of_countours, raw) + return CompositeGlyph(num_of_countours, raw) + + def update(self, sorted_glyph_map): + ans = OrderedDict() + offset = 0 + block = [] + for glyph_id, glyph in iteritems(sorted_glyph_map): + raw = glyph() + pad = 4 - (len(raw) % 4) + if pad < 4: + raw += b'\0' * pad + ans[glyph_id] = offset, len(raw) + offset += len(raw) + block.append(raw) + self.raw = b''.join(block) + return ans diff --git a/ebook_converter/utils/fonts/sfnt/gsub.py b/ebook_converter/utils/fonts/sfnt/gsub.py new file mode 100644 index 0000000..249e134 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/gsub.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import unpack_from +from functools import partial + +from calibre.utils.fonts.sfnt import UnknownTable, FixedProperty +from calibre.utils.fonts.sfnt.errors import UnsupportedFont +from calibre.utils.fonts.sfnt.common import (ScriptListTable, FeatureListTable, + SimpleListTable, LookupTable, ExtensionSubstitution, + UnknownLookupSubTable) +from polyglot.builtins import iteritems, itervalues + + +class SingleSubstitution(UnknownLookupSubTable): + + formats = {1, 2} + + def initialize(self, data): + if self.format == 1: + self.delta = data.unpack('h') + else: + count = data.unpack('H') + self.substitutes = data.unpack('%dH'%count, single_special=False) + + def all_substitutions(self, glyph_ids): + gid_index_map = self.coverage.coverage_indices(glyph_ids) + if self.format == 1: + return {gid + self.delta for gid in gid_index_map} + return {self.substitutes[i] for i in itervalues(gid_index_map)} + + +class MultipleSubstitution(UnknownLookupSubTable): + + formats = {1} + + def initialize(self, data): + self.coverage_to_subs_map = self.read_sets(data, set_is_index=True) + + def all_substitutions(self, glyph_ids): + gid_index_map = self.coverage.coverage_indices(glyph_ids) + ans = set() + for index in itervalues(gid_index_map): + glyphs = set(self.coverage_to_subs_map[index]) + ans |= glyphs + return ans + + +class AlternateSubstitution(MultipleSubstitution): + pass + + +class LigatureSubstitution(UnknownLookupSubTable): + + formats = {1} + + def initialize(self, data): + self.coverage_to_lig_map = self.read_sets(data, self.read_ligature) + + def read_ligature(self, data): + lig_glyph, count = data.unpack('HH') + components = data.unpack('%dH'%(count-1), single_special=False) + return (lig_glyph, components) + + def all_substitutions(self, glyph_ids): + gid_index_map = self.coverage.coverage_indices(glyph_ids) + ans = set() + for start_glyph_id, index in iteritems(gid_index_map): + for glyph_id, components in self.coverage_to_lig_map[index]: + components = (start_glyph_id,) + components + if set(components).issubset(glyph_ids): + ans.add(glyph_id) + return ans + + +class ContexttualSubstitution(UnknownLookupSubTable): + + formats = {1, 2, 3} + + @property + def has_initial_coverage(self): + return self.format != 3 + + def initialize(self, data): + pass # TODO + + def all_substitutions(self, glyph_ids): + # This table only defined substitution in terms of other tables + return set() + + +class ChainingContextualSubstitution(UnknownLookupSubTable): + + formats = {1, 2, 3} + + @property + def has_initial_coverage(self): + return self.format != 3 + + def initialize(self, data): + pass # TODO + + def all_substitutions(self, glyph_ids): + # This table only defined substitution in terms of other tables + return set() + + +class ReverseChainSingleSubstitution(UnknownLookupSubTable): + + formats = {1} + + def initialize(self, data): + backtrack_count = data.unpack('H') + backtrack_offsets = data.unpack('%dH'%backtrack_count, + single_special=False) + lookahead_count = data.unpack('H') + lookahead_offsets = data.unpack('%dH'%lookahead_count, + single_special=False) + backtrack_offsets = [data.start_pos + x for x in backtrack_offsets] + lookahead_offsets = [data.start_pos + x for x in lookahead_offsets] + backtrack_offsets, lookahead_offsets # TODO: Use these + count = data.unpack('H') + self.substitutes = data.unpack('%dH'%count) + + def all_substitutions(self, glyph_ids): + gid_index_map = self.coverage.coverage_indices(glyph_ids) + return {self.substitutes[i] for i in itervalues(gid_index_map)} + + +subtable_map = { + 1: SingleSubstitution, + 2: MultipleSubstitution, + 3: AlternateSubstitution, + 4: LigatureSubstitution, + 5: ContexttualSubstitution, + 6: ChainingContextualSubstitution, + 8: ReverseChainSingleSubstitution, +} + + +class GSUBLookupTable(LookupTable): + + def set_child_class(self): + if self.lookup_type == 7: + self.child_class = partial(ExtensionSubstitution, + subtable_map=subtable_map) + else: + self.child_class = subtable_map[self.lookup_type] + + +class LookupListTable(SimpleListTable): + + child_class = GSUBLookupTable + + +class GSUBTable(UnknownTable): + + version = FixedProperty('_version') + + def decompile(self): + (self._version, self.scriptlist_offset, self.featurelist_offset, + self.lookuplist_offset) = unpack_from(b'>L3H', self.raw) + if self._version != 0x10000: + raise UnsupportedFont('The GSUB table has unknown version: 0x%x'% + self._version) + + self.script_list_table = ScriptListTable(self.raw, + self.scriptlist_offset) + # self.script_list_table.dump() + + self.feature_list_table = FeatureListTable(self.raw, + self.featurelist_offset) + # self.feature_list_table.dump() + + self.lookup_list_table = LookupListTable(self.raw, + self.lookuplist_offset) + + def all_substitutions(self, glyph_ids): + glyph_ids = frozenset(glyph_ids) + ans = set(glyph_ids) + for lookup_table in self.lookup_list_table: + for subtable in lookup_table: + glyphs = subtable.all_substitutions(ans) + if glyphs: + ans |= glyphs + return ans - {glyph_ids} diff --git a/ebook_converter/utils/fonts/sfnt/head.py b/ebook_converter/utils/fonts/sfnt/head.py new file mode 100644 index 0000000..06dd711 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/head.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import unpack_from, pack, calcsize + +from calibre.utils.fonts.sfnt import UnknownTable, DateTimeProperty, FixedProperty +from calibre.utils.fonts.sfnt.errors import UnsupportedFont +from calibre.utils.fonts.sfnt.loca import read_array +from polyglot.builtins import zip + + +class HeadTable(UnknownTable): + + created = DateTimeProperty('_created') + modified = DateTimeProperty('_modified') + version_number = FixedProperty('_version_number') + font_revision = FixedProperty('_font_revision') + + def __init__(self, *args, **kwargs): + super(HeadTable, self).__init__(*args, **kwargs) + + field_types = ( + '_version_number' , 'l', + '_font_revision' , 'l', + 'checksum_adjustment' , 'L', + 'magic_number' , 'L', + 'flags' , 'H', + 'units_per_em' , 'H', + '_created' , 'q', + '_modified' , 'q', + 'x_min' , 'h', + 'y_min' , 'h', + 'x_max' , 'h', + 'y_max' , 'h', + 'mac_style' , 'H', + 'lowest_rec_ppem' , 'H', + 'font_direction_hint' , 'h', + 'index_to_loc_format' , 'h', + 'glyph_data_format' , 'h' + ) + + self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii') + self._fields = field_types[0::2] + + for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)): + setattr(self, f, val) + + def update(self): + vals = [getattr(self, f) for f in self._fields] + self.raw = pack(self._fmt, *vals) + + +class HorizontalHeader(UnknownTable): + + version_number = FixedProperty('_version_number') + + def read_data(self, hmtx): + if hasattr(self, 'ascender'): + return + field_types = ( + '_version_number' , 'l', + 'ascender', 'h', + 'descender', 'h', + 'line_gap', 'h', + 'advance_width_max', 'H', + 'min_left_side_bearing', 'h', + 'min_right_side_bearing', 'h', + 'x_max_extent', 'h', + 'caret_slope_rise', 'h', + 'caret_slop_run', 'h', + 'caret_offset', 'h', + 'r1', 'h', + 'r2', 'h', + 'r3', 'h', + 'r4', 'h', + 'metric_data_format', 'h', + 'number_of_h_metrics', 'H', + ) + + self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii') + self._fields = field_types[0::2] + + for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)): + setattr(self, f, val) + + raw = hmtx.raw + num = self.number_of_h_metrics + if len(raw) < 4*num: + raise UnsupportedFont('The hmtx table has insufficient data') + long_hor_metric = raw[:4*num] + a = read_array(long_hor_metric) + self.advance_widths = a[0::2] + a = read_array(long_hor_metric, 'h') + self.left_side_bearings = a[1::2] + + +class VerticalHeader(UnknownTable): + + version_number = FixedProperty('_version_number') + + def read_data(self, vmtx): + if hasattr(self, 'ascender'): + return + field_types = ( + '_version_number' , 'l', + 'ascender', 'h', + 'descender', 'h', + 'line_gap', 'h', + 'advance_height_max', 'H', + 'min_top_side_bearing', 'h', + 'min_bottom_side_bearing', 'h', + 'y_max_extent', 'h', + 'caret_slope_rise', 'h', + 'caret_slop_run', 'h', + 'caret_offset', 'h', + 'r1', 'h', + 'r2', 'h', + 'r3', 'h', + 'r4', 'h', + 'metric_data_format', 'h', + 'number_of_v_metrics', 'H', + ) + + self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii') + self._fields = field_types[0::2] + + for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)): + setattr(self, f, val) + + raw = vmtx.raw + num = self.number_of_v_metrics + if len(raw) < 4*num: + raise UnsupportedFont('The vmtx table has insufficient data') + long_hor_metric = raw[:4*num] + long_hor_metric = raw[:4*num] + a = read_array(long_hor_metric) + self.advance_heights = a[0::2] + a = read_array(long_hor_metric, 'h') + self.top_side_bearings = a[1::2] + + +class OS2Table(UnknownTable): + + def read_data(self): + if hasattr(self, 'char_width'): + return + ver, = unpack_from(b'>H', self.raw) + field_types = [ + 'version' , 'H', + 'average_char_width', 'h', + 'weight_class', 'H', + 'width_class', 'H', + 'fs_type', 'H', + 'subscript_x_size', 'h', + 'subscript_y_size', 'h', + 'subscript_x_offset', 'h', + 'subscript_y_offset', 'h', + 'superscript_x_size', 'h', + 'superscript_y_size', 'h', + 'superscript_x_offset', 'h', + 'superscript_y_offset', 'h', + 'strikeout_size', 'h', + 'strikeout_position', 'h', + 'family_class', 'h', + 'panose', '10s', + 'ranges', '16s', + 'vendor_id', '4s', + 'selection', 'H', + 'first_char_index', 'H', + 'last_char_index', 'H', + 'typo_ascender', 'h', + 'typo_descender', 'h', + 'typo_line_gap', 'h', + 'win_ascent', 'H', + 'win_descent', 'H', + ] + if ver > 1: + field_types += [ + 'code_page_range', '8s', + 'x_height', 'h', + 'cap_height', 'h', + 'default_char', 'H', + 'break_char', 'H', + 'max_context', 'H', + ] + + self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii') + self._fields = field_types[0::2] + + for f, val in zip(self._fields, unpack_from(self._fmt, self.raw)): + setattr(self, f, val) + + def zero_fstype(self): + prefix = calcsize(b'>HhHH') + self.raw = self.raw[:prefix] + b'\0\0' + self.raw[prefix+2:] + self.fs_type = 0 + + +class PostTable(UnknownTable): + + version_number = FixedProperty('_version') + italic_angle = FixedProperty('_italic_angle') + + def read_data(self): + if hasattr(self, 'underline_position'): + return + (self._version, self._italic_angle, self.underline_position, + self.underline_thickness) = unpack_from(b'>llhh', self.raw) diff --git a/ebook_converter/utils/fonts/sfnt/kern.py b/ebook_converter/utils/fonts/sfnt/kern.py new file mode 100644 index 0000000..c4c58a2 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/kern.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import unpack_from, calcsize, pack, error as struct_error + +from calibre.utils.fonts.sfnt import (UnknownTable, FixedProperty, + max_power_of_two) +from calibre.utils.fonts.sfnt.errors import UnsupportedFont +from polyglot.builtins import range + + +class KernTable(UnknownTable): + + version = FixedProperty('_version') + + def __init__(self, *args, **kwargs): + super(KernTable, self).__init__(*args, **kwargs) + self._version, self.num_tables = unpack_from(b'>HH', self.raw) + if self._version == 1 and len(self.raw) >= 8: + self._version, self.num_tables = unpack_from(b'>LL', self.raw) + self.headerfmt = b'>HH' if self._version == 0 else b'>LL' + + def restrict_to_glyphs(self, glyph_ids): + if self._version not in {0, 0x10000}: + raise UnsupportedFont('kern table has version: %x'%self._version) + offset = 4 if (self._version == 0) else 8 + tables = [] + for i in range(self.num_tables): + if self._version == 0: + version, length, coverage = unpack_from(b'>3H', self.raw, offset) + table_format = version + else: + length, coverage = unpack_from(b'>LH', self.raw, offset) + table_format = coverage & 0xff + raw = self.raw[offset:offset+length] + if table_format == 0: + raw = self.restrict_format_0(raw, glyph_ids) + if not raw: + continue + tables.append(raw) + offset += length + self.raw = pack(self.headerfmt, self._version, len(tables)) + b''.join(tables) + + def restrict_format_0(self, raw, glyph_ids): + if self._version == 0: + version, length, coverage, npairs = unpack_from(b'>4H', raw) + headerfmt = b'>3H' + else: + length, coverage, tuple_index, npairs = unpack_from(b'>L3H', raw) + headerfmt = b'>L2H' + + offset = calcsize(headerfmt + b'4H') + entries = [] + entrysz = calcsize(b'>2Hh') + for i in range(npairs): + try: + left, right, value = unpack_from(b'>2Hh', raw, offset) + except struct_error: + offset = len(raw) + break # Buggy kern table + if left in glyph_ids and right in glyph_ids: + entries.append(pack(b'>2Hh', left, right, value)) + offset += entrysz + + if offset != len(raw): + raise UnsupportedFont('This font has extra data at the end of' + ' a Format 0 kern subtable') + + npairs = len(entries) + if npairs == 0: + return b'' + + entry_selector = max_power_of_two(npairs) + search_range = (2 ** entry_selector) * 6 + range_shift = (npairs - (2 ** entry_selector)) * 6 + + entries = b''.join(entries) + length = calcsize(headerfmt + b'4H') + len(entries) + if self._version == 0: + header = pack(headerfmt, version, length, coverage) + else: + header = pack(headerfmt, length, coverage, tuple_index) + return header + pack(b'>4H', npairs, search_range, entry_selector, + range_shift) + entries diff --git a/ebook_converter/utils/fonts/sfnt/loca.py b/ebook_converter/utils/fonts/sfnt/loca.py new file mode 100644 index 0000000..04a9c37 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/loca.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import array, sys +from operator import itemgetter +from itertools import repeat + +from calibre.utils.fonts.sfnt import UnknownTable +from polyglot.builtins import iteritems, range + + +def four_byte_type_code(): + for c in 'IL': + a = array.array(c) + if a.itemsize == 4: + return c + + +def read_array(data, fmt='H'): + ans = array.array(fmt, data) + if sys.byteorder != 'big': + ans.byteswap() + return ans + + +class LocaTable(UnknownTable): + + def load_offsets(self, head_table, maxp_table): + fmt = 'H' if head_table.index_to_loc_format == 0 else four_byte_type_code() + locs = read_array(self.raw, fmt) + self.offset_map = locs.tolist() + if fmt == 'H': + self.offset_map = [2*i for i in self.offset_map] + self.fmt = fmt + + def glyph_location(self, glyph_id): + offset = self.offset_map[glyph_id] + next_offset = self.offset_map[glyph_id+1] + return offset, next_offset - offset + + def update(self, resolved_glyph_map): + ''' + Update this table to contain pointers only to the glyphs in + resolved_glyph_map which must be a map of glyph_ids to (offset, sz) + Note that the loca table is generated for all glyphs from 0 to the + largest glyph that is either in resolved_glyph_map or was present + originally. The pointers to glyphs that have no data will be set to + zero. This preserves glyph ids. + ''' + current_max_glyph_id = len(self.offset_map) - 2 + max_glyph_id = max(resolved_glyph_map or (0,)) + max_glyph_id = max(max_glyph_id, current_max_glyph_id) + self.offset_map = list(repeat(0, max_glyph_id + 2)) + glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in + iteritems(resolved_glyph_map)] + glyphs.sort(key=itemgetter(1)) + for glyph_id, offset, sz in glyphs: + self.offset_map[glyph_id] = offset + self.offset_map[glyph_id+1] = offset + sz + # Fix all zero entries to be the same as the previous entry, which + # means that if the ith entry is zero, the i-1 glyph is not present. + for i in range(1, len(self.offset_map)): + if self.offset_map[i] == 0: + self.offset_map[i] = self.offset_map[i-1] + + vals = self.offset_map + max_offset = max(vals) if vals else 0 + if max_offset < 0x20000 and all(l % 2 == 0 for l in vals): + self.fmt = 'H' + vals = array.array(self.fmt, (i // 2 for i in vals)) + else: + self.fmt = four_byte_type_code() + vals = array.array(self.fmt, vals) + + if sys.byteorder != "big": + vals.byteswap() + self.raw = vals.tostring() + subset = update + + def dump_glyphs(self, sfnt): + if not hasattr(self, 'offset_map'): + self.load_offsets(sfnt[b'head'], sfnt[b'maxp']) + for i in range(len(self.offset_map)-1): + off, noff = self.offset_map[i], self.offset_map[i+1] + if noff != off: + print('Glyph id:', i, 'size:', noff-off) diff --git a/ebook_converter/utils/fonts/sfnt/maxp.py b/ebook_converter/utils/fonts/sfnt/maxp.py new file mode 100644 index 0000000..4d927ee --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/maxp.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import unpack_from, pack + +from calibre.utils.fonts.sfnt import UnknownTable, FixedProperty +from calibre.utils.fonts.sfnt.errors import UnsupportedFont +from polyglot.builtins import zip + + +class MaxpTable(UnknownTable): + + version = FixedProperty('_version') + + def __init__(self, *args, **kwargs): + super(MaxpTable, self).__init__(*args, **kwargs) + + self._fmt = b'>lH' + self._version, self.num_glyphs = unpack_from(self._fmt, self.raw) + self.fields = ('_version', 'num_glyphs') + + if self.version > 1.0: + raise UnsupportedFont('This font has a maxp table with version: %s' + %self.version) + if self.version == 1.0: + self.fields = ('_version', 'num_glyphs', 'max_points', + 'max_contours', 'max_composite_points', + 'max_composite_contours', 'max_zones', + 'max_twilight_points', 'max_storage', 'max_function_defs', + 'max_instruction_defs', 'max_stack_elements', + 'max_size_of_instructions', 'max_component_elements', + 'max_component_depth') + self._fmt = b'>lH' + b'H'*(len(self.fields)-2) + + vals = unpack_from(self._fmt, self.raw) + for f, val in zip(self.fields, vals): + setattr(self, f, val) + + def update(self): + vals = [getattr(self, f) for f in self.fields] + self.raw = pack(self._fmt, *vals) diff --git a/ebook_converter/utils/fonts/sfnt/subset.py b/ebook_converter/utils/fonts/sfnt/subset.py new file mode 100644 index 0000000..cdc46f0 --- /dev/null +++ b/ebook_converter/utils/fonts/sfnt/subset.py @@ -0,0 +1,380 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import traceback +from collections import OrderedDict +from operator import itemgetter +from functools import partial + +from calibre.utils.icu import safe_chr, ord_string +from calibre.utils.fonts.sfnt.container import Sfnt +from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs +from polyglot.builtins import unicode_type, range, iteritems, itervalues, map + +# TrueType outlines {{{ + + +def resolve_glyphs(loca, glyf, character_map, extra_glyphs): + unresolved_glyphs = set(itervalues(character_map)) | extra_glyphs + unresolved_glyphs.add(0) # We always want the .notdef glyph + resolved_glyphs = {} + + while unresolved_glyphs: + glyph_id = unresolved_glyphs.pop() + try: + offset, length = loca.glyph_location(glyph_id) + except (IndexError, ValueError, KeyError, TypeError): + continue + glyph = glyf.glyph_data(offset, length) + resolved_glyphs[glyph_id] = glyph + for gid in glyph.glyph_indices: + if gid not in resolved_glyphs: + unresolved_glyphs.add(gid) + + return OrderedDict(sorted(iteritems(resolved_glyphs), key=itemgetter(0))) + + +def subset_truetype(sfnt, character_map, extra_glyphs): + loca = sfnt[b'loca'] + glyf = sfnt[b'glyf'] + + try: + head, maxp = sfnt[b'head'], sfnt[b'maxp'] + except KeyError: + raise UnsupportedFont('This font does not contain head and/or maxp tables') + loca.load_offsets(head, maxp) + + resolved_glyphs = resolve_glyphs(loca, glyf, character_map, extra_glyphs) + if not resolved_glyphs or set(resolved_glyphs) == {0}: + raise NoGlyphs('This font has no glyphs for the specified character ' + 'set, subsetting it is pointless') + + # Keep only character codes that have resolved glyphs + for code, glyph_id in tuple(iteritems(character_map)): + if glyph_id not in resolved_glyphs: + del character_map[code] + + # Update the glyf table + glyph_offset_map = glyf.update(resolved_glyphs) + + # Update the loca table + loca.subset(glyph_offset_map) + head.index_to_loc_format = 0 if loca.fmt == 'H' else 1 + head.update() + maxp.num_glyphs = len(loca.offset_map) - 1 + +# }}} + + +def subset_postscript(sfnt, character_map, extra_glyphs): + cff = sfnt[b'CFF '] + cff.decompile() + cff.subset(character_map, extra_glyphs) + + +def do_warn(warnings, *args): + for arg in args: + for line in arg.splitlines(): + if warnings is None: + print(line) + else: + warnings.append(line) + if warnings is None: + print() + else: + warnings.append('') + + +def pdf_subset(sfnt, glyphs): + for tag in tuple(sfnt.tables): + if tag not in {b'hhea', b'head', b'hmtx', b'maxp', + b'OS/2', b'post', b'cvt ', b'fpgm', b'glyf', b'loca', + b'prep', b'CFF ', b'VORG'}: + # Remove non core tables since they are unused in PDF rendering + del sfnt[tag] + if b'loca' in sfnt and b'glyf' in sfnt: + # TrueType Outlines + subset_truetype(sfnt, {}, glyphs) + elif b'CFF ' in sfnt: + # PostScript Outlines + subset_postscript(sfnt, {}, glyphs) + else: + raise UnsupportedFont('This font does not contain TrueType ' + 'or PostScript outlines') + + +def safe_ord(x): + return ord_string(unicode_type(x))[0] + + +def subset(raw, individual_chars, ranges=(), warnings=None): + warn = partial(do_warn, warnings) + + chars = set(map(safe_ord, individual_chars)) + for r in ranges: + chars |= set(range(safe_ord(r[0]), safe_ord(r[1])+1)) + + # Always add the space character for ease of use from the command line + if safe_ord(' ') not in chars: + chars.add(safe_ord(' ')) + + sfnt = Sfnt(raw) + old_sizes = sfnt.sizes() + + # Remove the Digital Signature table since it is useless in a subset + # font anyway + sfnt.pop(b'DSIG', None) + + # Remove non core tables as they aren't likely to be used by renderers + # anyway + core_tables = {b'cmap', b'hhea', b'head', b'hmtx', b'maxp', b'name', + b'OS/2', b'post', b'cvt ', b'fpgm', b'glyf', b'loca', b'prep', + b'CFF ', b'VORG', b'EBDT', b'EBLC', b'EBSC', b'BASE', b'GSUB', + b'GPOS', b'GDEF', b'JSTF', b'gasp', b'hdmx', b'kern', b'LTSH', + b'PCLT', b'VDMX', b'vhea', b'vmtx', b'MATH'} + for tag in list(sfnt): + if tag not in core_tables: + del sfnt[tag] + + try: + cmap = sfnt[b'cmap'] + except KeyError: + raise UnsupportedFont('This font has no cmap table') + + # Get mapping of chars to glyph ids for all specified chars + character_map = cmap.get_character_map(chars) + + extra_glyphs = set() + + if b'GSUB' in sfnt: + # Parse all substitution rules to ensure that glyphs that can be + # substituted for the specified set of glyphs are not removed + gsub = sfnt[b'GSUB'] + try: + gsub.decompile() + extra_glyphs = gsub.all_substitutions(itervalues(character_map)) + except UnsupportedFont as e: + warn('Usupported GSUB table: %s'%e) + except Exception: + warn('Failed to decompile GSUB table:', traceback.format_exc()) + + if b'loca' in sfnt and b'glyf' in sfnt: + # TrueType Outlines + subset_truetype(sfnt, character_map, extra_glyphs) + elif b'CFF ' in sfnt: + # PostScript Outlines + subset_postscript(sfnt, character_map, extra_glyphs) + else: + raise UnsupportedFont('This font does not contain TrueType ' + 'or PostScript outlines') + + # Restrict the cmap table to only contain entries for the resolved glyphs + cmap.set_character_map(character_map) + + if b'kern' in sfnt: + try: + sfnt[b'kern'].restrict_to_glyphs(frozenset(itervalues(character_map))) + except UnsupportedFont as e: + warn('kern table unsupported, ignoring: %s'%e) + except Exception: + warn('Subsetting of kern table failed, ignoring:', + traceback.format_exc()) + + raw, new_sizes = sfnt() + return raw, old_sizes, new_sizes + +# CLI {{{ + + +def option_parser(): + import textwrap + from calibre.utils.config import OptionParser + parser = OptionParser(usage=textwrap.dedent('''\ + %prog [options] input_font_file output_font_file characters_to_keep + + Subset the specified font, keeping only the glyphs for the characters in + characters_to_keep. characters_to_keep is a comma separated list of characters of + the form: a,b,c,A-Z,0-9,xyz + + You can specify ranges in the list of characters, as shown above. + ''')) + parser.add_option('-c', '--codes', default=False, action='store_true', + help='If specified, the list of characters is interpreted as ' + 'numeric unicode codes instead of characters. So to specify the ' + 'characters a,b you would use 97,98 or U+0061,U+0062') + parser.prog = 'subset-font' + return parser + + +def print_stats(old_stats, new_stats): + from calibre import prints + prints('========= Table comparison (original vs. subset) =========') + prints('Table', ' ', '%10s'%'Size', ' ', 'Percent', ' ', '%10s'%'New Size', + ' New Percent') + prints('='*80) + old_total = sum(itervalues(old_stats)) + new_total = sum(itervalues(new_stats)) + tables = sorted(old_stats, key=lambda x:old_stats[x], + reverse=True) + for table in tables: + osz = old_stats[table] + op = osz/old_total * 100 + nsz = new_stats.get(table, 0) + np = nsz/new_total * 100 + suffix = ' | same size' + if nsz != osz: + suffix = ' | reduced to %.1f %%'%(nsz/osz * 100) + prints('%4s'%table, ' ', '%10s'%osz, ' ', '%5.1f %%'%op, ' ', + '%10s'%nsz, ' ', '%5.1f %%'%np, suffix) + prints('='*80) + + +def main(args): + import sys, time + from calibre import prints + parser = option_parser() + opts, args = parser.parse_args(args) + if len(args) < 4 or len(args) > 4: + parser.print_help() + raise SystemExit(1) + iff, off, chars = args[1:] + with open(iff, 'rb') as f: + orig = f.read() + + chars = [x for x in chars.split(',')] + individual, ranges = set(), set() + + def not_single(c): + if len(c) > 1: + prints(c, 'is not a single character', file=sys.stderr) + raise SystemExit(1) + + def conv_code(c): + if c.upper()[:2] in ('U+', '0X'): + c = int(c[2:], 16) + return safe_chr(int(c)) + + for c in chars: + if '-' in c: + parts = [x.strip() for x in c.split('-')] + if len(parts) != 2: + prints('Invalid range:', c, file=sys.stderr) + raise SystemExit(1) + if opts.codes: + parts = tuple(map(conv_code, parts)) + tuple(map(not_single, parts)) + ranges.add(tuple(parts)) + else: + if opts.codes: + c = conv_code(c) + not_single(c) + individual.add(c) + st = time.time() + sf, old_stats, new_stats = subset(orig, individual, ranges) + taken = time.time() - st + reduced = (len(sf)/len(orig)) * 100 + + def sz(x): + return '%gKB'%(len(x)/1024.) + print_stats(old_stats, new_stats) + prints('Original size:', sz(orig), 'Subset size:', sz(sf), 'Reduced to: %g%%'%(reduced)) + prints('Subsetting took %g seconds'%taken) + with open(off, 'wb') as f: + f.write(sf) + prints('Subset font written to:', off) + + +if __name__ == '__main__': + try: + import init_calibre + init_calibre + except ImportError: + pass + import sys + main(sys.argv) +# }}} + +# Tests {{{ + + +def test_mem(): + from calibre.utils.mem import memory + import gc + gc.collect() + start_mem = memory() + raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True) + calls = 1000 + for i in range(calls): + subset(raw, (), (('a', 'z'),)) + del raw + for i in range(3): + gc.collect() + print('Leaked memory per call:', (memory() - start_mem)/calls*1024, 'KB') + + +def test(): + raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True) + sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ()) + if len(sf) > 0.3 * len(raw): + raise Exception('Subsetting failed') + + +def all(): + from calibre.utils.fonts.scanner import font_scanner + failed = [] + unsupported = [] + warnings = {} + total = 0 + averages = [] + for family in font_scanner.find_font_families(): + for font in font_scanner.fonts_for_family(family): + raw = font_scanner.get_font_data(font) + print('Subsetting', font['full_name'], end='\t') + total += 1 + try: + w = [] + sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), + (), w) + if w: + warnings[font['full_name'] + ' (%s)'%font['path']] = w + except NoGlyphs: + print('No glyphs!') + continue + except UnsupportedFont as e: + unsupported.append((font['full_name'], font['path'], unicode_type(e))) + print('Unsupported!') + continue + except Exception as e: + print('Failed!') + failed.append((font['full_name'], font['path'], unicode_type(e))) + else: + averages.append(sum(itervalues(new_stats))/sum(itervalues(old_stats)) * 100) + print('Reduced to:', '%.1f'%averages[-1] , '%') + if unsupported: + print('\n\nUnsupported:') + for name, path, err in unsupported: + print(name, path, err) + print() + if warnings: + print('\n\nWarnings:') + for name, w in iteritems(warnings): + if w: + print(name) + print('', '\n\t'.join(w), sep='\t') + if failed: + print('\n\nFailures:') + for name, path, err in failed: + print(name, path, err) + print() + + print('Average reduction to: %.1f%%'%(sum(averages)/len(averages))) + print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:', + len(failed), 'Warnings:', len(warnings)) + + +# }}}