Added docx writer related modules

2026-04-23 22:51:30 +02:00 · 2020-04-13 16:33:15 +02:00
parent ae80ae5640
commit 98b2dd8d4f
29 changed files with 5956 additions and 0 deletions
@@ -0,0 +1,316 @@
+#!/usr/bin/env python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from collections import defaultdict
+
+from calibre.ebooks.oeb.base import urlnormalize, css_text
+from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont
+from polyglot.builtins import iteritems, itervalues, unicode_type, range
+from tinycss.fonts3 import parse_font_family
+
+
+def get_font_properties(rule, default=None):
+    '''
+    Given a CSS rule, extract normalized font properties from
+    it. Note that shorthand font property should already have been expanded
+    by the CSS flattening code.
+    '''
+    props = {}
+    s = rule.style
+    for q in ('font-family', 'src', 'font-weight', 'font-stretch',
+            'font-style'):
+        g = 'uri' if q == 'src' else 'value'
+        try:
+            val = s.getProperty(q).propertyValue[0]
+            val = getattr(val, g)
+            if q == 'font-family':
+                val = parse_font_family(css_text(s.getProperty(q).propertyValue))
+                if val and val[0] == 'inherit':
+                    val = None
+        except (IndexError, KeyError, AttributeError, TypeError, ValueError):
+            val = None if q in {'src', 'font-family'} else default
+        if q in {'font-weight', 'font-stretch', 'font-style'}:
+            val = unicode_type(val).lower() if (val or val == 0) else val
+            if val == 'inherit':
+                val = default
+        if q == 'font-weight':
+            val = {'normal':'400', 'bold':'700'}.get(val, val)
+            if val not in {'100', '200', '300', '400', '500', '600', '700',
+                    '800', '900', 'bolder', 'lighter'}:
+                val = default
+            if val == 'normal':
+                val = '400'
+        elif q == 'font-style':
+            if val not in {'normal', 'italic', 'oblique'}:
+                val = default
+        elif q == 'font-stretch':
+            if val not in {'normal', 'ultra-condensed', 'extra-condensed',
+                    'condensed', 'semi-condensed', 'semi-expanded',
+                    'expanded', 'extra-expanded', 'ultra-expanded'}:
+                val = default
+        props[q] = val
+    return props
+
+
+def find_font_face_rules(sheet, oeb):
+    '''
+    Find all @font-face rules in the given sheet and extract the relevant info from them.
+    sheet can be either a ManifestItem or a CSSStyleSheet.
+    '''
+    ans = []
+    try:
+        rules = sheet.data.cssRules
+    except AttributeError:
+        rules = sheet.cssRules
+
+    for i, rule in enumerate(rules):
+        if rule.type != rule.FONT_FACE_RULE:
+            continue
+        props = get_font_properties(rule, default='normal')
+        if not props['font-family'] or not props['src']:
+            continue
+
+        try:
+            path = sheet.abshref(props['src'])
+        except AttributeError:
+            path = props['src']
+        ff = oeb.manifest.hrefs.get(urlnormalize(path), None)
+        if not ff:
+            continue
+        props['item'] = ff
+        if props['font-weight'] in {'bolder', 'lighter'}:
+            props['font-weight'] = '400'
+        props['weight'] = int(props['font-weight'])
+        props['rule'] = rule
+        props['chars'] = set()
+        ans.append(props)
+
+    return ans
+
+
+def elem_style(style_rules, cls, inherited_style):
+    '''
+    Find the effective style for the given element.
+    '''
+    classes = cls.split()
+    style = inherited_style.copy()
+    for cls in classes:
+        style.update(style_rules.get(cls, {}))
+    wt = style.get('font-weight', None)
+    pwt = inherited_style.get('font-weight', '400')
+    if wt == 'bolder':
+        style['font-weight'] = {
+                '100':'400',
+                '200':'400',
+                '300':'400',
+                '400':'700',
+                '500':'700',
+                }.get(pwt, '900')
+    elif wt == 'lighter':
+        style['font-weight'] = {
+                '600':'400', '700':'400',
+                '800':'700', '900':'700'}.get(pwt, '100')
+
+    return style
+
+
+class SubsetFonts(object):
+
+    '''
+    Subset all embedded fonts. Must be run after CSS flattening, as it requires
+    CSS normalization and flattening to work.
+    '''
+
+    def __call__(self, oeb, log, opts):
+        self.oeb, self.log, self.opts = oeb, log, opts
+
+        self.find_embedded_fonts()
+        if not self.embedded_fonts:
+            self.log.debug('No embedded fonts found')
+            return
+        self.find_style_rules()
+        self.find_font_usage()
+
+        totals = [0, 0]
+
+        def remove(font):
+            totals[1] += len(font['item'].data)
+            self.oeb.manifest.remove(font['item'])
+            font['rule'].parentStyleSheet.deleteRule(font['rule'])
+
+        fonts = {}
+        for font in self.embedded_fonts:
+            item, chars = font['item'], font['chars']
+            if item.href in fonts:
+                fonts[item.href]['chars'] |= chars
+            else:
+                fonts[item.href] = font
+
+        for font in itervalues(fonts):
+            if not font['chars']:
+                self.log('The font %s is unused. Removing it.'%font['src'])
+                remove(font)
+                continue
+            try:
+                raw, old_stats, new_stats = subset(font['item'].data, font['chars'])
+            except NoGlyphs:
+                self.log('The font %s has no used glyphs. Removing it.'%font['src'])
+                remove(font)
+                continue
+            except UnsupportedFont as e:
+                self.log.warn('The font %s is unsupported for subsetting. %s'%(
+                    font['src'], e))
+                sz = len(font['item'].data)
+                totals[0] += sz
+                totals[1] += sz
+            else:
+                font['item'].data = raw
+                nlen = sum(itervalues(new_stats))
+                olen = sum(itervalues(old_stats))
+                self.log('Decreased the font %s to %.1f%% of its original size'%
+                        (font['src'], nlen/olen *100))
+                totals[0] += nlen
+                totals[1] += olen
+
+            font['item'].unload_data_from_memory()
+
+        if totals[0]:
+            self.log('Reduced total font size to %.1f%% of original'%
+                    (totals[0]/totals[1] * 100))
+
+    def find_embedded_fonts(self):
+        '''
+        Find all @font-face rules and extract the relevant info from them.
+        '''
+        self.embedded_fonts = []
+        for item in self.oeb.manifest:
+            if not hasattr(item.data, 'cssRules'):
+                continue
+            self.embedded_fonts.extend(find_font_face_rules(item, self.oeb))
+
+    def find_style_rules(self):
+        '''
+        Extract all font related style information from all stylesheets into a
+        dict mapping classes to font properties specified by that class. All
+        the heavy lifting has already been done by the CSS flattening code.
+        '''
+        rules = defaultdict(dict)
+        for item in self.oeb.manifest:
+            if not hasattr(item.data, 'cssRules'):
+                continue
+            for i, rule in enumerate(item.data.cssRules):
+                if rule.type != rule.STYLE_RULE:
+                    continue
+                props = {k:v for k,v in
+                        iteritems(get_font_properties(rule)) if v}
+                if not props:
+                    continue
+                for sel in rule.selectorList:
+                    sel = sel.selectorText
+                    if sel and sel.startswith('.'):
+                        # We dont care about pseudo-selectors as the worst that
+                        # can happen is some extra characters will remain in
+                        # the font
+                        sel = sel.partition(':')[0]
+                        rules[sel[1:]].update(props)
+
+        self.style_rules = dict(rules)
+
+    def find_font_usage(self):
+        for item in self.oeb.manifest:
+            if not hasattr(item.data, 'xpath'):
+                continue
+            for body in item.data.xpath('//*[local-name()="body"]'):
+                base = {'font-family':['serif'], 'font-weight': '400',
+                        'font-style':'normal', 'font-stretch':'normal'}
+                self.find_usage_in(body, base)
+
+    def used_font(self, style):
+        '''
+        Given a style find the embedded font that matches it. Returns None if
+        no match is found (can happen if no family matches).
+        '''
+        ff = style.get('font-family', [])
+        lnames = {unicode_type(x).lower() for x in ff}
+        matching_set = []
+
+        # Filter on font-family
+        for ef in self.embedded_fonts:
+            flnames = {x.lower() for x in ef.get('font-family', [])}
+            if not lnames.intersection(flnames):
+                continue
+            matching_set.append(ef)
+        if not matching_set:
+            return None
+
+        # Filter on font-stretch
+        widths = {x:i for i, x in enumerate(('ultra-condensed',
+                'extra-condensed', 'condensed', 'semi-condensed', 'normal',
+                'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'
+                ))}
+
+        width = widths[style.get('font-stretch', 'normal')]
+        for f in matching_set:
+            f['width'] = widths[style.get('font-stretch', 'normal')]
+
+        min_dist = min(abs(width-f['width']) for f in matching_set)
+        nearest = [f for f in matching_set if abs(width-f['width']) ==
+            min_dist]
+        if width <= 4:
+            lmatches = [f for f in nearest if f['width'] <= width]
+        else:
+            lmatches = [f for f in nearest if f['width'] >= width]
+        matching_set = (lmatches or nearest)
+
+        # Filter on font-style
+        fs = style.get('font-style', 'normal')
+        order = {
+                'oblique':['oblique', 'italic', 'normal'],
+                'normal':['normal', 'oblique', 'italic']
+            }.get(fs, ['italic', 'oblique', 'normal'])
+        for q in order:
+            matches = [f for f in matching_set if f.get('font-style', 'normal') == q]
+            if matches:
+                matching_set = matches
+                break
+
+        # Filter on font weight
+        fw = int(style.get('font-weight', '400'))
+        if fw == 400:
+            q = [400, 500, 300, 200, 100, 600, 700, 800, 900]
+        elif fw == 500:
+            q = [500, 400, 300, 200, 100, 600, 700, 800, 900]
+        elif fw < 400:
+            q = [fw] + list(range(fw-100, -100, -100)) + list(range(fw+100,
+                100, 1000))
+        else:
+            q = [fw] + list(range(fw+100, 100, 1000)) + list(range(fw-100,
+                -100, -100))
+        for wt in q:
+            matches = [f for f in matching_set if f['weight'] == wt]
+            if matches:
+                return matches[0]
+
+    def find_chars(self, elem):
+        ans = set()
+        if elem.text:
+            ans |= set(elem.text)
+        for child in elem:
+            if child.tail:
+                ans |= set(child.tail)
+        return ans
+
+    def find_usage_in(self, elem, inherited_style):
+        style = elem_style(self.style_rules, elem.get('class', '') or '', inherited_style)
+        for child in elem:
+            self.find_usage_in(child, style)
+        font = self.used_font(style)
+        if font:
+            chars = self.find_chars(elem)
+            if chars:
+                font['chars'] |= chars