ebook-converter/ebook_converter/utils/fonts/sfnt/subset.py

import traceback
from collections import OrderedDict
from operator import itemgetter
from functools import partial

from ebook_converter.utils.icu import safe_chr, ord_string
from ebook_converter.utils.fonts.sfnt.container import Sfnt
from ebook_converter.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs
from ebook_converter.polyglot.builtins import iteritems, itervalues


__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

# TrueType outlines {{{


def resolve_glyphs(loca, glyf, character_map, extra_glyphs):
    unresolved_glyphs = set(itervalues(character_map)) | extra_glyphs
    unresolved_glyphs.add(0)  # We always want the .notdef glyph
    resolved_glyphs = {}

    while unresolved_glyphs:
        glyph_id = unresolved_glyphs.pop()
        try:
            offset, length = loca.glyph_location(glyph_id)
        except (IndexError, ValueError, KeyError, TypeError):
            continue
        glyph = glyf.glyph_data(offset, length)
        resolved_glyphs[glyph_id] = glyph
        for gid in glyph.glyph_indices:
            if gid not in resolved_glyphs:
                unresolved_glyphs.add(gid)

    return OrderedDict(sorted(iteritems(resolved_glyphs), key=itemgetter(0)))


def subset_truetype(sfnt, character_map, extra_glyphs):
    loca = sfnt[b'loca']
    glyf = sfnt[b'glyf']

    try:
        head, maxp = sfnt[b'head'], sfnt[b'maxp']
    except KeyError:
        raise UnsupportedFont('This font does not contain head and/or maxp tables')
    loca.load_offsets(head, maxp)

    resolved_glyphs = resolve_glyphs(loca, glyf, character_map, extra_glyphs)
    if not resolved_glyphs or set(resolved_glyphs) == {0}:
        raise NoGlyphs('This font has no glyphs for the specified character '
                'set, subsetting it is pointless')

    # Keep only character codes that have resolved glyphs
    for code, glyph_id in tuple(iteritems(character_map)):
        if glyph_id not in resolved_glyphs:
            del character_map[code]

    # Update the glyf table
    glyph_offset_map = glyf.update(resolved_glyphs)

    # Update the loca table
    loca.subset(glyph_offset_map)
    head.index_to_loc_format = 0 if loca.fmt == 'H' else 1
    head.update()
    maxp.num_glyphs = len(loca.offset_map) - 1

# }}}


def subset_postscript(sfnt, character_map, extra_glyphs):
    cff = sfnt[b'CFF ']
    cff.decompile()
    cff.subset(character_map, extra_glyphs)


def do_warn(warnings, *args):
    for arg in args:
        for line in arg.splitlines():
            if warnings is None:
                print(line)
            else:
                warnings.append(line)
    if warnings is None:
        print()
    else:
        warnings.append('')


def pdf_subset(sfnt, glyphs):
    for tag in tuple(sfnt.tables):
        if tag not in {b'hhea', b'head', b'hmtx', b'maxp',
                       b'OS/2', b'post', b'cvt ', b'fpgm', b'glyf', b'loca',
                       b'prep', b'CFF ', b'VORG'}:
            # Remove non core tables since they are unused in PDF rendering
            del sfnt[tag]
    if b'loca' in sfnt and b'glyf' in sfnt:
        # TrueType Outlines
        subset_truetype(sfnt, {}, glyphs)
    elif b'CFF ' in sfnt:
        # PostScript Outlines
        subset_postscript(sfnt, {}, glyphs)
    else:
        raise UnsupportedFont('This font does not contain TrueType '
                'or PostScript outlines')


def safe_ord(x):
    return ord_string(str(x))[0]


def subset(raw, individual_chars, ranges=(), warnings=None):
    warn = partial(do_warn, warnings)

    chars = set(map(safe_ord, individual_chars))
    for r in ranges:
        chars |= set(range(safe_ord(r[0]), safe_ord(r[1])+1))

    # Always add the space character for ease of use from the command line
    if safe_ord(' ') not in chars:
        chars.add(safe_ord(' '))

    sfnt = Sfnt(raw)
    old_sizes = sfnt.sizes()

    # Remove the Digital Signature table since it is useless in a subset
    # font anyway
    sfnt.pop(b'DSIG', None)

    # Remove non core tables as they aren't likely to be used by renderers
    # anyway
    core_tables = {b'cmap', b'hhea', b'head', b'hmtx', b'maxp', b'name',
            b'OS/2', b'post', b'cvt ', b'fpgm', b'glyf', b'loca', b'prep',
            b'CFF ', b'VORG', b'EBDT', b'EBLC', b'EBSC', b'BASE', b'GSUB',
            b'GPOS', b'GDEF', b'JSTF', b'gasp', b'hdmx', b'kern', b'LTSH',
            b'PCLT', b'VDMX', b'vhea', b'vmtx', b'MATH'}
    for tag in list(sfnt):
        if tag not in core_tables:
            del sfnt[tag]

    try:
        cmap = sfnt[b'cmap']
    except KeyError:
        raise UnsupportedFont('This font has no cmap table')

    # Get mapping of chars to glyph ids for all specified chars
    character_map = cmap.get_character_map(chars)

    extra_glyphs = set()

    if b'GSUB' in sfnt:
        # Parse all substitution rules to ensure that glyphs that can be
        # substituted for the specified set of glyphs are not removed
        gsub = sfnt[b'GSUB']
        try:
            gsub.decompile()
            extra_glyphs = gsub.all_substitutions(itervalues(character_map))
        except UnsupportedFont as e:
            warn('Usupported GSUB table: %s'%e)
        except Exception:
            warn('Failed to decompile GSUB table:', traceback.format_exc())

    if b'loca' in sfnt and b'glyf' in sfnt:
        # TrueType Outlines
        subset_truetype(sfnt, character_map, extra_glyphs)
    elif b'CFF ' in sfnt:
        # PostScript Outlines
        subset_postscript(sfnt, character_map, extra_glyphs)
    else:
        raise UnsupportedFont('This font does not contain TrueType '
                'or PostScript outlines')

    # Restrict the cmap table to only contain entries for the resolved glyphs
    cmap.set_character_map(character_map)

    if b'kern' in sfnt:
        try:
            sfnt[b'kern'].restrict_to_glyphs(frozenset(itervalues(character_map)))
        except UnsupportedFont as e:
            warn('kern table unsupported, ignoring: %s'%e)
        except Exception:
            warn('Subsetting of kern table failed, ignoring:',
                    traceback.format_exc())

    raw, new_sizes = sfnt()
    return raw, old_sizes, new_sizes

# CLI {{{


def option_parser():
    import textwrap
    from ebook_converter.utils.config import OptionParser
    parser = OptionParser(usage=textwrap.dedent('''\
            %prog [options] input_font_file output_font_file characters_to_keep

            Subset the specified font, keeping only the glyphs for the characters in
            characters_to_keep. characters_to_keep is a comma separated list of characters of
            the form: a,b,c,A-Z,0-9,xyz

            You can specify ranges in the list of characters, as shown above.
            '''))
    parser.add_option('-c', '--codes', default=False, action='store_true',
            help='If specified, the list of characters is interpreted as '
            'numeric unicode codes instead of characters. So to specify the '
            'characters a,b you would use 97,98 or U+0061,U+0062')
    parser.prog = 'subset-font'
    return parser


def print_stats(old_stats, new_stats):
    from ebook_converter import prints
    prints('========= Table comparison (original vs. subset) =========')
    prints('Table', ' ', '%10s'%'Size', '  ', 'Percent', '   ', '%10s'%'New Size',
            ' New Percent')
    prints('='*80)
    old_total = sum(itervalues(old_stats))
    new_total = sum(itervalues(new_stats))
    tables = sorted(old_stats, key=lambda x:old_stats[x],
            reverse=True)
    for table in tables:
        osz = old_stats[table]
        op = osz/old_total * 100
        nsz = new_stats.get(table, 0)
        np = nsz/new_total * 100
        suffix = ' | same size'
        if nsz != osz:
            suffix = ' | reduced to %.1f %%'%(nsz/osz * 100)
        prints('%4s'%table, '  ', '%10s'%osz, '  ', '%5.1f %%'%op, '   ',
                '%10s'%nsz, '  ', '%5.1f %%'%np, suffix)
    prints('='*80)


def main(args):
    import sys, time
    from ebook_converter import prints
    parser = option_parser()
    opts, args = parser.parse_args(args)
    if len(args) < 4 or len(args) > 4:
        parser.print_help()
        raise SystemExit(1)
    iff, off, chars = args[1:]
    with open(iff, 'rb') as f:
        orig = f.read()

    chars = [x for x in chars.split(',')]
    individual, ranges = set(), set()

    def not_single(c):
        if len(c) > 1:
            prints(c, 'is not a single character', file=sys.stderr)
            raise SystemExit(1)

    def conv_code(c):
        if c.upper()[:2] in ('U+', '0X'):
            c = int(c[2:], 16)
        return safe_chr(int(c))

    for c in chars:
        if '-' in c:
            parts = [x.strip() for x in c.split('-')]
            if len(parts) != 2:
                prints('Invalid range:', c, file=sys.stderr)
                raise SystemExit(1)
            if opts.codes:
                parts = tuple(map(conv_code, parts))
            tuple(map(not_single, parts))
            ranges.add(tuple(parts))
        else:
            if opts.codes:
                c = conv_code(c)
            not_single(c)
            individual.add(c)
    st = time.time()
    sf, old_stats, new_stats = subset(orig, individual, ranges)
    taken = time.time() - st
    reduced = (len(sf)/len(orig)) * 100

    def sz(x):
        return '%gKB'%(len(x)/1024.)
    print_stats(old_stats, new_stats)
    prints('Original size:', sz(orig), 'Subset size:', sz(sf), 'Reduced to: %g%%'%(reduced))
    prints('Subsetting took %g seconds'%taken)
    with open(off, 'wb') as f:
        f.write(sf)
    prints('Subset font written to:', off)


if __name__ == '__main__':
    try:
        import init_calibre
        init_calibre
    except ImportError:
        pass
    import sys
    main(sys.argv)
# }}}

# Tests {{{


def test_mem():
    from ebook_converter.utils.mem import memory
    import gc
    gc.collect()
    start_mem = memory()
    raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
    calls = 1000
    for i in range(calls):
        subset(raw, (), (('a', 'z'),))
    del raw
    for i in range(3):
        gc.collect()
    print('Leaked memory per call:', (memory() - start_mem)/calls*1024, 'KB')


def test():
    raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
    sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
    if len(sf) > 0.3 * len(raw):
        raise Exception('Subsetting failed')


def all():
    from ebook_converter.utils.fonts.scanner import font_scanner
    failed = []
    unsupported = []
    warnings = {}
    total = 0
    averages = []
    for family in font_scanner.find_font_families():
        for font in font_scanner.fonts_for_family(family):
            raw = font_scanner.get_font_data(font)
            print('Subsetting', font['full_name'], end='\t')
            total += 1
            try:
                w = []
                sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')),
                        (), w)
                if w:
                    warnings[font['full_name'] + ' (%s)'%font['path']] = w
            except NoGlyphs:
                print('No glyphs!')
                continue
            except UnsupportedFont as e:
                unsupported.append((font['full_name'], font['path'], str(e)))
                print('Unsupported!')
                continue
            except Exception as e:
                print('Failed!')
                failed.append((font['full_name'], font['path'], str(e)))
            else:
                averages.append(sum(itervalues(new_stats))/sum(itervalues(old_stats)) * 100)
                print('Reduced to:', '%.1f'%averages[-1] , '%')
    if unsupported:
        print('\n\nUnsupported:')
        for name, path, err in unsupported:
            print(name, path, err)
            print()
    if warnings:
        print('\n\nWarnings:')
    for name, w in iteritems(warnings):
        if w:
            print(name)
            print('', '\n\t'.join(w), sep='\t')
    if failed:
        print('\n\nFailures:')
        for name, path, err in failed:
            print(name, path, err)
            print()

    print('Average reduction to: %.1f%%'%(sum(averages)/len(averages)))
    print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:',
            len(failed), 'Warnings:', len(warnings))


# }}}