Switch to a pure python implementation of font subsetting

2025-08-30 23:00:21 -04:00 · 2012-11-06 11:39:48 +05:30 · 2012-11-06 11:39:48 +05:30 · f54843c547
commit f54843c547
parent 18db66fd77
10 changed files with 383 additions and 13 deletions
--- a/src/calibre/debug.py
+++ b/src/calibre/debug.py
@ -212,7 +212,7 @@ def main(args=sys.argv):
        return

    if len(args) > 1 and args[1] in ('-f', '--subset-font'):
-        from calibre.utils.fonts.subset import main
+        from calibre.utils.fonts.sfnt.subset import main
        main(['subset-font']+args[2:])
        return

--- a/src/calibre/ebooks/oeb/transforms/subset.py
+++ b/src/calibre/ebooks/oeb/transforms/subset.py
@ -10,7 +10,7 @@ __docformat__ = 'restructuredtext en'
 from collections import defaultdict

 from calibre.ebooks.oeb.base import urlnormalize
-from calibre.utils.fonts.subset import subset, NoGlyphs, UnsupportedFont
+from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont

 class SubsetFonts(object):

--- a/src/calibre/utils/fonts/sfnt/init.py
+++ b/src/calibre/utils/fonts/sfnt/init.py
@ -26,6 +26,9 @@ class UnknownTable(object):
    def __call__(self):
        return self.raw

+    def __len__(self):
+        return len(self.raw)
+
 class DateTimeProperty(object):

    def __init__(self, name):
@ -46,10 +49,10 @@ class FixedProperty(object):

    def __get__(self, obj, type=None):
        val = getattr(obj, self.name)
-        return val * (2**-16)
+        return val / 0x10000

    def __set__(self, obj, val):
-        return int(round(val*(2**16)))
+        return int(round(val*(0x10000)))

 def max_power_of_two(x):
 	"""
@ -62,4 +65,10 @@ def max_power_of_two(x):
 		exponent += 1
 	return max(exponent - 1, 0)

+def load_font(stream_or_path):
+    raw = stream_or_path
+    if hasattr(raw, 'read'):
+        raw = raw.read()
+    from calibre.utils.fonts.sfnt.container import Sfnt
+    return Sfnt(raw)

--- a/src/calibre/utils/fonts/sfnt/cmap.py
+++ b/src/calibre/utils/fonts/sfnt/cmap.py
@ -144,6 +144,7 @@ class CmapTable(UnknownTable):
            except IndexError:
                next_offset = len(self.raw)
            table = self.raw[offset:next_offset]
+            if table:
                fmt = unpack_from(b'>H', table)[0]
                if platform == 3 and encoding == 1 and fmt == 4:
                    self.bmp_table = table
--- a/src/calibre/utils/fonts/sfnt/container.py
+++ b/src/calibre/utils/fonts/sfnt/container.py
@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'

 from struct import pack, calcsize
 from io import BytesIO
+from collections import OrderedDict

 from calibre.utils.fonts.utils import (get_tables, checksum_of_block,
        verify_checksums)
@ -18,6 +19,8 @@ from calibre.utils.fonts.sfnt.errors import UnsupportedFont
 from calibre.utils.fonts.sfnt.head import HeadTable
 from calibre.utils.fonts.sfnt.maxp import MaxpTable
 from calibre.utils.fonts.sfnt.loca import LocaTable
+from calibre.utils.fonts.sfnt.glyf import GlyfTable
+from calibre.utils.fonts.sfnt.cmap import CmapTable

 class Sfnt(object):

@ -35,6 +38,8 @@ class Sfnt(object):
                    b'head' : HeadTable,
                    b'maxp' : MaxpTable,
                    b'loca' : LocaTable,
+                    b'glyf' : GlyfTable,
+                    b'cmap' : CmapTable,
                    }.get(table_tag, UnknownTable)(table)

    def __getitem__(self, key):
@ -49,6 +54,12 @@ class Sfnt(object):
    def pop(self, key, default=None):
        return self.tables.pop(key, default)

+    def sizes(self):
+        ans = OrderedDict()
+        for tag in sorted(self.tables):
+            ans[tag] = len(self[tag])
+        return ans
+
    def __call__(self):
        stream = BytesIO()

@ -68,6 +79,7 @@ class Sfnt(object):
        head_offset = None
        table_data = []
        offset = stream.tell() + ( calcsize(b'>4s3L') * num_tables )
+        sizes = OrderedDict()
        for tag in sorted(self.tables):
            table = self.tables[tag]
            raw = table()
@ -80,6 +92,7 @@ class Sfnt(object):
            spack(b'>4s3L', tag, checksum, offset, table_len)
            offset += len(raw)
            table_data.append(raw)
+            sizes[tag] = table_len

        for x in table_data:
            stream.write(x)
@ -89,7 +102,7 @@ class Sfnt(object):
        stream.seek(head_offset + 8)
        spack(b'>L', q)

-        return stream.getvalue()
+        return stream.getvalue(), sizes

 def test_roundtrip(ff=None):
    if ff is None:
@ -97,7 +110,7 @@ def test_roundtrip(ff=None):
    else:
        with open(ff, 'rb') as f:
            data = f.read()
-    rd = Sfnt(data)()
+    rd = Sfnt(data)()[0]
    verify_checksums(rd)
    if data[:12] != rd[:12]:
        raise ValueError('Roundtripping failed, font header not the same')
--- a/src/calibre/utils/fonts/sfnt/errors.py
+++ b/src/calibre/utils/fonts/sfnt/errors.py
@ -10,3 +10,6 @@ __docformat__ = 'restructuredtext en'
 class UnsupportedFont(ValueError):
    pass

+class NoGlyphs(ValueError):
+    pass
+
--- a/src/calibre/utils/fonts/sfnt/glyf.py
+++ b/src/calibre/utils/fonts/sfnt/glyf.py
@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from struct import unpack_from
+from collections import OrderedDict
+
+from calibre.utils.fonts.sfnt import UnknownTable
+
+ARG_1_AND_2_ARE_WORDS      = 0x0001  # if set args are words otherwise they are bytes
+ARGS_ARE_XY_VALUES         = 0x0002  # if set args are xy values, otherwise they are points
+ROUND_XY_TO_GRID           = 0x0004  # for the xy values if above is true
+WE_HAVE_A_SCALE            = 0x0008  # Sx = Sy, otherwise scale == 1.0
+NON_OVERLAPPING            = 0x0010  # set to same value for all components (obsolete!)
+MORE_COMPONENTS            = 0x0020  # indicates at least one more glyph after this one
+WE_HAVE_AN_X_AND_Y_SCALE   = 0x0040  # Sx, Sy
+WE_HAVE_A_TWO_BY_TWO       = 0x0080  # t00, t01, t10, t11
+WE_HAVE_INSTRUCTIONS       = 0x0100  # instructions follow
+USE_MY_METRICS             = 0x0200  # apply these metrics to parent glyph
+OVERLAP_COMPOUND           = 0x0400  # used by Apple in GX fonts
+SCALED_COMPONENT_OFFSET    = 0x0800  # composite designed to have the component offset scaled (designed for Apple)
+UNSCALED_COMPONENT_OFFSET  = 0x1000  # composite designed not to have the component offset scaled (designed for MS)
+
+class SimpleGlyph(object):
+
+    def __init__(self, num_of_countours, raw):
+        self.num_of_countours = num_of_countours
+        self.raw = raw
+        # The list of glyph indices referred to by this glyph, will always be
+        # empty for a simple glyph and not empty for a composite glyph
+        self.glyph_indices = []
+        self.is_composite = False
+
+    def __len__(self):
+        return len(self.raw)
+
+    def __call__(self):
+        return self.raw
+
+class CompositeGlyph(SimpleGlyph):
+
+    def __init__(self, num_of_countours, raw):
+        super(CompositeGlyph, self).__init__(num_of_countours, raw)
+        self.is_composite = True
+
+        flags = MORE_COMPONENTS
+        offset = 0
+        while flags & MORE_COMPONENTS:
+            flags, glyph_index = unpack_from(b'>HH', raw, offset)
+            self.glyph_indices.append(glyph_index)
+            offset += 4
+            if flags & ARG_1_AND_2_ARE_WORDS:
+                offset += 4
+            else:
+                offset += 2
+            if flags & WE_HAVE_A_SCALE:
+                offset += 2
+            elif flags & WE_HAVE_AN_X_AND_Y_SCALE:
+                offset += 4
+            elif flags & WE_HAVE_A_TWO_BY_TWO:
+                offset += 8
+
+class GlyfTable(UnknownTable):
+
+    def glyph_data(self, offset, length):
+        raw = self.raw[offset:offset+length]
+        num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0
+        if num_of_countours >= 0:
+            return SimpleGlyph(num_of_countours, raw)
+        return CompositeGlyph(num_of_countours, raw)
+
+    def update(self, sorted_glyph_map):
+        ans = OrderedDict()
+        offset = 0
+        block = []
+        for glyph_id, glyph in sorted_glyph_map.iteritems():
+            raw = glyph()
+            ans[glyph_id] = (offset, len(raw))
+            offset += len(raw)
+            block.append(raw)
+        self.raw = b''.join(block)
+        return ans
+
--- a/src/calibre/utils/fonts/sfnt/loca.py
+++ b/src/calibre/utils/fonts/sfnt/loca.py
@ -7,7 +7,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-from struct import calcsize, unpack_from
+from struct import calcsize, unpack_from, pack
+from operator import itemgetter

 from calibre.utils.fonts.sfnt import UnknownTable

@ -23,9 +24,43 @@ class LocaTable(UnknownTable):
        self.offset_map = self.offset_map[:num_glyphs+1]
        if fmt == 'H':
            self.offset_map = [2*i for i in self.offset_map]
+        self.fmt = fmt

    def glyph_location(self, glyph_id):
        offset = self.offset_map[glyph_id]
        next_offset = self.offset_map[glyph_id+1]
        return offset, next_offset - offset

+    def subset(self, resolved_glyph_map):
+        '''
+        Update this table to contain pointers only to the glyphs in
+        resolved_glyph_map which must be a map of glyph_ids to (offset, sz)
+        '''
+        self.offset_map = [0 for i in self.offset_map]
+        glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in
+                    resolved_glyph_map.iteritems()]
+        glyphs.sort(key=itemgetter(1))
+        for glyph_id, offset, sz in glyphs:
+            self.offset_map[glyph_id] = offset
+            self.offset_map[glyph_id+1] = offset + sz
+        # Fix all zero entries to be the same as the previous entry, which
+        # means that if the ith entry is zero, the i-1 glyph is not present.
+        for i in xrange(1, len(self.offset_map)):
+            if self.offset_map[i] == 0:
+                self.offset_map[i] = self.offset_map[i-1]
+
+        vals = self.offset_map
+        if self.fmt == 'H':
+            vals = [i//2 for i in self.offset_map]
+
+        self.raw = pack(('>%d%s'%(len(vals), self.fmt)).encode('ascii'), *vals)
+
+    def dump_glyphs(self, sfnt):
+        if not hasattr(self, 'offset_map'):
+            self.load_offsets(sfnt[b'head'], sfnt[b'maxp'])
+        for i in xrange(len(self.offset_map)-1):
+            off, noff = self.offset_map[i], self.offset_map[i+1]
+            if noff != off:
+                print ('Glyph id:', i, 'size:', noff-off)
+
+
--- a/src/calibre/utils/fonts/sfnt/subset.py
+++ b/src/calibre/utils/fonts/sfnt/subset.py
@ -7,23 +7,73 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

+from collections import OrderedDict
+from operator import itemgetter
+
 from calibre.utils.fonts.sfnt.container import Sfnt
-from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs
+
+# TrueType outlines {{{
+
+def resolve_glyphs(loca, glyf, character_map):
+    unresolved_glyphs = set(character_map.itervalues())
+    unresolved_glyphs.add(0) # We always want the .notdef glyph
+    resolved_glyphs = {}
+
+    while unresolved_glyphs:
+        glyph_id = unresolved_glyphs.pop()
+        try:
+            offset, length = loca.glyph_location(glyph_id)
+        except (IndexError, ValueError, KeyError, TypeError):
+            continue
+        if length < 1:
+            continue
+        glyph = glyf.glyph_data(offset, length)
+        if len(glyph) == 0:
+            continue
+        resolved_glyphs[glyph_id] = glyph
+        for gid in glyph.glyph_indices:
+            if gid not in resolved_glyphs:
+                unresolved_glyphs.add(gid)
+
+    return OrderedDict(sorted(resolved_glyphs.iteritems(), key=itemgetter(0)))

 def subset_truetype(sfnt, character_map):
    loca = sfnt[b'loca']
+    glyf = sfnt[b'glyf']
+
    try:
        head, maxp = sfnt[b'head'], sfnt[b'maxp']
    except KeyError:
        raise UnsupportedFont('This font does not contain head and/or maxp tables')
    loca.load_offsets(head, maxp)

+    resolved_glyphs = resolve_glyphs(loca, glyf, character_map)
+    if not resolved_glyphs or set(resolved_glyphs) == {0}:
+        raise NoGlyphs('This font has no glyphs for the specified character '
+                'set, subsetting it is pointless')
+
+    # Keep only character codes that have resolved glyphs
+    for code, glyph_id in tuple(character_map.iteritems()):
+        if glyph_id not in resolved_glyphs:
+            del character_map[code]
+
+    # Update the glyf table
+    glyph_offset_map = glyf.update(resolved_glyphs)
+
+    # Update the loca table
+    loca.subset(glyph_offset_map)
+
+# }}}
+
 def subset(raw, individual_chars, ranges=()):
    chars = list(map(ord, individual_chars))
    for r in ranges:
        chars += list(xrange(ord(r[0]), ord(r[1])+1))

    sfnt = Sfnt(raw)
+    old_sizes = sfnt.sizes()
+
    # Remove the Digital Signature table since it is useless in a subset
    # font anyway
    sfnt.pop(b'DSIG', None)
@ -35,16 +85,186 @@ def subset(raw, individual_chars, ranges=()):

    # Get mapping of chars to glyph ids for all specified chars
    character_map = cmap.get_character_map(chars)
-    # Restrict the cmap table to only contain entries for the specified chars
-    cmap.set_character_map(character_map)

    if b'loca' in sfnt and b'glyf' in sfnt:
+        # TrueType Outlines
        subset_truetype(sfnt, character_map)
    elif b'CFF ' in sfnt:
+        # PostScript Outlines
        raise UnsupportedFont('This font contains PostScript outlines, '
                'subsetting not supported')
    else:
        raise UnsupportedFont('This font does not contain TrueType '
                'or PostScript outlines')

+    # Restrict the cmap table to only contain entries for the resolved glyphs
+    cmap.set_character_map(character_map)
+
+    raw, new_sizes = sfnt()
+    return raw, old_sizes, new_sizes
+
+# CLI {{{
+def option_parser():
+    import textwrap
+    from calibre.utils.config import OptionParser
+    parser = OptionParser(usage=textwrap.dedent('''\
+            %prog [options] input_font_file output_font_file characters_to_keep
+
+            Subset the specified font, keeping only the glyphs for the characters in
+            characters_to_keep. characters_to_keep is a comma separated list of characters of
+            the form: a,b,c,A-Z,0-9,xyz
+
+            You can specify ranges in the list of characters, as shown above.
+            '''))
+    parser.add_option('-c', '--codes', default=False, action='store_true',
+            help='If specified, the list of characters is interpreted as '
+            'numeric unicode codes instead of characters. So to specify the '
+            'characters a,b you would use 97,98')
+    parser.prog = 'subset-font'
+    return parser
+
+def print_stats(old_stats, new_stats):
+    from calibre import prints
+    prints('========= Table comparison (original vs. subset) =========')
+    prints('Table', ' ', '%10s'%'Size', '  ', 'Percent', '   ', '%10s'%'New Size',
+            ' New Percent')
+    prints('='*80)
+    old_total = sum(old_stats.itervalues())
+    new_total = sum(new_stats.itervalues())
+    tables = sorted(old_stats.iterkeys(), key=lambda x:old_stats[x],
+            reverse=True)
+    for table in tables:
+        osz = old_stats[table]
+        op = osz/old_total * 100
+        nsz = new_stats.get(table, 0)
+        np = nsz/new_total * 100
+        suffix = ' | same size'
+        if nsz != osz:
+            suffix = ' | reduced to %.1f %%'%(nsz/osz * 100)
+        prints('%4s'%table, '  ', '%10s'%osz, '  ', '%5.1f %%'%op, '   ',
+                '%10s'%nsz, '  ', '%5.1f %%'%np, suffix)
+    prints('='*80)
+
+
+def main(args):
+    import sys, time
+    from calibre import prints
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    if len(args) < 4 or len(args) > 4:
+        parser.print_help()
+        raise SystemExit(1)
+    iff, off, chars = args[1:]
+    with open(iff, 'rb') as f:
+        orig = f.read()
+
+    chars = [x.strip() for x in chars.split(',')]
+    individual, ranges = set(), set()
+
+    def not_single(c):
+        if len(c) > 1:
+            prints(c, 'is not a single character', file=sys.stderr)
+            raise SystemExit(1)
+
+    for c in chars:
+        if '-' in c:
+            parts = [x.strip() for x in c.split('-')]
+            if len(parts) != 2:
+                prints('Invalid range:', c, file=sys.stderr)
+                raise SystemExit(1)
+            if opts.codes:
+                parts = tuple(map(unichr, map(int, parts)))
+            map(not_single, parts)
+            ranges.add(tuple(parts))
+        else:
+            if opts.codes:
+                c = unichr(int(c))
+            not_single(c)
+            individual.add(c)
+    st = time.time()
+    sf, old_stats, new_stats = subset(orig, individual, ranges)
+    taken = time.time() - st
+    reduced = (len(sf)/len(orig)) * 100
+    def sz(x):
+        return '%gKB'%(len(x)/1024.)
+    print_stats(old_stats, new_stats)
+    prints('Original size:', sz(orig), 'Subset size:', sz(sf), 'Reduced to: %g%%'%(reduced))
+    prints('Subsetting took %g seconds'%taken)
+    with open(off, 'wb') as f:
+        f.write(sf)
+    prints('Subset font written to:', off)
+
+if __name__ == '__main__':
+    try:
+        import init_calibre
+        init_calibre
+    except ImportError:
+        pass
+    import sys
+    main(sys.argv)
+# }}}
+
+# Tests {{{
+def test_mem():
+    from calibre.utils.mem import memory
+    import gc
+    gc.collect()
+    start_mem = memory()
+    raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
+    calls = 1000
+    for i in xrange(calls):
+        subset(raw, (), (('a', 'z'),))
+    del raw
+    for i in xrange(3): gc.collect()
+    print ('Leaked memory per call:', (memory() - start_mem)/calls*1024, 'KB')
+
+def test():
+    raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
+    sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
+    if len(sf) > 0.3 * len(raw):
+        raise Exception('Subsetting failed')
+
+def all():
+    from calibre.utils.fonts.scanner import font_scanner
+    failed = []
+    unsupported = []
+    total = 0
+    for family in font_scanner.find_font_families():
+        for font in font_scanner.fonts_for_family(family):
+            raw = font_scanner.get_font_data(font)
+            print ('Subsetting', font['full_name'], end='\t')
+            total += 1
+            try:
+                sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
+            except NoGlyphs:
+                print('No glyphs!')
+                continue
+            except UnsupportedFont as e:
+                unsupported.append((font['full_name'], font['path'], unicode(e)))
+                print ('Unsupported!')
+                continue
+            except Exception as e:
+                print ('Failed!')
+                failed.append((font['full_name'], font['path'], unicode(e)))
+            else:
+                print ('Reduced to:', '%.1f'%(
+                        sum(new_stats.itervalues())/sum(old_stats.itervalues())
+                        * 100), '%')
+    if unsupported:
+        print ('\n\nUnsupported:')
+        for name, path, err in unsupported:
+            print (name, path, err)
+            print()
+    if failed:
+        print ('\n\nFailures:')
+        for name, path, err in failed:
+            print (name, path, err)
+            print()
+
+    print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:',
+            len(failed))
+
+
+# }}}
+

--- a/src/calibre/utils/fonts/subset.py
+++ b/src/calibre/utils/fonts/subset.py
@ -120,6 +120,7 @@ def all():
            try:
                sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
            except NoGlyphs:
+                print ('No glyphs!')
                continue
            except UnsupportedFont as e:
                unsupported.append((font['full_name'], font['path'], unicode(e)))