Switch to a pure python implementation of font subsetting

2025-07-09 03:04:10 -04:00 · 2012-11-06 11:39:48 +05:30 · 2012-11-06 11:39:48 +05:30 · f54843c547
commit f54843c547
parent 18db66fd77
10 changed files with 383 additions and 13 deletions
--- a/src/calibre/debug.py
+++ b/src/calibre/debug.py
@ -212,7 +212,7 @@ def main(args=sys.argv):
        return
    if len(args) > 1 and args[1] in ('-f', '--subset-font'):
-        from calibre.utils.fonts.subset import main
+        from calibre.utils.fonts.sfnt.subset import main
        main(['subset-font']+args[2:])
        return
--- a/src/calibre/ebooks/oeb/transforms/subset.py
+++ b/src/calibre/ebooks/oeb/transforms/subset.py
@ -10,7 +10,7 @@ __docformat__ = 'restructuredtext en'
 from collections import defaultdict
 from calibre.ebooks.oeb.base import urlnormalize
-from calibre.utils.fonts.subset import subset, NoGlyphs, UnsupportedFont
+from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont
 class SubsetFonts(object):
--- a/src/calibre/utils/fonts/sfnt/init.py
+++ b/src/calibre/utils/fonts/sfnt/init.py
@ -26,6 +26,9 @@ class UnknownTable(object):
    def __call__(self):
        return self.raw
    def __len__(self):
        return len(self.raw)
 class DateTimeProperty(object):
    def __init__(self, name):
@ -46,10 +49,10 @@ class FixedProperty(object):
    def __get__(self, obj, type=None):
        val = getattr(obj, self.name)
-        return val * (2**-16)
+        return val / 0x10000
    def __set__(self, obj, val):
-        return int(round(val*(2**16)))
+        return int(round(val*(0x10000)))
 def max_power_of_two(x):
 	"""
@ -62,4 +65,10 @@ def max_power_of_two(x):
 		exponent += 1
 	return max(exponent - 1, 0)
 def load_font(stream_or_path):
    raw = stream_or_path
    if hasattr(raw, 'read'):
        raw = raw.read()
    from calibre.utils.fonts.sfnt.container import Sfnt
    return Sfnt(raw)
--- a/src/calibre/utils/fonts/sfnt/cmap.py
+++ b/src/calibre/utils/fonts/sfnt/cmap.py
@ -144,6 +144,7 @@ class CmapTable(UnknownTable):
            except IndexError:
                next_offset = len(self.raw)
            table = self.raw[offset:next_offset]
            if table:
                fmt = unpack_from(b'>H', table)[0]
                if platform == 3 and encoding == 1 and fmt == 4:
                    self.bmp_table = table
--- a/src/calibre/utils/fonts/sfnt/container.py
+++ b/src/calibre/utils/fonts/sfnt/container.py
@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
 from struct import pack, calcsize
 from io import BytesIO
 from collections import OrderedDict
 from calibre.utils.fonts.utils import (get_tables, checksum_of_block,
        verify_checksums)
@ -18,6 +19,8 @@ from calibre.utils.fonts.sfnt.errors import UnsupportedFont
 from calibre.utils.fonts.sfnt.head import HeadTable
 from calibre.utils.fonts.sfnt.maxp import MaxpTable
 from calibre.utils.fonts.sfnt.loca import LocaTable
 from calibre.utils.fonts.sfnt.glyf import GlyfTable
 from calibre.utils.fonts.sfnt.cmap import CmapTable
 class Sfnt(object):
@ -35,6 +38,8 @@ class Sfnt(object):
                    b'head' : HeadTable,
                    b'maxp' : MaxpTable,
                    b'loca' : LocaTable,
                    b'glyf' : GlyfTable,
                    b'cmap' : CmapTable,
                    }.get(table_tag, UnknownTable)(table)
    def __getitem__(self, key):
@ -49,6 +54,12 @@ class Sfnt(object):
    def pop(self, key, default=None):
        return self.tables.pop(key, default)
    def sizes(self):
        ans = OrderedDict()
        for tag in sorted(self.tables):
            ans[tag] = len(self[tag])
        return ans
    def __call__(self):
        stream = BytesIO()
@ -68,6 +79,7 @@ class Sfnt(object):
        head_offset = None
        table_data = []
        offset = stream.tell() + ( calcsize(b'>4s3L') * num_tables )
        sizes = OrderedDict()
        for tag in sorted(self.tables):
            table = self.tables[tag]
            raw = table()
@ -80,6 +92,7 @@ class Sfnt(object):
            spack(b'>4s3L', tag, checksum, offset, table_len)
            offset += len(raw)
            table_data.append(raw)
            sizes[tag] = table_len
        for x in table_data:
            stream.write(x)
@ -89,7 +102,7 @@ class Sfnt(object):
        stream.seek(head_offset + 8)
        spack(b'>L', q)
-        return stream.getvalue()
+        return stream.getvalue(), sizes
 def test_roundtrip(ff=None):
    if ff is None:
@ -97,7 +110,7 @@ def test_roundtrip(ff=None):
    else:
        with open(ff, 'rb') as f:
            data = f.read()
-    rd = Sfnt(data)()
+    rd = Sfnt(data)()[0]
    verify_checksums(rd)
    if data[:12] != rd[:12]:
        raise ValueError('Roundtripping failed, font header not the same')
--- a/src/calibre/utils/fonts/sfnt/errors.py
+++ b/src/calibre/utils/fonts/sfnt/errors.py
@ -10,3 +10,6 @@ __docformat__ = 'restructuredtext en'
 class UnsupportedFont(ValueError):
    pass
 class NoGlyphs(ValueError):
    pass
--- a/src/calibre/utils/fonts/sfnt/glyf.py
+++ b/src/calibre/utils/fonts/sfnt/glyf.py
@ -0,0 +1,88 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from struct import unpack_from
 from collections import OrderedDict
 from calibre.utils.fonts.sfnt import UnknownTable
 ARG_1_AND_2_ARE_WORDS      = 0x0001  # if set args are words otherwise they are bytes
 ARGS_ARE_XY_VALUES         = 0x0002  # if set args are xy values, otherwise they are points
 ROUND_XY_TO_GRID           = 0x0004  # for the xy values if above is true
 WE_HAVE_A_SCALE            = 0x0008  # Sx = Sy, otherwise scale == 1.0
 NON_OVERLAPPING            = 0x0010  # set to same value for all components (obsolete!)
 MORE_COMPONENTS            = 0x0020  # indicates at least one more glyph after this one
 WE_HAVE_AN_X_AND_Y_SCALE   = 0x0040  # Sx, Sy
 WE_HAVE_A_TWO_BY_TWO       = 0x0080  # t00, t01, t10, t11
 WE_HAVE_INSTRUCTIONS       = 0x0100  # instructions follow
 USE_MY_METRICS             = 0x0200  # apply these metrics to parent glyph
 OVERLAP_COMPOUND           = 0x0400  # used by Apple in GX fonts
 SCALED_COMPONENT_OFFSET    = 0x0800  # composite designed to have the component offset scaled (designed for Apple)
 UNSCALED_COMPONENT_OFFSET  = 0x1000  # composite designed not to have the component offset scaled (designed for MS)
 class SimpleGlyph(object):
    def __init__(self, num_of_countours, raw):
        self.num_of_countours = num_of_countours
        self.raw = raw
        # The list of glyph indices referred to by this glyph, will always be
        # empty for a simple glyph and not empty for a composite glyph
        self.glyph_indices = []
        self.is_composite = False
    def __len__(self):
        return len(self.raw)
    def __call__(self):
        return self.raw
 class CompositeGlyph(SimpleGlyph):
    def __init__(self, num_of_countours, raw):
        super(CompositeGlyph, self).__init__(num_of_countours, raw)
        self.is_composite = True
        flags = MORE_COMPONENTS
        offset = 0
        while flags & MORE_COMPONENTS:
            flags, glyph_index = unpack_from(b'>HH', raw, offset)
            self.glyph_indices.append(glyph_index)
            offset += 4
            if flags & ARG_1_AND_2_ARE_WORDS:
                offset += 4
            else:
                offset += 2
            if flags & WE_HAVE_A_SCALE:
                offset += 2
            elif flags & WE_HAVE_AN_X_AND_Y_SCALE:
                offset += 4
            elif flags & WE_HAVE_A_TWO_BY_TWO:
                offset += 8
 class GlyfTable(UnknownTable):
    def glyph_data(self, offset, length):
        raw = self.raw[offset:offset+length]
        num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0
        if num_of_countours >= 0:
            return SimpleGlyph(num_of_countours, raw)
        return CompositeGlyph(num_of_countours, raw)
    def update(self, sorted_glyph_map):
        ans = OrderedDict()
        offset = 0
        block = []
        for glyph_id, glyph in sorted_glyph_map.iteritems():
            raw = glyph()
            ans[glyph_id] = (offset, len(raw))
            offset += len(raw)
            block.append(raw)
        self.raw = b''.join(block)
        return ans
--- a/src/calibre/utils/fonts/sfnt/loca.py
+++ b/src/calibre/utils/fonts/sfnt/loca.py
@ -7,7 +7,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-from struct import calcsize, unpack_from
+from struct import calcsize, unpack_from, pack
 from operator import itemgetter
 from calibre.utils.fonts.sfnt import UnknownTable
@ -23,9 +24,43 @@ class LocaTable(UnknownTable):
        self.offset_map = self.offset_map[:num_glyphs+1]
        if fmt == 'H':
            self.offset_map = [2*i for i in self.offset_map]
        self.fmt = fmt
    def glyph_location(self, glyph_id):
        offset = self.offset_map[glyph_id]
        next_offset = self.offset_map[glyph_id+1]
        return offset, next_offset - offset
    def subset(self, resolved_glyph_map):
        '''
        Update this table to contain pointers only to the glyphs in
        resolved_glyph_map which must be a map of glyph_ids to (offset, sz)
        '''
        self.offset_map = [0 for i in self.offset_map]
        glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in
                    resolved_glyph_map.iteritems()]
        glyphs.sort(key=itemgetter(1))
        for glyph_id, offset, sz in glyphs:
            self.offset_map[glyph_id] = offset
            self.offset_map[glyph_id+1] = offset + sz
        # Fix all zero entries to be the same as the previous entry, which
        # means that if the ith entry is zero, the i-1 glyph is not present.
        for i in xrange(1, len(self.offset_map)):
            if self.offset_map[i] == 0:
                self.offset_map[i] = self.offset_map[i-1]
        vals = self.offset_map
        if self.fmt == 'H':
            vals = [i//2 for i in self.offset_map]
        self.raw = pack(('>%d%s'%(len(vals), self.fmt)).encode('ascii'), *vals)
    def dump_glyphs(self, sfnt):
        if not hasattr(self, 'offset_map'):
            self.load_offsets(sfnt[b'head'], sfnt[b'maxp'])
        for i in xrange(len(self.offset_map)-1):
            off, noff = self.offset_map[i], self.offset_map[i+1]
            if noff != off:
                print ('Glyph id:', i, 'size:', noff-off)
--- a/src/calibre/utils/fonts/sfnt/subset.py
+++ b/src/calibre/utils/fonts/sfnt/subset.py
@ -7,23 +7,73 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from collections import OrderedDict
 from operator import itemgetter
 from calibre.utils.fonts.sfnt.container import Sfnt
-from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs
 # TrueType outlines {{{
 def resolve_glyphs(loca, glyf, character_map):
    unresolved_glyphs = set(character_map.itervalues())
    unresolved_glyphs.add(0) # We always want the .notdef glyph
    resolved_glyphs = {}
    while unresolved_glyphs:
        glyph_id = unresolved_glyphs.pop()
        try:
            offset, length = loca.glyph_location(glyph_id)
        except (IndexError, ValueError, KeyError, TypeError):
            continue
        if length < 1:
            continue
        glyph = glyf.glyph_data(offset, length)
        if len(glyph) == 0:
            continue
        resolved_glyphs[glyph_id] = glyph
        for gid in glyph.glyph_indices:
            if gid not in resolved_glyphs:
                unresolved_glyphs.add(gid)
    return OrderedDict(sorted(resolved_glyphs.iteritems(), key=itemgetter(0)))
 def subset_truetype(sfnt, character_map):
    loca = sfnt[b'loca']
    glyf = sfnt[b'glyf']
    try:
        head, maxp = sfnt[b'head'], sfnt[b'maxp']
    except KeyError:
        raise UnsupportedFont('This font does not contain head and/or maxp tables')
    loca.load_offsets(head, maxp)
    resolved_glyphs = resolve_glyphs(loca, glyf, character_map)
    if not resolved_glyphs or set(resolved_glyphs) == {0}:
        raise NoGlyphs('This font has no glyphs for the specified character '
                'set, subsetting it is pointless')
    # Keep only character codes that have resolved glyphs
    for code, glyph_id in tuple(character_map.iteritems()):
        if glyph_id not in resolved_glyphs:
            del character_map[code]
    # Update the glyf table
    glyph_offset_map = glyf.update(resolved_glyphs)
    # Update the loca table
    loca.subset(glyph_offset_map)
 # }}}
 def subset(raw, individual_chars, ranges=()):
    chars = list(map(ord, individual_chars))
    for r in ranges:
        chars += list(xrange(ord(r[0]), ord(r[1])+1))
    sfnt = Sfnt(raw)
    old_sizes = sfnt.sizes()
    # Remove the Digital Signature table since it is useless in a subset
    # font anyway
    sfnt.pop(b'DSIG', None)
@ -35,16 +85,186 @@ def subset(raw, individual_chars, ranges=()):
    # Get mapping of chars to glyph ids for all specified chars
    character_map = cmap.get_character_map(chars)
    # Restrict the cmap table to only contain entries for the specified chars
    cmap.set_character_map(character_map)
    if b'loca' in sfnt and b'glyf' in sfnt:
        # TrueType Outlines
        subset_truetype(sfnt, character_map)
    elif b'CFF ' in sfnt:
        # PostScript Outlines
        raise UnsupportedFont('This font contains PostScript outlines, '
                'subsetting not supported')
    else:
        raise UnsupportedFont('This font does not contain TrueType '
                'or PostScript outlines')
    # Restrict the cmap table to only contain entries for the resolved glyphs
    cmap.set_character_map(character_map)
    raw, new_sizes = sfnt()
    return raw, old_sizes, new_sizes
 # CLI {{{
 def option_parser():
    import textwrap
    from calibre.utils.config import OptionParser
    parser = OptionParser(usage=textwrap.dedent('''\
            %prog [options] input_font_file output_font_file characters_to_keep
            Subset the specified font, keeping only the glyphs for the characters in
            characters_to_keep. characters_to_keep is a comma separated list of characters of
            the form: a,b,c,A-Z,0-9,xyz
            You can specify ranges in the list of characters, as shown above.
            '''))
    parser.add_option('-c', '--codes', default=False, action='store_true',
            help='If specified, the list of characters is interpreted as '
            'numeric unicode codes instead of characters. So to specify the '
            'characters a,b you would use 97,98')
    parser.prog = 'subset-font'
    return parser
 def print_stats(old_stats, new_stats):
    from calibre import prints
    prints('========= Table comparison (original vs. subset) =========')
    prints('Table', ' ', '%10s'%'Size', '  ', 'Percent', '   ', '%10s'%'New Size',
            ' New Percent')
    prints('='*80)
    old_total = sum(old_stats.itervalues())
    new_total = sum(new_stats.itervalues())
    tables = sorted(old_stats.iterkeys(), key=lambda x:old_stats[x],
            reverse=True)
    for table in tables:
        osz = old_stats[table]
        op = osz/old_total * 100
        nsz = new_stats.get(table, 0)
        np = nsz/new_total * 100
        suffix = ' | same size'
        if nsz != osz:
            suffix = ' | reduced to %.1f %%'%(nsz/osz * 100)
        prints('%4s'%table, '  ', '%10s'%osz, '  ', '%5.1f %%'%op, '   ',
                '%10s'%nsz, '  ', '%5.1f %%'%np, suffix)
    prints('='*80)
 def main(args):
    import sys, time
    from calibre import prints
    parser = option_parser()
    opts, args = parser.parse_args(args)
    if len(args) < 4 or len(args) > 4:
        parser.print_help()
        raise SystemExit(1)
    iff, off, chars = args[1:]
    with open(iff, 'rb') as f:
        orig = f.read()
    chars = [x.strip() for x in chars.split(',')]
    individual, ranges = set(), set()
    def not_single(c):
        if len(c) > 1:
            prints(c, 'is not a single character', file=sys.stderr)
            raise SystemExit(1)
    for c in chars:
        if '-' in c:
            parts = [x.strip() for x in c.split('-')]
            if len(parts) != 2:
                prints('Invalid range:', c, file=sys.stderr)
                raise SystemExit(1)
            if opts.codes:
                parts = tuple(map(unichr, map(int, parts)))
            map(not_single, parts)
            ranges.add(tuple(parts))
        else:
            if opts.codes:
                c = unichr(int(c))
            not_single(c)
            individual.add(c)
    st = time.time()
    sf, old_stats, new_stats = subset(orig, individual, ranges)
    taken = time.time() - st
    reduced = (len(sf)/len(orig)) * 100
    def sz(x):
        return '%gKB'%(len(x)/1024.)
    print_stats(old_stats, new_stats)
    prints('Original size:', sz(orig), 'Subset size:', sz(sf), 'Reduced to: %g%%'%(reduced))
    prints('Subsetting took %g seconds'%taken)
    with open(off, 'wb') as f:
        f.write(sf)
    prints('Subset font written to:', off)
 if __name__ == '__main__':
    try:
        import init_calibre
        init_calibre
    except ImportError:
        pass
    import sys
    main(sys.argv)
 # }}}
 # Tests {{{
 def test_mem():
    from calibre.utils.mem import memory
    import gc
    gc.collect()
    start_mem = memory()
    raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
    calls = 1000
    for i in xrange(calls):
        subset(raw, (), (('a', 'z'),))
    del raw
    for i in xrange(3): gc.collect()
    print ('Leaked memory per call:', (memory() - start_mem)/calls*1024, 'KB')
 def test():
    raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
    sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
    if len(sf) > 0.3 * len(raw):
        raise Exception('Subsetting failed')
 def all():
    from calibre.utils.fonts.scanner import font_scanner
    failed = []
    unsupported = []
    total = 0
    for family in font_scanner.find_font_families():
        for font in font_scanner.fonts_for_family(family):
            raw = font_scanner.get_font_data(font)
            print ('Subsetting', font['full_name'], end='\t')
            total += 1
            try:
                sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
            except NoGlyphs:
                print('No glyphs!')
                continue
            except UnsupportedFont as e:
                unsupported.append((font['full_name'], font['path'], unicode(e)))
                print ('Unsupported!')
                continue
            except Exception as e:
                print ('Failed!')
                failed.append((font['full_name'], font['path'], unicode(e)))
            else:
                print ('Reduced to:', '%.1f'%(
                        sum(new_stats.itervalues())/sum(old_stats.itervalues())
                        * 100), '%')
    if unsupported:
        print ('\n\nUnsupported:')
        for name, path, err in unsupported:
            print (name, path, err)
            print()
    if failed:
        print ('\n\nFailures:')
        for name, path, err in failed:
            print (name, path, err)
            print()
    print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:',
            len(failed))
 # }}}
--- a/src/calibre/utils/fonts/subset.py
+++ b/src/calibre/utils/fonts/subset.py
@ -120,6 +120,7 @@ def all():
            try:
                sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
            except NoGlyphs:
                print ('No glyphs!')
                continue
            except UnsupportedFont as e:
                unsupported.append((font['full_name'], font['path'], unicode(e)))