Font subsetting: Parse the GSUB table for glyph substituion rules and add do not remove any glyphs that could act as substitutes. Keep zero length glyphs like the glyphs for non prinatable characters when subsetting TrueType outlines.

2025-06-23 15:30:45 -04:00 · 2012-11-10 17:32:13 +05:30 · 2012-11-10 17:32:13 +05:30 · b4a49e5cdd
commit b4a49e5cdd
parent e494412887
5 changed files with 491 additions and 35 deletions
--- a/src/calibre/utils/fonts/sfnt/cff/table.py
+++ b/src/calibre/utils/fonts/sfnt/cff/table.py
@ -186,7 +186,7 @@ class CFFTable(UnknownTable):
    def decompile(self):
        self.cff = CFF(self.raw)
-    def subset(self, character_map):
+    def subset(self, character_map, extra_glyphs):
        from calibre.utils.fonts.sfnt.cff.writer import Subset
        # Map codes from the cmap table to glyph names, this will be used to
        # reconstruct character_map for the subset font
@ -196,6 +196,9 @@ class CFFTable(UnknownTable):
        charset.discard(None)
        if not charset:
            raise NoGlyphs('This font has no glyphs for the specified characters')
        charset |= {
            self.cff.charset.safe_lookup(glyph_id) for glyph_id in extra_glyphs}
        charset.discard(None)
        s = Subset(self.cff, charset)
        # Rebuild character_map with the glyph ids from the subset font
--- a/src/calibre/utils/fonts/sfnt/common.py
+++ b/src/calibre/utils/fonts/sfnt/common.py
@ -0,0 +1,240 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from struct import unpack_from, calcsize
 from collections import OrderedDict, namedtuple
 from calibre.utils.fonts.sfnt.errors import UnsupportedFont
 class Unpackable(object):
    def __init__(self, raw, offset):
        self.raw, self.offset = raw, offset
        self.start_pos = offset
    def unpack(self, fmt, single_special=True):
        fmt = fmt.encode('ascii') if not isinstance(fmt, bytes) else fmt
        ans = unpack_from(b'>'+fmt, self.raw, self.offset)
        if single_special and len(ans) == 1:
            ans = ans[0]
        self.offset += calcsize(fmt)
        return ans
 class SimpleListTable(list):
    'A table that contains a list of subtables'
    child_class = None
    def __init__(self, raw, offset):
        list.__init__(self)
        data = Unpackable(raw, offset)
        self.read_extra_header(data)
        count = data.unpack('H')
        for i in xrange(count):
            offset = data.unpack('H')
            self.append(self.child_class(raw, data.start_pos + offset))
        self.read_extra_footer(data)
    def read_extra_header(self, data):
        pass
    def read_extra_footer(self, data):
        pass
 class ListTable(OrderedDict):
    'A table that contains an ordered mapping of table tag to subtable'
    child_class = None
    def __init__(self, raw, offset):
        OrderedDict.__init__(self)
        data = Unpackable(raw, offset)
        self.read_extra_header(data)
        count = data.unpack('H')
        for i in xrange(count):
            tag, coffset = data.unpack('4sH')
            self[tag] = self.child_class(raw, data.start_pos + coffset)
        self.read_extra_footer(data)
    def read_extra_header(self, data):
        pass
    def read_extra_footer(self, data):
        pass
    def dump(self, prefix=''):
        print (prefix, self.__class__.__name__, sep='')
        prefix += '  '
        for tag, child in self.iteritems():
            print (prefix, tag, sep='')
            child.dump(prefix=prefix+'  ')
 class IndexTable(list):
    def __init__(self, raw, offset):
        data = Unpackable(raw, offset)
        self.read_extra_header(data)
        count = data.unpack('H')
        for i in xrange(count):
            self.append(data.unpack('H'))
    def read_extra_header(self, data):
        pass
    def dump(self, prefix=''):
        print(prefix, self.__class__.__name__, sep='')
 class LanguageSystemTable(IndexTable):
    def read_extra_header(self, data):
        self.lookup_order, self.required_feature_index = data.unpack('2H')
        if self.lookup_order != 0:
            raise UnsupportedFont('This LanguageSystemTable has an unknown'
                    ' lookup order: 0x%x'%self.lookup_order)
 class ScriptTable(ListTable):
    child_class = LanguageSystemTable
    def __init__(self, raw, offset):
        ListTable.__init__(self, raw, offset)
    def read_extra_header(self, data):
        start_pos = data.offset
        default_offset = data.unpack('H')
        self[b'default'] = (LanguageSystemTable(data.raw, start_pos +
            default_offset) if default_offset else None)
 class ScriptListTable(ListTable):
    child_class = ScriptTable
 class FeatureTable(IndexTable):
    def read_extra_header(self, data):
        self.feature_params = data.unpack('H')
        if False and self.feature_params != 0:
            # Source code pro sets this to non NULL
            raise UnsupportedFont(
                'This FeatureTable has non NULL FeatureParams: 0x%x'%self.feature_params)
 class FeatureListTable(ListTable):
    child_class = FeatureTable
 class LookupTable(SimpleListTable):
    def read_extra_header(self, data):
        self.lookup_type, self.lookup_flag = data.unpack('2H')
        self.set_child_class()
    def set_child_class(self):
        raise NotImplementedError()
    def read_extra_footer(self, data):
        if self.lookup_flag & 0x0010:
            self.mark_filtering_set = data.unpack('H')
 def ExtensionSubstitution(raw, offset, subtable_map={}):
    data = Unpackable(raw, offset)
    subst_format, extension_lookup_type, offset = data.unpack('2HL')
    if subst_format != 1:
        raise UnsupportedFont('ExtensionSubstitution has unknown format: 0x%x'%subst_format)
    return subtable_map[extension_lookup_type](raw, offset+data.start_pos)
 CoverageRange = namedtuple('CoverageRange', 'start end start_coverage_index')
 class Coverage(object):
    def __init__(self, raw, offset, parent_table_name):
        data = Unpackable(raw, offset)
        self.format, count = data.unpack('2H')
        if self.format not in {1, 2}:
            raise UnsupportedFont('Unknown Coverage format: 0x%x in %s'%(
                self.format, parent_table_name))
        if self.format == 1:
            self.glyph_ids = data.unpack('%dH'%count, single_special=False)
            self.glyph_ids_map = {gid:i for i, gid in
                    enumerate(self.glyph_ids)}
        else:
            self.ranges = []
            ranges = data.unpack('%dH'%(3*count), single_special=False)
            for i in xrange(count):
                start, end, start_coverage_index = ranges[i*3:(i+1)*3]
                self.ranges.append(CoverageRange(start, end, start_coverage_index))
    def coverage_indices(self, glyph_ids):
        '''Return map of glyph_id -> coverage index. Map contains only those
        glyph_ids that are covered by this table and that are present in
        glyph_ids.'''
        ans = OrderedDict()
        for gid in glyph_ids:
            if self.format == 1:
                idx = self.glyph_ids_map.get(gid, None)
                if idx is not None:
                    ans[gid] = idx
            else:
                for start, end, start_coverage_index in self.ranges:
                    if start <= gid <= end:
                        ans[gid] = start_coverage_index + (gid-start)
        return ans
 class UnknownLookupSubTable(object):
    formats = {}
    def __init__(self, raw, offset):
        data = Unpackable(raw, offset)
        self.format = data.unpack('H')
        if self.format not in self.formats:
            raise UnsupportedFont('Unknown format for Lookup Subtable %s: 0x%x'%(
                self.__class__.__name__, self.format))
        if self.has_initial_coverage:
            coverage_offset = data.unpack('H') + data.start_pos
            self.coverage = Coverage(raw, coverage_offset, self.__class__.__name__)
        self.initialize(data)
    @property
    def has_initial_coverage(self):
        return True
    def all_substitutions(self, glyph_ids):
        ''' Return a set of all glyph ids that could be substituted for any
        subset of the specified glyph ids (which must be a set)'''
        raise NotImplementedError()
    def read_sets(self, data, read_item=None, set_is_index=False):
        count = data.unpack('H')
        sets = data.unpack('%dH'%count, single_special=False)
        coverage_to_items_map = []
        for offset in sets:
            # Read items in the set
            data.offset = start_pos = offset + data.start_pos
            count = data.unpack('H')
            item_offsets = data.unpack('%dH'%count, single_special=False)
            items = []
            for offset in item_offsets:
                data.offset = offset + start_pos
                if set_is_index:
                    items.append(offset)
                else:
                    items.append(read_item(data))
            coverage_to_items_map.append(items)
        return coverage_to_items_map
--- a/src/calibre/utils/fonts/sfnt/container.py
+++ b/src/calibre/utils/fonts/sfnt/container.py
@ -22,6 +22,7 @@ from calibre.utils.fonts.sfnt.loca import LocaTable
 from calibre.utils.fonts.sfnt.glyf import GlyfTable
 from calibre.utils.fonts.sfnt.cmap import CmapTable
 from calibre.utils.fonts.sfnt.kern import KernTable
 from calibre.utils.fonts.sfnt.gsub import GSUBTable
 from calibre.utils.fonts.sfnt.cff.table import CFFTable
 # OpenType spec: http://www.microsoft.com/typography/otspec/otff.htm
@ -46,6 +47,7 @@ class Sfnt(object):
                    b'cmap' : CmapTable,
                    b'CFF ' : CFFTable,
                    b'kern' : KernTable,
                    b'GSUB' : GSUBTable,
                    }.get(table_tag, UnknownTable)(table)
    def __getitem__(self, key):
--- a/src/calibre/utils/fonts/sfnt/gsub.py
+++ b/src/calibre/utils/fonts/sfnt/gsub.py
@ -0,0 +1,180 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from struct import unpack_from
 from functools import partial
 from calibre.utils.fonts.sfnt import UnknownTable, FixedProperty
 from calibre.utils.fonts.sfnt.errors import UnsupportedFont
 from calibre.utils.fonts.sfnt.common import (ScriptListTable, FeatureListTable,
        SimpleListTable, LookupTable, ExtensionSubstitution,
        UnknownLookupSubTable)
 class SingleSubstitution(UnknownLookupSubTable):
    formats = {1, 2}
    def initialize(self, data):
        if self.format == 1:
            self.delta = data.unpack('h')
        else:
            count = data.unpack('H')
            self.substitutes = data.unpack('%dH'%count, single_special=False)
    def all_substitutions(self, glyph_ids):
        gid_index_map = self.coverage.coverage_indices(glyph_ids)
        if self.format == 1:
            return {gid + self.delta for gid in gid_index_map}
        return {self.substitutes[i] for i in gid_index_map.itervalues()}
 class MultipleSubstitution(UnknownLookupSubTable):
    formats = {1}
    def initialize(self, data):
        self.coverage_to_subs_map = self.read_sets(data, set_is_index=True)
    def all_substitutions(self, glyph_ids):
        gid_index_map = self.coverage.coverage_indices(glyph_ids)
        ans = set()
        for index in gid_index_map.itervalues():
            glyphs = set(self.coverage_to_subs_map[index])
            ans |= glyphs
        return ans
 class AlternateSubstitution(MultipleSubstitution):
    pass
 class LigatureSubstitution(UnknownLookupSubTable):
    formats = {1}
    def initialize(self, data):
        self.coverage_to_lig_map = self.read_sets(data, self.read_ligature)
    def read_ligature(self, data):
        lig_glyph, count = data.unpack('HH')
        components = data.unpack('%dH'%count, single_special=False)
        return (lig_glyph, components)
    def all_substitutions(self, glyph_ids):
        gid_index_map = self.coverage.coverage_indices(glyph_ids)
        ans = set()
        for index in gid_index_map.itervalues():
            for glyph_id, components in self.coverage_to_lig_map[index]:
                if set(components).issubset(glyph_ids):
                    ans.add(glyph_id)
        return ans
 class ContexttualSubstitution(UnknownLookupSubTable):
    formats = {1, 2, 3}
    @property
    def has_initial_coverage(self):
        return self.format != 3
    def initialize(self, data):
        pass # TODO
    def all_substitutions(self, glyph_ids):
        # This table only defined substitution in terms of other tables
        return set()
 class ChainingContextualSubstitution(UnknownLookupSubTable):
    formats = {1, 2, 3}
    @property
    def has_initial_coverage(self):
        return self.format != 3
    def initialize(self, data):
        pass # TODO
    def all_substitutions(self, glyph_ids):
        # This table only defined substitution in terms of other tables
        return set()
 class ReverseChainSingleSubstitution(UnknownLookupSubTable):
    formats = {1}
    def initialize(self, data):
        backtrack_count = data.unpack('H')
        backtrack_offsets = data.unpack('%dH'%backtrack_count,
                single_special=False)
        lookahead_count = data.unpack('H')
        lookahead_offsets = data.unpack('%dH'%lookahead_count,
                single_special=False)
        backtrack_offsets = [data.start_pos + x for x in backtrack_offsets]
        lookahead_offsets = [data.start_pos + x for x in lookahead_offsets]
        backtrack_offsets, lookahead_offsets # TODO: Use these
        count = data.unpack('H')
        self.substitutes = data.unpack('%dH'%count)
    def all_substitutions(self, glyph_ids):
        gid_index_map = self.coverage.coverage_indices(glyph_ids)
        return {self.substitutes[i] for i in gid_index_map.itervalues()}
 subtable_map = {
        1: SingleSubstitution,
        2: MultipleSubstitution,
        3: AlternateSubstitution,
        4: LigatureSubstitution,
        5: ContexttualSubstitution,
        6: ChainingContextualSubstitution,
        8: ReverseChainSingleSubstitution,
 }
 class GSUBLookupTable(LookupTable):
    def set_child_class(self):
        if self.lookup_type == 7:
            self.child_class = partial(ExtensionSubstitution,
                    subtable_map=subtable_map)
        else:
            self.child_class = subtable_map[self.lookup_type]
 class LookupListTable(SimpleListTable):
    child_class = GSUBLookupTable
 class GSUBTable(UnknownTable):
    version = FixedProperty('_version')
    def decompile(self):
        (self._version, self.scriptlist_offset, self.featurelist_offset,
                self.lookuplist_offset) = unpack_from(b'>L3H', self.raw)
        if self._version != 0x10000:
            raise UnsupportedFont('The GSUB table has unknown version: 0x%x'%
                    self._version)
        self.script_list_table = ScriptListTable(self.raw,
                self.scriptlist_offset)
        # self.script_list_table.dump()
        self.feature_list_table = FeatureListTable(self.raw,
                self.featurelist_offset)
        # self.feature_list_table.dump()
        self.lookup_list_table = LookupListTable(self.raw,
                self.lookuplist_offset)
    def all_substitutions(self, glyph_ids):
        ans = set()
        glyph_ids = frozenset(glyph_ids)
        for lookup_table in self.lookup_list_table:
            for subtable in lookup_table:
                gids = subtable.all_substitutions(glyph_ids)
                ans |= gids
        return ans
--- a/src/calibre/utils/fonts/sfnt/subset.py
+++ b/src/calibre/utils/fonts/sfnt/subset.py
@ -7,16 +7,18 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import traceback
 from collections import OrderedDict
 from operator import itemgetter
 from functools import partial
 from calibre.utils.fonts.sfnt.container import Sfnt
 from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs
 # TrueType outlines {{{
-def resolve_glyphs(loca, glyf, character_map):
+def resolve_glyphs(loca, glyf, character_map, extra_glyphs):
-    unresolved_glyphs = set(character_map.itervalues())
+    unresolved_glyphs = set(character_map.itervalues()) | extra_glyphs
    unresolved_glyphs.add(0) # We always want the .notdef glyph
    resolved_glyphs = {}
@ -26,11 +28,7 @@ def resolve_glyphs(loca, glyf, character_map):
            offset, length = loca.glyph_location(glyph_id)
        except (IndexError, ValueError, KeyError, TypeError):
            continue
        if length < 1:
            continue
        glyph = glyf.glyph_data(offset, length)
        if len(glyph) == 0:
            continue
        resolved_glyphs[glyph_id] = glyph
        for gid in glyph.glyph_indices:
            if gid not in resolved_glyphs:
@ -38,7 +36,7 @@ def resolve_glyphs(loca, glyf, character_map):
    return OrderedDict(sorted(resolved_glyphs.iteritems(), key=itemgetter(0)))
-def subset_truetype(sfnt, character_map):
+def subset_truetype(sfnt, character_map, extra_glyphs):
    loca = sfnt[b'loca']
    glyf = sfnt[b'glyf']
@ -48,7 +46,7 @@ def subset_truetype(sfnt, character_map):
        raise UnsupportedFont('This font does not contain head and/or maxp tables')
    loca.load_offsets(head, maxp)
-    resolved_glyphs = resolve_glyphs(loca, glyf, character_map)
+    resolved_glyphs = resolve_glyphs(loca, glyf, character_map, extra_glyphs)
    if not resolved_glyphs or set(resolved_glyphs) == {0}:
        raise NoGlyphs('This font has no glyphs for the specified character '
                'set, subsetting it is pointless')
@ -66,26 +64,33 @@ def subset_truetype(sfnt, character_map):
 # }}}
-def subset_postscript(sfnt, character_map):
+def subset_postscript(sfnt, character_map, extra_glyphs):
    cff = sfnt[b'CFF ']
    cff.decompile()
-    cff.subset(character_map)
+    cff.subset(character_map, extra_glyphs)
-def subset(raw, individual_chars, ranges=()):
+def do_warn(warnings, *args):
-    chars = list(map(ord, individual_chars))
+    for arg in args:
        for line in arg.splitlines():
            if warnings is None:
                print(line)
            else:
                warnings.append(line)
    if warnings is None:
        print()
    else:
        warnings.append('')
 def subset(raw, individual_chars, ranges=(), warnings=None):
    warn = partial(do_warn, warnings)
    chars = set(map(ord, individual_chars))
    for r in ranges:
-        chars += list(xrange(ord(r[0]), ord(r[1])+1))
+        chars |= set(xrange(ord(r[0]), ord(r[1])+1))
-    # Hack pending parsing of the GSUB table, manually add in a few common
+    # Always add the space character for ease of use from the command line
-    # ligatures
+    if ord(' ') not in chars:
-    ligatures = {'AE':'Æ', 'ae':'æ', 'OE':'Œ', 'IJ':'Ĳ', 'ij':'ĳ', 'ue':'ᵫ',
+        chars.add(ord(' '))
            'ff':'ﬀ', 'fi':'ﬁ', 'fl':'ﬂ', 'ffi':'ﬃ', 'ffl':'ﬄ', 'st':'ﬆ'}
    all_chars = set(chars)
    for ichars, lig in ligatures.iteritems():
        ichars = frozenset(map(ord, ichars))
        if ichars.issubset(all_chars) and ord(lig) not in all_chars:
            all_chars.add(ord(lig))
            chars.append(ord(lig))
    sfnt = Sfnt(raw)
    old_sizes = sfnt.sizes()
@ -113,12 +118,26 @@ def subset(raw, individual_chars, ranges=()):
    # Get mapping of chars to glyph ids for all specified chars
    character_map = cmap.get_character_map(chars)
    extra_glyphs = set()
    if b'GSUB' in sfnt:
        # Parse all substitution rules to ensure that glyphs that can be
        # substituted for the specified set of glyphs are not removed
        gsub = sfnt[b'GSUB']
        try:
            gsub.decompile()
            extra_glyphs = gsub.all_substitutions(character_map.itervalues())
        except UnsupportedFont as e:
            warn('Usupported GSUB table: %s'%e)
        except Exception as e:
            warn('Failed to decompile GSUB table:', traceback.format_exc())
    if b'loca' in sfnt and b'glyf' in sfnt:
        # TrueType Outlines
-        subset_truetype(sfnt, character_map)
+        subset_truetype(sfnt, character_map, extra_glyphs)
    elif b'CFF ' in sfnt:
        # PostScript Outlines
-        subset_postscript(sfnt, character_map)
+        subset_postscript(sfnt, character_map, extra_glyphs)
    else:
        raise UnsupportedFont('This font does not contain TrueType '
                'or PostScript outlines')
@ -130,11 +149,10 @@ def subset(raw, individual_chars, ranges=()):
        try:
            sfnt[b'kern'].restrict_to_glyphs(frozenset(character_map.itervalues()))
        except UnsupportedFont as e:
-            print ('Subsetting of kern table failed, ignoring: %s'%e)
+            warn('kern table unsupported, ignoring: %s'%e)
        except Exception as e:
-            print ('Subsetting of kern table failed, ignoring')
+            warn('Subsetting of kern table failed, ignoring:',
-            import traceback
+                    traceback.format_exc())
            traceback.print_exc()
    raw, new_sizes = sfnt()
    return raw, old_sizes, new_sizes
@ -264,14 +282,20 @@ def all():
    from calibre.utils.fonts.scanner import font_scanner
    failed = []
    unsupported = []
    warnings = {}
    total = 0
    averages = []
    for family in font_scanner.find_font_families():
        for font in font_scanner.fonts_for_family(family):
            raw = font_scanner.get_font_data(font)
            print ('Subsetting', font['full_name'], end='\t')
            total += 1
            try:
-                sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
+                w = []
                sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')),
                        (), w)
                if w:
                    warnings[font['full_name'] + ' (%s)'%font['path']] = w
            except NoGlyphs:
                print('No glyphs!')
                continue
@ -283,22 +307,29 @@ def all():
                print ('Failed!')
                failed.append((font['full_name'], font['path'], unicode(e)))
            else:
-                print ('Reduced to:', '%.1f'%(
+                averages.append(sum(new_stats.itervalues())/sum(old_stats.itervalues())
-                        sum(new_stats.itervalues())/sum(old_stats.itervalues())
+                        * 100)
-                        * 100), '%')
+                print ('Reduced to:', '%.1f'%averages[-1] , '%')
    if unsupported:
        print ('\n\nUnsupported:')
        for name, path, err in unsupported:
            print (name, path, err)
            print()
    if warnings:
        print ('\n\nWarnings:')
    for name, w in warnings.iteritems():
        if w:
            print (name)
            print('', '\n\t'.join(w), sep='\t')
    if failed:
        print ('\n\nFailures:')
        for name, path, err in failed:
            print (name, path, err)
            print()
    print ('Average reduction to: %.1f%%'%( sum(averages)/len(averages)))
    print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:',
-            len(failed))
+            len(failed), 'Warnings:', len(warnings))
 # }}}