Font subsetting: Parse the GSUB table for glyph substituion rules and add do not remove any glyphs that could act as substitutes. Keep zero length glyphs like the glyphs for non prinatable characters when subsetting TrueType outlines.

This commit is contained in:
Kovid Goyal 2012-11-10 17:32:13 +05:30
parent e494412887
commit b4a49e5cdd
5 changed files with 491 additions and 35 deletions

View File

@ -186,7 +186,7 @@ class CFFTable(UnknownTable):
def decompile(self): def decompile(self):
self.cff = CFF(self.raw) self.cff = CFF(self.raw)
def subset(self, character_map): def subset(self, character_map, extra_glyphs):
from calibre.utils.fonts.sfnt.cff.writer import Subset from calibre.utils.fonts.sfnt.cff.writer import Subset
# Map codes from the cmap table to glyph names, this will be used to # Map codes from the cmap table to glyph names, this will be used to
# reconstruct character_map for the subset font # reconstruct character_map for the subset font
@ -196,6 +196,9 @@ class CFFTable(UnknownTable):
charset.discard(None) charset.discard(None)
if not charset: if not charset:
raise NoGlyphs('This font has no glyphs for the specified characters') raise NoGlyphs('This font has no glyphs for the specified characters')
charset |= {
self.cff.charset.safe_lookup(glyph_id) for glyph_id in extra_glyphs}
charset.discard(None)
s = Subset(self.cff, charset) s = Subset(self.cff, charset)
# Rebuild character_map with the glyph ids from the subset font # Rebuild character_map with the glyph ids from the subset font

View File

@ -0,0 +1,240 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from, calcsize
from collections import OrderedDict, namedtuple
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
class Unpackable(object):
def __init__(self, raw, offset):
self.raw, self.offset = raw, offset
self.start_pos = offset
def unpack(self, fmt, single_special=True):
fmt = fmt.encode('ascii') if not isinstance(fmt, bytes) else fmt
ans = unpack_from(b'>'+fmt, self.raw, self.offset)
if single_special and len(ans) == 1:
ans = ans[0]
self.offset += calcsize(fmt)
return ans
class SimpleListTable(list):
'A table that contains a list of subtables'
child_class = None
def __init__(self, raw, offset):
list.__init__(self)
data = Unpackable(raw, offset)
self.read_extra_header(data)
count = data.unpack('H')
for i in xrange(count):
offset = data.unpack('H')
self.append(self.child_class(raw, data.start_pos + offset))
self.read_extra_footer(data)
def read_extra_header(self, data):
pass
def read_extra_footer(self, data):
pass
class ListTable(OrderedDict):
'A table that contains an ordered mapping of table tag to subtable'
child_class = None
def __init__(self, raw, offset):
OrderedDict.__init__(self)
data = Unpackable(raw, offset)
self.read_extra_header(data)
count = data.unpack('H')
for i in xrange(count):
tag, coffset = data.unpack('4sH')
self[tag] = self.child_class(raw, data.start_pos + coffset)
self.read_extra_footer(data)
def read_extra_header(self, data):
pass
def read_extra_footer(self, data):
pass
def dump(self, prefix=''):
print (prefix, self.__class__.__name__, sep='')
prefix += ' '
for tag, child in self.iteritems():
print (prefix, tag, sep='')
child.dump(prefix=prefix+' ')
class IndexTable(list):
def __init__(self, raw, offset):
data = Unpackable(raw, offset)
self.read_extra_header(data)
count = data.unpack('H')
for i in xrange(count):
self.append(data.unpack('H'))
def read_extra_header(self, data):
pass
def dump(self, prefix=''):
print(prefix, self.__class__.__name__, sep='')
class LanguageSystemTable(IndexTable):
def read_extra_header(self, data):
self.lookup_order, self.required_feature_index = data.unpack('2H')
if self.lookup_order != 0:
raise UnsupportedFont('This LanguageSystemTable has an unknown'
' lookup order: 0x%x'%self.lookup_order)
class ScriptTable(ListTable):
child_class = LanguageSystemTable
def __init__(self, raw, offset):
ListTable.__init__(self, raw, offset)
def read_extra_header(self, data):
start_pos = data.offset
default_offset = data.unpack('H')
self[b'default'] = (LanguageSystemTable(data.raw, start_pos +
default_offset) if default_offset else None)
class ScriptListTable(ListTable):
child_class = ScriptTable
class FeatureTable(IndexTable):
def read_extra_header(self, data):
self.feature_params = data.unpack('H')
if False and self.feature_params != 0:
# Source code pro sets this to non NULL
raise UnsupportedFont(
'This FeatureTable has non NULL FeatureParams: 0x%x'%self.feature_params)
class FeatureListTable(ListTable):
child_class = FeatureTable
class LookupTable(SimpleListTable):
def read_extra_header(self, data):
self.lookup_type, self.lookup_flag = data.unpack('2H')
self.set_child_class()
def set_child_class(self):
raise NotImplementedError()
def read_extra_footer(self, data):
if self.lookup_flag & 0x0010:
self.mark_filtering_set = data.unpack('H')
def ExtensionSubstitution(raw, offset, subtable_map={}):
data = Unpackable(raw, offset)
subst_format, extension_lookup_type, offset = data.unpack('2HL')
if subst_format != 1:
raise UnsupportedFont('ExtensionSubstitution has unknown format: 0x%x'%subst_format)
return subtable_map[extension_lookup_type](raw, offset+data.start_pos)
CoverageRange = namedtuple('CoverageRange', 'start end start_coverage_index')
class Coverage(object):
def __init__(self, raw, offset, parent_table_name):
data = Unpackable(raw, offset)
self.format, count = data.unpack('2H')
if self.format not in {1, 2}:
raise UnsupportedFont('Unknown Coverage format: 0x%x in %s'%(
self.format, parent_table_name))
if self.format == 1:
self.glyph_ids = data.unpack('%dH'%count, single_special=False)
self.glyph_ids_map = {gid:i for i, gid in
enumerate(self.glyph_ids)}
else:
self.ranges = []
ranges = data.unpack('%dH'%(3*count), single_special=False)
for i in xrange(count):
start, end, start_coverage_index = ranges[i*3:(i+1)*3]
self.ranges.append(CoverageRange(start, end, start_coverage_index))
def coverage_indices(self, glyph_ids):
'''Return map of glyph_id -> coverage index. Map contains only those
glyph_ids that are covered by this table and that are present in
glyph_ids.'''
ans = OrderedDict()
for gid in glyph_ids:
if self.format == 1:
idx = self.glyph_ids_map.get(gid, None)
if idx is not None:
ans[gid] = idx
else:
for start, end, start_coverage_index in self.ranges:
if start <= gid <= end:
ans[gid] = start_coverage_index + (gid-start)
return ans
class UnknownLookupSubTable(object):
formats = {}
def __init__(self, raw, offset):
data = Unpackable(raw, offset)
self.format = data.unpack('H')
if self.format not in self.formats:
raise UnsupportedFont('Unknown format for Lookup Subtable %s: 0x%x'%(
self.__class__.__name__, self.format))
if self.has_initial_coverage:
coverage_offset = data.unpack('H') + data.start_pos
self.coverage = Coverage(raw, coverage_offset, self.__class__.__name__)
self.initialize(data)
@property
def has_initial_coverage(self):
return True
def all_substitutions(self, glyph_ids):
''' Return a set of all glyph ids that could be substituted for any
subset of the specified glyph ids (which must be a set)'''
raise NotImplementedError()
def read_sets(self, data, read_item=None, set_is_index=False):
count = data.unpack('H')
sets = data.unpack('%dH'%count, single_special=False)
coverage_to_items_map = []
for offset in sets:
# Read items in the set
data.offset = start_pos = offset + data.start_pos
count = data.unpack('H')
item_offsets = data.unpack('%dH'%count, single_special=False)
items = []
for offset in item_offsets:
data.offset = offset + start_pos
if set_is_index:
items.append(offset)
else:
items.append(read_item(data))
coverage_to_items_map.append(items)
return coverage_to_items_map

View File

@ -22,6 +22,7 @@ from calibre.utils.fonts.sfnt.loca import LocaTable
from calibre.utils.fonts.sfnt.glyf import GlyfTable from calibre.utils.fonts.sfnt.glyf import GlyfTable
from calibre.utils.fonts.sfnt.cmap import CmapTable from calibre.utils.fonts.sfnt.cmap import CmapTable
from calibre.utils.fonts.sfnt.kern import KernTable from calibre.utils.fonts.sfnt.kern import KernTable
from calibre.utils.fonts.sfnt.gsub import GSUBTable
from calibre.utils.fonts.sfnt.cff.table import CFFTable from calibre.utils.fonts.sfnt.cff.table import CFFTable
# OpenType spec: http://www.microsoft.com/typography/otspec/otff.htm # OpenType spec: http://www.microsoft.com/typography/otspec/otff.htm
@ -46,6 +47,7 @@ class Sfnt(object):
b'cmap' : CmapTable, b'cmap' : CmapTable,
b'CFF ' : CFFTable, b'CFF ' : CFFTable,
b'kern' : KernTable, b'kern' : KernTable,
b'GSUB' : GSUBTable,
}.get(table_tag, UnknownTable)(table) }.get(table_tag, UnknownTable)(table)
def __getitem__(self, key): def __getitem__(self, key):

View File

@ -0,0 +1,180 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from
from functools import partial
from calibre.utils.fonts.sfnt import UnknownTable, FixedProperty
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from calibre.utils.fonts.sfnt.common import (ScriptListTable, FeatureListTable,
SimpleListTable, LookupTable, ExtensionSubstitution,
UnknownLookupSubTable)
class SingleSubstitution(UnknownLookupSubTable):
formats = {1, 2}
def initialize(self, data):
if self.format == 1:
self.delta = data.unpack('h')
else:
count = data.unpack('H')
self.substitutes = data.unpack('%dH'%count, single_special=False)
def all_substitutions(self, glyph_ids):
gid_index_map = self.coverage.coverage_indices(glyph_ids)
if self.format == 1:
return {gid + self.delta for gid in gid_index_map}
return {self.substitutes[i] for i in gid_index_map.itervalues()}
class MultipleSubstitution(UnknownLookupSubTable):
formats = {1}
def initialize(self, data):
self.coverage_to_subs_map = self.read_sets(data, set_is_index=True)
def all_substitutions(self, glyph_ids):
gid_index_map = self.coverage.coverage_indices(glyph_ids)
ans = set()
for index in gid_index_map.itervalues():
glyphs = set(self.coverage_to_subs_map[index])
ans |= glyphs
return ans
class AlternateSubstitution(MultipleSubstitution):
pass
class LigatureSubstitution(UnknownLookupSubTable):
formats = {1}
def initialize(self, data):
self.coverage_to_lig_map = self.read_sets(data, self.read_ligature)
def read_ligature(self, data):
lig_glyph, count = data.unpack('HH')
components = data.unpack('%dH'%count, single_special=False)
return (lig_glyph, components)
def all_substitutions(self, glyph_ids):
gid_index_map = self.coverage.coverage_indices(glyph_ids)
ans = set()
for index in gid_index_map.itervalues():
for glyph_id, components in self.coverage_to_lig_map[index]:
if set(components).issubset(glyph_ids):
ans.add(glyph_id)
return ans
class ContexttualSubstitution(UnknownLookupSubTable):
formats = {1, 2, 3}
@property
def has_initial_coverage(self):
return self.format != 3
def initialize(self, data):
pass # TODO
def all_substitutions(self, glyph_ids):
# This table only defined substitution in terms of other tables
return set()
class ChainingContextualSubstitution(UnknownLookupSubTable):
formats = {1, 2, 3}
@property
def has_initial_coverage(self):
return self.format != 3
def initialize(self, data):
pass # TODO
def all_substitutions(self, glyph_ids):
# This table only defined substitution in terms of other tables
return set()
class ReverseChainSingleSubstitution(UnknownLookupSubTable):
formats = {1}
def initialize(self, data):
backtrack_count = data.unpack('H')
backtrack_offsets = data.unpack('%dH'%backtrack_count,
single_special=False)
lookahead_count = data.unpack('H')
lookahead_offsets = data.unpack('%dH'%lookahead_count,
single_special=False)
backtrack_offsets = [data.start_pos + x for x in backtrack_offsets]
lookahead_offsets = [data.start_pos + x for x in lookahead_offsets]
backtrack_offsets, lookahead_offsets # TODO: Use these
count = data.unpack('H')
self.substitutes = data.unpack('%dH'%count)
def all_substitutions(self, glyph_ids):
gid_index_map = self.coverage.coverage_indices(glyph_ids)
return {self.substitutes[i] for i in gid_index_map.itervalues()}
subtable_map = {
1: SingleSubstitution,
2: MultipleSubstitution,
3: AlternateSubstitution,
4: LigatureSubstitution,
5: ContexttualSubstitution,
6: ChainingContextualSubstitution,
8: ReverseChainSingleSubstitution,
}
class GSUBLookupTable(LookupTable):
def set_child_class(self):
if self.lookup_type == 7:
self.child_class = partial(ExtensionSubstitution,
subtable_map=subtable_map)
else:
self.child_class = subtable_map[self.lookup_type]
class LookupListTable(SimpleListTable):
child_class = GSUBLookupTable
class GSUBTable(UnknownTable):
version = FixedProperty('_version')
def decompile(self):
(self._version, self.scriptlist_offset, self.featurelist_offset,
self.lookuplist_offset) = unpack_from(b'>L3H', self.raw)
if self._version != 0x10000:
raise UnsupportedFont('The GSUB table has unknown version: 0x%x'%
self._version)
self.script_list_table = ScriptListTable(self.raw,
self.scriptlist_offset)
# self.script_list_table.dump()
self.feature_list_table = FeatureListTable(self.raw,
self.featurelist_offset)
# self.feature_list_table.dump()
self.lookup_list_table = LookupListTable(self.raw,
self.lookuplist_offset)
def all_substitutions(self, glyph_ids):
ans = set()
glyph_ids = frozenset(glyph_ids)
for lookup_table in self.lookup_list_table:
for subtable in lookup_table:
gids = subtable.all_substitutions(glyph_ids)
ans |= gids
return ans

View File

@ -7,16 +7,18 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import traceback
from collections import OrderedDict from collections import OrderedDict
from operator import itemgetter from operator import itemgetter
from functools import partial
from calibre.utils.fonts.sfnt.container import Sfnt from calibre.utils.fonts.sfnt.container import Sfnt
from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs
# TrueType outlines {{{ # TrueType outlines {{{
def resolve_glyphs(loca, glyf, character_map): def resolve_glyphs(loca, glyf, character_map, extra_glyphs):
unresolved_glyphs = set(character_map.itervalues()) unresolved_glyphs = set(character_map.itervalues()) | extra_glyphs
unresolved_glyphs.add(0) # We always want the .notdef glyph unresolved_glyphs.add(0) # We always want the .notdef glyph
resolved_glyphs = {} resolved_glyphs = {}
@ -26,11 +28,7 @@ def resolve_glyphs(loca, glyf, character_map):
offset, length = loca.glyph_location(glyph_id) offset, length = loca.glyph_location(glyph_id)
except (IndexError, ValueError, KeyError, TypeError): except (IndexError, ValueError, KeyError, TypeError):
continue continue
if length < 1:
continue
glyph = glyf.glyph_data(offset, length) glyph = glyf.glyph_data(offset, length)
if len(glyph) == 0:
continue
resolved_glyphs[glyph_id] = glyph resolved_glyphs[glyph_id] = glyph
for gid in glyph.glyph_indices: for gid in glyph.glyph_indices:
if gid not in resolved_glyphs: if gid not in resolved_glyphs:
@ -38,7 +36,7 @@ def resolve_glyphs(loca, glyf, character_map):
return OrderedDict(sorted(resolved_glyphs.iteritems(), key=itemgetter(0))) return OrderedDict(sorted(resolved_glyphs.iteritems(), key=itemgetter(0)))
def subset_truetype(sfnt, character_map): def subset_truetype(sfnt, character_map, extra_glyphs):
loca = sfnt[b'loca'] loca = sfnt[b'loca']
glyf = sfnt[b'glyf'] glyf = sfnt[b'glyf']
@ -48,7 +46,7 @@ def subset_truetype(sfnt, character_map):
raise UnsupportedFont('This font does not contain head and/or maxp tables') raise UnsupportedFont('This font does not contain head and/or maxp tables')
loca.load_offsets(head, maxp) loca.load_offsets(head, maxp)
resolved_glyphs = resolve_glyphs(loca, glyf, character_map) resolved_glyphs = resolve_glyphs(loca, glyf, character_map, extra_glyphs)
if not resolved_glyphs or set(resolved_glyphs) == {0}: if not resolved_glyphs or set(resolved_glyphs) == {0}:
raise NoGlyphs('This font has no glyphs for the specified character ' raise NoGlyphs('This font has no glyphs for the specified character '
'set, subsetting it is pointless') 'set, subsetting it is pointless')
@ -66,26 +64,33 @@ def subset_truetype(sfnt, character_map):
# }}} # }}}
def subset_postscript(sfnt, character_map): def subset_postscript(sfnt, character_map, extra_glyphs):
cff = sfnt[b'CFF '] cff = sfnt[b'CFF ']
cff.decompile() cff.decompile()
cff.subset(character_map) cff.subset(character_map, extra_glyphs)
def subset(raw, individual_chars, ranges=()): def do_warn(warnings, *args):
chars = list(map(ord, individual_chars)) for arg in args:
for line in arg.splitlines():
if warnings is None:
print(line)
else:
warnings.append(line)
if warnings is None:
print()
else:
warnings.append('')
def subset(raw, individual_chars, ranges=(), warnings=None):
warn = partial(do_warn, warnings)
chars = set(map(ord, individual_chars))
for r in ranges: for r in ranges:
chars += list(xrange(ord(r[0]), ord(r[1])+1)) chars |= set(xrange(ord(r[0]), ord(r[1])+1))
# Hack pending parsing of the GSUB table, manually add in a few common # Always add the space character for ease of use from the command line
# ligatures if ord(' ') not in chars:
ligatures = {'AE':'Æ', 'ae':'æ', 'OE':'Œ', 'IJ':'IJ', 'ij':'ij', 'ue':'', chars.add(ord(' '))
'ff':'', 'fi':'', 'fl':'', 'ffi':'', 'ffl':'', 'st':''}
all_chars = set(chars)
for ichars, lig in ligatures.iteritems():
ichars = frozenset(map(ord, ichars))
if ichars.issubset(all_chars) and ord(lig) not in all_chars:
all_chars.add(ord(lig))
chars.append(ord(lig))
sfnt = Sfnt(raw) sfnt = Sfnt(raw)
old_sizes = sfnt.sizes() old_sizes = sfnt.sizes()
@ -113,12 +118,26 @@ def subset(raw, individual_chars, ranges=()):
# Get mapping of chars to glyph ids for all specified chars # Get mapping of chars to glyph ids for all specified chars
character_map = cmap.get_character_map(chars) character_map = cmap.get_character_map(chars)
extra_glyphs = set()
if b'GSUB' in sfnt:
# Parse all substitution rules to ensure that glyphs that can be
# substituted for the specified set of glyphs are not removed
gsub = sfnt[b'GSUB']
try:
gsub.decompile()
extra_glyphs = gsub.all_substitutions(character_map.itervalues())
except UnsupportedFont as e:
warn('Usupported GSUB table: %s'%e)
except Exception as e:
warn('Failed to decompile GSUB table:', traceback.format_exc())
if b'loca' in sfnt and b'glyf' in sfnt: if b'loca' in sfnt and b'glyf' in sfnt:
# TrueType Outlines # TrueType Outlines
subset_truetype(sfnt, character_map) subset_truetype(sfnt, character_map, extra_glyphs)
elif b'CFF ' in sfnt: elif b'CFF ' in sfnt:
# PostScript Outlines # PostScript Outlines
subset_postscript(sfnt, character_map) subset_postscript(sfnt, character_map, extra_glyphs)
else: else:
raise UnsupportedFont('This font does not contain TrueType ' raise UnsupportedFont('This font does not contain TrueType '
'or PostScript outlines') 'or PostScript outlines')
@ -130,11 +149,10 @@ def subset(raw, individual_chars, ranges=()):
try: try:
sfnt[b'kern'].restrict_to_glyphs(frozenset(character_map.itervalues())) sfnt[b'kern'].restrict_to_glyphs(frozenset(character_map.itervalues()))
except UnsupportedFont as e: except UnsupportedFont as e:
print ('Subsetting of kern table failed, ignoring: %s'%e) warn('kern table unsupported, ignoring: %s'%e)
except Exception as e: except Exception as e:
print ('Subsetting of kern table failed, ignoring') warn('Subsetting of kern table failed, ignoring:',
import traceback traceback.format_exc())
traceback.print_exc()
raw, new_sizes = sfnt() raw, new_sizes = sfnt()
return raw, old_sizes, new_sizes return raw, old_sizes, new_sizes
@ -264,14 +282,20 @@ def all():
from calibre.utils.fonts.scanner import font_scanner from calibre.utils.fonts.scanner import font_scanner
failed = [] failed = []
unsupported = [] unsupported = []
warnings = {}
total = 0 total = 0
averages = []
for family in font_scanner.find_font_families(): for family in font_scanner.find_font_families():
for font in font_scanner.fonts_for_family(family): for font in font_scanner.fonts_for_family(family):
raw = font_scanner.get_font_data(font) raw = font_scanner.get_font_data(font)
print ('Subsetting', font['full_name'], end='\t') print ('Subsetting', font['full_name'], end='\t')
total += 1 total += 1
try: try:
sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ()) w = []
sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')),
(), w)
if w:
warnings[font['full_name'] + ' (%s)'%font['path']] = w
except NoGlyphs: except NoGlyphs:
print('No glyphs!') print('No glyphs!')
continue continue
@ -283,22 +307,29 @@ def all():
print ('Failed!') print ('Failed!')
failed.append((font['full_name'], font['path'], unicode(e))) failed.append((font['full_name'], font['path'], unicode(e)))
else: else:
print ('Reduced to:', '%.1f'%( averages.append(sum(new_stats.itervalues())/sum(old_stats.itervalues())
sum(new_stats.itervalues())/sum(old_stats.itervalues()) * 100)
* 100), '%') print ('Reduced to:', '%.1f'%averages[-1] , '%')
if unsupported: if unsupported:
print ('\n\nUnsupported:') print ('\n\nUnsupported:')
for name, path, err in unsupported: for name, path, err in unsupported:
print (name, path, err) print (name, path, err)
print() print()
if warnings:
print ('\n\nWarnings:')
for name, w in warnings.iteritems():
if w:
print (name)
print('', '\n\t'.join(w), sep='\t')
if failed: if failed:
print ('\n\nFailures:') print ('\n\nFailures:')
for name, path, err in failed: for name, path, err in failed:
print (name, path, err) print (name, path, err)
print() print()
print ('Average reduction to: %.1f%%'%( sum(averages)/len(averages)))
print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:', print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:',
len(failed)) len(failed), 'Warnings:', len(warnings))
# }}} # }}}