From b4a49e5cdde70a7284ea54e7a664e925ddd8429c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 10 Nov 2012 17:32:13 +0530 Subject: [PATCH] Font subsetting: Parse the GSUB table for glyph substituion rules and add do not remove any glyphs that could act as substitutes. Keep zero length glyphs like the glyphs for non prinatable characters when subsetting TrueType outlines. --- src/calibre/utils/fonts/sfnt/cff/table.py | 5 +- src/calibre/utils/fonts/sfnt/common.py | 240 ++++++++++++++++++++++ src/calibre/utils/fonts/sfnt/container.py | 2 + src/calibre/utils/fonts/sfnt/gsub.py | 180 ++++++++++++++++ src/calibre/utils/fonts/sfnt/subset.py | 99 ++++++--- 5 files changed, 491 insertions(+), 35 deletions(-) create mode 100644 src/calibre/utils/fonts/sfnt/common.py create mode 100644 src/calibre/utils/fonts/sfnt/gsub.py diff --git a/src/calibre/utils/fonts/sfnt/cff/table.py b/src/calibre/utils/fonts/sfnt/cff/table.py index 63d85b65ec..fa3a5207a0 100644 --- a/src/calibre/utils/fonts/sfnt/cff/table.py +++ b/src/calibre/utils/fonts/sfnt/cff/table.py @@ -186,7 +186,7 @@ class CFFTable(UnknownTable): def decompile(self): self.cff = CFF(self.raw) - def subset(self, character_map): + def subset(self, character_map, extra_glyphs): from calibre.utils.fonts.sfnt.cff.writer import Subset # Map codes from the cmap table to glyph names, this will be used to # reconstruct character_map for the subset font @@ -196,6 +196,9 @@ class CFFTable(UnknownTable): charset.discard(None) if not charset: raise NoGlyphs('This font has no glyphs for the specified characters') + charset |= { + self.cff.charset.safe_lookup(glyph_id) for glyph_id in extra_glyphs} + charset.discard(None) s = Subset(self.cff, charset) # Rebuild character_map with the glyph ids from the subset font diff --git a/src/calibre/utils/fonts/sfnt/common.py b/src/calibre/utils/fonts/sfnt/common.py new file mode 100644 index 0000000000..49ba77a28e --- /dev/null +++ b/src/calibre/utils/fonts/sfnt/common.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import unpack_from, calcsize +from collections import OrderedDict, namedtuple + +from calibre.utils.fonts.sfnt.errors import UnsupportedFont + +class Unpackable(object): + + def __init__(self, raw, offset): + self.raw, self.offset = raw, offset + self.start_pos = offset + + def unpack(self, fmt, single_special=True): + fmt = fmt.encode('ascii') if not isinstance(fmt, bytes) else fmt + ans = unpack_from(b'>'+fmt, self.raw, self.offset) + if single_special and len(ans) == 1: + ans = ans[0] + self.offset += calcsize(fmt) + return ans + +class SimpleListTable(list): + + 'A table that contains a list of subtables' + + child_class = None + + def __init__(self, raw, offset): + list.__init__(self) + + data = Unpackable(raw, offset) + self.read_extra_header(data) + + count = data.unpack('H') + for i in xrange(count): + offset = data.unpack('H') + self.append(self.child_class(raw, data.start_pos + offset)) + self.read_extra_footer(data) + + def read_extra_header(self, data): + pass + + def read_extra_footer(self, data): + pass + +class ListTable(OrderedDict): + + 'A table that contains an ordered mapping of table tag to subtable' + + child_class = None + + def __init__(self, raw, offset): + OrderedDict.__init__(self) + + data = Unpackable(raw, offset) + self.read_extra_header(data) + + count = data.unpack('H') + for i in xrange(count): + tag, coffset = data.unpack('4sH') + self[tag] = self.child_class(raw, data.start_pos + coffset) + + self.read_extra_footer(data) + + def read_extra_header(self, data): + pass + + def read_extra_footer(self, data): + pass + + def dump(self, prefix=''): + print (prefix, self.__class__.__name__, sep='') + prefix += ' ' + for tag, child in self.iteritems(): + print (prefix, tag, sep='') + child.dump(prefix=prefix+' ') + + +class IndexTable(list): + + def __init__(self, raw, offset): + data = Unpackable(raw, offset) + self.read_extra_header(data) + + count = data.unpack('H') + for i in xrange(count): + self.append(data.unpack('H')) + + def read_extra_header(self, data): + pass + + def dump(self, prefix=''): + print(prefix, self.__class__.__name__, sep='') + +class LanguageSystemTable(IndexTable): + + def read_extra_header(self, data): + self.lookup_order, self.required_feature_index = data.unpack('2H') + if self.lookup_order != 0: + raise UnsupportedFont('This LanguageSystemTable has an unknown' + ' lookup order: 0x%x'%self.lookup_order) + +class ScriptTable(ListTable): + + child_class = LanguageSystemTable + + def __init__(self, raw, offset): + ListTable.__init__(self, raw, offset) + + def read_extra_header(self, data): + start_pos = data.offset + default_offset = data.unpack('H') + self[b'default'] = (LanguageSystemTable(data.raw, start_pos + + default_offset) if default_offset else None) + +class ScriptListTable(ListTable): + + child_class = ScriptTable + +class FeatureTable(IndexTable): + + def read_extra_header(self, data): + self.feature_params = data.unpack('H') + if False and self.feature_params != 0: + # Source code pro sets this to non NULL + raise UnsupportedFont( + 'This FeatureTable has non NULL FeatureParams: 0x%x'%self.feature_params) + +class FeatureListTable(ListTable): + + child_class = FeatureTable + +class LookupTable(SimpleListTable): + + def read_extra_header(self, data): + self.lookup_type, self.lookup_flag = data.unpack('2H') + self.set_child_class() + + def set_child_class(self): + raise NotImplementedError() + + def read_extra_footer(self, data): + if self.lookup_flag & 0x0010: + self.mark_filtering_set = data.unpack('H') + +def ExtensionSubstitution(raw, offset, subtable_map={}): + data = Unpackable(raw, offset) + subst_format, extension_lookup_type, offset = data.unpack('2HL') + if subst_format != 1: + raise UnsupportedFont('ExtensionSubstitution has unknown format: 0x%x'%subst_format) + return subtable_map[extension_lookup_type](raw, offset+data.start_pos) + +CoverageRange = namedtuple('CoverageRange', 'start end start_coverage_index') + +class Coverage(object): + + def __init__(self, raw, offset, parent_table_name): + data = Unpackable(raw, offset) + self.format, count = data.unpack('2H') + + if self.format not in {1, 2}: + raise UnsupportedFont('Unknown Coverage format: 0x%x in %s'%( + self.format, parent_table_name)) + if self.format == 1: + self.glyph_ids = data.unpack('%dH'%count, single_special=False) + self.glyph_ids_map = {gid:i for i, gid in + enumerate(self.glyph_ids)} + else: + self.ranges = [] + ranges = data.unpack('%dH'%(3*count), single_special=False) + for i in xrange(count): + start, end, start_coverage_index = ranges[i*3:(i+1)*3] + self.ranges.append(CoverageRange(start, end, start_coverage_index)) + + def coverage_indices(self, glyph_ids): + '''Return map of glyph_id -> coverage index. Map contains only those + glyph_ids that are covered by this table and that are present in + glyph_ids.''' + ans = OrderedDict() + for gid in glyph_ids: + if self.format == 1: + idx = self.glyph_ids_map.get(gid, None) + if idx is not None: + ans[gid] = idx + else: + for start, end, start_coverage_index in self.ranges: + if start <= gid <= end: + ans[gid] = start_coverage_index + (gid-start) + return ans + +class UnknownLookupSubTable(object): + + formats = {} + + def __init__(self, raw, offset): + data = Unpackable(raw, offset) + self.format = data.unpack('H') + if self.format not in self.formats: + raise UnsupportedFont('Unknown format for Lookup Subtable %s: 0x%x'%( + self.__class__.__name__, self.format)) + if self.has_initial_coverage: + coverage_offset = data.unpack('H') + data.start_pos + self.coverage = Coverage(raw, coverage_offset, self.__class__.__name__) + self.initialize(data) + + @property + def has_initial_coverage(self): + return True + + def all_substitutions(self, glyph_ids): + ''' Return a set of all glyph ids that could be substituted for any + subset of the specified glyph ids (which must be a set)''' + raise NotImplementedError() + + def read_sets(self, data, read_item=None, set_is_index=False): + count = data.unpack('H') + sets = data.unpack('%dH'%count, single_special=False) + coverage_to_items_map = [] + for offset in sets: + # Read items in the set + data.offset = start_pos = offset + data.start_pos + count = data.unpack('H') + item_offsets = data.unpack('%dH'%count, single_special=False) + items = [] + for offset in item_offsets: + data.offset = offset + start_pos + if set_is_index: + items.append(offset) + else: + items.append(read_item(data)) + coverage_to_items_map.append(items) + return coverage_to_items_map + diff --git a/src/calibre/utils/fonts/sfnt/container.py b/src/calibre/utils/fonts/sfnt/container.py index cf207d0bd1..92246fe1a9 100644 --- a/src/calibre/utils/fonts/sfnt/container.py +++ b/src/calibre/utils/fonts/sfnt/container.py @@ -22,6 +22,7 @@ from calibre.utils.fonts.sfnt.loca import LocaTable from calibre.utils.fonts.sfnt.glyf import GlyfTable from calibre.utils.fonts.sfnt.cmap import CmapTable from calibre.utils.fonts.sfnt.kern import KernTable +from calibre.utils.fonts.sfnt.gsub import GSUBTable from calibre.utils.fonts.sfnt.cff.table import CFFTable # OpenType spec: http://www.microsoft.com/typography/otspec/otff.htm @@ -46,6 +47,7 @@ class Sfnt(object): b'cmap' : CmapTable, b'CFF ' : CFFTable, b'kern' : KernTable, + b'GSUB' : GSUBTable, }.get(table_tag, UnknownTable)(table) def __getitem__(self, key): diff --git a/src/calibre/utils/fonts/sfnt/gsub.py b/src/calibre/utils/fonts/sfnt/gsub.py new file mode 100644 index 0000000000..77d7db8519 --- /dev/null +++ b/src/calibre/utils/fonts/sfnt/gsub.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import unpack_from +from functools import partial + +from calibre.utils.fonts.sfnt import UnknownTable, FixedProperty +from calibre.utils.fonts.sfnt.errors import UnsupportedFont +from calibre.utils.fonts.sfnt.common import (ScriptListTable, FeatureListTable, + SimpleListTable, LookupTable, ExtensionSubstitution, + UnknownLookupSubTable) + +class SingleSubstitution(UnknownLookupSubTable): + + formats = {1, 2} + + def initialize(self, data): + if self.format == 1: + self.delta = data.unpack('h') + else: + count = data.unpack('H') + self.substitutes = data.unpack('%dH'%count, single_special=False) + + def all_substitutions(self, glyph_ids): + gid_index_map = self.coverage.coverage_indices(glyph_ids) + if self.format == 1: + return {gid + self.delta for gid in gid_index_map} + return {self.substitutes[i] for i in gid_index_map.itervalues()} + +class MultipleSubstitution(UnknownLookupSubTable): + + formats = {1} + + def initialize(self, data): + self.coverage_to_subs_map = self.read_sets(data, set_is_index=True) + + def all_substitutions(self, glyph_ids): + gid_index_map = self.coverage.coverage_indices(glyph_ids) + ans = set() + for index in gid_index_map.itervalues(): + glyphs = set(self.coverage_to_subs_map[index]) + ans |= glyphs + return ans + +class AlternateSubstitution(MultipleSubstitution): + pass + +class LigatureSubstitution(UnknownLookupSubTable): + + formats = {1} + + def initialize(self, data): + self.coverage_to_lig_map = self.read_sets(data, self.read_ligature) + + def read_ligature(self, data): + lig_glyph, count = data.unpack('HH') + components = data.unpack('%dH'%count, single_special=False) + return (lig_glyph, components) + + def all_substitutions(self, glyph_ids): + gid_index_map = self.coverage.coverage_indices(glyph_ids) + ans = set() + for index in gid_index_map.itervalues(): + for glyph_id, components in self.coverage_to_lig_map[index]: + if set(components).issubset(glyph_ids): + ans.add(glyph_id) + return ans + +class ContexttualSubstitution(UnknownLookupSubTable): + + formats = {1, 2, 3} + + @property + def has_initial_coverage(self): + return self.format != 3 + + def initialize(self, data): + pass # TODO + + def all_substitutions(self, glyph_ids): + # This table only defined substitution in terms of other tables + return set() + + +class ChainingContextualSubstitution(UnknownLookupSubTable): + + formats = {1, 2, 3} + + @property + def has_initial_coverage(self): + return self.format != 3 + + def initialize(self, data): + pass # TODO + + def all_substitutions(self, glyph_ids): + # This table only defined substitution in terms of other tables + return set() + +class ReverseChainSingleSubstitution(UnknownLookupSubTable): + + formats = {1} + + def initialize(self, data): + backtrack_count = data.unpack('H') + backtrack_offsets = data.unpack('%dH'%backtrack_count, + single_special=False) + lookahead_count = data.unpack('H') + lookahead_offsets = data.unpack('%dH'%lookahead_count, + single_special=False) + backtrack_offsets = [data.start_pos + x for x in backtrack_offsets] + lookahead_offsets = [data.start_pos + x for x in lookahead_offsets] + backtrack_offsets, lookahead_offsets # TODO: Use these + count = data.unpack('H') + self.substitutes = data.unpack('%dH'%count) + + def all_substitutions(self, glyph_ids): + gid_index_map = self.coverage.coverage_indices(glyph_ids) + return {self.substitutes[i] for i in gid_index_map.itervalues()} + +subtable_map = { + 1: SingleSubstitution, + 2: MultipleSubstitution, + 3: AlternateSubstitution, + 4: LigatureSubstitution, + 5: ContexttualSubstitution, + 6: ChainingContextualSubstitution, + 8: ReverseChainSingleSubstitution, +} + +class GSUBLookupTable(LookupTable): + + def set_child_class(self): + if self.lookup_type == 7: + self.child_class = partial(ExtensionSubstitution, + subtable_map=subtable_map) + else: + self.child_class = subtable_map[self.lookup_type] + +class LookupListTable(SimpleListTable): + + child_class = GSUBLookupTable + +class GSUBTable(UnknownTable): + + version = FixedProperty('_version') + + def decompile(self): + (self._version, self.scriptlist_offset, self.featurelist_offset, + self.lookuplist_offset) = unpack_from(b'>L3H', self.raw) + if self._version != 0x10000: + raise UnsupportedFont('The GSUB table has unknown version: 0x%x'% + self._version) + + self.script_list_table = ScriptListTable(self.raw, + self.scriptlist_offset) + # self.script_list_table.dump() + + self.feature_list_table = FeatureListTable(self.raw, + self.featurelist_offset) + # self.feature_list_table.dump() + + self.lookup_list_table = LookupListTable(self.raw, + self.lookuplist_offset) + + def all_substitutions(self, glyph_ids): + ans = set() + glyph_ids = frozenset(glyph_ids) + for lookup_table in self.lookup_list_table: + for subtable in lookup_table: + gids = subtable.all_substitutions(glyph_ids) + ans |= gids + return ans + diff --git a/src/calibre/utils/fonts/sfnt/subset.py b/src/calibre/utils/fonts/sfnt/subset.py index 829963700e..1b4a351dab 100644 --- a/src/calibre/utils/fonts/sfnt/subset.py +++ b/src/calibre/utils/fonts/sfnt/subset.py @@ -7,16 +7,18 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import traceback from collections import OrderedDict from operator import itemgetter +from functools import partial from calibre.utils.fonts.sfnt.container import Sfnt from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs # TrueType outlines {{{ -def resolve_glyphs(loca, glyf, character_map): - unresolved_glyphs = set(character_map.itervalues()) +def resolve_glyphs(loca, glyf, character_map, extra_glyphs): + unresolved_glyphs = set(character_map.itervalues()) | extra_glyphs unresolved_glyphs.add(0) # We always want the .notdef glyph resolved_glyphs = {} @@ -26,11 +28,7 @@ def resolve_glyphs(loca, glyf, character_map): offset, length = loca.glyph_location(glyph_id) except (IndexError, ValueError, KeyError, TypeError): continue - if length < 1: - continue glyph = glyf.glyph_data(offset, length) - if len(glyph) == 0: - continue resolved_glyphs[glyph_id] = glyph for gid in glyph.glyph_indices: if gid not in resolved_glyphs: @@ -38,7 +36,7 @@ def resolve_glyphs(loca, glyf, character_map): return OrderedDict(sorted(resolved_glyphs.iteritems(), key=itemgetter(0))) -def subset_truetype(sfnt, character_map): +def subset_truetype(sfnt, character_map, extra_glyphs): loca = sfnt[b'loca'] glyf = sfnt[b'glyf'] @@ -48,7 +46,7 @@ def subset_truetype(sfnt, character_map): raise UnsupportedFont('This font does not contain head and/or maxp tables') loca.load_offsets(head, maxp) - resolved_glyphs = resolve_glyphs(loca, glyf, character_map) + resolved_glyphs = resolve_glyphs(loca, glyf, character_map, extra_glyphs) if not resolved_glyphs or set(resolved_glyphs) == {0}: raise NoGlyphs('This font has no glyphs for the specified character ' 'set, subsetting it is pointless') @@ -66,26 +64,33 @@ def subset_truetype(sfnt, character_map): # }}} -def subset_postscript(sfnt, character_map): +def subset_postscript(sfnt, character_map, extra_glyphs): cff = sfnt[b'CFF '] cff.decompile() - cff.subset(character_map) + cff.subset(character_map, extra_glyphs) -def subset(raw, individual_chars, ranges=()): - chars = list(map(ord, individual_chars)) +def do_warn(warnings, *args): + for arg in args: + for line in arg.splitlines(): + if warnings is None: + print(line) + else: + warnings.append(line) + if warnings is None: + print() + else: + warnings.append('') + +def subset(raw, individual_chars, ranges=(), warnings=None): + warn = partial(do_warn, warnings) + + chars = set(map(ord, individual_chars)) for r in ranges: - chars += list(xrange(ord(r[0]), ord(r[1])+1)) + chars |= set(xrange(ord(r[0]), ord(r[1])+1)) - # Hack pending parsing of the GSUB table, manually add in a few common - # ligatures - ligatures = {'AE':'Æ', 'ae':'æ', 'OE':'Œ', 'IJ':'IJ', 'ij':'ij', 'ue':'ᵫ', - 'ff':'ff', 'fi':'fi', 'fl':'fl', 'ffi':'ffi', 'ffl':'ffl', 'st':'st'} - all_chars = set(chars) - for ichars, lig in ligatures.iteritems(): - ichars = frozenset(map(ord, ichars)) - if ichars.issubset(all_chars) and ord(lig) not in all_chars: - all_chars.add(ord(lig)) - chars.append(ord(lig)) + # Always add the space character for ease of use from the command line + if ord(' ') not in chars: + chars.add(ord(' ')) sfnt = Sfnt(raw) old_sizes = sfnt.sizes() @@ -113,12 +118,26 @@ def subset(raw, individual_chars, ranges=()): # Get mapping of chars to glyph ids for all specified chars character_map = cmap.get_character_map(chars) + extra_glyphs = set() + + if b'GSUB' in sfnt: + # Parse all substitution rules to ensure that glyphs that can be + # substituted for the specified set of glyphs are not removed + gsub = sfnt[b'GSUB'] + try: + gsub.decompile() + extra_glyphs = gsub.all_substitutions(character_map.itervalues()) + except UnsupportedFont as e: + warn('Usupported GSUB table: %s'%e) + except Exception as e: + warn('Failed to decompile GSUB table:', traceback.format_exc()) + if b'loca' in sfnt and b'glyf' in sfnt: # TrueType Outlines - subset_truetype(sfnt, character_map) + subset_truetype(sfnt, character_map, extra_glyphs) elif b'CFF ' in sfnt: # PostScript Outlines - subset_postscript(sfnt, character_map) + subset_postscript(sfnt, character_map, extra_glyphs) else: raise UnsupportedFont('This font does not contain TrueType ' 'or PostScript outlines') @@ -130,11 +149,10 @@ def subset(raw, individual_chars, ranges=()): try: sfnt[b'kern'].restrict_to_glyphs(frozenset(character_map.itervalues())) except UnsupportedFont as e: - print ('Subsetting of kern table failed, ignoring: %s'%e) + warn('kern table unsupported, ignoring: %s'%e) except Exception as e: - print ('Subsetting of kern table failed, ignoring') - import traceback - traceback.print_exc() + warn('Subsetting of kern table failed, ignoring:', + traceback.format_exc()) raw, new_sizes = sfnt() return raw, old_sizes, new_sizes @@ -264,14 +282,20 @@ def all(): from calibre.utils.fonts.scanner import font_scanner failed = [] unsupported = [] + warnings = {} total = 0 + averages = [] for family in font_scanner.find_font_families(): for font in font_scanner.fonts_for_family(family): raw = font_scanner.get_font_data(font) print ('Subsetting', font['full_name'], end='\t') total += 1 try: - sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ()) + w = [] + sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), + (), w) + if w: + warnings[font['full_name'] + ' (%s)'%font['path']] = w except NoGlyphs: print('No glyphs!') continue @@ -283,22 +307,29 @@ def all(): print ('Failed!') failed.append((font['full_name'], font['path'], unicode(e))) else: - print ('Reduced to:', '%.1f'%( - sum(new_stats.itervalues())/sum(old_stats.itervalues()) - * 100), '%') + averages.append(sum(new_stats.itervalues())/sum(old_stats.itervalues()) + * 100) + print ('Reduced to:', '%.1f'%averages[-1] , '%') if unsupported: print ('\n\nUnsupported:') for name, path, err in unsupported: print (name, path, err) print() + if warnings: + print ('\n\nWarnings:') + for name, w in warnings.iteritems(): + if w: + print (name) + print('', '\n\t'.join(w), sep='\t') if failed: print ('\n\nFailures:') for name, path, err in failed: print (name, path, err) print() + print ('Average reduction to: %.1f%%'%( sum(averages)/len(averages))) print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:', - len(failed)) + len(failed), 'Warnings:', len(warnings)) # }}}