Switch to a pure python implementation of font subsetting

This commit is contained in:
Kovid Goyal 2012-11-06 11:39:48 +05:30
parent 18db66fd77
commit f54843c547
10 changed files with 383 additions and 13 deletions

View File

@ -212,7 +212,7 @@ def main(args=sys.argv):
return return
if len(args) > 1 and args[1] in ('-f', '--subset-font'): if len(args) > 1 and args[1] in ('-f', '--subset-font'):
from calibre.utils.fonts.subset import main from calibre.utils.fonts.sfnt.subset import main
main(['subset-font']+args[2:]) main(['subset-font']+args[2:])
return return

View File

@ -10,7 +10,7 @@ __docformat__ = 'restructuredtext en'
from collections import defaultdict from collections import defaultdict
from calibre.ebooks.oeb.base import urlnormalize from calibre.ebooks.oeb.base import urlnormalize
from calibre.utils.fonts.subset import subset, NoGlyphs, UnsupportedFont from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont
class SubsetFonts(object): class SubsetFonts(object):

View File

@ -26,6 +26,9 @@ class UnknownTable(object):
def __call__(self): def __call__(self):
return self.raw return self.raw
def __len__(self):
return len(self.raw)
class DateTimeProperty(object): class DateTimeProperty(object):
def __init__(self, name): def __init__(self, name):
@ -46,10 +49,10 @@ class FixedProperty(object):
def __get__(self, obj, type=None): def __get__(self, obj, type=None):
val = getattr(obj, self.name) val = getattr(obj, self.name)
return val * (2**-16) return val / 0x10000
def __set__(self, obj, val): def __set__(self, obj, val):
return int(round(val*(2**16))) return int(round(val*(0x10000)))
def max_power_of_two(x): def max_power_of_two(x):
""" """
@ -62,4 +65,10 @@ def max_power_of_two(x):
exponent += 1 exponent += 1
return max(exponent - 1, 0) return max(exponent - 1, 0)
def load_font(stream_or_path):
raw = stream_or_path
if hasattr(raw, 'read'):
raw = raw.read()
from calibre.utils.fonts.sfnt.container import Sfnt
return Sfnt(raw)

View File

@ -144,6 +144,7 @@ class CmapTable(UnknownTable):
except IndexError: except IndexError:
next_offset = len(self.raw) next_offset = len(self.raw)
table = self.raw[offset:next_offset] table = self.raw[offset:next_offset]
if table:
fmt = unpack_from(b'>H', table)[0] fmt = unpack_from(b'>H', table)[0]
if platform == 3 and encoding == 1 and fmt == 4: if platform == 3 and encoding == 1 and fmt == 4:
self.bmp_table = table self.bmp_table = table

View File

@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
from struct import pack, calcsize from struct import pack, calcsize
from io import BytesIO from io import BytesIO
from collections import OrderedDict
from calibre.utils.fonts.utils import (get_tables, checksum_of_block, from calibre.utils.fonts.utils import (get_tables, checksum_of_block,
verify_checksums) verify_checksums)
@ -18,6 +19,8 @@ from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from calibre.utils.fonts.sfnt.head import HeadTable from calibre.utils.fonts.sfnt.head import HeadTable
from calibre.utils.fonts.sfnt.maxp import MaxpTable from calibre.utils.fonts.sfnt.maxp import MaxpTable
from calibre.utils.fonts.sfnt.loca import LocaTable from calibre.utils.fonts.sfnt.loca import LocaTable
from calibre.utils.fonts.sfnt.glyf import GlyfTable
from calibre.utils.fonts.sfnt.cmap import CmapTable
class Sfnt(object): class Sfnt(object):
@ -35,6 +38,8 @@ class Sfnt(object):
b'head' : HeadTable, b'head' : HeadTable,
b'maxp' : MaxpTable, b'maxp' : MaxpTable,
b'loca' : LocaTable, b'loca' : LocaTable,
b'glyf' : GlyfTable,
b'cmap' : CmapTable,
}.get(table_tag, UnknownTable)(table) }.get(table_tag, UnknownTable)(table)
def __getitem__(self, key): def __getitem__(self, key):
@ -49,6 +54,12 @@ class Sfnt(object):
def pop(self, key, default=None): def pop(self, key, default=None):
return self.tables.pop(key, default) return self.tables.pop(key, default)
def sizes(self):
ans = OrderedDict()
for tag in sorted(self.tables):
ans[tag] = len(self[tag])
return ans
def __call__(self): def __call__(self):
stream = BytesIO() stream = BytesIO()
@ -68,6 +79,7 @@ class Sfnt(object):
head_offset = None head_offset = None
table_data = [] table_data = []
offset = stream.tell() + ( calcsize(b'>4s3L') * num_tables ) offset = stream.tell() + ( calcsize(b'>4s3L') * num_tables )
sizes = OrderedDict()
for tag in sorted(self.tables): for tag in sorted(self.tables):
table = self.tables[tag] table = self.tables[tag]
raw = table() raw = table()
@ -80,6 +92,7 @@ class Sfnt(object):
spack(b'>4s3L', tag, checksum, offset, table_len) spack(b'>4s3L', tag, checksum, offset, table_len)
offset += len(raw) offset += len(raw)
table_data.append(raw) table_data.append(raw)
sizes[tag] = table_len
for x in table_data: for x in table_data:
stream.write(x) stream.write(x)
@ -89,7 +102,7 @@ class Sfnt(object):
stream.seek(head_offset + 8) stream.seek(head_offset + 8)
spack(b'>L', q) spack(b'>L', q)
return stream.getvalue() return stream.getvalue(), sizes
def test_roundtrip(ff=None): def test_roundtrip(ff=None):
if ff is None: if ff is None:
@ -97,7 +110,7 @@ def test_roundtrip(ff=None):
else: else:
with open(ff, 'rb') as f: with open(ff, 'rb') as f:
data = f.read() data = f.read()
rd = Sfnt(data)() rd = Sfnt(data)()[0]
verify_checksums(rd) verify_checksums(rd)
if data[:12] != rd[:12]: if data[:12] != rd[:12]:
raise ValueError('Roundtripping failed, font header not the same') raise ValueError('Roundtripping failed, font header not the same')

View File

@ -10,3 +10,6 @@ __docformat__ = 'restructuredtext en'
class UnsupportedFont(ValueError): class UnsupportedFont(ValueError):
pass pass
class NoGlyphs(ValueError):
pass

View File

@ -0,0 +1,88 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from
from collections import OrderedDict
from calibre.utils.fonts.sfnt import UnknownTable
ARG_1_AND_2_ARE_WORDS = 0x0001 # if set args are words otherwise they are bytes
ARGS_ARE_XY_VALUES = 0x0002 # if set args are xy values, otherwise they are points
ROUND_XY_TO_GRID = 0x0004 # for the xy values if above is true
WE_HAVE_A_SCALE = 0x0008 # Sx = Sy, otherwise scale == 1.0
NON_OVERLAPPING = 0x0010 # set to same value for all components (obsolete!)
MORE_COMPONENTS = 0x0020 # indicates at least one more glyph after this one
WE_HAVE_AN_X_AND_Y_SCALE = 0x0040 # Sx, Sy
WE_HAVE_A_TWO_BY_TWO = 0x0080 # t00, t01, t10, t11
WE_HAVE_INSTRUCTIONS = 0x0100 # instructions follow
USE_MY_METRICS = 0x0200 # apply these metrics to parent glyph
OVERLAP_COMPOUND = 0x0400 # used by Apple in GX fonts
SCALED_COMPONENT_OFFSET = 0x0800 # composite designed to have the component offset scaled (designed for Apple)
UNSCALED_COMPONENT_OFFSET = 0x1000 # composite designed not to have the component offset scaled (designed for MS)
class SimpleGlyph(object):
def __init__(self, num_of_countours, raw):
self.num_of_countours = num_of_countours
self.raw = raw
# The list of glyph indices referred to by this glyph, will always be
# empty for a simple glyph and not empty for a composite glyph
self.glyph_indices = []
self.is_composite = False
def __len__(self):
return len(self.raw)
def __call__(self):
return self.raw
class CompositeGlyph(SimpleGlyph):
def __init__(self, num_of_countours, raw):
super(CompositeGlyph, self).__init__(num_of_countours, raw)
self.is_composite = True
flags = MORE_COMPONENTS
offset = 0
while flags & MORE_COMPONENTS:
flags, glyph_index = unpack_from(b'>HH', raw, offset)
self.glyph_indices.append(glyph_index)
offset += 4
if flags & ARG_1_AND_2_ARE_WORDS:
offset += 4
else:
offset += 2
if flags & WE_HAVE_A_SCALE:
offset += 2
elif flags & WE_HAVE_AN_X_AND_Y_SCALE:
offset += 4
elif flags & WE_HAVE_A_TWO_BY_TWO:
offset += 8
class GlyfTable(UnknownTable):
def glyph_data(self, offset, length):
raw = self.raw[offset:offset+length]
num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0
if num_of_countours >= 0:
return SimpleGlyph(num_of_countours, raw)
return CompositeGlyph(num_of_countours, raw)
def update(self, sorted_glyph_map):
ans = OrderedDict()
offset = 0
block = []
for glyph_id, glyph in sorted_glyph_map.iteritems():
raw = glyph()
ans[glyph_id] = (offset, len(raw))
offset += len(raw)
block.append(raw)
self.raw = b''.join(block)
return ans

View File

@ -7,7 +7,8 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from struct import calcsize, unpack_from from struct import calcsize, unpack_from, pack
from operator import itemgetter
from calibre.utils.fonts.sfnt import UnknownTable from calibre.utils.fonts.sfnt import UnknownTable
@ -23,9 +24,43 @@ class LocaTable(UnknownTable):
self.offset_map = self.offset_map[:num_glyphs+1] self.offset_map = self.offset_map[:num_glyphs+1]
if fmt == 'H': if fmt == 'H':
self.offset_map = [2*i for i in self.offset_map] self.offset_map = [2*i for i in self.offset_map]
self.fmt = fmt
def glyph_location(self, glyph_id): def glyph_location(self, glyph_id):
offset = self.offset_map[glyph_id] offset = self.offset_map[glyph_id]
next_offset = self.offset_map[glyph_id+1] next_offset = self.offset_map[glyph_id+1]
return offset, next_offset - offset return offset, next_offset - offset
def subset(self, resolved_glyph_map):
'''
Update this table to contain pointers only to the glyphs in
resolved_glyph_map which must be a map of glyph_ids to (offset, sz)
'''
self.offset_map = [0 for i in self.offset_map]
glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in
resolved_glyph_map.iteritems()]
glyphs.sort(key=itemgetter(1))
for glyph_id, offset, sz in glyphs:
self.offset_map[glyph_id] = offset
self.offset_map[glyph_id+1] = offset + sz
# Fix all zero entries to be the same as the previous entry, which
# means that if the ith entry is zero, the i-1 glyph is not present.
for i in xrange(1, len(self.offset_map)):
if self.offset_map[i] == 0:
self.offset_map[i] = self.offset_map[i-1]
vals = self.offset_map
if self.fmt == 'H':
vals = [i//2 for i in self.offset_map]
self.raw = pack(('>%d%s'%(len(vals), self.fmt)).encode('ascii'), *vals)
def dump_glyphs(self, sfnt):
if not hasattr(self, 'offset_map'):
self.load_offsets(sfnt[b'head'], sfnt[b'maxp'])
for i in xrange(len(self.offset_map)-1):
off, noff = self.offset_map[i], self.offset_map[i+1]
if noff != off:
print ('Glyph id:', i, 'size:', noff-off)

View File

@ -7,23 +7,73 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from collections import OrderedDict
from operator import itemgetter
from calibre.utils.fonts.sfnt.container import Sfnt from calibre.utils.fonts.sfnt.container import Sfnt
from calibre.utils.fonts.sfnt.errors import UnsupportedFont from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs
# TrueType outlines {{{
def resolve_glyphs(loca, glyf, character_map):
unresolved_glyphs = set(character_map.itervalues())
unresolved_glyphs.add(0) # We always want the .notdef glyph
resolved_glyphs = {}
while unresolved_glyphs:
glyph_id = unresolved_glyphs.pop()
try:
offset, length = loca.glyph_location(glyph_id)
except (IndexError, ValueError, KeyError, TypeError):
continue
if length < 1:
continue
glyph = glyf.glyph_data(offset, length)
if len(glyph) == 0:
continue
resolved_glyphs[glyph_id] = glyph
for gid in glyph.glyph_indices:
if gid not in resolved_glyphs:
unresolved_glyphs.add(gid)
return OrderedDict(sorted(resolved_glyphs.iteritems(), key=itemgetter(0)))
def subset_truetype(sfnt, character_map): def subset_truetype(sfnt, character_map):
loca = sfnt[b'loca'] loca = sfnt[b'loca']
glyf = sfnt[b'glyf']
try: try:
head, maxp = sfnt[b'head'], sfnt[b'maxp'] head, maxp = sfnt[b'head'], sfnt[b'maxp']
except KeyError: except KeyError:
raise UnsupportedFont('This font does not contain head and/or maxp tables') raise UnsupportedFont('This font does not contain head and/or maxp tables')
loca.load_offsets(head, maxp) loca.load_offsets(head, maxp)
resolved_glyphs = resolve_glyphs(loca, glyf, character_map)
if not resolved_glyphs or set(resolved_glyphs) == {0}:
raise NoGlyphs('This font has no glyphs for the specified character '
'set, subsetting it is pointless')
# Keep only character codes that have resolved glyphs
for code, glyph_id in tuple(character_map.iteritems()):
if glyph_id not in resolved_glyphs:
del character_map[code]
# Update the glyf table
glyph_offset_map = glyf.update(resolved_glyphs)
# Update the loca table
loca.subset(glyph_offset_map)
# }}}
def subset(raw, individual_chars, ranges=()): def subset(raw, individual_chars, ranges=()):
chars = list(map(ord, individual_chars)) chars = list(map(ord, individual_chars))
for r in ranges: for r in ranges:
chars += list(xrange(ord(r[0]), ord(r[1])+1)) chars += list(xrange(ord(r[0]), ord(r[1])+1))
sfnt = Sfnt(raw) sfnt = Sfnt(raw)
old_sizes = sfnt.sizes()
# Remove the Digital Signature table since it is useless in a subset # Remove the Digital Signature table since it is useless in a subset
# font anyway # font anyway
sfnt.pop(b'DSIG', None) sfnt.pop(b'DSIG', None)
@ -35,16 +85,186 @@ def subset(raw, individual_chars, ranges=()):
# Get mapping of chars to glyph ids for all specified chars # Get mapping of chars to glyph ids for all specified chars
character_map = cmap.get_character_map(chars) character_map = cmap.get_character_map(chars)
# Restrict the cmap table to only contain entries for the specified chars
cmap.set_character_map(character_map)
if b'loca' in sfnt and b'glyf' in sfnt: if b'loca' in sfnt and b'glyf' in sfnt:
# TrueType Outlines
subset_truetype(sfnt, character_map) subset_truetype(sfnt, character_map)
elif b'CFF ' in sfnt: elif b'CFF ' in sfnt:
# PostScript Outlines
raise UnsupportedFont('This font contains PostScript outlines, ' raise UnsupportedFont('This font contains PostScript outlines, '
'subsetting not supported') 'subsetting not supported')
else: else:
raise UnsupportedFont('This font does not contain TrueType ' raise UnsupportedFont('This font does not contain TrueType '
'or PostScript outlines') 'or PostScript outlines')
# Restrict the cmap table to only contain entries for the resolved glyphs
cmap.set_character_map(character_map)
raw, new_sizes = sfnt()
return raw, old_sizes, new_sizes
# CLI {{{
def option_parser():
import textwrap
from calibre.utils.config import OptionParser
parser = OptionParser(usage=textwrap.dedent('''\
%prog [options] input_font_file output_font_file characters_to_keep
Subset the specified font, keeping only the glyphs for the characters in
characters_to_keep. characters_to_keep is a comma separated list of characters of
the form: a,b,c,A-Z,0-9,xyz
You can specify ranges in the list of characters, as shown above.
'''))
parser.add_option('-c', '--codes', default=False, action='store_true',
help='If specified, the list of characters is interpreted as '
'numeric unicode codes instead of characters. So to specify the '
'characters a,b you would use 97,98')
parser.prog = 'subset-font'
return parser
def print_stats(old_stats, new_stats):
from calibre import prints
prints('========= Table comparison (original vs. subset) =========')
prints('Table', ' ', '%10s'%'Size', ' ', 'Percent', ' ', '%10s'%'New Size',
' New Percent')
prints('='*80)
old_total = sum(old_stats.itervalues())
new_total = sum(new_stats.itervalues())
tables = sorted(old_stats.iterkeys(), key=lambda x:old_stats[x],
reverse=True)
for table in tables:
osz = old_stats[table]
op = osz/old_total * 100
nsz = new_stats.get(table, 0)
np = nsz/new_total * 100
suffix = ' | same size'
if nsz != osz:
suffix = ' | reduced to %.1f %%'%(nsz/osz * 100)
prints('%4s'%table, ' ', '%10s'%osz, ' ', '%5.1f %%'%op, ' ',
'%10s'%nsz, ' ', '%5.1f %%'%np, suffix)
prints('='*80)
def main(args):
import sys, time
from calibre import prints
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) < 4 or len(args) > 4:
parser.print_help()
raise SystemExit(1)
iff, off, chars = args[1:]
with open(iff, 'rb') as f:
orig = f.read()
chars = [x.strip() for x in chars.split(',')]
individual, ranges = set(), set()
def not_single(c):
if len(c) > 1:
prints(c, 'is not a single character', file=sys.stderr)
raise SystemExit(1)
for c in chars:
if '-' in c:
parts = [x.strip() for x in c.split('-')]
if len(parts) != 2:
prints('Invalid range:', c, file=sys.stderr)
raise SystemExit(1)
if opts.codes:
parts = tuple(map(unichr, map(int, parts)))
map(not_single, parts)
ranges.add(tuple(parts))
else:
if opts.codes:
c = unichr(int(c))
not_single(c)
individual.add(c)
st = time.time()
sf, old_stats, new_stats = subset(orig, individual, ranges)
taken = time.time() - st
reduced = (len(sf)/len(orig)) * 100
def sz(x):
return '%gKB'%(len(x)/1024.)
print_stats(old_stats, new_stats)
prints('Original size:', sz(orig), 'Subset size:', sz(sf), 'Reduced to: %g%%'%(reduced))
prints('Subsetting took %g seconds'%taken)
with open(off, 'wb') as f:
f.write(sf)
prints('Subset font written to:', off)
if __name__ == '__main__':
try:
import init_calibre
init_calibre
except ImportError:
pass
import sys
main(sys.argv)
# }}}
# Tests {{{
def test_mem():
from calibre.utils.mem import memory
import gc
gc.collect()
start_mem = memory()
raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
calls = 1000
for i in xrange(calls):
subset(raw, (), (('a', 'z'),))
del raw
for i in xrange(3): gc.collect()
print ('Leaked memory per call:', (memory() - start_mem)/calls*1024, 'KB')
def test():
raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
if len(sf) > 0.3 * len(raw):
raise Exception('Subsetting failed')
def all():
from calibre.utils.fonts.scanner import font_scanner
failed = []
unsupported = []
total = 0
for family in font_scanner.find_font_families():
for font in font_scanner.fonts_for_family(family):
raw = font_scanner.get_font_data(font)
print ('Subsetting', font['full_name'], end='\t')
total += 1
try:
sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
except NoGlyphs:
print('No glyphs!')
continue
except UnsupportedFont as e:
unsupported.append((font['full_name'], font['path'], unicode(e)))
print ('Unsupported!')
continue
except Exception as e:
print ('Failed!')
failed.append((font['full_name'], font['path'], unicode(e)))
else:
print ('Reduced to:', '%.1f'%(
sum(new_stats.itervalues())/sum(old_stats.itervalues())
* 100), '%')
if unsupported:
print ('\n\nUnsupported:')
for name, path, err in unsupported:
print (name, path, err)
print()
if failed:
print ('\n\nFailures:')
for name, path, err in failed:
print (name, path, err)
print()
print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:',
len(failed))
# }}}

View File

@ -120,6 +120,7 @@ def all():
try: try:
sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ()) sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
except NoGlyphs: except NoGlyphs:
print ('No glyphs!')
continue continue
except UnsupportedFont as e: except UnsupportedFont as e:
unsupported.append((font['full_name'], font['path'], unicode(e))) unsupported.append((font['full_name'], font['path'], unicode(e)))