Switch to a pure python implementation of font subsetting

This commit is contained in:
Kovid Goyal 2012-11-06 11:39:48 +05:30
parent 18db66fd77
commit f54843c547
10 changed files with 383 additions and 13 deletions

View File

@ -212,7 +212,7 @@ def main(args=sys.argv):
return
if len(args) > 1 and args[1] in ('-f', '--subset-font'):
from calibre.utils.fonts.subset import main
from calibre.utils.fonts.sfnt.subset import main
main(['subset-font']+args[2:])
return

View File

@ -10,7 +10,7 @@ __docformat__ = 'restructuredtext en'
from collections import defaultdict
from calibre.ebooks.oeb.base import urlnormalize
from calibre.utils.fonts.subset import subset, NoGlyphs, UnsupportedFont
from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont
class SubsetFonts(object):

View File

@ -26,6 +26,9 @@ class UnknownTable(object):
def __call__(self):
return self.raw
def __len__(self):
return len(self.raw)
class DateTimeProperty(object):
def __init__(self, name):
@ -46,10 +49,10 @@ class FixedProperty(object):
def __get__(self, obj, type=None):
val = getattr(obj, self.name)
return val * (2**-16)
return val / 0x10000
def __set__(self, obj, val):
return int(round(val*(2**16)))
return int(round(val*(0x10000)))
def max_power_of_two(x):
"""
@ -62,4 +65,10 @@ def max_power_of_two(x):
exponent += 1
return max(exponent - 1, 0)
def load_font(stream_or_path):
raw = stream_or_path
if hasattr(raw, 'read'):
raw = raw.read()
from calibre.utils.fonts.sfnt.container import Sfnt
return Sfnt(raw)

View File

@ -144,6 +144,7 @@ class CmapTable(UnknownTable):
except IndexError:
next_offset = len(self.raw)
table = self.raw[offset:next_offset]
if table:
fmt = unpack_from(b'>H', table)[0]
if platform == 3 and encoding == 1 and fmt == 4:
self.bmp_table = table

View File

@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
from struct import pack, calcsize
from io import BytesIO
from collections import OrderedDict
from calibre.utils.fonts.utils import (get_tables, checksum_of_block,
verify_checksums)
@ -18,6 +19,8 @@ from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from calibre.utils.fonts.sfnt.head import HeadTable
from calibre.utils.fonts.sfnt.maxp import MaxpTable
from calibre.utils.fonts.sfnt.loca import LocaTable
from calibre.utils.fonts.sfnt.glyf import GlyfTable
from calibre.utils.fonts.sfnt.cmap import CmapTable
class Sfnt(object):
@ -35,6 +38,8 @@ class Sfnt(object):
b'head' : HeadTable,
b'maxp' : MaxpTable,
b'loca' : LocaTable,
b'glyf' : GlyfTable,
b'cmap' : CmapTable,
}.get(table_tag, UnknownTable)(table)
def __getitem__(self, key):
@ -49,6 +54,12 @@ class Sfnt(object):
def pop(self, key, default=None):
return self.tables.pop(key, default)
def sizes(self):
ans = OrderedDict()
for tag in sorted(self.tables):
ans[tag] = len(self[tag])
return ans
def __call__(self):
stream = BytesIO()
@ -68,6 +79,7 @@ class Sfnt(object):
head_offset = None
table_data = []
offset = stream.tell() + ( calcsize(b'>4s3L') * num_tables )
sizes = OrderedDict()
for tag in sorted(self.tables):
table = self.tables[tag]
raw = table()
@ -80,6 +92,7 @@ class Sfnt(object):
spack(b'>4s3L', tag, checksum, offset, table_len)
offset += len(raw)
table_data.append(raw)
sizes[tag] = table_len
for x in table_data:
stream.write(x)
@ -89,7 +102,7 @@ class Sfnt(object):
stream.seek(head_offset + 8)
spack(b'>L', q)
return stream.getvalue()
return stream.getvalue(), sizes
def test_roundtrip(ff=None):
if ff is None:
@ -97,7 +110,7 @@ def test_roundtrip(ff=None):
else:
with open(ff, 'rb') as f:
data = f.read()
rd = Sfnt(data)()
rd = Sfnt(data)()[0]
verify_checksums(rd)
if data[:12] != rd[:12]:
raise ValueError('Roundtripping failed, font header not the same')

View File

@ -10,3 +10,6 @@ __docformat__ = 'restructuredtext en'
class UnsupportedFont(ValueError):
pass
class NoGlyphs(ValueError):
pass

View File

@ -0,0 +1,88 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from
from collections import OrderedDict
from calibre.utils.fonts.sfnt import UnknownTable
ARG_1_AND_2_ARE_WORDS = 0x0001 # if set args are words otherwise they are bytes
ARGS_ARE_XY_VALUES = 0x0002 # if set args are xy values, otherwise they are points
ROUND_XY_TO_GRID = 0x0004 # for the xy values if above is true
WE_HAVE_A_SCALE = 0x0008 # Sx = Sy, otherwise scale == 1.0
NON_OVERLAPPING = 0x0010 # set to same value for all components (obsolete!)
MORE_COMPONENTS = 0x0020 # indicates at least one more glyph after this one
WE_HAVE_AN_X_AND_Y_SCALE = 0x0040 # Sx, Sy
WE_HAVE_A_TWO_BY_TWO = 0x0080 # t00, t01, t10, t11
WE_HAVE_INSTRUCTIONS = 0x0100 # instructions follow
USE_MY_METRICS = 0x0200 # apply these metrics to parent glyph
OVERLAP_COMPOUND = 0x0400 # used by Apple in GX fonts
SCALED_COMPONENT_OFFSET = 0x0800 # composite designed to have the component offset scaled (designed for Apple)
UNSCALED_COMPONENT_OFFSET = 0x1000 # composite designed not to have the component offset scaled (designed for MS)
class SimpleGlyph(object):
def __init__(self, num_of_countours, raw):
self.num_of_countours = num_of_countours
self.raw = raw
# The list of glyph indices referred to by this glyph, will always be
# empty for a simple glyph and not empty for a composite glyph
self.glyph_indices = []
self.is_composite = False
def __len__(self):
return len(self.raw)
def __call__(self):
return self.raw
class CompositeGlyph(SimpleGlyph):
def __init__(self, num_of_countours, raw):
super(CompositeGlyph, self).__init__(num_of_countours, raw)
self.is_composite = True
flags = MORE_COMPONENTS
offset = 0
while flags & MORE_COMPONENTS:
flags, glyph_index = unpack_from(b'>HH', raw, offset)
self.glyph_indices.append(glyph_index)
offset += 4
if flags & ARG_1_AND_2_ARE_WORDS:
offset += 4
else:
offset += 2
if flags & WE_HAVE_A_SCALE:
offset += 2
elif flags & WE_HAVE_AN_X_AND_Y_SCALE:
offset += 4
elif flags & WE_HAVE_A_TWO_BY_TWO:
offset += 8
class GlyfTable(UnknownTable):
def glyph_data(self, offset, length):
raw = self.raw[offset:offset+length]
num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0
if num_of_countours >= 0:
return SimpleGlyph(num_of_countours, raw)
return CompositeGlyph(num_of_countours, raw)
def update(self, sorted_glyph_map):
ans = OrderedDict()
offset = 0
block = []
for glyph_id, glyph in sorted_glyph_map.iteritems():
raw = glyph()
ans[glyph_id] = (offset, len(raw))
offset += len(raw)
block.append(raw)
self.raw = b''.join(block)
return ans

View File

@ -7,7 +7,8 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import calcsize, unpack_from
from struct import calcsize, unpack_from, pack
from operator import itemgetter
from calibre.utils.fonts.sfnt import UnknownTable
@ -23,9 +24,43 @@ class LocaTable(UnknownTable):
self.offset_map = self.offset_map[:num_glyphs+1]
if fmt == 'H':
self.offset_map = [2*i for i in self.offset_map]
self.fmt = fmt
def glyph_location(self, glyph_id):
offset = self.offset_map[glyph_id]
next_offset = self.offset_map[glyph_id+1]
return offset, next_offset - offset
def subset(self, resolved_glyph_map):
'''
Update this table to contain pointers only to the glyphs in
resolved_glyph_map which must be a map of glyph_ids to (offset, sz)
'''
self.offset_map = [0 for i in self.offset_map]
glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in
resolved_glyph_map.iteritems()]
glyphs.sort(key=itemgetter(1))
for glyph_id, offset, sz in glyphs:
self.offset_map[glyph_id] = offset
self.offset_map[glyph_id+1] = offset + sz
# Fix all zero entries to be the same as the previous entry, which
# means that if the ith entry is zero, the i-1 glyph is not present.
for i in xrange(1, len(self.offset_map)):
if self.offset_map[i] == 0:
self.offset_map[i] = self.offset_map[i-1]
vals = self.offset_map
if self.fmt == 'H':
vals = [i//2 for i in self.offset_map]
self.raw = pack(('>%d%s'%(len(vals), self.fmt)).encode('ascii'), *vals)
def dump_glyphs(self, sfnt):
if not hasattr(self, 'offset_map'):
self.load_offsets(sfnt[b'head'], sfnt[b'maxp'])
for i in xrange(len(self.offset_map)-1):
off, noff = self.offset_map[i], self.offset_map[i+1]
if noff != off:
print ('Glyph id:', i, 'size:', noff-off)

View File

@ -7,23 +7,73 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from collections import OrderedDict
from operator import itemgetter
from calibre.utils.fonts.sfnt.container import Sfnt
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs
# TrueType outlines {{{
def resolve_glyphs(loca, glyf, character_map):
unresolved_glyphs = set(character_map.itervalues())
unresolved_glyphs.add(0) # We always want the .notdef glyph
resolved_glyphs = {}
while unresolved_glyphs:
glyph_id = unresolved_glyphs.pop()
try:
offset, length = loca.glyph_location(glyph_id)
except (IndexError, ValueError, KeyError, TypeError):
continue
if length < 1:
continue
glyph = glyf.glyph_data(offset, length)
if len(glyph) == 0:
continue
resolved_glyphs[glyph_id] = glyph
for gid in glyph.glyph_indices:
if gid not in resolved_glyphs:
unresolved_glyphs.add(gid)
return OrderedDict(sorted(resolved_glyphs.iteritems(), key=itemgetter(0)))
def subset_truetype(sfnt, character_map):
loca = sfnt[b'loca']
glyf = sfnt[b'glyf']
try:
head, maxp = sfnt[b'head'], sfnt[b'maxp']
except KeyError:
raise UnsupportedFont('This font does not contain head and/or maxp tables')
loca.load_offsets(head, maxp)
resolved_glyphs = resolve_glyphs(loca, glyf, character_map)
if not resolved_glyphs or set(resolved_glyphs) == {0}:
raise NoGlyphs('This font has no glyphs for the specified character '
'set, subsetting it is pointless')
# Keep only character codes that have resolved glyphs
for code, glyph_id in tuple(character_map.iteritems()):
if glyph_id not in resolved_glyphs:
del character_map[code]
# Update the glyf table
glyph_offset_map = glyf.update(resolved_glyphs)
# Update the loca table
loca.subset(glyph_offset_map)
# }}}
def subset(raw, individual_chars, ranges=()):
chars = list(map(ord, individual_chars))
for r in ranges:
chars += list(xrange(ord(r[0]), ord(r[1])+1))
sfnt = Sfnt(raw)
old_sizes = sfnt.sizes()
# Remove the Digital Signature table since it is useless in a subset
# font anyway
sfnt.pop(b'DSIG', None)
@ -35,16 +85,186 @@ def subset(raw, individual_chars, ranges=()):
# Get mapping of chars to glyph ids for all specified chars
character_map = cmap.get_character_map(chars)
# Restrict the cmap table to only contain entries for the specified chars
cmap.set_character_map(character_map)
if b'loca' in sfnt and b'glyf' in sfnt:
# TrueType Outlines
subset_truetype(sfnt, character_map)
elif b'CFF ' in sfnt:
# PostScript Outlines
raise UnsupportedFont('This font contains PostScript outlines, '
'subsetting not supported')
else:
raise UnsupportedFont('This font does not contain TrueType '
'or PostScript outlines')
# Restrict the cmap table to only contain entries for the resolved glyphs
cmap.set_character_map(character_map)
raw, new_sizes = sfnt()
return raw, old_sizes, new_sizes
# CLI {{{
def option_parser():
import textwrap
from calibre.utils.config import OptionParser
parser = OptionParser(usage=textwrap.dedent('''\
%prog [options] input_font_file output_font_file characters_to_keep
Subset the specified font, keeping only the glyphs for the characters in
characters_to_keep. characters_to_keep is a comma separated list of characters of
the form: a,b,c,A-Z,0-9,xyz
You can specify ranges in the list of characters, as shown above.
'''))
parser.add_option('-c', '--codes', default=False, action='store_true',
help='If specified, the list of characters is interpreted as '
'numeric unicode codes instead of characters. So to specify the '
'characters a,b you would use 97,98')
parser.prog = 'subset-font'
return parser
def print_stats(old_stats, new_stats):
from calibre import prints
prints('========= Table comparison (original vs. subset) =========')
prints('Table', ' ', '%10s'%'Size', ' ', 'Percent', ' ', '%10s'%'New Size',
' New Percent')
prints('='*80)
old_total = sum(old_stats.itervalues())
new_total = sum(new_stats.itervalues())
tables = sorted(old_stats.iterkeys(), key=lambda x:old_stats[x],
reverse=True)
for table in tables:
osz = old_stats[table]
op = osz/old_total * 100
nsz = new_stats.get(table, 0)
np = nsz/new_total * 100
suffix = ' | same size'
if nsz != osz:
suffix = ' | reduced to %.1f %%'%(nsz/osz * 100)
prints('%4s'%table, ' ', '%10s'%osz, ' ', '%5.1f %%'%op, ' ',
'%10s'%nsz, ' ', '%5.1f %%'%np, suffix)
prints('='*80)
def main(args):
import sys, time
from calibre import prints
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) < 4 or len(args) > 4:
parser.print_help()
raise SystemExit(1)
iff, off, chars = args[1:]
with open(iff, 'rb') as f:
orig = f.read()
chars = [x.strip() for x in chars.split(',')]
individual, ranges = set(), set()
def not_single(c):
if len(c) > 1:
prints(c, 'is not a single character', file=sys.stderr)
raise SystemExit(1)
for c in chars:
if '-' in c:
parts = [x.strip() for x in c.split('-')]
if len(parts) != 2:
prints('Invalid range:', c, file=sys.stderr)
raise SystemExit(1)
if opts.codes:
parts = tuple(map(unichr, map(int, parts)))
map(not_single, parts)
ranges.add(tuple(parts))
else:
if opts.codes:
c = unichr(int(c))
not_single(c)
individual.add(c)
st = time.time()
sf, old_stats, new_stats = subset(orig, individual, ranges)
taken = time.time() - st
reduced = (len(sf)/len(orig)) * 100
def sz(x):
return '%gKB'%(len(x)/1024.)
print_stats(old_stats, new_stats)
prints('Original size:', sz(orig), 'Subset size:', sz(sf), 'Reduced to: %g%%'%(reduced))
prints('Subsetting took %g seconds'%taken)
with open(off, 'wb') as f:
f.write(sf)
prints('Subset font written to:', off)
if __name__ == '__main__':
try:
import init_calibre
init_calibre
except ImportError:
pass
import sys
main(sys.argv)
# }}}
# Tests {{{
def test_mem():
from calibre.utils.mem import memory
import gc
gc.collect()
start_mem = memory()
raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
calls = 1000
for i in xrange(calls):
subset(raw, (), (('a', 'z'),))
del raw
for i in xrange(3): gc.collect()
print ('Leaked memory per call:', (memory() - start_mem)/calls*1024, 'KB')
def test():
raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
if len(sf) > 0.3 * len(raw):
raise Exception('Subsetting failed')
def all():
from calibre.utils.fonts.scanner import font_scanner
failed = []
unsupported = []
total = 0
for family in font_scanner.find_font_families():
for font in font_scanner.fonts_for_family(family):
raw = font_scanner.get_font_data(font)
print ('Subsetting', font['full_name'], end='\t')
total += 1
try:
sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
except NoGlyphs:
print('No glyphs!')
continue
except UnsupportedFont as e:
unsupported.append((font['full_name'], font['path'], unicode(e)))
print ('Unsupported!')
continue
except Exception as e:
print ('Failed!')
failed.append((font['full_name'], font['path'], unicode(e)))
else:
print ('Reduced to:', '%.1f'%(
sum(new_stats.itervalues())/sum(old_stats.itervalues())
* 100), '%')
if unsupported:
print ('\n\nUnsupported:')
for name, path, err in unsupported:
print (name, path, err)
print()
if failed:
print ('\n\nFailures:')
for name, path, err in failed:
print (name, path, err)
print()
print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:',
len(failed))
# }}}

View File

@ -120,6 +120,7 @@ def all():
try:
sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ())
except NoGlyphs:
print ('No glyphs!')
continue
except UnsupportedFont as e:
unsupported.append((font['full_name'], font['path'], unicode(e)))