diff --git a/src/calibre/debug.py b/src/calibre/debug.py index f7fd6f2d72..a06096c593 100644 --- a/src/calibre/debug.py +++ b/src/calibre/debug.py @@ -212,7 +212,7 @@ def main(args=sys.argv): return if len(args) > 1 and args[1] in ('-f', '--subset-font'): - from calibre.utils.fonts.subset import main + from calibre.utils.fonts.sfnt.subset import main main(['subset-font']+args[2:]) return diff --git a/src/calibre/ebooks/oeb/transforms/subset.py b/src/calibre/ebooks/oeb/transforms/subset.py index a3e1b3bd10..9b10aaad06 100644 --- a/src/calibre/ebooks/oeb/transforms/subset.py +++ b/src/calibre/ebooks/oeb/transforms/subset.py @@ -10,7 +10,7 @@ __docformat__ = 'restructuredtext en' from collections import defaultdict from calibre.ebooks.oeb.base import urlnormalize -from calibre.utils.fonts.subset import subset, NoGlyphs, UnsupportedFont +from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont class SubsetFonts(object): diff --git a/src/calibre/utils/fonts/sfnt/__init__.py b/src/calibre/utils/fonts/sfnt/__init__.py index 7a40e7fd15..f3e0a2eae8 100644 --- a/src/calibre/utils/fonts/sfnt/__init__.py +++ b/src/calibre/utils/fonts/sfnt/__init__.py @@ -26,6 +26,9 @@ class UnknownTable(object): def __call__(self): return self.raw + def __len__(self): + return len(self.raw) + class DateTimeProperty(object): def __init__(self, name): @@ -46,10 +49,10 @@ class FixedProperty(object): def __get__(self, obj, type=None): val = getattr(obj, self.name) - return val * (2**-16) + return val / 0x10000 def __set__(self, obj, val): - return int(round(val*(2**16))) + return int(round(val*(0x10000))) def max_power_of_two(x): """ @@ -62,4 +65,10 @@ def max_power_of_two(x): exponent += 1 return max(exponent - 1, 0) +def load_font(stream_or_path): + raw = stream_or_path + if hasattr(raw, 'read'): + raw = raw.read() + from calibre.utils.fonts.sfnt.container import Sfnt + return Sfnt(raw) diff --git a/src/calibre/utils/fonts/sfnt/cmap.py b/src/calibre/utils/fonts/sfnt/cmap.py index 94b0e0eaf5..a00eb56d6f 100644 --- a/src/calibre/utils/fonts/sfnt/cmap.py +++ b/src/calibre/utils/fonts/sfnt/cmap.py @@ -144,9 +144,10 @@ class CmapTable(UnknownTable): except IndexError: next_offset = len(self.raw) table = self.raw[offset:next_offset] - fmt = unpack_from(b'>H', table)[0] - if platform == 3 and encoding == 1 and fmt == 4: - self.bmp_table = table + if table: + fmt = unpack_from(b'>H', table)[0] + if platform == 3 and encoding == 1 and fmt == 4: + self.bmp_table = table def get_character_map(self, chars): ''' diff --git a/src/calibre/utils/fonts/sfnt/container.py b/src/calibre/utils/fonts/sfnt/container.py index 6b4be41739..73bb787810 100644 --- a/src/calibre/utils/fonts/sfnt/container.py +++ b/src/calibre/utils/fonts/sfnt/container.py @@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en' from struct import pack, calcsize from io import BytesIO +from collections import OrderedDict from calibre.utils.fonts.utils import (get_tables, checksum_of_block, verify_checksums) @@ -18,6 +19,8 @@ from calibre.utils.fonts.sfnt.errors import UnsupportedFont from calibre.utils.fonts.sfnt.head import HeadTable from calibre.utils.fonts.sfnt.maxp import MaxpTable from calibre.utils.fonts.sfnt.loca import LocaTable +from calibre.utils.fonts.sfnt.glyf import GlyfTable +from calibre.utils.fonts.sfnt.cmap import CmapTable class Sfnt(object): @@ -35,6 +38,8 @@ class Sfnt(object): b'head' : HeadTable, b'maxp' : MaxpTable, b'loca' : LocaTable, + b'glyf' : GlyfTable, + b'cmap' : CmapTable, }.get(table_tag, UnknownTable)(table) def __getitem__(self, key): @@ -49,6 +54,12 @@ class Sfnt(object): def pop(self, key, default=None): return self.tables.pop(key, default) + def sizes(self): + ans = OrderedDict() + for tag in sorted(self.tables): + ans[tag] = len(self[tag]) + return ans + def __call__(self): stream = BytesIO() @@ -68,6 +79,7 @@ class Sfnt(object): head_offset = None table_data = [] offset = stream.tell() + ( calcsize(b'>4s3L') * num_tables ) + sizes = OrderedDict() for tag in sorted(self.tables): table = self.tables[tag] raw = table() @@ -80,6 +92,7 @@ class Sfnt(object): spack(b'>4s3L', tag, checksum, offset, table_len) offset += len(raw) table_data.append(raw) + sizes[tag] = table_len for x in table_data: stream.write(x) @@ -89,7 +102,7 @@ class Sfnt(object): stream.seek(head_offset + 8) spack(b'>L', q) - return stream.getvalue() + return stream.getvalue(), sizes def test_roundtrip(ff=None): if ff is None: @@ -97,7 +110,7 @@ def test_roundtrip(ff=None): else: with open(ff, 'rb') as f: data = f.read() - rd = Sfnt(data)() + rd = Sfnt(data)()[0] verify_checksums(rd) if data[:12] != rd[:12]: raise ValueError('Roundtripping failed, font header not the same') diff --git a/src/calibre/utils/fonts/sfnt/errors.py b/src/calibre/utils/fonts/sfnt/errors.py index a002192863..c2a918d78b 100644 --- a/src/calibre/utils/fonts/sfnt/errors.py +++ b/src/calibre/utils/fonts/sfnt/errors.py @@ -10,3 +10,6 @@ __docformat__ = 'restructuredtext en' class UnsupportedFont(ValueError): pass +class NoGlyphs(ValueError): + pass + diff --git a/src/calibre/utils/fonts/sfnt/glyf.py b/src/calibre/utils/fonts/sfnt/glyf.py new file mode 100644 index 0000000000..55c2a13767 --- /dev/null +++ b/src/calibre/utils/fonts/sfnt/glyf.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from struct import unpack_from +from collections import OrderedDict + +from calibre.utils.fonts.sfnt import UnknownTable + +ARG_1_AND_2_ARE_WORDS = 0x0001 # if set args are words otherwise they are bytes +ARGS_ARE_XY_VALUES = 0x0002 # if set args are xy values, otherwise they are points +ROUND_XY_TO_GRID = 0x0004 # for the xy values if above is true +WE_HAVE_A_SCALE = 0x0008 # Sx = Sy, otherwise scale == 1.0 +NON_OVERLAPPING = 0x0010 # set to same value for all components (obsolete!) +MORE_COMPONENTS = 0x0020 # indicates at least one more glyph after this one +WE_HAVE_AN_X_AND_Y_SCALE = 0x0040 # Sx, Sy +WE_HAVE_A_TWO_BY_TWO = 0x0080 # t00, t01, t10, t11 +WE_HAVE_INSTRUCTIONS = 0x0100 # instructions follow +USE_MY_METRICS = 0x0200 # apply these metrics to parent glyph +OVERLAP_COMPOUND = 0x0400 # used by Apple in GX fonts +SCALED_COMPONENT_OFFSET = 0x0800 # composite designed to have the component offset scaled (designed for Apple) +UNSCALED_COMPONENT_OFFSET = 0x1000 # composite designed not to have the component offset scaled (designed for MS) + +class SimpleGlyph(object): + + def __init__(self, num_of_countours, raw): + self.num_of_countours = num_of_countours + self.raw = raw + # The list of glyph indices referred to by this glyph, will always be + # empty for a simple glyph and not empty for a composite glyph + self.glyph_indices = [] + self.is_composite = False + + def __len__(self): + return len(self.raw) + + def __call__(self): + return self.raw + +class CompositeGlyph(SimpleGlyph): + + def __init__(self, num_of_countours, raw): + super(CompositeGlyph, self).__init__(num_of_countours, raw) + self.is_composite = True + + flags = MORE_COMPONENTS + offset = 0 + while flags & MORE_COMPONENTS: + flags, glyph_index = unpack_from(b'>HH', raw, offset) + self.glyph_indices.append(glyph_index) + offset += 4 + if flags & ARG_1_AND_2_ARE_WORDS: + offset += 4 + else: + offset += 2 + if flags & WE_HAVE_A_SCALE: + offset += 2 + elif flags & WE_HAVE_AN_X_AND_Y_SCALE: + offset += 4 + elif flags & WE_HAVE_A_TWO_BY_TWO: + offset += 8 + +class GlyfTable(UnknownTable): + + def glyph_data(self, offset, length): + raw = self.raw[offset:offset+length] + num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0 + if num_of_countours >= 0: + return SimpleGlyph(num_of_countours, raw) + return CompositeGlyph(num_of_countours, raw) + + def update(self, sorted_glyph_map): + ans = OrderedDict() + offset = 0 + block = [] + for glyph_id, glyph in sorted_glyph_map.iteritems(): + raw = glyph() + ans[glyph_id] = (offset, len(raw)) + offset += len(raw) + block.append(raw) + self.raw = b''.join(block) + return ans + diff --git a/src/calibre/utils/fonts/sfnt/loca.py b/src/calibre/utils/fonts/sfnt/loca.py index f6ca903b83..062cf561aa 100644 --- a/src/calibre/utils/fonts/sfnt/loca.py +++ b/src/calibre/utils/fonts/sfnt/loca.py @@ -7,7 +7,8 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -from struct import calcsize, unpack_from +from struct import calcsize, unpack_from, pack +from operator import itemgetter from calibre.utils.fonts.sfnt import UnknownTable @@ -23,9 +24,43 @@ class LocaTable(UnknownTable): self.offset_map = self.offset_map[:num_glyphs+1] if fmt == 'H': self.offset_map = [2*i for i in self.offset_map] + self.fmt = fmt def glyph_location(self, glyph_id): offset = self.offset_map[glyph_id] next_offset = self.offset_map[glyph_id+1] return offset, next_offset - offset + def subset(self, resolved_glyph_map): + ''' + Update this table to contain pointers only to the glyphs in + resolved_glyph_map which must be a map of glyph_ids to (offset, sz) + ''' + self.offset_map = [0 for i in self.offset_map] + glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in + resolved_glyph_map.iteritems()] + glyphs.sort(key=itemgetter(1)) + for glyph_id, offset, sz in glyphs: + self.offset_map[glyph_id] = offset + self.offset_map[glyph_id+1] = offset + sz + # Fix all zero entries to be the same as the previous entry, which + # means that if the ith entry is zero, the i-1 glyph is not present. + for i in xrange(1, len(self.offset_map)): + if self.offset_map[i] == 0: + self.offset_map[i] = self.offset_map[i-1] + + vals = self.offset_map + if self.fmt == 'H': + vals = [i//2 for i in self.offset_map] + + self.raw = pack(('>%d%s'%(len(vals), self.fmt)).encode('ascii'), *vals) + + def dump_glyphs(self, sfnt): + if not hasattr(self, 'offset_map'): + self.load_offsets(sfnt[b'head'], sfnt[b'maxp']) + for i in xrange(len(self.offset_map)-1): + off, noff = self.offset_map[i], self.offset_map[i+1] + if noff != off: + print ('Glyph id:', i, 'size:', noff-off) + + diff --git a/src/calibre/utils/fonts/sfnt/subset.py b/src/calibre/utils/fonts/sfnt/subset.py index 085b6255e4..e3883c180d 100644 --- a/src/calibre/utils/fonts/sfnt/subset.py +++ b/src/calibre/utils/fonts/sfnt/subset.py @@ -7,23 +7,73 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' +from collections import OrderedDict +from operator import itemgetter + from calibre.utils.fonts.sfnt.container import Sfnt -from calibre.utils.fonts.sfnt.errors import UnsupportedFont +from calibre.utils.fonts.sfnt.errors import UnsupportedFont, NoGlyphs + +# TrueType outlines {{{ + +def resolve_glyphs(loca, glyf, character_map): + unresolved_glyphs = set(character_map.itervalues()) + unresolved_glyphs.add(0) # We always want the .notdef glyph + resolved_glyphs = {} + + while unresolved_glyphs: + glyph_id = unresolved_glyphs.pop() + try: + offset, length = loca.glyph_location(glyph_id) + except (IndexError, ValueError, KeyError, TypeError): + continue + if length < 1: + continue + glyph = glyf.glyph_data(offset, length) + if len(glyph) == 0: + continue + resolved_glyphs[glyph_id] = glyph + for gid in glyph.glyph_indices: + if gid not in resolved_glyphs: + unresolved_glyphs.add(gid) + + return OrderedDict(sorted(resolved_glyphs.iteritems(), key=itemgetter(0))) def subset_truetype(sfnt, character_map): loca = sfnt[b'loca'] + glyf = sfnt[b'glyf'] + try: head, maxp = sfnt[b'head'], sfnt[b'maxp'] except KeyError: raise UnsupportedFont('This font does not contain head and/or maxp tables') loca.load_offsets(head, maxp) + resolved_glyphs = resolve_glyphs(loca, glyf, character_map) + if not resolved_glyphs or set(resolved_glyphs) == {0}: + raise NoGlyphs('This font has no glyphs for the specified character ' + 'set, subsetting it is pointless') + + # Keep only character codes that have resolved glyphs + for code, glyph_id in tuple(character_map.iteritems()): + if glyph_id not in resolved_glyphs: + del character_map[code] + + # Update the glyf table + glyph_offset_map = glyf.update(resolved_glyphs) + + # Update the loca table + loca.subset(glyph_offset_map) + +# }}} + def subset(raw, individual_chars, ranges=()): chars = list(map(ord, individual_chars)) for r in ranges: chars += list(xrange(ord(r[0]), ord(r[1])+1)) sfnt = Sfnt(raw) + old_sizes = sfnt.sizes() + # Remove the Digital Signature table since it is useless in a subset # font anyway sfnt.pop(b'DSIG', None) @@ -35,16 +85,186 @@ def subset(raw, individual_chars, ranges=()): # Get mapping of chars to glyph ids for all specified chars character_map = cmap.get_character_map(chars) - # Restrict the cmap table to only contain entries for the specified chars - cmap.set_character_map(character_map) if b'loca' in sfnt and b'glyf' in sfnt: + # TrueType Outlines subset_truetype(sfnt, character_map) elif b'CFF ' in sfnt: + # PostScript Outlines raise UnsupportedFont('This font contains PostScript outlines, ' 'subsetting not supported') else: raise UnsupportedFont('This font does not contain TrueType ' 'or PostScript outlines') + # Restrict the cmap table to only contain entries for the resolved glyphs + cmap.set_character_map(character_map) + + raw, new_sizes = sfnt() + return raw, old_sizes, new_sizes + +# CLI {{{ +def option_parser(): + import textwrap + from calibre.utils.config import OptionParser + parser = OptionParser(usage=textwrap.dedent('''\ + %prog [options] input_font_file output_font_file characters_to_keep + + Subset the specified font, keeping only the glyphs for the characters in + characters_to_keep. characters_to_keep is a comma separated list of characters of + the form: a,b,c,A-Z,0-9,xyz + + You can specify ranges in the list of characters, as shown above. + ''')) + parser.add_option('-c', '--codes', default=False, action='store_true', + help='If specified, the list of characters is interpreted as ' + 'numeric unicode codes instead of characters. So to specify the ' + 'characters a,b you would use 97,98') + parser.prog = 'subset-font' + return parser + +def print_stats(old_stats, new_stats): + from calibre import prints + prints('========= Table comparison (original vs. subset) =========') + prints('Table', ' ', '%10s'%'Size', ' ', 'Percent', ' ', '%10s'%'New Size', + ' New Percent') + prints('='*80) + old_total = sum(old_stats.itervalues()) + new_total = sum(new_stats.itervalues()) + tables = sorted(old_stats.iterkeys(), key=lambda x:old_stats[x], + reverse=True) + for table in tables: + osz = old_stats[table] + op = osz/old_total * 100 + nsz = new_stats.get(table, 0) + np = nsz/new_total * 100 + suffix = ' | same size' + if nsz != osz: + suffix = ' | reduced to %.1f %%'%(nsz/osz * 100) + prints('%4s'%table, ' ', '%10s'%osz, ' ', '%5.1f %%'%op, ' ', + '%10s'%nsz, ' ', '%5.1f %%'%np, suffix) + prints('='*80) + + +def main(args): + import sys, time + from calibre import prints + parser = option_parser() + opts, args = parser.parse_args(args) + if len(args) < 4 or len(args) > 4: + parser.print_help() + raise SystemExit(1) + iff, off, chars = args[1:] + with open(iff, 'rb') as f: + orig = f.read() + + chars = [x.strip() for x in chars.split(',')] + individual, ranges = set(), set() + + def not_single(c): + if len(c) > 1: + prints(c, 'is not a single character', file=sys.stderr) + raise SystemExit(1) + + for c in chars: + if '-' in c: + parts = [x.strip() for x in c.split('-')] + if len(parts) != 2: + prints('Invalid range:', c, file=sys.stderr) + raise SystemExit(1) + if opts.codes: + parts = tuple(map(unichr, map(int, parts))) + map(not_single, parts) + ranges.add(tuple(parts)) + else: + if opts.codes: + c = unichr(int(c)) + not_single(c) + individual.add(c) + st = time.time() + sf, old_stats, new_stats = subset(orig, individual, ranges) + taken = time.time() - st + reduced = (len(sf)/len(orig)) * 100 + def sz(x): + return '%gKB'%(len(x)/1024.) + print_stats(old_stats, new_stats) + prints('Original size:', sz(orig), 'Subset size:', sz(sf), 'Reduced to: %g%%'%(reduced)) + prints('Subsetting took %g seconds'%taken) + with open(off, 'wb') as f: + f.write(sf) + prints('Subset font written to:', off) + +if __name__ == '__main__': + try: + import init_calibre + init_calibre + except ImportError: + pass + import sys + main(sys.argv) +# }}} + +# Tests {{{ +def test_mem(): + from calibre.utils.mem import memory + import gc + gc.collect() + start_mem = memory() + raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True) + calls = 1000 + for i in xrange(calls): + subset(raw, (), (('a', 'z'),)) + del raw + for i in xrange(3): gc.collect() + print ('Leaked memory per call:', (memory() - start_mem)/calls*1024, 'KB') + +def test(): + raw = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True) + sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ()) + if len(sf) > 0.3 * len(raw): + raise Exception('Subsetting failed') + +def all(): + from calibre.utils.fonts.scanner import font_scanner + failed = [] + unsupported = [] + total = 0 + for family in font_scanner.find_font_families(): + for font in font_scanner.fonts_for_family(family): + raw = font_scanner.get_font_data(font) + print ('Subsetting', font['full_name'], end='\t') + total += 1 + try: + sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ()) + except NoGlyphs: + print('No glyphs!') + continue + except UnsupportedFont as e: + unsupported.append((font['full_name'], font['path'], unicode(e))) + print ('Unsupported!') + continue + except Exception as e: + print ('Failed!') + failed.append((font['full_name'], font['path'], unicode(e))) + else: + print ('Reduced to:', '%.1f'%( + sum(new_stats.itervalues())/sum(old_stats.itervalues()) + * 100), '%') + if unsupported: + print ('\n\nUnsupported:') + for name, path, err in unsupported: + print (name, path, err) + print() + if failed: + print ('\n\nFailures:') + for name, path, err in failed: + print (name, path, err) + print() + + print('Total:', total, 'Unsupported:', len(unsupported), 'Failed:', + len(failed)) + + +# }}} + diff --git a/src/calibre/utils/fonts/subset.py b/src/calibre/utils/fonts/subset.py index f000f2d608..777d72971d 100644 --- a/src/calibre/utils/fonts/subset.py +++ b/src/calibre/utils/fonts/subset.py @@ -120,6 +120,7 @@ def all(): try: sf, old_stats, new_stats = subset(raw, set(('a', 'b', 'c')), ()) except NoGlyphs: + print ('No glyphs!') continue except UnsupportedFont as e: unsupported.append((font['full_name'], font['path'], unicode(e)))