Start work on pure python implementation of font subsetting, since I really dont like sfntly

2025-07-09 03:04:10 -04:00 · 2012-11-05 23:42:07 +05:30 · 2012-11-05 23:42:07 +05:30 · a7f054ec5c
commit a7f054ec5c
parent f2e6dd1cce
7 changed files with 491 additions and 10 deletions
--- a/src/calibre/utils/fonts/sfnt/init.py
+++ b/src/calibre/utils/fonts/sfnt/init.py
@ -7,6 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from datetime import datetime, timedelta
 def align_block(raw, multiple=4, pad=b'\0'):
    '''
@ -17,5 +18,48 @@ def align_block(raw, multiple=4, pad=b'\0'):
    if extra == 0: return raw
    return raw + pad*(multiple - extra)
 class UnknownTable(object):
    def __init__(self, raw):
        self.raw = raw
    def __call__(self):
        return self.raw
 class DateTimeProperty(object):
    def __init__(self, name):
        self.name = name
    def __get__(self, obj, type=None):
        return datetime(1904, 1, 1) + timedelta(seconds=getattr(obj,
            self.name))
    def __set__(self, obj, val):
        td = val - datetime(1904, 1, 1)
        setattr(obj, self.name, int(td.total_seconds()))
 class FixedProperty(object):
    def __init__(self, name):
        self.name = name
    def __get__(self, obj, type=None):
        val = getattr(obj, self.name)
        return val * (2**-16)
    def __set__(self, obj, val):
        return int(round(val*(2**16)))
 def max_power_of_two(x):
 	"""
    Return the highest exponent of two, so that
 	(2 ** exponent) <= x
 	"""
 	exponent = 0
 	while x:
 		x = x >> 1
 		exponent += 1
 	return max(exponent - 1, 0)
--- a/src/calibre/utils/fonts/sfnt/cmap.py
+++ b/src/calibre/utils/fonts/sfnt/cmap.py
@ -0,0 +1,235 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 # Note that the code for creating a BMP table (cmap format 4) is taken with
 # thanks from the fonttools project (BSD licensed).
 from struct import unpack_from, calcsize, pack
 from collections import OrderedDict
 from calibre.utils.fonts.utils import get_bmp_glyph_ids
 from calibre.utils.fonts.sfnt import UnknownTable, max_power_of_two
 from calibre.utils.fonts.sfnt.errors import UnsupportedFont
 def split_range(start_code, end_code, cmap): # {{{
 	# Try to split a range of character codes into subranges with consecutive
 	# glyph IDs in such a way that the cmap4 subtable can be stored "most"
 	# efficiently.
 	if start_code == end_code:
 		return [], [end_code]
 	last_id = cmap[start_code]
 	last_code = start_code
 	in_order = None
 	ordered_begin = None
 	sub_ranges = []
 	# Gather subranges in which the glyph IDs are consecutive.
 	for code in range(start_code + 1, end_code + 1):
 		glyph_id = cmap[code]
 		if glyph_id - 1 == last_id:
 			if in_order is None or not in_order:
 				in_order = 1
 				ordered_begin = last_code
 		else:
 			if in_order:
 				in_order = 0
 				sub_ranges.append((ordered_begin, last_code))
 				ordered_begin = None
 		last_id = glyph_id
 		last_code = code
 	if in_order:
 		sub_ranges.append((ordered_begin, last_code))
 	assert last_code == end_code
 	# Now filter out those new subranges that would only make the data bigger.
 	# A new segment cost 8 bytes, not using a new segment costs 2 bytes per
 	# character.
 	new_ranges = []
 	for b, e in sub_ranges:
 		if b == start_code and e == end_code:
 			break  # the whole range, we're fine
 		if b == start_code or e == end_code:
 			threshold = 4  # split costs one more segment
 		else:
 			threshold = 8  # split costs two more segments
 		if (e - b + 1) > threshold:
 			new_ranges.append((b, e))
 	sub_ranges = new_ranges
 	if not sub_ranges:
 		return [], [end_code]
 	if sub_ranges[0][0] != start_code:
 		sub_ranges.insert(0, (start_code, sub_ranges[0][0] - 1))
 	if sub_ranges[-1][1] != end_code:
 		sub_ranges.append((sub_ranges[-1][1] + 1, end_code))
 	# Fill the "holes" in the segments list -- those are the segments in which
 	# the glyph IDs are _not_ consecutive.
 	i = 1
 	while i < len(sub_ranges):
 		if sub_ranges[i-1][1] + 1 != sub_ranges[i][0]:
 			sub_ranges.insert(i, (sub_ranges[i-1][1] + 1, sub_ranges[i][0] - 1))
 			i = i + 1
 		i = i + 1
 	# Transform the ranges into start_code/end_code lists.
 	start = []
 	end = []
 	for b, e in sub_ranges:
 		start.append(b)
 		end.append(e)
 	start.pop(0)
 	assert len(start) + 1 == len(end)
 	return start, end
 # }}}
 def set_id_delta(id_delta): # {{{
    # The lowest gid in glyphIndexArray, after subtracting id_delta, must be 1.
    # id_delta is a short, and must be between -32K and 32K
    # startCode can be between 0 and 64K-1, and the first glyph index can be between 1 and 64K-1
    # This means that we have a problem because we can need to assign to
    # id_delta values
    # between -(64K-2) and 64K -1.
    # Since the final gi is reconstructed from the glyphArray GID by:
    #    (short)finalGID = (gid +  id_delta) % 0x10000),
    # we can get from a startCode of 0 to a final GID of 64 -1K by subtracting 1, and casting the
    # negative number to an unsigned short.
    # Similarly , we can get from a startCode of 64K-1 to a final GID of 1 by adding 2, because of
    # the modulo arithmetic.
    if id_delta > 0x7FFF:
        id_delta = id_delta - 0x10000
    elif id_delta <  -0x7FFF:
        id_delta = id_delta + 0x10000
    return id_delta
 # }}}
 class CmapTable(UnknownTable):
    def __init__(self, *args, **kwargs):
        super(CmapTable, self).__init__(*args, **kwargs)
        self.version, self.num_tables = unpack_from(b'>HH', self.raw)
        self.tables = {}
        offset = 4
        sz = calcsize(b'>HHL')
        recs = []
        for i in xrange(self.num_tables):
            platform, encoding, table_offset = unpack_from(b'>HHL', self.raw,
                    offset)
            offset += sz
            recs.append((platform, encoding, table_offset))
        self.bmp_table = None
        for i in xrange(len(recs)):
            platform, encoding, offset = recs[i]
            try:
                next_offset = recs[i+1][-1]
            except IndexError:
                next_offset = len(self.raw)
            table = self.raw[offset:next_offset]
            fmt = unpack_from(b'>H', table)[0]
            if platform == 3 and encoding == 1 and fmt == 4:
                self.bmp_table = table
    def get_character_map(self, chars):
        '''
        Get a mapping of character codes to glyph ids in the font.
        '''
        if self.bmp_table is None:
            raise UnsupportedFont('This font has no Windows BMP cmap subtable.'
                    ' Most likely a special purpose font.')
        chars = list(set(chars))
        chars.sort()
        ans = OrderedDict()
        for i, glyph_id in enumerate(get_bmp_glyph_ids(self.bmp_table, 0,
            chars)):
            if glyph_id > 0:
                ans[chars[i]] = glyph_id
        return ans
    def set_character_map(self, cmap):
        self.version, self.num_tables = 0, 1
        fmt = b'>7H'
        codes = list(cmap.iterkeys())
        codes.sort()
        if not codes:
            start_code = [0xffff]
            end_code = [0xffff]
        else:
            last_code = codes[0]
            end_code = []
            start_code = [last_code]
            for code in codes[1:]:
 				if code == last_code + 1:
 					last_code = code
 					continue
 				start, end = split_range(start_code[-1], last_code, cmap)
 				start_code.extend(start)
 				end_code.extend(end)
 				start_code.append(code)
 				last_code = code
 			end_code.append(last_code)
 			start_code.append(0xffff)
 			end_code.append(0xffff)
 		id_delta = []
 		id_range_offset = []
 		glyph_index_array = []
 		for i in xrange(len(end_code)-1):  # skip the closing codes (0xffff)
 			indices = []
 			for char_code in xrange(start_code[i], end_code[i] + 1):
 				indices.append(cmap[char_code])
 			if  (indices == xrange(indices[0], indices[0] + len(indices))):
 				id_delta_temp = set_id_delta(indices[0] - start_code[i])
 				id_delta.append(id_delta_temp)
 				id_range_offset.append(0)
 			else:
 				id_delta.append(0)
 				id_range_offset.append(2 * (len(end_code) +
                    len(glyph_index_array) - i))
 				glyph_index_array.extend(indices)
 		id_delta.append(1)  # 0xffff + 1 == 0. So this end code maps to .notdef
 		id_range_offset.append(0)
 		seg_count = len(end_code)
 		max_exponent = max_power_of_two(seg_count)
 		search_range = 2 * (2 ** max_exponent)
 		entry_selector = max_exponent
 		range_shift = 2 * seg_count - search_range
        char_code_array = end_code + [0] + start_code
 		char_code_array = pack(b'>%dH'%len(char_code_array), *char_code_array)
 		id_delta_array = pack(b'>%dh'%len(id_delta), *id_delta)
        rest_array = id_range_offset + glyph_index_array
        rest_array = pack(b'>%dH'%len(rest_array), *rest_array)
 		data = char_code_array + id_delta_array + rest_array
 		length = calcsize(fmt) + len(data)
 		header = pack(fmt, 4, length, 0,
 				2*seg_count, search_range, entry_selector, range_shift)
 		self.bmp_table = header + data
        fmt = b'>4HL'
        offset = calcsize(fmt)
        self.raw = pack(fmt, self.version, self.num_tables, 3, 1, offset) + \
                self.bmp_table
--- a/src/calibre/utils/fonts/sfnt/container.py
+++ b/src/calibre/utils/fonts/sfnt/container.py
@ -7,22 +7,17 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from math import log
 from struct import pack, calcsize
 from io import BytesIO
 from calibre.utils.fonts.utils import (get_tables, checksum_of_block,
        verify_checksums)
-from calibre.utils.fonts.sfnt import align_block
+from calibre.utils.fonts.sfnt import align_block, UnknownTable, max_power_of_two
 from calibre.utils.fonts.sfnt.errors import UnsupportedFont
-class UnknownTable(object):
+from calibre.utils.fonts.sfnt.head import HeadTable
-
+from calibre.utils.fonts.sfnt.maxp import MaxpTable
-    def __init__(self, raw):
+from calibre.utils.fonts.sfnt.loca import LocaTable
        self.raw = raw
    def __call__(self):
        return self.raw
 class Sfnt(object):
@ -37,8 +32,23 @@ class Sfnt(object):
        self.tables = {}
        for table_tag, table, table_index, table_offset, table_checksum in get_tables(raw):
            self.tables[table_tag] = {
                    b'head' : HeadTable,
                    b'maxp' : MaxpTable,
                    b'loca' : LocaTable,
                    }.get(table_tag, UnknownTable)(table)
    def __getitem__(self, key):
        return self.tables[key]
    def __contains__(self, key):
        return key in self.tables
    def __delitem__(self, key):
        del self.tables[key]
    def pop(self, key, default=None):
        return self.tables.pop(key, default)
    def __call__(self):
        stream = BytesIO()
@ -49,7 +59,7 @@ class Sfnt(object):
        # Write header
        num_tables = len(self.tables)
-        ln2 = int(log(num_tables, 2))
+        ln2 = max_power_of_two(num_tables)
        srange = (2**ln2) * 16
        spack(b'>4s4H',
            self.sfnt_version, num_tables, srange, ln2, num_tables * 16 - srange)
--- a/src/calibre/utils/fonts/sfnt/head.py
+++ b/src/calibre/utils/fonts/sfnt/head.py
@ -0,0 +1,53 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from itertools import izip
 from struct import unpack_from, pack
 from calibre.utils.fonts.sfnt import UnknownTable, DateTimeProperty
 class HeadTable(UnknownTable):
    created = DateTimeProperty('_created')
    modified = DateTimeProperty('_modified')
    def __init__(self, *args, **kwargs):
        super(HeadTable, self).__init__(*args, **kwargs)
        field_types = (
                'version_number' , 'L',
                'font_revision'  , 'L',
                'checksum_adjustment' , 'L',
                'magic_number' , 'L',
                'flags' , 'H',
                'units_per_em' , 'H',
                '_created' , 'q',
                '_modified' , 'q',
                'x_min' , 'H',
                'y_min' , 'H',
                'x_max' , 'H',
                'y_max' , 'H',
                'mac_style' , 'H',
                'lowest_rec_ppem' , 'H',
                'font_direction_hint' , 'h',
                'index_to_loc_format' , 'h',
                'glyph_data_format'   , 'h'
        )
        self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii')
        self._fields = field_types[0::2]
        for f, val in izip(self._fields, unpack_from(self._fmt, self.raw)):
            setattr(self, f, val)
    def update(self):
        vals = [getattr(self, f) for f in self._fields]
        self.raw = pack(self._fmt, *vals)
--- a/src/calibre/utils/fonts/sfnt/loca.py
+++ b/src/calibre/utils/fonts/sfnt/loca.py
@ -0,0 +1,31 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from struct import calcsize, unpack_from
 from calibre.utils.fonts.sfnt import UnknownTable
 class LocaTable(UnknownTable):
    def load_offsets(self, head_table, maxp_table):
        fmt = 'H' if head_table.index_to_loc_format == 0 else 'L'
        num_glyphs = maxp_table.num_glyphs
        sz = calcsize(('>%s'%fmt).encode('ascii'))
        num = len(self.raw)//sz
        self.offset_map = unpack_from(('>%d%s'%(num, fmt)).encode('ascii'),
                self.raw)
        self.offset_map = self.offset_map[:num_glyphs+1]
        if fmt == 'H':
            self.offset_map = [2*i for i in self.offset_map]
    def glyph_location(self, glyph_id):
        offset = self.offset_map[glyph_id]
        next_offset = self.offset_map[glyph_id+1]
        return offset, next_offset - offset
--- a/src/calibre/utils/fonts/sfnt/maxp.py
+++ b/src/calibre/utils/fonts/sfnt/maxp.py
@ -0,0 +1,58 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from itertools import izip
 from struct import unpack_from, pack
 from calibre.utils.fonts.sfnt import UnknownTable
 from calibre.utils.fonts.sfnt.errors import UnsupportedFont
 class MaxpTable(UnknownTable):
    def __init__(self, *args, **kwargs):
        super(MaxpTable, self).__init__(*args, **kwargs)
        self._fmt = b'>LH'
        self._version, self.num_glyphs = unpack_from(self._fmt, self.raw)
        self.fields = ('_version', 'num_glyphs')
        if self._version >= 0x10000:
            self.version = 0x10000
            vals = unpack_from(self._fmt, self.raw)
            for f, val in izip(self.fields, vals):
                setattr(self, f, val)
    @dynamic_property
    def version(self):
        def fget(self):
            return self._version
        def fset(self, val):
            if val == 0x5000:
                self._fmt = b'>LH'
                self._fields = ('_version', 'num_glyphs')
            elif val == 0x10000:
                self.fields = ('_version', 'num_glyphs', 'max_points',
                        'max_contours', 'max_composite_points',
                        'max_composite_contours', 'max_zones',
                        'max_twilight_points', 'max_storage', 'max_function_defs',
                        'max_instruction_defs', 'max_stack_elements',
                        'max_size_of_instructions', 'max_component_elements',
                        'max_component_depth')
                self._fmt = b'>LH' + b'H'*(len(self.fields)-2)
            self._version = val
        return property(fget=fget, fset=fset)
    def update(self):
        if self._version > 0x10000:
            raise UnsupportedFont('maxp table with version > 0x10000 not modifiable')
        vals = [getattr(self, f) for f in self._fields]
        self.raw = pack(self._fmt, *vals)
--- a/src/calibre/utils/fonts/sfnt/subset.py
+++ b/src/calibre/utils/fonts/sfnt/subset.py
@ -0,0 +1,50 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from calibre.utils.fonts.sfnt.container import Sfnt
 from calibre.utils.fonts.sfnt.errors import UnsupportedFont
 def subset_truetype(sfnt, character_map):
    loca = sfnt[b'loca']
    try:
        head, maxp = sfnt[b'head'], sfnt[b'maxp']
    except KeyError:
        raise UnsupportedFont('This font does not contain head and/or maxp tables')
    loca.load_offsets(head, maxp)
 def subset(raw, individual_chars, ranges=()):
    chars = list(map(ord, individual_chars))
    for r in ranges:
        chars += list(xrange(ord(r[0]), ord(r[1])+1))
    sfnt = Sfnt(raw)
    # Remove the Digital Signature table since it is useless in a subset
    # font anyway
    sfnt.pop(b'DSIG', None)
    try:
        cmap = sfnt[b'cmap']
    except KeyError:
        raise UnsupportedFont('This font has no cmap table')
    # Get mapping of chars to glyph ids for all specified chars
    character_map = cmap.get_character_map(chars)
    # Restrict the cmap table to only contain entries for the specified chars
    cmap.set_character_map(character_map)
    if b'loca' in sfnt and b'glyf' in sfnt:
        subset_truetype(sfnt, character_map)
    elif b'CFF ' in sfnt:
        raise UnsupportedFont('This font contains PostScript outlines, '
                'subsetting not supported')
    else:
        raise UnsupportedFont('This font does not contain TrueType '
                'or PostScript outlines')