Start work on pure python implementation of font subsetting, since I really dont like sfntly

2025-08-30 23:00:21 -04:00 · 2012-11-05 23:42:07 +05:30 · 2012-11-05 23:42:07 +05:30 · a7f054ec5c
commit a7f054ec5c
parent f2e6dd1cce
7 changed files with 491 additions and 10 deletions
--- a/src/calibre/utils/fonts/sfnt/init.py
+++ b/src/calibre/utils/fonts/sfnt/init.py
@ -7,6 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

+from datetime import datetime, timedelta

 def align_block(raw, multiple=4, pad=b'\0'):
    '''
@ -17,5 +18,48 @@ def align_block(raw, multiple=4, pad=b'\0'):
    if extra == 0: return raw
    return raw + pad*(multiple - extra)

+class UnknownTable(object):
+
+    def __init__(self, raw):
+        self.raw = raw
+
+    def __call__(self):
+        return self.raw
+
+class DateTimeProperty(object):
+
+    def __init__(self, name):
+        self.name = name
+
+    def __get__(self, obj, type=None):
+        return datetime(1904, 1, 1) + timedelta(seconds=getattr(obj,
+            self.name))
+
+    def __set__(self, obj, val):
+        td = val - datetime(1904, 1, 1)
+        setattr(obj, self.name, int(td.total_seconds()))
+
+class FixedProperty(object):
+
+    def __init__(self, name):
+        self.name = name
+
+    def __get__(self, obj, type=None):
+        val = getattr(obj, self.name)
+        return val * (2**-16)
+
+    def __set__(self, obj, val):
+        return int(round(val*(2**16)))
+
+def max_power_of_two(x):
+	"""
+    Return the highest exponent of two, so that
+	(2 ** exponent) <= x
+	"""
+	exponent = 0
+	while x:
+		x = x >> 1
+		exponent += 1
+	return max(exponent - 1, 0)


--- a/src/calibre/utils/fonts/sfnt/cmap.py
+++ b/src/calibre/utils/fonts/sfnt/cmap.py
@ -0,0 +1,235 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+# Note that the code for creating a BMP table (cmap format 4) is taken with
+# thanks from the fonttools project (BSD licensed).
+
+from struct import unpack_from, calcsize, pack
+from collections import OrderedDict
+
+from calibre.utils.fonts.utils import get_bmp_glyph_ids
+from calibre.utils.fonts.sfnt import UnknownTable, max_power_of_two
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+
+def split_range(start_code, end_code, cmap): # {{{
+	# Try to split a range of character codes into subranges with consecutive
+	# glyph IDs in such a way that the cmap4 subtable can be stored "most"
+	# efficiently.
+	if start_code == end_code:
+		return [], [end_code]
+
+	last_id = cmap[start_code]
+	last_code = start_code
+	in_order = None
+	ordered_begin = None
+	sub_ranges = []
+
+	# Gather subranges in which the glyph IDs are consecutive.
+	for code in range(start_code + 1, end_code + 1):
+		glyph_id = cmap[code]
+
+		if glyph_id - 1 == last_id:
+			if in_order is None or not in_order:
+				in_order = 1
+				ordered_begin = last_code
+		else:
+			if in_order:
+				in_order = 0
+				sub_ranges.append((ordered_begin, last_code))
+				ordered_begin = None
+
+		last_id = glyph_id
+		last_code = code
+
+	if in_order:
+		sub_ranges.append((ordered_begin, last_code))
+	assert last_code == end_code
+
+	# Now filter out those new subranges that would only make the data bigger.
+	# A new segment cost 8 bytes, not using a new segment costs 2 bytes per
+	# character.
+	new_ranges = []
+	for b, e in sub_ranges:
+		if b == start_code and e == end_code:
+			break  # the whole range, we're fine
+		if b == start_code or e == end_code:
+			threshold = 4  # split costs one more segment
+		else:
+			threshold = 8  # split costs two more segments
+		if (e - b + 1) > threshold:
+			new_ranges.append((b, e))
+	sub_ranges = new_ranges
+
+	if not sub_ranges:
+		return [], [end_code]
+
+	if sub_ranges[0][0] != start_code:
+		sub_ranges.insert(0, (start_code, sub_ranges[0][0] - 1))
+	if sub_ranges[-1][1] != end_code:
+		sub_ranges.append((sub_ranges[-1][1] + 1, end_code))
+
+	# Fill the "holes" in the segments list -- those are the segments in which
+	# the glyph IDs are _not_ consecutive.
+	i = 1
+	while i < len(sub_ranges):
+		if sub_ranges[i-1][1] + 1 != sub_ranges[i][0]:
+			sub_ranges.insert(i, (sub_ranges[i-1][1] + 1, sub_ranges[i][0] - 1))
+			i = i + 1
+		i = i + 1
+
+	# Transform the ranges into start_code/end_code lists.
+	start = []
+	end = []
+	for b, e in sub_ranges:
+		start.append(b)
+		end.append(e)
+	start.pop(0)
+
+	assert len(start) + 1 == len(end)
+	return start, end
+# }}}
+
+def set_id_delta(id_delta): # {{{
+    # The lowest gid in glyphIndexArray, after subtracting id_delta, must be 1.
+    # id_delta is a short, and must be between -32K and 32K
+    # startCode can be between 0 and 64K-1, and the first glyph index can be between 1 and 64K-1
+    # This means that we have a problem because we can need to assign to
+    # id_delta values
+    # between -(64K-2) and 64K -1.
+    # Since the final gi is reconstructed from the glyphArray GID by:
+    #    (short)finalGID = (gid +  id_delta) % 0x10000),
+    # we can get from a startCode of 0 to a final GID of 64 -1K by subtracting 1, and casting the
+    # negative number to an unsigned short.
+    # Similarly , we can get from a startCode of 64K-1 to a final GID of 1 by adding 2, because of
+    # the modulo arithmetic.
+
+    if id_delta > 0x7FFF:
+        id_delta = id_delta - 0x10000
+    elif id_delta <  -0x7FFF:
+        id_delta = id_delta + 0x10000
+
+    return id_delta
+# }}}
+
+class CmapTable(UnknownTable):
+
+    def __init__(self, *args, **kwargs):
+        super(CmapTable, self).__init__(*args, **kwargs)
+
+        self.version, self.num_tables = unpack_from(b'>HH', self.raw)
+
+        self.tables = {}
+
+        offset = 4
+        sz = calcsize(b'>HHL')
+        recs = []
+        for i in xrange(self.num_tables):
+            platform, encoding, table_offset = unpack_from(b'>HHL', self.raw,
+                    offset)
+            offset += sz
+            recs.append((platform, encoding, table_offset))
+
+        self.bmp_table = None
+
+        for i in xrange(len(recs)):
+            platform, encoding, offset = recs[i]
+            try:
+                next_offset = recs[i+1][-1]
+            except IndexError:
+                next_offset = len(self.raw)
+            table = self.raw[offset:next_offset]
+            fmt = unpack_from(b'>H', table)[0]
+            if platform == 3 and encoding == 1 and fmt == 4:
+                self.bmp_table = table
+
+    def get_character_map(self, chars):
+        '''
+        Get a mapping of character codes to glyph ids in the font.
+        '''
+        if self.bmp_table is None:
+            raise UnsupportedFont('This font has no Windows BMP cmap subtable.'
+                    ' Most likely a special purpose font.')
+        chars = list(set(chars))
+        chars.sort()
+        ans = OrderedDict()
+        for i, glyph_id in enumerate(get_bmp_glyph_ids(self.bmp_table, 0,
+            chars)):
+            if glyph_id > 0:
+                ans[chars[i]] = glyph_id
+        return ans
+
+    def set_character_map(self, cmap):
+        self.version, self.num_tables = 0, 1
+        fmt = b'>7H'
+        codes = list(cmap.iterkeys())
+        codes.sort()
+
+        if not codes:
+            start_code = [0xffff]
+            end_code = [0xffff]
+        else:
+            last_code = codes[0]
+            end_code = []
+            start_code = [last_code]
+
+            for code in codes[1:]:
+				if code == last_code + 1:
+					last_code = code
+					continue
+				start, end = split_range(start_code[-1], last_code, cmap)
+				start_code.extend(start)
+				end_code.extend(end)
+				start_code.append(code)
+				last_code = code
+			end_code.append(last_code)
+			start_code.append(0xffff)
+			end_code.append(0xffff)
+
+		id_delta = []
+		id_range_offset = []
+		glyph_index_array = []
+		for i in xrange(len(end_code)-1):  # skip the closing codes (0xffff)
+			indices = []
+			for char_code in xrange(start_code[i], end_code[i] + 1):
+				indices.append(cmap[char_code])
+			if  (indices == xrange(indices[0], indices[0] + len(indices))):
+				id_delta_temp = set_id_delta(indices[0] - start_code[i])
+				id_delta.append(id_delta_temp)
+				id_range_offset.append(0)
+			else:
+				id_delta.append(0)
+				id_range_offset.append(2 * (len(end_code) +
+                    len(glyph_index_array) - i))
+				glyph_index_array.extend(indices)
+		id_delta.append(1)  # 0xffff + 1 == 0. So this end code maps to .notdef
+		id_range_offset.append(0)
+
+		seg_count = len(end_code)
+		max_exponent = max_power_of_two(seg_count)
+		search_range = 2 * (2 ** max_exponent)
+		entry_selector = max_exponent
+		range_shift = 2 * seg_count - search_range
+
+        char_code_array = end_code + [0] + start_code
+		char_code_array = pack(b'>%dH'%len(char_code_array), *char_code_array)
+		id_delta_array = pack(b'>%dh'%len(id_delta), *id_delta)
+        rest_array = id_range_offset + glyph_index_array
+        rest_array = pack(b'>%dH'%len(rest_array), *rest_array)
+		data = char_code_array + id_delta_array + rest_array
+
+		length = calcsize(fmt) + len(data)
+		header = pack(fmt, 4, length, 0,
+				2*seg_count, search_range, entry_selector, range_shift)
+		self.bmp_table = header + data
+
+        fmt = b'>4HL'
+        offset = calcsize(fmt)
+        self.raw = pack(fmt, self.version, self.num_tables, 3, 1, offset) + \
+                self.bmp_table
+
--- a/src/calibre/utils/fonts/sfnt/container.py
+++ b/src/calibre/utils/fonts/sfnt/container.py
@ -7,22 +7,17 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-from math import log
 from struct import pack, calcsize
 from io import BytesIO

 from calibre.utils.fonts.utils import (get_tables, checksum_of_block,
        verify_checksums)
-from calibre.utils.fonts.sfnt import align_block
+from calibre.utils.fonts.sfnt import align_block, UnknownTable, max_power_of_two
 from calibre.utils.fonts.sfnt.errors import UnsupportedFont

-class UnknownTable(object):
-
-    def __init__(self, raw):
-        self.raw = raw
-
-    def __call__(self):
-        return self.raw
+from calibre.utils.fonts.sfnt.head import HeadTable
+from calibre.utils.fonts.sfnt.maxp import MaxpTable
+from calibre.utils.fonts.sfnt.loca import LocaTable

 class Sfnt(object):

@ -37,8 +32,23 @@ class Sfnt(object):
        self.tables = {}
        for table_tag, table, table_index, table_offset, table_checksum in get_tables(raw):
            self.tables[table_tag] = {
+                    b'head' : HeadTable,
+                    b'maxp' : MaxpTable,
+                    b'loca' : LocaTable,
                    }.get(table_tag, UnknownTable)(table)

+    def __getitem__(self, key):
+        return self.tables[key]
+
+    def __contains__(self, key):
+        return key in self.tables
+
+    def __delitem__(self, key):
+        del self.tables[key]
+
+    def pop(self, key, default=None):
+        return self.tables.pop(key, default)
+
    def __call__(self):
        stream = BytesIO()

@ -49,7 +59,7 @@ class Sfnt(object):

        # Write header
        num_tables = len(self.tables)
-        ln2 = int(log(num_tables, 2))
+        ln2 = max_power_of_two(num_tables)
        srange = (2**ln2) * 16
        spack(b'>4s4H',
            self.sfnt_version, num_tables, srange, ln2, num_tables * 16 - srange)
--- a/src/calibre/utils/fonts/sfnt/head.py
+++ b/src/calibre/utils/fonts/sfnt/head.py
@ -0,0 +1,53 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from itertools import izip
+from struct import unpack_from, pack
+
+from calibre.utils.fonts.sfnt import UnknownTable, DateTimeProperty
+
+class HeadTable(UnknownTable):
+
+    created = DateTimeProperty('_created')
+    modified = DateTimeProperty('_modified')
+
+    def __init__(self, *args, **kwargs):
+        super(HeadTable, self).__init__(*args, **kwargs)
+
+        field_types = (
+                'version_number' , 'L',
+                'font_revision'  , 'L',
+                'checksum_adjustment' , 'L',
+                'magic_number' , 'L',
+                'flags' , 'H',
+                'units_per_em' , 'H',
+                '_created' , 'q',
+                '_modified' , 'q',
+                'x_min' , 'H',
+                'y_min' , 'H',
+                'x_max' , 'H',
+                'y_max' , 'H',
+                'mac_style' , 'H',
+                'lowest_rec_ppem' , 'H',
+                'font_direction_hint' , 'h',
+                'index_to_loc_format' , 'h',
+                'glyph_data_format'   , 'h'
+        )
+
+        self._fmt = ('>%s'%(''.join(field_types[1::2]))).encode('ascii')
+        self._fields = field_types[0::2]
+
+        for f, val in izip(self._fields, unpack_from(self._fmt, self.raw)):
+            setattr(self, f, val)
+
+    def update(self):
+        vals = [getattr(self, f) for f in self._fields]
+        self.raw = pack(self._fmt, *vals)
+
+
--- a/src/calibre/utils/fonts/sfnt/loca.py
+++ b/src/calibre/utils/fonts/sfnt/loca.py
@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from struct import calcsize, unpack_from
+
+from calibre.utils.fonts.sfnt import UnknownTable
+
+class LocaTable(UnknownTable):
+
+    def load_offsets(self, head_table, maxp_table):
+        fmt = 'H' if head_table.index_to_loc_format == 0 else 'L'
+        num_glyphs = maxp_table.num_glyphs
+        sz = calcsize(('>%s'%fmt).encode('ascii'))
+        num = len(self.raw)//sz
+        self.offset_map = unpack_from(('>%d%s'%(num, fmt)).encode('ascii'),
+                self.raw)
+        self.offset_map = self.offset_map[:num_glyphs+1]
+        if fmt == 'H':
+            self.offset_map = [2*i for i in self.offset_map]
+
+    def glyph_location(self, glyph_id):
+        offset = self.offset_map[glyph_id]
+        next_offset = self.offset_map[glyph_id+1]
+        return offset, next_offset - offset
+
--- a/src/calibre/utils/fonts/sfnt/maxp.py
+++ b/src/calibre/utils/fonts/sfnt/maxp.py
@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from itertools import izip
+from struct import unpack_from, pack
+
+from calibre.utils.fonts.sfnt import UnknownTable
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+
+class MaxpTable(UnknownTable):
+
+    def __init__(self, *args, **kwargs):
+        super(MaxpTable, self).__init__(*args, **kwargs)
+
+        self._fmt = b'>LH'
+        self._version, self.num_glyphs = unpack_from(self._fmt, self.raw)
+        self.fields = ('_version', 'num_glyphs')
+
+        if self._version >= 0x10000:
+            self.version = 0x10000
+            vals = unpack_from(self._fmt, self.raw)
+            for f, val in izip(self.fields, vals):
+                setattr(self, f, val)
+
+    @dynamic_property
+    def version(self):
+        def fget(self):
+            return self._version
+        def fset(self, val):
+            if val == 0x5000:
+                self._fmt = b'>LH'
+                self._fields = ('_version', 'num_glyphs')
+            elif val == 0x10000:
+                self.fields = ('_version', 'num_glyphs', 'max_points',
+                        'max_contours', 'max_composite_points',
+                        'max_composite_contours', 'max_zones',
+                        'max_twilight_points', 'max_storage', 'max_function_defs',
+                        'max_instruction_defs', 'max_stack_elements',
+                        'max_size_of_instructions', 'max_component_elements',
+                        'max_component_depth')
+                self._fmt = b'>LH' + b'H'*(len(self.fields)-2)
+            self._version = val
+        return property(fget=fget, fset=fset)
+
+    def update(self):
+        if self._version > 0x10000:
+            raise UnsupportedFont('maxp table with version > 0x10000 not modifiable')
+        vals = [getattr(self, f) for f in self._fields]
+        self.raw = pack(self._fmt, *vals)
+
+
+
--- a/src/calibre/utils/fonts/sfnt/subset.py
+++ b/src/calibre/utils/fonts/sfnt/subset.py
@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.utils.fonts.sfnt.container import Sfnt
+from calibre.utils.fonts.sfnt.errors import UnsupportedFont
+
+def subset_truetype(sfnt, character_map):
+    loca = sfnt[b'loca']
+    try:
+        head, maxp = sfnt[b'head'], sfnt[b'maxp']
+    except KeyError:
+        raise UnsupportedFont('This font does not contain head and/or maxp tables')
+    loca.load_offsets(head, maxp)
+
+def subset(raw, individual_chars, ranges=()):
+    chars = list(map(ord, individual_chars))
+    for r in ranges:
+        chars += list(xrange(ord(r[0]), ord(r[1])+1))
+
+    sfnt = Sfnt(raw)
+    # Remove the Digital Signature table since it is useless in a subset
+    # font anyway
+    sfnt.pop(b'DSIG', None)
+
+    try:
+        cmap = sfnt[b'cmap']
+    except KeyError:
+        raise UnsupportedFont('This font has no cmap table')
+
+    # Get mapping of chars to glyph ids for all specified chars
+    character_map = cmap.get_character_map(chars)
+    # Restrict the cmap table to only contain entries for the specified chars
+    cmap.set_character_map(character_map)
+
+    if b'loca' in sfnt and b'glyf' in sfnt:
+        subset_truetype(sfnt, character_map)
+    elif b'CFF ' in sfnt:
+        raise UnsupportedFont('This font contains PostScript outlines, '
+                'subsetting not supported')
+    else:
+        raise UnsupportedFont('This font does not contain TrueType '
+                'or PostScript outlines')
+
+