Much faster custom implementation for checking if a font supports some unicode text

2025-07-09 03:04:10 -04:00 · 2012-10-31 16:48:03 +05:30 · 2012-10-31 16:48:03 +05:30 · d69b24371d
commit d69b24371d
parent 9977bafa67
4 changed files with 143 additions and 41 deletions
--- a/src/calibre/utils/fonts/free_type.py
+++ b/src/calibre/utils/fonts/free_type.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import threading, unicodedata
+import threading
 from functools import wraps
 from future_builtins import map

@ -20,10 +20,6 @@ class ThreadingViolation(Exception):
                'You cannot use the MTP driver from a thread other than the '
                ' thread in which startup() was called')

-def get_printable_characters(text):
-    return u''.join(x for x in unicodedata.normalize('NFC', text)
-            if unicodedata.category(x)[0] not in {'C', 'Z', 'M'})
-
 def same_thread(func):
    @wraps(func)
    def check_thread(self, *args, **kwargs):
@ -55,10 +51,18 @@ class Face(object):
        if not isinstance(text, unicode):
            raise TypeError('%r is not a unicode object'%text)
        if has_non_printable_chars:
+            from calibre.utils.fonts.utils import get_printable_characters
            text = get_printable_characters(text)
        chars = tuple(frozenset(map(ord, text)))
        return self.face.supports_text(chars)

+    @same_thread
+    def glyph_ids(self, text):
+        if not isinstance(text, unicode):
+            raise TypeError('%r is not a unicode object'%text)
+        for char in text:
+            yield self.face.glyph_id(ord(char))
+
 class FreeType(object):

    def __init__(self):
@ -73,26 +77,4 @@ class FreeType(object):
    def load_font(self, data):
        return Face(self.ft.load_font(data))

-def test():
-    data = P('fonts/calibreSymbols.otf', data=True)
-    ft = FreeType()
-    font = ft.load_font(data)
-    if not font.supports_text('.\u2605★'):
-        raise RuntimeError('Incorrectly returning that text is not supported')
-    if font.supports_text('abc'):
-        raise RuntimeError('Incorrectly claiming that text is supported')
-
-def test_find_font():
-    from calibre.utils.fonts.scanner import font_scanner
-    abcd = '诶比西迪'
-    family = font_scanner.find_font_for_text(abcd)[0]
-    print ('Family for Chinese text:', family)
-    family = font_scanner.find_font_for_text(abcd)[0]
-    abcd = 'لوحة المفاتيح العربية'
-    print ('Family for Arabic text:', family)
-
-
-if __name__ == '__main__':
-    test()
-    test_find_font()

--- a/src/calibre/utils/fonts/freetype.cpp
+++ b/src/calibre/utils/fonts/freetype.cpp
@ -115,6 +115,14 @@ supports_text(Face *self, PyObject *args) {
    return ret;
 }

+static PyObject*
+glyph_id(Face *self, PyObject *args) {
+    unsigned long code;
+
+    if (!PyArg_ParseTuple(args, "k", &code)) return NULL;
+    return Py_BuildValue("k", (unsigned long)FT_Get_Char_Index(self->face, (FT_ULong)code));
+}
+
 static PyGetSetDef Face_getsetters[] = {
    {(char *)"family_name", 
     (getter)family_name, NULL,
@ -134,6 +142,10 @@ static PyMethodDef Face_methods[] = {
     "supports_text(sequence of unicode character codes) -> Return True iff this font has glyphs for all the specified characters."
    },

+    {"glyph_id", (PyCFunction)glyph_id, METH_VARARGS,
+     "glyph_id(character code) -> Returns the glyph id for the specified character code."
+    },
+
    {NULL}  /* Sentinel */
 };

--- a/src/calibre/utils/fonts/scanner.py
+++ b/src/calibre/utils/fonts/scanner.py
@ -15,7 +15,6 @@ from calibre import walk, prints, as_unicode
 from calibre.constants import (config_dir, iswindows, isosx, plugins, DEBUG,
        isworker)
 from calibre.utils.fonts.metadata import FontMetadata, UnsupportedFont
-from calibre.utils.fonts.utils import panose_to_css_generic_family
 from calibre.utils.icu import sort_key

 class NoFonts(ValueError):
@ -117,17 +116,17 @@ class Scanner(Thread):

        :return: (family name, faces) or None, None
        '''
-        from calibre.utils.fonts.free_type import FreeType, get_printable_characters
-        ft = FreeType()
-        found = {}
+        from calibre.utils.fonts.utils import (supports_text,
+                panose_to_css_generic_family, get_printable_characters)
        if not isinstance(text, unicode):
            raise TypeError(u'%r is not unicode'%text)
        text = get_printable_characters(text)
+        found = {}

        def filter_faces(font):
            try:
-                ftface = ft.load_font(self.get_font_data(font))
-                return ftface.supports_text(text, has_non_printable_chars=False)
+                raw = self.get_font_data(font)
+                return supports_text(raw, text)
            except:
                pass
            return False
--- a/src/calibre/utils/fonts/utils.py
+++ b/src/calibre/utils/fonts/utils.py
@ -14,6 +14,11 @@ from collections import defaultdict
 class UnsupportedFont(ValueError):
    pass

+def get_printable_characters(text):
+    import unicodedata
+    return u''.join(x for x in unicodedata.normalize('NFC', text)
+            if unicodedata.category(x)[0] not in {'C', 'Z', 'M'})
+
 def is_truetype_font(raw):
    sfnt_version = raw[:4]
    return (sfnt_version in {b'\x00\x01\x00\x00', b'OTTO'}, sfnt_version)
@ -267,16 +272,87 @@ def remove_embed_restriction(raw):
    verify_checksums(raw)
    return raw

+def get_bmp_glyph_ids(table, bmp, codes):
+    length, language, segcount = struct.unpack_from(b'>3H', table, bmp+2)
+    array_len = segcount //2
+    offset = bmp + 7*2
+    array_sz = 2*array_len
+    array = b'>%dH'%array_len
+    end_count = struct.unpack_from(array, table, offset)
+    offset += array_sz + 2
+    start_count = struct.unpack_from(array, table, offset)
+    offset += array_sz
+    id_delta = struct.unpack_from(array.replace(b'H', b'h'), table, offset)
+    offset += array_sz
+    range_offset = struct.unpack_from(array, table, offset)
+    if length + bmp < offset + array_sz:
+        raise ValueError('cmap subtable length is too small')
+    glyph_id_len = (length + bmp - (offset + array_sz))//2
+    glyph_id_map = struct.unpack_from(b'>%dH'%glyph_id_len, table, offset +
+            array_sz)
+
+    for code in codes:
+        found = False
+        for i, ec in enumerate(end_count):
+            if ec >= code:
+                sc = start_count[i]
+                if sc <= code:
+                    found = True
+                    ro = range_offset[i]
+                    if ro == 0:
+                        glyph_id = id_delta[i] + code
+                    else:
+                        idx = ro//2 + (code - sc) + i - array_len
+                        glyph_id = glyph_id_map[idx]
+                        if glyph_id != 0:
+                            glyph_id += id_delta[i]
+                    yield glyph_id % 0x1000
+                    break
+        if not found:
+            yield 0
+
+def get_glyph_ids(raw, text, raw_is_table=False):
+    if not isinstance(text, unicode):
+        raise TypeError('%r is not a unicode object'%text)
+    if raw_is_table:
+        table = raw
+    else:
+        table = get_table(raw, 'cmap')[0]
+        if table is None:
+            raise UnsupportedFont('Not a supported font, has no cmap table')
+    version, num_tables = struct.unpack_from(b'>HH', table)
+    bmp_table = None
+    for i in xrange(num_tables):
+        platform_id, encoding_id, offset = struct.unpack_from(b'>HHL', table,
+                4 + (i*8))
+        if platform_id == 3 and encoding_id == 1:
+            table_format = struct.unpack_from(b'>H', table, offset)[0]
+            if table_format == 4:
+                bmp_table = offset
+                break
+    if bmp_table is None:
+        raise UnsupportedFont('Not a supported font, has no format 4 cmap table')
+
+    for glyph_id in get_bmp_glyph_ids(table, bmp_table, map(ord, text)):
+        yield glyph_id
+
+def supports_text(raw, text, has_only_printable_chars=False):
+    if not isinstance(text, unicode):
+        raise TypeError('%r is not a unicode object'%text)
+    if not has_only_printable_chars:
+        text = get_printable_characters(text)
+    try:
+        for glyph_id in get_glyph_ids(raw, text):
+            if glyph_id == 0:
+                return False
+    except:
+        return False
+    return True
+
 def get_font_for_text(text, candidate_font_data=None):
    ok = False
    if candidate_font_data is not None:
-        from calibre.utils.fonts.free_type import FreeType, FreeTypeError
-        ft = FreeType()
-        try:
-            font = ft.load_font(candidate_font_data)
-            ok = font.supports_text(text)
-        except FreeTypeError:
-            ok = True
+        ok = supports_text(candidate_font_data, text)
    if not ok:
        from calibre.utils.fonts.scanner import font_scanner
        family, faces = font_scanner.find_font_for_text(text)
@ -285,7 +361,40 @@ def get_font_for_text(text, candidate_font_data=None):
                candidate_font_data = f.read()
    return candidate_font_data

+def test_glyph_ids():
+    from calibre.utils.fonts.free_type import FreeType
+    data = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
+    ft = FreeType()
+    font = ft.load_font(data)
+    text = u'诶йab'
+    ft_glyphs = tuple(font.glyph_ids(text))
+    glyphs = tuple(get_glyph_ids(data, text))
+    if ft_glyphs != glyphs:
+        raise Exception('My code and FreeType differ on the glyph ids')
+
+def test_supports_text():
+    data = P('fonts/calibreSymbols.otf', data=True)
+    if not supports_text(data, '.\u2605★'):
+        raise RuntimeError('Incorrectly returning that text is not supported')
+    if supports_text(data, 'abc'):
+        raise RuntimeError('Incorrectly claiming that text is supported')
+
+def test_find_font():
+    from calibre.utils.fonts.scanner import font_scanner
+    abcd = '诶比西迪'
+    family = font_scanner.find_font_for_text(abcd)[0]
+    print ('Family for Chinese text:', family)
+    family = font_scanner.find_font_for_text(abcd)[0]
+    abcd = 'لوحة المفاتيح العربية'
+    print ('Family for Arabic text:', family)
+
+
 def test():
+    test_glyph_ids()
+    test_supports_text()
+    test_find_font()
+
+def main():
    import sys, os
    for f in sys.argv[1:]:
        print (os.path.basename(f))
@ -299,5 +408,5 @@ def test():


 if __name__ == '__main__':
-    test()
+    main()