Much faster custom implementation for checking if a font supports some unicode text

2025-07-09 03:04:10 -04:00 · 2012-10-31 16:48:03 +05:30 · 2012-10-31 16:48:03 +05:30 · d69b24371d
commit d69b24371d
parent 9977bafa67
4 changed files with 143 additions and 41 deletions
--- a/src/calibre/utils/fonts/free_type.py
+++ b/src/calibre/utils/fonts/free_type.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-import threading, unicodedata
+import threading
 from functools import wraps
 from future_builtins import map
@ -20,10 +20,6 @@ class ThreadingViolation(Exception):
                'You cannot use the MTP driver from a thread other than the '
                ' thread in which startup() was called')
 def get_printable_characters(text):
    return u''.join(x for x in unicodedata.normalize('NFC', text)
            if unicodedata.category(x)[0] not in {'C', 'Z', 'M'})
 def same_thread(func):
    @wraps(func)
    def check_thread(self, *args, **kwargs):
@ -55,10 +51,18 @@ class Face(object):
        if not isinstance(text, unicode):
            raise TypeError('%r is not a unicode object'%text)
        if has_non_printable_chars:
            from calibre.utils.fonts.utils import get_printable_characters
            text = get_printable_characters(text)
        chars = tuple(frozenset(map(ord, text)))
        return self.face.supports_text(chars)
    @same_thread
    def glyph_ids(self, text):
        if not isinstance(text, unicode):
            raise TypeError('%r is not a unicode object'%text)
        for char in text:
            yield self.face.glyph_id(ord(char))
 class FreeType(object):
    def __init__(self):
@ -73,26 +77,4 @@ class FreeType(object):
    def load_font(self, data):
        return Face(self.ft.load_font(data))
 def test():
    data = P('fonts/calibreSymbols.otf', data=True)
    ft = FreeType()
    font = ft.load_font(data)
    if not font.supports_text('.\u2605★'):
        raise RuntimeError('Incorrectly returning that text is not supported')
    if font.supports_text('abc'):
        raise RuntimeError('Incorrectly claiming that text is supported')
 def test_find_font():
    from calibre.utils.fonts.scanner import font_scanner
    abcd = '诶比西迪'
    family = font_scanner.find_font_for_text(abcd)[0]
    print ('Family for Chinese text:', family)
    family = font_scanner.find_font_for_text(abcd)[0]
    abcd = 'لوحة المفاتيح العربية'
    print ('Family for Arabic text:', family)
 if __name__ == '__main__':
    test()
    test_find_font()
--- a/src/calibre/utils/fonts/freetype.cpp
+++ b/src/calibre/utils/fonts/freetype.cpp
@ -115,6 +115,14 @@ supports_text(Face *self, PyObject *args) {
    return ret;
 }
 static PyObject*
 glyph_id(Face *self, PyObject *args) {
    unsigned long code;
    if (!PyArg_ParseTuple(args, "k", &code)) return NULL;
    return Py_BuildValue("k", (unsigned long)FT_Get_Char_Index(self->face, (FT_ULong)code));
 }
 static PyGetSetDef Face_getsetters[] = {
    {(char *)"family_name", 
     (getter)family_name, NULL,
@ -134,6 +142,10 @@ static PyMethodDef Face_methods[] = {
     "supports_text(sequence of unicode character codes) -> Return True iff this font has glyphs for all the specified characters."
    },
    {"glyph_id", (PyCFunction)glyph_id, METH_VARARGS,
     "glyph_id(character code) -> Returns the glyph id for the specified character code."
    },
    {NULL}  /* Sentinel */
 };
--- a/src/calibre/utils/fonts/scanner.py
+++ b/src/calibre/utils/fonts/scanner.py
@ -15,7 +15,6 @@ from calibre import walk, prints, as_unicode
 from calibre.constants import (config_dir, iswindows, isosx, plugins, DEBUG,
        isworker)
 from calibre.utils.fonts.metadata import FontMetadata, UnsupportedFont
 from calibre.utils.fonts.utils import panose_to_css_generic_family
 from calibre.utils.icu import sort_key
 class NoFonts(ValueError):
@ -117,17 +116,17 @@ class Scanner(Thread):
        :return: (family name, faces) or None, None
        '''
-        from calibre.utils.fonts.free_type import FreeType, get_printable_characters
+        from calibre.utils.fonts.utils import (supports_text,
-        ft = FreeType()
+                panose_to_css_generic_family, get_printable_characters)
        found = {}
        if not isinstance(text, unicode):
            raise TypeError(u'%r is not unicode'%text)
        text = get_printable_characters(text)
        found = {}
        def filter_faces(font):
            try:
-                ftface = ft.load_font(self.get_font_data(font))
+                raw = self.get_font_data(font)
-                return ftface.supports_text(text, has_non_printable_chars=False)
+                return supports_text(raw, text)
            except:
                pass
            return False
--- a/src/calibre/utils/fonts/utils.py
+++ b/src/calibre/utils/fonts/utils.py
@ -14,6 +14,11 @@ from collections import defaultdict
 class UnsupportedFont(ValueError):
    pass
 def get_printable_characters(text):
    import unicodedata
    return u''.join(x for x in unicodedata.normalize('NFC', text)
            if unicodedata.category(x)[0] not in {'C', 'Z', 'M'})
 def is_truetype_font(raw):
    sfnt_version = raw[:4]
    return (sfnt_version in {b'\x00\x01\x00\x00', b'OTTO'}, sfnt_version)
@ -267,16 +272,87 @@ def remove_embed_restriction(raw):
    verify_checksums(raw)
    return raw
 def get_bmp_glyph_ids(table, bmp, codes):
    length, language, segcount = struct.unpack_from(b'>3H', table, bmp+2)
    array_len = segcount //2
    offset = bmp + 7*2
    array_sz = 2*array_len
    array = b'>%dH'%array_len
    end_count = struct.unpack_from(array, table, offset)
    offset += array_sz + 2
    start_count = struct.unpack_from(array, table, offset)
    offset += array_sz
    id_delta = struct.unpack_from(array.replace(b'H', b'h'), table, offset)
    offset += array_sz
    range_offset = struct.unpack_from(array, table, offset)
    if length + bmp < offset + array_sz:
        raise ValueError('cmap subtable length is too small')
    glyph_id_len = (length + bmp - (offset + array_sz))//2
    glyph_id_map = struct.unpack_from(b'>%dH'%glyph_id_len, table, offset +
            array_sz)
    for code in codes:
        found = False
        for i, ec in enumerate(end_count):
            if ec >= code:
                sc = start_count[i]
                if sc <= code:
                    found = True
                    ro = range_offset[i]
                    if ro == 0:
                        glyph_id = id_delta[i] + code
                    else:
                        idx = ro//2 + (code - sc) + i - array_len
                        glyph_id = glyph_id_map[idx]
                        if glyph_id != 0:
                            glyph_id += id_delta[i]
                    yield glyph_id % 0x1000
                    break
        if not found:
            yield 0
 def get_glyph_ids(raw, text, raw_is_table=False):
    if not isinstance(text, unicode):
        raise TypeError('%r is not a unicode object'%text)
    if raw_is_table:
        table = raw
    else:
        table = get_table(raw, 'cmap')[0]
        if table is None:
            raise UnsupportedFont('Not a supported font, has no cmap table')
    version, num_tables = struct.unpack_from(b'>HH', table)
    bmp_table = None
    for i in xrange(num_tables):
        platform_id, encoding_id, offset = struct.unpack_from(b'>HHL', table,
                4 + (i*8))
        if platform_id == 3 and encoding_id == 1:
            table_format = struct.unpack_from(b'>H', table, offset)[0]
            if table_format == 4:
                bmp_table = offset
                break
    if bmp_table is None:
        raise UnsupportedFont('Not a supported font, has no format 4 cmap table')
    for glyph_id in get_bmp_glyph_ids(table, bmp_table, map(ord, text)):
        yield glyph_id
 def supports_text(raw, text, has_only_printable_chars=False):
    if not isinstance(text, unicode):
        raise TypeError('%r is not a unicode object'%text)
    if not has_only_printable_chars:
        text = get_printable_characters(text)
    try:
        for glyph_id in get_glyph_ids(raw, text):
            if glyph_id == 0:
                return False
    except:
        return False
    return True
 def get_font_for_text(text, candidate_font_data=None):
    ok = False
    if candidate_font_data is not None:
-        from calibre.utils.fonts.free_type import FreeType, FreeTypeError
+        ok = supports_text(candidate_font_data, text)
        ft = FreeType()
        try:
            font = ft.load_font(candidate_font_data)
            ok = font.supports_text(text)
        except FreeTypeError:
            ok = True
    if not ok:
        from calibre.utils.fonts.scanner import font_scanner
        family, faces = font_scanner.find_font_for_text(text)
@ -285,7 +361,40 @@ def get_font_for_text(text, candidate_font_data=None):
                candidate_font_data = f.read()
    return candidate_font_data
 def test_glyph_ids():
    from calibre.utils.fonts.free_type import FreeType
    data = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
    ft = FreeType()
    font = ft.load_font(data)
    text = u'诶йab'
    ft_glyphs = tuple(font.glyph_ids(text))
    glyphs = tuple(get_glyph_ids(data, text))
    if ft_glyphs != glyphs:
        raise Exception('My code and FreeType differ on the glyph ids')
 def test_supports_text():
    data = P('fonts/calibreSymbols.otf', data=True)
    if not supports_text(data, '.\u2605★'):
        raise RuntimeError('Incorrectly returning that text is not supported')
    if supports_text(data, 'abc'):
        raise RuntimeError('Incorrectly claiming that text is supported')
 def test_find_font():
    from calibre.utils.fonts.scanner import font_scanner
    abcd = '诶比西迪'
    family = font_scanner.find_font_for_text(abcd)[0]
    print ('Family for Chinese text:', family)
    family = font_scanner.find_font_for_text(abcd)[0]
    abcd = 'لوحة المفاتيح العربية'
    print ('Family for Arabic text:', family)
 def test():
    test_glyph_ids()
    test_supports_text()
    test_find_font()
 def main():
    import sys, os
    for f in sys.argv[1:]:
        print (os.path.basename(f))
@ -299,5 +408,5 @@ def test():
 if __name__ == '__main__':
-    test()
+    main()