Much faster custom implementation for checking if a font supports some unicode text

This commit is contained in:
Kovid Goyal 2012-10-31 16:48:03 +05:30
parent 9977bafa67
commit d69b24371d
4 changed files with 143 additions and 41 deletions

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import threading, unicodedata
import threading
from functools import wraps
from future_builtins import map
@ -20,10 +20,6 @@ class ThreadingViolation(Exception):
'You cannot use the MTP driver from a thread other than the '
' thread in which startup() was called')
def get_printable_characters(text):
return u''.join(x for x in unicodedata.normalize('NFC', text)
if unicodedata.category(x)[0] not in {'C', 'Z', 'M'})
def same_thread(func):
@wraps(func)
def check_thread(self, *args, **kwargs):
@ -55,10 +51,18 @@ class Face(object):
if not isinstance(text, unicode):
raise TypeError('%r is not a unicode object'%text)
if has_non_printable_chars:
from calibre.utils.fonts.utils import get_printable_characters
text = get_printable_characters(text)
chars = tuple(frozenset(map(ord, text)))
return self.face.supports_text(chars)
@same_thread
def glyph_ids(self, text):
if not isinstance(text, unicode):
raise TypeError('%r is not a unicode object'%text)
for char in text:
yield self.face.glyph_id(ord(char))
class FreeType(object):
def __init__(self):
@ -73,26 +77,4 @@ class FreeType(object):
def load_font(self, data):
return Face(self.ft.load_font(data))
def test():
data = P('fonts/calibreSymbols.otf', data=True)
ft = FreeType()
font = ft.load_font(data)
if not font.supports_text('.\u2605'):
raise RuntimeError('Incorrectly returning that text is not supported')
if font.supports_text('abc'):
raise RuntimeError('Incorrectly claiming that text is supported')
def test_find_font():
from calibre.utils.fonts.scanner import font_scanner
abcd = '诶比西迪'
family = font_scanner.find_font_for_text(abcd)[0]
print ('Family for Chinese text:', family)
family = font_scanner.find_font_for_text(abcd)[0]
abcd = 'لوحة المفاتيح العربية'
print ('Family for Arabic text:', family)
if __name__ == '__main__':
test()
test_find_font()

View File

@ -115,6 +115,14 @@ supports_text(Face *self, PyObject *args) {
return ret;
}
static PyObject*
glyph_id(Face *self, PyObject *args) {
unsigned long code;
if (!PyArg_ParseTuple(args, "k", &code)) return NULL;
return Py_BuildValue("k", (unsigned long)FT_Get_Char_Index(self->face, (FT_ULong)code));
}
static PyGetSetDef Face_getsetters[] = {
{(char *)"family_name",
(getter)family_name, NULL,
@ -134,6 +142,10 @@ static PyMethodDef Face_methods[] = {
"supports_text(sequence of unicode character codes) -> Return True iff this font has glyphs for all the specified characters."
},
{"glyph_id", (PyCFunction)glyph_id, METH_VARARGS,
"glyph_id(character code) -> Returns the glyph id for the specified character code."
},
{NULL} /* Sentinel */
};

View File

@ -15,7 +15,6 @@ from calibre import walk, prints, as_unicode
from calibre.constants import (config_dir, iswindows, isosx, plugins, DEBUG,
isworker)
from calibre.utils.fonts.metadata import FontMetadata, UnsupportedFont
from calibre.utils.fonts.utils import panose_to_css_generic_family
from calibre.utils.icu import sort_key
class NoFonts(ValueError):
@ -117,17 +116,17 @@ class Scanner(Thread):
:return: (family name, faces) or None, None
'''
from calibre.utils.fonts.free_type import FreeType, get_printable_characters
ft = FreeType()
found = {}
from calibre.utils.fonts.utils import (supports_text,
panose_to_css_generic_family, get_printable_characters)
if not isinstance(text, unicode):
raise TypeError(u'%r is not unicode'%text)
text = get_printable_characters(text)
found = {}
def filter_faces(font):
try:
ftface = ft.load_font(self.get_font_data(font))
return ftface.supports_text(text, has_non_printable_chars=False)
raw = self.get_font_data(font)
return supports_text(raw, text)
except:
pass
return False

View File

@ -14,6 +14,11 @@ from collections import defaultdict
class UnsupportedFont(ValueError):
pass
def get_printable_characters(text):
import unicodedata
return u''.join(x for x in unicodedata.normalize('NFC', text)
if unicodedata.category(x)[0] not in {'C', 'Z', 'M'})
def is_truetype_font(raw):
sfnt_version = raw[:4]
return (sfnt_version in {b'\x00\x01\x00\x00', b'OTTO'}, sfnt_version)
@ -267,16 +272,87 @@ def remove_embed_restriction(raw):
verify_checksums(raw)
return raw
def get_bmp_glyph_ids(table, bmp, codes):
length, language, segcount = struct.unpack_from(b'>3H', table, bmp+2)
array_len = segcount //2
offset = bmp + 7*2
array_sz = 2*array_len
array = b'>%dH'%array_len
end_count = struct.unpack_from(array, table, offset)
offset += array_sz + 2
start_count = struct.unpack_from(array, table, offset)
offset += array_sz
id_delta = struct.unpack_from(array.replace(b'H', b'h'), table, offset)
offset += array_sz
range_offset = struct.unpack_from(array, table, offset)
if length + bmp < offset + array_sz:
raise ValueError('cmap subtable length is too small')
glyph_id_len = (length + bmp - (offset + array_sz))//2
glyph_id_map = struct.unpack_from(b'>%dH'%glyph_id_len, table, offset +
array_sz)
for code in codes:
found = False
for i, ec in enumerate(end_count):
if ec >= code:
sc = start_count[i]
if sc <= code:
found = True
ro = range_offset[i]
if ro == 0:
glyph_id = id_delta[i] + code
else:
idx = ro//2 + (code - sc) + i - array_len
glyph_id = glyph_id_map[idx]
if glyph_id != 0:
glyph_id += id_delta[i]
yield glyph_id % 0x1000
break
if not found:
yield 0
def get_glyph_ids(raw, text, raw_is_table=False):
if not isinstance(text, unicode):
raise TypeError('%r is not a unicode object'%text)
if raw_is_table:
table = raw
else:
table = get_table(raw, 'cmap')[0]
if table is None:
raise UnsupportedFont('Not a supported font, has no cmap table')
version, num_tables = struct.unpack_from(b'>HH', table)
bmp_table = None
for i in xrange(num_tables):
platform_id, encoding_id, offset = struct.unpack_from(b'>HHL', table,
4 + (i*8))
if platform_id == 3 and encoding_id == 1:
table_format = struct.unpack_from(b'>H', table, offset)[0]
if table_format == 4:
bmp_table = offset
break
if bmp_table is None:
raise UnsupportedFont('Not a supported font, has no format 4 cmap table')
for glyph_id in get_bmp_glyph_ids(table, bmp_table, map(ord, text)):
yield glyph_id
def supports_text(raw, text, has_only_printable_chars=False):
if not isinstance(text, unicode):
raise TypeError('%r is not a unicode object'%text)
if not has_only_printable_chars:
text = get_printable_characters(text)
try:
for glyph_id in get_glyph_ids(raw, text):
if glyph_id == 0:
return False
except:
return False
return True
def get_font_for_text(text, candidate_font_data=None):
ok = False
if candidate_font_data is not None:
from calibre.utils.fonts.free_type import FreeType, FreeTypeError
ft = FreeType()
try:
font = ft.load_font(candidate_font_data)
ok = font.supports_text(text)
except FreeTypeError:
ok = True
ok = supports_text(candidate_font_data, text)
if not ok:
from calibre.utils.fonts.scanner import font_scanner
family, faces = font_scanner.find_font_for_text(text)
@ -285,7 +361,40 @@ def get_font_for_text(text, candidate_font_data=None):
candidate_font_data = f.read()
return candidate_font_data
def test_glyph_ids():
from calibre.utils.fonts.free_type import FreeType
data = P('fonts/liberation/LiberationSerif-Regular.ttf', data=True)
ft = FreeType()
font = ft.load_font(data)
text = u'诶йab'
ft_glyphs = tuple(font.glyph_ids(text))
glyphs = tuple(get_glyph_ids(data, text))
if ft_glyphs != glyphs:
raise Exception('My code and FreeType differ on the glyph ids')
def test_supports_text():
data = P('fonts/calibreSymbols.otf', data=True)
if not supports_text(data, '.\u2605'):
raise RuntimeError('Incorrectly returning that text is not supported')
if supports_text(data, 'abc'):
raise RuntimeError('Incorrectly claiming that text is supported')
def test_find_font():
from calibre.utils.fonts.scanner import font_scanner
abcd = '诶比西迪'
family = font_scanner.find_font_for_text(abcd)[0]
print ('Family for Chinese text:', family)
family = font_scanner.find_font_for_text(abcd)[0]
abcd = 'لوحة المفاتيح العربية'
print ('Family for Arabic text:', family)
def test():
test_glyph_ids()
test_supports_text()
test_find_font()
def main():
import sys, os
for f in sys.argv[1:]:
print (os.path.basename(f))
@ -299,5 +408,5 @@ def test():
if __name__ == '__main__':
test()
main()