From 63cba4c884a688b9d3c5c06ddea0a28c49413f6b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 6 Mar 2014 13:10:48 +0530 Subject: [PATCH] Insert special char: Allow searching for non BMP characters, by using the ICU database of names rather than python's outdated one. --- src/calibre/gui2/tweak_book/char_select.py | 13 ++++++----- src/calibre/utils/icu.c | 27 ++++++++++++++++++++++ src/calibre/utils/icu.py | 11 +++++++++ 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/src/calibre/gui2/tweak_book/char_select.py b/src/calibre/gui2/tweak_book/char_select.py index c7b83cfab9..9530087210 100644 --- a/src/calibre/gui2/tweak_book/char_select.py +++ b/src/calibre/gui2/tweak_book/char_select.py @@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2014, Kovid Goyal ' -import unicodedata, re, os, cPickle, sys, textwrap +import unicodedata, re, os, cPickle, textwrap from bisect import bisect from functools import partial from collections import defaultdict @@ -22,7 +22,7 @@ from calibre.gui2 import NONE from calibre.gui2.widgets2 import HistoryLineEdit2 from calibre.gui2.tweak_book import tprefs from calibre.gui2.tweak_book.widgets import Dialog -from calibre.utils.icu import safe_chr as chr +from calibre.utils.icu import safe_chr as chr, icu_unicode_version, character_name_from_code ROOT = QModelIndex() @@ -35,9 +35,10 @@ non_printing = { } # Searching {{{ + def load_search_index(): - topchar = sys.maxunicode - ver = (1, topchar, unicodedata.unidata_version) # Increment this when you make any changes to the index + topchar = 0x10ffff + ver = (1, topchar, icu_unicode_version or unicodedata.unidata_version) # Increment this when you make any changes to the index name_map = {} path = os.path.join(cache_dir(), 'unicode-name-index.pickle') if os.path.exists(path): @@ -48,7 +49,7 @@ def load_search_index(): if not name_map: name_map = defaultdict(set) for x in xrange(1, topchar + 1): - for word in unicodedata.name(chr(x), '').split(): + for word in character_name_from_code(x).split(): name_map[word.lower()].add(x) from calibre.ebooks.html_entities import html5_entities for name, char in html5_entities.iteritems(): @@ -465,7 +466,7 @@ class CategoryModel(QAbstractItemModel): category, subcategory = self.category_map[self.starts[ipos]] except IndexError: category = subcategory = _('Unknown') - return category, subcategory, unicodedata.name(chr(char_code), _('Unknown')) + return category, subcategory, (character_name_from_code(char_code) or _('Unknown')) class CategoryDelegate(QStyledItemDelegate): diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index f42a9dbaad..3723fc00cc 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -738,6 +738,29 @@ end: return (PyErr_Occurred()) ? NULL : Py_BuildValue("s#", name, sz); } // }}} +// character_name {{{ +static PyObject * +icu_character_name_from_code(PyObject *self, PyObject *args) { + char name[512] = {0}; + int32_t sz, alias = 0; + UErrorCode status = U_ZERO_ERROR; + PyObject *palias = NULL; + UChar32 code = 0; + + if (!PyArg_ParseTuple(args, "I|O", &code, &palias)) return NULL; + + if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; + + if (alias) { + sz = u_charName(code, U_CHAR_NAME_ALIAS, name, 511, &status); + } else { + sz = u_charName(code, U_UNICODE_CHAR_NAME, name, 511, &status); + } + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to get name for code"); goto end; } +end: + return (PyErr_Occurred()) ? NULL : Py_BuildValue("s#", name, sz); +} // }}} + // chr {{{ static PyObject * icu_chr(PyObject *self, PyObject *args) { @@ -786,6 +809,10 @@ static PyMethodDef icu_methods[] = { "character_name(char, alias=False) -> Return name for the first character in char, which must be a unicode string." }, + {"character_name_from_code", icu_character_name_from_code, METH_VARARGS, + "character_name_from_code(code, alias=False) -> Return the name for the specified unicode code point" + }, + {"chr", icu_chr, METH_VARARGS, "chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)." }, diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 82c46dac57..22f8f2f2a8 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -140,6 +140,16 @@ def character_name(string): except (TypeError, ValueError, KeyError): pass +def character_name_from_code(code): + try: + try: + return _icu.character_name_from_code(code).decode('utf-8') or '' + except AttributeError: + import unicodedata + return unicodedata.name(py_safe_chr(code), '') + except (TypeError, ValueError, KeyError): + return '' + if sys.maxunicode >= 0x10ffff: try: py_safe_chr = unichr @@ -212,6 +222,7 @@ def icu_collation_order(collator, a): load_icu() load_collator() _icu_not_ok = _icu is None or _collator is None +icu_unicode_version = getattr(_icu, 'unicode_version', None) try: senc = sys.getdefaultencoding()