Insert special char: Allow searching for non BMP characters, by using the ICU database of names rather than python's outdated one.

This commit is contained in:
Kovid Goyal 2014-03-06 13:10:48 +05:30
parent b36c6211b0
commit 63cba4c884
3 changed files with 45 additions and 6 deletions

View File

@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import unicodedata, re, os, cPickle, sys, textwrap import unicodedata, re, os, cPickle, textwrap
from bisect import bisect from bisect import bisect
from functools import partial from functools import partial
from collections import defaultdict from collections import defaultdict
@ -22,7 +22,7 @@ from calibre.gui2 import NONE
from calibre.gui2.widgets2 import HistoryLineEdit2 from calibre.gui2.widgets2 import HistoryLineEdit2
from calibre.gui2.tweak_book import tprefs from calibre.gui2.tweak_book import tprefs
from calibre.gui2.tweak_book.widgets import Dialog from calibre.gui2.tweak_book.widgets import Dialog
from calibre.utils.icu import safe_chr as chr from calibre.utils.icu import safe_chr as chr, icu_unicode_version, character_name_from_code
ROOT = QModelIndex() ROOT = QModelIndex()
@ -35,9 +35,10 @@ non_printing = {
} }
# Searching {{{ # Searching {{{
def load_search_index(): def load_search_index():
topchar = sys.maxunicode topchar = 0x10ffff
ver = (1, topchar, unicodedata.unidata_version) # Increment this when you make any changes to the index ver = (1, topchar, icu_unicode_version or unicodedata.unidata_version) # Increment this when you make any changes to the index
name_map = {} name_map = {}
path = os.path.join(cache_dir(), 'unicode-name-index.pickle') path = os.path.join(cache_dir(), 'unicode-name-index.pickle')
if os.path.exists(path): if os.path.exists(path):
@ -48,7 +49,7 @@ def load_search_index():
if not name_map: if not name_map:
name_map = defaultdict(set) name_map = defaultdict(set)
for x in xrange(1, topchar + 1): for x in xrange(1, topchar + 1):
for word in unicodedata.name(chr(x), '').split(): for word in character_name_from_code(x).split():
name_map[word.lower()].add(x) name_map[word.lower()].add(x)
from calibre.ebooks.html_entities import html5_entities from calibre.ebooks.html_entities import html5_entities
for name, char in html5_entities.iteritems(): for name, char in html5_entities.iteritems():
@ -465,7 +466,7 @@ class CategoryModel(QAbstractItemModel):
category, subcategory = self.category_map[self.starts[ipos]] category, subcategory = self.category_map[self.starts[ipos]]
except IndexError: except IndexError:
category = subcategory = _('Unknown') category = subcategory = _('Unknown')
return category, subcategory, unicodedata.name(chr(char_code), _('Unknown')) return category, subcategory, (character_name_from_code(char_code) or _('Unknown'))
class CategoryDelegate(QStyledItemDelegate): class CategoryDelegate(QStyledItemDelegate):

View File

@ -738,6 +738,29 @@ end:
return (PyErr_Occurred()) ? NULL : Py_BuildValue("s#", name, sz); return (PyErr_Occurred()) ? NULL : Py_BuildValue("s#", name, sz);
} // }}} } // }}}
// character_name {{{
static PyObject *
icu_character_name_from_code(PyObject *self, PyObject *args) {
char name[512] = {0};
int32_t sz, alias = 0;
UErrorCode status = U_ZERO_ERROR;
PyObject *palias = NULL;
UChar32 code = 0;
if (!PyArg_ParseTuple(args, "I|O", &code, &palias)) return NULL;
if (palias != NULL && PyObject_IsTrue(palias)) alias = 1;
if (alias) {
sz = u_charName(code, U_CHAR_NAME_ALIAS, name, 511, &status);
} else {
sz = u_charName(code, U_UNICODE_CHAR_NAME, name, 511, &status);
}
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to get name for code"); goto end; }
end:
return (PyErr_Occurred()) ? NULL : Py_BuildValue("s#", name, sz);
} // }}}
// chr {{{ // chr {{{
static PyObject * static PyObject *
icu_chr(PyObject *self, PyObject *args) { icu_chr(PyObject *self, PyObject *args) {
@ -786,6 +809,10 @@ static PyMethodDef icu_methods[] = {
"character_name(char, alias=False) -> Return name for the first character in char, which must be a unicode string." "character_name(char, alias=False) -> Return name for the first character in char, which must be a unicode string."
}, },
{"character_name_from_code", icu_character_name_from_code, METH_VARARGS,
"character_name_from_code(code, alias=False) -> Return the name for the specified unicode code point"
},
{"chr", icu_chr, METH_VARARGS, {"chr", icu_chr, METH_VARARGS,
"chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)." "chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)."
}, },

View File

@ -140,6 +140,16 @@ def character_name(string):
except (TypeError, ValueError, KeyError): except (TypeError, ValueError, KeyError):
pass pass
def character_name_from_code(code):
try:
try:
return _icu.character_name_from_code(code).decode('utf-8') or ''
except AttributeError:
import unicodedata
return unicodedata.name(py_safe_chr(code), '')
except (TypeError, ValueError, KeyError):
return ''
if sys.maxunicode >= 0x10ffff: if sys.maxunicode >= 0x10ffff:
try: try:
py_safe_chr = unichr py_safe_chr = unichr
@ -212,6 +222,7 @@ def icu_collation_order(collator, a):
load_icu() load_icu()
load_collator() load_collator()
_icu_not_ok = _icu is None or _collator is None _icu_not_ok = _icu is None or _collator is None
icu_unicode_version = getattr(_icu, 'unicode_version', None)
try: try:
senc = sys.getdefaultencoding() senc = sys.getdefaultencoding()