From 63cba4c884a688b9d3c5c06ddea0a28c49413f6b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 6 Mar 2014 13:10:48 +0530
Subject: [PATCH] Insert special char: Allow searching for non BMP characters,
 by using the ICU database of names rather than python's outdated one.

---
 src/calibre/gui2/tweak_book/char_select.py | 13 ++++++-----
 src/calibre/utils/icu.c                    | 27 ++++++++++++++++++++++
 src/calibre/utils/icu.py                   | 11 +++++++++
 3 files changed, 45 insertions(+), 6 deletions(-)
diff --git a/src/calibre/gui2/tweak_book/char_select.py b/src/calibre/gui2/tweak_book/char_select.py
index c7b83cfab9..9530087210 100644
--- a/src/calibre/gui2/tweak_book/char_select.py
+++ b/src/calibre/gui2/tweak_book/char_select.py
@@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
 
-import unicodedata, re, os, cPickle, sys, textwrap
+import unicodedata, re, os, cPickle, textwrap
 from bisect import bisect
 from functools import partial
 from collections import defaultdict
@@ -22,7 +22,7 @@ from calibre.gui2 import NONE
 from calibre.gui2.widgets2 import HistoryLineEdit2
 from calibre.gui2.tweak_book import tprefs
 from calibre.gui2.tweak_book.widgets import Dialog
-from calibre.utils.icu import safe_chr as chr
+from calibre.utils.icu import safe_chr as chr, icu_unicode_version, character_name_from_code
 
 ROOT = QModelIndex()
 
@@ -35,9 +35,10 @@ non_printing = {
 }
 
 # Searching {{{
+
 def load_search_index():
-    topchar = sys.maxunicode
-    ver = (1, topchar, unicodedata.unidata_version)  # Increment this when you make any changes to the index
+    topchar = 0x10ffff
+    ver = (1, topchar, icu_unicode_version or unicodedata.unidata_version)  # Increment this when you make any changes to the index
     name_map = {}
     path = os.path.join(cache_dir(), 'unicode-name-index.pickle')
     if os.path.exists(path):
@@ -48,7 +49,7 @@ def load_search_index():
     if not name_map:
         name_map = defaultdict(set)
         for x in xrange(1, topchar + 1):
-            for word in unicodedata.name(chr(x), '').split():
+            for word in character_name_from_code(x).split():
                 name_map[word.lower()].add(x)
         from calibre.ebooks.html_entities import html5_entities
         for name, char in html5_entities.iteritems():
@@ -465,7 +466,7 @@ class CategoryModel(QAbstractItemModel):
             category, subcategory = self.category_map[self.starts[ipos]]
         except IndexError:
             category = subcategory = _('Unknown')
-        return category, subcategory, unicodedata.name(chr(char_code), _('Unknown'))
+        return category, subcategory, (character_name_from_code(char_code) or _('Unknown'))
 
 class CategoryDelegate(QStyledItemDelegate):
 
diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c
index f42a9dbaad..3723fc00cc 100644
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@@ -738,6 +738,29 @@ end:
     return (PyErr_Occurred()) ? NULL : Py_BuildValue("s#", name, sz);
 } // }}}
 
+// character_name {{{
+static PyObject *
+icu_character_name_from_code(PyObject *self, PyObject *args) {
+    char name[512] = {0}; 
+    int32_t sz, alias = 0;
+    UErrorCode status = U_ZERO_ERROR;
+    PyObject *palias = NULL;
+    UChar32 code = 0;
+  
+    if (!PyArg_ParseTuple(args, "I|O", &code, &palias)) return NULL;
+
+    if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; 
+    
+    if (alias) {
+        sz = u_charName(code, U_CHAR_NAME_ALIAS, name, 511, &status);
+    } else {
+        sz = u_charName(code, U_UNICODE_CHAR_NAME, name, 511, &status);
+    }
+    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to get name for code"); goto end; }
+end:
+    return (PyErr_Occurred()) ? NULL : Py_BuildValue("s#", name, sz);
+} // }}}
+
 // chr {{{
 static PyObject *
 icu_chr(PyObject *self, PyObject *args) {
@@ -786,6 +809,10 @@ static PyMethodDef icu_methods[] = {
      "character_name(char, alias=False) -> Return name for the first character in char, which must be a unicode string."
     },
 
+    {"character_name_from_code", icu_character_name_from_code, METH_VARARGS, 
+     "character_name_from_code(code, alias=False) -> Return the name for the specified unicode code point"
+    },
+
     {"chr", icu_chr, METH_VARARGS, 
      "chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)."
     },
diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py
index 82c46dac57..22f8f2f2a8 100644
--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@@ -140,6 +140,16 @@ def character_name(string):
     except (TypeError, ValueError, KeyError):
         pass
 
+def character_name_from_code(code):
+    try:
+        try:
+            return _icu.character_name_from_code(code).decode('utf-8') or ''
+        except AttributeError:
+            import unicodedata
+            return unicodedata.name(py_safe_chr(code), '')
+    except (TypeError, ValueError, KeyError):
+        return ''
+
 if sys.maxunicode >= 0x10ffff:
     try:
         py_safe_chr = unichr
@@ -212,6 +222,7 @@ def icu_collation_order(collator, a):
 load_icu()
 load_collator()
 _icu_not_ok = _icu is None or _collator is None
+icu_unicode_version = getattr(_icu, 'unicode_version', None)
 
 try:
     senc = sys.getdefaultencoding()