From 22ca2bcac474dffef5e5fd55c1c8d0cd56b63e4c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 6 Mar 2014 11:45:11 +0530
Subject: [PATCH] Edit Book: Add support for showing the names of non-BMP
 characters on narrow python builds (windows and OS X)

---
 src/calibre/gui2/tweak_book/char_select.py | 13 +---
 src/calibre/gui2/tweak_book/ui.py          |  5 +-
 src/calibre/utils/icu.c                    | 73 ++++++++++++++++++++++
 src/calibre/utils/icu.py                   | 28 +++++++++
 4 files changed, 106 insertions(+), 13 deletions(-)
diff --git a/src/calibre/gui2/tweak_book/char_select.py b/src/calibre/gui2/tweak_book/char_select.py
index 5a973efa3e..c7b83cfab9 100644
--- a/src/calibre/gui2/tweak_book/char_select.py
+++ b/src/calibre/gui2/tweak_book/char_select.py
@@ -17,21 +17,12 @@ from PyQt4.Qt import (
     QStyledItemDelegate, QSplitter, QLabel, QSizePolicy, QIcon, QMimeData,
     QPushButton, QToolButton, QInputMethodEvent)
 
-from calibre.constants import ispy3, plugins, cache_dir
+from calibre.constants import plugins, cache_dir
 from calibre.gui2 import NONE
 from calibre.gui2.widgets2 import HistoryLineEdit2
 from calibre.gui2.tweak_book import tprefs
 from calibre.gui2.tweak_book.widgets import Dialog
-
-if not ispy3:
-    if sys.maxunicode >= 0x10FFFF:
-        chr = unichr
-    else:
-        def chr(i):
-            # Narrow builds of python cannot represent code point > 0xffff as a
-            # single character, so we need our own implementation of unichr
-            # that returns them as a surrogate pair
-            return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape')
+from calibre.utils.icu import safe_chr as chr
 
 ROOT = QModelIndex()
 
diff --git a/src/calibre/gui2/tweak_book/ui.py b/src/calibre/gui2/tweak_book/ui.py
index 872998eef1..6915063535 100644
--- a/src/calibre/gui2/tweak_book/ui.py
+++ b/src/calibre/gui2/tweak_book/ui.py
@@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 
-import unicodedata, os
+import os
 from functools import partial
 from itertools import product
 from future_builtins import map
@@ -33,6 +33,7 @@ from calibre.gui2.tweak_book.toc import TOCViewer
 from calibre.gui2.tweak_book.char_select import CharSelect
 from calibre.gui2.tweak_book.editor.widget import register_text_editor_actions
 from calibre.gui2.tweak_book.editor.insert_resource import InsertImage
+from calibre.utils.icu import character_name
 
 def open_donate():
     open_url(QUrl('http://calibre-ebook.com/donate'))
@@ -188,7 +189,7 @@ class CursorPositionWidget(QWidget):  # {{{
             self.la.setText('')
         else:
             try:
-                name = unicodedata.name(character, None) if character and tprefs['editor_show_char_under_cursor'] else None
+                name = character_name(character) if character and tprefs['editor_show_char_under_cursor'] else None
             except Exception:
                 name = None
             text = _('Line: {0} : {1}').format(line, col)
diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c
index aee47448fd..608b0e62ab 100644
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@@ -1,8 +1,10 @@
 #define UNICODE
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+#include <unicode/uversion.h>
 #include <unicode/utypes.h>
 #include <unicode/uclean.h>
+#include <unicode/utf16.h>
 #include <unicode/ucol.h>
 #include <unicode/ucoleitr.h>
 #include <unicode/ustring.h>
@@ -672,6 +674,7 @@ icu_set_filesystem_encoding(PyObject *self, PyObject *args) {
 
 }
 // }}}
+
 // set_default_encoding {{{
 static PyObject *
 icu_get_available_transliterators(PyObject *self, PyObject *args) {
@@ -701,6 +704,59 @@ icu_get_available_transliterators(PyObject *self, PyObject *args) {
 }
 
 // }}}
+
+// character_name {{{
+static PyObject *
+icu_character_name(PyObject *self, PyObject *args) {
+    char *input = NULL, name[512] = {0}; 
+    int32_t sz, alias = 0;
+    UChar *buf;
+    UErrorCode status = U_ZERO_ERROR;
+    PyObject *palias = NULL;
+    UChar32 code = 0;
+  
+    if (!PyArg_ParseTuple(args, "es|O", "UTF-8", &input, &palias)) return NULL;
+
+    if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; 
+    
+    sz = (int32_t)strlen(input);
+    buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
+    if (buf == NULL) { PyErr_NoMemory(); goto end; }
+    u_strFromUTF8(buf, sz*4, &sz, input, sz, &status);
+    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to decode char string"); goto end; }
+    U16_GET(buf, 0, 0, -1, code);
+    if (alias) {
+        sz = u_charName(code, U_CHAR_NAME_ALIAS, name, 511, &status);
+    } else {
+        sz = u_charName(code, U_UNICODE_CHAR_NAME, name, 511, &status);
+    }
+    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to get name for code"); goto end; }
+end:
+    if (buf != NULL) free(buf);
+    PyMem_Free(input);
+
+    return (PyErr_Occurred()) ? NULL : Py_BuildValue("s#", name, sz);
+} // }}}
+
+// chr {{{
+static PyObject *
+icu_chr(PyObject *self, PyObject *args) {
+    UErrorCode status = U_ZERO_ERROR;
+    UChar32 code = 0;
+    UChar buf[5] = {0};
+    int32_t sz = 0;
+    char utf8[21];
+  
+    if (!PyArg_ParseTuple(args, "I", &code)) return NULL;
+
+    u_strFromUTF32(buf, 4, &sz, &code, 1, &status);
+    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; }
+    u_strToUTF8(utf8, 20, &sz, buf, sz, &status);
+    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; }
+end:
+    return (PyErr_Occurred()) ? NULL : Py_BuildValue("s#", utf8, sz);
+} // }}}
+
 static PyMethodDef icu_methods[] = {
     {"upper", icu_upper, METH_VARARGS,
         "upper(locale, unicode object) -> upper cased unicode object using locale rules."
@@ -726,6 +782,14 @@ static PyMethodDef icu_methods[] = {
         "get_available_transliterators() -> Return list of available transliterators. This list is rather limited on OS X."
     },
 
+    {"character_name", icu_character_name, METH_VARARGS, 
+     "character_name(char, alias=False) -> Return name for the first character in char, which must be a unicode string."
+    },
+
+    {"chr", icu_chr, METH_VARARGS, 
+     "chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)."
+    },
+
     {NULL}  /* Sentinel */
 };
 
@@ -735,13 +799,20 @@ PyMODINIT_FUNC
 initicu(void) 
 {
     PyObject* m;
+    UVersionInfo ver, uver;
     UErrorCode status = U_ZERO_ERROR;
+    char version[U_MAX_VERSION_STRING_LENGTH+1] = {0};
+    char uversion[U_MAX_VERSION_STRING_LENGTH+5] = {0};
 
     u_init(&status);
     if (U_FAILURE(status)) {
         PyErr_SetString(PyExc_RuntimeError, u_errorName(status));
         return;
     }
+    u_getVersion(ver);
+    u_versionToString(ver, version);
+    u_getUnicodeVersion(uver);
+    u_versionToString(uver, uversion);
 
     if (PyType_Ready(&icu_CollatorType) < 0)
         return;
@@ -753,6 +824,8 @@ initicu(void)
     PyModule_AddObject(m, "Collator", (PyObject *)&icu_CollatorType);
     // uint8_t must be the same size as char
     PyModule_AddIntConstant(m, "ok", (U_SUCCESS(status) && sizeof(uint8_t) == sizeof(char)) ? 1 : 0);
+    PyModule_AddStringConstant(m, "icu_version", version);
+    PyModule_AddStringConstant(m, "unicode_version", uversion);
 
     ADDUCONST(USET_SPAN_NOT_CONTAINED);
     ADDUCONST(USET_SPAN_CONTAINED);
diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py
index 9e0df01b85..a70ec19acd 100644
--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@@ -130,6 +130,34 @@ def py_find(pattern, source):
         return pos, len(pattern)
     return -1, -1
 
+def character_name(string):
+    try:
+        try:
+            return _icu.character_name(unicode(string)).decode('utf-8') or None
+        except AttributeError:
+            import unicodedata
+            return unicodedata.name(unicode(string)[0], None)
+    except (TypeError, ValueError, KeyError):
+        pass
+
+if sys.maxunicode >= 0x10ffff:
+    try:
+        py_safe_chr = unichr
+    except NameError:
+        py_safe_chr = chr
+else:
+    def py_safe_chr(i):
+        # Narrow builds of python cannot represent code point > 0xffff as a
+        # single character, so we need our own implementation of unichr
+        # that returns them as a surrogate pair
+        return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape')
+
+def safe_chr(code):
+    try:
+        return _icu.chr(code)
+    except AttributeError:
+        return py_safe_chr(code)
+
 def icu_find(collator, pattern, source):
     try:
         return collator.find(pattern, source)