From 22ca2bcac474dffef5e5fd55c1c8d0cd56b63e4c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 6 Mar 2014 11:45:11 +0530 Subject: [PATCH] Edit Book: Add support for showing the names of non-BMP characters on narrow python builds (windows and OS X) --- src/calibre/gui2/tweak_book/char_select.py | 13 +--- src/calibre/gui2/tweak_book/ui.py | 5 +- src/calibre/utils/icu.c | 73 ++++++++++++++++++++++ src/calibre/utils/icu.py | 28 +++++++++ 4 files changed, 106 insertions(+), 13 deletions(-) diff --git a/src/calibre/gui2/tweak_book/char_select.py b/src/calibre/gui2/tweak_book/char_select.py index 5a973efa3e..c7b83cfab9 100644 --- a/src/calibre/gui2/tweak_book/char_select.py +++ b/src/calibre/gui2/tweak_book/char_select.py @@ -17,21 +17,12 @@ from PyQt4.Qt import ( QStyledItemDelegate, QSplitter, QLabel, QSizePolicy, QIcon, QMimeData, QPushButton, QToolButton, QInputMethodEvent) -from calibre.constants import ispy3, plugins, cache_dir +from calibre.constants import plugins, cache_dir from calibre.gui2 import NONE from calibre.gui2.widgets2 import HistoryLineEdit2 from calibre.gui2.tweak_book import tprefs from calibre.gui2.tweak_book.widgets import Dialog - -if not ispy3: - if sys.maxunicode >= 0x10FFFF: - chr = unichr - else: - def chr(i): - # Narrow builds of python cannot represent code point > 0xffff as a - # single character, so we need our own implementation of unichr - # that returns them as a surrogate pair - return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape') +from calibre.utils.icu import safe_chr as chr ROOT = QModelIndex() diff --git a/src/calibre/gui2/tweak_book/ui.py b/src/calibre/gui2/tweak_book/ui.py index 872998eef1..6915063535 100644 --- a/src/calibre/gui2/tweak_book/ui.py +++ b/src/calibre/gui2/tweak_book/ui.py @@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' -import unicodedata, os +import os from functools import partial from itertools import product from future_builtins import map @@ -33,6 +33,7 @@ from calibre.gui2.tweak_book.toc import TOCViewer from calibre.gui2.tweak_book.char_select import CharSelect from calibre.gui2.tweak_book.editor.widget import register_text_editor_actions from calibre.gui2.tweak_book.editor.insert_resource import InsertImage +from calibre.utils.icu import character_name def open_donate(): open_url(QUrl('http://calibre-ebook.com/donate')) @@ -188,7 +189,7 @@ class CursorPositionWidget(QWidget): # {{{ self.la.setText('') else: try: - name = unicodedata.name(character, None) if character and tprefs['editor_show_char_under_cursor'] else None + name = character_name(character) if character and tprefs['editor_show_char_under_cursor'] else None except Exception: name = None text = _('Line: {0} : {1}').format(line, col) diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index aee47448fd..608b0e62ab 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -1,8 +1,10 @@ #define UNICODE #define PY_SSIZE_T_CLEAN #include +#include #include #include +#include #include #include #include @@ -672,6 +674,7 @@ icu_set_filesystem_encoding(PyObject *self, PyObject *args) { } // }}} + // set_default_encoding {{{ static PyObject * icu_get_available_transliterators(PyObject *self, PyObject *args) { @@ -701,6 +704,59 @@ icu_get_available_transliterators(PyObject *self, PyObject *args) { } // }}} + +// character_name {{{ +static PyObject * +icu_character_name(PyObject *self, PyObject *args) { + char *input = NULL, name[512] = {0}; + int32_t sz, alias = 0; + UChar *buf; + UErrorCode status = U_ZERO_ERROR; + PyObject *palias = NULL; + UChar32 code = 0; + + if (!PyArg_ParseTuple(args, "es|O", "UTF-8", &input, &palias)) return NULL; + + if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; + + sz = (int32_t)strlen(input); + buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar)); + if (buf == NULL) { PyErr_NoMemory(); goto end; } + u_strFromUTF8(buf, sz*4, &sz, input, sz, &status); + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to decode char string"); goto end; } + U16_GET(buf, 0, 0, -1, code); + if (alias) { + sz = u_charName(code, U_CHAR_NAME_ALIAS, name, 511, &status); + } else { + sz = u_charName(code, U_UNICODE_CHAR_NAME, name, 511, &status); + } + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to get name for code"); goto end; } +end: + if (buf != NULL) free(buf); + PyMem_Free(input); + + return (PyErr_Occurred()) ? NULL : Py_BuildValue("s#", name, sz); +} // }}} + +// chr {{{ +static PyObject * +icu_chr(PyObject *self, PyObject *args) { + UErrorCode status = U_ZERO_ERROR; + UChar32 code = 0; + UChar buf[5] = {0}; + int32_t sz = 0; + char utf8[21]; + + if (!PyArg_ParseTuple(args, "I", &code)) return NULL; + + u_strFromUTF32(buf, 4, &sz, &code, 1, &status); + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; } + u_strToUTF8(utf8, 20, &sz, buf, sz, &status); + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; } +end: + return (PyErr_Occurred()) ? NULL : Py_BuildValue("s#", utf8, sz); +} // }}} + static PyMethodDef icu_methods[] = { {"upper", icu_upper, METH_VARARGS, "upper(locale, unicode object) -> upper cased unicode object using locale rules." @@ -726,6 +782,14 @@ static PyMethodDef icu_methods[] = { "get_available_transliterators() -> Return list of available transliterators. This list is rather limited on OS X." }, + {"character_name", icu_character_name, METH_VARARGS, + "character_name(char, alias=False) -> Return name for the first character in char, which must be a unicode string." + }, + + {"chr", icu_chr, METH_VARARGS, + "chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)." + }, + {NULL} /* Sentinel */ }; @@ -735,13 +799,20 @@ PyMODINIT_FUNC initicu(void) { PyObject* m; + UVersionInfo ver, uver; UErrorCode status = U_ZERO_ERROR; + char version[U_MAX_VERSION_STRING_LENGTH+1] = {0}; + char uversion[U_MAX_VERSION_STRING_LENGTH+5] = {0}; u_init(&status); if (U_FAILURE(status)) { PyErr_SetString(PyExc_RuntimeError, u_errorName(status)); return; } + u_getVersion(ver); + u_versionToString(ver, version); + u_getUnicodeVersion(uver); + u_versionToString(uver, uversion); if (PyType_Ready(&icu_CollatorType) < 0) return; @@ -753,6 +824,8 @@ initicu(void) PyModule_AddObject(m, "Collator", (PyObject *)&icu_CollatorType); // uint8_t must be the same size as char PyModule_AddIntConstant(m, "ok", (U_SUCCESS(status) && sizeof(uint8_t) == sizeof(char)) ? 1 : 0); + PyModule_AddStringConstant(m, "icu_version", version); + PyModule_AddStringConstant(m, "unicode_version", uversion); ADDUCONST(USET_SPAN_NOT_CONTAINED); ADDUCONST(USET_SPAN_CONTAINED); diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 9e0df01b85..a70ec19acd 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -130,6 +130,34 @@ def py_find(pattern, source): return pos, len(pattern) return -1, -1 +def character_name(string): + try: + try: + return _icu.character_name(unicode(string)).decode('utf-8') or None + except AttributeError: + import unicodedata + return unicodedata.name(unicode(string)[0], None) + except (TypeError, ValueError, KeyError): + pass + +if sys.maxunicode >= 0x10ffff: + try: + py_safe_chr = unichr + except NameError: + py_safe_chr = chr +else: + def py_safe_chr(i): + # Narrow builds of python cannot represent code point > 0xffff as a + # single character, so we need our own implementation of unichr + # that returns them as a surrogate pair + return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape') + +def safe_chr(code): + try: + return _icu.chr(code) + except AttributeError: + return py_safe_chr(code) + def icu_find(collator, pattern, source): try: return collator.find(pattern, source)