From 67367f521d7c640ce046ed614b37f382e55653ec Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 3 Dec 2010 20:18:56 -0700 Subject: [PATCH] Use ICU for sorting --- setup/build_environment.py | 4 + setup/extensions.py | 14 +- setup/installer/windows/freeze.py | 2 +- setup/installer/windows/notes.rst | 9 ++ src/calibre/constants.py | 3 +- src/calibre/library/caches.py | 9 +- src/calibre/utils/icu.c | 220 ++++++++++++++++++++++++++++++ src/calibre/utils/icu.py | 53 +++++++ 8 files changed, 306 insertions(+), 8 deletions(-) create mode 100644 src/calibre/utils/icu.c create mode 100644 src/calibre/utils/icu.py diff --git a/setup/build_environment.py b/setup/build_environment.py index c021ebc6a6..d6581a907d 100644 --- a/setup/build_environment.py +++ b/setup/build_environment.py @@ -91,11 +91,15 @@ podofo_inc = '/usr/include/podofo' podofo_lib = '/usr/lib' chmlib_inc_dirs = chmlib_lib_dirs = [] sqlite_inc_dirs = [] +icu_inc_dirs = [] +icu_lib_dirs = [] if iswindows: prefix = r'C:\cygwin\home\kovid\sw' sw_inc_dir = os.path.join(prefix, 'include') sw_lib_dir = os.path.join(prefix, 'lib') + icu_inc_dirs = [sw_inc_dir] + icu_lib_dirs = [sw_lib_dir] sqlite_inc_dirs = [sw_inc_dir] fc_inc = os.path.join(sw_inc_dir, 'fontconfig') fc_lib = sw_lib_dir diff --git a/setup/extensions.py b/setup/extensions.py index d4ac8e188c..3862cce62a 100644 --- a/setup/extensions.py +++ b/setup/extensions.py @@ -18,7 +18,8 @@ from setup.build_environment import fc_inc, fc_lib, chmlib_inc_dirs, \ QMAKE, msvc, MT, win_inc, win_lib, png_inc_dirs, win_ddk, \ magick_inc_dirs, magick_lib_dirs, png_lib_dirs, png_libs, \ magick_error, magick_libs, ft_lib_dirs, ft_libs, jpg_libs, \ - jpg_lib_dirs, chmlib_lib_dirs, sqlite_inc_dirs + jpg_lib_dirs, chmlib_lib_dirs, sqlite_inc_dirs, icu_inc_dirs, \ + icu_lib_dirs MT isunix = islinux or isosx or isfreebsd @@ -56,8 +57,19 @@ pdfreflow_libs = [] if iswindows: pdfreflow_libs = ['advapi32', 'User32', 'Gdi32', 'zlib'] +icu_libs = ['icudata', 'icui18n', 'icuuc', 'icuio'] +if iswindows: + icu_libs = ['icudt', 'icuin', 'icuuc', 'icuio'] + extensions = [ + Extension('icu', + ['calibre/utils/icu.c'], + libraries=icu_libs, + lib_dirs=icu_lib_dirs, + inc_dirs=icu_inc_dirs, + ), + Extension('sqlite_custom', ['calibre/library/sqlite_custom.c'], inc_dirs=sqlite_inc_dirs diff --git a/setup/installer/windows/freeze.py b/setup/installer/windows/freeze.py index 30cc2a97af..7d8ea4d80a 100644 --- a/setup/installer/windows/freeze.py +++ b/setup/installer/windows/freeze.py @@ -199,7 +199,7 @@ class Win32Freeze(Command, WixMixIn): for pat in ('*.dll',): for f in glob.glob(os.path.join(bindir, pat)): ok = True - for ex in ('expatw',): + for ex in ('expatw', 'testplug'): if ex in f.lower(): ok = False if not ok: continue diff --git a/setup/installer/windows/notes.rst b/setup/installer/windows/notes.rst index 9c553c42e8..af4c871dac 100644 --- a/setup/installer/windows/notes.rst +++ b/setup/installer/windows/notes.rst @@ -77,6 +77,15 @@ Test it on the target system with calibre-debug -c "import _imaging, _imagingmath, _imagingft, _imagingcms" +ICU +------- + +Download the win32 msvc9 binary from http://www.icu-project.org/download/4.4.html + +Note that 4.4 is the last version of ICU that can be compiled (is precompiled) with msvc9 + +Put the dlls into sw/bin and the unicode dir into sw/include and the contents of lib int sw/lib + Libunrar ---------- diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 197fe5888a..f9c177e7a8 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -67,7 +67,8 @@ if plugins is None: 'pdfreflow', 'progress_indicator', 'chmlib', - 'chm_extra' + 'chm_extra', + 'icu', ] + \ (['winutil'] if iswindows else []) + \ (['usbobserver'] if isosx else []): diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py index 7b4c66c8b8..7c1dea792c 100644 --- a/src/calibre/library/caches.py +++ b/src/calibre/library/caches.py @@ -796,11 +796,13 @@ class SortKey(object): class SortKeyGenerator(object): def __init__(self, fields, field_metadata, data): + from calibre.utils.icu import sort_key self.field_metadata = field_metadata self.orders = [-1 if x[1] else 1 for x in fields] self.entries = [(x[0], field_metadata[x[0]]) for x in fields] self.library_order = tweaks['title_series_sorting'] == 'library_order' self.data = data + self.string_sort_key = sort_key def __call__(self, record): values = tuple(self.itervals(self.data[record])) @@ -821,17 +823,14 @@ class SortKeyGenerator(object): if val is None: val = ('', 1) else: - val = val.lower() if self.library_order: val = title_sort(val) sidx_fm = self.field_metadata[name + '_index'] sidx = record[sidx_fm['rec_index']] - val = (val, sidx) + val = (self.string_sort_key(val), sidx) elif dt in ('text', 'comments', 'composite', 'enumeration'): - if val is None: - val = '' - val = val.lower() + val = self.string_sort_key(val) elif dt == 'bool': val = {True: 1, False: 2, None: 3}.get(val, 3) diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c new file mode 100644 index 0000000000..6e06d54dff --- /dev/null +++ b/src/calibre/utils/icu.c @@ -0,0 +1,220 @@ +#define UNICODE +#define PY_SSIZE_T_CLEAN +#include +#include +#include +#include +#include + + +// Collator object definition {{{ +typedef struct { + PyObject_HEAD + // Type-specific fields go here. + UCollator *collator; + +} icu_Collator; + +static void +icu_Collator_dealloc(icu_Collator* self) +{ + if (self->collator != NULL) ucol_close(self->collator); + self->collator = NULL; + self->ob_type->tp_free((PyObject*)self); +} + +static PyObject * +icu_Collator_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + icu_Collator *self; + const char *loc; + UErrorCode status = U_ZERO_ERROR; + + if (!PyArg_ParseTuple(args, "s", &loc)) return NULL; + + self = (icu_Collator *)type->tp_alloc(type, 0); + if (self != NULL) { + self->collator = ucol_open(loc, &status); + if (self->collator == NULL || U_FAILURE(status)) { + PyErr_SetString(PyExc_Exception, "Failed to create collator."); + self->collator = NULL; + Py_DECREF(self); + return NULL; + } + } + + return (PyObject *)self; +} + +// Collator.display_name {{{ +static PyObject * +icu_Collator_display_name(icu_Collator *self, void *closure) { + const char *loc = NULL; + UErrorCode status = U_ZERO_ERROR; + UChar dname[400]; + char buf[100]; + + loc = ucol_getLocaleByType(self->collator, ULOC_ACTUAL_LOCALE, &status); + if (loc == NULL || U_FAILURE(status)) { + PyErr_SetString(PyExc_Exception, "Failed to get actual locale"); return NULL; + } + ucol_getDisplayName(loc, "en", dname, 100, &status); + if (U_FAILURE(status)) return PyErr_NoMemory(); + + u_strToUTF8(buf, 100, NULL, dname, -1, &status); + if (U_FAILURE(status)) { + PyErr_SetString(PyExc_Exception, "Failed ot convert dname to UTF-8"); return NULL; + } + return Py_BuildValue("s", buf); +} + +// }}} + +// Collator.actual_locale {{{ +static PyObject * +icu_Collator_actual_locale(icu_Collator *self, void *closure) { + const char *loc = NULL; + UErrorCode status = U_ZERO_ERROR; + + loc = ucol_getLocaleByType(self->collator, ULOC_ACTUAL_LOCALE, &status); + if (loc == NULL || U_FAILURE(status)) { + PyErr_SetString(PyExc_Exception, "Failed to get actual locale"); return NULL; + } + return Py_BuildValue("s", loc); +} + +// }}} + +// Collator.sort_key {{{ +static PyObject * +icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) { + PyObject *o; + Py_ssize_t sz; + wchar_t *buf; + UChar *buf2; + uint8_t *buf3; + PyObject *ans; + UErrorCode status = U_ZERO_ERROR; + + if (!PyArg_ParseTuple(args, "U", &o)) return NULL; + + sz = PyUnicode_GetSize(o); + + buf = (wchar_t*)calloc(sz*2 + 1, sizeof(wchar_t)); + buf2 = (UChar*)calloc(sz*2 + 1, sizeof(UChar)); + buf3 = (uint8_t*)calloc(sz*4 + 1, sizeof(uint8_t)); + + if (buf == NULL || buf2 == NULL || buf3 == NULL) return PyErr_NoMemory(); + + PyUnicode_AsWideChar((PyUnicodeObject *)o, buf, sz); + + u_strFromWCS(buf2, 2*sz+1, NULL, buf, -1, &status); + if (U_SUCCESS(status)) + ucol_getSortKey(self->collator, buf2, -1, buf3, sz*4+1); + + ans = PyBytes_FromString((char *)buf3); + free(buf3); free(buf); free(buf2); + if (ans == NULL) return PyErr_NoMemory(); + + return ans; +} + +static PyMethodDef icu_Collator_methods[] = { + {"sort_key", (PyCFunction)icu_Collator_sort_key, METH_VARARGS, + "sort_key(unicode object) -> Return a sort key for the given object as a bytestring. The idea is that these bytestring will sort using the builtin cmp function, just like the original unicode strings would sort in the current locale with ICU." + }, + + {NULL} /* Sentinel */ +}; + +static PyGetSetDef icu_Collator_getsetters[] = { + {(char *)"actual_locale", + (getter)icu_Collator_actual_locale, NULL, + (char *)"Actual locale used by this collator.", + NULL}, + + {(char *)"display_name", + (getter)icu_Collator_display_name, NULL, + (char *)"Display name of this collator in English. The name reflects the actual data source used.", + NULL}, + + {NULL} /* Sentinel */ +}; + +static PyTypeObject icu_CollatorType = { // {{{ + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "icu.Collator", /*tp_name*/ + sizeof(icu_Collator), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)icu_Collator_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "Collator", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + icu_Collator_methods, /* tp_methods */ + 0, /* tp_members */ + icu_Collator_getsetters, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + icu_Collator_new, /* tp_new */ +}; // }}} + +// }} + + +// }}} + + +// Module initialization {{{ + +static PyMethodDef icu_methods[] = { + {NULL} /* Sentinel */ +}; + + +PyMODINIT_FUNC +initicu(void) +{ + PyObject* m; + UErrorCode status = U_ZERO_ERROR; + + u_init(&status); + + + if (PyType_Ready(&icu_CollatorType) < 0) + return; + + m = Py_InitModule3("icu", icu_methods, + "Wrapper for the ICU internationalization library"); + + Py_INCREF(&icu_CollatorType); + PyModule_AddObject(m, "Collator", (PyObject *)&icu_CollatorType); + // uint8_t must be the same size as char + PyModule_AddIntConstant(m, "ok", (U_SUCCESS(status) && sizeof(uint8_t) == sizeof(char)) ? 1 : 0); + +} +// }}} diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py new file mode 100644 index 0000000000..5b432747f0 --- /dev/null +++ b/src/calibre/utils/icu.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from functools import partial + +from calibre.constants import plugins + +_icu = _collator = None + +_none = u'' +_none2 = b'' + +def load_icu(): + global _icu + if _icu is None: + _icu = plugins['icu'][0] + if _icu is None: + print plugins['icu'][1] + else: + if not _icu.ok: + print 'icu not ok' + _icu = None + return _icu + +def load_collator(): + global _collator + from calibre.utils.localization import get_lang + if _collator is None: + icu = load_icu() + if icu is not None: + _collator = icu.Collator(get_lang()) + return _collator + + +def py_sort_key(obj): + if not obj: + return _none + return obj.lower() + +def icu_sort_key(collator, obj): + if not obj: + return _none2 + return collator.sort_key(obj.lower()) + +load_icu() +load_collator() +sort_key = py_sort_key if _icu is None or _collator is None else \ + partial(icu_sort_key, _collator) +