diff --git a/setup/extensions.py b/setup/extensions.py index 7b2d7dfed0..6a78e0e058 100644 --- a/setup/extensions.py +++ b/setup/extensions.py @@ -177,6 +177,11 @@ extensions = [ inc_dirs=(['calibre/utils/chm'] if iswindows else []) # For stdint.h ), + Extension('matcher', + ['calibre/gui2/tweak_book/matcher.c'], + inc_dirs=(['calibre/utils/chm'] if iswindows else []) # For stdint.h + ), + Extension('podofo', [ 'calibre/utils/podofo/utils.cpp', diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 79f3e71c96..0270dbef6d 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -143,6 +143,7 @@ class Plugins(collections.Mapping): 'hunspell', '_patiencediff_c', 'bzzdec', + 'matcher', ] if iswindows: plugins.extend(['winutil', 'wpd', 'winfonts']) diff --git a/src/calibre/gui2/tweak_book/matcher.c b/src/calibre/gui2/tweak_book/matcher.c new file mode 100644 index 0000000000..4ba015faf3 --- /dev/null +++ b/src/calibre/gui2/tweak_book/matcher.c @@ -0,0 +1,368 @@ +/* + * matcher.c + * Copyright (C) 2014 Kovid Goyal + * + * Distributed under terms of the GPL3 license. + */ + +#define PY_SSIZE_T_CLEAN +#include +#include +#include +#include + +#ifdef _MSC_VER +#include "stdint.h" +// inline does not work with the visual studio C compiler +#define inline +#define qsort qsort_s +#else +#include +#define qsort qsort_r +#endif + + +typedef uint8_t bool; +#define TRUE 1 +#define FALSE 0 +#define MIN(x, y) ((x < y) ? x : y) +#define MAX(x, y) ((x > y) ? x : y) +#define nullfree(x) if(x != NULL) free(x); x = NULL; + +// Algorithm to sort items by subsequence score {{{ +typedef struct { + char *haystack; + uint32_t haystack_len; + char *needle; + uint32_t needle_len; + double max_score_per_char; + double *memo; +} MatchInfo; + +typedef struct { + char *item; + char *sort_key; + uint32_t sort_key_len; + PyObject *py_item; + double score; +} Match; + +static double recursive_match(MatchInfo *m, uint32_t haystack_idx, uint32_t needle_idx, uint32_t last_idx, double score) { + double seen_score = 0.0, memoized = DBL_MAX, score_for_char, factor, sub_score; + uint32_t i = 0, j = 0, distance; + char c, d, last, curr; + bool found; + + // do we have a memoized result we can return? + memoized = m->memo[needle_idx * m->needle_len + haystack_idx]; + if (memoized != DBL_MAX) + return memoized; + + // bail early if not enough room (left) in haystack for (rest of) needle + if (m->haystack_len - haystack_idx < m->needle_len - needle_idx) { + score = 0.0; + goto memoize; + } + for (i = needle_idx; i < m->needle_len; i++) { + c = m->needle[i]; + found = FALSE; + + // similar to above, we'll stop iterating when we know we're too close + // to the end of the string to possibly match + for (j = haystack_idx; + j <= m->haystack_len - (m->needle_len - i); + j++, haystack_idx++) { + d = m->haystack[j]; + if (d >= 'A' && d <= 'Z') { + d += 'a' - 'A'; // add 32 to downcase + } + + if (c == d) { + found = TRUE; + + // calculate score + score_for_char = m->max_score_per_char; + distance = j - last_idx; + + if (distance > 1) { + factor = 1.0; + last = m->haystack[j - 1]; + curr = m->haystack[j]; // case matters, so get again + if (last == '/') + factor = 0.9; + else if (last == '-' || + last == '_' || + last == ' ' || + (last >= '0' && last <= '9')) + factor = 0.8; + else if (last >= 'a' && last <= 'z' && + curr >= 'A' && curr <= 'Z') + factor = 0.8; + else if (last == '.') + factor = 0.7; + else + // if no "special" chars behind char, factor diminishes + // as distance from last matched char increases + factor = (1.0 / distance) * 0.75; + score_for_char *= factor; + } + + if (++j < m->haystack_len) { + // bump cursor one char to the right and + // use recursion to try and find a better match + sub_score = recursive_match(m, j, i, last_idx, score); + if (sub_score > seen_score) + seen_score = sub_score; + } + score += score_for_char; + last_idx = haystack_idx + 1; + break; + } + } + + if (!found) { + score = 0.0; + goto memoize; + } + } + + score = score > seen_score ? score : seen_score; + +memoize: + m->memo[needle_idx * m->needle_len + haystack_idx] = score; + return score; +} + +static bool match(char **items, uint32_t *item_lengths, uint32_t item_count, char *needle, uint32_t needle_len, Match *match_results) { + uint32_t i = 0, maxhl = 0, n = 0; + MatchInfo *matches = NULL; + bool ok = FALSE; + double *memo = NULL; + + if (needle_len == 0) { + for (i = 0; i < item_count; i++) match_results[i].score = 0.0; + ok = TRUE; + goto end; + } + + matches = (MatchInfo*)calloc(item_count, sizeof(MatchInfo)); + if (matches == NULL) goto end; + + for (i = 0; i < item_count; i++) { + matches[i].haystack = items[i]; + matches[i].haystack_len = item_lengths[i]; + matches[i].needle = needle; + matches[i].needle_len = needle_len; + matches[i].max_score_per_char = (1.0 / matches[i].haystack_len + 1.0 / needle_len) / 2.0; + maxhl = MAX(maxhl, matches[i].haystack_len); + } + maxhl *= needle_len; + + memo = (double*)calloc(maxhl, sizeof(double)); + if (memo == NULL) goto end; + + for (i = 0; i < item_count; i++) { + for (n = 0; n < maxhl; n++) memo[n] = DBL_MAX; + matches[i].memo = memo; + match_results[i].score = recursive_match(&matches[i], 0, 0, 0, 0.0); + } + + ok = TRUE; +end: + nullfree(matches); + nullfree(memo); + return ok; +} + +int cmp_score(const void *a, const void *b, void *arg) +{ + Match a_match = *(Match *)a; + Match b_match = *(Match *)b; + + if (a_match.score > b_match.score) + return -1; // a scores higher, a should appear sooner + else if (a_match.score < b_match.score) + return 1; // b scores higher, a should appear later + else + return strncmp(a_match.sort_key, b_match.sort_key, MIN(a_match.sort_key_len, b_match.sort_key_len)); +} +// }}} + +// Matcher object definition {{{ +typedef struct { + PyObject_HEAD + // Type-specific fields go here. + char **items; + char **sort_items; + uint32_t item_count; + uint32_t *item_lengths; + uint32_t *sort_item_lengths; + PyObject *py_items; + PyObject *py_sort_keys; + +} Matcher; +// Matcher.__init__() {{{ + +#define FREE_MATCHER nullfree(self->items); nullfree(self->sort_items); nullfree(self->item_lengths); nullfree(self->sort_item_lengths); Py_XDECREF(self->py_items); Py_XDECREF(self->py_sort_keys); +static void +Matcher_dealloc(Matcher* self) +{ + FREE_MATCHER + self->ob_type->tp_free((PyObject*)self); +} + +static int +Matcher_init(Matcher *self, PyObject *args, PyObject *kwds) +{ + PyObject *items = NULL, *sort_keys = NULL, *p = NULL; + uint32_t count = 0; + Py_ssize_t i = 0; + + if (!PyArg_ParseTuple(args, "OO", &items, &sort_keys)) return -1; + self->py_items = PySequence_Fast(items, "Must pass in two sequence objects"); + self->py_sort_keys = PySequence_Fast(sort_keys, "Must pass in two sequence objects"); + if (self->py_items == NULL || self->py_sort_keys == NULL) goto end; + count = (uint32_t)PySequence_Size(items); + if (count != (uint32_t)PySequence_Size(sort_keys)) { PyErr_SetString(PyExc_TypeError, "The sequences must have the same length."); } + + self->items = (char**)calloc(count, sizeof(char*)); + self->sort_items = (char**)calloc(count, sizeof(char*)); + self->item_lengths = (uint32_t*)calloc(count, sizeof(uint32_t)); + self->sort_item_lengths = (uint32_t*)calloc(count, sizeof(uint32_t)); + self->item_count = count; + + if (self->items == NULL || self->sort_items == NULL || self->item_lengths == NULL || self->sort_item_lengths == NULL) { + PyErr_NoMemory(); goto end; + } + + for (i = 0; i < (Py_ssize_t)count; i++) { + p = PySequence_Fast_GET_ITEM(self->py_items, i); + self->items[i] = PyBytes_AsString(p); + if (self->items[i] == NULL) goto end; + self->item_lengths[i] = (uint32_t) PyBytes_GET_SIZE(p); + p = PySequence_Fast_GET_ITEM(self->py_sort_keys, i); + self->sort_items[i] = PyBytes_AsString(p); + if (self->sort_items[i] == NULL) goto end; + self->sort_item_lengths[i] = (uint32_t) PyBytes_GET_SIZE(p); + } + +end: + if (PyErr_Occurred()) { FREE_MATCHER } + return (PyErr_Occurred()) ? -1 : 0; +} +// Matcher.__init__() }}} + +// Matcher.get_matches {{{ +static PyObject * +Matcher_get_matches(Matcher *self, PyObject *args) { + char *needle = NULL; + Py_ssize_t qsize = 0; + Match *matches = NULL; + bool ok = FALSE; + uint32_t i = 0; + PyObject *items = NULL; + + if (!PyArg_ParseTuple(args, "s#", &needle, &qsize)) return NULL; + + items = PyTuple_New(self->item_count); + matches = (Match*)calloc(self->item_count, sizeof(Match)); + if (items == NULL || matches == NULL) {PyErr_NoMemory(); goto end;} + for (i = 0; i < self->item_count; i++) { + matches[i].item = self->items[i]; + matches[i].sort_key = self->sort_items[i]; + matches[i].sort_key_len = self->sort_item_lengths[i]; + matches[i].py_item = PySequence_Fast_GET_ITEM(self->py_items, (Py_ssize_t)i); + } + + Py_BEGIN_ALLOW_THREADS; + ok = match(self->items, self->item_lengths, self->item_count, needle, (uint32_t)qsize, matches); + if (ok) qsort(matches, self->item_count, sizeof(Match), cmp_score, NULL); + Py_END_ALLOW_THREADS; + + if (ok) { + for (i = 0; i < self->item_count; i++) { + PyTuple_SET_ITEM(items, (Py_ssize_t)i, matches[i].py_item); + Py_INCREF(matches[i].py_item); + } + } else { PyErr_NoMemory(); goto end; } + +end: + nullfree(matches); + if (PyErr_Occurred()) { Py_XDECREF(items); return NULL; } + return items; +} // }}} + +static PyMethodDef Matcher_methods[] = { + {"get_matches", (PyCFunction)Matcher_get_matches, METH_VARARGS, + "get_matches(query) -> Return the sorted list of matches for query which must be a UTF-8 encoded string." + }, + + {NULL} /* Sentinel */ +}; + + +// }}} + +static PyTypeObject MatcherType = { // {{{ + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "matcher.Matcher", /*tp_name*/ + sizeof(Matcher), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)Matcher_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "Matcher", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Matcher_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)Matcher_init, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ +}; // }}} + +static PyMethodDef matcher_methods[] = { + {NULL, NULL, 0, NULL} +}; + + +PyMODINIT_FUNC +initmatcher(void) { + PyObject *m; + MatcherType.tp_new = PyType_GenericNew; + if (PyType_Ready(&MatcherType) < 0) + return; + m = Py_InitModule3("matcher", matcher_methods, "Find subsequence matches"); + if (m == NULL) return; + + Py_INCREF(&MatcherType); + PyModule_AddObject(m, "Matcher", (PyObject *)&MatcherType); + +} + + + diff --git a/src/calibre/gui2/tweak_book/matcher.py b/src/calibre/gui2/tweak_book/matcher.py new file mode 100644 index 0000000000..604c95a857 --- /dev/null +++ b/src/calibre/gui2/tweak_book/matcher.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2014, Kovid Goyal ' + +from unicodedata import normalize + +from future_builtins import map + +from calibre.constants import plugins +from calibre.utils.icu import primary_sort_key + +class Matcher(object): + + def __init__(self, items): + items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items)) + items = tuple(map(lambda x: x.encode('utf-8'), items)) + sort_keys = tuple(map(primary_sort_key, items)) + + speedup, err = plugins['matcher'] + if speedup is None: + raise RuntimeError('Failed to load the matcher plugin with error: %s' % err) + self.m = speedup.Matcher(items, sort_keys) + + def __call__(self, query): + query = normalize('NFC', unicode(query)).encode('utf-8') + return self.m.get_matches(query) + +def test_mem(): + from calibre.utils.mem import gc_histogram, diff_hists + m = Matcher([]) + del m + def doit(c): + m = Matcher([c+'im/one.gif', c+'im/two.gif', c+'text/one.html',]) + m('one') + import gc + gc.collect() + h1 = gc_histogram() + for i in xrange(100): + doit(str(i)) + h2 = gc_histogram() + diff_hists(h1, h2) + +if __name__ == '__main__': + test_mem()