From 3901051e2eff192ade8152f84045118736163de9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 6 Mar 2014 10:11:27 +0530 Subject: [PATCH] Refactor subsequence matcher to not use recursion and support multithreading. Also add a python implementation for easy experimentation with the algorithm. --- src/calibre/gui2/tweak_book/matcher.c | 384 +++++++++++++++---------- src/calibre/gui2/tweak_book/matcher.py | 108 ++++++- 2 files changed, 338 insertions(+), 154 deletions(-) diff --git a/src/calibre/gui2/tweak_book/matcher.c b/src/calibre/gui2/tweak_book/matcher.c index 553f41b4f4..df28c7c4c7 100644 --- a/src/calibre/gui2/tweak_book/matcher.c +++ b/src/calibre/gui2/tweak_book/matcher.c @@ -17,12 +17,8 @@ #ifdef _MSC_VER // inline does not work with the visual studio C compiler #define inline -#define qsort qsort_s -#else -#define qsort qsort_r #endif - typedef unsigned char bool; #define TRUE 1 #define FALSE 0 @@ -31,132 +27,224 @@ typedef unsigned char bool; #define nullfree(x) if(x != NULL) free(x); x = NULL; // Algorithm to sort items by subsequence score {{{ +typedef struct { + double score; + int32_t *positions; +} MemoryItem; + +static MemoryItem*** alloc_memory(int32_t needle_len, int32_t max_haystack_len) { + MemoryItem ***ans = NULL, **d1 = NULL, *d2 = NULL; + size_t num = max_haystack_len * max_haystack_len * needle_len; + size_t position_sz = needle_len * sizeof(int32_t); + size_t sz = (num * (sizeof(MemoryItem) + position_sz)) + (max_haystack_len * sizeof(MemoryItem**)) + (needle_len * sizeof(MemoryItem*)); + int32_t hidx, nidx, last_idx, i, j; + char *base = NULL; + + ans = (MemoryItem***) calloc(sz, 1); + if (ans != NULL) { + d1 = (MemoryItem**)(ans + max_haystack_len); + d2 = (MemoryItem*) (d1 + max_haystack_len * needle_len ); + for (i = 0; i < max_haystack_len; i++) { + ans[i] = d1 + i * needle_len; + for (j = 0; j < needle_len; j++) d1[i*needle_len + j] = d2 + j; + } + + base = ((char*)ans) + (sizeof(MemoryItem**)*max_haystack_len) + (sizeof(MemoryItem*)*needle_len) + (sizeof(MemoryItem)*max_haystack_len); + + for (hidx = 0; hidx < max_haystack_len; hidx++) { + for (nidx = 0; nidx < needle_len; nidx++) { + for (last_idx = 0; last_idx < max_haystack_len; last_idx++) { + ans[hidx][nidx][last_idx].positions = (int32_t*)base; + base += position_sz; + } + } + } + } + return ans; +} + +static void clear_memory(MemoryItem ***mem, int32_t needle_len, int32_t max_haystack_len) { + int32_t hidx, nidx, last_idx; + for (hidx = 0; hidx < max_haystack_len; hidx++) { + for (nidx = 0; nidx < needle_len; nidx++) { + for (last_idx = 0; last_idx < max_haystack_len; last_idx++) { + mem[hidx][nidx][last_idx].score = DBL_MAX; + } + } + } +} + +typedef struct { + int32_t hidx; + int32_t nidx; + int32_t last_idx; + double score; + int32_t *positions; +} StackItem; + +typedef struct { + ssize_t pos; + int32_t needle_len; + size_t size; + StackItem *items; +} Stack; + +static void alloc_stack(Stack *stack, int32_t needle_len, int32_t max_haystack_len) { + StackItem *ans = NULL; + char *base = NULL; + size_t num = max_haystack_len * needle_len; + size_t position_sz = needle_len * sizeof(int32_t); + size_t sz = sizeof(StackItem) + position_sz; + size_t i = 0; + + stack->needle_len = needle_len; + stack->pos = -1; + stack->size = num; + ans = (StackItem*) calloc(num, sz); + if (ans != NULL) { + base = (char*)(ans + num); + for (i = 0; i < num; i++, base += position_sz) ans[i].positions = (int32_t*) base; + stack->items = ans; + } +} + +static void stack_clear(Stack *stack) { stack->pos = -1; } + +static void stack_push(Stack *stack, int32_t hidx, int32_t nidx, int32_t last_idx, double score, int32_t *positions) { + StackItem *si = &(stack->items[++stack->pos]); + si->hidx = hidx; si->nidx = nidx; si->last_idx = last_idx; si->score = score; + memcpy(si->positions, positions, sizeof(*positions) * stack->needle_len); +} + +static void stack_pop(Stack *stack, int32_t *hidx, int32_t *nidx, int32_t *last_idx, double *score, int32_t *positions) { + StackItem *si = &(stack->items[stack->pos--]); + *hidx = si->hidx; *nidx = si->nidx; *last_idx = si->last_idx; *score = si->score; + memcpy(positions, si->positions, sizeof(*positions) * stack->needle_len); +} + typedef struct { UChar *haystack; int32_t haystack_len; UChar *needle; int32_t needle_len; double max_score_per_char; - double **memo; + MemoryItem ***memo; UChar *level1; UChar *level2; UChar *level3; } MatchInfo; typedef struct { - UChar *item; - char *sort_key; - uint32_t sort_key_len; - PyObject *py_item; double score; + int32_t *positions; } Match; -static double recursive_match(MatchInfo *m, int32_t haystack_idx, int32_t needle_idx, int32_t last_idx, double score) { - double seen_score = 0.0, memoized = DBL_MAX, score_for_char, factor, sub_score; - int32_t i = 0, j = 0, distance, curri; - UChar32 c, d, last; - bool found; - // do we have a memoized result we can return? - memoized = m->memo[needle_idx][haystack_idx]; - if (memoized != DBL_MAX) - return memoized; +static double calc_score_for_char(MatchInfo *m, UChar32 last, UChar32 current, int32_t distance_from_last_match) { + double factor = 1.0; + double ans = m->max_score_per_char; - // bail early if not enough room (left) in haystack for (rest of) needle - if (m->haystack_len - haystack_idx < m->needle_len - needle_idx) { - score = 0.0; - goto memoize; + if (u_strchr32(m->level1, last) != NULL) + factor = 0.9; + else if (u_strchr32(m->level2, last) != NULL) + factor = 0.8; + else if (u_isULowercase(last) && u_isUUppercase(current)) + factor = 0.8; // CamelCase + else if (u_strchr32(m->level3, last) != NULL) + factor = 0.7; + else + // If last is not a special char, factor diminishes + // as distance from last matched char increases + factor = (1.0 / distance_from_last_match) * 0.75; + return ans * factor; +} + +static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) { + // The positions array stores character positions as byte offsets in string, convert them into character offsets + int32_t i, *end; + + if (score == 0.0) { + for (i = 0; i < char_len; i++) final_positions[i] = -1; + return; } - for (i = needle_idx; i < m->needle_len; ) { - curri = i; - U16_NEXT(m->needle, i, m->needle_len, c); // i now points to the next codepoint - found = FALSE; - // similar to above, we'll stop iterating when we know we're too close - // to the end of the string to possibly match - for (j = haystack_idx; j <= m->haystack_len - (m->needle_len - curri); ) { - haystack_idx = j; - U16_NEXT(m->haystack, j, m->haystack_len, d); // j now points to the next codepoint + end = final_positions + char_len; + for (i = 0; i < byte_len && final_positions < end; i++) { + if (positions[i] == -1) continue; + *final_positions = u_countChar32(string, positions[i]); + final_positions += 1; + } +} - if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == u_foldCase(d, U_FOLD_CASE_DEFAULT)) { - found = TRUE; +static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions) { + UChar32 nc, hc, lc; + UChar *p; + double final_score = 0.0, score = 0.0, score_for_char = 0.0; + int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len; + stack_push(stack, 0, 0, 0, 0.0, final_positions); + MemoryItem mem = {0}; - // calculate score - score_for_char = m->max_score_per_char; - distance = haystack_idx - last_idx; - - if (distance > 1) { - factor = 1.0; - U16_GET(m->haystack, haystack_idx - 1, haystack_idx - 1, m->haystack_len, last); - if (u_strchr32(m->level1, last)) - factor = 0.9; - else if (u_strchr32(m->level2, last)) - factor = 0.8; - else if (u_isULowercase(last) && u_isUUppercase(d)) - factor = 0.8; // CamelCase - else if (u_strchr32(m->level3, last)) - factor = 0.7; - else - // if no "special" chars behind char, factor diminishes - // as distance from last matched char increases - factor = (1.0 / distance) * 0.75; - score_for_char *= factor; - } - - if (j < m->haystack_len) { - // bump cursor one char to the right and - // use recursion to try and find a better match - sub_score = recursive_match(m, j, curri, last_idx, score); - if (sub_score > seen_score) - seen_score = sub_score; + while (stack->pos >= 0) { + stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions); + mem = m->memo[hidx][nidx][last_idx]; + if (mem.score == DBL_MAX) { + // No memoized result, calculate the score + for (i = nidx; i < m->needle_len;) { + nidx = i; + U16_NEXT(m->needle, i, m->needle_len, nc); // i now points to next char in needle + if (m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; } + p = u_strchr32(m->haystack + hidx, nc); // TODO: Use primary collation for the find + if (p == NULL) { score = 0.0; break; } + pos = p - m->haystack; + distance = u_countChar32(m->haystack + last_idx, pos - last_idx); + if (distance <= 1) score_for_char = m->max_score_per_char; + else { + U16_GET(m->haystack, 0, pos, m->haystack_len, hc); + j = pos; + U16_PREV(m->haystack, 0, j, lc); // lc is the prev character + score_for_char = calc_score_for_char(m, lc, hc, distance); } + j = pos; + U16_NEXT(m->haystack, j, m->haystack_len, hc); + hidx = j; + if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions); + last_idx = pos; + positions[nidx] = pos; score += score_for_char; - last_idx = haystack_idx + 1; - break; - } - } // for(j) + } // for(i) iterate over needle + mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len); - if (!found) { - score = 0.0; - goto memoize; + } else { + score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len); + } + // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed + if (score > final_score) { + final_score = score; + memcpy(final_positions, positions, sizeof(*positions) * m->needle_len); } } - - score = score > seen_score ? score : seen_score; - -memoize: - m->memo[needle_idx][haystack_idx] = score; - return score; + return final_score; } -static double** alloc_memo(size_t rows, size_t cols) { - double **array, *data; /* Declare this first so we can use it with sizeof. */ - size_t i; - const size_t row_pointers_bytes = rows * sizeof(*array); - const size_t row_elements_bytes = cols * sizeof(**array); - array = malloc(row_pointers_bytes + rows * row_elements_bytes); - if (array != NULL) { - data = (double*)(array + rows); - for(i = 0; i < rows; i++) array[i] = data + i * cols; - } - return array; -} -static bool match(UChar **items, int32_t *item_lengths, uint32_t item_count, UChar *needle, int32_t needle_len, Match *match_results, UChar *level1, UChar *level2, UChar *level3) { - uint32_t i = 0, maxhl = 0; - int32_t r = 0, c = 0; +static bool match(UChar **items, int32_t *item_lengths, uint32_t item_count, UChar *needle, Match *match_results, int32_t *final_positions, int32_t needle_char_len, UChar *level1, UChar *level2, UChar *level3) { + Stack stack = {0}; + int32_t i = 0, maxhl = 0; + int32_t r = 0, *positions = NULL; MatchInfo *matches = NULL; bool ok = FALSE; - double **memo = NULL; + MemoryItem ***memo = NULL; + int32_t needle_len = u_strlen(needle); - if (needle_len == 0) { + if (needle_len <= 0 || item_count <= 0) { for (i = 0; i < item_count; i++) match_results[i].score = 0.0; ok = TRUE; goto end; } matches = (MatchInfo*)calloc(item_count, sizeof(MatchInfo)); - if (matches == NULL) goto end; + positions = (int32_t*)calloc(2*needle_len, sizeof(int32_t)); // One set of positions is the final answer and one set is working space + if (matches == NULL || positions == NULL) {PyErr_NoMemory(); goto end;} for (i = 0; i < item_count; i++) { matches[i].haystack = items[i]; @@ -170,36 +258,36 @@ static bool match(UChar **items, int32_t *item_lengths, uint32_t item_count, UCh maxhl = MAX(maxhl, matches[i].haystack_len); } - memo = alloc_memo(needle_len, maxhl); - if (memo == NULL) {PyErr_NoMemory(); goto end;} + if (maxhl <= 0) { + for (i = 0; i < item_count; i++) match_results[i].score = 0.0; + ok = TRUE; + goto end; + } + + alloc_stack(&stack, needle_len, maxhl); + memo = alloc_memory(needle_len, maxhl); + if (stack.items == NULL || memo == NULL) {PyErr_NoMemory(); goto end;} for (i = 0; i < item_count; i++) { for (r = 0; r < needle_len; r++) { - for (c = 0; c < maxhl; c++) memo[r][c] = DBL_MAX; + positions[r] = -1; } + stack_clear(&stack); + clear_memory(memo, needle_len, matches[i].haystack_len); matches[i].memo = memo; - match_results[i].score = recursive_match(&matches[i], 0, 0, 0, 0.0); + match_results[i].score = process_item(&matches[i], &stack, positions); + convert_positions(positions, final_positions + i, matches[i].haystack, needle_char_len, needle_len, match_results[i].score); } ok = TRUE; end: + nullfree(positions); + nullfree(stack.items); nullfree(matches); nullfree(memo); return ok; } -int cmp_score(const void *a, const void *b, void *arg) -{ - Match a_match = *(Match *)a; - Match b_match = *(Match *)b; - - if (a_match.score > b_match.score) - return -1; // a scores higher, a should appear sooner - else if (a_match.score < b_match.score) - return 1; // b scores higher, a should appear later - else - return strncmp(a_match.sort_key, b_match.sort_key, MIN(a_match.sort_key_len, b_match.sort_key_len)); -} // }}} // Matcher object definition {{{ @@ -207,17 +295,14 @@ typedef struct { PyObject_HEAD // Type-specific fields go here. UChar **items; - char **sort_items; uint32_t item_count; int32_t *item_lengths; - int32_t *sort_item_lengths; - PyObject *py_items; - PyObject *py_sort_keys; UChar *level1; UChar *level2; UChar *level3; } Matcher; + // Matcher.__init__() {{{ static void free_matcher(Matcher *self) { @@ -225,7 +310,7 @@ static void free_matcher(Matcher *self) { if (self->items != NULL) { for (i = 0; i < self->item_count; i++) { nullfree(self->items[i]); } } - nullfree(self->items); nullfree(self->sort_items); nullfree(self->item_lengths); nullfree(self->sort_item_lengths); Py_XDECREF(self->py_items); Py_XDECREF(self->py_sort_keys); + nullfree(self->items); nullfree(self->item_lengths); nullfree(self->level1); nullfree(self->level2); nullfree(self->level3); } static void @@ -239,66 +324,60 @@ Matcher_dealloc(Matcher* self) static int Matcher_init(Matcher *self, PyObject *args, PyObject *kwds) { - PyObject *items = NULL, *sort_keys = NULL, *p = NULL; + PyObject *items = NULL, *p = NULL, *py_items = NULL; char *utf8 = NULL, *level1 = NULL, *level2 = NULL, *level3 = NULL; int32_t i = 0; Py_ssize_t cap = 0, l1s, l2s, l3s; UErrorCode status = U_ZERO_ERROR; - if (!PyArg_ParseTuple(args, "OOs#s#s#", &items, &sort_keys, &level1, &l1s, &level2, &l2s, &level3, &l3s)) return -1; - self->py_items = PySequence_Fast(items, "Must pass in two sequence objects"); - self->py_sort_keys = PySequence_Fast(sort_keys, "Must pass in two sequence objects"); - if (self->py_items == NULL || self->py_sort_keys == NULL) goto end; + if (!PyArg_ParseTuple(args, "Os#s#s#", &items, &level1, &l1s, &level2, &l2s, &level3, &l3s)) return -1; + py_items = PySequence_Fast(items, "Must pass in two sequence objects"); + if (py_items == NULL) goto end; self->item_count = (uint32_t)PySequence_Size(items); - if (self->item_count != (uint32_t)PySequence_Size(sort_keys)) { PyErr_SetString(PyExc_TypeError, "The sequences must have the same length."); } self->items = (UChar**)calloc(self->item_count, sizeof(UChar*)); - self->sort_items = (char**)calloc(self->item_count, sizeof(char*)); self->item_lengths = (int32_t*)calloc(self->item_count, sizeof(uint32_t)); - self->sort_item_lengths = (int32_t*)calloc(self->item_count, sizeof(uint32_t)); self->level1 = (UChar*)calloc(alloc_uchar(l1s), sizeof(UChar)); self->level2 = (UChar*)calloc(alloc_uchar(l2s), sizeof(UChar)); self->level3 = (UChar*)calloc(alloc_uchar(l3s), sizeof(UChar)); - if (self->items == NULL || self->sort_items == NULL || self->item_lengths == NULL || self->sort_item_lengths == NULL || self->level1 == NULL || self->level2 == NULL || self->level3 == NULL) { + if (self->items == NULL || self->item_lengths == NULL || self->level1 == NULL || self->level2 == NULL || self->level3 == NULL) { PyErr_NoMemory(); goto end; } + u_strFromUTF8Lenient(self->level1, alloc_uchar(l1s), &i, level1, (int32_t)l1s, &status); u_strFromUTF8Lenient(self->level2, alloc_uchar(l2s), &i, level2, (int32_t)l2s, &status); u_strFromUTF8Lenient(self->level3, alloc_uchar(l3s), &i, level3, (int32_t)l3s, &status); if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes for level string from UTF-8 to UTF-16"); goto end; } for (i = 0; i < self->item_count; i++) { - p = PySequence_Fast_GET_ITEM(self->py_items, i); + p = PySequence_Fast_GET_ITEM(py_items, i); utf8 = PyBytes_AsString(p); if (utf8 == NULL) goto end; cap = PyBytes_GET_SIZE(p); self->items[i] = (UChar*)calloc(alloc_uchar(cap), sizeof(UChar)); if (self->items[i] == NULL) { PyErr_NoMemory(); goto end; } - u_strFromUTF8Lenient(self->items[i], alloc_uchar(cap), &(self->item_lengths[i]), utf8, cap, &status); + u_strFromUTF8Lenient(self->items[i], alloc_uchar(cap), NULL, utf8, cap, &status); if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes from UTF-8 to UTF-16"); goto end; } - - p = PySequence_Fast_GET_ITEM(self->py_sort_keys, i); - self->sort_items[i] = PyBytes_AsString(p); - if (self->sort_items[i] == NULL) goto end; - self->sort_item_lengths[i] = (uint32_t) PyBytes_GET_SIZE(p); + self->item_lengths[i] = u_strlen(self->items[i]); } end: + Py_XDECREF(py_items); if (PyErr_Occurred()) { free_matcher(self); } return (PyErr_Occurred()) ? -1 : 0; } // Matcher.__init__() }}} -// Matcher.get_matches {{{ +// Matcher.calculate_scores {{{ static PyObject * -Matcher_get_matches(Matcher *self, PyObject *args) { +Matcher_calculate_scores(Matcher *self, PyObject *args) { char *cneedle = NULL; - int32_t qsize = 0; + int32_t qsize = 0, *final_positions = NULL, *p; Match *matches = NULL; bool ok = FALSE; - uint32_t i = 0; - PyObject *items = NULL; + uint32_t i = 0, needle_char_len = 0, j = 0; + PyObject *items = NULL, *score = NULL, *positions = NULL; UErrorCode status = U_ZERO_ERROR; UChar *needle = NULL; @@ -308,39 +387,48 @@ Matcher_get_matches(Matcher *self, PyObject *args) { if (needle == NULL) return PyErr_NoMemory(); u_strFromUTF8Lenient(needle, alloc_uchar(qsize), &qsize, cneedle, qsize, &status); if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes from UTF-8 to UTF-16"); goto end; } - + needle_char_len = u_countChar32(needle, -1); items = PyTuple_New(self->item_count); + positions = PyTuple_New(self->item_count); matches = (Match*)calloc(self->item_count, sizeof(Match)); - if (items == NULL || matches == NULL) {PyErr_NoMemory(); goto end;} + final_positions = (int32_t*) calloc(needle_char_len * self->item_count, sizeof(int32_t)); + if (items == NULL || matches == NULL || final_positions == NULL || positions == NULL) {PyErr_NoMemory(); goto end;} + for (i = 0; i < self->item_count; i++) { - matches[i].item = self->items[i]; - matches[i].sort_key = self->sort_items[i]; - matches[i].sort_key_len = self->sort_item_lengths[i]; - matches[i].py_item = PySequence_Fast_GET_ITEM(self->py_items, (Py_ssize_t)i); + score = PyTuple_New(needle_char_len); + if (score == NULL) { PyErr_NoMemory(); goto end; } + PyTuple_SET_ITEM(positions, (Py_ssize_t)i, score); } Py_BEGIN_ALLOW_THREADS; - ok = match(self->items, self->item_lengths, self->item_count, needle, (uint32_t)qsize, matches, self->level1, self->level2, self->level3); - if (ok) qsort(matches, self->item_count, sizeof(Match), cmp_score, NULL); + ok = match(self->items, self->item_lengths, self->item_count, needle, matches, final_positions, needle_char_len, self->level1, self->level2, self->level3); Py_END_ALLOW_THREADS; if (ok) { for (i = 0; i < self->item_count; i++) { - PyTuple_SET_ITEM(items, (Py_ssize_t)i, matches[i].py_item); - Py_INCREF(matches[i].py_item); + score = PyFloat_FromDouble(matches[i].score); + if (score == NULL) { PyErr_NoMemory(); goto end; } + PyTuple_SET_ITEM(items, (Py_ssize_t)i, score); + p = final_positions + i; + for (j = 0; j < needle_char_len; j++) { + score = PyInt_FromLong((long)p[j]); + if (score == NULL) { PyErr_NoMemory(); goto end; } + PyTuple_SET_ITEM(PyTuple_GET_ITEM(positions, (Py_ssize_t)i), (Py_ssize_t)j, score); + } } } else { PyErr_NoMemory(); goto end; } end: nullfree(needle); nullfree(matches); - if (PyErr_Occurred()) { Py_XDECREF(items); return NULL; } - return items; + nullfree(final_positions); + if (PyErr_Occurred()) { Py_XDECREF(items); items = NULL; Py_XDECREF(positions); positions = NULL; return NULL; } + return Py_BuildValue("NN", items, positions); } // }}} static PyMethodDef Matcher_methods[] = { - {"get_matches", (PyCFunction)Matcher_get_matches, METH_VARARGS, - "get_matches(query) -> Return the sorted list of matches for query which must be a UTF-8 encoded string." + {"calculate_scores", (PyCFunction)Matcher_calculate_scores, METH_VARARGS, + "calculate_scores(query) -> Return the scores for all items given query as a tuple." }, {NULL} /* Sentinel */ diff --git a/src/calibre/gui2/tweak_book/matcher.py b/src/calibre/gui2/tweak_book/matcher.py index 99f3618b03..2800537d13 100644 --- a/src/calibre/gui2/tweak_book/matcher.py +++ b/src/calibre/gui2/tweak_book/matcher.py @@ -8,14 +8,19 @@ __copyright__ = '2014, Kovid Goyal ' from unicodedata import normalize +from itertools import izip from future_builtins import map from calibre.constants import plugins -from calibre.utils.icu import primary_sort_key +from calibre.utils.icu import primary_sort_key, find + +DEFAULT_LEVEL1 = '/' +DEFAULT_LEVEL2 = '-_ 0123456789' +DEFAULT_LEVEL3 = '.' class Matcher(object): - def __init__(self, items, level1='/', level2='-_ 0123456789', level3='.'): + def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3): items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items)) items = tuple(map(lambda x: x.encode('utf-8'), items)) sort_keys = tuple(map(primary_sort_key, items)) @@ -29,6 +34,96 @@ class Matcher(object): query = normalize('NFC', unicode(query)).encode('utf-8') return map(lambda x:x.decode('utf-8'), self.m.get_matches(query)) + +def calc_score_for_char(ctx, prev, current, distance): + factor = 1.0 + ans = ctx.max_score_per_char + + if prev in ctx.level1: + factor = 0.9 + elif prev in ctx.level2 or (icu_lower(prev) == prev and icu_upper(current) == current): + factor = 0.8 + elif prev in ctx.level3: + factor = 0.7 + else: + factor = (1.0 / distance) * 0.75 + + return ans * factor + +def process_item(ctx, haystack, needle): + # non-recursive implementation using a stack + stack = [(0, 0, 0, 0, [-1]*len(needle))] + final_score, final_positions = stack[0][-2:] + push, pop = stack.append, stack.pop + while stack: + hidx, nidx, last_idx, score, positions = pop() + key = (hidx, nidx, last_idx) + mem = ctx.memory.get(key, None) + if mem is None: + for i in xrange(nidx, len(needle)): + n = needle[i] + if (len(haystack) - hidx < len(needle) - i): + score = 0 + break + pos = find(n, haystack[hidx:])[0] + hidx + if pos == -1: + score = 0 + break + + distance = pos - last_idx + score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(ctx, haystack[pos-1], haystack[pos], distance) + hidx = pos + 1 + push((hidx, i, last_idx, score, list(positions))) + last_idx = positions[i] = pos + score += score_for_char + ctx.memory[key] = (score, positions) + else: + score, positions = mem + if score > final_score: + final_score = score + final_positions = positions + return final_score, final_positions + +class PyScorer(object): + __slots__ = ('level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory') + + def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3): + self.level1, self.level2, self.level3 = level1, level2, level3 + self.max_score_per_char = 0 + self.items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items)) + + def __call__(self, needle): + for item in self.items: + self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0 + self.memory = {} + yield process_item(self, item, needle) + +class CScorer(object): + + def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3): + items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items)) + items = tuple(map(lambda x: x.encode('utf-8'), items)) + + speedup, err = plugins['matcher'] + if speedup is None: + raise RuntimeError('Failed to load the matcher plugin with error: %s' % err) + self.m = speedup.Matcher(items, level1.encode('utf-8'), level2.encode('utf-8'), level3.encode('utf-8')) + + def __call__(self, query): + query = normalize('NFC', unicode(query)).encode('utf-8') + scores, positions = self.m.calculate_scores(query) + for score, pos in izip(scores, positions): + yield score, pos + +def test(): + items = ['m1mn34o/mno'] + s = PyScorer(items) + c = CScorer(items) + for q in (s, c): + print (q) + for item, (score, positions) in izip(items, q('mno')): + print (item, score, positions) + def test_mem(): from calibre.utils.mem import gc_histogram, diff_hists m = Matcher([]) @@ -45,7 +140,8 @@ def test_mem(): diff_hists(h1, h2) if __name__ == '__main__': - m = Matcher(['image/one.png', 'image/two.gif', 'text/one.html']) - for q in ('one', 'ton', 'imo'): - print (q, '->', tuple(m(q))) - test_mem() + test() + # m = Matcher(['image/one.png', 'image/two.gif', 'text/one.html']) + # for q in ('one', 'ONE', 'ton', 'imo'): + # print (q, '->', tuple(m(q))) + # test_mem()