mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Refactor subsequence matcher to not use recursion and support multithreading. Also add a python implementation for easy experimentation with the algorithm.
This commit is contained in:
parent
c634b7c946
commit
3901051e2e
@ -17,12 +17,8 @@
|
|||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
// inline does not work with the visual studio C compiler
|
// inline does not work with the visual studio C compiler
|
||||||
#define inline
|
#define inline
|
||||||
#define qsort qsort_s
|
|
||||||
#else
|
|
||||||
#define qsort qsort_r
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
typedef unsigned char bool;
|
typedef unsigned char bool;
|
||||||
#define TRUE 1
|
#define TRUE 1
|
||||||
#define FALSE 0
|
#define FALSE 0
|
||||||
@ -31,132 +27,224 @@ typedef unsigned char bool;
|
|||||||
#define nullfree(x) if(x != NULL) free(x); x = NULL;
|
#define nullfree(x) if(x != NULL) free(x); x = NULL;
|
||||||
|
|
||||||
// Algorithm to sort items by subsequence score {{{
|
// Algorithm to sort items by subsequence score {{{
|
||||||
|
typedef struct {
|
||||||
|
double score;
|
||||||
|
int32_t *positions;
|
||||||
|
} MemoryItem;
|
||||||
|
|
||||||
|
static MemoryItem*** alloc_memory(int32_t needle_len, int32_t max_haystack_len) {
|
||||||
|
MemoryItem ***ans = NULL, **d1 = NULL, *d2 = NULL;
|
||||||
|
size_t num = max_haystack_len * max_haystack_len * needle_len;
|
||||||
|
size_t position_sz = needle_len * sizeof(int32_t);
|
||||||
|
size_t sz = (num * (sizeof(MemoryItem) + position_sz)) + (max_haystack_len * sizeof(MemoryItem**)) + (needle_len * sizeof(MemoryItem*));
|
||||||
|
int32_t hidx, nidx, last_idx, i, j;
|
||||||
|
char *base = NULL;
|
||||||
|
|
||||||
|
ans = (MemoryItem***) calloc(sz, 1);
|
||||||
|
if (ans != NULL) {
|
||||||
|
d1 = (MemoryItem**)(ans + max_haystack_len);
|
||||||
|
d2 = (MemoryItem*) (d1 + max_haystack_len * needle_len );
|
||||||
|
for (i = 0; i < max_haystack_len; i++) {
|
||||||
|
ans[i] = d1 + i * needle_len;
|
||||||
|
for (j = 0; j < needle_len; j++) d1[i*needle_len + j] = d2 + j;
|
||||||
|
}
|
||||||
|
|
||||||
|
base = ((char*)ans) + (sizeof(MemoryItem**)*max_haystack_len) + (sizeof(MemoryItem*)*needle_len) + (sizeof(MemoryItem)*max_haystack_len);
|
||||||
|
|
||||||
|
for (hidx = 0; hidx < max_haystack_len; hidx++) {
|
||||||
|
for (nidx = 0; nidx < needle_len; nidx++) {
|
||||||
|
for (last_idx = 0; last_idx < max_haystack_len; last_idx++) {
|
||||||
|
ans[hidx][nidx][last_idx].positions = (int32_t*)base;
|
||||||
|
base += position_sz;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void clear_memory(MemoryItem ***mem, int32_t needle_len, int32_t max_haystack_len) {
|
||||||
|
int32_t hidx, nidx, last_idx;
|
||||||
|
for (hidx = 0; hidx < max_haystack_len; hidx++) {
|
||||||
|
for (nidx = 0; nidx < needle_len; nidx++) {
|
||||||
|
for (last_idx = 0; last_idx < max_haystack_len; last_idx++) {
|
||||||
|
mem[hidx][nidx][last_idx].score = DBL_MAX;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int32_t hidx;
|
||||||
|
int32_t nidx;
|
||||||
|
int32_t last_idx;
|
||||||
|
double score;
|
||||||
|
int32_t *positions;
|
||||||
|
} StackItem;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
ssize_t pos;
|
||||||
|
int32_t needle_len;
|
||||||
|
size_t size;
|
||||||
|
StackItem *items;
|
||||||
|
} Stack;
|
||||||
|
|
||||||
|
static void alloc_stack(Stack *stack, int32_t needle_len, int32_t max_haystack_len) {
|
||||||
|
StackItem *ans = NULL;
|
||||||
|
char *base = NULL;
|
||||||
|
size_t num = max_haystack_len * needle_len;
|
||||||
|
size_t position_sz = needle_len * sizeof(int32_t);
|
||||||
|
size_t sz = sizeof(StackItem) + position_sz;
|
||||||
|
size_t i = 0;
|
||||||
|
|
||||||
|
stack->needle_len = needle_len;
|
||||||
|
stack->pos = -1;
|
||||||
|
stack->size = num;
|
||||||
|
ans = (StackItem*) calloc(num, sz);
|
||||||
|
if (ans != NULL) {
|
||||||
|
base = (char*)(ans + num);
|
||||||
|
for (i = 0; i < num; i++, base += position_sz) ans[i].positions = (int32_t*) base;
|
||||||
|
stack->items = ans;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void stack_clear(Stack *stack) { stack->pos = -1; }
|
||||||
|
|
||||||
|
static void stack_push(Stack *stack, int32_t hidx, int32_t nidx, int32_t last_idx, double score, int32_t *positions) {
|
||||||
|
StackItem *si = &(stack->items[++stack->pos]);
|
||||||
|
si->hidx = hidx; si->nidx = nidx; si->last_idx = last_idx; si->score = score;
|
||||||
|
memcpy(si->positions, positions, sizeof(*positions) * stack->needle_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void stack_pop(Stack *stack, int32_t *hidx, int32_t *nidx, int32_t *last_idx, double *score, int32_t *positions) {
|
||||||
|
StackItem *si = &(stack->items[stack->pos--]);
|
||||||
|
*hidx = si->hidx; *nidx = si->nidx; *last_idx = si->last_idx; *score = si->score;
|
||||||
|
memcpy(positions, si->positions, sizeof(*positions) * stack->needle_len);
|
||||||
|
}
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
UChar *haystack;
|
UChar *haystack;
|
||||||
int32_t haystack_len;
|
int32_t haystack_len;
|
||||||
UChar *needle;
|
UChar *needle;
|
||||||
int32_t needle_len;
|
int32_t needle_len;
|
||||||
double max_score_per_char;
|
double max_score_per_char;
|
||||||
double **memo;
|
MemoryItem ***memo;
|
||||||
UChar *level1;
|
UChar *level1;
|
||||||
UChar *level2;
|
UChar *level2;
|
||||||
UChar *level3;
|
UChar *level3;
|
||||||
} MatchInfo;
|
} MatchInfo;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
UChar *item;
|
|
||||||
char *sort_key;
|
|
||||||
uint32_t sort_key_len;
|
|
||||||
PyObject *py_item;
|
|
||||||
double score;
|
double score;
|
||||||
|
int32_t *positions;
|
||||||
} Match;
|
} Match;
|
||||||
|
|
||||||
static double recursive_match(MatchInfo *m, int32_t haystack_idx, int32_t needle_idx, int32_t last_idx, double score) {
|
|
||||||
double seen_score = 0.0, memoized = DBL_MAX, score_for_char, factor, sub_score;
|
|
||||||
int32_t i = 0, j = 0, distance, curri;
|
|
||||||
UChar32 c, d, last;
|
|
||||||
bool found;
|
|
||||||
|
|
||||||
// do we have a memoized result we can return?
|
static double calc_score_for_char(MatchInfo *m, UChar32 last, UChar32 current, int32_t distance_from_last_match) {
|
||||||
memoized = m->memo[needle_idx][haystack_idx];
|
double factor = 1.0;
|
||||||
if (memoized != DBL_MAX)
|
double ans = m->max_score_per_char;
|
||||||
return memoized;
|
|
||||||
|
|
||||||
// bail early if not enough room (left) in haystack for (rest of) needle
|
if (u_strchr32(m->level1, last) != NULL)
|
||||||
if (m->haystack_len - haystack_idx < m->needle_len - needle_idx) {
|
|
||||||
score = 0.0;
|
|
||||||
goto memoize;
|
|
||||||
}
|
|
||||||
for (i = needle_idx; i < m->needle_len; ) {
|
|
||||||
curri = i;
|
|
||||||
U16_NEXT(m->needle, i, m->needle_len, c); // i now points to the next codepoint
|
|
||||||
found = FALSE;
|
|
||||||
|
|
||||||
// similar to above, we'll stop iterating when we know we're too close
|
|
||||||
// to the end of the string to possibly match
|
|
||||||
for (j = haystack_idx; j <= m->haystack_len - (m->needle_len - curri); ) {
|
|
||||||
haystack_idx = j;
|
|
||||||
U16_NEXT(m->haystack, j, m->haystack_len, d); // j now points to the next codepoint
|
|
||||||
|
|
||||||
if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == u_foldCase(d, U_FOLD_CASE_DEFAULT)) {
|
|
||||||
found = TRUE;
|
|
||||||
|
|
||||||
// calculate score
|
|
||||||
score_for_char = m->max_score_per_char;
|
|
||||||
distance = haystack_idx - last_idx;
|
|
||||||
|
|
||||||
if (distance > 1) {
|
|
||||||
factor = 1.0;
|
|
||||||
U16_GET(m->haystack, haystack_idx - 1, haystack_idx - 1, m->haystack_len, last);
|
|
||||||
if (u_strchr32(m->level1, last))
|
|
||||||
factor = 0.9;
|
factor = 0.9;
|
||||||
else if (u_strchr32(m->level2, last))
|
else if (u_strchr32(m->level2, last) != NULL)
|
||||||
factor = 0.8;
|
factor = 0.8;
|
||||||
else if (u_isULowercase(last) && u_isUUppercase(d))
|
else if (u_isULowercase(last) && u_isUUppercase(current))
|
||||||
factor = 0.8; // CamelCase
|
factor = 0.8; // CamelCase
|
||||||
else if (u_strchr32(m->level3, last))
|
else if (u_strchr32(m->level3, last) != NULL)
|
||||||
factor = 0.7;
|
factor = 0.7;
|
||||||
else
|
else
|
||||||
// if no "special" chars behind char, factor diminishes
|
// If last is not a special char, factor diminishes
|
||||||
// as distance from last matched char increases
|
// as distance from last matched char increases
|
||||||
factor = (1.0 / distance) * 0.75;
|
factor = (1.0 / distance_from_last_match) * 0.75;
|
||||||
score_for_char *= factor;
|
return ans * factor;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) {
|
||||||
|
// The positions array stores character positions as byte offsets in string, convert them into character offsets
|
||||||
|
int32_t i, *end;
|
||||||
|
|
||||||
|
if (score == 0.0) {
|
||||||
|
for (i = 0; i < char_len; i++) final_positions[i] = -1;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (j < m->haystack_len) {
|
end = final_positions + char_len;
|
||||||
// bump cursor one char to the right and
|
for (i = 0; i < byte_len && final_positions < end; i++) {
|
||||||
// use recursion to try and find a better match
|
if (positions[i] == -1) continue;
|
||||||
sub_score = recursive_match(m, j, curri, last_idx, score);
|
*final_positions = u_countChar32(string, positions[i]);
|
||||||
if (sub_score > seen_score)
|
final_positions += 1;
|
||||||
seen_score = sub_score;
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions) {
|
||||||
|
UChar32 nc, hc, lc;
|
||||||
|
UChar *p;
|
||||||
|
double final_score = 0.0, score = 0.0, score_for_char = 0.0;
|
||||||
|
int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len;
|
||||||
|
stack_push(stack, 0, 0, 0, 0.0, final_positions);
|
||||||
|
MemoryItem mem = {0};
|
||||||
|
|
||||||
|
while (stack->pos >= 0) {
|
||||||
|
stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions);
|
||||||
|
mem = m->memo[hidx][nidx][last_idx];
|
||||||
|
if (mem.score == DBL_MAX) {
|
||||||
|
// No memoized result, calculate the score
|
||||||
|
for (i = nidx; i < m->needle_len;) {
|
||||||
|
nidx = i;
|
||||||
|
U16_NEXT(m->needle, i, m->needle_len, nc); // i now points to next char in needle
|
||||||
|
if (m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; }
|
||||||
|
p = u_strchr32(m->haystack + hidx, nc); // TODO: Use primary collation for the find
|
||||||
|
if (p == NULL) { score = 0.0; break; }
|
||||||
|
pos = p - m->haystack;
|
||||||
|
distance = u_countChar32(m->haystack + last_idx, pos - last_idx);
|
||||||
|
if (distance <= 1) score_for_char = m->max_score_per_char;
|
||||||
|
else {
|
||||||
|
U16_GET(m->haystack, 0, pos, m->haystack_len, hc);
|
||||||
|
j = pos;
|
||||||
|
U16_PREV(m->haystack, 0, j, lc); // lc is the prev character
|
||||||
|
score_for_char = calc_score_for_char(m, lc, hc, distance);
|
||||||
|
}
|
||||||
|
j = pos;
|
||||||
|
U16_NEXT(m->haystack, j, m->haystack_len, hc);
|
||||||
|
hidx = j;
|
||||||
|
if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions);
|
||||||
|
last_idx = pos;
|
||||||
|
positions[nidx] = pos;
|
||||||
score += score_for_char;
|
score += score_for_char;
|
||||||
last_idx = haystack_idx + 1;
|
} // for(i) iterate over needle
|
||||||
break;
|
mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len);
|
||||||
}
|
|
||||||
} // for(j)
|
|
||||||
|
|
||||||
if (!found) {
|
} else {
|
||||||
score = 0.0;
|
score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len);
|
||||||
goto memoize;
|
}
|
||||||
|
// We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed
|
||||||
|
if (score > final_score) {
|
||||||
|
final_score = score;
|
||||||
|
memcpy(final_positions, positions, sizeof(*positions) * m->needle_len);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return final_score;
|
||||||
score = score > seen_score ? score : seen_score;
|
|
||||||
|
|
||||||
memoize:
|
|
||||||
m->memo[needle_idx][haystack_idx] = score;
|
|
||||||
return score;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static double** alloc_memo(size_t rows, size_t cols) {
|
|
||||||
double **array, *data; /* Declare this first so we can use it with sizeof. */
|
|
||||||
size_t i;
|
|
||||||
const size_t row_pointers_bytes = rows * sizeof(*array);
|
|
||||||
const size_t row_elements_bytes = cols * sizeof(**array);
|
|
||||||
array = malloc(row_pointers_bytes + rows * row_elements_bytes);
|
|
||||||
if (array != NULL) {
|
|
||||||
data = (double*)(array + rows);
|
|
||||||
for(i = 0; i < rows; i++) array[i] = data + i * cols;
|
|
||||||
}
|
|
||||||
return array;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool match(UChar **items, int32_t *item_lengths, uint32_t item_count, UChar *needle, int32_t needle_len, Match *match_results, UChar *level1, UChar *level2, UChar *level3) {
|
static bool match(UChar **items, int32_t *item_lengths, uint32_t item_count, UChar *needle, Match *match_results, int32_t *final_positions, int32_t needle_char_len, UChar *level1, UChar *level2, UChar *level3) {
|
||||||
uint32_t i = 0, maxhl = 0;
|
Stack stack = {0};
|
||||||
int32_t r = 0, c = 0;
|
int32_t i = 0, maxhl = 0;
|
||||||
|
int32_t r = 0, *positions = NULL;
|
||||||
MatchInfo *matches = NULL;
|
MatchInfo *matches = NULL;
|
||||||
bool ok = FALSE;
|
bool ok = FALSE;
|
||||||
double **memo = NULL;
|
MemoryItem ***memo = NULL;
|
||||||
|
int32_t needle_len = u_strlen(needle);
|
||||||
|
|
||||||
if (needle_len == 0) {
|
if (needle_len <= 0 || item_count <= 0) {
|
||||||
for (i = 0; i < item_count; i++) match_results[i].score = 0.0;
|
for (i = 0; i < item_count; i++) match_results[i].score = 0.0;
|
||||||
ok = TRUE;
|
ok = TRUE;
|
||||||
goto end;
|
goto end;
|
||||||
}
|
}
|
||||||
|
|
||||||
matches = (MatchInfo*)calloc(item_count, sizeof(MatchInfo));
|
matches = (MatchInfo*)calloc(item_count, sizeof(MatchInfo));
|
||||||
if (matches == NULL) goto end;
|
positions = (int32_t*)calloc(2*needle_len, sizeof(int32_t)); // One set of positions is the final answer and one set is working space
|
||||||
|
if (matches == NULL || positions == NULL) {PyErr_NoMemory(); goto end;}
|
||||||
|
|
||||||
for (i = 0; i < item_count; i++) {
|
for (i = 0; i < item_count; i++) {
|
||||||
matches[i].haystack = items[i];
|
matches[i].haystack = items[i];
|
||||||
@ -170,36 +258,36 @@ static bool match(UChar **items, int32_t *item_lengths, uint32_t item_count, UCh
|
|||||||
maxhl = MAX(maxhl, matches[i].haystack_len);
|
maxhl = MAX(maxhl, matches[i].haystack_len);
|
||||||
}
|
}
|
||||||
|
|
||||||
memo = alloc_memo(needle_len, maxhl);
|
if (maxhl <= 0) {
|
||||||
if (memo == NULL) {PyErr_NoMemory(); goto end;}
|
for (i = 0; i < item_count; i++) match_results[i].score = 0.0;
|
||||||
|
ok = TRUE;
|
||||||
|
goto end;
|
||||||
|
}
|
||||||
|
|
||||||
|
alloc_stack(&stack, needle_len, maxhl);
|
||||||
|
memo = alloc_memory(needle_len, maxhl);
|
||||||
|
if (stack.items == NULL || memo == NULL) {PyErr_NoMemory(); goto end;}
|
||||||
|
|
||||||
for (i = 0; i < item_count; i++) {
|
for (i = 0; i < item_count; i++) {
|
||||||
for (r = 0; r < needle_len; r++) {
|
for (r = 0; r < needle_len; r++) {
|
||||||
for (c = 0; c < maxhl; c++) memo[r][c] = DBL_MAX;
|
positions[r] = -1;
|
||||||
}
|
}
|
||||||
|
stack_clear(&stack);
|
||||||
|
clear_memory(memo, needle_len, matches[i].haystack_len);
|
||||||
matches[i].memo = memo;
|
matches[i].memo = memo;
|
||||||
match_results[i].score = recursive_match(&matches[i], 0, 0, 0, 0.0);
|
match_results[i].score = process_item(&matches[i], &stack, positions);
|
||||||
|
convert_positions(positions, final_positions + i, matches[i].haystack, needle_char_len, needle_len, match_results[i].score);
|
||||||
}
|
}
|
||||||
|
|
||||||
ok = TRUE;
|
ok = TRUE;
|
||||||
end:
|
end:
|
||||||
|
nullfree(positions);
|
||||||
|
nullfree(stack.items);
|
||||||
nullfree(matches);
|
nullfree(matches);
|
||||||
nullfree(memo);
|
nullfree(memo);
|
||||||
return ok;
|
return ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
int cmp_score(const void *a, const void *b, void *arg)
|
|
||||||
{
|
|
||||||
Match a_match = *(Match *)a;
|
|
||||||
Match b_match = *(Match *)b;
|
|
||||||
|
|
||||||
if (a_match.score > b_match.score)
|
|
||||||
return -1; // a scores higher, a should appear sooner
|
|
||||||
else if (a_match.score < b_match.score)
|
|
||||||
return 1; // b scores higher, a should appear later
|
|
||||||
else
|
|
||||||
return strncmp(a_match.sort_key, b_match.sort_key, MIN(a_match.sort_key_len, b_match.sort_key_len));
|
|
||||||
}
|
|
||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
// Matcher object definition {{{
|
// Matcher object definition {{{
|
||||||
@ -207,17 +295,14 @@ typedef struct {
|
|||||||
PyObject_HEAD
|
PyObject_HEAD
|
||||||
// Type-specific fields go here.
|
// Type-specific fields go here.
|
||||||
UChar **items;
|
UChar **items;
|
||||||
char **sort_items;
|
|
||||||
uint32_t item_count;
|
uint32_t item_count;
|
||||||
int32_t *item_lengths;
|
int32_t *item_lengths;
|
||||||
int32_t *sort_item_lengths;
|
|
||||||
PyObject *py_items;
|
|
||||||
PyObject *py_sort_keys;
|
|
||||||
UChar *level1;
|
UChar *level1;
|
||||||
UChar *level2;
|
UChar *level2;
|
||||||
UChar *level3;
|
UChar *level3;
|
||||||
|
|
||||||
} Matcher;
|
} Matcher;
|
||||||
|
|
||||||
// Matcher.__init__() {{{
|
// Matcher.__init__() {{{
|
||||||
|
|
||||||
static void free_matcher(Matcher *self) {
|
static void free_matcher(Matcher *self) {
|
||||||
@ -225,7 +310,7 @@ static void free_matcher(Matcher *self) {
|
|||||||
if (self->items != NULL) {
|
if (self->items != NULL) {
|
||||||
for (i = 0; i < self->item_count; i++) { nullfree(self->items[i]); }
|
for (i = 0; i < self->item_count; i++) { nullfree(self->items[i]); }
|
||||||
}
|
}
|
||||||
nullfree(self->items); nullfree(self->sort_items); nullfree(self->item_lengths); nullfree(self->sort_item_lengths); Py_XDECREF(self->py_items); Py_XDECREF(self->py_sort_keys);
|
nullfree(self->items); nullfree(self->item_lengths);
|
||||||
nullfree(self->level1); nullfree(self->level2); nullfree(self->level3);
|
nullfree(self->level1); nullfree(self->level2); nullfree(self->level3);
|
||||||
}
|
}
|
||||||
static void
|
static void
|
||||||
@ -239,66 +324,60 @@ Matcher_dealloc(Matcher* self)
|
|||||||
static int
|
static int
|
||||||
Matcher_init(Matcher *self, PyObject *args, PyObject *kwds)
|
Matcher_init(Matcher *self, PyObject *args, PyObject *kwds)
|
||||||
{
|
{
|
||||||
PyObject *items = NULL, *sort_keys = NULL, *p = NULL;
|
PyObject *items = NULL, *p = NULL, *py_items = NULL;
|
||||||
char *utf8 = NULL, *level1 = NULL, *level2 = NULL, *level3 = NULL;
|
char *utf8 = NULL, *level1 = NULL, *level2 = NULL, *level3 = NULL;
|
||||||
int32_t i = 0;
|
int32_t i = 0;
|
||||||
Py_ssize_t cap = 0, l1s, l2s, l3s;
|
Py_ssize_t cap = 0, l1s, l2s, l3s;
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "OOs#s#s#", &items, &sort_keys, &level1, &l1s, &level2, &l2s, &level3, &l3s)) return -1;
|
if (!PyArg_ParseTuple(args, "Os#s#s#", &items, &level1, &l1s, &level2, &l2s, &level3, &l3s)) return -1;
|
||||||
self->py_items = PySequence_Fast(items, "Must pass in two sequence objects");
|
py_items = PySequence_Fast(items, "Must pass in two sequence objects");
|
||||||
self->py_sort_keys = PySequence_Fast(sort_keys, "Must pass in two sequence objects");
|
if (py_items == NULL) goto end;
|
||||||
if (self->py_items == NULL || self->py_sort_keys == NULL) goto end;
|
|
||||||
self->item_count = (uint32_t)PySequence_Size(items);
|
self->item_count = (uint32_t)PySequence_Size(items);
|
||||||
if (self->item_count != (uint32_t)PySequence_Size(sort_keys)) { PyErr_SetString(PyExc_TypeError, "The sequences must have the same length."); }
|
|
||||||
|
|
||||||
self->items = (UChar**)calloc(self->item_count, sizeof(UChar*));
|
self->items = (UChar**)calloc(self->item_count, sizeof(UChar*));
|
||||||
self->sort_items = (char**)calloc(self->item_count, sizeof(char*));
|
|
||||||
self->item_lengths = (int32_t*)calloc(self->item_count, sizeof(uint32_t));
|
self->item_lengths = (int32_t*)calloc(self->item_count, sizeof(uint32_t));
|
||||||
self->sort_item_lengths = (int32_t*)calloc(self->item_count, sizeof(uint32_t));
|
|
||||||
self->level1 = (UChar*)calloc(alloc_uchar(l1s), sizeof(UChar));
|
self->level1 = (UChar*)calloc(alloc_uchar(l1s), sizeof(UChar));
|
||||||
self->level2 = (UChar*)calloc(alloc_uchar(l2s), sizeof(UChar));
|
self->level2 = (UChar*)calloc(alloc_uchar(l2s), sizeof(UChar));
|
||||||
self->level3 = (UChar*)calloc(alloc_uchar(l3s), sizeof(UChar));
|
self->level3 = (UChar*)calloc(alloc_uchar(l3s), sizeof(UChar));
|
||||||
|
|
||||||
if (self->items == NULL || self->sort_items == NULL || self->item_lengths == NULL || self->sort_item_lengths == NULL || self->level1 == NULL || self->level2 == NULL || self->level3 == NULL) {
|
if (self->items == NULL || self->item_lengths == NULL || self->level1 == NULL || self->level2 == NULL || self->level3 == NULL) {
|
||||||
PyErr_NoMemory(); goto end;
|
PyErr_NoMemory(); goto end;
|
||||||
}
|
}
|
||||||
|
|
||||||
u_strFromUTF8Lenient(self->level1, alloc_uchar(l1s), &i, level1, (int32_t)l1s, &status);
|
u_strFromUTF8Lenient(self->level1, alloc_uchar(l1s), &i, level1, (int32_t)l1s, &status);
|
||||||
u_strFromUTF8Lenient(self->level2, alloc_uchar(l2s), &i, level2, (int32_t)l2s, &status);
|
u_strFromUTF8Lenient(self->level2, alloc_uchar(l2s), &i, level2, (int32_t)l2s, &status);
|
||||||
u_strFromUTF8Lenient(self->level3, alloc_uchar(l3s), &i, level3, (int32_t)l3s, &status);
|
u_strFromUTF8Lenient(self->level3, alloc_uchar(l3s), &i, level3, (int32_t)l3s, &status);
|
||||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes for level string from UTF-8 to UTF-16"); goto end; }
|
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes for level string from UTF-8 to UTF-16"); goto end; }
|
||||||
|
|
||||||
for (i = 0; i < self->item_count; i++) {
|
for (i = 0; i < self->item_count; i++) {
|
||||||
p = PySequence_Fast_GET_ITEM(self->py_items, i);
|
p = PySequence_Fast_GET_ITEM(py_items, i);
|
||||||
utf8 = PyBytes_AsString(p);
|
utf8 = PyBytes_AsString(p);
|
||||||
if (utf8 == NULL) goto end;
|
if (utf8 == NULL) goto end;
|
||||||
cap = PyBytes_GET_SIZE(p);
|
cap = PyBytes_GET_SIZE(p);
|
||||||
self->items[i] = (UChar*)calloc(alloc_uchar(cap), sizeof(UChar));
|
self->items[i] = (UChar*)calloc(alloc_uchar(cap), sizeof(UChar));
|
||||||
if (self->items[i] == NULL) { PyErr_NoMemory(); goto end; }
|
if (self->items[i] == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
u_strFromUTF8Lenient(self->items[i], alloc_uchar(cap), &(self->item_lengths[i]), utf8, cap, &status);
|
u_strFromUTF8Lenient(self->items[i], alloc_uchar(cap), NULL, utf8, cap, &status);
|
||||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes from UTF-8 to UTF-16"); goto end; }
|
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes from UTF-8 to UTF-16"); goto end; }
|
||||||
|
self->item_lengths[i] = u_strlen(self->items[i]);
|
||||||
p = PySequence_Fast_GET_ITEM(self->py_sort_keys, i);
|
|
||||||
self->sort_items[i] = PyBytes_AsString(p);
|
|
||||||
if (self->sort_items[i] == NULL) goto end;
|
|
||||||
self->sort_item_lengths[i] = (uint32_t) PyBytes_GET_SIZE(p);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
end:
|
end:
|
||||||
|
Py_XDECREF(py_items);
|
||||||
if (PyErr_Occurred()) { free_matcher(self); }
|
if (PyErr_Occurred()) { free_matcher(self); }
|
||||||
return (PyErr_Occurred()) ? -1 : 0;
|
return (PyErr_Occurred()) ? -1 : 0;
|
||||||
}
|
}
|
||||||
// Matcher.__init__() }}}
|
// Matcher.__init__() }}}
|
||||||
|
|
||||||
// Matcher.get_matches {{{
|
// Matcher.calculate_scores {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
Matcher_get_matches(Matcher *self, PyObject *args) {
|
Matcher_calculate_scores(Matcher *self, PyObject *args) {
|
||||||
char *cneedle = NULL;
|
char *cneedle = NULL;
|
||||||
int32_t qsize = 0;
|
int32_t qsize = 0, *final_positions = NULL, *p;
|
||||||
Match *matches = NULL;
|
Match *matches = NULL;
|
||||||
bool ok = FALSE;
|
bool ok = FALSE;
|
||||||
uint32_t i = 0;
|
uint32_t i = 0, needle_char_len = 0, j = 0;
|
||||||
PyObject *items = NULL;
|
PyObject *items = NULL, *score = NULL, *positions = NULL;
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
UChar *needle = NULL;
|
UChar *needle = NULL;
|
||||||
|
|
||||||
@ -308,39 +387,48 @@ Matcher_get_matches(Matcher *self, PyObject *args) {
|
|||||||
if (needle == NULL) return PyErr_NoMemory();
|
if (needle == NULL) return PyErr_NoMemory();
|
||||||
u_strFromUTF8Lenient(needle, alloc_uchar(qsize), &qsize, cneedle, qsize, &status);
|
u_strFromUTF8Lenient(needle, alloc_uchar(qsize), &qsize, cneedle, qsize, &status);
|
||||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes from UTF-8 to UTF-16"); goto end; }
|
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes from UTF-8 to UTF-16"); goto end; }
|
||||||
|
needle_char_len = u_countChar32(needle, -1);
|
||||||
items = PyTuple_New(self->item_count);
|
items = PyTuple_New(self->item_count);
|
||||||
|
positions = PyTuple_New(self->item_count);
|
||||||
matches = (Match*)calloc(self->item_count, sizeof(Match));
|
matches = (Match*)calloc(self->item_count, sizeof(Match));
|
||||||
if (items == NULL || matches == NULL) {PyErr_NoMemory(); goto end;}
|
final_positions = (int32_t*) calloc(needle_char_len * self->item_count, sizeof(int32_t));
|
||||||
|
if (items == NULL || matches == NULL || final_positions == NULL || positions == NULL) {PyErr_NoMemory(); goto end;}
|
||||||
|
|
||||||
for (i = 0; i < self->item_count; i++) {
|
for (i = 0; i < self->item_count; i++) {
|
||||||
matches[i].item = self->items[i];
|
score = PyTuple_New(needle_char_len);
|
||||||
matches[i].sort_key = self->sort_items[i];
|
if (score == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
matches[i].sort_key_len = self->sort_item_lengths[i];
|
PyTuple_SET_ITEM(positions, (Py_ssize_t)i, score);
|
||||||
matches[i].py_item = PySequence_Fast_GET_ITEM(self->py_items, (Py_ssize_t)i);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Py_BEGIN_ALLOW_THREADS;
|
Py_BEGIN_ALLOW_THREADS;
|
||||||
ok = match(self->items, self->item_lengths, self->item_count, needle, (uint32_t)qsize, matches, self->level1, self->level2, self->level3);
|
ok = match(self->items, self->item_lengths, self->item_count, needle, matches, final_positions, needle_char_len, self->level1, self->level2, self->level3);
|
||||||
if (ok) qsort(matches, self->item_count, sizeof(Match), cmp_score, NULL);
|
|
||||||
Py_END_ALLOW_THREADS;
|
Py_END_ALLOW_THREADS;
|
||||||
|
|
||||||
if (ok) {
|
if (ok) {
|
||||||
for (i = 0; i < self->item_count; i++) {
|
for (i = 0; i < self->item_count; i++) {
|
||||||
PyTuple_SET_ITEM(items, (Py_ssize_t)i, matches[i].py_item);
|
score = PyFloat_FromDouble(matches[i].score);
|
||||||
Py_INCREF(matches[i].py_item);
|
if (score == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
|
PyTuple_SET_ITEM(items, (Py_ssize_t)i, score);
|
||||||
|
p = final_positions + i;
|
||||||
|
for (j = 0; j < needle_char_len; j++) {
|
||||||
|
score = PyInt_FromLong((long)p[j]);
|
||||||
|
if (score == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
|
PyTuple_SET_ITEM(PyTuple_GET_ITEM(positions, (Py_ssize_t)i), (Py_ssize_t)j, score);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else { PyErr_NoMemory(); goto end; }
|
} else { PyErr_NoMemory(); goto end; }
|
||||||
|
|
||||||
end:
|
end:
|
||||||
nullfree(needle);
|
nullfree(needle);
|
||||||
nullfree(matches);
|
nullfree(matches);
|
||||||
if (PyErr_Occurred()) { Py_XDECREF(items); return NULL; }
|
nullfree(final_positions);
|
||||||
return items;
|
if (PyErr_Occurred()) { Py_XDECREF(items); items = NULL; Py_XDECREF(positions); positions = NULL; return NULL; }
|
||||||
|
return Py_BuildValue("NN", items, positions);
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
static PyMethodDef Matcher_methods[] = {
|
static PyMethodDef Matcher_methods[] = {
|
||||||
{"get_matches", (PyCFunction)Matcher_get_matches, METH_VARARGS,
|
{"calculate_scores", (PyCFunction)Matcher_calculate_scores, METH_VARARGS,
|
||||||
"get_matches(query) -> Return the sorted list of matches for query which must be a UTF-8 encoded string."
|
"calculate_scores(query) -> Return the scores for all items given query as a tuple."
|
||||||
},
|
},
|
||||||
|
|
||||||
{NULL} /* Sentinel */
|
{NULL} /* Sentinel */
|
||||||
|
@ -8,14 +8,19 @@ __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
|
|
||||||
from unicodedata import normalize
|
from unicodedata import normalize
|
||||||
|
|
||||||
|
from itertools import izip
|
||||||
from future_builtins import map
|
from future_builtins import map
|
||||||
|
|
||||||
from calibre.constants import plugins
|
from calibre.constants import plugins
|
||||||
from calibre.utils.icu import primary_sort_key
|
from calibre.utils.icu import primary_sort_key, find
|
||||||
|
|
||||||
|
DEFAULT_LEVEL1 = '/'
|
||||||
|
DEFAULT_LEVEL2 = '-_ 0123456789'
|
||||||
|
DEFAULT_LEVEL3 = '.'
|
||||||
|
|
||||||
class Matcher(object):
|
class Matcher(object):
|
||||||
|
|
||||||
def __init__(self, items, level1='/', level2='-_ 0123456789', level3='.'):
|
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
||||||
items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
|
items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
|
||||||
items = tuple(map(lambda x: x.encode('utf-8'), items))
|
items = tuple(map(lambda x: x.encode('utf-8'), items))
|
||||||
sort_keys = tuple(map(primary_sort_key, items))
|
sort_keys = tuple(map(primary_sort_key, items))
|
||||||
@ -29,6 +34,96 @@ class Matcher(object):
|
|||||||
query = normalize('NFC', unicode(query)).encode('utf-8')
|
query = normalize('NFC', unicode(query)).encode('utf-8')
|
||||||
return map(lambda x:x.decode('utf-8'), self.m.get_matches(query))
|
return map(lambda x:x.decode('utf-8'), self.m.get_matches(query))
|
||||||
|
|
||||||
|
|
||||||
|
def calc_score_for_char(ctx, prev, current, distance):
|
||||||
|
factor = 1.0
|
||||||
|
ans = ctx.max_score_per_char
|
||||||
|
|
||||||
|
if prev in ctx.level1:
|
||||||
|
factor = 0.9
|
||||||
|
elif prev in ctx.level2 or (icu_lower(prev) == prev and icu_upper(current) == current):
|
||||||
|
factor = 0.8
|
||||||
|
elif prev in ctx.level3:
|
||||||
|
factor = 0.7
|
||||||
|
else:
|
||||||
|
factor = (1.0 / distance) * 0.75
|
||||||
|
|
||||||
|
return ans * factor
|
||||||
|
|
||||||
|
def process_item(ctx, haystack, needle):
|
||||||
|
# non-recursive implementation using a stack
|
||||||
|
stack = [(0, 0, 0, 0, [-1]*len(needle))]
|
||||||
|
final_score, final_positions = stack[0][-2:]
|
||||||
|
push, pop = stack.append, stack.pop
|
||||||
|
while stack:
|
||||||
|
hidx, nidx, last_idx, score, positions = pop()
|
||||||
|
key = (hidx, nidx, last_idx)
|
||||||
|
mem = ctx.memory.get(key, None)
|
||||||
|
if mem is None:
|
||||||
|
for i in xrange(nidx, len(needle)):
|
||||||
|
n = needle[i]
|
||||||
|
if (len(haystack) - hidx < len(needle) - i):
|
||||||
|
score = 0
|
||||||
|
break
|
||||||
|
pos = find(n, haystack[hidx:])[0] + hidx
|
||||||
|
if pos == -1:
|
||||||
|
score = 0
|
||||||
|
break
|
||||||
|
|
||||||
|
distance = pos - last_idx
|
||||||
|
score_for_char = ctx.max_score_per_char if distance <= 1 else calc_score_for_char(ctx, haystack[pos-1], haystack[pos], distance)
|
||||||
|
hidx = pos + 1
|
||||||
|
push((hidx, i, last_idx, score, list(positions)))
|
||||||
|
last_idx = positions[i] = pos
|
||||||
|
score += score_for_char
|
||||||
|
ctx.memory[key] = (score, positions)
|
||||||
|
else:
|
||||||
|
score, positions = mem
|
||||||
|
if score > final_score:
|
||||||
|
final_score = score
|
||||||
|
final_positions = positions
|
||||||
|
return final_score, final_positions
|
||||||
|
|
||||||
|
class PyScorer(object):
|
||||||
|
__slots__ = ('level1', 'level2', 'level3', 'max_score_per_char', 'items', 'memory')
|
||||||
|
|
||||||
|
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
||||||
|
self.level1, self.level2, self.level3 = level1, level2, level3
|
||||||
|
self.max_score_per_char = 0
|
||||||
|
self.items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
|
||||||
|
|
||||||
|
def __call__(self, needle):
|
||||||
|
for item in self.items:
|
||||||
|
self.max_score_per_char = (1.0 / len(item) + 1.0 / len(needle)) / 2.0
|
||||||
|
self.memory = {}
|
||||||
|
yield process_item(self, item, needle)
|
||||||
|
|
||||||
|
class CScorer(object):
|
||||||
|
|
||||||
|
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
||||||
|
items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
|
||||||
|
items = tuple(map(lambda x: x.encode('utf-8'), items))
|
||||||
|
|
||||||
|
speedup, err = plugins['matcher']
|
||||||
|
if speedup is None:
|
||||||
|
raise RuntimeError('Failed to load the matcher plugin with error: %s' % err)
|
||||||
|
self.m = speedup.Matcher(items, level1.encode('utf-8'), level2.encode('utf-8'), level3.encode('utf-8'))
|
||||||
|
|
||||||
|
def __call__(self, query):
|
||||||
|
query = normalize('NFC', unicode(query)).encode('utf-8')
|
||||||
|
scores, positions = self.m.calculate_scores(query)
|
||||||
|
for score, pos in izip(scores, positions):
|
||||||
|
yield score, pos
|
||||||
|
|
||||||
|
def test():
|
||||||
|
items = ['m1mn34o/mno']
|
||||||
|
s = PyScorer(items)
|
||||||
|
c = CScorer(items)
|
||||||
|
for q in (s, c):
|
||||||
|
print (q)
|
||||||
|
for item, (score, positions) in izip(items, q('mno')):
|
||||||
|
print (item, score, positions)
|
||||||
|
|
||||||
def test_mem():
|
def test_mem():
|
||||||
from calibre.utils.mem import gc_histogram, diff_hists
|
from calibre.utils.mem import gc_histogram, diff_hists
|
||||||
m = Matcher([])
|
m = Matcher([])
|
||||||
@ -45,7 +140,8 @@ def test_mem():
|
|||||||
diff_hists(h1, h2)
|
diff_hists(h1, h2)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
m = Matcher(['image/one.png', 'image/two.gif', 'text/one.html'])
|
test()
|
||||||
for q in ('one', 'ton', 'imo'):
|
# m = Matcher(['image/one.png', 'image/two.gif', 'text/one.html'])
|
||||||
print (q, '->', tuple(m(q)))
|
# for q in ('one', 'ONE', 'ton', 'imo'):
|
||||||
test_mem()
|
# print (q, '->', tuple(m(q)))
|
||||||
|
# test_mem()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user