mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Use the faster py->icu string conversion in the matcher
This commit is contained in:
parent
df6a06c8b7
commit
82b6335a3c
@ -180,10 +180,11 @@ extensions = [
|
|||||||
|
|
||||||
Extension('matcher',
|
Extension('matcher',
|
||||||
['calibre/gui2/tweak_book/matcher.c'],
|
['calibre/gui2/tweak_book/matcher.c'],
|
||||||
|
headers=['calibre/utils/icu_calibre_utils.h'],
|
||||||
libraries=icu_libs,
|
libraries=icu_libs,
|
||||||
lib_dirs=icu_lib_dirs,
|
lib_dirs=icu_lib_dirs,
|
||||||
cflags=icu_cflags,
|
cflags=icu_cflags,
|
||||||
inc_dirs=icu_inc_dirs # + (['calibre/utils/chm'] if iswindows else []) # For stdint.h
|
inc_dirs=icu_inc_dirs + ['calibre/utils']
|
||||||
),
|
),
|
||||||
|
|
||||||
Extension('podofo',
|
Extension('podofo',
|
||||||
|
@ -5,14 +5,8 @@
|
|||||||
* Distributed under terms of the GPL3 license.
|
* Distributed under terms of the GPL3 license.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define PY_SSIZE_T_CLEAN
|
#define NO_ICU_TO_PYTHON
|
||||||
#include <Python.h>
|
#include "icu_calibre_utils.h"
|
||||||
#include <float.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <search.h>
|
|
||||||
#include <unicode/uchar.h>
|
|
||||||
#include <unicode/ustring.h>
|
|
||||||
#include <unicode/utf16.h>
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
// inline does not work with the visual studio C compiler
|
// inline does not work with the visual studio C compiler
|
||||||
@ -325,42 +319,27 @@ Matcher_dealloc(Matcher* self)
|
|||||||
static int
|
static int
|
||||||
Matcher_init(Matcher *self, PyObject *args, PyObject *kwds)
|
Matcher_init(Matcher *self, PyObject *args, PyObject *kwds)
|
||||||
{
|
{
|
||||||
PyObject *items = NULL, *p = NULL, *py_items = NULL;
|
PyObject *items = NULL, *p = NULL, *py_items = NULL, *level1 = NULL, *level2 = NULL, *level3 = NULL;
|
||||||
char *utf8 = NULL, *level1 = NULL, *level2 = NULL, *level3 = NULL;
|
|
||||||
int32_t i = 0;
|
int32_t i = 0;
|
||||||
Py_ssize_t cap = 0, l1s, l2s, l3s;
|
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "Os#s#s#", &items, &level1, &l1s, &level2, &l2s, &level3, &l3s)) return -1;
|
if (!PyArg_ParseTuple(args, "OOOO", &items, &level1, &level2, &level3)) return -1;
|
||||||
py_items = PySequence_Fast(items, "Must pass in two sequence objects");
|
py_items = PySequence_Fast(items, "Must pass in two sequence objects");
|
||||||
if (py_items == NULL) goto end;
|
if (py_items == NULL) goto end;
|
||||||
self->item_count = (uint32_t)PySequence_Size(items);
|
self->item_count = (uint32_t)PySequence_Size(items);
|
||||||
|
|
||||||
self->items = (UChar**)calloc(self->item_count, sizeof(UChar*));
|
self->items = (UChar**)calloc(self->item_count, sizeof(UChar*));
|
||||||
self->item_lengths = (int32_t*)calloc(self->item_count, sizeof(uint32_t));
|
self->item_lengths = (int32_t*)calloc(self->item_count, sizeof(uint32_t));
|
||||||
self->level1 = (UChar*)calloc(alloc_uchar(l1s), sizeof(UChar));
|
self->level1 = python_to_icu(level1, NULL, 1);
|
||||||
self->level2 = (UChar*)calloc(alloc_uchar(l2s), sizeof(UChar));
|
self->level2 = python_to_icu(level2, NULL, 1);
|
||||||
self->level3 = (UChar*)calloc(alloc_uchar(l3s), sizeof(UChar));
|
self->level3 = python_to_icu(level3, NULL, 1);
|
||||||
|
|
||||||
if (self->items == NULL || self->item_lengths == NULL || self->level1 == NULL || self->level2 == NULL || self->level3 == NULL) {
|
if (self->items == NULL || self->item_lengths == NULL ) { PyErr_NoMemory(); goto end; }
|
||||||
PyErr_NoMemory(); goto end;
|
if (self->level1 == NULL || self->level2 == NULL || self->level3 == NULL) goto end;
|
||||||
}
|
|
||||||
|
|
||||||
u_strFromUTF8Lenient(self->level1, alloc_uchar((int32_t)l1s), &i, level1, (int32_t)l1s, &status);
|
|
||||||
u_strFromUTF8Lenient(self->level2, alloc_uchar((int32_t)l2s), &i, level2, (int32_t)l2s, &status);
|
|
||||||
u_strFromUTF8Lenient(self->level3, alloc_uchar((int32_t)l3s), &i, level3, (int32_t)l3s, &status);
|
|
||||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes for level string from UTF-8 to UTF-16"); goto end; }
|
|
||||||
|
|
||||||
for (i = 0; i < (int32_t)self->item_count; i++) {
|
for (i = 0; i < (int32_t)self->item_count; i++) {
|
||||||
p = PySequence_Fast_GET_ITEM(py_items, i);
|
p = PySequence_Fast_GET_ITEM(py_items, i);
|
||||||
utf8 = PyBytes_AsString(p);
|
self->items[i] = python_to_icu(p, self->item_lengths + i, 1);
|
||||||
if (utf8 == NULL) goto end;
|
|
||||||
cap = PyBytes_GET_SIZE(p);
|
|
||||||
self->items[i] = (UChar*)calloc(alloc_uchar(cap), sizeof(UChar));
|
|
||||||
if (self->items[i] == NULL) { PyErr_NoMemory(); goto end; }
|
if (self->items[i] == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
u_strFromUTF8Lenient(self->items[i], alloc_uchar((int32_t)cap), NULL, utf8, (int32_t)cap, &status);
|
|
||||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes from UTF-8 to UTF-16"); goto end; }
|
|
||||||
self->item_lengths[i] = u_strlen(self->items[i]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
end:
|
end:
|
||||||
@ -373,21 +352,17 @@ end:
|
|||||||
// Matcher.calculate_scores {{{
|
// Matcher.calculate_scores {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
Matcher_calculate_scores(Matcher *self, PyObject *args) {
|
Matcher_calculate_scores(Matcher *self, PyObject *args) {
|
||||||
char *cneedle = NULL;
|
int32_t *final_positions = NULL, *p;
|
||||||
int32_t qsize = 0, *final_positions = NULL, *p;
|
|
||||||
Match *matches = NULL;
|
Match *matches = NULL;
|
||||||
bool ok = FALSE;
|
bool ok = FALSE;
|
||||||
uint32_t i = 0, needle_char_len = 0, j = 0;
|
uint32_t i = 0, needle_char_len = 0, j = 0;
|
||||||
PyObject *items = NULL, *score = NULL, *positions = NULL;
|
PyObject *items = NULL, *score = NULL, *positions = NULL, *pneedle = NULL;
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
|
||||||
UChar *needle = NULL;
|
UChar *needle = NULL;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "s#", &cneedle, &qsize)) return NULL;
|
if (!PyArg_ParseTuple(args, "O", &pneedle)) return NULL;
|
||||||
|
|
||||||
needle = (UChar*)calloc(alloc_uchar(qsize), sizeof(UChar));
|
needle = python_to_icu(pneedle, NULL, 1);
|
||||||
if (needle == NULL) return PyErr_NoMemory();
|
if (needle == NULL) return NULL;
|
||||||
u_strFromUTF8Lenient(needle, alloc_uchar(qsize), &qsize, cneedle, qsize, &status);
|
|
||||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to convert bytes from UTF-8 to UTF-16"); goto end; }
|
|
||||||
needle_char_len = u_countChar32(needle, -1);
|
needle_char_len = u_countChar32(needle, -1);
|
||||||
items = PyTuple_New(self->item_count);
|
items = PyTuple_New(self->item_count);
|
||||||
positions = PyTuple_New(self->item_count);
|
positions = PyTuple_New(self->item_count);
|
||||||
|
@ -101,16 +101,15 @@ class PyScorer(object):
|
|||||||
class CScorer(object):
|
class CScorer(object):
|
||||||
|
|
||||||
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
def __init__(self, items, level1=DEFAULT_LEVEL1, level2=DEFAULT_LEVEL2, level3=DEFAULT_LEVEL3):
|
||||||
items = map(lambda x: normalize('NFC', unicode(x)), filter(None, items))
|
items = tuple(map(lambda x: normalize('NFC', unicode(x)), filter(None, items)))
|
||||||
items = tuple(map(lambda x: x.encode('utf-8'), items))
|
|
||||||
|
|
||||||
speedup, err = plugins['matcher']
|
speedup, err = plugins['matcher']
|
||||||
if speedup is None:
|
if speedup is None:
|
||||||
raise RuntimeError('Failed to load the matcher plugin with error: %s' % err)
|
raise RuntimeError('Failed to load the matcher plugin with error: %s' % err)
|
||||||
self.m = speedup.Matcher(items, level1.encode('utf-8'), level2.encode('utf-8'), level3.encode('utf-8'))
|
self.m = speedup.Matcher(items, unicode(level1), unicode(level2), unicode(level3))
|
||||||
|
|
||||||
def __call__(self, query):
|
def __call__(self, query):
|
||||||
query = normalize('NFC', unicode(query)).encode('utf-8')
|
query = normalize('NFC', unicode(query))
|
||||||
scores, positions = self.m.calculate_scores(query)
|
scores, positions = self.m.calculate_scores(query)
|
||||||
for score, pos in izip(scores, positions):
|
for score, pos in izip(scores, positions):
|
||||||
yield score, pos
|
yield score, pos
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
#if PY_VERSION_HEX < 0x03030000
|
#if PY_VERSION_HEX < 0x03030000
|
||||||
// Roundtripping will need to be implemented differently for python > 3.2 where strings are stored with variable widths
|
// Roundtripping will need to be implemented differently for python > 3.2 where strings are stored with variable widths
|
||||||
|
|
||||||
|
#ifndef NO_PYTHON_TO_ICU
|
||||||
static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) {
|
static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) {
|
||||||
UChar *ans = NULL;
|
UChar *ans = NULL;
|
||||||
Py_ssize_t sz = 0;
|
Py_ssize_t sz = 0;
|
||||||
@ -50,12 +51,15 @@ static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) {
|
|||||||
end:
|
end:
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef NO_ICU_TO_PYTHON
|
||||||
static PyObject* icu_to_python(UChar *src, int32_t sz) {
|
static PyObject* icu_to_python(UChar *src, int32_t sz) {
|
||||||
if (sizeof(Py_UNICODE) == 2) // narrow build UTF-16
|
if (sizeof(Py_UNICODE) == 2) // narrow build UTF-16
|
||||||
return PyUnicode_FromUnicode((Py_UNICODE*)src, sz);
|
return PyUnicode_FromUnicode((Py_UNICODE*)src, sz);
|
||||||
return PyUnicode_DecodeUTF16((char*)src, sz*sizeof(UChar), "strict", NULL);
|
return PyUnicode_DecodeUTF16((char*)src, sz*sizeof(UChar), "strict", NULL);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user