mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Allow directly testing the tokenizer
This commit is contained in:
parent
4127117e8a
commit
6302937c4f
@ -303,6 +303,26 @@ set_ui_language(PyObject *self, PyObject *args) {
|
|||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
py_callback(void *ctx, int flags, const char *text, int text_length, int start_offset, int end_offset) {
|
||||||
|
PyObject *ans = reinterpret_cast<PyObject*>(ctx);
|
||||||
|
pyobject_raii item(Py_BuildValue("{ss# si si si}", "text", text, text_length, "start", start_offset, "end", end_offset, "flags", flags));
|
||||||
|
if (item) PyList_Append(ans, item.ptr());
|
||||||
|
return SQLITE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
tokenize(PyObject *self, PyObject *args) {
|
||||||
|
const char *text; int text_length, remove_diacritics = 1, flags = FTS5_TOKENIZE_DOCUMENT;
|
||||||
|
if (!PyArg_ParseTuple(args, "s#|pi", &text, &text_length, &remove_diacritics, &flags)) return NULL;
|
||||||
|
const char *targs[2] = {"remove_diacritics", "2"};
|
||||||
|
if (!remove_diacritics) targs[1] = "0";
|
||||||
|
Tokenizer t(targs, sizeof(targs)/sizeof(targs[0]));
|
||||||
|
pyobject_raii ans(PyList_New(0));
|
||||||
|
t.tokenize(ans.ptr(), flags, text, text_length, py_callback);
|
||||||
|
return ans.detach();
|
||||||
|
}
|
||||||
|
|
||||||
static PyMethodDef methods[] = {
|
static PyMethodDef methods[] = {
|
||||||
{"get_locales_for_break_iteration", get_locales_for_break_iteration, METH_NOARGS,
|
{"get_locales_for_break_iteration", get_locales_for_break_iteration, METH_NOARGS,
|
||||||
"Get list of available locales for break iteration"
|
"Get list of available locales for break iteration"
|
||||||
@ -310,11 +330,21 @@ static PyMethodDef methods[] = {
|
|||||||
{"set_ui_language", set_ui_language, METH_VARARGS,
|
{"set_ui_language", set_ui_language, METH_VARARGS,
|
||||||
"Set the current UI language"
|
"Set the current UI language"
|
||||||
},
|
},
|
||||||
|
{"tokenize", tokenize, METH_VARARGS,
|
||||||
|
"Tokenize a string, useful for testing"
|
||||||
|
},
|
||||||
{NULL, NULL, 0, NULL}
|
{NULL, NULL, 0, NULL}
|
||||||
};
|
};
|
||||||
|
|
||||||
static int
|
static int
|
||||||
exec_module(PyObject *mod) { return 0; }
|
exec_module(PyObject *mod) {
|
||||||
|
if (PyModule_AddIntMacro(mod, FTS5_TOKENIZE_QUERY) != 0) return 1;
|
||||||
|
if (PyModule_AddIntMacro(mod, FTS5_TOKENIZE_DOCUMENT) != 0) return 1;
|
||||||
|
if (PyModule_AddIntMacro(mod, FTS5_TOKENIZE_PREFIX) != 0) return 1;
|
||||||
|
if (PyModule_AddIntMacro(mod, FTS5_TOKENIZE_AUX) != 0) return 1;
|
||||||
|
if (PyModule_AddIntMacro(mod, FTS5_TOKEN_COLOCATED) != 0) return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static PyModuleDef_Slot slots[] = { {Py_mod_exec, (void*)exec_module}, {0, NULL} };
|
static PyModuleDef_Slot slots[] = { {Py_mod_exec, (void*)exec_module}, {0, NULL} };
|
||||||
|
|
||||||
|
@ -47,10 +47,27 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
|
|||||||
return list(self.execute(stmt, (query,)))
|
return list(self.execute(stmt, (query,)))
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(text, flags=None, remove_diacritics=True):
|
||||||
|
from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
|
||||||
|
if flags is None:
|
||||||
|
flags = FTS5_TOKENIZE_DOCUMENT
|
||||||
|
return tokenize(text, remove_diacritics, flags)
|
||||||
|
|
||||||
|
|
||||||
class FTSTest(BaseTest):
|
class FTSTest(BaseTest):
|
||||||
ae = BaseTest.assertEqual
|
ae = BaseTest.assertEqual
|
||||||
|
|
||||||
def test_basic_fts(self): # {{{
|
def test_fts_tokenize(self): # {{{
|
||||||
|
def t(x, s, e, f=0):
|
||||||
|
return {'text': x, 'start': s, 'end': e, 'flags': f}
|
||||||
|
|
||||||
|
self.ae(
|
||||||
|
tokenize("Some wörds"),
|
||||||
|
[t('some', 0, 4), t('wörds', 5, 11), t('words', 5, 11, 1)]
|
||||||
|
)
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def test_fts_basic(self): # {{{
|
||||||
conn = TestConn()
|
conn = TestConn()
|
||||||
conn.insert_text('two words, and a period. With another.')
|
conn.insert_text('two words, and a period. With another.')
|
||||||
conn.insert_text('and another re-init')
|
conn.insert_text('and another re-init')
|
||||||
@ -60,6 +77,8 @@ class FTSTest(BaseTest):
|
|||||||
conn = TestConn()
|
conn = TestConn()
|
||||||
conn.insert_text('coộl')
|
conn.insert_text('coộl')
|
||||||
self.ae(conn.term_row_counts(), {'cool': 1, 'coộl': 1})
|
self.ae(conn.term_row_counts(), {'cool': 1, 'coộl': 1})
|
||||||
|
self.ae(conn.search("cool"), [('>coộl<',)])
|
||||||
|
self.ae(conn.search("coộl"), [('>coộl<',)])
|
||||||
conn = TestConn(remove_diacritics=False)
|
conn = TestConn(remove_diacritics=False)
|
||||||
conn.insert_text('coộl')
|
conn.insert_text('coộl')
|
||||||
self.ae(conn.term_row_counts(), {'coộl': 1})
|
self.ae(conn.term_row_counts(), {'coộl': 1})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user