diff --git a/src/calibre/db/sqlite_extension.cpp b/src/calibre/db/sqlite_extension.cpp index 30d318b640..7080f9c3c1 100644 --- a/src/calibre/db/sqlite_extension.cpp +++ b/src/calibre/db/sqlite_extension.cpp @@ -303,6 +303,26 @@ set_ui_language(PyObject *self, PyObject *args) { Py_RETURN_NONE; } +static int +py_callback(void *ctx, int flags, const char *text, int text_length, int start_offset, int end_offset) { + PyObject *ans = reinterpret_cast(ctx); + pyobject_raii item(Py_BuildValue("{ss# si si si}", "text", text, text_length, "start", start_offset, "end", end_offset, "flags", flags)); + if (item) PyList_Append(ans, item.ptr()); + return SQLITE_OK; +} + +static PyObject* +tokenize(PyObject *self, PyObject *args) { + const char *text; int text_length, remove_diacritics = 1, flags = FTS5_TOKENIZE_DOCUMENT; + if (!PyArg_ParseTuple(args, "s#|pi", &text, &text_length, &remove_diacritics, &flags)) return NULL; + const char *targs[2] = {"remove_diacritics", "2"}; + if (!remove_diacritics) targs[1] = "0"; + Tokenizer t(targs, sizeof(targs)/sizeof(targs[0])); + pyobject_raii ans(PyList_New(0)); + t.tokenize(ans.ptr(), flags, text, text_length, py_callback); + return ans.detach(); +} + static PyMethodDef methods[] = { {"get_locales_for_break_iteration", get_locales_for_break_iteration, METH_NOARGS, "Get list of available locales for break iteration" @@ -310,11 +330,21 @@ static PyMethodDef methods[] = { {"set_ui_language", set_ui_language, METH_VARARGS, "Set the current UI language" }, + {"tokenize", tokenize, METH_VARARGS, + "Tokenize a string, useful for testing" + }, {NULL, NULL, 0, NULL} }; static int -exec_module(PyObject *mod) { return 0; } +exec_module(PyObject *mod) { + if (PyModule_AddIntMacro(mod, FTS5_TOKENIZE_QUERY) != 0) return 1; + if (PyModule_AddIntMacro(mod, FTS5_TOKENIZE_DOCUMENT) != 0) return 1; + if (PyModule_AddIntMacro(mod, FTS5_TOKENIZE_PREFIX) != 0) return 1; + if (PyModule_AddIntMacro(mod, FTS5_TOKENIZE_AUX) != 0) return 1; + if (PyModule_AddIntMacro(mod, FTS5_TOKEN_COLOCATED) != 0) return 1; + return 0; +} static PyModuleDef_Slot slots[] = { {Py_mod_exec, (void*)exec_module}, {0, NULL} }; diff --git a/src/calibre/db/tests/fts.py b/src/calibre/db/tests/fts.py index 497f95caa9..3273634209 100644 --- a/src/calibre/db/tests/fts.py +++ b/src/calibre/db/tests/fts.py @@ -47,10 +47,27 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row); return list(self.execute(stmt, (query,))) +def tokenize(text, flags=None, remove_diacritics=True): + from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT + if flags is None: + flags = FTS5_TOKENIZE_DOCUMENT + return tokenize(text, remove_diacritics, flags) + + class FTSTest(BaseTest): ae = BaseTest.assertEqual - def test_basic_fts(self): # {{{ + def test_fts_tokenize(self): # {{{ + def t(x, s, e, f=0): + return {'text': x, 'start': s, 'end': e, 'flags': f} + + self.ae( + tokenize("Some wörds"), + [t('some', 0, 4), t('wörds', 5, 11), t('words', 5, 11, 1)] + ) + # }}} + + def test_fts_basic(self): # {{{ conn = TestConn() conn.insert_text('two words, and a period. With another.') conn.insert_text('and another re-init') @@ -60,6 +77,8 @@ class FTSTest(BaseTest): conn = TestConn() conn.insert_text('coộl') self.ae(conn.term_row_counts(), {'cool': 1, 'coộl': 1}) + self.ae(conn.search("cool"), [('>coộl<',)]) + self.ae(conn.search("coộl"), [('>coộl<',)]) conn = TestConn(remove_diacritics=False) conn.insert_text('coộl') self.ae(conn.term_row_counts(), {'coộl': 1})