From c819fcb8704c9f43a06a481f90073e19d235a3ad Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Jun 2021 11:44:31 +0530 Subject: [PATCH] A simple ASCII tokenizer to start with --- src/calibre/db/sqlite_extension.cpp | 47 +++++++++++++++++++++++------ src/calibre/db/tests/fts.py | 20 +++++++++--- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/src/calibre/db/sqlite_extension.cpp b/src/calibre/db/sqlite_extension.cpp index 86fdbc33fc..1c55e6c99f 100644 --- a/src/calibre/db/sqlite_extension.cpp +++ b/src/calibre/db/sqlite_extension.cpp @@ -8,19 +8,44 @@ #define UNICODE #include #include -#include +#include +#include #include SQLITE_EXTENSION_INIT1 typedef int (*token_callback_func)(void *, int, const char *, int, int, int); class Tokenizer { +private: + std::string ascii_folded_buf; + + int ascii_tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) { + int pos = 0; + while (pos < text_sz) { + /* Skip any leading divider characters. */ + while (pos < text_sz && !std::isalnum(text[pos], std::locale::classic())) pos++; + if (pos >= text_sz) break; + ascii_folded_buf.clear(); + int start_pos = pos; + while (std::isalnum(text[pos], std::locale::classic())) { + char ch = text[pos++]; + if ('A' <= ch && ch <= 'Z') ch += 'a' - 'A'; + ascii_folded_buf.push_back(ch); + } + if (!ascii_folded_buf.empty()) { + int rc = callback(callback_ctx, 0, ascii_folded_buf.c_str(), ascii_folded_buf.size(), start_pos, start_pos + ascii_folded_buf.size()); + if (rc != SQLITE_OK) return rc; + } + } + return SQLITE_OK; + } public: - Tokenizer(const char **args, int nargs) { + Tokenizer(const char **args, int nargs) : ascii_folded_buf() { + ascii_folded_buf.reserve(128); } int tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) { - return SQLITE_OK; + return ascii_tokenize(callback_ctx, flags, text, text_sz, callback); } }; @@ -40,11 +65,15 @@ fts5_api_from_db(sqlite3 *db, fts5_api **ppApi) { static int tok_create(void *sqlite3, const char **azArg, int nArg, Fts5Tokenizer **ppOut) { - int rc = SQLITE_OK; - Tokenizer *p = new (std::nothrow) Tokenizer(azArg, nArg); - if (p) *ppOut = reinterpret_cast(p); - else rc = SQLITE_NOMEM; - return rc; + try { + Tokenizer *p = new Tokenizer(azArg, nArg); + *ppOut = reinterpret_cast(p); + } catch (std::bad_alloc &ex) { + return SQLITE_NOMEM; + } catch (...) { + return SQLITE_ERROR; + } + return SQLITE_OK; } static int @@ -87,7 +116,7 @@ calibre_sqlite_extension_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_ro return SQLITE_ERROR; } fts5_tokenizer tok = {tok_create, tok_delete, tok_tokenize}; - fts5api->xCreateTokenizer(fts5api, "calibre", reinterpret_cast(fts5api), &tok, NULL); + fts5api->xCreateTokenizer(fts5api, "unicode61", reinterpret_cast(fts5api), &tok, NULL); return SQLITE_OK; } } diff --git a/src/calibre/db/tests/fts.py b/src/calibre/db/tests/fts.py index 6ebb8d02e4..a6f5e1e2c1 100644 --- a/src/calibre/db/tests/fts.py +++ b/src/calibre/db/tests/fts.py @@ -4,14 +4,24 @@ from calibre.db.tests.base import BaseTest +from apsw import Connection +from calibre.constants import plugins + + +class TestConn(Connection): + + def __init__(self): + super().__init__(':memory:') + plugins.load_apsw_extension(self, 'sqlite_extension') + self.cursor().execute("CREATE VIRTUAL TABLE fts_table USING fts5(t, tokenize = 'unicode61 remove_diacritics 2')") + + def insert_text(self, text): + self.cursor().execute('INSERT INTO fts_table(t) VALUES (?)', (text,)) class FTSTest(BaseTest): def test_basic_fts(self): # {{{ - from apsw import Connection - from calibre.constants import plugins - - conn = Connection(':memory:') - plugins.load_apsw_extension(conn, 'sqlite_extension') + conn = TestConn() + conn.insert_text('two words, and a period. With another.') # }}}