A simple ASCII tokenizer to start with

This commit is contained in:
Kovid Goyal 2021-06-15 11:44:31 +05:30
parent 268d1d991c
commit c819fcb870
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 53 additions and 14 deletions

View File

@ -8,19 +8,44 @@
#define UNICODE
#include <Python.h>
#include <stdlib.h>
#include <new>
#include <string>
#include <locale>
#include <sqlite3ext.h>
SQLITE_EXTENSION_INIT1
typedef int (*token_callback_func)(void *, int, const char *, int, int, int);
class Tokenizer {
private:
std::string ascii_folded_buf;
int ascii_tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) {
int pos = 0;
while (pos < text_sz) {
/* Skip any leading divider characters. */
while (pos < text_sz && !std::isalnum(text[pos], std::locale::classic())) pos++;
if (pos >= text_sz) break;
ascii_folded_buf.clear();
int start_pos = pos;
while (std::isalnum(text[pos], std::locale::classic())) {
char ch = text[pos++];
if ('A' <= ch && ch <= 'Z') ch += 'a' - 'A';
ascii_folded_buf.push_back(ch);
}
if (!ascii_folded_buf.empty()) {
int rc = callback(callback_ctx, 0, ascii_folded_buf.c_str(), ascii_folded_buf.size(), start_pos, start_pos + ascii_folded_buf.size());
if (rc != SQLITE_OK) return rc;
}
}
return SQLITE_OK;
}
public:
Tokenizer(const char **args, int nargs) {
Tokenizer(const char **args, int nargs) : ascii_folded_buf() {
ascii_folded_buf.reserve(128);
}
int tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) {
return SQLITE_OK;
return ascii_tokenize(callback_ctx, flags, text, text_sz, callback);
}
};
@ -40,11 +65,15 @@ fts5_api_from_db(sqlite3 *db, fts5_api **ppApi) {
static int
tok_create(void *sqlite3, const char **azArg, int nArg, Fts5Tokenizer **ppOut) {
int rc = SQLITE_OK;
Tokenizer *p = new (std::nothrow) Tokenizer(azArg, nArg);
if (p) *ppOut = reinterpret_cast<Fts5Tokenizer *>(p);
else rc = SQLITE_NOMEM;
return rc;
try {
Tokenizer *p = new Tokenizer(azArg, nArg);
*ppOut = reinterpret_cast<Fts5Tokenizer *>(p);
} catch (std::bad_alloc &ex) {
return SQLITE_NOMEM;
} catch (...) {
return SQLITE_ERROR;
}
return SQLITE_OK;
}
static int
@ -87,7 +116,7 @@ calibre_sqlite_extension_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_ro
return SQLITE_ERROR;
}
fts5_tokenizer tok = {tok_create, tok_delete, tok_tokenize};
fts5api->xCreateTokenizer(fts5api, "calibre", reinterpret_cast<void *>(fts5api), &tok, NULL);
fts5api->xCreateTokenizer(fts5api, "unicode61", reinterpret_cast<void *>(fts5api), &tok, NULL);
return SQLITE_OK;
}
}

View File

@ -4,14 +4,24 @@
from calibre.db.tests.base import BaseTest
from apsw import Connection
from calibre.constants import plugins
class TestConn(Connection):
def __init__(self):
super().__init__(':memory:')
plugins.load_apsw_extension(self, 'sqlite_extension')
self.cursor().execute("CREATE VIRTUAL TABLE fts_table USING fts5(t, tokenize = 'unicode61 remove_diacritics 2')")
def insert_text(self, text):
self.cursor().execute('INSERT INTO fts_table(t) VALUES (?)', (text,))
class FTSTest(BaseTest):
def test_basic_fts(self): # {{{
from apsw import Connection
from calibre.constants import plugins
conn = Connection(':memory:')
plugins.load_apsw_extension(conn, 'sqlite_extension')
conn = TestConn()
conn.insert_text('two words, and a period. With another.')
# }}}