mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
A simple ASCII tokenizer to start with
This commit is contained in:
parent
268d1d991c
commit
c819fcb870
@ -8,19 +8,44 @@
|
||||
#define UNICODE
|
||||
#include <Python.h>
|
||||
#include <stdlib.h>
|
||||
#include <new>
|
||||
#include <string>
|
||||
#include <locale>
|
||||
#include <sqlite3ext.h>
|
||||
SQLITE_EXTENSION_INIT1
|
||||
|
||||
typedef int (*token_callback_func)(void *, int, const char *, int, int, int);
|
||||
|
||||
class Tokenizer {
|
||||
private:
|
||||
std::string ascii_folded_buf;
|
||||
|
||||
int ascii_tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) {
|
||||
int pos = 0;
|
||||
while (pos < text_sz) {
|
||||
/* Skip any leading divider characters. */
|
||||
while (pos < text_sz && !std::isalnum(text[pos], std::locale::classic())) pos++;
|
||||
if (pos >= text_sz) break;
|
||||
ascii_folded_buf.clear();
|
||||
int start_pos = pos;
|
||||
while (std::isalnum(text[pos], std::locale::classic())) {
|
||||
char ch = text[pos++];
|
||||
if ('A' <= ch && ch <= 'Z') ch += 'a' - 'A';
|
||||
ascii_folded_buf.push_back(ch);
|
||||
}
|
||||
if (!ascii_folded_buf.empty()) {
|
||||
int rc = callback(callback_ctx, 0, ascii_folded_buf.c_str(), ascii_folded_buf.size(), start_pos, start_pos + ascii_folded_buf.size());
|
||||
if (rc != SQLITE_OK) return rc;
|
||||
}
|
||||
}
|
||||
return SQLITE_OK;
|
||||
}
|
||||
public:
|
||||
Tokenizer(const char **args, int nargs) {
|
||||
Tokenizer(const char **args, int nargs) : ascii_folded_buf() {
|
||||
ascii_folded_buf.reserve(128);
|
||||
}
|
||||
|
||||
int tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) {
|
||||
return SQLITE_OK;
|
||||
return ascii_tokenize(callback_ctx, flags, text, text_sz, callback);
|
||||
}
|
||||
};
|
||||
|
||||
@ -40,11 +65,15 @@ fts5_api_from_db(sqlite3 *db, fts5_api **ppApi) {
|
||||
|
||||
static int
|
||||
tok_create(void *sqlite3, const char **azArg, int nArg, Fts5Tokenizer **ppOut) {
|
||||
int rc = SQLITE_OK;
|
||||
Tokenizer *p = new (std::nothrow) Tokenizer(azArg, nArg);
|
||||
if (p) *ppOut = reinterpret_cast<Fts5Tokenizer *>(p);
|
||||
else rc = SQLITE_NOMEM;
|
||||
return rc;
|
||||
try {
|
||||
Tokenizer *p = new Tokenizer(azArg, nArg);
|
||||
*ppOut = reinterpret_cast<Fts5Tokenizer *>(p);
|
||||
} catch (std::bad_alloc &ex) {
|
||||
return SQLITE_NOMEM;
|
||||
} catch (...) {
|
||||
return SQLITE_ERROR;
|
||||
}
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int
|
||||
@ -87,7 +116,7 @@ calibre_sqlite_extension_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_ro
|
||||
return SQLITE_ERROR;
|
||||
}
|
||||
fts5_tokenizer tok = {tok_create, tok_delete, tok_tokenize};
|
||||
fts5api->xCreateTokenizer(fts5api, "calibre", reinterpret_cast<void *>(fts5api), &tok, NULL);
|
||||
fts5api->xCreateTokenizer(fts5api, "unicode61", reinterpret_cast<void *>(fts5api), &tok, NULL);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
}
|
||||
|
@ -4,14 +4,24 @@
|
||||
|
||||
|
||||
from calibre.db.tests.base import BaseTest
|
||||
from apsw import Connection
|
||||
from calibre.constants import plugins
|
||||
|
||||
|
||||
class TestConn(Connection):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(':memory:')
|
||||
plugins.load_apsw_extension(self, 'sqlite_extension')
|
||||
self.cursor().execute("CREATE VIRTUAL TABLE fts_table USING fts5(t, tokenize = 'unicode61 remove_diacritics 2')")
|
||||
|
||||
def insert_text(self, text):
|
||||
self.cursor().execute('INSERT INTO fts_table(t) VALUES (?)', (text,))
|
||||
|
||||
|
||||
class FTSTest(BaseTest):
|
||||
|
||||
def test_basic_fts(self): # {{{
|
||||
from apsw import Connection
|
||||
from calibre.constants import plugins
|
||||
|
||||
conn = Connection(':memory:')
|
||||
plugins.load_apsw_extension(conn, 'sqlite_extension')
|
||||
conn = TestConn()
|
||||
conn.insert_text('two words, and a period. With another.')
|
||||
# }}}
|
||||
|
Loading…
x
Reference in New Issue
Block a user