mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implement the unicode61 tokenizer with ICU
Still have to implement removal of diacritics
This commit is contained in:
parent
c9c1029d02
commit
ab313c836f
@ -74,7 +74,10 @@
|
|||||||
{
|
{
|
||||||
"name": "sqlite_extension",
|
"name": "sqlite_extension",
|
||||||
"sources": "calibre/db/sqlite_extension.cpp",
|
"sources": "calibre/db/sqlite_extension.cpp",
|
||||||
"inc_dirs": "!sqlite_inc_dirs"
|
"libraries": "icudata icui18n icuuc icuio",
|
||||||
|
"windows_libraries": "icudt icuin icuuc icuio",
|
||||||
|
"lib_dirs": "!icu_lib_dirs",
|
||||||
|
"inc_dirs": "!icu_inc_dirs !sqlite_inc_dirs"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "lzx",
|
"name": "lzx",
|
||||||
|
@ -10,48 +10,138 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
#include <vector>
|
||||||
#include <sqlite3ext.h>
|
#include <sqlite3ext.h>
|
||||||
|
#include <unicode/unistr.h>
|
||||||
|
#include <unicode/uchar.h>
|
||||||
SQLITE_EXTENSION_INIT1
|
SQLITE_EXTENSION_INIT1
|
||||||
|
|
||||||
typedef int (*token_callback_func)(void *, int, const char *, int, int, int);
|
typedef int (*token_callback_func)(void *, int, const char *, int, int, int);
|
||||||
|
|
||||||
|
|
||||||
|
// UTF-8 decode taken from: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
||||||
|
|
||||||
|
static const uint8_t utf8_data[] = {
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
|
||||||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
|
||||||
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
|
||||||
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
|
||||||
|
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
|
||||||
|
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
|
||||||
|
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
|
||||||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
|
||||||
|
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
|
||||||
|
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
|
||||||
|
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
typedef enum UTF8State { UTF8_ACCEPT = 0, UTF8_REJECT = 1} UTF8State;
|
||||||
|
|
||||||
|
uint32_t
|
||||||
|
decode_utf8(UTF8State* state, uint32_t* codep, uint8_t byte) {
|
||||||
|
uint32_t type = utf8_data[byte];
|
||||||
|
|
||||||
|
*codep = (*state != UTF8_ACCEPT) ?
|
||||||
|
(byte & 0x3fu) | (*codep << 6) :
|
||||||
|
(0xff >> type) & (byte);
|
||||||
|
|
||||||
|
*state = (UTF8State) utf8_data[256 + *state*16 + type];
|
||||||
|
return *state;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
populate_icu_string(const char *text, int text_sz, icu::UnicodeString &str, std::vector<int> &byte_offsets) {
|
||||||
|
UTF8State state = UTF8_ACCEPT, prev = UTF8_ACCEPT;
|
||||||
|
uint32_t codep = 0;
|
||||||
|
for (int i = 0, pos = 0; i < text_sz; i++) {
|
||||||
|
switch(decode_utf8(&state, &codep, text[i])) {
|
||||||
|
case UTF8_ACCEPT: {
|
||||||
|
size_t sz = str.length();
|
||||||
|
str.append((UChar32)codep);
|
||||||
|
sz = str.length() - sz;
|
||||||
|
for (size_t x = 0; x < sz; x++) byte_offsets.push_back(pos);
|
||||||
|
pos = i + 1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case UTF8_REJECT:
|
||||||
|
state = UTF8_ACCEPT;
|
||||||
|
if (prev != UTF8_ACCEPT && i > 0) i--;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
prev = state;
|
||||||
|
}
|
||||||
|
byte_offsets.push_back(text_sz);
|
||||||
|
}
|
||||||
|
|
||||||
class Tokenizer {
|
class Tokenizer {
|
||||||
private:
|
private:
|
||||||
std::string ascii_folded_buf;
|
|
||||||
bool remove_diacritics;
|
bool remove_diacritics;
|
||||||
|
std::vector<int> byte_offsets;
|
||||||
|
token_callback_func current_callback;
|
||||||
|
void *current_callback_ctx;
|
||||||
|
std::string token_buf;
|
||||||
|
|
||||||
int ascii_tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) {
|
bool is_token_char(UChar32 ch) const {
|
||||||
int pos = 0;
|
switch(u_charType(ch)) {
|
||||||
while (pos < text_sz) {
|
case U_UPPERCASE_LETTER:
|
||||||
/* Skip any leading divider characters. */
|
case U_LOWERCASE_LETTER:
|
||||||
while (pos < text_sz && !std::isalnum(text[pos], std::locale::classic())) pos++;
|
case U_TITLECASE_LETTER:
|
||||||
if (pos >= text_sz) break;
|
case U_MODIFIER_LETTER:
|
||||||
ascii_folded_buf.clear();
|
case U_OTHER_LETTER:
|
||||||
int start_pos = pos;
|
case U_DECIMAL_DIGIT_NUMBER:
|
||||||
while (std::isalnum(text[pos], std::locale::classic())) {
|
case U_LETTER_NUMBER:
|
||||||
char ch = text[pos++];
|
case U_OTHER_NUMBER:
|
||||||
if ('A' <= ch && ch <= 'Z') ch += 'a' - 'A';
|
case U_PRIVATE_USE_CHAR:
|
||||||
ascii_folded_buf.push_back(ch);
|
return true;
|
||||||
}
|
default:
|
||||||
if (!ascii_folded_buf.empty()) {
|
return false;
|
||||||
int rc = callback(callback_ctx, 0, ascii_folded_buf.c_str(), ascii_folded_buf.size(), start_pos, start_pos + ascii_folded_buf.size());
|
|
||||||
if (rc != SQLITE_OK) return rc;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return SQLITE_OK;
|
|
||||||
|
int send_token(int32_t start_offset, int32_t end_offset, int flags = 0) {
|
||||||
|
return current_callback(current_callback_ctx, flags, token_buf.c_str(), token_buf.size(), byte_offsets[start_offset], byte_offsets[end_offset]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Tokenizer(const char **args, int nargs) : ascii_folded_buf(), remove_diacritics(false) {
|
Tokenizer(const char **args, int nargs) : remove_diacritics(true), byte_offsets(), token_buf() {
|
||||||
ascii_folded_buf.reserve(128);
|
|
||||||
for (int i = 0; i < nargs; i++) {
|
for (int i = 0; i < nargs; i++) {
|
||||||
if (strcmp(args[i], "remove_diacritics") == 0) {
|
if (strcmp(args[i], "remove_diacritics") == 0) {
|
||||||
remove_diacritics = true;
|
i++;
|
||||||
|
if (i < nargs && strcmp(args[i], "0") == 0) remove_diacritics = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) {
|
int tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) {
|
||||||
return ascii_tokenize(callback_ctx, flags, text, text_sz, callback);
|
current_callback = callback; current_callback_ctx = callback_ctx;
|
||||||
|
icu::UnicodeString str(text_sz, 0, 0);
|
||||||
|
byte_offsets.clear();
|
||||||
|
byte_offsets.reserve(text_sz + 8);
|
||||||
|
populate_icu_string(text, text_sz, str, byte_offsets);
|
||||||
|
str.foldCase(U_FOLD_CASE_DEFAULT);
|
||||||
|
int32_t offset = str.getChar32Start(0);
|
||||||
|
while (offset < str.length()) {
|
||||||
|
// soak up non-token chars
|
||||||
|
while (offset < str.length() && !is_token_char(str.char32At(offset))) offset = str.moveIndex32(offset, 1);
|
||||||
|
if (offset >= str.length()) break;
|
||||||
|
// get the length of the sequence of token chars
|
||||||
|
int32_t start_offset = offset;
|
||||||
|
while (offset < str.length() && is_token_char(str.char32At(offset))) offset = str.moveIndex32(offset, 1);
|
||||||
|
if (offset > start_offset) {
|
||||||
|
icu::UnicodeString token(str, start_offset, offset - start_offset);
|
||||||
|
token.foldCase(U_FOLD_CASE_DEFAULT);
|
||||||
|
token_buf.clear(); token_buf.reserve(4 * (offset - start_offset));
|
||||||
|
token.toUTF8String(token_buf);
|
||||||
|
int rc = send_token(start_offset, offset);
|
||||||
|
if (rc != SQLITE_OK) return rc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return SQLITE_OK;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -22,8 +22,7 @@ class TestConn(Connection):
|
|||||||
super().__init__(':memory:')
|
super().__init__(':memory:')
|
||||||
plugins.load_apsw_extension(self, 'sqlite_extension')
|
plugins.load_apsw_extension(self, 'sqlite_extension')
|
||||||
options = []
|
options = []
|
||||||
if remove_diacritics:
|
options.append('remove_diacritics'), options.append('2' if remove_diacritics else '0')
|
||||||
options.append('remove_diacritics'), options.append('2')
|
|
||||||
options = ' '.join(options)
|
options = ' '.join(options)
|
||||||
self.execute(f'''
|
self.execute(f'''
|
||||||
CREATE VIRTUAL TABLE fts_table USING fts5(t, tokenize = 'unicode61 {options}');
|
CREATE VIRTUAL TABLE fts_table USING fts5(t, tokenize = 'unicode61 {options}');
|
||||||
|
Loading…
x
Reference in New Issue
Block a user