From ab313c836f0f3664249f14784f6a53824d682660 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 16 Jun 2021 12:51:43 +0530 Subject: [PATCH] Implement the unicode61 tokenizer with ICU Still have to implement removal of diacritics --- setup/extensions.json | 5 +- src/calibre/db/sqlite_extension.cpp | 136 +++++++++++++++++++++++----- src/calibre/db/tests/fts.py | 3 +- 3 files changed, 118 insertions(+), 26 deletions(-) diff --git a/setup/extensions.json b/setup/extensions.json index c9e5305227..511898196a 100644 --- a/setup/extensions.json +++ b/setup/extensions.json @@ -74,7 +74,10 @@ { "name": "sqlite_extension", "sources": "calibre/db/sqlite_extension.cpp", - "inc_dirs": "!sqlite_inc_dirs" + "libraries": "icudata icui18n icuuc icuio", + "windows_libraries": "icudt icuin icuuc icuio", + "lib_dirs": "!icu_lib_dirs", + "inc_dirs": "!icu_inc_dirs !sqlite_inc_dirs" }, { "name": "lzx", diff --git a/src/calibre/db/sqlite_extension.cpp b/src/calibre/db/sqlite_extension.cpp index 8729fb38e7..e31ae5e843 100644 --- a/src/calibre/db/sqlite_extension.cpp +++ b/src/calibre/db/sqlite_extension.cpp @@ -10,48 +10,138 @@ #include #include #include +#include #include +#include +#include SQLITE_EXTENSION_INIT1 typedef int (*token_callback_func)(void *, int, const char *, int, int, int); + +// UTF-8 decode taken from: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + +static const uint8_t utf8_data[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + + +typedef enum UTF8State { UTF8_ACCEPT = 0, UTF8_REJECT = 1} UTF8State; + +uint32_t +decode_utf8(UTF8State* state, uint32_t* codep, uint8_t byte) { + uint32_t type = utf8_data[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = (UTF8State) utf8_data[256 + *state*16 + type]; + return *state; +} + + +static void +populate_icu_string(const char *text, int text_sz, icu::UnicodeString &str, std::vector &byte_offsets) { + UTF8State state = UTF8_ACCEPT, prev = UTF8_ACCEPT; + uint32_t codep = 0; + for (int i = 0, pos = 0; i < text_sz; i++) { + switch(decode_utf8(&state, &codep, text[i])) { + case UTF8_ACCEPT: { + size_t sz = str.length(); + str.append((UChar32)codep); + sz = str.length() - sz; + for (size_t x = 0; x < sz; x++) byte_offsets.push_back(pos); + pos = i + 1; + } + break; + case UTF8_REJECT: + state = UTF8_ACCEPT; + if (prev != UTF8_ACCEPT && i > 0) i--; + break; + } + prev = state; + } + byte_offsets.push_back(text_sz); +} + class Tokenizer { private: - std::string ascii_folded_buf; bool remove_diacritics; + std::vector byte_offsets; + token_callback_func current_callback; + void *current_callback_ctx; + std::string token_buf; - int ascii_tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) { - int pos = 0; - while (pos < text_sz) { - /* Skip any leading divider characters. */ - while (pos < text_sz && !std::isalnum(text[pos], std::locale::classic())) pos++; - if (pos >= text_sz) break; - ascii_folded_buf.clear(); - int start_pos = pos; - while (std::isalnum(text[pos], std::locale::classic())) { - char ch = text[pos++]; - if ('A' <= ch && ch <= 'Z') ch += 'a' - 'A'; - ascii_folded_buf.push_back(ch); - } - if (!ascii_folded_buf.empty()) { - int rc = callback(callback_ctx, 0, ascii_folded_buf.c_str(), ascii_folded_buf.size(), start_pos, start_pos + ascii_folded_buf.size()); - if (rc != SQLITE_OK) return rc; - } + bool is_token_char(UChar32 ch) const { + switch(u_charType(ch)) { + case U_UPPERCASE_LETTER: + case U_LOWERCASE_LETTER: + case U_TITLECASE_LETTER: + case U_MODIFIER_LETTER: + case U_OTHER_LETTER: + case U_DECIMAL_DIGIT_NUMBER: + case U_LETTER_NUMBER: + case U_OTHER_NUMBER: + case U_PRIVATE_USE_CHAR: + return true; + default: + return false; } - return SQLITE_OK; } + + int send_token(int32_t start_offset, int32_t end_offset, int flags = 0) { + return current_callback(current_callback_ctx, flags, token_buf.c_str(), token_buf.size(), byte_offsets[start_offset], byte_offsets[end_offset]); + } + public: - Tokenizer(const char **args, int nargs) : ascii_folded_buf(), remove_diacritics(false) { - ascii_folded_buf.reserve(128); + Tokenizer(const char **args, int nargs) : remove_diacritics(true), byte_offsets(), token_buf() { for (int i = 0; i < nargs; i++) { if (strcmp(args[i], "remove_diacritics") == 0) { - remove_diacritics = true; + i++; + if (i < nargs && strcmp(args[i], "0") == 0) remove_diacritics = false; } } } int tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) { - return ascii_tokenize(callback_ctx, flags, text, text_sz, callback); + current_callback = callback; current_callback_ctx = callback_ctx; + icu::UnicodeString str(text_sz, 0, 0); + byte_offsets.clear(); + byte_offsets.reserve(text_sz + 8); + populate_icu_string(text, text_sz, str, byte_offsets); + str.foldCase(U_FOLD_CASE_DEFAULT); + int32_t offset = str.getChar32Start(0); + while (offset < str.length()) { + // soak up non-token chars + while (offset < str.length() && !is_token_char(str.char32At(offset))) offset = str.moveIndex32(offset, 1); + if (offset >= str.length()) break; + // get the length of the sequence of token chars + int32_t start_offset = offset; + while (offset < str.length() && is_token_char(str.char32At(offset))) offset = str.moveIndex32(offset, 1); + if (offset > start_offset) { + icu::UnicodeString token(str, start_offset, offset - start_offset); + token.foldCase(U_FOLD_CASE_DEFAULT); + token_buf.clear(); token_buf.reserve(4 * (offset - start_offset)); + token.toUTF8String(token_buf); + int rc = send_token(start_offset, offset); + if (rc != SQLITE_OK) return rc; + } + } + return SQLITE_OK; } }; diff --git a/src/calibre/db/tests/fts.py b/src/calibre/db/tests/fts.py index ec985e4525..322d4961cb 100644 --- a/src/calibre/db/tests/fts.py +++ b/src/calibre/db/tests/fts.py @@ -22,8 +22,7 @@ class TestConn(Connection): super().__init__(':memory:') plugins.load_apsw_extension(self, 'sqlite_extension') options = [] - if remove_diacritics: - options.append('remove_diacritics'), options.append('2') + options.append('remove_diacritics'), options.append('2' if remove_diacritics else '0') options = ' '.join(options) self.execute(f''' CREATE VIRTUAL TABLE fts_table USING fts5(t, tokenize = 'unicode61 {options}');