mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Add a UI language based iterator
This commit is contained in:
parent
06d34a2df9
commit
4127117e8a
@ -23,6 +23,7 @@ SQLITE_EXTENSION_INIT1
|
|||||||
typedef int (*token_callback_func)(void *, int, const char *, int, int, int);
|
typedef int (*token_callback_func)(void *, int, const char *, int, int, int);
|
||||||
|
|
||||||
|
|
||||||
|
// Converting SQLITE text to ICU strings {{{
|
||||||
// UTF-8 decode taken from: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
// UTF-8 decode taken from: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
||||||
|
|
||||||
static const uint8_t utf8_data[] = {
|
static const uint8_t utf8_data[] = {
|
||||||
@ -81,14 +82,19 @@ populate_icu_string(const char *text, int text_sz, icu::UnicodeString &str, std:
|
|||||||
}
|
}
|
||||||
byte_offsets.push_back(text_sz);
|
byte_offsets.push_back(text_sz);
|
||||||
}
|
}
|
||||||
|
// }}}
|
||||||
|
|
||||||
|
static char ui_language[16] = {0};
|
||||||
|
|
||||||
class Tokenizer {
|
class Tokenizer {
|
||||||
private:
|
private:
|
||||||
icu::Transliterator *diacritics_remover;
|
bool remove_diacritics;
|
||||||
|
std::unique_ptr<icu::Transliterator> diacritics_remover;
|
||||||
std::vector<int> byte_offsets;
|
std::vector<int> byte_offsets;
|
||||||
std::string token_buf;
|
std::string token_buf, current_ui_language;
|
||||||
token_callback_func current_callback;
|
token_callback_func current_callback;
|
||||||
void *current_callback_ctx;
|
void *current_callback_ctx;
|
||||||
|
std::unique_ptr<icu::BreakIterator> basic_word_iterator;
|
||||||
|
|
||||||
bool is_token_char(UChar32 ch) const {
|
bool is_token_char(UChar32 ch) const {
|
||||||
switch(u_charType(ch)) {
|
switch(u_charType(ch)) {
|
||||||
@ -113,14 +119,31 @@ private:
|
|||||||
return current_callback(current_callback_ctx, flags, token_buf.c_str(), (int)token_buf.size(), byte_offsets[start_offset], byte_offsets[end_offset]);
|
return current_callback(current_callback_ctx, flags, token_buf.c_str(), (int)token_buf.size(), byte_offsets[start_offset], byte_offsets[end_offset]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ensure_basic_iterator(void) {
|
||||||
|
if (current_ui_language != ui_language || !basic_word_iterator) {
|
||||||
|
current_ui_language.clear(); current_ui_language = ui_language;
|
||||||
|
const icu::Locale locale = icu::Locale::getDefault();
|
||||||
|
icu::ErrorCode status;
|
||||||
|
if (current_ui_language.empty()) {
|
||||||
|
basic_word_iterator.reset(icu::BreakIterator::createWordInstance(icu::Locale::getDefault(), status));
|
||||||
|
} else {
|
||||||
|
basic_word_iterator.reset(icu::BreakIterator::createWordInstance(icu::Locale::createCanonical(ui_language), status));
|
||||||
|
if (status.isFailure()) {
|
||||||
|
basic_word_iterator.reset(icu::BreakIterator::createWordInstance(icu::Locale::getDefault(), status));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
int constructor_error;
|
int constructor_error;
|
||||||
Tokenizer(const char **args, int nargs) :
|
Tokenizer(const char **args, int nargs) :
|
||||||
diacritics_remover(NULL),
|
remove_diacritics(true), diacritics_remover(),
|
||||||
byte_offsets(), token_buf(),
|
byte_offsets(), token_buf(), current_ui_language(ui_language),
|
||||||
current_callback(NULL), current_callback_ctx(NULL), constructor_error(SQLITE_OK)
|
current_callback(NULL), current_callback_ctx(NULL), basic_word_iterator(),
|
||||||
|
|
||||||
|
constructor_error(SQLITE_OK)
|
||||||
{
|
{
|
||||||
bool remove_diacritics = true;
|
|
||||||
for (int i = 0; i < nargs; i++) {
|
for (int i = 0; i < nargs; i++) {
|
||||||
if (strcmp(args[i], "remove_diacritics") == 0) {
|
if (strcmp(args[i], "remove_diacritics") == 0) {
|
||||||
i++;
|
i++;
|
||||||
@ -129,19 +152,18 @@ public:
|
|||||||
}
|
}
|
||||||
if (remove_diacritics) {
|
if (remove_diacritics) {
|
||||||
icu::ErrorCode status;
|
icu::ErrorCode status;
|
||||||
diacritics_remover = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status);
|
diacritics_remover.reset(icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status));
|
||||||
if (status.isFailure()) {
|
if (status.isFailure()) {
|
||||||
fprintf(stderr, "Failed to create ICU transliterator to remove diacritics with error: %s\n", status.errorName());
|
fprintf(stderr, "Failed to create ICU transliterator to remove diacritics with error: %s\n", status.errorName());
|
||||||
constructor_error = SQLITE_INTERNAL;
|
constructor_error = SQLITE_INTERNAL;
|
||||||
|
diacritics_remover.reset(NULL);
|
||||||
|
remove_diacritics = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
~Tokenizer() {
|
|
||||||
if (diacritics_remover) icu::Transliterator::unregister(diacritics_remover->getID());
|
|
||||||
diacritics_remover = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
int tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) {
|
int tokenize(void *callback_ctx, int flags, const char *text, int text_sz, token_callback_func callback) {
|
||||||
|
ensure_basic_iterator();
|
||||||
current_callback = callback; current_callback_ctx = callback_ctx;
|
current_callback = callback; current_callback_ctx = callback_ctx;
|
||||||
icu::UnicodeString str(text_sz, 0, 0);
|
icu::UnicodeString str(text_sz, 0, 0);
|
||||||
byte_offsets.clear();
|
byte_offsets.clear();
|
||||||
@ -162,7 +184,7 @@ public:
|
|||||||
icu::UnicodeString token(str, start_offset, offset - start_offset);
|
icu::UnicodeString token(str, start_offset, offset - start_offset);
|
||||||
token.foldCase(U_FOLD_CASE_DEFAULT);
|
token.foldCase(U_FOLD_CASE_DEFAULT);
|
||||||
if ((rc = send_token(token, start_offset, offset)) != SQLITE_OK) return rc;
|
if ((rc = send_token(token, start_offset, offset)) != SQLITE_OK) return rc;
|
||||||
if (!for_query && diacritics_remover) {
|
if (!for_query && remove_diacritics) {
|
||||||
icu::UnicodeString tt(token);
|
icu::UnicodeString tt(token);
|
||||||
diacritics_remover->transliterate(tt);
|
diacritics_remover->transliterate(tt);
|
||||||
if (tt != token) {
|
if (tt != token) {
|
||||||
@ -273,10 +295,21 @@ get_locales_for_break_iteration(PyObject *self, PyObject *args) {
|
|||||||
return ans.detach();
|
return ans.detach();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
set_ui_language(PyObject *self, PyObject *args) {
|
||||||
|
const char *val;
|
||||||
|
if (!PyArg_ParseTuple(args, "s", &val)) return NULL;
|
||||||
|
strncpy(ui_language, val, sizeof(ui_language) - 1);
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
static PyMethodDef methods[] = {
|
static PyMethodDef methods[] = {
|
||||||
{"get_locales_for_break_iteration", get_locales_for_break_iteration, METH_NOARGS,
|
{"get_locales_for_break_iteration", get_locales_for_break_iteration, METH_NOARGS,
|
||||||
"Get list of available locales for break iteration"
|
"Get list of available locales for break iteration"
|
||||||
},
|
},
|
||||||
|
{"set_ui_language", set_ui_language, METH_VARARGS,
|
||||||
|
"Set the current UI language"
|
||||||
|
},
|
||||||
{NULL, NULL, 0, NULL}
|
{NULL, NULL, 0, NULL}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user