Add currency and other symbols to allowed token characters

This commit is contained in:
Kovid Goyal 2021-06-18 21:04:31 +05:30
parent 2cf31be2ba
commit 6ef1ec1656
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 20 additions and 2 deletions

View File

@ -104,7 +104,6 @@ struct char_cmp {
typedef std::unique_ptr<icu::BreakIterator> BreakIterator;
class Tokenizer {
private:
bool remove_diacritics;
@ -125,11 +124,14 @@ private:
case U_DECIMAL_DIGIT_NUMBER:
case U_LETTER_NUMBER:
case U_OTHER_NUMBER:
case U_CURRENCY_SYMBOL:
case U_OTHER_SYMBOL:
case U_PRIVATE_USE_CHAR:
return true;
default:
return false;
break;;
}
return false;
}
int send_token(const icu::UnicodeString &token, int32_t start_offset, int32_t end_offset, int flags = 0) {

View File

@ -65,6 +65,22 @@ class FTSTest(BaseTest):
tokenize("Some wörds"),
[t('some', 0, 4), t('wörds', 5, 11), t('words', 5, 11, 1)]
)
self.ae(
tokenize("don't 'bug'"),
[t("don't", 0, 5), t('bug', 7, 10)]
)
self.ae(
tokenize("a,b. c"),
[t("a", 0, 1), t('b', 2, 3), t('c', 5, 6)]
)
self.ae(
tokenize("a*b+c"),
[t("a", 0, 1), t('b', 2, 3), t('c', 4, 5)]
)
self.ae(
tokenize("a😀smile"),
[t("a", 0, 1), t('😀', 1, 5), t('smile', 5, 10)]
)
# }}}
def test_fts_basic(self): # {{{