This commit is contained in:
Kovid Goyal 2020-01-06 17:52:29 +05:30
parent 15a0112b47
commit 1eba328f26
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -14,37 +14,36 @@ _iterators = {}
_lock = Lock() _lock = Lock()
def split_into_words(text, lang='en'): def get_iterator(lang):
with _lock: it = _iterators.get(lang)
it = _iterators.get(lang, None)
if it is None: if it is None:
it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang) it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang)
return it
def split_into_words(text, lang='en'):
with _lock:
it = get_iterator(lang)
it.set_text(text) it.set_text(text)
return [text[p:p+s] for p, s in it.split2()] return [text[p:p+s] for p, s in it.split2()]
def split_into_words_and_positions(text, lang='en'): def split_into_words_and_positions(text, lang='en'):
with _lock: with _lock:
it = _iterators.get(lang, None) it = get_iterator(lang)
if it is None:
it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang)
it.set_text(text) it.set_text(text)
return it.split2() return it.split2()
def index_of(needle, haystack, lang='en'): def index_of(needle, haystack, lang='en'):
with _lock: with _lock:
it = _iterators.get(lang, None) it = get_iterator(lang)
if it is None:
it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang)
it.set_text(haystack) it.set_text(haystack)
return it.index(needle) return it.index(needle)
def count_words(text, lang='en'): def count_words(text, lang='en'):
with _lock: with _lock:
it = _iterators.get(lang, None) it = get_iterator(lang)
if it is None:
it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang)
it.set_text(text) it.set_text(text)
return it.count_words() return it.count_words()