diff --git a/src/calibre/spell/dictionary.py b/src/calibre/spell/dictionary.py index 3b2b55a288..e948dab824 100644 --- a/src/calibre/spell/dictionary.py +++ b/src/calibre/spell/dictionary.py @@ -158,6 +158,7 @@ class Dictionaries(object): def __init__(self): self.remove_hyphenation = re.compile('[\u2010-]+') + self.negative_pat = re.compile('-[.\d+]') self.dictionaries = {} self.word_cache = {} self.ignored_words = set() @@ -327,6 +328,8 @@ class Dictionaries(object): pass else: ans = True + if ans is False and self.negative_pat.match(word) is not None: + ans = True self.word_cache[key] = ans return ans diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 6141fd668d..8169d29822 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -589,6 +589,8 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *input) { } // }}} +#define IS_HYPHEN_CHAR(x) ((x) == 0x2d || (x) == 0x2010) + // BreakIterator.index {{{ static PyObject * icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) { @@ -596,37 +598,53 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) { #error Not implemented for python >= 3.3 #endif - UChar *buf = NULL; - int32_t prev = 0, p = 0, sz = 0, ans = -1; + UChar *buf = NULL, *needle = NULL; + int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0; buf = python_to_icu(token, &sz, 1); if (buf == NULL) return NULL; if (sz < 1) goto end; + needle = buf; + if (sz > 1 && IS_HYPHEN_CHAR(buf[0])) { needle = buf + 1; leading_hyphen = 1; sz -= 1; } + if (sz > 1 && IS_HYPHEN_CHAR(buf[sz-1])) trailing_hyphen = 1; Py_BEGIN_ALLOW_THREADS; p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { - prev = p; p = ubrk_next(self->break_iterator); + word_start = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word - if (self->text_len >= prev + sz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { - // Needle is present at text[prev:] we have to check if it is not surrounded by hyphen boundaries - if (prev > 0 && (self->text[prev-1] == 0x2d || self->text[prev-1] == 0x2010)) continue; // At a hyphen boundary - if( - ubrk_isBoundary(self->break_iterator, prev + sz) && - (self->text_len == prev + sz || (self->text[prev + sz] != 0x2d && self->text[prev + sz] != 0x2010)) - ) { - ans = prev; break; // Found word surrounded by non-hyphen boundaries - } - if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary + + if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) { + if (word_start > 0 && ( + (leading_hyphen && !IS_HYPHEN_CHAR(self->text[word_start-1])) || + (!leading_hyphen && IS_HYPHEN_CHAR(self->text[word_start-1])) + )) continue; + if (!trailing_hyphen && IS_HYPHEN_CHAR(self->text[word_start + sz])) continue; + + if (p == UBRK_DONE || self->text_len <= word_start + sz) { ans = word_start; break; } + + if ( + // Check that the found word is followed by a word boundary + ubrk_isBoundary(self->break_iterator, word_start + sz) && + // If there is a leading hyphen check that the leading + // hyphen is preceded by a word boundary + (!leading_hyphen || (word_start > 1 && ubrk_isBoundary(self->break_iterator, word_start - 2))) && + // Check that there is a word boundary *after* the trailing + // hyphen. We cannot rely on ubrk_isBoundary() as that + // always returns true because of the trailing hyphen. + (!trailing_hyphen || ubrk_following(self->break_iterator, word_start + sz) == UBRK_DONE || ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) + ) { ans = word_start; break; } + + if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary() } } + if (leading_hyphen && ans > -1) ans -= 1; #ifdef Py_UNICODE_WIDE if (ans > 0) ans = u_countChar32(self->text, ans); #endif Py_END_ALLOW_THREADS; - end: free(buf); return Py_BuildValue("l", (long)ans); @@ -640,8 +658,8 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) { #error Not implemented for python >= 3.3 #endif - int32_t prev = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0; - int is_hyphen_sep = 0; + int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0; + int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0; UChar sep = 0; PyObject *ans = NULL, *temp = NULL, *t = NULL; @@ -650,26 +668,31 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) { p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { - prev = p; p = ubrk_next(self->break_iterator); + word_start = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word - sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; + sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start; if (sz > 0) { // ICU breaks on words containing hyphens, we do not want that, so we recombine manually - is_hyphen_sep = 0; - if (last_pos > 0) { - if (prev - last_pos == 1) { - sep = *(self->text + last_pos); - if (sep == 0x2d || sep == 0x2010) is_hyphen_sep = 1; + is_hyphen_sep = 0; leading_hyphen = 0; trailing_hyphen = 0; + if (word_start > 0) { // Look for a leading hyphen + sep = *(self->text + word_start - 1); + if (IS_HYPHEN_CHAR(sep)) { + leading_hyphen = 1; + if (last_pos > 0 && word_start - last_pos == 1) is_hyphen_sep = 1; } } + if (word_start + sz < self->text_len) { // Look for a trailing hyphen + sep = *(self->text + word_start + sz); + if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1; + } last_pos = p; #ifdef Py_UNICODE_WIDE - sz = u_countChar32(self->text + prev, sz); - prev = u_countChar32(self->text, prev); + sz = u_countChar32(self->text + word_start, sz); + word_start = u_countChar32(self->text, word_start); #endif if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) { - sz = last_sz + sz + 1; + sz = last_sz + sz + trailing_hyphen; last_sz = sz; t = PyInt_FromLong((long)sz); if (t == NULL) { Py_DECREF(ans); ans = NULL; break; } @@ -677,8 +700,9 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) { Py_DECREF(PyTuple_GET_ITEM(temp, 1)); PyTuple_SET_ITEM(temp, 1, t); } else { + sz += leading_hyphen + trailing_hyphen; last_sz = sz; - temp = Py_BuildValue("ll", (long)prev, (long)sz); + temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz); if (temp == NULL) { Py_DECREF(ans); ans = NULL; break; } diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index c35612ae5e..9f8edbb715 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -156,7 +156,9 @@ class TestICU(unittest.TestCase): self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q) self.ae(split(u'I I\'m'), ['I', "I'm"]) self.ae(split(u'out-of-the-box'), ['out-of-the-box']) - self.ae(split(u'-one two-'), ['one', 'two']) + self.ae(split(u'-one two-'), ['-one', 'two-']) + self.ae(split(u'-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e']) + self.ae(split(u'-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e']) self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)]) for needle, haystack, pos in ( ('word', 'a word b', 2), @@ -168,11 +170,26 @@ class TestICU(unittest.TestCase): ('one-two', 'one-two-three one-two', 14), ('one', 'onet one', 5), ('two', 'one-two two', 8), + ('two', 'two-one two', 8), + ('-two', 'one-two -two', 8), + ('-two', 'two', -1), ('i', 'i', 0), ('i', 'six i', 4), ('i', '', -1), ('', '', -1), ('', 'i', -1), ('i', 'six clicks', -1), ('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)), + ('-a', 'b -a', 2), + ('a-', 'a-b a- d', 4), + ('-a-', 'b -a -a-', 5), + ('-a-', '-a-', 0), + ('-a-', 'a-', -1), + ('-a-', '-a', -1), + ('-a-', 'a', -1), + ('a-', 'a-', 0), + ('-a', '-a', 0), + ('a-b-c-', 'a-b-c-d', -1), + ('a-b-c-', 'a-b-c-.', 0), + ('a-b-c-', 'a-b-c-d a-b-c- d', 8), ): fpos = index_of(needle, haystack) self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))