Edit Book: Fix leading or trailing hyphens on words being ignored when spell checking. Fixes #1370288 [Spell Check will not goto all misspelled words](https://bugs.launchpad.net/calibre/+bug/1370288)

This commit is contained in:
Kovid Goyal 2014-09-30 16:51:12 +05:30
parent 2fdf4de2f6
commit 4153baa4ce
3 changed files with 72 additions and 28 deletions

View File

@ -158,6 +158,7 @@ class Dictionaries(object):
def __init__(self): def __init__(self):
self.remove_hyphenation = re.compile('[\u2010-]+') self.remove_hyphenation = re.compile('[\u2010-]+')
self.negative_pat = re.compile('-[.\d+]')
self.dictionaries = {} self.dictionaries = {}
self.word_cache = {} self.word_cache = {}
self.ignored_words = set() self.ignored_words = set()
@ -327,6 +328,8 @@ class Dictionaries(object):
pass pass
else: else:
ans = True ans = True
if ans is False and self.negative_pat.match(word) is not None:
ans = True
self.word_cache[key] = ans self.word_cache[key] = ans
return ans return ans

View File

@ -589,6 +589,8 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *input) {
} // }}} } // }}}
#define IS_HYPHEN_CHAR(x) ((x) == 0x2d || (x) == 0x2010)
// BreakIterator.index {{{ // BreakIterator.index {{{
static PyObject * static PyObject *
icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) { icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
@ -596,37 +598,53 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
#error Not implemented for python >= 3.3 #error Not implemented for python >= 3.3
#endif #endif
UChar *buf = NULL; UChar *buf = NULL, *needle = NULL;
int32_t prev = 0, p = 0, sz = 0, ans = -1; int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0;
buf = python_to_icu(token, &sz, 1); buf = python_to_icu(token, &sz, 1);
if (buf == NULL) return NULL; if (buf == NULL) return NULL;
if (sz < 1) goto end; if (sz < 1) goto end;
needle = buf;
if (sz > 1 && IS_HYPHEN_CHAR(buf[0])) { needle = buf + 1; leading_hyphen = 1; sz -= 1; }
if (sz > 1 && IS_HYPHEN_CHAR(buf[sz-1])) trailing_hyphen = 1;
Py_BEGIN_ALLOW_THREADS; Py_BEGIN_ALLOW_THREADS;
p = ubrk_first(self->break_iterator); p = ubrk_first(self->break_iterator);
while (p != UBRK_DONE) { while (p != UBRK_DONE) {
prev = p; p = ubrk_next(self->break_iterator); word_start = p; p = ubrk_next(self->break_iterator);
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
continue; // We are not at the start of a word continue; // We are not at the start of a word
if (self->text_len >= prev + sz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) {
// Needle is present at text[prev:] we have to check if it is not surrounded by hyphen boundaries if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) {
if (prev > 0 && (self->text[prev-1] == 0x2d || self->text[prev-1] == 0x2010)) continue; // At a hyphen boundary if (word_start > 0 && (
if( (leading_hyphen && !IS_HYPHEN_CHAR(self->text[word_start-1])) ||
ubrk_isBoundary(self->break_iterator, prev + sz) && (!leading_hyphen && IS_HYPHEN_CHAR(self->text[word_start-1]))
(self->text_len == prev + sz || (self->text[prev + sz] != 0x2d && self->text[prev + sz] != 0x2010)) )) continue;
) { if (!trailing_hyphen && IS_HYPHEN_CHAR(self->text[word_start + sz])) continue;
ans = prev; break; // Found word surrounded by non-hyphen boundaries
} if (p == UBRK_DONE || self->text_len <= word_start + sz) { ans = word_start; break; }
if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary
if (
// Check that the found word is followed by a word boundary
ubrk_isBoundary(self->break_iterator, word_start + sz) &&
// If there is a leading hyphen check that the leading
// hyphen is preceded by a word boundary
(!leading_hyphen || (word_start > 1 && ubrk_isBoundary(self->break_iterator, word_start - 2))) &&
// Check that there is a word boundary *after* the trailing
// hyphen. We cannot rely on ubrk_isBoundary() as that
// always returns true because of the trailing hyphen.
(!trailing_hyphen || ubrk_following(self->break_iterator, word_start + sz) == UBRK_DONE || ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
) { ans = word_start; break; }
if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary()
} }
} }
if (leading_hyphen && ans > -1) ans -= 1;
#ifdef Py_UNICODE_WIDE #ifdef Py_UNICODE_WIDE
if (ans > 0) ans = u_countChar32(self->text, ans); if (ans > 0) ans = u_countChar32(self->text, ans);
#endif #endif
Py_END_ALLOW_THREADS; Py_END_ALLOW_THREADS;
end: end:
free(buf); free(buf);
return Py_BuildValue("l", (long)ans); return Py_BuildValue("l", (long)ans);
@ -640,8 +658,8 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
#error Not implemented for python >= 3.3 #error Not implemented for python >= 3.3
#endif #endif
int32_t prev = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0; int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
int is_hyphen_sep = 0; int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0;
UChar sep = 0; UChar sep = 0;
PyObject *ans = NULL, *temp = NULL, *t = NULL; PyObject *ans = NULL, *temp = NULL, *t = NULL;
@ -650,26 +668,31 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
p = ubrk_first(self->break_iterator); p = ubrk_first(self->break_iterator);
while (p != UBRK_DONE) { while (p != UBRK_DONE) {
prev = p; p = ubrk_next(self->break_iterator); word_start = p; p = ubrk_next(self->break_iterator);
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
continue; // We are not at the start of a word continue; // We are not at the start of a word
sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start;
if (sz > 0) { if (sz > 0) {
// ICU breaks on words containing hyphens, we do not want that, so we recombine manually // ICU breaks on words containing hyphens, we do not want that, so we recombine manually
is_hyphen_sep = 0; is_hyphen_sep = 0; leading_hyphen = 0; trailing_hyphen = 0;
if (last_pos > 0) { if (word_start > 0) { // Look for a leading hyphen
if (prev - last_pos == 1) { sep = *(self->text + word_start - 1);
sep = *(self->text + last_pos); if (IS_HYPHEN_CHAR(sep)) {
if (sep == 0x2d || sep == 0x2010) is_hyphen_sep = 1; leading_hyphen = 1;
if (last_pos > 0 && word_start - last_pos == 1) is_hyphen_sep = 1;
} }
} }
if (word_start + sz < self->text_len) { // Look for a trailing hyphen
sep = *(self->text + word_start + sz);
if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1;
}
last_pos = p; last_pos = p;
#ifdef Py_UNICODE_WIDE #ifdef Py_UNICODE_WIDE
sz = u_countChar32(self->text + prev, sz); sz = u_countChar32(self->text + word_start, sz);
prev = u_countChar32(self->text, prev); word_start = u_countChar32(self->text, word_start);
#endif #endif
if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) { if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) {
sz = last_sz + sz + 1; sz = last_sz + sz + trailing_hyphen;
last_sz = sz; last_sz = sz;
t = PyInt_FromLong((long)sz); t = PyInt_FromLong((long)sz);
if (t == NULL) { Py_DECREF(ans); ans = NULL; break; } if (t == NULL) { Py_DECREF(ans); ans = NULL; break; }
@ -677,8 +700,9 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
Py_DECREF(PyTuple_GET_ITEM(temp, 1)); Py_DECREF(PyTuple_GET_ITEM(temp, 1));
PyTuple_SET_ITEM(temp, 1, t); PyTuple_SET_ITEM(temp, 1, t);
} else { } else {
sz += leading_hyphen + trailing_hyphen;
last_sz = sz; last_sz = sz;
temp = Py_BuildValue("ll", (long)prev, (long)sz); temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz);
if (temp == NULL) { if (temp == NULL) {
Py_DECREF(ans); ans = NULL; break; Py_DECREF(ans); ans = NULL; break;
} }

View File

@ -156,7 +156,9 @@ class TestICU(unittest.TestCase):
self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q) self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
self.ae(split(u'I I\'m'), ['I', "I'm"]) self.ae(split(u'I I\'m'), ['I', "I'm"])
self.ae(split(u'out-of-the-box'), ['out-of-the-box']) self.ae(split(u'out-of-the-box'), ['out-of-the-box'])
self.ae(split(u'-one two-'), ['one', 'two']) self.ae(split(u'-one two-'), ['-one', 'two-'])
self.ae(split(u'-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e'])
self.ae(split(u'-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e'])
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)]) self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)])
for needle, haystack, pos in ( for needle, haystack, pos in (
('word', 'a word b', 2), ('word', 'a word b', 2),
@ -168,11 +170,26 @@ class TestICU(unittest.TestCase):
('one-two', 'one-two-three one-two', 14), ('one-two', 'one-two-three one-two', 14),
('one', 'onet one', 5), ('one', 'onet one', 5),
('two', 'one-two two', 8), ('two', 'one-two two', 8),
('two', 'two-one two', 8),
('-two', 'one-two -two', 8),
('-two', 'two', -1),
('i', 'i', 0), ('i', 'i', 0),
('i', 'six i', 4), ('i', 'six i', 4),
('i', '', -1), ('', '', -1), ('', 'i', -1), ('i', '', -1), ('', '', -1), ('', 'i', -1),
('i', 'six clicks', -1), ('i', 'six clicks', -1),
('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)), ('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)),
('-a', 'b -a', 2),
('a-', 'a-b a- d', 4),
('-a-', 'b -a -a-', 5),
('-a-', '-a-', 0),
('-a-', 'a-', -1),
('-a-', '-a', -1),
('-a-', 'a', -1),
('a-', 'a-', 0),
('-a', '-a', 0),
('a-b-c-', 'a-b-c-d', -1),
('a-b-c-', 'a-b-c-.', 0),
('a-b-c-', 'a-b-c-d a-b-c- d', 8),
): ):
fpos = index_of(needle, haystack) fpos = index_of(needle, haystack)
self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos)) self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))