mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Edit Book: Fix leading or trailing hyphens on words being ignored when spell checking. Fixes #1370288 [Spell Check will not goto all misspelled words](https://bugs.launchpad.net/calibre/+bug/1370288)
This commit is contained in:
parent
2fdf4de2f6
commit
4153baa4ce
@ -158,6 +158,7 @@ class Dictionaries(object):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.remove_hyphenation = re.compile('[\u2010-]+')
|
self.remove_hyphenation = re.compile('[\u2010-]+')
|
||||||
|
self.negative_pat = re.compile('-[.\d+]')
|
||||||
self.dictionaries = {}
|
self.dictionaries = {}
|
||||||
self.word_cache = {}
|
self.word_cache = {}
|
||||||
self.ignored_words = set()
|
self.ignored_words = set()
|
||||||
@ -327,6 +328,8 @@ class Dictionaries(object):
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
ans = True
|
ans = True
|
||||||
|
if ans is False and self.negative_pat.match(word) is not None:
|
||||||
|
ans = True
|
||||||
self.word_cache[key] = ans
|
self.word_cache[key] = ans
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
@ -589,6 +589,8 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *input) {
|
|||||||
|
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
|
#define IS_HYPHEN_CHAR(x) ((x) == 0x2d || (x) == 0x2010)
|
||||||
|
|
||||||
// BreakIterator.index {{{
|
// BreakIterator.index {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
|
icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
|
||||||
@ -596,37 +598,53 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
|
|||||||
#error Not implemented for python >= 3.3
|
#error Not implemented for python >= 3.3
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
UChar *buf = NULL;
|
UChar *buf = NULL, *needle = NULL;
|
||||||
int32_t prev = 0, p = 0, sz = 0, ans = -1;
|
int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0;
|
||||||
|
|
||||||
buf = python_to_icu(token, &sz, 1);
|
buf = python_to_icu(token, &sz, 1);
|
||||||
if (buf == NULL) return NULL;
|
if (buf == NULL) return NULL;
|
||||||
if (sz < 1) goto end;
|
if (sz < 1) goto end;
|
||||||
|
needle = buf;
|
||||||
|
if (sz > 1 && IS_HYPHEN_CHAR(buf[0])) { needle = buf + 1; leading_hyphen = 1; sz -= 1; }
|
||||||
|
if (sz > 1 && IS_HYPHEN_CHAR(buf[sz-1])) trailing_hyphen = 1;
|
||||||
|
|
||||||
Py_BEGIN_ALLOW_THREADS;
|
Py_BEGIN_ALLOW_THREADS;
|
||||||
p = ubrk_first(self->break_iterator);
|
p = ubrk_first(self->break_iterator);
|
||||||
while (p != UBRK_DONE) {
|
while (p != UBRK_DONE) {
|
||||||
prev = p; p = ubrk_next(self->break_iterator);
|
word_start = p; p = ubrk_next(self->break_iterator);
|
||||||
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
|
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
|
||||||
continue; // We are not at the start of a word
|
continue; // We are not at the start of a word
|
||||||
if (self->text_len >= prev + sz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) {
|
|
||||||
// Needle is present at text[prev:] we have to check if it is not surrounded by hyphen boundaries
|
if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) {
|
||||||
if (prev > 0 && (self->text[prev-1] == 0x2d || self->text[prev-1] == 0x2010)) continue; // At a hyphen boundary
|
if (word_start > 0 && (
|
||||||
if(
|
(leading_hyphen && !IS_HYPHEN_CHAR(self->text[word_start-1])) ||
|
||||||
ubrk_isBoundary(self->break_iterator, prev + sz) &&
|
(!leading_hyphen && IS_HYPHEN_CHAR(self->text[word_start-1]))
|
||||||
(self->text_len == prev + sz || (self->text[prev + sz] != 0x2d && self->text[prev + sz] != 0x2010))
|
)) continue;
|
||||||
) {
|
if (!trailing_hyphen && IS_HYPHEN_CHAR(self->text[word_start + sz])) continue;
|
||||||
ans = prev; break; // Found word surrounded by non-hyphen boundaries
|
|
||||||
}
|
if (p == UBRK_DONE || self->text_len <= word_start + sz) { ans = word_start; break; }
|
||||||
if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary
|
|
||||||
|
if (
|
||||||
|
// Check that the found word is followed by a word boundary
|
||||||
|
ubrk_isBoundary(self->break_iterator, word_start + sz) &&
|
||||||
|
// If there is a leading hyphen check that the leading
|
||||||
|
// hyphen is preceded by a word boundary
|
||||||
|
(!leading_hyphen || (word_start > 1 && ubrk_isBoundary(self->break_iterator, word_start - 2))) &&
|
||||||
|
// Check that there is a word boundary *after* the trailing
|
||||||
|
// hyphen. We cannot rely on ubrk_isBoundary() as that
|
||||||
|
// always returns true because of the trailing hyphen.
|
||||||
|
(!trailing_hyphen || ubrk_following(self->break_iterator, word_start + sz) == UBRK_DONE || ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
|
||||||
|
) { ans = word_start; break; }
|
||||||
|
|
||||||
|
if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (leading_hyphen && ans > -1) ans -= 1;
|
||||||
#ifdef Py_UNICODE_WIDE
|
#ifdef Py_UNICODE_WIDE
|
||||||
if (ans > 0) ans = u_countChar32(self->text, ans);
|
if (ans > 0) ans = u_countChar32(self->text, ans);
|
||||||
#endif
|
#endif
|
||||||
Py_END_ALLOW_THREADS;
|
Py_END_ALLOW_THREADS;
|
||||||
|
|
||||||
|
|
||||||
end:
|
end:
|
||||||
free(buf);
|
free(buf);
|
||||||
return Py_BuildValue("l", (long)ans);
|
return Py_BuildValue("l", (long)ans);
|
||||||
@ -640,8 +658,8 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
|
|||||||
#error Not implemented for python >= 3.3
|
#error Not implemented for python >= 3.3
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int32_t prev = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
|
int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
|
||||||
int is_hyphen_sep = 0;
|
int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0;
|
||||||
UChar sep = 0;
|
UChar sep = 0;
|
||||||
PyObject *ans = NULL, *temp = NULL, *t = NULL;
|
PyObject *ans = NULL, *temp = NULL, *t = NULL;
|
||||||
|
|
||||||
@ -650,26 +668,31 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
|
|||||||
|
|
||||||
p = ubrk_first(self->break_iterator);
|
p = ubrk_first(self->break_iterator);
|
||||||
while (p != UBRK_DONE) {
|
while (p != UBRK_DONE) {
|
||||||
prev = p; p = ubrk_next(self->break_iterator);
|
word_start = p; p = ubrk_next(self->break_iterator);
|
||||||
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
|
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
|
||||||
continue; // We are not at the start of a word
|
continue; // We are not at the start of a word
|
||||||
sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
|
sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start;
|
||||||
if (sz > 0) {
|
if (sz > 0) {
|
||||||
// ICU breaks on words containing hyphens, we do not want that, so we recombine manually
|
// ICU breaks on words containing hyphens, we do not want that, so we recombine manually
|
||||||
is_hyphen_sep = 0;
|
is_hyphen_sep = 0; leading_hyphen = 0; trailing_hyphen = 0;
|
||||||
if (last_pos > 0) {
|
if (word_start > 0) { // Look for a leading hyphen
|
||||||
if (prev - last_pos == 1) {
|
sep = *(self->text + word_start - 1);
|
||||||
sep = *(self->text + last_pos);
|
if (IS_HYPHEN_CHAR(sep)) {
|
||||||
if (sep == 0x2d || sep == 0x2010) is_hyphen_sep = 1;
|
leading_hyphen = 1;
|
||||||
|
if (last_pos > 0 && word_start - last_pos == 1) is_hyphen_sep = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (word_start + sz < self->text_len) { // Look for a trailing hyphen
|
||||||
|
sep = *(self->text + word_start + sz);
|
||||||
|
if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1;
|
||||||
|
}
|
||||||
last_pos = p;
|
last_pos = p;
|
||||||
#ifdef Py_UNICODE_WIDE
|
#ifdef Py_UNICODE_WIDE
|
||||||
sz = u_countChar32(self->text + prev, sz);
|
sz = u_countChar32(self->text + word_start, sz);
|
||||||
prev = u_countChar32(self->text, prev);
|
word_start = u_countChar32(self->text, word_start);
|
||||||
#endif
|
#endif
|
||||||
if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) {
|
if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) {
|
||||||
sz = last_sz + sz + 1;
|
sz = last_sz + sz + trailing_hyphen;
|
||||||
last_sz = sz;
|
last_sz = sz;
|
||||||
t = PyInt_FromLong((long)sz);
|
t = PyInt_FromLong((long)sz);
|
||||||
if (t == NULL) { Py_DECREF(ans); ans = NULL; break; }
|
if (t == NULL) { Py_DECREF(ans); ans = NULL; break; }
|
||||||
@ -677,8 +700,9 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
|
|||||||
Py_DECREF(PyTuple_GET_ITEM(temp, 1));
|
Py_DECREF(PyTuple_GET_ITEM(temp, 1));
|
||||||
PyTuple_SET_ITEM(temp, 1, t);
|
PyTuple_SET_ITEM(temp, 1, t);
|
||||||
} else {
|
} else {
|
||||||
|
sz += leading_hyphen + trailing_hyphen;
|
||||||
last_sz = sz;
|
last_sz = sz;
|
||||||
temp = Py_BuildValue("ll", (long)prev, (long)sz);
|
temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz);
|
||||||
if (temp == NULL) {
|
if (temp == NULL) {
|
||||||
Py_DECREF(ans); ans = NULL; break;
|
Py_DECREF(ans); ans = NULL; break;
|
||||||
}
|
}
|
||||||
|
@ -156,7 +156,9 @@ class TestICU(unittest.TestCase):
|
|||||||
self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
|
self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
|
||||||
self.ae(split(u'I I\'m'), ['I', "I'm"])
|
self.ae(split(u'I I\'m'), ['I', "I'm"])
|
||||||
self.ae(split(u'out-of-the-box'), ['out-of-the-box'])
|
self.ae(split(u'out-of-the-box'), ['out-of-the-box'])
|
||||||
self.ae(split(u'-one two-'), ['one', 'two'])
|
self.ae(split(u'-one two-'), ['-one', 'two-'])
|
||||||
|
self.ae(split(u'-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e'])
|
||||||
|
self.ae(split(u'-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e'])
|
||||||
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)])
|
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)])
|
||||||
for needle, haystack, pos in (
|
for needle, haystack, pos in (
|
||||||
('word', 'a word b', 2),
|
('word', 'a word b', 2),
|
||||||
@ -168,11 +170,26 @@ class TestICU(unittest.TestCase):
|
|||||||
('one-two', 'one-two-three one-two', 14),
|
('one-two', 'one-two-three one-two', 14),
|
||||||
('one', 'onet one', 5),
|
('one', 'onet one', 5),
|
||||||
('two', 'one-two two', 8),
|
('two', 'one-two two', 8),
|
||||||
|
('two', 'two-one two', 8),
|
||||||
|
('-two', 'one-two -two', 8),
|
||||||
|
('-two', 'two', -1),
|
||||||
('i', 'i', 0),
|
('i', 'i', 0),
|
||||||
('i', 'six i', 4),
|
('i', 'six i', 4),
|
||||||
('i', '', -1), ('', '', -1), ('', 'i', -1),
|
('i', '', -1), ('', '', -1), ('', 'i', -1),
|
||||||
('i', 'six clicks', -1),
|
('i', 'six clicks', -1),
|
||||||
('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)),
|
('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)),
|
||||||
|
('-a', 'b -a', 2),
|
||||||
|
('a-', 'a-b a- d', 4),
|
||||||
|
('-a-', 'b -a -a-', 5),
|
||||||
|
('-a-', '-a-', 0),
|
||||||
|
('-a-', 'a-', -1),
|
||||||
|
('-a-', '-a', -1),
|
||||||
|
('-a-', 'a', -1),
|
||||||
|
('a-', 'a-', 0),
|
||||||
|
('-a', '-a', 0),
|
||||||
|
('a-b-c-', 'a-b-c-d', -1),
|
||||||
|
('a-b-c-', 'a-b-c-.', 0),
|
||||||
|
('a-b-c-', 'a-b-c-d a-b-c- d', 8),
|
||||||
):
|
):
|
||||||
fpos = index_of(needle, haystack)
|
fpos = index_of(needle, haystack)
|
||||||
self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))
|
self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user