mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Edit Book: Fix leading or trailing hyphens on words being ignored when spell checking. Fixes #1370288 [Spell Check will not goto all misspelled words](https://bugs.launchpad.net/calibre/+bug/1370288)
This commit is contained in:
parent
2fdf4de2f6
commit
4153baa4ce
@ -158,6 +158,7 @@ class Dictionaries(object):
|
||||
|
||||
def __init__(self):
|
||||
self.remove_hyphenation = re.compile('[\u2010-]+')
|
||||
self.negative_pat = re.compile('-[.\d+]')
|
||||
self.dictionaries = {}
|
||||
self.word_cache = {}
|
||||
self.ignored_words = set()
|
||||
@ -327,6 +328,8 @@ class Dictionaries(object):
|
||||
pass
|
||||
else:
|
||||
ans = True
|
||||
if ans is False and self.negative_pat.match(word) is not None:
|
||||
ans = True
|
||||
self.word_cache[key] = ans
|
||||
return ans
|
||||
|
||||
|
@ -589,6 +589,8 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *input) {
|
||||
|
||||
} // }}}
|
||||
|
||||
#define IS_HYPHEN_CHAR(x) ((x) == 0x2d || (x) == 0x2010)
|
||||
|
||||
// BreakIterator.index {{{
|
||||
static PyObject *
|
||||
icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
|
||||
@ -596,37 +598,53 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
|
||||
#error Not implemented for python >= 3.3
|
||||
#endif
|
||||
|
||||
UChar *buf = NULL;
|
||||
int32_t prev = 0, p = 0, sz = 0, ans = -1;
|
||||
UChar *buf = NULL, *needle = NULL;
|
||||
int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0;
|
||||
|
||||
buf = python_to_icu(token, &sz, 1);
|
||||
if (buf == NULL) return NULL;
|
||||
if (sz < 1) goto end;
|
||||
needle = buf;
|
||||
if (sz > 1 && IS_HYPHEN_CHAR(buf[0])) { needle = buf + 1; leading_hyphen = 1; sz -= 1; }
|
||||
if (sz > 1 && IS_HYPHEN_CHAR(buf[sz-1])) trailing_hyphen = 1;
|
||||
|
||||
Py_BEGIN_ALLOW_THREADS;
|
||||
p = ubrk_first(self->break_iterator);
|
||||
while (p != UBRK_DONE) {
|
||||
prev = p; p = ubrk_next(self->break_iterator);
|
||||
word_start = p; p = ubrk_next(self->break_iterator);
|
||||
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
|
||||
continue; // We are not at the start of a word
|
||||
if (self->text_len >= prev + sz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) {
|
||||
// Needle is present at text[prev:] we have to check if it is not surrounded by hyphen boundaries
|
||||
if (prev > 0 && (self->text[prev-1] == 0x2d || self->text[prev-1] == 0x2010)) continue; // At a hyphen boundary
|
||||
|
||||
if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) {
|
||||
if (word_start > 0 && (
|
||||
(leading_hyphen && !IS_HYPHEN_CHAR(self->text[word_start-1])) ||
|
||||
(!leading_hyphen && IS_HYPHEN_CHAR(self->text[word_start-1]))
|
||||
)) continue;
|
||||
if (!trailing_hyphen && IS_HYPHEN_CHAR(self->text[word_start + sz])) continue;
|
||||
|
||||
if (p == UBRK_DONE || self->text_len <= word_start + sz) { ans = word_start; break; }
|
||||
|
||||
if (
|
||||
ubrk_isBoundary(self->break_iterator, prev + sz) &&
|
||||
(self->text_len == prev + sz || (self->text[prev + sz] != 0x2d && self->text[prev + sz] != 0x2010))
|
||||
) {
|
||||
ans = prev; break; // Found word surrounded by non-hyphen boundaries
|
||||
}
|
||||
if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary
|
||||
// Check that the found word is followed by a word boundary
|
||||
ubrk_isBoundary(self->break_iterator, word_start + sz) &&
|
||||
// If there is a leading hyphen check that the leading
|
||||
// hyphen is preceded by a word boundary
|
||||
(!leading_hyphen || (word_start > 1 && ubrk_isBoundary(self->break_iterator, word_start - 2))) &&
|
||||
// Check that there is a word boundary *after* the trailing
|
||||
// hyphen. We cannot rely on ubrk_isBoundary() as that
|
||||
// always returns true because of the trailing hyphen.
|
||||
(!trailing_hyphen || ubrk_following(self->break_iterator, word_start + sz) == UBRK_DONE || ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
|
||||
) { ans = word_start; break; }
|
||||
|
||||
if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary()
|
||||
}
|
||||
}
|
||||
if (leading_hyphen && ans > -1) ans -= 1;
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
if (ans > 0) ans = u_countChar32(self->text, ans);
|
||||
#endif
|
||||
Py_END_ALLOW_THREADS;
|
||||
|
||||
|
||||
end:
|
||||
free(buf);
|
||||
return Py_BuildValue("l", (long)ans);
|
||||
@ -640,8 +658,8 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
|
||||
#error Not implemented for python >= 3.3
|
||||
#endif
|
||||
|
||||
int32_t prev = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
|
||||
int is_hyphen_sep = 0;
|
||||
int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
|
||||
int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0;
|
||||
UChar sep = 0;
|
||||
PyObject *ans = NULL, *temp = NULL, *t = NULL;
|
||||
|
||||
@ -650,26 +668,31 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
|
||||
|
||||
p = ubrk_first(self->break_iterator);
|
||||
while (p != UBRK_DONE) {
|
||||
prev = p; p = ubrk_next(self->break_iterator);
|
||||
word_start = p; p = ubrk_next(self->break_iterator);
|
||||
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
|
||||
continue; // We are not at the start of a word
|
||||
sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
|
||||
sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start;
|
||||
if (sz > 0) {
|
||||
// ICU breaks on words containing hyphens, we do not want that, so we recombine manually
|
||||
is_hyphen_sep = 0;
|
||||
if (last_pos > 0) {
|
||||
if (prev - last_pos == 1) {
|
||||
sep = *(self->text + last_pos);
|
||||
if (sep == 0x2d || sep == 0x2010) is_hyphen_sep = 1;
|
||||
is_hyphen_sep = 0; leading_hyphen = 0; trailing_hyphen = 0;
|
||||
if (word_start > 0) { // Look for a leading hyphen
|
||||
sep = *(self->text + word_start - 1);
|
||||
if (IS_HYPHEN_CHAR(sep)) {
|
||||
leading_hyphen = 1;
|
||||
if (last_pos > 0 && word_start - last_pos == 1) is_hyphen_sep = 1;
|
||||
}
|
||||
}
|
||||
if (word_start + sz < self->text_len) { // Look for a trailing hyphen
|
||||
sep = *(self->text + word_start + sz);
|
||||
if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1;
|
||||
}
|
||||
last_pos = p;
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
sz = u_countChar32(self->text + prev, sz);
|
||||
prev = u_countChar32(self->text, prev);
|
||||
sz = u_countChar32(self->text + word_start, sz);
|
||||
word_start = u_countChar32(self->text, word_start);
|
||||
#endif
|
||||
if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) {
|
||||
sz = last_sz + sz + 1;
|
||||
sz = last_sz + sz + trailing_hyphen;
|
||||
last_sz = sz;
|
||||
t = PyInt_FromLong((long)sz);
|
||||
if (t == NULL) { Py_DECREF(ans); ans = NULL; break; }
|
||||
@ -677,8 +700,9 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
|
||||
Py_DECREF(PyTuple_GET_ITEM(temp, 1));
|
||||
PyTuple_SET_ITEM(temp, 1, t);
|
||||
} else {
|
||||
sz += leading_hyphen + trailing_hyphen;
|
||||
last_sz = sz;
|
||||
temp = Py_BuildValue("ll", (long)prev, (long)sz);
|
||||
temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz);
|
||||
if (temp == NULL) {
|
||||
Py_DECREF(ans); ans = NULL; break;
|
||||
}
|
||||
|
@ -156,7 +156,9 @@ class TestICU(unittest.TestCase):
|
||||
self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
|
||||
self.ae(split(u'I I\'m'), ['I', "I'm"])
|
||||
self.ae(split(u'out-of-the-box'), ['out-of-the-box'])
|
||||
self.ae(split(u'-one two-'), ['one', 'two'])
|
||||
self.ae(split(u'-one two-'), ['-one', 'two-'])
|
||||
self.ae(split(u'-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e'])
|
||||
self.ae(split(u'-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e'])
|
||||
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)])
|
||||
for needle, haystack, pos in (
|
||||
('word', 'a word b', 2),
|
||||
@ -168,11 +170,26 @@ class TestICU(unittest.TestCase):
|
||||
('one-two', 'one-two-three one-two', 14),
|
||||
('one', 'onet one', 5),
|
||||
('two', 'one-two two', 8),
|
||||
('two', 'two-one two', 8),
|
||||
('-two', 'one-two -two', 8),
|
||||
('-two', 'two', -1),
|
||||
('i', 'i', 0),
|
||||
('i', 'six i', 4),
|
||||
('i', '', -1), ('', '', -1), ('', 'i', -1),
|
||||
('i', 'six clicks', -1),
|
||||
('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)),
|
||||
('-a', 'b -a', 2),
|
||||
('a-', 'a-b a- d', 4),
|
||||
('-a-', 'b -a -a-', 5),
|
||||
('-a-', '-a-', 0),
|
||||
('-a-', 'a-', -1),
|
||||
('-a-', '-a', -1),
|
||||
('-a-', 'a', -1),
|
||||
('a-', 'a-', 0),
|
||||
('-a', '-a', 0),
|
||||
('a-b-c-', 'a-b-c-d', -1),
|
||||
('a-b-c-', 'a-b-c-.', 0),
|
||||
('a-b-c-', 'a-b-c-d a-b-c- d', 8),
|
||||
):
|
||||
fpos = index_of(needle, haystack)
|
||||
self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))
|
||||
|
Loading…
x
Reference in New Issue
Block a user