Edit Book: Fix leading or trailing hyphens on words being ignored when spell checking. Fixes #1370288 [Spell Check will not goto all misspelled words](https://bugs.launchpad.net/calibre/+bug/1370288)

2025-07-09 03:04:10 -04:00 · 2014-09-30 16:51:12 +05:30 · 2014-09-30 16:51:12 +05:30 · 4153baa4ce
commit 4153baa4ce
parent 2fdf4de2f6
3 changed files with 72 additions and 28 deletions
--- a/src/calibre/spell/dictionary.py
+++ b/src/calibre/spell/dictionary.py
@ -158,6 +158,7 @@ class Dictionaries(object):
    def __init__(self):
        self.remove_hyphenation = re.compile('[\u2010-]+')
        self.negative_pat = re.compile('-[.\d+]')
        self.dictionaries = {}
        self.word_cache = {}
        self.ignored_words = set()
@ -327,6 +328,8 @@ class Dictionaries(object):
                            pass
                    else:
                        ans = True
            if ans is False and self.negative_pat.match(word) is not None:
                ans = True
            self.word_cache[key] = ans
        return ans
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@ -589,6 +589,8 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *input) {
 } // }}}
 #define IS_HYPHEN_CHAR(x) ((x) == 0x2d || (x) == 0x2010)
 // BreakIterator.index {{{
 static PyObject *
 icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
@ -596,37 +598,53 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
 #error Not implemented for python >= 3.3
 #endif
-    UChar *buf = NULL;
+    UChar *buf = NULL, *needle = NULL;
-    int32_t prev = 0, p = 0, sz = 0, ans = -1;
+    int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0;
    buf = python_to_icu(token, &sz, 1);
    if (buf == NULL) return NULL;
    if (sz < 1) goto end;
    needle = buf;
    if (sz > 1 && IS_HYPHEN_CHAR(buf[0])) { needle = buf + 1; leading_hyphen = 1; sz -= 1; }
    if (sz > 1 && IS_HYPHEN_CHAR(buf[sz-1])) trailing_hyphen = 1;
    Py_BEGIN_ALLOW_THREADS;
    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
-        prev = p; p = ubrk_next(self->break_iterator);
+        word_start = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
            continue;  // We are not at the start of a word
-        if (self->text_len >= prev + sz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) {
+
-            // Needle is present at text[prev:] we have to check if it is not surrounded by hyphen boundaries
+        if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) {
-            if (prev > 0 && (self->text[prev-1] == 0x2d || self->text[prev-1] == 0x2010)) continue; // At a hyphen boundary
+            if (word_start > 0 && (
-            if(
+                    (leading_hyphen && !IS_HYPHEN_CHAR(self->text[word_start-1])) ||
-                ubrk_isBoundary(self->break_iterator, prev + sz) &&
+                    (!leading_hyphen && IS_HYPHEN_CHAR(self->text[word_start-1]))
-                (self->text_len == prev + sz || (self->text[prev + sz] != 0x2d && self->text[prev + sz] != 0x2010))
+            )) continue;
-            ) {
+            if (!trailing_hyphen && IS_HYPHEN_CHAR(self->text[word_start + sz])) continue;
-                ans = prev; break; // Found word surrounded by non-hyphen boundaries
+
-            } 
+            if (p == UBRK_DONE || self->text_len <= word_start + sz) { ans = word_start; break; }
-            if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary
+
            if (
                    // Check that the found word is followed by a word boundary
                    ubrk_isBoundary(self->break_iterator, word_start + sz) &&
                    // If there is a leading hyphen check  that the leading
                    // hyphen is preceded by a word boundary
                    (!leading_hyphen || (word_start > 1 && ubrk_isBoundary(self->break_iterator, word_start - 2))) &&
                    // Check that there is a word boundary *after* the trailing
                    // hyphen. We cannot rely on ubrk_isBoundary() as that
                    // always returns true because of the trailing hyphen.
                    (!trailing_hyphen || ubrk_following(self->break_iterator, word_start + sz) == UBRK_DONE || ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
            ) { ans = word_start; break; }
            if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary()
        }
    }
    if (leading_hyphen && ans > -1) ans -= 1;
 #ifdef Py_UNICODE_WIDE
    if (ans > 0) ans = u_countChar32(self->text, ans);
 #endif
    Py_END_ALLOW_THREADS;
 end:
    free(buf);
    return Py_BuildValue("l", (long)ans);
@ -640,8 +658,8 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
 #error Not implemented for python >= 3.3
 #endif
-    int32_t prev = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
+    int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
-    int is_hyphen_sep = 0;
+    int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0;
    UChar sep = 0;
    PyObject *ans = NULL, *temp = NULL, *t = NULL;
@ -650,26 +668,31 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
-        prev = p; p = ubrk_next(self->break_iterator);
+        word_start = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
            continue;  // We are not at the start of a word
-        sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
+        sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start;
        if (sz > 0) {
            // ICU breaks on words containing hyphens, we do not want that, so we recombine manually
-            is_hyphen_sep = 0;
+            is_hyphen_sep = 0; leading_hyphen = 0; trailing_hyphen = 0;
-            if (last_pos > 0) {
+            if (word_start > 0) { // Look for a leading hyphen
-                if (prev - last_pos == 1) {
+                sep = *(self->text + word_start - 1);
-                    sep = *(self->text + last_pos);
+                if (IS_HYPHEN_CHAR(sep)) {
-                    if (sep == 0x2d || sep == 0x2010) is_hyphen_sep = 1;
+                    leading_hyphen = 1;
                    if (last_pos > 0 && word_start - last_pos == 1) is_hyphen_sep = 1;
                }
            }
            if (word_start + sz < self->text_len) { // Look for a trailing hyphen
                sep = *(self->text + word_start + sz);
                if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1;
            }
            last_pos = p;
 #ifdef Py_UNICODE_WIDE
-            sz = u_countChar32(self->text + prev, sz);
+            sz = u_countChar32(self->text + word_start, sz);
-            prev = u_countChar32(self->text, prev);
+            word_start = u_countChar32(self->text, word_start);
 #endif
            if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) {
-                sz = last_sz + sz + 1;
+                sz = last_sz + sz + trailing_hyphen;
                last_sz = sz;
                t = PyInt_FromLong((long)sz);
                if (t == NULL) { Py_DECREF(ans); ans = NULL; break; }
@ -677,8 +700,9 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
                Py_DECREF(PyTuple_GET_ITEM(temp, 1));
                PyTuple_SET_ITEM(temp, 1, t);
            } else {
                sz += leading_hyphen + trailing_hyphen;
                last_sz = sz;
-                temp = Py_BuildValue("ll", (long)prev, (long)sz); 
+                temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz); 
                if (temp == NULL) {
                    Py_DECREF(ans); ans = NULL; break; 
                } 
--- a/src/calibre/utils/icu_test.py
+++ b/src/calibre/utils/icu_test.py
@ -156,7 +156,9 @@ class TestICU(unittest.TestCase):
            self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
        self.ae(split(u'I I\'m'), ['I', "I'm"])
        self.ae(split(u'out-of-the-box'), ['out-of-the-box'])
-        self.ae(split(u'-one two-'), ['one', 'two'])
+        self.ae(split(u'-one two-'), ['-one', 'two-'])
        self.ae(split(u'-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e'])
        self.ae(split(u'-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e'])
        self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)])
        for needle, haystack, pos in (
                ('word', 'a word b', 2),
@ -168,11 +170,26 @@ class TestICU(unittest.TestCase):
                ('one-two', 'one-two-three one-two', 14),
                ('one', 'onet one', 5),
                ('two', 'one-two two', 8),
                ('two', 'two-one two', 8),
                ('-two', 'one-two -two', 8),
                ('-two', 'two', -1),
                ('i', 'i', 0),
                ('i', 'six i', 4),
                ('i', '', -1), ('', '', -1), ('', 'i', -1),
                ('i', 'six clicks', -1),
                ('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)),
                ('-a', 'b -a', 2),
                ('a-', 'a-b a- d', 4),
                ('-a-', 'b -a -a-', 5),
                ('-a-', '-a-', 0),
                ('-a-', 'a-', -1),
                ('-a-', '-a', -1),
                ('-a-', 'a', -1),
                ('a-', 'a-', 0),
                ('-a', '-a', 0),
                ('a-b-c-', 'a-b-c-d', -1),
                ('a-b-c-', 'a-b-c-.', 0),
                ('a-b-c-', 'a-b-c-d a-b-c- d', 8),
        ):
            fpos = index_of(needle, haystack)
            self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))