Fix spurious detection of words starting at hyphen boundaries

2025-08-07 09:01:38 -04:00 · 2014-07-12 11:34:31 +05:30 · 2014-07-12 11:34:31 +05:30 · adbda39a3c
commit adbda39a3c
parent 9f1a5129de
2 changed files with 3 additions and 1 deletions
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@ -618,7 +618,8 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwarg
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
            continue;  // We are not at the start of a word
        if (self->text_len >= prev + sz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) {
-            // Needle is present at text[prev:] we have to check if it is followed by a non-hyphen boundary
+            // Needle is present at text[prev:] we have to check if it is not surrounded by hyphen boundaries
            if (prev > 0 && (self->text[prev-1] == 0x2d || self->text[prev-1] == 0x2010)) continue; // At a hyphen boundary
            if(
                ubrk_isBoundary(self->break_iterator, prev + sz) &&
                (self->text_len == prev + sz || (self->text[prev + sz] != 0x2d && self->text[prev + sz] != 0x2010))
--- a/src/calibre/utils/icu_test.py
+++ b/src/calibre/utils/icu_test.py
@ -166,6 +166,7 @@ class TestICU(unittest.TestCase):
                ('one', 'one-two one', 8),
                ('one-two', 'one-two-three one-two', 14),
                ('one', 'onet one', 5),
                ('two', 'one-two two', 8),
                ('i', 'i', 0),
                ('i', 'six i', 4),
                ('i', '', -1), ('', '', -1), ('', 'i', -1),