From adbda39a3c2633a7806a3f7393b6ab175e70e537 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 12 Jul 2014 11:34:31 +0530 Subject: [PATCH] Fix spurious detection of words starting at hyphen boundaries --- src/calibre/utils/icu.c | 3 ++- src/calibre/utils/icu_test.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index c6b6564ff3..ec9a778b58 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -618,7 +618,8 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwarg if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word if (self->text_len >= prev + sz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { - // Needle is present at text[prev:] we have to check if it is followed by a non-hyphen boundary + // Needle is present at text[prev:] we have to check if it is not surrounded by hyphen boundaries + if (prev > 0 && (self->text[prev-1] == 0x2d || self->text[prev-1] == 0x2010)) continue; // At a hyphen boundary if( ubrk_isBoundary(self->break_iterator, prev + sz) && (self->text_len == prev + sz || (self->text[prev + sz] != 0x2d && self->text[prev + sz] != 0x2010)) diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index 4f7a474dba..5ddd0bc345 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -166,6 +166,7 @@ class TestICU(unittest.TestCase): ('one', 'one-two one', 8), ('one-two', 'one-two-three one-two', 14), ('one', 'onet one', 5), + ('two', 'one-two two', 8), ('i', 'i', 0), ('i', 'six i', 4), ('i', '', -1), ('', '', -1), ('', 'i', -1),