Fix spurious detection of words starting at hyphen boundaries

This commit is contained in:
Kovid Goyal 2014-07-12 11:34:31 +05:30
parent 9f1a5129de
commit adbda39a3c
2 changed files with 3 additions and 1 deletions

View File

@ -618,7 +618,8 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwarg
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
continue; // We are not at the start of a word continue; // We are not at the start of a word
if (self->text_len >= prev + sz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { if (self->text_len >= prev + sz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) {
// Needle is present at text[prev:] we have to check if it is followed by a non-hyphen boundary // Needle is present at text[prev:] we have to check if it is not surrounded by hyphen boundaries
if (prev > 0 && (self->text[prev-1] == 0x2d || self->text[prev-1] == 0x2010)) continue; // At a hyphen boundary
if( if(
ubrk_isBoundary(self->break_iterator, prev + sz) && ubrk_isBoundary(self->break_iterator, prev + sz) &&
(self->text_len == prev + sz || (self->text[prev + sz] != 0x2d && self->text[prev + sz] != 0x2010)) (self->text_len == prev + sz || (self->text[prev + sz] != 0x2d && self->text[prev + sz] != 0x2010))

View File

@ -166,6 +166,7 @@ class TestICU(unittest.TestCase):
('one', 'one-two one', 8), ('one', 'one-two one', 8),
('one-two', 'one-two-three one-two', 14), ('one-two', 'one-two-three one-two', 14),
('one', 'onet one', 5), ('one', 'onet one', 5),
('two', 'one-two two', 8),
('i', 'i', 0), ('i', 'i', 0),
('i', 'six i', 4), ('i', 'six i', 4),
('i', '', -1), ('', '', -1), ('', 'i', -1), ('i', '', -1), ('', '', -1), ('', 'i', -1),