From 2f663a2e1746cde6c99982eca34d019425b20677 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 8 Jul 2014 11:45:45 +0530
Subject: [PATCH] Edit book: Spell checker: Treat hyphenated words as single
 words for spell checking

---
 src/calibre/spell/break_iterator.py |  3 +-
 src/calibre/utils/icu.c             | 75 ++++++++++++-----------------
 src/calibre/utils/icu_test.py       |  4 +-
 3 files changed, 35 insertions(+), 47 deletions(-)

diff --git a/src/calibre/spell/break_iterator.py b/src/calibre/spell/break_iterator.py
index 8480a73569..a615a5fced 100644
--- a/src/calibre/spell/break_iterator.py
+++ b/src/calibre/spell/break_iterator.py
@@ -20,8 +20,7 @@ def split_into_words(text, lang='en'):
         if it is None:
             it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang)
         it.set_text(text)
-        return it.split()
-
+        return [text[p:p+s] for p, s in it.split2()]
 
 def split_into_words_and_positions(text, lang='en'):
     with _lock:
diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c
index f2506fd78b..5ae1665578 100644
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@@ -595,37 +595,6 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *args, PyObject *kw
 
 } // }}}
 
-// BreakIterator.split {{{
-static PyObject *
-icu_BreakIterator_split(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
-    int32_t prev = 0, p = 0, sz = 0;
-    PyObject *ans = NULL, *token = NULL;
-  
-    ans = PyList_New(0);
-    if (ans == NULL) return PyErr_NoMemory();
-
-    p = ubrk_first(self->break_iterator);
-    while (p != UBRK_DONE) {
-        prev = p; p = ubrk_next(self->break_iterator);
-        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
-            continue;  // We are not at the start of a word
-        sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
-        if (sz > 0) {
-            token = icu_to_python(self->text + prev, sz);
-            if (token == NULL) {
-                Py_DECREF(ans); ans = NULL; break; 
-            }
-            if (PyList_Append(ans, token) != 0) {
-                Py_DECREF(token); Py_DECREF(ans); ans = NULL; break; 
-            }
-            Py_DECREF(token);
-        }
-    }
-
-    return ans;
-
-} // }}}
-
 // BreakIterator.index {{{
 static PyObject *
 icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
@@ -673,8 +642,10 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwar
 #error Not implemented for python >= 3.3
 #endif
 
-    int32_t prev = 0, p = 0, sz = 0;
-    PyObject *ans = NULL, *temp = NULL;
+    int32_t prev = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
+    int is_hyphen_sep = 0;
+    UChar sep = 0;
+    PyObject *ans = NULL, *temp = NULL, *t = NULL;
   
     ans = PyList_New(0);
     if (ans == NULL) return PyErr_NoMemory();
@@ -686,18 +657,38 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwar
             continue;  // We are not at the start of a word
         sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
         if (sz > 0) {
+            // ICU breaks on words containing hyphens, we do not want that, so we recombine manually
+            is_hyphen_sep = 0;
+            if (last_pos > 0) {
+                if (prev - last_pos == 1) {
+                    sep = *(self->text + last_pos);
+                    if (sep == 0x2d || sep == 0x2010) is_hyphen_sep = 1;
+                }
+            }
+            last_pos = p;
 #ifdef Py_UNICODE_WIDE
             sz = u_countChar32(self->text + prev, sz);
             prev = u_countChar32(self->text, prev);
 #endif
-            temp = Py_BuildValue("II", prev, sz); 
-            if (temp == NULL) {
-                Py_DECREF(ans); ans = NULL; break; 
-            } 
-            if (PyList_Append(ans, temp) != 0) {
-                Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; 
+            if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) {
+                sz = last_sz + sz + 1;
+                last_sz = sz;
+                t = PyInt_FromLong((long)sz);
+                if (t == NULL) { Py_DECREF(ans); ans = NULL; break; }
+                temp = PyList_GET_ITEM(ans, PyList_GET_SIZE(ans) - 1);
+                Py_DECREF(PyTuple_GET_ITEM(temp, 1));
+                PyTuple_SET_ITEM(temp, 1, t);
+            } else {
+                last_sz = sz;
+                temp = Py_BuildValue("II", (unsigned int)prev, (unsigned int)sz); 
+                if (temp == NULL) {
+                    Py_DECREF(ans); ans = NULL; break; 
+                } 
+                if (PyList_Append(ans, temp) != 0) {
+                    Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; 
+                }
+                Py_DECREF(temp);
             }
-            Py_DECREF(temp);
         }
     }
 
@@ -710,10 +701,6 @@ static PyMethodDef icu_BreakIterator_methods[] = {
      "set_text(unicode object) -> Set the text this iterator will operate on"
     },
 
-    {"split", (PyCFunction)icu_BreakIterator_split, METH_VARARGS,
-     "split() -> Split the current text into tokens, returning a list of tokens"
-    },
-
     {"split2", (PyCFunction)icu_BreakIterator_split2, METH_VARARGS,
      "split2() -> Split the current text into tokens, returning a list of 2-tuples of the form (position of token, length of token). The numbers are suitable for indexing python strings regardless of narrow/wide builds."
     },
diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py
index 8d42c59efa..faf7e3834a 100644
--- a/src/calibre/utils/icu_test.py
+++ b/src/calibre/utils/icu_test.py
@@ -151,9 +151,11 @@ class TestICU(unittest.TestCase):
     def test_break_iterator(self):
         ' Test the break iterator '
         from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions
-        for q in ('one two three', ' one two three', 'one\ntwo  three ', 'one-two,three'):
+        for q in ('one two three', ' one two three', 'one\ntwo  three ', ):
             self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
         self.ae(split(u'I I\'m'), ['I', "I'm"])
+        self.ae(split(u'out-of-the-box'), ['out-of-the-box'])
+        self.ae(split(u'-one two-'), ['one', 'two'])
         self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6 if sys.maxunicode >= 0x10ffff else 7, 5)])
         self.ae(0, index_of('i', 'i'))
         self.ae(4, index_of('i', 'six i'))