From 511d3c6ba4a07c9111f8fe3dbf7efb080b6b02ba Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 19 Dec 2019 23:01:05 +0530
Subject: [PATCH] Better fix for ignoring soft-hyphens when checking spelling
 live

---
 .../gui2/tweak_book/editor/syntax/html.py      | 15 +++++++++++++--
 src/calibre/spell/dictionary.py                | 18 +-----------------
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/calibre/gui2/tweak_book/editor/syntax/html.py b/src/calibre/gui2/tweak_book/editor/syntax/html.py
index 49545bb044..6b246e8207 100644
--- a/src/calibre/gui2/tweak_book/editor/syntax/html.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py
@@ -11,7 +11,7 @@ from collections import namedtuple
 
 from PyQt5.Qt import QFont, QTextBlockUserData, QTextCharFormat
 
-from calibre.ebooks.oeb.polish.spell import html_spell_tags, xml_spell_tags
+from calibre.ebooks.oeb.polish.spell import html_spell_tags, xml_spell_tags, patterns
 from calibre.spell.dictionary import parse_lang_code
 from calibre.spell.break_iterator import split_into_words_and_positions
 from calibre.gui2.tweak_book import dictionaries, tprefs, verify_link
@@ -76,7 +76,18 @@ def spell_property(sfmt, locale):
     return s
 
 
-_speedup.init(spell_property, dictionaries.recognized, split_into_words_and_positions)
+def sanitizing_recognizer():
+    sanitize = patterns().sanitize_invisible_pat.sub
+    r = dictionaries.recognized
+
+    def recognized(word, locale=None):
+        word = sanitize('', word).strip()
+        return r(word, locale)
+
+    return recognized
+
+
+_speedup.init(spell_property, sanitizing_recognizer(), split_into_words_and_positions)
 del spell_property
 check_spelling = _speedup.check_spelling
 
diff --git a/src/calibre/spell/dictionary.py b/src/calibre/spell/dictionary.py
index fe3579a480..d64cea0a45 100644
--- a/src/calibre/spell/dictionary.py
+++ b/src/calibre/spell/dictionary.py
@@ -34,11 +34,6 @@ dprefs.defaults['user_dictionaries'] = [{'name':_('Default'), 'is_active':True,
 not_present = object()
 
 
-def normalize_word(word):
-    # remove soft hyphens
-    return unicode_type(word).replace('\u00ad', '')
-
-
 class UserDictionary(object):
 
     __slots__ = ('name', 'is_active', 'words')
@@ -231,17 +226,14 @@ class Dictionaries(object):
         return ans
 
     def ignore_word(self, word, locale):
-        word = normalize_word(word)
         self.ignored_words.add((word, locale.langcode))
         self.word_cache[(word, locale)] = True
 
     def unignore_word(self, word, locale):
-        word = normalize_word(word)
         self.ignored_words.discard((word, locale.langcode))
         self.word_cache.pop((word, locale), None)
 
     def is_word_ignored(self, word, locale):
-        word = normalize_word(word)
         return (word, locale.langcode) in self.ignored_words
 
     @property
@@ -275,14 +267,12 @@ class Dictionaries(object):
         for d in itervalues(self.dictionaries):
             if d and getattr(d.primary_locale, 'langcode', None) == langcode:
                 for word in words:
-                    word = normalize_word(word)
                     d.obj.add(word)
 
     def remove_user_words(self, words, langcode):
         for d in itervalues(self.dictionaries):
             if d and d.primary_locale.langcode == langcode:
                 for word in words:
-                    word = normalize_word(word)
                     d.obj.remove(word)
 
     def add_to_user_dictionary(self, name, word, locale):
@@ -291,11 +281,9 @@ class Dictionaries(object):
             raise ValueError('Cannot add to the dictionary named: %s as no such dictionary exists' % name)
         wl = len(ud.words)
         if isinstance(word, (set, frozenset)):
-            word = frozenset(map(normalize_word, word))
             ud.words |= word
             self.add_user_words(word, locale.langcode)
         else:
-            word = normalize_word(word)
             ud.words.add((word, locale.langcode))
             self.add_user_words((word,), locale.langcode)
         if len(ud.words) > wl:
@@ -308,7 +296,6 @@ class Dictionaries(object):
         return False
 
     def remove_from_user_dictionaries(self, word, locale):
-        word = normalize_word(word)
         key = (word, locale.langcode)
         changed = False
         for ud in self.active_user_dictionaries:
@@ -324,7 +311,7 @@ class Dictionaries(object):
     def remove_from_user_dictionary(self, name, words):
         changed = False
         removals = defaultdict(set)
-        keys = [(normalize_word(w), l.langcode) for w, l in words]
+        keys = [(w, l.langcode) for w, l in words]
         for d in self.all_user_dictionaries:
             if d.name == name:
                 for key in keys:
@@ -341,7 +328,6 @@ class Dictionaries(object):
         return changed
 
     def word_in_user_dictionary(self, word, locale):
-        word = normalize_word(word)
         key = (word, locale.langcode)
         for ud in self.active_user_dictionaries:
             if key in ud.words:
@@ -377,7 +363,6 @@ class Dictionaries(object):
         return changed
 
     def recognized(self, word, locale=None):
-        word = normalize_word(word)
         locale = locale or self.default_locale
         key = (word, locale)
         ans = self.word_cache.get(key, None)
@@ -406,7 +391,6 @@ class Dictionaries(object):
         return ans
 
     def suggestions(self, word, locale=None):
-        word = normalize_word(word)
         locale = locale or self.default_locale
         d = self.dictionary_for_locale(locale)
         has_unicode_hyphen = '\u2010' in word