Better fix for ignoring soft-hyphens when checking spelling live

2025-07-09 03:04:10 -04:00 · 2019-12-19 23:01:05 +05:30 · 2019-12-19 23:01:05 +05:30 · 511d3c6ba4
commit 511d3c6ba4
parent ae962ae3b1
2 changed files with 14 additions and 19 deletions
--- a/src/calibre/gui2/tweak_book/editor/syntax/html.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py
@ -11,7 +11,7 @@ from collections import namedtuple

 from PyQt5.Qt import QFont, QTextBlockUserData, QTextCharFormat

-from calibre.ebooks.oeb.polish.spell import html_spell_tags, xml_spell_tags
+from calibre.ebooks.oeb.polish.spell import html_spell_tags, xml_spell_tags, patterns
 from calibre.spell.dictionary import parse_lang_code
 from calibre.spell.break_iterator import split_into_words_and_positions
 from calibre.gui2.tweak_book import dictionaries, tprefs, verify_link
@ -76,7 +76,18 @@ def spell_property(sfmt, locale):
    return s


-_speedup.init(spell_property, dictionaries.recognized, split_into_words_and_positions)
+def sanitizing_recognizer():
+    sanitize = patterns().sanitize_invisible_pat.sub
+    r = dictionaries.recognized
+
+    def recognized(word, locale=None):
+        word = sanitize('', word).strip()
+        return r(word, locale)
+
+    return recognized
+
+
+_speedup.init(spell_property, sanitizing_recognizer(), split_into_words_and_positions)
 del spell_property
 check_spelling = _speedup.check_spelling

--- a/src/calibre/spell/dictionary.py
+++ b/src/calibre/spell/dictionary.py
@ -34,11 +34,6 @@ dprefs.defaults['user_dictionaries'] = [{'name':_('Default'), 'is_active':True,
 not_present = object()


-def normalize_word(word):
-    # remove soft hyphens
-    return unicode_type(word).replace('\u00ad', '')
-
-
 class UserDictionary(object):

    __slots__ = ('name', 'is_active', 'words')
@ -231,17 +226,14 @@ class Dictionaries(object):
        return ans

    def ignore_word(self, word, locale):
-        word = normalize_word(word)
        self.ignored_words.add((word, locale.langcode))
        self.word_cache[(word, locale)] = True

    def unignore_word(self, word, locale):
-        word = normalize_word(word)
        self.ignored_words.discard((word, locale.langcode))
        self.word_cache.pop((word, locale), None)

    def is_word_ignored(self, word, locale):
-        word = normalize_word(word)
        return (word, locale.langcode) in self.ignored_words

    @property
@ -275,14 +267,12 @@ class Dictionaries(object):
        for d in itervalues(self.dictionaries):
            if d and getattr(d.primary_locale, 'langcode', None) == langcode:
                for word in words:
-                    word = normalize_word(word)
                    d.obj.add(word)

    def remove_user_words(self, words, langcode):
        for d in itervalues(self.dictionaries):
            if d and d.primary_locale.langcode == langcode:
                for word in words:
-                    word = normalize_word(word)
                    d.obj.remove(word)

    def add_to_user_dictionary(self, name, word, locale):
@ -291,11 +281,9 @@ class Dictionaries(object):
            raise ValueError('Cannot add to the dictionary named: %s as no such dictionary exists' % name)
        wl = len(ud.words)
        if isinstance(word, (set, frozenset)):
-            word = frozenset(map(normalize_word, word))
            ud.words |= word
            self.add_user_words(word, locale.langcode)
        else:
-            word = normalize_word(word)
            ud.words.add((word, locale.langcode))
            self.add_user_words((word,), locale.langcode)
        if len(ud.words) > wl:
@ -308,7 +296,6 @@ class Dictionaries(object):
        return False

    def remove_from_user_dictionaries(self, word, locale):
-        word = normalize_word(word)
        key = (word, locale.langcode)
        changed = False
        for ud in self.active_user_dictionaries:
@ -324,7 +311,7 @@ class Dictionaries(object):
    def remove_from_user_dictionary(self, name, words):
        changed = False
        removals = defaultdict(set)
-        keys = [(normalize_word(w), l.langcode) for w, l in words]
+        keys = [(w, l.langcode) for w, l in words]
        for d in self.all_user_dictionaries:
            if d.name == name:
                for key in keys:
@ -341,7 +328,6 @@ class Dictionaries(object):
        return changed

    def word_in_user_dictionary(self, word, locale):
-        word = normalize_word(word)
        key = (word, locale.langcode)
        for ud in self.active_user_dictionaries:
            if key in ud.words:
@ -377,7 +363,6 @@ class Dictionaries(object):
        return changed

    def recognized(self, word, locale=None):
-        word = normalize_word(word)
        locale = locale or self.default_locale
        key = (word, locale)
        ans = self.word_cache.get(key, None)
@ -406,7 +391,6 @@ class Dictionaries(object):
        return ans

    def suggestions(self, word, locale=None):
-        word = normalize_word(word)
        locale = locale or self.default_locale
        d = self.dictionary_for_locale(locale)
        has_unicode_hyphen = '\u2010' in word