From 16a5c262ebd93a097831551ea8df6fe399b76c70 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 21 Apr 2014 12:16:19 +0530
Subject: [PATCH] Spellcheck: When checking French text list all words that
 have the same stem as a single word. So j'aime and aime are listed as a
 single word.

---
 src/calibre/ebooks/oeb/polish/spell.py | 31 +++++++++++++++++++-------
 src/calibre/gui2/tweak_book/spell.py   |  4 +++-
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/oeb/polish/spell.py b/src/calibre/ebooks/oeb/polish/spell.py
index 811d405dd0..f599f55579 100644
--- a/src/calibre/ebooks/oeb/polish/spell.py
+++ b/src/calibre/ebooks/oeb/polish/spell.py
@@ -19,7 +19,7 @@ _patterns = None
 
 class Patterns(object):
 
-    __slots__ = ('sanitize_invisible_pat', 'split_pat', 'digit_pat')
+    __slots__ = ('sanitize_invisible_pat', 'split_pat', 'digit_pat', 'fr_elision_pat')
 
     def __init__(self):
         import regex
@@ -30,6 +30,10 @@ class Patterns(object):
             r'\W+', flags=regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE)
         self.digit_pat = regex.compile(
             r'^\d+$', flags=regex.VERSION1 | regex.WORD | regex.UNICODE)
+        # French words with prefixes are reduced to the stem word, so that the
+        # words appear only once in the word list
+        self.fr_elision_pat = regex.compile(
+            u"^(?:l|d|m|t|s|j|c|ç|lorsqu|puisqu|quoiqu|qu)['’]", flags=regex.UNICODE | regex.VERSION1 | regex.IGNORECASE)
 
 def patterns():
     global _patterns
@@ -39,16 +43,19 @@ def patterns():
 
 class Location(object):
 
-    __slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item')
+    __slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item', 'elided_prefix')
 
-    def __init__(self, file_name=None, sourceline=None, original_word=None, location_node=None, node_item=(None, None)):
-        self.file_name, self.sourceline, self.original_word = file_name, sourceline, original_word
-        self.location_node, self.node_item = location_node, node_item
+    def __init__(self, file_name=None, elided_prefix='', original_word=None, location_node=None, node_item=(None, None)):
+        self.file_name, self.elided_prefix, self.original_word = file_name, elided_prefix, original_word
+        self.location_node, self.node_item, self.sourceline = location_node, node_item, location_node.sourceline
 
     def __repr__(self):
-        return '%s:%s' % (self.file_name, self.sourceline)
+        return '%s @ %s:%s' % (self.original_word, self.file_name, self.sourceline)
     __str__ = __repr__
 
+    def replace(self, new_word):
+        self.original_word = self.elided_prefix + new_word
+
 def filter_words(word):
     if not word:
         return False
@@ -68,9 +75,16 @@ def add_words(text, node, words, file_name, locale, node_item):
     candidates = get_words(text, locale.langcode)
     if candidates:
         p = patterns()
+        is_fr = locale.langcode == 'fra'
         for word in candidates:
             sword = p.sanitize_invisible_pat.sub('', word)
-            loc = Location(file_name, node.sourceline, word, node, node_item)
+            elided_prefix = ''
+            if is_fr:
+                m = p.fr_elision_pat.match(sword)
+                if m is not None and len(sword) > len(elided_prefix):
+                    elided_prefix = m.group()
+                    sword = sword[len(elided_prefix):]
+            loc = Location(file_name, elided_prefix, word, node, node_item)
             words[(sword, locale)].append(loc)
 
 def add_words_from_attr(node, attr, words, file_name, locale):
@@ -184,7 +198,8 @@ def replace_word(container, new_word, locations, locale):
             text = node.get(attr)
         else:
             text = getattr(node, attr)
-        text, replaced = replace(text, loc.original_word, new_word, locale.langcode)
+        replacement = loc.elided_prefix + new_word
+        text, replaced = replace(text, loc.original_word, replacement, locale.langcode)
         if replaced:
             if is_attr:
                 node.set(attr, text)
diff --git a/src/calibre/gui2/tweak_book/spell.py b/src/calibre/gui2/tweak_book/spell.py
index 153236185b..f25575a04f 100644
--- a/src/calibre/gui2/tweak_book/spell.py
+++ b/src/calibre/gui2/tweak_book/spell.py
@@ -702,7 +702,7 @@ class WordsModel(QAbstractTableModel):
 
     def replace_word(self, w, new_word):
         for location in self.words[w]:
-            location.original_word = new_word
+            location.replace(new_word)
         if w[0] == new_word:
             return w
         new_key = (new_word, w[1])
@@ -1164,6 +1164,7 @@ class SpellCheck(Dialog):
         d.exec_()
 # }}}
 
+# Find next occurrence  {{{
 def find_next(word, locations, current_editor, current_editor_name,
               gui_parent, show_editor, edit_file):
     files = OrderedDict()
@@ -1196,6 +1197,7 @@ def find_next(word, locations, current_editor, current_editor_name,
             show_editor(file_name)
             return True
     return False
+# }}}
 
 if __name__ == '__main__':
     app = QApplication([])