From 16a5c262ebd93a097831551ea8df6fe399b76c70 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Apr 2014 12:16:19 +0530 Subject: [PATCH] Spellcheck: When checking French text list all words that have the same stem as a single word. So j'aime and aime are listed as a single word. --- src/calibre/ebooks/oeb/polish/spell.py | 31 +++++++++++++++++++------- src/calibre/gui2/tweak_book/spell.py | 4 +++- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/oeb/polish/spell.py b/src/calibre/ebooks/oeb/polish/spell.py index 811d405dd0..f599f55579 100644 --- a/src/calibre/ebooks/oeb/polish/spell.py +++ b/src/calibre/ebooks/oeb/polish/spell.py @@ -19,7 +19,7 @@ _patterns = None class Patterns(object): - __slots__ = ('sanitize_invisible_pat', 'split_pat', 'digit_pat') + __slots__ = ('sanitize_invisible_pat', 'split_pat', 'digit_pat', 'fr_elision_pat') def __init__(self): import regex @@ -30,6 +30,10 @@ class Patterns(object): r'\W+', flags=regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE) self.digit_pat = regex.compile( r'^\d+$', flags=regex.VERSION1 | regex.WORD | regex.UNICODE) + # French words with prefixes are reduced to the stem word, so that the + # words appear only once in the word list + self.fr_elision_pat = regex.compile( + u"^(?:l|d|m|t|s|j|c|ç|lorsqu|puisqu|quoiqu|qu)['’]", flags=regex.UNICODE | regex.VERSION1 | regex.IGNORECASE) def patterns(): global _patterns @@ -39,16 +43,19 @@ def patterns(): class Location(object): - __slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item') + __slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item', 'elided_prefix') - def __init__(self, file_name=None, sourceline=None, original_word=None, location_node=None, node_item=(None, None)): - self.file_name, self.sourceline, self.original_word = file_name, sourceline, original_word - self.location_node, self.node_item = location_node, node_item + def __init__(self, file_name=None, elided_prefix='', original_word=None, location_node=None, node_item=(None, None)): + self.file_name, self.elided_prefix, self.original_word = file_name, elided_prefix, original_word + self.location_node, self.node_item, self.sourceline = location_node, node_item, location_node.sourceline def __repr__(self): - return '%s:%s' % (self.file_name, self.sourceline) + return '%s @ %s:%s' % (self.original_word, self.file_name, self.sourceline) __str__ = __repr__ + def replace(self, new_word): + self.original_word = self.elided_prefix + new_word + def filter_words(word): if not word: return False @@ -68,9 +75,16 @@ def add_words(text, node, words, file_name, locale, node_item): candidates = get_words(text, locale.langcode) if candidates: p = patterns() + is_fr = locale.langcode == 'fra' for word in candidates: sword = p.sanitize_invisible_pat.sub('', word) - loc = Location(file_name, node.sourceline, word, node, node_item) + elided_prefix = '' + if is_fr: + m = p.fr_elision_pat.match(sword) + if m is not None and len(sword) > len(elided_prefix): + elided_prefix = m.group() + sword = sword[len(elided_prefix):] + loc = Location(file_name, elided_prefix, word, node, node_item) words[(sword, locale)].append(loc) def add_words_from_attr(node, attr, words, file_name, locale): @@ -184,7 +198,8 @@ def replace_word(container, new_word, locations, locale): text = node.get(attr) else: text = getattr(node, attr) - text, replaced = replace(text, loc.original_word, new_word, locale.langcode) + replacement = loc.elided_prefix + new_word + text, replaced = replace(text, loc.original_word, replacement, locale.langcode) if replaced: if is_attr: node.set(attr, text) diff --git a/src/calibre/gui2/tweak_book/spell.py b/src/calibre/gui2/tweak_book/spell.py index 153236185b..f25575a04f 100644 --- a/src/calibre/gui2/tweak_book/spell.py +++ b/src/calibre/gui2/tweak_book/spell.py @@ -702,7 +702,7 @@ class WordsModel(QAbstractTableModel): def replace_word(self, w, new_word): for location in self.words[w]: - location.original_word = new_word + location.replace(new_word) if w[0] == new_word: return w new_key = (new_word, w[1]) @@ -1164,6 +1164,7 @@ class SpellCheck(Dialog): d.exec_() # }}} +# Find next occurrence {{{ def find_next(word, locations, current_editor, current_editor_name, gui_parent, show_editor, edit_file): files = OrderedDict() @@ -1196,6 +1197,7 @@ def find_next(word, locations, current_editor, current_editor_name, show_editor(file_name) return True return False +# }}} if __name__ == '__main__': app = QApplication([])