Spell check: Fix 'Show net occurrence' sometimes showing the word in an incorrect location, for example in an attribute where spell check is not performed.

2025-07-09 03:04:10 -04:00 · 2014-04-19 18:33:10 +05:30 · 2014-04-19 18:33:10 +05:30 · b1a45f3147
commit b1a45f3147
parent 36c937c6ba
8 changed files with 163 additions and 56 deletions
--- a/src/calibre/ebooks/oeb/polish/spell.py
+++ b/src/calibre/ebooks/oeb/polish/spell.py
@ -83,28 +83,33 @@ def add_words_from_text(node, attr, words, file_name, locale):
 _opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
 opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
 # We can only use barename() for tag names and simple attribute checks so that
 # this code matches up with the syntax highlighter base spell checking
 def read_words_from_opf(root, words, file_name, book_locale):
-    for tag in root.xpath('//*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc']):
+    for tag in root.iterdescendants('*'):
-        tagname = barename(tag.tag)
+        if tag.text is not None and barename(tag.tag) in opf_spell_tags:
-        if not tag.text or tagname in {'identifier', 'language', 'date'}:
+            add_words_from_text(tag, 'text', words, file_name, book_locale)
            continue
        add_words_from_text(tag, 'text', words, file_name, book_locale)
        add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
 ncx_spell_tags = {'text'}
 xml_spell_tags = opf_spell_tags | ncx_spell_tags
 def read_words_from_ncx(root, words, file_name, book_locale):
    for tag in root.xpath('//*[local-name()="text"]'):
-        if not tag.text:
+        if tag.text is not None:
-            continue
+            add_words_from_text(tag, 'text', words, file_name, book_locale)
-        add_words_from_text(tag, 'text', words, file_name, book_locale)
+
 html_spell_tags = {'script', 'style', 'link'}
 def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
-    tagname = barename(tag.tag)
+    if tag.text is not None and barename(tag.tag) not in html_spell_tags:
-    if tagname not in {'script', 'style', 'link', 'head'}:
+        add_words_from_text(tag, 'text', words, file_name, locale)
-        if tag.text is not None:
+    for attr in {'alt', 'title'}:
-            add_words_from_text(tag, 'text', words, file_name, locale)
+        add_words_from_attr(tag, attr, words, file_name, locale)
-        for attr in {'alt', 'title'}:
+    if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
            add_words_from_attr(tag, attr, words, file_name, locale)
    if tag.tail is not None:
        add_words_from_text(tag, 'tail', words, file_name, parent_locale)
 def locale_from_tag(tag):
--- a/src/calibre/gui2/tweak_book/editor/smart/init.py
+++ b/src/calibre/gui2/tweak_book/editor/smart/init.py
@ -17,3 +17,6 @@ class NullSmarts(object):
    def get_smart_selection(self, editor, update=True):
        return editor.selected_text
    def verify_for_spellcheck(self, cursor, highlighter):
        return False
--- a/src/calibre/gui2/tweak_book/editor/smart/html.py
+++ b/src/calibre/gui2/tweak_book/editor/smart/html.py
@ -14,6 +14,7 @@ from PyQt4.Qt import QTextEdit
 from calibre import prepare_string_for_xml
 from calibre.gui2 import error_dialog
 from calibre.gui2.tweak_book.editor.syntax.html import ATTR_NAME, ATTR_END
 get_offset = itemgetter(0)
 PARAGRAPH_SEPARATOR = '\u2029'
@ -43,6 +44,20 @@ def next_tag_boundary(block, offset, forward=True):
        offset = -1 if forward else sys.maxint
    return None, None
 def next_attr_boundary(block, offset, forward=True):
    while block.isValid():
        ud = block.userData()
        if ud is not None:
            attributes = sorted(ud.attributes, key=get_offset, reverse=not forward)
            for boundary in attributes:
                if forward and boundary.offset >= offset:
                    return block, boundary
                if not forward and boundary.offset <= offset:
                    return block, boundary
        block = block.next() if forward else block.previous()
        offset = -1 if forward else sys.maxint
    return None, None
 def find_closest_containing_tag(block, offset, max_tags=sys.maxint):
    ''' Find the closest containing tag. To find it, we search for the first
    opening tag that does not have a matching closing tag before the specified
@ -79,6 +94,29 @@ def find_closest_containing_tag(block, offset, max_tags=sys.maxint):
        max_tags -= 1
    return None  # Could not find a containing tag
 def find_tag_definition(block, offset):
    ''' Return the <tag | > definition, if any that (block, offset) is inside. '''
    block, boundary = next_tag_boundary(block, offset, forward=False)
    if not boundary.is_start:
        return None, False
    tag_start = boundary
    closing = tag_start.closing
    tag = tag_start.name or tag_start.prefix
    if tag_start.name and tag_start.prefix:
        tag = tag_start.prefix + ':' + tag
    return tag, closing
 def find_containing_attribute(block, offset):
    block, boundary = next_attr_boundary(block, offset, forward=False)
    if block is None:
        return None
    if boundary.type is ATTR_NAME or boundary.data is ATTR_END:
        return None  # offset is not inside an attribute value
    block, boundary = next_attr_boundary(block, boundary.offset - 1, forward=False)
    if block is not None and boundary.type == ATTR_NAME:
        return boundary.data
    return None
 def find_closing_tag(tag, max_tags=sys.maxint):
    ''' Find the closing tag corresponding to the specified tag. To find it we
    search for the first closing tag after the specified tag that does not
@ -241,3 +279,33 @@ class HTMLSmarts(NullSmarts):
        c.setPosition(pos + 1 + len(name))
        editor.setTextCursor(c)
    def verify_for_spellcheck(self, cursor, highlighter):
        # Return True iff the cursor is in a location where spelling is
        # checked (inside a tag or inside a checked attribute)
        block = cursor.block()
        start_pos = cursor.anchor() - block.position()
        end_pos = cursor.position() - block.position()
        start_tag, closing = find_tag_definition(block, start_pos)
        if closing:
            return False
        end_tag, closing = find_tag_definition(block, end_pos)
        if closing:
            return False
        if start_tag is None and end_tag is None:
            # We are in normal text, check that the containing tag is
            # allowed for spell checking.
            tag = find_closest_containing_tag(block, start_pos)
            if tag is not None and highlighter.tag_ok_for_spell(tag.name.split(':')[-1]):
                return True
        if start_tag != end_tag:
            return False
        # Now we check if we are in an allowed attribute
        sa = find_containing_attribute(block, start_pos)
        ea = find_containing_attribute(block, end_pos)
        if sa == ea and sa in highlighter.spell_attributes:
            return True
        return False
--- a/src/calibre/gui2/tweak_book/editor/syntax/base.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/base.py
@ -31,6 +31,8 @@ class SyntaxHighlighter(QSyntaxHighlighter):
    state_map = {0:lambda state, text, i, formats:[(len(text), None)]}
    create_formats_func = lambda highlighter: {}
    spell_attributes = ()
    tag_ok_for_spell = lambda x: False
    def __init__(self, *args, **kwargs):
        QSyntaxHighlighter.__init__(self, *args, **kwargs)
--- a/src/calibre/gui2/tweak_book/editor/syntax/html.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py
@ -12,6 +12,7 @@ from collections import namedtuple
 from PyQt4.Qt import QFont, QTextBlockUserData
 from calibre.ebooks.oeb.polish.spell import html_spell_tags, xml_spell_tags
 from calibre.gui2.tweak_book.editor import SyntaxTextCharFormat
 from calibre.gui2.tweak_book.editor.syntax.base import SyntaxHighlighter, run_loop
 from calibre.gui2.tweak_book.editor.syntax.css import create_formats as create_css_formats, state_map as css_state_map, State as CSSState
@ -46,6 +47,7 @@ CSS = 11
 TagStart = namedtuple('TagStart', 'offset prefix name closing is_start')
 TagEnd = namedtuple('TagEnd', 'offset self_closing is_start')
 Attr = namedtuple('Attr', 'offset type data')
 class Tag(object):
@ -76,13 +78,14 @@ class State(object):
    __slots__ = ('tag_being_defined', 'tags', 'is_bold', 'is_italic',
                 'current_lang', 'parse', 'get_user_data', 'set_user_data',
-                 'css_formats', 'stack', 'sub_parser_state', 'default_lang')
+                 'css_formats', 'stack', 'sub_parser_state', 'default_lang',
                 'attribute_name',)
    def __init__(self):
        self.tags = []
        self.is_bold = self.is_italic = False
        self.tag_being_defined = self.current_lang = self.get_user_data = self.set_user_data = \
-            self.css_formats = self.stack = self.sub_parser_state = self.default_lang = None
+            self.css_formats = self.stack = self.sub_parser_state = self.default_lang = self.attribute_name = None
        self.parse = NORMAL
    def copy(self):
@ -101,13 +104,14 @@ class State(object):
        return self.stack.index_for(self)
    def __hash__(self):
-        return hash((self.parse, self.sub_parser_state, self.tag_being_defined, tuple(self.tags)))
+        return hash((self.parse, self.sub_parser_state, self.tag_being_defined, self.attribute_name, tuple(self.tags)))
    def __eq__(self, other):
        return (
            self.parse == getattr(other, 'parse', -1) and
            self.sub_parser_state == getattr(other, 'sub_parser_state', -1) and
            self.tag_being_defined == getattr(other, 'tag_being_defined', False) and
            self.attribute_name == getattr(other, 'attribute_name', False) and
            self.tags == getattr(other, 'tags', None)
        )
@ -194,6 +198,7 @@ class HTMLUserData(QTextBlockUserData):
    def __init__(self):
        QTextBlockUserData.__init__(self)
        self.tags = []
        self.attributes = []
 def add_tag_data(state, tag):
    ud = q = state.get_user_data()
@ -203,6 +208,16 @@ def add_tag_data(state, tag):
    if q is None:
        state.set_user_data(ud)
 ATTR_NAME, ATTR_VALUE, ATTR_START, ATTR_END = object(), object(), object(), object()
 def add_attr_data(state, data_type, data, offset):
    ud = q = state.get_user_data()
    if ud is None:
        ud = HTMLUserData()
    ud.attributes.append(Attr(offset, data_type, data))
    if q is None:
        state.set_user_data(ud)
 def css(state, text, i, formats):
    ' Inside a <style> tag '
    pat = cdata_close_pats['style']
@ -320,7 +335,9 @@ def opening_tag(cdata_tags, state, text, i, formats):
    if m is None:
        return [(1, formats['?'])]
    state.parse = ATTRIBUTE_NAME
-    prefix, name = m.group().partition(':')[0::2]
+    attrname = state.attribute_name = m.group()
    add_attr_data(state, ATTR_NAME, attrname, m.start())
    prefix, name = attrname.partition(':')[0::2]
    if prefix and name:
        return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])]
    return [(len(prefix), formats['attr'])]
@ -333,11 +350,9 @@ def attribute_name(state, text, i, formats):
    if ch == '=':
        state.parse = ATTRIBUTE_VALUE
        return [(1, formats['attr'])]
    # Standalone attribute with no value
    state.parse = IN_OPENING_TAG
-    if ch in {'>', '/'}:
+    return [(0, None)]
        # Standalone attribute with no value
        return [(0, None)]
    return [(1, formats['no-attr-value'])]
 def attribute_value(state, text, i, formats):
    ' After attribute = '
@ -356,12 +371,14 @@ def attribute_value(state, text, i, formats):
 def quoted_val(state, text, i, formats):
    ' A quoted attribute value '
    quote = '"' if state.parse is DQ_VAL else "'"
    add_attr_data(state, ATTR_VALUE, ATTR_START, i)
    pos = text.find(quote, i)
    if pos == -1:
        num = len(text) - i
    else:
        num = pos - i + 1
        state.parse = IN_OPENING_TAG
        add_attr_data(state, ATTR_VALUE, ATTR_END, i + num)
    return [(num, formats['string'])]
 def closing_tag(state, text, i, formats):
@ -447,6 +464,7 @@ class HTMLHighlighter(SyntaxHighlighter):
    state_map = state_map
    create_formats_func = create_formats
    spell_attributes = ('alt', 'title')
    def create_formats(self):
        super(HTMLHighlighter, self).create_formats()
@ -460,9 +478,16 @@ class HTMLHighlighter(SyntaxHighlighter):
        ans = self.default_state.stack.state_for(val) or self.default_state
        return ans.copy()
    def tag_ok_for_spell(self, name):
        return name not in html_spell_tags
 class XMLHighlighter(HTMLHighlighter):
    state_map = xml_state_map
    spell_attributes = ('opf:file-as',)
    def tag_ok_for_spell(self, name):
        return name in xml_spell_tags
 if __name__ == '__main__':
    from calibre.gui2.tweak_book.editor.widget import launch_editor
--- a/src/calibre/gui2/tweak_book/editor/text.py
+++ b/src/calibre/gui2/tweak_book/editor/text.py
@ -378,28 +378,35 @@ class TextEdit(PlainTextEdit):
            self.saved_matches[save_match] = (pat, m)
        return True
-    def find_word_from_line(self, word, lang, lnum, from_cursor=True):
+    def find_spell_word(self, original_words, lang, from_cursor=True):
        c = self.textCursor()
        c.setPosition(c.position())
-        if not from_cursor or c.blockNumber() != lnum - 1:
+        if not from_cursor:
            lnum = max(1, min(self.blockCount(), lnum))
            c.movePosition(c.Start)
-            c.movePosition(c.NextBlock, n=lnum - 1)
+        c.movePosition(c.End, c.KeepAnchor)
-            c.movePosition(c.StartOfLine)
+
-            offset = c.block().position()
+        def find_word(haystack):
            for w in original_words:
                idx = index_of(w, haystack, lang=lang)
                if idx > -1:
                    return idx, w
            return -1, None
        while True:
            text = unicode(c.selectedText()).rstrip('\0')
            idx, word = find_word(text)
            if idx == -1:
                return False
            c.setPosition(c.anchor() + idx)
            c.setPosition(c.position() + string_length(word), c.KeepAnchor)
            if self.smarts.verify_for_spellcheck(c, self.highlighter):
                self.setTextCursor(c)
                self.centerCursor()
                return True
            c.setPosition(c.position())
            c.movePosition(c.End, c.KeepAnchor)
-        else:
+
-            offset = c.block().position() + c.positionInBlock()
+        return False
            c.movePosition(c.End, c.KeepAnchor)
        text = unicode(c.selectedText()).rstrip('\0')
        idx = index_of(word, text, lang=lang)
        if idx == -1:
            return False
        c.setPosition(offset + idx)
        c.setPosition(c.position() + string_length(word), c.KeepAnchor)
        self.setTextCursor(c)
        self.centerCursor()
        return True
    def replace(self, pat, template, saved_match='gui'):
        c = self.textCursor()
--- a/src/calibre/gui2/tweak_book/editor/widget.py
+++ b/src/calibre/gui2/tweak_book/editor/widget.py
@ -189,8 +189,8 @@ class Editor(QMainWindow):
    def find(self, *args, **kwargs):
        return self.editor.find(*args, **kwargs)
-    def find_word_from_line(self, *args, **kwargs):
+    def find_spell_word(self, *args, **kwargs):
-        return self.editor.find_word_from_line(*args, **kwargs)
+        return self.editor.find_spell_word(*args, **kwargs)
    def replace(self, *args, **kwargs):
        return self.editor.replace(*args, **kwargs)
--- a/src/calibre/gui2/tweak_book/spell.py
+++ b/src/calibre/gui2/tweak_book/spell.py
@ -1036,10 +1036,10 @@ def find_next(word, locations, current_editor, current_editor_name,
            files[l.file_name].append(l)
        except KeyError:
            files[l.file_name] = [l]
    start_locations = set()
    if current_editor_name not in files:
-        current_editor = current_editor_name = None
+        current_editor_name = None
        locations = [(fname, {l.original_word for l in _locations}, False) for fname, _locations in files.iteritems()]
    else:
        # Re-order the list of locations to search so that we search in the
        # current editor first
@ -1047,20 +1047,17 @@ def find_next(word, locations, current_editor, current_editor_name,
        idx = lfiles.index(current_editor_name)
        before, after = lfiles[:idx], lfiles[idx+1:]
        lfiles = after + before + [current_editor_name]
-        lnum = current_editor.current_line + 1
+        locations = [(current_editor_name, {l.original_word for l in files[current_editor_name]}, True)]
        start_locations = [l for l in files[current_editor_name] if l.sourceline >= lnum]
        locations = list(start_locations)
        for fname in lfiles:
-            locations.extend(files[fname])
+            locations.append((fname, {l.original_word for l in files[fname]}, False))
        start_locations = set(start_locations)
-    for location in locations:
+    for file_name, original_words, from_cursor in locations:
-        ed = editors.get(location.file_name, None)
+        ed = editors.get(file_name, None)
        if ed is None:
-            edit_file(location.file_name)
+            edit_file(file_name)
-            ed = editors[location.file_name]
+            ed = editors[file_name]
-        if ed.find_word_from_line(location.original_word, word[1].langcode, location.sourceline, from_cursor=location in start_locations):
+        if ed.find_spell_word(original_words, word[1].langcode, from_cursor=from_cursor):
-            show_editor(location.file_name)
+            show_editor(file_name)
            return True
    return False