Spell check: Fix 'Show net occurrence' sometimes showing the word in an incorrect location, for example in an attribute where spell check is not performed.

2025-07-09 03:04:10 -04:00 · 2014-04-19 18:33:10 +05:30 · 2014-04-19 18:33:10 +05:30 · b1a45f3147
commit b1a45f3147
parent 36c937c6ba
8 changed files with 163 additions and 56 deletions
--- a/src/calibre/ebooks/oeb/polish/spell.py
+++ b/src/calibre/ebooks/oeb/polish/spell.py
@ -83,28 +83,33 @@ def add_words_from_text(node, attr, words, file_name, locale):

 _opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf']

+opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
+
+# We can only use barename() for tag names and simple attribute checks so that
+# this code matches up with the syntax highlighter base spell checking
+
 def read_words_from_opf(root, words, file_name, book_locale):
-    for tag in root.xpath('//*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc']):
-        tagname = barename(tag.tag)
-        if not tag.text or tagname in {'identifier', 'language', 'date'}:
-            continue
-        add_words_from_text(tag, 'text', words, file_name, book_locale)
+    for tag in root.iterdescendants('*'):
+        if tag.text is not None and barename(tag.tag) in opf_spell_tags:
+            add_words_from_text(tag, 'text', words, file_name, book_locale)
        add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)

+ncx_spell_tags = {'text'}
+xml_spell_tags = opf_spell_tags | ncx_spell_tags
+
 def read_words_from_ncx(root, words, file_name, book_locale):
    for tag in root.xpath('//*[local-name()="text"]'):
-        if not tag.text:
-            continue
-        add_words_from_text(tag, 'text', words, file_name, book_locale)
+        if tag.text is not None:
+            add_words_from_text(tag, 'text', words, file_name, book_locale)
+
+html_spell_tags = {'script', 'style', 'link'}

 def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
-    tagname = barename(tag.tag)
-    if tagname not in {'script', 'style', 'link', 'head'}:
-        if tag.text is not None:
-            add_words_from_text(tag, 'text', words, file_name, locale)
-        for attr in {'alt', 'title'}:
-            add_words_from_attr(tag, attr, words, file_name, locale)
-    if tag.tail is not None:
+    if tag.text is not None and barename(tag.tag) not in html_spell_tags:
+        add_words_from_text(tag, 'text', words, file_name, locale)
+    for attr in {'alt', 'title'}:
+        add_words_from_attr(tag, attr, words, file_name, locale)
+    if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
        add_words_from_text(tag, 'tail', words, file_name, parent_locale)

 def locale_from_tag(tag):
--- a/src/calibre/gui2/tweak_book/editor/smart/init.py
+++ b/src/calibre/gui2/tweak_book/editor/smart/init.py
@ -17,3 +17,6 @@ class NullSmarts(object):
    def get_smart_selection(self, editor, update=True):
        return editor.selected_text

+    def verify_for_spellcheck(self, cursor, highlighter):
+        return False
+
--- a/src/calibre/gui2/tweak_book/editor/smart/html.py
+++ b/src/calibre/gui2/tweak_book/editor/smart/html.py
@ -14,6 +14,7 @@ from PyQt4.Qt import QTextEdit

 from calibre import prepare_string_for_xml
 from calibre.gui2 import error_dialog
+from calibre.gui2.tweak_book.editor.syntax.html import ATTR_NAME, ATTR_END

 get_offset = itemgetter(0)
 PARAGRAPH_SEPARATOR = '\u2029'
@ -43,6 +44,20 @@ def next_tag_boundary(block, offset, forward=True):
        offset = -1 if forward else sys.maxint
    return None, None

+def next_attr_boundary(block, offset, forward=True):
+    while block.isValid():
+        ud = block.userData()
+        if ud is not None:
+            attributes = sorted(ud.attributes, key=get_offset, reverse=not forward)
+            for boundary in attributes:
+                if forward and boundary.offset >= offset:
+                    return block, boundary
+                if not forward and boundary.offset <= offset:
+                    return block, boundary
+        block = block.next() if forward else block.previous()
+        offset = -1 if forward else sys.maxint
+    return None, None
+
 def find_closest_containing_tag(block, offset, max_tags=sys.maxint):
    ''' Find the closest containing tag. To find it, we search for the first
    opening tag that does not have a matching closing tag before the specified
@ -79,6 +94,29 @@ def find_closest_containing_tag(block, offset, max_tags=sys.maxint):
        max_tags -= 1
    return None  # Could not find a containing tag

+def find_tag_definition(block, offset):
+    ''' Return the <tag | > definition, if any that (block, offset) is inside. '''
+    block, boundary = next_tag_boundary(block, offset, forward=False)
+    if not boundary.is_start:
+        return None, False
+    tag_start = boundary
+    closing = tag_start.closing
+    tag = tag_start.name or tag_start.prefix
+    if tag_start.name and tag_start.prefix:
+        tag = tag_start.prefix + ':' + tag
+    return tag, closing
+
+def find_containing_attribute(block, offset):
+    block, boundary = next_attr_boundary(block, offset, forward=False)
+    if block is None:
+        return None
+    if boundary.type is ATTR_NAME or boundary.data is ATTR_END:
+        return None  # offset is not inside an attribute value
+    block, boundary = next_attr_boundary(block, boundary.offset - 1, forward=False)
+    if block is not None and boundary.type == ATTR_NAME:
+        return boundary.data
+    return None
+
 def find_closing_tag(tag, max_tags=sys.maxint):
    ''' Find the closing tag corresponding to the specified tag. To find it we
    search for the first closing tag after the specified tag that does not
@ -241,3 +279,33 @@ class HTMLSmarts(NullSmarts):
        c.setPosition(pos + 1 + len(name))
        editor.setTextCursor(c)

+    def verify_for_spellcheck(self, cursor, highlighter):
+        # Return True iff the cursor is in a location where spelling is
+        # checked (inside a tag or inside a checked attribute)
+        block = cursor.block()
+        start_pos = cursor.anchor() - block.position()
+        end_pos = cursor.position() - block.position()
+        start_tag, closing = find_tag_definition(block, start_pos)
+        if closing:
+            return False
+        end_tag, closing = find_tag_definition(block, end_pos)
+        if closing:
+            return False
+        if start_tag is None and end_tag is None:
+            # We are in normal text, check that the containing tag is
+            # allowed for spell checking.
+            tag = find_closest_containing_tag(block, start_pos)
+            if tag is not None and highlighter.tag_ok_for_spell(tag.name.split(':')[-1]):
+                return True
+        if start_tag != end_tag:
+            return False
+
+        # Now we check if we are in an allowed attribute
+        sa = find_containing_attribute(block, start_pos)
+        ea = find_containing_attribute(block, end_pos)
+
+        if sa == ea and sa in highlighter.spell_attributes:
+            return True
+
+        return False
+
--- a/src/calibre/gui2/tweak_book/editor/syntax/base.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/base.py
@ -31,6 +31,8 @@ class SyntaxHighlighter(QSyntaxHighlighter):

    state_map = {0:lambda state, text, i, formats:[(len(text), None)]}
    create_formats_func = lambda highlighter: {}
+    spell_attributes = ()
+    tag_ok_for_spell = lambda x: False

    def __init__(self, *args, **kwargs):
        QSyntaxHighlighter.__init__(self, *args, **kwargs)
--- a/src/calibre/gui2/tweak_book/editor/syntax/html.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py
@ -12,6 +12,7 @@ from collections import namedtuple

 from PyQt4.Qt import QFont, QTextBlockUserData

+from calibre.ebooks.oeb.polish.spell import html_spell_tags, xml_spell_tags
 from calibre.gui2.tweak_book.editor import SyntaxTextCharFormat
 from calibre.gui2.tweak_book.editor.syntax.base import SyntaxHighlighter, run_loop
 from calibre.gui2.tweak_book.editor.syntax.css import create_formats as create_css_formats, state_map as css_state_map, State as CSSState
@ -46,6 +47,7 @@ CSS = 11

 TagStart = namedtuple('TagStart', 'offset prefix name closing is_start')
 TagEnd = namedtuple('TagEnd', 'offset self_closing is_start')
+Attr = namedtuple('Attr', 'offset type data')

 class Tag(object):

@ -76,13 +78,14 @@ class State(object):

    __slots__ = ('tag_being_defined', 'tags', 'is_bold', 'is_italic',
                 'current_lang', 'parse', 'get_user_data', 'set_user_data',
-                 'css_formats', 'stack', 'sub_parser_state', 'default_lang')
+                 'css_formats', 'stack', 'sub_parser_state', 'default_lang',
+                 'attribute_name',)

    def __init__(self):
        self.tags = []
        self.is_bold = self.is_italic = False
        self.tag_being_defined = self.current_lang = self.get_user_data = self.set_user_data = \
-            self.css_formats = self.stack = self.sub_parser_state = self.default_lang = None
+            self.css_formats = self.stack = self.sub_parser_state = self.default_lang = self.attribute_name = None
        self.parse = NORMAL

    def copy(self):
@ -101,13 +104,14 @@ class State(object):
        return self.stack.index_for(self)

    def __hash__(self):
-        return hash((self.parse, self.sub_parser_state, self.tag_being_defined, tuple(self.tags)))
+        return hash((self.parse, self.sub_parser_state, self.tag_being_defined, self.attribute_name, tuple(self.tags)))

    def __eq__(self, other):
        return (
            self.parse == getattr(other, 'parse', -1) and
            self.sub_parser_state == getattr(other, 'sub_parser_state', -1) and
            self.tag_being_defined == getattr(other, 'tag_being_defined', False) and
+            self.attribute_name == getattr(other, 'attribute_name', False) and
            self.tags == getattr(other, 'tags', None)
        )

@ -194,6 +198,7 @@ class HTMLUserData(QTextBlockUserData):
    def __init__(self):
        QTextBlockUserData.__init__(self)
        self.tags = []
+        self.attributes = []

 def add_tag_data(state, tag):
    ud = q = state.get_user_data()
@ -203,6 +208,16 @@ def add_tag_data(state, tag):
    if q is None:
        state.set_user_data(ud)

+ATTR_NAME, ATTR_VALUE, ATTR_START, ATTR_END = object(), object(), object(), object()
+
+def add_attr_data(state, data_type, data, offset):
+    ud = q = state.get_user_data()
+    if ud is None:
+        ud = HTMLUserData()
+    ud.attributes.append(Attr(offset, data_type, data))
+    if q is None:
+        state.set_user_data(ud)
+
 def css(state, text, i, formats):
    ' Inside a <style> tag '
    pat = cdata_close_pats['style']
@ -320,7 +335,9 @@ def opening_tag(cdata_tags, state, text, i, formats):
    if m is None:
        return [(1, formats['?'])]
    state.parse = ATTRIBUTE_NAME
-    prefix, name = m.group().partition(':')[0::2]
+    attrname = state.attribute_name = m.group()
+    add_attr_data(state, ATTR_NAME, attrname, m.start())
+    prefix, name = attrname.partition(':')[0::2]
    if prefix and name:
        return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])]
    return [(len(prefix), formats['attr'])]
@ -333,11 +350,9 @@ def attribute_name(state, text, i, formats):
    if ch == '=':
        state.parse = ATTRIBUTE_VALUE
        return [(1, formats['attr'])]
+    # Standalone attribute with no value
    state.parse = IN_OPENING_TAG
-    if ch in {'>', '/'}:
-        # Standalone attribute with no value
-        return [(0, None)]
-    return [(1, formats['no-attr-value'])]
+    return [(0, None)]

 def attribute_value(state, text, i, formats):
    ' After attribute = '
@ -356,12 +371,14 @@ def attribute_value(state, text, i, formats):
 def quoted_val(state, text, i, formats):
    ' A quoted attribute value '
    quote = '"' if state.parse is DQ_VAL else "'"
+    add_attr_data(state, ATTR_VALUE, ATTR_START, i)
    pos = text.find(quote, i)
    if pos == -1:
        num = len(text) - i
    else:
        num = pos - i + 1
        state.parse = IN_OPENING_TAG
+        add_attr_data(state, ATTR_VALUE, ATTR_END, i + num)
    return [(num, formats['string'])]

 def closing_tag(state, text, i, formats):
@ -447,6 +464,7 @@ class HTMLHighlighter(SyntaxHighlighter):

    state_map = state_map
    create_formats_func = create_formats
+    spell_attributes = ('alt', 'title')

    def create_formats(self):
        super(HTMLHighlighter, self).create_formats()
@ -460,9 +478,16 @@ class HTMLHighlighter(SyntaxHighlighter):
        ans = self.default_state.stack.state_for(val) or self.default_state
        return ans.copy()

+    def tag_ok_for_spell(self, name):
+        return name not in html_spell_tags
+
 class XMLHighlighter(HTMLHighlighter):

    state_map = xml_state_map
+    spell_attributes = ('opf:file-as',)
+
+    def tag_ok_for_spell(self, name):
+        return name in xml_spell_tags

 if __name__ == '__main__':
    from calibre.gui2.tweak_book.editor.widget import launch_editor
--- a/src/calibre/gui2/tweak_book/editor/text.py
+++ b/src/calibre/gui2/tweak_book/editor/text.py
@ -378,28 +378,35 @@ class TextEdit(PlainTextEdit):
            self.saved_matches[save_match] = (pat, m)
        return True

-    def find_word_from_line(self, word, lang, lnum, from_cursor=True):
+    def find_spell_word(self, original_words, lang, from_cursor=True):
        c = self.textCursor()
        c.setPosition(c.position())
-        if not from_cursor or c.blockNumber() != lnum - 1:
-            lnum = max(1, min(self.blockCount(), lnum))
+        if not from_cursor:
            c.movePosition(c.Start)
-            c.movePosition(c.NextBlock, n=lnum - 1)
-            c.movePosition(c.StartOfLine)
-            offset = c.block().position()
+        c.movePosition(c.End, c.KeepAnchor)
+
+        def find_word(haystack):
+            for w in original_words:
+                idx = index_of(w, haystack, lang=lang)
+                if idx > -1:
+                    return idx, w
+            return -1, None
+
+        while True:
+            text = unicode(c.selectedText()).rstrip('\0')
+            idx, word = find_word(text)
+            if idx == -1:
+                return False
+            c.setPosition(c.anchor() + idx)
+            c.setPosition(c.position() + string_length(word), c.KeepAnchor)
+            if self.smarts.verify_for_spellcheck(c, self.highlighter):
+                self.setTextCursor(c)
+                self.centerCursor()
+                return True
+            c.setPosition(c.position())
            c.movePosition(c.End, c.KeepAnchor)
-        else:
-            offset = c.block().position() + c.positionInBlock()
-            c.movePosition(c.End, c.KeepAnchor)
-        text = unicode(c.selectedText()).rstrip('\0')
-        idx = index_of(word, text, lang=lang)
-        if idx == -1:
-            return False
-        c.setPosition(offset + idx)
-        c.setPosition(c.position() + string_length(word), c.KeepAnchor)
-        self.setTextCursor(c)
-        self.centerCursor()
-        return True
+
+        return False

    def replace(self, pat, template, saved_match='gui'):
        c = self.textCursor()
--- a/src/calibre/gui2/tweak_book/editor/widget.py
+++ b/src/calibre/gui2/tweak_book/editor/widget.py
@ -189,8 +189,8 @@ class Editor(QMainWindow):
    def find(self, *args, **kwargs):
        return self.editor.find(*args, **kwargs)

-    def find_word_from_line(self, *args, **kwargs):
-        return self.editor.find_word_from_line(*args, **kwargs)
+    def find_spell_word(self, *args, **kwargs):
+        return self.editor.find_spell_word(*args, **kwargs)

    def replace(self, *args, **kwargs):
        return self.editor.replace(*args, **kwargs)
--- a/src/calibre/gui2/tweak_book/spell.py
+++ b/src/calibre/gui2/tweak_book/spell.py
@ -1036,10 +1036,10 @@ def find_next(word, locations, current_editor, current_editor_name,
            files[l.file_name].append(l)
        except KeyError:
            files[l.file_name] = [l]
-    start_locations = set()

    if current_editor_name not in files:
-        current_editor = current_editor_name = None
+        current_editor_name = None
+        locations = [(fname, {l.original_word for l in _locations}, False) for fname, _locations in files.iteritems()]
    else:
        # Re-order the list of locations to search so that we search in the
        # current editor first
@ -1047,20 +1047,17 @@ def find_next(word, locations, current_editor, current_editor_name,
        idx = lfiles.index(current_editor_name)
        before, after = lfiles[:idx], lfiles[idx+1:]
        lfiles = after + before + [current_editor_name]
-        lnum = current_editor.current_line + 1
-        start_locations = [l for l in files[current_editor_name] if l.sourceline >= lnum]
-        locations = list(start_locations)
+        locations = [(current_editor_name, {l.original_word for l in files[current_editor_name]}, True)]
        for fname in lfiles:
-            locations.extend(files[fname])
-        start_locations = set(start_locations)
+            locations.append((fname, {l.original_word for l in files[fname]}, False))

-    for location in locations:
-        ed = editors.get(location.file_name, None)
+    for file_name, original_words, from_cursor in locations:
+        ed = editors.get(file_name, None)
        if ed is None:
-            edit_file(location.file_name)
-            ed = editors[location.file_name]
-        if ed.find_word_from_line(location.original_word, word[1].langcode, location.sourceline, from_cursor=location in start_locations):
-            show_editor(location.file_name)
+            edit_file(file_name)
+            ed = editors[file_name]
+        if ed.find_spell_word(original_words, word[1].langcode, from_cursor=from_cursor):
+            show_editor(file_name)
            return True
    return False