Spell check: Fix 'Show net occurrence' sometimes showing the word in an incorrect location, for example in an attribute where spell check is not performed.

This commit is contained in:
Kovid Goyal 2014-04-19 18:33:10 +05:30
parent 36c937c6ba
commit b1a45f3147
8 changed files with 163 additions and 56 deletions

View File

@ -83,28 +83,33 @@ def add_words_from_text(node, attr, words, file_name, locale):
_opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
# We can only use barename() for tag names and simple attribute checks so that
# this code matches up with the syntax highlighter base spell checking
def read_words_from_opf(root, words, file_name, book_locale):
for tag in root.xpath('//*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc']):
tagname = barename(tag.tag)
if not tag.text or tagname in {'identifier', 'language', 'date'}:
continue
add_words_from_text(tag, 'text', words, file_name, book_locale)
for tag in root.iterdescendants('*'):
if tag.text is not None and barename(tag.tag) in opf_spell_tags:
add_words_from_text(tag, 'text', words, file_name, book_locale)
add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
ncx_spell_tags = {'text'}
xml_spell_tags = opf_spell_tags | ncx_spell_tags
def read_words_from_ncx(root, words, file_name, book_locale):
for tag in root.xpath('//*[local-name()="text"]'):
if not tag.text:
continue
add_words_from_text(tag, 'text', words, file_name, book_locale)
if tag.text is not None:
add_words_from_text(tag, 'text', words, file_name, book_locale)
html_spell_tags = {'script', 'style', 'link'}
def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
tagname = barename(tag.tag)
if tagname not in {'script', 'style', 'link', 'head'}:
if tag.text is not None:
add_words_from_text(tag, 'text', words, file_name, locale)
for attr in {'alt', 'title'}:
add_words_from_attr(tag, attr, words, file_name, locale)
if tag.tail is not None:
if tag.text is not None and barename(tag.tag) not in html_spell_tags:
add_words_from_text(tag, 'text', words, file_name, locale)
for attr in {'alt', 'title'}:
add_words_from_attr(tag, attr, words, file_name, locale)
if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
add_words_from_text(tag, 'tail', words, file_name, parent_locale)
def locale_from_tag(tag):

View File

@ -17,3 +17,6 @@ class NullSmarts(object):
def get_smart_selection(self, editor, update=True):
return editor.selected_text
def verify_for_spellcheck(self, cursor, highlighter):
return False

View File

@ -14,6 +14,7 @@ from PyQt4.Qt import QTextEdit
from calibre import prepare_string_for_xml
from calibre.gui2 import error_dialog
from calibre.gui2.tweak_book.editor.syntax.html import ATTR_NAME, ATTR_END
get_offset = itemgetter(0)
PARAGRAPH_SEPARATOR = '\u2029'
@ -43,6 +44,20 @@ def next_tag_boundary(block, offset, forward=True):
offset = -1 if forward else sys.maxint
return None, None
def next_attr_boundary(block, offset, forward=True):
while block.isValid():
ud = block.userData()
if ud is not None:
attributes = sorted(ud.attributes, key=get_offset, reverse=not forward)
for boundary in attributes:
if forward and boundary.offset >= offset:
return block, boundary
if not forward and boundary.offset <= offset:
return block, boundary
block = block.next() if forward else block.previous()
offset = -1 if forward else sys.maxint
return None, None
def find_closest_containing_tag(block, offset, max_tags=sys.maxint):
''' Find the closest containing tag. To find it, we search for the first
opening tag that does not have a matching closing tag before the specified
@ -79,6 +94,29 @@ def find_closest_containing_tag(block, offset, max_tags=sys.maxint):
max_tags -= 1
return None # Could not find a containing tag
def find_tag_definition(block, offset):
''' Return the <tag | > definition, if any that (block, offset) is inside. '''
block, boundary = next_tag_boundary(block, offset, forward=False)
if not boundary.is_start:
return None, False
tag_start = boundary
closing = tag_start.closing
tag = tag_start.name or tag_start.prefix
if tag_start.name and tag_start.prefix:
tag = tag_start.prefix + ':' + tag
return tag, closing
def find_containing_attribute(block, offset):
block, boundary = next_attr_boundary(block, offset, forward=False)
if block is None:
return None
if boundary.type is ATTR_NAME or boundary.data is ATTR_END:
return None # offset is not inside an attribute value
block, boundary = next_attr_boundary(block, boundary.offset - 1, forward=False)
if block is not None and boundary.type == ATTR_NAME:
return boundary.data
return None
def find_closing_tag(tag, max_tags=sys.maxint):
''' Find the closing tag corresponding to the specified tag. To find it we
search for the first closing tag after the specified tag that does not
@ -241,3 +279,33 @@ class HTMLSmarts(NullSmarts):
c.setPosition(pos + 1 + len(name))
editor.setTextCursor(c)
def verify_for_spellcheck(self, cursor, highlighter):
# Return True iff the cursor is in a location where spelling is
# checked (inside a tag or inside a checked attribute)
block = cursor.block()
start_pos = cursor.anchor() - block.position()
end_pos = cursor.position() - block.position()
start_tag, closing = find_tag_definition(block, start_pos)
if closing:
return False
end_tag, closing = find_tag_definition(block, end_pos)
if closing:
return False
if start_tag is None and end_tag is None:
# We are in normal text, check that the containing tag is
# allowed for spell checking.
tag = find_closest_containing_tag(block, start_pos)
if tag is not None and highlighter.tag_ok_for_spell(tag.name.split(':')[-1]):
return True
if start_tag != end_tag:
return False
# Now we check if we are in an allowed attribute
sa = find_containing_attribute(block, start_pos)
ea = find_containing_attribute(block, end_pos)
if sa == ea and sa in highlighter.spell_attributes:
return True
return False

View File

@ -31,6 +31,8 @@ class SyntaxHighlighter(QSyntaxHighlighter):
state_map = {0:lambda state, text, i, formats:[(len(text), None)]}
create_formats_func = lambda highlighter: {}
spell_attributes = ()
tag_ok_for_spell = lambda x: False
def __init__(self, *args, **kwargs):
QSyntaxHighlighter.__init__(self, *args, **kwargs)

View File

@ -12,6 +12,7 @@ from collections import namedtuple
from PyQt4.Qt import QFont, QTextBlockUserData
from calibre.ebooks.oeb.polish.spell import html_spell_tags, xml_spell_tags
from calibre.gui2.tweak_book.editor import SyntaxTextCharFormat
from calibre.gui2.tweak_book.editor.syntax.base import SyntaxHighlighter, run_loop
from calibre.gui2.tweak_book.editor.syntax.css import create_formats as create_css_formats, state_map as css_state_map, State as CSSState
@ -46,6 +47,7 @@ CSS = 11
TagStart = namedtuple('TagStart', 'offset prefix name closing is_start')
TagEnd = namedtuple('TagEnd', 'offset self_closing is_start')
Attr = namedtuple('Attr', 'offset type data')
class Tag(object):
@ -76,13 +78,14 @@ class State(object):
__slots__ = ('tag_being_defined', 'tags', 'is_bold', 'is_italic',
'current_lang', 'parse', 'get_user_data', 'set_user_data',
'css_formats', 'stack', 'sub_parser_state', 'default_lang')
'css_formats', 'stack', 'sub_parser_state', 'default_lang',
'attribute_name',)
def __init__(self):
self.tags = []
self.is_bold = self.is_italic = False
self.tag_being_defined = self.current_lang = self.get_user_data = self.set_user_data = \
self.css_formats = self.stack = self.sub_parser_state = self.default_lang = None
self.css_formats = self.stack = self.sub_parser_state = self.default_lang = self.attribute_name = None
self.parse = NORMAL
def copy(self):
@ -101,13 +104,14 @@ class State(object):
return self.stack.index_for(self)
def __hash__(self):
return hash((self.parse, self.sub_parser_state, self.tag_being_defined, tuple(self.tags)))
return hash((self.parse, self.sub_parser_state, self.tag_being_defined, self.attribute_name, tuple(self.tags)))
def __eq__(self, other):
return (
self.parse == getattr(other, 'parse', -1) and
self.sub_parser_state == getattr(other, 'sub_parser_state', -1) and
self.tag_being_defined == getattr(other, 'tag_being_defined', False) and
self.attribute_name == getattr(other, 'attribute_name', False) and
self.tags == getattr(other, 'tags', None)
)
@ -194,6 +198,7 @@ class HTMLUserData(QTextBlockUserData):
def __init__(self):
QTextBlockUserData.__init__(self)
self.tags = []
self.attributes = []
def add_tag_data(state, tag):
ud = q = state.get_user_data()
@ -203,6 +208,16 @@ def add_tag_data(state, tag):
if q is None:
state.set_user_data(ud)
ATTR_NAME, ATTR_VALUE, ATTR_START, ATTR_END = object(), object(), object(), object()
def add_attr_data(state, data_type, data, offset):
ud = q = state.get_user_data()
if ud is None:
ud = HTMLUserData()
ud.attributes.append(Attr(offset, data_type, data))
if q is None:
state.set_user_data(ud)
def css(state, text, i, formats):
' Inside a <style> tag '
pat = cdata_close_pats['style']
@ -320,7 +335,9 @@ def opening_tag(cdata_tags, state, text, i, formats):
if m is None:
return [(1, formats['?'])]
state.parse = ATTRIBUTE_NAME
prefix, name = m.group().partition(':')[0::2]
attrname = state.attribute_name = m.group()
add_attr_data(state, ATTR_NAME, attrname, m.start())
prefix, name = attrname.partition(':')[0::2]
if prefix and name:
return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])]
return [(len(prefix), formats['attr'])]
@ -333,11 +350,9 @@ def attribute_name(state, text, i, formats):
if ch == '=':
state.parse = ATTRIBUTE_VALUE
return [(1, formats['attr'])]
# Standalone attribute with no value
state.parse = IN_OPENING_TAG
if ch in {'>', '/'}:
# Standalone attribute with no value
return [(0, None)]
return [(1, formats['no-attr-value'])]
return [(0, None)]
def attribute_value(state, text, i, formats):
' After attribute = '
@ -356,12 +371,14 @@ def attribute_value(state, text, i, formats):
def quoted_val(state, text, i, formats):
' A quoted attribute value '
quote = '"' if state.parse is DQ_VAL else "'"
add_attr_data(state, ATTR_VALUE, ATTR_START, i)
pos = text.find(quote, i)
if pos == -1:
num = len(text) - i
else:
num = pos - i + 1
state.parse = IN_OPENING_TAG
add_attr_data(state, ATTR_VALUE, ATTR_END, i + num)
return [(num, formats['string'])]
def closing_tag(state, text, i, formats):
@ -447,6 +464,7 @@ class HTMLHighlighter(SyntaxHighlighter):
state_map = state_map
create_formats_func = create_formats
spell_attributes = ('alt', 'title')
def create_formats(self):
super(HTMLHighlighter, self).create_formats()
@ -460,9 +478,16 @@ class HTMLHighlighter(SyntaxHighlighter):
ans = self.default_state.stack.state_for(val) or self.default_state
return ans.copy()
def tag_ok_for_spell(self, name):
return name not in html_spell_tags
class XMLHighlighter(HTMLHighlighter):
state_map = xml_state_map
spell_attributes = ('opf:file-as',)
def tag_ok_for_spell(self, name):
return name in xml_spell_tags
if __name__ == '__main__':
from calibre.gui2.tweak_book.editor.widget import launch_editor

View File

@ -378,28 +378,35 @@ class TextEdit(PlainTextEdit):
self.saved_matches[save_match] = (pat, m)
return True
def find_word_from_line(self, word, lang, lnum, from_cursor=True):
def find_spell_word(self, original_words, lang, from_cursor=True):
c = self.textCursor()
c.setPosition(c.position())
if not from_cursor or c.blockNumber() != lnum - 1:
lnum = max(1, min(self.blockCount(), lnum))
if not from_cursor:
c.movePosition(c.Start)
c.movePosition(c.NextBlock, n=lnum - 1)
c.movePosition(c.StartOfLine)
offset = c.block().position()
c.movePosition(c.End, c.KeepAnchor)
def find_word(haystack):
for w in original_words:
idx = index_of(w, haystack, lang=lang)
if idx > -1:
return idx, w
return -1, None
while True:
text = unicode(c.selectedText()).rstrip('\0')
idx, word = find_word(text)
if idx == -1:
return False
c.setPosition(c.anchor() + idx)
c.setPosition(c.position() + string_length(word), c.KeepAnchor)
if self.smarts.verify_for_spellcheck(c, self.highlighter):
self.setTextCursor(c)
self.centerCursor()
return True
c.setPosition(c.position())
c.movePosition(c.End, c.KeepAnchor)
else:
offset = c.block().position() + c.positionInBlock()
c.movePosition(c.End, c.KeepAnchor)
text = unicode(c.selectedText()).rstrip('\0')
idx = index_of(word, text, lang=lang)
if idx == -1:
return False
c.setPosition(offset + idx)
c.setPosition(c.position() + string_length(word), c.KeepAnchor)
self.setTextCursor(c)
self.centerCursor()
return True
return False
def replace(self, pat, template, saved_match='gui'):
c = self.textCursor()

View File

@ -189,8 +189,8 @@ class Editor(QMainWindow):
def find(self, *args, **kwargs):
return self.editor.find(*args, **kwargs)
def find_word_from_line(self, *args, **kwargs):
return self.editor.find_word_from_line(*args, **kwargs)
def find_spell_word(self, *args, **kwargs):
return self.editor.find_spell_word(*args, **kwargs)
def replace(self, *args, **kwargs):
return self.editor.replace(*args, **kwargs)

View File

@ -1036,10 +1036,10 @@ def find_next(word, locations, current_editor, current_editor_name,
files[l.file_name].append(l)
except KeyError:
files[l.file_name] = [l]
start_locations = set()
if current_editor_name not in files:
current_editor = current_editor_name = None
current_editor_name = None
locations = [(fname, {l.original_word for l in _locations}, False) for fname, _locations in files.iteritems()]
else:
# Re-order the list of locations to search so that we search in the
# current editor first
@ -1047,20 +1047,17 @@ def find_next(word, locations, current_editor, current_editor_name,
idx = lfiles.index(current_editor_name)
before, after = lfiles[:idx], lfiles[idx+1:]
lfiles = after + before + [current_editor_name]
lnum = current_editor.current_line + 1
start_locations = [l for l in files[current_editor_name] if l.sourceline >= lnum]
locations = list(start_locations)
locations = [(current_editor_name, {l.original_word for l in files[current_editor_name]}, True)]
for fname in lfiles:
locations.extend(files[fname])
start_locations = set(start_locations)
locations.append((fname, {l.original_word for l in files[fname]}, False))
for location in locations:
ed = editors.get(location.file_name, None)
for file_name, original_words, from_cursor in locations:
ed = editors.get(file_name, None)
if ed is None:
edit_file(location.file_name)
ed = editors[location.file_name]
if ed.find_word_from_line(location.original_word, word[1].langcode, location.sourceline, from_cursor=location in start_locations):
show_editor(location.file_name)
edit_file(file_name)
ed = editors[file_name]
if ed.find_spell_word(original_words, word[1].langcode, from_cursor=from_cursor):
show_editor(file_name)
return True
return False