Spell check: Fix 'Show net occurrence' sometimes showing the word in an incorrect location, for example in an attribute where spell check is not performed.

This commit is contained in:
Kovid Goyal 2014-04-19 18:33:10 +05:30
parent 36c937c6ba
commit b1a45f3147
8 changed files with 163 additions and 56 deletions

View File

@ -83,28 +83,33 @@ def add_words_from_text(node, attr, words, file_name, locale):
_opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf'] _opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
# We can only use barename() for tag names and simple attribute checks so that
# this code matches up with the syntax highlighter base spell checking
def read_words_from_opf(root, words, file_name, book_locale): def read_words_from_opf(root, words, file_name, book_locale):
for tag in root.xpath('//*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc']): for tag in root.iterdescendants('*'):
tagname = barename(tag.tag) if tag.text is not None and barename(tag.tag) in opf_spell_tags:
if not tag.text or tagname in {'identifier', 'language', 'date'}: add_words_from_text(tag, 'text', words, file_name, book_locale)
continue
add_words_from_text(tag, 'text', words, file_name, book_locale)
add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale) add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
ncx_spell_tags = {'text'}
xml_spell_tags = opf_spell_tags | ncx_spell_tags
def read_words_from_ncx(root, words, file_name, book_locale): def read_words_from_ncx(root, words, file_name, book_locale):
for tag in root.xpath('//*[local-name()="text"]'): for tag in root.xpath('//*[local-name()="text"]'):
if not tag.text: if tag.text is not None:
continue add_words_from_text(tag, 'text', words, file_name, book_locale)
add_words_from_text(tag, 'text', words, file_name, book_locale)
html_spell_tags = {'script', 'style', 'link'}
def read_words_from_html_tag(tag, words, file_name, parent_locale, locale): def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
tagname = barename(tag.tag) if tag.text is not None and barename(tag.tag) not in html_spell_tags:
if tagname not in {'script', 'style', 'link', 'head'}: add_words_from_text(tag, 'text', words, file_name, locale)
if tag.text is not None: for attr in {'alt', 'title'}:
add_words_from_text(tag, 'text', words, file_name, locale) add_words_from_attr(tag, attr, words, file_name, locale)
for attr in {'alt', 'title'}: if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
add_words_from_attr(tag, attr, words, file_name, locale)
if tag.tail is not None:
add_words_from_text(tag, 'tail', words, file_name, parent_locale) add_words_from_text(tag, 'tail', words, file_name, parent_locale)
def locale_from_tag(tag): def locale_from_tag(tag):

View File

@ -17,3 +17,6 @@ class NullSmarts(object):
def get_smart_selection(self, editor, update=True): def get_smart_selection(self, editor, update=True):
return editor.selected_text return editor.selected_text
def verify_for_spellcheck(self, cursor, highlighter):
return False

View File

@ -14,6 +14,7 @@ from PyQt4.Qt import QTextEdit
from calibre import prepare_string_for_xml from calibre import prepare_string_for_xml
from calibre.gui2 import error_dialog from calibre.gui2 import error_dialog
from calibre.gui2.tweak_book.editor.syntax.html import ATTR_NAME, ATTR_END
get_offset = itemgetter(0) get_offset = itemgetter(0)
PARAGRAPH_SEPARATOR = '\u2029' PARAGRAPH_SEPARATOR = '\u2029'
@ -43,6 +44,20 @@ def next_tag_boundary(block, offset, forward=True):
offset = -1 if forward else sys.maxint offset = -1 if forward else sys.maxint
return None, None return None, None
def next_attr_boundary(block, offset, forward=True):
while block.isValid():
ud = block.userData()
if ud is not None:
attributes = sorted(ud.attributes, key=get_offset, reverse=not forward)
for boundary in attributes:
if forward and boundary.offset >= offset:
return block, boundary
if not forward and boundary.offset <= offset:
return block, boundary
block = block.next() if forward else block.previous()
offset = -1 if forward else sys.maxint
return None, None
def find_closest_containing_tag(block, offset, max_tags=sys.maxint): def find_closest_containing_tag(block, offset, max_tags=sys.maxint):
''' Find the closest containing tag. To find it, we search for the first ''' Find the closest containing tag. To find it, we search for the first
opening tag that does not have a matching closing tag before the specified opening tag that does not have a matching closing tag before the specified
@ -79,6 +94,29 @@ def find_closest_containing_tag(block, offset, max_tags=sys.maxint):
max_tags -= 1 max_tags -= 1
return None # Could not find a containing tag return None # Could not find a containing tag
def find_tag_definition(block, offset):
''' Return the <tag | > definition, if any that (block, offset) is inside. '''
block, boundary = next_tag_boundary(block, offset, forward=False)
if not boundary.is_start:
return None, False
tag_start = boundary
closing = tag_start.closing
tag = tag_start.name or tag_start.prefix
if tag_start.name and tag_start.prefix:
tag = tag_start.prefix + ':' + tag
return tag, closing
def find_containing_attribute(block, offset):
block, boundary = next_attr_boundary(block, offset, forward=False)
if block is None:
return None
if boundary.type is ATTR_NAME or boundary.data is ATTR_END:
return None # offset is not inside an attribute value
block, boundary = next_attr_boundary(block, boundary.offset - 1, forward=False)
if block is not None and boundary.type == ATTR_NAME:
return boundary.data
return None
def find_closing_tag(tag, max_tags=sys.maxint): def find_closing_tag(tag, max_tags=sys.maxint):
''' Find the closing tag corresponding to the specified tag. To find it we ''' Find the closing tag corresponding to the specified tag. To find it we
search for the first closing tag after the specified tag that does not search for the first closing tag after the specified tag that does not
@ -241,3 +279,33 @@ class HTMLSmarts(NullSmarts):
c.setPosition(pos + 1 + len(name)) c.setPosition(pos + 1 + len(name))
editor.setTextCursor(c) editor.setTextCursor(c)
def verify_for_spellcheck(self, cursor, highlighter):
# Return True iff the cursor is in a location where spelling is
# checked (inside a tag or inside a checked attribute)
block = cursor.block()
start_pos = cursor.anchor() - block.position()
end_pos = cursor.position() - block.position()
start_tag, closing = find_tag_definition(block, start_pos)
if closing:
return False
end_tag, closing = find_tag_definition(block, end_pos)
if closing:
return False
if start_tag is None and end_tag is None:
# We are in normal text, check that the containing tag is
# allowed for spell checking.
tag = find_closest_containing_tag(block, start_pos)
if tag is not None and highlighter.tag_ok_for_spell(tag.name.split(':')[-1]):
return True
if start_tag != end_tag:
return False
# Now we check if we are in an allowed attribute
sa = find_containing_attribute(block, start_pos)
ea = find_containing_attribute(block, end_pos)
if sa == ea and sa in highlighter.spell_attributes:
return True
return False

View File

@ -31,6 +31,8 @@ class SyntaxHighlighter(QSyntaxHighlighter):
state_map = {0:lambda state, text, i, formats:[(len(text), None)]} state_map = {0:lambda state, text, i, formats:[(len(text), None)]}
create_formats_func = lambda highlighter: {} create_formats_func = lambda highlighter: {}
spell_attributes = ()
tag_ok_for_spell = lambda x: False
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
QSyntaxHighlighter.__init__(self, *args, **kwargs) QSyntaxHighlighter.__init__(self, *args, **kwargs)

View File

@ -12,6 +12,7 @@ from collections import namedtuple
from PyQt4.Qt import QFont, QTextBlockUserData from PyQt4.Qt import QFont, QTextBlockUserData
from calibre.ebooks.oeb.polish.spell import html_spell_tags, xml_spell_tags
from calibre.gui2.tweak_book.editor import SyntaxTextCharFormat from calibre.gui2.tweak_book.editor import SyntaxTextCharFormat
from calibre.gui2.tweak_book.editor.syntax.base import SyntaxHighlighter, run_loop from calibre.gui2.tweak_book.editor.syntax.base import SyntaxHighlighter, run_loop
from calibre.gui2.tweak_book.editor.syntax.css import create_formats as create_css_formats, state_map as css_state_map, State as CSSState from calibre.gui2.tweak_book.editor.syntax.css import create_formats as create_css_formats, state_map as css_state_map, State as CSSState
@ -46,6 +47,7 @@ CSS = 11
TagStart = namedtuple('TagStart', 'offset prefix name closing is_start') TagStart = namedtuple('TagStart', 'offset prefix name closing is_start')
TagEnd = namedtuple('TagEnd', 'offset self_closing is_start') TagEnd = namedtuple('TagEnd', 'offset self_closing is_start')
Attr = namedtuple('Attr', 'offset type data')
class Tag(object): class Tag(object):
@ -76,13 +78,14 @@ class State(object):
__slots__ = ('tag_being_defined', 'tags', 'is_bold', 'is_italic', __slots__ = ('tag_being_defined', 'tags', 'is_bold', 'is_italic',
'current_lang', 'parse', 'get_user_data', 'set_user_data', 'current_lang', 'parse', 'get_user_data', 'set_user_data',
'css_formats', 'stack', 'sub_parser_state', 'default_lang') 'css_formats', 'stack', 'sub_parser_state', 'default_lang',
'attribute_name',)
def __init__(self): def __init__(self):
self.tags = [] self.tags = []
self.is_bold = self.is_italic = False self.is_bold = self.is_italic = False
self.tag_being_defined = self.current_lang = self.get_user_data = self.set_user_data = \ self.tag_being_defined = self.current_lang = self.get_user_data = self.set_user_data = \
self.css_formats = self.stack = self.sub_parser_state = self.default_lang = None self.css_formats = self.stack = self.sub_parser_state = self.default_lang = self.attribute_name = None
self.parse = NORMAL self.parse = NORMAL
def copy(self): def copy(self):
@ -101,13 +104,14 @@ class State(object):
return self.stack.index_for(self) return self.stack.index_for(self)
def __hash__(self): def __hash__(self):
return hash((self.parse, self.sub_parser_state, self.tag_being_defined, tuple(self.tags))) return hash((self.parse, self.sub_parser_state, self.tag_being_defined, self.attribute_name, tuple(self.tags)))
def __eq__(self, other): def __eq__(self, other):
return ( return (
self.parse == getattr(other, 'parse', -1) and self.parse == getattr(other, 'parse', -1) and
self.sub_parser_state == getattr(other, 'sub_parser_state', -1) and self.sub_parser_state == getattr(other, 'sub_parser_state', -1) and
self.tag_being_defined == getattr(other, 'tag_being_defined', False) and self.tag_being_defined == getattr(other, 'tag_being_defined', False) and
self.attribute_name == getattr(other, 'attribute_name', False) and
self.tags == getattr(other, 'tags', None) self.tags == getattr(other, 'tags', None)
) )
@ -194,6 +198,7 @@ class HTMLUserData(QTextBlockUserData):
def __init__(self): def __init__(self):
QTextBlockUserData.__init__(self) QTextBlockUserData.__init__(self)
self.tags = [] self.tags = []
self.attributes = []
def add_tag_data(state, tag): def add_tag_data(state, tag):
ud = q = state.get_user_data() ud = q = state.get_user_data()
@ -203,6 +208,16 @@ def add_tag_data(state, tag):
if q is None: if q is None:
state.set_user_data(ud) state.set_user_data(ud)
ATTR_NAME, ATTR_VALUE, ATTR_START, ATTR_END = object(), object(), object(), object()
def add_attr_data(state, data_type, data, offset):
ud = q = state.get_user_data()
if ud is None:
ud = HTMLUserData()
ud.attributes.append(Attr(offset, data_type, data))
if q is None:
state.set_user_data(ud)
def css(state, text, i, formats): def css(state, text, i, formats):
' Inside a <style> tag ' ' Inside a <style> tag '
pat = cdata_close_pats['style'] pat = cdata_close_pats['style']
@ -320,7 +335,9 @@ def opening_tag(cdata_tags, state, text, i, formats):
if m is None: if m is None:
return [(1, formats['?'])] return [(1, formats['?'])]
state.parse = ATTRIBUTE_NAME state.parse = ATTRIBUTE_NAME
prefix, name = m.group().partition(':')[0::2] attrname = state.attribute_name = m.group()
add_attr_data(state, ATTR_NAME, attrname, m.start())
prefix, name = attrname.partition(':')[0::2]
if prefix and name: if prefix and name:
return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])] return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])]
return [(len(prefix), formats['attr'])] return [(len(prefix), formats['attr'])]
@ -333,11 +350,9 @@ def attribute_name(state, text, i, formats):
if ch == '=': if ch == '=':
state.parse = ATTRIBUTE_VALUE state.parse = ATTRIBUTE_VALUE
return [(1, formats['attr'])] return [(1, formats['attr'])]
# Standalone attribute with no value
state.parse = IN_OPENING_TAG state.parse = IN_OPENING_TAG
if ch in {'>', '/'}: return [(0, None)]
# Standalone attribute with no value
return [(0, None)]
return [(1, formats['no-attr-value'])]
def attribute_value(state, text, i, formats): def attribute_value(state, text, i, formats):
' After attribute = ' ' After attribute = '
@ -356,12 +371,14 @@ def attribute_value(state, text, i, formats):
def quoted_val(state, text, i, formats): def quoted_val(state, text, i, formats):
' A quoted attribute value ' ' A quoted attribute value '
quote = '"' if state.parse is DQ_VAL else "'" quote = '"' if state.parse is DQ_VAL else "'"
add_attr_data(state, ATTR_VALUE, ATTR_START, i)
pos = text.find(quote, i) pos = text.find(quote, i)
if pos == -1: if pos == -1:
num = len(text) - i num = len(text) - i
else: else:
num = pos - i + 1 num = pos - i + 1
state.parse = IN_OPENING_TAG state.parse = IN_OPENING_TAG
add_attr_data(state, ATTR_VALUE, ATTR_END, i + num)
return [(num, formats['string'])] return [(num, formats['string'])]
def closing_tag(state, text, i, formats): def closing_tag(state, text, i, formats):
@ -447,6 +464,7 @@ class HTMLHighlighter(SyntaxHighlighter):
state_map = state_map state_map = state_map
create_formats_func = create_formats create_formats_func = create_formats
spell_attributes = ('alt', 'title')
def create_formats(self): def create_formats(self):
super(HTMLHighlighter, self).create_formats() super(HTMLHighlighter, self).create_formats()
@ -460,9 +478,16 @@ class HTMLHighlighter(SyntaxHighlighter):
ans = self.default_state.stack.state_for(val) or self.default_state ans = self.default_state.stack.state_for(val) or self.default_state
return ans.copy() return ans.copy()
def tag_ok_for_spell(self, name):
return name not in html_spell_tags
class XMLHighlighter(HTMLHighlighter): class XMLHighlighter(HTMLHighlighter):
state_map = xml_state_map state_map = xml_state_map
spell_attributes = ('opf:file-as',)
def tag_ok_for_spell(self, name):
return name in xml_spell_tags
if __name__ == '__main__': if __name__ == '__main__':
from calibre.gui2.tweak_book.editor.widget import launch_editor from calibre.gui2.tweak_book.editor.widget import launch_editor

View File

@ -378,28 +378,35 @@ class TextEdit(PlainTextEdit):
self.saved_matches[save_match] = (pat, m) self.saved_matches[save_match] = (pat, m)
return True return True
def find_word_from_line(self, word, lang, lnum, from_cursor=True): def find_spell_word(self, original_words, lang, from_cursor=True):
c = self.textCursor() c = self.textCursor()
c.setPosition(c.position()) c.setPosition(c.position())
if not from_cursor or c.blockNumber() != lnum - 1: if not from_cursor:
lnum = max(1, min(self.blockCount(), lnum))
c.movePosition(c.Start) c.movePosition(c.Start)
c.movePosition(c.NextBlock, n=lnum - 1) c.movePosition(c.End, c.KeepAnchor)
c.movePosition(c.StartOfLine)
offset = c.block().position() def find_word(haystack):
for w in original_words:
idx = index_of(w, haystack, lang=lang)
if idx > -1:
return idx, w
return -1, None
while True:
text = unicode(c.selectedText()).rstrip('\0')
idx, word = find_word(text)
if idx == -1:
return False
c.setPosition(c.anchor() + idx)
c.setPosition(c.position() + string_length(word), c.KeepAnchor)
if self.smarts.verify_for_spellcheck(c, self.highlighter):
self.setTextCursor(c)
self.centerCursor()
return True
c.setPosition(c.position())
c.movePosition(c.End, c.KeepAnchor) c.movePosition(c.End, c.KeepAnchor)
else:
offset = c.block().position() + c.positionInBlock() return False
c.movePosition(c.End, c.KeepAnchor)
text = unicode(c.selectedText()).rstrip('\0')
idx = index_of(word, text, lang=lang)
if idx == -1:
return False
c.setPosition(offset + idx)
c.setPosition(c.position() + string_length(word), c.KeepAnchor)
self.setTextCursor(c)
self.centerCursor()
return True
def replace(self, pat, template, saved_match='gui'): def replace(self, pat, template, saved_match='gui'):
c = self.textCursor() c = self.textCursor()

View File

@ -189,8 +189,8 @@ class Editor(QMainWindow):
def find(self, *args, **kwargs): def find(self, *args, **kwargs):
return self.editor.find(*args, **kwargs) return self.editor.find(*args, **kwargs)
def find_word_from_line(self, *args, **kwargs): def find_spell_word(self, *args, **kwargs):
return self.editor.find_word_from_line(*args, **kwargs) return self.editor.find_spell_word(*args, **kwargs)
def replace(self, *args, **kwargs): def replace(self, *args, **kwargs):
return self.editor.replace(*args, **kwargs) return self.editor.replace(*args, **kwargs)

View File

@ -1036,10 +1036,10 @@ def find_next(word, locations, current_editor, current_editor_name,
files[l.file_name].append(l) files[l.file_name].append(l)
except KeyError: except KeyError:
files[l.file_name] = [l] files[l.file_name] = [l]
start_locations = set()
if current_editor_name not in files: if current_editor_name not in files:
current_editor = current_editor_name = None current_editor_name = None
locations = [(fname, {l.original_word for l in _locations}, False) for fname, _locations in files.iteritems()]
else: else:
# Re-order the list of locations to search so that we search in the # Re-order the list of locations to search so that we search in the
# current editor first # current editor first
@ -1047,20 +1047,17 @@ def find_next(word, locations, current_editor, current_editor_name,
idx = lfiles.index(current_editor_name) idx = lfiles.index(current_editor_name)
before, after = lfiles[:idx], lfiles[idx+1:] before, after = lfiles[:idx], lfiles[idx+1:]
lfiles = after + before + [current_editor_name] lfiles = after + before + [current_editor_name]
lnum = current_editor.current_line + 1 locations = [(current_editor_name, {l.original_word for l in files[current_editor_name]}, True)]
start_locations = [l for l in files[current_editor_name] if l.sourceline >= lnum]
locations = list(start_locations)
for fname in lfiles: for fname in lfiles:
locations.extend(files[fname]) locations.append((fname, {l.original_word for l in files[fname]}, False))
start_locations = set(start_locations)
for location in locations: for file_name, original_words, from_cursor in locations:
ed = editors.get(location.file_name, None) ed = editors.get(file_name, None)
if ed is None: if ed is None:
edit_file(location.file_name) edit_file(file_name)
ed = editors[location.file_name] ed = editors[file_name]
if ed.find_word_from_line(location.original_word, word[1].langcode, location.sourceline, from_cursor=location in start_locations): if ed.find_spell_word(original_words, word[1].langcode, from_cursor=from_cursor):
show_editor(location.file_name) show_editor(file_name)
return True return True
return False return False