Edit book: Spell check: Fix words after a comment not being checked. Fixes #1962213 [Spell checker ignores text after HTML comment](https://bugs.launchpad.net/calibre/+bug/1962213)

This commit is contained in:
Kovid Goyal 2022-02-25 05:28:30 +05:30
parent dd0e805838
commit df66924f1b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -163,21 +163,37 @@ opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
def read_words_from_opf(root, words, file_name, book_locale): def read_words_from_opf(root, words, file_name, book_locale):
for tag in root.iterdescendants('*'): for tag in root.iterdescendants('*'):
if tag.text is not None and barename(tag.tag) in opf_spell_tags: if barename(tag.tag) in opf_spell_tags:
if barename(tag.tag) == 'description': if barename(tag.tag) == 'description':
if tag.text:
add_words_from_escaped_html(tag.text, words, file_name, tag, 'text', book_locale) add_words_from_escaped_html(tag.text, words, file_name, tag, 'text', book_locale)
for child in tag:
if child.tail:
add_words_from_escaped_html(child.tail, words, file_name, child, 'tail', book_locale)
else: else:
if tag.text:
add_words_from_text(tag, 'text', words, file_name, book_locale) add_words_from_text(tag, 'text', words, file_name, book_locale)
for child in tag:
if child.tail:
add_words_from_text(child, 'tail', words, file_name, book_locale)
add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale) add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
def count_chars_in_opf(root, counter, file_name, book_locale): def count_chars_in_opf(root, counter, file_name, book_locale):
for tag in root.iterdescendants('*'): for tag in root.iterdescendants('*'):
if tag.text is not None and barename(tag.tag) in opf_spell_tags: if barename(tag.tag) in opf_spell_tags:
if barename(tag.tag) == 'description': if barename(tag.tag) == 'description':
if tag.text:
count_chars_in_escaped_html(tag.text, counter, file_name, tag, 'text', book_locale) count_chars_in_escaped_html(tag.text, counter, file_name, tag, 'text', book_locale)
for child in tag:
if child.tail:
count_chars_in_escaped_html(child.tail, counter, file_name, tag, 'tail', book_locale)
else: else:
if tag.text:
count_chars_in_text(tag, 'text', counter, file_name, book_locale) count_chars_in_text(tag, 'text', counter, file_name, book_locale)
for child in tag:
if child.tail:
count_chars_in_text(tag, 'tail', counter, file_name, book_locale)
count_chars_in_attr(tag, _opf_file_as, counter, file_name, book_locale) count_chars_in_attr(tag, _opf_file_as, counter, file_name, book_locale)
@ -201,7 +217,7 @@ html_spell_tags = {'script', 'style', 'link'}
def read_words_from_html_tag(tag, words, file_name, parent_locale, locale): def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
if tag.text is not None and barename(tag.tag) not in html_spell_tags: if tag.text is not None and isinstance(tag.tag, str) and barename(tag.tag) not in html_spell_tags:
add_words_from_text(tag, 'text', words, file_name, locale) add_words_from_text(tag, 'text', words, file_name, locale)
for attr in {'alt', 'title'}: for attr in {'alt', 'title'}:
add_words_from_attr(tag, attr, words, file_name, locale) add_words_from_attr(tag, attr, words, file_name, locale)
@ -210,7 +226,7 @@ def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
def count_chars_in_html_tag(tag, counter, file_name, parent_locale, locale): def count_chars_in_html_tag(tag, counter, file_name, parent_locale, locale):
if tag.text is not None and barename(tag.tag) not in html_spell_tags: if tag.text is not None and isinstance(tag.tag, str) and barename(tag.tag) not in html_spell_tags:
count_chars_in_text(tag, 'text', counter, file_name, locale) count_chars_in_text(tag, 'text', counter, file_name, locale)
for attr in {'alt', 'title'}: for attr in {'alt', 'title'}:
count_chars_in_attr(tag, attr, counter, file_name, locale) count_chars_in_attr(tag, attr, counter, file_name, locale)
@ -219,14 +235,15 @@ def count_chars_in_html_tag(tag, counter, file_name, parent_locale, locale):
def locale_from_tag(tag): def locale_from_tag(tag):
if 'lang' in tag.attrib: a = tag.attrib
if 'lang' in a:
try: try:
loc = parse_lang_code(tag.get('lang')) loc = parse_lang_code(tag.get('lang'))
except ValueError: except ValueError:
loc = None loc = None
if loc is not None: if loc is not None:
return loc return loc
if '{http://www.w3.org/XML/1998/namespace}lang' in tag.attrib: if '{http://www.w3.org/XML/1998/namespace}lang' in a:
try: try:
loc = parse_lang_code(tag.get('{http://www.w3.org/XML/1998/namespace}lang')) loc = parse_lang_code(tag.get('{http://www.w3.org/XML/1998/namespace}lang'))
except ValueError: except ValueError:
@ -241,7 +258,7 @@ def read_words_from_html(root, words, file_name, book_locale):
parent, parent_locale = stack.pop() parent, parent_locale = stack.pop()
locale = locale_from_tag(parent) or parent_locale locale = locale_from_tag(parent) or parent_locale
read_words_from_html_tag(parent, words, file_name, parent_locale, locale) read_words_from_html_tag(parent, words, file_name, parent_locale, locale)
stack.extend((tag, locale) for tag in parent.iterchildren('*')) stack.extend((tag, locale) for tag in parent)
def count_chars_in_html(root, counter, file_name, book_locale): def count_chars_in_html(root, counter, file_name, book_locale):
@ -250,7 +267,7 @@ def count_chars_in_html(root, counter, file_name, book_locale):
parent, parent_locale = stack.pop() parent, parent_locale = stack.pop()
locale = locale_from_tag(parent) or parent_locale locale = locale_from_tag(parent) or parent_locale
count_chars_in_html_tag(parent, counter, file_name, parent_locale, locale) count_chars_in_html_tag(parent, counter, file_name, parent_locale, locale)
stack.extend((tag, locale) for tag in parent.iterchildren('*')) stack.extend((tag, locale) for tag in parent)
def group_sort(locations): def group_sort(locations):