Edit Book: Fix escaped HTML tags in the comments metadata being reported as mis-spelled words. Fixes #1441925 [Spell check flags html tags in metadata](https://bugs.launchpad.net/calibre/+bug/1441925)

2026-03-04 16:03:41 -05:00 · 2015-04-12 09:38:48 +05:30 · 2015-04-12 09:38:48 +05:30 · eec1f44dee
commit eec1f44dee
parent 203cb98ac0
1 changed files with 18 additions and 1 deletions
--- a/src/calibre/ebooks/oeb/polish/spell.py
+++ b/src/calibre/ebooks/oeb/polish/spell.py
@ -9,10 +9,12 @@ __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
 import sys
 from collections import defaultdict

+from calibre import replace_entities
 from calibre.spell.break_iterator import split_into_words, index_of
 from calibre.spell.dictionary import parse_lang_code
 from calibre.ebooks.oeb.base import barename
 from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container
+from calibre.ebooks.oeb.polish.parsing import parse
 from calibre.ebooks.oeb.polish.toc import find_existing_toc

 _patterns = None
@ -96,6 +98,18 @@ def add_words_from_attr(node, attr, words, file_name, locale):
 def add_words_from_text(node, attr, words, file_name, locale):
    add_words(getattr(node, attr), node, words, file_name, locale, (False, attr))

+def add_words_from_escaped_html(text, words, file_name, node, attr, locale):
+    text = replace_entities(text)
+    root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
+    ewords = defaultdict(list)
+    ewords[None] = 0
+    read_words_from_html(root, ewords, file_name, locale)
+    words[None] += ewords.pop(None)
+    for k, locs in ewords.iteritems():
+        for loc in locs:
+            loc.location_node, loc.node_item = node, (False, attr)
+        words[k].extend(locs)
+
 _opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf']

 opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
@ -106,7 +120,10 @@ opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
 def read_words_from_opf(root, words, file_name, book_locale):
    for tag in root.iterdescendants('*'):
        if tag.text is not None and barename(tag.tag) in opf_spell_tags:
-            add_words_from_text(tag, 'text', words, file_name, book_locale)
+            if barename(tag.tag) == 'description':
+                add_words_from_escaped_html(tag.text, words, file_name, tag, 'text', book_locale)
+            else:
+                add_words_from_text(tag, 'text', words, file_name, book_locale)
        add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)

 ncx_spell_tags = {'text'}