Edit Book: Fix escaped HTML tags in the comments metadata being reported as mis-spelled words. Fixes #1441925 [Spell check flags html tags in metadata](https://bugs.launchpad.net/calibre/+bug/1441925)

This commit is contained in:
Kovid Goyal 2015-04-12 09:38:48 +05:30
parent 203cb98ac0
commit eec1f44dee

View File

@ -9,10 +9,12 @@ __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import sys import sys
from collections import defaultdict from collections import defaultdict
from calibre import replace_entities
from calibre.spell.break_iterator import split_into_words, index_of from calibre.spell.break_iterator import split_into_words, index_of
from calibre.spell.dictionary import parse_lang_code from calibre.spell.dictionary import parse_lang_code
from calibre.ebooks.oeb.base import barename from calibre.ebooks.oeb.base import barename
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container
from calibre.ebooks.oeb.polish.parsing import parse
from calibre.ebooks.oeb.polish.toc import find_existing_toc from calibre.ebooks.oeb.polish.toc import find_existing_toc
_patterns = None _patterns = None
@ -96,6 +98,18 @@ def add_words_from_attr(node, attr, words, file_name, locale):
def add_words_from_text(node, attr, words, file_name, locale): def add_words_from_text(node, attr, words, file_name, locale):
add_words(getattr(node, attr), node, words, file_name, locale, (False, attr)) add_words(getattr(node, attr), node, words, file_name, locale, (False, attr))
def add_words_from_escaped_html(text, words, file_name, node, attr, locale):
text = replace_entities(text)
root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
ewords = defaultdict(list)
ewords[None] = 0
read_words_from_html(root, ewords, file_name, locale)
words[None] += ewords.pop(None)
for k, locs in ewords.iteritems():
for loc in locs:
loc.location_node, loc.node_item = node, (False, attr)
words[k].extend(locs)
_opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf'] _opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'} opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
@ -106,7 +120,10 @@ opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
def read_words_from_opf(root, words, file_name, book_locale): def read_words_from_opf(root, words, file_name, book_locale):
for tag in root.iterdescendants('*'): for tag in root.iterdescendants('*'):
if tag.text is not None and barename(tag.tag) in opf_spell_tags: if tag.text is not None and barename(tag.tag) in opf_spell_tags:
add_words_from_text(tag, 'text', words, file_name, book_locale) if barename(tag.tag) == 'description':
add_words_from_escaped_html(tag.text, words, file_name, tag, 'text', book_locale)
else:
add_words_from_text(tag, 'text', words, file_name, book_locale)
add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale) add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
ncx_spell_tags = {'text'} ncx_spell_tags = {'text'}