diff --git a/src/calibre/ebooks/oeb/polish/spell.py b/src/calibre/ebooks/oeb/polish/spell.py index 3571555d33..986a33e3c5 100644 --- a/src/calibre/ebooks/oeb/polish/spell.py +++ b/src/calibre/ebooks/oeb/polish/spell.py @@ -9,10 +9,12 @@ __copyright__ = '2014, Kovid Goyal ' import sys from collections import defaultdict +from calibre import replace_entities from calibre.spell.break_iterator import split_into_words, index_of from calibre.spell.dictionary import parse_lang_code from calibre.ebooks.oeb.base import barename from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container +from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.toc import find_existing_toc _patterns = None @@ -96,6 +98,18 @@ def add_words_from_attr(node, attr, words, file_name, locale): def add_words_from_text(node, attr, words, file_name, locale): add_words(getattr(node, attr), node, words, file_name, locale, (False, attr)) +def add_words_from_escaped_html(text, words, file_name, node, attr, locale): + text = replace_entities(text) + root = parse('
%s
' % text, decoder=lambda x:x.decode('utf-8')) + ewords = defaultdict(list) + ewords[None] = 0 + read_words_from_html(root, ewords, file_name, locale) + words[None] += ewords.pop(None) + for k, locs in ewords.iteritems(): + for loc in locs: + loc.location_node, loc.node_item = node, (False, attr) + words[k].extend(locs) + _opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf'] opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'} @@ -106,7 +120,10 @@ opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'} def read_words_from_opf(root, words, file_name, book_locale): for tag in root.iterdescendants('*'): if tag.text is not None and barename(tag.tag) in opf_spell_tags: - add_words_from_text(tag, 'text', words, file_name, book_locale) + if barename(tag.tag) == 'description': + add_words_from_escaped_html(tag.text, words, file_name, tag, 'text', book_locale) + else: + add_words_from_text(tag, 'text', words, file_name, book_locale) add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale) ncx_spell_tags = {'text'}