From b302cacc1bdb8af49fc1e8d731cb2ecf1588633d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 8 Mar 2018 12:22:38 +0530 Subject: [PATCH] Edit book: Reports: Do not show characters from the HTML markup in the characters report. Fixes #1753788 [Enhancement editor, reports; characters subwindow](https://bugs.launchpad.net/calibre/+bug/1753788) --- src/calibre/ebooks/oeb/polish/report.py | 32 ++++----- src/calibre/ebooks/oeb/polish/spell.py | 87 ++++++++++++++++++++++++- 2 files changed, 97 insertions(+), 22 deletions(-) diff --git a/src/calibre/ebooks/oeb/polish/report.py b/src/calibre/ebooks/oeb/polish/report.py index b758dcc202..4a954ae660 100644 --- a/src/calibre/ebooks/oeb/polish/report.py +++ b/src/calibre/ebooks/oeb/polish/report.py @@ -7,14 +7,14 @@ __license__ = 'GPL v3' __copyright__ = '2015, Kovid Goyal ' import posixpath, os, time, types -from collections import namedtuple, defaultdict, Counter +from collections import namedtuple, defaultdict from itertools import chain from calibre import prepare_string_for_xml, force_unicode from calibre.ebooks.oeb.base import XPath, xml2text from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS -from calibre.ebooks.oeb.polish.spell import get_all_words -from calibre.utils.icu import numeric_sort_key, ord_string, safe_chr +from calibre.ebooks.oeb.polish.spell import get_all_words, count_all_chars +from calibre.utils.icu import numeric_sort_key, safe_chr from calibre.utils.imghdr import identify from css_selectors import Select, SelectorError @@ -64,6 +64,7 @@ def files_data(container, *args): yield File(name, posixpath.dirname(name), posixpath.basename(name), safe_size(container, name), get_category(name, container.mime_map.get(name, ''))) + Image = namedtuple('Image', 'name mime_type usage size basename id width height') LinkLocation = namedtuple('LinkLocation', 'name line_number text_on_line') @@ -139,6 +140,7 @@ def create_anchor_map(root, pat, name): ans[anchor] = (LinkLocation(name, elem.sourceline, anchor), description_for_anchor(elem)) return ans + Anchor = namedtuple('Anchor', 'id location text') L = namedtuple('Link', 'location text is_external href path_ok anchor_ok anchor ok') @@ -192,6 +194,7 @@ def links_data(container, *args): link = Link(location, text, False, dest, False, False, Anchor(frag, None, None)) yield link + Word = namedtuple('Word', 'id word locale usage') @@ -199,32 +202,19 @@ def words_data(container, book_locale, *args): count, words = get_all_words(container, book_locale, get_word_count=True) return (count, tuple(Word(i, word, locale, v) for i, ((word, locale), v) in enumerate(words.iteritems()))) + Char = namedtuple('Char', 'id char codepoint usage count') -def chars_data(container, *args): - chars = defaultdict(set) - counter = Counter() - - def count(codepoint): - counter[codepoint] += 1 - - for name, is_linear in container.spine_names: - if container.mime_map.get(name) not in OEB_DOCS: - continue - raw = container.raw_data(name) - counts = Counter(ord_string(raw)) - counter.update(counts) - for codepoint in counts: - chars[codepoint].add(name) - +def chars_data(container, book_locale, *args): + cc = count_all_chars(container, book_locale) nmap = {n:i for i, (n, l) in enumerate(container.spine_names)} def sort_key(name): return nmap.get(name, len(nmap)), numeric_sort_key(name) - for i, (codepoint, usage) in enumerate(chars.iteritems()): - yield Char(i, safe_chr(codepoint), codepoint, sorted(usage, key=sort_key), counter[codepoint]) + for i, (codepoint, usage) in enumerate(cc.chars.iteritems()): + yield Char(i, safe_chr(codepoint), codepoint, sorted(usage, key=sort_key), cc.counter[codepoint]) CSSRule = namedtuple('CSSRule', 'selector location') diff --git a/src/calibre/ebooks/oeb/polish/spell.py b/src/calibre/ebooks/oeb/polish/spell.py index 4b97f13080..271c2474fe 100644 --- a/src/calibre/ebooks/oeb/polish/spell.py +++ b/src/calibre/ebooks/oeb/polish/spell.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2014, Kovid Goyal ' import sys -from collections import defaultdict +from collections import defaultdict, Counter from calibre import replace_entities from calibre.spell.break_iterator import split_into_words, index_of @@ -16,6 +16,7 @@ from calibre.ebooks.oeb.base import barename from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.toc import find_existing_ncx_toc, find_existing_nav_toc +from calibre.utils.icu import ord_string _patterns = None @@ -46,6 +47,14 @@ def patterns(): return _patterns +class CharCounter(object): + + def __init__(self): + self.counter = Counter() + self.chars = defaultdict(set) + self.update = self.counter.update + + class Location(object): __slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item', 'elided_prefix') @@ -97,16 +106,36 @@ def add_words(text, node, words, file_name, locale, node_item): words[None] += 1 +def add_chars(text, counter, file_name): + if text: + if isinstance(text, bytes): + text = text.decode('utf-8', 'ignore') + counts = Counter(ord_string(text)) + counter.update(counts) + for codepoint in counts: + counter.chars[codepoint].add(file_name) + + def add_words_from_attr(node, attr, words, file_name, locale): text = node.get(attr, None) if text: add_words(text, node, words, file_name, locale, (True, attr)) +def count_chars_in_attr(node, attr, counter, file_name, locale): + text = node.get(attr, None) + if text: + add_chars(text, counter, file_name) + + def add_words_from_text(node, attr, words, file_name, locale): add_words(getattr(node, attr), node, words, file_name, locale, (False, attr)) +def count_chars_in_text(node, attr, counter, file_name, locale): + add_chars(getattr(node, attr), counter, file_name) + + def add_words_from_escaped_html(text, words, file_name, node, attr, locale): text = replace_entities(text) root = parse('
%s
' % text, decoder=lambda x:x.decode('utf-8')) @@ -120,6 +149,12 @@ def add_words_from_escaped_html(text, words, file_name, node, attr, locale): words[k].extend(locs) +def count_chars_in_escaped_html(text, counter, file_name, node, attr, locale): + text = replace_entities(text) + root = parse('
%s
' % text, decoder=lambda x:x.decode('utf-8')) + count_chars_in_html(root, counter, file_name, locale) + + _opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf'] opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'} @@ -137,6 +172,16 @@ def read_words_from_opf(root, words, file_name, book_locale): add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale) +def count_chars_in_opf(root, counter, file_name, book_locale): + for tag in root.iterdescendants('*'): + if tag.text is not None and barename(tag.tag) in opf_spell_tags: + if barename(tag.tag) == 'description': + count_chars_in_escaped_html(tag.text, counter, file_name, tag, 'text', book_locale) + else: + count_chars_in_text(tag, 'text', counter, file_name, book_locale) + count_chars_in_attr(tag, _opf_file_as, counter, file_name, book_locale) + + ncx_spell_tags = {'text'} xml_spell_tags = opf_spell_tags | ncx_spell_tags @@ -147,6 +192,12 @@ def read_words_from_ncx(root, words, file_name, book_locale): add_words_from_text(tag, 'text', words, file_name, book_locale) +def count_chars_in_ncx(root, counter, file_name, book_locale): + for tag in root.xpath('//*[local-name()="text"]'): + if tag.text is not None: + count_chars_in_text(tag, 'text', counter, file_name, book_locale) + + html_spell_tags = {'script', 'style', 'link'} @@ -159,6 +210,15 @@ def read_words_from_html_tag(tag, words, file_name, parent_locale, locale): add_words_from_text(tag, 'tail', words, file_name, parent_locale) +def count_chars_in_html_tag(tag, counter, file_name, parent_locale, locale): + if tag.text is not None and barename(tag.tag) not in html_spell_tags: + count_chars_in_text(tag, 'text', counter, file_name, locale) + for attr in {'alt', 'title'}: + count_chars_in_attr(tag, attr, counter, file_name, locale) + if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags: + count_chars_in_text(tag, 'tail', counter, file_name, parent_locale) + + def locale_from_tag(tag): if 'lang' in tag.attrib: try: @@ -185,6 +245,15 @@ def read_words_from_html(root, words, file_name, book_locale): stack.extend((tag, locale) for tag in parent.iterchildren('*')) +def count_chars_in_html(root, counter, file_name, book_locale): + stack = [(root, book_locale)] + while stack: + parent, parent_locale = stack.pop() + locale = locale_from_tag(parent) or parent_locale + count_chars_in_html_tag(parent, counter, file_name, parent_locale, locale) + stack.extend((tag, locale) for tag in parent.iterchildren('*')) + + def group_sort(locations): order = {} for loc in locations: @@ -223,6 +292,22 @@ def get_all_words(container, book_locale, get_word_count=False): return ans +def count_all_chars(container, book_locale): + ans = CharCounter() + file_names, toc = get_checkable_file_names(container) + for file_name in file_names: + if not container.exists(file_name): + continue + root = container.parsed(file_name) + if file_name == container.opf_name: + count_chars_in_opf(root, ans, file_name, book_locale) + elif file_name == toc: + count_chars_in_ncx(root, ans, file_name, book_locale) + elif hasattr(root, 'xpath'): + count_chars_in_html(root, ans, file_name, book_locale) + return ans + + def merge_locations(locs1, locs2): return group_sort(locs1 + locs2)