Edit book: Reports: Do not show characters from the HTML markup in the characters report. Fixes #1753788 [Enhancement editor, reports; characters subwindow](https://bugs.launchpad.net/calibre/+bug/1753788)

This commit is contained in:
Kovid Goyal 2018-03-08 12:22:38 +05:30
parent 9651296a57
commit b302cacc1b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 97 additions and 22 deletions

View File

@ -7,14 +7,14 @@ __license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import posixpath, os, time, types import posixpath, os, time, types
from collections import namedtuple, defaultdict, Counter from collections import namedtuple, defaultdict
from itertools import chain from itertools import chain
from calibre import prepare_string_for_xml, force_unicode from calibre import prepare_string_for_xml, force_unicode
from calibre.ebooks.oeb.base import XPath, xml2text from calibre.ebooks.oeb.base import XPath, xml2text
from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS
from calibre.ebooks.oeb.polish.spell import get_all_words from calibre.ebooks.oeb.polish.spell import get_all_words, count_all_chars
from calibre.utils.icu import numeric_sort_key, ord_string, safe_chr from calibre.utils.icu import numeric_sort_key, safe_chr
from calibre.utils.imghdr import identify from calibre.utils.imghdr import identify
from css_selectors import Select, SelectorError from css_selectors import Select, SelectorError
@ -64,6 +64,7 @@ def files_data(container, *args):
yield File(name, posixpath.dirname(name), posixpath.basename(name), safe_size(container, name), yield File(name, posixpath.dirname(name), posixpath.basename(name), safe_size(container, name),
get_category(name, container.mime_map.get(name, ''))) get_category(name, container.mime_map.get(name, '')))
Image = namedtuple('Image', 'name mime_type usage size basename id width height') Image = namedtuple('Image', 'name mime_type usage size basename id width height')
LinkLocation = namedtuple('LinkLocation', 'name line_number text_on_line') LinkLocation = namedtuple('LinkLocation', 'name line_number text_on_line')
@ -139,6 +140,7 @@ def create_anchor_map(root, pat, name):
ans[anchor] = (LinkLocation(name, elem.sourceline, anchor), description_for_anchor(elem)) ans[anchor] = (LinkLocation(name, elem.sourceline, anchor), description_for_anchor(elem))
return ans return ans
Anchor = namedtuple('Anchor', 'id location text') Anchor = namedtuple('Anchor', 'id location text')
L = namedtuple('Link', 'location text is_external href path_ok anchor_ok anchor ok') L = namedtuple('Link', 'location text is_external href path_ok anchor_ok anchor ok')
@ -192,6 +194,7 @@ def links_data(container, *args):
link = Link(location, text, False, dest, False, False, Anchor(frag, None, None)) link = Link(location, text, False, dest, False, False, Anchor(frag, None, None))
yield link yield link
Word = namedtuple('Word', 'id word locale usage') Word = namedtuple('Word', 'id word locale usage')
@ -199,32 +202,19 @@ def words_data(container, book_locale, *args):
count, words = get_all_words(container, book_locale, get_word_count=True) count, words = get_all_words(container, book_locale, get_word_count=True)
return (count, tuple(Word(i, word, locale, v) for i, ((word, locale), v) in enumerate(words.iteritems()))) return (count, tuple(Word(i, word, locale, v) for i, ((word, locale), v) in enumerate(words.iteritems())))
Char = namedtuple('Char', 'id char codepoint usage count') Char = namedtuple('Char', 'id char codepoint usage count')
def chars_data(container, *args): def chars_data(container, book_locale, *args):
chars = defaultdict(set) cc = count_all_chars(container, book_locale)
counter = Counter()
def count(codepoint):
counter[codepoint] += 1
for name, is_linear in container.spine_names:
if container.mime_map.get(name) not in OEB_DOCS:
continue
raw = container.raw_data(name)
counts = Counter(ord_string(raw))
counter.update(counts)
for codepoint in counts:
chars[codepoint].add(name)
nmap = {n:i for i, (n, l) in enumerate(container.spine_names)} nmap = {n:i for i, (n, l) in enumerate(container.spine_names)}
def sort_key(name): def sort_key(name):
return nmap.get(name, len(nmap)), numeric_sort_key(name) return nmap.get(name, len(nmap)), numeric_sort_key(name)
for i, (codepoint, usage) in enumerate(chars.iteritems()): for i, (codepoint, usage) in enumerate(cc.chars.iteritems()):
yield Char(i, safe_chr(codepoint), codepoint, sorted(usage, key=sort_key), counter[codepoint]) yield Char(i, safe_chr(codepoint), codepoint, sorted(usage, key=sort_key), cc.counter[codepoint])
CSSRule = namedtuple('CSSRule', 'selector location') CSSRule = namedtuple('CSSRule', 'selector location')

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import sys import sys
from collections import defaultdict from collections import defaultdict, Counter
from calibre import replace_entities from calibre import replace_entities
from calibre.spell.break_iterator import split_into_words, index_of from calibre.spell.break_iterator import split_into_words, index_of
@ -16,6 +16,7 @@ from calibre.ebooks.oeb.base import barename
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container
from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.parsing import parse
from calibre.ebooks.oeb.polish.toc import find_existing_ncx_toc, find_existing_nav_toc from calibre.ebooks.oeb.polish.toc import find_existing_ncx_toc, find_existing_nav_toc
from calibre.utils.icu import ord_string
_patterns = None _patterns = None
@ -46,6 +47,14 @@ def patterns():
return _patterns return _patterns
class CharCounter(object):
def __init__(self):
self.counter = Counter()
self.chars = defaultdict(set)
self.update = self.counter.update
class Location(object): class Location(object):
__slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item', 'elided_prefix') __slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item', 'elided_prefix')
@ -97,16 +106,36 @@ def add_words(text, node, words, file_name, locale, node_item):
words[None] += 1 words[None] += 1
def add_chars(text, counter, file_name):
if text:
if isinstance(text, bytes):
text = text.decode('utf-8', 'ignore')
counts = Counter(ord_string(text))
counter.update(counts)
for codepoint in counts:
counter.chars[codepoint].add(file_name)
def add_words_from_attr(node, attr, words, file_name, locale): def add_words_from_attr(node, attr, words, file_name, locale):
text = node.get(attr, None) text = node.get(attr, None)
if text: if text:
add_words(text, node, words, file_name, locale, (True, attr)) add_words(text, node, words, file_name, locale, (True, attr))
def count_chars_in_attr(node, attr, counter, file_name, locale):
text = node.get(attr, None)
if text:
add_chars(text, counter, file_name)
def add_words_from_text(node, attr, words, file_name, locale): def add_words_from_text(node, attr, words, file_name, locale):
add_words(getattr(node, attr), node, words, file_name, locale, (False, attr)) add_words(getattr(node, attr), node, words, file_name, locale, (False, attr))
def count_chars_in_text(node, attr, counter, file_name, locale):
add_chars(getattr(node, attr), counter, file_name)
def add_words_from_escaped_html(text, words, file_name, node, attr, locale): def add_words_from_escaped_html(text, words, file_name, node, attr, locale):
text = replace_entities(text) text = replace_entities(text)
root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8')) root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
@ -120,6 +149,12 @@ def add_words_from_escaped_html(text, words, file_name, node, attr, locale):
words[k].extend(locs) words[k].extend(locs)
def count_chars_in_escaped_html(text, counter, file_name, node, attr, locale):
text = replace_entities(text)
root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
count_chars_in_html(root, counter, file_name, locale)
_opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf'] _opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'} opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
@ -137,6 +172,16 @@ def read_words_from_opf(root, words, file_name, book_locale):
add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale) add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
def count_chars_in_opf(root, counter, file_name, book_locale):
for tag in root.iterdescendants('*'):
if tag.text is not None and barename(tag.tag) in opf_spell_tags:
if barename(tag.tag) == 'description':
count_chars_in_escaped_html(tag.text, counter, file_name, tag, 'text', book_locale)
else:
count_chars_in_text(tag, 'text', counter, file_name, book_locale)
count_chars_in_attr(tag, _opf_file_as, counter, file_name, book_locale)
ncx_spell_tags = {'text'} ncx_spell_tags = {'text'}
xml_spell_tags = opf_spell_tags | ncx_spell_tags xml_spell_tags = opf_spell_tags | ncx_spell_tags
@ -147,6 +192,12 @@ def read_words_from_ncx(root, words, file_name, book_locale):
add_words_from_text(tag, 'text', words, file_name, book_locale) add_words_from_text(tag, 'text', words, file_name, book_locale)
def count_chars_in_ncx(root, counter, file_name, book_locale):
for tag in root.xpath('//*[local-name()="text"]'):
if tag.text is not None:
count_chars_in_text(tag, 'text', counter, file_name, book_locale)
html_spell_tags = {'script', 'style', 'link'} html_spell_tags = {'script', 'style', 'link'}
@ -159,6 +210,15 @@ def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
add_words_from_text(tag, 'tail', words, file_name, parent_locale) add_words_from_text(tag, 'tail', words, file_name, parent_locale)
def count_chars_in_html_tag(tag, counter, file_name, parent_locale, locale):
if tag.text is not None and barename(tag.tag) not in html_spell_tags:
count_chars_in_text(tag, 'text', counter, file_name, locale)
for attr in {'alt', 'title'}:
count_chars_in_attr(tag, attr, counter, file_name, locale)
if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
count_chars_in_text(tag, 'tail', counter, file_name, parent_locale)
def locale_from_tag(tag): def locale_from_tag(tag):
if 'lang' in tag.attrib: if 'lang' in tag.attrib:
try: try:
@ -185,6 +245,15 @@ def read_words_from_html(root, words, file_name, book_locale):
stack.extend((tag, locale) for tag in parent.iterchildren('*')) stack.extend((tag, locale) for tag in parent.iterchildren('*'))
def count_chars_in_html(root, counter, file_name, book_locale):
stack = [(root, book_locale)]
while stack:
parent, parent_locale = stack.pop()
locale = locale_from_tag(parent) or parent_locale
count_chars_in_html_tag(parent, counter, file_name, parent_locale, locale)
stack.extend((tag, locale) for tag in parent.iterchildren('*'))
def group_sort(locations): def group_sort(locations):
order = {} order = {}
for loc in locations: for loc in locations:
@ -223,6 +292,22 @@ def get_all_words(container, book_locale, get_word_count=False):
return ans return ans
def count_all_chars(container, book_locale):
ans = CharCounter()
file_names, toc = get_checkable_file_names(container)
for file_name in file_names:
if not container.exists(file_name):
continue
root = container.parsed(file_name)
if file_name == container.opf_name:
count_chars_in_opf(root, ans, file_name, book_locale)
elif file_name == toc:
count_chars_in_ncx(root, ans, file_name, book_locale)
elif hasattr(root, 'xpath'):
count_chars_in_html(root, ans, file_name, book_locale)
return ans
def merge_locations(locs1, locs2): def merge_locations(locs1, locs2):
return group_sort(locs1 + locs2) return group_sort(locs1 + locs2)