mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Edit book: Reports: Do not show characters from the HTML markup in the characters report. Fixes #1753788 [Enhancement editor, reports; characters subwindow](https://bugs.launchpad.net/calibre/+bug/1753788)
This commit is contained in:
parent
9651296a57
commit
b302cacc1b
@ -7,14 +7,14 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
import posixpath, os, time, types
|
import posixpath, os, time, types
|
||||||
from collections import namedtuple, defaultdict, Counter
|
from collections import namedtuple, defaultdict
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
|
||||||
from calibre import prepare_string_for_xml, force_unicode
|
from calibre import prepare_string_for_xml, force_unicode
|
||||||
from calibre.ebooks.oeb.base import XPath, xml2text
|
from calibre.ebooks.oeb.base import XPath, xml2text
|
||||||
from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS
|
from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS
|
||||||
from calibre.ebooks.oeb.polish.spell import get_all_words
|
from calibre.ebooks.oeb.polish.spell import get_all_words, count_all_chars
|
||||||
from calibre.utils.icu import numeric_sort_key, ord_string, safe_chr
|
from calibre.utils.icu import numeric_sort_key, safe_chr
|
||||||
from calibre.utils.imghdr import identify
|
from calibre.utils.imghdr import identify
|
||||||
from css_selectors import Select, SelectorError
|
from css_selectors import Select, SelectorError
|
||||||
|
|
||||||
@ -64,6 +64,7 @@ def files_data(container, *args):
|
|||||||
yield File(name, posixpath.dirname(name), posixpath.basename(name), safe_size(container, name),
|
yield File(name, posixpath.dirname(name), posixpath.basename(name), safe_size(container, name),
|
||||||
get_category(name, container.mime_map.get(name, '')))
|
get_category(name, container.mime_map.get(name, '')))
|
||||||
|
|
||||||
|
|
||||||
Image = namedtuple('Image', 'name mime_type usage size basename id width height')
|
Image = namedtuple('Image', 'name mime_type usage size basename id width height')
|
||||||
|
|
||||||
LinkLocation = namedtuple('LinkLocation', 'name line_number text_on_line')
|
LinkLocation = namedtuple('LinkLocation', 'name line_number text_on_line')
|
||||||
@ -139,6 +140,7 @@ def create_anchor_map(root, pat, name):
|
|||||||
ans[anchor] = (LinkLocation(name, elem.sourceline, anchor), description_for_anchor(elem))
|
ans[anchor] = (LinkLocation(name, elem.sourceline, anchor), description_for_anchor(elem))
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
Anchor = namedtuple('Anchor', 'id location text')
|
Anchor = namedtuple('Anchor', 'id location text')
|
||||||
L = namedtuple('Link', 'location text is_external href path_ok anchor_ok anchor ok')
|
L = namedtuple('Link', 'location text is_external href path_ok anchor_ok anchor ok')
|
||||||
|
|
||||||
@ -192,6 +194,7 @@ def links_data(container, *args):
|
|||||||
link = Link(location, text, False, dest, False, False, Anchor(frag, None, None))
|
link = Link(location, text, False, dest, False, False, Anchor(frag, None, None))
|
||||||
yield link
|
yield link
|
||||||
|
|
||||||
|
|
||||||
Word = namedtuple('Word', 'id word locale usage')
|
Word = namedtuple('Word', 'id word locale usage')
|
||||||
|
|
||||||
|
|
||||||
@ -199,32 +202,19 @@ def words_data(container, book_locale, *args):
|
|||||||
count, words = get_all_words(container, book_locale, get_word_count=True)
|
count, words = get_all_words(container, book_locale, get_word_count=True)
|
||||||
return (count, tuple(Word(i, word, locale, v) for i, ((word, locale), v) in enumerate(words.iteritems())))
|
return (count, tuple(Word(i, word, locale, v) for i, ((word, locale), v) in enumerate(words.iteritems())))
|
||||||
|
|
||||||
|
|
||||||
Char = namedtuple('Char', 'id char codepoint usage count')
|
Char = namedtuple('Char', 'id char codepoint usage count')
|
||||||
|
|
||||||
|
|
||||||
def chars_data(container, *args):
|
def chars_data(container, book_locale, *args):
|
||||||
chars = defaultdict(set)
|
cc = count_all_chars(container, book_locale)
|
||||||
counter = Counter()
|
|
||||||
|
|
||||||
def count(codepoint):
|
|
||||||
counter[codepoint] += 1
|
|
||||||
|
|
||||||
for name, is_linear in container.spine_names:
|
|
||||||
if container.mime_map.get(name) not in OEB_DOCS:
|
|
||||||
continue
|
|
||||||
raw = container.raw_data(name)
|
|
||||||
counts = Counter(ord_string(raw))
|
|
||||||
counter.update(counts)
|
|
||||||
for codepoint in counts:
|
|
||||||
chars[codepoint].add(name)
|
|
||||||
|
|
||||||
nmap = {n:i for i, (n, l) in enumerate(container.spine_names)}
|
nmap = {n:i for i, (n, l) in enumerate(container.spine_names)}
|
||||||
|
|
||||||
def sort_key(name):
|
def sort_key(name):
|
||||||
return nmap.get(name, len(nmap)), numeric_sort_key(name)
|
return nmap.get(name, len(nmap)), numeric_sort_key(name)
|
||||||
|
|
||||||
for i, (codepoint, usage) in enumerate(chars.iteritems()):
|
for i, (codepoint, usage) in enumerate(cc.chars.iteritems()):
|
||||||
yield Char(i, safe_chr(codepoint), codepoint, sorted(usage, key=sort_key), counter[codepoint])
|
yield Char(i, safe_chr(codepoint), codepoint, sorted(usage, key=sort_key), cc.counter[codepoint])
|
||||||
|
|
||||||
|
|
||||||
CSSRule = namedtuple('CSSRule', 'selector location')
|
CSSRule = namedtuple('CSSRule', 'selector location')
|
||||||
|
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict, Counter
|
||||||
|
|
||||||
from calibre import replace_entities
|
from calibre import replace_entities
|
||||||
from calibre.spell.break_iterator import split_into_words, index_of
|
from calibre.spell.break_iterator import split_into_words, index_of
|
||||||
@ -16,6 +16,7 @@ from calibre.ebooks.oeb.base import barename
|
|||||||
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container
|
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container
|
||||||
from calibre.ebooks.oeb.polish.parsing import parse
|
from calibre.ebooks.oeb.polish.parsing import parse
|
||||||
from calibre.ebooks.oeb.polish.toc import find_existing_ncx_toc, find_existing_nav_toc
|
from calibre.ebooks.oeb.polish.toc import find_existing_ncx_toc, find_existing_nav_toc
|
||||||
|
from calibre.utils.icu import ord_string
|
||||||
|
|
||||||
_patterns = None
|
_patterns = None
|
||||||
|
|
||||||
@ -46,6 +47,14 @@ def patterns():
|
|||||||
return _patterns
|
return _patterns
|
||||||
|
|
||||||
|
|
||||||
|
class CharCounter(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.counter = Counter()
|
||||||
|
self.chars = defaultdict(set)
|
||||||
|
self.update = self.counter.update
|
||||||
|
|
||||||
|
|
||||||
class Location(object):
|
class Location(object):
|
||||||
|
|
||||||
__slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item', 'elided_prefix')
|
__slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item', 'elided_prefix')
|
||||||
@ -97,16 +106,36 @@ def add_words(text, node, words, file_name, locale, node_item):
|
|||||||
words[None] += 1
|
words[None] += 1
|
||||||
|
|
||||||
|
|
||||||
|
def add_chars(text, counter, file_name):
|
||||||
|
if text:
|
||||||
|
if isinstance(text, bytes):
|
||||||
|
text = text.decode('utf-8', 'ignore')
|
||||||
|
counts = Counter(ord_string(text))
|
||||||
|
counter.update(counts)
|
||||||
|
for codepoint in counts:
|
||||||
|
counter.chars[codepoint].add(file_name)
|
||||||
|
|
||||||
|
|
||||||
def add_words_from_attr(node, attr, words, file_name, locale):
|
def add_words_from_attr(node, attr, words, file_name, locale):
|
||||||
text = node.get(attr, None)
|
text = node.get(attr, None)
|
||||||
if text:
|
if text:
|
||||||
add_words(text, node, words, file_name, locale, (True, attr))
|
add_words(text, node, words, file_name, locale, (True, attr))
|
||||||
|
|
||||||
|
|
||||||
|
def count_chars_in_attr(node, attr, counter, file_name, locale):
|
||||||
|
text = node.get(attr, None)
|
||||||
|
if text:
|
||||||
|
add_chars(text, counter, file_name)
|
||||||
|
|
||||||
|
|
||||||
def add_words_from_text(node, attr, words, file_name, locale):
|
def add_words_from_text(node, attr, words, file_name, locale):
|
||||||
add_words(getattr(node, attr), node, words, file_name, locale, (False, attr))
|
add_words(getattr(node, attr), node, words, file_name, locale, (False, attr))
|
||||||
|
|
||||||
|
|
||||||
|
def count_chars_in_text(node, attr, counter, file_name, locale):
|
||||||
|
add_chars(getattr(node, attr), counter, file_name)
|
||||||
|
|
||||||
|
|
||||||
def add_words_from_escaped_html(text, words, file_name, node, attr, locale):
|
def add_words_from_escaped_html(text, words, file_name, node, attr, locale):
|
||||||
text = replace_entities(text)
|
text = replace_entities(text)
|
||||||
root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
|
root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
|
||||||
@ -120,6 +149,12 @@ def add_words_from_escaped_html(text, words, file_name, node, attr, locale):
|
|||||||
words[k].extend(locs)
|
words[k].extend(locs)
|
||||||
|
|
||||||
|
|
||||||
|
def count_chars_in_escaped_html(text, counter, file_name, node, attr, locale):
|
||||||
|
text = replace_entities(text)
|
||||||
|
root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
|
||||||
|
count_chars_in_html(root, counter, file_name, locale)
|
||||||
|
|
||||||
|
|
||||||
_opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
|
_opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
|
||||||
opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
|
opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
|
||||||
|
|
||||||
@ -137,6 +172,16 @@ def read_words_from_opf(root, words, file_name, book_locale):
|
|||||||
add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
|
add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
|
||||||
|
|
||||||
|
|
||||||
|
def count_chars_in_opf(root, counter, file_name, book_locale):
|
||||||
|
for tag in root.iterdescendants('*'):
|
||||||
|
if tag.text is not None and barename(tag.tag) in opf_spell_tags:
|
||||||
|
if barename(tag.tag) == 'description':
|
||||||
|
count_chars_in_escaped_html(tag.text, counter, file_name, tag, 'text', book_locale)
|
||||||
|
else:
|
||||||
|
count_chars_in_text(tag, 'text', counter, file_name, book_locale)
|
||||||
|
count_chars_in_attr(tag, _opf_file_as, counter, file_name, book_locale)
|
||||||
|
|
||||||
|
|
||||||
ncx_spell_tags = {'text'}
|
ncx_spell_tags = {'text'}
|
||||||
xml_spell_tags = opf_spell_tags | ncx_spell_tags
|
xml_spell_tags = opf_spell_tags | ncx_spell_tags
|
||||||
|
|
||||||
@ -147,6 +192,12 @@ def read_words_from_ncx(root, words, file_name, book_locale):
|
|||||||
add_words_from_text(tag, 'text', words, file_name, book_locale)
|
add_words_from_text(tag, 'text', words, file_name, book_locale)
|
||||||
|
|
||||||
|
|
||||||
|
def count_chars_in_ncx(root, counter, file_name, book_locale):
|
||||||
|
for tag in root.xpath('//*[local-name()="text"]'):
|
||||||
|
if tag.text is not None:
|
||||||
|
count_chars_in_text(tag, 'text', counter, file_name, book_locale)
|
||||||
|
|
||||||
|
|
||||||
html_spell_tags = {'script', 'style', 'link'}
|
html_spell_tags = {'script', 'style', 'link'}
|
||||||
|
|
||||||
|
|
||||||
@ -159,6 +210,15 @@ def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
|
|||||||
add_words_from_text(tag, 'tail', words, file_name, parent_locale)
|
add_words_from_text(tag, 'tail', words, file_name, parent_locale)
|
||||||
|
|
||||||
|
|
||||||
|
def count_chars_in_html_tag(tag, counter, file_name, parent_locale, locale):
|
||||||
|
if tag.text is not None and barename(tag.tag) not in html_spell_tags:
|
||||||
|
count_chars_in_text(tag, 'text', counter, file_name, locale)
|
||||||
|
for attr in {'alt', 'title'}:
|
||||||
|
count_chars_in_attr(tag, attr, counter, file_name, locale)
|
||||||
|
if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
|
||||||
|
count_chars_in_text(tag, 'tail', counter, file_name, parent_locale)
|
||||||
|
|
||||||
|
|
||||||
def locale_from_tag(tag):
|
def locale_from_tag(tag):
|
||||||
if 'lang' in tag.attrib:
|
if 'lang' in tag.attrib:
|
||||||
try:
|
try:
|
||||||
@ -185,6 +245,15 @@ def read_words_from_html(root, words, file_name, book_locale):
|
|||||||
stack.extend((tag, locale) for tag in parent.iterchildren('*'))
|
stack.extend((tag, locale) for tag in parent.iterchildren('*'))
|
||||||
|
|
||||||
|
|
||||||
|
def count_chars_in_html(root, counter, file_name, book_locale):
|
||||||
|
stack = [(root, book_locale)]
|
||||||
|
while stack:
|
||||||
|
parent, parent_locale = stack.pop()
|
||||||
|
locale = locale_from_tag(parent) or parent_locale
|
||||||
|
count_chars_in_html_tag(parent, counter, file_name, parent_locale, locale)
|
||||||
|
stack.extend((tag, locale) for tag in parent.iterchildren('*'))
|
||||||
|
|
||||||
|
|
||||||
def group_sort(locations):
|
def group_sort(locations):
|
||||||
order = {}
|
order = {}
|
||||||
for loc in locations:
|
for loc in locations:
|
||||||
@ -223,6 +292,22 @@ def get_all_words(container, book_locale, get_word_count=False):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def count_all_chars(container, book_locale):
|
||||||
|
ans = CharCounter()
|
||||||
|
file_names, toc = get_checkable_file_names(container)
|
||||||
|
for file_name in file_names:
|
||||||
|
if not container.exists(file_name):
|
||||||
|
continue
|
||||||
|
root = container.parsed(file_name)
|
||||||
|
if file_name == container.opf_name:
|
||||||
|
count_chars_in_opf(root, ans, file_name, book_locale)
|
||||||
|
elif file_name == toc:
|
||||||
|
count_chars_in_ncx(root, ans, file_name, book_locale)
|
||||||
|
elif hasattr(root, 'xpath'):
|
||||||
|
count_chars_in_html(root, ans, file_name, book_locale)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def merge_locations(locs1, locs2):
|
def merge_locations(locs1, locs2):
|
||||||
return group_sort(locs1 + locs2)
|
return group_sort(locs1 + locs2)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user