From 67bcfac6b7694caa617ba8e94b69eb7ffcf59746 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 20 Nov 2022 10:46:23 +0530 Subject: [PATCH] Edit book: Reports: Show the number of words per file in the Files section of the report --- src/calibre/ebooks/oeb/polish/report.py | 15 +++++++++++---- src/calibre/ebooks/oeb/polish/spell.py | 13 ++++++++++++- src/calibre/gui2/tweak_book/reports.py | 10 +++++++--- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/oeb/polish/report.py b/src/calibre/ebooks/oeb/polish/report.py index 018acc28f3..05939e464d 100644 --- a/src/calibre/ebooks/oeb/polish/report.py +++ b/src/calibre/ebooks/oeb/polish/report.py @@ -18,7 +18,7 @@ from calibre.utils.imghdr import identify from css_selectors import Select, SelectorError from polyglot.builtins import iteritems -File = namedtuple('File', 'name dir basename size category') +File = namedtuple('File', 'name dir basename size category word_count') def get_category(name, mt): @@ -60,9 +60,10 @@ def safe_img_data(container, name, mt): def files_data(container, *args): + fwc = file_words_counts or {} for name, path in iteritems(container.name_path_map): yield File(name, posixpath.dirname(name), posixpath.basename(name), safe_size(container, name), - get_category(name, container.mime_map.get(name, ''))) + get_category(name, container.mime_map.get(name, '')), fwc.get(name, -1)) Image = namedtuple('Image', 'name mime_type usage size basename id width height') @@ -198,8 +199,11 @@ def links_data(container, *args): Word = namedtuple('Word', 'id word locale usage') +file_words_counts = None + + def words_data(container, book_locale, *args): - count, words = get_all_words(container, book_locale, get_word_count=True) + count, words = get_all_words(container, book_locale, get_word_count=True, file_words_counts=file_words_counts) return (count, tuple(Word(i, word, locale, v) for i, ((word, locale), v) in enumerate(iteritems(words)))) @@ -349,12 +353,15 @@ def css_data(container, book_locale, result_data, *args): def gather_data(container, book_locale): + global file_words_counts timing = {} data = {} - for x in 'files chars images links words css'.split(): + file_words_counts = {} + for x in 'chars images links words css files'.split(): st = time.time() data[x] = globals()[x + '_data'](container, book_locale, data) if isinstance(data[x], types.GeneratorType): data[x] = tuple(data[x]) timing[x] = time.time() - st + file_words_counts = None return data, timing diff --git a/src/calibre/ebooks/oeb/polish/spell.py b/src/calibre/ebooks/oeb/polish/spell.py index 6679b604ed..92b3f4d015 100644 --- a/src/calibre/ebooks/oeb/polish/spell.py +++ b/src/calibre/ebooks/oeb/polish/spell.py @@ -70,6 +70,9 @@ class Location: self.original_word = self.elided_prefix + new_word +file_word_count = 0 + + def filter_words(word): if not word: return False @@ -80,10 +83,12 @@ def filter_words(word): def get_words(text, lang): + global file_word_count try: ans = split_into_words(str(text), lang) except (TypeError, ValueError): return () + file_word_count += len(ans) return list(filter(filter_words, ans)) @@ -299,7 +304,10 @@ def root_is_excluded_from_spell_check(root): return False -def get_all_words(container, book_locale, get_word_count=False, excluded_files=()): +def get_all_words(container, book_locale, get_word_count=False, excluded_files=(), file_words_counts=None): + global file_word_count + if file_words_counts is None: + file_words_counts = {} words = defaultdict(list) words[None] = 0 file_names, ncx_toc = get_checkable_file_names(container) @@ -309,12 +317,15 @@ def get_all_words(container, book_locale, get_word_count=False, excluded_files=( root = container.parsed(file_name) if root_is_excluded_from_spell_check(root): continue + file_word_count = 0 if file_name == container.opf_name: read_words_from_opf(root, words, file_name, book_locale) elif file_name == ncx_toc: read_words_from_ncx(root, words, file_name, book_locale) elif hasattr(root, 'xpath'): read_words_from_html(root, words, file_name, book_locale) + file_words_counts[file_name] = file_word_count + file_word_count = 0 count = words.pop(None) ans = {k:group_sort(v) for k, v in iteritems(words)} if get_word_count: diff --git a/src/calibre/gui2/tweak_book/reports.py b/src/calibre/gui2/tweak_book/reports.py index 8e3f496bfe..74daaeaed8 100644 --- a/src/calibre/gui2/tweak_book/reports.py +++ b/src/calibre/gui2/tweak_book/reports.py @@ -236,8 +236,8 @@ class FilesView(QTableView): class FilesModel(FileCollection): - COLUMN_HEADERS = (_('Folder'), _('Name'), _('Size (KB)'), _('Type')) - alignments = Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignRight, Qt.AlignmentFlag.AlignLeft + COLUMN_HEADERS = (_('Folder'), _('Name'), _('Size (KB)'), _('Type'), _('Word count')) + alignments = Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignRight, Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignRight CATEGORY_NAMES = { 'image':_('Image'), 'text': _('Text'), @@ -257,7 +257,7 @@ class FilesModel(FileCollection): self.total_size = sum(map(itemgetter(3), self.files)) self.images_size = sum(map(itemgetter(3), (f for f in self.files if f.category == 'image'))) self.fonts_size = sum(map(itemgetter(3), (f for f in self.files if f.category == 'font'))) - self.sort_keys = tuple((psk(entry.dir), psk(entry.basename), entry.size, psk(self.CATEGORY_NAMES.get(entry.category, ''))) + self.sort_keys = tuple((psk(entry.dir), psk(entry.basename), entry.size, psk(self.CATEGORY_NAMES.get(entry.category, '')), entry.word_count) for entry in self.files) self.endResetModel() @@ -282,6 +282,10 @@ class FilesModel(FileCollection): return '%.2f ' % sz if col == 3: return self.CATEGORY_NAMES.get(entry.category) + if col == 4: + ans = entry.word_count + if ans > -1: + return str(ans) elif role == Qt.ItemDataRole.TextAlignmentRole: return int(Qt.AlignVCenter | self.alignments[index.column()]) # https://bugreports.qt.io/browse/PYSIDE-1974