From 67bcfac6b7694caa617ba8e94b69eb7ffcf59746 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 20 Nov 2022 10:46:23 +0530
Subject: [PATCH] Edit book: Reports: Show the number of words per file in the
 Files section of the report

---
 src/calibre/ebooks/oeb/polish/report.py | 15 +++++++++++----
 src/calibre/ebooks/oeb/polish/spell.py  | 13 ++++++++++++-
 src/calibre/gui2/tweak_book/reports.py  | 10 +++++++---
 3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/src/calibre/ebooks/oeb/polish/report.py b/src/calibre/ebooks/oeb/polish/report.py
index 018acc28f3..05939e464d 100644
--- a/src/calibre/ebooks/oeb/polish/report.py
+++ b/src/calibre/ebooks/oeb/polish/report.py
@@ -18,7 +18,7 @@ from calibre.utils.imghdr import identify
 from css_selectors import Select, SelectorError
 from polyglot.builtins import iteritems
 
-File = namedtuple('File', 'name dir basename size category')
+File = namedtuple('File', 'name dir basename size category word_count')
 
 
 def get_category(name, mt):
@@ -60,9 +60,10 @@ def safe_img_data(container, name, mt):
 
 
 def files_data(container, *args):
+    fwc = file_words_counts or {}
     for name, path in iteritems(container.name_path_map):
         yield File(name, posixpath.dirname(name), posixpath.basename(name), safe_size(container, name),
-                   get_category(name, container.mime_map.get(name, '')))
+                   get_category(name, container.mime_map.get(name, '')), fwc.get(name, -1))
 
 
 Image = namedtuple('Image', 'name mime_type usage size basename id width height')
@@ -198,8 +199,11 @@ def links_data(container, *args):
 Word = namedtuple('Word', 'id word locale usage')
 
 
+file_words_counts = None
+
+
 def words_data(container, book_locale, *args):
-    count, words = get_all_words(container, book_locale, get_word_count=True)
+    count, words = get_all_words(container, book_locale, get_word_count=True, file_words_counts=file_words_counts)
     return (count, tuple(Word(i, word, locale, v) for i, ((word, locale), v) in enumerate(iteritems(words))))
 
 
@@ -349,12 +353,15 @@ def css_data(container, book_locale, result_data, *args):
 
 
 def gather_data(container, book_locale):
+    global file_words_counts
     timing = {}
     data = {}
-    for x in 'files chars images links words css'.split():
+    file_words_counts = {}
+    for x in 'chars images links words css files'.split():
         st = time.time()
         data[x] = globals()[x + '_data'](container, book_locale, data)
         if isinstance(data[x], types.GeneratorType):
             data[x] = tuple(data[x])
         timing[x] = time.time() - st
+    file_words_counts = None
     return data, timing
diff --git a/src/calibre/ebooks/oeb/polish/spell.py b/src/calibre/ebooks/oeb/polish/spell.py
index 6679b604ed..92b3f4d015 100644
--- a/src/calibre/ebooks/oeb/polish/spell.py
+++ b/src/calibre/ebooks/oeb/polish/spell.py
@@ -70,6 +70,9 @@ class Location:
         self.original_word = self.elided_prefix + new_word
 
 
+file_word_count = 0
+
+
 def filter_words(word):
     if not word:
         return False
@@ -80,10 +83,12 @@ def filter_words(word):
 
 
 def get_words(text, lang):
+    global file_word_count
     try:
         ans = split_into_words(str(text), lang)
     except (TypeError, ValueError):
         return ()
+    file_word_count += len(ans)
     return list(filter(filter_words, ans))
 
 
@@ -299,7 +304,10 @@ def root_is_excluded_from_spell_check(root):
     return False
 
 
-def get_all_words(container, book_locale, get_word_count=False, excluded_files=()):
+def get_all_words(container, book_locale, get_word_count=False, excluded_files=(), file_words_counts=None):
+    global file_word_count
+    if file_words_counts is None:
+        file_words_counts = {}
     words = defaultdict(list)
     words[None] = 0
     file_names, ncx_toc = get_checkable_file_names(container)
@@ -309,12 +317,15 @@ def get_all_words(container, book_locale, get_word_count=False, excluded_files=(
         root = container.parsed(file_name)
         if root_is_excluded_from_spell_check(root):
             continue
+        file_word_count = 0
         if file_name == container.opf_name:
             read_words_from_opf(root, words, file_name, book_locale)
         elif file_name == ncx_toc:
             read_words_from_ncx(root, words, file_name, book_locale)
         elif hasattr(root, 'xpath'):
             read_words_from_html(root, words, file_name, book_locale)
+        file_words_counts[file_name] = file_word_count
+        file_word_count = 0
     count = words.pop(None)
     ans = {k:group_sort(v) for k, v in iteritems(words)}
     if get_word_count:
diff --git a/src/calibre/gui2/tweak_book/reports.py b/src/calibre/gui2/tweak_book/reports.py
index 8e3f496bfe..74daaeaed8 100644
--- a/src/calibre/gui2/tweak_book/reports.py
+++ b/src/calibre/gui2/tweak_book/reports.py
@@ -236,8 +236,8 @@ class FilesView(QTableView):
 
 class FilesModel(FileCollection):
 
-    COLUMN_HEADERS = (_('Folder'), _('Name'), _('Size (KB)'), _('Type'))
-    alignments = Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignRight, Qt.AlignmentFlag.AlignLeft
+    COLUMN_HEADERS = (_('Folder'), _('Name'), _('Size (KB)'), _('Type'), _('Word count'))
+    alignments = Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignRight, Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignRight
     CATEGORY_NAMES = {
         'image':_('Image'),
         'text': _('Text'),
@@ -257,7 +257,7 @@ class FilesModel(FileCollection):
         self.total_size = sum(map(itemgetter(3), self.files))
         self.images_size = sum(map(itemgetter(3), (f for f in self.files if f.category == 'image')))
         self.fonts_size = sum(map(itemgetter(3), (f for f in self.files if f.category == 'font')))
-        self.sort_keys = tuple((psk(entry.dir), psk(entry.basename), entry.size, psk(self.CATEGORY_NAMES.get(entry.category, '')))
+        self.sort_keys = tuple((psk(entry.dir), psk(entry.basename), entry.size, psk(self.CATEGORY_NAMES.get(entry.category, '')), entry.word_count)
                                for entry in self.files)
         self.endResetModel()
 
@@ -282,6 +282,10 @@ class FilesModel(FileCollection):
                 return '%.2f ' % sz
             if col == 3:
                 return self.CATEGORY_NAMES.get(entry.category)
+            if col == 4:
+                ans = entry.word_count
+                if ans > -1:
+                    return str(ans)
         elif role == Qt.ItemDataRole.TextAlignmentRole:
             return int(Qt.AlignVCenter | self.alignments[index.column()])  # https://bugreports.qt.io/browse/PYSIDE-1974