Edit book: Reports: Show the number of words per file in the Files section of the report

2025-07-09 03:04:10 -04:00 · 2022-11-20 10:46:23 +05:30 · 2022-11-20 10:46:23 +05:30 · 67bcfac6b7
commit 67bcfac6b7
parent c0fce03703
3 changed files with 30 additions and 8 deletions
--- a/src/calibre/ebooks/oeb/polish/report.py
+++ b/src/calibre/ebooks/oeb/polish/report.py
@ -18,7 +18,7 @@ from calibre.utils.imghdr import identify
 from css_selectors import Select, SelectorError
 from polyglot.builtins import iteritems

-File = namedtuple('File', 'name dir basename size category')
+File = namedtuple('File', 'name dir basename size category word_count')


 def get_category(name, mt):
@ -60,9 +60,10 @@ def safe_img_data(container, name, mt):


 def files_data(container, *args):
+    fwc = file_words_counts or {}
    for name, path in iteritems(container.name_path_map):
        yield File(name, posixpath.dirname(name), posixpath.basename(name), safe_size(container, name),
-                   get_category(name, container.mime_map.get(name, '')))
+                   get_category(name, container.mime_map.get(name, '')), fwc.get(name, -1))


 Image = namedtuple('Image', 'name mime_type usage size basename id width height')
@ -198,8 +199,11 @@ def links_data(container, *args):
 Word = namedtuple('Word', 'id word locale usage')


+file_words_counts = None
+
+
 def words_data(container, book_locale, *args):
-    count, words = get_all_words(container, book_locale, get_word_count=True)
+    count, words = get_all_words(container, book_locale, get_word_count=True, file_words_counts=file_words_counts)
    return (count, tuple(Word(i, word, locale, v) for i, ((word, locale), v) in enumerate(iteritems(words))))


@ -349,12 +353,15 @@ def css_data(container, book_locale, result_data, *args):


 def gather_data(container, book_locale):
+    global file_words_counts
    timing = {}
    data = {}
-    for x in 'files chars images links words css'.split():
+    file_words_counts = {}
+    for x in 'chars images links words css files'.split():
        st = time.time()
        data[x] = globals()[x + '_data'](container, book_locale, data)
        if isinstance(data[x], types.GeneratorType):
            data[x] = tuple(data[x])
        timing[x] = time.time() - st
+    file_words_counts = None
    return data, timing
--- a/src/calibre/ebooks/oeb/polish/spell.py
+++ b/src/calibre/ebooks/oeb/polish/spell.py
@ -70,6 +70,9 @@ class Location:
        self.original_word = self.elided_prefix + new_word


+file_word_count = 0
+
+
 def filter_words(word):
    if not word:
        return False
@ -80,10 +83,12 @@ def filter_words(word):


 def get_words(text, lang):
+    global file_word_count
    try:
        ans = split_into_words(str(text), lang)
    except (TypeError, ValueError):
        return ()
+    file_word_count += len(ans)
    return list(filter(filter_words, ans))


@ -299,7 +304,10 @@ def root_is_excluded_from_spell_check(root):
    return False


-def get_all_words(container, book_locale, get_word_count=False, excluded_files=()):
+def get_all_words(container, book_locale, get_word_count=False, excluded_files=(), file_words_counts=None):
+    global file_word_count
+    if file_words_counts is None:
+        file_words_counts = {}
    words = defaultdict(list)
    words[None] = 0
    file_names, ncx_toc = get_checkable_file_names(container)
@ -309,12 +317,15 @@ def get_all_words(container, book_locale, get_word_count=False, excluded_files=(
        root = container.parsed(file_name)
        if root_is_excluded_from_spell_check(root):
            continue
+        file_word_count = 0
        if file_name == container.opf_name:
            read_words_from_opf(root, words, file_name, book_locale)
        elif file_name == ncx_toc:
            read_words_from_ncx(root, words, file_name, book_locale)
        elif hasattr(root, 'xpath'):
            read_words_from_html(root, words, file_name, book_locale)
+        file_words_counts[file_name] = file_word_count
+        file_word_count = 0
    count = words.pop(None)
    ans = {k:group_sort(v) for k, v in iteritems(words)}
    if get_word_count:
--- a/src/calibre/gui2/tweak_book/reports.py
+++ b/src/calibre/gui2/tweak_book/reports.py
@ -236,8 +236,8 @@ class FilesView(QTableView):

 class FilesModel(FileCollection):

-    COLUMN_HEADERS = (_('Folder'), _('Name'), _('Size (KB)'), _('Type'))
-    alignments = Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignRight, Qt.AlignmentFlag.AlignLeft
+    COLUMN_HEADERS = (_('Folder'), _('Name'), _('Size (KB)'), _('Type'), _('Word count'))
+    alignments = Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignRight, Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignRight
    CATEGORY_NAMES = {
        'image':_('Image'),
        'text': _('Text'),
@ -257,7 +257,7 @@ class FilesModel(FileCollection):
        self.total_size = sum(map(itemgetter(3), self.files))
        self.images_size = sum(map(itemgetter(3), (f for f in self.files if f.category == 'image')))
        self.fonts_size = sum(map(itemgetter(3), (f for f in self.files if f.category == 'font')))
-        self.sort_keys = tuple((psk(entry.dir), psk(entry.basename), entry.size, psk(self.CATEGORY_NAMES.get(entry.category, '')))
+        self.sort_keys = tuple((psk(entry.dir), psk(entry.basename), entry.size, psk(self.CATEGORY_NAMES.get(entry.category, '')), entry.word_count)
                               for entry in self.files)
        self.endResetModel()

@ -282,6 +282,10 @@ class FilesModel(FileCollection):
                return '%.2f ' % sz
            if col == 3:
                return self.CATEGORY_NAMES.get(entry.category)
+            if col == 4:
+                ans = entry.word_count
+                if ans > -1:
+                    return str(ans)
        elif role == Qt.ItemDataRole.TextAlignmentRole:
            return int(Qt.AlignVCenter | self.alignments[index.column()])  # https://bugreports.qt.io/browse/PYSIDE-1974