From a7375ad7d41c52b24745fe36d8824e923f199b7c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 23 Mar 2025 05:43:42 +0530
Subject: [PATCH] Fix #2100891 [[Enhancement] Full-text search for doc/docx/zip
 (containing htm/html)](https://bugs.launchpad.net/calibre/+bug/2100891)

Forgot to modify the check in db cache for extractable format
---
 src/calibre/db/cache.py    |  4 ++--
 src/calibre/db/fts/text.py | 36 ++++++++++++++++++++++++++----------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py
index 6a76c5e852..083f573ea8 100644
--- a/src/calibre/db/cache.py
+++ b/src/calibre/db/cache.py
@@ -528,7 +528,7 @@ class Cache:
 
     @staticmethod
     def dispatch_fts_jobs(queue, stop_dispatch, dbref):
-        from .fts.text import is_fmt_ok
+        from .fts.text import is_fmt_extractable
 
         def do_one():
             self = dbref()
@@ -542,7 +542,7 @@ class Cache:
                 if book_id is None:
                     return False
                 path = self._format_abspath(book_id, fmt)
-            if not path or not is_fmt_ok(fmt):
+            if not path or not is_fmt_extractable(fmt):
                 with self.write_lock:
                     self.backend.remove_dirty_fts(book_id, fmt)
                     self._update_fts_indexing_numbers()
diff --git a/src/calibre/db/fts/text.py b/src/calibre/db/fts/text.py
index 9f009b7a43..a15a26178d 100644
--- a/src/calibre/db/fts/text.py
+++ b/src/calibre/db/fts/text.py
@@ -79,21 +79,37 @@ def pdftotext(path):
     return clean_ascii_chars(raw).decode('utf-8', 'replace')
 
 
+def can_extract_text(pathtoebook: str, input_fmt: str, exit_stack: contextlib.ExitStack) -> tuple[str, str]:
+    if not pathtoebook:
+        return pathtoebook, input_fmt
+    if is_fmt_ok(input_fmt):
+        return pathtoebook, input_fmt
+    if input_fmt.lower() in ARCHIVE_FMTS:
+        try:
+            tdir = exit_stack.enter_context(TemporaryDirectory())
+            pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
+            input_fmt = input_fmt.upper()
+        except Exception:
+            return '', input_fmt
+        else:
+            return pathtoebook, input_fmt
+    return '', input_fmt
+
+
+def is_fmt_extractable(input_fmt: str) -> bool:
+    if is_fmt_ok(input_fmt):
+        return True
+    return input_fmt.lower() in ARCHIVE_FMTS
+
+
 def extract_text(pathtoebook):
     input_fmt = pathtoebook.rpartition('.')[-1].upper()
     ans = ''
     input_plugin = is_fmt_ok(input_fmt)
     with contextlib.ExitStack() as exit_stack:
-        if not input_plugin:
-            if input_fmt.lower() in ARCHIVE_FMTS:
-                try:
-                    tdir = exit_stack.enter_context(TemporaryDirectory())
-                    pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
-                    input_fmt = input_fmt.upper()
-                except Exception:
-                    return ans
-            else:
-                return ans
+        pathtoebook, input_fmt = can_extract_text(pathtoebook, input_fmt, exit_stack)
+        if not pathtoebook:
+            return ans
         input_plugin = plugin_for_input_format(input_fmt)
         if input_fmt == 'PDF':
             ans = pdftotext(pathtoebook)