Fix #2100891 [[Enhancement] Full-text search for doc/docx/zip (containing htm/html)](https://bugs.launchpad.net/calibre/+bug/2100891)

Forgot to modify the check in db cache for extractable format
2025-12-19 19:45:01 -05:00 · 2025-03-23 05:43:42 +05:30 · 2025-03-23 05:43:42 +05:30 · a7375ad7d4
commit a7375ad7d4
parent 00ddf587c7
2 changed files with 28 additions and 12 deletions
--- a/src/calibre/db/cache.py
+++ b/src/calibre/db/cache.py
@ -528,7 +528,7 @@ class Cache:
    @staticmethod
    def dispatch_fts_jobs(queue, stop_dispatch, dbref):
-        from .fts.text import is_fmt_ok
+        from .fts.text import is_fmt_extractable
        def do_one():
            self = dbref()
@ -542,7 +542,7 @@ class Cache:
                if book_id is None:
                    return False
                path = self._format_abspath(book_id, fmt)
-            if not path or not is_fmt_ok(fmt):
+            if not path or not is_fmt_extractable(fmt):
                with self.write_lock:
                    self.backend.remove_dirty_fts(book_id, fmt)
                    self._update_fts_indexing_numbers()
--- a/src/calibre/db/fts/text.py
+++ b/src/calibre/db/fts/text.py
@ -79,21 +79,37 @@ def pdftotext(path):
    return clean_ascii_chars(raw).decode('utf-8', 'replace')
 def can_extract_text(pathtoebook: str, input_fmt: str, exit_stack: contextlib.ExitStack) -> tuple[str, str]:
    if not pathtoebook:
        return pathtoebook, input_fmt
    if is_fmt_ok(input_fmt):
        return pathtoebook, input_fmt
    if input_fmt.lower() in ARCHIVE_FMTS:
        try:
            tdir = exit_stack.enter_context(TemporaryDirectory())
            pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
            input_fmt = input_fmt.upper()
        except Exception:
            return '', input_fmt
        else:
            return pathtoebook, input_fmt
    return '', input_fmt
 def is_fmt_extractable(input_fmt: str) -> bool:
    if is_fmt_ok(input_fmt):
        return True
    return input_fmt.lower() in ARCHIVE_FMTS
 def extract_text(pathtoebook):
    input_fmt = pathtoebook.rpartition('.')[-1].upper()
    ans = ''
    input_plugin = is_fmt_ok(input_fmt)
    with contextlib.ExitStack() as exit_stack:
-        if not input_plugin:
+        pathtoebook, input_fmt = can_extract_text(pathtoebook, input_fmt, exit_stack)
-            if input_fmt.lower() in ARCHIVE_FMTS:
+        if not pathtoebook:
-                try:
+            return ans
                    tdir = exit_stack.enter_context(TemporaryDirectory())
                    pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
                    input_fmt = input_fmt.upper()
                except Exception:
                    return ans
            else:
                return ans
        input_plugin = plugin_for_input_format(input_fmt)
        if input_fmt == 'PDF':
            ans = pdftotext(pathtoebook)