From a7375ad7d41c52b24745fe36d8824e923f199b7c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 23 Mar 2025 05:43:42 +0530 Subject: [PATCH] Fix #2100891 [[Enhancement] Full-text search for doc/docx/zip (containing htm/html)](https://bugs.launchpad.net/calibre/+bug/2100891) Forgot to modify the check in db cache for extractable format --- src/calibre/db/cache.py | 4 ++-- src/calibre/db/fts/text.py | 36 ++++++++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index 6a76c5e852..083f573ea8 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -528,7 +528,7 @@ class Cache: @staticmethod def dispatch_fts_jobs(queue, stop_dispatch, dbref): - from .fts.text import is_fmt_ok + from .fts.text import is_fmt_extractable def do_one(): self = dbref() @@ -542,7 +542,7 @@ class Cache: if book_id is None: return False path = self._format_abspath(book_id, fmt) - if not path or not is_fmt_ok(fmt): + if not path or not is_fmt_extractable(fmt): with self.write_lock: self.backend.remove_dirty_fts(book_id, fmt) self._update_fts_indexing_numbers() diff --git a/src/calibre/db/fts/text.py b/src/calibre/db/fts/text.py index 9f009b7a43..a15a26178d 100644 --- a/src/calibre/db/fts/text.py +++ b/src/calibre/db/fts/text.py @@ -79,21 +79,37 @@ def pdftotext(path): return clean_ascii_chars(raw).decode('utf-8', 'replace') +def can_extract_text(pathtoebook: str, input_fmt: str, exit_stack: contextlib.ExitStack) -> tuple[str, str]: + if not pathtoebook: + return pathtoebook, input_fmt + if is_fmt_ok(input_fmt): + return pathtoebook, input_fmt + if input_fmt.lower() in ARCHIVE_FMTS: + try: + tdir = exit_stack.enter_context(TemporaryDirectory()) + pathtoebook, input_fmt = unarchive(pathtoebook, tdir) + input_fmt = input_fmt.upper() + except Exception: + return '', input_fmt + else: + return pathtoebook, input_fmt + return '', input_fmt + + +def is_fmt_extractable(input_fmt: str) -> bool: + if is_fmt_ok(input_fmt): + return True + return input_fmt.lower() in ARCHIVE_FMTS + + def extract_text(pathtoebook): input_fmt = pathtoebook.rpartition('.')[-1].upper() ans = '' input_plugin = is_fmt_ok(input_fmt) with contextlib.ExitStack() as exit_stack: - if not input_plugin: - if input_fmt.lower() in ARCHIVE_FMTS: - try: - tdir = exit_stack.enter_context(TemporaryDirectory()) - pathtoebook, input_fmt = unarchive(pathtoebook, tdir) - input_fmt = input_fmt.upper() - except Exception: - return ans - else: - return ans + pathtoebook, input_fmt = can_extract_text(pathtoebook, input_fmt, exit_stack) + if not pathtoebook: + return ans input_plugin = plugin_for_input_format(input_fmt) if input_fmt == 'PDF': ans = pdftotext(pathtoebook)