Fix #2100891 [[Enhancement] Full-text search for doc/docx/zip (containing htm/html)](https://bugs.launchpad.net/calibre/+bug/2100891)

Forgot to modify the check in db cache for extractable format
This commit is contained in:
Kovid Goyal 2025-03-23 05:43:42 +05:30
parent 00ddf587c7
commit a7375ad7d4
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 28 additions and 12 deletions

View File

@ -528,7 +528,7 @@ class Cache:
@staticmethod @staticmethod
def dispatch_fts_jobs(queue, stop_dispatch, dbref): def dispatch_fts_jobs(queue, stop_dispatch, dbref):
from .fts.text import is_fmt_ok from .fts.text import is_fmt_extractable
def do_one(): def do_one():
self = dbref() self = dbref()
@ -542,7 +542,7 @@ class Cache:
if book_id is None: if book_id is None:
return False return False
path = self._format_abspath(book_id, fmt) path = self._format_abspath(book_id, fmt)
if not path or not is_fmt_ok(fmt): if not path or not is_fmt_extractable(fmt):
with self.write_lock: with self.write_lock:
self.backend.remove_dirty_fts(book_id, fmt) self.backend.remove_dirty_fts(book_id, fmt)
self._update_fts_indexing_numbers() self._update_fts_indexing_numbers()

View File

@ -79,21 +79,37 @@ def pdftotext(path):
return clean_ascii_chars(raw).decode('utf-8', 'replace') return clean_ascii_chars(raw).decode('utf-8', 'replace')
def can_extract_text(pathtoebook: str, input_fmt: str, exit_stack: contextlib.ExitStack) -> tuple[str, str]:
if not pathtoebook:
return pathtoebook, input_fmt
if is_fmt_ok(input_fmt):
return pathtoebook, input_fmt
if input_fmt.lower() in ARCHIVE_FMTS:
try:
tdir = exit_stack.enter_context(TemporaryDirectory())
pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
input_fmt = input_fmt.upper()
except Exception:
return '', input_fmt
else:
return pathtoebook, input_fmt
return '', input_fmt
def is_fmt_extractable(input_fmt: str) -> bool:
if is_fmt_ok(input_fmt):
return True
return input_fmt.lower() in ARCHIVE_FMTS
def extract_text(pathtoebook): def extract_text(pathtoebook):
input_fmt = pathtoebook.rpartition('.')[-1].upper() input_fmt = pathtoebook.rpartition('.')[-1].upper()
ans = '' ans = ''
input_plugin = is_fmt_ok(input_fmt) input_plugin = is_fmt_ok(input_fmt)
with contextlib.ExitStack() as exit_stack: with contextlib.ExitStack() as exit_stack:
if not input_plugin: pathtoebook, input_fmt = can_extract_text(pathtoebook, input_fmt, exit_stack)
if input_fmt.lower() in ARCHIVE_FMTS: if not pathtoebook:
try: return ans
tdir = exit_stack.enter_context(TemporaryDirectory())
pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
input_fmt = input_fmt.upper()
except Exception:
return ans
else:
return ans
input_plugin = plugin_for_input_format(input_fmt) input_plugin = plugin_for_input_format(input_fmt)
if input_fmt == 'PDF': if input_fmt == 'PDF':
ans = pdftotext(pathtoebook) ans = pdftotext(pathtoebook)