mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Fix #2100891 [[Enhancement] Full-text search for doc/docx/zip (containing htm/html)](https://bugs.launchpad.net/calibre/+bug/2100891)
Forgot to modify the check in db cache for extractable format
This commit is contained in:
parent
00ddf587c7
commit
a7375ad7d4
@ -528,7 +528,7 @@ class Cache:
|
||||
|
||||
@staticmethod
|
||||
def dispatch_fts_jobs(queue, stop_dispatch, dbref):
|
||||
from .fts.text import is_fmt_ok
|
||||
from .fts.text import is_fmt_extractable
|
||||
|
||||
def do_one():
|
||||
self = dbref()
|
||||
@ -542,7 +542,7 @@ class Cache:
|
||||
if book_id is None:
|
||||
return False
|
||||
path = self._format_abspath(book_id, fmt)
|
||||
if not path or not is_fmt_ok(fmt):
|
||||
if not path or not is_fmt_extractable(fmt):
|
||||
with self.write_lock:
|
||||
self.backend.remove_dirty_fts(book_id, fmt)
|
||||
self._update_fts_indexing_numbers()
|
||||
|
@ -79,21 +79,37 @@ def pdftotext(path):
|
||||
return clean_ascii_chars(raw).decode('utf-8', 'replace')
|
||||
|
||||
|
||||
def can_extract_text(pathtoebook: str, input_fmt: str, exit_stack: contextlib.ExitStack) -> tuple[str, str]:
|
||||
if not pathtoebook:
|
||||
return pathtoebook, input_fmt
|
||||
if is_fmt_ok(input_fmt):
|
||||
return pathtoebook, input_fmt
|
||||
if input_fmt.lower() in ARCHIVE_FMTS:
|
||||
try:
|
||||
tdir = exit_stack.enter_context(TemporaryDirectory())
|
||||
pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
|
||||
input_fmt = input_fmt.upper()
|
||||
except Exception:
|
||||
return '', input_fmt
|
||||
else:
|
||||
return pathtoebook, input_fmt
|
||||
return '', input_fmt
|
||||
|
||||
|
||||
def is_fmt_extractable(input_fmt: str) -> bool:
|
||||
if is_fmt_ok(input_fmt):
|
||||
return True
|
||||
return input_fmt.lower() in ARCHIVE_FMTS
|
||||
|
||||
|
||||
def extract_text(pathtoebook):
|
||||
input_fmt = pathtoebook.rpartition('.')[-1].upper()
|
||||
ans = ''
|
||||
input_plugin = is_fmt_ok(input_fmt)
|
||||
with contextlib.ExitStack() as exit_stack:
|
||||
if not input_plugin:
|
||||
if input_fmt.lower() in ARCHIVE_FMTS:
|
||||
try:
|
||||
tdir = exit_stack.enter_context(TemporaryDirectory())
|
||||
pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
|
||||
input_fmt = input_fmt.upper()
|
||||
except Exception:
|
||||
return ans
|
||||
else:
|
||||
return ans
|
||||
pathtoebook, input_fmt = can_extract_text(pathtoebook, input_fmt, exit_stack)
|
||||
if not pathtoebook:
|
||||
return ans
|
||||
input_plugin = plugin_for_input_format(input_fmt)
|
||||
if input_fmt == 'PDF':
|
||||
ans = pdftotext(pathtoebook)
|
||||
|
Loading…
x
Reference in New Issue
Block a user