Avoid FTS overhead when restricting to subset

This is needed because the highlight()/snippet() sqlite functions are very slow with large text
2025-07-09 03:04:10 -04:00 · 2022-05-05 12:51:20 +05:30 · 2022-05-05 12:51:20 +05:30 · 2bbf8e5824
commit 2bbf8e5824
parent cac2a91df9
4 changed files with 18 additions and 8 deletions
--- a/src/calibre/db/backend.py
+++ b/src/calibre/db/backend.py
@ -980,9 +980,9 @@ class DB:
            return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg)
    def fts_search(self,
-        fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
+        fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,
    ):
-        yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,)
+        yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,)
    def shutdown_fts(self):
        if self.fts_enabled:
--- a/src/calibre/db/cache.py
+++ b/src/calibre/db/cache.py
@ -564,6 +564,7 @@ class Cache:
        highlight_end=None,
        snippet_size=None,
        restrict_to_book_ids=None,
        return_text=True,
        result_type=tuple,
    ):
        return result_type(self.backend.fts_search(
@ -572,6 +573,7 @@ class Cache:
            highlight_start=highlight_start,
            highlight_end=highlight_end,
            snippet_size=snippet_size,
            return_text=return_text,
            restrict_to_book_ids=restrict_to_book_ids,
        ))
--- a/src/calibre/db/fts/connect.py
+++ b/src/calibre/db/fts/connect.py
@ -9,6 +9,7 @@ import hashlib
 import os
 import sys
 from contextlib import suppress
 from itertools import repeat
 from threading import Lock
 from calibre.db import FTSQueryError
@ -139,6 +140,8 @@ class FTS:
        fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
        return_text=True,
    ):
        if restrict_to_book_ids is not None and not restrict_to_book_ids:
            return
        fts_engine_query = unicode_normalize(fts_engine_query)
        fts_table = 'books_fts' + ('_stemmed' if use_stemming else '')
        if return_text:
@ -153,18 +156,21 @@ class FTS:
            text = ''
        query = 'SELECT {0}.id, {0}.book, {0}.format {1} FROM {0} '.format('books_text', text)
        query += f' JOIN {fts_table} ON fts_db.books_text.id = {fts_table}.rowid'
-        query += f' WHERE "{fts_table}" MATCH ?'
+        query += ' WHERE '
-        data = [fts_engine_query]
+        data = []
        if restrict_to_book_ids:
            pl = ','.join(repeat('?', len(restrict_to_book_ids)))
            query += f' fts_db.books_text.book IN ({pl}) AND '
            data.extend(restrict_to_book_ids)
        query += f' "{fts_table}" MATCH ?'
        data.append(fts_engine_query)
        query += f' ORDER BY {fts_table}.rank '
        conn = self.get_connection()
        try:
            for record in conn.execute(query, tuple(data)):
                book_id = record[1]
                if restrict_to_book_ids is not None and book_id not in restrict_to_book_ids:
                    continue
                yield {
                    'id': record[0],
-                    'book_id': book_id,
+                    'book_id': record[1],
                    'format': record[2],
                    'text': record[3] if return_text else '',
                }
--- a/src/calibre/db/tests/fts_api.py
+++ b/src/calibre/db/tests/fts_api.py
@ -137,12 +137,14 @@ class FTSAPITest(BaseTest):
        self.wait_for_fts_to_finish(fts)
        self.assertFalse(fts.all_currently_dirty())
        self.ae({x['id'] for x in cache.fts_search('help')}, {1, 2})
        self.ae({x['id'] for x in cache.fts_search('help', restrict_to_book_ids=(1, 3, 4, 5, 11))}, {1})
        self.ae({x['format'] for x in cache.fts_search('help')}, {'TXT', 'MD'})
        self.ae({x['id'] for x in cache.fts_search('also')}, {2})
        self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']')}, {
            'some other long text that will [also] help with the testing of search'})
        self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']', snippet_size=3)}, {
            '…will [also] help…'})
        self.ae({x['text'] for x in cache.fts_search('also', return_text=False)}, {''})
        fts = cache.reindex_fts()
        self.assertTrue(fts.pool.initialized)
        self.wait_for_fts_to_finish(fts)