From 2bbf8e5824fff4a1cb22bc1a1fe181202cbe638d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 5 May 2022 12:51:20 +0530
Subject: [PATCH] Avoid FTS overhead when restricting to subset

This is needed because the highlight()/snippet() sqlite functions
are very slow with large text
---
 src/calibre/db/backend.py       |  4 ++--
 src/calibre/db/cache.py         |  2 ++
 src/calibre/db/fts/connect.py   | 18 ++++++++++++------
 src/calibre/db/tests/fts_api.py |  2 ++
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/calibre/db/backend.py b/src/calibre/db/backend.py
index f6398657d9..389db05c84 100644
--- a/src/calibre/db/backend.py
+++ b/src/calibre/db/backend.py
@@ -980,9 +980,9 @@ class DB:
             return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg)
 
     def fts_search(self,
-        fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
+        fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,
     ):
-        yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,)
+        yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,)
 
     def shutdown_fts(self):
         if self.fts_enabled:
diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py
index c220ab2f87..dafa4187c4 100644
--- a/src/calibre/db/cache.py
+++ b/src/calibre/db/cache.py
@@ -564,6 +564,7 @@ class Cache:
         highlight_end=None,
         snippet_size=None,
         restrict_to_book_ids=None,
+        return_text=True,
         result_type=tuple,
     ):
         return result_type(self.backend.fts_search(
@@ -572,6 +573,7 @@ class Cache:
             highlight_start=highlight_start,
             highlight_end=highlight_end,
             snippet_size=snippet_size,
+            return_text=return_text,
             restrict_to_book_ids=restrict_to_book_ids,
         ))
 
diff --git a/src/calibre/db/fts/connect.py b/src/calibre/db/fts/connect.py
index 5512b9b306..b05fe6c33c 100644
--- a/src/calibre/db/fts/connect.py
+++ b/src/calibre/db/fts/connect.py
@@ -9,6 +9,7 @@ import hashlib
 import os
 import sys
 from contextlib import suppress
+from itertools import repeat
 from threading import Lock
 
 from calibre.db import FTSQueryError
@@ -139,6 +140,8 @@ class FTS:
         fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
         return_text=True,
     ):
+        if restrict_to_book_ids is not None and not restrict_to_book_ids:
+            return
         fts_engine_query = unicode_normalize(fts_engine_query)
         fts_table = 'books_fts' + ('_stemmed' if use_stemming else '')
         if return_text:
@@ -153,18 +156,21 @@ class FTS:
             text = ''
         query = 'SELECT {0}.id, {0}.book, {0}.format {1} FROM {0} '.format('books_text', text)
         query += f' JOIN {fts_table} ON fts_db.books_text.id = {fts_table}.rowid'
-        query += f' WHERE "{fts_table}" MATCH ?'
-        data = [fts_engine_query]
+        query += ' WHERE '
+        data = []
+        if restrict_to_book_ids:
+            pl = ','.join(repeat('?', len(restrict_to_book_ids)))
+            query += f' fts_db.books_text.book IN ({pl}) AND '
+            data.extend(restrict_to_book_ids)
+        query += f' "{fts_table}" MATCH ?'
+        data.append(fts_engine_query)
         query += f' ORDER BY {fts_table}.rank '
         conn = self.get_connection()
         try:
             for record in conn.execute(query, tuple(data)):
-                book_id = record[1]
-                if restrict_to_book_ids is not None and book_id not in restrict_to_book_ids:
-                    continue
                 yield {
                     'id': record[0],
-                    'book_id': book_id,
+                    'book_id': record[1],
                     'format': record[2],
                     'text': record[3] if return_text else '',
                 }
diff --git a/src/calibre/db/tests/fts_api.py b/src/calibre/db/tests/fts_api.py
index 61f1df8eb4..2a0138f1d5 100644
--- a/src/calibre/db/tests/fts_api.py
+++ b/src/calibre/db/tests/fts_api.py
@@ -137,12 +137,14 @@ class FTSAPITest(BaseTest):
         self.wait_for_fts_to_finish(fts)
         self.assertFalse(fts.all_currently_dirty())
         self.ae({x['id'] for x in cache.fts_search('help')}, {1, 2})
+        self.ae({x['id'] for x in cache.fts_search('help', restrict_to_book_ids=(1, 3, 4, 5, 11))}, {1})
         self.ae({x['format'] for x in cache.fts_search('help')}, {'TXT', 'MD'})
         self.ae({x['id'] for x in cache.fts_search('also')}, {2})
         self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']')}, {
             'some other long text that will [also] help with the testing of search'})
         self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']', snippet_size=3)}, {
             '…will [also] help…'})
+        self.ae({x['text'] for x in cache.fts_search('also', return_text=False)}, {''})
         fts = cache.reindex_fts()
         self.assertTrue(fts.pool.initialized)
         self.wait_for_fts_to_finish(fts)