From 2bbf8e5824fff4a1cb22bc1a1fe181202cbe638d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 5 May 2022 12:51:20 +0530 Subject: [PATCH] Avoid FTS overhead when restricting to subset This is needed because the highlight()/snippet() sqlite functions are very slow with large text --- src/calibre/db/backend.py | 4 ++-- src/calibre/db/cache.py | 2 ++ src/calibre/db/fts/connect.py | 18 ++++++++++++------ src/calibre/db/tests/fts_api.py | 2 ++ 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/calibre/db/backend.py b/src/calibre/db/backend.py index f6398657d9..389db05c84 100644 --- a/src/calibre/db/backend.py +++ b/src/calibre/db/backend.py @@ -980,9 +980,9 @@ class DB: return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg) def fts_search(self, - fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, + fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text, ): - yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,) + yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,) def shutdown_fts(self): if self.fts_enabled: diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index c220ab2f87..dafa4187c4 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -564,6 +564,7 @@ class Cache: highlight_end=None, snippet_size=None, restrict_to_book_ids=None, + return_text=True, result_type=tuple, ): return result_type(self.backend.fts_search( @@ -572,6 +573,7 @@ class Cache: highlight_start=highlight_start, highlight_end=highlight_end, snippet_size=snippet_size, + return_text=return_text, restrict_to_book_ids=restrict_to_book_ids, )) diff --git a/src/calibre/db/fts/connect.py b/src/calibre/db/fts/connect.py index 5512b9b306..b05fe6c33c 100644 --- a/src/calibre/db/fts/connect.py +++ b/src/calibre/db/fts/connect.py @@ -9,6 +9,7 @@ import hashlib import os import sys from contextlib import suppress +from itertools import repeat from threading import Lock from calibre.db import FTSQueryError @@ -139,6 +140,8 @@ class FTS: fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text=True, ): + if restrict_to_book_ids is not None and not restrict_to_book_ids: + return fts_engine_query = unicode_normalize(fts_engine_query) fts_table = 'books_fts' + ('_stemmed' if use_stemming else '') if return_text: @@ -153,18 +156,21 @@ class FTS: text = '' query = 'SELECT {0}.id, {0}.book, {0}.format {1} FROM {0} '.format('books_text', text) query += f' JOIN {fts_table} ON fts_db.books_text.id = {fts_table}.rowid' - query += f' WHERE "{fts_table}" MATCH ?' - data = [fts_engine_query] + query += ' WHERE ' + data = [] + if restrict_to_book_ids: + pl = ','.join(repeat('?', len(restrict_to_book_ids))) + query += f' fts_db.books_text.book IN ({pl}) AND ' + data.extend(restrict_to_book_ids) + query += f' "{fts_table}" MATCH ?' + data.append(fts_engine_query) query += f' ORDER BY {fts_table}.rank ' conn = self.get_connection() try: for record in conn.execute(query, tuple(data)): - book_id = record[1] - if restrict_to_book_ids is not None and book_id not in restrict_to_book_ids: - continue yield { 'id': record[0], - 'book_id': book_id, + 'book_id': record[1], 'format': record[2], 'text': record[3] if return_text else '', } diff --git a/src/calibre/db/tests/fts_api.py b/src/calibre/db/tests/fts_api.py index 61f1df8eb4..2a0138f1d5 100644 --- a/src/calibre/db/tests/fts_api.py +++ b/src/calibre/db/tests/fts_api.py @@ -137,12 +137,14 @@ class FTSAPITest(BaseTest): self.wait_for_fts_to_finish(fts) self.assertFalse(fts.all_currently_dirty()) self.ae({x['id'] for x in cache.fts_search('help')}, {1, 2}) + self.ae({x['id'] for x in cache.fts_search('help', restrict_to_book_ids=(1, 3, 4, 5, 11))}, {1}) self.ae({x['format'] for x in cache.fts_search('help')}, {'TXT', 'MD'}) self.ae({x['id'] for x in cache.fts_search('also')}, {2}) self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']')}, { 'some other long text that will [also] help with the testing of search'}) self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']', snippet_size=3)}, { '…will [also] help…'}) + self.ae({x['text'] for x in cache.fts_search('also', return_text=False)}, {''}) fts = cache.reindex_fts() self.assertTrue(fts.pool.initialized) self.wait_for_fts_to_finish(fts)