Avoid FTS overhead when restricting to subset

This is needed because the highlight()/snippet() sqlite functions
are very slow with large text
This commit is contained in:
Kovid Goyal 2022-05-05 12:51:20 +05:30
parent cac2a91df9
commit 2bbf8e5824
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 18 additions and 8 deletions

View File

@ -980,9 +980,9 @@ class DB:
return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg)
def fts_search(self,
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,
):
yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,)
yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,)
def shutdown_fts(self):
if self.fts_enabled:

View File

@ -564,6 +564,7 @@ class Cache:
highlight_end=None,
snippet_size=None,
restrict_to_book_ids=None,
return_text=True,
result_type=tuple,
):
return result_type(self.backend.fts_search(
@ -572,6 +573,7 @@ class Cache:
highlight_start=highlight_start,
highlight_end=highlight_end,
snippet_size=snippet_size,
return_text=return_text,
restrict_to_book_ids=restrict_to_book_ids,
))

View File

@ -9,6 +9,7 @@ import hashlib
import os
import sys
from contextlib import suppress
from itertools import repeat
from threading import Lock
from calibre.db import FTSQueryError
@ -139,6 +140,8 @@ class FTS:
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
return_text=True,
):
if restrict_to_book_ids is not None and not restrict_to_book_ids:
return
fts_engine_query = unicode_normalize(fts_engine_query)
fts_table = 'books_fts' + ('_stemmed' if use_stemming else '')
if return_text:
@ -153,18 +156,21 @@ class FTS:
text = ''
query = 'SELECT {0}.id, {0}.book, {0}.format {1} FROM {0} '.format('books_text', text)
query += f' JOIN {fts_table} ON fts_db.books_text.id = {fts_table}.rowid'
query += f' WHERE "{fts_table}" MATCH ?'
data = [fts_engine_query]
query += ' WHERE '
data = []
if restrict_to_book_ids:
pl = ','.join(repeat('?', len(restrict_to_book_ids)))
query += f' fts_db.books_text.book IN ({pl}) AND '
data.extend(restrict_to_book_ids)
query += f' "{fts_table}" MATCH ?'
data.append(fts_engine_query)
query += f' ORDER BY {fts_table}.rank '
conn = self.get_connection()
try:
for record in conn.execute(query, tuple(data)):
book_id = record[1]
if restrict_to_book_ids is not None and book_id not in restrict_to_book_ids:
continue
yield {
'id': record[0],
'book_id': book_id,
'book_id': record[1],
'format': record[2],
'text': record[3] if return_text else '',
}

View File

@ -137,12 +137,14 @@ class FTSAPITest(BaseTest):
self.wait_for_fts_to_finish(fts)
self.assertFalse(fts.all_currently_dirty())
self.ae({x['id'] for x in cache.fts_search('help')}, {1, 2})
self.ae({x['id'] for x in cache.fts_search('help', restrict_to_book_ids=(1, 3, 4, 5, 11))}, {1})
self.ae({x['format'] for x in cache.fts_search('help')}, {'TXT', 'MD'})
self.ae({x['id'] for x in cache.fts_search('also')}, {2})
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']')}, {
'some other long text that will [also] help with the testing of search'})
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']', snippet_size=3)}, {
'…will [also] help…'})
self.ae({x['text'] for x in cache.fts_search('also', return_text=False)}, {''})
fts = cache.reindex_fts()
self.assertTrue(fts.pool.initialized)
self.wait_for_fts_to_finish(fts)