Avoid FTS overhead when restricting to subset

This is needed because the highlight()/snippet() sqlite functions
are very slow with large text
This commit is contained in:
Kovid Goyal 2022-05-05 12:51:20 +05:30
parent cac2a91df9
commit 2bbf8e5824
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 18 additions and 8 deletions

View File

@ -980,9 +980,9 @@ class DB:
return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg) return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg)
def fts_search(self, def fts_search(self,
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,
): ):
yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,) yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,)
def shutdown_fts(self): def shutdown_fts(self):
if self.fts_enabled: if self.fts_enabled:

View File

@ -564,6 +564,7 @@ class Cache:
highlight_end=None, highlight_end=None,
snippet_size=None, snippet_size=None,
restrict_to_book_ids=None, restrict_to_book_ids=None,
return_text=True,
result_type=tuple, result_type=tuple,
): ):
return result_type(self.backend.fts_search( return result_type(self.backend.fts_search(
@ -572,6 +573,7 @@ class Cache:
highlight_start=highlight_start, highlight_start=highlight_start,
highlight_end=highlight_end, highlight_end=highlight_end,
snippet_size=snippet_size, snippet_size=snippet_size,
return_text=return_text,
restrict_to_book_ids=restrict_to_book_ids, restrict_to_book_ids=restrict_to_book_ids,
)) ))

View File

@ -9,6 +9,7 @@ import hashlib
import os import os
import sys import sys
from contextlib import suppress from contextlib import suppress
from itertools import repeat
from threading import Lock from threading import Lock
from calibre.db import FTSQueryError from calibre.db import FTSQueryError
@ -139,6 +140,8 @@ class FTS:
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
return_text=True, return_text=True,
): ):
if restrict_to_book_ids is not None and not restrict_to_book_ids:
return
fts_engine_query = unicode_normalize(fts_engine_query) fts_engine_query = unicode_normalize(fts_engine_query)
fts_table = 'books_fts' + ('_stemmed' if use_stemming else '') fts_table = 'books_fts' + ('_stemmed' if use_stemming else '')
if return_text: if return_text:
@ -153,18 +156,21 @@ class FTS:
text = '' text = ''
query = 'SELECT {0}.id, {0}.book, {0}.format {1} FROM {0} '.format('books_text', text) query = 'SELECT {0}.id, {0}.book, {0}.format {1} FROM {0} '.format('books_text', text)
query += f' JOIN {fts_table} ON fts_db.books_text.id = {fts_table}.rowid' query += f' JOIN {fts_table} ON fts_db.books_text.id = {fts_table}.rowid'
query += f' WHERE "{fts_table}" MATCH ?' query += ' WHERE '
data = [fts_engine_query] data = []
if restrict_to_book_ids:
pl = ','.join(repeat('?', len(restrict_to_book_ids)))
query += f' fts_db.books_text.book IN ({pl}) AND '
data.extend(restrict_to_book_ids)
query += f' "{fts_table}" MATCH ?'
data.append(fts_engine_query)
query += f' ORDER BY {fts_table}.rank ' query += f' ORDER BY {fts_table}.rank '
conn = self.get_connection() conn = self.get_connection()
try: try:
for record in conn.execute(query, tuple(data)): for record in conn.execute(query, tuple(data)):
book_id = record[1]
if restrict_to_book_ids is not None and book_id not in restrict_to_book_ids:
continue
yield { yield {
'id': record[0], 'id': record[0],
'book_id': book_id, 'book_id': record[1],
'format': record[2], 'format': record[2],
'text': record[3] if return_text else '', 'text': record[3] if return_text else '',
} }

View File

@ -137,12 +137,14 @@ class FTSAPITest(BaseTest):
self.wait_for_fts_to_finish(fts) self.wait_for_fts_to_finish(fts)
self.assertFalse(fts.all_currently_dirty()) self.assertFalse(fts.all_currently_dirty())
self.ae({x['id'] for x in cache.fts_search('help')}, {1, 2}) self.ae({x['id'] for x in cache.fts_search('help')}, {1, 2})
self.ae({x['id'] for x in cache.fts_search('help', restrict_to_book_ids=(1, 3, 4, 5, 11))}, {1})
self.ae({x['format'] for x in cache.fts_search('help')}, {'TXT', 'MD'}) self.ae({x['format'] for x in cache.fts_search('help')}, {'TXT', 'MD'})
self.ae({x['id'] for x in cache.fts_search('also')}, {2}) self.ae({x['id'] for x in cache.fts_search('also')}, {2})
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']')}, { self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']')}, {
'some other long text that will [also] help with the testing of search'}) 'some other long text that will [also] help with the testing of search'})
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']', snippet_size=3)}, { self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']', snippet_size=3)}, {
'…will [also] help…'}) '…will [also] help…'})
self.ae({x['text'] for x in cache.fts_search('also', return_text=False)}, {''})
fts = cache.reindex_fts() fts = cache.reindex_fts()
self.assertTrue(fts.pool.initialized) self.assertTrue(fts.pool.initialized)
self.wait_for_fts_to_finish(fts) self.wait_for_fts_to_finish(fts)