mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Avoid FTS overhead when restricting to subset
This is needed because the highlight()/snippet() sqlite functions are very slow with large text
This commit is contained in:
parent
cac2a91df9
commit
2bbf8e5824
@ -980,9 +980,9 @@ class DB:
|
||||
return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg)
|
||||
|
||||
def fts_search(self,
|
||||
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
|
||||
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,
|
||||
):
|
||||
yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,)
|
||||
yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,)
|
||||
|
||||
def shutdown_fts(self):
|
||||
if self.fts_enabled:
|
||||
|
@ -564,6 +564,7 @@ class Cache:
|
||||
highlight_end=None,
|
||||
snippet_size=None,
|
||||
restrict_to_book_ids=None,
|
||||
return_text=True,
|
||||
result_type=tuple,
|
||||
):
|
||||
return result_type(self.backend.fts_search(
|
||||
@ -572,6 +573,7 @@ class Cache:
|
||||
highlight_start=highlight_start,
|
||||
highlight_end=highlight_end,
|
||||
snippet_size=snippet_size,
|
||||
return_text=return_text,
|
||||
restrict_to_book_ids=restrict_to_book_ids,
|
||||
))
|
||||
|
||||
|
@ -9,6 +9,7 @@ import hashlib
|
||||
import os
|
||||
import sys
|
||||
from contextlib import suppress
|
||||
from itertools import repeat
|
||||
from threading import Lock
|
||||
|
||||
from calibre.db import FTSQueryError
|
||||
@ -139,6 +140,8 @@ class FTS:
|
||||
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
|
||||
return_text=True,
|
||||
):
|
||||
if restrict_to_book_ids is not None and not restrict_to_book_ids:
|
||||
return
|
||||
fts_engine_query = unicode_normalize(fts_engine_query)
|
||||
fts_table = 'books_fts' + ('_stemmed' if use_stemming else '')
|
||||
if return_text:
|
||||
@ -153,18 +156,21 @@ class FTS:
|
||||
text = ''
|
||||
query = 'SELECT {0}.id, {0}.book, {0}.format {1} FROM {0} '.format('books_text', text)
|
||||
query += f' JOIN {fts_table} ON fts_db.books_text.id = {fts_table}.rowid'
|
||||
query += f' WHERE "{fts_table}" MATCH ?'
|
||||
data = [fts_engine_query]
|
||||
query += ' WHERE '
|
||||
data = []
|
||||
if restrict_to_book_ids:
|
||||
pl = ','.join(repeat('?', len(restrict_to_book_ids)))
|
||||
query += f' fts_db.books_text.book IN ({pl}) AND '
|
||||
data.extend(restrict_to_book_ids)
|
||||
query += f' "{fts_table}" MATCH ?'
|
||||
data.append(fts_engine_query)
|
||||
query += f' ORDER BY {fts_table}.rank '
|
||||
conn = self.get_connection()
|
||||
try:
|
||||
for record in conn.execute(query, tuple(data)):
|
||||
book_id = record[1]
|
||||
if restrict_to_book_ids is not None and book_id not in restrict_to_book_ids:
|
||||
continue
|
||||
yield {
|
||||
'id': record[0],
|
||||
'book_id': book_id,
|
||||
'book_id': record[1],
|
||||
'format': record[2],
|
||||
'text': record[3] if return_text else '',
|
||||
}
|
||||
|
@ -137,12 +137,14 @@ class FTSAPITest(BaseTest):
|
||||
self.wait_for_fts_to_finish(fts)
|
||||
self.assertFalse(fts.all_currently_dirty())
|
||||
self.ae({x['id'] for x in cache.fts_search('help')}, {1, 2})
|
||||
self.ae({x['id'] for x in cache.fts_search('help', restrict_to_book_ids=(1, 3, 4, 5, 11))}, {1})
|
||||
self.ae({x['format'] for x in cache.fts_search('help')}, {'TXT', 'MD'})
|
||||
self.ae({x['id'] for x in cache.fts_search('also')}, {2})
|
||||
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']')}, {
|
||||
'some other long text that will [also] help with the testing of search'})
|
||||
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']', snippet_size=3)}, {
|
||||
'…will [also] help…'})
|
||||
self.ae({x['text'] for x in cache.fts_search('also', return_text=False)}, {''})
|
||||
fts = cache.reindex_fts()
|
||||
self.assertTrue(fts.pool.initialized)
|
||||
self.wait_for_fts_to_finish(fts)
|
||||
|
Loading…
x
Reference in New Issue
Block a user