mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Avoid FTS overhead when restricting to subset
This is needed because the highlight()/snippet() sqlite functions are very slow with large text
This commit is contained in:
parent
cac2a91df9
commit
2bbf8e5824
@ -980,9 +980,9 @@ class DB:
|
|||||||
return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg)
|
return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg)
|
||||||
|
|
||||||
def fts_search(self,
|
def fts_search(self,
|
||||||
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
|
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,
|
||||||
):
|
):
|
||||||
yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,)
|
yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, return_text,)
|
||||||
|
|
||||||
def shutdown_fts(self):
|
def shutdown_fts(self):
|
||||||
if self.fts_enabled:
|
if self.fts_enabled:
|
||||||
|
@ -564,6 +564,7 @@ class Cache:
|
|||||||
highlight_end=None,
|
highlight_end=None,
|
||||||
snippet_size=None,
|
snippet_size=None,
|
||||||
restrict_to_book_ids=None,
|
restrict_to_book_ids=None,
|
||||||
|
return_text=True,
|
||||||
result_type=tuple,
|
result_type=tuple,
|
||||||
):
|
):
|
||||||
return result_type(self.backend.fts_search(
|
return result_type(self.backend.fts_search(
|
||||||
@ -572,6 +573,7 @@ class Cache:
|
|||||||
highlight_start=highlight_start,
|
highlight_start=highlight_start,
|
||||||
highlight_end=highlight_end,
|
highlight_end=highlight_end,
|
||||||
snippet_size=snippet_size,
|
snippet_size=snippet_size,
|
||||||
|
return_text=return_text,
|
||||||
restrict_to_book_ids=restrict_to_book_ids,
|
restrict_to_book_ids=restrict_to_book_ids,
|
||||||
))
|
))
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ import hashlib
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
|
from itertools import repeat
|
||||||
from threading import Lock
|
from threading import Lock
|
||||||
|
|
||||||
from calibre.db import FTSQueryError
|
from calibre.db import FTSQueryError
|
||||||
@ -139,6 +140,8 @@ class FTS:
|
|||||||
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
|
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
|
||||||
return_text=True,
|
return_text=True,
|
||||||
):
|
):
|
||||||
|
if restrict_to_book_ids is not None and not restrict_to_book_ids:
|
||||||
|
return
|
||||||
fts_engine_query = unicode_normalize(fts_engine_query)
|
fts_engine_query = unicode_normalize(fts_engine_query)
|
||||||
fts_table = 'books_fts' + ('_stemmed' if use_stemming else '')
|
fts_table = 'books_fts' + ('_stemmed' if use_stemming else '')
|
||||||
if return_text:
|
if return_text:
|
||||||
@ -153,18 +156,21 @@ class FTS:
|
|||||||
text = ''
|
text = ''
|
||||||
query = 'SELECT {0}.id, {0}.book, {0}.format {1} FROM {0} '.format('books_text', text)
|
query = 'SELECT {0}.id, {0}.book, {0}.format {1} FROM {0} '.format('books_text', text)
|
||||||
query += f' JOIN {fts_table} ON fts_db.books_text.id = {fts_table}.rowid'
|
query += f' JOIN {fts_table} ON fts_db.books_text.id = {fts_table}.rowid'
|
||||||
query += f' WHERE "{fts_table}" MATCH ?'
|
query += ' WHERE '
|
||||||
data = [fts_engine_query]
|
data = []
|
||||||
|
if restrict_to_book_ids:
|
||||||
|
pl = ','.join(repeat('?', len(restrict_to_book_ids)))
|
||||||
|
query += f' fts_db.books_text.book IN ({pl}) AND '
|
||||||
|
data.extend(restrict_to_book_ids)
|
||||||
|
query += f' "{fts_table}" MATCH ?'
|
||||||
|
data.append(fts_engine_query)
|
||||||
query += f' ORDER BY {fts_table}.rank '
|
query += f' ORDER BY {fts_table}.rank '
|
||||||
conn = self.get_connection()
|
conn = self.get_connection()
|
||||||
try:
|
try:
|
||||||
for record in conn.execute(query, tuple(data)):
|
for record in conn.execute(query, tuple(data)):
|
||||||
book_id = record[1]
|
|
||||||
if restrict_to_book_ids is not None and book_id not in restrict_to_book_ids:
|
|
||||||
continue
|
|
||||||
yield {
|
yield {
|
||||||
'id': record[0],
|
'id': record[0],
|
||||||
'book_id': book_id,
|
'book_id': record[1],
|
||||||
'format': record[2],
|
'format': record[2],
|
||||||
'text': record[3] if return_text else '',
|
'text': record[3] if return_text else '',
|
||||||
}
|
}
|
||||||
|
@ -137,12 +137,14 @@ class FTSAPITest(BaseTest):
|
|||||||
self.wait_for_fts_to_finish(fts)
|
self.wait_for_fts_to_finish(fts)
|
||||||
self.assertFalse(fts.all_currently_dirty())
|
self.assertFalse(fts.all_currently_dirty())
|
||||||
self.ae({x['id'] for x in cache.fts_search('help')}, {1, 2})
|
self.ae({x['id'] for x in cache.fts_search('help')}, {1, 2})
|
||||||
|
self.ae({x['id'] for x in cache.fts_search('help', restrict_to_book_ids=(1, 3, 4, 5, 11))}, {1})
|
||||||
self.ae({x['format'] for x in cache.fts_search('help')}, {'TXT', 'MD'})
|
self.ae({x['format'] for x in cache.fts_search('help')}, {'TXT', 'MD'})
|
||||||
self.ae({x['id'] for x in cache.fts_search('also')}, {2})
|
self.ae({x['id'] for x in cache.fts_search('also')}, {2})
|
||||||
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']')}, {
|
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']')}, {
|
||||||
'some other long text that will [also] help with the testing of search'})
|
'some other long text that will [also] help with the testing of search'})
|
||||||
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']', snippet_size=3)}, {
|
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']', snippet_size=3)}, {
|
||||||
'…will [also] help…'})
|
'…will [also] help…'})
|
||||||
|
self.ae({x['text'] for x in cache.fts_search('also', return_text=False)}, {''})
|
||||||
fts = cache.reindex_fts()
|
fts = cache.reindex_fts()
|
||||||
self.assertTrue(fts.pool.initialized)
|
self.assertTrue(fts.pool.initialized)
|
||||||
self.wait_for_fts_to_finish(fts)
|
self.wait_for_fts_to_finish(fts)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user