Get FTS search API working

This commit is contained in:
Kovid Goyal 2022-04-20 16:14:26 +05:30
parent 6f3cd9cc44
commit 4502569b90
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 29 additions and 14 deletions

View File

@ -19,8 +19,8 @@ CREATE TABLE fts_db.books_text ( id INTEGER PRIMARY KEY,
);
CREATE VIRTUAL TABLE fts_db.books_fts USING fts5(searchable_text, content = 'fts_db.books_text', content_rowid = 'id', tokenize = 'calibre remove_diacritics 2');
CREATE VIRTUAL TABLE fts_db.books_fts_stemmed USING fts5(searchable_text, content = 'fts_db.books_text', content_rowid = 'id', tokenize = 'porter calibre remove_diacritics 2');
CREATE VIRTUAL TABLE fts_db.books_fts USING fts5(searchable_text, content = 'books_text', content_rowid = 'id', tokenize = 'calibre remove_diacritics 2');
CREATE VIRTUAL TABLE fts_db.books_fts_stemmed USING fts5(searchable_text, content = 'books_text', content_rowid = 'id', tokenize = 'porter calibre remove_diacritics 2');
CREATE TRIGGER fts_db.books_fts_insert_trg AFTER INSERT ON fts_db.books_text
BEGIN

View File

@ -971,7 +971,7 @@ class DB:
def commit_fts_result(self, book_id, fmt, fmt_size, fmt_hash, text, err_msg):
return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg)
def search(self,
def fts_search(self,
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
):
yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,)

View File

@ -123,35 +123,32 @@ class FTS:
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
):
fts_engine_query = unicode_normalize(fts_engine_query)
fts_table = 'books_fts_stemmed' if use_stemming else 'books_fts'
fts_table = 'books_fts' + ('_stemmed' if use_stemming else '')
text = 'books_text.searchable_text'
if highlight_start is not None and highlight_end is not None:
if snippet_size is not None:
text = 'snippet({fts_table}, 0, "{highlight_start}", "{highlight_end}", "", {snippet_size})'.format(
fts_table=fts_table, highlight_start=highlight_start, highlight_end=highlight_end,
snippet_size=max(1, min(snippet_size, 64)))
text = f'snippet({fts_table}, 0, "{highlight_start}", "{highlight_end}", "", {max(1, min(snippet_size, 64))})'
else:
text = f'highlight({fts_table}, 0, "{highlight_start}", "{highlight_end}")'
text = f'highlight("{fts_table}", 0, "{highlight_start}", "{highlight_end}")'
query = 'SELECT {0}.id, {0}.book, {0}.format, {1} FROM {0} '
query = query.format('books_text', text)
query += ' JOIN {fts_table} ON books_text.id = {fts_table}.rowid'.format(fts_table=fts_table)
query += f' WHERE {fts_table} MATCH ?'
query += f' JOIN {fts_table} ON fts_db.books_text.id = {fts_table}.rowid'
query += f' WHERE "{fts_table}" MATCH ?'
data = [fts_engine_query]
query += f' ORDER BY {fts_table}.rank '
conn = self.get_connection()
try:
for (rowid, book_id, fmt, user_type, user, annot_data, text) in self.execute(query, tuple(data)):
for (rowid, book_id, fmt, text) in conn.execute(query, tuple(data)):
if restrict_to_book_ids is not None and book_id not in restrict_to_book_ids:
continue
yield {
'id': rowid,
'book_id': book_id,
'format': fmt,
'user_type': user_type,
'user': user,
'text': text,
}
except apsw.SQLError as e:
raise FTSQueryError(fts_engine_query, query, e)
raise FTSQueryError(fts_engine_query, query, e) from e
def shutdown(self):
self.pool.shutdown()

View File

@ -117,6 +117,24 @@ class FTSAPITest(BaseTest):
for w in workers:
self.assertFalse(w.is_alive())
def test_fts_search(self):
cache = self.new_library()
fts = cache.enable_fts()
self.wait_for_fts_to_finish(fts)
self.assertFalse(fts.all_currently_dirty())
cache.add_format(1, 'TXT', BytesIO(b'some long text to help with testing search.'))
cache.add_format(2, 'MD', BytesIO(b'some other long text that will also help with the testing of search'))
self.assertTrue(fts.all_currently_dirty())
self.wait_for_fts_to_finish(fts)
self.assertFalse(fts.all_currently_dirty())
self.ae({x['id'] for x in cache.fts_search('help')}, {1, 2})
self.ae({x['format'] for x in cache.fts_search('help')}, {'TXT', 'MD'})
self.ae({x['id'] for x in cache.fts_search('also')}, {2})
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']')}, {
'some other long text that will [also] help with the testing of search'})
self.ae({x['text'] for x in cache.fts_search('also', highlight_start='[', highlight_end=']', snippet_size=3)}, {
'…will [also] help…'})
def test_fts_triggers(self):
cache = self.init_cache()
# the cache fts jobs will clear dirtied flag so disable it