API for searching the FTS corpus

This commit is contained in:
Kovid Goyal 2022-04-20 11:53:47 +05:30
parent fc80be414c
commit 6f3cd9cc44
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 70 additions and 10 deletions

View File

@ -11,6 +11,14 @@ import numbers
from polyglot.builtins import iteritems
class FTSQueryError(ValueError):
def __init__(self, query, sql_statement, apsw_error):
ValueError.__init__(self, f'Failed to parse search query: {query} with error: {apsw_error}')
self.query = query
self.sql_statement = sql_statement
def _get_next_series_num_for_list(series_indices, unwrap=True):
from calibre.utils.config_base import tweaks
from math import ceil, floor

View File

@ -23,7 +23,7 @@ from calibre import as_unicode, force_unicode, isbytestring, prints
from calibre.constants import (
filesystem_encoding, iswindows, plugins, preferred_encoding
)
from calibre.db import SPOOL_SIZE
from calibre.db import SPOOL_SIZE, FTSQueryError
from calibre.db.annotations import annot_db_data, unicode_normalize
from calibre.db.delete_service import delete_service
from calibre.db.errors import NoSuchFormat
@ -55,14 +55,6 @@ from polyglot.builtins import (
# }}}
class FTSQueryError(ValueError):
def __init__(self, query, sql_statement, apsw_error):
ValueError.__init__(self, f'Failed to parse search query: {query} with error: {apsw_error}')
self.query = query
self.sql_statement = sql_statement
CUSTOM_DATA_TYPES = frozenset(('rating', 'text', 'comments', 'datetime',
'int', 'float', 'bool', 'series', 'composite', 'enumeration'))
WINDOWS_RESERVED_NAMES = frozenset('CON PRN AUX NUL COM1 COM2 COM3 COM4 COM5 COM6 COM7 COM8 COM9 LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 LPT8 LPT9'.split())
@ -979,6 +971,11 @@ class DB:
def commit_fts_result(self, book_id, fmt, fmt_size, fmt_hash, text, err_msg):
return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg)
def search(self,
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
):
yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,)
def shutdown_fts(self):
if self.fts_enabled:
self.fts.shutdown()

View File

@ -481,6 +481,25 @@ class Cache:
self.queue_next_fts_job()
return existing
@read_api
def fts_search(
self,
fts_engine_query,
use_stemming=True,
highlight_start=None,
highlight_end=None,
snippet_size=None,
restrict_to_book_ids=None,
):
return tuple(self.backend.fts_search(
fts_engine_query,
use_stemming=use_stemming,
highlight_start=highlight_start,
highlight_end=highlight_end,
snippet_size=snippet_size,
restrict_to_book_ids=restrict_to_book_ids,
))
# }}}
# Cache Layer API {{{

View File

@ -3,13 +3,15 @@
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
import builtins
import builtins, apsw
import hashlib
import os
import sys
from contextlib import suppress
from calibre.utils.date import EPOCH, utcnow
from calibre.db import FTSQueryError
from calibre.db.annotations import unicode_normalize
from .pool import Pool
from .schema_upgrade import SchemaUpgrade
@ -117,5 +119,39 @@ class FTS:
os.remove(path)
return False
def search(self,
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
):
fts_engine_query = unicode_normalize(fts_engine_query)
fts_table = 'books_fts_stemmed' if use_stemming else 'books_fts'
text = 'books_text.searchable_text'
if highlight_start is not None and highlight_end is not None:
if snippet_size is not None:
text = 'snippet({fts_table}, 0, "{highlight_start}", "{highlight_end}", "", {snippet_size})'.format(
fts_table=fts_table, highlight_start=highlight_start, highlight_end=highlight_end,
snippet_size=max(1, min(snippet_size, 64)))
else:
text = f'highlight({fts_table}, 0, "{highlight_start}", "{highlight_end}")'
query = 'SELECT {0}.id, {0}.book, {0}.format, {1} FROM {0} '
query = query.format('books_text', text)
query += ' JOIN {fts_table} ON books_text.id = {fts_table}.rowid'.format(fts_table=fts_table)
query += f' WHERE {fts_table} MATCH ?'
data = [fts_engine_query]
query += f' ORDER BY {fts_table}.rank '
try:
for (rowid, book_id, fmt, user_type, user, annot_data, text) in self.execute(query, tuple(data)):
if restrict_to_book_ids is not None and book_id not in restrict_to_book_ids:
continue
yield {
'id': rowid,
'book_id': book_id,
'format': fmt,
'user_type': user_type,
'user': user,
'text': text,
}
except apsw.SQLError as e:
raise FTSQueryError(fts_engine_query, query, e)
def shutdown(self):
self.pool.shutdown()