API for searching the FTS corpus

This commit is contained in:
Kovid Goyal 2022-04-20 11:53:47 +05:30
parent fc80be414c
commit 6f3cd9cc44
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 70 additions and 10 deletions

View File

@ -11,6 +11,14 @@ import numbers
from polyglot.builtins import iteritems from polyglot.builtins import iteritems
class FTSQueryError(ValueError):
def __init__(self, query, sql_statement, apsw_error):
ValueError.__init__(self, f'Failed to parse search query: {query} with error: {apsw_error}')
self.query = query
self.sql_statement = sql_statement
def _get_next_series_num_for_list(series_indices, unwrap=True): def _get_next_series_num_for_list(series_indices, unwrap=True):
from calibre.utils.config_base import tweaks from calibre.utils.config_base import tweaks
from math import ceil, floor from math import ceil, floor

View File

@ -23,7 +23,7 @@ from calibre import as_unicode, force_unicode, isbytestring, prints
from calibre.constants import ( from calibre.constants import (
filesystem_encoding, iswindows, plugins, preferred_encoding filesystem_encoding, iswindows, plugins, preferred_encoding
) )
from calibre.db import SPOOL_SIZE from calibre.db import SPOOL_SIZE, FTSQueryError
from calibre.db.annotations import annot_db_data, unicode_normalize from calibre.db.annotations import annot_db_data, unicode_normalize
from calibre.db.delete_service import delete_service from calibre.db.delete_service import delete_service
from calibre.db.errors import NoSuchFormat from calibre.db.errors import NoSuchFormat
@ -55,14 +55,6 @@ from polyglot.builtins import (
# }}} # }}}
class FTSQueryError(ValueError):
def __init__(self, query, sql_statement, apsw_error):
ValueError.__init__(self, f'Failed to parse search query: {query} with error: {apsw_error}')
self.query = query
self.sql_statement = sql_statement
CUSTOM_DATA_TYPES = frozenset(('rating', 'text', 'comments', 'datetime', CUSTOM_DATA_TYPES = frozenset(('rating', 'text', 'comments', 'datetime',
'int', 'float', 'bool', 'series', 'composite', 'enumeration')) 'int', 'float', 'bool', 'series', 'composite', 'enumeration'))
WINDOWS_RESERVED_NAMES = frozenset('CON PRN AUX NUL COM1 COM2 COM3 COM4 COM5 COM6 COM7 COM8 COM9 LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 LPT8 LPT9'.split()) WINDOWS_RESERVED_NAMES = frozenset('CON PRN AUX NUL COM1 COM2 COM3 COM4 COM5 COM6 COM7 COM8 COM9 LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 LPT8 LPT9'.split())
@ -979,6 +971,11 @@ class DB:
def commit_fts_result(self, book_id, fmt, fmt_size, fmt_hash, text, err_msg): def commit_fts_result(self, book_id, fmt, fmt_size, fmt_hash, text, err_msg):
return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg) return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg)
def search(self,
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
):
yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,)
def shutdown_fts(self): def shutdown_fts(self):
if self.fts_enabled: if self.fts_enabled:
self.fts.shutdown() self.fts.shutdown()

View File

@ -481,6 +481,25 @@ class Cache:
self.queue_next_fts_job() self.queue_next_fts_job()
return existing return existing
@read_api
def fts_search(
self,
fts_engine_query,
use_stemming=True,
highlight_start=None,
highlight_end=None,
snippet_size=None,
restrict_to_book_ids=None,
):
return tuple(self.backend.fts_search(
fts_engine_query,
use_stemming=use_stemming,
highlight_start=highlight_start,
highlight_end=highlight_end,
snippet_size=snippet_size,
restrict_to_book_ids=restrict_to_book_ids,
))
# }}} # }}}
# Cache Layer API {{{ # Cache Layer API {{{

View File

@ -3,13 +3,15 @@
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net> # License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
import builtins import builtins, apsw
import hashlib import hashlib
import os import os
import sys import sys
from contextlib import suppress from contextlib import suppress
from calibre.utils.date import EPOCH, utcnow from calibre.utils.date import EPOCH, utcnow
from calibre.db import FTSQueryError
from calibre.db.annotations import unicode_normalize
from .pool import Pool from .pool import Pool
from .schema_upgrade import SchemaUpgrade from .schema_upgrade import SchemaUpgrade
@ -117,5 +119,39 @@ class FTS:
os.remove(path) os.remove(path)
return False return False
def search(self,
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,
):
fts_engine_query = unicode_normalize(fts_engine_query)
fts_table = 'books_fts_stemmed' if use_stemming else 'books_fts'
text = 'books_text.searchable_text'
if highlight_start is not None and highlight_end is not None:
if snippet_size is not None:
text = 'snippet({fts_table}, 0, "{highlight_start}", "{highlight_end}", "", {snippet_size})'.format(
fts_table=fts_table, highlight_start=highlight_start, highlight_end=highlight_end,
snippet_size=max(1, min(snippet_size, 64)))
else:
text = f'highlight({fts_table}, 0, "{highlight_start}", "{highlight_end}")'
query = 'SELECT {0}.id, {0}.book, {0}.format, {1} FROM {0} '
query = query.format('books_text', text)
query += ' JOIN {fts_table} ON books_text.id = {fts_table}.rowid'.format(fts_table=fts_table)
query += f' WHERE {fts_table} MATCH ?'
data = [fts_engine_query]
query += f' ORDER BY {fts_table}.rank '
try:
for (rowid, book_id, fmt, user_type, user, annot_data, text) in self.execute(query, tuple(data)):
if restrict_to_book_ids is not None and book_id not in restrict_to_book_ids:
continue
yield {
'id': rowid,
'book_id': book_id,
'format': fmt,
'user_type': user_type,
'user': user,
'text': text,
}
except apsw.SQLError as e:
raise FTSQueryError(fts_engine_query, query, e)
def shutdown(self): def shutdown(self):
self.pool.shutdown() self.pool.shutdown()