From 6f3cd9cc4459f6573050dc15264d80a39762aa80 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Apr 2022 11:53:47 +0530 Subject: [PATCH] API for searching the FTS corpus --- src/calibre/db/__init__.py | 8 ++++++++ src/calibre/db/backend.py | 15 ++++++-------- src/calibre/db/cache.py | 19 ++++++++++++++++++ src/calibre/db/fts/connect.py | 38 ++++++++++++++++++++++++++++++++++- 4 files changed, 70 insertions(+), 10 deletions(-) diff --git a/src/calibre/db/__init__.py b/src/calibre/db/__init__.py index 4f920976ce..1b9c618cf6 100644 --- a/src/calibre/db/__init__.py +++ b/src/calibre/db/__init__.py @@ -11,6 +11,14 @@ import numbers from polyglot.builtins import iteritems +class FTSQueryError(ValueError): + + def __init__(self, query, sql_statement, apsw_error): + ValueError.__init__(self, f'Failed to parse search query: {query} with error: {apsw_error}') + self.query = query + self.sql_statement = sql_statement + + def _get_next_series_num_for_list(series_indices, unwrap=True): from calibre.utils.config_base import tweaks from math import ceil, floor diff --git a/src/calibre/db/backend.py b/src/calibre/db/backend.py index 610bae21f0..371ed71a5c 100644 --- a/src/calibre/db/backend.py +++ b/src/calibre/db/backend.py @@ -23,7 +23,7 @@ from calibre import as_unicode, force_unicode, isbytestring, prints from calibre.constants import ( filesystem_encoding, iswindows, plugins, preferred_encoding ) -from calibre.db import SPOOL_SIZE +from calibre.db import SPOOL_SIZE, FTSQueryError from calibre.db.annotations import annot_db_data, unicode_normalize from calibre.db.delete_service import delete_service from calibre.db.errors import NoSuchFormat @@ -55,14 +55,6 @@ from polyglot.builtins import ( # }}} -class FTSQueryError(ValueError): - - def __init__(self, query, sql_statement, apsw_error): - ValueError.__init__(self, f'Failed to parse search query: {query} with error: {apsw_error}') - self.query = query - self.sql_statement = sql_statement - - CUSTOM_DATA_TYPES = frozenset(('rating', 'text', 'comments', 'datetime', 'int', 'float', 'bool', 'series', 'composite', 'enumeration')) WINDOWS_RESERVED_NAMES = frozenset('CON PRN AUX NUL COM1 COM2 COM3 COM4 COM5 COM6 COM7 COM8 COM9 LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 LPT8 LPT9'.split()) @@ -979,6 +971,11 @@ class DB: def commit_fts_result(self, book_id, fmt, fmt_size, fmt_hash, text, err_msg): return self.fts.commit_result(book_id, fmt, fmt_size, fmt_hash, text, err_msg) + def search(self, + fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, + ): + yield from self.fts.search(fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids,) + def shutdown_fts(self): if self.fts_enabled: self.fts.shutdown() diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index 821f4734c1..6d7bba33ac 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -481,6 +481,25 @@ class Cache: self.queue_next_fts_job() return existing + @read_api + def fts_search( + self, + fts_engine_query, + use_stemming=True, + highlight_start=None, + highlight_end=None, + snippet_size=None, + restrict_to_book_ids=None, + ): + return tuple(self.backend.fts_search( + fts_engine_query, + use_stemming=use_stemming, + highlight_start=highlight_start, + highlight_end=highlight_end, + snippet_size=snippet_size, + restrict_to_book_ids=restrict_to_book_ids, + )) + # }}} # Cache Layer API {{{ diff --git a/src/calibre/db/fts/connect.py b/src/calibre/db/fts/connect.py index b5cc8e29ca..922dc68ad5 100644 --- a/src/calibre/db/fts/connect.py +++ b/src/calibre/db/fts/connect.py @@ -3,13 +3,15 @@ # License: GPL v3 Copyright: 2022, Kovid Goyal -import builtins +import builtins, apsw import hashlib import os import sys from contextlib import suppress from calibre.utils.date import EPOCH, utcnow +from calibre.db import FTSQueryError +from calibre.db.annotations import unicode_normalize from .pool import Pool from .schema_upgrade import SchemaUpgrade @@ -117,5 +119,39 @@ class FTS: os.remove(path) return False + def search(self, + fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, restrict_to_book_ids, + ): + fts_engine_query = unicode_normalize(fts_engine_query) + fts_table = 'books_fts_stemmed' if use_stemming else 'books_fts' + text = 'books_text.searchable_text' + if highlight_start is not None and highlight_end is not None: + if snippet_size is not None: + text = 'snippet({fts_table}, 0, "{highlight_start}", "{highlight_end}", "…", {snippet_size})'.format( + fts_table=fts_table, highlight_start=highlight_start, highlight_end=highlight_end, + snippet_size=max(1, min(snippet_size, 64))) + else: + text = f'highlight({fts_table}, 0, "{highlight_start}", "{highlight_end}")' + query = 'SELECT {0}.id, {0}.book, {0}.format, {1} FROM {0} ' + query = query.format('books_text', text) + query += ' JOIN {fts_table} ON books_text.id = {fts_table}.rowid'.format(fts_table=fts_table) + query += f' WHERE {fts_table} MATCH ?' + data = [fts_engine_query] + query += f' ORDER BY {fts_table}.rank ' + try: + for (rowid, book_id, fmt, user_type, user, annot_data, text) in self.execute(query, tuple(data)): + if restrict_to_book_ids is not None and book_id not in restrict_to_book_ids: + continue + yield { + 'id': rowid, + 'book_id': book_id, + 'format': fmt, + 'user_type': user_type, + 'user': user, + 'text': text, + } + except apsw.SQLError as e: + raise FTSQueryError(fts_engine_query, query, e) + def shutdown(self): self.pool.shutdown()