calibredb: A new fts_search command to perform full text searching

This commit is contained in:
Kovid Goyal 2022-08-08 08:26:23 +05:30
parent 9ae2074669
commit 6662e353aa
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 168 additions and 1 deletions

View File

@ -0,0 +1,167 @@
#!/usr/bin/env python
# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
import re
readonly = True
version = 0 # change this if you change signature of implementation()
def implementation(db, notify_changes, query, adata):
rto = adata['restrict_to']
restrict_to = None
if rto:
if isinstance(rto, str):
restrict_to = db.search(rto)
else:
restrict_to = set(rto)
metadata_cache = {}
include_snippets = adata['include_snippets']
def add_metadata(result):
result.pop('id', None)
if not include_snippets:
result.pop('text', None)
bid = result['book_id']
if bid not in metadata_cache:
with db.safe_read_lock:
metadata_cache[bid] = {'title': db._field_for('title', bid), 'authors': db._field_for('authors', bid)}
return result
from calibre.db import FTSQueryError
try:
return db.fts_search(
query, use_stemming=adata['use_stemming'], highlight_start=adata['start_marker'], highlight_end=adata['end_marker'],
return_text=include_snippets, restrict_to_book_ids=restrict_to, result_type=tuple if adata['as_tuple'] else lambda x: x,
process_each_result=add_metadata, snippet_size=64
), metadata_cache
except FTSQueryError as e:
e.suppress_traceback = True
raise e
def option_parser(get_parser, args):
parser = get_parser(
_(
'''\
%prog fts_search [options] search expression
Do a full text search on the entire library or a subset of it.
'''
))
parser.add_option(
'--include-snippets',
default=False,
action='store_true',
help=_('Include snippets of the text surrounding each match. Note that this makes searching much slower.')
)
parser.add_option(
'--match-start-marker',
default='\x1b[31m',
help=_('The marker used to indicate the start of a matched word inside a snippet')
)
parser.add_option(
'--match-end-marker',
default='\x1b[m',
help=_('The marker used to indicate the end of a matched word inside a snippet')
)
parser.add_option(
'--do-not-match-on-related-words',
default=True,
dest='use_stemming',
action='store_false',
help=_('Only match on exact words not related words. So correction will not match correcting.')
)
parser.add_option(
'--restrict-to',
default='',
help=_('Restrict the searched books, either using a search expression or ids.'
' For example: ids:1,2,3 to restrict by ids or search:tag:foo to restrict to books having the tag foo.')
)
parser.add_option(
'--output-format', default='text', choices=('text', 'json'),
help=_('The format to output the search results in. Either "text" for plain text or "json" for JSON output.')
)
return parser
def output_results_as_text(results, metadata_cache, include_snippets):
from calibre.utils.terminal import geometry
from calibre.ebooks.metadata import authors_to_string
width = max(5, geometry()[0])
separator = '' * width
if not include_snippets:
bids = {}
for result in results:
bids.setdefault(result['book_id'], []).append(result['format'])
for bid, fmts in bids.items():
m = metadata_cache[bid]
print(_('{0} by {1}').format(m['title'], authors_to_string(m['authors'])))
print(f'Book id: {bid} Formats: {", ".join(fmts)}')
print(separator)
return
current_text_q = ''
current_id = -1
current_formats = []
pat = re.compile(r'\s+')
def print_result():
m = metadata_cache[current_id]
print(_('{0} by {1}').format(m['title'], authors_to_string(m['authors'])))
print(f'Book id: {current_id} Formats: {", ".join(current_formats)}')
print(current_text_q)
print(separator)
for result in results:
textq = pat.sub(' ', result['text'])
if result['book_id'] == current_id and textq == current_text_q:
current_formats.append(result['format'])
else:
if current_id > -1:
print_result()
current_id, current_text_q, current_formats = result['book_id'], textq, [result['format']]
if current_id > -1:
print_result()
def main(opts, args, dbctx):
if len(args) < 1:
dbctx.option_parser.print_help()
raise SystemExit(_('Error: You must specify the search expression'))
search_expression = ' '.join(args)
restrict_to = ''
if opts.restrict_to:
q, v = opts.restrict_to.partition(':')[::2]
if q == 'ids':
restrict_to = tuple(set(map(int, v.split(','))))
elif q == 'search':
restrict_to = v
else:
raise SystemExit('The --restrict-to option must start with either ids: or search:')
from calibre.db import FTSQueryError
try:
results, metadata_cache = dbctx.run('fts_search', search_expression, {
'start_marker': opts.match_start_marker, 'end_marker': opts.match_end_marker, 'use_stemming': opts.use_stemming,
'include_snippets': opts.include_snippets, 'restrict_to': restrict_to, 'as_tuple': dbctx.is_remote
})
if opts.output_format == 'json':
if not dbctx.is_remote:
results = tuple(results)
for r in results:
m = metadata_cache[r['book_id']]
r['title'], r['authors'] = m['title'], m['authors']
import json
print(json.dumps(results, sort_keys=True, indent=' '))
else:
output_results_as_text(results, metadata_cache, opts.include_snippets)
except FTSQueryError as e:
raise SystemExit(str(e))
except Exception as e:
if getattr(e, 'suppress_traceback', False):
raise SystemExit(str(e))
raise
return 0

View File

@ -21,7 +21,7 @@ COMMANDS = (
'set_metadata', 'export', 'catalog', 'saved_searches', 'add_custom_column', 'set_metadata', 'export', 'catalog', 'saved_searches', 'add_custom_column',
'custom_columns', 'remove_custom_column', 'set_custom', 'restore_database', 'custom_columns', 'remove_custom_column', 'set_custom', 'restore_database',
'check_library', 'list_categories', 'backup_metadata', 'clone', 'embed_metadata', 'check_library', 'list_categories', 'backup_metadata', 'clone', 'embed_metadata',
'search', 'fts_index' 'search', 'fts_index', 'fts_search',
) )