From 63b164241a5846ca40aa20f361c20a1b29333068 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 20 Jan 2013 14:34:26 +0530 Subject: [PATCH] Start work on implementing search in the new backend. Searching for date columns working. --- src/calibre/db/cache.py | 6 + src/calibre/db/fields.py | 46 ++++++ src/calibre/db/search.py | 284 ++++++++++++++++++++++++++++++++ src/calibre/db/tests/reading.py | 20 +++ 4 files changed, 356 insertions(+) create mode 100644 src/calibre/db/search.py diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index a631f9ea46..88a2196a61 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -13,6 +13,7 @@ from functools import wraps, partial from calibre.db.locking import create_locks, RecordLock from calibre.db.fields import create_field +from calibre.db.search import Search from calibre.db.tables import VirtualTable from calibre.db.lazy import FormatMetadata, FormatsList from calibre.ebooks.metadata.book.base import Metadata @@ -50,6 +51,7 @@ class Cache(object): self.record_lock = RecordLock(self.read_lock) self.format_metadata_cache = defaultdict(dict) self.formatter_template_cache = {} + self._search_api = Search(self.field_metadata.get_search_terms()) # Implement locking for all simple read/write API methods # An unlocked version of the method is stored with the name starting @@ -409,6 +411,10 @@ class Cache(object): else: return sorted(all_book_ids, key=partial(SortKey, fields, sort_keys)) + @read_api + def search(self, query, restriction): + return self._search_api(self, query, restriction) + # }}} class SortKey(object): diff --git a/src/calibre/db/fields.py b/src/calibre/db/fields.py index 3808052549..43e89cdc6f 100644 --- a/src/calibre/db/fields.py +++ b/src/calibre/db/fields.py @@ -9,6 +9,7 @@ __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' from threading import Lock +from collections import defaultdict from calibre.db.tables import ONE_ONE, MANY_ONE, MANY_MANY from calibre.ebooks.metadata import title_sort @@ -83,6 +84,15 @@ class Field(object): ''' raise NotImplementedError() + def iter_searchable_values(self, get_metadata, candidates, default_value=None): + ''' + Return a generator that yields items of the form (value, set of books + ids that have this value). Here, value is a searchable value. For + OneToOneField the set of books ids will contain only a single id, but for + other fields it will generally have more than one id. Returned books_ids + are restricted to the set of ids in candidates. + ''' + raise NotImplementedError() class OneToOneField(Field): @@ -102,6 +112,11 @@ class OneToOneField(Field): return {id_ : self._sort_key(self.table.book_col_map.get(id_, self._default_sort_key)) for id_ in all_book_ids} + def iter_searchable_values(self, get_metadata, candidates, default_value=None): + cbm = self.table.book_col_map + for book_id in candidates: + yield cbm.get(book_id, default_value), {book_id} + class CompositeField(OneToOneField): def __init__(self, *args, **kwargs): @@ -139,6 +154,9 @@ class CompositeField(OneToOneField): return {id_ : sort_key(self.get_value_with_cache(id_, get_metadata)) for id_ in all_book_ids} + def iter_searchable_values(self, get_metadata, candidates, default_value=None): + for book_id in candidates: + yield self.get_value_with_cache(book_id, get_metadata), {book_id} class OnDeviceField(OneToOneField): @@ -176,6 +194,10 @@ class OnDeviceField(OneToOneField): return {id_ : self.for_book(id_) for id_ in all_book_ids} + def iter_searchable_values(self, get_metadata, candidates, default_value=None): + for book_id in candidates: + yield self.for_book(book_id, default_value=default_value), {book_id} + class ManyToOneField(Field): def for_book(self, book_id, default_value=None): @@ -206,6 +228,13 @@ class ManyToOneField(Field): for cid in ans.itervalues()} return {id_ : sk_map[cid] for id_, cid in ans.iteritems()} + def iter_searchable_values(self, get_metadata, candidates, default_value=None): + cbm = self.table.col_book_map + for item_id, val in self.table.id_map.iteritems(): + book_ids = set(cbm.get(item_id, ())).intersection(candidates) + if book_ids: + yield val, book_ids + class ManyToManyField(Field): def __init__(self, *args, **kwargs): @@ -241,6 +270,12 @@ class ManyToManyField(Field): (self._default_sort_key,)) for id_, cids in ans.iteritems()} + def iter_searchable_values(self, get_metadata, candidates, default_value=None): + cbm = self.table.col_book_map + for item_id, val in self.table.id_map.iteritems(): + book_ids = set(cbm.get(item_id, ())).intersection(candidates) + if book_ids: + yield val, book_ids class IdentifiersField(ManyToManyField): @@ -276,6 +311,17 @@ class FormatsField(ManyToManyField): def format_fname(self, book_id, fmt): return self.table.fname_map[book_id][fmt.upper()] + def iter_searchable_values(self, get_metadata, candidates, default_value=None): + val_map = defaultdict(set) + cbm = self.table.book_col_map + for book_id in candidates: + vals = cbm.get(book_id, ()) + for val in vals: + val_map[val].add(book_id) + + for val, book_ids in val_map.iteritems(): + yield val, book_ids + class SeriesField(ManyToOneField): def sort_key_for_series(self, book_id, get_lang, series_sort_order): diff --git a/src/calibre/db/search.py b/src/calibre/db/search.py new file mode 100644 index 0000000000..d304deeb9a --- /dev/null +++ b/src/calibre/db/search.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re +from functools import partial +from datetime import timedelta + +from calibre.utils.config_base import prefs +from calibre.utils.date import parse_date, UNDEFINED_DATE, now +from calibre.utils.search_query_parser import SearchQueryParser, ParseException + +# TODO: Thread safety of saved searches + +class DateSearch(object): # {{{ + + def __init__(self): + self.operators = { + '=' : (1, self.eq), + '!=' : (2, self.ne), + '>' : (1, self.gt), + '>=' : (2, self.ge), + '<' : (1, self.lt), + '<=' : (2, self.le), + } + self.local_today = { '_today', 'today', icu_lower(_('today')) } + self.local_yesterday = { '_yesterday', 'yesterday', icu_lower(_('yesterday')) } + self.local_thismonth = { '_thismonth', 'thismonth', icu_lower(_('thismonth')) } + self.daysago_pat = re.compile(r'(%s|daysago|_daysago)$'%_('daysago')) + + def eq(self, dbdate, query, field_count): + if dbdate.year == query.year: + if field_count == 1: + return True + if dbdate.month == query.month: + if field_count == 2: + return True + return dbdate.day == query.day + return False + + def ne(self, *args): + return not self.eq(*args) + + def gt(self, dbdate, query, field_count): + if dbdate.year > query.year: + return True + if field_count > 1 and dbdate.year == query.year: + if dbdate.month > query.month: + return True + return (field_count == 3 and dbdate.month == query.month and + dbdate.day > query.day) + return False + + def le(self, *args): + return not self.gt(*args) + + def lt(self, dbdate, query, field_count): + if dbdate.year < query.year: + return True + if field_count > 1 and dbdate.year == query.year: + if dbdate.month < query.month: + return True + return (field_count == 3 and dbdate.month == query.month and + dbdate.day < query.day) + return False + + def ge(self, *args): + return not self.lt(*args) + + def __call__(self, query, field_iter): + matches = set() + if len(query) < 2: + return matches + + if query == 'false': + for v, book_ids in field_iter(): + if isinstance(v, (str, unicode)): + v = parse_date(v) + if v is None or v <= UNDEFINED_DATE: + matches |= book_ids + return matches + + if query == 'true': + for v, book_ids in field_iter(): + if isinstance(v, (str, unicode)): + v = parse_date(v) + if v is not None and v > UNDEFINED_DATE: + matches |= book_ids + return matches + + relop = None + for k, op in self.operators.iteritems(): + if query.startswith(k): + p, relop = op + query = query[p:] + if relop is None: + relop = self.operators['='][-1] + + if query in self.local_today: + qd = now() + field_count = 3 + elif query in self.local_yesterday: + qd = now() - timedelta(1) + field_count = 3 + elif query in self.local_thismonth: + qd = now() + field_count = 2 + else: + m = self.daysago_pat.search(query) + if m is not None: + num = query[:-len(m.group(1))] + try: + qd = now() - timedelta(int(num)) + except: + raise ParseException(query, len(query), 'Number conversion error') + field_count = 3 + else: + try: + qd = parse_date(query, as_utc=False) + except: + raise ParseException(query, len(query), 'Date conversion error') + if '-' in query: + field_count = query.count('-') + 1 + else: + field_count = query.count('/') + 1 + + for v, book_ids in field_iter(): + if isinstance(v, (str, unicode)): + v = parse_date(v) + if v is not None and relop(v, qd, field_count): + matches |= book_ids + + return matches +# }}} + +class Parser(SearchQueryParser): + + def __init__(self, dbcache, all_book_ids, gst, date_search, + limit_search_columns, limit_search_columns_to, locations): + self.dbcache, self.all_book_ids = dbcache, all_book_ids + self.all_search_locations = frozenset(locations) + self.grouped_search_terms = gst + self.date_search = date_search + self.limit_search_columns, self.limit_search_columns_to = ( + limit_search_columns, limit_search_columns_to) + super(Parser, self).__init__(locations, optimize=True) + + @property + def field_metadata(self): + return self.dbcache.field_metadata + + def universal_set(self): + return self.all_book_ids + + def field_iter(self, name, candidates): + get_metadata = partial(self.dbcache._get_metadata, get_user_categories=False) + return self.dbcache.fields[name].iter_searchable_values(get_metadata, + candidates) + + def get_matches(self, location, query, candidates=None, + allow_recursion=True): + # If candidates is not None, it must not be modified. Changing its + # value will break query optimization in the search parser + matches = set() + + if candidates is None: + candidates = self.all_book_ids + if not candidates or not query or not query.strip(): + return matches + if location not in self.all_search_locations: + return matches + + if (len(location) > 2 and location.startswith('@') and + location[1:] in self.grouped_search_terms): + location = location[1:] + + # get metadata key associated with the search term. Eliminates + # dealing with plurals and other aliases + # original_location = location + location = self.field_metadata.search_term_to_field_key( + icu_lower(location.strip())) + # grouped search terms + if isinstance(location, list): + if allow_recursion: + if query.lower() == 'false': + invert = True + query = 'true' + else: + invert = False + for loc in location: + c = candidates.copy() + m = self.get_matches(loc, query, + candidates=c, allow_recursion=False) + matches |= m + c -= m + if len(c) == 0: + break + if invert: + matches = self.all_book_ids - matches + return matches + raise ParseException(query, len(query), 'Recursive query group detected') + + # If the user has asked to restrict searching over all field, apply + # that restriction + if (location == 'all' and self.limit_search_columns and + self.limit_search_columns_to): + terms = set() + for l in self.limit_search_columns_to: + l = icu_lower(l.strip()) + if l and l != 'all' and l in self.all_search_locations: + terms.add(l) + if terms: + c = candidates.copy() + for l in terms: + try: + m = self.get_matches(l, query, + candidates=c, allow_recursion=allow_recursion) + matches |= m + c -= m + if len(c) == 0: + break + except: + pass + return matches + + if location in self.field_metadata: + fm = self.field_metadata[location] + # take care of dates special case + if (fm['datatype'] == 'datetime' or + (fm['datatype'] == 'composite' and + fm['display'].get('composite_sort', '') == 'date')): + if location == 'date': + location = 'timestamp' + return self.date_search( + icu_lower(query), partial(self.field_iter, location, candidates)) + + return matches + + +class Search(object): + + def __init__(self, all_search_locations): + self.all_search_locations = all_search_locations + self.date_search = DateSearch() + + def change_locations(self, newlocs): + self.all_search_locations = newlocs + + def __call__(self, dbcache, query, search_restriction): + ''' + Return the set of ids of all records that match the specified + query and restriction + ''' + q = '' + if not query or not query.strip(): + q = search_restriction + else: + q = query + if search_restriction: + q = u'(%s) and (%s)' % (search_restriction, query) + + all_book_ids = dbcache.all_book_ids(type=set) + if not q: + return all_book_ids + + # We construct a new parser instance per search as pyparsing is not + # thread safe. On my desktop, constructing a SearchQueryParser instance + # takes 0.000975 seconds and restoring it from a pickle takes + # 0.000974 seconds. + sqp = Parser( + dbcache, all_book_ids, dbcache.pref('grouped_search_terms'), + self.date_search, prefs[ 'limit_search_columns' ], + prefs[ 'limit_search_columns_to' ], self.all_search_locations) + try: + ret = sqp.parse(query) + finally: + sqp.dbcache = None + return ret + diff --git a/src/calibre/db/tests/reading.py b/src/calibre/db/tests/reading.py index d77d3ac6eb..22d1bba37e 100644 --- a/src/calibre/db/tests/reading.py +++ b/src/calibre/db/tests/reading.py @@ -191,6 +191,26 @@ class ReadingTest(BaseTest): # }}} + def test_searching(self): # {{{ + 'Test searching returns the same data for both backends' + from calibre.library.database2 import LibraryDatabase2 + old = LibraryDatabase2(self.library_path) + oldvals = {query:set(old.search_getting_ids(query, '')) for query in ( + 'date:9/6/2011', 'date:true', 'date:false', 'pubdate:9/2011', + '#date:true', 'date:<100daysago', 'date:>9/6/2011', + '#date:>9/1/2011', '#date:=2011', + )} + old = None + + cache = self.init_cache(self.library_path) + for query, ans in oldvals.iteritems(): + nr = cache.search(query, '') + self.assertEqual(ans, nr, + 'Old result: %r != New result: %r for search: %s'%( + ans, nr, query)) + + # }}} + def tests(): return unittest.TestLoader().loadTestsFromTestCase(ReadingTest)