Fix #8007 (Search performance on multiple words)

This commit is contained in:
Kovid Goyal 2011-01-08 19:17:39 -07:00
parent 95a9ee5b76
commit 8bcdb0fed7
2 changed files with 81 additions and 30 deletions

View File

@ -181,7 +181,7 @@ class ResultCache(SearchQueryParser): # {{{
self.search_restriction = ''
self.field_metadata = field_metadata
self.all_search_locations = field_metadata.get_search_terms()
SearchQueryParser.__init__(self, self.all_search_locations)
SearchQueryParser.__init__(self, self.all_search_locations, optimize=True)
self.build_date_relop_dict()
self.build_numeric_relop_dict()
@ -264,7 +264,7 @@ class ResultCache(SearchQueryParser): # {{{
'<=':[2, relop_le]
}
def get_dates_matches(self, location, query):
def get_dates_matches(self, location, query, candidates):
matches = set([])
if len(query) < 2:
return matches
@ -274,13 +274,15 @@ class ResultCache(SearchQueryParser): # {{{
loc = self.field_metadata[location]['rec_index']
if query == 'false':
for item in self._data:
for id_ in candidates:
item = self._data[id_]
if item is None: continue
if item[loc] is None or item[loc] <= UNDEFINED_DATE:
matches.add(item[0])
return matches
if query == 'true':
for item in self._data:
for id_ in candidates:
item = self._data[id_]
if item is None: continue
if item[loc] is not None and item[loc] > UNDEFINED_DATE:
matches.add(item[0])
@ -319,7 +321,8 @@ class ResultCache(SearchQueryParser): # {{{
field_count = query.count('-') + 1
else:
field_count = query.count('/') + 1
for item in self._data:
for id_ in candidates:
item = self._data[id_]
if item is None or item[loc] is None: continue
if relop(item[loc], qd, field_count):
matches.add(item[0])
@ -335,7 +338,7 @@ class ResultCache(SearchQueryParser): # {{{
'<=':[2, lambda r, q: r <= q]
}
def get_numeric_matches(self, location, query, val_func = None):
def get_numeric_matches(self, location, query, candidates, val_func = None):
matches = set([])
if len(query) == 0:
return matches
@ -381,7 +384,8 @@ class ResultCache(SearchQueryParser): # {{{
except:
return matches
for item in self._data:
for id_ in candidates:
item = self._data[id_]
if item is None:
continue
v = val_func(item)
@ -393,8 +397,13 @@ class ResultCache(SearchQueryParser): # {{{
matches.add(item[0])
return matches
def get_matches(self, location, query, allow_recursion=True):
def get_matches(self, location, query, allow_recursion=True, candidates=None):
matches = set([])
if candidates is None:
candidates = self.universal_set()
if len(candidates) == 0:
return matches
if query and query.strip():
# get metadata key associated with the search term. Eliminates
# dealing with plurals and other aliases
@ -476,7 +485,8 @@ class ResultCache(SearchQueryParser): # {{{
else:
q = query
for item in self._data:
for id_ in candidates:
item = self._data[id]
if item is None: continue
if col_datatype[loc] == 'bool': # complexity caused by the two-/three-value tweak

View File

@ -118,8 +118,9 @@ class SearchQueryParser(object):
failed.append(test[0])
return failed
def __init__(self, locations, test=False):
def __init__(self, locations, test=False, optimize=False):
self._tests_failed = False
self.optimize = optimize
# Define a token
standard_locations = map(lambda x : CaselessLiteral(x)+Suppress(':'),
locations)
@ -182,38 +183,52 @@ class SearchQueryParser(object):
# empty the list of searches used for recursion testing
self.recurse_level = 0
self.searches_seen = set([])
return self._parse(query)
candidates = self.universal_set()
return self._parse(query, candidates)
# this parse is used internally because it doesn't clear the
# recursive search test list. However, we permit seeing the
# same search a few times because the search might appear within
# another search.
def _parse(self, query):
def _parse(self, query, candidates=None):
self.recurse_level += 1
res = self._parser.parseString(query)[0]
t = self.evaluate(res)
if candidates is None:
candidates = self.universal_set()
t = self.evaluate(res, candidates)
self.recurse_level -= 1
return t
def method(self, group_name):
return getattr(self, 'evaluate_'+group_name)
def evaluate(self, parse_result):
return self.method(parse_result.getName())(parse_result)
def evaluate(self, parse_result, candidates):
return self.method(parse_result.getName())(parse_result, candidates)
def evaluate_and(self, argument):
return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
def evaluate_and(self, argument, candidates):
# RHS checks only those items matched by LHS
# returns result of RHS check: RHmatches(LHmatches(c))
# return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
l = self.evaluate(argument[0], candidates)
return l.intersection(self.evaluate(argument[1], l))
def evaluate_or(self, argument):
return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
def evaluate_or(self, argument, candidates):
# RHS checks only those elements not matched by LHS
# returns LHS union RHS: LHmatches(c) + RHmatches(c-LHmatches(c))
# return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
l = self.evaluate(argument[0], candidates)
return l.union(self.evaluate(argument[1], candidates.difference(l)))
def evaluate_not(self, argument):
return self.universal_set().difference(self.evaluate(argument[0]))
def evaluate_not(self, argument, candidates):
# unary op checks only candidates. Result: list of items matching
# returns: c - matches(c)
# return self.universal_set().difference(self.evaluate(argument[0]))
return candidates.difference(self.evaluate(argument[0], candidates))
def evaluate_parenthesis(self, argument):
return self.evaluate(argument[0])
def evaluate_parenthesis(self, argument, candidates):
return self.evaluate(argument[0], candidates)
def evaluate_token(self, argument):
def evaluate_token(self, argument, candidates):
location = argument[0]
query = argument[1]
if location.lower() == 'search':
@ -224,17 +239,27 @@ class SearchQueryParser(object):
raise ParseException(query, len(query), 'undefined saved search', self)
if self.recurse_level > 5:
self.searches_seen.add(query)
return self._parse(saved_searches().lookup(query))
return self._parse(saved_searches().lookup(query), candidates)
except: # convert all exceptions (e.g., missing key) to a parse error
raise ParseException(query, len(query), 'undefined saved search', self)
return self.get_matches(location, query)
return self._get_matches(location, query, candidates)
def get_matches(self, location, query):
def _get_matches(self, location, query, candidates):
if self.optimize:
return self.get_matches(location, query, candidates=candidates)
else:
return self.get_matches(location, query)
def get_matches(self, location, query, candidates=None):
'''
Should return the set of matches for :param:'location` and :param:`query`.
The search must be performed over all entries is :param:`candidates` is
None otherwise only over the items in candidates.
:param:`location` is one of the items in :member:`SearchQueryParser.DEFAULT_LOCATIONS`.
:param:`query` is a string literal.
:param: None or a subset of the set returned by :meth:`universal_set`.
'''
return set([])
@ -561,7 +586,7 @@ class Tester(SearchQueryParser):
def universal_set(self):
return self._universal_set
def get_matches(self, location, query):
def get_matches(self, location, query, candidates=None):
location = location.lower()
if location in self.fields.keys():
getter = operator.itemgetter(self.fields[location])
@ -573,8 +598,13 @@ class Tester(SearchQueryParser):
if not query:
return set([])
query = query.lower()
return set(key for key, val in self.texts.items() \
if query and query in getattr(getter(val), 'lower', lambda : '')())
if candidates:
return set(key for key, val in self.texts.items() \
if key in candidates and query and query
in getattr(getter(val), 'lower', lambda : '')())
else:
return set(key for key, val in self.texts.items() \
if query and query in getattr(getter(val), 'lower', lambda : '')())
@ -592,6 +622,7 @@ class Tester(SearchQueryParser):
def main(args=sys.argv):
print 'testing unoptimized'
tester = Tester(['authors', 'author', 'series', 'formats', 'format',
'publisher', 'rating', 'tags', 'tag', 'comments', 'comment', 'cover',
'isbn', 'ondevice', 'pubdate', 'size', 'date', 'title', u'#read',
@ -601,6 +632,16 @@ def main(args=sys.argv):
print '>>>>>>>>>>>>>> Tests Failed <<<<<<<<<<<<<<<'
return 1
print '\n\ntesting optimized'
tester = Tester(['authors', 'author', 'series', 'formats', 'format',
'publisher', 'rating', 'tags', 'tag', 'comments', 'comment', 'cover',
'isbn', 'ondevice', 'pubdate', 'size', 'date', 'title', u'#read',
'all', 'search'], test=True, optimize=True)
failed = tester.run_tests()
if tester._tests_failed or failed:
print '>>>>>>>>>>>>>> Tests Failed <<<<<<<<<<<<<<<'
return 1
return 0
if __name__ == '__main__':