From 5fc450313ceb35aa8e7191c98c23f789d89b34b3 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Thu, 18 Apr 2013 11:33:44 +0200 Subject: [PATCH] Improvements to the new query parser. In particular, handle more degenerate cases where colons are significant in the query. Also make the implementation and the grammer more closely resemble each other. --- src/calibre/utils/search_query_parser.py | 122 ++++++++++++----------- 1 file changed, 64 insertions(+), 58 deletions(-) diff --git a/src/calibre/utils/search_query_parser.py b/src/calibre/utils/search_query_parser.py index ecf3e11ce8..29573cb9c9 100644 --- a/src/calibre/utils/search_query_parser.py +++ b/src/calibre/utils/search_query_parser.py @@ -97,19 +97,28 @@ def saved_searches(): ''' Parse a search expression into a series of potentially recursive operations. -The syntax is a bit twisted. +Note that the interpreter wants binary operators, not n-ary ops. This is why we +recurse instead of iterating when building sequences of the same op. + +The syntax is more than a bit twisted. In particular, the handling of colons +in the base token requires semantic analysis. + +Also note that the query string is lowercased before analysis. This is OK because +calibre's searches are all case-insensitive. + +Grammar: prog ::= or_expression or_expression ::= and_expression [ 'or' or_expression ] -and_expression ::= not_expression [ ( [ 'and' ] and_expression ) | ( '(' or_expression ')' ) ] +and_expression ::= not_expression [ [ 'and' ] and_expression ] -not_expression ::= [ 'not' ] base_token +not_expression ::= [ 'not' ] location_expression -base_token ::= location_expression | ( '(' or_expression ')' ) +location_expression ::= base_token | ( '(' or_expression ')' ) -location_expression ::= [ word [ ':' word ]* +base_token ::= a sequence of letters and colons, perhaps quoted ''' class Parser(object): @@ -119,20 +128,24 @@ class Parser(object): OPCODE = 1 WORD = 2 - EOF = 3 + QUOTED_WORD = 3 + EOF = 4 # Had to translate named constants to numeric values lex_scanner = re.Scanner([ - (r'[():]', lambda x,t: (1, t)), - (r'[^ "():]+', lambda x,t: (2, unicode(t))), - (r'".*?((? 1 and words[0] in self.locations: + loc = words[0] + words = words[1:] + if len(words) == 1 and self.token_type() == self.QUOTED_WORD: + return ['token', loc, self.token(advance=True)] + return ['token', loc, ':'.join(words)] + + return ['token', 'all', ':'.join(words)] class SearchQueryParser(object): ''' @@ -264,6 +269,7 @@ class SearchQueryParser(object): def __init__(self, locations, test=False, optimize=False): self.sqp_initialize(locations, test=test, optimize=optimize) + self.parser = Parser() def sqp_change_locations(self, locations): self.sqp_initialize(locations, optimize=self.optimize) @@ -287,7 +293,7 @@ class SearchQueryParser(object): def _parse(self, query, candidates=None): self.recurse_level += 1 try: - res = Parser().parse(query, self.locations) + res = self.parser.parse(query, self.locations) except RuntimeError: raise ParseException('Failed to parse query, recursion limit reached: %s'%repr(query)) if candidates is None: