Improvements to the new query parser. In particular, handle more degenerate cases where colons are significant in the query. Also make the implementation and the grammer more closely resemble each other.

This commit is contained in:
Charles Haley 2013-04-18 11:33:44 +02:00
parent a5b66f0b1c
commit 5fc450313c

View File

@ -97,19 +97,28 @@ def saved_searches():
''' '''
Parse a search expression into a series of potentially recursive operations. Parse a search expression into a series of potentially recursive operations.
The syntax is a bit twisted. Note that the interpreter wants binary operators, not n-ary ops. This is why we
recurse instead of iterating when building sequences of the same op.
The syntax is more than a bit twisted. In particular, the handling of colons
in the base token requires semantic analysis.
Also note that the query string is lowercased before analysis. This is OK because
calibre's searches are all case-insensitive.
Grammar:
prog ::= or_expression prog ::= or_expression
or_expression ::= and_expression [ 'or' or_expression ] or_expression ::= and_expression [ 'or' or_expression ]
and_expression ::= not_expression [ ( [ 'and' ] and_expression ) | ( '(' or_expression ')' ) ] and_expression ::= not_expression [ [ 'and' ] and_expression ]
not_expression ::= [ 'not' ] base_token not_expression ::= [ 'not' ] location_expression
base_token ::= location_expression | ( '(' or_expression ')' ) location_expression ::= base_token | ( '(' or_expression ')' )
location_expression ::= [ word [ ':' word ]* base_token ::= a sequence of letters and colons, perhaps quoted
''' '''
class Parser(object): class Parser(object):
@ -119,20 +128,24 @@ class Parser(object):
OPCODE = 1 OPCODE = 1
WORD = 2 WORD = 2
EOF = 3 QUOTED_WORD = 3
EOF = 4
# Had to translate named constants to numeric values # Had to translate named constants to numeric values
lex_scanner = re.Scanner([ lex_scanner = re.Scanner([
(r'[():]', lambda x,t: (1, t)), (r'[()]', lambda x,t: (1, t)),
(r'[^ "():]+', lambda x,t: (2, unicode(t))), (r'[^ "()]+', lambda x,t: (2, unicode(t))),
(r'".*?((?<!\\)")', lambda x,t: (2, t[1:-1])), (r'".*?((?<!\\)")', lambda x,t: (3, t[1:-1])),
(r'\s', None) (r'\s', None)
], flags=re.DOTALL) ], flags=re.DOTALL)
def token(self): def token(self, advance=False):
if self.is_eof(): if self.is_eof():
return None return None
return self.tokens[self.current_token][1] res = self.tokens[self.current_token][1]
if advance:
self.current_token += 1
return res
def token_type(self): def token_type(self):
if self.is_eof(): if self.is_eof():
@ -152,13 +165,11 @@ class Parser(object):
prog = self.or_expression() prog = self.or_expression()
if not self.is_eof(): if not self.is_eof():
raise ParseException(_('Extra characters at end of search')) raise ParseException(_('Extra characters at end of search'))
# prints(self.tokens, '\n', prog) #prints(self.tokens, '\n', prog)
return prog return prog
def or_expression(self): def or_expression(self):
lhs = self.and_expression() lhs = self.and_expression()
if self.is_eof():
return lhs
if self.token() == 'or': if self.token() == 'or':
self.advance() self.advance()
return ['or', lhs, self.or_expression()] return ['or', lhs, self.or_expression()]
@ -166,65 +177,59 @@ class Parser(object):
def and_expression(self): def and_expression(self):
lhs = self.not_expression() lhs = self.not_expression()
if self.is_eof():
return lhs
if self.token() == 'and': if self.token() == 'and':
self.advance() self.advance()
return ['and', lhs, self.and_expression()] return ['and', lhs, self.and_expression()]
# Account for the optional 'and' # Account for the optional 'and'
if self.token_type() == self.WORD and self.token() != 'or': if self.token_type() in [self.WORD, self.QUOTED_WORD] and self.token() != 'or':
return ['and', lhs, self.and_expression()] return ['and', lhs, self.and_expression()]
elif self.token() == '(':
self.advance()
rhs = self.or_expression()
if self.token() != ')':
raise ParseException('missing )')
else:
self.advance();
return ['and', lhs, rhs]
return lhs return lhs
def not_expression(self): def not_expression(self):
if self.token() == 'not': if self.token() == 'not':
self.advance() self.advance()
return ['not', self.not_expression()] return ['not', self.not_expression()]
return self.base_token()
def base_token(self):
if self.token() == '(':
self.advance()
res = self.or_expression()
if self.token() != ')':
raise ParseException('missing )')
self.advance()
return res
if self.token_type() != self.WORD:
raise ParseException('Invalid syntax. Expected a lookup name or a word')
return self.location_expression() return self.location_expression()
def location_expression(self): def location_expression(self):
loc = self.token() if self.token() == '(':
self.advance() self.advance()
if self.token() == ':': res = self.or_expression()
if loc in self.locations: if self.token(advance=True) != ')':
val = '' raise ParseException('missing )')
else: return res
val = loc + ':' if self.token_type() not in [ self.WORD, self.QUOTED_WORD ]:
loc = 'all' raise ParseException('Invalid syntax. Expected a lookup name or a word')
self.advance() return self.base_token()
while True:
val += self.token() def base_token(self):
self.advance() if self.token_type() == self.QUOTED_WORD:
if self.token() == ':': return ['token', 'all', self.token(advance=True)]
val += ':'
self.advance() words = self.token(advance=True).split(':')
else:
break # The complexity here comes from having colon-separated search
return ['token', loc, val] # values. That forces us to check that the first "word" in a colon-
return ['token', 'all', loc] # separated group is a valid location. If not, then the token must
# be reconstructed. We also have the problem that locations can be
# followed by quoted strings that appear as the next token. and that
# tokens can be a sequence of colons.
# We have a location if there is more than one word and the first
# word is in locations. This check could produce a "wrong" answer if
# the search string is something like 'author: "foo"' because it
# will be interpreted as 'author:"foo"'. I am choosing to accept the
# possible error. The expression should be written '"author:" foo'
if len(words) > 1 and words[0] in self.locations:
loc = words[0]
words = words[1:]
if len(words) == 1 and self.token_type() == self.QUOTED_WORD:
return ['token', loc, self.token(advance=True)]
return ['token', loc, ':'.join(words)]
return ['token', 'all', ':'.join(words)]
class SearchQueryParser(object): class SearchQueryParser(object):
''' '''
@ -264,6 +269,7 @@ class SearchQueryParser(object):
def __init__(self, locations, test=False, optimize=False): def __init__(self, locations, test=False, optimize=False):
self.sqp_initialize(locations, test=test, optimize=optimize) self.sqp_initialize(locations, test=test, optimize=optimize)
self.parser = Parser()
def sqp_change_locations(self, locations): def sqp_change_locations(self, locations):
self.sqp_initialize(locations, optimize=self.optimize) self.sqp_initialize(locations, optimize=self.optimize)
@ -287,7 +293,7 @@ class SearchQueryParser(object):
def _parse(self, query, candidates=None): def _parse(self, query, candidates=None):
self.recurse_level += 1 self.recurse_level += 1
try: try:
res = Parser().parse(query, self.locations) res = self.parser.parse(query, self.locations)
except RuntimeError: except RuntimeError:
raise ParseException('Failed to parse query, recursion limit reached: %s'%repr(query)) raise ParseException('Failed to parse query, recursion limit reached: %s'%repr(query))
if candidates is None: if candidates is None: