mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Improvements to the new query parser. In particular, handle more degenerate cases where colons are significant in the query. Also make the implementation and the grammer more closely resemble each other.
This commit is contained in:
parent
a5b66f0b1c
commit
5fc450313c
@ -97,19 +97,28 @@ def saved_searches():
|
|||||||
'''
|
'''
|
||||||
Parse a search expression into a series of potentially recursive operations.
|
Parse a search expression into a series of potentially recursive operations.
|
||||||
|
|
||||||
The syntax is a bit twisted.
|
Note that the interpreter wants binary operators, not n-ary ops. This is why we
|
||||||
|
recurse instead of iterating when building sequences of the same op.
|
||||||
|
|
||||||
|
The syntax is more than a bit twisted. In particular, the handling of colons
|
||||||
|
in the base token requires semantic analysis.
|
||||||
|
|
||||||
|
Also note that the query string is lowercased before analysis. This is OK because
|
||||||
|
calibre's searches are all case-insensitive.
|
||||||
|
|
||||||
|
Grammar:
|
||||||
|
|
||||||
prog ::= or_expression
|
prog ::= or_expression
|
||||||
|
|
||||||
or_expression ::= and_expression [ 'or' or_expression ]
|
or_expression ::= and_expression [ 'or' or_expression ]
|
||||||
|
|
||||||
and_expression ::= not_expression [ ( [ 'and' ] and_expression ) | ( '(' or_expression ')' ) ]
|
and_expression ::= not_expression [ [ 'and' ] and_expression ]
|
||||||
|
|
||||||
not_expression ::= [ 'not' ] base_token
|
not_expression ::= [ 'not' ] location_expression
|
||||||
|
|
||||||
base_token ::= location_expression | ( '(' or_expression ')' )
|
location_expression ::= base_token | ( '(' or_expression ')' )
|
||||||
|
|
||||||
location_expression ::= [ word [ ':' word ]*
|
base_token ::= a sequence of letters and colons, perhaps quoted
|
||||||
'''
|
'''
|
||||||
class Parser(object):
|
class Parser(object):
|
||||||
|
|
||||||
@ -119,20 +128,24 @@ class Parser(object):
|
|||||||
|
|
||||||
OPCODE = 1
|
OPCODE = 1
|
||||||
WORD = 2
|
WORD = 2
|
||||||
EOF = 3
|
QUOTED_WORD = 3
|
||||||
|
EOF = 4
|
||||||
|
|
||||||
# Had to translate named constants to numeric values
|
# Had to translate named constants to numeric values
|
||||||
lex_scanner = re.Scanner([
|
lex_scanner = re.Scanner([
|
||||||
(r'[():]', lambda x,t: (1, t)),
|
(r'[()]', lambda x,t: (1, t)),
|
||||||
(r'[^ "():]+', lambda x,t: (2, unicode(t))),
|
(r'[^ "()]+', lambda x,t: (2, unicode(t))),
|
||||||
(r'".*?((?<!\\)")', lambda x,t: (2, t[1:-1])),
|
(r'".*?((?<!\\)")', lambda x,t: (3, t[1:-1])),
|
||||||
(r'\s', None)
|
(r'\s', None)
|
||||||
], flags=re.DOTALL)
|
], flags=re.DOTALL)
|
||||||
|
|
||||||
def token(self):
|
def token(self, advance=False):
|
||||||
if self.is_eof():
|
if self.is_eof():
|
||||||
return None
|
return None
|
||||||
return self.tokens[self.current_token][1]
|
res = self.tokens[self.current_token][1]
|
||||||
|
if advance:
|
||||||
|
self.current_token += 1
|
||||||
|
return res
|
||||||
|
|
||||||
def token_type(self):
|
def token_type(self):
|
||||||
if self.is_eof():
|
if self.is_eof():
|
||||||
@ -152,13 +165,11 @@ class Parser(object):
|
|||||||
prog = self.or_expression()
|
prog = self.or_expression()
|
||||||
if not self.is_eof():
|
if not self.is_eof():
|
||||||
raise ParseException(_('Extra characters at end of search'))
|
raise ParseException(_('Extra characters at end of search'))
|
||||||
# prints(self.tokens, '\n', prog)
|
#prints(self.tokens, '\n', prog)
|
||||||
return prog
|
return prog
|
||||||
|
|
||||||
def or_expression(self):
|
def or_expression(self):
|
||||||
lhs = self.and_expression()
|
lhs = self.and_expression()
|
||||||
if self.is_eof():
|
|
||||||
return lhs
|
|
||||||
if self.token() == 'or':
|
if self.token() == 'or':
|
||||||
self.advance()
|
self.advance()
|
||||||
return ['or', lhs, self.or_expression()]
|
return ['or', lhs, self.or_expression()]
|
||||||
@ -166,65 +177,59 @@ class Parser(object):
|
|||||||
|
|
||||||
def and_expression(self):
|
def and_expression(self):
|
||||||
lhs = self.not_expression()
|
lhs = self.not_expression()
|
||||||
if self.is_eof():
|
|
||||||
return lhs
|
|
||||||
if self.token() == 'and':
|
if self.token() == 'and':
|
||||||
self.advance()
|
self.advance()
|
||||||
return ['and', lhs, self.and_expression()]
|
return ['and', lhs, self.and_expression()]
|
||||||
|
|
||||||
# Account for the optional 'and'
|
# Account for the optional 'and'
|
||||||
if self.token_type() == self.WORD and self.token() != 'or':
|
if self.token_type() in [self.WORD, self.QUOTED_WORD] and self.token() != 'or':
|
||||||
return ['and', lhs, self.and_expression()]
|
return ['and', lhs, self.and_expression()]
|
||||||
elif self.token() == '(':
|
|
||||||
self.advance()
|
|
||||||
rhs = self.or_expression()
|
|
||||||
if self.token() != ')':
|
|
||||||
raise ParseException('missing )')
|
|
||||||
else:
|
|
||||||
self.advance();
|
|
||||||
return ['and', lhs, rhs]
|
|
||||||
return lhs
|
return lhs
|
||||||
|
|
||||||
def not_expression(self):
|
def not_expression(self):
|
||||||
if self.token() == 'not':
|
if self.token() == 'not':
|
||||||
self.advance()
|
self.advance()
|
||||||
return ['not', self.not_expression()]
|
return ['not', self.not_expression()]
|
||||||
return self.base_token()
|
|
||||||
|
|
||||||
def base_token(self):
|
|
||||||
if self.token() == '(':
|
|
||||||
self.advance()
|
|
||||||
res = self.or_expression()
|
|
||||||
if self.token() != ')':
|
|
||||||
raise ParseException('missing )')
|
|
||||||
self.advance()
|
|
||||||
return res
|
|
||||||
if self.token_type() != self.WORD:
|
|
||||||
raise ParseException('Invalid syntax. Expected a lookup name or a word')
|
|
||||||
|
|
||||||
return self.location_expression()
|
return self.location_expression()
|
||||||
|
|
||||||
def location_expression(self):
|
def location_expression(self):
|
||||||
loc = self.token()
|
if self.token() == '(':
|
||||||
self.advance()
|
|
||||||
if self.token() == ':':
|
|
||||||
if loc in self.locations:
|
|
||||||
val = ''
|
|
||||||
else:
|
|
||||||
val = loc + ':'
|
|
||||||
loc = 'all'
|
|
||||||
|
|
||||||
self.advance()
|
self.advance()
|
||||||
while True:
|
res = self.or_expression()
|
||||||
val += self.token()
|
if self.token(advance=True) != ')':
|
||||||
self.advance()
|
raise ParseException('missing )')
|
||||||
if self.token() == ':':
|
return res
|
||||||
val += ':'
|
if self.token_type() not in [ self.WORD, self.QUOTED_WORD ]:
|
||||||
self.advance()
|
raise ParseException('Invalid syntax. Expected a lookup name or a word')
|
||||||
else:
|
|
||||||
break
|
return self.base_token()
|
||||||
return ['token', loc, val]
|
|
||||||
return ['token', 'all', loc]
|
def base_token(self):
|
||||||
|
if self.token_type() == self.QUOTED_WORD:
|
||||||
|
return ['token', 'all', self.token(advance=True)]
|
||||||
|
|
||||||
|
words = self.token(advance=True).split(':')
|
||||||
|
|
||||||
|
# The complexity here comes from having colon-separated search
|
||||||
|
# values. That forces us to check that the first "word" in a colon-
|
||||||
|
# separated group is a valid location. If not, then the token must
|
||||||
|
# be reconstructed. We also have the problem that locations can be
|
||||||
|
# followed by quoted strings that appear as the next token. and that
|
||||||
|
# tokens can be a sequence of colons.
|
||||||
|
|
||||||
|
# We have a location if there is more than one word and the first
|
||||||
|
# word is in locations. This check could produce a "wrong" answer if
|
||||||
|
# the search string is something like 'author: "foo"' because it
|
||||||
|
# will be interpreted as 'author:"foo"'. I am choosing to accept the
|
||||||
|
# possible error. The expression should be written '"author:" foo'
|
||||||
|
if len(words) > 1 and words[0] in self.locations:
|
||||||
|
loc = words[0]
|
||||||
|
words = words[1:]
|
||||||
|
if len(words) == 1 and self.token_type() == self.QUOTED_WORD:
|
||||||
|
return ['token', loc, self.token(advance=True)]
|
||||||
|
return ['token', loc, ':'.join(words)]
|
||||||
|
|
||||||
|
return ['token', 'all', ':'.join(words)]
|
||||||
|
|
||||||
class SearchQueryParser(object):
|
class SearchQueryParser(object):
|
||||||
'''
|
'''
|
||||||
@ -264,6 +269,7 @@ class SearchQueryParser(object):
|
|||||||
|
|
||||||
def __init__(self, locations, test=False, optimize=False):
|
def __init__(self, locations, test=False, optimize=False):
|
||||||
self.sqp_initialize(locations, test=test, optimize=optimize)
|
self.sqp_initialize(locations, test=test, optimize=optimize)
|
||||||
|
self.parser = Parser()
|
||||||
|
|
||||||
def sqp_change_locations(self, locations):
|
def sqp_change_locations(self, locations):
|
||||||
self.sqp_initialize(locations, optimize=self.optimize)
|
self.sqp_initialize(locations, optimize=self.optimize)
|
||||||
@ -287,7 +293,7 @@ class SearchQueryParser(object):
|
|||||||
def _parse(self, query, candidates=None):
|
def _parse(self, query, candidates=None):
|
||||||
self.recurse_level += 1
|
self.recurse_level += 1
|
||||||
try:
|
try:
|
||||||
res = Parser().parse(query, self.locations)
|
res = self.parser.parse(query, self.locations)
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
raise ParseException('Failed to parse query, recursion limit reached: %s'%repr(query))
|
raise ParseException('Failed to parse query, recursion limit reached: %s'%repr(query))
|
||||||
if candidates is None:
|
if candidates is None:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user