Improvements to the new query parser. In particular, handle more degenerate cases where colons are significant in the query. Also make the implementation and the grammer more closely resemble each other.

2025-10-31 18:47:02 -04:00 · 2013-04-18 11:33:44 +02:00 · 2013-04-18 11:33:44 +02:00 · 5fc450313c
commit 5fc450313c
parent a5b66f0b1c
1 changed files with 64 additions and 58 deletions
--- a/src/calibre/utils/search_query_parser.py
+++ b/src/calibre/utils/search_query_parser.py
@ -97,19 +97,28 @@ def saved_searches():
 '''
 Parse a search expression into a series of potentially recursive operations.
-The syntax is a bit twisted.
+Note that the interpreter wants binary operators, not n-ary ops. This is why we
 recurse instead of iterating when building sequences of the same op.
 The syntax is more than a bit twisted. In particular, the handling of colons
 in the base token requires semantic analysis.
 Also note that the query string is lowercased before analysis. This is OK because
 calibre's searches are all case-insensitive.
 Grammar:
 prog ::= or_expression
 or_expression ::= and_expression [ 'or' or_expression ]
-and_expression ::= not_expression [ ( [ 'and' ] and_expression ) | ( '(' or_expression ')' ) ]
+and_expression ::= not_expression [ [ 'and' ] and_expression ]
-not_expression ::= [ 'not' ] base_token
+not_expression ::= [ 'not' ] location_expression
-base_token ::= location_expression | ( '(' or_expression ')' )
+location_expression ::= base_token | ( '(' or_expression ')' )
-location_expression ::= [ word [ ':'  word ]*
+base_token ::= a sequence of letters and colons, perhaps quoted
 '''
 class Parser(object):
@ -119,20 +128,24 @@ class Parser(object):
        OPCODE = 1
        WORD = 2
-        EOF = 3
+        QUOTED_WORD = 3
        EOF = 4
        # Had to translate named constants to numeric values
        lex_scanner = re.Scanner([
-                (r'[():]',            lambda x,t: (1, t)),
+                (r'[()]',             lambda x,t: (1, t)),
-                (r'[^ "():]+',        lambda x,t: (2, unicode(t))),
+                (r'[^ "()]+',         lambda x,t: (2, unicode(t))),
-                (r'".*?((?<!\\)")',   lambda x,t: (2, t[1:-1])),
+                (r'".*?((?<!\\)")',   lambda x,t: (3, t[1:-1])),
                (r'\s',               None)
        ], flags=re.DOTALL)
-        def token(self):
+        def token(self, advance=False):
            if self.is_eof():
                return None
-            return self.tokens[self.current_token][1]
+            res = self.tokens[self.current_token][1]
            if advance:
                self.current_token += 1
            return res
        def token_type(self):
            if self.is_eof():
@ -152,13 +165,11 @@ class Parser(object):
            prog = self.or_expression()
            if not self.is_eof():
                raise ParseException(_('Extra characters at end of search'))
-            # prints(self.tokens, '\n', prog)
+            #prints(self.tokens, '\n', prog)
            return prog
        def or_expression(self):
            lhs = self.and_expression()
            if self.is_eof():
                return lhs
            if self.token() == 'or':
                self.advance()
                return ['or', lhs, self.or_expression()]
@ -166,65 +177,59 @@ class Parser(object):
        def and_expression(self):
            lhs = self.not_expression()
            if self.is_eof():
                return lhs
            if self.token() == 'and':
                self.advance()
                return ['and', lhs, self.and_expression()]
            # Account for the optional 'and'
-            if self.token_type() == self.WORD and self.token() != 'or':
+            if self.token_type() in [self.WORD, self.QUOTED_WORD] and self.token() != 'or':
                return ['and', lhs, self.and_expression()]
            elif self.token() == '(':
                self.advance()
                rhs = self.or_expression()
                if self.token() != ')':
                    raise ParseException('missing )')
                else:
                    self.advance();
                    return ['and', lhs, rhs]
            return lhs
        def not_expression(self):
            if self.token() == 'not':
                self.advance()
                return ['not', self.not_expression()]
            return self.base_token()
        def base_token(self):
            if self.token() == '(':
                self.advance()
                res = self.or_expression()
                if self.token() != ')':
                    raise ParseException('missing )')
                self.advance()
                return res
            if self.token_type() != self.WORD:
                raise ParseException('Invalid syntax. Expected a lookup name or a word')
            return self.location_expression()
        def location_expression(self):
-            loc = self.token()
+            if self.token() == '(':
            self.advance()
            if self.token() == ':':
                if loc in self.locations:
                    val = ''
                else:
                    val = loc + ':'
                    loc = 'all'
                self.advance()
-                while True:
+                res = self.or_expression()
-                    val += self.token()
+                if self.token(advance=True) != ')':
-                    self.advance()
+                    raise ParseException('missing )')
-                    if self.token() == ':':
+                return res
-                        val += ':'
+            if self.token_type() not in [ self.WORD, self.QUOTED_WORD ]:
-                        self.advance()
+                raise ParseException('Invalid syntax. Expected a lookup name or a word')
-                    else:
+
-                        break
+            return self.base_token()
-                return ['token', loc, val]
+
-            return ['token', 'all', loc]
+        def base_token(self):
            if self.token_type() == self.QUOTED_WORD:
                return ['token', 'all', self.token(advance=True)]
            words = self.token(advance=True).split(':')
            # The complexity here comes from having colon-separated search
            # values. That forces us to check that the first "word" in a colon-
            # separated group is a valid location. If not, then the token must
            # be reconstructed. We also have the problem that locations can be
            # followed by quoted strings that appear as the next token. and that
            # tokens can be a sequence of colons.
            # We have a location if there is more than one word and the first
            # word is in locations. This check could produce a "wrong" answer if
            # the search string is something like 'author: "foo"' because it
            # will be interpreted as 'author:"foo"'. I am choosing to accept the
            # possible error. The expression should be written '"author:" foo'
            if len(words) > 1 and words[0] in self.locations:
                loc = words[0]
                words = words[1:]
                if len(words) == 1 and self.token_type() == self.QUOTED_WORD:
                    return ['token', loc, self.token(advance=True)]
                return ['token', loc, ':'.join(words)]
            return ['token', 'all', ':'.join(words)]
 class SearchQueryParser(object):
    '''
@ -264,6 +269,7 @@ class SearchQueryParser(object):
    def __init__(self, locations, test=False, optimize=False):
        self.sqp_initialize(locations, test=test, optimize=optimize)
        self.parser = Parser()
    def sqp_change_locations(self, locations):
        self.sqp_initialize(locations, optimize=self.optimize)
@ -287,7 +293,7 @@ class SearchQueryParser(object):
    def _parse(self, query, candidates=None):
        self.recurse_level += 1
        try:
-            res = Parser().parse(query, self.locations)
+            res = self.parser.parse(query, self.locations)
        except RuntimeError:
            raise ParseException('Failed to parse query, recursion limit reached: %s'%repr(query))
        if candidates is None: