From 5fc450313ceb35aa8e7191c98c23f789d89b34b3 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Thu, 18 Apr 2013 11:33:44 +0200
Subject: [PATCH] Improvements to the new query parser. In particular, handle
 more degenerate cases where colons are significant in the query. Also make
 the implementation and the grammer more closely resemble each other.

---
 src/calibre/utils/search_query_parser.py | 122 ++++++++++++-----------
 1 file changed, 64 insertions(+), 58 deletions(-)

diff --git a/src/calibre/utils/search_query_parser.py b/src/calibre/utils/search_query_parser.py
index ecf3e11ce8..29573cb9c9 100644
--- a/src/calibre/utils/search_query_parser.py
+++ b/src/calibre/utils/search_query_parser.py
@@ -97,19 +97,28 @@ def saved_searches():
 '''
 Parse a search expression into a series of potentially recursive operations.
 
-The syntax is a bit twisted.
+Note that the interpreter wants binary operators, not n-ary ops. This is why we
+recurse instead of iterating when building sequences of the same op.
+
+The syntax is more than a bit twisted. In particular, the handling of colons
+in the base token requires semantic analysis.
+
+Also note that the query string is lowercased before analysis. This is OK because
+calibre's searches are all case-insensitive.
+
+Grammar:
 
 prog ::= or_expression
 
 or_expression ::= and_expression [ 'or' or_expression ]
 
-and_expression ::= not_expression [ ( [ 'and' ] and_expression ) | ( '(' or_expression ')' ) ]
+and_expression ::= not_expression [ [ 'and' ] and_expression ]
 
-not_expression ::= [ 'not' ] base_token
+not_expression ::= [ 'not' ] location_expression
 
-base_token ::= location_expression | ( '(' or_expression ')' )
+location_expression ::= base_token | ( '(' or_expression ')' )
 
-location_expression ::= [ word [ ':'  word ]*
+base_token ::= a sequence of letters and colons, perhaps quoted
 '''
 class Parser(object):
 
@@ -119,20 +128,24 @@ class Parser(object):
 
         OPCODE = 1
         WORD = 2
-        EOF = 3
+        QUOTED_WORD = 3
+        EOF = 4
 
         # Had to translate named constants to numeric values
         lex_scanner = re.Scanner([
-                (r'[():]',            lambda x,t: (1, t)),
-                (r'[^ "():]+',        lambda x,t: (2, unicode(t))),
-                (r'".*?((?<!\\)")',   lambda x,t: (2, t[1:-1])),
+                (r'[()]',             lambda x,t: (1, t)),
+                (r'[^ "()]+',         lambda x,t: (2, unicode(t))),
+                (r'".*?((?<!\\)")',   lambda x,t: (3, t[1:-1])),
                 (r'\s',               None)
         ], flags=re.DOTALL)
 
-        def token(self):
+        def token(self, advance=False):
             if self.is_eof():
                 return None
-            return self.tokens[self.current_token][1]
+            res = self.tokens[self.current_token][1]
+            if advance:
+                self.current_token += 1
+            return res
 
         def token_type(self):
             if self.is_eof():
@@ -152,13 +165,11 @@ class Parser(object):
             prog = self.or_expression()
             if not self.is_eof():
                 raise ParseException(_('Extra characters at end of search'))
-            # prints(self.tokens, '\n', prog)
+            #prints(self.tokens, '\n', prog)
             return prog
 
         def or_expression(self):
             lhs = self.and_expression()
-            if self.is_eof():
-                return lhs
             if self.token() == 'or':
                 self.advance()
                 return ['or', lhs, self.or_expression()]
@@ -166,65 +177,59 @@ class Parser(object):
 
         def and_expression(self):
             lhs = self.not_expression()
-            if self.is_eof():
-                return lhs
             if self.token() == 'and':
                 self.advance()
                 return ['and', lhs, self.and_expression()]
 
             # Account for the optional 'and'
-            if self.token_type() == self.WORD and self.token() != 'or':
+            if self.token_type() in [self.WORD, self.QUOTED_WORD] and self.token() != 'or':
                 return ['and', lhs, self.and_expression()]
-            elif self.token() == '(':
-                self.advance()
-                rhs = self.or_expression()
-                if self.token() != ')':
-                    raise ParseException('missing )')
-                else:
-                    self.advance();
-                    return ['and', lhs, rhs]
             return lhs
 
         def not_expression(self):
             if self.token() == 'not':
                 self.advance()
                 return ['not', self.not_expression()]
-            return self.base_token()
-
-        def base_token(self):
-            if self.token() == '(':
-                self.advance()
-                res = self.or_expression()
-                if self.token() != ')':
-                    raise ParseException('missing )')
-                self.advance()
-                return res
-            if self.token_type() != self.WORD:
-                raise ParseException('Invalid syntax. Expected a lookup name or a word')
-
             return self.location_expression()
 
         def location_expression(self):
-            loc = self.token()
-            self.advance()
-            if self.token() == ':':
-                if loc in self.locations:
-                    val = ''
-                else:
-                    val = loc + ':'
-                    loc = 'all'
-
+            if self.token() == '(':
                 self.advance()
-                while True:
-                    val += self.token()
-                    self.advance()
-                    if self.token() == ':':
-                        val += ':'
-                        self.advance()
-                    else:
-                        break
-                return ['token', loc, val]
-            return ['token', 'all', loc]
+                res = self.or_expression()
+                if self.token(advance=True) != ')':
+                    raise ParseException('missing )')
+                return res
+            if self.token_type() not in [ self.WORD, self.QUOTED_WORD ]:
+                raise ParseException('Invalid syntax. Expected a lookup name or a word')
+
+            return self.base_token()
+
+        def base_token(self):
+            if self.token_type() == self.QUOTED_WORD:
+                return ['token', 'all', self.token(advance=True)]
+
+            words = self.token(advance=True).split(':')
+
+            # The complexity here comes from having colon-separated search
+            # values. That forces us to check that the first "word" in a colon-
+            # separated group is a valid location. If not, then the token must
+            # be reconstructed. We also have the problem that locations can be
+            # followed by quoted strings that appear as the next token. and that
+            # tokens can be a sequence of colons.
+
+            # We have a location if there is more than one word and the first
+            # word is in locations. This check could produce a "wrong" answer if
+            # the search string is something like 'author: "foo"' because it
+            # will be interpreted as 'author:"foo"'. I am choosing to accept the
+            # possible error. The expression should be written '"author:" foo'
+            if len(words) > 1 and words[0] in self.locations:
+                loc = words[0]
+                words = words[1:]
+                if len(words) == 1 and self.token_type() == self.QUOTED_WORD:
+                    return ['token', loc, self.token(advance=True)]
+                return ['token', loc, ':'.join(words)]
+
+            return ['token', 'all', ':'.join(words)]
 
 class SearchQueryParser(object):
     '''
@@ -264,6 +269,7 @@ class SearchQueryParser(object):
 
     def __init__(self, locations, test=False, optimize=False):
         self.sqp_initialize(locations, test=test, optimize=optimize)
+        self.parser = Parser()
 
     def sqp_change_locations(self, locations):
         self.sqp_initialize(locations, optimize=self.optimize)
@@ -287,7 +293,7 @@ class SearchQueryParser(object):
     def _parse(self, query, candidates=None):
         self.recurse_level += 1
         try:
-            res = Parser().parse(query, self.locations)
+            res = self.parser.parse(query, self.locations)
         except RuntimeError:
             raise ParseException('Failed to parse query, recursion limit reached: %s'%repr(query))
         if candidates is None: