mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	Test replacement search query parser
This commit is contained in:
		
							parent
							
								
									3631011901
								
							
						
					
					
						commit
						a5b66f0b1c
					
				
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -16,11 +16,9 @@ methods :method:`SearchQueryParser.universal_set` and
 | 
				
			|||||||
If this module is run, it will perform a series of unit tests.
 | 
					If this module is run, it will perform a series of unit tests.
 | 
				
			||||||
'''
 | 
					'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import sys, operator, weakref
 | 
					import sys, operator, weakref, re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from calibre.utils.pyparsing import (CaselessKeyword, Group, Forward,
 | 
					from calibre.utils.pyparsing import ParseException
 | 
				
			||||||
        CharsNotIn, Suppress, OneOrMore, MatchFirst, CaselessLiteral,
 | 
					 | 
				
			||||||
        Optional, NoMatch, ParseException, QuotedString)
 | 
					 | 
				
			||||||
from calibre.constants import preferred_encoding
 | 
					from calibre.constants import preferred_encoding
 | 
				
			||||||
from calibre.utils.icu import sort_key
 | 
					from calibre.utils.icu import sort_key
 | 
				
			||||||
from calibre import prints
 | 
					from calibre import prints
 | 
				
			||||||
@ -96,6 +94,138 @@ def saved_searches():
 | 
				
			|||||||
    global ss
 | 
					    global ss
 | 
				
			||||||
    return ss
 | 
					    return ss
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					'''
 | 
				
			||||||
 | 
					Parse a search expression into a series of potentially recursive operations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The syntax is a bit twisted.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					prog ::= or_expression
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					or_expression ::= and_expression [ 'or' or_expression ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					and_expression ::= not_expression [ ( [ 'and' ] and_expression ) | ( '(' or_expression ')' ) ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					not_expression ::= [ 'not' ] base_token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					base_token ::= location_expression | ( '(' or_expression ')' )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					location_expression ::= [ word [ ':'  word ]*
 | 
				
			||||||
 | 
					'''
 | 
				
			||||||
 | 
					class Parser(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __init__(self):
 | 
				
			||||||
 | 
					            self.current_token = 0
 | 
				
			||||||
 | 
					            self.tokens = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        OPCODE = 1
 | 
				
			||||||
 | 
					        WORD = 2
 | 
				
			||||||
 | 
					        EOF = 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Had to translate named constants to numeric values
 | 
				
			||||||
 | 
					        lex_scanner = re.Scanner([
 | 
				
			||||||
 | 
					                (r'[():]',            lambda x,t: (1, t)),
 | 
				
			||||||
 | 
					                (r'[^ "():]+',        lambda x,t: (2, unicode(t))),
 | 
				
			||||||
 | 
					                (r'".*?((?<!\\)")',   lambda x,t: (2, t[1:-1])),
 | 
				
			||||||
 | 
					                (r'\s',               None)
 | 
				
			||||||
 | 
					        ], flags=re.DOTALL)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def token(self):
 | 
				
			||||||
 | 
					            if self.is_eof():
 | 
				
			||||||
 | 
					                return None
 | 
				
			||||||
 | 
					            return self.tokens[self.current_token][1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def token_type(self):
 | 
				
			||||||
 | 
					            if self.is_eof():
 | 
				
			||||||
 | 
					                return self.EOF
 | 
				
			||||||
 | 
					            return self.tokens[self.current_token][0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def is_eof(self):
 | 
				
			||||||
 | 
					            return self.current_token >= len(self.tokens)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def advance(self):
 | 
				
			||||||
 | 
					            self.current_token += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def parse(self, expr, locations):
 | 
				
			||||||
 | 
					            self.locations = locations
 | 
				
			||||||
 | 
					            self.tokens = self.lex_scanner.scan(icu_lower(expr))[0]
 | 
				
			||||||
 | 
					            self.current_token = 0
 | 
				
			||||||
 | 
					            prog = self.or_expression()
 | 
				
			||||||
 | 
					            if not self.is_eof():
 | 
				
			||||||
 | 
					                raise ParseException(_('Extra characters at end of search'))
 | 
				
			||||||
 | 
					            # prints(self.tokens, '\n', prog)
 | 
				
			||||||
 | 
					            return prog
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def or_expression(self):
 | 
				
			||||||
 | 
					            lhs = self.and_expression()
 | 
				
			||||||
 | 
					            if self.is_eof():
 | 
				
			||||||
 | 
					                return lhs
 | 
				
			||||||
 | 
					            if self.token() == 'or':
 | 
				
			||||||
 | 
					                self.advance()
 | 
				
			||||||
 | 
					                return ['or', lhs, self.or_expression()]
 | 
				
			||||||
 | 
					            return lhs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def and_expression(self):
 | 
				
			||||||
 | 
					            lhs = self.not_expression()
 | 
				
			||||||
 | 
					            if self.is_eof():
 | 
				
			||||||
 | 
					                return lhs
 | 
				
			||||||
 | 
					            if self.token() == 'and':
 | 
				
			||||||
 | 
					                self.advance()
 | 
				
			||||||
 | 
					                return ['and', lhs, self.and_expression()]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Account for the optional 'and'
 | 
				
			||||||
 | 
					            if self.token_type() == self.WORD and self.token() != 'or':
 | 
				
			||||||
 | 
					                return ['and', lhs, self.and_expression()]
 | 
				
			||||||
 | 
					            elif self.token() == '(':
 | 
				
			||||||
 | 
					                self.advance()
 | 
				
			||||||
 | 
					                rhs = self.or_expression()
 | 
				
			||||||
 | 
					                if self.token() != ')':
 | 
				
			||||||
 | 
					                    raise ParseException('missing )')
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    self.advance();
 | 
				
			||||||
 | 
					                    return ['and', lhs, rhs]
 | 
				
			||||||
 | 
					            return lhs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def not_expression(self):
 | 
				
			||||||
 | 
					            if self.token() == 'not':
 | 
				
			||||||
 | 
					                self.advance()
 | 
				
			||||||
 | 
					                return ['not', self.not_expression()]
 | 
				
			||||||
 | 
					            return self.base_token()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def base_token(self):
 | 
				
			||||||
 | 
					            if self.token() == '(':
 | 
				
			||||||
 | 
					                self.advance()
 | 
				
			||||||
 | 
					                res = self.or_expression()
 | 
				
			||||||
 | 
					                if self.token() != ')':
 | 
				
			||||||
 | 
					                    raise ParseException('missing )')
 | 
				
			||||||
 | 
					                self.advance()
 | 
				
			||||||
 | 
					                return res
 | 
				
			||||||
 | 
					            if self.token_type() != self.WORD:
 | 
				
			||||||
 | 
					                raise ParseException('Invalid syntax. Expected a lookup name or a word')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            return self.location_expression()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def location_expression(self):
 | 
				
			||||||
 | 
					            loc = self.token()
 | 
				
			||||||
 | 
					            self.advance()
 | 
				
			||||||
 | 
					            if self.token() == ':':
 | 
				
			||||||
 | 
					                if loc in self.locations:
 | 
				
			||||||
 | 
					                    val = ''
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    val = loc + ':'
 | 
				
			||||||
 | 
					                    loc = 'all'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                self.advance()
 | 
				
			||||||
 | 
					                while True:
 | 
				
			||||||
 | 
					                    val += self.token()
 | 
				
			||||||
 | 
					                    self.advance()
 | 
				
			||||||
 | 
					                    if self.token() == ':':
 | 
				
			||||||
 | 
					                        val += ':'
 | 
				
			||||||
 | 
					                        self.advance()
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        break
 | 
				
			||||||
 | 
					                return ['token', loc, val]
 | 
				
			||||||
 | 
					            return ['token', 'all', loc]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SearchQueryParser(object):
 | 
					class SearchQueryParser(object):
 | 
				
			||||||
    '''
 | 
					    '''
 | 
				
			||||||
    Parses a search query.
 | 
					    Parses a search query.
 | 
				
			||||||
@ -139,65 +269,9 @@ class SearchQueryParser(object):
 | 
				
			|||||||
        self.sqp_initialize(locations, optimize=self.optimize)
 | 
					        self.sqp_initialize(locations, optimize=self.optimize)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def sqp_initialize(self, locations, test=False, optimize=False):
 | 
					    def sqp_initialize(self, locations, test=False, optimize=False):
 | 
				
			||||||
 | 
					        self.locations = locations
 | 
				
			||||||
        self._tests_failed = False
 | 
					        self._tests_failed = False
 | 
				
			||||||
        self.optimize = optimize
 | 
					        self.optimize = optimize
 | 
				
			||||||
        # Define a token
 | 
					 | 
				
			||||||
        standard_locations = map(lambda x : CaselessLiteral(x)+Suppress(':'),
 | 
					 | 
				
			||||||
                locations)
 | 
					 | 
				
			||||||
        location = NoMatch()
 | 
					 | 
				
			||||||
        for l in standard_locations:
 | 
					 | 
				
			||||||
            location |= l
 | 
					 | 
				
			||||||
        location     = Optional(location, default='all')
 | 
					 | 
				
			||||||
        word_query   = CharsNotIn(u'\t\r\n\u00a0 ' + u'()')
 | 
					 | 
				
			||||||
        #quoted_query = Suppress('"')+CharsNotIn('"')+Suppress('"')
 | 
					 | 
				
			||||||
        quoted_query = QuotedString('"', escChar='\\')
 | 
					 | 
				
			||||||
        query        = quoted_query | word_query
 | 
					 | 
				
			||||||
        Token        = Group(location + query).setResultsName('token')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if test:
 | 
					 | 
				
			||||||
            print 'Testing Token parser:'
 | 
					 | 
				
			||||||
            Token.validate()
 | 
					 | 
				
			||||||
            failed = SearchQueryParser.run_tests(Token, 'token',
 | 
					 | 
				
			||||||
                (
 | 
					 | 
				
			||||||
                 ('tag:asd',           ['tag', 'asd']),
 | 
					 | 
				
			||||||
                 (u'ddsä',              ['all', u'ddsä']),
 | 
					 | 
				
			||||||
                 ('"one \\"two"',         ['all', 'one "two']),
 | 
					 | 
				
			||||||
                 ('title:"one \\"1.5\\" two"',   ['title', 'one "1.5" two']),
 | 
					 | 
				
			||||||
                 ('title:abc"def', ['title', 'abc"def']),
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        Or = Forward()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        Parenthesis = Group(
 | 
					 | 
				
			||||||
                        Suppress('(') + Or + Suppress(')')
 | 
					 | 
				
			||||||
                        ).setResultsName('parenthesis') | Token
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        Not = Forward()
 | 
					 | 
				
			||||||
        Not << (Group(
 | 
					 | 
				
			||||||
            Suppress(CaselessKeyword("not")) + Not
 | 
					 | 
				
			||||||
        ).setResultsName("not") | Parenthesis)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        And = Forward()
 | 
					 | 
				
			||||||
        And << (Group(
 | 
					 | 
				
			||||||
            Not + Suppress(CaselessKeyword("and")) + And
 | 
					 | 
				
			||||||
        ).setResultsName("and") | Group(
 | 
					 | 
				
			||||||
            Not + OneOrMore(~MatchFirst(list(map(CaselessKeyword,
 | 
					 | 
				
			||||||
                ('and', 'or')))) + And)
 | 
					 | 
				
			||||||
        ).setResultsName("and") | Not)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        Or << (Group(
 | 
					 | 
				
			||||||
            And + Suppress(CaselessKeyword("or")) + Or
 | 
					 | 
				
			||||||
        ).setResultsName("or") | And)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if test:
 | 
					 | 
				
			||||||
            #Or.validate()
 | 
					 | 
				
			||||||
            self._tests_failed = bool(failed)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self._parser = Or
 | 
					 | 
				
			||||||
        self._parser.setDebug(False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def parse(self, query):
 | 
					    def parse(self, query):
 | 
				
			||||||
        # empty the list of searches used for recursion testing
 | 
					        # empty the list of searches used for recursion testing
 | 
				
			||||||
@ -213,9 +287,8 @@ class SearchQueryParser(object):
 | 
				
			|||||||
    def _parse(self, query, candidates=None):
 | 
					    def _parse(self, query, candidates=None):
 | 
				
			||||||
        self.recurse_level += 1
 | 
					        self.recurse_level += 1
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            res = self._parser.parseString(query)[0]
 | 
					            res = Parser().parse(query, self.locations)
 | 
				
			||||||
        except RuntimeError:
 | 
					        except RuntimeError:
 | 
				
			||||||
            import repr
 | 
					 | 
				
			||||||
            raise ParseException('Failed to parse query, recursion limit reached: %s'%repr(query))
 | 
					            raise ParseException('Failed to parse query, recursion limit reached: %s'%repr(query))
 | 
				
			||||||
        if candidates is None:
 | 
					        if candidates is None:
 | 
				
			||||||
            candidates = self.universal_set()
 | 
					            candidates = self.universal_set()
 | 
				
			||||||
@ -227,7 +300,7 @@ class SearchQueryParser(object):
 | 
				
			|||||||
        return getattr(self, 'evaluate_'+group_name)
 | 
					        return getattr(self, 'evaluate_'+group_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def evaluate(self, parse_result, candidates):
 | 
					    def evaluate(self, parse_result, candidates):
 | 
				
			||||||
        return self.method(parse_result.getName())(parse_result, candidates)
 | 
					        return self.method(parse_result[0])(parse_result[1:], candidates)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def evaluate_and(self, argument, candidates):
 | 
					    def evaluate_and(self, argument, candidates):
 | 
				
			||||||
        # RHS checks only those items matched by LHS
 | 
					        # RHS checks only those items matched by LHS
 | 
				
			||||||
@ -249,8 +322,8 @@ class SearchQueryParser(object):
 | 
				
			|||||||
        #  return self.universal_set().difference(self.evaluate(argument[0]))
 | 
					        #  return self.universal_set().difference(self.evaluate(argument[0]))
 | 
				
			||||||
        return candidates.difference(self.evaluate(argument[0], candidates))
 | 
					        return candidates.difference(self.evaluate(argument[0], candidates))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def evaluate_parenthesis(self, argument, candidates):
 | 
					#     def evaluate_parenthesis(self, argument, candidates):
 | 
				
			||||||
        return self.evaluate(argument[0], candidates)
 | 
					#         return self.evaluate(argument[0], candidates)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def evaluate_token(self, argument, candidates):
 | 
					    def evaluate_token(self, argument, candidates):
 | 
				
			||||||
        location = argument[0]
 | 
					        location = argument[0]
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user