Merge new search query parser from calibre_temp

2025-07-09 03:04:10 -04:00 · 2013-04-20 10:13:41 +02:00 · 2013-04-20 10:13:41 +02:00 · 4fc41455ea
commit 4fc41455ea
parent 3631011901 e7d4bec981
8 changed files with 180 additions and 3821 deletions
--- a/resources/images/empty.png
+++ b/resources/images/empty.png
--- a/src/calibre/db/search.py
+++ b/src/calibre/db/search.py
@ -195,13 +195,13 @@ class DateSearch(object):  # {{{
                try:
                    qd = now() - timedelta(int(num))
                except:
-                    raise ParseException(query, len(query), 'Number conversion error')
+                    raise ParseException(_('Number conversion error: {0}').format(num))
                field_count = 3
            else:
                try:
                    qd = parse_date(query, as_utc=False)
                except:
-                    raise ParseException(query, len(query), 'Date conversion error')
+                    raise ParseException(_('Date conversion error: {0}').format(query))
                if '-' in query:
                    field_count = query.count('-') + 1
                else:
@ -285,8 +285,8 @@ class NumericSearch(object):  # {{{
            try:
                q = cast(query) * mult
            except:
-                raise ParseException(query, len(query),
-                                     'Non-numeric value in query: %r'%query)
+                raise ParseException(
+                        _('Non-numeric value in query: {0}').format(query))

        for val, book_ids in field_iter():
            if val is None:
@ -351,8 +351,8 @@ class KeyPairSearch(object):  # {{{
        if ':' in query:
            q = [q.strip() for q in query.split(':')]
            if len(q) != 2:
-                raise ParseException(query, len(query),
-                        'Invalid query format for colon-separated search')
+                raise ParseException(
+                 _('Invalid query format for colon-separated search: {0}').format(query))
            keyq, valq = q
            keyq_mkind, keyq = _matchkind(keyq)
            valq_mkind, valq = _matchkind(valq)
@ -465,7 +465,8 @@ class Parser(SearchQueryParser):
                if invert:
                    matches = self.all_book_ids - matches
                return matches
-            raise ParseException(query, len(query), 'Recursive query group detected')
+            raise ParseException(
+                       _('Recursive query group detected: {0}').format(query))

        # If the user has asked to restrict searching over all field, apply
        # that restriction
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@ -12,7 +12,7 @@ from PyQt4.Qt import (QAbstractTableModel, Qt, pyqtSignal, QIcon, QImage,
        QModelIndex, QVariant, QDateTime, QColor, QPixmap)

 from calibre.gui2 import NONE, UNDEFINED_QDATETIME, error_dialog
-from calibre.utils.pyparsing import ParseException
+from calibre.utils.search_query_parser import ParseException
 from calibre.ebooks.metadata import fmt_sidx, authors_to_string, string_to_authors
 from calibre.ebooks.metadata.book.base import SafeFormat
 from calibre.ptempfile import PersistentTemporaryFile
--- a/src/calibre/gui2/search_restriction_mixin.py
+++ b/src/calibre/gui2/search_restriction_mixin.py
@ -13,7 +13,7 @@ from PyQt4.Qt import (
 from calibre.gui2 import error_dialog, question_dialog
 from calibre.gui2.widgets import ComboBoxWithHelp
 from calibre.utils.icu import sort_key
-from calibre.utils.pyparsing import ParseException
+from calibre.utils.search_query_parser import ParseException
 from calibre.utils.search_query_parser import saved_searches

 class SelectNames(QDialog):  # {{{
@ -299,7 +299,7 @@ class SearchRestrictionMixin(object):

    def __init__(self):
        self.checked = QIcon(I('ok.png'))
-        self.empty = QIcon(I('empty.png'))
+        self.empty = QIcon(I('blank.png'))
        self.search_based_vl_name = None
        self.search_based_vl = None

--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@ -14,7 +14,7 @@ from threading import Thread
 from calibre.utils.config import tweaks, prefs
 from calibre.utils.date import parse_date, now, UNDEFINED_DATE, clean_date_for_sort
 from calibre.utils.search_query_parser import SearchQueryParser
-from calibre.utils.pyparsing import ParseException
+from calibre.utils.search_query_parser import ParseException
 from calibre.utils.localization import (canonicalize_lang, lang_map, get_udc)
 from calibre.db.search import CONTAINS_MATCH, EQUALS_MATCH, REGEXP_MATCH, _match
 from calibre.ebooks.metadata import title_sort, author_to_author_sort
@ -366,25 +366,18 @@ class ResultCache(SearchQueryParser): # {{{
        elif query in self.local_thismonth:
            qd = now()
            field_count = 2
-        elif query.endswith(self.local_daysago):
+        elif query.endswith(self.local_daysago) or query.endswith(self.untrans_daysago):
            num = query[0:-self.local_daysago_len]
            try:
                qd = now() - timedelta(int(num))
            except:
-                raise ParseException(query, len(query), 'Number conversion error', self)
-            field_count = 3
-        elif query.endswith(self.untrans_daysago):
-            num = query[0:-self.untrans_daysago_len]
-            try:
-                qd = now() - timedelta(int(num))
-            except:
-                raise ParseException(query, len(query), 'Number conversion error', self)
+                raise ParseException(_('Number conversion error: {0}').format(num))
            field_count = 3
        else:
            try:
                qd = parse_date(query, as_utc=False)
            except:
-                raise ParseException(query, len(query), 'Date conversion error', self)
+                raise ParseException(_('Date conversion error: {0}').format(query))
            if '-' in query:
                field_count = query.count('-') + 1
            else:
@ -460,8 +453,7 @@ class ResultCache(SearchQueryParser): # {{{
            try:
                q = cast(query) * mult
            except:
-                raise ParseException(query, len(query),
-                                     'Non-numeric value in query', self)
+                raise ParseException(_('Non-numeric value in query: {0}').format(query))

        for id_ in candidates:
            item = self._data[id_]
@ -501,12 +493,13 @@ class ResultCache(SearchQueryParser): # {{{
        return matches

    def get_keypair_matches(self, location, query, candidates):
+        print query
        matches = set([])
        if query.find(':') >= 0:
            q = [q.strip() for q in query.split(':')]
            if len(q) != 2:
-                raise ParseException(query, len(query),
-                        'Invalid query format for colon-separated search', self)
+                raise ParseException(
+                 _('Invalid query format for colon-separated search: {0}').format(query))
            (keyq, valq) = q
            keyq_mkind, keyq = self._matchkind(keyq)
            valq_mkind, valq = self._matchkind(valq)
@ -655,7 +648,7 @@ class ResultCache(SearchQueryParser): # {{{
                    if invert:
                        matches = self.universal_set() - matches
                    return matches
-                raise ParseException(query, len(query), 'Recursive query group detected', self)
+                raise ParseException(_('Recursive query group detected: {0}').format(query))

            # apply the limit if appropriate
            if location == 'all' and prefs['limit_search_columns'] and \
--- a/src/calibre/utils/pyparsing.py
+++ b/src/calibre/utils/pyparsing.py
--- a/src/calibre/utils/search_query_parser.py
+++ b/src/calibre/utils/search_query_parser.py
@ -16,11 +16,8 @@ methods :method:`SearchQueryParser.universal_set` and
 If this module is run, it will perform a series of unit tests.
 '''

-import sys, operator, weakref
+import sys, operator, weakref, re

-from calibre.utils.pyparsing import (CaselessKeyword, Group, Forward,
-        CharsNotIn, Suppress, OneOrMore, MatchFirst, CaselessLiteral,
-        Optional, NoMatch, ParseException, QuotedString)
 from calibre.constants import preferred_encoding
 from calibre.utils.icu import sort_key
 from calibre import prints
@ -96,6 +93,151 @@ def saved_searches():
    global ss
    return ss

+'''
+Parse a search expression into a series of potentially recursive operations.
+
+Note that the interpreter wants binary operators, not n-ary ops. This is why we
+recurse instead of iterating when building sequences of the same op.
+
+The syntax is more than a bit twisted. In particular, the handling of colons
+in the base token requires semantic analysis.
+
+Also note that the query string is lowercased before analysis. This is OK because
+calibre's searches are all case-insensitive.
+
+Grammar:
+
+prog ::= or_expression
+
+or_expression ::= and_expression [ 'or' or_expression ]
+
+and_expression ::= not_expression [ [ 'and' ] and_expression ]
+
+not_expression ::= [ 'not' ] location_expression
+
+location_expression ::= base_token | ( '(' or_expression ')' )
+
+base_token ::= a sequence of letters and colons, perhaps quoted
+'''
+class Parser(object):
+
+        def __init__(self):
+            self.current_token = 0
+            self.tokens = None
+
+        OPCODE = 1
+        WORD = 2
+        QUOTED_WORD = 3
+        EOF = 4
+
+        # Had to translate named constants to numeric values
+        lex_scanner = re.Scanner([
+                (r'[()]',             lambda x,t: (1, t)),
+                (r'[^ "()]+',         lambda x,t: (2, unicode(t))),
+                (r'".*?((?<!\\)")',   lambda x,t: (3, t[1:-1])),
+                (r'\s',               None)
+        ], flags=re.DOTALL)
+
+        def token(self, advance=False):
+            if self.is_eof():
+                return None
+            res = self.tokens[self.current_token][1]
+            if advance:
+                self.current_token += 1
+            return res
+
+        def token_type(self):
+            if self.is_eof():
+                return self.EOF
+            return self.tokens[self.current_token][0]
+
+        def is_eof(self):
+            return self.current_token >= len(self.tokens)
+
+        def advance(self):
+            self.current_token += 1
+
+        def parse(self, expr, locations):
+            self.locations = locations
+            self.tokens = self.lex_scanner.scan(icu_lower(expr))[0]
+            self.current_token = 0
+            prog = self.or_expression()
+            if not self.is_eof():
+                raise ParseException(_('Extra characters at end of search'))
+            #prints(self.tokens, '\n', prog)
+            return prog
+
+        def or_expression(self):
+            lhs = self.and_expression()
+            if self.token() == 'or':
+                self.advance()
+                return ['or', lhs, self.or_expression()]
+            return lhs
+
+        def and_expression(self):
+            lhs = self.not_expression()
+            if self.token() == 'and':
+                self.advance()
+                return ['and', lhs, self.and_expression()]
+
+            # Account for the optional 'and'
+            if self.token_type() in [self.WORD, self.QUOTED_WORD] and self.token() != 'or':
+                return ['and', lhs, self.and_expression()]
+            return lhs
+
+        def not_expression(self):
+            if self.token() == 'not':
+                self.advance()
+                return ['not', self.not_expression()]
+            return self.location_expression()
+
+        def location_expression(self):
+            if self.token() == '(':
+                self.advance()
+                res = self.or_expression()
+                if self.token(advance=True) != ')':
+                    raise ParseException(_('missing )'))
+                return res
+            if self.token_type() not in [ self.WORD, self.QUOTED_WORD ]:
+                raise ParseException(_('Invalid syntax. Expected a lookup name or a word'))
+
+            return self.base_token()
+
+        def base_token(self):
+            if self.token_type() == self.QUOTED_WORD:
+                return ['token', 'all', self.token(advance=True)]
+
+            words = self.token(advance=True).split(':')
+
+            # The complexity here comes from having colon-separated search
+            # values. That forces us to check that the first "word" in a colon-
+            # separated group is a valid location. If not, then the token must
+            # be reconstructed. We also have the problem that locations can be
+            # followed by quoted strings that appear as the next token. and that
+            # tokens can be a sequence of colons.
+
+            # We have a location if there is more than one word and the first
+            # word is in locations. This check could produce a "wrong" answer if
+            # the search string is something like 'author: "foo"' because it
+            # will be interpreted as 'author:"foo"'. I am choosing to accept the
+            # possible error. The expression should be written '"author:" foo'
+            if len(words) > 1 and words[0] in self.locations:
+                loc = words[0]
+                words = words[1:]
+                if len(words) == 1 and self.token_type() == self.QUOTED_WORD:
+                    return ['token', loc, self.token(advance=True)]
+                return ['token', loc, ':'.join(words)]
+
+            return ['token', 'all', ':'.join(words)]
+
+class ParseException(Exception):
+
+    @property
+    def msg(self):
+        if len(self.args) > 0:
+            return self.args[0]
+        return ""
+
 class SearchQueryParser(object):
    '''
    Parses a search query.
@ -134,70 +276,15 @@ class SearchQueryParser(object):

    def __init__(self, locations, test=False, optimize=False):
        self.sqp_initialize(locations, test=test, optimize=optimize)
+        self.parser = Parser()

    def sqp_change_locations(self, locations):
        self.sqp_initialize(locations, optimize=self.optimize)

    def sqp_initialize(self, locations, test=False, optimize=False):
+        self.locations = locations
        self._tests_failed = False
        self.optimize = optimize
-        # Define a token
-        standard_locations = map(lambda x : CaselessLiteral(x)+Suppress(':'),
-                locations)
-        location = NoMatch()
-        for l in standard_locations:
-            location |= l
-        location     = Optional(location, default='all')
-        word_query   = CharsNotIn(u'\t\r\n\u00a0 ' + u'()')
-        #quoted_query = Suppress('"')+CharsNotIn('"')+Suppress('"')
-        quoted_query = QuotedString('"', escChar='\\')
-        query        = quoted_query | word_query
-        Token        = Group(location + query).setResultsName('token')
-
-        if test:
-            print 'Testing Token parser:'
-            Token.validate()
-            failed = SearchQueryParser.run_tests(Token, 'token',
-                (
-                 ('tag:asd',           ['tag', 'asd']),
-                 (u'ddsä',              ['all', u'ddsä']),
-                 ('"one \\"two"',         ['all', 'one "two']),
-                 ('title:"one \\"1.5\\" two"',   ['title', 'one "1.5" two']),
-                 ('title:abc"def', ['title', 'abc"def']),
-                )
-            )
-
-        Or = Forward()
-
-        Parenthesis = Group(
-                        Suppress('(') + Or + Suppress(')')
-                        ).setResultsName('parenthesis') | Token
-
-
-        Not = Forward()
-        Not << (Group(
-            Suppress(CaselessKeyword("not")) + Not
-        ).setResultsName("not") | Parenthesis)
-
-        And = Forward()
-        And << (Group(
-            Not + Suppress(CaselessKeyword("and")) + And
-        ).setResultsName("and") | Group(
-            Not + OneOrMore(~MatchFirst(list(map(CaselessKeyword,
-                ('and', 'or')))) + And)
-        ).setResultsName("and") | Not)
-
-        Or << (Group(
-            And + Suppress(CaselessKeyword("or")) + Or
-        ).setResultsName("or") | And)
-
-        if test:
-            #Or.validate()
-            self._tests_failed = bool(failed)
-
-        self._parser = Or
-        self._parser.setDebug(False)
-

    def parse(self, query):
        # empty the list of searches used for recursion testing
@ -213,10 +300,9 @@ class SearchQueryParser(object):
    def _parse(self, query, candidates=None):
        self.recurse_level += 1
        try:
-            res = self._parser.parseString(query)[0]
+            res = self.parser.parse(query, self.locations)
        except RuntimeError:
-            import repr
-            raise ParseException('Failed to parse query, recursion limit reached: %s'%repr(query))
+            raise ParseException(_('Failed to parse query, recursion limit reached: %s')%repr(query))
        if candidates is None:
            candidates = self.universal_set()
        t = self.evaluate(res, candidates)
@ -227,7 +313,7 @@ class SearchQueryParser(object):
        return getattr(self, 'evaluate_'+group_name)

    def evaluate(self, parse_result, candidates):
-        return self.method(parse_result.getName())(parse_result, candidates)
+        return self.method(parse_result[0])(parse_result[1:], candidates)

    def evaluate_and(self, argument, candidates):
        # RHS checks only those items matched by LHS
@ -249,8 +335,8 @@ class SearchQueryParser(object):
        #  return self.universal_set().difference(self.evaluate(argument[0]))
        return candidates.difference(self.evaluate(argument[0], candidates))

-    def evaluate_parenthesis(self, argument, candidates):
-        return self.evaluate(argument[0], candidates)
+#     def evaluate_parenthesis(self, argument, candidates):
+#         return self.evaluate(argument[0], candidates)

    def evaluate_token(self, argument, candidates):
        location = argument[0]
@ -260,12 +346,16 @@ class SearchQueryParser(object):
                query = query[1:]
            try:
                if query in self.searches_seen:
-                    raise ParseException(query, len(query), 'undefined saved search', self)
+                    raise ParseException(_('Recursive saved search: {0}').format(query))
                if self.recurse_level > 5:
                    self.searches_seen.add(query)
                return self._parse(saved_searches().lookup(query), candidates)
+            except ParseException as e:
+                raise e
            except: # convert all exceptions (e.g., missing key) to a parse error
-                raise ParseException(query, len(query), 'undefined saved search', self)
+                import traceback
+                traceback.print_exc()
+                raise ParseException(_('Unknown error in saved search: {0}').format(query))
        return self._get_matches(location, query, candidates)

    def _get_matches(self, location, query, candidates):
--- a/src/calibre/web/feeds/recipes/model.py
+++ b/src/calibre/web/feeds/recipes/model.py
@ -19,7 +19,7 @@ from calibre.web.feeds.recipes.collection import \
        SchedulerConfig, download_builtin_recipe, update_custom_recipe, \
        add_custom_recipe, remove_custom_recipe, get_custom_recipe, \
        get_builtin_recipe
-from calibre.utils.pyparsing import ParseException
+from calibre.utils.search_query_parser import ParseException

 class NewsTreeItem(object):