Merge new search query parser from calibre_temp

This commit is contained in:
Charles Haley 2013-04-20 10:13:41 +02:00
commit 4fc41455ea
8 changed files with 180 additions and 3821 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 KiB

View File

@ -195,13 +195,13 @@ class DateSearch(object): # {{{
try: try:
qd = now() - timedelta(int(num)) qd = now() - timedelta(int(num))
except: except:
raise ParseException(query, len(query), 'Number conversion error') raise ParseException(_('Number conversion error: {0}').format(num))
field_count = 3 field_count = 3
else: else:
try: try:
qd = parse_date(query, as_utc=False) qd = parse_date(query, as_utc=False)
except: except:
raise ParseException(query, len(query), 'Date conversion error') raise ParseException(_('Date conversion error: {0}').format(query))
if '-' in query: if '-' in query:
field_count = query.count('-') + 1 field_count = query.count('-') + 1
else: else:
@ -285,8 +285,8 @@ class NumericSearch(object): # {{{
try: try:
q = cast(query) * mult q = cast(query) * mult
except: except:
raise ParseException(query, len(query), raise ParseException(
'Non-numeric value in query: %r'%query) _('Non-numeric value in query: {0}').format(query))
for val, book_ids in field_iter(): for val, book_ids in field_iter():
if val is None: if val is None:
@ -351,8 +351,8 @@ class KeyPairSearch(object): # {{{
if ':' in query: if ':' in query:
q = [q.strip() for q in query.split(':')] q = [q.strip() for q in query.split(':')]
if len(q) != 2: if len(q) != 2:
raise ParseException(query, len(query), raise ParseException(
'Invalid query format for colon-separated search') _('Invalid query format for colon-separated search: {0}').format(query))
keyq, valq = q keyq, valq = q
keyq_mkind, keyq = _matchkind(keyq) keyq_mkind, keyq = _matchkind(keyq)
valq_mkind, valq = _matchkind(valq) valq_mkind, valq = _matchkind(valq)
@ -465,7 +465,8 @@ class Parser(SearchQueryParser):
if invert: if invert:
matches = self.all_book_ids - matches matches = self.all_book_ids - matches
return matches return matches
raise ParseException(query, len(query), 'Recursive query group detected') raise ParseException(
_('Recursive query group detected: {0}').format(query))
# If the user has asked to restrict searching over all field, apply # If the user has asked to restrict searching over all field, apply
# that restriction # that restriction

View File

@ -12,7 +12,7 @@ from PyQt4.Qt import (QAbstractTableModel, Qt, pyqtSignal, QIcon, QImage,
QModelIndex, QVariant, QDateTime, QColor, QPixmap) QModelIndex, QVariant, QDateTime, QColor, QPixmap)
from calibre.gui2 import NONE, UNDEFINED_QDATETIME, error_dialog from calibre.gui2 import NONE, UNDEFINED_QDATETIME, error_dialog
from calibre.utils.pyparsing import ParseException from calibre.utils.search_query_parser import ParseException
from calibre.ebooks.metadata import fmt_sidx, authors_to_string, string_to_authors from calibre.ebooks.metadata import fmt_sidx, authors_to_string, string_to_authors
from calibre.ebooks.metadata.book.base import SafeFormat from calibre.ebooks.metadata.book.base import SafeFormat
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile

View File

@ -13,7 +13,7 @@ from PyQt4.Qt import (
from calibre.gui2 import error_dialog, question_dialog from calibre.gui2 import error_dialog, question_dialog
from calibre.gui2.widgets import ComboBoxWithHelp from calibre.gui2.widgets import ComboBoxWithHelp
from calibre.utils.icu import sort_key from calibre.utils.icu import sort_key
from calibre.utils.pyparsing import ParseException from calibre.utils.search_query_parser import ParseException
from calibre.utils.search_query_parser import saved_searches from calibre.utils.search_query_parser import saved_searches
class SelectNames(QDialog): # {{{ class SelectNames(QDialog): # {{{
@ -299,7 +299,7 @@ class SearchRestrictionMixin(object):
def __init__(self): def __init__(self):
self.checked = QIcon(I('ok.png')) self.checked = QIcon(I('ok.png'))
self.empty = QIcon(I('empty.png')) self.empty = QIcon(I('blank.png'))
self.search_based_vl_name = None self.search_based_vl_name = None
self.search_based_vl = None self.search_based_vl = None

View File

@ -14,7 +14,7 @@ from threading import Thread
from calibre.utils.config import tweaks, prefs from calibre.utils.config import tweaks, prefs
from calibre.utils.date import parse_date, now, UNDEFINED_DATE, clean_date_for_sort from calibre.utils.date import parse_date, now, UNDEFINED_DATE, clean_date_for_sort
from calibre.utils.search_query_parser import SearchQueryParser from calibre.utils.search_query_parser import SearchQueryParser
from calibre.utils.pyparsing import ParseException from calibre.utils.search_query_parser import ParseException
from calibre.utils.localization import (canonicalize_lang, lang_map, get_udc) from calibre.utils.localization import (canonicalize_lang, lang_map, get_udc)
from calibre.db.search import CONTAINS_MATCH, EQUALS_MATCH, REGEXP_MATCH, _match from calibre.db.search import CONTAINS_MATCH, EQUALS_MATCH, REGEXP_MATCH, _match
from calibre.ebooks.metadata import title_sort, author_to_author_sort from calibre.ebooks.metadata import title_sort, author_to_author_sort
@ -366,25 +366,18 @@ class ResultCache(SearchQueryParser): # {{{
elif query in self.local_thismonth: elif query in self.local_thismonth:
qd = now() qd = now()
field_count = 2 field_count = 2
elif query.endswith(self.local_daysago): elif query.endswith(self.local_daysago) or query.endswith(self.untrans_daysago):
num = query[0:-self.local_daysago_len] num = query[0:-self.local_daysago_len]
try: try:
qd = now() - timedelta(int(num)) qd = now() - timedelta(int(num))
except: except:
raise ParseException(query, len(query), 'Number conversion error', self) raise ParseException(_('Number conversion error: {0}').format(num))
field_count = 3
elif query.endswith(self.untrans_daysago):
num = query[0:-self.untrans_daysago_len]
try:
qd = now() - timedelta(int(num))
except:
raise ParseException(query, len(query), 'Number conversion error', self)
field_count = 3 field_count = 3
else: else:
try: try:
qd = parse_date(query, as_utc=False) qd = parse_date(query, as_utc=False)
except: except:
raise ParseException(query, len(query), 'Date conversion error', self) raise ParseException(_('Date conversion error: {0}').format(query))
if '-' in query: if '-' in query:
field_count = query.count('-') + 1 field_count = query.count('-') + 1
else: else:
@ -460,8 +453,7 @@ class ResultCache(SearchQueryParser): # {{{
try: try:
q = cast(query) * mult q = cast(query) * mult
except: except:
raise ParseException(query, len(query), raise ParseException(_('Non-numeric value in query: {0}').format(query))
'Non-numeric value in query', self)
for id_ in candidates: for id_ in candidates:
item = self._data[id_] item = self._data[id_]
@ -501,12 +493,13 @@ class ResultCache(SearchQueryParser): # {{{
return matches return matches
def get_keypair_matches(self, location, query, candidates): def get_keypair_matches(self, location, query, candidates):
print query
matches = set([]) matches = set([])
if query.find(':') >= 0: if query.find(':') >= 0:
q = [q.strip() for q in query.split(':')] q = [q.strip() for q in query.split(':')]
if len(q) != 2: if len(q) != 2:
raise ParseException(query, len(query), raise ParseException(
'Invalid query format for colon-separated search', self) _('Invalid query format for colon-separated search: {0}').format(query))
(keyq, valq) = q (keyq, valq) = q
keyq_mkind, keyq = self._matchkind(keyq) keyq_mkind, keyq = self._matchkind(keyq)
valq_mkind, valq = self._matchkind(valq) valq_mkind, valq = self._matchkind(valq)
@ -655,7 +648,7 @@ class ResultCache(SearchQueryParser): # {{{
if invert: if invert:
matches = self.universal_set() - matches matches = self.universal_set() - matches
return matches return matches
raise ParseException(query, len(query), 'Recursive query group detected', self) raise ParseException(_('Recursive query group detected: {0}').format(query))
# apply the limit if appropriate # apply the limit if appropriate
if location == 'all' and prefs['limit_search_columns'] and \ if location == 'all' and prefs['limit_search_columns'] and \

File diff suppressed because it is too large Load Diff

View File

@ -16,11 +16,8 @@ methods :method:`SearchQueryParser.universal_set` and
If this module is run, it will perform a series of unit tests. If this module is run, it will perform a series of unit tests.
''' '''
import sys, operator, weakref import sys, operator, weakref, re
from calibre.utils.pyparsing import (CaselessKeyword, Group, Forward,
CharsNotIn, Suppress, OneOrMore, MatchFirst, CaselessLiteral,
Optional, NoMatch, ParseException, QuotedString)
from calibre.constants import preferred_encoding from calibre.constants import preferred_encoding
from calibre.utils.icu import sort_key from calibre.utils.icu import sort_key
from calibre import prints from calibre import prints
@ -96,6 +93,151 @@ def saved_searches():
global ss global ss
return ss return ss
'''
Parse a search expression into a series of potentially recursive operations.
Note that the interpreter wants binary operators, not n-ary ops. This is why we
recurse instead of iterating when building sequences of the same op.
The syntax is more than a bit twisted. In particular, the handling of colons
in the base token requires semantic analysis.
Also note that the query string is lowercased before analysis. This is OK because
calibre's searches are all case-insensitive.
Grammar:
prog ::= or_expression
or_expression ::= and_expression [ 'or' or_expression ]
and_expression ::= not_expression [ [ 'and' ] and_expression ]
not_expression ::= [ 'not' ] location_expression
location_expression ::= base_token | ( '(' or_expression ')' )
base_token ::= a sequence of letters and colons, perhaps quoted
'''
class Parser(object):
def __init__(self):
self.current_token = 0
self.tokens = None
OPCODE = 1
WORD = 2
QUOTED_WORD = 3
EOF = 4
# Had to translate named constants to numeric values
lex_scanner = re.Scanner([
(r'[()]', lambda x,t: (1, t)),
(r'[^ "()]+', lambda x,t: (2, unicode(t))),
(r'".*?((?<!\\)")', lambda x,t: (3, t[1:-1])),
(r'\s', None)
], flags=re.DOTALL)
def token(self, advance=False):
if self.is_eof():
return None
res = self.tokens[self.current_token][1]
if advance:
self.current_token += 1
return res
def token_type(self):
if self.is_eof():
return self.EOF
return self.tokens[self.current_token][0]
def is_eof(self):
return self.current_token >= len(self.tokens)
def advance(self):
self.current_token += 1
def parse(self, expr, locations):
self.locations = locations
self.tokens = self.lex_scanner.scan(icu_lower(expr))[0]
self.current_token = 0
prog = self.or_expression()
if not self.is_eof():
raise ParseException(_('Extra characters at end of search'))
#prints(self.tokens, '\n', prog)
return prog
def or_expression(self):
lhs = self.and_expression()
if self.token() == 'or':
self.advance()
return ['or', lhs, self.or_expression()]
return lhs
def and_expression(self):
lhs = self.not_expression()
if self.token() == 'and':
self.advance()
return ['and', lhs, self.and_expression()]
# Account for the optional 'and'
if self.token_type() in [self.WORD, self.QUOTED_WORD] and self.token() != 'or':
return ['and', lhs, self.and_expression()]
return lhs
def not_expression(self):
if self.token() == 'not':
self.advance()
return ['not', self.not_expression()]
return self.location_expression()
def location_expression(self):
if self.token() == '(':
self.advance()
res = self.or_expression()
if self.token(advance=True) != ')':
raise ParseException(_('missing )'))
return res
if self.token_type() not in [ self.WORD, self.QUOTED_WORD ]:
raise ParseException(_('Invalid syntax. Expected a lookup name or a word'))
return self.base_token()
def base_token(self):
if self.token_type() == self.QUOTED_WORD:
return ['token', 'all', self.token(advance=True)]
words = self.token(advance=True).split(':')
# The complexity here comes from having colon-separated search
# values. That forces us to check that the first "word" in a colon-
# separated group is a valid location. If not, then the token must
# be reconstructed. We also have the problem that locations can be
# followed by quoted strings that appear as the next token. and that
# tokens can be a sequence of colons.
# We have a location if there is more than one word and the first
# word is in locations. This check could produce a "wrong" answer if
# the search string is something like 'author: "foo"' because it
# will be interpreted as 'author:"foo"'. I am choosing to accept the
# possible error. The expression should be written '"author:" foo'
if len(words) > 1 and words[0] in self.locations:
loc = words[0]
words = words[1:]
if len(words) == 1 and self.token_type() == self.QUOTED_WORD:
return ['token', loc, self.token(advance=True)]
return ['token', loc, ':'.join(words)]
return ['token', 'all', ':'.join(words)]
class ParseException(Exception):
@property
def msg(self):
if len(self.args) > 0:
return self.args[0]
return ""
class SearchQueryParser(object): class SearchQueryParser(object):
''' '''
Parses a search query. Parses a search query.
@ -134,70 +276,15 @@ class SearchQueryParser(object):
def __init__(self, locations, test=False, optimize=False): def __init__(self, locations, test=False, optimize=False):
self.sqp_initialize(locations, test=test, optimize=optimize) self.sqp_initialize(locations, test=test, optimize=optimize)
self.parser = Parser()
def sqp_change_locations(self, locations): def sqp_change_locations(self, locations):
self.sqp_initialize(locations, optimize=self.optimize) self.sqp_initialize(locations, optimize=self.optimize)
def sqp_initialize(self, locations, test=False, optimize=False): def sqp_initialize(self, locations, test=False, optimize=False):
self.locations = locations
self._tests_failed = False self._tests_failed = False
self.optimize = optimize self.optimize = optimize
# Define a token
standard_locations = map(lambda x : CaselessLiteral(x)+Suppress(':'),
locations)
location = NoMatch()
for l in standard_locations:
location |= l
location = Optional(location, default='all')
word_query = CharsNotIn(u'\t\r\n\u00a0 ' + u'()')
#quoted_query = Suppress('"')+CharsNotIn('"')+Suppress('"')
quoted_query = QuotedString('"', escChar='\\')
query = quoted_query | word_query
Token = Group(location + query).setResultsName('token')
if test:
print 'Testing Token parser:'
Token.validate()
failed = SearchQueryParser.run_tests(Token, 'token',
(
('tag:asd', ['tag', 'asd']),
(u'ddsä', ['all', u'ddsä']),
('"one \\"two"', ['all', 'one "two']),
('title:"one \\"1.5\\" two"', ['title', 'one "1.5" two']),
('title:abc"def', ['title', 'abc"def']),
)
)
Or = Forward()
Parenthesis = Group(
Suppress('(') + Or + Suppress(')')
).setResultsName('parenthesis') | Token
Not = Forward()
Not << (Group(
Suppress(CaselessKeyword("not")) + Not
).setResultsName("not") | Parenthesis)
And = Forward()
And << (Group(
Not + Suppress(CaselessKeyword("and")) + And
).setResultsName("and") | Group(
Not + OneOrMore(~MatchFirst(list(map(CaselessKeyword,
('and', 'or')))) + And)
).setResultsName("and") | Not)
Or << (Group(
And + Suppress(CaselessKeyword("or")) + Or
).setResultsName("or") | And)
if test:
#Or.validate()
self._tests_failed = bool(failed)
self._parser = Or
self._parser.setDebug(False)
def parse(self, query): def parse(self, query):
# empty the list of searches used for recursion testing # empty the list of searches used for recursion testing
@ -213,10 +300,9 @@ class SearchQueryParser(object):
def _parse(self, query, candidates=None): def _parse(self, query, candidates=None):
self.recurse_level += 1 self.recurse_level += 1
try: try:
res = self._parser.parseString(query)[0] res = self.parser.parse(query, self.locations)
except RuntimeError: except RuntimeError:
import repr raise ParseException(_('Failed to parse query, recursion limit reached: %s')%repr(query))
raise ParseException('Failed to parse query, recursion limit reached: %s'%repr(query))
if candidates is None: if candidates is None:
candidates = self.universal_set() candidates = self.universal_set()
t = self.evaluate(res, candidates) t = self.evaluate(res, candidates)
@ -227,7 +313,7 @@ class SearchQueryParser(object):
return getattr(self, 'evaluate_'+group_name) return getattr(self, 'evaluate_'+group_name)
def evaluate(self, parse_result, candidates): def evaluate(self, parse_result, candidates):
return self.method(parse_result.getName())(parse_result, candidates) return self.method(parse_result[0])(parse_result[1:], candidates)
def evaluate_and(self, argument, candidates): def evaluate_and(self, argument, candidates):
# RHS checks only those items matched by LHS # RHS checks only those items matched by LHS
@ -249,8 +335,8 @@ class SearchQueryParser(object):
# return self.universal_set().difference(self.evaluate(argument[0])) # return self.universal_set().difference(self.evaluate(argument[0]))
return candidates.difference(self.evaluate(argument[0], candidates)) return candidates.difference(self.evaluate(argument[0], candidates))
def evaluate_parenthesis(self, argument, candidates): # def evaluate_parenthesis(self, argument, candidates):
return self.evaluate(argument[0], candidates) # return self.evaluate(argument[0], candidates)
def evaluate_token(self, argument, candidates): def evaluate_token(self, argument, candidates):
location = argument[0] location = argument[0]
@ -260,12 +346,16 @@ class SearchQueryParser(object):
query = query[1:] query = query[1:]
try: try:
if query in self.searches_seen: if query in self.searches_seen:
raise ParseException(query, len(query), 'undefined saved search', self) raise ParseException(_('Recursive saved search: {0}').format(query))
if self.recurse_level > 5: if self.recurse_level > 5:
self.searches_seen.add(query) self.searches_seen.add(query)
return self._parse(saved_searches().lookup(query), candidates) return self._parse(saved_searches().lookup(query), candidates)
except ParseException as e:
raise e
except: # convert all exceptions (e.g., missing key) to a parse error except: # convert all exceptions (e.g., missing key) to a parse error
raise ParseException(query, len(query), 'undefined saved search', self) import traceback
traceback.print_exc()
raise ParseException(_('Unknown error in saved search: {0}').format(query))
return self._get_matches(location, query, candidates) return self._get_matches(location, query, candidates)
def _get_matches(self, location, query, candidates): def _get_matches(self, location, query, candidates):

View File

@ -19,7 +19,7 @@ from calibre.web.feeds.recipes.collection import \
SchedulerConfig, download_builtin_recipe, update_custom_recipe, \ SchedulerConfig, download_builtin_recipe, update_custom_recipe, \
add_custom_recipe, remove_custom_recipe, get_custom_recipe, \ add_custom_recipe, remove_custom_recipe, get_custom_recipe, \
get_builtin_recipe get_builtin_recipe
from calibre.utils.pyparsing import ParseException from calibre.utils.search_query_parser import ParseException
class NewsTreeItem(object): class NewsTreeItem(object):