mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Initial stab at supporting smart quotes as delimiters in the search query parser
This commit is contained in:
parent
13d0a7f353
commit
ac3132e541
@ -142,14 +142,16 @@ class Parser(object):
|
|||||||
WORD = 2
|
WORD = 2
|
||||||
QUOTED_WORD = 3
|
QUOTED_WORD = 3
|
||||||
EOF = 4
|
EOF = 4
|
||||||
|
REPLACEMENTS = tuple((u'\\' + x, unichr(i + 1)) for i, x in enumerate(ur'\"()“”'))
|
||||||
|
|
||||||
# Had to translate named constants to numeric values
|
# Had to translate named constants to numeric values
|
||||||
lex_scanner = re.Scanner([
|
lex_scanner = re.Scanner([
|
||||||
(r'[()]', lambda x,t: (Parser.OPCODE, t)),
|
(ur'[()]', lambda x,t: (Parser.OPCODE, t)),
|
||||||
(r'@.+?:[^")\s]+', lambda x,t: (Parser.WORD, unicode(t))),
|
(ur'@.+?:[^"“)\s]+', lambda x,t: (Parser.WORD, unicode(t))),
|
||||||
(r'[^"()\s]+', lambda x,t: (Parser.WORD, unicode(t))),
|
(ur'[^"“()\s]+', lambda x,t: (Parser.WORD, unicode(t))),
|
||||||
(r'".*?((?<!\\)")', lambda x,t: (Parser.QUOTED_WORD, t[1:-1])),
|
(ur'".*?((?<!\\)")', lambda x,t: (Parser.QUOTED_WORD, t[1:-1])),
|
||||||
(r'\s+', None)
|
(ur'“.*?((?<!\\)”)', lambda x,t: (Parser.QUOTED_WORD, t[1:-1])),
|
||||||
|
(ur'\s+', None)
|
||||||
], flags=re.DOTALL)
|
], flags=re.DOTALL)
|
||||||
|
|
||||||
def token(self, advance=False):
|
def token(self, advance=False):
|
||||||
@ -179,21 +181,26 @@ class Parser(object):
|
|||||||
def advance(self):
|
def advance(self):
|
||||||
self.current_token += 1
|
self.current_token += 1
|
||||||
|
|
||||||
def parse(self, expr, locations):
|
def tokenize(self, expr):
|
||||||
self.locations = locations
|
|
||||||
|
|
||||||
# Strip out escaped backslashes, quotes and parens so that the
|
# Strip out escaped backslashes, quotes and parens so that the
|
||||||
# lex scanner doesn't get confused. We put them back later.
|
# lex scanner doesn't get confused. We put them back later.
|
||||||
expr = expr.replace(u'\\\\', u'\x01').replace(u'\\"', u'\x02')
|
for k, v in self.REPLACEMENTS:
|
||||||
expr = expr.replace(u'\\(', u'\x03').replace(u'\\)', u'\x04')
|
expr = expr.replace(k, v)
|
||||||
self.tokens = self.lex_scanner.scan(expr)[0]
|
tokens = self.lex_scanner.scan(expr)[0]
|
||||||
for (i,tok) in enumerate(self.tokens):
|
|
||||||
tt, tv = tok
|
|
||||||
if tt == self.WORD or tt == self.QUOTED_WORD:
|
|
||||||
self.tokens[i] = (tt,
|
|
||||||
tv.replace(u'\x01', u'\\').replace(u'\x02', u'"').
|
|
||||||
replace(u'\x03', u'(').replace(u'\x04', u')'))
|
|
||||||
|
|
||||||
|
def unescape(x):
|
||||||
|
for k, v in self.REPLACEMENTS:
|
||||||
|
x = x.replace(v, k[1:])
|
||||||
|
return x
|
||||||
|
|
||||||
|
return [
|
||||||
|
(tt, unescape(tv) if tt in (self.WORD, self.QUOTED_WORD) else tv)
|
||||||
|
for tt, tv in tokens
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse(self, expr, locations):
|
||||||
|
self.locations = locations
|
||||||
|
self.tokens = self.tokenize(expr)
|
||||||
self.current_token = 0
|
self.current_token = 0
|
||||||
prog = self.or_expression()
|
prog = self.or_expression()
|
||||||
if not self.is_eof():
|
if not self.is_eof():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user