From c70b70dd76eb1f7fa8f7a587752aef171fc97e5f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 4 Aug 2013 08:20:37 +0530 Subject: [PATCH] Make the punctuation removal faster and more robust --- src/calibre/gui2/store/search/models.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/calibre/gui2/store/search/models.py b/src/calibre/gui2/store/search/models.py index af2c274bd7..026cb76d8d 100644 --- a/src/calibre/gui2/store/search/models.py +++ b/src/calibre/gui2/store/search/models.py @@ -6,7 +6,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re +import re, string from operator import attrgetter from PyQt4.Qt import (Qt, QAbstractItemModel, QVariant, QPixmap, QModelIndex, QSize, @@ -325,6 +325,9 @@ class SearchFilter(SearchQueryParser): def __init__(self): SearchQueryParser.__init__(self, locations=self.USABLE_LOCATIONS) self.srs = set([]) + # remove joiner words surrounded by space or at string boundaries + self.joiner_pat = re.compile(r'(^|\s)(and|not|or|a|the|is|of)(\s|$)', re.IGNORECASE) + self.punctuation_table = {ord(x):' ' for x in string.punctuation} def add_search_result(self, search_result): self.srs.add(search_result) @@ -449,11 +452,10 @@ class SearchFilter(SearchQueryParser): if locvalue == 'format': vals = accessor(sr).split(',') - elif locvalue in ('author2', 'title2'): + elif locvalue in {'author2', 'title2'}: m = self.IN_MATCH - vals = re.sub(r'(^|\s)(and|not|or|a|the|is|of|,)(\s|$)', ' ', accessor(sr)).split(' ') - vals = [x for x in vals if x] - final_query = query.lower() + vals = [x for x in self.field_trimmer(accessor(sr)).split() if x] + final_query = ' '.join(self.field_trimmer(icu_lower(query)).split()) else: vals = [accessor(sr)] if self._match(final_query, vals, m): @@ -464,3 +466,8 @@ class SearchFilter(SearchQueryParser): traceback.print_exc() return matches + def field_trimmer(self, field): + ''' Remove common joiner words and punctuation to improve matching, + punctuation is removed first, so that a.and.b becomes a b ''' + return self.joiner_pat.sub(' ', field.translate(self.punctuation_table)) +