mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Make the punctuation removal faster and more robust
This commit is contained in:
parent
fefee248cf
commit
c70b70dd76
@ -6,7 +6,7 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
import re, string
|
||||||
from operator import attrgetter
|
from operator import attrgetter
|
||||||
|
|
||||||
from PyQt4.Qt import (Qt, QAbstractItemModel, QVariant, QPixmap, QModelIndex, QSize,
|
from PyQt4.Qt import (Qt, QAbstractItemModel, QVariant, QPixmap, QModelIndex, QSize,
|
||||||
@ -325,6 +325,9 @@ class SearchFilter(SearchQueryParser):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
SearchQueryParser.__init__(self, locations=self.USABLE_LOCATIONS)
|
SearchQueryParser.__init__(self, locations=self.USABLE_LOCATIONS)
|
||||||
self.srs = set([])
|
self.srs = set([])
|
||||||
|
# remove joiner words surrounded by space or at string boundaries
|
||||||
|
self.joiner_pat = re.compile(r'(^|\s)(and|not|or|a|the|is|of)(\s|$)', re.IGNORECASE)
|
||||||
|
self.punctuation_table = {ord(x):' ' for x in string.punctuation}
|
||||||
|
|
||||||
def add_search_result(self, search_result):
|
def add_search_result(self, search_result):
|
||||||
self.srs.add(search_result)
|
self.srs.add(search_result)
|
||||||
@ -449,11 +452,10 @@ class SearchFilter(SearchQueryParser):
|
|||||||
|
|
||||||
if locvalue == 'format':
|
if locvalue == 'format':
|
||||||
vals = accessor(sr).split(',')
|
vals = accessor(sr).split(',')
|
||||||
elif locvalue in ('author2', 'title2'):
|
elif locvalue in {'author2', 'title2'}:
|
||||||
m = self.IN_MATCH
|
m = self.IN_MATCH
|
||||||
vals = re.sub(r'(^|\s)(and|not|or|a|the|is|of|,)(\s|$)', ' ', accessor(sr)).split(' ')
|
vals = [x for x in self.field_trimmer(accessor(sr)).split() if x]
|
||||||
vals = [x for x in vals if x]
|
final_query = ' '.join(self.field_trimmer(icu_lower(query)).split())
|
||||||
final_query = query.lower()
|
|
||||||
else:
|
else:
|
||||||
vals = [accessor(sr)]
|
vals = [accessor(sr)]
|
||||||
if self._match(final_query, vals, m):
|
if self._match(final_query, vals, m):
|
||||||
@ -464,3 +466,8 @@ class SearchFilter(SearchQueryParser):
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
def field_trimmer(self, field):
|
||||||
|
''' Remove common joiner words and punctuation to improve matching,
|
||||||
|
punctuation is removed first, so that a.and.b becomes a b '''
|
||||||
|
return self.joiner_pat.sub(' ', field.translate(self.punctuation_table))
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user