Start work on implementing search in the new backend. Searching for date columns working.

This commit is contained in:
Kovid Goyal 2013-01-20 14:34:26 +05:30
parent aff8f66fa1
commit 63b164241a
4 changed files with 356 additions and 0 deletions

View File

@ -13,6 +13,7 @@ from functools import wraps, partial
from calibre.db.locking import create_locks, RecordLock from calibre.db.locking import create_locks, RecordLock
from calibre.db.fields import create_field from calibre.db.fields import create_field
from calibre.db.search import Search
from calibre.db.tables import VirtualTable from calibre.db.tables import VirtualTable
from calibre.db.lazy import FormatMetadata, FormatsList from calibre.db.lazy import FormatMetadata, FormatsList
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
@ -50,6 +51,7 @@ class Cache(object):
self.record_lock = RecordLock(self.read_lock) self.record_lock = RecordLock(self.read_lock)
self.format_metadata_cache = defaultdict(dict) self.format_metadata_cache = defaultdict(dict)
self.formatter_template_cache = {} self.formatter_template_cache = {}
self._search_api = Search(self.field_metadata.get_search_terms())
# Implement locking for all simple read/write API methods # Implement locking for all simple read/write API methods
# An unlocked version of the method is stored with the name starting # An unlocked version of the method is stored with the name starting
@ -409,6 +411,10 @@ class Cache(object):
else: else:
return sorted(all_book_ids, key=partial(SortKey, fields, sort_keys)) return sorted(all_book_ids, key=partial(SortKey, fields, sort_keys))
@read_api
def search(self, query, restriction):
return self._search_api(self, query, restriction)
# }}} # }}}
class SortKey(object): class SortKey(object):

View File

@ -9,6 +9,7 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from threading import Lock from threading import Lock
from collections import defaultdict
from calibre.db.tables import ONE_ONE, MANY_ONE, MANY_MANY from calibre.db.tables import ONE_ONE, MANY_ONE, MANY_MANY
from calibre.ebooks.metadata import title_sort from calibre.ebooks.metadata import title_sort
@ -83,6 +84,15 @@ class Field(object):
''' '''
raise NotImplementedError() raise NotImplementedError()
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
'''
Return a generator that yields items of the form (value, set of books
ids that have this value). Here, value is a searchable value. For
OneToOneField the set of books ids will contain only a single id, but for
other fields it will generally have more than one id. Returned books_ids
are restricted to the set of ids in candidates.
'''
raise NotImplementedError()
class OneToOneField(Field): class OneToOneField(Field):
@ -102,6 +112,11 @@ class OneToOneField(Field):
return {id_ : self._sort_key(self.table.book_col_map.get(id_, return {id_ : self._sort_key(self.table.book_col_map.get(id_,
self._default_sort_key)) for id_ in all_book_ids} self._default_sort_key)) for id_ in all_book_ids}
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
cbm = self.table.book_col_map
for book_id in candidates:
yield cbm.get(book_id, default_value), {book_id}
class CompositeField(OneToOneField): class CompositeField(OneToOneField):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -139,6 +154,9 @@ class CompositeField(OneToOneField):
return {id_ : sort_key(self.get_value_with_cache(id_, get_metadata)) for id_ in return {id_ : sort_key(self.get_value_with_cache(id_, get_metadata)) for id_ in
all_book_ids} all_book_ids}
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
for book_id in candidates:
yield self.get_value_with_cache(book_id, get_metadata), {book_id}
class OnDeviceField(OneToOneField): class OnDeviceField(OneToOneField):
@ -176,6 +194,10 @@ class OnDeviceField(OneToOneField):
return {id_ : self.for_book(id_) for id_ in return {id_ : self.for_book(id_) for id_ in
all_book_ids} all_book_ids}
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
for book_id in candidates:
yield self.for_book(book_id, default_value=default_value), {book_id}
class ManyToOneField(Field): class ManyToOneField(Field):
def for_book(self, book_id, default_value=None): def for_book(self, book_id, default_value=None):
@ -206,6 +228,13 @@ class ManyToOneField(Field):
for cid in ans.itervalues()} for cid in ans.itervalues()}
return {id_ : sk_map[cid] for id_, cid in ans.iteritems()} return {id_ : sk_map[cid] for id_, cid in ans.iteritems()}
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
cbm = self.table.col_book_map
for item_id, val in self.table.id_map.iteritems():
book_ids = set(cbm.get(item_id, ())).intersection(candidates)
if book_ids:
yield val, book_ids
class ManyToManyField(Field): class ManyToManyField(Field):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -241,6 +270,12 @@ class ManyToManyField(Field):
(self._default_sort_key,)) (self._default_sort_key,))
for id_, cids in ans.iteritems()} for id_, cids in ans.iteritems()}
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
cbm = self.table.col_book_map
for item_id, val in self.table.id_map.iteritems():
book_ids = set(cbm.get(item_id, ())).intersection(candidates)
if book_ids:
yield val, book_ids
class IdentifiersField(ManyToManyField): class IdentifiersField(ManyToManyField):
@ -276,6 +311,17 @@ class FormatsField(ManyToManyField):
def format_fname(self, book_id, fmt): def format_fname(self, book_id, fmt):
return self.table.fname_map[book_id][fmt.upper()] return self.table.fname_map[book_id][fmt.upper()]
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
val_map = defaultdict(set)
cbm = self.table.book_col_map
for book_id in candidates:
vals = cbm.get(book_id, ())
for val in vals:
val_map[val].add(book_id)
for val, book_ids in val_map.iteritems():
yield val, book_ids
class SeriesField(ManyToOneField): class SeriesField(ManyToOneField):
def sort_key_for_series(self, book_id, get_lang, series_sort_order): def sort_key_for_series(self, book_id, get_lang, series_sort_order):

284
src/calibre/db/search.py Normal file
View File

@ -0,0 +1,284 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from functools import partial
from datetime import timedelta
from calibre.utils.config_base import prefs
from calibre.utils.date import parse_date, UNDEFINED_DATE, now
from calibre.utils.search_query_parser import SearchQueryParser, ParseException
# TODO: Thread safety of saved searches
class DateSearch(object): # {{{
def __init__(self):
self.operators = {
'=' : (1, self.eq),
'!=' : (2, self.ne),
'>' : (1, self.gt),
'>=' : (2, self.ge),
'<' : (1, self.lt),
'<=' : (2, self.le),
}
self.local_today = { '_today', 'today', icu_lower(_('today')) }
self.local_yesterday = { '_yesterday', 'yesterday', icu_lower(_('yesterday')) }
self.local_thismonth = { '_thismonth', 'thismonth', icu_lower(_('thismonth')) }
self.daysago_pat = re.compile(r'(%s|daysago|_daysago)$'%_('daysago'))
def eq(self, dbdate, query, field_count):
if dbdate.year == query.year:
if field_count == 1:
return True
if dbdate.month == query.month:
if field_count == 2:
return True
return dbdate.day == query.day
return False
def ne(self, *args):
return not self.eq(*args)
def gt(self, dbdate, query, field_count):
if dbdate.year > query.year:
return True
if field_count > 1 and dbdate.year == query.year:
if dbdate.month > query.month:
return True
return (field_count == 3 and dbdate.month == query.month and
dbdate.day > query.day)
return False
def le(self, *args):
return not self.gt(*args)
def lt(self, dbdate, query, field_count):
if dbdate.year < query.year:
return True
if field_count > 1 and dbdate.year == query.year:
if dbdate.month < query.month:
return True
return (field_count == 3 and dbdate.month == query.month and
dbdate.day < query.day)
return False
def ge(self, *args):
return not self.lt(*args)
def __call__(self, query, field_iter):
matches = set()
if len(query) < 2:
return matches
if query == 'false':
for v, book_ids in field_iter():
if isinstance(v, (str, unicode)):
v = parse_date(v)
if v is None or v <= UNDEFINED_DATE:
matches |= book_ids
return matches
if query == 'true':
for v, book_ids in field_iter():
if isinstance(v, (str, unicode)):
v = parse_date(v)
if v is not None and v > UNDEFINED_DATE:
matches |= book_ids
return matches
relop = None
for k, op in self.operators.iteritems():
if query.startswith(k):
p, relop = op
query = query[p:]
if relop is None:
relop = self.operators['='][-1]
if query in self.local_today:
qd = now()
field_count = 3
elif query in self.local_yesterday:
qd = now() - timedelta(1)
field_count = 3
elif query in self.local_thismonth:
qd = now()
field_count = 2
else:
m = self.daysago_pat.search(query)
if m is not None:
num = query[:-len(m.group(1))]
try:
qd = now() - timedelta(int(num))
except:
raise ParseException(query, len(query), 'Number conversion error')
field_count = 3
else:
try:
qd = parse_date(query, as_utc=False)
except:
raise ParseException(query, len(query), 'Date conversion error')
if '-' in query:
field_count = query.count('-') + 1
else:
field_count = query.count('/') + 1
for v, book_ids in field_iter():
if isinstance(v, (str, unicode)):
v = parse_date(v)
if v is not None and relop(v, qd, field_count):
matches |= book_ids
return matches
# }}}
class Parser(SearchQueryParser):
def __init__(self, dbcache, all_book_ids, gst, date_search,
limit_search_columns, limit_search_columns_to, locations):
self.dbcache, self.all_book_ids = dbcache, all_book_ids
self.all_search_locations = frozenset(locations)
self.grouped_search_terms = gst
self.date_search = date_search
self.limit_search_columns, self.limit_search_columns_to = (
limit_search_columns, limit_search_columns_to)
super(Parser, self).__init__(locations, optimize=True)
@property
def field_metadata(self):
return self.dbcache.field_metadata
def universal_set(self):
return self.all_book_ids
def field_iter(self, name, candidates):
get_metadata = partial(self.dbcache._get_metadata, get_user_categories=False)
return self.dbcache.fields[name].iter_searchable_values(get_metadata,
candidates)
def get_matches(self, location, query, candidates=None,
allow_recursion=True):
# If candidates is not None, it must not be modified. Changing its
# value will break query optimization in the search parser
matches = set()
if candidates is None:
candidates = self.all_book_ids
if not candidates or not query or not query.strip():
return matches
if location not in self.all_search_locations:
return matches
if (len(location) > 2 and location.startswith('@') and
location[1:] in self.grouped_search_terms):
location = location[1:]
# get metadata key associated with the search term. Eliminates
# dealing with plurals and other aliases
# original_location = location
location = self.field_metadata.search_term_to_field_key(
icu_lower(location.strip()))
# grouped search terms
if isinstance(location, list):
if allow_recursion:
if query.lower() == 'false':
invert = True
query = 'true'
else:
invert = False
for loc in location:
c = candidates.copy()
m = self.get_matches(loc, query,
candidates=c, allow_recursion=False)
matches |= m
c -= m
if len(c) == 0:
break
if invert:
matches = self.all_book_ids - matches
return matches
raise ParseException(query, len(query), 'Recursive query group detected')
# If the user has asked to restrict searching over all field, apply
# that restriction
if (location == 'all' and self.limit_search_columns and
self.limit_search_columns_to):
terms = set()
for l in self.limit_search_columns_to:
l = icu_lower(l.strip())
if l and l != 'all' and l in self.all_search_locations:
terms.add(l)
if terms:
c = candidates.copy()
for l in terms:
try:
m = self.get_matches(l, query,
candidates=c, allow_recursion=allow_recursion)
matches |= m
c -= m
if len(c) == 0:
break
except:
pass
return matches
if location in self.field_metadata:
fm = self.field_metadata[location]
# take care of dates special case
if (fm['datatype'] == 'datetime' or
(fm['datatype'] == 'composite' and
fm['display'].get('composite_sort', '') == 'date')):
if location == 'date':
location = 'timestamp'
return self.date_search(
icu_lower(query), partial(self.field_iter, location, candidates))
return matches
class Search(object):
def __init__(self, all_search_locations):
self.all_search_locations = all_search_locations
self.date_search = DateSearch()
def change_locations(self, newlocs):
self.all_search_locations = newlocs
def __call__(self, dbcache, query, search_restriction):
'''
Return the set of ids of all records that match the specified
query and restriction
'''
q = ''
if not query or not query.strip():
q = search_restriction
else:
q = query
if search_restriction:
q = u'(%s) and (%s)' % (search_restriction, query)
all_book_ids = dbcache.all_book_ids(type=set)
if not q:
return all_book_ids
# We construct a new parser instance per search as pyparsing is not
# thread safe. On my desktop, constructing a SearchQueryParser instance
# takes 0.000975 seconds and restoring it from a pickle takes
# 0.000974 seconds.
sqp = Parser(
dbcache, all_book_ids, dbcache.pref('grouped_search_terms'),
self.date_search, prefs[ 'limit_search_columns' ],
prefs[ 'limit_search_columns_to' ], self.all_search_locations)
try:
ret = sqp.parse(query)
finally:
sqp.dbcache = None
return ret

View File

@ -191,6 +191,26 @@ class ReadingTest(BaseTest):
# }}} # }}}
def test_searching(self): # {{{
'Test searching returns the same data for both backends'
from calibre.library.database2 import LibraryDatabase2
old = LibraryDatabase2(self.library_path)
oldvals = {query:set(old.search_getting_ids(query, '')) for query in (
'date:9/6/2011', 'date:true', 'date:false', 'pubdate:9/2011',
'#date:true', 'date:<100daysago', 'date:>9/6/2011',
'#date:>9/1/2011', '#date:=2011',
)}
old = None
cache = self.init_cache(self.library_path)
for query, ans in oldvals.iteritems():
nr = cache.search(query, '')
self.assertEqual(ans, nr,
'Old result: %r != New result: %r for search: %s'%(
ans, nr, query))
# }}}
def tests(): def tests():
return unittest.TestLoader().loadTestsFromTestCase(ReadingTest) return unittest.TestLoader().loadTestsFromTestCase(ReadingTest)