From 63b164241a5846ca40aa20f361c20a1b29333068 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 20 Jan 2013 14:34:26 +0530
Subject: [PATCH] Start work on implementing search in the new backend.
 Searching for date columns working.

---
 src/calibre/db/cache.py         |   6 +
 src/calibre/db/fields.py        |  46 ++++++
 src/calibre/db/search.py        | 284 ++++++++++++++++++++++++++++++++
 src/calibre/db/tests/reading.py |  20 +++
 4 files changed, 356 insertions(+)
 create mode 100644 src/calibre/db/search.py

diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py
index a631f9ea46..88a2196a61 100644
--- a/src/calibre/db/cache.py
+++ b/src/calibre/db/cache.py
@@ -13,6 +13,7 @@ from functools import wraps, partial
 
 from calibre.db.locking import create_locks, RecordLock
 from calibre.db.fields import create_field
+from calibre.db.search import Search
 from calibre.db.tables import VirtualTable
 from calibre.db.lazy import FormatMetadata, FormatsList
 from calibre.ebooks.metadata.book.base import Metadata
@@ -50,6 +51,7 @@ class Cache(object):
         self.record_lock = RecordLock(self.read_lock)
         self.format_metadata_cache = defaultdict(dict)
         self.formatter_template_cache = {}
+        self._search_api = Search(self.field_metadata.get_search_terms())
 
         # Implement locking for all simple read/write API methods
         # An unlocked version of the method is stored with the name starting
@@ -409,6 +411,10 @@ class Cache(object):
         else:
             return sorted(all_book_ids, key=partial(SortKey, fields, sort_keys))
 
+    @read_api
+    def search(self, query, restriction):
+        return self._search_api(self, query, restriction)
+
     # }}}
 
 class SortKey(object):
diff --git a/src/calibre/db/fields.py b/src/calibre/db/fields.py
index 3808052549..43e89cdc6f 100644
--- a/src/calibre/db/fields.py
+++ b/src/calibre/db/fields.py
@@ -9,6 +9,7 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
 from threading import Lock
+from collections import defaultdict
 
 from calibre.db.tables import ONE_ONE, MANY_ONE, MANY_MANY
 from calibre.ebooks.metadata import title_sort
@@ -83,6 +84,15 @@ class Field(object):
         '''
         raise NotImplementedError()
 
+    def iter_searchable_values(self, get_metadata, candidates, default_value=None):
+        '''
+        Return a generator that yields items of the form (value, set of books
+        ids that have this value). Here, value is a searchable value. For
+        OneToOneField the set of books ids will contain only a single id, but for
+        other fields it will generally have more than one id. Returned books_ids
+        are restricted to the set of ids in candidates.
+        '''
+        raise NotImplementedError()
 
 class OneToOneField(Field):
 
@@ -102,6 +112,11 @@ class OneToOneField(Field):
         return {id_ : self._sort_key(self.table.book_col_map.get(id_,
             self._default_sort_key)) for id_ in all_book_ids}
 
+    def iter_searchable_values(self, get_metadata, candidates, default_value=None):
+        cbm = self.table.book_col_map
+        for book_id in candidates:
+            yield cbm.get(book_id, default_value), {book_id}
+
 class CompositeField(OneToOneField):
 
     def __init__(self, *args, **kwargs):
@@ -139,6 +154,9 @@ class CompositeField(OneToOneField):
         return {id_ : sort_key(self.get_value_with_cache(id_, get_metadata)) for id_ in
                 all_book_ids}
 
+    def iter_searchable_values(self, get_metadata, candidates, default_value=None):
+        for book_id in candidates:
+            yield self.get_value_with_cache(book_id, get_metadata), {book_id}
 
 class OnDeviceField(OneToOneField):
 
@@ -176,6 +194,10 @@ class OnDeviceField(OneToOneField):
         return {id_ : self.for_book(id_) for id_ in
                 all_book_ids}
 
+    def iter_searchable_values(self, get_metadata, candidates, default_value=None):
+        for book_id in candidates:
+            yield self.for_book(book_id, default_value=default_value), {book_id}
+
 class ManyToOneField(Field):
 
     def for_book(self, book_id, default_value=None):
@@ -206,6 +228,13 @@ class ManyToOneField(Field):
                 for cid in ans.itervalues()}
         return {id_ : sk_map[cid] for id_, cid in ans.iteritems()}
 
+    def iter_searchable_values(self, get_metadata, candidates, default_value=None):
+        cbm = self.table.col_book_map
+        for item_id, val in self.table.id_map.iteritems():
+            book_ids = set(cbm.get(item_id, ())).intersection(candidates)
+            if book_ids:
+                yield val, book_ids
+
 class ManyToManyField(Field):
 
     def __init__(self, *args, **kwargs):
@@ -241,6 +270,12 @@ class ManyToManyField(Field):
                         (self._default_sort_key,))
                 for id_, cids in ans.iteritems()}
 
+    def iter_searchable_values(self, get_metadata, candidates, default_value=None):
+        cbm = self.table.col_book_map
+        for item_id, val in self.table.id_map.iteritems():
+            book_ids = set(cbm.get(item_id, ())).intersection(candidates)
+            if book_ids:
+                yield val, book_ids
 
 class IdentifiersField(ManyToManyField):
 
@@ -276,6 +311,17 @@ class FormatsField(ManyToManyField):
     def format_fname(self, book_id, fmt):
         return self.table.fname_map[book_id][fmt.upper()]
 
+    def iter_searchable_values(self, get_metadata, candidates, default_value=None):
+        val_map = defaultdict(set)
+        cbm = self.table.book_col_map
+        for book_id in candidates:
+            vals = cbm.get(book_id, ())
+            for val in vals:
+                val_map[val].add(book_id)
+
+        for val, book_ids in val_map.iteritems():
+            yield val, book_ids
+
 class SeriesField(ManyToOneField):
 
     def sort_key_for_series(self, book_id, get_lang, series_sort_order):
diff --git a/src/calibre/db/search.py b/src/calibre/db/search.py
new file mode 100644
index 0000000000..d304deeb9a
--- /dev/null
+++ b/src/calibre/db/search.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+from functools import partial
+from datetime import timedelta
+
+from calibre.utils.config_base import prefs
+from calibre.utils.date import parse_date, UNDEFINED_DATE, now
+from calibre.utils.search_query_parser import SearchQueryParser, ParseException
+
+# TODO: Thread safety of saved searches
+
+class DateSearch(object): # {{{
+
+    def __init__(self):
+        self.operators = {
+            '='   : (1, self.eq),
+            '!='  : (2, self.ne),
+            '>'   : (1, self.gt),
+            '>='  : (2, self.ge),
+            '<'   : (1, self.lt),
+            '<='  : (2, self.le),
+        }
+        self.local_today         = { '_today', 'today', icu_lower(_('today')) }
+        self.local_yesterday     = { '_yesterday', 'yesterday', icu_lower(_('yesterday')) }
+        self.local_thismonth     = { '_thismonth', 'thismonth', icu_lower(_('thismonth')) }
+        self.daysago_pat = re.compile(r'(%s|daysago|_daysago)$'%_('daysago'))
+
+    def eq(self, dbdate, query, field_count):
+        if dbdate.year == query.year:
+            if field_count == 1:
+                return True
+            if dbdate.month == query.month:
+                if field_count == 2:
+                    return True
+                return dbdate.day == query.day
+        return False
+
+    def ne(self, *args):
+        return not self.eq(*args)
+
+    def gt(self, dbdate, query, field_count):
+        if dbdate.year > query.year:
+            return True
+        if field_count > 1 and dbdate.year == query.year:
+            if dbdate.month > query.month:
+                return True
+            return (field_count == 3 and dbdate.month == query.month and
+                    dbdate.day > query.day)
+        return False
+
+    def le(self, *args):
+        return not self.gt(*args)
+
+    def lt(self, dbdate, query, field_count):
+        if dbdate.year < query.year:
+            return True
+        if field_count > 1 and dbdate.year == query.year:
+            if dbdate.month < query.month:
+                return True
+            return (field_count == 3 and dbdate.month == query.month and
+                    dbdate.day < query.day)
+        return False
+
+    def ge(self, *args):
+        return not self.lt(*args)
+
+    def __call__(self, query, field_iter):
+        matches = set()
+        if len(query) < 2:
+            return matches
+
+        if query == 'false':
+            for v, book_ids in field_iter():
+                if isinstance(v, (str, unicode)):
+                    v = parse_date(v)
+                if v is None or v <= UNDEFINED_DATE:
+                    matches |= book_ids
+            return matches
+
+        if query == 'true':
+            for v, book_ids in field_iter():
+                if isinstance(v, (str, unicode)):
+                    v = parse_date(v)
+                if v is not None and v > UNDEFINED_DATE:
+                    matches |= book_ids
+            return matches
+
+        relop = None
+        for k, op in self.operators.iteritems():
+            if query.startswith(k):
+                p, relop = op
+                query = query[p:]
+        if relop is None:
+            relop = self.operators['='][-1]
+
+        if query in self.local_today:
+            qd = now()
+            field_count = 3
+        elif query in self.local_yesterday:
+            qd = now() - timedelta(1)
+            field_count = 3
+        elif query in self.local_thismonth:
+            qd = now()
+            field_count = 2
+        else:
+            m = self.daysago_pat.search(query)
+            if m is not None:
+                num = query[:-len(m.group(1))]
+                try:
+                    qd = now() - timedelta(int(num))
+                except:
+                    raise ParseException(query, len(query), 'Number conversion error')
+                field_count = 3
+            else:
+                try:
+                    qd = parse_date(query, as_utc=False)
+                except:
+                    raise ParseException(query, len(query), 'Date conversion error')
+                if '-' in query:
+                    field_count = query.count('-') + 1
+                else:
+                    field_count = query.count('/') + 1
+
+        for v, book_ids in field_iter():
+            if isinstance(v, (str, unicode)):
+                v = parse_date(v)
+            if v is not None and relop(v, qd, field_count):
+                matches |= book_ids
+
+        return matches
+# }}}
+
+class Parser(SearchQueryParser):
+
+    def __init__(self, dbcache, all_book_ids, gst, date_search,
+                 limit_search_columns, limit_search_columns_to, locations):
+        self.dbcache, self.all_book_ids = dbcache, all_book_ids
+        self.all_search_locations = frozenset(locations)
+        self.grouped_search_terms = gst
+        self.date_search = date_search
+        self.limit_search_columns, self.limit_search_columns_to = (
+            limit_search_columns, limit_search_columns_to)
+        super(Parser, self).__init__(locations, optimize=True)
+
+    @property
+    def field_metadata(self):
+        return self.dbcache.field_metadata
+
+    def universal_set(self):
+        return self.all_book_ids
+
+    def field_iter(self, name, candidates):
+        get_metadata = partial(self.dbcache._get_metadata, get_user_categories=False)
+        return self.dbcache.fields[name].iter_searchable_values(get_metadata,
+                                                                candidates)
+
+    def get_matches(self, location, query, candidates=None,
+                    allow_recursion=True):
+        # If candidates is not None, it must not be modified. Changing its
+        # value will break query optimization in the search parser
+        matches = set()
+
+        if candidates is None:
+            candidates = self.all_book_ids
+        if not candidates or not query or not query.strip():
+            return matches
+        if location not in self.all_search_locations:
+            return matches
+
+        if (len(location) > 2 and location.startswith('@') and
+                    location[1:] in self.grouped_search_terms):
+            location = location[1:]
+
+        # get metadata key associated with the search term. Eliminates
+        # dealing with plurals and other aliases
+        # original_location = location
+        location = self.field_metadata.search_term_to_field_key(
+            icu_lower(location.strip()))
+        # grouped search terms
+        if isinstance(location, list):
+            if allow_recursion:
+                if query.lower() == 'false':
+                    invert = True
+                    query = 'true'
+                else:
+                    invert = False
+                for loc in location:
+                    c = candidates.copy()
+                    m = self.get_matches(loc, query,
+                            candidates=c, allow_recursion=False)
+                    matches |= m
+                    c -= m
+                    if len(c) == 0:
+                        break
+                if invert:
+                    matches = self.all_book_ids - matches
+                return matches
+            raise ParseException(query, len(query), 'Recursive query group detected')
+
+        # If the user has asked to restrict searching over all field, apply
+        # that restriction
+        if (location == 'all' and self.limit_search_columns and
+            self.limit_search_columns_to):
+            terms = set()
+            for l in self.limit_search_columns_to:
+                l = icu_lower(l.strip())
+                if l and l != 'all' and l in self.all_search_locations:
+                    terms.add(l)
+            if terms:
+                c = candidates.copy()
+                for l in terms:
+                    try:
+                        m = self.get_matches(l, query,
+                            candidates=c, allow_recursion=allow_recursion)
+                        matches |= m
+                        c -= m
+                        if len(c) == 0:
+                            break
+                    except:
+                        pass
+                return matches
+
+        if location in self.field_metadata:
+            fm = self.field_metadata[location]
+            # take care of dates special case
+            if (fm['datatype'] == 'datetime' or
+                    (fm['datatype'] == 'composite' and
+                     fm['display'].get('composite_sort', '') == 'date')):
+                if location == 'date':
+                    location = 'timestamp'
+                return self.date_search(
+                    icu_lower(query), partial(self.field_iter, location, candidates))
+
+        return matches
+
+
+class Search(object):
+
+    def __init__(self, all_search_locations):
+        self.all_search_locations = all_search_locations
+        self.date_search = DateSearch()
+
+    def change_locations(self, newlocs):
+        self.all_search_locations = newlocs
+
+    def __call__(self, dbcache, query, search_restriction):
+        '''
+        Return the set of ids of all records that match the specified
+        query and restriction
+        '''
+        q = ''
+        if not query or not query.strip():
+            q = search_restriction
+        else:
+            q = query
+            if search_restriction:
+                q = u'(%s) and (%s)' % (search_restriction, query)
+
+        all_book_ids = dbcache.all_book_ids(type=set)
+        if not q:
+            return all_book_ids
+
+        # We construct a new parser instance per search as pyparsing is not
+        # thread safe. On my desktop, constructing a SearchQueryParser instance
+        # takes 0.000975 seconds and restoring it from a pickle takes
+        # 0.000974 seconds.
+        sqp = Parser(
+            dbcache, all_book_ids, dbcache.pref('grouped_search_terms'),
+            self.date_search, prefs[ 'limit_search_columns' ],
+            prefs[ 'limit_search_columns_to' ], self.all_search_locations)
+        try:
+            ret = sqp.parse(query)
+        finally:
+            sqp.dbcache = None
+        return ret
+
diff --git a/src/calibre/db/tests/reading.py b/src/calibre/db/tests/reading.py
index d77d3ac6eb..22d1bba37e 100644
--- a/src/calibre/db/tests/reading.py
+++ b/src/calibre/db/tests/reading.py
@@ -191,6 +191,26 @@ class ReadingTest(BaseTest):
 
     # }}}
 
+    def test_searching(self): # {{{
+        'Test searching returns the same data for both backends'
+        from calibre.library.database2 import LibraryDatabase2
+        old = LibraryDatabase2(self.library_path)
+        oldvals = {query:set(old.search_getting_ids(query, '')) for query in (
+            'date:9/6/2011', 'date:true', 'date:false', 'pubdate:9/2011',
+            '#date:true', 'date:<100daysago', 'date:>9/6/2011',
+            '#date:>9/1/2011', '#date:=2011',
+        )}
+        old = None
+
+        cache = self.init_cache(self.library_path)
+        for query, ans in oldvals.iteritems():
+            nr = cache.search(query, '')
+            self.assertEqual(ans, nr,
+                'Old result: %r != New result: %r for search: %s'%(
+                    ans, nr, query))
+
+    # }}}
+
 def tests():
     return unittest.TestLoader().loadTestsFromTestCase(ReadingTest)