From d2ff37b5a179ecf10041d8e51a810cb067820ca9 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Tue, 14 Dec 2010 14:11:10 +0000
Subject: [PATCH] Improved get_categories -- approximately 6 times faster

---
 src/calibre/library/database2.py | 219 ++++++++++++++++++++++++++-----
 1 file changed, 187 insertions(+), 32 deletions(-)

diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 1229b60577..0d301ccaff 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -14,7 +14,7 @@ from operator import itemgetter
 
 from PyQt4.QtGui import QImage
 
-
+from calibre import prints
 from calibre.ebooks.metadata import title_sort, author_to_author_sort
 from calibre.ebooks.metadata.opf2 import metadata_to_opf
 from calibre.library.database import LibraryDatabase
@@ -1039,43 +1039,170 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
                     tn=field['table'], col=field['link_column']), (id_,))
         return set(x[0] for x in ans)
 
+########## data structures for get_categories
+
     CATEGORY_SORTS = ('name', 'popularity', 'rating')
 
-    def get_categories(self, sort='name', ids=None, icon_map=None):
-        self.books_list_filter.change([] if not ids else ids)
+    class TCat_Tag(object):
 
-        categories = {}
+        def __init__(self, name, sort):
+            self.n = name
+            self.s = sort
+            self.c  = 0
+            self.rt = 0
+            self.rc = 0
+            self.id = None
+
+        def set_all(self, c, rt, rc, id):
+            self.c  = c
+            self.rt = rt
+            self.rc = rc
+            self.id = id
+
+        def __str__(self):
+            return unicode(self)
+
+        def __unicode__(self):
+            return 'n=%s s=%s c=%d rt=%d rc=%d id=%s'%\
+                            (self.n, self.s, self.c, self.rt, self.rc, self.id)
+
+
+    def get_categories(self, sort='name', ids=None, icon_map=None):
+        start = time.time()
         if icon_map is not None and type(icon_map) != TagsIcons:
             raise TypeError('icon_map passed to get_categories must be of type TagIcons')
+        if sort not in self.CATEGORY_SORTS:
+            raise ValueError('sort ' + sort + ' not a valid value')
+
+        self.books_list_filter.change([] if not ids else ids)
+        id_filter = None if not ids else frozenset(ids)
 
         tb_cats = self.field_metadata
-        #### First, build the standard and custom-column categories ####
+        tcategories = {}
+        tids = {}
+        md = []
+
+        # First, build the maps. We need a category->items map and an
+        # item -> (item_id, sort_val) map to use in the books loop
         for category in tb_cats.keys():
             cat = tb_cats[category]
-            if not cat['is_category'] or cat['kind'] in ['user', 'search']:
+            if not cat['is_category'] or cat['kind'] in ['user', 'search'] \
+                    or category in ['news', 'formats']:
                 continue
-            tn = cat['table']
-            categories[category] = []   #reserve the position in the ordered list
-            if tn is None:              # Nothing to do for the moment
+            # Get the ids for the item values
+            if not cat['is_custom']:
+                funcs = {
+                        'authors'  : self.get_authors_with_ids,
+                        'series'   : self.get_series_with_ids,
+                        'publisher': self.get_publishers_with_ids,
+                        'tags'     : self.get_tags_with_ids,
+                        'rating'   : self.get_ratings_with_ids,
+                    }
+                func = funcs.get(category, None)
+                if func:
+                    list = func()
+                else:
+                    raise ValueError(category + ' has no get with ids function')
+            else:
+                list = self.get_custom_items_with_ids(label=cat['label'])
+            tids[category] = {}
+            if category == 'authors':
+                for l in list:
+                    (id, val, sort_val) = (l[0], l[1], l[2])
+                    tids[category][val] = (id, sort_val)
+            else:
+                for l in list:
+                    (id, val) = (l[0], l[1])
+                    tids[category][val] = (id, val)
+            # add an empty category to the category map
+            tcategories[category] = {}
+            # create a list of category/field_index for the books scan to use.
+            # This saves iterating through field_metadata for each book
+            md.append((category, cat['rec_index'], cat['is_multiple']))
+
+        print 'end phase "collection":', time.time() - start, 'seconds'
+
+        # Now scan every book looking for category items.
+        # Code below is duplicated because it shaves off 10% of the loop time
+        id_dex = self.FIELD_MAP['id']
+        rating_dex = self.FIELD_MAP['rating']
+        for book in self.data.iterall():
+            if id_filter and book[id_dex] not in id_filter:
                 continue
-            cn = cat['column']
-            if ids is None:
-                query = '''SELECT id, {0}, count, avg_rating, sort
-                           FROM tag_browser_{1}'''.format(cn, tn)
-            else:
-                query = '''SELECT id, {0}, count, avg_rating, sort
-                           FROM tag_browser_filtered_{1}'''.format(cn, tn)
-            if sort == 'popularity':
-                query += ' ORDER BY count DESC, sort ASC'
-            elif sort == 'name':
-                query += ' ORDER BY sort COLLATE icucollate'
-            else:
-                query += ' ORDER BY avg_rating DESC, sort ASC'
-            data = self.conn.get(query)
+            rating = book[rating_dex]
+            # We kept track of all possible category field_map positions above
+            for (cat, dex, mult) in md:
+                if book[dex] is None:
+                    continue
+                if not mult:
+                    val = book[dex]
+                    try:
+                        (item_id, sort_val) = tids[cat][val] # let exceptions fly
+                        item = tcategories[cat].get(val, None)
+                        if not item:
+                            item = LibraryDatabase2.TCat_Tag(val, sort_val)
+                            tcategories[cat][val] = item
+                        item.c += 1
+                        item.id = item_id
+                        if rating > 0:
+                            item.rt += rating
+                            item.rc += 1
+                    except:
+                        prints('get_categories: item', val, 'is not in', cat, 'list!')
+                else:
+                    vals = book[dex].split(mult)
+                    for val in vals:
+                        try:
+                            (item_id, sort_val) = tids[cat][val] # let exceptions fly
+                            item = tcategories[cat].get(val, None)
+                            if not item:
+                                item = LibraryDatabase2.TCat_Tag(val, sort_val)
+                                tcategories[cat][val] = item
+                            item.c += 1
+                            item.id = item_id
+                            if rating > 0:
+                                item.rt += rating
+                                item.rc += 1
+                        except:
+                            prints('get_categories: item', val, 'is not in', cat, 'list!')
+
+        print 'end phase "books":', time.time() - start, 'seconds'
+
+        # Now do news
+        tcategories['news'] = {}
+        cat = tb_cats['news']
+        tn = cat['table']
+        cn = cat['column']
+        if ids is None:
+            query = '''SELECT id, {0}, count, avg_rating, sort
+                       FROM tag_browser_{1}'''.format(cn, tn)
+        else:
+            query = '''SELECT id, {0}, count, avg_rating, sort
+                       FROM tag_browser_filtered_{1}'''.format(cn, tn)
+        # results will be sorted later
+        data = self.conn.get(query)
+        for r in data:
+            item = LibraryDatabase2.TCat_Tag(r[1], r[1])
+            item.set_all(c=r[2], rt=r[2]*r[3], rc=r[2], id=r[0])
+            tcategories['news'][r[1]] = item
+
+        print 'end phase "news":', time.time() - start, 'seconds'
+
+        # Build the real category list by iterating over the temporary copy
+        # and building the Tag instances.
+        categories = {}
+        for category in tb_cats.keys():
+            if category not in tcategories:
+                continue
+            cat = tb_cats[category]
+
+            # prepare the place where we will put the array of Tags
+            categories[category] = []
 
             # icon_map is not None if get_categories is to store an icon and
             # possibly a tooltip in the tag structure.
-            icon, tooltip = None, ''
+            icon = None
+            tooltip = ''
             label = tb_cats.key_to_label(category)
             if icon_map:
                 if not tb_cats.is_custom_field(category):
@@ -1087,23 +1214,40 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
                     tooltip = self.custom_column_label_map[label]['name']
 
             datatype = cat['datatype']
-            avgr = itemgetter(3)
-            item_not_zero_func = lambda x: x[2] > 0
+            avgr = lambda x: 0.0 if x.rc == 0 else x.rt/x.rc
+            # Duplicate the build of items below to avoid using a lambda func
+            # in the main Tag loop. Saves a few %
             if datatype == 'rating':
-                # eliminate the zero ratings line as well as count == 0
-                item_not_zero_func = (lambda x: x[1] > 0 and x[2] > 0)
                 formatter = (lambda x:u'\u2605'*int(x/2))
-                avgr = itemgetter(1)
+                avgr = lambda x : x.n
+                # eliminate the zero ratings line as well as count == 0
+                items = [v for v in tcategories[category].values() if v.c > 0 and v.n != 0]
             elif category == 'authors':
                 # Clean up the authors strings to human-readable form
                 formatter = (lambda x: x.replace('|', ','))
+                items = [v for v in tcategories[category].values() if v.c > 0]
             else:
                 formatter = (lambda x:unicode(x))
+                items = [v for v in tcategories[category].values() if v.c > 0]
 
-            categories[category] = [Tag(formatter(r[1]), count=r[2], id=r[0],
-                                        avg=avgr(r), sort=r[4], icon=icon,
+            # sort the list
+            if sort == 'name':
+                kf = lambda x: sort_key(x.s) if isinstance(x.s, unicode) else x.s
+                reverse=False
+            elif sort == 'popularity':
+                kf = lambda x: x.c
+                reverse=True
+            else:
+                kf = avgr
+                reverse=True
+            items.sort(key=kf, reverse=reverse)
+
+            categories[category] = [Tag(formatter(r.n), count=r.c, id=r.id,
+                                        avg=avgr(r), sort=r.s, icon=icon,
                                         tooltip=tooltip, category=category)
-                                    for r in data if item_not_zero_func(r)]
+                                    for r in items]
+
+        print 'end phase "tags list":', time.time() - start, 'seconds'
 
         # Needed for legacy databases that have multiple ratings that
         # map to n stars
@@ -1189,8 +1333,13 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
                 icon_map['search'] = icon_map['search']
             categories['search'] = items
 
+        t = time.time() - start
+        print 'get_categories ran in:', t, 'seconds'
+
         return categories
 
+    ############# End get_categories
+
     def tags_older_than(self, tag, delta):
         tag = tag.lower().strip()
         now = nowf()
@@ -1486,6 +1635,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
     # Note: we generally do not need to refresh_ids because library_view will
     # refresh everything.
 
+    def get_ratings_with_ids(self):
+        result = self.conn.get('SELECT id,rating FROM ratings')
+        if not result:
+            return []
+        return result
+
     def dirty_books_referencing(self, field, id, commit=True):
         # Get the list of books to dirty -- all books that reference the item
         table = self.field_metadata[field]['table']