Improved get_categories -- approximately 6 times faster

2025-08-11 09:13:57 -04:00 · 2010-12-14 14:11:10 +00:00 · 2010-12-14 14:11:10 +00:00 · d2ff37b5a1
commit d2ff37b5a1
parent f1911aa270
1 changed files with 187 additions and 32 deletions
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -14,7 +14,7 @@ from operator import itemgetter
 from PyQt4.QtGui import QImage
-
+from calibre import prints
 from calibre.ebooks.metadata import title_sort, author_to_author_sort
 from calibre.ebooks.metadata.opf2 import metadata_to_opf
 from calibre.library.database import LibraryDatabase
@ -1039,43 +1039,170 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
                    tn=field['table'], col=field['link_column']), (id_,))
        return set(x[0] for x in ans)
 ########## data structures for get_categories
    CATEGORY_SORTS = ('name', 'popularity', 'rating')
-    def get_categories(self, sort='name', ids=None, icon_map=None):
+    class TCat_Tag(object):
        self.books_list_filter.change([] if not ids else ids)
-        categories = {}
+        def __init__(self, name, sort):
            self.n = name
            self.s = sort
            self.c  = 0
            self.rt = 0
            self.rc = 0
            self.id = None
        def set_all(self, c, rt, rc, id):
            self.c  = c
            self.rt = rt
            self.rc = rc
            self.id = id
        def __str__(self):
            return unicode(self)
        def __unicode__(self):
            return 'n=%s s=%s c=%d rt=%d rc=%d id=%s'%\
                            (self.n, self.s, self.c, self.rt, self.rc, self.id)
    def get_categories(self, sort='name', ids=None, icon_map=None):
        start = time.time()
        if icon_map is not None and type(icon_map) != TagsIcons:
            raise TypeError('icon_map passed to get_categories must be of type TagIcons')
        if sort not in self.CATEGORY_SORTS:
            raise ValueError('sort ' + sort + ' not a valid value')
        self.books_list_filter.change([] if not ids else ids)
        id_filter = None if not ids else frozenset(ids)
        tb_cats = self.field_metadata
-        #### First, build the standard and custom-column categories ####
+        tcategories = {}
        tids = {}
        md = []
        # First, build the maps. We need a category->items map and an
        # item -> (item_id, sort_val) map to use in the books loop
        for category in tb_cats.keys():
            cat = tb_cats[category]
-            if not cat['is_category'] or cat['kind'] in ['user', 'search']:
+            if not cat['is_category'] or cat['kind'] in ['user', 'search'] \
                    or category in ['news', 'formats']:
                continue
-            tn = cat['table']
+            # Get the ids for the item values
-            categories[category] = []   #reserve the position in the ordered list
+            if not cat['is_custom']:
-            if tn is None:              # Nothing to do for the moment
+                funcs = {
                        'authors'  : self.get_authors_with_ids,
                        'series'   : self.get_series_with_ids,
                        'publisher': self.get_publishers_with_ids,
                        'tags'     : self.get_tags_with_ids,
                        'rating'   : self.get_ratings_with_ids,
                    }
                func = funcs.get(category, None)
                if func:
                    list = func()
                else:
                    raise ValueError(category + ' has no get with ids function')
            else:
                list = self.get_custom_items_with_ids(label=cat['label'])
            tids[category] = {}
            if category == 'authors':
                for l in list:
                    (id, val, sort_val) = (l[0], l[1], l[2])
                    tids[category][val] = (id, sort_val)
            else:
                for l in list:
                    (id, val) = (l[0], l[1])
                    tids[category][val] = (id, val)
            # add an empty category to the category map
            tcategories[category] = {}
            # create a list of category/field_index for the books scan to use.
            # This saves iterating through field_metadata for each book
            md.append((category, cat['rec_index'], cat['is_multiple']))
        print 'end phase "collection":', time.time() - start, 'seconds'
        # Now scan every book looking for category items.
        # Code below is duplicated because it shaves off 10% of the loop time
        id_dex = self.FIELD_MAP['id']
        rating_dex = self.FIELD_MAP['rating']
        for book in self.data.iterall():
            if id_filter and book[id_dex] not in id_filter:
                continue
-            cn = cat['column']
+            rating = book[rating_dex]
-            if ids is None:
+            # We kept track of all possible category field_map positions above
-                query = '''SELECT id, {0}, count, avg_rating, sort
+            for (cat, dex, mult) in md:
-                           FROM tag_browser_{1}'''.format(cn, tn)
+                if book[dex] is None:
-            else:
+                    continue
-                query = '''SELECT id, {0}, count, avg_rating, sort
+                if not mult:
-                           FROM tag_browser_filtered_{1}'''.format(cn, tn)
+                    val = book[dex]
-            if sort == 'popularity':
+                    try:
-                query += ' ORDER BY count DESC, sort ASC'
+                        (item_id, sort_val) = tids[cat][val] # let exceptions fly
-            elif sort == 'name':
+                        item = tcategories[cat].get(val, None)
-                query += ' ORDER BY sort COLLATE icucollate'
+                        if not item:
-            else:
+                            item = LibraryDatabase2.TCat_Tag(val, sort_val)
-                query += ' ORDER BY avg_rating DESC, sort ASC'
+                            tcategories[cat][val] = item
-            data = self.conn.get(query)
+                        item.c += 1
                        item.id = item_id
                        if rating > 0:
                            item.rt += rating
                            item.rc += 1
                    except:
                        prints('get_categories: item', val, 'is not in', cat, 'list!')
                else:
                    vals = book[dex].split(mult)
                    for val in vals:
                        try:
                            (item_id, sort_val) = tids[cat][val] # let exceptions fly
                            item = tcategories[cat].get(val, None)
                            if not item:
                                item = LibraryDatabase2.TCat_Tag(val, sort_val)
                                tcategories[cat][val] = item
                            item.c += 1
                            item.id = item_id
                            if rating > 0:
                                item.rt += rating
                                item.rc += 1
                        except:
                            prints('get_categories: item', val, 'is not in', cat, 'list!')
        print 'end phase "books":', time.time() - start, 'seconds'
        # Now do news
        tcategories['news'] = {}
        cat = tb_cats['news']
        tn = cat['table']
        cn = cat['column']
        if ids is None:
            query = '''SELECT id, {0}, count, avg_rating, sort
                       FROM tag_browser_{1}'''.format(cn, tn)
        else:
            query = '''SELECT id, {0}, count, avg_rating, sort
                       FROM tag_browser_filtered_{1}'''.format(cn, tn)
        # results will be sorted later
        data = self.conn.get(query)
        for r in data:
            item = LibraryDatabase2.TCat_Tag(r[1], r[1])
            item.set_all(c=r[2], rt=r[2]*r[3], rc=r[2], id=r[0])
            tcategories['news'][r[1]] = item
        print 'end phase "news":', time.time() - start, 'seconds'
        # Build the real category list by iterating over the temporary copy
        # and building the Tag instances.
        categories = {}
        for category in tb_cats.keys():
            if category not in tcategories:
                continue
            cat = tb_cats[category]
            # prepare the place where we will put the array of Tags
            categories[category] = []
            # icon_map is not None if get_categories is to store an icon and
            # possibly a tooltip in the tag structure.
-            icon, tooltip = None, ''
+            icon = None
            tooltip = ''
            label = tb_cats.key_to_label(category)
            if icon_map:
                if not tb_cats.is_custom_field(category):
@ -1087,23 +1214,40 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
                    tooltip = self.custom_column_label_map[label]['name']
            datatype = cat['datatype']
-            avgr = itemgetter(3)
+            avgr = lambda x: 0.0 if x.rc == 0 else x.rt/x.rc
-            item_not_zero_func = lambda x: x[2] > 0
+            # Duplicate the build of items below to avoid using a lambda func
            # in the main Tag loop. Saves a few %
            if datatype == 'rating':
                # eliminate the zero ratings line as well as count == 0
                item_not_zero_func = (lambda x: x[1] > 0 and x[2] > 0)
                formatter = (lambda x:u'\u2605'*int(x/2))
-                avgr = itemgetter(1)
+                avgr = lambda x : x.n
                # eliminate the zero ratings line as well as count == 0
                items = [v for v in tcategories[category].values() if v.c > 0 and v.n != 0]
            elif category == 'authors':
                # Clean up the authors strings to human-readable form
                formatter = (lambda x: x.replace('|', ','))
                items = [v for v in tcategories[category].values() if v.c > 0]
            else:
                formatter = (lambda x:unicode(x))
                items = [v for v in tcategories[category].values() if v.c > 0]
-            categories[category] = [Tag(formatter(r[1]), count=r[2], id=r[0],
+            # sort the list
-                                        avg=avgr(r), sort=r[4], icon=icon,
+            if sort == 'name':
                kf = lambda x: sort_key(x.s) if isinstance(x.s, unicode) else x.s
                reverse=False
            elif sort == 'popularity':
                kf = lambda x: x.c
                reverse=True
            else:
                kf = avgr
                reverse=True
            items.sort(key=kf, reverse=reverse)
            categories[category] = [Tag(formatter(r.n), count=r.c, id=r.id,
                                        avg=avgr(r), sort=r.s, icon=icon,
                                        tooltip=tooltip, category=category)
-                                    for r in data if item_not_zero_func(r)]
+                                    for r in items]
        print 'end phase "tags list":', time.time() - start, 'seconds'
        # Needed for legacy databases that have multiple ratings that
        # map to n stars
@ -1189,8 +1333,13 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
                icon_map['search'] = icon_map['search']
            categories['search'] = items
        t = time.time() - start
        print 'get_categories ran in:', t, 'seconds'
        return categories
    ############# End get_categories
    def tags_older_than(self, tag, delta):
        tag = tag.lower().strip()
        now = nowf()
@ -1486,6 +1635,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
    # Note: we generally do not need to refresh_ids because library_view will
    # refresh everything.
    def get_ratings_with_ids(self):
        result = self.conn.get('SELECT id,rating FROM ratings')
        if not result:
            return []
        return result
    def dirty_books_referencing(self, field, id, commit=True):
        # Get the list of books to dirty -- all books that reference the item
        table = self.field_metadata[field]['table']