Speed up get_categories()

This commit is contained in:
Kovid Goyal 2010-12-14 10:55:48 -07:00
commit 9548f98035

View File

@ -10,11 +10,10 @@ import os, sys, shutil, cStringIO, glob, time, functools, traceback, re
from itertools import repeat from itertools import repeat
from math import floor from math import floor
from Queue import Queue from Queue import Queue
from operator import itemgetter
from PyQt4.QtGui import QImage from PyQt4.QtGui import QImage
from calibre import prints
from calibre.ebooks.metadata import title_sort, author_to_author_sort from calibre.ebooks.metadata import title_sort, author_to_author_sort
from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre.ebooks.metadata.opf2 import metadata_to_opf
from calibre.library.database import LibraryDatabase from calibre.library.database import LibraryDatabase
@ -1039,25 +1038,142 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
tn=field['table'], col=field['link_column']), (id_,)) tn=field['table'], col=field['link_column']), (id_,))
return set(x[0] for x in ans) return set(x[0] for x in ans)
########## data structures for get_categories
CATEGORY_SORTS = ('name', 'popularity', 'rating') CATEGORY_SORTS = ('name', 'popularity', 'rating')
def get_categories(self, sort='name', ids=None, icon_map=None): class TCat_Tag(object):
self.books_list_filter.change([] if not ids else ids)
categories = {} def __init__(self, name, sort):
self.n = name
self.s = sort
self.c = 0
self.rt = 0
self.rc = 0
self.id = None
def set_all(self, c, rt, rc, id):
self.c = c
self.rt = rt
self.rc = rc
self.id = id
def __str__(self):
return unicode(self)
def __unicode__(self):
return 'n=%s s=%s c=%d rt=%d rc=%d id=%s'%\
(self.n, self.s, self.c, self.rt, self.rc, self.id)
def get_categories(self, sort='name', ids=None, icon_map=None):
start = last = time.time()
if icon_map is not None and type(icon_map) != TagsIcons: if icon_map is not None and type(icon_map) != TagsIcons:
raise TypeError('icon_map passed to get_categories must be of type TagIcons') raise TypeError('icon_map passed to get_categories must be of type TagIcons')
if sort not in self.CATEGORY_SORTS:
raise ValueError('sort ' + sort + ' not a valid value')
self.books_list_filter.change([] if not ids else ids)
id_filter = None if not ids else frozenset(ids)
tb_cats = self.field_metadata tb_cats = self.field_metadata
#### First, build the standard and custom-column categories #### tcategories = {}
tids = {}
md = []
# First, build the maps. We need a category->items map and an
# item -> (item_id, sort_val) map to use in the books loop
for category in tb_cats.keys(): for category in tb_cats.keys():
cat = tb_cats[category] cat = tb_cats[category]
if not cat['is_category'] or cat['kind'] in ['user', 'search']: if not cat['is_category'] or cat['kind'] in ['user', 'search'] \
or category in ['news', 'formats']:
continue continue
# Get the ids for the item values
if not cat['is_custom']:
funcs = {
'authors' : self.get_authors_with_ids,
'series' : self.get_series_with_ids,
'publisher': self.get_publishers_with_ids,
'tags' : self.get_tags_with_ids,
'rating' : self.get_ratings_with_ids,
}
func = funcs.get(category, None)
if func:
list = func()
else:
raise ValueError(category + ' has no get with ids function')
else:
list = self.get_custom_items_with_ids(label=cat['label'])
tids[category] = {}
if category == 'authors':
for l in list:
(id, val, sort_val) = (l[0], l[1], l[2])
tids[category][val] = (id, sort_val)
else:
for l in list:
(id, val) = (l[0], l[1])
tids[category][val] = (id, val)
# add an empty category to the category map
tcategories[category] = {}
# create a list of category/field_index for the books scan to use.
# This saves iterating through field_metadata for each book
md.append((category, cat['rec_index'], cat['is_multiple']))
print 'end phase "collection":', time.time() - last, 'seconds'
last = time.time()
# Now scan every book looking for category items.
# Code below is duplicated because it shaves off 10% of the loop time
id_dex = self.FIELD_MAP['id']
rating_dex = self.FIELD_MAP['rating']
tag_class = LibraryDatabase2.TCat_Tag
for book in self.data.iterall():
if id_filter and book[id_dex] not in id_filter:
continue
rating = book[rating_dex]
# We kept track of all possible category field_map positions above
for (cat, dex, mult) in md:
if book[dex] is None:
continue
if not mult:
val = book[dex]
try:
(item_id, sort_val) = tids[cat][val] # let exceptions fly
item = tcategories[cat].get(val, None)
if not item:
item = tag_class(val, sort_val)
tcategories[cat][val] = item
item.c += 1
item.id = item_id
if rating > 0:
item.rt += rating
item.rc += 1
except:
prints('get_categories: item', val, 'is not in', cat, 'list!')
else:
vals = book[dex].split(mult)
for val in vals:
try:
(item_id, sort_val) = tids[cat][val] # let exceptions fly
item = tcategories[cat].get(val, None)
if not item:
item = tag_class(val, sort_val)
tcategories[cat][val] = item
item.c += 1
item.id = item_id
if rating > 0:
item.rt += rating
item.rc += 1
except:
prints('get_categories: item', val, 'is not in', cat, 'list!')
print 'end phase "books":', time.time() - last, 'seconds'
last = time.time()
# Now do news
tcategories['news'] = {}
cat = tb_cats['news']
tn = cat['table'] tn = cat['table']
categories[category] = [] #reserve the position in the ordered list
if tn is None: # Nothing to do for the moment
continue
cn = cat['column'] cn = cat['column']
if ids is None: if ids is None:
query = '''SELECT id, {0}, count, avg_rating, sort query = '''SELECT id, {0}, count, avg_rating, sort
@ -1065,17 +1181,32 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
else: else:
query = '''SELECT id, {0}, count, avg_rating, sort query = '''SELECT id, {0}, count, avg_rating, sort
FROM tag_browser_filtered_{1}'''.format(cn, tn) FROM tag_browser_filtered_{1}'''.format(cn, tn)
if sort == 'popularity': # results will be sorted later
query += ' ORDER BY count DESC, sort ASC'
elif sort == 'name':
query += ' ORDER BY sort COLLATE icucollate'
else:
query += ' ORDER BY avg_rating DESC, sort ASC'
data = self.conn.get(query) data = self.conn.get(query)
for r in data:
item = LibraryDatabase2.TCat_Tag(r[1], r[1])
item.set_all(c=r[2], rt=r[2]*r[3], rc=r[2], id=r[0])
tcategories['news'][r[1]] = item
print 'end phase "news":', time.time() - last, 'seconds'
last = time.time()
# Build the real category list by iterating over the temporary copy
# and building the Tag instances.
categories = {}
tag_class = Tag
for category in tb_cats.keys():
if category not in tcategories:
continue
cat = tb_cats[category]
# prepare the place where we will put the array of Tags
categories[category] = []
# icon_map is not None if get_categories is to store an icon and # icon_map is not None if get_categories is to store an icon and
# possibly a tooltip in the tag structure. # possibly a tooltip in the tag structure.
icon, tooltip = None, '' icon = None
tooltip = ''
label = tb_cats.key_to_label(category) label = tb_cats.key_to_label(category)
if icon_map: if icon_map:
if not tb_cats.is_custom_field(category): if not tb_cats.is_custom_field(category):
@ -1087,23 +1218,46 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
tooltip = self.custom_column_label_map[label]['name'] tooltip = self.custom_column_label_map[label]['name']
datatype = cat['datatype'] datatype = cat['datatype']
avgr = itemgetter(3) avgr = lambda x: 0.0 if x.rc == 0 else x.rt/x.rc
item_not_zero_func = lambda x: x[2] > 0 # Duplicate the build of items below to avoid using a lambda func
# in the main Tag loop. Saves a few %
if datatype == 'rating': if datatype == 'rating':
# eliminate the zero ratings line as well as count == 0
item_not_zero_func = (lambda x: x[1] > 0 and x[2] > 0)
formatter = (lambda x:u'\u2605'*int(x/2)) formatter = (lambda x:u'\u2605'*int(x/2))
avgr = itemgetter(1) avgr = lambda x : x.n
# eliminate the zero ratings line as well as count == 0
items = [v for v in tcategories[category].values() if v.c > 0 and v.n != 0]
elif category == 'authors': elif category == 'authors':
# Clean up the authors strings to human-readable form # Clean up the authors strings to human-readable form
formatter = (lambda x: x.replace('|', ',')) formatter = (lambda x: x.replace('|', ','))
items = [v for v in tcategories[category].values() if v.c > 0]
else: else:
formatter = (lambda x:unicode(x)) formatter = (lambda x:unicode(x))
items = [v for v in tcategories[category].values() if v.c > 0]
categories[category] = [Tag(formatter(r[1]), count=r[2], id=r[0], # sort the list
avg=avgr(r), sort=r[4], icon=icon, if sort == 'name':
def get_sort_key(x):
sk = x.s
if isinstance(sk, unicode):
sk = sort_key(sk)
return sk
kf = get_sort_key
reverse=False
elif sort == 'popularity':
kf = lambda x: x.c
reverse=True
else:
kf = avgr
reverse=True
items.sort(key=kf, reverse=reverse)
categories[category] = [tag_class(formatter(r.n), count=r.c, id=r.id,
avg=avgr(r), sort=r.s, icon=icon,
tooltip=tooltip, category=category) tooltip=tooltip, category=category)
for r in data if item_not_zero_func(r)] for r in items]
print 'end phase "tags list":', time.time() - last, 'seconds'
last = time.time()
# Needed for legacy databases that have multiple ratings that # Needed for legacy databases that have multiple ratings that
# map to n stars # map to n stars
@ -1189,8 +1343,13 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
icon_map['search'] = icon_map['search'] icon_map['search'] = icon_map['search']
categories['search'] = items categories['search'] = items
print 'last phase ran in:', time.time() - last, 'seconds'
print 'get_categories ran in:', time.time() - start, 'seconds'
return categories return categories
############# End get_categories
def tags_older_than(self, tag, delta): def tags_older_than(self, tag, delta):
tag = tag.lower().strip() tag = tag.lower().strip()
now = nowf() now = nowf()
@ -1486,6 +1645,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
# Note: we generally do not need to refresh_ids because library_view will # Note: we generally do not need to refresh_ids because library_view will
# refresh everything. # refresh everything.
def get_ratings_with_ids(self):
result = self.conn.get('SELECT id,rating FROM ratings')
if not result:
return []
return result
def dirty_books_referencing(self, field, id, commit=True): def dirty_books_referencing(self, field, id, commit=True):
# Get the list of books to dirty -- all books that reference the item # Get the list of books to dirty -- all books that reference the item
table = self.field_metadata[field]['table'] table = self.field_metadata[field]['table']