Improved get_categories -- approximately 6 times faster

This commit is contained in:
Charles Haley 2010-12-14 14:11:10 +00:00
parent f1911aa270
commit d2ff37b5a1

View File

@ -14,7 +14,7 @@ from operator import itemgetter
from PyQt4.QtGui import QImage
from calibre import prints
from calibre.ebooks.metadata import title_sort, author_to_author_sort
from calibre.ebooks.metadata.opf2 import metadata_to_opf
from calibre.library.database import LibraryDatabase
@ -1039,25 +1039,139 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
tn=field['table'], col=field['link_column']), (id_,))
return set(x[0] for x in ans)
########## data structures for get_categories
CATEGORY_SORTS = ('name', 'popularity', 'rating')
def get_categories(self, sort='name', ids=None, icon_map=None):
self.books_list_filter.change([] if not ids else ids)
class TCat_Tag(object):
categories = {}
def __init__(self, name, sort):
self.n = name
self.s = sort
self.c = 0
self.rt = 0
self.rc = 0
self.id = None
def set_all(self, c, rt, rc, id):
self.c = c
self.rt = rt
self.rc = rc
self.id = id
def __str__(self):
return unicode(self)
def __unicode__(self):
return 'n=%s s=%s c=%d rt=%d rc=%d id=%s'%\
(self.n, self.s, self.c, self.rt, self.rc, self.id)
def get_categories(self, sort='name', ids=None, icon_map=None):
start = time.time()
if icon_map is not None and type(icon_map) != TagsIcons:
raise TypeError('icon_map passed to get_categories must be of type TagIcons')
if sort not in self.CATEGORY_SORTS:
raise ValueError('sort ' + sort + ' not a valid value')
self.books_list_filter.change([] if not ids else ids)
id_filter = None if not ids else frozenset(ids)
tb_cats = self.field_metadata
#### First, build the standard and custom-column categories ####
tcategories = {}
tids = {}
md = []
# First, build the maps. We need a category->items map and an
# item -> (item_id, sort_val) map to use in the books loop
for category in tb_cats.keys():
cat = tb_cats[category]
if not cat['is_category'] or cat['kind'] in ['user', 'search']:
if not cat['is_category'] or cat['kind'] in ['user', 'search'] \
or category in ['news', 'formats']:
continue
# Get the ids for the item values
if not cat['is_custom']:
funcs = {
'authors' : self.get_authors_with_ids,
'series' : self.get_series_with_ids,
'publisher': self.get_publishers_with_ids,
'tags' : self.get_tags_with_ids,
'rating' : self.get_ratings_with_ids,
}
func = funcs.get(category, None)
if func:
list = func()
else:
raise ValueError(category + ' has no get with ids function')
else:
list = self.get_custom_items_with_ids(label=cat['label'])
tids[category] = {}
if category == 'authors':
for l in list:
(id, val, sort_val) = (l[0], l[1], l[2])
tids[category][val] = (id, sort_val)
else:
for l in list:
(id, val) = (l[0], l[1])
tids[category][val] = (id, val)
# add an empty category to the category map
tcategories[category] = {}
# create a list of category/field_index for the books scan to use.
# This saves iterating through field_metadata for each book
md.append((category, cat['rec_index'], cat['is_multiple']))
print 'end phase "collection":', time.time() - start, 'seconds'
# Now scan every book looking for category items.
# Code below is duplicated because it shaves off 10% of the loop time
id_dex = self.FIELD_MAP['id']
rating_dex = self.FIELD_MAP['rating']
for book in self.data.iterall():
if id_filter and book[id_dex] not in id_filter:
continue
rating = book[rating_dex]
# We kept track of all possible category field_map positions above
for (cat, dex, mult) in md:
if book[dex] is None:
continue
if not mult:
val = book[dex]
try:
(item_id, sort_val) = tids[cat][val] # let exceptions fly
item = tcategories[cat].get(val, None)
if not item:
item = LibraryDatabase2.TCat_Tag(val, sort_val)
tcategories[cat][val] = item
item.c += 1
item.id = item_id
if rating > 0:
item.rt += rating
item.rc += 1
except:
prints('get_categories: item', val, 'is not in', cat, 'list!')
else:
vals = book[dex].split(mult)
for val in vals:
try:
(item_id, sort_val) = tids[cat][val] # let exceptions fly
item = tcategories[cat].get(val, None)
if not item:
item = LibraryDatabase2.TCat_Tag(val, sort_val)
tcategories[cat][val] = item
item.c += 1
item.id = item_id
if rating > 0:
item.rt += rating
item.rc += 1
except:
prints('get_categories: item', val, 'is not in', cat, 'list!')
print 'end phase "books":', time.time() - start, 'seconds'
# Now do news
tcategories['news'] = {}
cat = tb_cats['news']
tn = cat['table']
categories[category] = [] #reserve the position in the ordered list
if tn is None: # Nothing to do for the moment
continue
cn = cat['column']
if ids is None:
query = '''SELECT id, {0}, count, avg_rating, sort
@ -1065,17 +1179,30 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
else:
query = '''SELECT id, {0}, count, avg_rating, sort
FROM tag_browser_filtered_{1}'''.format(cn, tn)
if sort == 'popularity':
query += ' ORDER BY count DESC, sort ASC'
elif sort == 'name':
query += ' ORDER BY sort COLLATE icucollate'
else:
query += ' ORDER BY avg_rating DESC, sort ASC'
# results will be sorted later
data = self.conn.get(query)
for r in data:
item = LibraryDatabase2.TCat_Tag(r[1], r[1])
item.set_all(c=r[2], rt=r[2]*r[3], rc=r[2], id=r[0])
tcategories['news'][r[1]] = item
print 'end phase "news":', time.time() - start, 'seconds'
# Build the real category list by iterating over the temporary copy
# and building the Tag instances.
categories = {}
for category in tb_cats.keys():
if category not in tcategories:
continue
cat = tb_cats[category]
# prepare the place where we will put the array of Tags
categories[category] = []
# icon_map is not None if get_categories is to store an icon and
# possibly a tooltip in the tag structure.
icon, tooltip = None, ''
icon = None
tooltip = ''
label = tb_cats.key_to_label(category)
if icon_map:
if not tb_cats.is_custom_field(category):
@ -1087,23 +1214,40 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
tooltip = self.custom_column_label_map[label]['name']
datatype = cat['datatype']
avgr = itemgetter(3)
item_not_zero_func = lambda x: x[2] > 0
avgr = lambda x: 0.0 if x.rc == 0 else x.rt/x.rc
# Duplicate the build of items below to avoid using a lambda func
# in the main Tag loop. Saves a few %
if datatype == 'rating':
# eliminate the zero ratings line as well as count == 0
item_not_zero_func = (lambda x: x[1] > 0 and x[2] > 0)
formatter = (lambda x:u'\u2605'*int(x/2))
avgr = itemgetter(1)
avgr = lambda x : x.n
# eliminate the zero ratings line as well as count == 0
items = [v for v in tcategories[category].values() if v.c > 0 and v.n != 0]
elif category == 'authors':
# Clean up the authors strings to human-readable form
formatter = (lambda x: x.replace('|', ','))
items = [v for v in tcategories[category].values() if v.c > 0]
else:
formatter = (lambda x:unicode(x))
items = [v for v in tcategories[category].values() if v.c > 0]
categories[category] = [Tag(formatter(r[1]), count=r[2], id=r[0],
avg=avgr(r), sort=r[4], icon=icon,
# sort the list
if sort == 'name':
kf = lambda x: sort_key(x.s) if isinstance(x.s, unicode) else x.s
reverse=False
elif sort == 'popularity':
kf = lambda x: x.c
reverse=True
else:
kf = avgr
reverse=True
items.sort(key=kf, reverse=reverse)
categories[category] = [Tag(formatter(r.n), count=r.c, id=r.id,
avg=avgr(r), sort=r.s, icon=icon,
tooltip=tooltip, category=category)
for r in data if item_not_zero_func(r)]
for r in items]
print 'end phase "tags list":', time.time() - start, 'seconds'
# Needed for legacy databases that have multiple ratings that
# map to n stars
@ -1189,8 +1333,13 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
icon_map['search'] = icon_map['search']
categories['search'] = items
t = time.time() - start
print 'get_categories ran in:', t, 'seconds'
return categories
############# End get_categories
def tags_older_than(self, tag, delta):
tag = tag.lower().strip()
now = nowf()
@ -1486,6 +1635,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
# Note: we generally do not need to refresh_ids because library_view will
# refresh everything.
def get_ratings_with_ids(self):
result = self.conn.get('SELECT id,rating FROM ratings')
if not result:
return []
return result
def dirty_books_referencing(self, field, id, commit=True):
# Get the list of books to dirty -- all books that reference the item
table = self.field_metadata[field]['table']