Much faster sorting code

This commit is contained in:
Kovid Goyal 2010-09-11 19:11:30 -06:00
parent e531b51767
commit 7382552d18

View File

@ -607,16 +607,22 @@ class ResultCache(SearchQueryParser):
y = UNDEFINED_DATE
return cmp(x, y)
if subsort and ans == 0:
return cmp(self._data[x][11].lower(), self._data[y][11].lower())
idx = self.FIELD_MAP['sort']
return cmp(self._data[x][idx].lower(), self._data[y][idx].lower())
return ans
def sort(self, field, ascending, subsort=False):
def sanitize_field_name(self, field):
field = field.lower().strip()
if field in ('author', 'tag', 'comment'):
field += 's'
if field == 'date': field = 'timestamp'
elif field == 'title': field = 'sort'
elif field == 'authors': field = 'author_sort'
if field not in self.field_metadata.iterkeys():
if field in ('author', 'tag', 'comment'):
field += 's'
if field == 'date': field = 'timestamp'
elif field == 'title': field = 'sort'
elif field == 'authors': field = 'author_sort'
return field
def sort(self, field, ascending, subsort=False):
field = self.sanitize_field_name(field)
as_string = field not in ('size', 'rating', 'timestamp')
if self.first_sort:
@ -643,6 +649,164 @@ class ResultCache(SearchQueryParser):
self._map.sort(cmp=fcmp, reverse=not ascending)
self._map_filtered = [id for id in self._map if id in self._map_filtered]
def multisort(self, fields=[], subsort=False):
fields = [(self.sanitize_field_name(x), bool(y)) for x, y in fields]
if subsort and 'sort' not in [x[0] for x in fields]:
fields += [('sort', True)]
if not fields:
fields = [('timestamp', False)]
keys = self.field_metadata.keys()
for f, order in fields:
if f not in keys:
raise ValueError(f + ' not an existing field name')
keyg = SortKeyGenerator(fields, self.field_metadata, self._data)
if len(fields) == 1:
self._map.sort(key=keyg, reverse=not fields[0][1])
else:
self._map.sort(key=keyg)
self._map_filtered = [id for id in self._map if id in self._map_filtered]
class SortKey(object):
def __init__(self, orders, values):
self.orders, self.values = orders, values
def __cmp__(self, other):
for i, ascending in enumerate(self.orders):
ans = cmp(self.values[i], other.values[i])
if ans != 0:
if not ascending:
ans *= -1
return ans
return 0
class SortKeyGenerator(object):
def __init__(self, fields, field_metadata, data):
self.field_metadata = field_metadata
self.orders = [x[1] for x in fields]
self.entries = [(x[0], field_metadata[x[0]]) for x in fields]
self.library_order = tweaks['title_series_sorting'] == 'library_order'
self.data = data
def __call__(self, record):
values = tuple(self.itervals(self.data[record]))
if len(values) == 1:
return values[0]
return SortKey(self.orders, values)
def itervals(self, record):
for name, fm in self.entries:
dt = fm['datatype']
val = record[fm['rec_index']]
if dt == 'datetime':
if val is None:
val = UNDEFINED_DATE
elif dt == 'series':
if val is None:
val = ('', 1)
else:
val = val.lower()
if self.library_order:
val = title_sort(val)
sidx_fm = self.field_metadata[name + '_index']
sidx = record[sidx_fm['rec_index']]
val = (val, sidx)
elif dt in ('text', 'comments'):
if val is None:
val = ''
val = val.lower()
yield val
# }}}
if __name__ == '__main__':
# Testing.timing for new multi-sort {{{
import time
from calibre.library import db
db = db()
db.refresh()
fields = db.field_metadata.keys()
print fields
def do_single_sort(meth, field, order):
if meth == 'old':
db.data.sort(field, order)
else:
db.data.multisort([(field, order)])
def test_single_sort(field):
for meth in ('old', 'new'):
ttime = 0
NUM = 10
asc = desc = None
for i in range(NUM):
db.data.sort('id', False)
st = time.time()
do_single_sort(meth, field, True)
asc = db.data._map
do_single_sort(meth, field, False)
desc = db.data._map
ttime += time.time() - st
yield (ttime/NUM, asc, desc)
print 'Running single sort differentials'
for field in fields:
if field in ('search', 'id', 'news', 'flags'): continue
print '\t', field
old, new = test_single_sort(field)
if old[1] != new[1] or old[2] != new[2]:
print '\t\t', 'Sort failure!'
raise SystemExit(1)
print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
def do_multi_sort(meth, ms):
if meth == 'new':
db.data.multisort(ms)
else:
for s in reversed(ms):
db.data.sort(*s)
def test_multi_sort(ms):
for meth in ('old', 'new'):
ttime = 0
NUM = 10
for i in range(NUM):
db.data.sort('id', False)
st = time.time()
do_multi_sort(meth, ms)
ttime += time.time() - st
yield (ttime/NUM, db.data._map)
print 'Running multi-sort differentials'
for ms in [
[('timestamp', False), ('author', True), ('title', False)],
[('size', True), ('tags', True), ('author', False)],
[('series', False), ('title', True)],
[('size', True), ('tags', True), ('author', False), ('pubdate',
True), ('tags', False), ('formats', False), ('uuid', True)],
]:
print '\t', ms
db.data.sort('id', False)
old, new = test_multi_sort(ms)
if old[1] != new[1]:
print '\t\t', 'Sort failure!'
raise SystemExit()
print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
# }}}