newdb: Speed up multisort() by lazy evaluation

The sort keys for the sub-sorted columns are now only evaluated on demand.
The common case is that the keys from the first column will differ,
therefore we can speed up sorting on average by delaying evaluation of
the sort keys for the second and later columns, only evaluating them
when the sort keys of the first column for a particular book differ.
The gains will be particularly noticeable for columns where calculating
the sort key is expensive, like composite columns or the series column.
This commit is contained in:
Kovid Goyal 2013-08-28 15:58:32 +05:30
parent 20817c3992
commit f1d71a068e
4 changed files with 170 additions and 106 deletions

View File

@ -10,7 +10,8 @@ __docformat__ = 'restructuredtext en'
import os, traceback, random, shutil, re
from io import BytesIO
from collections import defaultdict
from functools import wraps, partial
from functools import wraps
from future_builtins import zip
from calibre import isbytestring
from calibre.constants import iswindows, preferred_encoding
@ -19,7 +20,7 @@ from calibre.db import SPOOL_SIZE, _get_next_series_num_for_list
from calibre.db.categories import get_categories
from calibre.db.locking import create_locks
from calibre.db.errors import NoSuchFormat
from calibre.db.fields import create_field
from calibre.db.fields import create_field, IDENTITY
from calibre.db.search import Search
from calibre.db.tables import VirtualTable
from calibre.db.write import get_series_values
@ -804,42 +805,59 @@ class Cache(object):
ascending=True or False). The most significant field is the first
2-tuple.
'''
all_book_ids = frozenset(self._all_book_ids() if ids_to_sort is None
else ids_to_sort)
ids_to_sort = all_book_ids if ids_to_sort is None else ids_to_sort
ids_to_sort = self._all_book_ids() if ids_to_sort is None else ids_to_sort
get_metadata = self._get_proxy_metadata
lang_map = self.fields['languages'].book_value_map
virtual_fields = virtual_fields or {}
fm = {'title':'sort', 'authors':'author_sort'}
def sort_key(field):
'Handle series type fields'
def sort_key_func(field):
'Handle series type fields, virtual fields and the id field'
idx = field + '_index'
is_series = idx in self.fields
try:
ans = self.fields[fm.get(field, field)].sort_keys_for_books(
get_metadata, lang_map, all_book_ids)
func = self.fields[fm.get(field, field)].sort_keys_for_books(get_metadata, lang_map)
except KeyError:
if field == 'id':
ans = {bid:bid for bid in all_book_ids}
return IDENTITY
else:
ans = virtual_fields[fm.get(field, field)].sort_keys_for_books(
get_metadata, lang_map, all_book_ids)
return virtual_fields[fm.get(field, field)].sort_keys_for_books(get_metadata, lang_map)
if is_series:
idx_ans = self.fields[idx].sort_keys_for_books(
get_metadata, lang_map, all_book_ids)
ans = {k:(v, idx_ans[k]) for k, v in ans.iteritems()}
return ans
idx_func = self.fields[idx].sort_keys_for_books(get_metadata, lang_map)
def skf(book_id):
return (func(book_id), idx_func(book_id))
return skf
return func
sort_keys = tuple(sort_key(field[0]) for field in fields)
if len(fields) == 1:
return sorted(ids_to_sort, key=sort_key_func(fields[0][0]),
reverse=not fields[0][1])
sort_key_funcs = tuple(sort_key_func(field) for field, order in fields)
orders = tuple(1 if order else -1 for _, order in fields)
Lazy = object() # Lazy load the sort keys for sub-sort fields
if len(sort_keys) == 1:
sk = sort_keys[0]
return sorted(ids_to_sort, key=lambda i:sk[i], reverse=not
fields[0][1])
else:
return sorted(ids_to_sort, key=partial(SortKey, fields, sort_keys))
class SortKey(object):
__slots__ = ('book_id', 'sort_key')
def __init__(self, book_id):
self.book_id = book_id
# Calculate only the first sub-sort key since that will always be used
self.sort_key = [key(book_id) if i == 0 else Lazy for i, key in enumerate(sort_key_funcs)]
def __cmp__(self, other):
for i, (order, self_key, other_key) in enumerate(zip(orders, self.sort_key, other.sort_key)):
if self_key is Lazy:
self_key = self.sort_key[i] = sort_key_funcs[i](self.book_id)
if other_key is Lazy:
other_key = other.sort_key[i] = sort_key_funcs[i](other.book_id)
ans = cmp(self_key, other_key)
if ans != 0:
return ans * order
return 0
return sorted(ids_to_sort, key=SortKey)
@read_api
def search(self, query, restriction='', virtual_fields=None, book_ids=None):
@ -1713,17 +1731,3 @@ class Cache(object):
# }}}
class SortKey(object): # {{{
def __init__(self, fields, sort_keys, book_id):
self.orders = tuple(1 if f[1] else -1 for f in fields)
self.sort_key = tuple(sk[book_id] for sk in sort_keys)
def __cmp__(self, other):
for i, order in enumerate(self.orders):
ans = cmp(self.sort_key[i], other.sort_key[i])
if ans != 0:
return ans * order
return 0
# }}}

View File

@ -25,6 +25,8 @@ from calibre.utils.localization import calibre_langcode_to_name
def bool_sort_key(bools_are_tristate):
return (lambda x:{True: 1, False: 2, None: 3}.get(x, 3)) if bools_are_tristate else lambda x:{True: 1, False: 2, None: 2}.get(x, 2)
IDENTITY = lambda x: x
class Field(object):
is_many = False
@ -36,7 +38,7 @@ class Field(object):
dt = self.metadata['datatype']
self.has_text_data = dt in {'text', 'comments', 'series', 'enumeration'}
self.table_type = self.table.table_type
self._sort_key = (sort_key if dt in ('text', 'series', 'enumeration') else lambda x: x)
self._sort_key = (sort_key if dt in ('text', 'series', 'enumeration') else IDENTITY)
# This will be compared to the output of sort_key() which is a
# bytestring, therefore it is safer to have it be a bytestring.
@ -112,12 +114,11 @@ class Field(object):
'''
return iter(())
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids):
def sort_keys_for_books(self, get_metadata, lang_map):
'''
Return a mapping of book_id -> sort_key. The sort key is suitable for
Return a function that maps book_id to sort_key. The sort key is suitable for
use in sorting the list of all books by this field, via the python cmp
method. all_book_ids is the list/set of book ids for which sort_keys
should be generated.
method.
'''
raise NotImplementedError()
@ -165,9 +166,13 @@ class OneToOneField(Field):
def __iter__(self):
return self.table.book_col_map.iterkeys()
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids):
return {id_: self._sort_key(self.table.book_col_map.get(id_,
self._default_sort_key)) for id_ in all_book_ids}
def sort_keys_for_books(self, get_metadata, lang_map):
bcmg = self.table.book_col_map.get
dk = self._default_sort_key
sk = self._sort_key
if sk is IDENTITY:
return lambda book_id:bcmg(book_id, dk)
return lambda book_id:sk(bcmg(book_id, dk))
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
cbm = self.table.book_col_map
@ -263,9 +268,12 @@ class CompositeField(OneToOneField):
self._render_cache[book_id] = ans
return ans
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids):
return {id_: self._sort_key(self.get_value_with_cache(id_, get_metadata)) for id_ in
all_book_ids}
def sort_keys_for_books(self, get_metadata, lang_map):
gv = self.get_value_with_cache
sk = self._sort_key
if sk is IDENTITY:
return lambda book_id:gv(book_id, get_metadata)
return lambda book_id:sk(gv(book_id, get_metadata))
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
val_map = defaultdict(set)
@ -362,9 +370,8 @@ class OnDeviceField(OneToOneField):
def __iter__(self):
return iter(())
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids):
return {id_: self.for_book(id_) for id_ in
all_book_ids}
def sort_keys_for_books(self, get_metadata, lang_map):
return self.for_book
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
val_map = defaultdict(set)
@ -373,6 +380,27 @@ class OnDeviceField(OneToOneField):
for val, book_ids in val_map.iteritems():
yield val, book_ids
class LazySortMap(object):
__slots__ = ('default_sort_key', 'sort_key_func', 'id_map', 'cache')
def __init__(self, default_sort_key, sort_key_func, id_map):
self.default_sort_key = default_sort_key
self.sort_key_func = sort_key_func
self.id_map = id_map
self.cache = {None:default_sort_key}
def __call__(self, item_id):
try:
return self.cache[item_id]
except KeyError:
try:
val = self.cache[item_id] = self.sort_key_func(self.id_map[item_id])
except KeyError:
val = self.cache[item_id] = self.default_sort_key
return val
class ManyToOneField(Field):
is_many = True
@ -397,13 +425,10 @@ class ManyToOneField(Field):
def __iter__(self):
return self.table.id_map.iterkeys()
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids):
ans = {id_: self.table.book_col_map.get(id_, None)
for id_ in all_book_ids}
sk_map = {cid: (self._default_sort_key if cid is None else
self._sort_key(self.table.id_map[cid]))
for cid in ans.itervalues()}
return {id_: sk_map[cid] for id_, cid in ans.iteritems()}
def sort_keys_for_books(self, get_metadata, lang_map):
sk_map = LazySortMap(self._default_sort_key, self._sort_key, self.table.id_map)
bcmg = self.table.book_col_map.get
return lambda book_id:sk_map(bcmg(book_id, None))
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
cbm = self.table.col_book_map
@ -447,17 +472,17 @@ class ManyToManyField(Field):
def __iter__(self):
return self.table.id_map.iterkeys()
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids):
ans = {id_: self.table.book_col_map.get(id_, ())
for id_ in all_book_ids}
all_cids = set()
for cids in ans.itervalues():
all_cids = all_cids.union(set(cids))
sk_map = {cid: self._sort_key(self.table.id_map[cid]) for cid in all_cids}
sort_func = (lambda x:tuple(sorted(x))) if self.sort_sort_key else tuple
return {id_: (sort_func(sk_map[cid] for cid in cids) if cids else
(self._default_sort_key,))
for id_, cids in ans.iteritems()}
def sort_keys_for_books(self, get_metadata, lang_map):
sk_map = LazySortMap(self._default_sort_key, self._sort_key, self.table.id_map)
bcmg = self.table.book_col_map.get
dsk = (self._default_sort_key,)
if self.sort_sort_key:
def sk(book_id):
return tuple(sorted(sk_map(x) for x in bcmg(book_id, ()))) or dsk
else:
def sk(book_id):
return tuple(sk_map(x) for x in bcmg(book_id, ())) or dsk
return sk
def iter_searchable_values(self, get_metadata, candidates, default_value=None):
cbm = self.table.col_book_map
@ -491,13 +516,11 @@ class IdentifiersField(ManyToManyField):
ids = default_value
return ids
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids):
def sort_keys_for_books(self, get_metadata, lang_map):
'Sort by identifier keys'
ans = {id_: self.table.book_col_map.get(id_, ())
for id_ in all_book_ids}
return {id_: (tuple(sorted(cids.iterkeys())) if cids else
(self._default_sort_key,))
for id_, cids in ans.iteritems()}
bcmg = self.table.book_col_map.get
dv = {self._default_sort_key:None}
return lambda book_id: tuple(sorted(bcmg(book_id, dv).iterkeys()))
def iter_searchable_values(self, get_metadata, candidates, default_value=()):
bcm = self.table.book_col_map
@ -566,22 +589,43 @@ class FormatsField(ManyToManyField):
ans.append(c)
return ans
class LazySeriesSortMap(object):
__slots__ = ('default_sort_key', 'sort_key_func', 'id_map', 'cache')
def __init__(self, default_sort_key, sort_key_func, id_map):
self.default_sort_key = default_sort_key
self.sort_key_func = sort_key_func
self.id_map = id_map
self.cache = {}
def __call__(self, item_id, lang):
try:
return self.cache[(item_id, lang)]
except KeyError:
try:
val = self.cache[(item_id, lang)] = self.sort_key_func(self.id_map[item_id], lang)
except KeyError:
val = self.cache[(item_id, lang)] = self.default_sort_key
return val
class SeriesField(ManyToOneField):
def sort_key_for_series(self, book_id, lang_map, series_sort_order):
sid = self.table.book_col_map.get(book_id, None)
if sid is None:
return self._default_sort_key
lang = lang_map.get(book_id, None) or None
if lang:
lang = lang[0]
return self._sort_key(title_sort(self.table.id_map[sid],
order=series_sort_order, lang=lang))
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids):
def sort_keys_for_books(self, get_metadata, lang_map):
sso = tweaks['title_series_sorting']
return {book_id:self.sort_key_for_series(book_id, lang_map, sso) for book_id
in all_book_ids}
ssk = self._sort_key
ts = title_sort
def sk(val, lang):
return ssk(ts(val, order=sso, lang=lang))
sk_map = LazySeriesSortMap(self._default_sort_key, sk, self.table.id_map)
bcmg = self.table.book_col_map.get
lang_map = {k:v[0] if v else None for k, v in lang_map.iteritems()}
def key(book_id):
lang = lang_map.get(book_id, None)
return sk_map(bcmg(book_id, None), lang)
return key
def category_sort_value(self, item_id, book_ids, lang_map):
lang = None

View File

@ -126,6 +126,7 @@ class ReadingTest(BaseTest):
def test_sorting(self): # {{{
'Test sorting'
cache = self.init_cache()
ae = self.assertEqual
for field, order in {
'title' : [2, 1, 3],
'authors': [2, 1, 3],
@ -151,49 +152,63 @@ class ReadingTest(BaseTest):
'#comments':[3, 2, 1],
}.iteritems():
x = list(reversed(order))
self.assertEqual(order, cache.multisort([(field, True)],
ae(order, cache.multisort([(field, True)],
ids_to_sort=x),
'Ascending sort of %s failed'%field)
self.assertEqual(x, cache.multisort([(field, False)],
ae(x, cache.multisort([(field, False)],
ids_to_sort=order),
'Descending sort of %s failed'%field)
# Test subsorting
self.assertEqual([3, 2, 1], cache.multisort([('identifiers', True),
('title', True)]), 'Subsort failed')
# Test sorting of is_multiple fields.
# Author like fields should be sorted by generating sort names from the
# actual values in entry order
for field in ('authors', '#authors'):
self.assertEqual(
ae(
cache.set_field(field, {1:('aa bb', 'bb cc', 'cc dd'), 2:('bb aa', 'xx yy'), 3: ('aa bb', 'bb aa')}), {1, 2, 3})
self.assertEqual([2, 3, 1], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3)))
self.assertEqual([1, 3, 2], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3)))
ae([2, 3, 1], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3)))
ae([1, 3, 2], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3)))
# All other is_multiple fields should be sorted by sorting the values
# for each book and using that as the sort key
for field in ('tags', '#tags'):
self.assertEqual(
ae(
cache.set_field(field, {1:('b', 'a'), 2:('c', 'y'), 3: ('b', 'z')}), {1, 2, 3})
self.assertEqual([1, 3, 2], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3)))
self.assertEqual([2, 3, 1], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3)))
ae([1, 3, 2], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3)))
ae([2, 3, 1], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3)))
# Test tweak to sort dates by visible format
from calibre.utils.date import parse_only_date as p
from calibre.utils.config_base import Tweak
self.assertEqual(cache.set_field('pubdate', {1:p('2001-3-3'), 2:p('2002-2-3'), 3:p('2003-1-3')}), {1, 2, 3})
self.assertEqual([1, 2, 3], cache.multisort([('pubdate', True)]))
ae(cache.set_field('pubdate', {1:p('2001-3-3'), 2:p('2002-2-3'), 3:p('2003-1-3')}), {1, 2, 3})
ae([1, 2, 3], cache.multisort([('pubdate', True)]))
with Tweak('gui_pubdate_display_format', 'MMM'), Tweak('sort_dates_using_visible_fields', True):
c2 = self.init_cache()
self.assertEqual([3, 2, 1], c2.multisort([('pubdate', True)]))
ae([3, 2, 1], c2.multisort([('pubdate', True)]))
# Test bool sorting when not tristate
cache.set_pref('bools_are_tristate', False)
c2 = self.init_cache()
self.assertEqual([2, 3, 1], c2.multisort([('#yesno', True), ('id', False)]))
ae([2, 3, 1], c2.multisort([('#yesno', True), ('id', False)]))
# Test subsorting
ae([3, 2, 1], cache.multisort([('identifiers', True),
('title', True)]), 'Subsort failed')
from calibre.ebooks.metadata.book.base import Metadata
for i in xrange(7):
cache.create_book_entry(Metadata('title%d' % i), apply_import_tags=False)
cache.create_custom_column('one', 'CC1', 'int', False)
cache.create_custom_column('two', 'CC2', 'int', False)
cache.create_custom_column('three', 'CC3', 'int', False)
cache.close()
cache = self.init_cache()
cache.set_field('#one', {(i+(5*m)):m for m in (0, 1) for i in xrange(1, 6)})
cache.set_field('#two', {i+(m*3):m for m in (0, 1, 2) for i in (1, 2, 3)})
cache.set_field('#two', {10:2})
cache.set_field('#three', {i:i for i in xrange(1, 11)})
ae(list(xrange(1, 11)), cache.multisort([('#one', True), ('#two', True)], ids_to_sort=sorted(cache.all_book_ids())))
ae([4, 5, 1, 2, 3, 7,8, 9, 10, 6], cache.multisort([('#one', True), ('#two', False)], ids_to_sort=sorted(cache.all_book_ids())))
ae([5, 4, 3, 2, 1, 10, 9, 8, 7, 6], cache.multisort([('#one', True), ('#two', False), ('#three', False)], ids_to_sort=sorted(cache.all_book_ids())))
# }}}
def test_get_metadata(self): # {{{

View File

@ -30,8 +30,9 @@ class MarkedVirtualField(object):
for book_id in candidates:
yield self.marked_ids.get(book_id, default_value), {book_id}
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids):
return {bid:self.marked_ids.get(bid, None) for bid in all_book_ids}
def sort_keys_for_books(self, get_metadata, lang_map):
g = self.marked_ids.get
return lambda book_id:g(book_id, None)
class TableRow(object):