newdb: Speed up multisort() by lazy evaluation

The sort keys for the sub-sorted columns are now only evaluated on demand.
The common case is that the keys from the first column will differ,
therefore we can speed up sorting on average by delaying evaluation of
the sort keys for the second and later columns, only evaluating them
when the sort keys of the first column for a particular book differ.
The gains will be particularly noticeable for columns where calculating
the sort key is expensive, like composite columns or the series column.
This commit is contained in:
Kovid Goyal 2013-08-28 15:58:32 +05:30
parent 20817c3992
commit f1d71a068e
4 changed files with 170 additions and 106 deletions

View File

@ -10,7 +10,8 @@ __docformat__ = 'restructuredtext en'
import os, traceback, random, shutil, re import os, traceback, random, shutil, re
from io import BytesIO from io import BytesIO
from collections import defaultdict from collections import defaultdict
from functools import wraps, partial from functools import wraps
from future_builtins import zip
from calibre import isbytestring from calibre import isbytestring
from calibre.constants import iswindows, preferred_encoding from calibre.constants import iswindows, preferred_encoding
@ -19,7 +20,7 @@ from calibre.db import SPOOL_SIZE, _get_next_series_num_for_list
from calibre.db.categories import get_categories from calibre.db.categories import get_categories
from calibre.db.locking import create_locks from calibre.db.locking import create_locks
from calibre.db.errors import NoSuchFormat from calibre.db.errors import NoSuchFormat
from calibre.db.fields import create_field from calibre.db.fields import create_field, IDENTITY
from calibre.db.search import Search from calibre.db.search import Search
from calibre.db.tables import VirtualTable from calibre.db.tables import VirtualTable
from calibre.db.write import get_series_values from calibre.db.write import get_series_values
@ -804,42 +805,59 @@ class Cache(object):
ascending=True or False). The most significant field is the first ascending=True or False). The most significant field is the first
2-tuple. 2-tuple.
''' '''
all_book_ids = frozenset(self._all_book_ids() if ids_to_sort is None ids_to_sort = self._all_book_ids() if ids_to_sort is None else ids_to_sort
else ids_to_sort)
ids_to_sort = all_book_ids if ids_to_sort is None else ids_to_sort
get_metadata = self._get_proxy_metadata get_metadata = self._get_proxy_metadata
lang_map = self.fields['languages'].book_value_map lang_map = self.fields['languages'].book_value_map
virtual_fields = virtual_fields or {} virtual_fields = virtual_fields or {}
fm = {'title':'sort', 'authors':'author_sort'} fm = {'title':'sort', 'authors':'author_sort'}
def sort_key(field): def sort_key_func(field):
'Handle series type fields' 'Handle series type fields, virtual fields and the id field'
idx = field + '_index' idx = field + '_index'
is_series = idx in self.fields is_series = idx in self.fields
try: try:
ans = self.fields[fm.get(field, field)].sort_keys_for_books( func = self.fields[fm.get(field, field)].sort_keys_for_books(get_metadata, lang_map)
get_metadata, lang_map, all_book_ids)
except KeyError: except KeyError:
if field == 'id': if field == 'id':
ans = {bid:bid for bid in all_book_ids} return IDENTITY
else: else:
ans = virtual_fields[fm.get(field, field)].sort_keys_for_books( return virtual_fields[fm.get(field, field)].sort_keys_for_books(get_metadata, lang_map)
get_metadata, lang_map, all_book_ids)
if is_series: if is_series:
idx_ans = self.fields[idx].sort_keys_for_books( idx_func = self.fields[idx].sort_keys_for_books(get_metadata, lang_map)
get_metadata, lang_map, all_book_ids) def skf(book_id):
ans = {k:(v, idx_ans[k]) for k, v in ans.iteritems()} return (func(book_id), idx_func(book_id))
return ans return skf
return func
sort_keys = tuple(sort_key(field[0]) for field in fields) if len(fields) == 1:
return sorted(ids_to_sort, key=sort_key_func(fields[0][0]),
reverse=not fields[0][1])
sort_key_funcs = tuple(sort_key_func(field) for field, order in fields)
orders = tuple(1 if order else -1 for _, order in fields)
Lazy = object() # Lazy load the sort keys for sub-sort fields
if len(sort_keys) == 1: class SortKey(object):
sk = sort_keys[0]
return sorted(ids_to_sort, key=lambda i:sk[i], reverse=not __slots__ = ('book_id', 'sort_key')
fields[0][1])
else: def __init__(self, book_id):
return sorted(ids_to_sort, key=partial(SortKey, fields, sort_keys)) self.book_id = book_id
# Calculate only the first sub-sort key since that will always be used
self.sort_key = [key(book_id) if i == 0 else Lazy for i, key in enumerate(sort_key_funcs)]
def __cmp__(self, other):
for i, (order, self_key, other_key) in enumerate(zip(orders, self.sort_key, other.sort_key)):
if self_key is Lazy:
self_key = self.sort_key[i] = sort_key_funcs[i](self.book_id)
if other_key is Lazy:
other_key = other.sort_key[i] = sort_key_funcs[i](other.book_id)
ans = cmp(self_key, other_key)
if ans != 0:
return ans * order
return 0
return sorted(ids_to_sort, key=SortKey)
@read_api @read_api
def search(self, query, restriction='', virtual_fields=None, book_ids=None): def search(self, query, restriction='', virtual_fields=None, book_ids=None):
@ -1713,17 +1731,3 @@ class Cache(object):
# }}} # }}}
class SortKey(object): # {{{
def __init__(self, fields, sort_keys, book_id):
self.orders = tuple(1 if f[1] else -1 for f in fields)
self.sort_key = tuple(sk[book_id] for sk in sort_keys)
def __cmp__(self, other):
for i, order in enumerate(self.orders):
ans = cmp(self.sort_key[i], other.sort_key[i])
if ans != 0:
return ans * order
return 0
# }}}

View File

@ -25,6 +25,8 @@ from calibre.utils.localization import calibre_langcode_to_name
def bool_sort_key(bools_are_tristate): def bool_sort_key(bools_are_tristate):
return (lambda x:{True: 1, False: 2, None: 3}.get(x, 3)) if bools_are_tristate else lambda x:{True: 1, False: 2, None: 2}.get(x, 2) return (lambda x:{True: 1, False: 2, None: 3}.get(x, 3)) if bools_are_tristate else lambda x:{True: 1, False: 2, None: 2}.get(x, 2)
IDENTITY = lambda x: x
class Field(object): class Field(object):
is_many = False is_many = False
@ -36,7 +38,7 @@ class Field(object):
dt = self.metadata['datatype'] dt = self.metadata['datatype']
self.has_text_data = dt in {'text', 'comments', 'series', 'enumeration'} self.has_text_data = dt in {'text', 'comments', 'series', 'enumeration'}
self.table_type = self.table.table_type self.table_type = self.table.table_type
self._sort_key = (sort_key if dt in ('text', 'series', 'enumeration') else lambda x: x) self._sort_key = (sort_key if dt in ('text', 'series', 'enumeration') else IDENTITY)
# This will be compared to the output of sort_key() which is a # This will be compared to the output of sort_key() which is a
# bytestring, therefore it is safer to have it be a bytestring. # bytestring, therefore it is safer to have it be a bytestring.
@ -112,12 +114,11 @@ class Field(object):
''' '''
return iter(()) return iter(())
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): def sort_keys_for_books(self, get_metadata, lang_map):
''' '''
Return a mapping of book_id -> sort_key. The sort key is suitable for Return a function that maps book_id to sort_key. The sort key is suitable for
use in sorting the list of all books by this field, via the python cmp use in sorting the list of all books by this field, via the python cmp
method. all_book_ids is the list/set of book ids for which sort_keys method.
should be generated.
''' '''
raise NotImplementedError() raise NotImplementedError()
@ -165,9 +166,13 @@ class OneToOneField(Field):
def __iter__(self): def __iter__(self):
return self.table.book_col_map.iterkeys() return self.table.book_col_map.iterkeys()
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): def sort_keys_for_books(self, get_metadata, lang_map):
return {id_: self._sort_key(self.table.book_col_map.get(id_, bcmg = self.table.book_col_map.get
self._default_sort_key)) for id_ in all_book_ids} dk = self._default_sort_key
sk = self._sort_key
if sk is IDENTITY:
return lambda book_id:bcmg(book_id, dk)
return lambda book_id:sk(bcmg(book_id, dk))
def iter_searchable_values(self, get_metadata, candidates, default_value=None): def iter_searchable_values(self, get_metadata, candidates, default_value=None):
cbm = self.table.book_col_map cbm = self.table.book_col_map
@ -263,9 +268,12 @@ class CompositeField(OneToOneField):
self._render_cache[book_id] = ans self._render_cache[book_id] = ans
return ans return ans
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): def sort_keys_for_books(self, get_metadata, lang_map):
return {id_: self._sort_key(self.get_value_with_cache(id_, get_metadata)) for id_ in gv = self.get_value_with_cache
all_book_ids} sk = self._sort_key
if sk is IDENTITY:
return lambda book_id:gv(book_id, get_metadata)
return lambda book_id:sk(gv(book_id, get_metadata))
def iter_searchable_values(self, get_metadata, candidates, default_value=None): def iter_searchable_values(self, get_metadata, candidates, default_value=None):
val_map = defaultdict(set) val_map = defaultdict(set)
@ -362,9 +370,8 @@ class OnDeviceField(OneToOneField):
def __iter__(self): def __iter__(self):
return iter(()) return iter(())
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): def sort_keys_for_books(self, get_metadata, lang_map):
return {id_: self.for_book(id_) for id_ in return self.for_book
all_book_ids}
def iter_searchable_values(self, get_metadata, candidates, default_value=None): def iter_searchable_values(self, get_metadata, candidates, default_value=None):
val_map = defaultdict(set) val_map = defaultdict(set)
@ -373,6 +380,27 @@ class OnDeviceField(OneToOneField):
for val, book_ids in val_map.iteritems(): for val, book_ids in val_map.iteritems():
yield val, book_ids yield val, book_ids
class LazySortMap(object):
__slots__ = ('default_sort_key', 'sort_key_func', 'id_map', 'cache')
def __init__(self, default_sort_key, sort_key_func, id_map):
self.default_sort_key = default_sort_key
self.sort_key_func = sort_key_func
self.id_map = id_map
self.cache = {None:default_sort_key}
def __call__(self, item_id):
try:
return self.cache[item_id]
except KeyError:
try:
val = self.cache[item_id] = self.sort_key_func(self.id_map[item_id])
except KeyError:
val = self.cache[item_id] = self.default_sort_key
return val
class ManyToOneField(Field): class ManyToOneField(Field):
is_many = True is_many = True
@ -397,13 +425,10 @@ class ManyToOneField(Field):
def __iter__(self): def __iter__(self):
return self.table.id_map.iterkeys() return self.table.id_map.iterkeys()
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): def sort_keys_for_books(self, get_metadata, lang_map):
ans = {id_: self.table.book_col_map.get(id_, None) sk_map = LazySortMap(self._default_sort_key, self._sort_key, self.table.id_map)
for id_ in all_book_ids} bcmg = self.table.book_col_map.get
sk_map = {cid: (self._default_sort_key if cid is None else return lambda book_id:sk_map(bcmg(book_id, None))
self._sort_key(self.table.id_map[cid]))
for cid in ans.itervalues()}
return {id_: sk_map[cid] for id_, cid in ans.iteritems()}
def iter_searchable_values(self, get_metadata, candidates, default_value=None): def iter_searchable_values(self, get_metadata, candidates, default_value=None):
cbm = self.table.col_book_map cbm = self.table.col_book_map
@ -447,17 +472,17 @@ class ManyToManyField(Field):
def __iter__(self): def __iter__(self):
return self.table.id_map.iterkeys() return self.table.id_map.iterkeys()
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): def sort_keys_for_books(self, get_metadata, lang_map):
ans = {id_: self.table.book_col_map.get(id_, ()) sk_map = LazySortMap(self._default_sort_key, self._sort_key, self.table.id_map)
for id_ in all_book_ids} bcmg = self.table.book_col_map.get
all_cids = set() dsk = (self._default_sort_key,)
for cids in ans.itervalues(): if self.sort_sort_key:
all_cids = all_cids.union(set(cids)) def sk(book_id):
sk_map = {cid: self._sort_key(self.table.id_map[cid]) for cid in all_cids} return tuple(sorted(sk_map(x) for x in bcmg(book_id, ()))) or dsk
sort_func = (lambda x:tuple(sorted(x))) if self.sort_sort_key else tuple else:
return {id_: (sort_func(sk_map[cid] for cid in cids) if cids else def sk(book_id):
(self._default_sort_key,)) return tuple(sk_map(x) for x in bcmg(book_id, ())) or dsk
for id_, cids in ans.iteritems()} return sk
def iter_searchable_values(self, get_metadata, candidates, default_value=None): def iter_searchable_values(self, get_metadata, candidates, default_value=None):
cbm = self.table.col_book_map cbm = self.table.col_book_map
@ -491,13 +516,11 @@ class IdentifiersField(ManyToManyField):
ids = default_value ids = default_value
return ids return ids
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): def sort_keys_for_books(self, get_metadata, lang_map):
'Sort by identifier keys' 'Sort by identifier keys'
ans = {id_: self.table.book_col_map.get(id_, ()) bcmg = self.table.book_col_map.get
for id_ in all_book_ids} dv = {self._default_sort_key:None}
return {id_: (tuple(sorted(cids.iterkeys())) if cids else return lambda book_id: tuple(sorted(bcmg(book_id, dv).iterkeys()))
(self._default_sort_key,))
for id_, cids in ans.iteritems()}
def iter_searchable_values(self, get_metadata, candidates, default_value=()): def iter_searchable_values(self, get_metadata, candidates, default_value=()):
bcm = self.table.book_col_map bcm = self.table.book_col_map
@ -566,22 +589,43 @@ class FormatsField(ManyToManyField):
ans.append(c) ans.append(c)
return ans return ans
class LazySeriesSortMap(object):
__slots__ = ('default_sort_key', 'sort_key_func', 'id_map', 'cache')
def __init__(self, default_sort_key, sort_key_func, id_map):
self.default_sort_key = default_sort_key
self.sort_key_func = sort_key_func
self.id_map = id_map
self.cache = {}
def __call__(self, item_id, lang):
try:
return self.cache[(item_id, lang)]
except KeyError:
try:
val = self.cache[(item_id, lang)] = self.sort_key_func(self.id_map[item_id], lang)
except KeyError:
val = self.cache[(item_id, lang)] = self.default_sort_key
return val
class SeriesField(ManyToOneField): class SeriesField(ManyToOneField):
def sort_key_for_series(self, book_id, lang_map, series_sort_order): def sort_keys_for_books(self, get_metadata, lang_map):
sid = self.table.book_col_map.get(book_id, None)
if sid is None:
return self._default_sort_key
lang = lang_map.get(book_id, None) or None
if lang:
lang = lang[0]
return self._sort_key(title_sort(self.table.id_map[sid],
order=series_sort_order, lang=lang))
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids):
sso = tweaks['title_series_sorting'] sso = tweaks['title_series_sorting']
return {book_id:self.sort_key_for_series(book_id, lang_map, sso) for book_id ssk = self._sort_key
in all_book_ids} ts = title_sort
def sk(val, lang):
return ssk(ts(val, order=sso, lang=lang))
sk_map = LazySeriesSortMap(self._default_sort_key, sk, self.table.id_map)
bcmg = self.table.book_col_map.get
lang_map = {k:v[0] if v else None for k, v in lang_map.iteritems()}
def key(book_id):
lang = lang_map.get(book_id, None)
return sk_map(bcmg(book_id, None), lang)
return key
def category_sort_value(self, item_id, book_ids, lang_map): def category_sort_value(self, item_id, book_ids, lang_map):
lang = None lang = None

View File

@ -126,6 +126,7 @@ class ReadingTest(BaseTest):
def test_sorting(self): # {{{ def test_sorting(self): # {{{
'Test sorting' 'Test sorting'
cache = self.init_cache() cache = self.init_cache()
ae = self.assertEqual
for field, order in { for field, order in {
'title' : [2, 1, 3], 'title' : [2, 1, 3],
'authors': [2, 1, 3], 'authors': [2, 1, 3],
@ -151,49 +152,63 @@ class ReadingTest(BaseTest):
'#comments':[3, 2, 1], '#comments':[3, 2, 1],
}.iteritems(): }.iteritems():
x = list(reversed(order)) x = list(reversed(order))
self.assertEqual(order, cache.multisort([(field, True)], ae(order, cache.multisort([(field, True)],
ids_to_sort=x), ids_to_sort=x),
'Ascending sort of %s failed'%field) 'Ascending sort of %s failed'%field)
self.assertEqual(x, cache.multisort([(field, False)], ae(x, cache.multisort([(field, False)],
ids_to_sort=order), ids_to_sort=order),
'Descending sort of %s failed'%field) 'Descending sort of %s failed'%field)
# Test subsorting
self.assertEqual([3, 2, 1], cache.multisort([('identifiers', True),
('title', True)]), 'Subsort failed')
# Test sorting of is_multiple fields. # Test sorting of is_multiple fields.
# Author like fields should be sorted by generating sort names from the # Author like fields should be sorted by generating sort names from the
# actual values in entry order # actual values in entry order
for field in ('authors', '#authors'): for field in ('authors', '#authors'):
self.assertEqual( ae(
cache.set_field(field, {1:('aa bb', 'bb cc', 'cc dd'), 2:('bb aa', 'xx yy'), 3: ('aa bb', 'bb aa')}), {1, 2, 3}) cache.set_field(field, {1:('aa bb', 'bb cc', 'cc dd'), 2:('bb aa', 'xx yy'), 3: ('aa bb', 'bb aa')}), {1, 2, 3})
self.assertEqual([2, 3, 1], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3))) ae([2, 3, 1], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3)))
self.assertEqual([1, 3, 2], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3))) ae([1, 3, 2], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3)))
# All other is_multiple fields should be sorted by sorting the values # All other is_multiple fields should be sorted by sorting the values
# for each book and using that as the sort key # for each book and using that as the sort key
for field in ('tags', '#tags'): for field in ('tags', '#tags'):
self.assertEqual( ae(
cache.set_field(field, {1:('b', 'a'), 2:('c', 'y'), 3: ('b', 'z')}), {1, 2, 3}) cache.set_field(field, {1:('b', 'a'), 2:('c', 'y'), 3: ('b', 'z')}), {1, 2, 3})
self.assertEqual([1, 3, 2], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3))) ae([1, 3, 2], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3)))
self.assertEqual([2, 3, 1], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3))) ae([2, 3, 1], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3)))
# Test tweak to sort dates by visible format # Test tweak to sort dates by visible format
from calibre.utils.date import parse_only_date as p from calibre.utils.date import parse_only_date as p
from calibre.utils.config_base import Tweak from calibre.utils.config_base import Tweak
self.assertEqual(cache.set_field('pubdate', {1:p('2001-3-3'), 2:p('2002-2-3'), 3:p('2003-1-3')}), {1, 2, 3}) ae(cache.set_field('pubdate', {1:p('2001-3-3'), 2:p('2002-2-3'), 3:p('2003-1-3')}), {1, 2, 3})
self.assertEqual([1, 2, 3], cache.multisort([('pubdate', True)])) ae([1, 2, 3], cache.multisort([('pubdate', True)]))
with Tweak('gui_pubdate_display_format', 'MMM'), Tweak('sort_dates_using_visible_fields', True): with Tweak('gui_pubdate_display_format', 'MMM'), Tweak('sort_dates_using_visible_fields', True):
c2 = self.init_cache() c2 = self.init_cache()
self.assertEqual([3, 2, 1], c2.multisort([('pubdate', True)])) ae([3, 2, 1], c2.multisort([('pubdate', True)]))
# Test bool sorting when not tristate # Test bool sorting when not tristate
cache.set_pref('bools_are_tristate', False) cache.set_pref('bools_are_tristate', False)
c2 = self.init_cache() c2 = self.init_cache()
self.assertEqual([2, 3, 1], c2.multisort([('#yesno', True), ('id', False)])) ae([2, 3, 1], c2.multisort([('#yesno', True), ('id', False)]))
# Test subsorting
ae([3, 2, 1], cache.multisort([('identifiers', True),
('title', True)]), 'Subsort failed')
from calibre.ebooks.metadata.book.base import Metadata
for i in xrange(7):
cache.create_book_entry(Metadata('title%d' % i), apply_import_tags=False)
cache.create_custom_column('one', 'CC1', 'int', False)
cache.create_custom_column('two', 'CC2', 'int', False)
cache.create_custom_column('three', 'CC3', 'int', False)
cache.close()
cache = self.init_cache()
cache.set_field('#one', {(i+(5*m)):m for m in (0, 1) for i in xrange(1, 6)})
cache.set_field('#two', {i+(m*3):m for m in (0, 1, 2) for i in (1, 2, 3)})
cache.set_field('#two', {10:2})
cache.set_field('#three', {i:i for i in xrange(1, 11)})
ae(list(xrange(1, 11)), cache.multisort([('#one', True), ('#two', True)], ids_to_sort=sorted(cache.all_book_ids())))
ae([4, 5, 1, 2, 3, 7,8, 9, 10, 6], cache.multisort([('#one', True), ('#two', False)], ids_to_sort=sorted(cache.all_book_ids())))
ae([5, 4, 3, 2, 1, 10, 9, 8, 7, 6], cache.multisort([('#one', True), ('#two', False), ('#three', False)], ids_to_sort=sorted(cache.all_book_ids())))
# }}} # }}}
def test_get_metadata(self): # {{{ def test_get_metadata(self): # {{{

View File

@ -30,8 +30,9 @@ class MarkedVirtualField(object):
for book_id in candidates: for book_id in candidates:
yield self.marked_ids.get(book_id, default_value), {book_id} yield self.marked_ids.get(book_id, default_value), {book_id}
def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): def sort_keys_for_books(self, get_metadata, lang_map):
return {bid:self.marked_ids.get(bid, None) for bid in all_book_ids} g = self.marked_ids.get
return lambda book_id:g(book_id, None)
class TableRow(object): class TableRow(object):