From f1d71a068e27d361ca0a5d2e5116bcac0899484f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 28 Aug 2013 15:58:32 +0530 Subject: [PATCH] newdb: Speed up multisort() by lazy evaluation The sort keys for the sub-sorted columns are now only evaluated on demand. The common case is that the keys from the first column will differ, therefore we can speed up sorting on average by delaying evaluation of the sort keys for the second and later columns, only evaluating them when the sort keys of the first column for a particular book differ. The gains will be particularly noticeable for columns where calculating the sort key is expensive, like composite columns or the series column. --- src/calibre/db/cache.py | 78 +++++++++-------- src/calibre/db/fields.py | 146 +++++++++++++++++++++----------- src/calibre/db/tests/reading.py | 47 ++++++---- src/calibre/db/view.py | 5 +- 4 files changed, 170 insertions(+), 106 deletions(-) diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index dbc28a4200..18ef989988 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -10,7 +10,8 @@ __docformat__ = 'restructuredtext en' import os, traceback, random, shutil, re from io import BytesIO from collections import defaultdict -from functools import wraps, partial +from functools import wraps +from future_builtins import zip from calibre import isbytestring from calibre.constants import iswindows, preferred_encoding @@ -19,7 +20,7 @@ from calibre.db import SPOOL_SIZE, _get_next_series_num_for_list from calibre.db.categories import get_categories from calibre.db.locking import create_locks from calibre.db.errors import NoSuchFormat -from calibre.db.fields import create_field +from calibre.db.fields import create_field, IDENTITY from calibre.db.search import Search from calibre.db.tables import VirtualTable from calibre.db.write import get_series_values @@ -804,42 +805,59 @@ class Cache(object): ascending=True or False). The most significant field is the first 2-tuple. ''' - all_book_ids = frozenset(self._all_book_ids() if ids_to_sort is None - else ids_to_sort) - ids_to_sort = all_book_ids if ids_to_sort is None else ids_to_sort + ids_to_sort = self._all_book_ids() if ids_to_sort is None else ids_to_sort get_metadata = self._get_proxy_metadata lang_map = self.fields['languages'].book_value_map virtual_fields = virtual_fields or {} fm = {'title':'sort', 'authors':'author_sort'} - def sort_key(field): - 'Handle series type fields' + def sort_key_func(field): + 'Handle series type fields, virtual fields and the id field' idx = field + '_index' is_series = idx in self.fields try: - ans = self.fields[fm.get(field, field)].sort_keys_for_books( - get_metadata, lang_map, all_book_ids) + func = self.fields[fm.get(field, field)].sort_keys_for_books(get_metadata, lang_map) except KeyError: if field == 'id': - ans = {bid:bid for bid in all_book_ids} + return IDENTITY else: - ans = virtual_fields[fm.get(field, field)].sort_keys_for_books( - get_metadata, lang_map, all_book_ids) + return virtual_fields[fm.get(field, field)].sort_keys_for_books(get_metadata, lang_map) if is_series: - idx_ans = self.fields[idx].sort_keys_for_books( - get_metadata, lang_map, all_book_ids) - ans = {k:(v, idx_ans[k]) for k, v in ans.iteritems()} - return ans + idx_func = self.fields[idx].sort_keys_for_books(get_metadata, lang_map) + def skf(book_id): + return (func(book_id), idx_func(book_id)) + return skf + return func - sort_keys = tuple(sort_key(field[0]) for field in fields) + if len(fields) == 1: + return sorted(ids_to_sort, key=sort_key_func(fields[0][0]), + reverse=not fields[0][1]) + sort_key_funcs = tuple(sort_key_func(field) for field, order in fields) + orders = tuple(1 if order else -1 for _, order in fields) + Lazy = object() # Lazy load the sort keys for sub-sort fields - if len(sort_keys) == 1: - sk = sort_keys[0] - return sorted(ids_to_sort, key=lambda i:sk[i], reverse=not - fields[0][1]) - else: - return sorted(ids_to_sort, key=partial(SortKey, fields, sort_keys)) + class SortKey(object): + + __slots__ = ('book_id', 'sort_key') + + def __init__(self, book_id): + self.book_id = book_id + # Calculate only the first sub-sort key since that will always be used + self.sort_key = [key(book_id) if i == 0 else Lazy for i, key in enumerate(sort_key_funcs)] + + def __cmp__(self, other): + for i, (order, self_key, other_key) in enumerate(zip(orders, self.sort_key, other.sort_key)): + if self_key is Lazy: + self_key = self.sort_key[i] = sort_key_funcs[i](self.book_id) + if other_key is Lazy: + other_key = other.sort_key[i] = sort_key_funcs[i](other.book_id) + ans = cmp(self_key, other_key) + if ans != 0: + return ans * order + return 0 + + return sorted(ids_to_sort, key=SortKey) @read_api def search(self, query, restriction='', virtual_fields=None, book_ids=None): @@ -1713,17 +1731,3 @@ class Cache(object): # }}} -class SortKey(object): # {{{ - - def __init__(self, fields, sort_keys, book_id): - self.orders = tuple(1 if f[1] else -1 for f in fields) - self.sort_key = tuple(sk[book_id] for sk in sort_keys) - - def __cmp__(self, other): - for i, order in enumerate(self.orders): - ans = cmp(self.sort_key[i], other.sort_key[i]) - if ans != 0: - return ans * order - return 0 -# }}} - diff --git a/src/calibre/db/fields.py b/src/calibre/db/fields.py index fc7bee2c51..b59fc58608 100644 --- a/src/calibre/db/fields.py +++ b/src/calibre/db/fields.py @@ -25,6 +25,8 @@ from calibre.utils.localization import calibre_langcode_to_name def bool_sort_key(bools_are_tristate): return (lambda x:{True: 1, False: 2, None: 3}.get(x, 3)) if bools_are_tristate else lambda x:{True: 1, False: 2, None: 2}.get(x, 2) +IDENTITY = lambda x: x + class Field(object): is_many = False @@ -36,7 +38,7 @@ class Field(object): dt = self.metadata['datatype'] self.has_text_data = dt in {'text', 'comments', 'series', 'enumeration'} self.table_type = self.table.table_type - self._sort_key = (sort_key if dt in ('text', 'series', 'enumeration') else lambda x: x) + self._sort_key = (sort_key if dt in ('text', 'series', 'enumeration') else IDENTITY) # This will be compared to the output of sort_key() which is a # bytestring, therefore it is safer to have it be a bytestring. @@ -112,12 +114,11 @@ class Field(object): ''' return iter(()) - def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): + def sort_keys_for_books(self, get_metadata, lang_map): ''' - Return a mapping of book_id -> sort_key. The sort key is suitable for + Return a function that maps book_id to sort_key. The sort key is suitable for use in sorting the list of all books by this field, via the python cmp - method. all_book_ids is the list/set of book ids for which sort_keys - should be generated. + method. ''' raise NotImplementedError() @@ -165,9 +166,13 @@ class OneToOneField(Field): def __iter__(self): return self.table.book_col_map.iterkeys() - def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): - return {id_: self._sort_key(self.table.book_col_map.get(id_, - self._default_sort_key)) for id_ in all_book_ids} + def sort_keys_for_books(self, get_metadata, lang_map): + bcmg = self.table.book_col_map.get + dk = self._default_sort_key + sk = self._sort_key + if sk is IDENTITY: + return lambda book_id:bcmg(book_id, dk) + return lambda book_id:sk(bcmg(book_id, dk)) def iter_searchable_values(self, get_metadata, candidates, default_value=None): cbm = self.table.book_col_map @@ -263,9 +268,12 @@ class CompositeField(OneToOneField): self._render_cache[book_id] = ans return ans - def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): - return {id_: self._sort_key(self.get_value_with_cache(id_, get_metadata)) for id_ in - all_book_ids} + def sort_keys_for_books(self, get_metadata, lang_map): + gv = self.get_value_with_cache + sk = self._sort_key + if sk is IDENTITY: + return lambda book_id:gv(book_id, get_metadata) + return lambda book_id:sk(gv(book_id, get_metadata)) def iter_searchable_values(self, get_metadata, candidates, default_value=None): val_map = defaultdict(set) @@ -362,9 +370,8 @@ class OnDeviceField(OneToOneField): def __iter__(self): return iter(()) - def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): - return {id_: self.for_book(id_) for id_ in - all_book_ids} + def sort_keys_for_books(self, get_metadata, lang_map): + return self.for_book def iter_searchable_values(self, get_metadata, candidates, default_value=None): val_map = defaultdict(set) @@ -373,6 +380,27 @@ class OnDeviceField(OneToOneField): for val, book_ids in val_map.iteritems(): yield val, book_ids +class LazySortMap(object): + + __slots__ = ('default_sort_key', 'sort_key_func', 'id_map', 'cache') + + def __init__(self, default_sort_key, sort_key_func, id_map): + self.default_sort_key = default_sort_key + self.sort_key_func = sort_key_func + self.id_map = id_map + self.cache = {None:default_sort_key} + + def __call__(self, item_id): + try: + return self.cache[item_id] + except KeyError: + try: + val = self.cache[item_id] = self.sort_key_func(self.id_map[item_id]) + except KeyError: + val = self.cache[item_id] = self.default_sort_key + return val + + class ManyToOneField(Field): is_many = True @@ -397,13 +425,10 @@ class ManyToOneField(Field): def __iter__(self): return self.table.id_map.iterkeys() - def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): - ans = {id_: self.table.book_col_map.get(id_, None) - for id_ in all_book_ids} - sk_map = {cid: (self._default_sort_key if cid is None else - self._sort_key(self.table.id_map[cid])) - for cid in ans.itervalues()} - return {id_: sk_map[cid] for id_, cid in ans.iteritems()} + def sort_keys_for_books(self, get_metadata, lang_map): + sk_map = LazySortMap(self._default_sort_key, self._sort_key, self.table.id_map) + bcmg = self.table.book_col_map.get + return lambda book_id:sk_map(bcmg(book_id, None)) def iter_searchable_values(self, get_metadata, candidates, default_value=None): cbm = self.table.col_book_map @@ -447,17 +472,17 @@ class ManyToManyField(Field): def __iter__(self): return self.table.id_map.iterkeys() - def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): - ans = {id_: self.table.book_col_map.get(id_, ()) - for id_ in all_book_ids} - all_cids = set() - for cids in ans.itervalues(): - all_cids = all_cids.union(set(cids)) - sk_map = {cid: self._sort_key(self.table.id_map[cid]) for cid in all_cids} - sort_func = (lambda x:tuple(sorted(x))) if self.sort_sort_key else tuple - return {id_: (sort_func(sk_map[cid] for cid in cids) if cids else - (self._default_sort_key,)) - for id_, cids in ans.iteritems()} + def sort_keys_for_books(self, get_metadata, lang_map): + sk_map = LazySortMap(self._default_sort_key, self._sort_key, self.table.id_map) + bcmg = self.table.book_col_map.get + dsk = (self._default_sort_key,) + if self.sort_sort_key: + def sk(book_id): + return tuple(sorted(sk_map(x) for x in bcmg(book_id, ()))) or dsk + else: + def sk(book_id): + return tuple(sk_map(x) for x in bcmg(book_id, ())) or dsk + return sk def iter_searchable_values(self, get_metadata, candidates, default_value=None): cbm = self.table.col_book_map @@ -491,13 +516,11 @@ class IdentifiersField(ManyToManyField): ids = default_value return ids - def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): + def sort_keys_for_books(self, get_metadata, lang_map): 'Sort by identifier keys' - ans = {id_: self.table.book_col_map.get(id_, ()) - for id_ in all_book_ids} - return {id_: (tuple(sorted(cids.iterkeys())) if cids else - (self._default_sort_key,)) - for id_, cids in ans.iteritems()} + bcmg = self.table.book_col_map.get + dv = {self._default_sort_key:None} + return lambda book_id: tuple(sorted(bcmg(book_id, dv).iterkeys())) def iter_searchable_values(self, get_metadata, candidates, default_value=()): bcm = self.table.book_col_map @@ -566,22 +589,43 @@ class FormatsField(ManyToManyField): ans.append(c) return ans +class LazySeriesSortMap(object): + + __slots__ = ('default_sort_key', 'sort_key_func', 'id_map', 'cache') + + def __init__(self, default_sort_key, sort_key_func, id_map): + self.default_sort_key = default_sort_key + self.sort_key_func = sort_key_func + self.id_map = id_map + self.cache = {} + + def __call__(self, item_id, lang): + try: + return self.cache[(item_id, lang)] + except KeyError: + try: + val = self.cache[(item_id, lang)] = self.sort_key_func(self.id_map[item_id], lang) + except KeyError: + val = self.cache[(item_id, lang)] = self.default_sort_key + return val + class SeriesField(ManyToOneField): - def sort_key_for_series(self, book_id, lang_map, series_sort_order): - sid = self.table.book_col_map.get(book_id, None) - if sid is None: - return self._default_sort_key - lang = lang_map.get(book_id, None) or None - if lang: - lang = lang[0] - return self._sort_key(title_sort(self.table.id_map[sid], - order=series_sort_order, lang=lang)) - - def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): + def sort_keys_for_books(self, get_metadata, lang_map): sso = tweaks['title_series_sorting'] - return {book_id:self.sort_key_for_series(book_id, lang_map, sso) for book_id - in all_book_ids} + ssk = self._sort_key + ts = title_sort + def sk(val, lang): + return ssk(ts(val, order=sso, lang=lang)) + sk_map = LazySeriesSortMap(self._default_sort_key, sk, self.table.id_map) + bcmg = self.table.book_col_map.get + lang_map = {k:v[0] if v else None for k, v in lang_map.iteritems()} + + def key(book_id): + lang = lang_map.get(book_id, None) + return sk_map(bcmg(book_id, None), lang) + + return key def category_sort_value(self, item_id, book_ids, lang_map): lang = None diff --git a/src/calibre/db/tests/reading.py b/src/calibre/db/tests/reading.py index 2dcd519cad..101d1b12cc 100644 --- a/src/calibre/db/tests/reading.py +++ b/src/calibre/db/tests/reading.py @@ -126,6 +126,7 @@ class ReadingTest(BaseTest): def test_sorting(self): # {{{ 'Test sorting' cache = self.init_cache() + ae = self.assertEqual for field, order in { 'title' : [2, 1, 3], 'authors': [2, 1, 3], @@ -151,49 +152,63 @@ class ReadingTest(BaseTest): '#comments':[3, 2, 1], }.iteritems(): x = list(reversed(order)) - self.assertEqual(order, cache.multisort([(field, True)], + ae(order, cache.multisort([(field, True)], ids_to_sort=x), 'Ascending sort of %s failed'%field) - self.assertEqual(x, cache.multisort([(field, False)], + ae(x, cache.multisort([(field, False)], ids_to_sort=order), 'Descending sort of %s failed'%field) - # Test subsorting - self.assertEqual([3, 2, 1], cache.multisort([('identifiers', True), - ('title', True)]), 'Subsort failed') - # Test sorting of is_multiple fields. # Author like fields should be sorted by generating sort names from the # actual values in entry order for field in ('authors', '#authors'): - self.assertEqual( + ae( cache.set_field(field, {1:('aa bb', 'bb cc', 'cc dd'), 2:('bb aa', 'xx yy'), 3: ('aa bb', 'bb aa')}), {1, 2, 3}) - self.assertEqual([2, 3, 1], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3))) - self.assertEqual([1, 3, 2], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3))) + ae([2, 3, 1], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3))) + ae([1, 3, 2], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3))) # All other is_multiple fields should be sorted by sorting the values # for each book and using that as the sort key for field in ('tags', '#tags'): - self.assertEqual( + ae( cache.set_field(field, {1:('b', 'a'), 2:('c', 'y'), 3: ('b', 'z')}), {1, 2, 3}) - self.assertEqual([1, 3, 2], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3))) - self.assertEqual([2, 3, 1], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3))) + ae([1, 3, 2], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3))) + ae([2, 3, 1], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3))) # Test tweak to sort dates by visible format from calibre.utils.date import parse_only_date as p from calibre.utils.config_base import Tweak - self.assertEqual(cache.set_field('pubdate', {1:p('2001-3-3'), 2:p('2002-2-3'), 3:p('2003-1-3')}), {1, 2, 3}) - self.assertEqual([1, 2, 3], cache.multisort([('pubdate', True)])) + ae(cache.set_field('pubdate', {1:p('2001-3-3'), 2:p('2002-2-3'), 3:p('2003-1-3')}), {1, 2, 3}) + ae([1, 2, 3], cache.multisort([('pubdate', True)])) with Tweak('gui_pubdate_display_format', 'MMM'), Tweak('sort_dates_using_visible_fields', True): c2 = self.init_cache() - self.assertEqual([3, 2, 1], c2.multisort([('pubdate', True)])) + ae([3, 2, 1], c2.multisort([('pubdate', True)])) # Test bool sorting when not tristate cache.set_pref('bools_are_tristate', False) c2 = self.init_cache() - self.assertEqual([2, 3, 1], c2.multisort([('#yesno', True), ('id', False)])) + ae([2, 3, 1], c2.multisort([('#yesno', True), ('id', False)])) + # Test subsorting + ae([3, 2, 1], cache.multisort([('identifiers', True), + ('title', True)]), 'Subsort failed') + from calibre.ebooks.metadata.book.base import Metadata + for i in xrange(7): + cache.create_book_entry(Metadata('title%d' % i), apply_import_tags=False) + cache.create_custom_column('one', 'CC1', 'int', False) + cache.create_custom_column('two', 'CC2', 'int', False) + cache.create_custom_column('three', 'CC3', 'int', False) + cache.close() + cache = self.init_cache() + cache.set_field('#one', {(i+(5*m)):m for m in (0, 1) for i in xrange(1, 6)}) + cache.set_field('#two', {i+(m*3):m for m in (0, 1, 2) for i in (1, 2, 3)}) + cache.set_field('#two', {10:2}) + cache.set_field('#three', {i:i for i in xrange(1, 11)}) + ae(list(xrange(1, 11)), cache.multisort([('#one', True), ('#two', True)], ids_to_sort=sorted(cache.all_book_ids()))) + ae([4, 5, 1, 2, 3, 7,8, 9, 10, 6], cache.multisort([('#one', True), ('#two', False)], ids_to_sort=sorted(cache.all_book_ids()))) + ae([5, 4, 3, 2, 1, 10, 9, 8, 7, 6], cache.multisort([('#one', True), ('#two', False), ('#three', False)], ids_to_sort=sorted(cache.all_book_ids()))) # }}} def test_get_metadata(self): # {{{ diff --git a/src/calibre/db/view.py b/src/calibre/db/view.py index 43243318d5..04f205f21f 100644 --- a/src/calibre/db/view.py +++ b/src/calibre/db/view.py @@ -30,8 +30,9 @@ class MarkedVirtualField(object): for book_id in candidates: yield self.marked_ids.get(book_id, default_value), {book_id} - def sort_keys_for_books(self, get_metadata, lang_map, all_book_ids): - return {bid:self.marked_ids.get(bid, None) for bid in all_book_ids} + def sort_keys_for_books(self, get_metadata, lang_map): + g = self.marked_ids.get + return lambda book_id:g(book_id, None) class TableRow(object):