Speed up evaluation of composite columns

Use a ProxyMetadata object that lazily evaluates its fields on demand,
thereby avoiding the overhead of get_metadata() on every composite field
evaluation.
This commit is contained in:
Kovid Goyal 2013-07-24 10:57:04 +05:30
parent f37de3d33c
commit 70f1dbb832
3 changed files with 271 additions and 8 deletions

View File

@ -23,7 +23,7 @@ from calibre.db.fields import create_field
from calibre.db.search import Search from calibre.db.search import Search
from calibre.db.tables import VirtualTable from calibre.db.tables import VirtualTable
from calibre.db.write import get_series_values from calibre.db.write import get_series_values
from calibre.db.lazy import FormatMetadata, FormatsList from calibre.db.lazy import FormatMetadata, FormatsList, ProxyMetadata
from calibre.ebooks import check_ebook_format from calibre.ebooks import check_ebook_format
from calibre.ebooks.metadata import string_to_authors, author_to_author_sort, get_title_sort_pat from calibre.ebooks.metadata import string_to_authors, author_to_author_sort, get_title_sort_pat
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
@ -338,7 +338,7 @@ class Cache(object):
def fast_field_for(self, field_obj, book_id, default_value=None): def fast_field_for(self, field_obj, book_id, default_value=None):
' Same as field_for, except that it avoids the extra lookup to get the field object ' ' Same as field_for, except that it avoids the extra lookup to get the field object '
if field_obj.is_composite: if field_obj.is_composite:
return field_obj.get_value_with_cache(book_id, partial(self._get_metadata, get_user_categories=False)) return field_obj.get_value_with_cache(book_id, self._get_proxy_metadata)
try: try:
return field_obj.for_book(book_id, default_value=default_value) return field_obj.for_book(book_id, default_value=default_value)
except (KeyError, IndexError): except (KeyError, IndexError):
@ -358,8 +358,7 @@ class Cache(object):
return default_value return default_value
if mi is None: if mi is None:
return f.get_value_with_cache(book_id, partial(self._get_metadata, return f.get_value_with_cache(book_id, self._get_proxy_metadata)
get_user_categories=False))
else: else:
return f.render_composite(book_id, mi) return f.render_composite(book_id, mi)
@ -534,6 +533,10 @@ class Cache(object):
return mi return mi
@read_api
def get_proxy_metadata(self, book_id):
return ProxyMetadata(self, book_id)
@api @api
def cover(self, book_id, def cover(self, book_id,
as_file=False, as_image=False, as_path=False): as_file=False, as_image=False, as_path=False):
@ -781,7 +784,7 @@ class Cache(object):
''' '''
all_book_ids = frozenset(self._all_book_ids() if ids_to_sort is None all_book_ids = frozenset(self._all_book_ids() if ids_to_sort is None
else ids_to_sort) else ids_to_sort)
get_metadata = partial(self._get_metadata, get_user_categories=False) get_metadata = self._get_proxy_metadata
lang_map = self.fields['languages'].book_value_map lang_map = self.fields['languages'].book_value_map
fm = {'title':'sort', 'authors':'author_sort'} fm = {'title':'sort', 'authors':'author_sort'}
@ -1189,7 +1192,7 @@ class Cache(object):
sf = self.fields[field] sf = self.fields[field]
if series: if series:
q = icu_lower(series) q = icu_lower(series)
for val, book_ids in sf.iter_searchable_values(self._get_metadata, frozenset(self._all_book_ids())): for val, book_ids in sf.iter_searchable_values(self._get_proxy_metadata, frozenset(self._all_book_ids())):
if q == icu_lower(val): if q == icu_lower(val):
books = book_ids books = book_ids
break break
@ -1499,7 +1502,7 @@ class Cache(object):
f = self.fields[category] f = self.fields[category]
if hasattr(f, 'get_books_for_val'): if hasattr(f, 'get_books_for_val'):
# Composite field # Composite field
return f.get_books_for_val(item_id_or_composite_value, self._get_metadata, self._all_book_ids()) return f.get_books_for_val(item_id_or_composite_value, self._get_proxy_metadata, self._all_book_ids())
return self._books_for_field(f.name, int(item_id_or_composite_value)) return self._books_for_field(f.name, int(item_id_or_composite_value))
@read_api @read_api

View File

@ -10,14 +10,19 @@ __docformat__ = 'restructuredtext en'
import weakref import weakref
from functools import wraps from functools import wraps
from collections import MutableMapping, MutableSequence from collections import MutableMapping, MutableSequence
from copy import deepcopy
from calibre.ebooks.metadata.book.base import Metadata, SIMPLE_GET, TOP_LEVEL_IDENTIFIERS, NULL_VALUES
from calibre.ebooks.metadata.book.formatter import SafeFormat
from calibre.utils.date import utcnow
# Lazy format metadata retrieval {{{
''' '''
Avoid doing stats on all files in a book when getting metadata for that book. Avoid doing stats on all files in a book when getting metadata for that book.
Speeds up calibre startup with large libraries/libraries on a network share, Speeds up calibre startup with large libraries/libraries on a network share,
with a composite custom column. with a composite custom column.
''' '''
# Lazy format metadata retrieval {{{
def resolved(f): def resolved(f):
@wraps(f) @wraps(f)
def wrapper(self, *args, **kwargs): def wrapper(self, *args, **kwargs):
@ -97,3 +102,232 @@ class FormatsList(MutableBase, MutableSequence):
# }}} # }}}
# Lazy metadata getters {{{
ga = object.__getattribute__
sa = object.__setattr__
def simple_getter(field, default_value=None):
def func(dbref, book_id, cache):
try:
return cache[field]
except KeyError:
db = dbref()
cache[field] = ret = db.field_for(field, book_id, default_value=default_value)
return ret
return func
def pp_getter(field, postprocess, default_value=None):
def func(dbref, book_id, cache):
try:
return cache[field]
except KeyError:
db = dbref()
cache[field] = ret = postprocess(db.field_for(field, book_id, default_value=default_value))
return ret
return func
def adata_getter(field):
def func(dbref, book_id, cache):
try:
author_ids, adata = cache['adata']
except KeyError:
db = dbref()
with db.read_lock:
author_ids = db._field_ids_for('authors', book_id)
adata = db._author_data(author_ids)
cache['adata'] = (author_ids, adata)
k = 'sort' if field == 'author_sort_map' else 'link'
return {adata[i]['name']:adata[i][k] for i in author_ids}
return func
def dt_getter(field):
def func(dbref, book_id, cache):
try:
return cache[field]
except KeyError:
db = dbref()
cache[field] = ret = db.field_for(field, book_id, default_value=utcnow())
return ret
return func
def item_getter(field, default_value=None, key=0):
def func(dbref, book_id, cache):
try:
return cache[field]
except KeyError:
db = dbref()
ret = cache[field] = db.field_for(field, book_id, default_value=default_value)
try:
return ret[key]
except (IndexError, KeyError):
return default_value
return func
def fmt_getter(field):
def func(dbref, book_id, cache):
try:
format_metadata = cache['format_metadata']
except KeyError:
db = dbref()
format_metadata = {}
for fmt in db.formats(book_id, verify_formats=False):
m = db.format_metadata(book_id, fmt)
if m:
format_metadata[fmt] = m
if field == 'formats':
return list(format_metadata) or None
return format_metadata
return func
def approx_fmts_getter(dbref, book_id, cache):
try:
return cache['formats']
except KeyError:
db = dbref()
cache['formats'] = ret = list(db.field_for('formats', book_id))
return ret
def series_index_getter(field='series'):
def func(dbref, book_id, cache):
try:
series = getters[field](dbref, book_id, cache)
except KeyError:
series = custom_getter(field, dbref, book_id, cache)
if series:
try:
return cache[field + '_index']
except KeyError:
db = dbref()
cache[field + '_index'] = ret = db.field_for(field + '_index', book_id, default_value=1.0)
return ret
return func
def has_cover_getter(dbref, book_id, cache):
try:
return cache['has_cover']
except KeyError:
db = dbref()
cache['has_cover'] = ret = _('Yes') if db.field_for('cover', book_id, default_value=False) else ''
return ret
fmt_custom = lambda x:list(x) if isinstance(x, tuple) else x
def custom_getter(field, dbref, book_id, cache):
try:
return cache[field]
except KeyError:
db = dbref()
cache[field] = ret = fmt_custom(db.field_for(field, book_id))
return ret
def composite_getter(mi, field, metadata, book_id, cache, formatter, template_cache):
try:
return cache[field]
except KeyError:
ret = cache[field] = formatter.safe_format(
metadata['display']['composite_template'],
mi,
_('TEMPLATE ERROR'),
mi, column_name=field,
template_cache=template_cache).strip()
return ret
getters = {
'title':simple_getter('title', _('Unknown')),
'title_sort':simple_getter('sort', _('Unknown')),
'authors':pp_getter('authors', list, (_('Unknown'),)),
'author_sort':simple_getter('author_sort', _('Unknown')),
'uuid':simple_getter('uuid', 'dummy'),
'book_size':simple_getter('size', 0),
'ondevice_col':simple_getter('ondevice', ''),
'languages':pp_getter('languages', list),
'language':item_getter('languages', default_value=NULL_VALUES['language']),
'db_approx_formats': approx_fmts_getter,
'has_cover': has_cover_getter,
'tags':pp_getter('tags', list, (_('Unknown'),)),
'series_index':series_index_getter(),
'application_id':lambda x, book_id, y: book_id,
'id':lambda x, book_id, y: book_id,
}
for field in ('comments', 'publisher', 'identifiers', 'series', 'rating'):
getters[field] = simple_getter(field)
for field in ('author_sort_map', 'author_link_map'):
getters[field] = adata_getter(field)
for field in ('timestamp', 'pubdate', 'last_modified'):
getters[field] = dt_getter(field)
for field in TOP_LEVEL_IDENTIFIERS:
getters[field] = item_getter('identifiers', key=field)
for field in ('formats', 'format_metadata'):
getters[field] = fmt_getter(field)
# }}}
class ProxyMetadata(Metadata):
def __init__(self, db, book_id):
sa(self, 'template_cache', db.formatter_template_cache)
sa(self, 'formatter', SafeFormat())
sa(self, '_db', weakref.ref(db))
sa(self, '_book_id', book_id)
sa(self, '_cache', {'user_categories':{}, 'cover_data':(None,None), 'device_collections':[]})
sa(self, '_user_metadata', db.field_metadata)
def __getattribute__(self, field):
getter = getters.get(field, None)
if getter is not None:
return getter(ga(self, '_db'), ga(self, '_book_id'), ga(self, '_cache'))
if field in SIMPLE_GET:
return ga(self, '_cache').get(field, None)
try:
return ga(self, field)
except AttributeError:
pass
um = ga(self, '_user_metadata')
d = um.get(field, None)
if d is not None:
dt = d['datatype']
if dt != 'composite':
if field.endswith('_index') and dt == 'float':
return series_index_getter(field[:-6])(ga(self, '_db'), ga(self, '_book_id'), ga(self, '_cache'))
return custom_getter(field, ga(self, '_db'), ga(self, '_book_id'), ga(self, '_cache'))
return composite_getter(self, field, d, ga(self, '_book_id'), ga(self, '_cache'), ga(self, 'formatter'), ga(self, 'template_cache'))
try:
return ga(self, '_cache')[field]
except KeyError:
raise AttributeError('Metadata object has no attribute named: %r' % field)
def __setattr__(self, field, val, extra=None):
cache = ga(self, '_cache')
cache[field] = val
if extra is not None:
cache[field + '_index'] = val
def get_user_metadata(self, field, make_copy=False):
um = ga(self, '_user_metadata')
try:
ans = um[field]
except KeyError:
pass
else:
if make_copy:
ans = deepcopy(ans)
return ans
def get_extra(self, field, default=None):
um = ga(self, '_user_metadata')
if field + '_index' in um:
try:
return getattr(self, field + '_index')
except AttributeError:
return default
raise AttributeError(
'Metadata object has no attribute named: '+ repr(field))
def custom_field_keys(self):
um = ga(self, '_user_metadata')
return iter(um.custom_field_keys())

View File

@ -460,3 +460,29 @@ class ReadingTest(BaseTest):
test(True, {3}, 'Unknown', 'unknown') test(True, {3}, 'Unknown', 'unknown')
# }}} # }}}
def test_proxy_metadata(self): # {{{
' Test the ProxyMetadata object used for composite columns '
from calibre.ebooks.metadata.book.base import STANDARD_METADATA_FIELDS
cache = self.init_cache()
for book_id in cache.all_book_ids():
mi = cache.get_metadata(book_id, get_user_categories=False)
pmi = cache.get_proxy_metadata(book_id)
self.assertSetEqual(set(mi.custom_field_keys()), set(pmi.custom_field_keys()))
for field in STANDARD_METADATA_FIELDS | {'#series_index'}:
f = lambda x: x
if field == 'formats':
f = lambda x: x if x is None else set(x)
self.assertEqual(f(getattr(mi, field)), f(getattr(pmi, field)),
'Standard field: %s not the same for book %s' % (field, book_id))
self.assertEqual(mi.format_field(field), pmi.format_field(field),
'Standard field format: %s not the same for book %s' % (field, book_id))
for field, meta in cache.field_metadata.custom_iteritems():
if meta['datatype'] != 'composite':
self.assertEqual(f(getattr(mi, field)), f(getattr(pmi, field)),
'Custom field: %s not the same for book %s' % (field, book_id))
self.assertEqual(mi.format_field(field), pmi.format_field(field),
'Custom field format: %s not the same for book %s' % (field, book_id))
# }}}