Speed up evaluation of composite columns

Use a ProxyMetadata object that lazily evaluates its fields on demand,
thereby avoiding the overhead of get_metadata() on every composite field
evaluation.
This commit is contained in:
Kovid Goyal 2013-07-24 10:57:04 +05:30
parent f37de3d33c
commit 70f1dbb832
3 changed files with 271 additions and 8 deletions

View File

@ -23,7 +23,7 @@ from calibre.db.fields import create_field
from calibre.db.search import Search
from calibre.db.tables import VirtualTable
from calibre.db.write import get_series_values
from calibre.db.lazy import FormatMetadata, FormatsList
from calibre.db.lazy import FormatMetadata, FormatsList, ProxyMetadata
from calibre.ebooks import check_ebook_format
from calibre.ebooks.metadata import string_to_authors, author_to_author_sort, get_title_sort_pat
from calibre.ebooks.metadata.book.base import Metadata
@ -338,7 +338,7 @@ class Cache(object):
def fast_field_for(self, field_obj, book_id, default_value=None):
' Same as field_for, except that it avoids the extra lookup to get the field object '
if field_obj.is_composite:
return field_obj.get_value_with_cache(book_id, partial(self._get_metadata, get_user_categories=False))
return field_obj.get_value_with_cache(book_id, self._get_proxy_metadata)
try:
return field_obj.for_book(book_id, default_value=default_value)
except (KeyError, IndexError):
@ -358,8 +358,7 @@ class Cache(object):
return default_value
if mi is None:
return f.get_value_with_cache(book_id, partial(self._get_metadata,
get_user_categories=False))
return f.get_value_with_cache(book_id, self._get_proxy_metadata)
else:
return f.render_composite(book_id, mi)
@ -534,6 +533,10 @@ class Cache(object):
return mi
@read_api
def get_proxy_metadata(self, book_id):
return ProxyMetadata(self, book_id)
@api
def cover(self, book_id,
as_file=False, as_image=False, as_path=False):
@ -781,7 +784,7 @@ class Cache(object):
'''
all_book_ids = frozenset(self._all_book_ids() if ids_to_sort is None
else ids_to_sort)
get_metadata = partial(self._get_metadata, get_user_categories=False)
get_metadata = self._get_proxy_metadata
lang_map = self.fields['languages'].book_value_map
fm = {'title':'sort', 'authors':'author_sort'}
@ -1189,7 +1192,7 @@ class Cache(object):
sf = self.fields[field]
if series:
q = icu_lower(series)
for val, book_ids in sf.iter_searchable_values(self._get_metadata, frozenset(self._all_book_ids())):
for val, book_ids in sf.iter_searchable_values(self._get_proxy_metadata, frozenset(self._all_book_ids())):
if q == icu_lower(val):
books = book_ids
break
@ -1499,7 +1502,7 @@ class Cache(object):
f = self.fields[category]
if hasattr(f, 'get_books_for_val'):
# Composite field
return f.get_books_for_val(item_id_or_composite_value, self._get_metadata, self._all_book_ids())
return f.get_books_for_val(item_id_or_composite_value, self._get_proxy_metadata, self._all_book_ids())
return self._books_for_field(f.name, int(item_id_or_composite_value))
@read_api

View File

@ -10,14 +10,19 @@ __docformat__ = 'restructuredtext en'
import weakref
from functools import wraps
from collections import MutableMapping, MutableSequence
from copy import deepcopy
from calibre.ebooks.metadata.book.base import Metadata, SIMPLE_GET, TOP_LEVEL_IDENTIFIERS, NULL_VALUES
from calibre.ebooks.metadata.book.formatter import SafeFormat
from calibre.utils.date import utcnow
# Lazy format metadata retrieval {{{
'''
Avoid doing stats on all files in a book when getting metadata for that book.
Speeds up calibre startup with large libraries/libraries on a network share,
with a composite custom column.
'''
# Lazy format metadata retrieval {{{
def resolved(f):
@wraps(f)
def wrapper(self, *args, **kwargs):
@ -97,3 +102,232 @@ class FormatsList(MutableBase, MutableSequence):
# }}}
# Lazy metadata getters {{{
ga = object.__getattribute__
sa = object.__setattr__
def simple_getter(field, default_value=None):
def func(dbref, book_id, cache):
try:
return cache[field]
except KeyError:
db = dbref()
cache[field] = ret = db.field_for(field, book_id, default_value=default_value)
return ret
return func
def pp_getter(field, postprocess, default_value=None):
def func(dbref, book_id, cache):
try:
return cache[field]
except KeyError:
db = dbref()
cache[field] = ret = postprocess(db.field_for(field, book_id, default_value=default_value))
return ret
return func
def adata_getter(field):
def func(dbref, book_id, cache):
try:
author_ids, adata = cache['adata']
except KeyError:
db = dbref()
with db.read_lock:
author_ids = db._field_ids_for('authors', book_id)
adata = db._author_data(author_ids)
cache['adata'] = (author_ids, adata)
k = 'sort' if field == 'author_sort_map' else 'link'
return {adata[i]['name']:adata[i][k] for i in author_ids}
return func
def dt_getter(field):
def func(dbref, book_id, cache):
try:
return cache[field]
except KeyError:
db = dbref()
cache[field] = ret = db.field_for(field, book_id, default_value=utcnow())
return ret
return func
def item_getter(field, default_value=None, key=0):
def func(dbref, book_id, cache):
try:
return cache[field]
except KeyError:
db = dbref()
ret = cache[field] = db.field_for(field, book_id, default_value=default_value)
try:
return ret[key]
except (IndexError, KeyError):
return default_value
return func
def fmt_getter(field):
def func(dbref, book_id, cache):
try:
format_metadata = cache['format_metadata']
except KeyError:
db = dbref()
format_metadata = {}
for fmt in db.formats(book_id, verify_formats=False):
m = db.format_metadata(book_id, fmt)
if m:
format_metadata[fmt] = m
if field == 'formats':
return list(format_metadata) or None
return format_metadata
return func
def approx_fmts_getter(dbref, book_id, cache):
try:
return cache['formats']
except KeyError:
db = dbref()
cache['formats'] = ret = list(db.field_for('formats', book_id))
return ret
def series_index_getter(field='series'):
def func(dbref, book_id, cache):
try:
series = getters[field](dbref, book_id, cache)
except KeyError:
series = custom_getter(field, dbref, book_id, cache)
if series:
try:
return cache[field + '_index']
except KeyError:
db = dbref()
cache[field + '_index'] = ret = db.field_for(field + '_index', book_id, default_value=1.0)
return ret
return func
def has_cover_getter(dbref, book_id, cache):
try:
return cache['has_cover']
except KeyError:
db = dbref()
cache['has_cover'] = ret = _('Yes') if db.field_for('cover', book_id, default_value=False) else ''
return ret
fmt_custom = lambda x:list(x) if isinstance(x, tuple) else x
def custom_getter(field, dbref, book_id, cache):
try:
return cache[field]
except KeyError:
db = dbref()
cache[field] = ret = fmt_custom(db.field_for(field, book_id))
return ret
def composite_getter(mi, field, metadata, book_id, cache, formatter, template_cache):
try:
return cache[field]
except KeyError:
ret = cache[field] = formatter.safe_format(
metadata['display']['composite_template'],
mi,
_('TEMPLATE ERROR'),
mi, column_name=field,
template_cache=template_cache).strip()
return ret
getters = {
'title':simple_getter('title', _('Unknown')),
'title_sort':simple_getter('sort', _('Unknown')),
'authors':pp_getter('authors', list, (_('Unknown'),)),
'author_sort':simple_getter('author_sort', _('Unknown')),
'uuid':simple_getter('uuid', 'dummy'),
'book_size':simple_getter('size', 0),
'ondevice_col':simple_getter('ondevice', ''),
'languages':pp_getter('languages', list),
'language':item_getter('languages', default_value=NULL_VALUES['language']),
'db_approx_formats': approx_fmts_getter,
'has_cover': has_cover_getter,
'tags':pp_getter('tags', list, (_('Unknown'),)),
'series_index':series_index_getter(),
'application_id':lambda x, book_id, y: book_id,
'id':lambda x, book_id, y: book_id,
}
for field in ('comments', 'publisher', 'identifiers', 'series', 'rating'):
getters[field] = simple_getter(field)
for field in ('author_sort_map', 'author_link_map'):
getters[field] = adata_getter(field)
for field in ('timestamp', 'pubdate', 'last_modified'):
getters[field] = dt_getter(field)
for field in TOP_LEVEL_IDENTIFIERS:
getters[field] = item_getter('identifiers', key=field)
for field in ('formats', 'format_metadata'):
getters[field] = fmt_getter(field)
# }}}
class ProxyMetadata(Metadata):
def __init__(self, db, book_id):
sa(self, 'template_cache', db.formatter_template_cache)
sa(self, 'formatter', SafeFormat())
sa(self, '_db', weakref.ref(db))
sa(self, '_book_id', book_id)
sa(self, '_cache', {'user_categories':{}, 'cover_data':(None,None), 'device_collections':[]})
sa(self, '_user_metadata', db.field_metadata)
def __getattribute__(self, field):
getter = getters.get(field, None)
if getter is not None:
return getter(ga(self, '_db'), ga(self, '_book_id'), ga(self, '_cache'))
if field in SIMPLE_GET:
return ga(self, '_cache').get(field, None)
try:
return ga(self, field)
except AttributeError:
pass
um = ga(self, '_user_metadata')
d = um.get(field, None)
if d is not None:
dt = d['datatype']
if dt != 'composite':
if field.endswith('_index') and dt == 'float':
return series_index_getter(field[:-6])(ga(self, '_db'), ga(self, '_book_id'), ga(self, '_cache'))
return custom_getter(field, ga(self, '_db'), ga(self, '_book_id'), ga(self, '_cache'))
return composite_getter(self, field, d, ga(self, '_book_id'), ga(self, '_cache'), ga(self, 'formatter'), ga(self, 'template_cache'))
try:
return ga(self, '_cache')[field]
except KeyError:
raise AttributeError('Metadata object has no attribute named: %r' % field)
def __setattr__(self, field, val, extra=None):
cache = ga(self, '_cache')
cache[field] = val
if extra is not None:
cache[field + '_index'] = val
def get_user_metadata(self, field, make_copy=False):
um = ga(self, '_user_metadata')
try:
ans = um[field]
except KeyError:
pass
else:
if make_copy:
ans = deepcopy(ans)
return ans
def get_extra(self, field, default=None):
um = ga(self, '_user_metadata')
if field + '_index' in um:
try:
return getattr(self, field + '_index')
except AttributeError:
return default
raise AttributeError(
'Metadata object has no attribute named: '+ repr(field))
def custom_field_keys(self):
um = ga(self, '_user_metadata')
return iter(um.custom_field_keys())

View File

@ -460,3 +460,29 @@ class ReadingTest(BaseTest):
test(True, {3}, 'Unknown', 'unknown')
# }}}
def test_proxy_metadata(self): # {{{
' Test the ProxyMetadata object used for composite columns '
from calibre.ebooks.metadata.book.base import STANDARD_METADATA_FIELDS
cache = self.init_cache()
for book_id in cache.all_book_ids():
mi = cache.get_metadata(book_id, get_user_categories=False)
pmi = cache.get_proxy_metadata(book_id)
self.assertSetEqual(set(mi.custom_field_keys()), set(pmi.custom_field_keys()))
for field in STANDARD_METADATA_FIELDS | {'#series_index'}:
f = lambda x: x
if field == 'formats':
f = lambda x: x if x is None else set(x)
self.assertEqual(f(getattr(mi, field)), f(getattr(pmi, field)),
'Standard field: %s not the same for book %s' % (field, book_id))
self.assertEqual(mi.format_field(field), pmi.format_field(field),
'Standard field format: %s not the same for book %s' % (field, book_id))
for field, meta in cache.field_metadata.custom_iteritems():
if meta['datatype'] != 'composite':
self.assertEqual(f(getattr(mi, field)), f(getattr(pmi, field)),
'Custom field: %s not the same for book %s' % (field, book_id))
self.assertEqual(mi.format_field(field), pmi.format_field(field),
'Custom field format: %s not the same for book %s' % (field, book_id))
# }}}