Start work on new db backend

This commit is contained in:
Kovid Goyal 2011-06-20 16:13:18 -06:00
parent 15d8272efe
commit 50dadb45cf
2 changed files with 471 additions and 0 deletions

View File

@ -0,0 +1,67 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Rewrite of the calibre database backend.
Broad Objectives:
* Use the sqlite db only as a datastore. i.e. do not do
sorting/searching/concatenation or anything else in sqlite. Instead
mirror the sqlite tables in memory, create caches and lookup maps from
them and create a set_* API that updates the memory caches and the sqlite
correctly.
* Move from keeping a list of books in memory as a cache to a per table
cache. This allows much faster search and sort operations at the expense
of slightly slower lookup operations. That slowdown can be mitigated by
keeping lots of maps and updating them in the set_* API. Also
get_categories becomes blazingly fast.
* Separate the database layer from the cache layer more cleanly. Rather
than having the db layer refer to the cache layer and vice versa, the
cache layer will refer to the db layer only and the new API will be
defined on the cache layer.
* Get rid of index_is_id and other poor design decisions
* Minimize the API as much as possible and define it cleanly
* Do not change the on disk format of metadata.db at all (this is for
backwards compatibility)
* Get rid of the need for a separate db access thread by switching to apsw
to access sqlite, which is thread safe
* The new API will have methods to efficiently do bulk operations and will
use shared/exclusive/pending locks to serialize access to the in-mem data
structures. Use the same locking scheme as sqlite itself does.
How this will proceed:
1. Create the new API
2. Create a test suite for it
3. Write a replacement for LibraryDatabase2 that uses the new API
internally
4. Lots of testing of calibre with the new LibraryDatabase2
5. Gradually migrate code to use the (much faster) new api wherever possible (the new api
will be exposed via db.new_api)
I plan to work on this slowly, in parallel to normal calibre development
work.
Various things that require other things before they can be migrated:
1. From initialize_dynamic(): set_saved_searches,
load_user_template_functions. Also add custom
columns/categories/searches info into
self.field_metadata. Finally, implement metadata dirtied
functionality.
'''

404
src/calibre/db/backend.py Normal file
View File

@ -0,0 +1,404 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
# Imports {{{
import os, shutil, uuid, json
from functools import partial
import apsw
from calibre import isbytestring, force_unicode, prints
from calibre.constants import (iswindows, filesystem_encoding,
preferred_encoding)
from calibre.ptempfile import PersistentTemporaryFile
from calibre.library.schema_upgrades import SchemaUpgrade
from calibre.library.field_metadata import FieldMetadata
from calibre.ebooks.metadata import title_sort, author_to_author_sort
from calibre.utils.icu import strcmp
from calibre.utils.config import to_json, from_json, prefs, tweaks
from calibre.utils.date import utcfromtimestamp
# }}}
'''
Differences in semantics from pysqlite:
1. execute/executemany/executescript operate in autocommit mode
'''
class DynamicFilter(object): # {{{
'No longer used, present for legacy compatibility'
def __init__(self, name):
self.name = name
self.ids = frozenset([])
def __call__(self, id_):
return int(id_ in self.ids)
def change(self, ids):
self.ids = frozenset(ids)
# }}}
class DBPrefs(dict): # {{{
'Store preferences as key:value pairs in the db'
def __init__(self, db):
dict.__init__(self)
self.db = db
self.defaults = {}
self.disable_setting = False
for key, val in self.db.conn.get('SELECT key,val FROM preferences'):
try:
val = self.raw_to_object(val)
except:
prints('Failed to read value for:', key, 'from db')
continue
dict.__setitem__(self, key, val)
def raw_to_object(self, raw):
if not isinstance(raw, unicode):
raw = raw.decode(preferred_encoding)
return json.loads(raw, object_hook=from_json)
def to_raw(self, val):
return json.dumps(val, indent=2, default=to_json)
def __getitem__(self, key):
try:
return dict.__getitem__(self, key)
except KeyError:
return self.defaults[key]
def __delitem__(self, key):
dict.__delitem__(self, key)
self.db.conn.execute('DELETE FROM preferences WHERE key=?', (key,))
def __setitem__(self, key, val):
if self.disable_setting:
return
raw = self.to_raw(val)
self.db.conn.execute('INSERT OR REPLACE INTO preferences (key,val) VALUES (?,?)', (key,
raw))
dict.__setitem__(self, key, val)
def set(self, key, val):
self.__setitem__(key, val)
# }}}
# Extra collators {{{
def pynocase(one, two, encoding='utf-8'):
if isbytestring(one):
try:
one = one.decode(encoding, 'replace')
except:
pass
if isbytestring(two):
try:
two = two.decode(encoding, 'replace')
except:
pass
return cmp(one.lower(), two.lower())
def _author_to_author_sort(x):
if not x: return ''
return author_to_author_sort(x.replace('|', ','))
def icu_collator(s1, s2):
return strcmp(force_unicode(s1, 'utf-8'), force_unicode(s2, 'utf-8'))
# }}}
class Connection(apsw.Connection): # {{{
BUSY_TIMEOUT = 2000 # milliseconds
def __init__(self, path):
apsw.Connection.__init__(self, path)
self.setbusytimeout(self.BUSY_TIMEOUT)
self.execute('pragma cache_size=5000')
self.conn.execute('pragma temp_store=2')
encoding = self.execute('pragma encoding').fetchone()[0]
self.conn.create_collation('PYNOCASE', partial(pynocase,
encoding=encoding))
self.conn.create_function('title_sort', 1, title_sort)
self.conn.create_function('author_to_author_sort', 1,
_author_to_author_sort)
self.conn.create_function('uuid4', 0, lambda : str(uuid.uuid4()))
# Dummy functions for dynamically created filters
self.conn.create_function('books_list_filter', 1, lambda x: 1)
self.conn.create_collation('icucollate', icu_collator)
def create_dynamic_filter(self, name):
f = DynamicFilter(name)
self.conn.create_function(name, 1, f)
def get(self, *args, **kw):
ans = self.cursor().execute(*args)
if kw.get('all', True):
return ans.fetchall()
for row in ans:
return ans[0]
def execute(self, sql, bindings=None):
cursor = self.cursor()
return cursor.execute(sql, bindings)
def executemany(self, sql, sequence_of_bindings):
return self.cursor().executemany(sql, sequence_of_bindings)
def executescript(self, sql):
with self:
# Use an explicit savepoint so that even if this is called
# while a transaction is active, it is atomic
return self.cursor().execute(sql)
# }}}
class DB(SchemaUpgrade):
PATH_LIMIT = 40 if iswindows else 100
WINDOWS_LIBRARY_PATH_LIMIT = 75
# Initialize database {{{
def __init__(self, library_path, default_prefs=None, read_only=False):
try:
if isbytestring(library_path):
library_path = library_path.decode(filesystem_encoding)
except:
import traceback
traceback.print_exc()
self.field_metadata = FieldMetadata()
self.library_path = os.path.abspath(library_path)
self.dbpath = os.path.join(library_path, 'metadata.db')
self.dbpath = os.environ.get('CALIBRE_OVERRIDE_DATABASE_PATH',
self.dbpath)
if iswindows and len(self.library_path) + 4*self.PATH_LIMIT + 10 > 259:
raise ValueError(_(
'Path to library too long. Must be less than'
' %d characters.')%(259-4*self.PATH_LIMIT-10))
exists = self._exists = os.path.exists(self.dbpath)
if not exists:
# Be more strict when creating new libraries as the old calculation
# allowed for max path lengths of 265 chars.
if (iswindows and len(self.library_path) >
self.WINDOWS_LIBRARY_PATH_LIMIT):
raise ValueError(_(
'Path to library too long. Must be less than'
' %d characters.')%self.WINDOWS_LIBRARY_PATH_LIMIT)
if read_only and os.path.exists(self.dbpath):
# Work on only a copy of metadata.db to ensure that
# metadata.db is not changed
pt = PersistentTemporaryFile('_metadata_ro.db')
pt.close()
shutil.copyfile(self.dbpath, pt.name)
self.dbpath = pt.name
self.is_case_sensitive = (not iswindows and
not os.path.exists(self.dbpath.replace('metadata.db',
'MeTAdAtA.dB')))
self._conn = None
if self.user_version == 0:
self.initialize_database()
SchemaUpgrade.__init__(self)
# Guarantee that the library_id is set
self.library_id
self.initialize_prefs(default_prefs)
# Fix legacy triggers and columns
self.conn.executescript('''
DROP TRIGGER IF EXISTS author_insert_trg;
CREATE TEMP TRIGGER author_insert_trg
AFTER INSERT ON authors
BEGIN
UPDATE authors SET sort=author_to_author_sort(NEW.name) WHERE id=NEW.id;
END;
DROP TRIGGER IF EXISTS author_update_trg;
CREATE TEMP TRIGGER author_update_trg
BEFORE UPDATE ON authors
BEGIN
UPDATE authors SET sort=author_to_author_sort(NEW.name)
WHERE id=NEW.id AND name <> NEW.name;
END;
UPDATE authors SET sort=author_to_author_sort(name) WHERE sort IS NULL;
''')
def initialize_prefs(self, default_prefs):
self.prefs = DBPrefs(self)
if default_prefs is not None and not self._exists:
# Only apply default prefs to a new database
for key in default_prefs:
# be sure that prefs not to be copied are listed below
if key not in frozenset(['news_to_be_synced']):
self.prefs[key] = default_prefs[key]
if 'field_metadata' in default_prefs:
fmvals = [f for f in default_prefs['field_metadata'].values()
if f['is_custom']]
for f in fmvals:
self.create_custom_column(f['label'], f['name'],
f['datatype'], f['is_multiple'] is not None,
f['is_editable'], f['display'])
defs = self.prefs.defaults
defs['gui_restriction'] = defs['cs_restriction'] = ''
defs['categories_using_hierarchy'] = []
defs['column_color_rules'] = []
# Migrate the bool tristate tweak
defs['bools_are_tristate'] = \
tweaks.get('bool_custom_columns_are_tristate', 'yes') == 'yes'
if self.prefs.get('bools_are_tristate') is None:
self.prefs.set('bools_are_tristate', defs['bools_are_tristate'])
# Migrate column coloring rules
if self.prefs.get('column_color_name_1', None) is not None:
from calibre.library.coloring import migrate_old_rule
old_rules = []
for i in range(1, 6):
col = self.prefs.get('column_color_name_'+str(i), None)
templ = self.prefs.get('column_color_template_'+str(i), None)
if col and templ:
try:
del self.prefs['column_color_name_'+str(i)]
rules = migrate_old_rule(self.field_metadata, templ)
for templ in rules:
old_rules.append((col, templ))
except:
pass
if old_rules:
self.prefs['column_color_rules'] += old_rules
# Migrate saved search and user categories to db preference scheme
def migrate_preference(key, default):
oldval = prefs[key]
if oldval != default:
self.prefs[key] = oldval
prefs[key] = default
if key not in self.prefs:
self.prefs[key] = default
migrate_preference('user_categories', {})
migrate_preference('saved_searches', {})
# migrate grouped_search_terms
if self.prefs.get('grouped_search_terms', None) is None:
try:
ogst = tweaks.get('grouped_search_terms', {})
ngst = {}
for t in ogst:
ngst[icu_lower(t)] = ogst[t]
self.prefs.set('grouped_search_terms', ngst)
except:
pass
# Rename any user categories with names that differ only in case
user_cats = self.prefs.get('user_categories', [])
catmap = {}
for uc in user_cats:
ucl = icu_lower(uc)
if ucl not in catmap:
catmap[ucl] = []
catmap[ucl].append(uc)
cats_changed = False
for uc in catmap:
if len(catmap[uc]) > 1:
prints('found user category case overlap', catmap[uc])
cat = catmap[uc][0]
suffix = 1
while icu_lower((cat + unicode(suffix))) in catmap:
suffix += 1
prints('Renaming user category %s to %s'%(cat, cat+unicode(suffix)))
user_cats[cat + unicode(suffix)] = user_cats[cat]
del user_cats[cat]
cats_changed = True
if cats_changed:
self.prefs.set('user_categories', user_cats)
@property
def conn(self):
if self._conn is None:
self._conn = apsw.Connection(self.dbpath)
if self._exists and self.user_version == 0:
self._conn.close()
os.remove(self.dbpath)
self._conn = apsw.Connection(self.dbpath)
return self._conn
@dynamic_property
def user_version(self):
doc = 'The user version of this database'
def fget(self):
return self.conn.get('pragma user_version;', all=False)
def fset(self, val):
self.conn.execute('pragma user_version=%d'%int(val))
return property(doc=doc, fget=fget, fset=fset)
def initialize_database(self):
metadata_sqlite = P('metadata_sqlite.sql', data=True,
allow_user_override=False).decode('utf-8')
self.conn.executescript(metadata_sqlite)
if self.user_version == 0:
self.user_version = 1
# }}}
# Database layer API {{{
@classmethod
def exists_at(cls, path):
return path and os.path.exists(os.path.join(path, 'metadata.db'))
@dynamic_property
def library_id(self):
doc = ('The UUID for this library. As long as the user only operates'
' on libraries with calibre, it will be unique')
def fget(self):
if getattr(self, '_library_id_', None) is None:
ans = self.conn.get('SELECT uuid FROM library_id', all=False)
if ans is None:
ans = str(uuid.uuid4())
self.library_id = ans
else:
self._library_id_ = ans
return self._library_id_
def fset(self, val):
self._library_id_ = unicode(val)
self.conn.execute('''
DELETE FROM library_id;
INSERT INTO library_id (uuid) VALUES (?);
''', self._library_id_)
return property(doc=doc, fget=fget, fset=fset)
def last_modified(self):
''' Return last modified time as a UTC datetime object '''
return utcfromtimestamp(os.stat(self.dbpath).st_mtime)
# }}}