Start work on new db backend

2025-07-09 03:04:10 -04:00 · 2011-06-20 16:13:18 -06:00 · 2011-06-20 16:13:18 -06:00 · 50dadb45cf
commit 50dadb45cf
parent 15d8272efe
2 changed files with 471 additions and 0 deletions
--- a/src/calibre/db/init.py
+++ b/src/calibre/db/init.py
@ -0,0 +1,67 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+'''
+Rewrite of the calibre database backend.
+
+Broad Objectives:
+
+    * Use the sqlite db only as a datastore. i.e. do not do
+      sorting/searching/concatenation or anything else in sqlite. Instead
+      mirror the sqlite tables in memory, create caches and lookup maps from
+      them and create a set_* API that updates the memory caches and the sqlite
+      correctly.
+
+    * Move from keeping a list of books in memory as a cache to a per table
+      cache. This allows much faster search and sort operations at the expense
+      of slightly slower lookup operations. That slowdown can be mitigated by
+      keeping lots of maps and updating them in the set_* API. Also
+      get_categories becomes blazingly fast.
+
+    * Separate the database layer from the cache layer more cleanly. Rather
+      than having the db layer refer to the cache layer and vice versa, the
+      cache layer will refer to the db layer only and the new API will be
+      defined on the cache layer.
+
+    * Get rid of index_is_id and other poor design decisions
+
+    * Minimize the API as much as possible and define it cleanly
+
+    * Do not change the on disk format of metadata.db at all (this is for
+      backwards compatibility)
+
+    * Get rid of the need for a separate db access thread by switching to apsw
+      to access sqlite, which is thread safe
+
+    * The new API will have methods to efficiently do bulk operations and will
+      use shared/exclusive/pending locks to serialize access to the in-mem data
+      structures. Use the same locking scheme as sqlite itself does.
+
+How this will proceed:
+
+    1. Create the new API
+    2. Create a test suite for it
+    3. Write a replacement for LibraryDatabase2 that uses the new API
+       internally
+    4. Lots of testing of calibre with the new LibraryDatabase2
+    5. Gradually migrate code to use the (much faster) new api wherever possible (the new api
+       will be exposed via db.new_api)
+
+    I plan to work on this slowly, in parallel to normal calibre development
+    work.
+
+Various things that require other things before they can be migrated:
+    1. From initialize_dynamic(): set_saved_searches,
+                    load_user_template_functions. Also add custom
+                    columns/categories/searches info into
+                    self.field_metadata. Finally, implement metadata dirtied
+                    functionality.
+
+'''
--- a/src/calibre/db/backend.py
+++ b/src/calibre/db/backend.py
@ -0,0 +1,404 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+# Imports {{{
+import os, shutil, uuid, json
+from functools import partial
+
+import apsw
+
+from calibre import isbytestring, force_unicode, prints
+from calibre.constants import (iswindows, filesystem_encoding,
+        preferred_encoding)
+from calibre.ptempfile import PersistentTemporaryFile
+from calibre.library.schema_upgrades import SchemaUpgrade
+from calibre.library.field_metadata import FieldMetadata
+from calibre.ebooks.metadata import title_sort, author_to_author_sort
+from calibre.utils.icu import strcmp
+from calibre.utils.config import to_json, from_json, prefs, tweaks
+from calibre.utils.date import utcfromtimestamp
+# }}}
+
+'''
+Differences in semantics from pysqlite:
+
+    1. execute/executemany/executescript operate in autocommit mode
+
+'''
+
+class DynamicFilter(object): # {{{
+
+    'No longer used, present for legacy compatibility'
+
+    def __init__(self, name):
+        self.name = name
+        self.ids = frozenset([])
+
+    def __call__(self, id_):
+        return int(id_ in self.ids)
+
+    def change(self, ids):
+        self.ids = frozenset(ids)
+# }}}
+
+class DBPrefs(dict): # {{{
+
+    'Store preferences as key:value pairs in the db'
+
+    def __init__(self, db):
+        dict.__init__(self)
+        self.db = db
+        self.defaults = {}
+        self.disable_setting = False
+        for key, val in self.db.conn.get('SELECT key,val FROM preferences'):
+            try:
+                val = self.raw_to_object(val)
+            except:
+                prints('Failed to read value for:', key, 'from db')
+                continue
+            dict.__setitem__(self, key, val)
+
+    def raw_to_object(self, raw):
+        if not isinstance(raw, unicode):
+            raw = raw.decode(preferred_encoding)
+        return json.loads(raw, object_hook=from_json)
+
+    def to_raw(self, val):
+        return json.dumps(val, indent=2, default=to_json)
+
+    def __getitem__(self, key):
+        try:
+            return dict.__getitem__(self, key)
+        except KeyError:
+            return self.defaults[key]
+
+    def __delitem__(self, key):
+        dict.__delitem__(self, key)
+        self.db.conn.execute('DELETE FROM preferences WHERE key=?', (key,))
+
+    def __setitem__(self, key, val):
+        if self.disable_setting:
+            return
+        raw = self.to_raw(val)
+        self.db.conn.execute('INSERT OR REPLACE INTO preferences (key,val) VALUES (?,?)', (key,
+            raw))
+        dict.__setitem__(self, key, val)
+
+    def set(self, key, val):
+        self.__setitem__(key, val)
+
+# }}}
+
+# Extra collators {{{
+def pynocase(one, two, encoding='utf-8'):
+    if isbytestring(one):
+        try:
+            one = one.decode(encoding, 'replace')
+        except:
+            pass
+    if isbytestring(two):
+        try:
+            two = two.decode(encoding, 'replace')
+        except:
+            pass
+    return cmp(one.lower(), two.lower())
+
+def _author_to_author_sort(x):
+    if not x: return ''
+    return author_to_author_sort(x.replace('|', ','))
+
+def icu_collator(s1, s2):
+    return strcmp(force_unicode(s1, 'utf-8'), force_unicode(s2, 'utf-8'))
+# }}}
+
+class Connection(apsw.Connection): # {{{
+
+    BUSY_TIMEOUT = 2000 # milliseconds
+
+    def __init__(self, path):
+        apsw.Connection.__init__(self, path)
+
+        self.setbusytimeout(self.BUSY_TIMEOUT)
+        self.execute('pragma cache_size=5000')
+        self.conn.execute('pragma temp_store=2')
+
+        encoding = self.execute('pragma encoding').fetchone()[0]
+        self.conn.create_collation('PYNOCASE', partial(pynocase,
+            encoding=encoding))
+
+        self.conn.create_function('title_sort', 1, title_sort)
+        self.conn.create_function('author_to_author_sort', 1,
+                _author_to_author_sort)
+
+        self.conn.create_function('uuid4', 0, lambda : str(uuid.uuid4()))
+
+        # Dummy functions for dynamically created filters
+        self.conn.create_function('books_list_filter', 1, lambda x: 1)
+        self.conn.create_collation('icucollate', icu_collator)
+
+    def create_dynamic_filter(self, name):
+        f = DynamicFilter(name)
+        self.conn.create_function(name, 1, f)
+
+    def get(self, *args, **kw):
+        ans = self.cursor().execute(*args)
+        if kw.get('all', True):
+            return ans.fetchall()
+        for row in ans:
+            return ans[0]
+
+    def execute(self, sql, bindings=None):
+        cursor = self.cursor()
+        return cursor.execute(sql, bindings)
+
+    def executemany(self, sql, sequence_of_bindings):
+        return self.cursor().executemany(sql, sequence_of_bindings)
+
+    def executescript(self, sql):
+        with self:
+            # Use an explicit savepoint so that even if this is called
+            # while a transaction is active, it is atomic
+            return self.cursor().execute(sql)
+# }}}
+
+class DB(SchemaUpgrade):
+
+    PATH_LIMIT = 40 if iswindows else 100
+    WINDOWS_LIBRARY_PATH_LIMIT = 75
+
+    # Initialize database {{{
+
+    def __init__(self, library_path, default_prefs=None, read_only=False):
+        try:
+            if isbytestring(library_path):
+                library_path = library_path.decode(filesystem_encoding)
+        except:
+            import traceback
+            traceback.print_exc()
+
+        self.field_metadata = FieldMetadata()
+
+        self.library_path = os.path.abspath(library_path)
+        self.dbpath = os.path.join(library_path, 'metadata.db')
+        self.dbpath = os.environ.get('CALIBRE_OVERRIDE_DATABASE_PATH',
+                self.dbpath)
+
+        if iswindows and len(self.library_path) + 4*self.PATH_LIMIT + 10 > 259:
+            raise ValueError(_(
+                'Path to library too long. Must be less than'
+                ' %d characters.')%(259-4*self.PATH_LIMIT-10))
+        exists = self._exists = os.path.exists(self.dbpath)
+        if not exists:
+            # Be more strict when creating new libraries as the old calculation
+            # allowed for max path lengths of 265 chars.
+            if (iswindows and len(self.library_path) >
+                    self.WINDOWS_LIBRARY_PATH_LIMIT):
+                raise ValueError(_(
+                    'Path to library too long. Must be less than'
+                    ' %d characters.')%self.WINDOWS_LIBRARY_PATH_LIMIT)
+
+        if read_only and os.path.exists(self.dbpath):
+            # Work on only a copy of metadata.db to ensure that
+            # metadata.db is not changed
+            pt = PersistentTemporaryFile('_metadata_ro.db')
+            pt.close()
+            shutil.copyfile(self.dbpath, pt.name)
+            self.dbpath = pt.name
+
+        self.is_case_sensitive = (not iswindows and
+            not os.path.exists(self.dbpath.replace('metadata.db',
+                'MeTAdAtA.dB')))
+
+        self._conn = None
+
+        if self.user_version == 0:
+            self.initialize_database()
+
+        SchemaUpgrade.__init__(self)
+        # Guarantee that the library_id is set
+        self.library_id
+
+        self.initialize_prefs(default_prefs)
+
+        # Fix legacy triggers and columns
+        self.conn.executescript('''
+        DROP TRIGGER IF EXISTS author_insert_trg;
+        CREATE TEMP TRIGGER author_insert_trg
+            AFTER INSERT ON authors
+            BEGIN
+            UPDATE authors SET sort=author_to_author_sort(NEW.name) WHERE id=NEW.id;
+        END;
+        DROP TRIGGER IF EXISTS author_update_trg;
+        CREATE TEMP TRIGGER author_update_trg
+            BEFORE UPDATE ON authors
+            BEGIN
+            UPDATE authors SET sort=author_to_author_sort(NEW.name)
+            WHERE id=NEW.id AND name <> NEW.name;
+        END;
+        UPDATE authors SET sort=author_to_author_sort(name) WHERE sort IS NULL;
+        ''')
+
+    def initialize_prefs(self, default_prefs):
+        self.prefs = DBPrefs(self)
+
+        if default_prefs is not None and not self._exists:
+            # Only apply default prefs to a new database
+            for key in default_prefs:
+                # be sure that prefs not to be copied are listed below
+                if key not in frozenset(['news_to_be_synced']):
+                    self.prefs[key] = default_prefs[key]
+            if 'field_metadata' in default_prefs:
+                fmvals = [f for f in default_prefs['field_metadata'].values()
+                                if f['is_custom']]
+                for f in fmvals:
+                    self.create_custom_column(f['label'], f['name'],
+                            f['datatype'], f['is_multiple'] is not None,
+                            f['is_editable'], f['display'])
+
+        defs = self.prefs.defaults
+        defs['gui_restriction'] = defs['cs_restriction'] = ''
+        defs['categories_using_hierarchy'] = []
+        defs['column_color_rules'] = []
+
+        # Migrate the bool tristate tweak
+        defs['bools_are_tristate'] = \
+                tweaks.get('bool_custom_columns_are_tristate', 'yes') == 'yes'
+        if self.prefs.get('bools_are_tristate') is None:
+            self.prefs.set('bools_are_tristate', defs['bools_are_tristate'])
+
+        # Migrate column coloring rules
+        if self.prefs.get('column_color_name_1', None) is not None:
+            from calibre.library.coloring import migrate_old_rule
+            old_rules = []
+            for i in range(1, 6):
+                col = self.prefs.get('column_color_name_'+str(i), None)
+                templ = self.prefs.get('column_color_template_'+str(i), None)
+                if col and templ:
+                    try:
+                        del self.prefs['column_color_name_'+str(i)]
+                        rules = migrate_old_rule(self.field_metadata, templ)
+                        for templ in rules:
+                            old_rules.append((col, templ))
+                    except:
+                        pass
+            if old_rules:
+                self.prefs['column_color_rules'] += old_rules
+
+        # Migrate saved search and user categories to db preference scheme
+        def migrate_preference(key, default):
+            oldval = prefs[key]
+            if oldval != default:
+                self.prefs[key] = oldval
+                prefs[key] = default
+            if key not in self.prefs:
+                self.prefs[key] = default
+
+        migrate_preference('user_categories', {})
+        migrate_preference('saved_searches', {})
+
+        # migrate grouped_search_terms
+        if self.prefs.get('grouped_search_terms', None) is None:
+            try:
+                ogst = tweaks.get('grouped_search_terms', {})
+                ngst = {}
+                for t in ogst:
+                    ngst[icu_lower(t)] = ogst[t]
+                self.prefs.set('grouped_search_terms', ngst)
+            except:
+                pass
+
+        # Rename any user categories with names that differ only in case
+        user_cats = self.prefs.get('user_categories', [])
+        catmap = {}
+        for uc in user_cats:
+            ucl = icu_lower(uc)
+            if ucl not in catmap:
+                catmap[ucl] = []
+            catmap[ucl].append(uc)
+        cats_changed = False
+        for uc in catmap:
+            if len(catmap[uc]) > 1:
+                prints('found user category case overlap', catmap[uc])
+                cat = catmap[uc][0]
+                suffix = 1
+                while icu_lower((cat + unicode(suffix))) in catmap:
+                    suffix += 1
+                prints('Renaming user category %s to %s'%(cat, cat+unicode(suffix)))
+                user_cats[cat + unicode(suffix)] = user_cats[cat]
+                del user_cats[cat]
+                cats_changed = True
+        if cats_changed:
+            self.prefs.set('user_categories', user_cats)
+
+    @property
+    def conn(self):
+        if self._conn is None:
+            self._conn = apsw.Connection(self.dbpath)
+            if self._exists and self.user_version == 0:
+                self._conn.close()
+                os.remove(self.dbpath)
+                self._conn = apsw.Connection(self.dbpath)
+        return self._conn
+
+    @dynamic_property
+    def user_version(self):
+        doc = 'The user version of this database'
+
+        def fget(self):
+            return self.conn.get('pragma user_version;', all=False)
+
+        def fset(self, val):
+            self.conn.execute('pragma user_version=%d'%int(val))
+
+        return property(doc=doc, fget=fget, fset=fset)
+
+    def initialize_database(self):
+        metadata_sqlite = P('metadata_sqlite.sql', data=True,
+                allow_user_override=False).decode('utf-8')
+        self.conn.executescript(metadata_sqlite)
+        if self.user_version == 0:
+            self.user_version = 1
+    # }}}
+
+    # Database layer API {{{
+
+    @classmethod
+    def exists_at(cls, path):
+        return path and os.path.exists(os.path.join(path, 'metadata.db'))
+
+    @dynamic_property
+    def library_id(self):
+        doc = ('The UUID for this library. As long as the user only operates'
+                ' on libraries with calibre, it will be unique')
+
+        def fget(self):
+            if getattr(self, '_library_id_', None) is None:
+                ans = self.conn.get('SELECT uuid FROM library_id', all=False)
+                if ans is None:
+                    ans = str(uuid.uuid4())
+                    self.library_id = ans
+                else:
+                    self._library_id_ = ans
+            return self._library_id_
+
+        def fset(self, val):
+            self._library_id_ = unicode(val)
+            self.conn.execute('''
+                    DELETE FROM library_id;
+                    INSERT INTO library_id (uuid) VALUES (?);
+                    ''', self._library_id_)
+
+        return property(doc=doc, fget=fget, fset=fset)
+
+    def last_modified(self):
+        ''' Return last modified time as a UTC datetime object '''
+        return utcfromtimestamp(os.stat(self.dbpath).st_mtime)
+
+   # }}}
+