From 6017bafe48b9ac097af2006535b8e35deac560c5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 24 Aug 2023 08:54:04 +0530 Subject: [PATCH] Use hashes as primary key for resources This will allow rebuild of database as the backup documents will refer to resource hashes not ids (its very difficult to restore resources with ids unchanged). Also this allows docs to be migrated between databases easily. --- resources/notes_sqlite.sql | 12 +++++----- src/calibre/db/backend.py | 8 +++---- src/calibre/db/cache.py | 8 +++---- src/calibre/db/notes/connect.py | 37 +++++++++++++++--------------- src/calibre/db/tests/filesystem.py | 2 +- src/calibre/db/tests/notes.py | 12 +++++----- 6 files changed, 39 insertions(+), 40 deletions(-) diff --git a/resources/notes_sqlite.sql b/resources/notes_sqlite.sql index 729d04f3bf..ac0143e841 100644 --- a/resources/notes_sqlite.sql +++ b/resources/notes_sqlite.sql @@ -6,16 +6,16 @@ CREATE TABLE notes_db.notes ( id INTEGER PRIMARY KEY AUTOINCREMENT, UNIQUE(item, colname) ); -CREATE TABLE notes_db.resources ( id INTEGER PRIMARY KEY AUTOINCREMENT, - hash TEXT NOT NULL UNIQUE ON CONFLICT FAIL, +CREATE TABLE notes_db.resources ( + hash TEXT NOT NULL PRIMARY KEY ON CONFLICT FAIL, name TEXT NOT NULL UNIQUE ON CONFLICT FAIL -); +) WITHOUT ROWID; CREATE TABLE notes_db.notes_resources_link ( id INTEGER PRIMARY KEY, note INTEGER NOT NULL, - resource INTEGER NOT NULL, + resource TEXT NOT NULL, FOREIGN KEY(note) REFERENCES notes(id), - FOREIGN KEY(resource) REFERENCES resources(id), + FOREIGN KEY(resource) REFERENCES resources(hash), UNIQUE(note, resource) ); @@ -45,7 +45,7 @@ END; CREATE TRIGGER notes_db.notes_db_resources_delete_trg BEFORE DELETE ON notes_db.resources BEGIN - DELETE FROM notes_resources_link WHERE resource=OLD.id; + DELETE FROM notes_resources_link WHERE resource=OLD.hash; END; PRAGMA notes_db.user_version=1; diff --git a/src/calibre/db/backend.py b/src/calibre/db/backend.py index 922536a484..fd5f0b6d84 100644 --- a/src/calibre/db/backend.py +++ b/src/calibre/db/backend.py @@ -972,9 +972,9 @@ class DB: def notes_for(self, field_name, item_id): return self.notes.get_note(self.conn, field_name, item_id) or '' - def set_notes_for(self, field, item_id, doc: str, searchable_text: str, resource_ids) -> int: + def set_notes_for(self, field, item_id, doc: str, searchable_text: str, resource_hashes) -> int: id_val = self.tables[field].id_map[item_id] - return self.notes.set_note(self.conn, field, item_id, id_val, doc, resource_ids, searchable_text) + return self.notes.set_note(self.conn, field, item_id, id_val, doc, resource_hashes, searchable_text) def unretire_note_for(self, field, item_id) -> int: id_val = self.tables[field].id_map[item_id] @@ -983,8 +983,8 @@ class DB: def add_notes_resource(self, path_or_stream, name) -> int: return self.notes.add_resource(self.conn, path_or_stream, name) - def get_notes_resource(self, resource_id) -> Optional[dict]: - return self.notes.get_resource_data(self.conn, resource_id) + def get_notes_resource(self, resource_hash) -> Optional[dict]: + return self.notes.get_resource_data(self.conn, resource_hash) def notes_resources_used_by(self, field, item_id): conn = self.conn diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index 667c727832..5e4c0883f8 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -681,16 +681,16 @@ class Cache: return self.backend.notes_for(field, item_id) @write_api - def set_notes_for(self, field, item_id, doc: str, searchable_text: str = copy_marked_up_text, resource_ids=()) -> int: - return self.backend.set_notes_for(field, item_id, doc, searchable_text, resource_ids) + def set_notes_for(self, field, item_id, doc: str, searchable_text: str = copy_marked_up_text, resource_hashes=()) -> int: + return self.backend.set_notes_for(field, item_id, doc, searchable_text, resource_hashes) @write_api def add_notes_resource(self, path_or_stream_or_data, name: str) -> int: return self.backend.add_notes_resource(path_or_stream_or_data, name) @read_api - def get_notes_resource(self, resource_id) -> Optional[dict]: - return self.backend.get_notes_resource(resource_id) + def get_notes_resource(self, resource_hash) -> Optional[dict]: + return self.backend.get_notes_resource(resource_hash) @read_api def notes_resources_used_by(self, field, item_id): diff --git a/src/calibre/db/notes/connect.py b/src/calibre/db/notes/connect.py index 5c9f1c2d53..c07c2c5cb8 100644 --- a/src/calibre/db/notes/connect.py +++ b/src/calibre/db/notes/connect.py @@ -8,7 +8,7 @@ import time import xxhash from contextlib import suppress from itertools import count, repeat -from typing import Optional, Union +from typing import Optional from calibre.constants import iswindows from calibre.db import FTSQueryError @@ -103,15 +103,14 @@ class Notes: self.remove_unreferenced_resources(conn) def remove_unreferenced_resources(self, conn): - for (rhash,) in conn.get('SELECT hash FROM notes_db.resources WHERE id NOT IN (SELECT resource FROM notes_db.notes_resources_link)'): + found = False + for (rhash,) in conn.get('SELECT hash FROM notes_db.resources WHERE hash NOT IN (SELECT resource FROM notes_db.notes_resources_link)'): + found = True remove_with_retry(self.path_for_resource(conn, rhash)) - conn.execute('DELETE FROM notes_db.resources WHERE id NOT IN (SELECT resource FROM notes_db.notes_resources_link)') + if found: + conn.execute('DELETE FROM notes_db.resources WHERE hash NOT IN (SELECT resource FROM notes_db.notes_resources_link)') - def path_for_resource(self, conn, resource_hash_or_resource_id: Union[str,int]) -> str: - if isinstance(resource_hash_or_resource_id, str): - resource_hash = resource_hash_or_resource_id - else: - resource_hash = conn.get('SELECT hash FROM notes_db.resources WHERE id=?', (resource_hash_or_resource_id,), all=False) + def path_for_resource(self, conn, resource_hash: str) -> str: hashalg, digest = resource_hash.split(':', 1) prefix = digest[:2] # Cant use colons in filenames on windows safely @@ -188,7 +187,7 @@ class Notes: remove_with_retry(srcdir, is_dir=True) return note_id - def set_note(self, conn, field_name, item_id, item_value, marked_up_text='', used_resource_ids=(), searchable_text=copy_marked_up_text): + def set_note(self, conn, field_name, item_id, item_value, marked_up_text='', used_resource_hashes=(), searchable_text=copy_marked_up_text): if searchable_text is copy_marked_up_text: searchable_text = marked_up_text note_id = self.note_id_for(conn, field_name, item_id) @@ -199,13 +198,13 @@ class Notes: resources = () if old_resources: resources = conn.get( - 'SELECT hash,name FROM notes_db.resources WHERE id IN ({})'.format(','.join(repeat('?', len(old_resources)))), + 'SELECT hash,name FROM notes_db.resources WHERE hash IN ({})'.format(','.join(repeat('?', len(old_resources)))), tuple(old_resources)) self.retire_entry(field_name, item_id, item_value, resources, note_id) if old_resources: self.remove_resources(conn, note_id, old_resources, delete_from_link_table=False) return -1 - new_resources = frozenset(used_resource_ids) + new_resources = frozenset(used_resource_hashes) resources_to_potentially_remove = old_resources - new_resources resources_to_add = new_resources - old_resources if note_id is None: @@ -232,7 +231,7 @@ class Notes: ): return { 'id': note_id, 'doc': doc, 'searchable_text': searchable_text, - 'resource_ids': frozenset(self.resources_used_by(conn, note_id)), + 'resource_hashes': frozenset(self.resources_used_by(conn, note_id)), } def rename_note(self, conn, field_name, old_item_id, new_item_id, new_item_value): @@ -245,7 +244,7 @@ class Notes: old_note = self.get_note_data(conn, field_name, old_item_id) if not old_note or not old_note['doc']: return - self.set_note(conn, field_name, new_item_id, new_item_value, old_note['doc'], old_note['resource_ids'], old_note['searchable_text']) + self.set_note(conn, field_name, new_item_id, new_item_value, old_note['doc'], old_note['resource_hashes'], old_note['searchable_text']) def trim_retired_dir(self): items = [] @@ -290,11 +289,11 @@ class Notes: f.write(data) base_name, ext = os.path.splitext(name) c = 0 - for (resource_id, existing_name) in conn.execute('SELECT id,name FROM notes_db.resources WHERE hash=?', (resource_hash,)): + for (existing_name,) in conn.execute('SELECT name FROM notes_db.resources WHERE hash=?', (resource_hash,)): if existing_name != name and update_name: while True: try: - conn.execute('UPDATE notes_db.resources SET name=? WHERE id=?', (name, resource_id)) + conn.execute('UPDATE notes_db.resources SET name=? WHERE hash=?', (name, resource_hash)) break except apsw.ConstraintError: c += 1 @@ -303,15 +302,15 @@ class Notes: else: while True: try: - resource_id = conn.get('INSERT INTO notes_db.resources (hash,name) VALUES (?,?) RETURNING id', (resource_hash, name), all=False) + conn.get('INSERT INTO notes_db.resources (hash,name) VALUES (?,?)', (resource_hash, name), all=False) break except apsw.ConstraintError: c += 1 name = f'{base_name}-{c}{ext}' - return resource_id + return resource_hash - def get_resource_data(self, conn, resource_id) -> Optional[dict]: - for (name, resource_hash) in conn.execute('SELECT name,hash FROM notes_db.resources WHERE id=?', (resource_id,)): + def get_resource_data(self, conn, resource_hash) -> Optional[dict]: + for (name,) in conn.execute('SELECT name FROM notes_db.resources WHERE hash=?', (resource_hash,)): path = self.path_for_resource(conn, resource_hash) path = make_long_path_useable(path) os.listdir(os.path.dirname(path)) diff --git a/src/calibre/db/tests/filesystem.py b/src/calibre/db/tests/filesystem.py index 085e783427..4784236ce5 100644 --- a/src/calibre/db/tests/filesystem.py +++ b/src/calibre/db/tests/filesystem.py @@ -266,7 +266,7 @@ class FilesystemTest(BaseTest): self.assertEqual('recurse', open(os.path.join(bookdir, 'sub', 'recurse')).read()) r1 = cache.add_notes_resource(b'res1', 'res.jpg') r2 = cache.add_notes_resource(b'res2', 'res.jpg') - cache.set_notes_for('authors', 2, 'some notes', resource_ids=(r1, r2)) + cache.set_notes_for('authors', 2, 'some notes', resource_hashes=(r1, r2)) cache.add_format(1, 'TXT', BytesIO(b'testing exim')) cache.fts_indexing_sleep_time = 0.001 cache.enable_fts() diff --git a/src/calibre/db/tests/notes.py b/src/calibre/db/tests/notes.py index 94c9fb1ccf..01ba62fc4a 100644 --- a/src/calibre/db/tests/notes.py +++ b/src/calibre/db/tests/notes.py @@ -16,13 +16,13 @@ def test_notes_api(self: 'NotesTest'): h2 = cache.add_notes_resource(b'resource2', 'r1.jpg') self.ae(cache.get_notes_resource(h1)['name'], 'r1.jpg') self.ae(cache.get_notes_resource(h2)['name'], 'r1-1.jpg') - note_id = cache.set_notes_for('authors', authors[0], doc, resource_ids=(h1, h2)) + note_id = cache.set_notes_for('authors', authors[0], doc, resource_hashes=(h1, h2)) self.ae(cache.notes_for('authors', authors[0]), doc) self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1, h2})) self.ae(cache.get_notes_resource(h1)['data'], b'resource1') self.ae(cache.get_notes_resource(h2)['data'], b'resource2') doc2 = 'a different note to replace the first one' - self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_ids=(h1,))) + self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_hashes=(h1,))) self.ae(cache.notes_for('authors', authors[0]), doc2) self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1})) self.ae(cache.get_notes_resource(h1)['data'], b'resource1') @@ -32,7 +32,7 @@ def test_notes_api(self: 'NotesTest'): # check retirement h2 = cache.add_notes_resource(b'resource2', 'r1.jpg') - self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_ids=(h1,h2))) + self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_hashes=(h1,h2))) self.ae(-1, cache.set_notes_for('authors', authors[0], '')) self.ae(cache.notes_for('authors', authors[0]), '') self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset()) @@ -41,7 +41,7 @@ def test_notes_api(self: 'NotesTest'): h1 = cache.add_notes_resource(b'resource1', 'r1.jpg') h2 = cache.add_notes_resource(b'resource2', 'r1.jpg') - nnote_id = cache.set_notes_for('authors', authors[1], doc, resource_ids=(h1, h2)) + nnote_id = cache.set_notes_for('authors', authors[1], doc, resource_hashes=(h1, h2)) self.assertNotEqual(note_id, nnote_id) self.ae(-1, cache.set_notes_for('authors', authors[1], '')) after = os.listdir(notes.retired_dir) @@ -63,7 +63,7 @@ def test_cache_api(self: 'NotesTest'): doc = 'simple notes for an author' h1 = cache.add_notes_resource(b'resource1', 'r1.jpg') h2 = cache.add_notes_resource(b'resource2', 'r1.jpg') - cache.set_notes_for('authors', author_id, doc, resource_ids=(h1, h2)) + cache.set_notes_for('authors', author_id, doc, resource_hashes=(h1, h2)) # test renaming to a new author preserves notes cache.rename_items('authors', {author_id: 'renamed author'}) raid = cache.get_item_id('authors', 'renamed author') @@ -92,7 +92,7 @@ def test_cache_api(self: 'NotesTest'): tag_id = cache.get_item_id('#tags', tags[0]) h1 = cache.add_notes_resource(b'resource1t', 'r1.jpg') h2 = cache.add_notes_resource(b'resource2t', 'r1.jpg') - cache.set_notes_for('#tags', tag_id, doc, resource_ids=(h1, h2)) + cache.set_notes_for('#tags', tag_id, doc, resource_hashes=(h1, h2)) self.ae(cache.notes_for('#tags', tag_id), doc) cache.delete_custom_column('tags') cache.close()