Use hashes as primary key for resources

This will allow rebuild of database as the backup documents will refer
to resource hashes not ids (its very difficult to restore resources with
ids unchanged). Also this allows docs to be migrated between databases
easily.
This commit is contained in:
Kovid Goyal 2023-08-24 08:54:04 +05:30
parent 713f1af61e
commit 6017bafe48
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
6 changed files with 39 additions and 40 deletions

View File

@ -6,16 +6,16 @@ CREATE TABLE notes_db.notes ( id INTEGER PRIMARY KEY AUTOINCREMENT,
UNIQUE(item, colname)
);
CREATE TABLE notes_db.resources ( id INTEGER PRIMARY KEY AUTOINCREMENT,
hash TEXT NOT NULL UNIQUE ON CONFLICT FAIL,
CREATE TABLE notes_db.resources (
hash TEXT NOT NULL PRIMARY KEY ON CONFLICT FAIL,
name TEXT NOT NULL UNIQUE ON CONFLICT FAIL
);
) WITHOUT ROWID;
CREATE TABLE notes_db.notes_resources_link ( id INTEGER PRIMARY KEY,
note INTEGER NOT NULL,
resource INTEGER NOT NULL,
resource TEXT NOT NULL,
FOREIGN KEY(note) REFERENCES notes(id),
FOREIGN KEY(resource) REFERENCES resources(id),
FOREIGN KEY(resource) REFERENCES resources(hash),
UNIQUE(note, resource)
);
@ -45,7 +45,7 @@ END;
CREATE TRIGGER notes_db.notes_db_resources_delete_trg BEFORE DELETE ON notes_db.resources
BEGIN
DELETE FROM notes_resources_link WHERE resource=OLD.id;
DELETE FROM notes_resources_link WHERE resource=OLD.hash;
END;
PRAGMA notes_db.user_version=1;

View File

@ -972,9 +972,9 @@ class DB:
def notes_for(self, field_name, item_id):
return self.notes.get_note(self.conn, field_name, item_id) or ''
def set_notes_for(self, field, item_id, doc: str, searchable_text: str, resource_ids) -> int:
def set_notes_for(self, field, item_id, doc: str, searchable_text: str, resource_hashes) -> int:
id_val = self.tables[field].id_map[item_id]
return self.notes.set_note(self.conn, field, item_id, id_val, doc, resource_ids, searchable_text)
return self.notes.set_note(self.conn, field, item_id, id_val, doc, resource_hashes, searchable_text)
def unretire_note_for(self, field, item_id) -> int:
id_val = self.tables[field].id_map[item_id]
@ -983,8 +983,8 @@ class DB:
def add_notes_resource(self, path_or_stream, name) -> int:
return self.notes.add_resource(self.conn, path_or_stream, name)
def get_notes_resource(self, resource_id) -> Optional[dict]:
return self.notes.get_resource_data(self.conn, resource_id)
def get_notes_resource(self, resource_hash) -> Optional[dict]:
return self.notes.get_resource_data(self.conn, resource_hash)
def notes_resources_used_by(self, field, item_id):
conn = self.conn

View File

@ -681,16 +681,16 @@ class Cache:
return self.backend.notes_for(field, item_id)
@write_api
def set_notes_for(self, field, item_id, doc: str, searchable_text: str = copy_marked_up_text, resource_ids=()) -> int:
return self.backend.set_notes_for(field, item_id, doc, searchable_text, resource_ids)
def set_notes_for(self, field, item_id, doc: str, searchable_text: str = copy_marked_up_text, resource_hashes=()) -> int:
return self.backend.set_notes_for(field, item_id, doc, searchable_text, resource_hashes)
@write_api
def add_notes_resource(self, path_or_stream_or_data, name: str) -> int:
return self.backend.add_notes_resource(path_or_stream_or_data, name)
@read_api
def get_notes_resource(self, resource_id) -> Optional[dict]:
return self.backend.get_notes_resource(resource_id)
def get_notes_resource(self, resource_hash) -> Optional[dict]:
return self.backend.get_notes_resource(resource_hash)
@read_api
def notes_resources_used_by(self, field, item_id):

View File

@ -8,7 +8,7 @@ import time
import xxhash
from contextlib import suppress
from itertools import count, repeat
from typing import Optional, Union
from typing import Optional
from calibre.constants import iswindows
from calibre.db import FTSQueryError
@ -103,15 +103,14 @@ class Notes:
self.remove_unreferenced_resources(conn)
def remove_unreferenced_resources(self, conn):
for (rhash,) in conn.get('SELECT hash FROM notes_db.resources WHERE id NOT IN (SELECT resource FROM notes_db.notes_resources_link)'):
found = False
for (rhash,) in conn.get('SELECT hash FROM notes_db.resources WHERE hash NOT IN (SELECT resource FROM notes_db.notes_resources_link)'):
found = True
remove_with_retry(self.path_for_resource(conn, rhash))
conn.execute('DELETE FROM notes_db.resources WHERE id NOT IN (SELECT resource FROM notes_db.notes_resources_link)')
if found:
conn.execute('DELETE FROM notes_db.resources WHERE hash NOT IN (SELECT resource FROM notes_db.notes_resources_link)')
def path_for_resource(self, conn, resource_hash_or_resource_id: Union[str,int]) -> str:
if isinstance(resource_hash_or_resource_id, str):
resource_hash = resource_hash_or_resource_id
else:
resource_hash = conn.get('SELECT hash FROM notes_db.resources WHERE id=?', (resource_hash_or_resource_id,), all=False)
def path_for_resource(self, conn, resource_hash: str) -> str:
hashalg, digest = resource_hash.split(':', 1)
prefix = digest[:2]
# Cant use colons in filenames on windows safely
@ -188,7 +187,7 @@ class Notes:
remove_with_retry(srcdir, is_dir=True)
return note_id
def set_note(self, conn, field_name, item_id, item_value, marked_up_text='', used_resource_ids=(), searchable_text=copy_marked_up_text):
def set_note(self, conn, field_name, item_id, item_value, marked_up_text='', used_resource_hashes=(), searchable_text=copy_marked_up_text):
if searchable_text is copy_marked_up_text:
searchable_text = marked_up_text
note_id = self.note_id_for(conn, field_name, item_id)
@ -199,13 +198,13 @@ class Notes:
resources = ()
if old_resources:
resources = conn.get(
'SELECT hash,name FROM notes_db.resources WHERE id IN ({})'.format(','.join(repeat('?', len(old_resources)))),
'SELECT hash,name FROM notes_db.resources WHERE hash IN ({})'.format(','.join(repeat('?', len(old_resources)))),
tuple(old_resources))
self.retire_entry(field_name, item_id, item_value, resources, note_id)
if old_resources:
self.remove_resources(conn, note_id, old_resources, delete_from_link_table=False)
return -1
new_resources = frozenset(used_resource_ids)
new_resources = frozenset(used_resource_hashes)
resources_to_potentially_remove = old_resources - new_resources
resources_to_add = new_resources - old_resources
if note_id is None:
@ -232,7 +231,7 @@ class Notes:
):
return {
'id': note_id, 'doc': doc, 'searchable_text': searchable_text,
'resource_ids': frozenset(self.resources_used_by(conn, note_id)),
'resource_hashes': frozenset(self.resources_used_by(conn, note_id)),
}
def rename_note(self, conn, field_name, old_item_id, new_item_id, new_item_value):
@ -245,7 +244,7 @@ class Notes:
old_note = self.get_note_data(conn, field_name, old_item_id)
if not old_note or not old_note['doc']:
return
self.set_note(conn, field_name, new_item_id, new_item_value, old_note['doc'], old_note['resource_ids'], old_note['searchable_text'])
self.set_note(conn, field_name, new_item_id, new_item_value, old_note['doc'], old_note['resource_hashes'], old_note['searchable_text'])
def trim_retired_dir(self):
items = []
@ -290,11 +289,11 @@ class Notes:
f.write(data)
base_name, ext = os.path.splitext(name)
c = 0
for (resource_id, existing_name) in conn.execute('SELECT id,name FROM notes_db.resources WHERE hash=?', (resource_hash,)):
for (existing_name,) in conn.execute('SELECT name FROM notes_db.resources WHERE hash=?', (resource_hash,)):
if existing_name != name and update_name:
while True:
try:
conn.execute('UPDATE notes_db.resources SET name=? WHERE id=?', (name, resource_id))
conn.execute('UPDATE notes_db.resources SET name=? WHERE hash=?', (name, resource_hash))
break
except apsw.ConstraintError:
c += 1
@ -303,15 +302,15 @@ class Notes:
else:
while True:
try:
resource_id = conn.get('INSERT INTO notes_db.resources (hash,name) VALUES (?,?) RETURNING id', (resource_hash, name), all=False)
conn.get('INSERT INTO notes_db.resources (hash,name) VALUES (?,?)', (resource_hash, name), all=False)
break
except apsw.ConstraintError:
c += 1
name = f'{base_name}-{c}{ext}'
return resource_id
return resource_hash
def get_resource_data(self, conn, resource_id) -> Optional[dict]:
for (name, resource_hash) in conn.execute('SELECT name,hash FROM notes_db.resources WHERE id=?', (resource_id,)):
def get_resource_data(self, conn, resource_hash) -> Optional[dict]:
for (name,) in conn.execute('SELECT name FROM notes_db.resources WHERE hash=?', (resource_hash,)):
path = self.path_for_resource(conn, resource_hash)
path = make_long_path_useable(path)
os.listdir(os.path.dirname(path))

View File

@ -266,7 +266,7 @@ class FilesystemTest(BaseTest):
self.assertEqual('recurse', open(os.path.join(bookdir, 'sub', 'recurse')).read())
r1 = cache.add_notes_resource(b'res1', 'res.jpg')
r2 = cache.add_notes_resource(b'res2', 'res.jpg')
cache.set_notes_for('authors', 2, 'some notes', resource_ids=(r1, r2))
cache.set_notes_for('authors', 2, 'some notes', resource_hashes=(r1, r2))
cache.add_format(1, 'TXT', BytesIO(b'testing exim'))
cache.fts_indexing_sleep_time = 0.001
cache.enable_fts()

View File

@ -16,13 +16,13 @@ def test_notes_api(self: 'NotesTest'):
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
self.ae(cache.get_notes_resource(h1)['name'], 'r1.jpg')
self.ae(cache.get_notes_resource(h2)['name'], 'r1-1.jpg')
note_id = cache.set_notes_for('authors', authors[0], doc, resource_ids=(h1, h2))
note_id = cache.set_notes_for('authors', authors[0], doc, resource_hashes=(h1, h2))
self.ae(cache.notes_for('authors', authors[0]), doc)
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1, h2}))
self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
self.ae(cache.get_notes_resource(h2)['data'], b'resource2')
doc2 = 'a different note to replace the first one'
self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_ids=(h1,)))
self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_hashes=(h1,)))
self.ae(cache.notes_for('authors', authors[0]), doc2)
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1}))
self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
@ -32,7 +32,7 @@ def test_notes_api(self: 'NotesTest'):
# check retirement
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_ids=(h1,h2)))
self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_hashes=(h1,h2)))
self.ae(-1, cache.set_notes_for('authors', authors[0], ''))
self.ae(cache.notes_for('authors', authors[0]), '')
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset())
@ -41,7 +41,7 @@ def test_notes_api(self: 'NotesTest'):
h1 = cache.add_notes_resource(b'resource1', 'r1.jpg')
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
nnote_id = cache.set_notes_for('authors', authors[1], doc, resource_ids=(h1, h2))
nnote_id = cache.set_notes_for('authors', authors[1], doc, resource_hashes=(h1, h2))
self.assertNotEqual(note_id, nnote_id)
self.ae(-1, cache.set_notes_for('authors', authors[1], ''))
after = os.listdir(notes.retired_dir)
@ -63,7 +63,7 @@ def test_cache_api(self: 'NotesTest'):
doc = 'simple notes for an author'
h1 = cache.add_notes_resource(b'resource1', 'r1.jpg')
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
cache.set_notes_for('authors', author_id, doc, resource_ids=(h1, h2))
cache.set_notes_for('authors', author_id, doc, resource_hashes=(h1, h2))
# test renaming to a new author preserves notes
cache.rename_items('authors', {author_id: 'renamed author'})
raid = cache.get_item_id('authors', 'renamed author')
@ -92,7 +92,7 @@ def test_cache_api(self: 'NotesTest'):
tag_id = cache.get_item_id('#tags', tags[0])
h1 = cache.add_notes_resource(b'resource1t', 'r1.jpg')
h2 = cache.add_notes_resource(b'resource2t', 'r1.jpg')
cache.set_notes_for('#tags', tag_id, doc, resource_ids=(h1, h2))
cache.set_notes_for('#tags', tag_id, doc, resource_hashes=(h1, h2))
self.ae(cache.notes_for('#tags', tag_id), doc)
cache.delete_custom_column('tags')
cache.close()