Use hashes as primary key for resources

This will allow rebuild of database as the backup documents will refer
to resource hashes not ids (its very difficult to restore resources with
ids unchanged). Also this allows docs to be migrated between databases
easily.
This commit is contained in:
Kovid Goyal 2023-08-24 08:54:04 +05:30
parent 713f1af61e
commit 6017bafe48
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
6 changed files with 39 additions and 40 deletions

View File

@ -6,16 +6,16 @@ CREATE TABLE notes_db.notes ( id INTEGER PRIMARY KEY AUTOINCREMENT,
UNIQUE(item, colname) UNIQUE(item, colname)
); );
CREATE TABLE notes_db.resources ( id INTEGER PRIMARY KEY AUTOINCREMENT, CREATE TABLE notes_db.resources (
hash TEXT NOT NULL UNIQUE ON CONFLICT FAIL, hash TEXT NOT NULL PRIMARY KEY ON CONFLICT FAIL,
name TEXT NOT NULL UNIQUE ON CONFLICT FAIL name TEXT NOT NULL UNIQUE ON CONFLICT FAIL
); ) WITHOUT ROWID;
CREATE TABLE notes_db.notes_resources_link ( id INTEGER PRIMARY KEY, CREATE TABLE notes_db.notes_resources_link ( id INTEGER PRIMARY KEY,
note INTEGER NOT NULL, note INTEGER NOT NULL,
resource INTEGER NOT NULL, resource TEXT NOT NULL,
FOREIGN KEY(note) REFERENCES notes(id), FOREIGN KEY(note) REFERENCES notes(id),
FOREIGN KEY(resource) REFERENCES resources(id), FOREIGN KEY(resource) REFERENCES resources(hash),
UNIQUE(note, resource) UNIQUE(note, resource)
); );
@ -45,7 +45,7 @@ END;
CREATE TRIGGER notes_db.notes_db_resources_delete_trg BEFORE DELETE ON notes_db.resources CREATE TRIGGER notes_db.notes_db_resources_delete_trg BEFORE DELETE ON notes_db.resources
BEGIN BEGIN
DELETE FROM notes_resources_link WHERE resource=OLD.id; DELETE FROM notes_resources_link WHERE resource=OLD.hash;
END; END;
PRAGMA notes_db.user_version=1; PRAGMA notes_db.user_version=1;

View File

@ -972,9 +972,9 @@ class DB:
def notes_for(self, field_name, item_id): def notes_for(self, field_name, item_id):
return self.notes.get_note(self.conn, field_name, item_id) or '' return self.notes.get_note(self.conn, field_name, item_id) or ''
def set_notes_for(self, field, item_id, doc: str, searchable_text: str, resource_ids) -> int: def set_notes_for(self, field, item_id, doc: str, searchable_text: str, resource_hashes) -> int:
id_val = self.tables[field].id_map[item_id] id_val = self.tables[field].id_map[item_id]
return self.notes.set_note(self.conn, field, item_id, id_val, doc, resource_ids, searchable_text) return self.notes.set_note(self.conn, field, item_id, id_val, doc, resource_hashes, searchable_text)
def unretire_note_for(self, field, item_id) -> int: def unretire_note_for(self, field, item_id) -> int:
id_val = self.tables[field].id_map[item_id] id_val = self.tables[field].id_map[item_id]
@ -983,8 +983,8 @@ class DB:
def add_notes_resource(self, path_or_stream, name) -> int: def add_notes_resource(self, path_or_stream, name) -> int:
return self.notes.add_resource(self.conn, path_or_stream, name) return self.notes.add_resource(self.conn, path_or_stream, name)
def get_notes_resource(self, resource_id) -> Optional[dict]: def get_notes_resource(self, resource_hash) -> Optional[dict]:
return self.notes.get_resource_data(self.conn, resource_id) return self.notes.get_resource_data(self.conn, resource_hash)
def notes_resources_used_by(self, field, item_id): def notes_resources_used_by(self, field, item_id):
conn = self.conn conn = self.conn

View File

@ -681,16 +681,16 @@ class Cache:
return self.backend.notes_for(field, item_id) return self.backend.notes_for(field, item_id)
@write_api @write_api
def set_notes_for(self, field, item_id, doc: str, searchable_text: str = copy_marked_up_text, resource_ids=()) -> int: def set_notes_for(self, field, item_id, doc: str, searchable_text: str = copy_marked_up_text, resource_hashes=()) -> int:
return self.backend.set_notes_for(field, item_id, doc, searchable_text, resource_ids) return self.backend.set_notes_for(field, item_id, doc, searchable_text, resource_hashes)
@write_api @write_api
def add_notes_resource(self, path_or_stream_or_data, name: str) -> int: def add_notes_resource(self, path_or_stream_or_data, name: str) -> int:
return self.backend.add_notes_resource(path_or_stream_or_data, name) return self.backend.add_notes_resource(path_or_stream_or_data, name)
@read_api @read_api
def get_notes_resource(self, resource_id) -> Optional[dict]: def get_notes_resource(self, resource_hash) -> Optional[dict]:
return self.backend.get_notes_resource(resource_id) return self.backend.get_notes_resource(resource_hash)
@read_api @read_api
def notes_resources_used_by(self, field, item_id): def notes_resources_used_by(self, field, item_id):

View File

@ -8,7 +8,7 @@ import time
import xxhash import xxhash
from contextlib import suppress from contextlib import suppress
from itertools import count, repeat from itertools import count, repeat
from typing import Optional, Union from typing import Optional
from calibre.constants import iswindows from calibre.constants import iswindows
from calibre.db import FTSQueryError from calibre.db import FTSQueryError
@ -103,15 +103,14 @@ class Notes:
self.remove_unreferenced_resources(conn) self.remove_unreferenced_resources(conn)
def remove_unreferenced_resources(self, conn): def remove_unreferenced_resources(self, conn):
for (rhash,) in conn.get('SELECT hash FROM notes_db.resources WHERE id NOT IN (SELECT resource FROM notes_db.notes_resources_link)'): found = False
for (rhash,) in conn.get('SELECT hash FROM notes_db.resources WHERE hash NOT IN (SELECT resource FROM notes_db.notes_resources_link)'):
found = True
remove_with_retry(self.path_for_resource(conn, rhash)) remove_with_retry(self.path_for_resource(conn, rhash))
conn.execute('DELETE FROM notes_db.resources WHERE id NOT IN (SELECT resource FROM notes_db.notes_resources_link)') if found:
conn.execute('DELETE FROM notes_db.resources WHERE hash NOT IN (SELECT resource FROM notes_db.notes_resources_link)')
def path_for_resource(self, conn, resource_hash_or_resource_id: Union[str,int]) -> str: def path_for_resource(self, conn, resource_hash: str) -> str:
if isinstance(resource_hash_or_resource_id, str):
resource_hash = resource_hash_or_resource_id
else:
resource_hash = conn.get('SELECT hash FROM notes_db.resources WHERE id=?', (resource_hash_or_resource_id,), all=False)
hashalg, digest = resource_hash.split(':', 1) hashalg, digest = resource_hash.split(':', 1)
prefix = digest[:2] prefix = digest[:2]
# Cant use colons in filenames on windows safely # Cant use colons in filenames on windows safely
@ -188,7 +187,7 @@ class Notes:
remove_with_retry(srcdir, is_dir=True) remove_with_retry(srcdir, is_dir=True)
return note_id return note_id
def set_note(self, conn, field_name, item_id, item_value, marked_up_text='', used_resource_ids=(), searchable_text=copy_marked_up_text): def set_note(self, conn, field_name, item_id, item_value, marked_up_text='', used_resource_hashes=(), searchable_text=copy_marked_up_text):
if searchable_text is copy_marked_up_text: if searchable_text is copy_marked_up_text:
searchable_text = marked_up_text searchable_text = marked_up_text
note_id = self.note_id_for(conn, field_name, item_id) note_id = self.note_id_for(conn, field_name, item_id)
@ -199,13 +198,13 @@ class Notes:
resources = () resources = ()
if old_resources: if old_resources:
resources = conn.get( resources = conn.get(
'SELECT hash,name FROM notes_db.resources WHERE id IN ({})'.format(','.join(repeat('?', len(old_resources)))), 'SELECT hash,name FROM notes_db.resources WHERE hash IN ({})'.format(','.join(repeat('?', len(old_resources)))),
tuple(old_resources)) tuple(old_resources))
self.retire_entry(field_name, item_id, item_value, resources, note_id) self.retire_entry(field_name, item_id, item_value, resources, note_id)
if old_resources: if old_resources:
self.remove_resources(conn, note_id, old_resources, delete_from_link_table=False) self.remove_resources(conn, note_id, old_resources, delete_from_link_table=False)
return -1 return -1
new_resources = frozenset(used_resource_ids) new_resources = frozenset(used_resource_hashes)
resources_to_potentially_remove = old_resources - new_resources resources_to_potentially_remove = old_resources - new_resources
resources_to_add = new_resources - old_resources resources_to_add = new_resources - old_resources
if note_id is None: if note_id is None:
@ -232,7 +231,7 @@ class Notes:
): ):
return { return {
'id': note_id, 'doc': doc, 'searchable_text': searchable_text, 'id': note_id, 'doc': doc, 'searchable_text': searchable_text,
'resource_ids': frozenset(self.resources_used_by(conn, note_id)), 'resource_hashes': frozenset(self.resources_used_by(conn, note_id)),
} }
def rename_note(self, conn, field_name, old_item_id, new_item_id, new_item_value): def rename_note(self, conn, field_name, old_item_id, new_item_id, new_item_value):
@ -245,7 +244,7 @@ class Notes:
old_note = self.get_note_data(conn, field_name, old_item_id) old_note = self.get_note_data(conn, field_name, old_item_id)
if not old_note or not old_note['doc']: if not old_note or not old_note['doc']:
return return
self.set_note(conn, field_name, new_item_id, new_item_value, old_note['doc'], old_note['resource_ids'], old_note['searchable_text']) self.set_note(conn, field_name, new_item_id, new_item_value, old_note['doc'], old_note['resource_hashes'], old_note['searchable_text'])
def trim_retired_dir(self): def trim_retired_dir(self):
items = [] items = []
@ -290,11 +289,11 @@ class Notes:
f.write(data) f.write(data)
base_name, ext = os.path.splitext(name) base_name, ext = os.path.splitext(name)
c = 0 c = 0
for (resource_id, existing_name) in conn.execute('SELECT id,name FROM notes_db.resources WHERE hash=?', (resource_hash,)): for (existing_name,) in conn.execute('SELECT name FROM notes_db.resources WHERE hash=?', (resource_hash,)):
if existing_name != name and update_name: if existing_name != name and update_name:
while True: while True:
try: try:
conn.execute('UPDATE notes_db.resources SET name=? WHERE id=?', (name, resource_id)) conn.execute('UPDATE notes_db.resources SET name=? WHERE hash=?', (name, resource_hash))
break break
except apsw.ConstraintError: except apsw.ConstraintError:
c += 1 c += 1
@ -303,15 +302,15 @@ class Notes:
else: else:
while True: while True:
try: try:
resource_id = conn.get('INSERT INTO notes_db.resources (hash,name) VALUES (?,?) RETURNING id', (resource_hash, name), all=False) conn.get('INSERT INTO notes_db.resources (hash,name) VALUES (?,?)', (resource_hash, name), all=False)
break break
except apsw.ConstraintError: except apsw.ConstraintError:
c += 1 c += 1
name = f'{base_name}-{c}{ext}' name = f'{base_name}-{c}{ext}'
return resource_id return resource_hash
def get_resource_data(self, conn, resource_id) -> Optional[dict]: def get_resource_data(self, conn, resource_hash) -> Optional[dict]:
for (name, resource_hash) in conn.execute('SELECT name,hash FROM notes_db.resources WHERE id=?', (resource_id,)): for (name,) in conn.execute('SELECT name FROM notes_db.resources WHERE hash=?', (resource_hash,)):
path = self.path_for_resource(conn, resource_hash) path = self.path_for_resource(conn, resource_hash)
path = make_long_path_useable(path) path = make_long_path_useable(path)
os.listdir(os.path.dirname(path)) os.listdir(os.path.dirname(path))

View File

@ -266,7 +266,7 @@ class FilesystemTest(BaseTest):
self.assertEqual('recurse', open(os.path.join(bookdir, 'sub', 'recurse')).read()) self.assertEqual('recurse', open(os.path.join(bookdir, 'sub', 'recurse')).read())
r1 = cache.add_notes_resource(b'res1', 'res.jpg') r1 = cache.add_notes_resource(b'res1', 'res.jpg')
r2 = cache.add_notes_resource(b'res2', 'res.jpg') r2 = cache.add_notes_resource(b'res2', 'res.jpg')
cache.set_notes_for('authors', 2, 'some notes', resource_ids=(r1, r2)) cache.set_notes_for('authors', 2, 'some notes', resource_hashes=(r1, r2))
cache.add_format(1, 'TXT', BytesIO(b'testing exim')) cache.add_format(1, 'TXT', BytesIO(b'testing exim'))
cache.fts_indexing_sleep_time = 0.001 cache.fts_indexing_sleep_time = 0.001
cache.enable_fts() cache.enable_fts()

View File

@ -16,13 +16,13 @@ def test_notes_api(self: 'NotesTest'):
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg') h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
self.ae(cache.get_notes_resource(h1)['name'], 'r1.jpg') self.ae(cache.get_notes_resource(h1)['name'], 'r1.jpg')
self.ae(cache.get_notes_resource(h2)['name'], 'r1-1.jpg') self.ae(cache.get_notes_resource(h2)['name'], 'r1-1.jpg')
note_id = cache.set_notes_for('authors', authors[0], doc, resource_ids=(h1, h2)) note_id = cache.set_notes_for('authors', authors[0], doc, resource_hashes=(h1, h2))
self.ae(cache.notes_for('authors', authors[0]), doc) self.ae(cache.notes_for('authors', authors[0]), doc)
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1, h2})) self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1, h2}))
self.ae(cache.get_notes_resource(h1)['data'], b'resource1') self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
self.ae(cache.get_notes_resource(h2)['data'], b'resource2') self.ae(cache.get_notes_resource(h2)['data'], b'resource2')
doc2 = 'a different note to replace the first one' doc2 = 'a different note to replace the first one'
self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_ids=(h1,))) self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_hashes=(h1,)))
self.ae(cache.notes_for('authors', authors[0]), doc2) self.ae(cache.notes_for('authors', authors[0]), doc2)
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1})) self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1}))
self.ae(cache.get_notes_resource(h1)['data'], b'resource1') self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
@ -32,7 +32,7 @@ def test_notes_api(self: 'NotesTest'):
# check retirement # check retirement
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg') h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_ids=(h1,h2))) self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_hashes=(h1,h2)))
self.ae(-1, cache.set_notes_for('authors', authors[0], '')) self.ae(-1, cache.set_notes_for('authors', authors[0], ''))
self.ae(cache.notes_for('authors', authors[0]), '') self.ae(cache.notes_for('authors', authors[0]), '')
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset()) self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset())
@ -41,7 +41,7 @@ def test_notes_api(self: 'NotesTest'):
h1 = cache.add_notes_resource(b'resource1', 'r1.jpg') h1 = cache.add_notes_resource(b'resource1', 'r1.jpg')
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg') h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
nnote_id = cache.set_notes_for('authors', authors[1], doc, resource_ids=(h1, h2)) nnote_id = cache.set_notes_for('authors', authors[1], doc, resource_hashes=(h1, h2))
self.assertNotEqual(note_id, nnote_id) self.assertNotEqual(note_id, nnote_id)
self.ae(-1, cache.set_notes_for('authors', authors[1], '')) self.ae(-1, cache.set_notes_for('authors', authors[1], ''))
after = os.listdir(notes.retired_dir) after = os.listdir(notes.retired_dir)
@ -63,7 +63,7 @@ def test_cache_api(self: 'NotesTest'):
doc = 'simple notes for an author' doc = 'simple notes for an author'
h1 = cache.add_notes_resource(b'resource1', 'r1.jpg') h1 = cache.add_notes_resource(b'resource1', 'r1.jpg')
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg') h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
cache.set_notes_for('authors', author_id, doc, resource_ids=(h1, h2)) cache.set_notes_for('authors', author_id, doc, resource_hashes=(h1, h2))
# test renaming to a new author preserves notes # test renaming to a new author preserves notes
cache.rename_items('authors', {author_id: 'renamed author'}) cache.rename_items('authors', {author_id: 'renamed author'})
raid = cache.get_item_id('authors', 'renamed author') raid = cache.get_item_id('authors', 'renamed author')
@ -92,7 +92,7 @@ def test_cache_api(self: 'NotesTest'):
tag_id = cache.get_item_id('#tags', tags[0]) tag_id = cache.get_item_id('#tags', tags[0])
h1 = cache.add_notes_resource(b'resource1t', 'r1.jpg') h1 = cache.add_notes_resource(b'resource1t', 'r1.jpg')
h2 = cache.add_notes_resource(b'resource2t', 'r1.jpg') h2 = cache.add_notes_resource(b'resource2t', 'r1.jpg')
cache.set_notes_for('#tags', tag_id, doc, resource_ids=(h1, h2)) cache.set_notes_for('#tags', tag_id, doc, resource_hashes=(h1, h2))
self.ae(cache.notes_for('#tags', tag_id), doc) self.ae(cache.notes_for('#tags', tag_id), doc)
cache.delete_custom_column('tags') cache.delete_custom_column('tags')
cache.close() cache.close()