Store a name for every resource

This commit is contained in:
Kovid Goyal 2023-08-14 09:52:27 +05:30
parent 0b18f30392
commit 35b7bd3fe8
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 101 additions and 66 deletions

View File

@ -6,10 +6,17 @@ CREATE TABLE notes_db.notes ( id INTEGER PRIMARY KEY,
UNIQUE(item, colname) UNIQUE(item, colname)
); );
CREATE TABLE notes_db.resources ( id INTEGER PRIMARY KEY,
hash TEXT NOT NULL UNIQUE ON CONFLICT FAIL,
name TEXT NOT NULL UNIQUE ON CONFLICT FAIL
);
CREATE TABLE notes_db.notes_resources_link ( id INTEGER PRIMARY KEY, CREATE TABLE notes_db.notes_resources_link ( id INTEGER PRIMARY KEY,
note INTEGER NOT NULL, note INTEGER NOT NULL,
hash TEXT NOT NULL, resource INTEGER NOT NULL,
UNIQUE(note, hash) FOREIGN KEY(note) REFERENCES notes(id),
FOREIGN KEY(resource) REFERENCES resources(id),
UNIQUE(note, resource)
); );
CREATE VIRTUAL TABLE notes_db.notes_fts USING fts5(searchable_text, content = 'notes', content_rowid = 'id', tokenize = 'calibre remove_diacritics 2'); CREATE VIRTUAL TABLE notes_db.notes_fts USING fts5(searchable_text, content = 'notes', content_rowid = 'id', tokenize = 'calibre remove_diacritics 2');
@ -21,7 +28,7 @@ BEGIN
INSERT INTO notes_fts_stemmed(rowid, searchable_text) VALUES (NEW.id, NEW.searchable_text); INSERT INTO notes_fts_stemmed(rowid, searchable_text) VALUES (NEW.id, NEW.searchable_text);
END; END;
CREATE TRIGGER notes_db.notes_db_notes_delete_trg AFTER DELETE ON notes_db.notes CREATE TRIGGER notes_db.notes_db_notes_delete_trg BEFORE DELETE ON notes_db.notes
BEGIN BEGIN
DELETE FROM notes_resources_link WHERE note=OLD.id; DELETE FROM notes_resources_link WHERE note=OLD.id;
INSERT INTO notes_fts(notes_fts, rowid, searchable_text) VALUES('delete', OLD.id, OLD.searchable_text); INSERT INTO notes_fts(notes_fts, rowid, searchable_text) VALUES('delete', OLD.id, OLD.searchable_text);
@ -36,5 +43,9 @@ BEGIN
INSERT INTO notes_fts_stemmed(rowid, searchable_text) VALUES (NEW.id, NEW.searchable_text); INSERT INTO notes_fts_stemmed(rowid, searchable_text) VALUES (NEW.id, NEW.searchable_text);
END; END;
CREATE TRIGGER notes_db.notes_db_resources_delete_trg BEFORE DELETE ON notes_db.resources
BEGIN
DELETE FROM notes_resources_link WHERE resource=OLD.id;
END;
PRAGMA notes_db.user_version=1; PRAGMA notes_db.user_version=1;

View File

@ -17,6 +17,7 @@ import sys
import time import time
import uuid import uuid
from contextlib import closing, suppress from contextlib import closing, suppress
from typing import Optional
from functools import partial from functools import partial
from calibre import as_unicode, force_unicode, isbytestring, prints from calibre import as_unicode, force_unicode, isbytestring, prints
@ -339,10 +340,9 @@ class Connection(apsw.Connection): # {{{
self.fts_dbpath = self.notes_dbpath = None self.fts_dbpath = self.notes_dbpath = None
self.setbusytimeout(self.BUSY_TIMEOUT) self.setbusytimeout(self.BUSY_TIMEOUT)
self.execute('pragma cache_size=-5000') self.execute('PRAGMA cache_size=-5000; PRAGMA temp_store=2; PRAGMA foreign_keys=ON;')
self.execute('pragma temp_store=2')
encoding = next(self.execute('pragma encoding'))[0] encoding = next(self.execute('PRAGMA encoding'))[0]
self.createcollation('PYNOCASE', partial(pynocase, self.createcollation('PYNOCASE', partial(pynocase,
encoding=encoding)) encoding=encoding))
@ -968,14 +968,14 @@ class DB:
def notes_for(self, field_name, item_id): def notes_for(self, field_name, item_id):
return self.notes.get_note(self.conn, field_name, item_id) or '' return self.notes.get_note(self.conn, field_name, item_id) or ''
def set_notes_for(self, field, item_id, doc: str, searchable_text: str, resource_hashes) -> int: def set_notes_for(self, field, item_id, doc: str, searchable_text: str, resource_ids) -> int:
return self.notes.set_note(self.conn, field, item_id, doc, resource_hashes, searchable_text) return self.notes.set_note(self.conn, field, item_id, doc, resource_ids, searchable_text)
def add_notes_resource(self, path_or_stream) -> str: def add_notes_resource(self, path_or_stream, name) -> int:
return self.notes.add_resource(path_or_stream) return self.notes.add_resource(self.conn, path_or_stream, name)
def get_notes_resource(self, resource_hash) -> bytes: def get_notes_resource(self, resource_id) -> Optional[dict]:
return self.notes.get_resource(resource_hash) return self.notes.get_resource_data(self.conn, resource_id)
def notes_resources_used_by(self, field, item_id): def notes_resources_used_by(self, field, item_id):
conn = self.conn conn = self.conn
@ -1354,11 +1354,11 @@ class DB:
@property @property
def user_version(self): def user_version(self):
'''The user version of this database''' '''The user version of this database'''
return self.conn.get('pragma user_version;', all=False) return self.conn.get('PRAGMA user_version;', all=False)
@user_version.setter @user_version.setter
def user_version(self, val): def user_version(self, val):
self.execute('pragma user_version=%d'%int(val)) self.execute('PRAGMA user_version=%d'%int(val))
def initialize_database(self): def initialize_database(self):
metadata_sqlite = P('metadata_sqlite.sql', data=True, metadata_sqlite = P('metadata_sqlite.sql', data=True,

View File

@ -20,7 +20,7 @@ from io import DEFAULT_BUFFER_SIZE, BytesIO
from queue import Queue from queue import Queue
from threading import Lock from threading import Lock
from time import monotonic, sleep, time from time import monotonic, sleep, time
from typing import NamedTuple, Tuple from typing import NamedTuple, Tuple, Optional
from calibre import as_unicode, detect_ncpus, isbytestring from calibre import as_unicode, detect_ncpus, isbytestring
from calibre.constants import iswindows, preferred_encoding from calibre.constants import iswindows, preferred_encoding
@ -677,14 +677,14 @@ class Cache:
def notes_for(self, field, item_id) -> str: def notes_for(self, field, item_id) -> str:
return self.backend.notes_for(field, item_id) return self.backend.notes_for(field, item_id)
def set_notes_for(self, field, item_id, doc: str, searchable_text: str = copy_marked_up_text, resource_hashes=()) -> int: def set_notes_for(self, field, item_id, doc: str, searchable_text: str = copy_marked_up_text, resource_ids=()) -> int:
return self.backend.set_notes_for(field, item_id, doc, searchable_text, resource_hashes) return self.backend.set_notes_for(field, item_id, doc, searchable_text, resource_ids)
def add_notes_resource(self, path_or_stream_or_data) -> str: def add_notes_resource(self, path_or_stream_or_data, name: str) -> int:
return self.backend.add_notes_resource(path_or_stream_or_data) return self.backend.add_notes_resource(path_or_stream_or_data, name)
def get_notes_resource(self, resource_hash) -> bytes: def get_notes_resource(self, resource_id) -> Optional[dict]:
return self.backend.get_notes_resource(resource_hash) return self.backend.get_notes_resource(resource_id)
def notes_resources_used_by(self, field, item_id): def notes_resources_used_by(self, field, item_id):
return frozenset(self.backend.notes_resources_used_by(field, item_id)) return frozenset(self.backend.notes_resources_used_by(field, item_id))

View File

@ -1,9 +1,11 @@
#!/usr/bin/env python #!/usr/bin/env python
# License: GPLv3 Copyright: 2023, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2023, Kovid Goyal <kovid at kovidgoyal.net>
import apsw
import os import os
import time import time
import xxhash import xxhash
from typing import Union, Optional
from contextlib import suppress from contextlib import suppress
from itertools import repeat from itertools import repeat
@ -76,7 +78,11 @@ class Notes:
SchemaUpgrade(conn, '\n'.join(triggers)) SchemaUpgrade(conn, '\n'.join(triggers))
conn.notes_dbpath = dbpath conn.notes_dbpath = dbpath
def path_for_resource(self, resource_hash: str) -> str: def path_for_resource(self, conn, resource_hash_or_resource_id: Union[str,int]) -> str:
if isinstance(resource_hash_or_resource_id, str):
resource_hash = resource_hash_or_resource_id
else:
resource_hash = conn.get('SELECT hash FROM notes_db.resources WHERE id=?', (resource_hash_or_resource_id,), all=False)
idx = resource_hash.index(':') idx = resource_hash.index(':')
prefix = resource_hash[idx + 1: idx + 3] prefix = resource_hash[idx + 1: idx + 3]
return os.path.join(self.resources_dir, prefix, resource_hash) return os.path.join(self.resources_dir, prefix, resource_hash)
@ -86,22 +92,21 @@ class Notes:
resources_to_potentially_remove = tuple(resources_to_potentially_remove) resources_to_potentially_remove = tuple(resources_to_potentially_remove)
if delete_from_link_table: if delete_from_link_table:
conn.executemany(''' conn.executemany('''
DELETE FROM notes_db.notes_resources_link WHERE note=? AND hash=? DELETE FROM notes_db.notes_resources_link WHERE note=? AND resource=?
''', tuple((note_id, x) for x in resources_to_potentially_remove)) ''', tuple((note_id, x) for x in resources_to_potentially_remove))
stmt = ''' stmt = '''
WITH resources_table(value) AS (VALUES {}) WITH resources_table(value) AS (VALUES {})
SELECT value FROM resources_table WHERE value NOT IN (SELECT hash FROM notes_db.notes_resources_link) SELECT value FROM resources_table WHERE value NOT IN (SELECT resource FROM notes_db.notes_resources_link)
'''.format(','.join(repeat('(?)', len(resources_to_potentially_remove)))) '''.format(','.join(repeat('(?)', len(resources_to_potentially_remove))))
for (x,) in conn.execute(stmt, resources_to_potentially_remove): for (x,) in conn.execute(stmt, resources_to_potentially_remove):
remove_with_retry(self.path_for_resource(x)) remove_with_retry(self.path_for_resource(conn, x))
def note_id_for(self, conn, field_name, item_id): def note_id_for(self, conn, field_name, item_id):
for (ans,) in conn.execute('SELECT id FROM notes_db.notes WHERE item=? AND colname=?', (item_id, field_name)): return conn.get('SELECT id FROM notes_db.notes WHERE item=? AND colname=?', (item_id, field_name), all=False)
return ans
def resources_used_by(self, conn, note_id): def resources_used_by(self, conn, note_id):
if note_id is not None: if note_id is not None:
for (h,) in conn.execute('SELECT hash from notes_db.notes_resources_link WHERE note=?', (note_id,)): for (h,) in conn.execute('SELECT resource from notes_db.notes_resources_link WHERE note=?', (note_id,)):
yield h yield h
def set_backup_for(self, field_name, item_id, marked_up_text='', searchable_text=''): def set_backup_for(self, field_name, item_id, marked_up_text='', searchable_text=''):
@ -122,7 +127,7 @@ class Notes:
os.replace(path, dest) os.replace(path, dest)
self.trim_retired_dir() self.trim_retired_dir()
def set_note(self, conn, field_name, item_id, marked_up_text='', hashes_of_used_resources=(), searchable_text=copy_marked_up_text): def set_note(self, conn, field_name, item_id, marked_up_text='', used_resource_ids=(), searchable_text=copy_marked_up_text):
if searchable_text is copy_marked_up_text: if searchable_text is copy_marked_up_text:
searchable_text = marked_up_text searchable_text = marked_up_text
note_id = self.note_id_for(conn, field_name, item_id) note_id = self.note_id_for(conn, field_name, item_id)
@ -134,27 +139,26 @@ class Notes:
if old_resources: if old_resources:
self.remove_resources(conn, note_id, old_resources, delete_from_link_table=False) self.remove_resources(conn, note_id, old_resources, delete_from_link_table=False)
return return
new_resources = frozenset(hashes_of_used_resources) new_resources = frozenset(used_resource_ids)
resources_to_potentially_remove = old_resources - new_resources resources_to_potentially_remove = old_resources - new_resources
resources_to_add = new_resources - old_resources resources_to_add = new_resources - old_resources
if note_id is None: if note_id is None:
note_id, = next(conn.execute(''' note_id = conn.get('''
INSERT INTO notes_db.notes (item,colname,doc,searchable_text) VALUES (?,?,?,?) RETURNING id; INSERT INTO notes_db.notes (item,colname,doc,searchable_text) VALUES (?,?,?,?) RETURNING id;
''', (item_id, field_name, marked_up_text, searchable_text))) ''', (item_id, field_name, marked_up_text, searchable_text), all=False)
else: else:
conn.execute('UPDATE notes_db.notes SET doc=?,searchable_text=?', (marked_up_text, searchable_text)) conn.execute('UPDATE notes_db.notes SET doc=?,searchable_text=?', (marked_up_text, searchable_text))
if resources_to_potentially_remove: if resources_to_potentially_remove:
self.remove_resources(conn, note_id, resources_to_potentially_remove) self.remove_resources(conn, note_id, resources_to_potentially_remove)
if resources_to_add: if resources_to_add:
conn.executemany(''' conn.executemany('''
INSERT INTO notes_db.notes_resources_link (note,hash) VALUES (?,?); INSERT INTO notes_db.notes_resources_link (note,resource) VALUES (?,?);
''', tuple((note_id, x) for x in resources_to_add)) ''', tuple((note_id, x) for x in resources_to_add))
self.set_backup_for(field_name, item_id, marked_up_text, searchable_text) self.set_backup_for(field_name, item_id, marked_up_text, searchable_text)
return note_id return note_id
def get_note(self, conn, field_name, item_id): def get_note(self, conn, field_name, item_id):
for (doc,) in conn.execute('SELECT doc FROM notes_db.notes WHERE item=? AND colname=?', (item_id, field_name)): return conn.get('SELECT doc FROM notes_db.notes WHERE item=? AND colname=?', (item_id, field_name), all=False)
return doc
def get_note_data(self, conn, field_name, item_id): def get_note_data(self, conn, field_name, item_id):
for (note_id, doc, searchable_text) in conn.execute( for (note_id, doc, searchable_text) in conn.execute(
@ -162,7 +166,7 @@ class Notes:
): ):
return { return {
'id': note_id, 'doc': doc, 'searchable_text': searchable_text, 'id': note_id, 'doc': doc, 'searchable_text': searchable_text,
'resource_hashes': frozenset(self.resources_used_by(conn, note_id)), 'resource_ids': frozenset(self.resources_used_by(conn, note_id)),
} }
def rename_note(self, conn, field_name, old_item_id, new_item_id): def rename_note(self, conn, field_name, old_item_id, new_item_id):
@ -175,7 +179,7 @@ class Notes:
old_note = self.get_note_data(conn, field_name, old_item_id) old_note = self.get_note_data(conn, field_name, old_item_id)
if not old_note or not old_note['doc']: if not old_note or not old_note['doc']:
return return
self.set_note(conn, field_name, new_item_id, old_note['doc'], old_note['resource_hashes'], old_note['searchable_text']) self.set_note(conn, field_name, new_item_id, old_note['doc'], old_note['resource_ids'], old_note['searchable_text'])
def trim_retired_dir(self): def trim_retired_dir(self):
mpath_map = {} mpath_map = {}
@ -189,7 +193,7 @@ class Notes:
for path in items[:extra]: for path in items[:extra]:
remove_with_retry(path) remove_with_retry(path)
def add_resource(self, path_or_stream_or_data): def add_resource(self, conn, path_or_stream_or_data, name):
if isinstance(path_or_stream_or_data, bytes): if isinstance(path_or_stream_or_data, bytes):
data = path_or_stream_or_data data = path_or_stream_or_data
elif isinstance(path_or_stream_or_data, str): elif isinstance(path_or_stream_or_data, str):
@ -198,7 +202,7 @@ class Notes:
else: else:
data = f.read() data = f.read()
resource_hash = hash_data(data) resource_hash = hash_data(data)
path = self.path_for_resource(resource_hash) path = self.path_for_resource(conn, resource_hash)
path = make_long_path_useable(path) path = make_long_path_useable(path)
exists = False exists = False
try: try:
@ -207,21 +211,39 @@ class Notes:
pass pass
else: else:
exists = s.st_size == len(data) exists = s.st_size == len(data)
if exists: if not exists:
return resource_hash try:
f = open(path, 'wb')
except FileNotFoundError:
os.makedirs(os.path.dirname(path), exist_ok=True)
f = open(path, 'wb')
with f:
f.write(data)
base_name, ext = os.path.splitext(name)
c = 0
for (resource_id, existing_name) in conn.execute('SELECT id,name FROM notes_db.resources WHERE hash=?', (resource_hash,)):
if existing_name != name:
while True:
try:
conn.execute('UPDATE notes_db.resources SET name=? WHERE id=?', (name, resource_id))
break
except apsw.ConstraintError:
c += 1
name = f'{base_name}-{c}{ext}'
break
else:
while True:
try:
resource_id = conn.get('INSERT INTO notes_db.resources (hash,name) VALUES (?,?) RETURNING id', (resource_hash, name), all=False)
break
except apsw.ConstraintError:
c += 1
name = f'{base_name}-{c}{ext}'
return resource_id
try: def get_resource_data(self, conn, resource_id) -> Optional[dict]:
f = open(path, 'wb') for (name, resource_hash) in conn.execute('SELECT name,hash FROM notes_db.resources WHERE id=?', (resource_id,)):
except FileNotFoundError: path = self.path_for_resource(conn, resource_hash)
os.makedirs(os.path.dirname(path), exist_ok=True) path = make_long_path_useable(path)
f = open(path, 'wb') with suppress(FileNotFoundError), open(path, 'rb') as f:
with f: return {'name': name, 'data': f.read(), 'hash': resource_hash}
f.write(data)
return resource_hash
def get_resource(self, resource_hash) -> bytes:
path = self.path_for_resource(resource_hash)
path = make_long_path_useable(path)
with suppress(FileNotFoundError), open(path, 'rb') as f:
return f.read()
return b''

View File

@ -22,18 +22,20 @@ class NotesTest(BaseTest):
authors = sorted(cache.all_field_ids('authors')) authors = sorted(cache.all_field_ids('authors'))
self.ae(cache.notes_for('authors', authors[0]), '') self.ae(cache.notes_for('authors', authors[0]), '')
doc = 'simple notes for an author' doc = 'simple notes for an author'
h1 = cache.add_notes_resource(b'resource1') h1 = cache.add_notes_resource(b'resource1', 'r1.jpg')
h2 = cache.add_notes_resource(b'resource2') h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
note_id = cache.set_notes_for('authors', authors[0], doc, resource_hashes=(h1, h2)) self.ae(cache.get_notes_resource(h1)['name'], 'r1.jpg')
self.ae(cache.get_notes_resource(h2)['name'], 'r1-1.jpg')
note_id = cache.set_notes_for('authors', authors[0], doc, resource_ids=(h1, h2))
self.ae(cache.notes_for('authors', authors[0]), doc) self.ae(cache.notes_for('authors', authors[0]), doc)
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1, h2})) self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1, h2}))
self.ae(cache.get_notes_resource(h1), b'resource1') self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
self.ae(cache.get_notes_resource(h2), b'resource2') self.ae(cache.get_notes_resource(h2)['data'], b'resource2')
doc2 = 'a different note to replace the first one' doc2 = 'a different note to replace the first one'
self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_hashes=(h1,))) self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_ids=(h1,)))
self.ae(cache.notes_for('authors', authors[0]), doc2) self.ae(cache.notes_for('authors', authors[0]), doc2)
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1})) self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1}))
self.ae(cache.get_notes_resource(h1), b'resource1') self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
self.ae(cache.get_notes_resource(h2), b'') self.ae(cache.get_notes_resource(h2), None)
self.assertTrue(os.path.exists(notes.path_for_resource(h1))) self.assertTrue(os.path.exists(notes.path_for_resource(cache.backend.conn, h1)))
self.assertFalse(os.path.exists(notes.path_for_resource(h2))) self.assertFalse(os.path.exists(notes.path_for_resource(cache.backend.conn, h2)))