Implement retiring of notes

This commit is contained in:
Kovid Goyal 2023-08-14 19:55:26 +05:30
parent 35b7bd3fe8
commit 0bd1137fa1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
7 changed files with 197 additions and 87 deletions

View File

@ -1,4 +1,4 @@
CREATE TABLE notes_db.notes ( id INTEGER PRIMARY KEY,
CREATE TABLE notes_db.notes ( id INTEGER PRIMARY KEY AUTOINCREMENT,
item INTEGER NOT NULL,
colname TEXT NOT NULL COLLATE NOCASE,
doc TEXT NOT NULL DEFAULT '',
@ -6,7 +6,7 @@ CREATE TABLE notes_db.notes ( id INTEGER PRIMARY KEY,
UNIQUE(item, colname)
);
CREATE TABLE notes_db.resources ( id INTEGER PRIMARY KEY,
CREATE TABLE notes_db.resources ( id INTEGER PRIMARY KEY AUTOINCREMENT,
hash TEXT NOT NULL UNIQUE ON CONFLICT FAIL,
name TEXT NOT NULL UNIQUE ON CONFLICT FAIL
);

View File

@ -949,16 +949,16 @@ class DB:
from .notes.connect import Notes
self.notes = Notes(self)
def delete_category_items(self, field_name, table_name, items, link_table_name='', link_col_name=''):
bindings = tuple((x,) for x in items)
def delete_category_items(self, field_name, table_name, item_map, link_table_name='', link_col_name=''):
for item_id, item_val in item_map.items():
self.notes.set_note(self.conn, field_name, item_id, item_val or '')
bindings = tuple((x,) for x in item_map)
if link_table_name and link_col_name:
self.executemany(f'DELETE FROM {link_table_name} WHERE {link_col_name}=?', bindings)
self.executemany(f'DELETE FROM {table_name} WHERE id=?', bindings)
for item_id in items:
self.notes.set_note(self.conn, field_name, item_id)
def rename_category_item(self, field_name, table_name, link_table_name, link_col_name, old_item_id, new_item_id):
self.notes.rename_note(self.conn, field_name, old_item_id, new_item_id)
def rename_category_item(self, field_name, table_name, link_table_name, link_col_name, old_item_id, new_item_id, new_item_value):
self.notes.rename_note(self.conn, field_name, old_item_id, new_item_id, new_item_value or '')
# For custom series this means that the series index can
# potentially have duplicates/be incorrect, but there is no way to
# handle that in this context.
@ -969,7 +969,12 @@ class DB:
return self.notes.get_note(self.conn, field_name, item_id) or ''
def set_notes_for(self, field, item_id, doc: str, searchable_text: str, resource_ids) -> int:
return self.notes.set_note(self.conn, field, item_id, doc, resource_ids, searchable_text)
id_val = self.tables[field].id_map[item_id]
return self.notes.set_note(self.conn, field, item_id, id_val, doc, resource_ids, searchable_text)
def unretire_note_for(self, field, item_id) -> int:
id_val = self.tables[field].id_map[item_id]
return self.notes.unretire(self.conn, field, item_id, id_val)
def add_notes_resource(self, path_or_stream, name) -> int:
return self.notes.add_resource(self.conn, path_or_stream, name)

View File

@ -674,20 +674,29 @@ class Cache:
# }}}
# Notes API {{{
@read_api
def notes_for(self, field, item_id) -> str:
return self.backend.notes_for(field, item_id)
@write_api
def set_notes_for(self, field, item_id, doc: str, searchable_text: str = copy_marked_up_text, resource_ids=()) -> int:
return self.backend.set_notes_for(field, item_id, doc, searchable_text, resource_ids)
@write_api
def add_notes_resource(self, path_or_stream_or_data, name: str) -> int:
return self.backend.add_notes_resource(path_or_stream_or_data, name)
@read_api
def get_notes_resource(self, resource_id) -> Optional[dict]:
return self.backend.get_notes_resource(resource_id)
@read_api
def notes_resources_used_by(self, field, item_id):
return frozenset(self.backend.notes_resources_used_by(field, item_id))
@write_api
def unretire_note_for(self, field, item_id) -> int:
return self.backend.unretire_note_for(field, item_id)
# }}}
# Cache Layer API {{{

View File

@ -3,15 +3,16 @@
import apsw
import os
import shutil
import time
import xxhash
from typing import Union, Optional
from contextlib import suppress
from itertools import repeat
from typing import Optional, Union
from calibre.constants import iswindows
from calibre.utils.copy_files import WINDOWS_SLEEP_FOR_RETRY_TIME
from calibre.utils.filenames import make_long_path_useable
from calibre.utils.filenames import copyfile_using_links, make_long_path_useable
from ..constants import NOTES_DIR_NAME
from .schema_upgrade import SchemaUpgrade
@ -25,22 +26,27 @@ class cmt(str):
copy_marked_up_text = cmt()
SEP = b'\0\x1c\0'
DOC_NAME = 'doc.html'
def hash_data(data: bytes) -> str:
return 'xxh64:' + xxhash.xxh3_64_hexdigest(data)
def remove_with_retry(x):
def hash_key(key: str) -> str:
return xxhash.xxh3_64_hexdigest(key.encode('utf-8'))
def remove_with_retry(x, is_dir=False):
x = make_long_path_useable(x)
f = (shutil.rmtree if is_dir else os.remove)
try:
os.remove(x)
f(x)
except FileNotFoundError:
return
except OSError as e:
if iswindows and e.winerror == winutil.ERROR_SHARING_VIOLATION:
time.sleep(WINDOWS_SLEEP_FOR_RETRY_TIME)
os.remove(x)
f(x)
class Notes:
@ -109,25 +115,56 @@ class Notes:
for (h,) in conn.execute('SELECT resource from notes_db.notes_resources_link WHERE note=?', (note_id,)):
yield h
def set_backup_for(self, field_name, item_id, marked_up_text='', searchable_text=''):
def set_backup_for(self, field_name, item_id, marked_up_text, searchable_text):
path = make_long_path_useable(os.path.join(self.backup_dir, field_name, str(item_id)))
if marked_up_text:
try:
f = open(path, 'wb')
except FileNotFoundError:
os.makedirs(os.path.dirname(path), exist_ok=True)
f = open(path, 'wb')
with f:
f.write(marked_up_text.encode('utf-8'))
f.write(SEP)
f.write(searchable_text.encode('utf-8'))
else:
if os.path.exists(path):
dest = make_long_path_useable(os.path.join(self.retired_dir, f'{item_id}_{field_name}'))
os.replace(path, dest)
self.trim_retired_dir()
try:
f = open(path, 'wb')
except FileNotFoundError:
os.makedirs(os.path.dirname(path), exist_ok=True)
f = open(path, 'wb')
with f:
f.write(marked_up_text.encode('utf-8'))
f.write(SEP)
f.write(searchable_text.encode('utf-8'))
def set_note(self, conn, field_name, item_id, marked_up_text='', used_resource_ids=(), searchable_text=copy_marked_up_text):
def retire_entry(self, field_name, item_id, item_value, resources, note_id):
path = make_long_path_useable(os.path.join(self.backup_dir, field_name, str(item_id)))
if os.path.exists(path):
key = (item_value or '').lower()
destdir = os.path.join(self.retired_dir, hash_key(f'{field_name} {key}'))
os.makedirs(make_long_path_useable(destdir), exist_ok=True)
dest = os.path.join(destdir, DOC_NAME)
os.replace(path, make_long_path_useable(dest))
with open(make_long_path_useable(os.path.join(destdir, 'note_id')), 'w') as nif:
nif.write(str(note_id))
for rhash, rname in resources:
rpath = make_long_path_useable(self.path_for_resource(None, rhash))
if os.path.exists(rpath):
rdest = os.path.join(destdir, 'res-'+rname)
copyfile_using_links(rpath, make_long_path_useable(rdest), dest_is_dir=False)
self.trim_retired_dir()
def unretire(self, conn, field_name, item_id, item_value) -> int:
key = (item_value or '').lower()
srcdir = make_long_path_useable(os.path.join(self.retired_dir, hash_key(f'{field_name} {key}')))
note_id = -1
if not os.path.exists(srcdir) or self.note_id_for(conn, field_name, item_id) is not None:
return note_id
with open(os.path.join(srcdir, DOC_NAME), 'rb') as src:
a, b = src.read().partition(SEP)[::2]
marked_up_text, searchable_text = a.decode('utf-8'), b.decode('utf-8')
resources = set()
for x in os.listdir(srcdir):
if x.startswith('res-'):
rname = x.split('-', 1)[1]
with open(os.path.join(srcdir, x), 'rb') as rsrc:
resources.add(self.add_resource(conn, rsrc, rname))
note_id = self.set_note(conn, field_name, item_id, item_value, marked_up_text, resources, searchable_text)
if note_id > -1:
remove_with_retry(srcdir, is_dir=True)
return note_id
def set_note(self, conn, field_name, item_id, item_value, marked_up_text='', used_resource_ids=(), searchable_text=copy_marked_up_text):
if searchable_text is copy_marked_up_text:
searchable_text = marked_up_text
note_id = self.note_id_for(conn, field_name, item_id)
@ -135,16 +172,21 @@ class Notes:
if not marked_up_text:
if note_id is not None:
conn.execute('DELETE FROM notes_db.notes WHERE id=?', (note_id,))
self.set_backup_for(field_name, item_id)
resources = ()
if old_resources:
resources = conn.get(
'SELECT hash,name FROM notes_db.resources WHERE id IN ({})'.format(','.join(repeat('?', len(old_resources)))),
tuple(old_resources))
self.retire_entry(field_name, item_id, item_value, resources, note_id)
if old_resources:
self.remove_resources(conn, note_id, old_resources, delete_from_link_table=False)
return
return -1
new_resources = frozenset(used_resource_ids)
resources_to_potentially_remove = old_resources - new_resources
resources_to_add = new_resources - old_resources
if note_id is None:
note_id = conn.get('''
INSERT INTO notes_db.notes (item,colname,doc,searchable_text) VALUES (?,?,?,?) RETURNING id;
INSERT INTO notes_db.notes (item,colname,doc,searchable_text) VALUES (?,?,?,?) RETURNING notes.id;
''', (item_id, field_name, marked_up_text, searchable_text), all=False)
else:
conn.execute('UPDATE notes_db.notes SET doc=?,searchable_text=?', (marked_up_text, searchable_text))
@ -169,7 +211,7 @@ class Notes:
'resource_ids': frozenset(self.resources_used_by(conn, note_id)),
}
def rename_note(self, conn, field_name, old_item_id, new_item_id):
def rename_note(self, conn, field_name, old_item_id, new_item_id, new_item_value):
note_id = self.note_id_for(conn, field_name, old_item_id)
if note_id is None:
return
@ -179,19 +221,22 @@ class Notes:
old_note = self.get_note_data(conn, field_name, old_item_id)
if not old_note or not old_note['doc']:
return
self.set_note(conn, field_name, new_item_id, old_note['doc'], old_note['resource_ids'], old_note['searchable_text'])
self.set_note(conn, field_name, new_item_id, new_item_value, old_note['doc'], old_note['resource_ids'], old_note['searchable_text'])
def trim_retired_dir(self):
mpath_map = {}
items = []
for d in os.scandir(self.retired_dir):
mpath_map[d.path] = d.stat(follow_symlinks=False).st_mtime_ns
for d in os.scandir(make_long_path_useable(self.retired_dir)):
items.append(d.path)
extra = len(items) - self.max_retired_items
if extra > 0:
items.sort(key=mpath_map.__getitem__)
def key(path):
path = os.path.join(path, 'note_id')
with suppress(OSError):
with open(path) as f:
return os.stat(path, follow_symlinks=False).st_mtime_ns, int(f.read())
items.sort(key=key)
for path in items[:extra]:
remove_with_retry(path)
remove_with_retry(path, is_dir=True)
def add_resource(self, conn, path_or_stream_or_data, name):
if isinstance(path_or_stream_or_data, bytes):
@ -200,7 +245,7 @@ class Notes:
with open(path_or_stream_or_data, 'rb') as f:
data = f.read()
else:
data = f.read()
data = path_or_stream_or_data.read()
resource_hash = hash_data(data)
path = self.path_for_resource(conn, resource_hash)
path = make_long_path_useable(path)

View File

@ -248,15 +248,18 @@ class ManyToOneTable(Table):
if len(v) > 1:
main_id = min(v)
v.discard(main_id)
item_map = {}
for item_id in v:
self.id_map.pop(item_id, None)
val = self.id_map.pop(item_id, null)
if val is not null:
item_map[item_id] = val
books = self.col_book_map.pop(item_id, set())
for book_id in books:
self.book_col_map[book_id] = main_id
db.executemany('UPDATE {0} SET {1}=? WHERE {1}=?'.format(
self.link_table, self.metadata['link_column']),
tuple((main_id, x) for x in v))
db.delete_category_items(self.name, self.metadata['table'], v)
db.delete_category_items(self.name, self.metadata['table'], item_map)
def remove_books(self, book_ids, db):
clean = set()
@ -310,20 +313,26 @@ class ManyToOneTable(Table):
affected_books |= self.remove_items(items_to_process_normally, db)
return affected_books
item_map = {}
for item_id in item_ids:
val = self.id_map.pop(item_id, null)
if val is null:
continue
item_map[item_id] = val
book_ids = self.col_book_map.pop(item_id, set())
for book_id in book_ids:
self.book_col_map.pop(book_id, None)
affected_books.update(book_ids)
db.delete_category_items(self.name, self.metadata['table'], item_ids, self.link_table, self.metadata['link_column'])
db.delete_category_items(self.name, self.metadata['table'], item_map, self.link_table, self.metadata['link_column'])
return affected_books
def rename_item(self, item_id, new_name, db):
rmap = {icu_lower(v):k for k, v in iteritems(self.id_map)}
existing_item = rmap.get(icu_lower(new_name), None)
existing_item = None
q = icu_lower(new_name)
for q_id, q_val in self.id_map.items():
if icu_lower(q_val) == q:
existing_item = q_id
break
table, col, lcol = self.metadata['table'], self.metadata['column'], self.metadata['link_column']
affected_books = self.col_book_map.get(item_id, set())
new_id = item_id
@ -339,7 +348,7 @@ class ManyToOneTable(Table):
for book_id in books:
self.book_col_map[book_id] = existing_item
self.col_book_map[existing_item].update(books)
db.rename_category_item(self.name, table, self.link_table, lcol, item_id, existing_item)
db.rename_category_item(self.name, table, self.link_table, lcol, item_id, existing_item, self.id_map[new_id])
return affected_books, new_id
def set_links(self, link_map, db):
@ -403,7 +412,7 @@ class ManyToManyTable(ManyToOneTable):
self.link_table, self.metadata['link_column']), tuple((x,) for x in extra_item_ids))
def remove_books(self, book_ids, db):
clean = set()
clean = {}
for book_id in book_ids:
item_ids = self.book_col_map.pop(book_id, ())
for item_id in item_ids:
@ -415,11 +424,12 @@ class ManyToManyTable(ManyToOneTable):
else:
if not self.col_book_map[item_id]:
del self.col_book_map[item_id]
if self.id_map.pop(item_id, null) is not null:
clean.add(item_id)
val = self.id_map.pop(item_id, null)
if val is not null:
clean[item_id] = val
if clean and self.do_clean_on_remove:
db.delete_category_items(self.name, self.metadata['table'], clean)
return clean
return set(clean)
def remove_items(self, item_ids, db, restrict_to_book_ids=None):
affected_books = set()
@ -452,20 +462,26 @@ class ManyToManyTable(ManyToOneTable):
affected_books |= self.remove_items(items_to_process_normally, db)
return affected_books
item_map = {}
for item_id in item_ids:
val = self.id_map.pop(item_id, null)
if val is null:
continue
item_map[item_id] = val
book_ids = self.col_book_map.pop(item_id, set())
for book_id in book_ids:
self.book_col_map[book_id] = tuple(x for x in self.book_col_map.get(book_id, ()) if x != item_id)
affected_books.update(book_ids)
db.delete_category_items(self.name, self.metadata['table'], item_ids, self.link_table, self.metadata['link_column'])
db.delete_category_items(self.name, self.metadata['table'], item_map, self.link_table, self.metadata['link_column'])
return affected_books
def rename_item(self, item_id, new_name, db):
rmap = {icu_lower(v):k for k, v in iteritems(self.id_map)}
existing_item = rmap.get(icu_lower(new_name), None)
existing_item = None
q = icu_lower(new_name)
for q_id, q_val in self.id_map.items():
if icu_lower(q_val) == q:
existing_item = q_id
break
table, col, lcol = self.metadata['table'], self.metadata['column'], self.metadata['link_column']
affected_books = self.col_book_map.get(item_id, set())
new_id = item_id
@ -486,7 +502,7 @@ class ManyToManyTable(ManyToOneTable):
self.col_book_map[existing_item].update(books)
db.executemany(f'DELETE FROM {self.link_table} WHERE book=? AND {lcol}=?', [
(book_id, existing_item) for book_id in books])
db.rename_category_item(self.name, table, self.link_table, lcol, item_id, existing_item)
db.rename_category_item(self.name, table, self.link_table, lcol, item_id, existing_item, self.id_map[new_id])
return affected_books, new_id
def fix_case_duplicates(self, db):
@ -500,8 +516,11 @@ class ManyToManyTable(ManyToOneTable):
done_books = set()
main_id = min(v)
v.discard(main_id)
item_map = {}
for item_id in v:
self.id_map.pop(item_id, None)
val = self.id_map.pop(item_id, null)
if val is not null:
item_map[item_id] = val
books = self.col_book_map.pop(item_id, set())
for book_id in books:
if book_id in done_books:
@ -524,7 +543,7 @@ class ManyToManyTable(ManyToOneTable):
db.executemany(
'INSERT INTO {} (book,{}) VALUES (?,?)'.format(self.link_table, self.metadata['link_column']),
tuple((book_id, x) for x in vals))
db.delete_category_items(self.name, self.metadata['table'], v)
db.delete_category_items(self.name, self.metadata['table'], item_map)
class AuthorsTable(ManyToManyTable):

View File

@ -7,35 +7,62 @@ import os
from calibre.db.tests.base import BaseTest
def test_notes_api(self: 'NotesTest'):
cache, notes = self.create_notes_db()
authors = sorted(cache.all_field_ids('authors'))
self.ae(cache.notes_for('authors', authors[0]), '')
doc = 'simple notes for an author'
h1 = cache.add_notes_resource(b'resource1', 'r1.jpg')
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
self.ae(cache.get_notes_resource(h1)['name'], 'r1.jpg')
self.ae(cache.get_notes_resource(h2)['name'], 'r1-1.jpg')
note_id = cache.set_notes_for('authors', authors[0], doc, resource_ids=(h1, h2))
self.ae(cache.notes_for('authors', authors[0]), doc)
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1, h2}))
self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
self.ae(cache.get_notes_resource(h2)['data'], b'resource2')
doc2 = 'a different note to replace the first one'
self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_ids=(h1,)))
self.ae(cache.notes_for('authors', authors[0]), doc2)
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1}))
self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
self.ae(cache.get_notes_resource(h2), None)
self.assertTrue(os.path.exists(notes.path_for_resource(cache.backend.conn, h1)))
self.assertFalse(os.path.exists(notes.path_for_resource(cache.backend.conn, h2)))
# check retirement
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_ids=(h1,h2)))
self.ae(-1, cache.set_notes_for('authors', authors[0], ''))
self.ae(cache.notes_for('authors', authors[0]), '')
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset())
before = os.listdir(notes.retired_dir)
self.ae(len(before), 1)
h1 = cache.add_notes_resource(b'resource1', 'r1.jpg')
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
nnote_id = cache.set_notes_for('authors', authors[1], doc, resource_ids=(h1, h2))
self.assertNotEqual(note_id, nnote_id)
self.ae(-1, cache.set_notes_for('authors', authors[1], ''))
after = os.listdir(notes.retired_dir)
self.ae(len(after), 1)
self.assertNotEqual(before, after)
self.assertGreater(cache.unretire_note_for('authors', authors[1]), nnote_id)
self.assertFalse(os.listdir(notes.retired_dir))
self.ae(cache.notes_for('authors', authors[1]), doc)
self.ae(cache.notes_resources_used_by('authors', authors[1]), frozenset({h1, h2}))
self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
self.ae(cache.get_notes_resource(h2)['data'], b'resource2')
class NotesTest(BaseTest):
ae = BaseTest.assertEqual
def create_notes_db(self):
cache = self.init_cache()
cache.backend.notes.max_retired_items = 1
return cache, cache.backend.notes
def test_notes(self):
def create():
cache = self.init_cache()
cache.backend.notes.max_retired_items = 1
return cache, cache.backend.notes
cache, notes = create()
authors = sorted(cache.all_field_ids('authors'))
self.ae(cache.notes_for('authors', authors[0]), '')
doc = 'simple notes for an author'
h1 = cache.add_notes_resource(b'resource1', 'r1.jpg')
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
self.ae(cache.get_notes_resource(h1)['name'], 'r1.jpg')
self.ae(cache.get_notes_resource(h2)['name'], 'r1-1.jpg')
note_id = cache.set_notes_for('authors', authors[0], doc, resource_ids=(h1, h2))
self.ae(cache.notes_for('authors', authors[0]), doc)
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1, h2}))
self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
self.ae(cache.get_notes_resource(h2)['data'], b'resource2')
doc2 = 'a different note to replace the first one'
self.ae(note_id, cache.set_notes_for('authors', authors[0], doc2, resource_ids=(h1,)))
self.ae(cache.notes_for('authors', authors[0]), doc2)
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1}))
self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
self.ae(cache.get_notes_resource(h2), None)
self.assertTrue(os.path.exists(notes.path_for_resource(cache.backend.conn, h1)))
self.assertFalse(os.path.exists(notes.path_for_resource(cache.backend.conn, h2)))
test_notes_api(self)

View File

@ -553,8 +553,12 @@ def get_hardlink_function(src, dest):
if not iswindows:
return os.link
from calibre_extensions import winutil
if src.startswith(long_path_prefix):
src = src[len(long_path_prefix):]
if dest.startswith(long_path_prefix):
dest = dest[len(long_path_prefix):]
root = dest[0] + ':\\'
if src[0].lower() == dest[0].lower() and hasattr(winutil, 'supports_hardlinks') and winutil.supports_hardlinks(root):
if src[0].lower() == dest[0].lower() and winutil.supports_hardlinks(root):
return windows_fast_hardlink
@ -563,6 +567,7 @@ def copyfile_using_links(path, dest, dest_is_dir=True, filecopyfunc=copyfile):
if dest_is_dir:
dest = os.path.join(dest, os.path.basename(path))
hardlink = get_hardlink_function(path, dest)
path, dest = make_long_path_useable(path), make_long_path_useable(dest)
try:
hardlink(path, dest)
except Exception: