From 3aee66065695fca2f5a153fa500a8a252e94079c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 12 Aug 2023 07:06:45 +0530 Subject: [PATCH] More work on notes --- setup/arch-ci.sh | 2 +- src/calibre/db/backend.py | 15 +++ src/calibre/db/cache.py | 15 +++ src/calibre/db/notes/connect.py | 185 ++++++++++++++++++++++++++++- src/calibre/db/tests/filesystem.py | 3 - src/calibre/db/tests/notes.py | 19 +++ 6 files changed, 234 insertions(+), 5 deletions(-) create mode 100644 src/calibre/db/tests/notes.py diff --git a/setup/arch-ci.sh b/setup/arch-ci.sh index 9c2b10cee1..56d5ccb699 100755 --- a/setup/arch-ci.sh +++ b/setup/arch-ci.sh @@ -5,7 +5,7 @@ set -xe -pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-fonttools uchardet libstemmer poppler tk podofo +pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-fonttools python-xxhash uchardet libstemmer poppler tk podofo useradd -m ci chown -R ci:users $GITHUB_WORKSPACE diff --git a/src/calibre/db/backend.py b/src/calibre/db/backend.py index c8323ceb66..a8658bbf63 100644 --- a/src/calibre/db/backend.py +++ b/src/calibre/db/backend.py @@ -954,14 +954,29 @@ class DB: if link_table_name and link_col_name: self.executemany(f'DELETE FROM {link_table_name} WHERE {link_col_name}=?', bindings) self.executemany(f'DELETE FROM {table_name} WHERE id=?', bindings) + for item_id in items: + self.notes.set_note(self.conn, field_name, item_id) def rename_category_item(self, field_name, table_name, link_table_name, link_col_name, old_item_id, new_item_id): + self.notes.rename_note(self.conn, field_name, old_item_id, new_item_id) # For custom series this means that the series index can # potentially have duplicates/be incorrect, but there is no way to # handle that in this context. self.execute(f'UPDATE {link_table_name} SET {link_col_name}=? WHERE {link_col_name}=?; DELETE FROM {table_name} WHERE id=?', (new_item_id, old_item_id, old_item_id)) + def notes_for(self, field_name, item_id): + return self.notes.get_note(self.conn, field_name, item_id) or '' + + def set_notes_for(self, field, item_id, doc: str, searchable_text: str, resource_hashes) -> int: + return self.notes.set_note(self.conn, field, item_id, doc, resource_hashes, searchable_text) + + def add_notes_resource(self, path_or_stream) -> str: + return self.notes.add_resource(path_or_stream) + + def get_notes_resource(self, resource_hash) -> bytes: + return self.notes.get_resource(resource_hash) + def initialize_fts(self, dbref): self.fts = None if not self.prefs['fts_enabled']: diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index 4c57102ecd..7f465f15a8 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -38,6 +38,7 @@ from calibre.db.listeners import EventDispatcher, EventType from calibre.db.locking import ( DowngradeLockError, LockingError, SafeReadLock, create_locks, try_lock, ) +from calibre.db.notes.connect import copy_marked_up_text from calibre.db.search import Search from calibre.db.tables import VirtualTable from calibre.db.utils import type_safe_sort_key_function @@ -672,6 +673,20 @@ class Cache: # }}} + # Notes API {{{ + def notes_for(self, field, item_id) -> str: + return self.backend.notes_for(field, item_id) + + def set_notes_for(self, field, item_id, doc: str, searchable_text: str = copy_marked_up_text, resource_hashes=()) -> int: + return self.backend.set_notes_for(field, item_id, doc, searchable_text, resource_hashes) + + def add_notes_resource(self, path_or_stream_or_data) -> str: + return self.backend.add_notes_resource(path_or_stream_or_data) + + def get_notes_resource(self, resource_hash) -> bytes: + return self.backend.get_notes_resource(resource_hash) + # }}} + # Cache Layer API {{{ @write_api diff --git a/src/calibre/db/notes/connect.py b/src/calibre/db/notes/connect.py index 99bb33c4aa..eca6054455 100644 --- a/src/calibre/db/notes/connect.py +++ b/src/calibre/db/notes/connect.py @@ -2,12 +2,44 @@ # License: GPLv3 Copyright: 2023, Kovid Goyal import os +import time +import xxhash +from contextlib import suppress +from itertools import repeat from calibre.constants import iswindows +from calibre.utils.copy_files import WINDOWS_SLEEP_FOR_RETRY_TIME +from calibre.utils.filenames import make_long_path_useable from ..constants import NOTES_DIR_NAME from .schema_upgrade import SchemaUpgrade +if iswindows: + from calibre_extensions import winutil + +class cmt(str): + pass + + +copy_marked_up_text = cmt() +SEP = b'\0\x1c\0' + + +def hash_data(data: bytes) -> str: + return 'xxh64:' + xxhash.xxh3_64_hexdigest(data) + + +def remove_with_retry(x): + x = make_long_path_useable(x) + try: + os.remove(x) + except FileNotFoundError: + return + except OSError as e: + if iswindows and e.winerror == winutil.ERROR_SHARING_VIOLATION: + time.sleep(WINDOWS_SLEEP_FOR_RETRY_TIME) + os.remove(x) + class Notes: @@ -15,13 +47,18 @@ class Notes: conn = backend.get_connection() libdir = os.path.dirname(os.path.abspath(conn.db_filename('main'))) notes_dir = os.path.join(libdir, NOTES_DIR_NAME) + self.resources_dir = os.path.join(notes_dir, 'resources') + self.backup_dir = os.path.join(notes_dir, 'backup') + self.retired_dir = os.path.join(notes_dir, 'retired') if not os.path.exists(notes_dir): os.makedirs(notes_dir, exist_ok=True) if iswindows: - import calibre_extensions.winutil as winutil winutil.set_file_attributes(notes_dir, winutil.FILE_ATTRIBUTE_HIDDEN | winutil.FILE_ATTRIBUTE_NOT_CONTENT_INDEXED) dbpath = os.path.join(notes_dir, 'notes.db') conn.execute("ATTACH DATABASE ? AS notes_db", (dbpath,)) + os.makedirs(self.resources_dir, exist_ok=True) + os.makedirs(self.backup_dir, exist_ok=True) + os.makedirs(self.retired_dir, exist_ok=True) self.allowed_fields = set() triggers = [] for table in backend.tables.values(): @@ -36,3 +73,149 @@ class Notes: ) SchemaUpgrade(conn, '\n'.join(triggers)) conn.notes_dbpath = dbpath + + def path_for_resource(self, resource_hash: str) -> str: + idx = resource_hash.index(':') + prefix = resource_hash[idx + 1: idx + 3] + return os.path.join(self.resources_dir, prefix, resource_hash) + + def remove_resources(self, conn, note_id, resources_to_potentially_remove, delete_from_link_table=True): + if not isinstance(resources_to_potentially_remove, tuple): + resources_to_potentially_remove = tuple(resources_to_potentially_remove) + if delete_from_link_table: + conn.executemany(''' + DELETE FROM notes_db.notes_resources_link WHERE note=? AND hash=? + ''', tuple((note_id, x) for x in resources_to_potentially_remove)) + for (x,) in conn.execute( + ''' + SELECT value FROM (VALUES {}) AS my_values(value) WHERE value NOT IN (SELECT hash FROM notes_db.notes_resources_link) + '''.format(','.join(repeat('(?)', len(resources_to_potentially_remove)))), resources_to_potentially_remove): + remove_with_retry(self.path_for_resource(x)) + + def note_id_for(self, conn, field_name, item_id): + for (ans,) in conn.execute('SELECT id FROM notes_db.notes WHERE item=? AND colname=?', (item_id, field_name)): + return ans + + def resources_used_by(self, conn, note_id): + if note_id is not None: + for (h,) in conn.execute('SELECT hash from notes_db.notes_resources_link WHERE note=?', (note_id,)): + yield h + + def set_backup_for(self, field_name, item_id, marked_up_text='', searchable_text=''): + path = make_long_path_useable(os.path.join(self.backup_dir, field_name, str(item_id))) + if marked_up_text: + try: + f = open(path, 'wb') + except FileNotFoundError: + os.makedirs(os.path.dirname(path), exist_ok=True) + f = open(path, 'wb') + with f: + f.write(marked_up_text.encode('utf-8')) + f.write(SEP) + f.write(searchable_text.encode('utf-8')) + else: + if os.path.exists(path): + dest = make_long_path_useable(os.path.join(self.retired_dir, f'{item_id}_{field_name}')) + os.replace(path, dest) + self.trim_retire_dir() + + def set_note(self, conn, field_name, item_id, marked_up_text='', hashes_of_used_resources=(), searchable_text=copy_marked_up_text): + if searchable_text is copy_marked_up_text: + searchable_text = marked_up_text + note_id = self.note_id_for(conn, field_name, item_id) + old_resources = frozenset(self.resources_used_by(conn, note_id)) + if not marked_up_text: + if note_id is not None: + conn.execute('DELETE FROM notes_db.notes WHERE id=?', (note_id,)) + self.set_backup_for(field_name, item_id) + if old_resources: + self.remove_resources(conn, note_id, old_resources, delete_from_link_table=False) + return + new_resources = frozenset(hashes_of_used_resources) + resources_to_potentially_remove = old_resources - new_resources + resources_to_add = new_resources - old_resources + inserted_id, = next(conn.execute(''' + INSERT OR REPLACE INTO notes_db.notes (item,colname,doc,searchable_text) VALUES (?,?,?,?) RETURNING id; + ''', (item_id, field_name, marked_up_text, searchable_text))) + if resources_to_potentially_remove: + self.remove_resources(conn, inserted_id, resources_to_potentially_remove) + if resources_to_add: + conn.executemany(''' + INSERT INTO notes_db.notes_resources_link (note,hash) VALUES (?,?); + ''', tuple((inserted_id, x) for x in resources_to_add)) + self.set_backup_for(field_name, item_id, marked_up_text, searchable_text) + return note_id + + def get_note(self, conn, field_name, item_id): + for (doc,) in conn.execute('SELECT doc FROM notes_db.notes WHERE item=? AND colname=?', (item_id, field_name)): + return doc + + def get_note_data(self, conn, field_name, item_id): + for (note_id, doc, searchable_text) in conn.execute( + 'SELECT id,doc,searchable_text FROM notes_db.notes WHERE item=? AND colname=?', (item_id, field_name) + ): + return { + 'id': note_id, 'doc': doc, 'searchable_text': searchable_text, + 'resource_hashes': frozenset(self.resources_used_by(conn, note_id)), + } + + def rename_note(self, conn, field_name, old_item_id, new_item_id): + note_id = self.note_id_for(conn, field_name, old_item_id) + if note_id is None: + return + new_note = self.get_note(conn, field_name, new_item_id) + if new_note: + return + old_note = self.get_note_data(conn, field_name, old_item_id) + if not old_note or not old_note['doc']: + return + self.set_note(conn, field_name, new_item_id, old_note['doc'], old_note['resource_hashes'], old_note['searchable_text']) + + def trim_retired_dir(self): + mpath_map = {} + items = [] + for d in os.scandir(self.retired_dir): + mpath_map[d.path] = d.stat(follow_symlinks=False).st_mtime_ns + items.append(d.path) + extra = len(items) - self.max_retired_items + if extra > 0: + items.sort(key=mpath_map.__getitem__) + for path in items[:extra]: + remove_with_retry(path) + + def add_resource(self, path_or_stream_or_data): + if isinstance(path_or_stream_or_data, bytes): + data = path_or_stream_or_data + elif isinstance(path_or_stream_or_data, str): + with open(path_or_stream_or_data, 'rb') as f: + data = f.read() + else: + data = f.read() + resource_hash = hash_data(data) + path = self.path_for_resource(resource_hash) + path = make_long_path_useable(path) + exists = False + try: + s = os.stat(path, follow_symlinks=False) + except OSError: + pass + else: + exists = s.st_size == len(data) + if exists: + return resource_hash + + try: + f = open(path, 'wb') + except FileNotFoundError: + os.makedirs(os.path.dirname(path), exist_ok=True) + f = open(path, 'wb') + with f: + f.write(data) + return resource_hash + + def get_resource(self, resource_hash) -> bytes: + path = self.path_for_resource(resource_hash) + path = make_long_path_useable(path) + with suppress(FileNotFoundError), open(path, 'rb') as f: + return f.read() + return b'' diff --git a/src/calibre/db/tests/filesystem.py b/src/calibre/db/tests/filesystem.py index a07a635e58..9e2bce4a6a 100644 --- a/src/calibre/db/tests/filesystem.py +++ b/src/calibre/db/tests/filesystem.py @@ -306,6 +306,3 @@ class FilesystemTest(BaseTest): c(r(match_type='not_startswith', query='IGnored.', action='add'), r(query='ignored.md')), ): q(['added.epub non-book.other'.split()], find_books_in_directory('', True, compiled_rules=rules, listdir_impl=lambda x: files)) - - def test_notes_operations(self): - cache = self.init_cache() diff --git a/src/calibre/db/tests/notes.py b/src/calibre/db/tests/notes.py new file mode 100644 index 0000000000..d48f0112ee --- /dev/null +++ b/src/calibre/db/tests/notes.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# License: GPLv3 Copyright: 2023, Kovid Goyal + + +from calibre.db.tests.base import BaseTest + +class NotesTest(BaseTest): + + ae = BaseTest.assertEqual + + def test_notes(self): + cache = self.init_cache() + authors = sorted(cache.all_field_ids('authors')) + self.ae(cache.notes_for('authors', authors[0]), '') + doc = 'simple notes for an author' + h1 = cache.add_notes_resource(b'resource1') + h2 = cache.add_notes_resource(b'resource2') + cache.set_notes_for('authors', authors[0], doc, resource_hashes=(h1, h2)) + self.ae(cache.notes_for('authors', authors[0]), doc)