Implement database rebuild for notes data

This commit is contained in:
Kovid Goyal 2023-08-24 20:40:36 +05:30
parent 6017bafe48
commit fe4292c3cc
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
6 changed files with 110 additions and 18 deletions

View File

@ -1020,6 +1020,9 @@ class DB:
os.remove(pt.name) os.remove(pt.name)
self.notes.export_non_db_data(zf) self.notes.export_non_db_data(zf)
def restore_notes(self, report_progress):
self.notes.restore(self.conn, self.tables, report_progress)
def initialize_fts(self, dbref): def initialize_fts(self, dbref):
self.fts = None self.fts = None
if not self.prefs['fts_enabled']: if not self.prefs['fts_enabled']:

View File

@ -9,6 +9,7 @@ METADATA_FILE_NAME = 'metadata.opf'
DEFAULT_TRASH_EXPIRY_TIME_SECONDS = 14 * 86400 DEFAULT_TRASH_EXPIRY_TIME_SECONDS = 14 * 86400
TRASH_DIR_NAME = '.caltrash' TRASH_DIR_NAME = '.caltrash'
NOTES_DIR_NAME = '.calnotes' NOTES_DIR_NAME = '.calnotes'
NOTES_DB_NAME = 'notes.db'
DATA_DIR_NAME = 'data' DATA_DIR_NAME = 'data'
DATA_FILE_PATTERN = f'{DATA_DIR_NAME}/**/*' DATA_FILE_PATTERN = f'{DATA_DIR_NAME}/**/*'
BOOK_ID_PATH_TEMPLATE = ' ({})' BOOK_ID_PATH_TEMPLATE = ' ({})'

View File

@ -17,7 +17,7 @@ from calibre.utils.copy_files import WINDOWS_SLEEP_FOR_RETRY_TIME
from calibre.utils.filenames import copyfile_using_links, make_long_path_useable from calibre.utils.filenames import copyfile_using_links, make_long_path_useable
from calibre.utils.icu import lower as icu_lower from calibre.utils.icu import lower as icu_lower
from ..constants import NOTES_DIR_NAME from ..constants import NOTES_DB_NAME, NOTES_DIR_NAME
from .schema_upgrade import SchemaUpgrade from .schema_upgrade import SchemaUpgrade
if iswindows: if iswindows:
@ -31,6 +31,7 @@ copy_marked_up_text = cmt()
SEP = b'\0\x1c\0' SEP = b'\0\x1c\0'
DOC_NAME = 'doc.html' DOC_NAME = 'doc.html'
def hash_data(data: bytes) -> str: def hash_data(data: bytes) -> str:
return 'xxh64:' + xxhash.xxh3_64_hexdigest(data) return 'xxh64:' + xxhash.xxh3_64_hexdigest(data)
@ -77,7 +78,7 @@ class Notes:
def reopen(self, backend): def reopen(self, backend):
conn = backend.get_connection() conn = backend.get_connection()
conn.notes_dbpath = os.path.join(self.notes_dir, 'notes.db') conn.notes_dbpath = os.path.join(self.notes_dir, NOTES_DB_NAME)
conn.execute("ATTACH DATABASE ? AS notes_db", (conn.notes_dbpath,)) conn.execute("ATTACH DATABASE ? AS notes_db", (conn.notes_dbpath,))
self.allowed_fields = set() self.allowed_fields = set()
triggers = [] triggers = []
@ -106,11 +107,11 @@ class Notes:
found = False found = False
for (rhash,) in conn.get('SELECT hash FROM notes_db.resources WHERE hash NOT IN (SELECT resource FROM notes_db.notes_resources_link)'): for (rhash,) in conn.get('SELECT hash FROM notes_db.resources WHERE hash NOT IN (SELECT resource FROM notes_db.notes_resources_link)'):
found = True found = True
remove_with_retry(self.path_for_resource(conn, rhash)) remove_with_retry(self.path_for_resource(rhash))
if found: if found:
conn.execute('DELETE FROM notes_db.resources WHERE hash NOT IN (SELECT resource FROM notes_db.notes_resources_link)') conn.execute('DELETE FROM notes_db.resources WHERE hash NOT IN (SELECT resource FROM notes_db.notes_resources_link)')
def path_for_resource(self, conn, resource_hash: str) -> str: def path_for_resource(self, resource_hash: str) -> str:
hashalg, digest = resource_hash.split(':', 1) hashalg, digest = resource_hash.split(':', 1)
prefix = digest[:2] prefix = digest[:2]
# Cant use colons in filenames on windows safely # Cant use colons in filenames on windows safely
@ -128,7 +129,9 @@ class Notes:
SELECT value FROM resources_table WHERE value NOT IN (SELECT resource FROM notes_db.notes_resources_link) SELECT value FROM resources_table WHERE value NOT IN (SELECT resource FROM notes_db.notes_resources_link)
'''.format(','.join(repeat('(?)', len(resources_to_potentially_remove)))) '''.format(','.join(repeat('(?)', len(resources_to_potentially_remove))))
for (x,) in conn.execute(stmt, resources_to_potentially_remove): for (x,) in conn.execute(stmt, resources_to_potentially_remove):
remove_with_retry(self.path_for_resource(conn, x)) p = self.path_for_resource(x)
remove_with_retry(p)
remove_with_retry(p + '.name')
def note_id_for(self, conn, field_name, item_id): def note_id_for(self, conn, field_name, item_id):
return conn.get('SELECT id FROM notes_db.notes WHERE item=? AND colname=?', (item_id, field_name), all=False) return conn.get('SELECT id FROM notes_db.notes WHERE item=? AND colname=?', (item_id, field_name), all=False)
@ -138,7 +141,7 @@ class Notes:
for (h,) in conn.execute('SELECT resource from notes_db.notes_resources_link WHERE note=?', (note_id,)): for (h,) in conn.execute('SELECT resource from notes_db.notes_resources_link WHERE note=?', (note_id,)):
yield h yield h
def set_backup_for(self, field_name, item_id, marked_up_text, searchable_text): def set_backup_for(self, field_name, item_id, marked_up_text, searchable_text, resource_hashes):
path = make_long_path_useable(os.path.join(self.backup_dir, field_name, str(item_id))) path = make_long_path_useable(os.path.join(self.backup_dir, field_name, str(item_id)))
try: try:
f = open(path, 'wb') f = open(path, 'wb')
@ -149,6 +152,8 @@ class Notes:
f.write(marked_up_text.encode('utf-8')) f.write(marked_up_text.encode('utf-8'))
f.write(SEP) f.write(SEP)
f.write(searchable_text.encode('utf-8')) f.write(searchable_text.encode('utf-8'))
f.write(SEP)
f.write('\n'.join(resource_hashes).encode('utf-8'))
def retire_entry(self, field_name, item_id, item_value, resources, note_id): def retire_entry(self, field_name, item_id, item_value, resources, note_id):
path = make_long_path_useable(os.path.join(self.backup_dir, field_name, str(item_id))) path = make_long_path_useable(os.path.join(self.backup_dir, field_name, str(item_id)))
@ -161,7 +166,7 @@ class Notes:
with open(make_long_path_useable(os.path.join(destdir, 'note_id')), 'w') as nif: with open(make_long_path_useable(os.path.join(destdir, 'note_id')), 'w') as nif:
nif.write(str(note_id)) nif.write(str(note_id))
for rhash, rname in resources: for rhash, rname in resources:
rpath = make_long_path_useable(self.path_for_resource(None, rhash)) rpath = make_long_path_useable(self.path_for_resource(rhash))
if os.path.exists(rpath): if os.path.exists(rpath):
rdest = os.path.join(destdir, 'res-'+rname) rdest = os.path.join(destdir, 'res-'+rname)
copyfile_using_links(rpath, make_long_path_useable(rdest), dest_is_dir=False) copyfile_using_links(rpath, make_long_path_useable(rdest), dest_is_dir=False)
@ -174,7 +179,10 @@ class Notes:
if not os.path.exists(srcdir) or self.note_id_for(conn, field_name, item_id) is not None: if not os.path.exists(srcdir) or self.note_id_for(conn, field_name, item_id) is not None:
return note_id return note_id
with open(os.path.join(srcdir, DOC_NAME), 'rb') as src: with open(os.path.join(srcdir, DOC_NAME), 'rb') as src:
a, b = src.read().partition(SEP)[::2] try:
a, b, _ = src.read().split(SEP, 2)
except Exception:
return note_id
marked_up_text, searchable_text = a.decode('utf-8'), b.decode('utf-8') marked_up_text, searchable_text = a.decode('utf-8'), b.decode('utf-8')
resources = set() resources = set()
for x in os.listdir(srcdir): for x in os.listdir(srcdir):
@ -219,7 +227,7 @@ class Notes:
conn.executemany(''' conn.executemany('''
INSERT INTO notes_db.notes_resources_link (note,resource) VALUES (?,?); INSERT INTO notes_db.notes_resources_link (note,resource) VALUES (?,?);
''', tuple((note_id, x) for x in resources_to_add)) ''', tuple((note_id, x) for x in resources_to_add))
self.set_backup_for(field_name, item_id, marked_up_text, searchable_text) self.set_backup_for(field_name, item_id, marked_up_text, searchable_text, used_resource_hashes)
return note_id return note_id
def get_note(self, conn, field_name, item_id): def get_note(self, conn, field_name, item_id):
@ -270,7 +278,7 @@ class Notes:
else: else:
data = path_or_stream_or_data.read() data = path_or_stream_or_data.read()
resource_hash = hash_data(data) resource_hash = hash_data(data)
path = self.path_for_resource(conn, resource_hash) path = self.path_for_resource(resource_hash)
path = make_long_path_useable(path) path = make_long_path_useable(path)
exists = False exists = False
try: try:
@ -294,6 +302,8 @@ class Notes:
while True: while True:
try: try:
conn.execute('UPDATE notes_db.resources SET name=? WHERE hash=?', (name, resource_hash)) conn.execute('UPDATE notes_db.resources SET name=? WHERE hash=?', (name, resource_hash))
with open(path + '.name', 'w') as fn:
fn.write(name)
break break
except apsw.ConstraintError: except apsw.ConstraintError:
c += 1 c += 1
@ -303,6 +313,8 @@ class Notes:
while True: while True:
try: try:
conn.get('INSERT INTO notes_db.resources (hash,name) VALUES (?,?)', (resource_hash, name), all=False) conn.get('INSERT INTO notes_db.resources (hash,name) VALUES (?,?)', (resource_hash, name), all=False)
with open(path + '.name', 'w') as fn:
fn.write(name)
break break
except apsw.ConstraintError: except apsw.ConstraintError:
c += 1 c += 1
@ -311,7 +323,7 @@ class Notes:
def get_resource_data(self, conn, resource_hash) -> Optional[dict]: def get_resource_data(self, conn, resource_hash) -> Optional[dict]:
for (name,) in conn.execute('SELECT name FROM notes_db.resources WHERE hash=?', (resource_hash,)): for (name,) in conn.execute('SELECT name FROM notes_db.resources WHERE hash=?', (resource_hash,)):
path = self.path_for_resource(conn, resource_hash) path = self.path_for_resource(resource_hash)
path = make_long_path_useable(path) path = make_long_path_useable(path)
os.listdir(os.path.dirname(path)) os.listdir(os.path.dirname(path))
with suppress(FileNotFoundError), open(path, 'rb') as f: with suppress(FileNotFoundError), open(path, 'rb') as f:
@ -359,7 +371,7 @@ class Notes:
def export_non_db_data(self, zf): def export_non_db_data(self, zf):
import zipfile import zipfile
def add_dir(which): def add_dir(which):
for dirpath, _, filenames in os.walk(which): for dirpath, _, filenames in os.walk(make_long_path_useable(which)):
for f in filenames: for f in filenames:
path = os.path.join(dirpath, f) path = os.path.join(dirpath, f)
with open(path, 'rb') as src: with open(path, 'rb') as src:
@ -371,3 +383,56 @@ class Notes:
def vacuum(self, conn): def vacuum(self, conn):
conn.execute('VACUUM notes_db') conn.execute('VACUUM notes_db')
def restore(self, conn, tables, report_progress):
resources = {}
errors = []
for subdir in os.listdir(make_long_path_useable(self.resources_dir)):
for rf in os.listdir(make_long_path_useable(os.path.join(self.resources_dir, subdir))):
name_path = os.path.join(self.resources_dir, subdir, rf + '.name')
name = 'unnamed'
with suppress(OSError), open(make_long_path_useable(name_path)) as n:
name = n.read()
resources[rf] = name
items = {}
for f in os.listdir(make_long_path_useable(self.backup_dir)):
if f in self.allowed_fields:
items[f] = []
for x in os.listdir(make_long_path_useable(os.path.join(self.backup_dir, f))):
with suppress(Exception):
items[f].append(int(x))
known_resources = frozenset(resources)
conn.executemany('INSERT OR REPLACE INTO notes_db.resources (hash,name) VALUES (?,?)', tuple(resources.items()))
i, total = 0, sum(len(x) for x in items.values())
report_progress(None, total)
for field, entries in items.items():
table = tables[field]
for item_id in entries:
item_val = table.id_map.get(item_id)
i += 1
if item_val is not None:
report_progress(item_val, i)
try:
with open(make_long_path_useable(os.path.join(self.backup_dir, field, str(item_id)))) as f:
raw = f.read()
except OSError as e:
errors.append(_('Failed to read from document for {path} with error: {error}').format(path=item_val, error=e))
continue
try:
doc, searchable_text, res = raw.split(SEP, 2)
except Exception:
errors.append(_('Failed to parse document for: {}').format(item_val))
continue
resources = frozenset(res.splitlines())
missing_resources = resources - known_resources
if missing_resources:
errors.append(_('Some resources for {} were missing').format(item_val))
resources &= known_resources
try:
self.set_note(conn, field, item_id, item_val, doc, resources, searchable_text)
except Exception as e:
errors.append(_('Failed to set note for {path} with error: {error}').format(path=item_val, error=e))
else:
errors.append(_('Could not restore item: {} as not present in database').format(f'{field}/{item_id}'))
report_progress('', i)
return errors

View File

@ -18,7 +18,7 @@ from threading import Thread
from calibre import force_unicode, isbytestring from calibre import force_unicode, isbytestring
from calibre.constants import filesystem_encoding from calibre.constants import filesystem_encoding
from calibre.db.backend import DB, DBPrefs from calibre.db.backend import DB, DBPrefs
from calibre.db.constants import METADATA_FILE_NAME, TRASH_DIR_NAME from calibre.db.constants import METADATA_FILE_NAME, TRASH_DIR_NAME, NOTES_DIR_NAME, NOTES_DB_NAME
from calibre.db.cache import Cache from calibre.db.cache import Cache
from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
@ -53,7 +53,9 @@ def is_ebook_file(filename):
class Restorer(Cache): class Restorer(Cache):
def __init__(self, library_path, default_prefs=None, restore_all_prefs=False, progress_callback=lambda x, y:True): def __init__(self, library_path, default_prefs=None, restore_all_prefs=False, progress_callback=lambda x, y:True):
backend = DB(library_path, default_prefs=default_prefs, restore_all_prefs=restore_all_prefs, progress_callback=progress_callback) backend = DB(
library_path, default_prefs=default_prefs, restore_all_prefs=restore_all_prefs, progress_callback=progress_callback
)
Cache.__init__(self, backend) Cache.__init__(self, backend)
for x in ('update_path', 'mark_as_dirty'): for x in ('update_path', 'mark_as_dirty'):
setattr(self, x, self.no_op) setattr(self, x, self.no_op)
@ -81,6 +83,7 @@ class Restore(Thread):
self.conflicting_custom_cols = {} self.conflicting_custom_cols = {}
self.failed_restores = [] self.failed_restores = []
self.mismatched_dirs = [] self.mismatched_dirs = []
self.notes_errors = []
self.successes = 0 self.successes = 0
self.tb = None self.tb = None
self.link_maps = {} self.link_maps = {}
@ -88,7 +91,7 @@ class Restore(Thread):
@property @property
def errors_occurred(self): def errors_occurred(self):
return (self.failed_dirs or self.mismatched_dirs or return (self.failed_dirs or self.mismatched_dirs or
self.conflicting_custom_cols or self.failed_restores) self.conflicting_custom_cols or self.failed_restores or self.notes_errors)
@property @property
def report(self): def report(self):
@ -122,6 +125,11 @@ class Restore(Thread):
for x in self.mismatched_dirs: for x in self.mismatched_dirs:
ans += '\t' + force_unicode(x, filesystem_encoding) + '\n' ans += '\t' + force_unicode(x, filesystem_encoding) + '\n'
if self.notes_errors:
ans += '\n\n'
ans += 'Failed to restore notes for the following items:\n'
for x in self.notes_errors:
ans += '\t' + x
return ans return ans
def run(self): def run(self):
@ -285,6 +293,9 @@ class Restore(Thread):
self.progress_callback(None, len(self.books)) self.progress_callback(None, len(self.books))
self.books.sort(key=itemgetter('id')) self.books.sort(key=itemgetter('id'))
shutil.copytree(os.path.join(self.src_library_path, NOTES_DIR_NAME), os.path.join(self.library_path, NOTES_DIR_NAME))
with suppress(FileNotFoundError):
os.remove(os.path.join(self.library_path, NOTES_DIR_NAME, NOTES_DB_NAME))
db = Restorer(self.library_path) db = Restorer(self.library_path)
for i, book in enumerate(self.books): for i, book in enumerate(self.books):
@ -299,6 +310,7 @@ class Restore(Thread):
for field, lmap in self.link_maps.items(): for field, lmap in self.link_maps.items():
with suppress(Exception): with suppress(Exception):
db.set_link_map(field, {k:v[0] for k, v in lmap.items()}) db.set_link_map(field, {k:v[0] for k, v in lmap.items()})
self.notes_errors = db.backend.restore_notes(self.progress_callback)
db.close() db.close()
def replace_db(self): def replace_db(self):
@ -310,7 +322,7 @@ class Restore(Thread):
os.remove(save_path) os.remove(save_path)
if os.path.exists(dbpath): if os.path.exists(dbpath):
try: try:
os.rename(dbpath, save_path) os.replace(dbpath, save_path)
except OSError: except OSError:
time.sleep(30) # Wait a little for dropbox or the antivirus or whatever to release the file time.sleep(30) # Wait a little for dropbox or the antivirus or whatever to release the file
shutil.copyfile(dbpath, save_path) shutil.copyfile(dbpath, save_path)

View File

@ -27,8 +27,8 @@ def test_notes_api(self: 'NotesTest'):
self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1})) self.ae(cache.notes_resources_used_by('authors', authors[0]), frozenset({h1}))
self.ae(cache.get_notes_resource(h1)['data'], b'resource1') self.ae(cache.get_notes_resource(h1)['data'], b'resource1')
self.ae(cache.get_notes_resource(h2), None) self.ae(cache.get_notes_resource(h2), None)
self.assertTrue(os.path.exists(notes.path_for_resource(cache.backend.conn, h1))) self.assertTrue(os.path.exists(notes.path_for_resource(h1)))
self.assertFalse(os.path.exists(notes.path_for_resource(cache.backend.conn, h2))) self.assertFalse(os.path.exists(notes.path_for_resource(h2)))
# check retirement # check retirement
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg') h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')

View File

@ -401,6 +401,12 @@ class WritingTest(BaseTest):
with open(os.path.join(bookdir, 'sub', 'recurse'), 'w') as f: with open(os.path.join(bookdir, 'sub', 'recurse'), 'w') as f:
f.write('recurse') f.write('recurse')
ebefore = read_all_extra_files() ebefore = read_all_extra_files()
authors = cache.field_for('authors', 1)
author_id = cache.get_item_id('authors', authors[0])
doc = 'simple notes for an author'
h1 = cache.add_notes_resource(b'resource1', 'r1.jpg')
h2 = cache.add_notes_resource(b'resource2', 'r1.jpg')
cache.set_notes_for('authors', author_id, doc, resource_hashes=(h1, h2))
cache.close() cache.close()
from calibre.db.restore import Restore from calibre.db.restore import Restore
restorer = Restore(cl) restorer = Restore(cl)
@ -412,6 +418,11 @@ class WritingTest(BaseTest):
ae(lbefore, tuple(cache.get_all_link_maps_for_book(i) for i in book_ids)) ae(lbefore, tuple(cache.get_all_link_maps_for_book(i) for i in book_ids))
ae(fbefore, read_all_formats()) ae(fbefore, read_all_formats())
ae(ebefore, read_all_extra_files()) ae(ebefore, read_all_extra_files())
ae(cache.notes_for('authors', author_id), doc)
ae(cache.notes_resources_used_by('authors', author_id), frozenset({h1, h2}))
ae(cache.get_notes_resource(h1)['data'], b'resource1')
ae(cache.get_notes_resource(h2)['data'], b'resource2')
# }}} # }}}
def test_set_cover(self): # {{{ def test_set_cover(self): # {{{