From ebadee8a5e580048486fa03cc5372220b6054f1d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Dec 2015 13:01:27 +0530 Subject: [PATCH] Code to import a previously exported library --- src/calibre/db/backend.py | 16 ++++--- src/calibre/db/cache.py | 76 +++++++++++++++++++++++------- src/calibre/db/tests/filesystem.py | 14 ++++-- src/calibre/utils/exim.py | 74 ++++++++++++++++++++++++++++- 4 files changed, 152 insertions(+), 28 deletions(-) diff --git a/src/calibre/db/backend.py b/src/calibre/db/backend.py index bc0dfcae7a..e46cdbccb8 100644 --- a/src/calibre/db/backend.py +++ b/src/calibre/db/backend.py @@ -1354,7 +1354,7 @@ class DB(object): with f: return True, f.read(), stat.st_mtime - def set_cover(self, book_id, path, data): + def set_cover(self, book_id, path, data, no_processing=False): path = os.path.abspath(os.path.join(self.library_path, path)) if not os.path.exists(path): os.makedirs(path) @@ -1372,11 +1372,15 @@ class DB(object): time.sleep(0.2) os.remove(path) else: - try: - save_cover_data_to(data, path) - except (IOError, OSError): - time.sleep(0.2) - save_cover_data_to(data, path) + if no_processing: + with open(path, 'wb') as f: + f.write(data) + else: + try: + save_cover_data_to(data, path) + except (IOError, OSError): + time.sleep(0.2) + save_cover_data_to(data, path) def copy_format_to(self, book_id, fmt, fname, path, dest, windows_atomic_move=None, use_hardlink=False, report_file_size=None): diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index 721ba89bc7..1b0514f8ee 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -1320,6 +1320,24 @@ class Cache(object): self._reload_from_db() raise + def _do_add_format(self, book_id, fmt, stream, name=None): + path = self._field_for('path', book_id) + if path is None: + # Theoretically, this should never happen, but apparently it + # does: http://www.mobileread.com/forums/showthread.php?t=233353 + self._update_path({book_id}, mark_as_dirtied=False) + path = self._field_for('path', book_id) + + path = path.replace('/', os.sep) + title = self._field_for('title', book_id, default_value=_('Unknown')) + try: + author = self._field_for('authors', book_id, default_value=(_('Unknown'),))[0] + except IndexError: + author = _('Unknown') + + size, fname = self.backend.add_format(book_id, fmt, stream, title, author, path, name) + return size, fname + @api def add_format(self, book_id, fmt, stream_or_path, replace=True, run_hooks=True, dbapi=None): ''' @@ -1343,28 +1361,14 @@ class Cache(object): self.format_metadata_cache[book_id].pop(fmt, None) try: name = self.fields['formats'].format_fname(book_id, fmt) - except: + except Exception: name = None if name and not replace: return False - path = self._field_for('path', book_id) - if path is None: - # Theoretically, this should never happen, but apparently it - # does: http://www.mobileread.com/forums/showthread.php?t=233353 - self._update_path({book_id}, mark_as_dirtied=False) - path = self._field_for('path', book_id) - - path = path.replace('/', os.sep) - title = self._field_for('title', book_id, default_value=_('Unknown')) - try: - author = self._field_for('authors', book_id, default_value=(_('Unknown'),))[0] - except IndexError: - author = _('Unknown') stream = stream_or_path if hasattr(stream_or_path, 'read') else lopen(stream_or_path, 'rb') - - size, fname = self.backend.add_format(book_id, fmt, stream, title, author, path, name) + size, fname = self._do_add_format(book_id, fmt, stream, name) del stream max_size = self.fields['formats'].table.update_fmt(book_id, fmt, fname, size, self.backend) @@ -2112,7 +2116,7 @@ class Cache(object): with lopen(pt.name, 'rb') as f: exporter.add_file(f, dbkey) os.remove(pt.name) - metadata = {'format_data':format_metadata, 'metadata.db':dbkey} + metadata = {'format_data':format_metadata, 'metadata.db':dbkey, 'total':total} for i, book_id in enumerate(book_ids): if progress is not None: progress(self._field_for('title', book_id), i + 1, total) @@ -2126,7 +2130,43 @@ class Cache(object): with exporter.start_file(cover_key) as dest: if not self.copy_cover_to(book_id, dest, report_file_size=dest.ensure_space): dest.discard() + else: + format_metadata[book_id]['.cover'] = cover_key exporter.set_metadata(library_key, metadata) + exporter.commit() if progress is not None: progress(_('Completed'), total, total) - # }}} + +def import_library(library_key, importer, library_path, progress=None): + from calibre.db.backend import DB + metadata = importer.metadata[library_key] + total = metadata['total'] + if progress is not None: + progress('metadata.db', 0, total) + with open(os.path.join(library_path, 'metadata.db'), 'wb') as f: + src = importer.start_file(metadata['metadata.db'], 'metadata.db for ' + library_path) + shutil.copyfileobj(src, f) + src.close() + cache = Cache(DB(library_path, load_user_formatter_functions=False)) + cache.init() + format_data = {int(book_id):data for book_id, data in metadata['format_data'].iteritems()} + cache._update_path(set(format_data), mark_as_dirtied=False) + for i, (book_id, fmt_key_map) in enumerate(format_data.iteritems()): + title = cache._field_for('title', book_id) + if progress is not None: + progress(title, i + 1, total) + for fmt, fmtkey in fmt_key_map.iteritems(): + if fmt == '.cover': + stream = importer.start_file(fmtkey, _('Cover for %s') % title) + path = cache._field_for('path', book_id).replace('/', os.sep) + cache.backend.set_cover(book_id, path, stream, no_processing=True) + else: + stream = importer.start_file(fmtkey, _('{0} format for {1}').format(fmt.upper(), title)) + size, fname = cache._do_add_format(book_id, fmt, stream) + cache.fields['formats'].table.update_fmt(book_id, fmt, fname, size, cache.backend) + stream.close() + cache.dump_metadata({book_id}) + if progress is not None: + progress(_('Completed'), total, total) + return cache +# }}} diff --git a/src/calibre/db/tests/filesystem.py b/src/calibre/db/tests/filesystem.py index 50e9ab6798..da4910063f 100644 --- a/src/calibre/db/tests/filesystem.py +++ b/src/calibre/db/tests/filesystem.py @@ -145,9 +145,17 @@ class FilesystemTest(BaseTest): self.assertEqual(sorted([os.path.basename(fpath)]), sorted(os.listdir(os.path.dirname(fpath)))) def test_export_import(self): - from calibre.utils.exim import Exporter + from calibre.db.cache import import_library + from calibre.utils.exim import Exporter, Importer cache = self.init_cache() - for part_size in (1024, 100, 1): - with TemporaryDirectory('export_lib') as tdir: + for part_size in (1 << 30, 100, 1): + with TemporaryDirectory('export_lib') as tdir, TemporaryDirectory('import_lib') as idir: exporter = Exporter(tdir, part_size=part_size) cache.export_library('l', exporter) + importer = Importer(tdir) + ic = import_library('l', importer, idir) + self.assertEqual(cache.all_book_ids(), ic.all_book_ids()) + for book_id in cache.all_book_ids(): + self.assertEqual(cache.cover(book_id), ic.cover(book_id), 'Covers not identical for book: %d' % book_id) + for fmt in cache.formats(book_id): + self.assertEqual(cache.format(book_id, fmt), ic.format(book_id, fmt)) diff --git a/src/calibre/utils/exim.py b/src/calibre/utils/exim.py index c92bf998d5..4b51416ce0 100644 --- a/src/calibre/utils/exim.py +++ b/src/calibre/utils/exim.py @@ -58,6 +58,7 @@ class Exporter(object): VERSION = 1 TAIL_FMT = b'!II?' # part_num, version, is_last MDATA_SZ_FMT = b'!Q' + EXT = '.calibre-data' def __init__(self, path_to_export_dir, part_size=(1 << 30)): self.part_size = part_size @@ -78,7 +79,7 @@ class Exporter(object): def new_part(self): self.parts.append(open(os.path.join( - self.base, 'part-{:04d}.calibre-data'.format(len(self.parts) + 1)), 'wb')) + self.base, 'part-{:04d}{}'.format(len(self.parts) + 1, self.EXT)), 'wb')) def commit_part(self, is_last=False): self.f.write(struct.pack(self.TAIL_FMT, len(self.parts), self.VERSION, is_last)) @@ -112,3 +113,74 @@ class Exporter(object): def start_file(self, key): return FileDest(key, self) + +class FileSource(object): + + def __init__(self, f, size, digest, description, importer): + self.f, self.size, self.digest, self.description = f, size, digest, description + self.end = f.tell() + size + self.hasher = hashlib.sha1() + self.importer = importer + + def read(self, size=None): + if size is not None and size < 1: + return b'' + left = self.end - self.f.tell() + amt = min(left, size or left) + if amt < 1: + return b'' + ans = self.f.read(amt) + self.hasher.update(ans) + return ans + + def close(self): + if self.hasher.hexdigest() != self.digest: + self.importer.corrupted_files.append(self.description) + self.hasher = self.f = None + +class Importer(object): + + def __init__(self, path_to_export_dir): + self.corrupted_files = [] + part_map = {} + tail_size = struct.calcsize(Exporter.TAIL_FMT) + for name in os.listdir(path_to_export_dir): + if name.lower().endswith(Exporter.EXT): + path = os.path.join(path_to_export_dir, name) + with open(path, 'rb') as f: + f.seek(-tail_size, os.SEEK_END) + raw = f.read() + if len(raw) != tail_size: + raise ValueError('The exported data in %s is not valid, tail too small' % name) + part_num, version, is_last = struct.unpack(Exporter.TAIL_FMT, raw) + if version > Exporter.VERSION: + raise ValueError('The exported data in %s is not valid, version (%d) is higher than maximum supported version.' % ( + name, version)) + part_map[part_num] = path, is_last + nums = sorted(part_map) + if not nums: + raise ValueError('No exported data found in: %s' % path_to_export_dir) + if nums[0] != 1: + raise ValueError('The first part of this exported data set is missing') + if not part_map[nums[-1]][1]: + raise ValueError('The last part of this exported data set is missing') + if len(nums) != nums[-1]: + raise ValueError('There are some parts of the exported data set missing') + self.part_map = {num:path for num, (path, is_last) in part_map.iteritems()} + msf = struct.calcsize(Exporter.MDATA_SZ_FMT) + offset = tail_size + msf + with self.part(nums[-1]) as f: + f.seek(-offset, os.SEEK_END) + sz, = struct.unpack(Exporter.MDATA_SZ_FMT, f.read(msf)) + f.seek(- sz - offset, os.SEEK_END) + self.metadata = json.loads(f.read(sz)) + self.file_metadata = self.metadata['file_metadata'] + + def part(self, num): + return lopen(self.part_map[num], 'rb') + + def start_file(self, key, description): + partnum, pos, size, digest = self.file_metadata[key] + f = self.part(partnum) + f.seek(pos) + return FileSource(f, size, digest, description, self)