diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 52cadb4fe..1ca4fef17 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -3,7 +3,6 @@ import json import os import shutil import tempfile -from itertools import chain from itertools import islice from pathlib import Path from typing import TYPE_CHECKING @@ -81,6 +80,87 @@ def serialize_queryset_batched( yield serializers.serialize("python", chunk) +class StreamingManifestWriter: + """Incrementally writes a JSON array to a file, one record at a time. + + Writes to .tmp first; on close(), optionally BLAKE2b-compares + with the existing file (--compare-json) and renames or discards accordingly. + On exception, discard() deletes the tmp file and leaves the original intact. + """ + + def __init__( + self, + path: Path, + *, + compare_json: bool = False, + files_in_export_dir: "set[Path] | None" = None, + ) -> None: + self._path = path.resolve() + self._tmp_path = self._path.with_suffix(self._path.suffix + ".tmp") + self._compare_json = compare_json + self._files_in_export_dir: set[Path] = ( + files_in_export_dir if files_in_export_dir is not None else set() + ) + self._file = None + self._first = True + + def open(self) -> None: + self._path.parent.mkdir(parents=True, exist_ok=True) + self._file = self._tmp_path.open("w", encoding="utf-8") + self._file.write("[") + self._first = True + + def write_record(self, record: dict) -> None: + if not self._first: + self._file.write(",\n") + else: + self._first = False + self._file.write( + json.dumps(record, cls=DjangoJSONEncoder, indent=2, ensure_ascii=False), + ) + + def write_batch(self, records: list[dict]) -> None: + for record in records: + self.write_record(record) + + def close(self) -> None: + if self._file is None: + return + self._file.write("\n]") + self._file.close() + self._file = None + self._finalize() + + def discard(self) -> None: + if self._file is not None: + self._file.close() + self._file = None + if self._tmp_path.exists(): + self._tmp_path.unlink() + + def _finalize(self) -> None: + """Compare with existing file (if --compare-json) then rename or discard tmp.""" + if self._path in self._files_in_export_dir: + self._files_in_export_dir.remove(self._path) + if self._compare_json: + existing_hash = hashlib.blake2b(self._path.read_bytes()).hexdigest() + new_hash = hashlib.blake2b(self._tmp_path.read_bytes()).hexdigest() + if existing_hash == new_hash: + self._tmp_path.unlink() + return + self._tmp_path.rename(self._path) + + def __enter__(self) -> "StreamingManifestWriter": + self.open() + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + if exc_type is not None: + self.discard() + else: + self.close() + + class Command(CryptMixin, BaseCommand): help = ( "Decrypt and rename all files in our collection into a given target " @@ -322,95 +402,83 @@ class Command(CryptMixin, BaseCommand): if settings.AUDIT_LOG_ENABLED: manifest_key_to_object_query["log_entries"] = LogEntry.objects.all() - with transaction.atomic(): - manifest_dict = {} - - # Build an overall manifest - for key, object_query in manifest_key_to_object_query.items(): - manifest_dict[key] = list( - chain.from_iterable( - serialize_queryset_batched( - object_query, - batch_size=self.batch_size, - ), - ), - ) - - self.encrypt_secret_fields(manifest_dict) - - # These are treated specially and included in the per-document manifest - # if that setting is enabled. Otherwise, they are just exported to the bulk - # manifest - document_map: dict[int, Document] = { - d.pk: d for d in manifest_key_to_object_query["documents"] - } - document_manifest = manifest_dict["documents"] - - # 3. Export files from each document - for index, document_dict in tqdm.tqdm( - enumerate(document_manifest), - total=len(document_manifest), - disable=self.no_progress_bar, - ): - document = document_map[document_dict["pk"]] - - # 3.1. generate a unique filename - base_name = self.generate_base_name(document) - - # 3.2. write filenames into manifest - original_target, thumbnail_target, archive_target = ( - self.generate_document_targets(document, base_name, document_dict) + # Crypto setup before streaming begins + if self.passphrase: + self.setup_crypto(passphrase=self.passphrase) + elif MailAccount.objects.count() > 0 or SocialToken.objects.count() > 0: + self.stdout.write( + self.style.NOTICE( + "No passphrase was given, sensitive fields will be in plaintext", + ), ) - # 3.3. write files to target folder - if not self.data_only: - self.copy_document_files( - document, - original_target, - thumbnail_target, - archive_target, - ) - - if self.split_manifest: - manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json") - if self.use_folder_prefix: - manifest_name = Path("json") / manifest_name - manifest_name = (self.target / manifest_name).resolve() - manifest_name.parent.mkdir(parents=True, exist_ok=True) - content = [document_manifest[index]] - content += list( - filter( - lambda d: d["fields"]["document"] == document_dict["pk"], - manifest_dict["notes"], - ), - ) - content += list( - filter( - lambda d: d["fields"]["document"] == document_dict["pk"], - manifest_dict["custom_field_instances"], - ), - ) - - self.check_and_write_json( - content, - manifest_name, - ) - - # These were exported already - if self.split_manifest: - del manifest_dict["documents"] - del manifest_dict["notes"] - del manifest_dict["custom_field_instances"] - - # 4.1 write primary manifest to target folder - manifest = [] - for key, item in manifest_dict.items(): - manifest.extend(item) + document_manifest: list[dict] = [] manifest_path = (self.target / "manifest.json").resolve() - self.check_and_write_json( - manifest, + + with StreamingManifestWriter( manifest_path, - ) + compare_json=self.compare_json, + files_in_export_dir=self.files_in_export_dir, + ) as writer: + with transaction.atomic(): + for key, qs in manifest_key_to_object_query.items(): + if key == "documents": + # Accumulate for file-copy loop; written to manifest after + for batch in serialize_queryset_batched( + qs, + batch_size=self.batch_size, + ): + for record in batch: + self._encrypt_record_inline(record) + document_manifest.extend(batch) + elif self.split_manifest and key in ( + "notes", + "custom_field_instances", + ): + # Written per-document in _write_split_manifest + pass + else: + for batch in serialize_queryset_batched( + qs, + batch_size=self.batch_size, + ): + for record in batch: + self._encrypt_record_inline(record) + writer.write_batch(batch) + + document_map: dict[int, Document] = { + d.pk: d for d in Document.objects.order_by("id") + } + + # 3. Export files from each document + for document_dict in tqdm.tqdm( + document_manifest, + total=len(document_manifest), + disable=self.no_progress_bar, + ): + document = document_map[document_dict["pk"]] + + # 3.1. generate a unique filename + base_name = self.generate_base_name(document) + + # 3.2. write filenames into manifest + original_target, thumbnail_target, archive_target = ( + self.generate_document_targets(document, base_name, document_dict) + ) + + # 3.3. write files to target folder + if not self.data_only: + self.copy_document_files( + document, + original_target, + thumbnail_target, + archive_target, + ) + + if self.split_manifest: + self._write_split_manifest(document_dict, document, base_name) + else: + writer.write_record(document_dict) # 4.2 write version information to target folder extra_metadata_path = (self.target / "metadata.json").resolve() @@ -532,6 +600,42 @@ class Command(CryptMixin, BaseCommand): archive_target, ) + def _encrypt_record_inline(self, record: dict) -> None: + """Encrypt sensitive fields in a single record, if passphrase is set.""" + if not self.passphrase: + return + fields = self.CRYPT_FIELDS_BY_MODEL.get(record.get("model", "")) + if fields: + for field in fields: + if record["fields"].get(field): + record["fields"][field] = self.encrypt_string( + value=record["fields"][field], + ) + + def _write_split_manifest( + self, + document_dict: dict, + document: Document, + base_name: Path, + ) -> None: + """Write per-document manifest file for --split-manifest mode.""" + content = [document_dict] + content.extend( + serializers.serialize("python", Note.objects.filter(document=document)), + ) + content.extend( + serializers.serialize( + "python", + CustomFieldInstance.objects.filter(document=document), + ), + ) + manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json") + if self.use_folder_prefix: + manifest_name = Path("json") / manifest_name + manifest_name = (self.target / manifest_name).resolve() + manifest_name.parent.mkdir(parents=True, exist_ok=True) + self.check_and_write_json(content, manifest_name) + def check_and_write_json( self, content: list[dict] | dict, @@ -549,14 +653,14 @@ class Command(CryptMixin, BaseCommand): if target in self.files_in_export_dir: self.files_in_export_dir.remove(target) if self.compare_json: - target_checksum = hashlib.md5(target.read_bytes()).hexdigest() + target_checksum = hashlib.blake2b(target.read_bytes()).hexdigest() src_str = json.dumps( content, cls=DjangoJSONEncoder, indent=2, ensure_ascii=False, ) - src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest() + src_checksum = hashlib.blake2b(src_str.encode("utf-8")).hexdigest() if src_checksum == target_checksum: perform_write = False @@ -606,28 +710,3 @@ class Command(CryptMixin, BaseCommand): if perform_copy: target.parent.mkdir(parents=True, exist_ok=True) copy_file_with_basic_stats(source, target) - - def encrypt_secret_fields(self, manifest: dict) -> None: - """ - Encrypts certain fields in the export. Currently limited to the mail account password - """ - - if self.passphrase: - self.setup_crypto(passphrase=self.passphrase) - - for crypt_config in self.CRYPT_FIELDS: - exporter_key = crypt_config["exporter_key"] - crypt_fields = crypt_config["fields"] - for manifest_record in manifest[exporter_key]: - for field in crypt_fields: - if manifest_record["fields"][field]: - manifest_record["fields"][field] = self.encrypt_string( - value=manifest_record["fields"][field], - ) - - elif MailAccount.objects.count() > 0 or SocialToken.objects.count() > 0: - self.stdout.write( - self.style.NOTICE( - "No passphrase was given, sensitive fields will be in plaintext", - ), - ) diff --git a/src/documents/management/commands/mixins.py b/src/documents/management/commands/mixins.py index aa5befd6d..b03e05956 100644 --- a/src/documents/management/commands/mixins.py +++ b/src/documents/management/commands/mixins.py @@ -71,7 +71,7 @@ class CryptMixin: key_size = 32 kdf_algorithm = "pbkdf2_sha256" - CRYPT_FIELDS: CryptFields = [ + CRYPT_FIELDS: list[CryptFields] = [ { "exporter_key": "mail_accounts", "model_name": "paperless_mail.mailaccount", @@ -89,6 +89,10 @@ class CryptMixin: ], }, ] + # O(1) lookup for per-record encryption; derived from CRYPT_FIELDS at class definition time + CRYPT_FIELDS_BY_MODEL: dict[str, list[str]] = { + cfg["model_name"]: cfg["fields"] for cfg in CRYPT_FIELDS + } def get_crypt_params(self) -> dict[str, dict[str, str | int]]: return { diff --git a/src/documents/tests/test_management_exporter.py b/src/documents/tests/test_management_exporter.py index 2d17aaec6..fb41bfa07 100644 --- a/src/documents/tests/test_management_exporter.py +++ b/src/documents/tests/test_management_exporter.py @@ -753,6 +753,31 @@ class TestExportImport( call_command("document_importer", "--no-progress-bar", self.target) self.assertEqual(Document.objects.count(), 4) + def test_folder_prefix_with_split(self) -> None: + """ + GIVEN: + - Request to export documents to directory + WHEN: + - Option use_folder_prefix is used + - Option split manifest is used + THEN: + - Documents can be imported again + """ + shutil.rmtree(Path(self.dirs.media_dir) / "documents") + shutil.copytree( + Path(__file__).parent / "samples" / "documents", + Path(self.dirs.media_dir) / "documents", + ) + + self._do_export(use_folder_prefix=True, split_manifest=True) + + with paperless_environment(): + self.assertEqual(Document.objects.count(), 4) + Document.objects.all().delete() + self.assertEqual(Document.objects.count(), 0) + call_command("document_importer", "--no-progress-bar", self.target) + self.assertEqual(Document.objects.count(), 4) + def test_import_db_transaction_failed(self) -> None: """ GIVEN: