From 1f066ea54856bee448101eb906cd1a3c9452fc8b Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Sat, 7 Mar 2026 13:50:11 -0800 Subject: [PATCH] Cleans up profiling code and tests --- src/documents/profiling.py | 71 ---------- .../tests/test_importer_profile_phase4.py | 128 ------------------ 2 files changed, 199 deletions(-) delete mode 100644 src/documents/profiling.py delete mode 100644 src/documents/tests/test_importer_profile_phase4.py diff --git a/src/documents/profiling.py b/src/documents/profiling.py deleted file mode 100644 index 0c938e6dc..000000000 --- a/src/documents/profiling.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -Temporary profiling utilities for comparing implementations. - -Usage in a management command or shell:: - - from documents.profiling import profile_block - - with profile_block("new check_sanity"): - messages = check_sanity() - - with profile_block("old check_sanity"): - messages = check_sanity_old() - -Drop this file when done. -""" - -from __future__ import annotations - -import tracemalloc -from contextlib import contextmanager -from time import perf_counter -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from collections.abc import Generator - -from django.db import connection -from django.db import reset_queries -from django.test.utils import override_settings - - -@contextmanager -def profile_block(label: str = "block") -> Generator[None, None, None]: - """Profile memory, wall time, and DB queries for a code block. - - Prints a summary to stdout on exit. Requires no external packages. - Enables DEBUG temporarily to capture Django's query log. - """ - tracemalloc.start() - snapshot_before = tracemalloc.take_snapshot() - - with override_settings(DEBUG=True): - reset_queries() - start = perf_counter() - - yield - - elapsed = perf_counter() - start - queries = list(connection.queries) - - snapshot_after = tracemalloc.take_snapshot() - _, peak = tracemalloc.get_traced_memory() - tracemalloc.stop() - - # Compare snapshots for top allocations - stats = snapshot_after.compare_to(snapshot_before, "lineno") - - query_time = sum(float(q["time"]) for q in queries) - mem_diff = sum(s.size_diff for s in stats) - - print(f"\n{'=' * 60}") # noqa: T201 - print(f" Profile: {label}") # noqa: T201 - print(f"{'=' * 60}") # noqa: T201 - print(f" Wall time: {elapsed:.4f}s") # noqa: T201 - print(f" Queries: {len(queries)} ({query_time:.4f}s)") # noqa: T201 - print(f" Memory delta: {mem_diff / 1024:.1f} KiB") # noqa: T201 - print(f" Peak memory: {peak / 1024:.1f} KiB") # noqa: T201 - print("\n Top 5 allocations:") # noqa: T201 - for stat in stats[:5]: - print(f" {stat}") # noqa: T201 - print(f"{'=' * 60}\n") # noqa: T201 diff --git a/src/documents/tests/test_importer_profile_phase4.py b/src/documents/tests/test_importer_profile_phase4.py deleted file mode 100644 index 41a9fb379..000000000 --- a/src/documents/tests/test_importer_profile_phase4.py +++ /dev/null @@ -1,128 +0,0 @@ -""" -Phase 4 profiling benchmark: ijson streaming parse vs json.load for manifest files. - -Run with: - uv run pytest src/documents/tests/test_importer_profile_phase4.py \ - -m profiling --override-ini="addopts=" -s -""" - -import json -import shutil -import tempfile -from pathlib import Path - -import pytest -from django.core.management import call_command -from django.test import TestCase - -from documents.management.commands.document_importer import iter_manifest_records -from documents.models import CustomField -from documents.models import CustomFieldInstance -from documents.profiling import profile_block -from documents.tests.factories import DocumentFactory -from documents.tests.utils import DirectoriesMixin -from documents.tests.utils import SampleDirMixin - - -@pytest.mark.profiling -class TestImporterProfilePhase4(DirectoriesMixin, SampleDirMixin, TestCase): - """ - Benchmarks streaming ijson parse vs json.load over exported manifest files. - - Creates 200 documents + 1 custom field + 200 custom field instances, - exports them, then compares the parse step in isolation. - - Does not assert on results — inspect printed profile_block output manually. - """ - - def setUp(self) -> None: - super().setUp() - self.export_dir = Path(tempfile.mkdtemp()) - self.addCleanup(shutil.rmtree, self.export_dir) - - def _create_test_data(self) -> None: - cf = CustomField.objects.create( - name="Phase4 Field", - data_type=CustomField.FieldDataType.STRING, - ) - docs = DocumentFactory.create_batch(200) - for doc in docs: - CustomFieldInstance.objects.create( - field=cf, - document=doc, - value_text=f"value for {doc.pk}", - ) - - def _get_manifest_paths(self) -> list[Path]: - paths = [self.export_dir / "manifest.json"] - paths += list(self.export_dir.glob("**/*-manifest.json")) - return [p for p in paths if p.exists()] - - def test_profile_streaming_vs_json_load(self) -> None: - self._create_test_data() - - call_command( - "document_exporter", - str(self.export_dir), - "--no-progress-bar", - "--data-only", - ) - - manifest_paths = self._get_manifest_paths() - self.assertTrue(manifest_paths, "No manifest files found after export") - - # Baseline: json.load then iterate (original approach — loads all into memory) - with profile_block("baseline: json.load + iterate"): - for path in manifest_paths: - with path.open() as f: - records = json.load(f) - for r in records: - _ = r["model"] # simulate check_manifest_validity - - # New: ijson streaming without accumulation (mirrors check_manifest_validity) - with profile_block("new: ijson streaming (no accumulation)"): - for path in manifest_paths: - for record in iter_manifest_records(path): - _ = record["model"] # process one at a time, no list buildup - - # New: ijson stream-decrypt to temp file (mirrors decrypt_secret_fields) - tmp_path = self.export_dir / "manifest.bench.json" - with profile_block("new: ijson stream to temp file"): - for path in manifest_paths: - with tmp_path.open("w", encoding="utf-8") as out: - out.write("[\n") - first = True - for record in iter_manifest_records(path): - if not first: - out.write(",\n") - json.dump(record, out, ensure_ascii=False) - first = False - out.write("\n]\n") - tmp_path.unlink(missing_ok=True) - - # Baseline: full record list (old _import_files_from_manifest approach) - with profile_block("baseline: full record list (doc records only)"): - _ = [ - record - for path in manifest_paths - for record in iter_manifest_records(path) - if record["model"] == "documents.document" - ] - - # New: slim dict list (current _import_files_from_manifest approach) - from documents.settings import EXPORTER_ARCHIVE_NAME - from documents.settings import EXPORTER_FILE_NAME - from documents.settings import EXPORTER_THUMBNAIL_NAME - - with profile_block("new: slim dict list (4 keys only)"): - _ = [ - { - "pk": record["pk"], - EXPORTER_FILE_NAME: record[EXPORTER_FILE_NAME], - EXPORTER_THUMBNAIL_NAME: record.get(EXPORTER_THUMBNAIL_NAME), - EXPORTER_ARCHIVE_NAME: record.get(EXPORTER_ARCHIVE_NAME), - } - for path in manifest_paths - for record in iter_manifest_records(path) - if record["model"] == "documents.document" - ]