From 1f066ea54856bee448101eb906cd1a3c9452fc8b Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Sat, 7 Mar 2026 13:50:11 -0800
Subject: [PATCH] Cleans up profiling code and tests

---
 src/documents/profiling.py                    |  71 ----------
 .../tests/test_importer_profile_phase4.py     | 128 ------------------
 2 files changed, 199 deletions(-)
 delete mode 100644 src/documents/profiling.py
 delete mode 100644 src/documents/tests/test_importer_profile_phase4.py

diff --git a/src/documents/profiling.py b/src/documents/profiling.py
deleted file mode 100644
index 0c938e6dc..000000000
--- a/src/documents/profiling.py
+++ /dev/null
@@ -1,71 +0,0 @@
-"""
-Temporary profiling utilities for comparing implementations.
-
-Usage in a management command or shell::
-
-    from documents.profiling import profile_block
-
-    with profile_block("new check_sanity"):
-        messages = check_sanity()
-
-    with profile_block("old check_sanity"):
-        messages = check_sanity_old()
-
-Drop this file when done.
-"""
-
-from __future__ import annotations
-
-import tracemalloc
-from contextlib import contextmanager
-from time import perf_counter
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from collections.abc import Generator
-
-from django.db import connection
-from django.db import reset_queries
-from django.test.utils import override_settings
-
-
-@contextmanager
-def profile_block(label: str = "block") -> Generator[None, None, None]:
-    """Profile memory, wall time, and DB queries for a code block.
-
-    Prints a summary to stdout on exit. Requires no external packages.
-    Enables DEBUG temporarily to capture Django's query log.
-    """
-    tracemalloc.start()
-    snapshot_before = tracemalloc.take_snapshot()
-
-    with override_settings(DEBUG=True):
-        reset_queries()
-        start = perf_counter()
-
-        yield
-
-        elapsed = perf_counter() - start
-        queries = list(connection.queries)
-
-    snapshot_after = tracemalloc.take_snapshot()
-    _, peak = tracemalloc.get_traced_memory()
-    tracemalloc.stop()
-
-    # Compare snapshots for top allocations
-    stats = snapshot_after.compare_to(snapshot_before, "lineno")
-
-    query_time = sum(float(q["time"]) for q in queries)
-    mem_diff = sum(s.size_diff for s in stats)
-
-    print(f"\n{'=' * 60}")  # noqa: T201
-    print(f"  Profile: {label}")  # noqa: T201
-    print(f"{'=' * 60}")  # noqa: T201
-    print(f"  Wall time:    {elapsed:.4f}s")  # noqa: T201
-    print(f"  Queries:      {len(queries)} ({query_time:.4f}s)")  # noqa: T201
-    print(f"  Memory delta: {mem_diff / 1024:.1f} KiB")  # noqa: T201
-    print(f"  Peak memory:  {peak / 1024:.1f} KiB")  # noqa: T201
-    print("\n  Top 5 allocations:")  # noqa: T201
-    for stat in stats[:5]:
-        print(f"    {stat}")  # noqa: T201
-    print(f"{'=' * 60}\n")  # noqa: T201
diff --git a/src/documents/tests/test_importer_profile_phase4.py b/src/documents/tests/test_importer_profile_phase4.py
deleted file mode 100644
index 41a9fb379..000000000
--- a/src/documents/tests/test_importer_profile_phase4.py
+++ /dev/null
@@ -1,128 +0,0 @@
-"""
-Phase 4 profiling benchmark: ijson streaming parse vs json.load for manifest files.
-
-Run with:
-    uv run pytest src/documents/tests/test_importer_profile_phase4.py \
-        -m profiling --override-ini="addopts=" -s
-"""
-
-import json
-import shutil
-import tempfile
-from pathlib import Path
-
-import pytest
-from django.core.management import call_command
-from django.test import TestCase
-
-from documents.management.commands.document_importer import iter_manifest_records
-from documents.models import CustomField
-from documents.models import CustomFieldInstance
-from documents.profiling import profile_block
-from documents.tests.factories import DocumentFactory
-from documents.tests.utils import DirectoriesMixin
-from documents.tests.utils import SampleDirMixin
-
-
-@pytest.mark.profiling
-class TestImporterProfilePhase4(DirectoriesMixin, SampleDirMixin, TestCase):
-    """
-    Benchmarks streaming ijson parse vs json.load over exported manifest files.
-
-    Creates 200 documents + 1 custom field + 200 custom field instances,
-    exports them, then compares the parse step in isolation.
-
-    Does not assert on results — inspect printed profile_block output manually.
-    """
-
-    def setUp(self) -> None:
-        super().setUp()
-        self.export_dir = Path(tempfile.mkdtemp())
-        self.addCleanup(shutil.rmtree, self.export_dir)
-
-    def _create_test_data(self) -> None:
-        cf = CustomField.objects.create(
-            name="Phase4 Field",
-            data_type=CustomField.FieldDataType.STRING,
-        )
-        docs = DocumentFactory.create_batch(200)
-        for doc in docs:
-            CustomFieldInstance.objects.create(
-                field=cf,
-                document=doc,
-                value_text=f"value for {doc.pk}",
-            )
-
-    def _get_manifest_paths(self) -> list[Path]:
-        paths = [self.export_dir / "manifest.json"]
-        paths += list(self.export_dir.glob("**/*-manifest.json"))
-        return [p for p in paths if p.exists()]
-
-    def test_profile_streaming_vs_json_load(self) -> None:
-        self._create_test_data()
-
-        call_command(
-            "document_exporter",
-            str(self.export_dir),
-            "--no-progress-bar",
-            "--data-only",
-        )
-
-        manifest_paths = self._get_manifest_paths()
-        self.assertTrue(manifest_paths, "No manifest files found after export")
-
-        # Baseline: json.load then iterate (original approach — loads all into memory)
-        with profile_block("baseline: json.load + iterate"):
-            for path in manifest_paths:
-                with path.open() as f:
-                    records = json.load(f)
-                for r in records:
-                    _ = r["model"]  # simulate check_manifest_validity
-
-        # New: ijson streaming without accumulation (mirrors check_manifest_validity)
-        with profile_block("new: ijson streaming (no accumulation)"):
-            for path in manifest_paths:
-                for record in iter_manifest_records(path):
-                    _ = record["model"]  # process one at a time, no list buildup
-
-        # New: ijson stream-decrypt to temp file (mirrors decrypt_secret_fields)
-        tmp_path = self.export_dir / "manifest.bench.json"
-        with profile_block("new: ijson stream to temp file"):
-            for path in manifest_paths:
-                with tmp_path.open("w", encoding="utf-8") as out:
-                    out.write("[\n")
-                    first = True
-                    for record in iter_manifest_records(path):
-                        if not first:
-                            out.write(",\n")
-                        json.dump(record, out, ensure_ascii=False)
-                        first = False
-                    out.write("\n]\n")
-        tmp_path.unlink(missing_ok=True)
-
-        # Baseline: full record list (old _import_files_from_manifest approach)
-        with profile_block("baseline: full record list (doc records only)"):
-            _ = [
-                record
-                for path in manifest_paths
-                for record in iter_manifest_records(path)
-                if record["model"] == "documents.document"
-            ]
-
-        # New: slim dict list (current _import_files_from_manifest approach)
-        from documents.settings import EXPORTER_ARCHIVE_NAME
-        from documents.settings import EXPORTER_FILE_NAME
-        from documents.settings import EXPORTER_THUMBNAIL_NAME
-
-        with profile_block("new: slim dict list (4 keys only)"):
-            _ = [
-                {
-                    "pk": record["pk"],
-                    EXPORTER_FILE_NAME: record[EXPORTER_FILE_NAME],
-                    EXPORTER_THUMBNAIL_NAME: record.get(EXPORTER_THUMBNAIL_NAME),
-                    EXPORTER_ARCHIVE_NAME: record.get(EXPORTER_ARCHIVE_NAME),
-                }
-                for path in manifest_paths
-                for record in iter_manifest_records(path)
-                if record["model"] == "documents.document"
-            ]