Cleans up profiling code and tests

This commit is contained in:
Trenton H 2026-03-07 13:50:11 -08:00
parent db41c45128
commit 1f066ea548
2 changed files with 0 additions and 199 deletions

View File

@ -1,71 +0,0 @@
"""
Temporary profiling utilities for comparing implementations.
Usage in a management command or shell::
from documents.profiling import profile_block
with profile_block("new check_sanity"):
messages = check_sanity()
with profile_block("old check_sanity"):
messages = check_sanity_old()
Drop this file when done.
"""
from __future__ import annotations
import tracemalloc
from contextlib import contextmanager
from time import perf_counter
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Generator
from django.db import connection
from django.db import reset_queries
from django.test.utils import override_settings
@contextmanager
def profile_block(label: str = "block") -> Generator[None, None, None]:
"""Profile memory, wall time, and DB queries for a code block.
Prints a summary to stdout on exit. Requires no external packages.
Enables DEBUG temporarily to capture Django's query log.
"""
tracemalloc.start()
snapshot_before = tracemalloc.take_snapshot()
with override_settings(DEBUG=True):
reset_queries()
start = perf_counter()
yield
elapsed = perf_counter() - start
queries = list(connection.queries)
snapshot_after = tracemalloc.take_snapshot()
_, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
# Compare snapshots for top allocations
stats = snapshot_after.compare_to(snapshot_before, "lineno")
query_time = sum(float(q["time"]) for q in queries)
mem_diff = sum(s.size_diff for s in stats)
print(f"\n{'=' * 60}") # noqa: T201
print(f" Profile: {label}") # noqa: T201
print(f"{'=' * 60}") # noqa: T201
print(f" Wall time: {elapsed:.4f}s") # noqa: T201
print(f" Queries: {len(queries)} ({query_time:.4f}s)") # noqa: T201
print(f" Memory delta: {mem_diff / 1024:.1f} KiB") # noqa: T201
print(f" Peak memory: {peak / 1024:.1f} KiB") # noqa: T201
print("\n Top 5 allocations:") # noqa: T201
for stat in stats[:5]:
print(f" {stat}") # noqa: T201
print(f"{'=' * 60}\n") # noqa: T201

View File

@ -1,128 +0,0 @@
"""
Phase 4 profiling benchmark: ijson streaming parse vs json.load for manifest files.
Run with:
uv run pytest src/documents/tests/test_importer_profile_phase4.py \
-m profiling --override-ini="addopts=" -s
"""
import json
import shutil
import tempfile
from pathlib import Path
import pytest
from django.core.management import call_command
from django.test import TestCase
from documents.management.commands.document_importer import iter_manifest_records
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.profiling import profile_block
from documents.tests.factories import DocumentFactory
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import SampleDirMixin
@pytest.mark.profiling
class TestImporterProfilePhase4(DirectoriesMixin, SampleDirMixin, TestCase):
"""
Benchmarks streaming ijson parse vs json.load over exported manifest files.
Creates 200 documents + 1 custom field + 200 custom field instances,
exports them, then compares the parse step in isolation.
Does not assert on results inspect printed profile_block output manually.
"""
def setUp(self) -> None:
super().setUp()
self.export_dir = Path(tempfile.mkdtemp())
self.addCleanup(shutil.rmtree, self.export_dir)
def _create_test_data(self) -> None:
cf = CustomField.objects.create(
name="Phase4 Field",
data_type=CustomField.FieldDataType.STRING,
)
docs = DocumentFactory.create_batch(200)
for doc in docs:
CustomFieldInstance.objects.create(
field=cf,
document=doc,
value_text=f"value for {doc.pk}",
)
def _get_manifest_paths(self) -> list[Path]:
paths = [self.export_dir / "manifest.json"]
paths += list(self.export_dir.glob("**/*-manifest.json"))
return [p for p in paths if p.exists()]
def test_profile_streaming_vs_json_load(self) -> None:
self._create_test_data()
call_command(
"document_exporter",
str(self.export_dir),
"--no-progress-bar",
"--data-only",
)
manifest_paths = self._get_manifest_paths()
self.assertTrue(manifest_paths, "No manifest files found after export")
# Baseline: json.load then iterate (original approach — loads all into memory)
with profile_block("baseline: json.load + iterate"):
for path in manifest_paths:
with path.open() as f:
records = json.load(f)
for r in records:
_ = r["model"] # simulate check_manifest_validity
# New: ijson streaming without accumulation (mirrors check_manifest_validity)
with profile_block("new: ijson streaming (no accumulation)"):
for path in manifest_paths:
for record in iter_manifest_records(path):
_ = record["model"] # process one at a time, no list buildup
# New: ijson stream-decrypt to temp file (mirrors decrypt_secret_fields)
tmp_path = self.export_dir / "manifest.bench.json"
with profile_block("new: ijson stream to temp file"):
for path in manifest_paths:
with tmp_path.open("w", encoding="utf-8") as out:
out.write("[\n")
first = True
for record in iter_manifest_records(path):
if not first:
out.write(",\n")
json.dump(record, out, ensure_ascii=False)
first = False
out.write("\n]\n")
tmp_path.unlink(missing_ok=True)
# Baseline: full record list (old _import_files_from_manifest approach)
with profile_block("baseline: full record list (doc records only)"):
_ = [
record
for path in manifest_paths
for record in iter_manifest_records(path)
if record["model"] == "documents.document"
]
# New: slim dict list (current _import_files_from_manifest approach)
from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
with profile_block("new: slim dict list (4 keys only)"):
_ = [
{
"pk": record["pk"],
EXPORTER_FILE_NAME: record[EXPORTER_FILE_NAME],
EXPORTER_THUMBNAIL_NAME: record.get(EXPORTER_THUMBNAIL_NAME),
EXPORTER_ARCHIVE_NAME: record.get(EXPORTER_ARCHIVE_NAME),
}
for path in manifest_paths
for record in iter_manifest_records(path)
if record["model"] == "documents.document"
]