mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-09 19:33:45 -04:00
Cleans up profiling code and tests
This commit is contained in:
parent
db41c45128
commit
1f066ea548
@ -1,71 +0,0 @@
|
||||
"""
|
||||
Temporary profiling utilities for comparing implementations.
|
||||
|
||||
Usage in a management command or shell::
|
||||
|
||||
from documents.profiling import profile_block
|
||||
|
||||
with profile_block("new check_sanity"):
|
||||
messages = check_sanity()
|
||||
|
||||
with profile_block("old check_sanity"):
|
||||
messages = check_sanity_old()
|
||||
|
||||
Drop this file when done.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import tracemalloc
|
||||
from contextlib import contextmanager
|
||||
from time import perf_counter
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator
|
||||
|
||||
from django.db import connection
|
||||
from django.db import reset_queries
|
||||
from django.test.utils import override_settings
|
||||
|
||||
|
||||
@contextmanager
|
||||
def profile_block(label: str = "block") -> Generator[None, None, None]:
|
||||
"""Profile memory, wall time, and DB queries for a code block.
|
||||
|
||||
Prints a summary to stdout on exit. Requires no external packages.
|
||||
Enables DEBUG temporarily to capture Django's query log.
|
||||
"""
|
||||
tracemalloc.start()
|
||||
snapshot_before = tracemalloc.take_snapshot()
|
||||
|
||||
with override_settings(DEBUG=True):
|
||||
reset_queries()
|
||||
start = perf_counter()
|
||||
|
||||
yield
|
||||
|
||||
elapsed = perf_counter() - start
|
||||
queries = list(connection.queries)
|
||||
|
||||
snapshot_after = tracemalloc.take_snapshot()
|
||||
_, peak = tracemalloc.get_traced_memory()
|
||||
tracemalloc.stop()
|
||||
|
||||
# Compare snapshots for top allocations
|
||||
stats = snapshot_after.compare_to(snapshot_before, "lineno")
|
||||
|
||||
query_time = sum(float(q["time"]) for q in queries)
|
||||
mem_diff = sum(s.size_diff for s in stats)
|
||||
|
||||
print(f"\n{'=' * 60}") # noqa: T201
|
||||
print(f" Profile: {label}") # noqa: T201
|
||||
print(f"{'=' * 60}") # noqa: T201
|
||||
print(f" Wall time: {elapsed:.4f}s") # noqa: T201
|
||||
print(f" Queries: {len(queries)} ({query_time:.4f}s)") # noqa: T201
|
||||
print(f" Memory delta: {mem_diff / 1024:.1f} KiB") # noqa: T201
|
||||
print(f" Peak memory: {peak / 1024:.1f} KiB") # noqa: T201
|
||||
print("\n Top 5 allocations:") # noqa: T201
|
||||
for stat in stats[:5]:
|
||||
print(f" {stat}") # noqa: T201
|
||||
print(f"{'=' * 60}\n") # noqa: T201
|
||||
@ -1,128 +0,0 @@
|
||||
"""
|
||||
Phase 4 profiling benchmark: ijson streaming parse vs json.load for manifest files.
|
||||
|
||||
Run with:
|
||||
uv run pytest src/documents/tests/test_importer_profile_phase4.py \
|
||||
-m profiling --override-ini="addopts=" -s
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.management.commands.document_importer import iter_manifest_records
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.profiling import profile_block
|
||||
from documents.tests.factories import DocumentFactory
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import SampleDirMixin
|
||||
|
||||
|
||||
@pytest.mark.profiling
|
||||
class TestImporterProfilePhase4(DirectoriesMixin, SampleDirMixin, TestCase):
|
||||
"""
|
||||
Benchmarks streaming ijson parse vs json.load over exported manifest files.
|
||||
|
||||
Creates 200 documents + 1 custom field + 200 custom field instances,
|
||||
exports them, then compares the parse step in isolation.
|
||||
|
||||
Does not assert on results — inspect printed profile_block output manually.
|
||||
"""
|
||||
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
self.export_dir = Path(tempfile.mkdtemp())
|
||||
self.addCleanup(shutil.rmtree, self.export_dir)
|
||||
|
||||
def _create_test_data(self) -> None:
|
||||
cf = CustomField.objects.create(
|
||||
name="Phase4 Field",
|
||||
data_type=CustomField.FieldDataType.STRING,
|
||||
)
|
||||
docs = DocumentFactory.create_batch(200)
|
||||
for doc in docs:
|
||||
CustomFieldInstance.objects.create(
|
||||
field=cf,
|
||||
document=doc,
|
||||
value_text=f"value for {doc.pk}",
|
||||
)
|
||||
|
||||
def _get_manifest_paths(self) -> list[Path]:
|
||||
paths = [self.export_dir / "manifest.json"]
|
||||
paths += list(self.export_dir.glob("**/*-manifest.json"))
|
||||
return [p for p in paths if p.exists()]
|
||||
|
||||
def test_profile_streaming_vs_json_load(self) -> None:
|
||||
self._create_test_data()
|
||||
|
||||
call_command(
|
||||
"document_exporter",
|
||||
str(self.export_dir),
|
||||
"--no-progress-bar",
|
||||
"--data-only",
|
||||
)
|
||||
|
||||
manifest_paths = self._get_manifest_paths()
|
||||
self.assertTrue(manifest_paths, "No manifest files found after export")
|
||||
|
||||
# Baseline: json.load then iterate (original approach — loads all into memory)
|
||||
with profile_block("baseline: json.load + iterate"):
|
||||
for path in manifest_paths:
|
||||
with path.open() as f:
|
||||
records = json.load(f)
|
||||
for r in records:
|
||||
_ = r["model"] # simulate check_manifest_validity
|
||||
|
||||
# New: ijson streaming without accumulation (mirrors check_manifest_validity)
|
||||
with profile_block("new: ijson streaming (no accumulation)"):
|
||||
for path in manifest_paths:
|
||||
for record in iter_manifest_records(path):
|
||||
_ = record["model"] # process one at a time, no list buildup
|
||||
|
||||
# New: ijson stream-decrypt to temp file (mirrors decrypt_secret_fields)
|
||||
tmp_path = self.export_dir / "manifest.bench.json"
|
||||
with profile_block("new: ijson stream to temp file"):
|
||||
for path in manifest_paths:
|
||||
with tmp_path.open("w", encoding="utf-8") as out:
|
||||
out.write("[\n")
|
||||
first = True
|
||||
for record in iter_manifest_records(path):
|
||||
if not first:
|
||||
out.write(",\n")
|
||||
json.dump(record, out, ensure_ascii=False)
|
||||
first = False
|
||||
out.write("\n]\n")
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
# Baseline: full record list (old _import_files_from_manifest approach)
|
||||
with profile_block("baseline: full record list (doc records only)"):
|
||||
_ = [
|
||||
record
|
||||
for path in manifest_paths
|
||||
for record in iter_manifest_records(path)
|
||||
if record["model"] == "documents.document"
|
||||
]
|
||||
|
||||
# New: slim dict list (current _import_files_from_manifest approach)
|
||||
from documents.settings import EXPORTER_ARCHIVE_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||
|
||||
with profile_block("new: slim dict list (4 keys only)"):
|
||||
_ = [
|
||||
{
|
||||
"pk": record["pk"],
|
||||
EXPORTER_FILE_NAME: record[EXPORTER_FILE_NAME],
|
||||
EXPORTER_THUMBNAIL_NAME: record.get(EXPORTER_THUMBNAIL_NAME),
|
||||
EXPORTER_ARCHIVE_NAME: record.get(EXPORTER_ARCHIVE_NAME),
|
||||
}
|
||||
for path in manifest_paths
|
||||
for record in iter_manifest_records(path)
|
||||
if record["model"] == "documents.document"
|
||||
]
|
||||
Loading…
x
Reference in New Issue
Block a user