""" Search performance profiling tests. Run explicitly — excluded from the normal test suite: uv run pytest -m profiling -s -p no:xdist --override-ini="addopts=" -v The ``-s`` flag is required to see profile_block() output. The ``-p no:xdist`` flag disables parallel execution for accurate measurements. Corpus: 5 000 documents generated deterministically from a fixed Faker seed, with realistic variety: 30 correspondents, 15 document types, 50 tags, ~500 notes spread across ~10 % of documents. """ from __future__ import annotations import random import pytest from django.contrib.auth.models import User from faker import Faker from profiling import profile_block from rest_framework.test import APIClient from documents.models import Correspondent from documents.models import Document from documents.models import DocumentType from documents.models import Note from documents.models import Tag from documents.search import get_backend from documents.search import reset_backend from documents.search._backend import SearchMode pytestmark = [pytest.mark.profiling, pytest.mark.search, pytest.mark.django_db] # --------------------------------------------------------------------------- # Corpus parameters # --------------------------------------------------------------------------- DOC_COUNT = 5_000 SEED = 42 NUM_CORRESPONDENTS = 30 NUM_DOC_TYPES = 15 NUM_TAGS = 50 NOTE_FRACTION = 0.10 # ~500 documents get a note PAGE_SIZE = 25 def _build_corpus(rng: random.Random, fake: Faker) -> None: """ Insert the full corpus into the database and index it. Uses bulk_create for the Document rows (fast) then handles the M2M tag relationships and notes individually. Indexes the full corpus with a single backend.rebuild() call. """ import datetime # ---- lookup objects ------------------------------------------------- correspondents = [ Correspondent.objects.create(name=f"profcorp-{i}-{fake.company()}"[:128]) for i in range(NUM_CORRESPONDENTS) ] doc_types = [ DocumentType.objects.create(name=f"proftype-{i}-{fake.word()}"[:128]) for i in range(NUM_DOC_TYPES) ] tags = [ Tag.objects.create(name=f"proftag-{i}-{fake.word()}"[:100]) for i in range(NUM_TAGS) ] note_user = User.objects.create_user(username="profnoteuser", password="x") # ---- bulk-create documents ------------------------------------------ base_date = datetime.date(2018, 1, 1) raw_docs = [] for i in range(DOC_COUNT): day_offset = rng.randint(0, 6 * 365) created = base_date + datetime.timedelta(days=day_offset) raw_docs.append( Document( title=fake.sentence(nb_words=rng.randint(3, 9)).rstrip("."), content="\n\n".join( fake.paragraph(nb_sentences=rng.randint(3, 7)) for _ in range(rng.randint(2, 5)) ), checksum=f"PROF{i:07d}", correspondent=rng.choice(correspondents + [None] * 8), document_type=rng.choice(doc_types + [None] * 4), created=created, ), ) documents = Document.objects.bulk_create(raw_docs) # ---- tags (M2M, post-bulk) ------------------------------------------ for doc in documents: k = rng.randint(0, 5) if k: doc.tags.add(*rng.sample(tags, k)) # ---- notes on ~10 % of docs ----------------------------------------- note_docs = rng.sample(documents, int(DOC_COUNT * NOTE_FRACTION)) for doc in note_docs: Note.objects.create( document=doc, note=fake.sentence(nb_words=rng.randint(6, 20)), user=note_user, ) # ---- build Tantivy index -------------------------------------------- backend = get_backend() qs = Document.objects.select_related( "correspondent", "document_type", "storage_path", "owner", ).prefetch_related("tags", "notes__user", "custom_fields__field") backend.rebuild(qs) class TestSearchProfiling: """ Performance profiling for the Tantivy search backend and DRF API layer. Each test builds a fresh 5 000-document corpus, exercises one hot path, and prints profile_block() measurements to stdout. No correctness assertions — the goal is to surface hot spots and track regressions. """ @pytest.fixture(autouse=True) def _setup(self, tmp_path, settings): index_dir = tmp_path / "index" index_dir.mkdir() settings.INDEX_DIR = index_dir reset_backend() rng = random.Random(SEED) fake = Faker() Faker.seed(SEED) self.user = User.objects.create_superuser( username="profiler", password="admin", ) self.client = APIClient() self.client.force_authenticate(user=self.user) _build_corpus(rng, fake) yield reset_backend() # -- 1. Backend: search_ids relevance --------------------------------- def test_profile_search_ids_relevance(self): """Profile: search_ids() with relevance ordering across several queries.""" backend = get_backend() queries = [ "invoice payment", "annual report", "bank statement", "contract agreement", "receipt", ] with profile_block(f"search_ids — relevance ({len(queries)} queries)"): for q in queries: backend.search_ids(q, user=None) # -- 2. Backend: search_ids with Tantivy-native sort ------------------ def test_profile_search_ids_sorted(self): """Profile: search_ids() sorted by a Tantivy fast field (created).""" backend = get_backend() with profile_block("search_ids — sorted by created (asc + desc)"): backend.search_ids( "the", user=None, sort_field="created", sort_reverse=False, ) backend.search_ids( "the", user=None, sort_field="created", sort_reverse=True, ) # -- 3. Backend: highlight_hits for a page of 25 ---------------------- def test_profile_highlight_hits(self): """Profile: highlight_hits() for a 25-document page.""" backend = get_backend() all_ids = backend.search_ids("report", user=None) page_ids = all_ids[:PAGE_SIZE] with profile_block(f"highlight_hits — {len(page_ids)} docs"): backend.highlight_hits("report", page_ids) # -- 4. Backend: autocomplete ----------------------------------------- def test_profile_autocomplete(self): """Profile: autocomplete() with eight common prefixes.""" backend = get_backend() prefixes = ["inv", "pay", "con", "rep", "sta", "acc", "doc", "fin"] with profile_block(f"autocomplete — {len(prefixes)} prefixes"): for prefix in prefixes: backend.autocomplete(prefix, limit=10) # -- 5. Backend: simple-mode search (TEXT and TITLE) ------------------ def test_profile_search_ids_simple_modes(self): """Profile: search_ids() in TEXT and TITLE simple-search modes.""" backend = get_backend() queries = ["invoice 2023", "annual report", "bank statement"] with profile_block( f"search_ids — TEXT + TITLE modes ({len(queries)} queries each)", ): for q in queries: backend.search_ids(q, user=None, search_mode=SearchMode.TEXT) backend.search_ids(q, user=None, search_mode=SearchMode.TITLE) # -- 6. API: full round-trip, relevance + page 1 ---------------------- def test_profile_api_relevance_search(self): """Profile: full API search round-trip, relevance order, page 1.""" with profile_block( f"API /documents/?query=… relevance (page 1, page_size={PAGE_SIZE})", ): response = self.client.get( f"/api/documents/?query=invoice+payment&page=1&page_size={PAGE_SIZE}", ) assert response.status_code == 200 # -- 7. API: full round-trip, ORM-ordered (title) --------------------- def test_profile_api_orm_sorted_search(self): """Profile: full API search round-trip with ORM-delegated sort (title).""" with profile_block("API /documents/?query=…&ordering=title"): response = self.client.get( f"/api/documents/?query=report&ordering=title&page=1&page_size={PAGE_SIZE}", ) assert response.status_code == 200 # -- 8. API: full round-trip, score sort ------------------------------ def test_profile_api_score_sort(self): """Profile: full API search with ordering=-score (relevance, preserve order).""" with profile_block("API /documents/?query=…&ordering=-score"): response = self.client.get( f"/api/documents/?query=statement&ordering=-score&page=1&page_size={PAGE_SIZE}", ) assert response.status_code == 200 # -- 9. API: full round-trip, with selection_data --------------------- def test_profile_api_with_selection_data(self): """Profile: full API search including include_selection_data=true.""" with profile_block("API /documents/?query=…&include_selection_data=true"): response = self.client.get( f"/api/documents/?query=contract&page=1&page_size={PAGE_SIZE}" "&include_selection_data=true", ) assert response.status_code == 200 assert "selection_data" in response.data # -- 10. API: paginated (page 2) -------------------------------------- def test_profile_api_page_2(self): """Profile: full API search, page 2 — exercises page offset arithmetic.""" with profile_block(f"API /documents/?query=…&page=2&page_size={PAGE_SIZE}"): response = self.client.get( f"/api/documents/?query=the&page=2&page_size={PAGE_SIZE}", ) assert response.status_code == 200