Unify prompts, cover

2025-08-11 09:14:02 -04:00 · 2025-04-28 13:46:22 -07:00 · 2025-04-28 13:46:22 -07:00 · 62fd722019
commit 62fd722019
parent 014eafe3d1
2 changed files with 17 additions and 37 deletions
--- a/src/paperless/ai/ai_classifier.py
+++ b/src/paperless/ai/ai_classifier.py
@ -21,6 +21,7 @@ def build_prompt_without_rag(document: Document) -> str:
    Never ask for further information, additional content or ask questions. Never include any other text.
    Suggested tags and document types must be strictly based on the content of the document.
    Do not change the field names or the JSON structure, only provide the values. Use double quotes and proper JSON syntax.
    Each field must be a list of plain strings.
    The JSON object must contain the following fields:
    - title: A short, descriptive title
@ -30,8 +31,6 @@ def build_prompt_without_rag(document: Document) -> str:
    - storage_paths: Suggested folder paths (e.g. "Medical/Insurance")
    - dates: List up to 3 relevant dates in YYYY-MM-DD format
    Respond ONLY in JSON.
    Each field must be a list of plain strings.
    The format of the JSON object is as follows:
    {{
        "title": "xxxxx",
@ -43,7 +42,6 @@ def build_prompt_without_rag(document: Document) -> str:
    }}
    ---
    FILENAME:
    {filename}
@ -56,41 +54,9 @@ def build_prompt_without_rag(document: Document) -> str:
 def build_prompt_with_rag(document: Document) -> str:
    context = get_context_for_document(document)
-    content = document.content or ""
+    prompt = build_prompt_without_rag(document)
    filename = document.filename or ""
-    prompt = f"""
+    prompt += f"""
    You are a helpful assistant that extracts structured information from documents.
    You have access to similar documents as context to help improve suggestions.
    Only output valid JSON in the format below. No additional explanations.
    The JSON object must contain:
    - title: A short, human-readable, descriptive title based on the content
    - tags: A list of relevant topics
    - correspondents: People or organizations involved
    - document_types: Type or category of the document
    - storage_paths: Suggested folder paths
    - dates: Up to 3 relevant dates in YYYY-MM-DD
    Respond ONLY in JSON.
    Each field must be a list of plain strings.
    The format of the JSON object is as follows:
    {{
        "title": "xxxxx",
        "tags": ["xxxx", "xxxx"],
        "correspondents": ["xxxx", "xxxx"],
        "document_types": ["xxxx", "xxxx"],
        "storage_paths": ["xxxx", "xxxx"],
        "dates": ["YYYY-MM-DD", "YYYY-MM-DD", "YYYY-MM-DD"],
    }}
    Here is the document:
    FILENAME:
    {filename}
    CONTENT:
    {content[:4000]}
    CONTEXT FROM SIMILAR DOCUMENTS:
    {context[:4000]}
--- a/src/paperless/tests/test_ai_classifier.py
+++ b/src/paperless/tests/test_ai_classifier.py
@ -6,6 +6,8 @@ import pytest
 from django.test import override_settings
 from documents.models import Document
 from paperless.ai.ai_classifier import build_prompt_with_rag
 from paperless.ai.ai_classifier import build_prompt_without_rag
 from paperless.ai.ai_classifier import get_ai_document_classification
 from paperless.ai.ai_classifier import parse_ai_response
@ -101,3 +103,15 @@ def test_use_without_rag_if_not_configured(
    mock_run_llm_query.return_value.text = json.dumps({})
    get_ai_document_classification(mock_document)
    mock_build_prompt_without_rag.assert_called_once()
@override_settings(
    LLM_BACKEND="ollama",
    LLM_MODEL="some_model",
 )
 def test_prompt_with_without_rag(mock_document):
    prompt = build_prompt_without_rag(mock_document)
    assert "CONTEXT FROM SIMILAR DOCUMENTS:" not in prompt
    prompt = build_prompt_with_rag(mock_document)
    assert "CONTEXT FROM SIMILAR DOCUMENTS:" in prompt