2025-05-19 14:31:40 -07:00

376 lines
13 KiB
Python

import shutil
from datetime import timedelta
from pathlib import Path
from unittest import mock
from celery import states
from django.conf import settings
from django.test import TestCase
from django.test import override_settings
from django.utils import timezone
from documents import tasks
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import PaperlessTask
from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException
from documents.sanity_checker import SanityCheckMessages
from documents.tests.test_classifier import dummy_preprocess
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
class TestIndexReindex(DirectoriesMixin, TestCase):
def test_index_reindex(self):
Document.objects.create(
title="test",
content="my document",
checksum="wow",
added=timezone.now(),
created=timezone.now(),
modified=timezone.now(),
)
tasks.index_reindex()
def test_index_optimize(self):
Document.objects.create(
title="test",
content="my document",
checksum="wow",
added=timezone.now(),
created=timezone.now(),
modified=timezone.now(),
)
tasks.index_optimize()
class TestClassifier(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@mock.patch("documents.tasks.load_classifier")
def test_train_classifier_no_auto_matching(self, load_classifier):
tasks.train_classifier()
load_classifier.assert_not_called()
@mock.patch("documents.tasks.load_classifier")
def test_train_classifier_with_auto_tag(self, load_classifier):
load_classifier.return_value = None
Tag.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
tasks.train_classifier()
load_classifier.assert_called_once()
self.assertIsNotFile(settings.MODEL_FILE)
@mock.patch("documents.tasks.load_classifier")
def test_train_classifier_with_auto_type(self, load_classifier):
load_classifier.return_value = None
DocumentType.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
tasks.train_classifier()
load_classifier.assert_called_once()
self.assertIsNotFile(settings.MODEL_FILE)
@mock.patch("documents.tasks.load_classifier")
def test_train_classifier_with_auto_correspondent(self, load_classifier):
load_classifier.return_value = None
Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
tasks.train_classifier()
load_classifier.assert_called_once()
self.assertIsNotFile(settings.MODEL_FILE)
def test_train_classifier(self):
c = Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
doc = Document.objects.create(correspondent=c, content="test", title="test")
self.assertIsNotFile(settings.MODEL_FILE)
with mock.patch(
"documents.classifier.DocumentClassifier.preprocess_content",
) as pre_proc_mock:
pre_proc_mock.side_effect = dummy_preprocess
tasks.train_classifier()
self.assertIsFile(settings.MODEL_FILE)
mtime = Path(settings.MODEL_FILE).stat().st_mtime
tasks.train_classifier()
self.assertIsFile(settings.MODEL_FILE)
mtime2 = Path(settings.MODEL_FILE).stat().st_mtime
self.assertEqual(mtime, mtime2)
doc.content = "test2"
doc.save()
tasks.train_classifier()
self.assertIsFile(settings.MODEL_FILE)
mtime3 = Path(settings.MODEL_FILE).stat().st_mtime
self.assertNotEqual(mtime2, mtime3)
class TestSanityCheck(DirectoriesMixin, TestCase):
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_success(self, m):
m.return_value = SanityCheckMessages()
self.assertEqual(tasks.sanity_check(), "No issues detected.")
m.assert_called_once()
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_error(self, m):
messages = SanityCheckMessages()
messages.error(None, "Some error")
m.return_value = messages
self.assertRaises(SanityCheckFailedException, tasks.sanity_check)
m.assert_called_once()
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_error_no_raise(self, m):
messages = SanityCheckMessages()
messages.error(None, "Some error")
m.return_value = messages
# No exception should be raised
result = tasks.sanity_check(raise_on_error=False)
self.assertEqual(
result,
"Sanity check exited with errors. See log.",
)
m.assert_called_once()
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_warning(self, m):
messages = SanityCheckMessages()
messages.warning(None, "Some warning")
m.return_value = messages
self.assertEqual(
tasks.sanity_check(),
"Sanity check exited with warnings. See log.",
)
m.assert_called_once()
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_info(self, m):
messages = SanityCheckMessages()
messages.info(None, "Some info")
m.return_value = messages
self.assertEqual(
tasks.sanity_check(),
"Sanity check exited with infos. See log.",
)
m.assert_called_once()
class TestBulkUpdate(DirectoriesMixin, TestCase):
def test_bulk_update_documents(self):
doc1 = Document.objects.create(
title="test",
content="my document",
checksum="wow",
added=timezone.now(),
created=timezone.now(),
modified=timezone.now(),
)
tasks.bulk_update_documents([doc1.pk])
class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
GIVEN:
- Existing document in trash
WHEN:
- Empty trash task is called without doc_ids
THEN:
- Document is only deleted if it has been in trash for more than delay (default 30 days)
"""
def test_empty_trash(self):
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
added=timezone.now(),
created=timezone.now(),
modified=timezone.now(),
)
doc.delete()
self.assertEqual(Document.global_objects.count(), 1)
self.assertEqual(Document.objects.count(), 0)
tasks.empty_trash()
self.assertEqual(Document.global_objects.count(), 1)
doc.deleted_at = timezone.now() - timedelta(days=31)
doc.save()
tasks.empty_trash()
self.assertEqual(Document.global_objects.count(), 0)
class TestUpdateContent(DirectoriesMixin, TestCase):
def test_update_content_maybe_archive_file(self):
"""
GIVEN:
- Existing document with archive file
WHEN:
- Update content task is called
THEN:
- Document is reprocessed, content and checksum are updated
"""
sample1 = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1,
)
sample1_archive = self.dirs.archive_dir / "sample_archive.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1_archive,
)
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
archive_checksum="wow",
filename=sample1,
mime_type="application/pdf",
archive_filename=sample1_archive,
)
tasks.update_document_content_maybe_archive_file(doc.pk)
self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")
self.assertNotEqual(Document.objects.get(pk=doc.pk).archive_checksum, "wow")
def test_update_content_maybe_archive_file_no_archive(self):
"""
GIVEN:
- Existing document without archive file
WHEN:
- Update content task is called
THEN:
- Document is reprocessed, content is updated
"""
sample1 = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1,
)
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
filename=sample1,
mime_type="application/pdf",
)
tasks.update_document_content_maybe_archive_file(doc.pk)
self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")
class TestAIIndex(DirectoriesMixin, TestCase):
@override_settings(
AI_ENABLED=True,
LLM_EMBEDDING_BACKEND="huggingface",
)
def test_ai_index_success(self):
"""
GIVEN:
- Document exists, AI is enabled, llm index backend is set
WHEN:
- llmindex_index task is called
THEN:
- update_llm_index is called, and the task is marked as success
"""
Document.objects.create(
title="test",
content="my document",
checksum="wow",
)
# lazy-loaded so mock the actual function
with mock.patch("paperless_ai.indexing.update_llm_index") as update_llm_index:
update_llm_index.return_value = "LLM index updated successfully."
tasks.llmindex_index()
update_llm_index.assert_called_once()
task = PaperlessTask.objects.get(
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
)
self.assertEqual(task.status, states.SUCCESS)
self.assertEqual(task.result, "LLM index updated successfully.")
@override_settings(
AI_ENABLED=True,
LLM_EMBEDDING_BACKEND="huggingface",
)
def test_ai_index_failure(self):
"""
GIVEN:
- Document exists, AI is enabled, llm index backend is set
WHEN:
- llmindex_index task is called
THEN:
- update_llm_index raises an exception, and the task is marked as failure
"""
Document.objects.create(
title="test",
content="my document",
checksum="wow",
)
# lazy-loaded so mock the actual function
with mock.patch("paperless_ai.indexing.update_llm_index") as update_llm_index:
update_llm_index.side_effect = Exception("LLM index update failed.")
tasks.llmindex_index()
update_llm_index.assert_called_once()
task = PaperlessTask.objects.get(
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
)
self.assertEqual(task.status, states.FAILURE)
self.assertIn("LLM index update failed.", task.result)
def test_update_document_in_llm_index(self):
"""
GIVEN:
- Nothing
WHEN:
- update_document_in_llm_index task is called
THEN:
- llm_index_add_or_update_document is called
"""
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
)
with mock.patch(
"documents.tasks.llm_index_add_or_update_document",
) as llm_index_add_or_update_document:
tasks.update_document_in_llm_index(doc)
llm_index_add_or_update_document.assert_called_once_with(doc)
def test_remove_document_from_llm_index(self):
"""
GIVEN:
- Nothing
WHEN:
- remove_document_from_llm_index task is called
THEN:
- llm_index_remove_document is called
"""
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
)
with mock.patch(
"documents.tasks.llm_index_remove_document",
) as llm_index_remove_document:
tasks.remove_document_from_llm_index(doc)
llm_index_remove_document.assert_called_once_with(doc)