paperless-ngx/src/documents/tests/test_document_model.py
Antoine Mérino 8adc26e09d
Enhancement: Limit excessively long content length when computing suggestions (#10656)
This helps prevent excessive processing times on very large documents
by limiting the text analyzed during date parsing, tag prediction,
and correspondent matching.

If the document exceeds 1.2M chars, crop to 1M char.
2025-09-09 13:02:16 -07:00

133 lines
4.0 KiB
Python

import shutil
import tempfile
from datetime import date
from pathlib import Path
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from faker import Faker
from documents.models import Correspondent
from documents.models import Document
from documents.tasks import empty_trash
class TestDocument(TestCase):
def setUp(self) -> None:
self.originals_dir = tempfile.mkdtemp()
self.thumb_dir = tempfile.mkdtemp()
override_settings(
ORIGINALS_DIR=self.originals_dir,
THUMBNAIL_DIR=self.thumb_dir,
).enable()
def tearDown(self) -> None:
shutil.rmtree(self.originals_dir)
shutil.rmtree(self.thumb_dir)
def test_file_deletion(self):
document = Document.objects.create(
correspondent=Correspondent.objects.create(name="Test0"),
title="Title",
content="content",
checksum="checksum",
mime_type="application/pdf",
)
file_path = document.source_path
thumb_path = document.thumbnail_path
Path(file_path).touch()
Path(thumb_path).touch()
with mock.patch("documents.signals.handlers.Path.unlink") as mock_unlink:
document.delete()
empty_trash([document.pk])
self.assertEqual(mock_unlink.call_count, 2)
def test_document_soft_delete(self):
document = Document.objects.create(
correspondent=Correspondent.objects.create(name="Test0"),
title="Title",
content="content",
checksum="checksum",
mime_type="application/pdf",
)
file_path = document.source_path
thumb_path = document.thumbnail_path
Path(file_path).touch()
Path(thumb_path).touch()
with mock.patch("documents.signals.handlers.Path.unlink") as mock_unlink:
document.delete()
self.assertEqual(mock_unlink.call_count, 0)
self.assertEqual(Document.objects.count(), 0)
document.restore(strict=False)
self.assertEqual(Document.objects.count(), 1)
document.delete()
empty_trash([document.pk])
self.assertEqual(mock_unlink.call_count, 2)
def test_file_name(self):
doc = Document(
mime_type="application/pdf",
title="test",
created=date(2020, 12, 25),
)
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.pdf")
def test_file_name_jpg(self):
doc = Document(
mime_type="image/jpeg",
title="test",
created=date(2020, 12, 25),
)
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.jpg")
def test_file_name_unknown(self):
doc = Document(
mime_type="application/zip",
title="test",
created=date(2020, 12, 25),
)
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.zip")
def test_file_name_invalid_type(self):
doc = Document(
mime_type="image/jpegasd",
title="test",
created=date(2020, 12, 25),
)
self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
def test_suggestion_content():
"""
Check that the document for suggestion is cropped, only if it exceeds the length limit.
"""
fake_text = Faker().text(max_nb_chars=1201000)
# Do not crop content under 1.2M chars
content_under_limit = fake_text[:1200000]
doc = Document(
title="test",
created=date(2025, 6, 1),
content=content_under_limit,
)
assert doc.suggestion_content == content_under_limit
# If over the limit, crop to 1M char (800K from the beginning, 200K from the end)
content_over_limit = fake_text[:1200001]
expected_cropped_content = (
content_over_limit[:800000] + " " + content_over_limit[-200000:]
)
doc.content = content_over_limit
assert doc.suggestion_content == expected_cropped_content