paperless-ngx/src/paperless_text/tests/test_parser.py

import tempfile
from pathlib import Path

from paperless_text.parsers import TextDocumentParser


class TestTextParser:
    def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
        # just make sure that it does not crash
        f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
        assert f.exists()
        assert f.is_file()

    def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
        text_parser.parse(sample_txt_file, "text/plain")

        assert text_parser.get_text() == "This is a test file.\n"
        assert text_parser.get_archive_path() is None

    def test_parse_invalid_bytes(
        self,
        text_parser: TextDocumentParser,
        malformed_txt_file: Path,
    ):
        """
        GIVEN:
            - Text file which contains invalid UTF bytes
        WHEN:
            - The file is parsed
        THEN:
            - Parsing continues
            - Invalid bytes are removed
        """

        text_parser.parse(malformed_txt_file, "text/plain")

        assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
        assert text_parser.get_archive_path() is None

    def test_thumbnail_large_file(self, text_parser: TextDocumentParser):
        """
        GIVEN:
            - A very large text file (>50MB)
        WHEN:
            - A thumbnail is requested
        THEN:
            - A thumbnail is created without reading the entire file into memory
        """
        with tempfile.NamedTemporaryFile(
            delete=False,
            mode="w",
            encoding="utf-8",
            suffix=".txt",
        ) as tmp:
            tmp.write("A" * (51 * 1024 * 1024))  # 51 MB of 'A'
            large_file = Path(tmp.name)

            thumb = text_parser.get_thumbnail(large_file, "text/plain")
            assert thumb.exists()
            assert thumb.is_file()
            large_file.unlink()