mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-11-12 09:36:41 -05:00
62 lines
1.9 KiB
Python
62 lines
1.9 KiB
Python
import tempfile
|
||
from pathlib import Path
|
||
|
||
from paperless_text.parsers import TextDocumentParser
|
||
|
||
|
||
class TestTextParser:
|
||
def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
|
||
# just make sure that it does not crash
|
||
f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
|
||
assert f.exists()
|
||
assert f.is_file()
|
||
|
||
def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
|
||
text_parser.parse(sample_txt_file, "text/plain")
|
||
|
||
assert text_parser.get_text() == "This is a test file.\n"
|
||
assert text_parser.get_archive_path() is None
|
||
|
||
def test_parse_invalid_bytes(
|
||
self,
|
||
text_parser: TextDocumentParser,
|
||
malformed_txt_file: Path,
|
||
):
|
||
"""
|
||
GIVEN:
|
||
- Text file which contains invalid UTF bytes
|
||
WHEN:
|
||
- The file is parsed
|
||
THEN:
|
||
- Parsing continues
|
||
- Invalid bytes are removed
|
||
"""
|
||
|
||
text_parser.parse(malformed_txt_file, "text/plain")
|
||
|
||
assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
|
||
assert text_parser.get_archive_path() is None
|
||
|
||
def test_thumbnail_large_file(self, text_parser: TextDocumentParser):
|
||
"""
|
||
GIVEN:
|
||
- A very large text file (>50MB)
|
||
WHEN:
|
||
- A thumbnail is requested
|
||
THEN:
|
||
- A thumbnail is created without reading the entire file into memory
|
||
"""
|
||
with tempfile.NamedTemporaryFile(
|
||
delete=False,
|
||
mode="w",
|
||
encoding="utf-8",
|
||
suffix=".txt",
|
||
) as tmp:
|
||
tmp.write("A" * (51 * 1024 * 1024)) # 51 MB of 'A'
|
||
large_file = Path(tmp.name)
|
||
|
||
thumb = text_parser.get_thumbnail(large_file, "text/plain")
|
||
assert thumb.exists()
|
||
assert thumb.is_file()
|
||
large_file.unlink()
|