mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-11 09:14:02 -04:00
Fixhancement: improve text thumbnail generation for large files (#10483)
This commit is contained in:
parent
f09965464a
commit
23501b9060
@ -239,6 +239,7 @@ testpaths = [
|
|||||||
"src/paperless_mail/tests/",
|
"src/paperless_mail/tests/",
|
||||||
"src/paperless_tesseract/tests/",
|
"src/paperless_tesseract/tests/",
|
||||||
"src/paperless_tika/tests",
|
"src/paperless_tika/tests",
|
||||||
|
"src/paperless_text/tests/",
|
||||||
]
|
]
|
||||||
addopts = [
|
addopts = [
|
||||||
"--pythonwarnings=all",
|
"--pythonwarnings=all",
|
||||||
|
@ -16,7 +16,15 @@ class TextDocumentParser(DocumentParser):
|
|||||||
logging_name = "paperless.parsing.text"
|
logging_name = "paperless.parsing.text"
|
||||||
|
|
||||||
def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
|
def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
|
||||||
text = self.read_file_handle_unicode_errors(document_path)
|
# Avoid reading entire file into memory
|
||||||
|
max_chars = 100_000
|
||||||
|
file_size_limit = 50 * 1024 * 1024
|
||||||
|
|
||||||
|
if document_path.stat().st_size > file_size_limit:
|
||||||
|
text = "[File too large to preview]"
|
||||||
|
else:
|
||||||
|
with Path(document_path).open("r", encoding="utf-8", errors="replace") as f:
|
||||||
|
text = f.read(max_chars)
|
||||||
|
|
||||||
img = Image.new("RGB", (500, 700), color="white")
|
img = Image.new("RGB", (500, 700), color="white")
|
||||||
draw = ImageDraw.Draw(img)
|
draw = ImageDraw.Draw(img)
|
||||||
@ -25,7 +33,7 @@ class TextDocumentParser(DocumentParser):
|
|||||||
size=20,
|
size=20,
|
||||||
layout_engine=ImageFont.Layout.BASIC,
|
layout_engine=ImageFont.Layout.BASIC,
|
||||||
)
|
)
|
||||||
draw.text((5, 5), text, font=font, fill="black")
|
draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
|
||||||
|
|
||||||
out_path = self.tempdir / "thumb.webp"
|
out_path = self.tempdir / "thumb.webp"
|
||||||
img.save(out_path, format="WEBP")
|
img.save(out_path, format="WEBP")
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from paperless_text.parsers import TextDocumentParser
|
from paperless_text.parsers import TextDocumentParser
|
||||||
@ -35,3 +36,26 @@ class TestTextParser:
|
|||||||
|
|
||||||
assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
|
assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
|
||||||
assert text_parser.get_archive_path() is None
|
assert text_parser.get_archive_path() is None
|
||||||
|
|
||||||
|
def test_thumbnail_large_file(self, text_parser: TextDocumentParser):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- A very large text file (>50MB)
|
||||||
|
WHEN:
|
||||||
|
- A thumbnail is requested
|
||||||
|
THEN:
|
||||||
|
- A thumbnail is created without reading the entire file into memory
|
||||||
|
"""
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
delete=False,
|
||||||
|
mode="w",
|
||||||
|
encoding="utf-8",
|
||||||
|
suffix=".txt",
|
||||||
|
) as tmp:
|
||||||
|
tmp.write("A" * (51 * 1024 * 1024)) # 51 MB of 'A'
|
||||||
|
large_file = Path(tmp.name)
|
||||||
|
|
||||||
|
thumb = text_parser.get_thumbnail(large_file, "text/plain")
|
||||||
|
assert thumb.exists()
|
||||||
|
assert thumb.is_file()
|
||||||
|
large_file.unlink()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user