mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	Fixhancement: improve text thumbnail generation for large files (#10483)
This commit is contained in:
		
							parent
							
								
									f09965464a
								
							
						
					
					
						commit
						23501b9060
					
				@ -239,6 +239,7 @@ testpaths = [
 | 
			
		||||
  "src/paperless_mail/tests/",
 | 
			
		||||
  "src/paperless_tesseract/tests/",
 | 
			
		||||
  "src/paperless_tika/tests",
 | 
			
		||||
  "src/paperless_text/tests/",
 | 
			
		||||
]
 | 
			
		||||
addopts = [
 | 
			
		||||
  "--pythonwarnings=all",
 | 
			
		||||
 | 
			
		||||
@ -16,7 +16,15 @@ class TextDocumentParser(DocumentParser):
 | 
			
		||||
    logging_name = "paperless.parsing.text"
 | 
			
		||||
 | 
			
		||||
    def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
 | 
			
		||||
        text = self.read_file_handle_unicode_errors(document_path)
 | 
			
		||||
        # Avoid reading entire file into memory
 | 
			
		||||
        max_chars = 100_000
 | 
			
		||||
        file_size_limit = 50 * 1024 * 1024
 | 
			
		||||
 | 
			
		||||
        if document_path.stat().st_size > file_size_limit:
 | 
			
		||||
            text = "[File too large to preview]"
 | 
			
		||||
        else:
 | 
			
		||||
            with Path(document_path).open("r", encoding="utf-8", errors="replace") as f:
 | 
			
		||||
                text = f.read(max_chars)
 | 
			
		||||
 | 
			
		||||
        img = Image.new("RGB", (500, 700), color="white")
 | 
			
		||||
        draw = ImageDraw.Draw(img)
 | 
			
		||||
@ -25,7 +33,7 @@ class TextDocumentParser(DocumentParser):
 | 
			
		||||
            size=20,
 | 
			
		||||
            layout_engine=ImageFont.Layout.BASIC,
 | 
			
		||||
        )
 | 
			
		||||
        draw.text((5, 5), text, font=font, fill="black")
 | 
			
		||||
        draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
 | 
			
		||||
 | 
			
		||||
        out_path = self.tempdir / "thumb.webp"
 | 
			
		||||
        img.save(out_path, format="WEBP")
 | 
			
		||||
 | 
			
		||||
@ -1,3 +1,4 @@
 | 
			
		||||
import tempfile
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from paperless_text.parsers import TextDocumentParser
 | 
			
		||||
@ -35,3 +36,26 @@ class TestTextParser:
 | 
			
		||||
 | 
			
		||||
        assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
 | 
			
		||||
        assert text_parser.get_archive_path() is None
 | 
			
		||||
 | 
			
		||||
    def test_thumbnail_large_file(self, text_parser: TextDocumentParser):
 | 
			
		||||
        """
 | 
			
		||||
        GIVEN:
 | 
			
		||||
            - A very large text file (>50MB)
 | 
			
		||||
        WHEN:
 | 
			
		||||
            - A thumbnail is requested
 | 
			
		||||
        THEN:
 | 
			
		||||
            - A thumbnail is created without reading the entire file into memory
 | 
			
		||||
        """
 | 
			
		||||
        with tempfile.NamedTemporaryFile(
 | 
			
		||||
            delete=False,
 | 
			
		||||
            mode="w",
 | 
			
		||||
            encoding="utf-8",
 | 
			
		||||
            suffix=".txt",
 | 
			
		||||
        ) as tmp:
 | 
			
		||||
            tmp.write("A" * (51 * 1024 * 1024))  # 51 MB of 'A'
 | 
			
		||||
            large_file = Path(tmp.name)
 | 
			
		||||
 | 
			
		||||
            thumb = text_parser.get_thumbnail(large_file, "text/plain")
 | 
			
		||||
            assert thumb.exists()
 | 
			
		||||
            assert thumb.is_file()
 | 
			
		||||
            large_file.unlink()
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user