mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-04 03:27:12 -05:00 
			
		
		
		
	Cleans up and improves parser discovery testing, simplifies the determination of supported or not supported extensions and mime types
This commit is contained in:
		
							parent
							
								
									a340b9c8a1
								
							
						
					
					
						commit
						d19bf59f47
					
				@ -6,12 +6,12 @@ import re
 | 
			
		||||
import shutil
 | 
			
		||||
import subprocess
 | 
			
		||||
import tempfile
 | 
			
		||||
from functools import cache
 | 
			
		||||
from typing import Iterator
 | 
			
		||||
from typing import Match
 | 
			
		||||
from typing import Optional
 | 
			
		||||
from typing import Set
 | 
			
		||||
 | 
			
		||||
import magic
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.utils import timezone
 | 
			
		||||
from documents.loggers import LoggingMixin
 | 
			
		||||
@ -45,11 +45,20 @@ DATE_REGEX = re.compile(
 | 
			
		||||
logger = logging.getLogger("paperless.parsing")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_mime_type_supported(mime_type) -> bool:
 | 
			
		||||
@cache
 | 
			
		||||
def is_mime_type_supported(mime_type: str) -> bool:
 | 
			
		||||
    """
 | 
			
		||||
    Returns True if the mime type is supported, False otherwise
 | 
			
		||||
    """
 | 
			
		||||
    return get_parser_class_for_mime_type(mime_type) is not None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_default_file_extension(mime_type) -> str:
 | 
			
		||||
@cache
 | 
			
		||||
def get_default_file_extension(mime_type: str) -> str:
 | 
			
		||||
    """
 | 
			
		||||
    Returns the default file extension for a mimetype, or
 | 
			
		||||
    an empty string if it could not be determined
 | 
			
		||||
    """
 | 
			
		||||
    for response in document_consumer_declaration.send(None):
 | 
			
		||||
        parser_declaration = response[1]
 | 
			
		||||
        supported_mime_types = parser_declaration["mime_types"]
 | 
			
		||||
@ -64,7 +73,12 @@ def get_default_file_extension(mime_type) -> str:
 | 
			
		||||
        return ""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_file_ext_supported(ext) -> bool:
 | 
			
		||||
@cache
 | 
			
		||||
def is_file_ext_supported(ext: str) -> bool:
 | 
			
		||||
    """
 | 
			
		||||
    Returns True if the file extension is supported, False otherwise
 | 
			
		||||
    TODO: Investigate why this really exists, why not use mimetype
 | 
			
		||||
    """
 | 
			
		||||
    if ext:
 | 
			
		||||
        return ext.lower() in get_supported_file_extensions()
 | 
			
		||||
    else:
 | 
			
		||||
@ -79,11 +93,19 @@ def get_supported_file_extensions() -> Set[str]:
 | 
			
		||||
 | 
			
		||||
        for mime_type in supported_mime_types:
 | 
			
		||||
            extensions.update(mimetypes.guess_all_extensions(mime_type))
 | 
			
		||||
            # Python's stdlib might be behind, so also add what the parser
 | 
			
		||||
            # says is the default extension
 | 
			
		||||
            # This makes image/webp supported on Python < 3.11
 | 
			
		||||
            extensions.add(supported_mime_types[mime_type])
 | 
			
		||||
 | 
			
		||||
    return extensions
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_parser_class_for_mime_type(mime_type):
 | 
			
		||||
def get_parser_class_for_mime_type(mime_type: str) -> Optional["DocumentParser"]:
 | 
			
		||||
    """
 | 
			
		||||
    Returns the best parser (by weight) for the given mimetype or
 | 
			
		||||
    None if no parser exists
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    options = []
 | 
			
		||||
 | 
			
		||||
@ -103,16 +125,6 @@ def get_parser_class_for_mime_type(mime_type):
 | 
			
		||||
    return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_parser_class(path):
 | 
			
		||||
    """
 | 
			
		||||
    Determine the appropriate parser class based on the file
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    mime_type = magic.from_file(path, mime=True)
 | 
			
		||||
 | 
			
		||||
    return get_parser_class_for_mime_type(mime_type)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_convert(
 | 
			
		||||
    input_file,
 | 
			
		||||
    output_file,
 | 
			
		||||
 | 
			
		||||
@ -1,14 +1,8 @@
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
import tempfile
 | 
			
		||||
from tempfile import TemporaryDirectory
 | 
			
		||||
from unittest import mock
 | 
			
		||||
 | 
			
		||||
from django.test import override_settings
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
from documents.parsers import DocumentParser
 | 
			
		||||
from documents.parsers import get_default_file_extension
 | 
			
		||||
from documents.parsers import get_parser_class
 | 
			
		||||
from documents.parsers import get_parser_class_for_mime_type
 | 
			
		||||
from documents.parsers import get_supported_file_extensions
 | 
			
		||||
from documents.parsers import is_file_ext_supported
 | 
			
		||||
@ -16,21 +10,18 @@ from paperless_tesseract.parsers import RasterisedDocumentParser
 | 
			
		||||
from paperless_text.parsers import TextDocumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def fake_magic_from_file(file, mime=False):
 | 
			
		||||
 | 
			
		||||
    if mime:
 | 
			
		||||
        if os.path.splitext(file)[1] == ".pdf":
 | 
			
		||||
            return "application/pdf"
 | 
			
		||||
        else:
 | 
			
		||||
            return "unknown"
 | 
			
		||||
    else:
 | 
			
		||||
        return "A verbose string that describes the contents of the file"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
 | 
			
		||||
class TestParserDiscovery(TestCase):
 | 
			
		||||
    @mock.patch("documents.parsers.document_consumer_declaration.send")
 | 
			
		||||
    def test__get_parser_class_1_parser(self, m, *args):
 | 
			
		||||
    def test_get_parser_class_1_parser(self, m, *args):
 | 
			
		||||
        """
 | 
			
		||||
        GIVEN:
 | 
			
		||||
            - Parser declared for a given mimetype
 | 
			
		||||
        WHEN:
 | 
			
		||||
            - Attempt to get parser for the mimetype
 | 
			
		||||
        THEN:
 | 
			
		||||
            - Declared parser class is returned
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        class DummyParser:
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
@ -45,10 +36,20 @@ class TestParserDiscovery(TestCase):
 | 
			
		||||
            ),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(get_parser_class("doc.pdf"), DummyParser)
 | 
			
		||||
        self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.parsers.document_consumer_declaration.send")
 | 
			
		||||
    def test__get_parser_class_n_parsers(self, m, *args):
 | 
			
		||||
    def test_get_parser_class_n_parsers(self, m, *args):
 | 
			
		||||
        """
 | 
			
		||||
        GIVEN:
 | 
			
		||||
            - Two parsers declared for a given mimetype
 | 
			
		||||
            - Second parser has a higher weight
 | 
			
		||||
        WHEN:
 | 
			
		||||
            - Attempt to get parser for the mimetype
 | 
			
		||||
        THEN:
 | 
			
		||||
            - Second parser class is returned
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        class DummyParser1:
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
@ -74,30 +75,77 @@ class TestParserDiscovery(TestCase):
 | 
			
		||||
            ),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(get_parser_class("doc.pdf"), DummyParser2)
 | 
			
		||||
        self.assertEqual(
 | 
			
		||||
            get_parser_class_for_mime_type("application/pdf"),
 | 
			
		||||
            DummyParser2,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.parsers.document_consumer_declaration.send")
 | 
			
		||||
    def test__get_parser_class_0_parsers(self, m, *args):
 | 
			
		||||
    def test_get_parser_class_0_parsers(self, m, *args):
 | 
			
		||||
        """
 | 
			
		||||
        GIVEN:
 | 
			
		||||
            - No parsers are declared
 | 
			
		||||
        WHEN:
 | 
			
		||||
            - Attempt to get parser for the mimetype
 | 
			
		||||
        THEN:
 | 
			
		||||
            - No parser class is returned
 | 
			
		||||
        """
 | 
			
		||||
        m.return_value = []
 | 
			
		||||
        with TemporaryDirectory() as tmpdir:
 | 
			
		||||
            self.assertIsNone(get_parser_class("doc.pdf"))
 | 
			
		||||
            self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
 | 
			
		||||
 | 
			
		||||
    @mock.patch("documents.parsers.document_consumer_declaration.send")
 | 
			
		||||
    def test_get_parser_class_no_valid_parser(self, m, *args):
 | 
			
		||||
        """
 | 
			
		||||
        GIVEN:
 | 
			
		||||
            - No parser declared for a given mimetype
 | 
			
		||||
            - Parser declared for a different mimetype
 | 
			
		||||
        WHEN:
 | 
			
		||||
            - Attempt to get parser for the given mimetype
 | 
			
		||||
        THEN:
 | 
			
		||||
            - No parser class is returned
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
def fake_get_thumbnail(self, path, mimetype, file_name):
 | 
			
		||||
    return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
 | 
			
		||||
        class DummyParser:
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
        m.return_value = (
 | 
			
		||||
            (
 | 
			
		||||
                None,
 | 
			
		||||
                {
 | 
			
		||||
                    "weight": 0,
 | 
			
		||||
                    "parser": DummyParser,
 | 
			
		||||
                    "mime_types": {"application/pdf": ".pdf"},
 | 
			
		||||
                },
 | 
			
		||||
            ),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestParserAvailability(TestCase):
 | 
			
		||||
    def test_file_extensions(self):
 | 
			
		||||
 | 
			
		||||
        for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
 | 
			
		||||
            self.assertIn(ext, get_supported_file_extensions())
 | 
			
		||||
        self.assertEqual(get_default_file_extension("application/pdf"), ".pdf")
 | 
			
		||||
        self.assertEqual(get_default_file_extension("image/png"), ".png")
 | 
			
		||||
        self.assertEqual(get_default_file_extension("image/jpeg"), ".jpg")
 | 
			
		||||
        self.assertEqual(get_default_file_extension("text/plain"), ".txt")
 | 
			
		||||
        self.assertEqual(get_default_file_extension("text/csv"), ".csv")
 | 
			
		||||
        supported_mimes_and_exts = [
 | 
			
		||||
            ("application/pdf", ".pdf"),
 | 
			
		||||
            ("image/png", ".png"),
 | 
			
		||||
            ("image/jpeg", ".jpg"),
 | 
			
		||||
            ("image/tiff", ".tif"),
 | 
			
		||||
            ("image/webp", ".webp"),
 | 
			
		||||
            ("text/plain", ".txt"),
 | 
			
		||||
            ("text/csv", ".csv"),
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
        supported_exts = get_supported_file_extensions()
 | 
			
		||||
 | 
			
		||||
        for mime_type, ext in supported_mimes_and_exts:
 | 
			
		||||
            self.assertIn(ext, supported_exts)
 | 
			
		||||
            self.assertEqual(get_default_file_extension(mime_type), ext)
 | 
			
		||||
 | 
			
		||||
        # Test no parser declared still returns a an extension
 | 
			
		||||
        self.assertEqual(get_default_file_extension("application/zip"), ".zip")
 | 
			
		||||
 | 
			
		||||
        # Test invalid mimetype returns no extension
 | 
			
		||||
        self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
 | 
			
		||||
 | 
			
		||||
        self.assertIsInstance(
 | 
			
		||||
@ -108,7 +156,7 @@ class TestParserAvailability(TestCase):
 | 
			
		||||
            get_parser_class_for_mime_type("text/plain")(logging_group=None),
 | 
			
		||||
            TextDocumentParser,
 | 
			
		||||
        )
 | 
			
		||||
        self.assertEqual(get_parser_class_for_mime_type("text/sdgsdf"), None)
 | 
			
		||||
        self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
 | 
			
		||||
 | 
			
		||||
        self.assertTrue(is_file_ext_supported(".pdf"))
 | 
			
		||||
        self.assertFalse(is_file_ext_supported(".hsdfh"))
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user