mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-02 18:47:10 -05:00 
			
		
		
		
	Adding more test coverage, in particular around Tika and its parser
This commit is contained in:
		
							parent
							
								
									8154c7b53a
								
							
						
					
					
						commit
						bdcba570cb
					
				@ -121,28 +121,28 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
 | 
				
			|||||||
        response = self.client.get("/api/documents/", format="json")
 | 
					        response = self.client.get("/api/documents/", format="json")
 | 
				
			||||||
        self.assertEqual(response.status_code, 200)
 | 
					        self.assertEqual(response.status_code, 200)
 | 
				
			||||||
        results_full = response.data["results"]
 | 
					        results_full = response.data["results"]
 | 
				
			||||||
        self.assertTrue("content" in results_full[0])
 | 
					        self.assertIn("content", results_full[0])
 | 
				
			||||||
        self.assertTrue("id" in results_full[0])
 | 
					        self.assertIn("id", results_full[0])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        response = self.client.get("/api/documents/?fields=id", format="json")
 | 
					        response = self.client.get("/api/documents/?fields=id", format="json")
 | 
				
			||||||
        self.assertEqual(response.status_code, 200)
 | 
					        self.assertEqual(response.status_code, 200)
 | 
				
			||||||
        results = response.data["results"]
 | 
					        results = response.data["results"]
 | 
				
			||||||
        self.assertFalse("content" in results[0])
 | 
					        self.assertFalse("content" in results[0])
 | 
				
			||||||
        self.assertTrue("id" in results[0])
 | 
					        self.assertIn("id", results[0])
 | 
				
			||||||
        self.assertEqual(len(results[0]), 1)
 | 
					        self.assertEqual(len(results[0]), 1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        response = self.client.get("/api/documents/?fields=content", format="json")
 | 
					        response = self.client.get("/api/documents/?fields=content", format="json")
 | 
				
			||||||
        self.assertEqual(response.status_code, 200)
 | 
					        self.assertEqual(response.status_code, 200)
 | 
				
			||||||
        results = response.data["results"]
 | 
					        results = response.data["results"]
 | 
				
			||||||
        self.assertTrue("content" in results[0])
 | 
					        self.assertIn("content", results[0])
 | 
				
			||||||
        self.assertFalse("id" in results[0])
 | 
					        self.assertFalse("id" in results[0])
 | 
				
			||||||
        self.assertEqual(len(results[0]), 1)
 | 
					        self.assertEqual(len(results[0]), 1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        response = self.client.get("/api/documents/?fields=id,content", format="json")
 | 
					        response = self.client.get("/api/documents/?fields=id,content", format="json")
 | 
				
			||||||
        self.assertEqual(response.status_code, 200)
 | 
					        self.assertEqual(response.status_code, 200)
 | 
				
			||||||
        results = response.data["results"]
 | 
					        results = response.data["results"]
 | 
				
			||||||
        self.assertTrue("content" in results[0])
 | 
					        self.assertIn("content", results[0])
 | 
				
			||||||
        self.assertTrue("id" in results[0])
 | 
					        self.assertIn("id", results[0])
 | 
				
			||||||
        self.assertEqual(len(results[0]), 2)
 | 
					        self.assertEqual(len(results[0]), 2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        response = self.client.get(
 | 
					        response = self.client.get(
 | 
				
			||||||
@ -152,7 +152,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
 | 
				
			|||||||
        self.assertEqual(response.status_code, 200)
 | 
					        self.assertEqual(response.status_code, 200)
 | 
				
			||||||
        results = response.data["results"]
 | 
					        results = response.data["results"]
 | 
				
			||||||
        self.assertFalse("content" in results[0])
 | 
					        self.assertFalse("content" in results[0])
 | 
				
			||||||
        self.assertTrue("id" in results[0])
 | 
					        self.assertIn("id", results[0])
 | 
				
			||||||
        self.assertEqual(len(results[0]), 1)
 | 
					        self.assertEqual(len(results[0]), 1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        response = self.client.get("/api/documents/?fields=", format="json")
 | 
					        response = self.client.get("/api/documents/?fields=", format="json")
 | 
				
			||||||
 | 
				
			|||||||
@ -25,7 +25,7 @@ class TestImporter(TestCase):
 | 
				
			|||||||
        cmd.manifest = [{"model": "documents.document"}]
 | 
					        cmd.manifest = [{"model": "documents.document"}]
 | 
				
			||||||
        with self.assertRaises(CommandError) as cm:
 | 
					        with self.assertRaises(CommandError) as cm:
 | 
				
			||||||
            cmd._check_manifest()
 | 
					            cmd._check_manifest()
 | 
				
			||||||
        self.assertTrue("The manifest file contains a record" in str(cm.exception))
 | 
					        self.assertIn("The manifest file contains a record", str(cm.exception))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cmd.manifest = [
 | 
					        cmd.manifest = [
 | 
				
			||||||
            {"model": "documents.document", EXPORTER_FILE_NAME: "noexist.pdf"},
 | 
					            {"model": "documents.document", EXPORTER_FILE_NAME: "noexist.pdf"},
 | 
				
			||||||
@ -33,6 +33,7 @@ class TestImporter(TestCase):
 | 
				
			|||||||
        # self.assertRaises(CommandError, cmd._check_manifest)
 | 
					        # self.assertRaises(CommandError, cmd._check_manifest)
 | 
				
			||||||
        with self.assertRaises(CommandError) as cm:
 | 
					        with self.assertRaises(CommandError) as cm:
 | 
				
			||||||
            cmd._check_manifest()
 | 
					            cmd._check_manifest()
 | 
				
			||||||
        self.assertTrue(
 | 
					        self.assertIn(
 | 
				
			||||||
            'The manifest file refers to "noexist.pdf"' in str(cm.exception),
 | 
					            'The manifest file refers to "noexist.pdf"',
 | 
				
			||||||
 | 
					            str(cm.exception),
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
				
			|||||||
@ -1,6 +1,8 @@
 | 
				
			|||||||
from tempfile import TemporaryDirectory
 | 
					from tempfile import TemporaryDirectory
 | 
				
			||||||
from unittest import mock
 | 
					from unittest import mock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from django.apps import apps
 | 
				
			||||||
 | 
					from django.test import override_settings
 | 
				
			||||||
from django.test import TestCase
 | 
					from django.test import TestCase
 | 
				
			||||||
from documents.parsers import get_default_file_extension
 | 
					from documents.parsers import get_default_file_extension
 | 
				
			||||||
from documents.parsers import get_parser_class_for_mime_type
 | 
					from documents.parsers import get_parser_class_for_mime_type
 | 
				
			||||||
@ -8,6 +10,7 @@ from documents.parsers import get_supported_file_extensions
 | 
				
			|||||||
from documents.parsers import is_file_ext_supported
 | 
					from documents.parsers import is_file_ext_supported
 | 
				
			||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
 | 
					from paperless_tesseract.parsers import RasterisedDocumentParser
 | 
				
			||||||
from paperless_text.parsers import TextDocumentParser
 | 
					from paperless_text.parsers import TextDocumentParser
 | 
				
			||||||
 | 
					from paperless_tika.parsers import TikaDocumentParser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TestParserDiscovery(TestCase):
 | 
					class TestParserDiscovery(TestCase):
 | 
				
			||||||
@ -124,14 +127,43 @@ class TestParserDiscovery(TestCase):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TestParserAvailability(TestCase):
 | 
					class TestParserAvailability(TestCase):
 | 
				
			||||||
    def test_file_extensions(self):
 | 
					    def test_tesseract_parser(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - Various mime types
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - The parser class is instantiated
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - The Tesseract based parser is return
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
        supported_mimes_and_exts = [
 | 
					        supported_mimes_and_exts = [
 | 
				
			||||||
            ("application/pdf", ".pdf"),
 | 
					            ("application/pdf", ".pdf"),
 | 
				
			||||||
            ("image/png", ".png"),
 | 
					            ("image/png", ".png"),
 | 
				
			||||||
            ("image/jpeg", ".jpg"),
 | 
					            ("image/jpeg", ".jpg"),
 | 
				
			||||||
            ("image/tiff", ".tif"),
 | 
					            ("image/tiff", ".tif"),
 | 
				
			||||||
            ("image/webp", ".webp"),
 | 
					            ("image/webp", ".webp"),
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        supported_exts = get_supported_file_extensions()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for mime_type, ext in supported_mimes_and_exts:
 | 
				
			||||||
 | 
					            self.assertIn(ext, supported_exts)
 | 
				
			||||||
 | 
					            self.assertEqual(get_default_file_extension(mime_type), ext)
 | 
				
			||||||
 | 
					            self.assertIsInstance(
 | 
				
			||||||
 | 
					                get_parser_class_for_mime_type(mime_type)(logging_group=None),
 | 
				
			||||||
 | 
					                RasterisedDocumentParser,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_text_parser(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - Various mime types of a text form
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - The parser class is instantiated
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - The text based parser is return
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        supported_mimes_and_exts = [
 | 
				
			||||||
            ("text/plain", ".txt"),
 | 
					            ("text/plain", ".txt"),
 | 
				
			||||||
            ("text/csv", ".csv"),
 | 
					            ("text/csv", ".csv"),
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
@ -141,23 +173,55 @@ class TestParserAvailability(TestCase):
 | 
				
			|||||||
        for mime_type, ext in supported_mimes_and_exts:
 | 
					        for mime_type, ext in supported_mimes_and_exts:
 | 
				
			||||||
            self.assertIn(ext, supported_exts)
 | 
					            self.assertIn(ext, supported_exts)
 | 
				
			||||||
            self.assertEqual(get_default_file_extension(mime_type), ext)
 | 
					            self.assertEqual(get_default_file_extension(mime_type), ext)
 | 
				
			||||||
 | 
					            self.assertIsInstance(
 | 
				
			||||||
 | 
					                get_parser_class_for_mime_type(mime_type)(logging_group=None),
 | 
				
			||||||
 | 
					                TextDocumentParser,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_tika_parser(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - Various mime types of a office document form
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - The parser class is instantiated
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - The Tika/Gotenberg based parser is return
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        supported_mimes_and_exts = [
 | 
				
			||||||
 | 
					            ("application/vnd.oasis.opendocument.text", ".odt"),
 | 
				
			||||||
 | 
					            ("text/rtf", ".rtf"),
 | 
				
			||||||
 | 
					            ("application/msword", ".doc"),
 | 
				
			||||||
 | 
					            (
 | 
				
			||||||
 | 
					                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 | 
				
			||||||
 | 
					                ".docx",
 | 
				
			||||||
 | 
					            ),
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Force the app ready to notice the settings override
 | 
				
			||||||
 | 
					        with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
 | 
				
			||||||
 | 
					            app = apps.get_app_config("paperless_tika")
 | 
				
			||||||
 | 
					            app.ready()
 | 
				
			||||||
 | 
					            supported_exts = get_supported_file_extensions()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for mime_type, ext in supported_mimes_and_exts:
 | 
				
			||||||
 | 
					            self.assertIn(ext, supported_exts)
 | 
				
			||||||
 | 
					            self.assertEqual(get_default_file_extension(mime_type), ext)
 | 
				
			||||||
 | 
					            self.assertIsInstance(
 | 
				
			||||||
 | 
					                get_parser_class_for_mime_type(mime_type)(logging_group=None),
 | 
				
			||||||
 | 
					                TikaDocumentParser,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_no_parser_for_mime(self):
 | 
				
			||||||
 | 
					        self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_default_extension(self):
 | 
				
			||||||
        # Test no parser declared still returns a an extension
 | 
					        # Test no parser declared still returns a an extension
 | 
				
			||||||
        self.assertEqual(get_default_file_extension("application/zip"), ".zip")
 | 
					        self.assertEqual(get_default_file_extension("application/zip"), ".zip")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Test invalid mimetype returns no extension
 | 
					        # Test invalid mimetype returns no extension
 | 
				
			||||||
        self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
 | 
					        self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertIsInstance(
 | 
					    def test_file_extension_support(self):
 | 
				
			||||||
            get_parser_class_for_mime_type("application/pdf")(logging_group=None),
 | 
					 | 
				
			||||||
            RasterisedDocumentParser,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        self.assertIsInstance(
 | 
					 | 
				
			||||||
            get_parser_class_for_mime_type("text/plain")(logging_group=None),
 | 
					 | 
				
			||||||
            TextDocumentParser,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self.assertTrue(is_file_ext_supported(".pdf"))
 | 
					        self.assertTrue(is_file_ext_supported(".pdf"))
 | 
				
			||||||
        self.assertFalse(is_file_ext_supported(".hsdfh"))
 | 
					        self.assertFalse(is_file_ext_supported(".hsdfh"))
 | 
				
			||||||
        self.assertFalse(is_file_ext_supported(""))
 | 
					        self.assertFalse(is_file_ext_supported(""))
 | 
				
			||||||
 | 
				
			|||||||
@ -14,15 +14,14 @@ TEST_CHANNEL_LAYERS = {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
 | 
				
			||||||
class TestWebSockets(TestCase):
 | 
					class TestWebSockets(TestCase):
 | 
				
			||||||
    @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
 | 
					 | 
				
			||||||
    async def test_no_auth(self):
 | 
					    async def test_no_auth(self):
 | 
				
			||||||
        communicator = WebsocketCommunicator(application, "/ws/status/")
 | 
					        communicator = WebsocketCommunicator(application, "/ws/status/")
 | 
				
			||||||
        connected, subprotocol = await communicator.connect()
 | 
					        connected, subprotocol = await communicator.connect()
 | 
				
			||||||
        self.assertFalse(connected)
 | 
					        self.assertFalse(connected)
 | 
				
			||||||
        await communicator.disconnect()
 | 
					        await communicator.disconnect()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
 | 
					 | 
				
			||||||
    @mock.patch("paperless.consumers.StatusConsumer._authenticated")
 | 
					    @mock.patch("paperless.consumers.StatusConsumer._authenticated")
 | 
				
			||||||
    async def test_auth(self, _authenticated):
 | 
					    async def test_auth(self, _authenticated):
 | 
				
			||||||
        _authenticated.return_value = True
 | 
					        _authenticated.return_value = True
 | 
				
			||||||
@ -33,7 +32,6 @@ class TestWebSockets(TestCase):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        await communicator.disconnect()
 | 
					        await communicator.disconnect()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
 | 
					 | 
				
			||||||
    @mock.patch("paperless.consumers.StatusConsumer._authenticated")
 | 
					    @mock.patch("paperless.consumers.StatusConsumer._authenticated")
 | 
				
			||||||
    async def test_receive(self, _authenticated):
 | 
					    async def test_receive(self, _authenticated):
 | 
				
			||||||
        _authenticated.return_value = True
 | 
					        _authenticated.return_value = True
 | 
				
			||||||
 | 
				
			|||||||
@ -12,7 +12,7 @@ class StandardPagination(PageNumberPagination):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class FaviconView(View):
 | 
					class FaviconView(View):
 | 
				
			||||||
    def get(self, request, *args, **kwargs):
 | 
					    def get(self, request, *args, **kwargs):  # pragma: nocover
 | 
				
			||||||
        favicon = os.path.join(
 | 
					        favicon = os.path.join(
 | 
				
			||||||
            os.path.dirname(__file__),
 | 
					            os.path.dirname(__file__),
 | 
				
			||||||
            "static",
 | 
					            "static",
 | 
				
			||||||
 | 
				
			|||||||
@ -161,7 +161,7 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        except Exception:
 | 
					        except Exception:
 | 
				
			||||||
            # TODO catch all for various issues with PDFminer.six.
 | 
					            # TODO catch all for various issues with PDFminer.six.
 | 
				
			||||||
            #  If PDFminer fails, fall back to OCR.
 | 
					            #  If pdftotext fails, fall back to OCR.
 | 
				
			||||||
            self.log(
 | 
					            self.log(
 | 
				
			||||||
                "warning",
 | 
					                "warning",
 | 
				
			||||||
                "Error while getting text from PDF document with " "pdfminer.six",
 | 
					                "Error while getting text from PDF document with " "pdfminer.six",
 | 
				
			||||||
 | 
				
			|||||||
@ -364,7 +364,7 @@ class TestParser(DirectoriesMixin, TestCase):
 | 
				
			|||||||
        )
 | 
					        )
 | 
				
			||||||
        self.assertTrue(os.path.isfile(parser.archive_path))
 | 
					        self.assertTrue(os.path.isfile(parser.archive_path))
 | 
				
			||||||
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
 | 
					        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
 | 
				
			||||||
        self.assertFalse("page 3" in parser.get_text().lower())
 | 
					        self.assertNotIn("page 3", parser.get_text().lower())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @override_settings(OCR_PAGES=1, OCR_MODE="force")
 | 
					    @override_settings(OCR_PAGES=1, OCR_MODE="force")
 | 
				
			||||||
    def test_multi_page_analog_pages_force(self):
 | 
					    def test_multi_page_analog_pages_force(self):
 | 
				
			||||||
@ -386,8 +386,8 @@ class TestParser(DirectoriesMixin, TestCase):
 | 
				
			|||||||
        )
 | 
					        )
 | 
				
			||||||
        self.assertTrue(os.path.isfile(parser.archive_path))
 | 
					        self.assertTrue(os.path.isfile(parser.archive_path))
 | 
				
			||||||
        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
 | 
					        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
 | 
				
			||||||
        self.assertFalse("page 2" in parser.get_text().lower())
 | 
					        self.assertNotIn("page 2", parser.get_text().lower())
 | 
				
			||||||
        self.assertFalse("page 3" in parser.get_text().lower())
 | 
					        self.assertNotIn("page 3", parser.get_text().lower())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @override_settings(OCR_MODE="skip_noarchive")
 | 
					    @override_settings(OCR_MODE="skip_noarchive")
 | 
				
			||||||
    def test_skip_noarchive_withtext(self):
 | 
					    def test_skip_noarchive_withtext(self):
 | 
				
			||||||
@ -660,6 +660,15 @@ class TestParser(DirectoriesMixin, TestCase):
 | 
				
			|||||||
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
 | 
					            params = parser.construct_ocrmypdf_parameters("", "", "", "")
 | 
				
			||||||
            self.assertNotIn("deskew", params)
 | 
					            self.assertNotIn("deskew", params)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
 | 
				
			||||||
 | 
					            params = parser.construct_ocrmypdf_parameters("", "", "", "")
 | 
				
			||||||
 | 
					            self.assertIn("max_image_mpixels", params)
 | 
				
			||||||
 | 
					            self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
 | 
				
			||||||
 | 
					            params = parser.construct_ocrmypdf_parameters("", "", "", "")
 | 
				
			||||||
 | 
					            self.assertNotIn("max_image_mpixels", params)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_rtl_language_detection(self):
 | 
					    def test_rtl_language_detection(self):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        GIVEN:
 | 
					        GIVEN:
 | 
				
			||||||
 | 
				
			|||||||
@ -3,7 +3,9 @@ import os
 | 
				
			|||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from unittest import mock
 | 
					from unittest import mock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from django.test import override_settings
 | 
				
			||||||
from django.test import TestCase
 | 
					from django.test import TestCase
 | 
				
			||||||
 | 
					from documents.parsers import ParseError
 | 
				
			||||||
from paperless_tika.parsers import TikaDocumentParser
 | 
					from paperless_tika.parsers import TikaDocumentParser
 | 
				
			||||||
from requests import Response
 | 
					from requests import Response
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -54,3 +56,63 @@ class TestTikaParser(TestCase):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
 | 
					        self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
 | 
				
			||||||
        self.assertTrue("Some-key" in [m["key"] for m in metadata])
 | 
					        self.assertTrue("Some-key" in [m["key"] for m in metadata])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @mock.patch("paperless_tika.parsers.parser.from_file")
 | 
				
			||||||
 | 
					    @mock.patch("paperless_tika.parsers.requests.post")
 | 
				
			||||||
 | 
					    def test_convert_failure(self, post, from_file):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - Document needs to be converted to PDF
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - Gotenberg server returns an error
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - Parse error is raised
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        from_file.return_value = {
 | 
				
			||||||
 | 
					            "content": "the content",
 | 
				
			||||||
 | 
					            "metadata": {"Creation-Date": "2020-11-21"},
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        response = Response()
 | 
				
			||||||
 | 
					        response._content = b"PDF document"
 | 
				
			||||||
 | 
					        response.status_code = 500
 | 
				
			||||||
 | 
					        post.return_value = response
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        file = os.path.join(self.parser.tempdir, "input.odt")
 | 
				
			||||||
 | 
					        Path(file).touch()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        with self.assertRaises(ParseError):
 | 
				
			||||||
 | 
					            self.parser.convert_to_pdf(file, None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @mock.patch("paperless_tika.parsers.requests.post")
 | 
				
			||||||
 | 
					    def test_request_pdf_a_format(self, post: mock.Mock):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - Document needs to be converted to PDF
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - Specific PDF/A format requested
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - Request to Gotenberg contains the expected PDF/A format string
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        file = os.path.join(self.parser.tempdir, "input.odt")
 | 
				
			||||||
 | 
					        Path(file).touch()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        response = Response()
 | 
				
			||||||
 | 
					        response._content = b"PDF document"
 | 
				
			||||||
 | 
					        response.status_code = 200
 | 
				
			||||||
 | 
					        post.return_value = response
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for setting, expected_key in [
 | 
				
			||||||
 | 
					            ("pdfa", "PDF/A-2b"),
 | 
				
			||||||
 | 
					            ("pdfa-2", "PDF/A-2b"),
 | 
				
			||||||
 | 
					            ("pdfa-1", "PDF/A-1a"),
 | 
				
			||||||
 | 
					            ("pdfa-3", "PDF/A-3b"),
 | 
				
			||||||
 | 
					        ]:
 | 
				
			||||||
 | 
					            with override_settings(OCR_OUTPUT_TYPE=setting):
 | 
				
			||||||
 | 
					                self.parser.convert_to_pdf(file, None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                post.assert_called_once()
 | 
				
			||||||
 | 
					                _, kwargs = post.call_args
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                self.assertEqual(kwargs["data"]["pdfFormat"], expected_key)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                post.reset_mock()
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user