mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	Merge pull request #1442 from paperless-ngx/fix/skip-archive-still-archiving
Bugfix: Fixes the creation of an archive file, even if noarchive was specified
This commit is contained in:
		
						commit
						1692bac3fe
					
				@ -276,7 +276,10 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
            self.log("debug", f"Calling OCRmyPDF with args: {args}")
 | 
					            self.log("debug", f"Calling OCRmyPDF with args: {args}")
 | 
				
			||||||
            ocrmypdf.ocr(**args)
 | 
					            ocrmypdf.ocr(**args)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.archive_path = archive_path
 | 
					            # Only create archive file if archiving isn't being skipped
 | 
				
			||||||
 | 
					            if settings.OCR_MODE != "skip_noarchive":
 | 
				
			||||||
 | 
					                self.archive_path = archive_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.text = self.extract_text(sidecar_file, archive_path)
 | 
					            self.text = self.extract_text(sidecar_file, archive_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if not self.text:
 | 
					            if not self.text:
 | 
				
			||||||
 | 
				
			|||||||
@ -364,6 +364,16 @@ class TestParser(DirectoriesMixin, TestCase):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    @override_settings(OCR_MODE="skip_noarchive")
 | 
					    @override_settings(OCR_MODE="skip_noarchive")
 | 
				
			||||||
    def test_skip_noarchive_withtext(self):
 | 
					    def test_skip_noarchive_withtext(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - File with existing text layer
 | 
				
			||||||
 | 
					            - OCR mode set to skip_noarchive
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - Document is parsed
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - Text from images is extracted
 | 
				
			||||||
 | 
					            - No archive file is created
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
        parser = RasterisedDocumentParser(None)
 | 
					        parser = RasterisedDocumentParser(None)
 | 
				
			||||||
        parser.parse(
 | 
					        parser.parse(
 | 
				
			||||||
            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
 | 
					            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
 | 
				
			||||||
@ -377,17 +387,29 @@ class TestParser(DirectoriesMixin, TestCase):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    @override_settings(OCR_MODE="skip_noarchive")
 | 
					    @override_settings(OCR_MODE="skip_noarchive")
 | 
				
			||||||
    def test_skip_noarchive_notext(self):
 | 
					    def test_skip_noarchive_notext(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - File with text contained in images but no text layer
 | 
				
			||||||
 | 
					            - OCR mode set to skip_noarchive
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - Document is parsed
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - Text from images is extracted
 | 
				
			||||||
 | 
					            - No archive file is created
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
        parser = RasterisedDocumentParser(None)
 | 
					        parser = RasterisedDocumentParser(None)
 | 
				
			||||||
        parser.parse(
 | 
					        parser.parse(
 | 
				
			||||||
            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
 | 
					            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
 | 
				
			||||||
            "application/pdf",
 | 
					            "application/pdf",
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        self.assertTrue(os.path.isfile(parser.archive_path))
 | 
					
 | 
				
			||||||
        self.assertContainsStrings(
 | 
					        self.assertContainsStrings(
 | 
				
			||||||
            parser.get_text().lower(),
 | 
					            parser.get_text().lower(),
 | 
				
			||||||
            ["page 1", "page 2", "page 3"],
 | 
					            ["page 1", "page 2", "page 3"],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.assertIsNone(parser.archive_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @override_settings(OCR_MODE="skip")
 | 
					    @override_settings(OCR_MODE="skip")
 | 
				
			||||||
    def test_multi_page_mixed(self):
 | 
					    def test_multi_page_mixed(self):
 | 
				
			||||||
        parser = RasterisedDocumentParser(None)
 | 
					        parser = RasterisedDocumentParser(None)
 | 
				
			||||||
@ -408,6 +430,16 @@ class TestParser(DirectoriesMixin, TestCase):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    @override_settings(OCR_MODE="skip_noarchive")
 | 
					    @override_settings(OCR_MODE="skip_noarchive")
 | 
				
			||||||
    def test_multi_page_mixed_no_archive(self):
 | 
					    def test_multi_page_mixed_no_archive(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - File with some text contained in images and some in text layer
 | 
				
			||||||
 | 
					            - OCR mode set to skip_noarchive
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - Document is parsed
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - Text from images is extracted
 | 
				
			||||||
 | 
					            - No archive file is created
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
        parser = RasterisedDocumentParser(None)
 | 
					        parser = RasterisedDocumentParser(None)
 | 
				
			||||||
        parser.parse(
 | 
					        parser.parse(
 | 
				
			||||||
            os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
 | 
					            os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user