mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-25 07:49:06 -04:00 
			
		
		
		
	Reverts the change around skip_noarchive to align with how it is documented to work
This commit is contained in:
		
							parent
							
								
									53e8d84af2
								
							
						
					
					
						commit
						d1aa08850d
					
				| @ -249,16 +249,22 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
| 
 | 
 | ||||||
|         if mime_type == "application/pdf": |         if mime_type == "application/pdf": | ||||||
|             text_original = self.extract_text(None, document_path) |             text_original = self.extract_text(None, document_path) | ||||||
|             original_has_text = text_original and len(text_original) > 50 |             original_has_text = text_original is not None and len(text_original) > 50 | ||||||
|         else: |         else: | ||||||
|             text_original = None |             text_original = None | ||||||
|             original_has_text = False |             original_has_text = False | ||||||
| 
 | 
 | ||||||
|  |         # If the original has text, and the user doesn't want an archive, | ||||||
|  |         # we're done here | ||||||
|         if settings.OCR_MODE == "skip_noarchive" and original_has_text: |         if settings.OCR_MODE == "skip_noarchive" and original_has_text: | ||||||
|             self.log("debug", "Document has text, skipping OCRmyPDF entirely.") |             self.log("debug", "Document has text, skipping OCRmyPDF entirely.") | ||||||
|             self.text = text_original |             self.text = text_original | ||||||
|             return |             return | ||||||
| 
 | 
 | ||||||
|  |         # Either no text was in the original or there should be an archive | ||||||
|  |         # file created, so OCR the file and create an archive with any | ||||||
|  |         # test located via OCR | ||||||
|  | 
 | ||||||
|         import ocrmypdf |         import ocrmypdf | ||||||
|         from ocrmypdf import InputFileError, EncryptedPdfError |         from ocrmypdf import InputFileError, EncryptedPdfError | ||||||
| 
 | 
 | ||||||
| @ -276,9 +282,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|             self.log("debug", f"Calling OCRmyPDF with args: {args}") |             self.log("debug", f"Calling OCRmyPDF with args: {args}") | ||||||
|             ocrmypdf.ocr(**args) |             ocrmypdf.ocr(**args) | ||||||
| 
 | 
 | ||||||
|             # Only create archive file if archiving isn't being skipped |             self.archive_path = archive_path | ||||||
|             if settings.OCR_MODE != "skip_noarchive": |  | ||||||
|                 self.archive_path = archive_path |  | ||||||
| 
 | 
 | ||||||
|             self.text = self.extract_text(sidecar_file, archive_path) |             self.text = self.extract_text(sidecar_file, archive_path) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -341,6 +341,17 @@ class TestParser(DirectoriesMixin, TestCase): | |||||||
| 
 | 
 | ||||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="redo") |     @override_settings(OCR_PAGES=2, OCR_MODE="redo") | ||||||
|     def test_multi_page_analog_pages_redo(self): |     def test_multi_page_analog_pages_redo(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File with text contained in images but no text layer | ||||||
|  |             - OCR of only pages 1 and 2 requested | ||||||
|  |             - OCR mode set to redo | ||||||
|  |         WHEN: | ||||||
|  |             - Document is parsed | ||||||
|  |         THEN: | ||||||
|  |             - Text of page 1 and 2 extracted | ||||||
|  |             - An archive file is created | ||||||
|  |         """ | ||||||
|         parser = RasterisedDocumentParser(None) |         parser = RasterisedDocumentParser(None) | ||||||
|         parser.parse( |         parser.parse( | ||||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), |             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), | ||||||
| @ -352,6 +363,17 @@ class TestParser(DirectoriesMixin, TestCase): | |||||||
| 
 | 
 | ||||||
|     @override_settings(OCR_PAGES=1, OCR_MODE="force") |     @override_settings(OCR_PAGES=1, OCR_MODE="force") | ||||||
|     def test_multi_page_analog_pages_force(self): |     def test_multi_page_analog_pages_force(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File with text contained in images but no text layer | ||||||
|  |             - OCR of only page 1 requested | ||||||
|  |             - OCR mode set to force | ||||||
|  |         WHEN: | ||||||
|  |             - Document is parsed | ||||||
|  |         THEN: | ||||||
|  |             - Only text of page 1 is extracted | ||||||
|  |             - An archive file is created | ||||||
|  |         """ | ||||||
|         parser = RasterisedDocumentParser(None) |         parser = RasterisedDocumentParser(None) | ||||||
|         parser.parse( |         parser.parse( | ||||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), |             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), | ||||||
| @ -395,7 +417,7 @@ class TestParser(DirectoriesMixin, TestCase): | |||||||
|             - Document is parsed |             - Document is parsed | ||||||
|         THEN: |         THEN: | ||||||
|             - Text from images is extracted |             - Text from images is extracted | ||||||
|             - No archive file is created |             - An archive file is created with the OCRd text | ||||||
|         """ |         """ | ||||||
|         parser = RasterisedDocumentParser(None) |         parser = RasterisedDocumentParser(None) | ||||||
|         parser.parse( |         parser.parse( | ||||||
| @ -408,15 +430,26 @@ class TestParser(DirectoriesMixin, TestCase): | |||||||
|             ["page 1", "page 2", "page 3"], |             ["page 1", "page 2", "page 3"], | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         self.assertIsNone(parser.archive_path) |         self.assertIsNotNone(parser.archive_path) | ||||||
| 
 | 
 | ||||||
|     @override_settings(OCR_MODE="skip") |     @override_settings(OCR_MODE="skip") | ||||||
|     def test_multi_page_mixed(self): |     def test_multi_page_mixed(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File with some text contained in images and some in text layer | ||||||
|  |             - OCR mode set to skip | ||||||
|  |         WHEN: | ||||||
|  |             - Document is parsed | ||||||
|  |         THEN: | ||||||
|  |             - Text from images is extracted | ||||||
|  |             - An archive file is created with the OCRd text and the original text | ||||||
|  |         """ | ||||||
|         parser = RasterisedDocumentParser(None) |         parser = RasterisedDocumentParser(None) | ||||||
|         parser.parse( |         parser.parse( | ||||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), |             os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), | ||||||
|             "application/pdf", |             "application/pdf", | ||||||
|         ) |         ) | ||||||
|  |         self.assertIsNotNone(parser.archive_path) | ||||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|         self.assertContainsStrings( |         self.assertContainsStrings( | ||||||
|             parser.get_text().lower(), |             parser.get_text().lower(), | ||||||
| @ -438,7 +471,7 @@ class TestParser(DirectoriesMixin, TestCase): | |||||||
|             - Document is parsed |             - Document is parsed | ||||||
|         THEN: |         THEN: | ||||||
|             - Text from images is extracted |             - Text from images is extracted | ||||||
|             - No archive file is created |             - No archive file is created as original file contains text | ||||||
|         """ |         """ | ||||||
|         parser = RasterisedDocumentParser(None) |         parser = RasterisedDocumentParser(None) | ||||||
|         parser.parse( |         parser.parse( | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user