mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-31 10:37:12 -04:00 
			
		
		
		
	Merge remote-tracking branch 'paperless/dev' into feature-consume-eml
This commit is contained in:
		
						commit
						e1fa59122d
					
				
							
								
								
									
										11
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										11
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							| @ -226,7 +226,7 @@ | ||||
|                 "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", | ||||
|                 "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" | ||||
|             ], | ||||
|             "markers": "python_full_version >= '3.6.0'", | ||||
|             "markers": "python_version >= '3.6'", | ||||
|             "version": "==2.1.1" | ||||
|         }, | ||||
|         "click": { | ||||
| @ -242,7 +242,7 @@ | ||||
|                 "sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667", | ||||
|                 "sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035" | ||||
|             ], | ||||
|             "markers": "python_full_version >= '3.6.2' and python_full_version < '4.0.0'", | ||||
|             "markers": "python_version < '4' and python_full_version >= '3.6.2'", | ||||
|             "version": "==0.3.0" | ||||
|         }, | ||||
|         "click-plugins": { | ||||
| @ -2191,7 +2191,7 @@ | ||||
|                 "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", | ||||
|                 "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" | ||||
|             ], | ||||
|             "markers": "python_full_version >= '3.6.0'", | ||||
|             "markers": "python_version >= '3.6'", | ||||
|             "version": "==2.1.1" | ||||
|         }, | ||||
|         "click": { | ||||
| @ -2211,6 +2211,9 @@ | ||||
|             "version": "==0.4.5" | ||||
|         }, | ||||
|         "coverage": { | ||||
|             "extras": [ | ||||
|                 "toml" | ||||
|             ], | ||||
|             "hashes": [ | ||||
|                 "sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79", | ||||
|                 "sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a", | ||||
| @ -2785,7 +2788,7 @@ | ||||
|                 "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", | ||||
|                 "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" | ||||
|             ], | ||||
|             "markers": "python_full_version < '3.11.0a7'", | ||||
|             "markers": "python_version >= '3.7'", | ||||
|             "version": "==2.0.1" | ||||
|         }, | ||||
|         "tornado": { | ||||
|  | ||||
| @ -10,9 +10,12 @@ from typing import Tuple | ||||
| import magic | ||||
| from django.conf import settings | ||||
| from pdf2image import convert_from_path | ||||
| from pdf2image.exceptions import PDFPageCountError | ||||
| from pikepdf import Page | ||||
| from pikepdf import PasswordError | ||||
| from pikepdf import Pdf | ||||
| from pikepdf import PdfImage | ||||
| from pikepdf.models.image import HifiPrintImageNotTranscodableError | ||||
| from PIL import Image | ||||
| from PIL import ImageSequence | ||||
| from pyzbar import pyzbar | ||||
| @ -120,7 +123,9 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | ||||
|                     pdfimage = PdfImage(page.images[image_key]) | ||||
| 
 | ||||
|                     if "/CCITTFaxDecode" in pdfimage.filters: | ||||
|                         raise BarcodeImageFormatError() | ||||
|                         raise BarcodeImageFormatError( | ||||
|                             "Unable to decode CCITTFaxDecode images", | ||||
|                         ) | ||||
| 
 | ||||
|                     # Not all images can be transcoded to a PIL image, which | ||||
|                     # is what pyzbar expects to receive | ||||
| @ -132,7 +137,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | ||||
|                         separator_page_numbers.append(page_num) | ||||
| 
 | ||||
|     def _pdf2image_barcode_scan(pdf_filepath: str): | ||||
|         # use a temporary directory in case the file os too big to handle in memory | ||||
|         # use a temporary directory in case the file is too big to handle in memory | ||||
|         with tempfile.TemporaryDirectory() as path: | ||||
|             pages_from_path = convert_from_path(pdf_filepath, output_folder=path) | ||||
|             for current_page_number, page in enumerate(pages_from_path): | ||||
| @ -150,20 +155,42 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | ||||
|         if mime_type == "image/tiff": | ||||
|             pdf_filepath = convert_from_tiff_to_pdf(filepath) | ||||
| 
 | ||||
|         # Chose the scanner | ||||
|         if settings.CONSUMER_USE_LEGACY_DETECTION: | ||||
|             _pdf2image_barcode_scan(pdf_filepath) | ||||
|             logger.debug("Using pdf2image for barcodes") | ||||
|             scanner_function = _pdf2image_barcode_scan | ||||
|         else: | ||||
|             try: | ||||
|                 _pikepdf_barcode_scan(pdf_filepath) | ||||
|             except Exception as e: | ||||
|             logger.debug("Using pikepdf for barcodes") | ||||
|             scanner_function = _pikepdf_barcode_scan | ||||
| 
 | ||||
|                 logger.warning( | ||||
|                     f"Exception using pikepdf for barcodes," | ||||
|                     f" falling back to pdf2image: {e}", | ||||
|                 ) | ||||
|                 # Reset this incase pikepdf got part way through | ||||
|         # Run the scanner | ||||
|         try: | ||||
|             scanner_function(pdf_filepath) | ||||
|         # Neither method can handle password protected PDFs without it being | ||||
|         # provided.  Log it and continue | ||||
|         except (PasswordError, PDFPageCountError) as e: | ||||
|             logger.warning( | ||||
|                 f"File is likely password protected, not splitting: {e}", | ||||
|             ) | ||||
|         # Handle pikepdf related image decoding issues with a fallback | ||||
|         except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e: | ||||
|             logger.warning( | ||||
|                 f"Falling back to pdf2image because: {e}", | ||||
|             ) | ||||
|             try: | ||||
|                 separator_page_numbers = [] | ||||
|                 _pdf2image_barcode_scan(pdf_filepath) | ||||
|             # This file is really borked, allow the consumption to continue | ||||
|             # but it may fail further on | ||||
|             except Exception as e:  # pragma: no cover | ||||
|                 logger.warning( | ||||
|                     f"Exception during barcode scanning: {e}", | ||||
|                 ) | ||||
|         # We're not sure what happened, but allow the consumption to continue | ||||
|         except Exception as e:  # pragma: no cover | ||||
|             logger.warning( | ||||
|                 f"Exception during barcode scanning: {e}", | ||||
|             ) | ||||
| 
 | ||||
|     else: | ||||
|         logger.warning( | ||||
|  | ||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/password-is-test.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/password-is-test.pdf
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -174,7 +174,7 @@ class TestBarcode(DirectoriesMixin, TestCase): | ||||
|         self.assertEqual(pdf_file, test_file) | ||||
|         self.assertListEqual(separator_page_numbers, [0]) | ||||
| 
 | ||||
|     def test_scan_file_for_separating_barcodes2(self): | ||||
|     def test_scan_file_for_separating_barcodes_none_present(self): | ||||
|         test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf") | ||||
|         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( | ||||
|             test_file, | ||||
| @ -585,3 +585,40 @@ class TestBarcode(DirectoriesMixin, TestCase): | ||||
| 
 | ||||
|         with mock.patch("documents.tasks.async_to_sync"): | ||||
|             self.assertEqual(tasks.consume_file(dst), "File successfully split") | ||||
| 
 | ||||
|     def test_scan_file_for_separating_barcodes_password_pikepdf(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Password protected PDF | ||||
|             - pikepdf based scanning | ||||
|         WHEN: | ||||
|             - File is scanned for barcode | ||||
|         THEN: | ||||
|             - Scanning handle the exception without exception | ||||
|         """ | ||||
|         test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") | ||||
|         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( | ||||
|             test_file, | ||||
|         ) | ||||
| 
 | ||||
|         self.assertEqual(pdf_file, test_file) | ||||
|         self.assertListEqual(separator_page_numbers, []) | ||||
| 
 | ||||
|     @override_settings(CONSUMER_USE_LEGACY_DETECTION=True) | ||||
|     def test_scan_file_for_separating_barcodes_password_pdf2image(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Password protected PDF | ||||
|             - pdf2image based scanning | ||||
|         WHEN: | ||||
|             - File is scanned for barcode | ||||
|         THEN: | ||||
|             - Scanning handle the exception without exception | ||||
|         """ | ||||
|         test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") | ||||
|         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( | ||||
|             test_file, | ||||
|         ) | ||||
| 
 | ||||
|         self.assertEqual(pdf_file, test_file) | ||||
|         self.assertListEqual(separator_page_numbers, []) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user