mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	Merge remote-tracking branch 'paperless/dev' into feature-consume-eml
This commit is contained in:
		
						commit
						e1fa59122d
					
				
							
								
								
									
										11
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										11
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							@ -226,7 +226,7 @@
 | 
				
			|||||||
                "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
 | 
					                "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
 | 
				
			||||||
                "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
 | 
					                "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
 | 
				
			||||||
            ],
 | 
					            ],
 | 
				
			||||||
            "markers": "python_full_version >= '3.6.0'",
 | 
					            "markers": "python_version >= '3.6'",
 | 
				
			||||||
            "version": "==2.1.1"
 | 
					            "version": "==2.1.1"
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        "click": {
 | 
					        "click": {
 | 
				
			||||||
@ -242,7 +242,7 @@
 | 
				
			|||||||
                "sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667",
 | 
					                "sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667",
 | 
				
			||||||
                "sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035"
 | 
					                "sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035"
 | 
				
			||||||
            ],
 | 
					            ],
 | 
				
			||||||
            "markers": "python_full_version >= '3.6.2' and python_full_version < '4.0.0'",
 | 
					            "markers": "python_version < '4' and python_full_version >= '3.6.2'",
 | 
				
			||||||
            "version": "==0.3.0"
 | 
					            "version": "==0.3.0"
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        "click-plugins": {
 | 
					        "click-plugins": {
 | 
				
			||||||
@ -2191,7 +2191,7 @@
 | 
				
			|||||||
                "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
 | 
					                "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
 | 
				
			||||||
                "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
 | 
					                "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
 | 
				
			||||||
            ],
 | 
					            ],
 | 
				
			||||||
            "markers": "python_full_version >= '3.6.0'",
 | 
					            "markers": "python_version >= '3.6'",
 | 
				
			||||||
            "version": "==2.1.1"
 | 
					            "version": "==2.1.1"
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        "click": {
 | 
					        "click": {
 | 
				
			||||||
@ -2211,6 +2211,9 @@
 | 
				
			|||||||
            "version": "==0.4.5"
 | 
					            "version": "==0.4.5"
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        "coverage": {
 | 
					        "coverage": {
 | 
				
			||||||
 | 
					            "extras": [
 | 
				
			||||||
 | 
					                "toml"
 | 
				
			||||||
 | 
					            ],
 | 
				
			||||||
            "hashes": [
 | 
					            "hashes": [
 | 
				
			||||||
                "sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79",
 | 
					                "sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79",
 | 
				
			||||||
                "sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a",
 | 
					                "sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a",
 | 
				
			||||||
@ -2785,7 +2788,7 @@
 | 
				
			|||||||
                "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
 | 
					                "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
 | 
				
			||||||
                "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
 | 
					                "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
 | 
				
			||||||
            ],
 | 
					            ],
 | 
				
			||||||
            "markers": "python_full_version < '3.11.0a7'",
 | 
					            "markers": "python_version >= '3.7'",
 | 
				
			||||||
            "version": "==2.0.1"
 | 
					            "version": "==2.0.1"
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        "tornado": {
 | 
					        "tornado": {
 | 
				
			||||||
 | 
				
			|||||||
@ -10,9 +10,12 @@ from typing import Tuple
 | 
				
			|||||||
import magic
 | 
					import magic
 | 
				
			||||||
from django.conf import settings
 | 
					from django.conf import settings
 | 
				
			||||||
from pdf2image import convert_from_path
 | 
					from pdf2image import convert_from_path
 | 
				
			||||||
 | 
					from pdf2image.exceptions import PDFPageCountError
 | 
				
			||||||
from pikepdf import Page
 | 
					from pikepdf import Page
 | 
				
			||||||
 | 
					from pikepdf import PasswordError
 | 
				
			||||||
from pikepdf import Pdf
 | 
					from pikepdf import Pdf
 | 
				
			||||||
from pikepdf import PdfImage
 | 
					from pikepdf import PdfImage
 | 
				
			||||||
 | 
					from pikepdf.models.image import HifiPrintImageNotTranscodableError
 | 
				
			||||||
from PIL import Image
 | 
					from PIL import Image
 | 
				
			||||||
from PIL import ImageSequence
 | 
					from PIL import ImageSequence
 | 
				
			||||||
from pyzbar import pyzbar
 | 
					from pyzbar import pyzbar
 | 
				
			||||||
@ -120,7 +123,9 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
 | 
				
			|||||||
                    pdfimage = PdfImage(page.images[image_key])
 | 
					                    pdfimage = PdfImage(page.images[image_key])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    if "/CCITTFaxDecode" in pdfimage.filters:
 | 
					                    if "/CCITTFaxDecode" in pdfimage.filters:
 | 
				
			||||||
                        raise BarcodeImageFormatError()
 | 
					                        raise BarcodeImageFormatError(
 | 
				
			||||||
 | 
					                            "Unable to decode CCITTFaxDecode images",
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    # Not all images can be transcoded to a PIL image, which
 | 
					                    # Not all images can be transcoded to a PIL image, which
 | 
				
			||||||
                    # is what pyzbar expects to receive
 | 
					                    # is what pyzbar expects to receive
 | 
				
			||||||
@ -132,7 +137,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
 | 
				
			|||||||
                        separator_page_numbers.append(page_num)
 | 
					                        separator_page_numbers.append(page_num)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _pdf2image_barcode_scan(pdf_filepath: str):
 | 
					    def _pdf2image_barcode_scan(pdf_filepath: str):
 | 
				
			||||||
        # use a temporary directory in case the file os too big to handle in memory
 | 
					        # use a temporary directory in case the file is too big to handle in memory
 | 
				
			||||||
        with tempfile.TemporaryDirectory() as path:
 | 
					        with tempfile.TemporaryDirectory() as path:
 | 
				
			||||||
            pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
 | 
					            pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
 | 
				
			||||||
            for current_page_number, page in enumerate(pages_from_path):
 | 
					            for current_page_number, page in enumerate(pages_from_path):
 | 
				
			||||||
@ -150,20 +155,42 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
 | 
				
			|||||||
        if mime_type == "image/tiff":
 | 
					        if mime_type == "image/tiff":
 | 
				
			||||||
            pdf_filepath = convert_from_tiff_to_pdf(filepath)
 | 
					            pdf_filepath = convert_from_tiff_to_pdf(filepath)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Chose the scanner
 | 
				
			||||||
        if settings.CONSUMER_USE_LEGACY_DETECTION:
 | 
					        if settings.CONSUMER_USE_LEGACY_DETECTION:
 | 
				
			||||||
            _pdf2image_barcode_scan(pdf_filepath)
 | 
					            logger.debug("Using pdf2image for barcodes")
 | 
				
			||||||
 | 
					            scanner_function = _pdf2image_barcode_scan
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            try:
 | 
					            logger.debug("Using pikepdf for barcodes")
 | 
				
			||||||
                _pikepdf_barcode_scan(pdf_filepath)
 | 
					            scanner_function = _pikepdf_barcode_scan
 | 
				
			||||||
            except Exception as e:
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Run the scanner
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            scanner_function(pdf_filepath)
 | 
				
			||||||
 | 
					        # Neither method can handle password protected PDFs without it being
 | 
				
			||||||
 | 
					        # provided.  Log it and continue
 | 
				
			||||||
 | 
					        except (PasswordError, PDFPageCountError) as e:
 | 
				
			||||||
            logger.warning(
 | 
					            logger.warning(
 | 
				
			||||||
                    f"Exception using pikepdf for barcodes,"
 | 
					                f"File is likely password protected, not splitting: {e}",
 | 
				
			||||||
                    f" falling back to pdf2image: {e}",
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
                # Reset this incase pikepdf got part way through
 | 
					        # Handle pikepdf related image decoding issues with a fallback
 | 
				
			||||||
 | 
					        except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
 | 
				
			||||||
 | 
					            logger.warning(
 | 
				
			||||||
 | 
					                f"Falling back to pdf2image because: {e}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
                separator_page_numbers = []
 | 
					                separator_page_numbers = []
 | 
				
			||||||
                _pdf2image_barcode_scan(pdf_filepath)
 | 
					                _pdf2image_barcode_scan(pdf_filepath)
 | 
				
			||||||
 | 
					            # This file is really borked, allow the consumption to continue
 | 
				
			||||||
 | 
					            # but it may fail further on
 | 
				
			||||||
 | 
					            except Exception as e:  # pragma: no cover
 | 
				
			||||||
 | 
					                logger.warning(
 | 
				
			||||||
 | 
					                    f"Exception during barcode scanning: {e}",
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					        # We're not sure what happened, but allow the consumption to continue
 | 
				
			||||||
 | 
					        except Exception as e:  # pragma: no cover
 | 
				
			||||||
 | 
					            logger.warning(
 | 
				
			||||||
 | 
					                f"Exception during barcode scanning: {e}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        logger.warning(
 | 
					        logger.warning(
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/password-is-test.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/password-is-test.pdf
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							@ -174,7 +174,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
 | 
				
			|||||||
        self.assertEqual(pdf_file, test_file)
 | 
					        self.assertEqual(pdf_file, test_file)
 | 
				
			||||||
        self.assertListEqual(separator_page_numbers, [0])
 | 
					        self.assertListEqual(separator_page_numbers, [0])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_scan_file_for_separating_barcodes2(self):
 | 
					    def test_scan_file_for_separating_barcodes_none_present(self):
 | 
				
			||||||
        test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
 | 
					        test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
 | 
				
			||||||
        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
 | 
					        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
 | 
				
			||||||
            test_file,
 | 
					            test_file,
 | 
				
			||||||
@ -585,3 +585,40 @@ class TestBarcode(DirectoriesMixin, TestCase):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        with mock.patch("documents.tasks.async_to_sync"):
 | 
					        with mock.patch("documents.tasks.async_to_sync"):
 | 
				
			||||||
            self.assertEqual(tasks.consume_file(dst), "File successfully split")
 | 
					            self.assertEqual(tasks.consume_file(dst), "File successfully split")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_scan_file_for_separating_barcodes_password_pikepdf(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - Password protected PDF
 | 
				
			||||||
 | 
					            - pikepdf based scanning
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - File is scanned for barcode
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - Scanning handle the exception without exception
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
 | 
				
			||||||
 | 
					        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
 | 
				
			||||||
 | 
					            test_file,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.assertEqual(pdf_file, test_file)
 | 
				
			||||||
 | 
					        self.assertListEqual(separator_page_numbers, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
 | 
				
			||||||
 | 
					    def test_scan_file_for_separating_barcodes_password_pdf2image(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - Password protected PDF
 | 
				
			||||||
 | 
					            - pdf2image based scanning
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - File is scanned for barcode
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - Scanning handle the exception without exception
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
 | 
				
			||||||
 | 
					        pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
 | 
				
			||||||
 | 
					            test_file,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.assertEqual(pdf_file, test_file)
 | 
				
			||||||
 | 
					        self.assertListEqual(separator_page_numbers, [])
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user