mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 00:02:35 -04:00 
			
		
		
		
	Merge pull request #1745 from paperless-ngx/fix/1743
Bugfix: Fallback to pdf2image if pikepdf fails
This commit is contained in:
		
						commit
						b13ced93ed
					
				
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							| @ -57,6 +57,7 @@ celery = {extras = ["redis"], version = "*"} | |||||||
| django-celery-results = "*" | django-celery-results = "*" | ||||||
| setproctitle = "*" | setproctitle = "*" | ||||||
| nltk = "*" | nltk = "*" | ||||||
|  | pdf2image = "*" | ||||||
| 
 | 
 | ||||||
| [dev-packages] | [dev-packages] | ||||||
| coveralls = "*" | coveralls = "*" | ||||||
|  | |||||||
							
								
								
									
										8
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										8
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							| @ -939,6 +939,14 @@ | |||||||
|             "index": "pypi", |             "index": "pypi", | ||||||
|             "version": "==2.5.2" |             "version": "==2.5.2" | ||||||
|         }, |         }, | ||||||
|  |         "pdf2image": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:84f79f2b8fad943e36323ea4e937fcb05f26ded0caa0a01181df66049e42fb65", | ||||||
|  |                 "sha256:d58ed94d978a70c73c2bb7fdf8acbaf2a7089c29ff8141be5f45433c0c4293bb" | ||||||
|  |             ], | ||||||
|  |             "index": "pypi", | ||||||
|  |             "version": "==1.16.0" | ||||||
|  |         }, | ||||||
|         "pdfminer.six": { |         "pdfminer.six": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:5a64c924410ac48501d6060b21638bf401db69f5b1bd57207df7fbc070ac8ae2", |                 "sha256:5a64c924410ac48501d6060b21638bf401db69f5b1bd57207df7fbc070ac8ae2", | ||||||
|  | |||||||
| @ -9,6 +9,7 @@ from typing import Tuple | |||||||
| 
 | 
 | ||||||
| import magic | import magic | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
|  | from pdf2image import convert_from_path | ||||||
| from pikepdf import Page | from pikepdf import Page | ||||||
| from pikepdf import Pdf | from pikepdf import Pdf | ||||||
| from pikepdf import PdfImage | from pikepdf import PdfImage | ||||||
| @ -19,6 +20,10 @@ from pyzbar import pyzbar | |||||||
| logger = logging.getLogger("paperless.barcodes") | logger = logging.getLogger("paperless.barcodes") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class BarcodeImageFormatError(Exception): | ||||||
|  |     pass | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @lru_cache(maxsize=8) | @lru_cache(maxsize=8) | ||||||
| def supported_file_type(mime_type) -> bool: | def supported_file_type(mime_type) -> bool: | ||||||
|     """ |     """ | ||||||
| @ -108,6 +113,33 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | |||||||
|     which separate the file into new files |     which separate the file into new files | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|  |     def _pikepdf_barcode_scan(pdf_filepath: str): | ||||||
|  |         with Pdf.open(pdf_filepath) as pdf: | ||||||
|  |             for page_num, page in enumerate(pdf.pages): | ||||||
|  |                 for image_key in page.images: | ||||||
|  |                     pdfimage = PdfImage(page.images[image_key]) | ||||||
|  | 
 | ||||||
|  |                     if "/CCITTFaxDecode" in pdfimage.filters: | ||||||
|  |                         raise BarcodeImageFormatError() | ||||||
|  | 
 | ||||||
|  |                     # Not all images can be transcoded to a PIL image, which | ||||||
|  |                     # is what pyzbar expects to receive | ||||||
|  |                     pillow_img = pdfimage.as_pil_image() | ||||||
|  | 
 | ||||||
|  |                     detected_barcodes = barcode_reader(pillow_img) | ||||||
|  | 
 | ||||||
|  |                     if settings.CONSUMER_BARCODE_STRING in detected_barcodes: | ||||||
|  |                         separator_page_numbers.append(page_num) | ||||||
|  | 
 | ||||||
|  |     def _pdf2image_barcode_scan(pdf_filepath: str): | ||||||
|  |         # use a temporary directory in case the file os too big to handle in memory | ||||||
|  |         with tempfile.TemporaryDirectory() as path: | ||||||
|  |             pages_from_path = convert_from_path(pdf_filepath, output_folder=path) | ||||||
|  |             for current_page_number, page in enumerate(pages_from_path): | ||||||
|  |                 current_barcodes = barcode_reader(page) | ||||||
|  |                 if settings.CONSUMER_BARCODE_STRING in current_barcodes: | ||||||
|  |                     separator_page_numbers.append(current_page_number) | ||||||
|  | 
 | ||||||
|     separator_page_numbers = [] |     separator_page_numbers = [] | ||||||
|     pdf_filepath = None |     pdf_filepath = None | ||||||
| 
 | 
 | ||||||
| @ -118,17 +150,17 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis | |||||||
|         if mime_type == "image/tiff": |         if mime_type == "image/tiff": | ||||||
|             pdf_filepath = convert_from_tiff_to_pdf(filepath) |             pdf_filepath = convert_from_tiff_to_pdf(filepath) | ||||||
| 
 | 
 | ||||||
|         pdf = Pdf.open(pdf_filepath) |         try: | ||||||
|  |             _pikepdf_barcode_scan(pdf_filepath) | ||||||
|  |         except Exception as e: | ||||||
| 
 | 
 | ||||||
|         for page_num, page in enumerate(pdf.pages): |             logger.warning( | ||||||
|             for image_key in page.images: |                 f"Exception using pikepdf for barcodes, falling back to pdf2image: {e}", | ||||||
|                 pdfimage = PdfImage(page.images[image_key]) |             ) | ||||||
|                 pillow_img = pdfimage.as_pil_image() |             # Reset this incase pikepdf got part way through | ||||||
|  |             separator_page_numbers = [] | ||||||
|  |             _pdf2image_barcode_scan(pdf_filepath) | ||||||
| 
 | 
 | ||||||
|                 detected_barcodes = barcode_reader(pillow_img) |  | ||||||
| 
 |  | ||||||
|                 if settings.CONSUMER_BARCODE_STRING in detected_barcodes: |  | ||||||
|                     separator_page_numbers.append(page_num) |  | ||||||
|     else: |     else: | ||||||
|         logger.warning( |         logger.warning( | ||||||
|             f"Unsupported file format for barcode reader: {str(mime_type)}", |             f"Unsupported file format for barcode reader: {str(mime_type)}", | ||||||
|  | |||||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/barcode-fax-image.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/barcode-fax-image.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -3,6 +3,7 @@ import shutil | |||||||
| import tempfile | import tempfile | ||||||
| from unittest import mock | from unittest import mock | ||||||
| 
 | 
 | ||||||
|  | import pikepdf | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.test import override_settings | from django.test import override_settings | ||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
| @ -218,6 +219,86 @@ class TestBarcode(DirectoriesMixin, TestCase): | |||||||
|         self.assertEqual(pdf_file, test_file) |         self.assertEqual(pdf_file, test_file) | ||||||
|         self.assertListEqual(separator_page_numbers, [1]) |         self.assertListEqual(separator_page_numbers, [1]) | ||||||
| 
 | 
 | ||||||
|  |     def test_scan_file_for_separating_barcodes_pillow_transcode_error(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - A PDF containing an image which cannot be transcoded to a PIL image | ||||||
|  |         WHEN: | ||||||
|  |             - The image tries to be transcoded to a PIL image, but fails | ||||||
|  |         THEN: | ||||||
|  |             - The barcode reader is still called | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         def _build_device_n_pdf(self, save_path: str): | ||||||
|  |             # Based on the pikepdf tests | ||||||
|  |             # https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py | ||||||
|  |             pdf = pikepdf.new() | ||||||
|  |             pdf.add_blank_page(page_size=(72, 72)) | ||||||
|  |             imobj = pikepdf.Stream( | ||||||
|  |                 pdf, | ||||||
|  |                 bytes(range(0, 256)), | ||||||
|  |                 BitsPerComponent=8, | ||||||
|  |                 ColorSpace=pikepdf.Array( | ||||||
|  |                     [ | ||||||
|  |                         pikepdf.Name.DeviceN, | ||||||
|  |                         pikepdf.Array([pikepdf.Name.Black]), | ||||||
|  |                         pikepdf.Name.DeviceCMYK, | ||||||
|  |                         pikepdf.Stream( | ||||||
|  |                             pdf, | ||||||
|  |                             b"{0 0 0 4 -1 roll}",  # Colorspace conversion function | ||||||
|  |                             FunctionType=4, | ||||||
|  |                             Domain=[0.0, 1.0], | ||||||
|  |                             Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0], | ||||||
|  |                         ), | ||||||
|  |                     ], | ||||||
|  |                 ), | ||||||
|  |                 Width=16, | ||||||
|  |                 Height=16, | ||||||
|  |                 Type=pikepdf.Name.XObject, | ||||||
|  |                 Subtype=pikepdf.Name.Image, | ||||||
|  |             ) | ||||||
|  |             pim = pikepdf.PdfImage(imobj) | ||||||
|  |             self.assertEqual(pim.mode, "DeviceN") | ||||||
|  |             self.assertTrue(pim.is_device_n) | ||||||
|  | 
 | ||||||
|  |             pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do") | ||||||
|  |             pdf.pages[0].Resources = pikepdf.Dictionary( | ||||||
|  |                 XObject=pikepdf.Dictionary(Im0=imobj), | ||||||
|  |             ) | ||||||
|  |             pdf.save(save_path) | ||||||
|  | 
 | ||||||
|  |         with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf: | ||||||
|  |             # Build an offending file | ||||||
|  |             _build_device_n_pdf(self, str(device_n_pdf.name)) | ||||||
|  |             with mock.patch("documents.barcodes.barcode_reader") as reader: | ||||||
|  |                 reader.return_value = list() | ||||||
|  | 
 | ||||||
|  |                 _, _ = barcodes.scan_file_for_separating_barcodes( | ||||||
|  |                     str(device_n_pdf.name), | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |                 reader.assert_called() | ||||||
|  | 
 | ||||||
|  |     def test_scan_file_for_separating_barcodes_fax_decode(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - A PDF containing an image encoded as CCITT Group 4 encoding | ||||||
|  |         WHEN: | ||||||
|  |             - Barcode processing happens with the file | ||||||
|  |         THEN: | ||||||
|  |             - The barcode is still detected | ||||||
|  |         """ | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             self.BARCODE_SAMPLE_DIR, | ||||||
|  |             "barcode-fax-image.pdf", | ||||||
|  |         ) | ||||||
|  |         pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes( | ||||||
|  |             test_file, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(pdf_file, test_file) | ||||||
|  |         self.assertListEqual(separator_page_numbers, [1]) | ||||||
|  | 
 | ||||||
|     def test_scan_file_for_separating_qr_barcodes(self): |     def test_scan_file_for_separating_qr_barcodes(self): | ||||||
|         test_file = os.path.join( |         test_file = os.path.join( | ||||||
|             self.BARCODE_SAMPLE_DIR, |             self.BARCODE_SAMPLE_DIR, | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user