mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-24 23:39:05 -04:00 
			
		
		
		
	Merge pull request #766 from paperless-ngx/feature-barcode-tiff-support
Feature barcode tiff support
This commit is contained in:
		
						commit
						8c8f366e0f
					
				| @ -629,8 +629,19 @@ PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool> | |||||||
|     If no barcodes are detected in the uploaded file, no page separation |     If no barcodes are detected in the uploaded file, no page separation | ||||||
|     will happen. |     will happen. | ||||||
| 
 | 
 | ||||||
|  |     The original document will be removed and the separated pages will be | ||||||
|  |     saved as pdf. | ||||||
|  | 
 | ||||||
|     Defaults to false. |     Defaults to false. | ||||||
| 
 | 
 | ||||||
|  | PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT=<bool> | ||||||
|  |     Whether TIFF image files should be scanned for barcodes. | ||||||
|  |     This will automatically convert any TIFF image(s) to pdfs for later | ||||||
|  |     processing. | ||||||
|  |     This only has an effect, if PAPERLESS_CONSUMER_ENABLE_BARCODES has been | ||||||
|  |     enabled. | ||||||
|  | 
 | ||||||
|  |     Defaults to false. | ||||||
| 
 | 
 | ||||||
| PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT | PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT | ||||||
|   Defines the string to be detected as a separator barcode. |   Defines the string to be detected as a separator barcode. | ||||||
|  | |||||||
| @ -22,6 +22,8 @@ from documents.models import Tag | |||||||
| from documents.sanity_checker import SanityCheckFailedException | from documents.sanity_checker import SanityCheckFailedException | ||||||
| from pdf2image import convert_from_path | from pdf2image import convert_from_path | ||||||
| from pikepdf import Pdf | from pikepdf import Pdf | ||||||
|  | from PIL import Image | ||||||
|  | from PIL import ImageSequence | ||||||
| from pyzbar import pyzbar | from pyzbar import pyzbar | ||||||
| from whoosh.writing import AsyncWriter | from whoosh.writing import AsyncWriter | ||||||
| 
 | 
 | ||||||
| @ -93,9 +95,41 @@ def barcode_reader(image) -> List[str]: | |||||||
|     return barcodes |     return barcodes | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def convert_from_tiff_to_pdf(filepath: str) -> str: | ||||||
|  |     """ | ||||||
|  |     converts a given TIFF image file to pdf into a temp. directory. | ||||||
|  |     Returns the new pdf file. | ||||||
|  |     """ | ||||||
|  |     file_name = os.path.splitext(os.path.basename(filepath))[0] | ||||||
|  |     file_extension = os.path.splitext(os.path.basename(filepath))[1].lower() | ||||||
|  |     tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||||
|  |     # use old file name with pdf extension | ||||||
|  |     if file_extension == ".tif" or file_extension == ".tiff": | ||||||
|  |         newpath = os.path.join(tempdir, file_name + ".pdf") | ||||||
|  |     else: | ||||||
|  |         logger.warning(f"Cannot convert from {str(file_extension)} to pdf.") | ||||||
|  |         return None | ||||||
|  |     with Image.open(filepath) as image: | ||||||
|  |         images = [] | ||||||
|  |         for i, page in enumerate(ImageSequence.Iterator(image)): | ||||||
|  |             page = page.convert("RGB") | ||||||
|  |             images.append(page) | ||||||
|  |         try: | ||||||
|  |             if len(images) == 1: | ||||||
|  |                 images[0].save(newpath) | ||||||
|  |             else: | ||||||
|  |                 images[0].save(newpath, save_all=True, append_images=images[1:]) | ||||||
|  |         except OSError as e: | ||||||
|  |             logger.warning( | ||||||
|  |                 f"Could not save the file as pdf. Error: {str(e)}", | ||||||
|  |             ) | ||||||
|  |             return None | ||||||
|  |     return newpath | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def scan_file_for_separating_barcodes(filepath: str) -> List[int]: | def scan_file_for_separating_barcodes(filepath: str) -> List[int]: | ||||||
|     """ |     """ | ||||||
|     Scan the provided file for page separating barcodes |     Scan the provided pdf file for page separating barcodes | ||||||
|     Returns a list of pagenumbers, which separate the file |     Returns a list of pagenumbers, which separate the file | ||||||
|     """ |     """ | ||||||
|     separator_page_numbers = [] |     separator_page_numbers = [] | ||||||
| @ -112,7 +146,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> List[int]: | |||||||
| 
 | 
 | ||||||
| def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]: | def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]: | ||||||
|     """ |     """ | ||||||
|     Separate the provided file on the pages_to_split_on. |     Separate the provided pdf file on the pages_to_split_on. | ||||||
|     The pages which are defined by page_numbers will be removed. |     The pages which are defined by page_numbers will be removed. | ||||||
|     Returns a list of (temporary) filepaths to consume. |     Returns a list of (temporary) filepaths to consume. | ||||||
|     These will need to be deleted later. |     These will need to be deleted later. | ||||||
| @ -195,42 +229,70 @@ def consume_file( | |||||||
|     if settings.CONSUMER_ENABLE_BARCODES: |     if settings.CONSUMER_ENABLE_BARCODES: | ||||||
|         separators = [] |         separators = [] | ||||||
|         document_list = [] |         document_list = [] | ||||||
|         separators = scan_file_for_separating_barcodes(path) |         converted_tiff = None | ||||||
|         if separators: |         if settings.CONSUMER_BARCODE_TIFF_SUPPORT: | ||||||
|             logger.debug(f"Pages with separators found in: {str(path)}") |             supported_extensions = [".pdf", ".tiff", ".tif"] | ||||||
|             document_list = separate_pages(path, separators) |         else: | ||||||
|         if document_list: |             supported_extensions = [".pdf"] | ||||||
|             for n, document in enumerate(document_list): |         file_extension = os.path.splitext(os.path.basename(path))[1].lower() | ||||||
|                 # save to consumption dir |         if file_extension not in supported_extensions: | ||||||
|                 # rename it to the original filename  with number prefix |             # if not supported, skip this routine | ||||||
|                 if override_filename: |             logger.warning( | ||||||
|                     newname = f"{str(n)}_" + override_filename |                 f"Unsupported file format for barcode reader: {str(file_extension)}", | ||||||
|                 else: |             ) | ||||||
|                     newname = None |         else: | ||||||
|                 save_to_dir(document, newname=newname) |             if file_extension in {".tif", ".tiff"}: | ||||||
|             # if we got here, the document was successfully split |                 file_to_process = convert_from_tiff_to_pdf(path) | ||||||
|             # and can safely be deleted |             else: | ||||||
|             logger.debug("Deleting file {}".format(path)) |                 file_to_process = path | ||||||
|             os.unlink(path) | 
 | ||||||
|             # notify the sender, otherwise the progress bar |             separators = scan_file_for_separating_barcodes(file_to_process) | ||||||
|             # in the UI stays stuck | 
 | ||||||
|             payload = { |             if separators: | ||||||
|                 "filename": override_filename, |                 logger.debug( | ||||||
|                 "task_id": task_id, |                     f"Pages with separators found in: {str(path)}", | ||||||
|                 "current_progress": 100, |  | ||||||
|                 "max_progress": 100, |  | ||||||
|                 "status": "SUCCESS", |  | ||||||
|                 "message": "finished", |  | ||||||
|             } |  | ||||||
|             try: |  | ||||||
|                 async_to_sync(get_channel_layer().group_send)( |  | ||||||
|                     "status_updates", |  | ||||||
|                     {"type": "status_update", "data": payload}, |  | ||||||
|                 ) |                 ) | ||||||
|             except OSError as e: |                 document_list = separate_pages(file_to_process, separators) | ||||||
|                 logger.warning("OSError. It could be, the broker cannot be reached.") | 
 | ||||||
|                 logger.warning(str(e)) |             if document_list: | ||||||
|             return "File successfully split" |                 for n, document in enumerate(document_list): | ||||||
|  |                     # save to consumption dir | ||||||
|  |                     # rename it to the original filename  with number prefix | ||||||
|  |                     if override_filename: | ||||||
|  |                         newname = f"{str(n)}_" + override_filename | ||||||
|  |                     else: | ||||||
|  |                         newname = None | ||||||
|  |                     save_to_dir(document, newname=newname) | ||||||
|  |                 # if we got here, the document was successfully split | ||||||
|  |                 # and can safely be deleted | ||||||
|  |                 if converted_tiff: | ||||||
|  |                     logger.debug("Deleting file {}".format(file_to_process)) | ||||||
|  |                     os.unlink(file_to_process) | ||||||
|  |                 logger.debug("Deleting file {}".format(path)) | ||||||
|  |                 os.unlink(path) | ||||||
|  |                 # notify the sender, otherwise the progress bar | ||||||
|  |                 # in the UI stays stuck | ||||||
|  |                 payload = { | ||||||
|  |                     "filename": override_filename, | ||||||
|  |                     "task_id": task_id, | ||||||
|  |                     "current_progress": 100, | ||||||
|  |                     "max_progress": 100, | ||||||
|  |                     "status": "SUCCESS", | ||||||
|  |                     "message": "finished", | ||||||
|  |                 } | ||||||
|  |                 try: | ||||||
|  |                     async_to_sync(get_channel_layer().group_send)( | ||||||
|  |                         "status_updates", | ||||||
|  |                         {"type": "status_update", "data": payload}, | ||||||
|  |                     ) | ||||||
|  |                 except OSError as e: | ||||||
|  |                     logger.warning( | ||||||
|  |                         "OSError. It could be, the broker cannot be reached.", | ||||||
|  |                     ) | ||||||
|  |                     logger.warning(str(e)) | ||||||
|  |                 # consuming stops here, since the original document with | ||||||
|  |                 # the barcodes has been split and will be consumed separately | ||||||
|  |                 return "File successfully split" | ||||||
| 
 | 
 | ||||||
|     # continue with consumption if no barcode was found |     # continue with consumption if no barcode was found | ||||||
|     document = Consumer().try_consume_file( |     document = Consumer().try_consume_file( | ||||||
|  | |||||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/patch-code-t-middle.tiff
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/patch-code-t-middle.tiff
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/simple.tiff
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/simple.tiff
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -204,6 +204,29 @@ class TestTasks(DirectoriesMixin, TestCase): | |||||||
|         img = Image.open(test_file) |         img = Image.open(test_file) | ||||||
|         self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) |         self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) | ||||||
| 
 | 
 | ||||||
|  |     def test_convert_from_tiff_to_pdf(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "simple.tiff", | ||||||
|  |         ) | ||||||
|  |         dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff") | ||||||
|  |         shutil.copy(test_file, dst) | ||||||
|  |         target_file = tasks.convert_from_tiff_to_pdf(dst) | ||||||
|  |         file_extension = os.path.splitext(os.path.basename(target_file))[1] | ||||||
|  |         self.assertTrue(os.path.isfile(target_file)) | ||||||
|  |         self.assertEqual(file_extension, ".pdf") | ||||||
|  | 
 | ||||||
|  |     def test_convert_error_from_pdf_to_pdf(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "simple.pdf", | ||||||
|  |         ) | ||||||
|  |         dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf") | ||||||
|  |         shutil.copy(test_file, dst) | ||||||
|  |         self.assertIsNone(tasks.convert_from_tiff_to_pdf(dst)) | ||||||
|  | 
 | ||||||
|     def test_scan_file_for_separating_barcodes(self): |     def test_scan_file_for_separating_barcodes(self): | ||||||
|         test_file = os.path.join( |         test_file = os.path.join( | ||||||
|             os.path.dirname(__file__), |             os.path.dirname(__file__), | ||||||
| @ -400,11 +423,64 @@ class TestTasks(DirectoriesMixin, TestCase): | |||||||
|             "barcodes", |             "barcodes", | ||||||
|             "patch-code-t-middle.pdf", |             "patch-code-t-middle.pdf", | ||||||
|         ) |         ) | ||||||
|         dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pd") |         dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf") | ||||||
|         shutil.copy(test_file, dst) |         shutil.copy(test_file, dst) | ||||||
| 
 | 
 | ||||||
|         self.assertEqual(tasks.consume_file(dst), "File successfully split") |         self.assertEqual(tasks.consume_file(dst), "File successfully split") | ||||||
| 
 | 
 | ||||||
|  |     @override_settings( | ||||||
|  |         CONSUMER_ENABLE_BARCODES=True, | ||||||
|  |         CONSUMER_BARCODE_TIFF_SUPPORT=True, | ||||||
|  |     ) | ||||||
|  |     def test_consume_barcode_tiff_file(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t-middle.tiff", | ||||||
|  |         ) | ||||||
|  |         dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff") | ||||||
|  |         shutil.copy(test_file, dst) | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(tasks.consume_file(dst), "File successfully split") | ||||||
|  | 
 | ||||||
|  |     @override_settings( | ||||||
|  |         CONSUMER_ENABLE_BARCODES=True, | ||||||
|  |         CONSUMER_BARCODE_TIFF_SUPPORT=True, | ||||||
|  |     ) | ||||||
|  |     @mock.patch("documents.consumer.Consumer.try_consume_file") | ||||||
|  |     def test_consume_barcode_unsupported_jpg_file(self, m): | ||||||
|  |         """ | ||||||
|  |         This test assumes barcode and TIFF support are enabled and | ||||||
|  |         the user uploads an unsupported image file (e.g. jpg) | ||||||
|  | 
 | ||||||
|  |         The function shouldn't try to scan for separating barcodes | ||||||
|  |         and continue archiving the file as is. | ||||||
|  |         """ | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "simple.jpg", | ||||||
|  |         ) | ||||||
|  |         dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg") | ||||||
|  |         shutil.copy(test_file, dst) | ||||||
|  |         with self.assertLogs("paperless.tasks", level="WARNING") as cm: | ||||||
|  |             self.assertIn("Success", tasks.consume_file(dst)) | ||||||
|  |         self.assertEqual( | ||||||
|  |             cm.output, | ||||||
|  |             [ | ||||||
|  |                 "WARNING:paperless.tasks:Unsupported file format for barcode reader: .jpg", | ||||||
|  |             ], | ||||||
|  |         ) | ||||||
|  |         m.assert_called_once() | ||||||
|  | 
 | ||||||
|  |         args, kwargs = m.call_args | ||||||
|  |         self.assertIsNone(kwargs["override_filename"]) | ||||||
|  |         self.assertIsNone(kwargs["override_title"]) | ||||||
|  |         self.assertIsNone(kwargs["override_correspondent_id"]) | ||||||
|  |         self.assertIsNone(kwargs["override_document_type_id"]) | ||||||
|  |         self.assertIsNone(kwargs["override_tag_ids"]) | ||||||
|  | 
 | ||||||
|     @mock.patch("documents.tasks.sanity_checker.check_sanity") |     @mock.patch("documents.tasks.sanity_checker.check_sanity") | ||||||
|     def test_sanity_check_success(self, m): |     def test_sanity_check_success(self, m): | ||||||
|         m.return_value = SanityCheckMessages() |         m.return_value = SanityCheckMessages() | ||||||
|  | |||||||
| @ -503,6 +503,10 @@ CONSUMER_ENABLE_BARCODES = __get_boolean( | |||||||
|     "PAPERLESS_CONSUMER_ENABLE_BARCODES", |     "PAPERLESS_CONSUMER_ENABLE_BARCODES", | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
|  | CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean( | ||||||
|  |     "PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT", | ||||||
|  | ) | ||||||
|  | 
 | ||||||
| CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") | CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") | ||||||
| 
 | 
 | ||||||
| OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") | OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user