mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 00:02:35 -04:00 
			
		
		
		
	Feature: Switches to a new client to handle communication with Gotenberg (#4391)
Switches to a new client to handle communication with Gotenberg for merging and generating PDFs
This commit is contained in:
		
							parent
							
								
									5f0eba694c
								
							
						
					
					
						commit
						999ae678c2
					
				
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							| @ -51,6 +51,7 @@ flower = "*" | |||||||
| bleach = "*" | bleach = "*" | ||||||
| zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"} | zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"} | ||||||
| django-multiselectfield = "*" | django-multiselectfield = "*" | ||||||
|  | gotenberg-client = "*" | ||||||
| 
 | 
 | ||||||
| [dev-packages] | [dev-packages] | ||||||
| # Linting | # Linting | ||||||
|  | |||||||
							
								
								
									
										39
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										39
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							| @ -1,7 +1,7 @@ | |||||||
| { | { | ||||||
|     "_meta": { |     "_meta": { | ||||||
|         "hash": { |         "hash": { | ||||||
|             "sha256": "3025da2940433d347b2fd2ac222852c21f4aa73eeefbd1ee9152cbfd7a7a48e9" |             "sha256": "505bd6b18d31ed64988ef307c12a5acb70f611cafd932a391e985a11bbbc8000" | ||||||
|         }, |         }, | ||||||
|         "pipfile-spec": 6, |         "pipfile-spec": 6, | ||||||
|         "requires": {}, |         "requires": {}, | ||||||
| @ -539,6 +539,15 @@ | |||||||
|             "markers": "python_version >= '3.7'", |             "markers": "python_version >= '3.7'", | ||||||
|             "version": "==2.0.1" |             "version": "==2.0.1" | ||||||
|         }, |         }, | ||||||
|  |         "gotenberg-client": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:4508ecb913ef2d553dd2ceb78e32cee001000ba08c910ba1f9ace38350d1589e", | ||||||
|  |                 "sha256:7a3f8a02caee768391373b3610c6ec25a853cccf391ed6b5d5a1292c3ed15e7e" | ||||||
|  |             ], | ||||||
|  |             "index": "pypi", | ||||||
|  |             "markers": "python_version >= '3.8'", | ||||||
|  |             "version": "==0.3.0" | ||||||
|  |         }, | ||||||
|         "gunicorn": { |         "gunicorn": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:3213aa5e8c24949e792bcacfc176fef362e7aac80b76c56f6b5122bf350722f0", |                 "sha256:3213aa5e8c24949e792bcacfc176fef362e7aac80b76c56f6b5122bf350722f0", | ||||||
| @ -556,6 +565,13 @@ | |||||||
|             "markers": "python_version >= '3.7'", |             "markers": "python_version >= '3.7'", | ||||||
|             "version": "==0.14.0" |             "version": "==0.14.0" | ||||||
|         }, |         }, | ||||||
|  |         "h2": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d", | ||||||
|  |                 "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb" | ||||||
|  |             ], | ||||||
|  |             "version": "==4.1.0" | ||||||
|  |         }, | ||||||
|         "hiredis": { |         "hiredis": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:071c5814b850574036506a8118034f97c3cbf2fe9947ff45a27b07a48da56240", |                 "sha256:071c5814b850574036506a8118034f97c3cbf2fe9947ff45a27b07a48da56240", | ||||||
| @ -650,6 +666,14 @@ | |||||||
|             ], |             ], | ||||||
|             "version": "==2.2.3" |             "version": "==2.2.3" | ||||||
|         }, |         }, | ||||||
|  |         "hpack": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c", | ||||||
|  |                 "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095" | ||||||
|  |             ], | ||||||
|  |             "markers": "python_full_version >= '3.6.1'", | ||||||
|  |             "version": "==4.0.0" | ||||||
|  |         }, | ||||||
|         "httpcore": { |         "httpcore": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:13b5e5cd1dca1a6636a6aaea212b19f4f85cd88c366a2b82304181b769aab3c9", |                 "sha256:13b5e5cd1dca1a6636a6aaea212b19f4f85cd88c366a2b82304181b769aab3c9", | ||||||
| @ -699,6 +723,9 @@ | |||||||
|             "version": "==0.6.0" |             "version": "==0.6.0" | ||||||
|         }, |         }, | ||||||
|         "httpx": { |         "httpx": { | ||||||
|  |             "extras": [ | ||||||
|  |                 "http2" | ||||||
|  |             ], | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:181ea7f8ba3a82578be86ef4171554dd45fec26a02556a744db029a0a27b7100", |                 "sha256:181ea7f8ba3a82578be86ef4171554dd45fec26a02556a744db029a0a27b7100", | ||||||
|                 "sha256:47ecda285389cb32bb2691cc6e069e3ab0205956f681c5b2ad2325719751d875" |                 "sha256:47ecda285389cb32bb2691cc6e069e3ab0205956f681c5b2ad2325719751d875" | ||||||
| @ -714,6 +741,14 @@ | |||||||
|             "markers": "python_version >= '3.8'", |             "markers": "python_version >= '3.8'", | ||||||
|             "version": "==4.8.0" |             "version": "==4.8.0" | ||||||
|         }, |         }, | ||||||
|  |         "hyperframe": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15", | ||||||
|  |                 "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914" | ||||||
|  |             ], | ||||||
|  |             "markers": "python_full_version >= '3.6.1'", | ||||||
|  |             "version": "==6.0.1" | ||||||
|  |         }, | ||||||
|         "idna": { |         "idna": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", |                 "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", | ||||||
| @ -1782,7 +1817,7 @@ | |||||||
|                 "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0", |                 "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0", | ||||||
|                 "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef" |                 "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef" | ||||||
|             ], |             ], | ||||||
|             "markers": "python_version < '3.11'", |             "markers": "python_version < '3.10'", | ||||||
|             "version": "==4.8.0" |             "version": "==4.8.0" | ||||||
|         }, |         }, | ||||||
|         "tzdata": { |         "tzdata": { | ||||||
|  | |||||||
| @ -1,13 +1,17 @@ | |||||||
| import re | import re | ||||||
| from html import escape | from html import escape | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | from typing import Optional | ||||||
| 
 | 
 | ||||||
| import httpx |  | ||||||
| from bleach import clean | from bleach import clean | ||||||
| from bleach import linkify | from bleach import linkify | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.utils.timezone import is_naive | from django.utils.timezone import is_naive | ||||||
| from django.utils.timezone import make_aware | from django.utils.timezone import make_aware | ||||||
|  | from gotenberg_client import GotenbergClient | ||||||
|  | from gotenberg_client.options import Margin | ||||||
|  | from gotenberg_client.options import PageSize | ||||||
|  | from gotenberg_client.options import PdfAFormat | ||||||
| from humanize import naturalsize | from humanize import naturalsize | ||||||
| from imap_tools import MailAttachment | from imap_tools import MailAttachment | ||||||
| from imap_tools import MailMessage | from imap_tools import MailMessage | ||||||
| @ -24,11 +28,22 @@ class MailDocumentParser(DocumentParser): | |||||||
|     Gotenberg and sends the html part to a Tika server for text extraction. |     Gotenberg and sends the html part to a Tika server for text extraction. | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT |  | ||||||
|     tika_server = settings.TIKA_ENDPOINT |  | ||||||
| 
 |  | ||||||
|     logging_name = "paperless.parsing.mail" |     logging_name = "paperless.parsing.mail" | ||||||
| 
 | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def _settings_to_gotenberg_pdfa() -> Optional[PdfAFormat]: | ||||||
|  |         """ | ||||||
|  |         Converts our requested PDF/A output into the Gotenberg API | ||||||
|  |         format | ||||||
|  |         """ | ||||||
|  |         if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: | ||||||
|  |             return PdfAFormat.A2b | ||||||
|  |         elif settings.OCR_OUTPUT_TYPE == "pdfa-1":  # pragma: no cover | ||||||
|  |             return PdfAFormat.A1a | ||||||
|  |         elif settings.OCR_OUTPUT_TYPE == "pdfa-3":  # pragma: no cover | ||||||
|  |             return PdfAFormat.A3b | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|     def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None): |     def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None): | ||||||
|         if not self.archive_path: |         if not self.archive_path: | ||||||
|             self.archive_path = self.generate_pdf( |             self.archive_path = self.generate_pdf( | ||||||
| @ -173,7 +188,7 @@ class MailDocumentParser(DocumentParser): | |||||||
|         self.log.info("Sending content to Tika server") |         self.log.info("Sending content to Tika server") | ||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|             with TikaClient(tika_url=self.tika_server) as client: |             with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: | ||||||
|                 parsed = client.tika.as_text.from_buffer(html, "text/html") |                 parsed = client.tika.as_text.from_buffer(html, "text/html") | ||||||
| 
 | 
 | ||||||
|                 if parsed.content is not None: |                 if parsed.content is not None: | ||||||
| @ -182,7 +197,7 @@ class MailDocumentParser(DocumentParser): | |||||||
|         except Exception as err: |         except Exception as err: | ||||||
|             raise ParseError( |             raise ParseError( | ||||||
|                 f"Could not parse content with tika server at " |                 f"Could not parse content with tika server at " | ||||||
|                 f"{self.tika_server}: {err}", |                 f"{settings.TIKA_ENDPOINT}: {err}", | ||||||
|             ) from err |             ) from err | ||||||
| 
 | 
 | ||||||
|     def generate_pdf(self, mail_message: MailMessage) -> Path: |     def generate_pdf(self, mail_message: MailMessage) -> Path: | ||||||
| @ -195,45 +210,29 @@ class MailDocumentParser(DocumentParser): | |||||||
|         if not mail_message.html: |         if not mail_message.html: | ||||||
|             archive_path.write_bytes(mail_pdf_file.read_bytes()) |             archive_path.write_bytes(mail_pdf_file.read_bytes()) | ||||||
|         else: |         else: | ||||||
|             url_merge = self.gotenberg_server + "/forms/pdfengines/merge" |  | ||||||
| 
 |  | ||||||
|             pdf_of_html_content = self.generate_pdf_from_html( |             pdf_of_html_content = self.generate_pdf_from_html( | ||||||
|                 mail_message.html, |                 mail_message.html, | ||||||
|                 mail_message.attachments, |                 mail_message.attachments, | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|             pdf_collection = { |             with GotenbergClient( | ||||||
|                 "1_mail.pdf": ("1_mail.pdf", mail_pdf_file, "application/pdf"), |                 host=settings.TIKA_GOTENBERG_ENDPOINT, | ||||||
|                 "2_html.pdf": ("2_html.pdf", pdf_of_html_content, "application/pdf"), |                 timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||||
|             } |             ) as client, client.merge.merge() as route: | ||||||
|  |                 # Configure requested PDF/A formatting, if any | ||||||
|  |                 pdf_a_format = self._settings_to_gotenberg_pdfa() | ||||||
|  |                 if pdf_a_format is not None: | ||||||
|  |                     route.pdf_format(pdf_a_format) | ||||||
|  | 
 | ||||||
|  |                 route.merge([mail_pdf_file, pdf_of_html_content]) | ||||||
| 
 | 
 | ||||||
|                 try: |                 try: | ||||||
|                 # Open a handle to each file, replacing the tuple |                     response = route.run() | ||||||
|                 for filename in pdf_collection: |  | ||||||
|                     file_multi_part = pdf_collection[filename] |  | ||||||
|                     pdf_collection[filename] = ( |  | ||||||
|                         file_multi_part[0], |  | ||||||
|                         file_multi_part[1].open("rb"), |  | ||||||
|                         file_multi_part[2], |  | ||||||
|                     ) |  | ||||||
| 
 |  | ||||||
|                 response = httpx.post( |  | ||||||
|                     url_merge, |  | ||||||
|                     files=pdf_collection, |  | ||||||
|                     timeout=settings.CELERY_TASK_TIME_LIMIT, |  | ||||||
|                 ) |  | ||||||
|                 response.raise_for_status()  # ensure we notice bad responses |  | ||||||
| 
 |  | ||||||
|                     archive_path.write_bytes(response.content) |                     archive_path.write_bytes(response.content) | ||||||
| 
 |  | ||||||
|                 except Exception as err: |                 except Exception as err: | ||||||
|                     raise ParseError( |                     raise ParseError( | ||||||
|                         f"Error while merging email HTML into PDF: {err}", |                         f"Error while merging email HTML into PDF: {err}", | ||||||
|                     ) from err |                     ) from err | ||||||
|             finally: |  | ||||||
|                 for filename in pdf_collection: |  | ||||||
|                     file_multi_part_handle = pdf_collection[filename][1] |  | ||||||
|                     file_multi_part_handle.close() |  | ||||||
| 
 | 
 | ||||||
|         return archive_path |         return archive_path | ||||||
| 
 | 
 | ||||||
| @ -299,48 +298,29 @@ class MailDocumentParser(DocumentParser): | |||||||
|         Creates a PDF based on the given email, using the email's values in a |         Creates a PDF based on the given email, using the email's values in a | ||||||
|         an HTML template |         an HTML template | ||||||
|         """ |         """ | ||||||
|         url = self.gotenberg_server + "/forms/chromium/convert/html" |  | ||||||
|         self.log.info("Converting mail to PDF") |         self.log.info("Converting mail to PDF") | ||||||
| 
 | 
 | ||||||
|         css_file = Path(__file__).parent / "templates" / "output.css" |         css_file = Path(__file__).parent / "templates" / "output.css" | ||||||
|         email_html_file = self.mail_to_html(mail) |         email_html_file = self.mail_to_html(mail) | ||||||
| 
 | 
 | ||||||
|         with css_file.open("rb") as css_handle, email_html_file.open( |         with GotenbergClient( | ||||||
|             "rb", |             host=settings.TIKA_GOTENBERG_ENDPOINT, | ||||||
|         ) as email_html_handle: |             timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||||
|             files = { |         ) as client, client.chromium.html_to_pdf() as route: | ||||||
|                 "html": ("index.html", email_html_handle, "text/html"), |             # Configure requested PDF/A formatting, if any | ||||||
|                 "css": ("output.css", css_handle, "text/css"), |             pdf_a_format = self._settings_to_gotenberg_pdfa() | ||||||
|             } |             if pdf_a_format is not None: | ||||||
|             headers = {} |                 route.pdf_format(pdf_a_format) | ||||||
|             data = { |  | ||||||
|                 "marginTop": "0.1", |  | ||||||
|                 "marginBottom": "0.1", |  | ||||||
|                 "marginLeft": "0.1", |  | ||||||
|                 "marginRight": "0.1", |  | ||||||
|                 "paperWidth": "8.27", |  | ||||||
|                 "paperHeight": "11.7", |  | ||||||
|                 "scale": "1.0", |  | ||||||
|             } |  | ||||||
| 
 |  | ||||||
|             # Set the output format of the resulting PDF |  | ||||||
|             # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno |  | ||||||
|             if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: |  | ||||||
|                 data["pdfFormat"] = "PDF/A-2b" |  | ||||||
|             elif settings.OCR_OUTPUT_TYPE == "pdfa-1": |  | ||||||
|                 data["pdfFormat"] = "PDF/A-1a" |  | ||||||
|             elif settings.OCR_OUTPUT_TYPE == "pdfa-3": |  | ||||||
|                 data["pdfFormat"] = "PDF/A-3b" |  | ||||||
| 
 | 
 | ||||||
|             try: |             try: | ||||||
|                 response = httpx.post( |                 response = ( | ||||||
|                     url, |                     route.index(email_html_file) | ||||||
|                     files=files, |                     .resource(css_file) | ||||||
|                     headers=headers, |                     .margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)) | ||||||
|                     data=data, |                     .size(PageSize(height=11.7, width=8.27)) | ||||||
|                     timeout=settings.CELERY_TASK_TIME_LIMIT, |                     .scale(1.0) | ||||||
|  |                     .run() | ||||||
|                 ) |                 ) | ||||||
|                 response.raise_for_status()  # ensure we notice bad responses |  | ||||||
|             except Exception as err: |             except Exception as err: | ||||||
|                 raise ParseError( |                 raise ParseError( | ||||||
|                     f"Error while converting email to PDF: {err}", |                     f"Error while converting email to PDF: {err}", | ||||||
| @ -368,15 +348,25 @@ class MailDocumentParser(DocumentParser): | |||||||
|             text = compiled_close.sub("</div", text) |             text = compiled_close.sub("</div", text) | ||||||
|             return text |             return text | ||||||
| 
 | 
 | ||||||
|         url = self.gotenberg_server + "/forms/chromium/convert/html" |  | ||||||
|         self.log.info("Converting html to PDF") |         self.log.info("Converting html to PDF") | ||||||
| 
 | 
 | ||||||
|         tempdir = Path(self.tempdir) |         tempdir = Path(self.tempdir) | ||||||
| 
 | 
 | ||||||
|         html_clean = clean_html_script(orig_html) |         html_clean = clean_html_script(orig_html) | ||||||
|  |         html_clean_file = tempdir / "index.html" | ||||||
|  |         html_clean_file.write_text(html_clean) | ||||||
| 
 | 
 | ||||||
|         files = {} |         with GotenbergClient( | ||||||
|  |             host=settings.TIKA_GOTENBERG_ENDPOINT, | ||||||
|  |             timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||||
|  |         ) as client, client.chromium.html_to_pdf() as route: | ||||||
|  |             # Configure requested PDF/A formatting, if any | ||||||
|  |             pdf_a_format = self._settings_to_gotenberg_pdfa() | ||||||
|  |             if pdf_a_format is not None: | ||||||
|  |                 route.pdf_format(pdf_a_format) | ||||||
| 
 | 
 | ||||||
|  |             # Add attachments as resources, cleaning the filename and replacing | ||||||
|  |             # it in the index file for inclusion | ||||||
|             for attachment in attachments: |             for attachment in attachments: | ||||||
|                 # Clean the attachment name to be valid |                 # Clean the attachment name to be valid | ||||||
|                 name_cid = f"cid:{attachment.content_id}" |                 name_cid = f"cid:{attachment.content_id}" | ||||||
| @ -386,8 +376,7 @@ class MailDocumentParser(DocumentParser): | |||||||
|                 temp_file = tempdir / name_clean |                 temp_file = tempdir / name_clean | ||||||
|                 temp_file.write_bytes(attachment.payload) |                 temp_file.write_bytes(attachment.payload) | ||||||
| 
 | 
 | ||||||
|             # Store the attachment for upload |                 route.resource(temp_file) | ||||||
|             files[name_clean] = (name_clean, temp_file, attachment.content_type) |  | ||||||
| 
 | 
 | ||||||
|                 # Replace as needed the name with the clean name |                 # Replace as needed the name with the clean name | ||||||
|                 html_clean = html_clean.replace(name_cid, name_clean) |                 html_clean = html_clean.replace(name_cid, name_clean) | ||||||
| @ -395,42 +384,21 @@ class MailDocumentParser(DocumentParser): | |||||||
|             # Now store the cleaned up HTML version |             # Now store the cleaned up HTML version | ||||||
|             html_clean_file = tempdir / "index.html" |             html_clean_file = tempdir / "index.html" | ||||||
|             html_clean_file.write_text(html_clean) |             html_clean_file.write_text(html_clean) | ||||||
|  |             # This is our index file, the main page basically | ||||||
|  |             route.index(html_clean_file) | ||||||
| 
 | 
 | ||||||
|         files["index.html"] = ("index.html", html_clean_file, "text/html") |             # Set page size, margins | ||||||
|  |             route.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)).size( | ||||||
|  |                 PageSize(height=11.7, width=8.27), | ||||||
|  |             ).scale(1.0) | ||||||
| 
 | 
 | ||||||
|         data = { |  | ||||||
|             "marginTop": "0.1", |  | ||||||
|             "marginBottom": "0.1", |  | ||||||
|             "marginLeft": "0.1", |  | ||||||
|             "marginRight": "0.1", |  | ||||||
|             "paperWidth": "8.27", |  | ||||||
|             "paperHeight": "11.7", |  | ||||||
|             "scale": "1.0", |  | ||||||
|         } |  | ||||||
|             try: |             try: | ||||||
|             # Open a handle to each file, replacing the tuple |                 response = route.run() | ||||||
|             for filename in files: |  | ||||||
|                 file_multi_part = files[filename] |  | ||||||
|                 files[filename] = ( |  | ||||||
|                     file_multi_part[0], |  | ||||||
|                     file_multi_part[1].open("rb"), |  | ||||||
|                     file_multi_part[2], |  | ||||||
|                 ) |  | ||||||
| 
 | 
 | ||||||
|             response = httpx.post( |  | ||||||
|                 url, |  | ||||||
|                 files=files, |  | ||||||
|                 data=data, |  | ||||||
|                 timeout=settings.CELERY_TASK_TIME_LIMIT, |  | ||||||
|             ) |  | ||||||
|             response.raise_for_status()  # ensure we notice bad responses |  | ||||||
|             except Exception as err: |             except Exception as err: | ||||||
|             raise ParseError(f"Error while converting document to PDF: {err}") from err |                 raise ParseError( | ||||||
|         finally: |                     f"Error while converting document to PDF: {err}", | ||||||
|             # Ensure all file handles as closed |                 ) from err | ||||||
|             for filename in files: |  | ||||||
|                 file_multi_part_handle = files[filename][1] |  | ||||||
|                 file_multi_part_handle.close() |  | ||||||
| 
 | 
 | ||||||
|         html_pdf = tempdir / "html.pdf" |         html_pdf = tempdir / "html.pdf" | ||||||
|         html_pdf.write_bytes(response.content) |         html_pdf.write_bytes(response.content) | ||||||
|  | |||||||
| @ -341,7 +341,7 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase): | |||||||
|         ) |         ) | ||||||
|         parsed = self.parser.tika_parse(html) |         parsed = self.parser.tika_parse(html) | ||||||
|         self.assertEqual(expected_text, parsed.strip()) |         self.assertEqual(expected_text, parsed.strip()) | ||||||
|         self.assertIn(self.parser.tika_server, str(self.httpx_mock.get_request().url)) |         self.assertIn("http://localhost:9998", str(self.httpx_mock.get_request().url)) | ||||||
| 
 | 
 | ||||||
|     def test_tika_parse_exception(self): |     def test_tika_parse_exception(self): | ||||||
|         """ |         """ | ||||||
| @ -653,5 +653,5 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) | |||||||
| 
 | 
 | ||||||
|         self.assertEqual( |         self.assertEqual( | ||||||
|             str(request.url), |             str(request.url), | ||||||
|             self.parser.gotenberg_server + "/forms/chromium/convert/html", |             "http://localhost:3000/forms/chromium/convert/html", | ||||||
|         ) |         ) | ||||||
|  | |||||||
| @ -1,11 +1,14 @@ | |||||||
| import os | import os | ||||||
|  | import shutil | ||||||
|  | import subprocess | ||||||
|  | import tempfile | ||||||
|  | from pathlib import Path | ||||||
| from unittest import mock | from unittest import mock | ||||||
| 
 | 
 | ||||||
| import httpx | import httpx | ||||||
| import pytest | import pytest | ||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
| from imagehash import average_hash | from imagehash import average_hash | ||||||
| from pdfminer.high_level import extract_text |  | ||||||
| from PIL import Image | from PIL import Image | ||||||
| 
 | 
 | ||||||
| from documents.tests.utils import FileSystemAssertsMixin | from documents.tests.utils import FileSystemAssertsMixin | ||||||
| @ -13,6 +16,29 @@ from documents.tests.utils import util_call_with_backoff | |||||||
| from paperless_mail.tests.test_parsers import BaseMailParserTestCase | from paperless_mail.tests.test_parsers import BaseMailParserTestCase | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def extract_text(pdf_path: Path) -> str: | ||||||
|  |     """ | ||||||
|  |     Using pdftotext from poppler, extracts the text of a PDF into a file, | ||||||
|  |     then reads the file contents and returns it | ||||||
|  |     """ | ||||||
|  |     with tempfile.NamedTemporaryFile( | ||||||
|  |         mode="w+", | ||||||
|  |     ) as tmp: | ||||||
|  |         subprocess.run( | ||||||
|  |             [ | ||||||
|  |                 shutil.which("pdftotext"), | ||||||
|  |                 "-q", | ||||||
|  |                 "-layout", | ||||||
|  |                 "-enc", | ||||||
|  |                 "UTF-8", | ||||||
|  |                 str(pdf_path), | ||||||
|  |                 tmp.name, | ||||||
|  |             ], | ||||||
|  |             check=True, | ||||||
|  |         ) | ||||||
|  |         return tmp.read() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class MailAttachmentMock: | class MailAttachmentMock: | ||||||
|     def __init__(self, payload, content_id): |     def __init__(self, payload, content_id): | ||||||
|         self.payload = payload |         self.payload = payload | ||||||
| @ -150,7 +176,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): | |||||||
| 
 | 
 | ||||||
|         extracted = extract_text(pdf_path) |         extracted = extract_text(pdf_path) | ||||||
|         expected = ( |         expected = ( | ||||||
|             "first\tPDF\tto\tbe\tmerged.\n\n\x0csecond\tPDF\tto\tbe\tmerged.\n\n\x0c" |             "first   PDF   to   be   merged.\n\x0csecond PDF   to   be   merged.\n\x0c" | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         self.assertEqual(expected, extracted) |         self.assertEqual(expected, extracted) | ||||||
|  | |||||||
| @ -1,9 +1,10 @@ | |||||||
| import os |  | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 | 
 | ||||||
| import httpx | import httpx | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
|  | from gotenberg_client import GotenbergClient | ||||||
|  | from gotenberg_client.options import PdfAFormat | ||||||
| from tika_client import TikaClient | from tika_client import TikaClient | ||||||
| 
 | 
 | ||||||
| from documents.parsers import DocumentParser | from documents.parsers import DocumentParser | ||||||
| @ -80,47 +81,33 @@ class TikaDocumentParser(DocumentParser): | |||||||
| 
 | 
 | ||||||
|         self.archive_path = self.convert_to_pdf(document_path, file_name) |         self.archive_path = self.convert_to_pdf(document_path, file_name) | ||||||
| 
 | 
 | ||||||
|     def convert_to_pdf(self, document_path, file_name): |     def convert_to_pdf(self, document_path: Path, file_name): | ||||||
|         pdf_path = os.path.join(self.tempdir, "convert.pdf") |         pdf_path = Path(self.tempdir) / "convert.pdf" | ||||||
|         gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT |  | ||||||
|         url = gotenberg_server + "/forms/libreoffice/convert" |  | ||||||
| 
 | 
 | ||||||
|         self.log.info(f"Converting {document_path} to PDF as {pdf_path}") |         self.log.info(f"Converting {document_path} to PDF as {pdf_path}") | ||||||
|         with open(document_path, "rb") as document_handle: |  | ||||||
|             files = { |  | ||||||
|                 "files": ( |  | ||||||
|                     "convert" + os.path.splitext(document_path)[-1], |  | ||||||
|                     document_handle, |  | ||||||
|                 ), |  | ||||||
|             } |  | ||||||
|             headers = {} |  | ||||||
|             data = {} |  | ||||||
| 
 | 
 | ||||||
|  |         with GotenbergClient( | ||||||
|  |             host=settings.TIKA_GOTENBERG_ENDPOINT, | ||||||
|  |             timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||||
|  |         ) as client, client.libre_office.to_pdf() as route: | ||||||
|             # Set the output format of the resulting PDF |             # Set the output format of the resulting PDF | ||||||
|             # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno |  | ||||||
|             if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: |             if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: | ||||||
|                 data["pdfFormat"] = "PDF/A-2b" |                 route.pdf_format(PdfAFormat.A2b) | ||||||
|             elif settings.OCR_OUTPUT_TYPE == "pdfa-1": |             elif settings.OCR_OUTPUT_TYPE == "pdfa-1": | ||||||
|                 data["pdfFormat"] = "PDF/A-1a" |                 route.pdf_format(PdfAFormat.A1a) | ||||||
|             elif settings.OCR_OUTPUT_TYPE == "pdfa-3": |             elif settings.OCR_OUTPUT_TYPE == "pdfa-3": | ||||||
|                 data["pdfFormat"] = "PDF/A-3b" |                 route.pdf_format(PdfAFormat.A3b) | ||||||
|  | 
 | ||||||
|  |             route.convert(document_path) | ||||||
| 
 | 
 | ||||||
|             try: |             try: | ||||||
|                 response = httpx.post( |                 response = route.run() | ||||||
|                     url, | 
 | ||||||
|                     files=files, |                 pdf_path.write_bytes(response.content) | ||||||
|                     headers=headers, | 
 | ||||||
|                     data=data, |                 return pdf_path | ||||||
|                     timeout=settings.CELERY_TASK_TIME_LIMIT, | 
 | ||||||
|                 ) |  | ||||||
|                 response.raise_for_status()  # ensure we notice bad responses |  | ||||||
|             except Exception as err: |             except Exception as err: | ||||||
|                 raise ParseError( |                 raise ParseError( | ||||||
|                     f"Error while converting document to PDF: {err}", |                     f"Error while converting document to PDF: {err}", | ||||||
|                 ) from err |                 ) from err | ||||||
| 
 |  | ||||||
|         with open(pdf_path, "wb") as file: |  | ||||||
|             file.write(response.content) |  | ||||||
|             file.close() |  | ||||||
| 
 |  | ||||||
|         return pdf_path |  | ||||||
|  | |||||||
| @ -2,12 +2,11 @@ import datetime | |||||||
| import os | import os | ||||||
| import zoneinfo | import zoneinfo | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from unittest import mock |  | ||||||
| 
 | 
 | ||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
| from django.test import override_settings | from django.test import override_settings | ||||||
| from httpx import Request | from httpx import codes | ||||||
| from httpx import Response | from httpx._multipart import DataField | ||||||
| from rest_framework import status | from rest_framework import status | ||||||
| 
 | 
 | ||||||
| from documents.parsers import ParseError | from documents.parsers import ParseError | ||||||
| @ -95,8 +94,7 @@ class TestTikaParser(HttpxMockMixin, TestCase): | |||||||
|         with self.assertRaises(ParseError): |         with self.assertRaises(ParseError): | ||||||
|             self.parser.convert_to_pdf(file, None) |             self.parser.convert_to_pdf(file, None) | ||||||
| 
 | 
 | ||||||
|     @mock.patch("paperless_tika.parsers.httpx.post") |     def test_request_pdf_a_format(self): | ||||||
|     def test_request_pdf_a_format(self, post: mock.Mock): |  | ||||||
|         """ |         """ | ||||||
|         GIVEN: |         GIVEN: | ||||||
|             - Document needs to be converted to PDF |             - Document needs to be converted to PDF | ||||||
| @ -108,10 +106,6 @@ class TestTikaParser(HttpxMockMixin, TestCase): | |||||||
|         file = Path(os.path.join(self.parser.tempdir, "input.odt")) |         file = Path(os.path.join(self.parser.tempdir, "input.odt")) | ||||||
|         file.touch() |         file.touch() | ||||||
| 
 | 
 | ||||||
|         response = Response(status_code=status.HTTP_200_OK) |  | ||||||
|         response.request = Request("POST", "/somewhere/") |  | ||||||
|         post.return_value = response |  | ||||||
| 
 |  | ||||||
|         for setting, expected_key in [ |         for setting, expected_key in [ | ||||||
|             ("pdfa", "PDF/A-2b"), |             ("pdfa", "PDF/A-2b"), | ||||||
|             ("pdfa-2", "PDF/A-2b"), |             ("pdfa-2", "PDF/A-2b"), | ||||||
| @ -119,11 +113,20 @@ class TestTikaParser(HttpxMockMixin, TestCase): | |||||||
|             ("pdfa-3", "PDF/A-3b"), |             ("pdfa-3", "PDF/A-3b"), | ||||||
|         ]: |         ]: | ||||||
|             with override_settings(OCR_OUTPUT_TYPE=setting): |             with override_settings(OCR_OUTPUT_TYPE=setting): | ||||||
|  |                 self.httpx_mock.add_response( | ||||||
|  |                     status_code=codes.OK, | ||||||
|  |                     content=b"PDF document", | ||||||
|  |                     method="POST", | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|                 self.parser.convert_to_pdf(file, None) |                 self.parser.convert_to_pdf(file, None) | ||||||
| 
 | 
 | ||||||
|                 post.assert_called_once() |                 request = self.httpx_mock.get_request() | ||||||
|                 _, kwargs = post.call_args |                 found = False | ||||||
|  |                 for field in request.stream.fields: | ||||||
|  |                     if isinstance(field, DataField) and field.name == "pdfFormat": | ||||||
|  |                         self.assertEqual(field.value, expected_key) | ||||||
|  |                         found = True | ||||||
|  |                 self.assertTrue(found) | ||||||
| 
 | 
 | ||||||
|                 self.assertEqual(kwargs["data"]["pdfFormat"], expected_key) |                 self.httpx_mock.reset(assert_all_responses_were_requested=False) | ||||||
| 
 |  | ||||||
|                 post.reset_mock() |  | ||||||
|  | |||||||
| @ -7,7 +7,7 @@ max-line-length = 88 | |||||||
| 
 | 
 | ||||||
| [tool:pytest] | [tool:pytest] | ||||||
| DJANGO_SETTINGS_MODULE=paperless.settings | DJANGO_SETTINGS_MODULE=paperless.settings | ||||||
| addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --quiet --durations=50 | addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50 | ||||||
| env = | env = | ||||||
|   PAPERLESS_DISABLE_DBHANDLER=true |   PAPERLESS_DISABLE_DBHANDLER=true | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user