mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 00:02:35 -04:00 
			
		
		
		
	Feature: Switches to a new client to handle communication with Gotenberg (#4391)
Switches to a new client to handle communication with Gotenberg for merging and generating PDFs
This commit is contained in:
		
							parent
							
								
									5f0eba694c
								
							
						
					
					
						commit
						999ae678c2
					
				
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							| @ -51,6 +51,7 @@ flower = "*" | ||||
| bleach = "*" | ||||
| zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"} | ||||
| django-multiselectfield = "*" | ||||
| gotenberg-client = "*" | ||||
| 
 | ||||
| [dev-packages] | ||||
| # Linting | ||||
|  | ||||
							
								
								
									
										39
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										39
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							| @ -1,7 +1,7 @@ | ||||
| { | ||||
|     "_meta": { | ||||
|         "hash": { | ||||
|             "sha256": "3025da2940433d347b2fd2ac222852c21f4aa73eeefbd1ee9152cbfd7a7a48e9" | ||||
|             "sha256": "505bd6b18d31ed64988ef307c12a5acb70f611cafd932a391e985a11bbbc8000" | ||||
|         }, | ||||
|         "pipfile-spec": 6, | ||||
|         "requires": {}, | ||||
| @ -539,6 +539,15 @@ | ||||
|             "markers": "python_version >= '3.7'", | ||||
|             "version": "==2.0.1" | ||||
|         }, | ||||
|         "gotenberg-client": { | ||||
|             "hashes": [ | ||||
|                 "sha256:4508ecb913ef2d553dd2ceb78e32cee001000ba08c910ba1f9ace38350d1589e", | ||||
|                 "sha256:7a3f8a02caee768391373b3610c6ec25a853cccf391ed6b5d5a1292c3ed15e7e" | ||||
|             ], | ||||
|             "index": "pypi", | ||||
|             "markers": "python_version >= '3.8'", | ||||
|             "version": "==0.3.0" | ||||
|         }, | ||||
|         "gunicorn": { | ||||
|             "hashes": [ | ||||
|                 "sha256:3213aa5e8c24949e792bcacfc176fef362e7aac80b76c56f6b5122bf350722f0", | ||||
| @ -556,6 +565,13 @@ | ||||
|             "markers": "python_version >= '3.7'", | ||||
|             "version": "==0.14.0" | ||||
|         }, | ||||
|         "h2": { | ||||
|             "hashes": [ | ||||
|                 "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d", | ||||
|                 "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb" | ||||
|             ], | ||||
|             "version": "==4.1.0" | ||||
|         }, | ||||
|         "hiredis": { | ||||
|             "hashes": [ | ||||
|                 "sha256:071c5814b850574036506a8118034f97c3cbf2fe9947ff45a27b07a48da56240", | ||||
| @ -650,6 +666,14 @@ | ||||
|             ], | ||||
|             "version": "==2.2.3" | ||||
|         }, | ||||
|         "hpack": { | ||||
|             "hashes": [ | ||||
|                 "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c", | ||||
|                 "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095" | ||||
|             ], | ||||
|             "markers": "python_full_version >= '3.6.1'", | ||||
|             "version": "==4.0.0" | ||||
|         }, | ||||
|         "httpcore": { | ||||
|             "hashes": [ | ||||
|                 "sha256:13b5e5cd1dca1a6636a6aaea212b19f4f85cd88c366a2b82304181b769aab3c9", | ||||
| @ -699,6 +723,9 @@ | ||||
|             "version": "==0.6.0" | ||||
|         }, | ||||
|         "httpx": { | ||||
|             "extras": [ | ||||
|                 "http2" | ||||
|             ], | ||||
|             "hashes": [ | ||||
|                 "sha256:181ea7f8ba3a82578be86ef4171554dd45fec26a02556a744db029a0a27b7100", | ||||
|                 "sha256:47ecda285389cb32bb2691cc6e069e3ab0205956f681c5b2ad2325719751d875" | ||||
| @ -714,6 +741,14 @@ | ||||
|             "markers": "python_version >= '3.8'", | ||||
|             "version": "==4.8.0" | ||||
|         }, | ||||
|         "hyperframe": { | ||||
|             "hashes": [ | ||||
|                 "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15", | ||||
|                 "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914" | ||||
|             ], | ||||
|             "markers": "python_full_version >= '3.6.1'", | ||||
|             "version": "==6.0.1" | ||||
|         }, | ||||
|         "idna": { | ||||
|             "hashes": [ | ||||
|                 "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", | ||||
| @ -1782,7 +1817,7 @@ | ||||
|                 "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0", | ||||
|                 "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef" | ||||
|             ], | ||||
|             "markers": "python_version < '3.11'", | ||||
|             "markers": "python_version < '3.10'", | ||||
|             "version": "==4.8.0" | ||||
|         }, | ||||
|         "tzdata": { | ||||
|  | ||||
| @ -1,13 +1,17 @@ | ||||
| import re | ||||
| from html import escape | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| 
 | ||||
| import httpx | ||||
| from bleach import clean | ||||
| from bleach import linkify | ||||
| from django.conf import settings | ||||
| from django.utils.timezone import is_naive | ||||
| from django.utils.timezone import make_aware | ||||
| from gotenberg_client import GotenbergClient | ||||
| from gotenberg_client.options import Margin | ||||
| from gotenberg_client.options import PageSize | ||||
| from gotenberg_client.options import PdfAFormat | ||||
| from humanize import naturalsize | ||||
| from imap_tools import MailAttachment | ||||
| from imap_tools import MailMessage | ||||
| @ -24,11 +28,22 @@ class MailDocumentParser(DocumentParser): | ||||
|     Gotenberg and sends the html part to a Tika server for text extraction. | ||||
|     """ | ||||
| 
 | ||||
|     gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT | ||||
|     tika_server = settings.TIKA_ENDPOINT | ||||
| 
 | ||||
|     logging_name = "paperless.parsing.mail" | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _settings_to_gotenberg_pdfa() -> Optional[PdfAFormat]: | ||||
|         """ | ||||
|         Converts our requested PDF/A output into the Gotenberg API | ||||
|         format | ||||
|         """ | ||||
|         if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: | ||||
|             return PdfAFormat.A2b | ||||
|         elif settings.OCR_OUTPUT_TYPE == "pdfa-1":  # pragma: no cover | ||||
|             return PdfAFormat.A1a | ||||
|         elif settings.OCR_OUTPUT_TYPE == "pdfa-3":  # pragma: no cover | ||||
|             return PdfAFormat.A3b | ||||
|         return None | ||||
| 
 | ||||
|     def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None): | ||||
|         if not self.archive_path: | ||||
|             self.archive_path = self.generate_pdf( | ||||
| @ -173,7 +188,7 @@ class MailDocumentParser(DocumentParser): | ||||
|         self.log.info("Sending content to Tika server") | ||||
| 
 | ||||
|         try: | ||||
|             with TikaClient(tika_url=self.tika_server) as client: | ||||
|             with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: | ||||
|                 parsed = client.tika.as_text.from_buffer(html, "text/html") | ||||
| 
 | ||||
|                 if parsed.content is not None: | ||||
| @ -182,7 +197,7 @@ class MailDocumentParser(DocumentParser): | ||||
|         except Exception as err: | ||||
|             raise ParseError( | ||||
|                 f"Could not parse content with tika server at " | ||||
|                 f"{self.tika_server}: {err}", | ||||
|                 f"{settings.TIKA_ENDPOINT}: {err}", | ||||
|             ) from err | ||||
| 
 | ||||
|     def generate_pdf(self, mail_message: MailMessage) -> Path: | ||||
| @ -195,45 +210,29 @@ class MailDocumentParser(DocumentParser): | ||||
|         if not mail_message.html: | ||||
|             archive_path.write_bytes(mail_pdf_file.read_bytes()) | ||||
|         else: | ||||
|             url_merge = self.gotenberg_server + "/forms/pdfengines/merge" | ||||
| 
 | ||||
|             pdf_of_html_content = self.generate_pdf_from_html( | ||||
|                 mail_message.html, | ||||
|                 mail_message.attachments, | ||||
|             ) | ||||
| 
 | ||||
|             pdf_collection = { | ||||
|                 "1_mail.pdf": ("1_mail.pdf", mail_pdf_file, "application/pdf"), | ||||
|                 "2_html.pdf": ("2_html.pdf", pdf_of_html_content, "application/pdf"), | ||||
|             } | ||||
|             with GotenbergClient( | ||||
|                 host=settings.TIKA_GOTENBERG_ENDPOINT, | ||||
|                 timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||
|             ) as client, client.merge.merge() as route: | ||||
|                 # Configure requested PDF/A formatting, if any | ||||
|                 pdf_a_format = self._settings_to_gotenberg_pdfa() | ||||
|                 if pdf_a_format is not None: | ||||
|                     route.pdf_format(pdf_a_format) | ||||
| 
 | ||||
|                 route.merge([mail_pdf_file, pdf_of_html_content]) | ||||
| 
 | ||||
|                 try: | ||||
|                 # Open a handle to each file, replacing the tuple | ||||
|                 for filename in pdf_collection: | ||||
|                     file_multi_part = pdf_collection[filename] | ||||
|                     pdf_collection[filename] = ( | ||||
|                         file_multi_part[0], | ||||
|                         file_multi_part[1].open("rb"), | ||||
|                         file_multi_part[2], | ||||
|                     ) | ||||
| 
 | ||||
|                 response = httpx.post( | ||||
|                     url_merge, | ||||
|                     files=pdf_collection, | ||||
|                     timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||
|                 ) | ||||
|                 response.raise_for_status()  # ensure we notice bad responses | ||||
| 
 | ||||
|                     response = route.run() | ||||
|                     archive_path.write_bytes(response.content) | ||||
| 
 | ||||
|                 except Exception as err: | ||||
|                     raise ParseError( | ||||
|                         f"Error while merging email HTML into PDF: {err}", | ||||
|                     ) from err | ||||
|             finally: | ||||
|                 for filename in pdf_collection: | ||||
|                     file_multi_part_handle = pdf_collection[filename][1] | ||||
|                     file_multi_part_handle.close() | ||||
| 
 | ||||
|         return archive_path | ||||
| 
 | ||||
| @ -299,48 +298,29 @@ class MailDocumentParser(DocumentParser): | ||||
|         Creates a PDF based on the given email, using the email's values in a | ||||
|         an HTML template | ||||
|         """ | ||||
|         url = self.gotenberg_server + "/forms/chromium/convert/html" | ||||
|         self.log.info("Converting mail to PDF") | ||||
| 
 | ||||
|         css_file = Path(__file__).parent / "templates" / "output.css" | ||||
|         email_html_file = self.mail_to_html(mail) | ||||
| 
 | ||||
|         with css_file.open("rb") as css_handle, email_html_file.open( | ||||
|             "rb", | ||||
|         ) as email_html_handle: | ||||
|             files = { | ||||
|                 "html": ("index.html", email_html_handle, "text/html"), | ||||
|                 "css": ("output.css", css_handle, "text/css"), | ||||
|             } | ||||
|             headers = {} | ||||
|             data = { | ||||
|                 "marginTop": "0.1", | ||||
|                 "marginBottom": "0.1", | ||||
|                 "marginLeft": "0.1", | ||||
|                 "marginRight": "0.1", | ||||
|                 "paperWidth": "8.27", | ||||
|                 "paperHeight": "11.7", | ||||
|                 "scale": "1.0", | ||||
|             } | ||||
| 
 | ||||
|             # Set the output format of the resulting PDF | ||||
|             # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno | ||||
|             if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: | ||||
|                 data["pdfFormat"] = "PDF/A-2b" | ||||
|             elif settings.OCR_OUTPUT_TYPE == "pdfa-1": | ||||
|                 data["pdfFormat"] = "PDF/A-1a" | ||||
|             elif settings.OCR_OUTPUT_TYPE == "pdfa-3": | ||||
|                 data["pdfFormat"] = "PDF/A-3b" | ||||
|         with GotenbergClient( | ||||
|             host=settings.TIKA_GOTENBERG_ENDPOINT, | ||||
|             timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||
|         ) as client, client.chromium.html_to_pdf() as route: | ||||
|             # Configure requested PDF/A formatting, if any | ||||
|             pdf_a_format = self._settings_to_gotenberg_pdfa() | ||||
|             if pdf_a_format is not None: | ||||
|                 route.pdf_format(pdf_a_format) | ||||
| 
 | ||||
|             try: | ||||
|                 response = httpx.post( | ||||
|                     url, | ||||
|                     files=files, | ||||
|                     headers=headers, | ||||
|                     data=data, | ||||
|                     timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||
|                 response = ( | ||||
|                     route.index(email_html_file) | ||||
|                     .resource(css_file) | ||||
|                     .margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)) | ||||
|                     .size(PageSize(height=11.7, width=8.27)) | ||||
|                     .scale(1.0) | ||||
|                     .run() | ||||
|                 ) | ||||
|                 response.raise_for_status()  # ensure we notice bad responses | ||||
|             except Exception as err: | ||||
|                 raise ParseError( | ||||
|                     f"Error while converting email to PDF: {err}", | ||||
| @ -368,15 +348,25 @@ class MailDocumentParser(DocumentParser): | ||||
|             text = compiled_close.sub("</div", text) | ||||
|             return text | ||||
| 
 | ||||
|         url = self.gotenberg_server + "/forms/chromium/convert/html" | ||||
|         self.log.info("Converting html to PDF") | ||||
| 
 | ||||
|         tempdir = Path(self.tempdir) | ||||
| 
 | ||||
|         html_clean = clean_html_script(orig_html) | ||||
|         html_clean_file = tempdir / "index.html" | ||||
|         html_clean_file.write_text(html_clean) | ||||
| 
 | ||||
|         files = {} | ||||
|         with GotenbergClient( | ||||
|             host=settings.TIKA_GOTENBERG_ENDPOINT, | ||||
|             timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||
|         ) as client, client.chromium.html_to_pdf() as route: | ||||
|             # Configure requested PDF/A formatting, if any | ||||
|             pdf_a_format = self._settings_to_gotenberg_pdfa() | ||||
|             if pdf_a_format is not None: | ||||
|                 route.pdf_format(pdf_a_format) | ||||
| 
 | ||||
|             # Add attachments as resources, cleaning the filename and replacing | ||||
|             # it in the index file for inclusion | ||||
|             for attachment in attachments: | ||||
|                 # Clean the attachment name to be valid | ||||
|                 name_cid = f"cid:{attachment.content_id}" | ||||
| @ -386,8 +376,7 @@ class MailDocumentParser(DocumentParser): | ||||
|                 temp_file = tempdir / name_clean | ||||
|                 temp_file.write_bytes(attachment.payload) | ||||
| 
 | ||||
|             # Store the attachment for upload | ||||
|             files[name_clean] = (name_clean, temp_file, attachment.content_type) | ||||
|                 route.resource(temp_file) | ||||
| 
 | ||||
|                 # Replace as needed the name with the clean name | ||||
|                 html_clean = html_clean.replace(name_cid, name_clean) | ||||
| @ -395,42 +384,21 @@ class MailDocumentParser(DocumentParser): | ||||
|             # Now store the cleaned up HTML version | ||||
|             html_clean_file = tempdir / "index.html" | ||||
|             html_clean_file.write_text(html_clean) | ||||
|             # This is our index file, the main page basically | ||||
|             route.index(html_clean_file) | ||||
| 
 | ||||
|         files["index.html"] = ("index.html", html_clean_file, "text/html") | ||||
|             # Set page size, margins | ||||
|             route.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)).size( | ||||
|                 PageSize(height=11.7, width=8.27), | ||||
|             ).scale(1.0) | ||||
| 
 | ||||
|         data = { | ||||
|             "marginTop": "0.1", | ||||
|             "marginBottom": "0.1", | ||||
|             "marginLeft": "0.1", | ||||
|             "marginRight": "0.1", | ||||
|             "paperWidth": "8.27", | ||||
|             "paperHeight": "11.7", | ||||
|             "scale": "1.0", | ||||
|         } | ||||
|             try: | ||||
|             # Open a handle to each file, replacing the tuple | ||||
|             for filename in files: | ||||
|                 file_multi_part = files[filename] | ||||
|                 files[filename] = ( | ||||
|                     file_multi_part[0], | ||||
|                     file_multi_part[1].open("rb"), | ||||
|                     file_multi_part[2], | ||||
|                 ) | ||||
|                 response = route.run() | ||||
| 
 | ||||
|             response = httpx.post( | ||||
|                 url, | ||||
|                 files=files, | ||||
|                 data=data, | ||||
|                 timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||
|             ) | ||||
|             response.raise_for_status()  # ensure we notice bad responses | ||||
|             except Exception as err: | ||||
|             raise ParseError(f"Error while converting document to PDF: {err}") from err | ||||
|         finally: | ||||
|             # Ensure all file handles as closed | ||||
|             for filename in files: | ||||
|                 file_multi_part_handle = files[filename][1] | ||||
|                 file_multi_part_handle.close() | ||||
|                 raise ParseError( | ||||
|                     f"Error while converting document to PDF: {err}", | ||||
|                 ) from err | ||||
| 
 | ||||
|         html_pdf = tempdir / "html.pdf" | ||||
|         html_pdf.write_bytes(response.content) | ||||
|  | ||||
| @ -341,7 +341,7 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase): | ||||
|         ) | ||||
|         parsed = self.parser.tika_parse(html) | ||||
|         self.assertEqual(expected_text, parsed.strip()) | ||||
|         self.assertIn(self.parser.tika_server, str(self.httpx_mock.get_request().url)) | ||||
|         self.assertIn("http://localhost:9998", str(self.httpx_mock.get_request().url)) | ||||
| 
 | ||||
|     def test_tika_parse_exception(self): | ||||
|         """ | ||||
| @ -653,5 +653,5 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) | ||||
| 
 | ||||
|         self.assertEqual( | ||||
|             str(request.url), | ||||
|             self.parser.gotenberg_server + "/forms/chromium/convert/html", | ||||
|             "http://localhost:3000/forms/chromium/convert/html", | ||||
|         ) | ||||
|  | ||||
| @ -1,11 +1,14 @@ | ||||
| import os | ||||
| import shutil | ||||
| import subprocess | ||||
| import tempfile | ||||
| from pathlib import Path | ||||
| from unittest import mock | ||||
| 
 | ||||
| import httpx | ||||
| import pytest | ||||
| from django.test import TestCase | ||||
| from imagehash import average_hash | ||||
| from pdfminer.high_level import extract_text | ||||
| from PIL import Image | ||||
| 
 | ||||
| from documents.tests.utils import FileSystemAssertsMixin | ||||
| @ -13,6 +16,29 @@ from documents.tests.utils import util_call_with_backoff | ||||
| from paperless_mail.tests.test_parsers import BaseMailParserTestCase | ||||
| 
 | ||||
| 
 | ||||
| def extract_text(pdf_path: Path) -> str: | ||||
|     """ | ||||
|     Using pdftotext from poppler, extracts the text of a PDF into a file, | ||||
|     then reads the file contents and returns it | ||||
|     """ | ||||
|     with tempfile.NamedTemporaryFile( | ||||
|         mode="w+", | ||||
|     ) as tmp: | ||||
|         subprocess.run( | ||||
|             [ | ||||
|                 shutil.which("pdftotext"), | ||||
|                 "-q", | ||||
|                 "-layout", | ||||
|                 "-enc", | ||||
|                 "UTF-8", | ||||
|                 str(pdf_path), | ||||
|                 tmp.name, | ||||
|             ], | ||||
|             check=True, | ||||
|         ) | ||||
|         return tmp.read() | ||||
| 
 | ||||
| 
 | ||||
| class MailAttachmentMock: | ||||
|     def __init__(self, payload, content_id): | ||||
|         self.payload = payload | ||||
| @ -150,7 +176,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): | ||||
| 
 | ||||
|         extracted = extract_text(pdf_path) | ||||
|         expected = ( | ||||
|             "first\tPDF\tto\tbe\tmerged.\n\n\x0csecond\tPDF\tto\tbe\tmerged.\n\n\x0c" | ||||
|             "first   PDF   to   be   merged.\n\x0csecond PDF   to   be   merged.\n\x0c" | ||||
|         ) | ||||
| 
 | ||||
|         self.assertEqual(expected, extracted) | ||||
|  | ||||
| @ -1,9 +1,10 @@ | ||||
| import os | ||||
| from pathlib import Path | ||||
| 
 | ||||
| import httpx | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
| from gotenberg_client import GotenbergClient | ||||
| from gotenberg_client.options import PdfAFormat | ||||
| from tika_client import TikaClient | ||||
| 
 | ||||
| from documents.parsers import DocumentParser | ||||
| @ -80,47 +81,33 @@ class TikaDocumentParser(DocumentParser): | ||||
| 
 | ||||
|         self.archive_path = self.convert_to_pdf(document_path, file_name) | ||||
| 
 | ||||
|     def convert_to_pdf(self, document_path, file_name): | ||||
|         pdf_path = os.path.join(self.tempdir, "convert.pdf") | ||||
|         gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT | ||||
|         url = gotenberg_server + "/forms/libreoffice/convert" | ||||
|     def convert_to_pdf(self, document_path: Path, file_name): | ||||
|         pdf_path = Path(self.tempdir) / "convert.pdf" | ||||
| 
 | ||||
|         self.log.info(f"Converting {document_path} to PDF as {pdf_path}") | ||||
|         with open(document_path, "rb") as document_handle: | ||||
|             files = { | ||||
|                 "files": ( | ||||
|                     "convert" + os.path.splitext(document_path)[-1], | ||||
|                     document_handle, | ||||
|                 ), | ||||
|             } | ||||
|             headers = {} | ||||
|             data = {} | ||||
| 
 | ||||
|         with GotenbergClient( | ||||
|             host=settings.TIKA_GOTENBERG_ENDPOINT, | ||||
|             timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||
|         ) as client, client.libre_office.to_pdf() as route: | ||||
|             # Set the output format of the resulting PDF | ||||
|             # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno | ||||
|             if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: | ||||
|                 data["pdfFormat"] = "PDF/A-2b" | ||||
|                 route.pdf_format(PdfAFormat.A2b) | ||||
|             elif settings.OCR_OUTPUT_TYPE == "pdfa-1": | ||||
|                 data["pdfFormat"] = "PDF/A-1a" | ||||
|                 route.pdf_format(PdfAFormat.A1a) | ||||
|             elif settings.OCR_OUTPUT_TYPE == "pdfa-3": | ||||
|                 data["pdfFormat"] = "PDF/A-3b" | ||||
|                 route.pdf_format(PdfAFormat.A3b) | ||||
| 
 | ||||
|             route.convert(document_path) | ||||
| 
 | ||||
|             try: | ||||
|                 response = httpx.post( | ||||
|                     url, | ||||
|                     files=files, | ||||
|                     headers=headers, | ||||
|                     data=data, | ||||
|                     timeout=settings.CELERY_TASK_TIME_LIMIT, | ||||
|                 ) | ||||
|                 response.raise_for_status()  # ensure we notice bad responses | ||||
|                 response = route.run() | ||||
| 
 | ||||
|                 pdf_path.write_bytes(response.content) | ||||
| 
 | ||||
|                 return pdf_path | ||||
| 
 | ||||
|             except Exception as err: | ||||
|                 raise ParseError( | ||||
|                     f"Error while converting document to PDF: {err}", | ||||
|                 ) from err | ||||
| 
 | ||||
|         with open(pdf_path, "wb") as file: | ||||
|             file.write(response.content) | ||||
|             file.close() | ||||
| 
 | ||||
|         return pdf_path | ||||
|  | ||||
| @ -2,12 +2,11 @@ import datetime | ||||
| import os | ||||
| import zoneinfo | ||||
| from pathlib import Path | ||||
| from unittest import mock | ||||
| 
 | ||||
| from django.test import TestCase | ||||
| from django.test import override_settings | ||||
| from httpx import Request | ||||
| from httpx import Response | ||||
| from httpx import codes | ||||
| from httpx._multipart import DataField | ||||
| from rest_framework import status | ||||
| 
 | ||||
| from documents.parsers import ParseError | ||||
| @ -95,8 +94,7 @@ class TestTikaParser(HttpxMockMixin, TestCase): | ||||
|         with self.assertRaises(ParseError): | ||||
|             self.parser.convert_to_pdf(file, None) | ||||
| 
 | ||||
|     @mock.patch("paperless_tika.parsers.httpx.post") | ||||
|     def test_request_pdf_a_format(self, post: mock.Mock): | ||||
|     def test_request_pdf_a_format(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Document needs to be converted to PDF | ||||
| @ -108,10 +106,6 @@ class TestTikaParser(HttpxMockMixin, TestCase): | ||||
|         file = Path(os.path.join(self.parser.tempdir, "input.odt")) | ||||
|         file.touch() | ||||
| 
 | ||||
|         response = Response(status_code=status.HTTP_200_OK) | ||||
|         response.request = Request("POST", "/somewhere/") | ||||
|         post.return_value = response | ||||
| 
 | ||||
|         for setting, expected_key in [ | ||||
|             ("pdfa", "PDF/A-2b"), | ||||
|             ("pdfa-2", "PDF/A-2b"), | ||||
| @ -119,11 +113,20 @@ class TestTikaParser(HttpxMockMixin, TestCase): | ||||
|             ("pdfa-3", "PDF/A-3b"), | ||||
|         ]: | ||||
|             with override_settings(OCR_OUTPUT_TYPE=setting): | ||||
|                 self.httpx_mock.add_response( | ||||
|                     status_code=codes.OK, | ||||
|                     content=b"PDF document", | ||||
|                     method="POST", | ||||
|                 ) | ||||
| 
 | ||||
|                 self.parser.convert_to_pdf(file, None) | ||||
| 
 | ||||
|                 post.assert_called_once() | ||||
|                 _, kwargs = post.call_args | ||||
|                 request = self.httpx_mock.get_request() | ||||
|                 found = False | ||||
|                 for field in request.stream.fields: | ||||
|                     if isinstance(field, DataField) and field.name == "pdfFormat": | ||||
|                         self.assertEqual(field.value, expected_key) | ||||
|                         found = True | ||||
|                 self.assertTrue(found) | ||||
| 
 | ||||
|                 self.assertEqual(kwargs["data"]["pdfFormat"], expected_key) | ||||
| 
 | ||||
|                 post.reset_mock() | ||||
|                 self.httpx_mock.reset(assert_all_responses_were_requested=False) | ||||
|  | ||||
| @ -7,7 +7,7 @@ max-line-length = 88 | ||||
| 
 | ||||
| [tool:pytest] | ||||
| DJANGO_SETTINGS_MODULE=paperless.settings | ||||
| addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --quiet --durations=50 | ||||
| addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50 | ||||
| env = | ||||
|   PAPERLESS_DISABLE_DBHANDLER=true | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user