This actually works

[ci skip]
This commit is contained in:
shamoon 2025-04-18 13:03:51 -07:00
parent b6f39b453b
commit 18e77fabf5
No known key found for this signature in database
2 changed files with 15 additions and 7 deletions

View File

@ -317,6 +317,7 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"paperless_remote.apps.PaperlessRemoteParserConfig",
"django.contrib.admin",
"rest_framework",
"rest_framework.authtoken",

View File

@ -64,6 +64,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
This method uses the Azure AI Vision API to parse documents
"""
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
@ -72,19 +73,25 @@ class RemoteDocumentParser(RasterisedDocumentParser):
)
with file.open("rb") as f:
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
poller = client.begin_analyze_document(
model_id="prebuilt-read",
analyze_request=f,
content_type="application/octet-stream",
output_format="pdf",
body=analyze_request,
output=["pdf"], # request searchable PDF output
content_type="application/json",
)
result = poller.result()
poller.wait()
result_id = poller.details["operation_id"]
# Download the PDF with embedded text
pdf_bytes = client.get_analyze_result_pdf(result.result_id)
self.archive_path = Path(self.tempdir) / "archive.pdf"
self.archive_path.write_bytes(pdf_bytes)
with self.archive_path.open("wb") as f:
for chunk in client.get_analyze_result_pdf(
model_id="prebuilt-read",
result_id=result_id,
):
f.write(chunk)
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
subprocess.run(
@ -96,7 +103,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
tmp.name,
],
)
with Path.open(tmp.name, encoding="utf-8") as t:
with Path(tmp.name).open(encoding="utf-8") as t:
return t.read()
def parse(self, document_path: Path, mime_type, file_name=None):