mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-24 02:02:23 -04:00
This actually works
[ci skip]
This commit is contained in:
parent
b6f39b453b
commit
18e77fabf5
@ -317,6 +317,7 @@ INSTALLED_APPS = [
|
|||||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||||
"paperless_text.apps.PaperlessTextConfig",
|
"paperless_text.apps.PaperlessTextConfig",
|
||||||
"paperless_mail.apps.PaperlessMailConfig",
|
"paperless_mail.apps.PaperlessMailConfig",
|
||||||
|
"paperless_remote.apps.PaperlessRemoteParserConfig",
|
||||||
"django.contrib.admin",
|
"django.contrib.admin",
|
||||||
"rest_framework",
|
"rest_framework",
|
||||||
"rest_framework.authtoken",
|
"rest_framework.authtoken",
|
||||||
|
@ -64,6 +64,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
|||||||
This method uses the Azure AI Vision API to parse documents
|
This method uses the Azure AI Vision API to parse documents
|
||||||
"""
|
"""
|
||||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||||
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||||
from azure.core.credentials import AzureKeyCredential
|
from azure.core.credentials import AzureKeyCredential
|
||||||
|
|
||||||
client = DocumentIntelligenceClient(
|
client = DocumentIntelligenceClient(
|
||||||
@ -72,19 +73,25 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
|||||||
)
|
)
|
||||||
|
|
||||||
with file.open("rb") as f:
|
with file.open("rb") as f:
|
||||||
|
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
||||||
poller = client.begin_analyze_document(
|
poller = client.begin_analyze_document(
|
||||||
model_id="prebuilt-read",
|
model_id="prebuilt-read",
|
||||||
analyze_request=f,
|
body=analyze_request,
|
||||||
content_type="application/octet-stream",
|
output=["pdf"], # request searchable PDF output
|
||||||
output_format="pdf",
|
content_type="application/json",
|
||||||
)
|
)
|
||||||
|
|
||||||
result = poller.result()
|
poller.wait()
|
||||||
|
result_id = poller.details["operation_id"]
|
||||||
|
|
||||||
# Download the PDF with embedded text
|
# Download the PDF with embedded text
|
||||||
pdf_bytes = client.get_analyze_result_pdf(result.result_id)
|
|
||||||
self.archive_path = Path(self.tempdir) / "archive.pdf"
|
self.archive_path = Path(self.tempdir) / "archive.pdf"
|
||||||
self.archive_path.write_bytes(pdf_bytes)
|
with self.archive_path.open("wb") as f:
|
||||||
|
for chunk in client.get_analyze_result_pdf(
|
||||||
|
model_id="prebuilt-read",
|
||||||
|
result_id=result_id,
|
||||||
|
):
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
@ -96,7 +103,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
|||||||
tmp.name,
|
tmp.name,
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
with Path.open(tmp.name, encoding="utf-8") as t:
|
with Path(tmp.name).open(encoding="utf-8") as t:
|
||||||
return t.read()
|
return t.read()
|
||||||
|
|
||||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user