Ok, restart implementing this with just azure

[ci skip]
This commit is contained in:
shamoon 2025-04-18 11:38:36 -07:00
parent 62f46b706e
commit c796b0ed0d
No known key found for this signature in database
10 changed files with 253 additions and 0 deletions

View File

@ -1266,3 +1266,11 @@ OUTLOOK_OAUTH_ENABLED = bool(
and OUTLOOK_OAUTH_CLIENT_ID
and OUTLOOK_OAUTH_CLIENT_SECRET,
)
###############################################################################
# Remote Parser #
###############################################################################
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")

View File

@ -0,0 +1,4 @@
# this is here so that django finds the checks.
from paperless_remote.checks import check_remote_parser_configured
__all__ = ["check_remote_parser_configured"]

View File

@ -0,0 +1,14 @@
from django.apps import AppConfig
from paperless_remote.signals import remote_consumer_declaration
class PaperlessRemoteParserConfig(AppConfig):
name = "paperless_remote"
def ready(self):
from documents.signals import document_consumer_declaration
document_consumer_declaration.connect(remote_consumer_declaration)
AppConfig.ready(self)

View File

@ -0,0 +1,15 @@
from django.conf import settings
from django.core.checks import Error
from django.core.checks import register
@register()
def check_remote_parser_configured(app_configs, **kwargs):
if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT:
return [
Error(
"Azure AI remote parser requires endpoint to be configured.",
),
]
return []

View File

@ -0,0 +1,74 @@
from pathlib import Path
from django.conf import settings
from paperless_tesseract.parsers import RasterisedDocumentParser
class RemoteEngineConfig:
def __init__(
self,
engine: str,
api_key: str | None = None,
endpoint: str | None = None,
):
self.engine = engine
self.api_key = api_key
self.endpoint = endpoint
def engine_is_valid(self):
valid = self.engine in ["azureai"] and self.api_key is not None
if self.engine == "azureai":
valid = valid and self.endpoint is not None
return valid
class RemoteDocumentParser(RasterisedDocumentParser):
"""
This parser uses a remote ocr engine to parse documents
"""
logging_name = "paperless.parsing.remote"
def get_settings(self) -> RemoteEngineConfig:
"""
This parser uses the OCR configuration settings to parse documents
"""
return RemoteEngineConfig(
engine=settings.REMOTE_OCR_ENGINE,
api_key=settings.REMOTE_OCR_API_KEY,
endpoint=settings.REMOTE_OCR_ENDPOINT,
)
def supported_mime_types(self):
if self.settings.engine_is_valid():
return [
"application/pdf",
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
"image/webp",
]
else:
return []
def azure_ai_vision_parse(
self,
file: Path,
) -> str | None:
"""
This method uses the Azure AI Vision API to parse documents
"""
# TODO: Implement the Azure AI Vision API parsing logic
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():
self.log.warning(
"No valid remote parser engine is configured, content will be empty.",
)
self.text = ""
return
elif self.settings.engine == "azureai":
self.text = self.azure_ai_vision_parse(document_path)

View File

@ -0,0 +1,18 @@
def get_parser(*args, **kwargs):
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(*args, **kwargs)
def get_supported_mime_types():
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(None).supported_mime_types()
def remote_consumer_declaration(sender, **kwargs):
return {
"parser": get_parser,
"weight": 5,
"mime_types": get_supported_mime_types(),
}

View File

Binary file not shown.

View File

@ -0,0 +1,29 @@
from django.test import TestCase
from django.test import override_settings
from paperless_remote import check_remote_parser_configured
class TestChecks(TestCase):
@override_settings(REMOTE_OCR_ENGINE=None)
def test_no_engine(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)
@override_settings(REMOTE_OCR_ENGINE="azureai")
@override_settings(REMOTE_OCR_API_KEY="somekey")
@override_settings(REMOTE_OCR_ENDPOINT=None)
def test_azure_no_endpoint(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 1)
self.assertTrue(
msgs[0].msg.startswith(
"Azure AI Vision remote parser requires endpoint to be configured.",
),
)
@override_settings(REMOTE_OCR_ENGINE="something")
@override_settings(REMOTE_OCR_API_KEY="somekey")
def test_valid_configuration(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)

View File

@ -0,0 +1,91 @@
import sys
import uuid
from pathlib import Path
from unittest import mock
import pytest
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_remote.parsers import RemoteDocumentParser
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
def assertContainsStrings(self, content, strings):
# Asserts that all strings appear in content, in the given order.
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
@pytest.mark.skipif(
sys.version_info > (3, 10),
reason="Fails on 3.11 only on CI, for some reason",
) # TODO: investigate
@mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient")
def test_get_text_with_azure(self, mock_azure_client):
result = mock.Mock()
result.content = "This is a test document."
result.pages = [
mock.Mock(
width=100,
height=100,
words=[
mock.Mock(
content="This",
polygon=[
mock.Mock(x=0, y=0),
],
),
mock.Mock(
content="is",
polygon=[
mock.Mock(x=10, y=10),
],
),
mock.Mock(
content="a",
polygon=[
mock.Mock(x=20, y=20),
],
),
mock.Mock(
content="test",
polygon=[
mock.Mock(x=30, y=30),
],
),
mock.Mock(
content="document.",
polygon=[
mock.Mock(x=40, y=40),
],
),
],
),
]
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = result
with override_settings(
REMOTE_OCR_ENGINE="azureaivision",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/",
):
parser = RemoteDocumentParser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)