mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-04 03:27:12 -05:00 
			
		
		
		
	Combine and extend the utility for calling the live services to be more robust against failures, reporting, etc
This commit is contained in:
		
							parent
							
								
									b9fdf68be3
								
							
						
					
					
						commit
						a1697ff21c
					
				@ -1,14 +1,21 @@
 | 
				
			|||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
import tempfile
 | 
					import tempfile
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
 | 
					import warnings
 | 
				
			||||||
from collections import namedtuple
 | 
					from collections import namedtuple
 | 
				
			||||||
from contextlib import contextmanager
 | 
					from contextlib import contextmanager
 | 
				
			||||||
from os import PathLike
 | 
					from os import PathLike
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any
 | 
				
			||||||
 | 
					from typing import Callable
 | 
				
			||||||
from typing import Iterator
 | 
					from typing import Iterator
 | 
				
			||||||
 | 
					from typing import List
 | 
				
			||||||
from typing import Tuple
 | 
					from typing import Tuple
 | 
				
			||||||
from typing import Union
 | 
					from typing import Union
 | 
				
			||||||
from unittest import mock
 | 
					from unittest import mock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import httpx
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
from django.apps import apps
 | 
					from django.apps import apps
 | 
				
			||||||
from django.db import connection
 | 
					from django.db import connection
 | 
				
			||||||
from django.db.migrations.executor import MigrationExecutor
 | 
					from django.db.migrations.executor import MigrationExecutor
 | 
				
			||||||
@ -78,6 +85,61 @@ def paperless_environment():
 | 
				
			|||||||
            remove_dirs(dirs)
 | 
					            remove_dirs(dirs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def util_call_with_backoff(
 | 
				
			||||||
 | 
					    method_or_callable: Callable,
 | 
				
			||||||
 | 
					    args: Union[List, Tuple],
 | 
				
			||||||
 | 
					    *,
 | 
				
			||||||
 | 
					    skip_on_503=True,
 | 
				
			||||||
 | 
					) -> Tuple[bool, Any]:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    For whatever reason, the images started during the test pipeline like to
 | 
				
			||||||
 | 
					    segfault sometimes, crash and otherwise fail randomly, when run with the
 | 
				
			||||||
 | 
					    exact files that usually pass.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    So, this function will retry the given method/function up to 3 times, with larger backoff
 | 
				
			||||||
 | 
					    periods between each attempt, in hopes the issue resolves itself during
 | 
				
			||||||
 | 
					    one attempt to parse.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This will wait the following:
 | 
				
			||||||
 | 
					        - Attempt 1 - 20s following failure
 | 
				
			||||||
 | 
					        - Attempt 2 - 40s following failure
 | 
				
			||||||
 | 
					        - Attempt 3 - 80s following failure
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    result = None
 | 
				
			||||||
 | 
					    succeeded = False
 | 
				
			||||||
 | 
					    retry_time = 20.0
 | 
				
			||||||
 | 
					    retry_count = 0
 | 
				
			||||||
 | 
					    status_codes = []
 | 
				
			||||||
 | 
					    max_retry_count = 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    while retry_count < max_retry_count and not succeeded:
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            result = method_or_callable(*args)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            succeeded = True
 | 
				
			||||||
 | 
					        except httpx.HTTPError as exc:
 | 
				
			||||||
 | 
					            warnings.warn(f"HTTP Exception for {exc.request.url} - {exc}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if isinstance(exc, httpx.HTTPStatusError):
 | 
				
			||||||
 | 
					                status_codes.append(exc.response.status_code)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            retry_count = retry_count + 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            time.sleep(retry_time)
 | 
				
			||||||
 | 
					            retry_time = retry_time * 2.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (
 | 
				
			||||||
 | 
					        not succeeded
 | 
				
			||||||
 | 
					        and status_codes
 | 
				
			||||||
 | 
					        and skip_on_503
 | 
				
			||||||
 | 
					        and all(element == httpx.codes.SERVICE_UNAVAILABLE for element in status_codes)
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
 | 
					        pytest.skip("Repeated HTTP 503 for service")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return succeeded, result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DirectoriesMixin:
 | 
					class DirectoriesMixin:
 | 
				
			||||||
    def __init__(self, *args, **kwargs):
 | 
					    def __init__(self, *args, **kwargs):
 | 
				
			||||||
        super().__init__(*args, **kwargs)
 | 
					        super().__init__(*args, **kwargs)
 | 
				
			||||||
 | 
				
			|||||||
@ -1,5 +1,4 @@
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
import time
 | 
					 | 
				
			||||||
from unittest import mock
 | 
					from unittest import mock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import httpx
 | 
					import httpx
 | 
				
			||||||
@ -10,6 +9,7 @@ from pdfminer.high_level import extract_text
 | 
				
			|||||||
from PIL import Image
 | 
					from PIL import Image
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from documents.tests.utils import FileSystemAssertsMixin
 | 
					from documents.tests.utils import FileSystemAssertsMixin
 | 
				
			||||||
 | 
					from documents.tests.utils import util_call_with_backoff
 | 
				
			||||||
from paperless_mail.tests.test_parsers import BaseMailParserTestCase
 | 
					from paperless_mail.tests.test_parsers import BaseMailParserTestCase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -79,51 +79,6 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
 | 
				
			|||||||
    def imagehash(file, hash_size=18):
 | 
					    def imagehash(file, hash_size=18):
 | 
				
			||||||
        return f"{average_hash(Image.open(file), hash_size)}"
 | 
					        return f"{average_hash(Image.open(file), hash_size)}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def util_call_with_backoff(self, method_or_callable, args):
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        For whatever reason, the image started during the test pipeline likes to
 | 
					 | 
				
			||||||
        segfault sometimes, when run with the exact files that usually pass.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        So, this function will retry the parsing up to 3 times, with larger backoff
 | 
					 | 
				
			||||||
        periods between each attempt, in hopes the issue resolves itself during
 | 
					 | 
				
			||||||
        one attempt to parse.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        This will wait the following:
 | 
					 | 
				
			||||||
            - Attempt 1 - 20s following failure
 | 
					 | 
				
			||||||
            - Attempt 2 - 40s following failure
 | 
					 | 
				
			||||||
            - Attempt 3 - 80s following failure
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        result = None
 | 
					 | 
				
			||||||
        succeeded = False
 | 
					 | 
				
			||||||
        retry_time = 20.0
 | 
					 | 
				
			||||||
        retry_count = 0
 | 
					 | 
				
			||||||
        max_retry_count = 3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        while retry_count < max_retry_count and not succeeded:
 | 
					 | 
				
			||||||
            try:
 | 
					 | 
				
			||||||
                result = method_or_callable(*args)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                succeeded = True
 | 
					 | 
				
			||||||
            except httpx.HTTPError as e:
 | 
					 | 
				
			||||||
                # Retry on HTTP errors
 | 
					 | 
				
			||||||
                print(f"{e} during try #{retry_count}", flush=True)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                retry_count = retry_count + 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                time.sleep(retry_time)
 | 
					 | 
				
			||||||
                retry_time = retry_time * 2.0
 | 
					 | 
				
			||||||
            except Exception:
 | 
					 | 
				
			||||||
                # Not on other error
 | 
					 | 
				
			||||||
                raise
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self.assertTrue(
 | 
					 | 
				
			||||||
            succeeded,
 | 
					 | 
				
			||||||
            "Continued Tika server errors after multiple retries",
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return result
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
 | 
					    @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
 | 
				
			||||||
    def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
 | 
					    def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
@ -187,7 +142,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
 | 
				
			|||||||
            self.SAMPLE_DIR / "html.eml",
 | 
					            self.SAMPLE_DIR / "html.eml",
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        pdf_path = self.util_call_with_backoff(
 | 
					        _, pdf_path = util_call_with_backoff(
 | 
				
			||||||
            self.parser.generate_pdf,
 | 
					            self.parser.generate_pdf,
 | 
				
			||||||
            [msg],
 | 
					            [msg],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
@ -210,7 +165,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
 | 
				
			|||||||
            - gotenberg is called and the resulting file is returned and look as expected.
 | 
					            - gotenberg is called and the resulting file is returned and look as expected.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.util_call_with_backoff(
 | 
					        util_call_with_backoff(
 | 
				
			||||||
            self.parser.parse,
 | 
					            self.parser.parse,
 | 
				
			||||||
            [self.SAMPLE_DIR / "html.eml", "message/rfc822"],
 | 
					            [self.SAMPLE_DIR / "html.eml", "message/rfc822"],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
				
			|||||||
@ -1,11 +1,11 @@
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
import time
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Final
 | 
					from typing import Final
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from django.test import TestCase
 | 
					from django.test import TestCase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from documents.tests.utils import util_call_with_backoff
 | 
				
			||||||
from paperless_tika.parsers import TikaDocumentParser
 | 
					from paperless_tika.parsers import TikaDocumentParser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -28,44 +28,6 @@ class TestTikaParserAgainstServer(TestCase):
 | 
				
			|||||||
    def tearDown(self) -> None:
 | 
					    def tearDown(self) -> None:
 | 
				
			||||||
        self.parser.cleanup()
 | 
					        self.parser.cleanup()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def try_parse_with_wait(self, test_file: Path, mime_type: str):
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        For whatever reason, the image started during the test pipeline likes to
 | 
					 | 
				
			||||||
        segfault sometimes, when run with the exact files that usually pass.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        So, this function will retry the parsing up to 3 times, with larger backoff
 | 
					 | 
				
			||||||
        periods between each attempt, in hopes the issue resolves itself during
 | 
					 | 
				
			||||||
        one attempt to parse.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        This will wait the following:
 | 
					 | 
				
			||||||
            - Attempt 1 - 20s following failure
 | 
					 | 
				
			||||||
            - Attempt 2 - 40s following failure
 | 
					 | 
				
			||||||
            - Attempt 3 - 80s following failure
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        succeeded = False
 | 
					 | 
				
			||||||
        retry_time = 20.0
 | 
					 | 
				
			||||||
        retry_count = 0
 | 
					 | 
				
			||||||
        max_retry_count = 3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        while retry_count < max_retry_count and not succeeded:
 | 
					 | 
				
			||||||
            try:
 | 
					 | 
				
			||||||
                self.parser.parse(test_file, mime_type)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                succeeded = True
 | 
					 | 
				
			||||||
            except Exception as e:
 | 
					 | 
				
			||||||
                print(f"{e} during try #{retry_count}", flush=True)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                retry_count = retry_count + 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                time.sleep(retry_time)
 | 
					 | 
				
			||||||
                retry_time = retry_time * 2.0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self.assertTrue(
 | 
					 | 
				
			||||||
            succeeded,
 | 
					 | 
				
			||||||
            "Continued Tika server errors after multiple retries",
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def test_basic_parse_odt(self):
 | 
					    def test_basic_parse_odt(self):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        GIVEN:
 | 
					        GIVEN:
 | 
				
			||||||
@ -78,7 +40,10 @@ class TestTikaParserAgainstServer(TestCase):
 | 
				
			|||||||
        """
 | 
					        """
 | 
				
			||||||
        test_file = self.SAMPLE_DIR / Path("sample.odt")
 | 
					        test_file = self.SAMPLE_DIR / Path("sample.odt")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.try_parse_with_wait(test_file, "application/vnd.oasis.opendocument.text")
 | 
					        util_call_with_backoff(
 | 
				
			||||||
 | 
					            self.parser.parse,
 | 
				
			||||||
 | 
					            [test_file, "application/vnd.oasis.opendocument.text"],
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertEqual(
 | 
					        self.assertEqual(
 | 
				
			||||||
            self.parser.text,
 | 
					            self.parser.text,
 | 
				
			||||||
@ -104,9 +69,12 @@ class TestTikaParserAgainstServer(TestCase):
 | 
				
			|||||||
        """
 | 
					        """
 | 
				
			||||||
        test_file = self.SAMPLE_DIR / Path("sample.docx")
 | 
					        test_file = self.SAMPLE_DIR / Path("sample.docx")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.try_parse_with_wait(
 | 
					        util_call_with_backoff(
 | 
				
			||||||
 | 
					            self.parser.parse,
 | 
				
			||||||
 | 
					            [
 | 
				
			||||||
                test_file,
 | 
					                test_file,
 | 
				
			||||||
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 | 
					                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 | 
				
			||||||
 | 
					            ],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertEqual(
 | 
					        self.assertEqual(
 | 
				
			||||||
@ -131,9 +99,9 @@ class TestTikaParserAgainstServer(TestCase):
 | 
				
			|||||||
        """
 | 
					        """
 | 
				
			||||||
        test_file = self.SAMPLE_DIR / "sample.doc"
 | 
					        test_file = self.SAMPLE_DIR / "sample.doc"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.try_parse_with_wait(
 | 
					        util_call_with_backoff(
 | 
				
			||||||
            test_file,
 | 
					            self.parser.parse,
 | 
				
			||||||
            "application/msword",
 | 
					            [test_file, "application/msword"],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertIn(
 | 
					        self.assertIn(
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user