mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-31 02:27:10 -04:00 
			
		
		
		
	removed obsolete tests.
This commit is contained in:
		
							parent
							
								
									ef6690905e
								
							
						
					
					
						commit
						56ce267f89
					
				
										
											Binary file not shown.
										
									
								
							| Before Width: | Height: | Size: 32 KiB | 
| @ -1,34 +1,9 @@ | |||||||
| import os | import os | ||||||
| from unittest import mock, skipIf | from unittest import mock, skipIf | ||||||
| 
 | 
 | ||||||
| import pyocr |  | ||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
| from pyocr.libtesseract.tesseract_raw import \ |  | ||||||
|     TesseractError as OtherTesseractError |  | ||||||
| 
 | 
 | ||||||
| from ..parsers import image_to_string, strip_excess_whitespace | from ..parsers import strip_excess_whitespace | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class FakeTesseract(object): |  | ||||||
| 
 |  | ||||||
|     @staticmethod |  | ||||||
|     def can_detect_orientation(): |  | ||||||
|         return True |  | ||||||
| 
 |  | ||||||
|     @staticmethod |  | ||||||
|     def detect_orientation(file_handle, lang): |  | ||||||
|         raise OtherTesseractError("arbitrary status", "message") |  | ||||||
| 
 |  | ||||||
|     @staticmethod |  | ||||||
|     def image_to_string(file_handle, lang): |  | ||||||
|         return "This is test text" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class FakePyOcr(object): |  | ||||||
| 
 |  | ||||||
|     @staticmethod |  | ||||||
|     def get_available_tools(): |  | ||||||
|         return [FakeTesseract] |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TestOCR(TestCase): | class TestOCR(TestCase): | ||||||
| @ -45,9 +20,6 @@ class TestOCR(TestCase): | |||||||
|         ) |         ) | ||||||
|     ] |     ] | ||||||
| 
 | 
 | ||||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") |  | ||||||
|     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) |  | ||||||
| 
 |  | ||||||
|     def test_strip_excess_whitespace(self): |     def test_strip_excess_whitespace(self): | ||||||
|         for source, result in self.text_cases: |         for source, result in self.text_cases: | ||||||
|             actual_result = strip_excess_whitespace(source) |             actual_result = strip_excess_whitespace(source) | ||||||
| @ -60,17 +32,3 @@ class TestOCR(TestCase): | |||||||
|                     actual_result |                     actual_result | ||||||
|                 ) |                 ) | ||||||
|             ) |             ) | ||||||
| 
 |  | ||||||
|     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) |  | ||||||
|     def test_image_to_string_with_text_free_page(self): |  | ||||||
|         """ |  | ||||||
|         This test is sort of silly, since it's really just reproducing an odd |  | ||||||
|         exception thrown by pyocr when it encounters a page with no text. |  | ||||||
|         Actually running this test against an installation of Tesseract results |  | ||||||
|         in a segmentation fault rooted somewhere deep inside pyocr where I |  | ||||||
|         don't care to dig.  Regardless, if you run the consumer normally, |  | ||||||
|         text-free pages are now handled correctly so long as we work around |  | ||||||
|         this weird exception. |  | ||||||
|         """ |  | ||||||
|         image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"]) |  | ||||||
|  | |||||||
| @ -6,41 +6,13 @@ from typing import ContextManager | |||||||
| from unittest import mock | from unittest import mock | ||||||
| 
 | 
 | ||||||
| from django.test import TestCase, override_settings | from django.test import TestCase, override_settings | ||||||
| from pyocr.error import TesseractError |  | ||||||
| 
 | 
 | ||||||
| from documents.parsers import ParseError, run_convert | from documents.parsers import ParseError, run_convert | ||||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError | from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf | ||||||
| 
 | 
 | ||||||
| image_to_string_calls = [] | image_to_string_calls = [] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class FakeTesseract(object): |  | ||||||
| 
 |  | ||||||
|     @staticmethod |  | ||||||
|     def can_detect_orientation(): |  | ||||||
|         return True |  | ||||||
| 
 |  | ||||||
|     @staticmethod |  | ||||||
|     def detect_orientation(file_handle, lang): |  | ||||||
|         raise TesseractError("arbitrary status", "message") |  | ||||||
| 
 |  | ||||||
|     @staticmethod |  | ||||||
|     def get_available_languages(): |  | ||||||
|         return ['eng', 'deu'] |  | ||||||
| 
 |  | ||||||
|     @staticmethod |  | ||||||
|     def image_to_string(file_handle, lang): |  | ||||||
|         image_to_string_calls.append((file_handle.name, lang)) |  | ||||||
|         return file_handle.read() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class FakePyOcr(object): |  | ||||||
| 
 |  | ||||||
|     @staticmethod |  | ||||||
|     def get_available_tools(): |  | ||||||
|         return [FakeTesseract] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def fake_convert(input_file, output_file, **kwargs): | def fake_convert(input_file, output_file, **kwargs): | ||||||
|     with open(input_file) as f: |     with open(input_file) as f: | ||||||
|         lines = f.readlines() |         lines = f.readlines() | ||||||
| @ -50,12 +22,6 @@ def fake_convert(input_file, output_file, **kwargs): | |||||||
|             f2.write(line.strip()) |             f2.write(line.strip()) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def fake_unpaper(pnm): |  | ||||||
|     output = pnm + ".unpaper.pnm" |  | ||||||
|     shutil.copy(pnm, output) |  | ||||||
|     return output |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class FakeImageFile(ContextManager): | class FakeImageFile(ContextManager): | ||||||
|     def __init__(self, fname): |     def __init__(self, fname): | ||||||
|         self.fname = fname |         self.fname = fname | ||||||
| @ -67,92 +33,6 @@ class FakeImageFile(ContextManager): | |||||||
|         return os.path.basename(self.fname) |         return os.path.basename(self.fname) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| fake_image = FakeImageFile |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) |  | ||||||
| @mock.patch("paperless_tesseract.parsers.run_convert", fake_convert) |  | ||||||
| @mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper) |  | ||||||
| @mock.patch("paperless_tesseract.parsers.Image.open", open) |  | ||||||
| class TestRasterisedDocumentParser(TestCase): |  | ||||||
| 
 |  | ||||||
|     def setUp(self): |  | ||||||
|         self.scratch = tempfile.mkdtemp() |  | ||||||
| 
 |  | ||||||
|         global image_to_string_calls |  | ||||||
| 
 |  | ||||||
|         image_to_string_calls = [] |  | ||||||
| 
 |  | ||||||
|         override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable() |  | ||||||
| 
 |  | ||||||
|     def tearDown(self): |  | ||||||
|         shutil.rmtree(self.scratch) |  | ||||||
| 
 |  | ||||||
|     def get_input_file(self, pages): |  | ||||||
|         _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch) |  | ||||||
|         with open(fname, "w") as f: |  | ||||||
|             f.writelines([f"line {p}\n" for p in range(pages)]) |  | ||||||
|         return fname |  | ||||||
| 
 |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") |  | ||||||
|     def test_parse_text_simple_language_match(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0") |  | ||||||
| 
 |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"]) |  | ||||||
| 
 |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") |  | ||||||
|     def test_parse_text_2_pages(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0 line 1") |  | ||||||
| 
 |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"]) |  | ||||||
| 
 |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") |  | ||||||
|     def test_parse_text_3_pages(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0 line 1 line 2") |  | ||||||
| 
 |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) |  | ||||||
| 
 |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None) |  | ||||||
|     def test_parse_text_lang_detect_failed(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0 line 1 line 2") |  | ||||||
| 
 |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) |  | ||||||
| 
 |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it") |  | ||||||
|     def test_parse_text_lang_not_installed(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0 line 1 line 2 line 3") |  | ||||||
| 
 |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"]) |  | ||||||
| 
 |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") |  | ||||||
|     def test_parse_text_lang_mismatch(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0 line 1 line 2") |  | ||||||
| 
 |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"]) |  | ||||||
| 
 |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") |  | ||||||
|     def test_parse_empty_doc(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) |  | ||||||
|         try: |  | ||||||
|             parser.get_text() |  | ||||||
|         except ParseError as e: |  | ||||||
|             self.assertEqual("Empty document, nothing to do.", str(e)) |  | ||||||
|         else: |  | ||||||
|             self.fail("Should raise exception") |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class TestAuxilliaryFunctions(TestCase): | class TestAuxilliaryFunctions(TestCase): | ||||||
| 
 | 
 | ||||||
|     def setUp(self): |     def setUp(self): | ||||||
| @ -173,32 +53,7 @@ class TestAuxilliaryFunctions(TestCase): | |||||||
|     def test_get_text_from_pdf_error(self): |     def test_get_text_from_pdf_error(self): | ||||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) |         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) | ||||||
| 
 | 
 | ||||||
|         self.assertEqual(text.strip(), "") |         self.assertIsNone(text) | ||||||
| 
 |  | ||||||
|     def test_image_to_string(self): |  | ||||||
|         text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng")) |  | ||||||
| 
 |  | ||||||
|         self.assertEqual(text, "This is a test document.") |  | ||||||
| 
 |  | ||||||
|     def test_image_to_string_language_unavailable(self): |  | ||||||
|         try: |  | ||||||
|             image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita")) |  | ||||||
|         except OCRError as e: |  | ||||||
|             self.assertTrue("Failed loading language" in str(e)) |  | ||||||
|         else: |  | ||||||
|             self.fail("Should raise exception") |  | ||||||
| 
 |  | ||||||
|     @override_settings(OCR_ALWAYS=False) |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.get_text_from_pdf") |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale") |  | ||||||
|     def test_is_ocred(self, m2, m): |  | ||||||
|         parser = RasterisedDocumentParser("", uuid.uuid4()) |  | ||||||
|         m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ |  | ||||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " \ |  | ||||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " |  | ||||||
|         parser.get_text() |  | ||||||
|         self.assertEqual(m.call_count, 2) |  | ||||||
|         self.assertEqual(m2.call_count, 0) |  | ||||||
| 
 | 
 | ||||||
|     def test_thumbnail(self): |     def test_thumbnail(self): | ||||||
|         parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) |         parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user