mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-31 02:27:10 -04:00 
			
		
		
		
	Fix for #154
* Added a test with a faked pyocr and tesseract * Added a catch for pyocr's *other* TesseractError
This commit is contained in:
		
							parent
							
								
									b88e0fd902
								
							
						
					
					
						commit
						18495ce9da
					
				| @ -1,33 +1,31 @@ | |||||||
| import datetime |  | ||||||
| import hashlib |  | ||||||
| import logging |  | ||||||
| import tempfile |  | ||||||
| import uuid |  | ||||||
| 
 |  | ||||||
| from multiprocessing.pool import Pool |  | ||||||
| 
 |  | ||||||
| import itertools |  | ||||||
| 
 |  | ||||||
| import langdetect |  | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
|  | import uuid | ||||||
|  | import shutil | ||||||
|  | import hashlib | ||||||
|  | import logging | ||||||
|  | import datetime | ||||||
|  | import tempfile | ||||||
|  | import itertools | ||||||
| import subprocess | import subprocess | ||||||
|  | from multiprocessing.pool import Pool | ||||||
| 
 | 
 | ||||||
| import pyocr | import pyocr | ||||||
| import shutil | import langdetect | ||||||
| 
 |  | ||||||
| from PIL import Image | from PIL import Image | ||||||
| 
 |  | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
| from pyocr.tesseract import TesseractError |  | ||||||
| 
 |  | ||||||
| from paperless.db import GnuPG | from paperless.db import GnuPG | ||||||
|  | from pyocr.tesseract import TesseractError | ||||||
|  | from pyocr.libtesseract.tesseract_raw import \ | ||||||
|  |     TesseractError as OtherTesseractError | ||||||
| 
 | 
 | ||||||
| from .models import Tag, Document, FileInfo | from .models import Tag, Document, FileInfo | ||||||
| from .languages import ISO639 |  | ||||||
| from .signals import ( | from .signals import ( | ||||||
|     document_consumption_started, document_consumption_finished) |     document_consumption_started, | ||||||
|  |     document_consumption_finished | ||||||
|  | ) | ||||||
|  | from .languages import ISO639 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class OCRError(Exception): | class OCRError(Exception): | ||||||
| @ -381,7 +379,7 @@ def image_to_string(args): | |||||||
|             try: |             try: | ||||||
|                 orientation = ocr.detect_orientation(f, lang=lang) |                 orientation = ocr.detect_orientation(f, lang=lang) | ||||||
|                 f = f.rotate(orientation["angle"], expand=1) |                 f = f.rotate(orientation["angle"], expand=1) | ||||||
|             except TesseractError: |             except (TesseractError, OtherTesseractError): | ||||||
|                 pass |                 pass | ||||||
|         return ocr.image_to_string(f, lang=lang) |         return ocr.image_to_string(f, lang=lang) | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/no-text.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/no-text.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 32 KiB | 
| @ -1,7 +1,13 @@ | |||||||
| from django.test import TestCase | import os | ||||||
|  | from unittest import mock, skipIf | ||||||
|  | 
 | ||||||
|  | import pyocr | ||||||
|  | from django.test import TestCase | ||||||
|  | from pyocr.libtesseract.tesseract_raw import \ | ||||||
|  |     TesseractError as OtherTesseractError | ||||||
| 
 | 
 | ||||||
| from ..consumer import strip_excess_whitespace |  | ||||||
| from ..models import FileInfo | from ..models import FileInfo | ||||||
|  | from ..consumer import image_to_string, strip_excess_whitespace | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TestAttributes(TestCase): | class TestAttributes(TestCase): | ||||||
| @ -304,6 +310,28 @@ class TestFieldPermutations(TestCase): | |||||||
|                             template.format(**spec), **spec) |                             template.format(**spec), **spec) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class FakeTesseract(object): | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def can_detect_orientation(): | ||||||
|  |         return True | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def detect_orientation(file_handle, lang): | ||||||
|  |         raise OtherTesseractError("arbitrary status", "message") | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def image_to_string(file_handle, lang): | ||||||
|  |         return "This is test text" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class FakePyOcr(object): | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def get_available_tools(): | ||||||
|  |         return [FakeTesseract] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class TestOCR(TestCase): | class TestOCR(TestCase): | ||||||
| 
 | 
 | ||||||
|     text_cases = [ |     text_cases = [ | ||||||
| @ -318,6 +346,9 @@ class TestOCR(TestCase): | |||||||
|         ) |         ) | ||||||
|     ] |     ] | ||||||
|      |      | ||||||
|  |     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||||
|  |     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) | ||||||
|  | 
 | ||||||
|     def test_strip_excess_whitespace(self): |     def test_strip_excess_whitespace(self): | ||||||
|         for source, result in self.text_cases: |         for source, result in self.text_cases: | ||||||
|             actual_result = strip_excess_whitespace(source) |             actual_result = strip_excess_whitespace(source) | ||||||
| @ -330,3 +361,18 @@ class TestOCR(TestCase): | |||||||
|                     actual_result |                     actual_result | ||||||
|                 ) |                 ) | ||||||
|             ) |             ) | ||||||
|  | 
 | ||||||
|  |     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") | ||||||
|  |     @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES) | ||||||
|  |     @mock.patch("documents.consumer.pyocr", FakePyOcr) | ||||||
|  |     def test_image_to_string_with_text_free_page(self): | ||||||
|  |         """ | ||||||
|  |         This test is sort of silly, since it's really just reproducing an odd | ||||||
|  |         exception thrown by pyocr when it encounters a page with no text. | ||||||
|  |         Actually running this test against an installation of Tesseract results | ||||||
|  |         in a segmentation fault rooted somewhere deep inside pyocr where I | ||||||
|  |         don't care to dig.  Regardless, if you run the consumer normally, | ||||||
|  |         text-free pages are now handled correctly so long as we work around | ||||||
|  |         this weird exception. | ||||||
|  |         """ | ||||||
|  |         image_to_string(["text.png", "en"]) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user