mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-31 10:37:12 -04:00 
			
		
		
		
	Fix for #154
* Added a test with a faked pyocr and tesseract * Added a catch for pyocr's *other* TesseractError
This commit is contained in:
		
							parent
							
								
									b88e0fd902
								
							
						
					
					
						commit
						18495ce9da
					
				| @ -1,33 +1,31 @@ | ||||
| import datetime | ||||
| import hashlib | ||||
| import logging | ||||
| import tempfile | ||||
| import uuid | ||||
| 
 | ||||
| from multiprocessing.pool import Pool | ||||
| 
 | ||||
| import itertools | ||||
| 
 | ||||
| import langdetect | ||||
| import os | ||||
| import re | ||||
| import uuid | ||||
| import shutil | ||||
| import hashlib | ||||
| import logging | ||||
| import datetime | ||||
| import tempfile | ||||
| import itertools | ||||
| import subprocess | ||||
| from multiprocessing.pool import Pool | ||||
| 
 | ||||
| import pyocr | ||||
| import shutil | ||||
| 
 | ||||
| import langdetect | ||||
| from PIL import Image | ||||
| 
 | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
| from pyocr.tesseract import TesseractError | ||||
| 
 | ||||
| from paperless.db import GnuPG | ||||
| from pyocr.tesseract import TesseractError | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
| 
 | ||||
| from .models import Tag, Document, FileInfo | ||||
| from .languages import ISO639 | ||||
| from .signals import ( | ||||
|     document_consumption_started, document_consumption_finished) | ||||
|     document_consumption_started, | ||||
|     document_consumption_finished | ||||
| ) | ||||
| from .languages import ISO639 | ||||
| 
 | ||||
| 
 | ||||
| class OCRError(Exception): | ||||
| @ -381,7 +379,7 @@ def image_to_string(args): | ||||
|             try: | ||||
|                 orientation = ocr.detect_orientation(f, lang=lang) | ||||
|                 f = f.rotate(orientation["angle"], expand=1) | ||||
|             except TesseractError: | ||||
|             except (TesseractError, OtherTesseractError): | ||||
|                 pass | ||||
|         return ocr.image_to_string(f, lang=lang) | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/no-text.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/no-text.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 32 KiB | 
| @ -1,7 +1,13 @@ | ||||
| from django.test import TestCase | ||||
| import os | ||||
| from unittest import mock, skipIf | ||||
| 
 | ||||
| import pyocr | ||||
| from django.test import TestCase | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
| 
 | ||||
| from ..consumer import strip_excess_whitespace | ||||
| from ..models import FileInfo | ||||
| from ..consumer import image_to_string, strip_excess_whitespace | ||||
| 
 | ||||
| 
 | ||||
| class TestAttributes(TestCase): | ||||
| @ -304,6 +310,28 @@ class TestFieldPermutations(TestCase): | ||||
|                             template.format(**spec), **spec) | ||||
| 
 | ||||
| 
 | ||||
| class FakeTesseract(object): | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def can_detect_orientation(): | ||||
|         return True | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def detect_orientation(file_handle, lang): | ||||
|         raise OtherTesseractError("arbitrary status", "message") | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def image_to_string(file_handle, lang): | ||||
|         return "This is test text" | ||||
| 
 | ||||
| 
 | ||||
| class FakePyOcr(object): | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def get_available_tools(): | ||||
|         return [FakeTesseract] | ||||
| 
 | ||||
| 
 | ||||
| class TestOCR(TestCase): | ||||
| 
 | ||||
|     text_cases = [ | ||||
| @ -317,6 +345,9 @@ class TestOCR(TestCase): | ||||
|             "utf-8 строка с пробелами в конце" | ||||
|         ) | ||||
|     ] | ||||
|      | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) | ||||
| 
 | ||||
|     def test_strip_excess_whitespace(self): | ||||
|         for source, result in self.text_cases: | ||||
| @ -330,3 +361,18 @@ class TestOCR(TestCase): | ||||
|                     actual_result | ||||
|                 ) | ||||
|             ) | ||||
| 
 | ||||
|     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") | ||||
|     @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES) | ||||
|     @mock.patch("documents.consumer.pyocr", FakePyOcr) | ||||
|     def test_image_to_string_with_text_free_page(self): | ||||
|         """ | ||||
|         This test is sort of silly, since it's really just reproducing an odd | ||||
|         exception thrown by pyocr when it encounters a page with no text. | ||||
|         Actually running this test against an installation of Tesseract results | ||||
|         in a segmentation fault rooted somewhere deep inside pyocr where I | ||||
|         don't care to dig.  Regardless, if you run the consumer normally, | ||||
|         text-free pages are now handled correctly so long as we work around | ||||
|         this weird exception. | ||||
|         """ | ||||
|         image_to_string(["text.png", "en"]) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user