mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 08:12:34 -04:00 
			
		
		
		
	Collapsing excess whitespace after OCR
This commit is contained in:
		
							parent
							
								
									14811a4a49
								
							
						
					
					
						commit
						63de2ca1b0
					
				| @ -283,7 +283,7 @@ class Consumer(object): | |||||||
|             r = " ".join(r) |             r = " ".join(r) | ||||||
| 
 | 
 | ||||||
|         # Strip out excess white space to allow matching to go smoother |         # Strip out excess white space to allow matching to go smoother | ||||||
|         return re.sub(r"\s+", " ", r) |         return strip_excess_whitespace(r) | ||||||
| 
 | 
 | ||||||
|     def _store(self, text, doc, thumbnail): |     def _store(self, text, doc, thumbnail): | ||||||
| 
 | 
 | ||||||
| @ -360,6 +360,13 @@ class Consumer(object): | |||||||
|         return Document.objects.filter(checksum=checksum).exists() |         return Document.objects.filter(checksum=checksum).exists() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def strip_excess_whitespace(text): | ||||||
|  |     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) | ||||||
|  |     no_leading_whitespace = re.sub("([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) | ||||||
|  |     no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) | ||||||
|  |     return no_trailing_whitespace | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def image_to_string(args): | def image_to_string(args): | ||||||
|     img, lang = args |     img, lang = args | ||||||
|     ocr = pyocr.get_available_tools()[0] |     ocr = pyocr.get_available_tools()[0] | ||||||
|  | |||||||
| @ -1,5 +1,6 @@ | |||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
| 
 | 
 | ||||||
|  | from ..consumer import strip_excess_whitespace | ||||||
| from ..models import FileInfo | from ..models import FileInfo | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -301,3 +302,16 @@ class Permutations(TestCase): | |||||||
|                         } |                         } | ||||||
|                         self._test_guessed_attributes( |                         self._test_guessed_attributes( | ||||||
|                             template.format(**spec), **spec) |                             template.format(**spec), **spec) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestOCR(TestCase): | ||||||
|  |     text_cases = [ | ||||||
|  |         ("simple     string", "simple string"), | ||||||
|  |         ("simple    newline\n   testing string", "simple newline\ntesting string"), | ||||||
|  |         ("utf-8   строка с пробелами в конце  ", "utf-8 строка с пробелами в конце") | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     def test_strip_excess_whitespace(self): | ||||||
|  |         for source, result in self.text_cases: | ||||||
|  |             actual_result = strip_excess_whitespace(source) | ||||||
|  |             assert result == actual_result, "strip_exceess_whitespace(%s) != '%s', but '%s'" % (source, result, actual_result) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user