mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 00:02:35 -04:00 
			
		
		
		
	Merge branch 'synchrone-no_stripping_newlines'
This commit is contained in:
		
						commit
						77fda752ae
					
				| @ -283,7 +283,7 @@ class Consumer(object): | ||||
|             r = " ".join(r) | ||||
| 
 | ||||
|         # Strip out excess white space to allow matching to go smoother | ||||
|         return re.sub(r"\s+", " ", r) | ||||
|         return strip_excess_whitespace(r) | ||||
| 
 | ||||
|     def _store(self, text, doc, thumbnail): | ||||
| 
 | ||||
| @ -360,6 +360,14 @@ class Consumer(object): | ||||
|         return Document.objects.filter(checksum=checksum).exists() | ||||
| 
 | ||||
| 
 | ||||
| def strip_excess_whitespace(text): | ||||
|     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) | ||||
|     no_leading_whitespace = re.sub( | ||||
|         "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) | ||||
|     no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) | ||||
|     return no_trailing_whitespace | ||||
| 
 | ||||
| 
 | ||||
| def image_to_string(args): | ||||
|     img, lang = args | ||||
|     ocr = pyocr.get_available_tools()[0] | ||||
|  | ||||
| @ -14,7 +14,7 @@ from dateutil import parser | ||||
| from django.conf import settings | ||||
| 
 | ||||
| from .consumer import Consumer | ||||
| from .models import Correspondent, Log | ||||
| from .models import Correspondent | ||||
| 
 | ||||
| 
 | ||||
| class MailFetcherError(Exception): | ||||
|  | ||||
| @ -6,7 +6,6 @@ import time | ||||
| from django.conf import settings | ||||
| from django.core.management.base import BaseCommand, CommandError | ||||
| 
 | ||||
| from ...models import Log | ||||
| from ...consumer import Consumer, ConsumerError | ||||
| from ...mail import MailFetcher, MailFetcherError | ||||
| 
 | ||||
|  | ||||
| @ -1,5 +1,6 @@ | ||||
| from django.test import TestCase | ||||
| 
 | ||||
| from ..consumer import strip_excess_whitespace | ||||
| from ..models import FileInfo | ||||
| 
 | ||||
| 
 | ||||
| @ -133,7 +134,7 @@ class TestAttributes(TestCase): | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| class Permutations(TestCase): | ||||
| class TestFieldPermutations(TestCase): | ||||
| 
 | ||||
|     valid_dates = ( | ||||
|         "20150102030405Z", | ||||
| @ -301,3 +302,31 @@ class Permutations(TestCase): | ||||
|                         } | ||||
|                         self._test_guessed_attributes( | ||||
|                             template.format(**spec), **spec) | ||||
| 
 | ||||
| 
 | ||||
| class TestOCR(TestCase): | ||||
| 
 | ||||
|     text_cases = [ | ||||
|         ("simple     string", "simple string"), | ||||
|         ( | ||||
|             "simple    newline\n   testing string", | ||||
|             "simple newline\ntesting string" | ||||
|         ), | ||||
|         ( | ||||
|             "utf-8   строка с пробелами в конце  ", | ||||
|             "utf-8 строка с пробелами в конце" | ||||
|         ) | ||||
|     ] | ||||
| 
 | ||||
|     def test_strip_excess_whitespace(self): | ||||
|         for source, result in self.text_cases: | ||||
|             actual_result = strip_excess_whitespace(source) | ||||
|             self.assertEqual( | ||||
|                 result, | ||||
|                 actual_result, | ||||
|                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( | ||||
|                     source, | ||||
|                     result, | ||||
|                     actual_result | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user