mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-24 23:39:05 -04:00 
			
		
		
		
	Cleans up the docs, adds validation of the process count, include the test descriptions
This commit is contained in:
		
							parent
							
								
									a03a745295
								
							
						
					
					
						commit
						81b9f2d4e0
					
				| @ -582,8 +582,11 @@ duplicate. But the content should be exact or close, allowing detection. | |||||||
| This tool does a fuzzy match over document content, looking for | This tool does a fuzzy match over document content, looking for | ||||||
| those which look close according to a given ratio. | those which look close according to a given ratio. | ||||||
| 
 | 
 | ||||||
|  | At this time, other metadata (such as correspondent or type) is not | ||||||
|  | take into account by the detection. | ||||||
|  | 
 | ||||||
| ``` | ``` | ||||||
| document_fuzzy_match [--ratio] | document_fuzzy_match [--ratio] [--processes N] | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| | Option      | Required | Default | Description                                                                                                                    | | | Option      | Required | Default | Description                                                                                                                    | | ||||||
|  | |||||||
| @ -27,6 +27,10 @@ class _WorkResult: | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _process_and_match(work: _WorkPackage) -> _WorkResult: | def _process_and_match(work: _WorkPackage) -> _WorkResult: | ||||||
|  |     """ | ||||||
|  |     Does basic processing of document content, gets the basic ratio | ||||||
|  |     and returns the result package | ||||||
|  |     """ | ||||||
|     # Normalize the string some, lower case, whitespace, etc |     # Normalize the string some, lower case, whitespace, etc | ||||||
|     first_string = rapidfuzz.utils.default_process(work.first_doc.content) |     first_string = rapidfuzz.utils.default_process(work.first_doc.content) | ||||||
|     second_string = rapidfuzz.utils.default_process(work.second_doc.content) |     second_string = rapidfuzz.utils.default_process(work.second_doc.content) | ||||||
| @ -72,6 +76,9 @@ class Command(BaseCommand): | |||||||
|         if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX: |         if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX: | ||||||
|             raise CommandError("The ratio must be between 0 and 100") |             raise CommandError("The ratio must be between 0 and 100") | ||||||
| 
 | 
 | ||||||
|  |         if options["processes"] < 1: | ||||||
|  |             raise CommandError("There must be at least 1 process") | ||||||
|  | 
 | ||||||
|         all_docs = Document.objects.all().order_by("id") |         all_docs = Document.objects.all().order_by("id") | ||||||
| 
 | 
 | ||||||
|         # Build work packages for processing |         # Build work packages for processing | ||||||
|  | |||||||
| @ -22,15 +22,54 @@ class TestFuzzyMatchCommand(TestCase): | |||||||
|         return stdout.getvalue(), stderr.getvalue() |         return stdout.getvalue(), stderr.getvalue() | ||||||
| 
 | 
 | ||||||
|     def test_invalid_ratio_lower_limit(self): |     def test_invalid_ratio_lower_limit(self): | ||||||
|         with self.assertRaises(CommandError): |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - Invalid ratio below lower limit | ||||||
|  |         WHEN: | ||||||
|  |             - Command is called | ||||||
|  |         THEN: | ||||||
|  |             - Error is raised indicating issue | ||||||
|  |         """ | ||||||
|  |         with self.assertRaises(CommandError) as e: | ||||||
|             self.call_command("--ratio", "-1") |             self.call_command("--ratio", "-1") | ||||||
|  |             self.assertIn("The ratio must be between 0 and 100", str(e)) | ||||||
| 
 | 
 | ||||||
|     def test_invalid_ratio_upper_limit(self): |     def test_invalid_ratio_upper_limit(self): | ||||||
|         with self.assertRaises(CommandError): |         """ | ||||||
|  |         GIVEN:s | ||||||
|  |             - Invalid ratio above upper | ||||||
|  |         WHEN: | ||||||
|  |             - Command is called | ||||||
|  |         THEN: | ||||||
|  |             - Error is raised indicating issue | ||||||
|  |         """ | ||||||
|  |         with self.assertRaises(CommandError) as e: | ||||||
|             self.call_command("--ratio", "101") |             self.call_command("--ratio", "101") | ||||||
|  |             self.assertIn("The ratio must be between 0 and 100", str(e)) | ||||||
|  | 
 | ||||||
|  |     def test_invalid_process_count(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - Invalid process count less than 0 above upper | ||||||
|  |         WHEN: | ||||||
|  |             - Command is called | ||||||
|  |         THEN: | ||||||
|  |             - Error is raised indicating issue | ||||||
|  |         """ | ||||||
|  |         with self.assertRaises(CommandError) as e: | ||||||
|  |             self.call_command("--processes", "0") | ||||||
|  |             self.assertIn("There must be at least 1 process", str(e)) | ||||||
| 
 | 
 | ||||||
|     def test_no_matches(self): |     def test_no_matches(self): | ||||||
|         # Content similarity is 82.35 |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - 2 documents exist | ||||||
|  |             - Similarity between content is 82.32 | ||||||
|  |         WHEN: | ||||||
|  |             - Command is called | ||||||
|  |         THEN: | ||||||
|  |             - No matches are found | ||||||
|  |         """ | ||||||
|         Document.objects.create( |         Document.objects.create( | ||||||
|             checksum="BEEFCAFE", |             checksum="BEEFCAFE", | ||||||
|             title="A", |             title="A", | ||||||
| @ -49,6 +88,16 @@ class TestFuzzyMatchCommand(TestCase): | |||||||
|         self.assertEqual(stdout, "No matches found\n") |         self.assertEqual(stdout, "No matches found\n") | ||||||
| 
 | 
 | ||||||
|     def test_with_matches(self): |     def test_with_matches(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - 2 documents exist | ||||||
|  |             - Similarity between content is 86.667 | ||||||
|  |         WHEN: | ||||||
|  |             - Command is called | ||||||
|  |         THEN: | ||||||
|  |             - 1 match is returned from doc 1 to doc 2 | ||||||
|  |             - No match from doc 2 to doc 1 reported | ||||||
|  |         """ | ||||||
|         # Content similarity is 86.667 |         # Content similarity is 86.667 | ||||||
|         Document.objects.create( |         Document.objects.create( | ||||||
|             checksum="BEEFCAFE", |             checksum="BEEFCAFE", | ||||||
| @ -68,6 +117,16 @@ class TestFuzzyMatchCommand(TestCase): | |||||||
|         self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n") |         self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n") | ||||||
| 
 | 
 | ||||||
|     def test_with_3_matches(self): |     def test_with_3_matches(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - 3 documents exist | ||||||
|  |             - All documents have similarity over 85.0 | ||||||
|  |         WHEN: | ||||||
|  |             - Command is called | ||||||
|  |         THEN: | ||||||
|  |             - 3 matches is returned from each document to the others | ||||||
|  |             - No duplication of matches returned | ||||||
|  |         """ | ||||||
|         # Content similarity is 86.667 |         # Content similarity is 86.667 | ||||||
|         Document.objects.create( |         Document.objects.create( | ||||||
|             checksum="BEEFCAFE", |             checksum="BEEFCAFE", | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user