mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-31 10:37:12 -04:00 
			
		
		
		
	Merge pull request #277 from ishirav/multi-word-match
Add multi-word match
This commit is contained in:
		
						commit
						06117929bb
					
				| @ -80,6 +80,12 @@ text and matching algorithm.  From the help info there: | ||||
|     uses a regex to match the PDF.  If you don't know what a regex is, you | ||||
|     probably don't want this option. | ||||
| 
 | ||||
| When using the "any" or "all" matching algorithms, you can search for terms that | ||||
| consist of multiple words by enclosing them in double quotes. For example, defining | ||||
| a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match | ||||
| documents that contain either "Bank of America" or "BofA", but will not match | ||||
| documents containing "Bank of South America". | ||||
| 
 | ||||
| Then just save your tag/correspondent and run another document through the | ||||
| consumer.  Once complete, you should see the newly-created document, | ||||
| automatically tagged with the appropriate data. | ||||
|  | ||||
| @ -89,7 +89,7 @@ class MatchingModel(models.Model): | ||||
|             search_kwargs = {"flags": re.IGNORECASE} | ||||
| 
 | ||||
|         if self.matching_algorithm == self.MATCH_ALL: | ||||
|             for word in self.match.split(" "): | ||||
|             for word in self._split_match(): | ||||
|                 search_result = re.search( | ||||
|                     r"\b{}\b".format(word), text, **search_kwargs) | ||||
|                 if not search_result: | ||||
| @ -97,7 +97,7 @@ class MatchingModel(models.Model): | ||||
|             return True | ||||
| 
 | ||||
|         if self.matching_algorithm == self.MATCH_ANY: | ||||
|             for word in self.match.split(" "): | ||||
|             for word in self._split_match(): | ||||
|                 if re.search(r"\b{}\b".format(word), text, **search_kwargs): | ||||
|                     return True | ||||
|             return False | ||||
| @ -121,6 +121,20 @@ class MatchingModel(models.Model): | ||||
| 
 | ||||
|         raise NotImplementedError("Unsupported matching algorithm") | ||||
| 
 | ||||
|     def _split_match(self): | ||||
|         ''' | ||||
|         Splits the match to invidual keywords, getting rid of unecessary spaces | ||||
|         and grouping quoted words together. | ||||
|         Example: | ||||
|         '  some random  words "with   quotes  " and   spaces' | ||||
|             ==> | ||||
|         ['some', 'random', 'words', 'with\s+quotes', 'and', 'spaces'] | ||||
|         ''' | ||||
|         findterms = re.compile(r'"([^"]+)"|(\S+)').findall | ||||
|         normspace = re.compile(r'\s+').sub | ||||
|         return [normspace(r'\s+', (t[0] or t[1]).strip()) | ||||
|                 for t in findterms(self.match)] | ||||
| 
 | ||||
|     def save(self, *args, **kwargs): | ||||
| 
 | ||||
|         self.match = self.match.lower() | ||||
|  | ||||
| @ -16,9 +16,15 @@ class TestMatching(TestCase): | ||||
|                 matching_algorithm=getattr(klass, algorithm) | ||||
|             ) | ||||
|             for string in true: | ||||
|                 self.assertTrue(instance.matches(string)) | ||||
|                 self.assertTrue( | ||||
|                     instance.matches(string), | ||||
|                     '"%s" should match "%s" but it does not' % (text, string) | ||||
|                 ) | ||||
|             for string in false: | ||||
|                 self.assertFalse(instance.matches(string)) | ||||
|                 self.assertFalse( | ||||
|                     instance.matches(string), | ||||
|                     '"%s" should not match "%s" but it does' % (text, string) | ||||
|                 ) | ||||
| 
 | ||||
|     def test_match_all(self): | ||||
| 
 | ||||
| @ -54,6 +60,21 @@ class TestMatching(TestCase): | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|         self._test_matching( | ||||
|             'brown fox "lazy dogs"', | ||||
|             "MATCH_ALL", | ||||
|             ( | ||||
|                 "the quick brown fox jumped over the lazy dogs", | ||||
|                 "the quick brown fox jumped over the lazy  dogs", | ||||
|             ), | ||||
|             ( | ||||
|                 "the quick fox jumped over the lazy dogs", | ||||
|                 "the quick brown wolf jumped over the lazy dogs", | ||||
|                 "the quick brown fox jumped over the fat dogs", | ||||
|                 "the quick brown fox jumped over the lazy... dogs", | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     def test_match_any(self): | ||||
| 
 | ||||
|         self._test_matching( | ||||
| @ -89,6 +110,18 @@ class TestMatching(TestCase): | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|         self._test_matching( | ||||
|             '"brown fox" " lazy  dogs "', | ||||
|             "MATCH_ANY", | ||||
|             ( | ||||
|                 "the quick brown fox", | ||||
|                 "jumped over the lazy  dogs.", | ||||
|             ), | ||||
|             ( | ||||
|                 "the lazy fox jumped over the brown dogs", | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     def test_match_literal(self): | ||||
| 
 | ||||
|         self._test_matching( | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user