mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-04 03:27:12 -05:00 
			
		
		
		
	Merge pull request #220 from jgysland/add-fuzzy-matching
fuzzy matching
This commit is contained in:
		
						commit
						5eb26102d4
					
				@ -6,6 +6,7 @@ django-filter>=1.0
 | 
				
			|||||||
django-flat-responsive>=1.2.0
 | 
					django-flat-responsive>=1.2.0
 | 
				
			||||||
djangorestframework>=3.5.3
 | 
					djangorestframework>=3.5.3
 | 
				
			||||||
filemagic>=1.6
 | 
					filemagic>=1.6
 | 
				
			||||||
 | 
					fuzzywuzzy[speedup]==0.15.0
 | 
				
			||||||
langdetect>=1.0.7
 | 
					langdetect>=1.0.7
 | 
				
			||||||
pyocr>=0.4.6
 | 
					pyocr>=0.4.6
 | 
				
			||||||
python-dateutil>=2.6.0
 | 
					python-dateutil>=2.6.0
 | 
				
			||||||
 | 
				
			|||||||
@ -5,6 +5,7 @@ import re
 | 
				
			|||||||
import uuid
 | 
					import uuid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from collections import OrderedDict
 | 
					from collections import OrderedDict
 | 
				
			||||||
 | 
					from fuzzywuzzy import fuzz
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from django.conf import settings
 | 
					from django.conf import settings
 | 
				
			||||||
from django.core.urlresolvers import reverse
 | 
					from django.core.urlresolvers import reverse
 | 
				
			||||||
@ -21,11 +22,13 @@ class MatchingModel(models.Model):
 | 
				
			|||||||
    MATCH_ALL = 2
 | 
					    MATCH_ALL = 2
 | 
				
			||||||
    MATCH_LITERAL = 3
 | 
					    MATCH_LITERAL = 3
 | 
				
			||||||
    MATCH_REGEX = 4
 | 
					    MATCH_REGEX = 4
 | 
				
			||||||
 | 
					    MATCH_FUZZY = 5
 | 
				
			||||||
    MATCHING_ALGORITHMS = (
 | 
					    MATCHING_ALGORITHMS = (
 | 
				
			||||||
        (MATCH_ANY, "Any"),
 | 
					        (MATCH_ANY, "Any"),
 | 
				
			||||||
        (MATCH_ALL, "All"),
 | 
					        (MATCH_ALL, "All"),
 | 
				
			||||||
        (MATCH_LITERAL, "Literal"),
 | 
					        (MATCH_LITERAL, "Literal"),
 | 
				
			||||||
        (MATCH_REGEX, "Regular Expression"),
 | 
					        (MATCH_REGEX, "Regular Expression"),
 | 
				
			||||||
 | 
					        (MATCH_FUZZY, "Fuzzy Match"),
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    name = models.CharField(max_length=128, unique=True)
 | 
					    name = models.CharField(max_length=128, unique=True)
 | 
				
			||||||
@ -42,8 +45,11 @@ class MatchingModel(models.Model):
 | 
				
			|||||||
            "provided appear in the PDF, albeit not in the order provided.  A "
 | 
					            "provided appear in the PDF, albeit not in the order provided.  A "
 | 
				
			||||||
            "\"literal\" match means that the text you enter must appear in "
 | 
					            "\"literal\" match means that the text you enter must appear in "
 | 
				
			||||||
            "the PDF exactly as you've entered it, and \"regular expression\" "
 | 
					            "the PDF exactly as you've entered it, and \"regular expression\" "
 | 
				
			||||||
            "uses a regex to match the PDF.  If you don't know what a regex "
 | 
					            "uses a regex to match the PDF.  (If you don't know what a regex "
 | 
				
			||||||
            "is, you probably don't want this option."
 | 
					            "is, you probably don't want this option.)  Finally, a \"fuzzy "
 | 
				
			||||||
 | 
					            "match\" looks for words or phrases that are mostly—but not "
 | 
				
			||||||
 | 
					            "exactly—the same, which can be useful for matching against "
 | 
				
			||||||
 | 
					            "documents containg imperfections that foil accurate OCR."
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -104,6 +110,15 @@ class MatchingModel(models.Model):
 | 
				
			|||||||
            return bool(re.search(
 | 
					            return bool(re.search(
 | 
				
			||||||
                re.compile(self.match, **search_kwargs), text))
 | 
					                re.compile(self.match, **search_kwargs), text))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if self.matching_algorithm == self.MATCH_FUZZY:
 | 
				
			||||||
 | 
					            match = re.sub(r'[^\w\s]', '', self.match)
 | 
				
			||||||
 | 
					            text = re.sub(r'[^\w\s]', '', text)
 | 
				
			||||||
 | 
					            if self.is_insensitive:
 | 
				
			||||||
 | 
					                match = match.lower()
 | 
				
			||||||
 | 
					                text = text.lower()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            return True if fuzz.partial_ratio(match, text) >= 90 else False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        raise NotImplementedError("Unsupported matching algorithm")
 | 
					        raise NotImplementedError("Unsupported matching algorithm")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def save(self, *args, **kwargs):
 | 
					    def save(self, *args, **kwargs):
 | 
				
			||||||
 | 
				
			|||||||
@ -149,6 +149,22 @@ class TestMatching(TestCase):
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_match_fuzzy(self):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self._test_matching(
 | 
				
			||||||
 | 
					            "Springfield, Miss.",
 | 
				
			||||||
 | 
					            "MATCH_FUZZY",
 | 
				
			||||||
 | 
					            (
 | 
				
			||||||
 | 
					                "1220 Main Street, Springf eld, Miss.",
 | 
				
			||||||
 | 
					                "1220 Main Street, Spring field, Miss.",
 | 
				
			||||||
 | 
					                "1220 Main Street, Springfeld, Miss.",
 | 
				
			||||||
 | 
					                "1220 Main Street Springfield Miss",
 | 
				
			||||||
 | 
					            ),
 | 
				
			||||||
 | 
					            (
 | 
				
			||||||
 | 
					                "1220 Main Street, Springfield, Mich.",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TestApplications(TestCase):
 | 
					class TestApplications(TestCase):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user