mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	add fuzzy matching + tests
This commit is contained in:
		
							parent
							
								
									819a0e1f57
								
							
						
					
					
						commit
						6ce27d225d
					
				@ -6,6 +6,7 @@ django-filter>=1.0
 | 
			
		||||
django-flat-responsive>=1.2.0
 | 
			
		||||
djangorestframework>=3.5.3
 | 
			
		||||
filemagic>=1.6
 | 
			
		||||
fuzzywuzzy[speedup]==0.15.0
 | 
			
		||||
langdetect>=1.0.7
 | 
			
		||||
pyocr>=0.4.6
 | 
			
		||||
python-dateutil>=2.6.0
 | 
			
		||||
 | 
			
		||||
@ -5,6 +5,7 @@ import re
 | 
			
		||||
import uuid
 | 
			
		||||
 | 
			
		||||
from collections import OrderedDict
 | 
			
		||||
from fuzzywuzzy import fuzz
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.core.urlresolvers import reverse
 | 
			
		||||
@ -21,11 +22,13 @@ class MatchingModel(models.Model):
 | 
			
		||||
    MATCH_ALL = 2
 | 
			
		||||
    MATCH_LITERAL = 3
 | 
			
		||||
    MATCH_REGEX = 4
 | 
			
		||||
    MATCH_FUZZY = 5
 | 
			
		||||
    MATCHING_ALGORITHMS = (
 | 
			
		||||
        (MATCH_ANY, "Any"),
 | 
			
		||||
        (MATCH_ALL, "All"),
 | 
			
		||||
        (MATCH_LITERAL, "Literal"),
 | 
			
		||||
        (MATCH_REGEX, "Regular Expression"),
 | 
			
		||||
        (MATCH_FUZZY, "Fuzzy Match"),
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    name = models.CharField(max_length=128, unique=True)
 | 
			
		||||
@ -42,8 +45,13 @@ class MatchingModel(models.Model):
 | 
			
		||||
            "provided appear in the PDF, albeit not in the order provided.  A "
 | 
			
		||||
            "\"literal\" match means that the text you enter must appear in "
 | 
			
		||||
            "the PDF exactly as you've entered it, and \"regular expression\" "
 | 
			
		||||
            "uses a regex to match the PDF.  If you don't know what a regex "
 | 
			
		||||
            "is, you probably don't want this option."
 | 
			
		||||
            "uses a regex to match the PDF.  (If you don't know what a regex "
 | 
			
		||||
            "is, you probably don't want this option.)  Finally, a \"fuzzy "
 | 
			
		||||
            "match\" strips all punctuation from both the match candidate "
 | 
			
		||||
            "and the OCR'd text and looks for a Levenshtein \"partial ratio\" "
 | 
			
		||||
            "(as implemented in the Python package \"FuzzyWuzzy\") of >= 90, "
 | 
			
		||||
            "which can be useful for matching against documents with "
 | 
			
		||||
            "imperfections that foil accurate OCR."
 | 
			
		||||
        )
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
@ -104,6 +112,15 @@ class MatchingModel(models.Model):
 | 
			
		||||
            return bool(re.search(
 | 
			
		||||
                re.compile(self.match, **search_kwargs), text))
 | 
			
		||||
 | 
			
		||||
        if self.matching_algorithm == self.MATCH_FUZZY:
 | 
			
		||||
            match = re.sub(r'[^\w\s]', '', self.match)
 | 
			
		||||
            text = re.sub(r'[^\w\s]', '', text)
 | 
			
		||||
            if self.is_insensitive:
 | 
			
		||||
                match = match.lower()
 | 
			
		||||
                text = text.lower()
 | 
			
		||||
 | 
			
		||||
            return True if fuzz.partial_ratio(match, text) >= 90 else False
 | 
			
		||||
 | 
			
		||||
        raise NotImplementedError("Unsupported matching algorithm")
 | 
			
		||||
 | 
			
		||||
    def save(self, *args, **kwargs):
 | 
			
		||||
 | 
			
		||||
@ -149,6 +149,22 @@ class TestMatching(TestCase):
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def test_match_fuzzy(self):
 | 
			
		||||
 | 
			
		||||
        self._test_matching(
 | 
			
		||||
            "Springfield, Miss.",
 | 
			
		||||
            "MATCH_FUZZY",
 | 
			
		||||
            (
 | 
			
		||||
                "1220 Main Street, Springf eld, Miss.",
 | 
			
		||||
                "1220 Main Street, Spring field, Miss.",
 | 
			
		||||
                "1220 Main Street, Springfeld, Miss.",
 | 
			
		||||
                "1220 Main Street Springfield Miss",
 | 
			
		||||
            ),
 | 
			
		||||
            (
 | 
			
		||||
                "1220 Main Street, Springfield, Mich.",
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestApplications(TestCase):
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user