Merge pull request #277 from ishirav/multi-word-match

Add multi-word match
2025-12-23 13:27:24 -05:00 · 2017-12-27 11:21:27 +01:00 · 2017-12-27 11:21:27 +01:00 · 06117929bb
commit 06117929bb
parent af4623e605 d1c8241947
3 changed files with 57 additions and 4 deletions
--- a/docs/guesswork.rst
+++ b/docs/guesswork.rst
@ -80,6 +80,12 @@ text and matching algorithm.  From the help info there:
    uses a regex to match the PDF.  If you don't know what a regex is, you
    probably don't want this option.

+When using the "any" or "all" matching algorithms, you can search for terms that
+consist of multiple words by enclosing them in double quotes. For example, defining
+a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match
+documents that contain either "Bank of America" or "BofA", but will not match
+documents containing "Bank of South America".
+
 Then just save your tag/correspondent and run another document through the
 consumer.  Once complete, you should see the newly-created document,
 automatically tagged with the appropriate data.
--- a/src/documents/models.py
+++ b/src/documents/models.py
@ -89,7 +89,7 @@ class MatchingModel(models.Model):
            search_kwargs = {"flags": re.IGNORECASE}

        if self.matching_algorithm == self.MATCH_ALL:
-            for word in self.match.split(" "):
+            for word in self._split_match():
                search_result = re.search(
                    r"\b{}\b".format(word), text, **search_kwargs)
                if not search_result:
@ -97,7 +97,7 @@ class MatchingModel(models.Model):
            return True

        if self.matching_algorithm == self.MATCH_ANY:
-            for word in self.match.split(" "):
+            for word in self._split_match():
                if re.search(r"\b{}\b".format(word), text, **search_kwargs):
                    return True
            return False
@ -121,6 +121,20 @@ class MatchingModel(models.Model):

        raise NotImplementedError("Unsupported matching algorithm")

+    def _split_match(self):
+        '''
+        Splits the match to invidual keywords, getting rid of unecessary spaces
+        and grouping quoted words together.
+        Example:
+        '  some random  words "with   quotes  " and   spaces'
+            ==>
+        ['some', 'random', 'words', 'with\s+quotes', 'and', 'spaces']
+        '''
+        findterms = re.compile(r'"([^"]+)"|(\S+)').findall
+        normspace = re.compile(r'\s+').sub
+        return [normspace(r'\s+', (t[0] or t[1]).strip())
+                for t in findterms(self.match)]
+
    def save(self, *args, **kwargs):

        self.match = self.match.lower()
--- a/src/documents/tests/test_matchables.py
+++ b/src/documents/tests/test_matchables.py
@ -16,9 +16,15 @@ class TestMatching(TestCase):
                matching_algorithm=getattr(klass, algorithm)
            )
            for string in true:
-                self.assertTrue(instance.matches(string))
+                self.assertTrue(
+                    instance.matches(string),
+                    '"%s" should match "%s" but it does not' % (text, string)
+                )
            for string in false:
-                self.assertFalse(instance.matches(string))
+                self.assertFalse(
+                    instance.matches(string),
+                    '"%s" should not match "%s" but it does' % (text, string)
+                )

    def test_match_all(self):

@ -54,6 +60,21 @@ class TestMatching(TestCase):
            )
        )

+        self._test_matching(
+            'brown fox "lazy dogs"',
+            "MATCH_ALL",
+            (
+                "the quick brown fox jumped over the lazy dogs",
+                "the quick brown fox jumped over the lazy  dogs",
+            ),
+            (
+                "the quick fox jumped over the lazy dogs",
+                "the quick brown wolf jumped over the lazy dogs",
+                "the quick brown fox jumped over the fat dogs",
+                "the quick brown fox jumped over the lazy... dogs",
+            )
+        )
+
    def test_match_any(self):

        self._test_matching(
@ -89,6 +110,18 @@ class TestMatching(TestCase):
            )
        )

+        self._test_matching(
+            '"brown fox" " lazy  dogs "',
+            "MATCH_ANY",
+            (
+                "the quick brown fox",
+                "jumped over the lazy  dogs.",
+            ),
+            (
+                "the lazy fox jumped over the brown dogs",
+            )
+        )
+
    def test_match_literal(self):

        self._test_matching(