mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-10-23 23:09:08 -04:00
Adds multiprocessing to the fuzzy matching for a speedup
This commit is contained in:
parent
ce8bf90663
commit
a03a745295
@ -586,6 +586,7 @@ those which look close according to a given ratio.
|
||||
document_fuzzy_match [--ratio]
|
||||
```
|
||||
|
||||
Optional arguments:
|
||||
--ratio - a number between 0 and 100, setting how similar a document must be for it to be reported.
|
||||
Higher numbers mean more similarity.
|
||||
| Option | Required | Default | Description |
|
||||
| ----------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| --ratio | No | 85.0 | a number between 0 and 100, setting how similar a document must be for it to be reported. Higher numbers mean more similarity. |
|
||||
| --processes | No | 4 | Number of processes to use for matching. Setting 1 disables multiple processes |
|
||||
|
@ -1,3 +1,5 @@
|
||||
import dataclasses
|
||||
import multiprocessing
|
||||
from typing import Final
|
||||
|
||||
import rapidfuzz
|
||||
@ -8,8 +10,35 @@ from django.core.management import CommandError
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class _WorkPackage:
|
||||
first_doc: Document
|
||||
second_doc: Document
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class _WorkResult:
|
||||
doc_one_pk: int
|
||||
doc_two_pk: int
|
||||
ratio: float
|
||||
|
||||
def __lt__(self, other: "_WorkResult") -> bool:
|
||||
return self.doc_one_pk < other.doc_one_pk
|
||||
|
||||
|
||||
def _process_and_match(work: _WorkPackage) -> _WorkResult:
|
||||
# Normalize the string some, lower case, whitespace, etc
|
||||
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
|
||||
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
|
||||
|
||||
# Basic matching ratio
|
||||
match = rapidfuzz.fuzz.ratio(first_string, second_string)
|
||||
|
||||
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Manages the document index."
|
||||
help = "Searches for documents where the content almost matches"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
@ -18,6 +47,12 @@ class Command(BaseCommand):
|
||||
type=float,
|
||||
help="Ratio to consider documents a match",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--processes",
|
||||
default=4,
|
||||
type=int,
|
||||
help="Number of processes to distribute work amongst",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-progress-bar",
|
||||
default=False,
|
||||
@ -30,8 +65,8 @@ class Command(BaseCommand):
|
||||
RATIO_MAX: Final[float] = 100.0
|
||||
|
||||
opt_ratio = options["ratio"]
|
||||
progress_bar_disable = options["no_progress_bar"]
|
||||
match_pairs = set()
|
||||
checked_pairs: set[tuple[int, int]] = set()
|
||||
work_pkgs: list[_WorkPackage] = []
|
||||
|
||||
# Ratio is a float from 0.0 to 100.0
|
||||
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
|
||||
@ -39,42 +74,54 @@ class Command(BaseCommand):
|
||||
|
||||
all_docs = Document.objects.all().order_by("id")
|
||||
|
||||
messages = []
|
||||
|
||||
for first_doc in tqdm.tqdm(all_docs, disable=progress_bar_disable):
|
||||
# Build work packages for processing
|
||||
for first_doc in all_docs:
|
||||
for second_doc in all_docs:
|
||||
# doc to doc is obviously not useful
|
||||
if first_doc.pk == second_doc.pk:
|
||||
continue
|
||||
# Skip matching which have already been matched together
|
||||
# doc 1 to doc 2 is the same as doc 2 to doc 1
|
||||
if (first_doc.pk, second_doc.pk) in checked_pairs or (
|
||||
second_doc.pk,
|
||||
first_doc.pk,
|
||||
) in checked_pairs:
|
||||
continue
|
||||
checked_pairs.update(
|
||||
[(first_doc.pk, second_doc.pk), (second_doc.pk, first_doc.pk)],
|
||||
)
|
||||
|
||||
# Normalize the string some, lower case, whitespace, etc
|
||||
first_string = rapidfuzz.utils.default_process(first_doc.content)
|
||||
second_string = rapidfuzz.utils.default_process(second_doc.content)
|
||||
work_pkgs.append(_WorkPackage(first_doc, second_doc))
|
||||
|
||||
# Basic matching ratio
|
||||
match = rapidfuzz.fuzz.ratio(first_string, second_string)
|
||||
# Don't spin up a pool of 1 process
|
||||
if options["processes"] == 1:
|
||||
results = []
|
||||
for work in tqdm.tqdm(work_pkgs, disable=options["no_progress_bar"]):
|
||||
results.append(_process_and_match(work))
|
||||
else:
|
||||
with multiprocessing.Pool(processes=options["processes"]) as pool:
|
||||
results = list(
|
||||
tqdm.tqdm(
|
||||
pool.imap_unordered(_process_and_match, work_pkgs),
|
||||
total=len(work_pkgs),
|
||||
disable=options["no_progress_bar"],
|
||||
),
|
||||
)
|
||||
|
||||
if match >= opt_ratio:
|
||||
# Skip matching which have already been matched together
|
||||
# doc 1 to doc 2 is the same as doc 2 to doc 1
|
||||
if (first_doc.pk, second_doc.pk) in match_pairs or (
|
||||
second_doc.pk,
|
||||
first_doc.pk,
|
||||
) in match_pairs:
|
||||
continue
|
||||
else:
|
||||
match_pairs.add((first_doc.pk, second_doc.pk))
|
||||
match_pairs.add((second_doc.pk, first_doc.pk))
|
||||
|
||||
messages.append(
|
||||
self.style.NOTICE(
|
||||
f"Document {first_doc.pk} fuzzy match"
|
||||
f" to {second_doc.pk} (confidence {match:.3f})",
|
||||
),
|
||||
)
|
||||
# Check results
|
||||
messages = []
|
||||
for result in sorted(results):
|
||||
if result.ratio >= opt_ratio:
|
||||
messages.append(
|
||||
self.style.NOTICE(
|
||||
f"Document {result.doc_one_pk} fuzzy match"
|
||||
f" to {result.doc_two_pk} (confidence {result.ratio:.3f})",
|
||||
),
|
||||
)
|
||||
|
||||
if len(messages) == 0:
|
||||
messages.append(
|
||||
self.style.NOTICE("No matches found"),
|
||||
self.style.SUCCESS("No matches found"),
|
||||
)
|
||||
self.stdout.writelines(
|
||||
messages,
|
||||
|
@ -13,6 +13,7 @@ class TestFuzzyMatchCommand(TestCase):
|
||||
stderr = StringIO()
|
||||
call_command(
|
||||
"document_fuzzy_match",
|
||||
"--no-progress-bar",
|
||||
*args,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
@ -63,7 +64,7 @@ class TestFuzzyMatchCommand(TestCase):
|
||||
mime_type="application/pdf",
|
||||
filename="other_test.pdf",
|
||||
)
|
||||
stdout, _ = self.call_command()
|
||||
stdout, _ = self.call_command("--processes", "1")
|
||||
self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
|
||||
|
||||
def test_with_3_matches(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user