mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-04 03:27:12 -05:00 
			
		
		
		
	removed matching model fields, automatic classifier reloading, added autmatic_classification field to matching model
This commit is contained in:
		
							parent
							
								
									30134034e2
								
							
						
					
					
						commit
						70bd05450a
					
				@ -102,9 +102,8 @@ class CommonAdmin(admin.ModelAdmin):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class CorrespondentAdmin(CommonAdmin):
 | 
					class CorrespondentAdmin(CommonAdmin):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    list_display = ("name", "match", "matching_algorithm", "document_count", "last_correspondence")
 | 
					    list_display = ("name", "automatic_classification", "document_count", "last_correspondence")
 | 
				
			||||||
    list_filter = ("matching_algorithm",)
 | 
					    list_editable = ("automatic_classification",)
 | 
				
			||||||
    list_editable = ("match", "matching_algorithm")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_queryset(self, request):
 | 
					    def get_queryset(self, request):
 | 
				
			||||||
        qs = super(CorrespondentAdmin, self).get_queryset(request)
 | 
					        qs = super(CorrespondentAdmin, self).get_queryset(request)
 | 
				
			||||||
@ -122,10 +121,9 @@ class CorrespondentAdmin(CommonAdmin):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class TagAdmin(CommonAdmin):
 | 
					class TagAdmin(CommonAdmin):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    list_display = ("name", "colour", "match", "matching_algorithm",
 | 
					    list_display = ("name", "colour", "automatic_classification", "document_count")
 | 
				
			||||||
                    "document_count")
 | 
					    list_filter = ("colour",)
 | 
				
			||||||
    list_filter = ("colour", "matching_algorithm")
 | 
					    list_editable = ("colour", "automatic_classification")
 | 
				
			||||||
    list_editable = ("colour", "match", "matching_algorithm")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_queryset(self, request):
 | 
					    def get_queryset(self, request):
 | 
				
			||||||
        qs = super(TagAdmin, self).get_queryset(request)
 | 
					        qs = super(TagAdmin, self).get_queryset(request)
 | 
				
			||||||
@ -139,9 +137,8 @@ class TagAdmin(CommonAdmin):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class DocumentTypeAdmin(CommonAdmin):
 | 
					class DocumentTypeAdmin(CommonAdmin):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    list_display = ("name", "match", "matching_algorithm", "document_count")
 | 
					    list_display = ("name", "automatic_classification", "document_count")
 | 
				
			||||||
    list_filter = ("matching_algorithm",)
 | 
					    list_editable = ("automatic_classification",)
 | 
				
			||||||
    list_editable = ("match", "matching_algorithm")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_queryset(self, request):
 | 
					    def get_queryset(self, request):
 | 
				
			||||||
        qs = super(DocumentTypeAdmin, self).get_queryset(request)
 | 
					        qs = super(DocumentTypeAdmin, self).get_queryset(request)
 | 
				
			||||||
 | 
				
			|||||||
@ -1,3 +1,4 @@
 | 
				
			|||||||
 | 
					import os
 | 
				
			||||||
import pickle
 | 
					import pickle
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from documents.models import Correspondent, DocumentType, Tag
 | 
					from documents.models import Correspondent, DocumentType, Tag
 | 
				
			||||||
@ -16,6 +17,18 @@ def preprocess_content(content):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class DocumentClassifier(object):
 | 
					class DocumentClassifier(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    classifier_version = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    data_vectorizer = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    tags_binarizer = None
 | 
				
			||||||
 | 
					    correspondent_binarizer = None
 | 
				
			||||||
 | 
					    type_binarizer = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    tags_classifier = None
 | 
				
			||||||
 | 
					    correspondent_classifier = None
 | 
				
			||||||
 | 
					    type_classifier = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def load_classifier():
 | 
					    def load_classifier():
 | 
				
			||||||
        clf = DocumentClassifier()
 | 
					        clf = DocumentClassifier()
 | 
				
			||||||
@ -23,6 +36,8 @@ class DocumentClassifier(object):
 | 
				
			|||||||
        return clf
 | 
					        return clf
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def reload(self):
 | 
					    def reload(self):
 | 
				
			||||||
 | 
					        if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
 | 
				
			||||||
 | 
					            print("reloading classifier")
 | 
				
			||||||
            with open(settings.MODEL_FILE, "rb") as f:
 | 
					            with open(settings.MODEL_FILE, "rb") as f:
 | 
				
			||||||
                self.data_vectorizer = pickle.load(f)
 | 
					                self.data_vectorizer = pickle.load(f)
 | 
				
			||||||
                self.tags_binarizer = pickle.load(f)
 | 
					                self.tags_binarizer = pickle.load(f)
 | 
				
			||||||
@ -32,6 +47,7 @@ class DocumentClassifier(object):
 | 
				
			|||||||
                self.tags_classifier = pickle.load(f)
 | 
					                self.tags_classifier = pickle.load(f)
 | 
				
			||||||
                self.correspondent_classifier = pickle.load(f)
 | 
					                self.correspondent_classifier = pickle.load(f)
 | 
				
			||||||
                self.type_classifier = pickle.load(f)
 | 
					                self.type_classifier = pickle.load(f)
 | 
				
			||||||
 | 
					            self.classifier_version = os.path.getmtime(settings.MODEL_FILE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def save_classifier(self):
 | 
					    def save_classifier(self):
 | 
				
			||||||
        with open(settings.MODEL_FILE, "wb") as f:
 | 
					        with open(settings.MODEL_FILE, "wb") as f:
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										6
									
								
								src/documents/consumer.py
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										6
									
								
								src/documents/consumer.py
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							@ -221,12 +221,6 @@ class Consumer:
 | 
				
			|||||||
                storage_type=self.storage_type
 | 
					                storage_type=self.storage_type
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
 | 
					 | 
				
			||||||
        if relevant_tags:
 | 
					 | 
				
			||||||
            tag_names = ", ".join([t.slug for t in relevant_tags])
 | 
					 | 
				
			||||||
            self.log("debug", "Tagging with {}".format(tag_names))
 | 
					 | 
				
			||||||
            document.tags.add(*relevant_tags)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self._write(document, doc, document.source_path)
 | 
					        self._write(document, doc, document.source_path)
 | 
				
			||||||
        self._write(document, thumbnail, document.thumbnail_path)
 | 
					        self._write(document, thumbnail, document.thumbnail_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -42,9 +42,14 @@ class Command(Renderable, BaseCommand):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        # Step 2: vectorize data
 | 
					        # Step 2: vectorize data
 | 
				
			||||||
        logging.getLogger(__name__).info("Vectorizing data...")
 | 
					        logging.getLogger(__name__).info("Vectorizing data...")
 | 
				
			||||||
        clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=0.05)
 | 
					        clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=0.1)
 | 
				
			||||||
        data_vectorized = clf.data_vectorizer.fit_transform(data)
 | 
					        data_vectorized = clf.data_vectorizer.fit_transform(data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        print(clf.data_vectorizer.vocabulary_)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        logging.getLogger(__name__).info("Shape of vectorized data: {}".format(data_vectorized.shape))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        clf.tags_binarizer = MultiLabelBinarizer()
 | 
					        clf.tags_binarizer = MultiLabelBinarizer()
 | 
				
			||||||
        labels_tags_vectorized = clf.tags_binarizer.fit_transform(labels_tags)
 | 
					        labels_tags_vectorized = clf.tags_binarizer.fit_transform(labels_tags)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -46,7 +46,11 @@ class Command(Renderable, BaseCommand):
 | 
				
			|||||||
            documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct()
 | 
					            documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        logging.getLogger(__name__).info("Loading classifier")
 | 
					        logging.getLogger(__name__).info("Loading classifier")
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
            clf = DocumentClassifier.load_classifier()
 | 
					            clf = DocumentClassifier.load_classifier()
 | 
				
			||||||
 | 
					        except FileNotFoundError:
 | 
				
			||||||
 | 
					            logging.getLogger(__name__).fatal("Cannot classify documents, classifier model file was not found.")
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for document in documents:
 | 
					        for document in documents:
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										77
									
								
								src/documents/migrations/0024_auto_20180904_1425.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										77
									
								
								src/documents/migrations/0024_auto_20180904_1425.py
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,77 @@
 | 
				
			|||||||
 | 
					# Generated by Django 2.0.8 on 2018-09-04 14:25
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from django.db import migrations, models
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def transfer_automatic_classification(apps, schema_editor):
 | 
				
			||||||
 | 
					    for model_name in ["Tag", "Correspondent", "DocumentType"]:
 | 
				
			||||||
 | 
					        model_class = apps.get_model("documents", model_name)
 | 
				
			||||||
 | 
					        for o in model_class.objects.all():
 | 
				
			||||||
 | 
					            o.automatic_classification = o.match is not None and len(o.match) > 0
 | 
				
			||||||
 | 
					            o.save()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def reverse_automatic_classification(apps, schema_editor):
 | 
				
			||||||
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Migration(migrations.Migration):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    dependencies = [
 | 
				
			||||||
 | 
					        ('documents', '0023_auto_20180823_1155'),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    operations = [
 | 
				
			||||||
 | 
					        migrations.AddField(
 | 
				
			||||||
 | 
					            model_name='correspondent',
 | 
				
			||||||
 | 
					            name='automatic_classification',
 | 
				
			||||||
 | 
					            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.AddField(
 | 
				
			||||||
 | 
					            model_name='documenttype',
 | 
				
			||||||
 | 
					            name='automatic_classification',
 | 
				
			||||||
 | 
					            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.AddField(
 | 
				
			||||||
 | 
					            model_name='tag',
 | 
				
			||||||
 | 
					            name='automatic_classification',
 | 
				
			||||||
 | 
					            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.RunPython(transfer_automatic_classification, reverse_automatic_classification),
 | 
				
			||||||
 | 
					        migrations.RemoveField(
 | 
				
			||||||
 | 
					            model_name='correspondent',
 | 
				
			||||||
 | 
					            name='is_insensitive',
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.RemoveField(
 | 
				
			||||||
 | 
					            model_name='correspondent',
 | 
				
			||||||
 | 
					            name='match',
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.RemoveField(
 | 
				
			||||||
 | 
					            model_name='correspondent',
 | 
				
			||||||
 | 
					            name='matching_algorithm',
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.RemoveField(
 | 
				
			||||||
 | 
					            model_name='documenttype',
 | 
				
			||||||
 | 
					            name='is_insensitive',
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.RemoveField(
 | 
				
			||||||
 | 
					            model_name='documenttype',
 | 
				
			||||||
 | 
					            name='match',
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.RemoveField(
 | 
				
			||||||
 | 
					            model_name='documenttype',
 | 
				
			||||||
 | 
					            name='matching_algorithm',
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.RemoveField(
 | 
				
			||||||
 | 
					            model_name='tag',
 | 
				
			||||||
 | 
					            name='is_insensitive',
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.RemoveField(
 | 
				
			||||||
 | 
					            model_name='tag',
 | 
				
			||||||
 | 
					            name='match',
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        migrations.RemoveField(
 | 
				
			||||||
 | 
					            model_name='tag',
 | 
				
			||||||
 | 
					            name='matching_algorithm',
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
@ -15,48 +15,15 @@ from django.db import models
 | 
				
			|||||||
from django.template.defaultfilters import slugify
 | 
					from django.template.defaultfilters import slugify
 | 
				
			||||||
from django.utils import timezone
 | 
					from django.utils import timezone
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from reminders.models import Reminder
 | 
					 | 
				
			||||||
from .managers import LogManager
 | 
					from .managers import LogManager
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class MatchingModel(models.Model):
 | 
					class MatchingModel(models.Model):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    MATCH_ANY = 1
 | 
					 | 
				
			||||||
    MATCH_ALL = 2
 | 
					 | 
				
			||||||
    MATCH_LITERAL = 3
 | 
					 | 
				
			||||||
    MATCH_REGEX = 4
 | 
					 | 
				
			||||||
    MATCH_FUZZY = 5
 | 
					 | 
				
			||||||
    MATCHING_ALGORITHMS = (
 | 
					 | 
				
			||||||
        (MATCH_ANY, "Any"),
 | 
					 | 
				
			||||||
        (MATCH_ALL, "All"),
 | 
					 | 
				
			||||||
        (MATCH_LITERAL, "Literal"),
 | 
					 | 
				
			||||||
        (MATCH_REGEX, "Regular Expression"),
 | 
					 | 
				
			||||||
        (MATCH_FUZZY, "Fuzzy Match"),
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    name = models.CharField(max_length=128, unique=True)
 | 
					    name = models.CharField(max_length=128, unique=True)
 | 
				
			||||||
    slug = models.SlugField(blank=True)
 | 
					    slug = models.SlugField(blank=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    match = models.CharField(max_length=256, blank=True)
 | 
					    automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.')
 | 
				
			||||||
    matching_algorithm = models.PositiveIntegerField(
 | 
					 | 
				
			||||||
        choices=MATCHING_ALGORITHMS,
 | 
					 | 
				
			||||||
        default=MATCH_ANY,
 | 
					 | 
				
			||||||
        help_text=(
 | 
					 | 
				
			||||||
            "Which algorithm you want to use when matching text to the OCR'd "
 | 
					 | 
				
			||||||
            "PDF.  Here, \"any\" looks for any occurrence of any word "
 | 
					 | 
				
			||||||
            "provided in the PDF, while \"all\" requires that every word "
 | 
					 | 
				
			||||||
            "provided appear in the PDF, albeit not in the order provided.  A "
 | 
					 | 
				
			||||||
            "\"literal\" match means that the text you enter must appear in "
 | 
					 | 
				
			||||||
            "the PDF exactly as you've entered it, and \"regular expression\" "
 | 
					 | 
				
			||||||
            "uses a regex to match the PDF.  (If you don't know what a regex "
 | 
					 | 
				
			||||||
            "is, you probably don't want this option.)  Finally, a \"fuzzy "
 | 
					 | 
				
			||||||
            "match\" looks for words or phrases that are mostly—but not "
 | 
					 | 
				
			||||||
            "exactly—the same, which can be useful for matching against "
 | 
					 | 
				
			||||||
            "documents containg imperfections that foil accurate OCR."
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    is_insensitive = models.BooleanField(default=True)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    class Meta:
 | 
					    class Meta:
 | 
				
			||||||
        abstract = True
 | 
					        abstract = True
 | 
				
			||||||
@ -64,87 +31,8 @@ class MatchingModel(models.Model):
 | 
				
			|||||||
    def __str__(self):
 | 
					    def __str__(self):
 | 
				
			||||||
        return self.name
 | 
					        return self.name
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					 | 
				
			||||||
    def conditions(self):
 | 
					 | 
				
			||||||
        return "{}: \"{}\" ({})".format(
 | 
					 | 
				
			||||||
            self.name, self.match, self.get_matching_algorithm_display())
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def match_all(cls, text, tags=None):
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if tags is None:
 | 
					 | 
				
			||||||
            tags = cls.objects.all()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        text = text.lower()
 | 
					 | 
				
			||||||
        for tag in tags:
 | 
					 | 
				
			||||||
            if tag.matches(text):
 | 
					 | 
				
			||||||
                yield tag
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def matches(self, text):
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        search_kwargs = {}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Check that match is not empty
 | 
					 | 
				
			||||||
        if self.match.strip() == "":
 | 
					 | 
				
			||||||
            return False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if self.is_insensitive:
 | 
					 | 
				
			||||||
            search_kwargs = {"flags": re.IGNORECASE}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if self.matching_algorithm == self.MATCH_ALL:
 | 
					 | 
				
			||||||
            for word in self._split_match():
 | 
					 | 
				
			||||||
                search_result = re.search(
 | 
					 | 
				
			||||||
                    r"\b{}\b".format(word), text, **search_kwargs)
 | 
					 | 
				
			||||||
                if not search_result:
 | 
					 | 
				
			||||||
                    return False
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if self.matching_algorithm == self.MATCH_ANY:
 | 
					 | 
				
			||||||
            for word in self._split_match():
 | 
					 | 
				
			||||||
                if re.search(r"\b{}\b".format(word), text, **search_kwargs):
 | 
					 | 
				
			||||||
                    return True
 | 
					 | 
				
			||||||
            return False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if self.matching_algorithm == self.MATCH_LITERAL:
 | 
					 | 
				
			||||||
            return bool(re.search(
 | 
					 | 
				
			||||||
                r"\b{}\b".format(self.match), text, **search_kwargs))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if self.matching_algorithm == self.MATCH_REGEX:
 | 
					 | 
				
			||||||
            return bool(re.search(
 | 
					 | 
				
			||||||
                re.compile(self.match, **search_kwargs), text))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if self.matching_algorithm == self.MATCH_FUZZY:
 | 
					 | 
				
			||||||
            match = re.sub(r'[^\w\s]', '', self.match)
 | 
					 | 
				
			||||||
            text = re.sub(r'[^\w\s]', '', text)
 | 
					 | 
				
			||||||
            if self.is_insensitive:
 | 
					 | 
				
			||||||
                match = match.lower()
 | 
					 | 
				
			||||||
                text = text.lower()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            return True if fuzz.partial_ratio(match, text) >= 90 else False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        raise NotImplementedError("Unsupported matching algorithm")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _split_match(self):
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        Splits the match to individual keywords, getting rid of unnecessary
 | 
					 | 
				
			||||||
        spaces and grouping quoted words together.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        Example:
 | 
					 | 
				
			||||||
          '  some random  words "with   quotes  " and   spaces'
 | 
					 | 
				
			||||||
            ==>
 | 
					 | 
				
			||||||
          ["some", "random", "words", "with\s+quotes", "and", "spaces"]
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        findterms = re.compile(r'"([^"]+)"|(\S+)').findall
 | 
					 | 
				
			||||||
        normspace = re.compile(r"\s+").sub
 | 
					 | 
				
			||||||
        return [
 | 
					 | 
				
			||||||
            normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
 | 
					 | 
				
			||||||
            for t in findterms(self.match)
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def save(self, *args, **kwargs):
 | 
					    def save(self, *args, **kwargs):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.match = self.match.lower()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if not self.slug:
 | 
					        if not self.slug:
 | 
				
			||||||
            self.slug = slugify(self.name)
 | 
					            self.slug = slugify(self.name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -16,15 +16,17 @@ def logger(message, group):
 | 
				
			|||||||
    logging.getLogger(__name__).debug(message, extra={"group": group})
 | 
					    logging.getLogger(__name__).debug(message, extra={"group": group})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
classifier = None
 | 
					classifier = DocumentClassifier()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def classify_document(sender, document=None, logging_group=None, **kwargs):
 | 
					def classify_document(sender, document=None, logging_group=None, **kwargs):
 | 
				
			||||||
    global classifier
 | 
					    global classifier
 | 
				
			||||||
    if classifier is None:
 | 
					    try:
 | 
				
			||||||
        classifier = DocumentClassifier.load_classifier()
 | 
					        classifier.reload()
 | 
				
			||||||
 | 
					 | 
				
			||||||
        classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)
 | 
					        classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)
 | 
				
			||||||
 | 
					    except FileNotFoundError:
 | 
				
			||||||
 | 
					        logging.getLogger(__name__).fatal("Cannot classify document, classifier model file was not found.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user