mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 19:17:13 -05:00 
			
		
		
		
	some search index optimizations
This commit is contained in:
		
							parent
							
								
									56bd966c02
								
							
						
					
					
						commit
						8bf4241b16
					
				@ -1,7 +1,5 @@
 | 
				
			|||||||
from django.contrib import admin
 | 
					from django.contrib import admin
 | 
				
			||||||
from whoosh.writing import AsyncWriter
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . import index
 | 
					 | 
				
			||||||
from .models import Correspondent, Document, DocumentType, Tag, \
 | 
					from .models import Correspondent, Document, DocumentType, Tag, \
 | 
				
			||||||
    SavedView, SavedViewFilterRule
 | 
					    SavedView, SavedViewFilterRule
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -84,17 +82,21 @@ class DocumentAdmin(admin.ModelAdmin):
 | 
				
			|||||||
    created_.short_description = "Created"
 | 
					    created_.short_description = "Created"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def delete_queryset(self, request, queryset):
 | 
					    def delete_queryset(self, request, queryset):
 | 
				
			||||||
        ix = index.open_index()
 | 
					        from documents import index
 | 
				
			||||||
        with AsyncWriter(ix) as writer:
 | 
					
 | 
				
			||||||
 | 
					        with index.open_index_writer() as writer:
 | 
				
			||||||
            for o in queryset:
 | 
					            for o in queryset:
 | 
				
			||||||
                index.remove_document(writer, o)
 | 
					                index.remove_document(writer, o)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        super(DocumentAdmin, self).delete_queryset(request, queryset)
 | 
					        super(DocumentAdmin, self).delete_queryset(request, queryset)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def delete_model(self, request, obj):
 | 
					    def delete_model(self, request, obj):
 | 
				
			||||||
 | 
					        from documents import index
 | 
				
			||||||
        index.remove_document_from_index(obj)
 | 
					        index.remove_document_from_index(obj)
 | 
				
			||||||
        super(DocumentAdmin, self).delete_model(request, obj)
 | 
					        super(DocumentAdmin, self).delete_model(request, obj)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def save_model(self, request, obj, form, change):
 | 
					    def save_model(self, request, obj, form, change):
 | 
				
			||||||
 | 
					        from documents import index
 | 
				
			||||||
        index.add_or_update_document(obj)
 | 
					        index.add_or_update_document(obj)
 | 
				
			||||||
        super(DocumentAdmin, self).save_model(request, obj, form, change)
 | 
					        super(DocumentAdmin, self).save_model(request, obj, form, change)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -2,9 +2,7 @@ import itertools
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from django.db.models import Q
 | 
					from django.db.models import Q
 | 
				
			||||||
from django_q.tasks import async_task
 | 
					from django_q.tasks import async_task
 | 
				
			||||||
from whoosh.writing import AsyncWriter
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from documents import index
 | 
					 | 
				
			||||||
from documents.models import Document, Correspondent, DocumentType
 | 
					from documents.models import Document, Correspondent, DocumentType
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -99,8 +97,9 @@ def modify_tags(doc_ids, add_tags, remove_tags):
 | 
				
			|||||||
def delete(doc_ids):
 | 
					def delete(doc_ids):
 | 
				
			||||||
    Document.objects.filter(id__in=doc_ids).delete()
 | 
					    Document.objects.filter(id__in=doc_ids).delete()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ix = index.open_index()
 | 
					    from documents import index
 | 
				
			||||||
    with AsyncWriter(ix) as writer:
 | 
					
 | 
				
			||||||
 | 
					    with index.open_index_writer() as writer:
 | 
				
			||||||
        for id in doc_ids:
 | 
					        for id in doc_ids:
 | 
				
			||||||
            index.remove_document_by_id(writer, id)
 | 
					            index.remove_document_by_id(writer, id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -86,6 +86,22 @@ def open_index(recreate=False):
 | 
				
			|||||||
    return create_in(settings.INDEX_DIR, get_schema())
 | 
					    return create_in(settings.INDEX_DIR, get_schema())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@contextmanager
 | 
				
			||||||
 | 
					def open_index_writer(ix=None, optimize=False):
 | 
				
			||||||
 | 
					    if ix:
 | 
				
			||||||
 | 
					        writer = AsyncWriter(ix)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        writer = AsyncWriter(open_index())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        yield writer
 | 
				
			||||||
 | 
					    except Exception as e:
 | 
				
			||||||
 | 
					        logger.exception(str(e))
 | 
				
			||||||
 | 
					        writer.cancel()
 | 
				
			||||||
 | 
					    finally:
 | 
				
			||||||
 | 
					        writer.commit(optimize=optimize)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def update_document(writer, doc):
 | 
					def update_document(writer, doc):
 | 
				
			||||||
    tags = ",".join([t.name for t in doc.tags.all()])
 | 
					    tags = ",".join([t.name for t in doc.tags.all()])
 | 
				
			||||||
    writer.update_document(
 | 
					    writer.update_document(
 | 
				
			||||||
@ -110,14 +126,12 @@ def remove_document_by_id(writer, doc_id):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def add_or_update_document(document):
 | 
					def add_or_update_document(document):
 | 
				
			||||||
    ix = open_index()
 | 
					    with open_index_writer() as writer:
 | 
				
			||||||
    with AsyncWriter(ix) as writer:
 | 
					 | 
				
			||||||
        update_document(writer, document)
 | 
					        update_document(writer, document)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def remove_document_from_index(document):
 | 
					def remove_document_from_index(document):
 | 
				
			||||||
    ix = open_index()
 | 
					    with open_index_writer() as writer:
 | 
				
			||||||
    with AsyncWriter(ix) as writer:
 | 
					 | 
				
			||||||
        remove_document(writer, document)
 | 
					        remove_document(writer, document)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -11,7 +11,7 @@ from django.dispatch import receiver
 | 
				
			|||||||
from django.utils import timezone
 | 
					from django.utils import timezone
 | 
				
			||||||
from filelock import FileLock
 | 
					from filelock import FileLock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import index, matching
 | 
					from .. import matching
 | 
				
			||||||
from ..file_handling import delete_empty_directories, \
 | 
					from ..file_handling import delete_empty_directories, \
 | 
				
			||||||
    create_source_path_directory, \
 | 
					    create_source_path_directory, \
 | 
				
			||||||
    generate_unique_filename
 | 
					    generate_unique_filename
 | 
				
			||||||
@ -305,4 +305,6 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def add_to_index(sender, document, **kwargs):
 | 
					def add_to_index(sender, document, **kwargs):
 | 
				
			||||||
 | 
					    from documents import index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    index.add_or_update_document(document)
 | 
					    index.add_or_update_document(document)
 | 
				
			||||||
 | 
				
			|||||||
@ -4,6 +4,7 @@ from django.contrib.admin.sites import AdminSite
 | 
				
			|||||||
from django.test import TestCase
 | 
					from django.test import TestCase
 | 
				
			||||||
from django.utils import timezone
 | 
					from django.utils import timezone
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from documents import index
 | 
				
			||||||
from documents.admin import DocumentAdmin
 | 
					from documents.admin import DocumentAdmin
 | 
				
			||||||
from documents.models import Document
 | 
					from documents.models import Document
 | 
				
			||||||
from documents.tests.utils import DirectoriesMixin
 | 
					from documents.tests.utils import DirectoriesMixin
 | 
				
			||||||
@ -11,37 +12,52 @@ from documents.tests.utils import DirectoriesMixin
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class TestDocumentAdmin(DirectoriesMixin, TestCase):
 | 
					class TestDocumentAdmin(DirectoriesMixin, TestCase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_document_from_index(self, doc):
 | 
				
			||||||
 | 
					        ix = index.open_index()
 | 
				
			||||||
 | 
					        with ix.searcher() as searcher:
 | 
				
			||||||
 | 
					            return searcher.document(id=doc.id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def setUp(self) -> None:
 | 
					    def setUp(self) -> None:
 | 
				
			||||||
        super(TestDocumentAdmin, self).setUp()
 | 
					        super(TestDocumentAdmin, self).setUp()
 | 
				
			||||||
        self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())
 | 
					        self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @mock.patch("documents.admin.index.add_or_update_document")
 | 
					    def test_save_model(self):
 | 
				
			||||||
    def test_save_model(self, m):
 | 
					 | 
				
			||||||
        doc = Document.objects.create(title="test")
 | 
					        doc = Document.objects.create(title="test")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        doc.title = "new title"
 | 
					        doc.title = "new title"
 | 
				
			||||||
        self.doc_admin.save_model(None, doc, None, None)
 | 
					        self.doc_admin.save_model(None, doc, None, None)
 | 
				
			||||||
        self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
 | 
					        self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
 | 
				
			||||||
        m.assert_called_once()
 | 
					        self.assertEqual(self.get_document_from_index(doc)['title'], "new title")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @mock.patch("documents.admin.index.remove_document")
 | 
					    def test_delete_model(self):
 | 
				
			||||||
    def test_delete_model(self, m):
 | 
					 | 
				
			||||||
        doc = Document.objects.create(title="test")
 | 
					        doc = Document.objects.create(title="test")
 | 
				
			||||||
        self.doc_admin.delete_model(None, doc)
 | 
					        index.add_or_update_document(doc)
 | 
				
			||||||
        self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
 | 
					        self.assertIsNotNone(self.get_document_from_index(doc))
 | 
				
			||||||
        m.assert_called_once()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @mock.patch("documents.admin.index.remove_document")
 | 
					        self.doc_admin.delete_model(None, doc)
 | 
				
			||||||
    def test_delete_queryset(self, m):
 | 
					
 | 
				
			||||||
 | 
					        self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
 | 
				
			||||||
 | 
					        self.assertIsNone(self.get_document_from_index(doc))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_delete_queryset(self):
 | 
				
			||||||
 | 
					        docs = []
 | 
				
			||||||
        for i in range(42):
 | 
					        for i in range(42):
 | 
				
			||||||
            Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
 | 
					            doc = Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
 | 
				
			||||||
 | 
					            docs.append(doc)
 | 
				
			||||||
 | 
					            index.add_or_update_document(doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertEqual(Document.objects.count(), 42)
 | 
					        self.assertEqual(Document.objects.count(), 42)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for doc in docs:
 | 
				
			||||||
 | 
					            self.assertIsNotNone(self.get_document_from_index(doc))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.doc_admin.delete_queryset(None, Document.objects.all())
 | 
					        self.doc_admin.delete_queryset(None, Document.objects.all())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertEqual(m.call_count, 42)
 | 
					 | 
				
			||||||
        self.assertEqual(Document.objects.count(), 0)
 | 
					        self.assertEqual(Document.objects.count(), 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for doc in docs:
 | 
				
			||||||
 | 
					            self.assertIsNone(self.get_document_from_index(doc))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_created(self):
 | 
					    def test_created(self):
 | 
				
			||||||
        doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
 | 
					        doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
 | 
				
			||||||
        self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")
 | 
					        self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")
 | 
				
			||||||
 | 
				
			|||||||
@ -32,7 +32,6 @@ from rest_framework.viewsets import (
 | 
				
			|||||||
    ViewSet
 | 
					    ViewSet
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import documents.index as index
 | 
					 | 
				
			||||||
from paperless.db import GnuPG
 | 
					from paperless.db import GnuPG
 | 
				
			||||||
from paperless.views import StandardPagination
 | 
					from paperless.views import StandardPagination
 | 
				
			||||||
from .classifier import load_classifier
 | 
					from .classifier import load_classifier
 | 
				
			||||||
@ -176,10 +175,12 @@ class DocumentViewSet(RetrieveModelMixin,
 | 
				
			|||||||
    def update(self, request, *args, **kwargs):
 | 
					    def update(self, request, *args, **kwargs):
 | 
				
			||||||
        response = super(DocumentViewSet, self).update(
 | 
					        response = super(DocumentViewSet, self).update(
 | 
				
			||||||
            request, *args, **kwargs)
 | 
					            request, *args, **kwargs)
 | 
				
			||||||
 | 
					        from documents import index
 | 
				
			||||||
        index.add_or_update_document(self.get_object())
 | 
					        index.add_or_update_document(self.get_object())
 | 
				
			||||||
        return response
 | 
					        return response
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def destroy(self, request, *args, **kwargs):
 | 
					    def destroy(self, request, *args, **kwargs):
 | 
				
			||||||
 | 
					        from documents import index
 | 
				
			||||||
        index.remove_document_from_index(self.get_object())
 | 
					        index.remove_document_from_index(self.get_object())
 | 
				
			||||||
        return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
 | 
					        return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -501,10 +502,6 @@ class SearchView(APIView):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    permission_classes = (IsAuthenticated,)
 | 
					    permission_classes = (IsAuthenticated,)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, *args, **kwargs):
 | 
					 | 
				
			||||||
        super(SearchView, self).__init__(*args, **kwargs)
 | 
					 | 
				
			||||||
        self.ix = index.open_index()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def add_infos_to_hit(self, r):
 | 
					    def add_infos_to_hit(self, r):
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            doc = Document.objects.get(id=r['id'])
 | 
					            doc = Document.objects.get(id=r['id'])
 | 
				
			||||||
@ -525,6 +522,7 @@ class SearchView(APIView):
 | 
				
			|||||||
                }
 | 
					                }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get(self, request, format=None):
 | 
					    def get(self, request, format=None):
 | 
				
			||||||
 | 
					        from documents import index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if 'query' in request.query_params:
 | 
					        if 'query' in request.query_params:
 | 
				
			||||||
            query = request.query_params['query']
 | 
					            query = request.query_params['query']
 | 
				
			||||||
@ -554,8 +552,10 @@ class SearchView(APIView):
 | 
				
			|||||||
        if page < 1:
 | 
					        if page < 1:
 | 
				
			||||||
            page = 1
 | 
					            page = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ix = index.open_index()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query):  # NOQA: E501
 | 
					            with index.query_page(ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query):  # NOQA: E501
 | 
				
			||||||
                return Response(
 | 
					                return Response(
 | 
				
			||||||
                    {'count': len(result_page),
 | 
					                    {'count': len(result_page),
 | 
				
			||||||
                     'page': result_page.pagenum,
 | 
					                     'page': result_page.pagenum,
 | 
				
			||||||
@ -570,10 +570,6 @@ class SearchAutoCompleteView(APIView):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    permission_classes = (IsAuthenticated,)
 | 
					    permission_classes = (IsAuthenticated,)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, *args, **kwargs):
 | 
					 | 
				
			||||||
        super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
 | 
					 | 
				
			||||||
        self.ix = index.open_index()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get(self, request, format=None):
 | 
					    def get(self, request, format=None):
 | 
				
			||||||
        if 'term' in request.query_params:
 | 
					        if 'term' in request.query_params:
 | 
				
			||||||
            term = request.query_params['term']
 | 
					            term = request.query_params['term']
 | 
				
			||||||
@ -587,7 +583,11 @@ class SearchAutoCompleteView(APIView):
 | 
				
			|||||||
        else:
 | 
					        else:
 | 
				
			||||||
            limit = 10
 | 
					            limit = 10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return Response(index.autocomplete(self.ix, term, limit))
 | 
					        from documents import index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ix = index.open_index()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return Response(index.autocomplete(ix, term, limit))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class StatisticsView(APIView):
 | 
					class StatisticsView(APIView):
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user