mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 18:22:40 -04:00 
			
		
		
		
	llamaindex vector index, llmindex mangement command
This commit is contained in:
		
							parent
							
								
									eb1c49090b
								
							
						
					
					
						commit
						959ebdbb85
					
				| @ -11,6 +11,7 @@ for command in decrypt_documents \ | ||||
| 	mail_fetcher \ | ||||
| 	document_create_classifier \ | ||||
| 	document_index \ | ||||
| 	document_llmindex \ | ||||
| 	document_renamer \ | ||||
| 	document_retagger \ | ||||
| 	document_thumbnails \ | ||||
|  | ||||
							
								
								
									
										14
									
								
								docker/rootfs/usr/local/bin/document_llmindex
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										14
									
								
								docker/rootfs/usr/local/bin/document_llmindex
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,14 @@ | ||||
| #!/command/with-contenv /usr/bin/bash | ||||
| # shellcheck shell=bash | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| cd "${PAPERLESS_SRC_DIR}" | ||||
| 
 | ||||
| if [[ $(id -u) == 0 ]]; then | ||||
| 	s6-setuidgid paperless python3 manage.py document_llmindex "$@" | ||||
| elif [[ $(id -un) == "paperless" ]]; then | ||||
| 	python3 manage.py document_llmindex "$@" | ||||
| else | ||||
| 	echo "Unknown user." | ||||
| fi | ||||
| @ -40,6 +40,7 @@ dependencies = [ | ||||
|   "drf-spectacular~=0.28", | ||||
|   "drf-spectacular-sidecar~=2025.4.1", | ||||
|   "drf-writable-nested~=0.7.1", | ||||
|   "faiss-cpu>=1.10", | ||||
|   "filelock~=3.18.0", | ||||
|   "flower~=2.0.1", | ||||
|   "gotenberg-client~=0.10.0", | ||||
| @ -48,8 +49,12 @@ dependencies = [ | ||||
|   "inotifyrecursive~=0.3", | ||||
|   "jinja2~=3.1.5", | ||||
|   "langdetect~=1.0.9", | ||||
|   "llama-index>=0.12.33", | ||||
|   "llama-index-embeddings-huggingface>=0.5.3", | ||||
|   "llama-index-vector-stores-faiss>=0.3", | ||||
|   "nltk~=3.9.1", | ||||
|   "ocrmypdf~=16.10.0", | ||||
|   "openai>=1.76", | ||||
|   "pathvalidate~=3.2.3", | ||||
|   "pdf2image~=1.17.0", | ||||
|   "python-dateutil~=2.9.0", | ||||
| @ -61,6 +66,7 @@ dependencies = [ | ||||
|   "rapidfuzz~=3.13.0", | ||||
|   "redis[hiredis]~=5.2.1", | ||||
|   "scikit-learn~=1.6.1", | ||||
|   "sentence-transformers>=4.1", | ||||
|   "setproctitle~=1.3.4", | ||||
|   "tika-client~=0.9.0", | ||||
|   "tqdm~=4.67.1", | ||||
|  | ||||
							
								
								
									
										19
									
								
								src/documents/management/commands/document_llmindex.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								src/documents/management/commands/document_llmindex.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,19 @@ | ||||
| from django.core.management import BaseCommand | ||||
| from django.db import transaction | ||||
| 
 | ||||
| from documents.management.commands.mixins import ProgressBarMixin | ||||
| from documents.tasks import llm_index_rebuild | ||||
| 
 | ||||
| 
 | ||||
| class Command(ProgressBarMixin, BaseCommand): | ||||
|     help = "Manages the LLM-based vector index for Paperless." | ||||
| 
 | ||||
|     def add_arguments(self, parser): | ||||
|         parser.add_argument("command", choices=["rebuild"]) | ||||
|         self.add_argument_progress_bar_mixin(parser) | ||||
| 
 | ||||
|     def handle(self, *args, **options): | ||||
|         self.handle_progress_bar_mixin(**options) | ||||
|         with transaction.atomic(): | ||||
|             if options["command"] == "rebuild": | ||||
|                 llm_index_rebuild(progress_bar_disable=self.no_progress_bar) | ||||
| @ -6,6 +6,7 @@ import uuid | ||||
| from pathlib import Path | ||||
| from tempfile import TemporaryDirectory | ||||
| 
 | ||||
| import faiss | ||||
| import tqdm | ||||
| from celery import Task | ||||
| from celery import shared_task | ||||
| @ -17,6 +18,11 @@ from django.db import transaction | ||||
| from django.db.models.signals import post_save | ||||
| from django.utils import timezone | ||||
| from filelock import FileLock | ||||
| from llama_index.core import Document as LlamaDocument | ||||
| from llama_index.core import StorageContext | ||||
| from llama_index.core import VectorStoreIndex | ||||
| from llama_index.core.settings import Settings | ||||
| from llama_index.vector_stores.faiss import FaissVectorStore | ||||
| from whoosh.writing import AsyncWriter | ||||
| 
 | ||||
| from documents import index | ||||
| @ -54,6 +60,9 @@ from documents.sanity_checker import SanityCheckFailedException | ||||
| from documents.signals import document_updated | ||||
| from documents.signals.handlers import cleanup_document_deletion | ||||
| from documents.signals.handlers import run_workflows | ||||
| from paperless.ai.embedding import build_llm_index_text | ||||
| from paperless.ai.embedding import get_embedding_dim | ||||
| from paperless.ai.embedding import get_embedding_model | ||||
| 
 | ||||
| if settings.AUDIT_LOG_ENABLED: | ||||
|     from auditlog.models import LogEntry | ||||
| @ -517,3 +526,52 @@ def check_scheduled_workflows(): | ||||
|                             workflow_to_run=workflow, | ||||
|                             document=document, | ||||
|                         ) | ||||
| 
 | ||||
| 
 | ||||
| def llm_index_rebuild(*, progress_bar_disable=False, rebuild=False): | ||||
|     if rebuild: | ||||
|         shutil.rmtree(settings.LLM_INDEX_DIR, ignore_errors=True) | ||||
|         settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True) | ||||
| 
 | ||||
|     documents = Document.objects.all() | ||||
| 
 | ||||
|     embed_model = get_embedding_model() | ||||
| 
 | ||||
|     if rebuild or not settings.LLM_INDEX_DIR.exists(): | ||||
|         embedding_dim = get_embedding_dim() | ||||
|         faiss_index = faiss.IndexFlatL2(embedding_dim) | ||||
|         vector_store = FaissVectorStore(faiss_index) | ||||
|     else: | ||||
|         vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR) | ||||
|     storage_context = StorageContext.from_defaults(vector_store=vector_store) | ||||
|     Settings.embed_model = embed_model | ||||
| 
 | ||||
|     llm_docs = [] | ||||
|     for document in tqdm.tqdm(documents, disable=progress_bar_disable): | ||||
|         if not document.content: | ||||
|             continue | ||||
|         llm_docs.append( | ||||
|             LlamaDocument( | ||||
|                 text=build_llm_index_text(document), | ||||
|                 metadata={ | ||||
|                     "id": document.id, | ||||
|                     "title": document.title, | ||||
|                     "tags": [t.name for t in document.tags.all()], | ||||
|                     "correspondent": document.correspondent.name | ||||
|                     if document.correspondent | ||||
|                     else None, | ||||
|                     "document_type": document.document_type.name | ||||
|                     if document.document_type | ||||
|                     else None, | ||||
|                     "created": document.created.isoformat(), | ||||
|                     "added": document.added.isoformat(), | ||||
|                 }, | ||||
|             ), | ||||
|         ) | ||||
| 
 | ||||
|     index = VectorStoreIndex.from_documents( | ||||
|         llm_docs, | ||||
|         storage_context=storage_context, | ||||
|     ) | ||||
|     settings.LLM_INDEX_DIR.mkdir(exist_ok=True) | ||||
|     index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR) | ||||
|  | ||||
							
								
								
									
										67
									
								
								src/paperless/ai/embedding.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								src/paperless/ai/embedding.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,67 @@ | ||||
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | ||||
| from llama_index.embeddings.openai import OpenAIEmbedding | ||||
| 
 | ||||
| from documents.models import Document | ||||
| from documents.models import Note | ||||
| from paperless.config import AIConfig | ||||
| 
 | ||||
| EMBEDDING_DIMENSIONS = { | ||||
|     "text-embedding-3-small": 1536, | ||||
|     "sentence-transformers/all-MiniLM-L6-v2": 384, | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| def get_embedding_model(): | ||||
|     config = AIConfig() | ||||
| 
 | ||||
|     match config.llm_embedding_backend: | ||||
|         case "openai": | ||||
|             return OpenAIEmbedding( | ||||
|                 model=config.llm_embedding_model or "text-embedding-3-small", | ||||
|                 api_key=config.llm_api_key, | ||||
|             ) | ||||
|         case "local": | ||||
|             return HuggingFaceEmbedding( | ||||
|                 model_name=config.llm_embedding_model | ||||
|                 or "sentence-transformers/all-MiniLM-L6-v2", | ||||
|             ) | ||||
|         case _: | ||||
|             raise ValueError( | ||||
|                 f"Unsupported embedding backend: {config.llm_embedding_backend}", | ||||
|             ) | ||||
| 
 | ||||
| 
 | ||||
| def get_embedding_dim() -> int: | ||||
|     config = AIConfig() | ||||
|     model = config.llm_embedding_model or ( | ||||
|         "text-embedding-3-small" | ||||
|         if config.llm_embedding_backend == "openai" | ||||
|         else "sentence-transformers/all-MiniLM-L6-v2" | ||||
|     ) | ||||
|     if model not in EMBEDDING_DIMENSIONS: | ||||
|         raise ValueError(f"Unknown embedding model: {model}") | ||||
|     return EMBEDDING_DIMENSIONS[model] | ||||
| 
 | ||||
| 
 | ||||
| def build_llm_index_text(doc: Document) -> str: | ||||
|     lines = [ | ||||
|         f"Title: {doc.title}", | ||||
|         f"Filename: {doc.filename}", | ||||
|         f"Created: {doc.created}", | ||||
|         f"Added: {doc.added}", | ||||
|         f"Modified: {doc.modified}", | ||||
|         f"Tags: {', '.join(tag.name for tag in doc.tags.all())}", | ||||
|         f"Document Type: {doc.document_type.name if doc.document_type else ''}", | ||||
|         f"Correspondent: {doc.correspondent.name if doc.correspondent else ''}", | ||||
|         f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}", | ||||
|         f"Archive Serial Number: {doc.archive_serial_number or ''}", | ||||
|         f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}", | ||||
|     ] | ||||
| 
 | ||||
|     for instance in doc.custom_fields.all(): | ||||
|         lines.append(f"Custom Field - {instance.field.name}: {instance}") | ||||
| 
 | ||||
|     lines.append("\nContent:\n") | ||||
|     lines.append(doc.content or "") | ||||
| 
 | ||||
|     return "\n".join(lines) | ||||
							
								
								
									
										52
									
								
								src/paperless/ai/indexing.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								src/paperless/ai/indexing.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,52 @@ | ||||
| import logging | ||||
| 
 | ||||
| import llama_index.core.settings as llama_settings | ||||
| from django.conf import settings | ||||
| from llama_index.core import StorageContext | ||||
| from llama_index.core import VectorStoreIndex | ||||
| from llama_index.core import load_index_from_storage | ||||
| from llama_index.core.retrievers import VectorIndexRetriever | ||||
| from llama_index.vector_stores.faiss import FaissVectorStore | ||||
| 
 | ||||
| from documents.models import Document | ||||
| from paperless.ai.embedding import get_embedding_model | ||||
| 
 | ||||
| logger = logging.getLogger("paperless.ai.indexing") | ||||
| 
 | ||||
| 
 | ||||
| def load_index() -> VectorStoreIndex: | ||||
|     """Loads the persisted LlamaIndex from disk.""" | ||||
|     vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR) | ||||
|     embed_model = get_embedding_model() | ||||
| 
 | ||||
|     llama_settings.Settings.embed_model = embed_model | ||||
|     llama_settings.Settings.chunk_size = 512 | ||||
| 
 | ||||
|     storage_context = StorageContext.from_defaults( | ||||
|         vector_store=vector_store, | ||||
|         persist_dir=settings.LLM_INDEX_DIR, | ||||
|     ) | ||||
|     return load_index_from_storage(storage_context) | ||||
| 
 | ||||
| 
 | ||||
| def query_similar_documents(document: Document, top_k: int = 5) -> list[Document]: | ||||
|     """Runs a similarity query and returns top-k similar Document objects.""" | ||||
| 
 | ||||
|     # Load index | ||||
|     index = load_index() | ||||
|     retriever = VectorIndexRetriever(index=index, similarity_top_k=top_k) | ||||
| 
 | ||||
|     # Build query from the document text | ||||
|     query_text = (document.title or "") + "\n" + (document.content or "") | ||||
| 
 | ||||
|     # Query | ||||
|     results = retriever.retrieve(query_text) | ||||
| 
 | ||||
|     # Each result.node.metadata["document_id"] should match our stored doc | ||||
|     document_ids = [ | ||||
|         int(node.metadata["document_id"]) | ||||
|         for node in results | ||||
|         if "document_id" in node.metadata | ||||
|     ] | ||||
| 
 | ||||
|     return list(Document.objects.filter(pk__in=document_ids)) | ||||
| @ -178,6 +178,8 @@ class AIConfig(BaseConfig): | ||||
|     """ | ||||
| 
 | ||||
|     ai_enabled: bool = dataclasses.field(init=False) | ||||
|     llm_embedding_backend: str = dataclasses.field(init=False) | ||||
|     llm_embedding_model: str = dataclasses.field(init=False) | ||||
|     llm_backend: str = dataclasses.field(init=False) | ||||
|     llm_model: str = dataclasses.field(init=False) | ||||
|     llm_api_key: str = dataclasses.field(init=False) | ||||
| @ -187,6 +189,12 @@ class AIConfig(BaseConfig): | ||||
|         app_config = self._get_config_instance() | ||||
| 
 | ||||
|         self.ai_enabled = app_config.ai_enabled or settings.AI_ENABLED | ||||
|         self.llm_embedding_backend = ( | ||||
|             app_config.llm_embedding_backend or settings.LLM_EMBEDDING_BACKEND | ||||
|         ) | ||||
|         self.llm_embedding_model = ( | ||||
|             app_config.llm_embedding_model or settings.LLM_EMBEDDING_MODEL | ||||
|         ) | ||||
|         self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND | ||||
|         self.llm_model = app_config.llm_model or settings.LLM_MODEL | ||||
|         self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY | ||||
|  | ||||
| @ -19,6 +19,27 @@ class Migration(migrations.Migration): | ||||
|                 verbose_name="Enables AI features", | ||||
|             ), | ||||
|         ), | ||||
|         migrations.AddField( | ||||
|             model_name="applicationconfiguration", | ||||
|             name="llm_embedding_backend", | ||||
|             field=models.CharField( | ||||
|                 blank=True, | ||||
|                 choices=[("openai", "OpenAI"), ("local", "Local")], | ||||
|                 max_length=32, | ||||
|                 null=True, | ||||
|                 verbose_name="Sets the LLM Embedding backend", | ||||
|             ), | ||||
|         ), | ||||
|         migrations.AddField( | ||||
|             model_name="applicationconfiguration", | ||||
|             name="llm_embedding_model", | ||||
|             field=models.CharField( | ||||
|                 blank=True, | ||||
|                 max_length=32, | ||||
|                 null=True, | ||||
|                 verbose_name="Sets the LLM Embedding model", | ||||
|             ), | ||||
|         ), | ||||
|         migrations.AddField( | ||||
|             model_name="applicationconfiguration", | ||||
|             name="llm_api_key", | ||||
|  | ||||
| @ -74,6 +74,11 @@ class ColorConvertChoices(models.TextChoices): | ||||
|     CMYK = ("CMYK", _("CMYK")) | ||||
| 
 | ||||
| 
 | ||||
| class LLMEmbeddingBackend(models.TextChoices): | ||||
|     OPENAI = ("openai", _("OpenAI")) | ||||
|     LOCAL = ("local", _("Local")) | ||||
| 
 | ||||
| 
 | ||||
| class LLMBackend(models.TextChoices): | ||||
|     """ | ||||
|     Matches to --llm-backend | ||||
| @ -284,6 +289,21 @@ class ApplicationConfiguration(AbstractSingletonModel): | ||||
|         default=False, | ||||
|     ) | ||||
| 
 | ||||
|     llm_embedding_backend = models.CharField( | ||||
|         verbose_name=_("Sets the LLM embedding backend"), | ||||
|         null=True, | ||||
|         blank=True, | ||||
|         max_length=32, | ||||
|         choices=LLMEmbeddingBackend.choices, | ||||
|     ) | ||||
| 
 | ||||
|     llm_embedding_model = models.CharField( | ||||
|         verbose_name=_("Sets the LLM embedding model"), | ||||
|         null=True, | ||||
|         blank=True, | ||||
|         max_length=32, | ||||
|     ) | ||||
| 
 | ||||
|     llm_backend = models.CharField( | ||||
|         verbose_name=_("Sets the LLM backend"), | ||||
|         null=True, | ||||
|  | ||||
| @ -291,6 +291,7 @@ MODEL_FILE = __get_path( | ||||
|     "PAPERLESS_MODEL_FILE", | ||||
|     DATA_DIR / "classification_model.pickle", | ||||
| ) | ||||
| LLM_INDEX_DIR = DATA_DIR / "llm_index" | ||||
| 
 | ||||
| LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", DATA_DIR / "log") | ||||
| 
 | ||||
| @ -1416,7 +1417,12 @@ OUTLOOK_OAUTH_ENABLED = bool( | ||||
| # AI Settings                                                                  # | ||||
| ################################################################################ | ||||
| AI_ENABLED = __get_boolean("PAPERLESS_AI_ENABLED", "NO") | ||||
| LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "openai")  # or "ollama" | ||||
| LLM_EMBEDDING_BACKEND = os.getenv( | ||||
|     "PAPERLESS_LLM_EMBEDDING_BACKEND", | ||||
|     "local", | ||||
| )  # or "openai" | ||||
| LLM_EMBEDDING_MODEL = os.getenv("PAPERLESS_LLM_EMBEDDING_MODEL") | ||||
| LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "ollama")  # or "openai" | ||||
| LLM_MODEL = os.getenv("PAPERLESS_LLM_MODEL") | ||||
| LLM_API_KEY = os.getenv("PAPERLESS_LLM_API_KEY") | ||||
| LLM_URL = os.getenv("PAPERLESS_LLM_URL") | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user