mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-31 02:27:10 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			257 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			257 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import logging
 | |
| import shutil
 | |
| 
 | |
| import faiss
 | |
| import llama_index.core.settings as llama_settings
 | |
| import tqdm
 | |
| from django.conf import settings
 | |
| from llama_index.core import Document as LlamaDocument
 | |
| from llama_index.core import StorageContext
 | |
| from llama_index.core import VectorStoreIndex
 | |
| from llama_index.core import load_index_from_storage
 | |
| from llama_index.core.node_parser import SimpleNodeParser
 | |
| from llama_index.core.retrievers import VectorIndexRetriever
 | |
| from llama_index.core.schema import BaseNode
 | |
| from llama_index.core.storage.docstore import SimpleDocumentStore
 | |
| from llama_index.core.storage.index_store import SimpleIndexStore
 | |
| from llama_index.vector_stores.faiss import FaissVectorStore
 | |
| 
 | |
| from documents.models import Document
 | |
| from paperless.ai.embedding import build_llm_index_text
 | |
| from paperless.ai.embedding import get_embedding_dim
 | |
| from paperless.ai.embedding import get_embedding_model
 | |
| 
 | |
| logger = logging.getLogger("paperless.ai.indexing")
 | |
| 
 | |
| 
 | |
| def get_or_create_storage_context(*, rebuild=False):
 | |
|     """
 | |
|     Loads or creates the StorageContext (vector store, docstore, index store).
 | |
|     If rebuild=True, deletes and recreates everything.
 | |
|     """
 | |
|     if rebuild:
 | |
|         shutil.rmtree(settings.LLM_INDEX_DIR, ignore_errors=True)
 | |
|         settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True)
 | |
| 
 | |
|     if rebuild or not settings.LLM_INDEX_DIR.exists():
 | |
|         embedding_dim = get_embedding_dim()
 | |
|         faiss_index = faiss.IndexFlatL2(embedding_dim)
 | |
|         vector_store = FaissVectorStore(faiss_index=faiss_index)
 | |
|         docstore = SimpleDocumentStore()
 | |
|         index_store = SimpleIndexStore()
 | |
|     else:
 | |
|         vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR)
 | |
|         docstore = SimpleDocumentStore.from_persist_dir(settings.LLM_INDEX_DIR)
 | |
|         index_store = SimpleIndexStore.from_persist_dir(settings.LLM_INDEX_DIR)
 | |
| 
 | |
|     return StorageContext.from_defaults(
 | |
|         docstore=docstore,
 | |
|         index_store=index_store,
 | |
|         vector_store=vector_store,
 | |
|         persist_dir=settings.LLM_INDEX_DIR,
 | |
|     )
 | |
| 
 | |
| 
 | |
| def get_vector_store_index(storage_context, embed_model):
 | |
|     """
 | |
|     Returns a VectorStoreIndex given a storage context and embed model.
 | |
|     """
 | |
|     return VectorStoreIndex(
 | |
|         storage_context=storage_context,
 | |
|         embed_model=embed_model,
 | |
|     )
 | |
| 
 | |
| 
 | |
| def build_document_node(document: Document) -> list[BaseNode]:
 | |
|     """
 | |
|     Given a Document, returns parsed Nodes ready for indexing.
 | |
|     """
 | |
|     if not document.content:
 | |
|         return []
 | |
| 
 | |
|     text = build_llm_index_text(document)
 | |
|     metadata = {
 | |
|         "document_id": str(document.id),
 | |
|         "title": document.title,
 | |
|         "tags": [t.name for t in document.tags.all()],
 | |
|         "correspondent": document.correspondent.name
 | |
|         if document.correspondent
 | |
|         else None,
 | |
|         "document_type": document.document_type.name
 | |
|         if document.document_type
 | |
|         else None,
 | |
|         "created": document.created.isoformat() if document.created else None,
 | |
|         "added": document.added.isoformat() if document.added else None,
 | |
|         "modified": document.modified.isoformat(),
 | |
|     }
 | |
|     doc = LlamaDocument(text=text, metadata=metadata)
 | |
|     parser = SimpleNodeParser()
 | |
|     return parser.get_nodes_from_documents([doc])
 | |
| 
 | |
| 
 | |
| def load_or_build_index(storage_context: StorageContext, embed_model, nodes=None):
 | |
|     """
 | |
|     Load an existing VectorStoreIndex if present,
 | |
|     or build a new one using provided nodes if storage is empty.
 | |
|     """
 | |
|     try:
 | |
|         return load_index_from_storage(storage_context=storage_context)
 | |
|     except ValueError as e:
 | |
|         logger.debug("Failed to load index from storage: %s", e)
 | |
|         if not nodes:
 | |
|             return None
 | |
|         return VectorStoreIndex(
 | |
|             nodes=nodes,
 | |
|             storage_context=storage_context,
 | |
|             embed_model=embed_model,
 | |
|         )
 | |
| 
 | |
| 
 | |
| def remove_document_docstore_nodes(document: Document, index: VectorStoreIndex):
 | |
|     """
 | |
|     Removes existing documents from docstore for a given document from the index.
 | |
|     This is necessary because FAISS IndexFlatL2 is append-only.
 | |
|     """
 | |
|     all_node_ids = list(index.docstore.docs.keys())
 | |
|     existing_nodes = [
 | |
|         node.node_id
 | |
|         for node in index.docstore.get_nodes(all_node_ids)
 | |
|         if node.metadata.get("document_id") == document.id
 | |
|     ]
 | |
|     for node_id in existing_nodes:
 | |
|         # Delete from docstore, FAISS IndexFlatL2 are append-only
 | |
|         index.docstore.delete_document(node_id)
 | |
| 
 | |
| 
 | |
| def update_llm_index(*, progress_bar_disable=False, rebuild=False):
 | |
|     """
 | |
|     Rebuild or update the LLM index.
 | |
|     """
 | |
|     embed_model = get_embedding_model()
 | |
|     llama_settings.Settings.embed_model = embed_model
 | |
|     storage_context = get_or_create_storage_context(rebuild=rebuild)
 | |
| 
 | |
|     nodes = []
 | |
| 
 | |
|     documents = Document.objects.all()
 | |
|     if not documents.exists():
 | |
|         logger.warning("No documents found to index.")
 | |
|         return
 | |
| 
 | |
|     if rebuild:
 | |
|         # Rebuild index from scratch
 | |
|         for document in tqdm.tqdm(documents, disable=progress_bar_disable):
 | |
|             document_nodes = build_document_node(document)
 | |
|             nodes.extend(document_nodes)
 | |
| 
 | |
|         VectorStoreIndex(
 | |
|             nodes=nodes,
 | |
|             storage_context=storage_context,
 | |
|             embed_model=embed_model,
 | |
|             show_progress=not progress_bar_disable,
 | |
|         )
 | |
|     else:
 | |
|         # Update existing index
 | |
|         index = load_or_build_index(storage_context, embed_model)
 | |
|         all_node_ids = list(index.docstore.docs.keys())
 | |
|         existing_nodes = {
 | |
|             node.metadata.get("document_id"): node
 | |
|             for node in index.docstore.get_nodes(all_node_ids)
 | |
|         }
 | |
| 
 | |
|         node_ids_to_remove = []
 | |
| 
 | |
|         for document in tqdm.tqdm(documents, disable=progress_bar_disable):
 | |
|             doc_id = str(document.id)
 | |
|             document_modified = document.modified.isoformat()
 | |
| 
 | |
|             if doc_id in existing_nodes:
 | |
|                 node = existing_nodes[doc_id]
 | |
|                 node_modified = node.metadata.get("modified")
 | |
| 
 | |
|                 if node_modified == document_modified:
 | |
|                     continue
 | |
| 
 | |
|                 node_ids_to_remove.append(node.node_id)
 | |
|                 nodes.extend(build_document_node(document))
 | |
|             else:
 | |
|                 # New document, add it
 | |
|                 nodes.extend(build_document_node(document))
 | |
| 
 | |
|         if node_ids_to_remove or nodes:
 | |
|             logger.info(
 | |
|                 "Updating LLM index with %d new nodes and removing %d old nodes.",
 | |
|                 len(nodes),
 | |
|                 len(node_ids_to_remove),
 | |
|             )
 | |
|             if node_ids_to_remove:
 | |
|                 index.delete_nodes(node_ids_to_remove)
 | |
|             if nodes:
 | |
|                 index.insert_nodes(nodes)
 | |
|         else:
 | |
|             logger.info("No changes detected, skipping llm index rebuild.")
 | |
| 
 | |
|     storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
 | |
| 
 | |
| 
 | |
| def llm_index_add_or_update_document(document: Document):
 | |
|     """
 | |
|     Adds or updates a document in the LLM index.
 | |
|     If the document already exists, it will be replaced.
 | |
|     """
 | |
|     embed_model = get_embedding_model()
 | |
|     llama_settings.Settings.embed_model = embed_model
 | |
| 
 | |
|     storage_context = get_or_create_storage_context(rebuild=False)
 | |
| 
 | |
|     new_nodes = build_document_node(document)
 | |
| 
 | |
|     index = load_or_build_index(storage_context, embed_model, nodes=new_nodes)
 | |
| 
 | |
|     if index is None:
 | |
|         return
 | |
| 
 | |
|     remove_document_docstore_nodes(document, index)
 | |
| 
 | |
|     index.insert_nodes(new_nodes)
 | |
| 
 | |
|     storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
 | |
| 
 | |
| 
 | |
| def llm_index_remove_document(document: Document):
 | |
|     """
 | |
|     Removes a document from the LLM index.
 | |
|     """
 | |
|     embed_model = get_embedding_model()
 | |
|     llama_settings.embed_model = embed_model
 | |
| 
 | |
|     storage_context = get_or_create_storage_context(rebuild=False)
 | |
| 
 | |
|     index = load_or_build_index(storage_context, embed_model)
 | |
| 
 | |
|     if index is None:
 | |
|         return
 | |
| 
 | |
|     remove_document_docstore_nodes(document, index)
 | |
| 
 | |
|     storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
 | |
| 
 | |
| 
 | |
| def query_similar_documents(document: Document, top_k: int = 5) -> list[Document]:
 | |
|     """
 | |
|     Runs a similarity query and returns top-k similar Document objects.
 | |
|     """
 | |
|     index = load_or_build_index()
 | |
|     retriever = VectorIndexRetriever(index=index, similarity_top_k=top_k)
 | |
| 
 | |
|     query_text = (document.title or "") + "\n" + (document.content or "")
 | |
|     results = retriever.retrieve(query_text)
 | |
| 
 | |
|     document_ids = [
 | |
|         int(node.metadata["document_id"])
 | |
|         for node in results
 | |
|         if "document_id" in node.metadata
 | |
|     ]
 | |
| 
 | |
|     return list(Document.objects.filter(pk__in=document_ids))
 |