mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-24 02:02:23 -04:00
indexing cleanup and tests
This commit is contained in:
parent
3d1ed671fa
commit
cd4540412a
@ -26,7 +26,7 @@ def stream_chat_with_documents(query_str: str, documents: list[Document]):
|
|||||||
client = AIClient()
|
client = AIClient()
|
||||||
index = load_or_build_index()
|
index = load_or_build_index()
|
||||||
|
|
||||||
doc_ids = [doc.pk for doc in documents]
|
doc_ids = [str(doc.pk) for doc in documents]
|
||||||
|
|
||||||
# Filter only the node(s) that match the document IDs
|
# Filter only the node(s) that match the document IDs
|
||||||
nodes = [
|
nodes = [
|
||||||
|
@ -52,23 +52,10 @@ def get_or_create_storage_context(*, rebuild=False):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_vector_store_index(storage_context, embed_model):
|
|
||||||
"""
|
|
||||||
Returns a VectorStoreIndex given a storage context and embed model.
|
|
||||||
"""
|
|
||||||
return VectorStoreIndex(
|
|
||||||
storage_context=storage_context,
|
|
||||||
embed_model=embed_model,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def build_document_node(document: Document) -> list[BaseNode]:
|
def build_document_node(document: Document) -> list[BaseNode]:
|
||||||
"""
|
"""
|
||||||
Given a Document, returns parsed Nodes ready for indexing.
|
Given a Document, returns parsed Nodes ready for indexing.
|
||||||
"""
|
"""
|
||||||
if not document.content:
|
|
||||||
return []
|
|
||||||
|
|
||||||
text = build_llm_index_text(document)
|
text = build_llm_index_text(document)
|
||||||
metadata = {
|
metadata = {
|
||||||
"document_id": str(document.id),
|
"document_id": str(document.id),
|
||||||
@ -97,9 +84,10 @@ def load_or_build_index(storage_context: StorageContext, embed_model, nodes=None
|
|||||||
try:
|
try:
|
||||||
return load_index_from_storage(storage_context=storage_context)
|
return load_index_from_storage(storage_context=storage_context)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.debug("Failed to load index from storage: %s", e)
|
logger.warning("Failed to load index from storage: %s", e)
|
||||||
if not nodes:
|
if not nodes:
|
||||||
return None
|
logger.info("No nodes provided for index creation.")
|
||||||
|
raise
|
||||||
return VectorStoreIndex(
|
return VectorStoreIndex(
|
||||||
nodes=nodes,
|
nodes=nodes,
|
||||||
storage_context=storage_context,
|
storage_context=storage_context,
|
||||||
@ -116,7 +104,7 @@ def remove_document_docstore_nodes(document: Document, index: VectorStoreIndex):
|
|||||||
existing_nodes = [
|
existing_nodes = [
|
||||||
node.node_id
|
node.node_id
|
||||||
for node in index.docstore.get_nodes(all_node_ids)
|
for node in index.docstore.get_nodes(all_node_ids)
|
||||||
if node.metadata.get("document_id") == document.id
|
if node.metadata.get("document_id") == str(document.id)
|
||||||
]
|
]
|
||||||
for node_id in existing_nodes:
|
for node_id in existing_nodes:
|
||||||
# Delete from docstore, FAISS IndexFlatL2 are append-only
|
# Delete from docstore, FAISS IndexFlatL2 are append-only
|
||||||
@ -208,9 +196,6 @@ def llm_index_add_or_update_document(document: Document):
|
|||||||
|
|
||||||
index = load_or_build_index(storage_context, embed_model, nodes=new_nodes)
|
index = load_or_build_index(storage_context, embed_model, nodes=new_nodes)
|
||||||
|
|
||||||
if index is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
remove_document_docstore_nodes(document, index)
|
remove_document_docstore_nodes(document, index)
|
||||||
|
|
||||||
index.insert_nodes(new_nodes)
|
index.insert_nodes(new_nodes)
|
||||||
@ -229,9 +214,6 @@ def llm_index_remove_document(document: Document):
|
|||||||
|
|
||||||
index = load_or_build_index(storage_context, embed_model)
|
index = load_or_build_index(storage_context, embed_model)
|
||||||
|
|
||||||
if index is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
remove_document_docstore_nodes(document, index)
|
remove_document_docstore_nodes(document, index)
|
||||||
|
|
||||||
storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
|
storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
|
||||||
|
@ -28,7 +28,6 @@ def real_document(db):
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_embed_model():
|
def mock_embed_model():
|
||||||
"""Mocks the embedding model."""
|
|
||||||
with patch("paperless.ai.indexing.get_embedding_model") as mock:
|
with patch("paperless.ai.indexing.get_embedding_model") as mock:
|
||||||
mock.return_value = FakeEmbedding()
|
mock.return_value = FakeEmbedding()
|
||||||
yield mock
|
yield mock
|
||||||
@ -57,7 +56,7 @@ def test_build_document_node(real_document):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_rebuild_llm_index(
|
def test_update_llm_index(
|
||||||
temp_llm_index_dir,
|
temp_llm_index_dir,
|
||||||
real_document,
|
real_document,
|
||||||
mock_embed_model,
|
mock_embed_model,
|
||||||
@ -72,6 +71,49 @@ def test_rebuild_llm_index(
|
|||||||
assert any(temp_llm_index_dir.glob("*.json"))
|
assert any(temp_llm_index_dir.glob("*.json"))
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_or_create_storage_context_raises_exception(
|
||||||
|
temp_llm_index_dir,
|
||||||
|
mock_embed_model,
|
||||||
|
):
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
indexing.get_or_create_storage_context(rebuild=False)
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_or_build_index_builds_when_nodes_given(
|
||||||
|
temp_llm_index_dir,
|
||||||
|
mock_embed_model,
|
||||||
|
real_document,
|
||||||
|
):
|
||||||
|
storage_context = MagicMock()
|
||||||
|
with patch(
|
||||||
|
"paperless.ai.indexing.load_index_from_storage",
|
||||||
|
side_effect=ValueError("Index not found"),
|
||||||
|
):
|
||||||
|
with patch(
|
||||||
|
"paperless.ai.indexing.VectorStoreIndex",
|
||||||
|
return_value=MagicMock(),
|
||||||
|
) as mock_index_cls:
|
||||||
|
indexing.load_or_build_index(
|
||||||
|
storage_context,
|
||||||
|
mock_embed_model,
|
||||||
|
nodes=[indexing.build_document_node(real_document)],
|
||||||
|
)
|
||||||
|
mock_index_cls.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_or_build_index_raises_exception_when_no_nodes(
|
||||||
|
temp_llm_index_dir,
|
||||||
|
mock_embed_model,
|
||||||
|
):
|
||||||
|
storage_context = MagicMock()
|
||||||
|
with patch(
|
||||||
|
"paperless.ai.indexing.load_index_from_storage",
|
||||||
|
side_effect=ValueError("Index not found"),
|
||||||
|
):
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
indexing.load_or_build_index(storage_context, mock_embed_model)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_add_or_update_document_updates_existing_entry(
|
def test_add_or_update_document_updates_existing_entry(
|
||||||
temp_llm_index_dir,
|
temp_llm_index_dir,
|
||||||
@ -91,14 +133,18 @@ def test_remove_document_deletes_node_from_docstore(
|
|||||||
mock_embed_model,
|
mock_embed_model,
|
||||||
):
|
):
|
||||||
indexing.update_llm_index(rebuild=True)
|
indexing.update_llm_index(rebuild=True)
|
||||||
indexing.llm_index_add_or_update_document(real_document)
|
storage_context = indexing.get_or_create_storage_context()
|
||||||
indexing.llm_index_remove_document(real_document)
|
index = indexing.load_or_build_index(storage_context, mock_embed_model)
|
||||||
|
assert len(index.docstore.docs) == 1
|
||||||
|
|
||||||
assert any(temp_llm_index_dir.glob("*.json"))
|
indexing.llm_index_remove_document(real_document)
|
||||||
|
storage_context = indexing.get_or_create_storage_context()
|
||||||
|
index = indexing.load_or_build_index(storage_context, mock_embed_model)
|
||||||
|
assert len(index.docstore.docs) == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_rebuild_llm_index_no_documents(
|
def test_update_llm_index_no_documents(
|
||||||
temp_llm_index_dir,
|
temp_llm_index_dir,
|
||||||
mock_embed_model,
|
mock_embed_model,
|
||||||
):
|
):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user