CV / backend /app /routers /embedding.py
jdenis-insn
init commit for build
737f55b
"""Embedding tools"""
import logging
from typing import List
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from app.internal.bdd_manager import create_collection, get_vector_store
from app.internal.embedder import get_embedder
from app.internal.parser import get_pdf_paths, get_text_chunker, parse_document
from app.settings import settings
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
embedding_router = APIRouter(
prefix="/embeddings",
tags=["documents"],
responses={404: {"description": "Not found"}},
)
user_collection_name = settings.user_collection_name
logger.info("Initializing collection: %s", user_collection_name)
create_collection(user_collection_name)
doc_collection_name = settings.doc_collection_name
logger.info("Initializing collection: %s", doc_collection_name)
create_collection(doc_collection_name)
embedder = get_embedder(provider=settings.provider)
logger.info("Embedder initialized.")
doc_vector_store = get_vector_store(embedder, doc_collection_name)
logger.info("Vector store initialized with collection: %s", doc_collection_name)
user_vector_store = get_vector_store(embedder, user_collection_name)
logger.info("Vector store initialized with collection: %s", user_collection_name)
text_splitter = get_text_chunker()
logger.info("Text splitter initialized.")
def get_vectorstore(vectorstor_type):
if vectorstor_type == "user":
return user_vector_store
if vectorstor_type == "doc":
return doc_vector_store
return None
class DocPathsInput(BaseModel): # TODO move to schema.py
doc_paths: str
vectorstor_type: str
@embedding_router.post("/embedded/")
async def embedding(doc_paths_input: DocPathsInput):
"""
Embeds documents provided via file paths and adds them to the vector store.
Args:
doc_paths_input (DocPathsInput): A Pydantic model containing
a list of document file paths.
Returns:
dict: A response containing the number of documents added to the vector store.
Raises:
HTTPException: If the document parsing or embedding process fails.
"""
logger.info("Received request to embed documents: %s", doc_paths_input.doc_paths)
vector_store = get_vectorstore(doc_paths_input.vectorstor_type)
try:
folder_path = doc_paths_input.doc_paths
logger.info(folder_path)
doc_paths = get_pdf_paths(folder_path)
logger.info(doc_paths)
for path in doc_paths:
try:
logger.info("Parsing document at path: %s", path)
parsed_documents = parse_document(path)
doc_title = path.split("\\")[-1]
logger.info("Document parsed: %s", doc_title)
documents = text_splitter.create_documents(
parsed_documents,
metadatas=[
{"Title": doc_title} for _ in range(len(parsed_documents))
],
)
logger.info(
"Created %d document chunks for: %s", len(documents), doc_title
)
vector_store.add_documents(documents)
logger.info("Documents added to vector store: %s", doc_title)
except Exception as e:
logger.info(
f"An error occured during the parsing of the file {path}: {e}"
)
logger.info("All documents successfully processed and embedded.")
return {
"message": "Documents successfully embedded and stored",
"documents_added": len(doc_paths),
}
except Exception as e:
logger.error("An error occurred during the embedding process: %s", e)
raise HTTPException(status_code=500, detail=f"An error occurred: {e!s}")
class SearchQuery(BaseModel): # TODO move to schema.py
vectorstor_type: str
query: str
k: int = 2
@embedding_router.post("/similarity_search/")
async def search_documents(search_query: SearchQuery):
"""
Search for documents in the vector store based on a query.
Args:
search_query (SearchQuery): A Pydantic model containing the query string and the number of results (k).
Returns:
List[dict]: A list of documents matching the query, including their content and metadata.
Raises:
HTTPException: If the search process fails or no documents are found.
"""
logger.info("Received similarity search query: %s", search_query.query)
vector_store = get_vectorstore(search_query.vectorstor_type)
try:
found_docs = vector_store.similarity_search(
search_query.query, k=search_query.k
)
logger.info(
"Found %d documents for query: %s", len(found_docs), search_query.query
)
if not found_docs:
logger.warning("No documents found for query: %s", search_query.query)
raise HTTPException(
status_code=404, detail="No documents found for the given query."
)
logger.info("Returning results for query: %s", search_query.query)
return [
{
"content": doc.page_content,
"metadata": doc.metadata if hasattr(doc, "metadata") else None,
}
for doc in found_docs
]
except Exception as e:
logger.error("An error occurred during the similarity search: %s", e)
raise HTTPException(
status_code=500, detail=f"An error occurred during the search: {e}"
)