File size: 5,608 Bytes
737f55b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
"""Embedding tools"""
import logging
from typing import List
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from app.internal.bdd_manager import create_collection, get_vector_store
from app.internal.embedder import get_embedder
from app.internal.parser import get_pdf_paths, get_text_chunker, parse_document
from app.settings import settings
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
embedding_router = APIRouter(
prefix="/embeddings",
tags=["documents"],
responses={404: {"description": "Not found"}},
)
user_collection_name = settings.user_collection_name
logger.info("Initializing collection: %s", user_collection_name)
create_collection(user_collection_name)
doc_collection_name = settings.doc_collection_name
logger.info("Initializing collection: %s", doc_collection_name)
create_collection(doc_collection_name)
embedder = get_embedder(provider=settings.provider)
logger.info("Embedder initialized.")
doc_vector_store = get_vector_store(embedder, doc_collection_name)
logger.info("Vector store initialized with collection: %s", doc_collection_name)
user_vector_store = get_vector_store(embedder, user_collection_name)
logger.info("Vector store initialized with collection: %s", user_collection_name)
text_splitter = get_text_chunker()
logger.info("Text splitter initialized.")
def get_vectorstore(vectorstor_type):
if vectorstor_type == "user":
return user_vector_store
if vectorstor_type == "doc":
return doc_vector_store
return None
class DocPathsInput(BaseModel): # TODO move to schema.py
doc_paths: str
vectorstor_type: str
@embedding_router.post("/embedded/")
async def embedding(doc_paths_input: DocPathsInput):
"""
Embeds documents provided via file paths and adds them to the vector store.
Args:
doc_paths_input (DocPathsInput): A Pydantic model containing
a list of document file paths.
Returns:
dict: A response containing the number of documents added to the vector store.
Raises:
HTTPException: If the document parsing or embedding process fails.
"""
logger.info("Received request to embed documents: %s", doc_paths_input.doc_paths)
vector_store = get_vectorstore(doc_paths_input.vectorstor_type)
try:
folder_path = doc_paths_input.doc_paths
logger.info(folder_path)
doc_paths = get_pdf_paths(folder_path)
logger.info(doc_paths)
for path in doc_paths:
try:
logger.info("Parsing document at path: %s", path)
parsed_documents = parse_document(path)
doc_title = path.split("\\")[-1]
logger.info("Document parsed: %s", doc_title)
documents = text_splitter.create_documents(
parsed_documents,
metadatas=[
{"Title": doc_title} for _ in range(len(parsed_documents))
],
)
logger.info(
"Created %d document chunks for: %s", len(documents), doc_title
)
vector_store.add_documents(documents)
logger.info("Documents added to vector store: %s", doc_title)
except Exception as e:
logger.info(
f"An error occured during the parsing of the file {path}: {e}"
)
logger.info("All documents successfully processed and embedded.")
return {
"message": "Documents successfully embedded and stored",
"documents_added": len(doc_paths),
}
except Exception as e:
logger.error("An error occurred during the embedding process: %s", e)
raise HTTPException(status_code=500, detail=f"An error occurred: {e!s}")
class SearchQuery(BaseModel): # TODO move to schema.py
vectorstor_type: str
query: str
k: int = 2
@embedding_router.post("/similarity_search/")
async def search_documents(search_query: SearchQuery):
"""
Search for documents in the vector store based on a query.
Args:
search_query (SearchQuery): A Pydantic model containing the query string and the number of results (k).
Returns:
List[dict]: A list of documents matching the query, including their content and metadata.
Raises:
HTTPException: If the search process fails or no documents are found.
"""
logger.info("Received similarity search query: %s", search_query.query)
vector_store = get_vectorstore(search_query.vectorstor_type)
try:
found_docs = vector_store.similarity_search(
search_query.query, k=search_query.k
)
logger.info(
"Found %d documents for query: %s", len(found_docs), search_query.query
)
if not found_docs:
logger.warning("No documents found for query: %s", search_query.query)
raise HTTPException(
status_code=404, detail="No documents found for the given query."
)
logger.info("Returning results for query: %s", search_query.query)
return [
{
"content": doc.page_content,
"metadata": doc.metadata if hasattr(doc, "metadata") else None,
}
for doc in found_docs
]
except Exception as e:
logger.error("An error occurred during the similarity search: %s", e)
raise HTTPException(
status_code=500, detail=f"An error occurred during the search: {e}"
)
|