File size: 5,608 Bytes
737f55b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""Embedding tools"""

import logging
from typing import List
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel

from app.internal.bdd_manager import create_collection, get_vector_store
from app.internal.embedder import get_embedder
from app.internal.parser import get_pdf_paths, get_text_chunker, parse_document
from app.settings import settings


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

embedding_router = APIRouter(
    prefix="/embeddings",
    tags=["documents"],
    responses={404: {"description": "Not found"}},
)

user_collection_name = settings.user_collection_name
logger.info("Initializing collection: %s", user_collection_name)
create_collection(user_collection_name)

doc_collection_name = settings.doc_collection_name
logger.info("Initializing collection: %s", doc_collection_name)
create_collection(doc_collection_name)

embedder = get_embedder(provider=settings.provider)
logger.info("Embedder initialized.")

doc_vector_store = get_vector_store(embedder, doc_collection_name)
logger.info("Vector store initialized with collection: %s", doc_collection_name)

user_vector_store = get_vector_store(embedder, user_collection_name)
logger.info("Vector store initialized with collection: %s", user_collection_name)

text_splitter = get_text_chunker()
logger.info("Text splitter initialized.")


def get_vectorstore(vectorstor_type):
    if vectorstor_type == "user":
        return user_vector_store

    if vectorstor_type == "doc":
        return doc_vector_store
    return None


class DocPathsInput(BaseModel):  # TODO move to schema.py
    doc_paths: str
    vectorstor_type: str


@embedding_router.post("/embedded/")
async def embedding(doc_paths_input: DocPathsInput):
    """
    Embeds documents provided via file paths and adds them to the vector store.

    Args:
        doc_paths_input (DocPathsInput): A Pydantic model containing
        a list of document file paths.

    Returns:
        dict: A response containing the number of documents added to the vector store.

    Raises:
        HTTPException: If the document parsing or embedding process fails.
    """

    logger.info("Received request to embed documents: %s", doc_paths_input.doc_paths)
    vector_store = get_vectorstore(doc_paths_input.vectorstor_type)

    try:
        folder_path = doc_paths_input.doc_paths
        logger.info(folder_path)
        doc_paths = get_pdf_paths(folder_path)
        logger.info(doc_paths)
        for path in doc_paths:
            try:
                logger.info("Parsing document at path: %s", path)
                parsed_documents = parse_document(path)
                doc_title = path.split("\\")[-1]
                logger.info("Document parsed: %s", doc_title)

                documents = text_splitter.create_documents(
                    parsed_documents,
                    metadatas=[
                        {"Title": doc_title} for _ in range(len(parsed_documents))
                    ],
                )
                logger.info(
                    "Created %d document chunks for: %s", len(documents), doc_title
                )

                vector_store.add_documents(documents)

                logger.info("Documents added to vector store: %s", doc_title)

            except Exception as e:
                logger.info(
                    f"An error occured during the parsing of the file {path}: {e}"
                )

        logger.info("All documents successfully processed and embedded.")
        return {
            "message": "Documents successfully embedded and stored",
            "documents_added": len(doc_paths),
        }

    except Exception as e:
        logger.error("An error occurred during the embedding process: %s", e)
        raise HTTPException(status_code=500, detail=f"An error occurred: {e!s}")


class SearchQuery(BaseModel):  # TODO move to schema.py
    vectorstor_type: str
    query: str
    k: int = 2


@embedding_router.post("/similarity_search/")
async def search_documents(search_query: SearchQuery):
    """
    Search for documents in the vector store based on a query.

    Args:
        search_query (SearchQuery): A Pydantic model containing the query string and the number of results (k).

    Returns:
        List[dict]: A list of documents matching the query, including their content and metadata.

    Raises:
        HTTPException: If the search process fails or no documents are found.
    """
    logger.info("Received similarity search query: %s", search_query.query)

    vector_store = get_vectorstore(search_query.vectorstor_type)

    try:
        found_docs = vector_store.similarity_search(
            search_query.query, k=search_query.k
        )
        logger.info(
            "Found %d documents for query: %s", len(found_docs), search_query.query
        )

        if not found_docs:
            logger.warning("No documents found for query: %s", search_query.query)
            raise HTTPException(
                status_code=404, detail="No documents found for the given query."
            )

        logger.info("Returning results for query: %s", search_query.query)
        return [
            {
                "content": doc.page_content,
                "metadata": doc.metadata if hasattr(doc, "metadata") else None,
            }
            for doc in found_docs
        ]
    except Exception as e:
        logger.error("An error occurred during the similarity search: %s", e)
        raise HTTPException(
            status_code=500, detail=f"An error occurred during the search: {e}"
        )