sami606713 commited on
Commit
27a8994
·
verified ·
1 Parent(s): bdb9574

Upload 17 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ my_faiss_index/index.faiss filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+ .env
9
+
10
+ # Virtual environments
11
+ .venv
README.md CHANGED
@@ -1,20 +1,46 @@
1
- ---
2
- title: RagChatbot
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: This Chat Bot can answer user query based on knowledegbase
12
- license: mit
13
- ---
14
-
15
- # Welcome to Streamlit!
16
-
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
-
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Backend
2
+
3
+ This folder contains the backend services for the Document Chat App.
4
+
5
+ ### `app.py`
6
+ This file is the main entry point for the Streamlit web application. It handles the user interface, chat history management, and interacts with the `agent` to process user queries and generate responses.
7
+
8
+ ### `main.py`
9
+ This script is responsible for processing documents. It loads and extracts data (tables, texts, images) from PDF files in the `data` directory, summarizes them using the `summerizer` module, and then chunks and adds the processed documents to the vector store. It keeps track of processed files in `processed_files.txt` to avoid reprocessing.
10
+
11
+ ### `data/`
12
+ This directory is intended to store the raw PDF documents that need to be processed by the system.
13
+
14
+ ### `vectorStore/`
15
+ This directory stores the generated vector embeddings of the processed documents. These embeddings are used by the `agent` for retrieving relevant information during the chat.
16
+
17
+ ### `agent/`
18
+ This module contains the logic for the conversational agent, which uses the vector store to answer questions based on the processed documents.
19
+
20
+ ### `summerizer/`
21
+ This module provides functionalities for summarizing different types of content (text, images) extracted from the documents.
22
+
23
+ ### `utils/`
24
+ This module contains utility functions, such as `helper.py` for loading and extracting data from documents.
25
+
26
+ ### `tool/`
27
+ This module likely contains tools or functions used by the agent to perform specific tasks.
28
+
29
+ ### `generator.py`
30
+ This file likely contains code related to generating responses or content within the application.
31
+
32
+ ### How to Run
33
+ To run the backend application, you will typically run `app.py` using Streamlit after ensuring all dependencies are installed and documents are processed by `main.py`.
34
+
35
+ ```bash
36
+ streamlit run app.py
37
+ ```
38
+
39
+ ### Running `main.py` for Document Embedding
40
+ To process and embed documents, run the `main.py` script. This script will load PDF files from the `data` directory, extract and summarize their contents, and then add them to the vector store.
41
+
42
+ ```bash
43
+ python main.py
44
+ ```
45
+
46
+ Make sure that the `data` directory contains the PDF files you want to process. The script will log processed files in `processed_files.txt` to avoid reprocessing them.
agent/__pycache__/agent.cpython-311.pyc ADDED
Binary file (3.75 kB). View file
 
agent/agent.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agno.agent import Agent
2
+ from agno.agent import Agent, RunResponse
3
+ from agno.models.groq import Groq
4
+ from agno.tools.reasoning import ReasoningTools
5
+ from agno.tools.thinking import ThinkingTools
6
+ from vectorStore.vectorStore import GetContext
7
+ from agno.models.openai import OpenAIChat
8
+
9
+
10
+ def RunAgent(query):
11
+ """
12
+ This agent can run hte query and return the response
13
+ retriever_tool can accept query and user_id and return the response
14
+ """
15
+ try:
16
+ agent = Agent(
17
+ tools=[GetContext,
18
+ ReasoningTools(add_instructions=True),
19
+ ThinkingTools(add_instructions=True)
20
+ ],
21
+ description = "This agent strictly processes user queries using ONLY the provided context. It must not use external knowledge or assumptions beyond the context. "
22
+ "If the exact answer is not found, it must reason based on the available information to generate a helpful response. "
23
+ "If reasoning is not possible from the given context, the agent must clearly state that it cannot answer the query and prompt the user to try a related query. "
24
+ "At no point should the agent fabricate information or rely on knowledge not present in the provided context.",
25
+ instructions = [
26
+ """
27
+ Role:
28
+ - You are an assistant representing <BOT_NAME>. Your job is to assist users strictly based on the provided context.
29
+
30
+ Core Rules:
31
+ 1. Use ONLY the provided context to generate responses.
32
+ 2. DO NOT use any external knowledge, assumptions, prior training data, or general world knowledge.
33
+ 3. If the context does not provide a clear answer, try to infer a reasonable response *only within* the scope of the context.
34
+ 4. If a reasonable answer cannot be formed from the context, respond exactly with:
35
+ "Apologies; I am not sure about that. Please head over to <SUPPORT_URL> for some additional help from our team."
36
+ 5. NEVER fabricate, guess, or hallucinate any information not clearly supported by the context.
37
+ 6. Do NOT suggest or imply you have access to any knowledge beyond the context.
38
+ 7.nclude the resource/document name at the end response.
39
+
40
+ Compliance:
41
+ - This is a ZERO-TOLERANCE instruction set.
42
+ - Any use of information outside the provided context is a strict violation.
43
+ """
44
+ ],
45
+
46
+ show_tool_calls=True,
47
+ markdown=True,
48
+ model=OpenAIChat(id="gpt-4o",api_key="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")
49
+ )
50
+
51
+
52
+ # Run Agent
53
+ response: RunResponse = agent.run(query, stream=False,structured_outputs=True)
54
+
55
+ return response.content
56
+ except Exception as e:
57
+ return str(e)
58
+
59
+ if __name__ == "__main__":
60
+ query = "tell me about enery and climate changes?"
61
+ response = RunAgent(query=query)
62
+ print(">> Response: ",response)
app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from agent.agent import RunAgent
3
+
4
+ # Set Streamlit layout
5
+ st.set_page_config(page_title="Document Chat App", layout="wide")
6
+ st.title("📄 Document Chat App")
7
+
8
+ # Initialize session state
9
+ if 'chat_history' not in st.session_state:
10
+ st.session_state.chat_history = []
11
+
12
+ # Display chat history
13
+ for speaker, message in st.session_state.chat_history:
14
+ with st.chat_message(name=speaker):
15
+ st.markdown(message)
16
+
17
+ # Chat input
18
+ user_input = st.chat_input("Ask something about your document...")
19
+
20
+ if user_input:
21
+ # Show user message
22
+ with st.chat_message("You"):
23
+ st.markdown(user_input)
24
+
25
+ # Run agent
26
+ response = RunAgent(query=user_input)
27
+
28
+ # Show bot response
29
+ with st.chat_message("Bot"):
30
+ st.markdown(response)
31
+
32
+ # Save to chat history
33
+ st.session_state.chat_history.append(("You", user_input))
34
+ st.session_state.chat_history.append(("Bot", response))
app_debug.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 2025-06-16 00:17:48,085 - INFO - Session state initialized
2
+ 2025-06-16 00:17:48,085 - INFO - Debug: App started successfully
3
+ 2025-06-16 00:17:48,087 - INFO - Debug: Directories created successfully
generator.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from agent.agent import RunAgent
2
+
3
+ if __name__ == "__main__":
4
+ query = """
5
+ Explain how the crystalline structure of cellulose impacts its enzymatic hydrolysis. What methods are used to overcome this challenge?
6
+ """
7
+ response = RunAgent(query=query)
8
+ print(">> Response: ",response)
main.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from utils.helper import LoadAndExtractData # Uncomment if you want to process files
4
+ from summerizer.imageSummerizer import Image_Summerizer
5
+ from summerizer.textSummerizer import TextSummerizer
6
+ from langchain_core.documents import Document
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from vectorStore.vectorStore import add_to_vector_store
9
+
10
+ def main():
11
+ try:
12
+ root_dir = "data"
13
+ processed_log_path = "processed_files.txt"
14
+
15
+ # Load already processed file names
16
+ if os.path.exists(processed_log_path):
17
+ with open(processed_log_path, 'r') as f:
18
+ processed_files = set(f.read().splitlines())
19
+ else:
20
+ processed_files = set()
21
+
22
+ files = os.listdir(root_dir)
23
+ print(">> Files: ",files)
24
+ print(">> Process Files: ",processed_files)
25
+ print(">> Processing Files ")
26
+ for file in files:
27
+ file_path = os.path.join(root_dir, file)
28
+
29
+ # Only process files that don't exist in the process directory
30
+ if file not in processed_files and file.lower().endswith('.pdf'):
31
+ print(f">> Processing: {file}")
32
+
33
+ tables, texts, images = LoadAndExtractData(file_path)
34
+
35
+ print(">> Generating Summaries ")
36
+ text_summary = TextSummerizer(data=texts)
37
+ tables_summary = TextSummerizer(data=tables)
38
+ images_summary = Image_Summerizer(data=images)
39
+
40
+ print("Text Sumary: ",text_summary)
41
+ print("Table Summary: ",tables_summary)
42
+ print("Image Susmmary: ",images_summary)
43
+
44
+ print(">> Summary Generated")
45
+
46
+ print(">> Combine Each and every thing into one document")
47
+ # Create Document objects for text chunks
48
+ text_docs = [Document(page_content=str(text), metadata={"type": "text", "summary": text_summary[i], "source":file_path,"name":file}) for i, text in enumerate(texts)]
49
+
50
+ # Create Document objects for table summaries (using the HTML representation)
51
+ table_docs = [Document(page_content=tables[i], metadata={"type": "table", "summary": tables_summary[i],"source":file_path,"name":file}) for i, table in enumerate(tables)]
52
+
53
+ # Create Document objects for image summaries
54
+ image_docs = [Document(page_content=images[i], metadata={"type": "image", "summary": images_summary[i],"source":file_path,"name":file}) for i, image in enumerate(images)]
55
+
56
+ # Combine all document types into a single list
57
+ docs = text_docs + table_docs + image_docs
58
+
59
+ print(">> Splitting Documents")
60
+ document_splitter = RecursiveCharacterTextSplitter(
61
+ chunk_size=1000, # Example size, adjust based on your needs
62
+ chunk_overlap=200, # Example overlap, adjust based on your needs
63
+ length_function=len,
64
+ is_separator_regex=False,
65
+ )
66
+
67
+ # Spli the documents
68
+ docs_chunks = document_splitter.split_documents(docs)
69
+ print(">> Splitting Done")
70
+
71
+ add_to_vector_store(docs_chunks=docs_chunks)
72
+
73
+
74
+ # Append to log file
75
+ with open(processed_log_path, 'a') as f:
76
+ f.write(file + '\n')
77
+
78
+ print(f">> Marked {file} as processed")
79
+
80
+
81
+ else:
82
+ print(f"!! Skipping already processed or unsupported file: {file}")
83
+
84
+ except Exception as e:
85
+ print("Error is:", str(e))
86
+ return str(e)
87
+
88
+ if __name__ == "__main__":
89
+ main()
my_faiss_index/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c04a5dd5bbd2a99da72a7ab0084626a7501a920cc1e05034fc3449ecb206330e
3
+ size 56463405
my_faiss_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e088d4825bc2685515fe7ef79aff669292ad3652c9ffbb296f6f7d87827cf00c
3
+ size 10708178
processed_files.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AMMONIA PRODUCTION TECHNOLOGIES CAP 8 STORAGE.pdf
2
+ Ammonias-Double-Edged-Sword-Clean-Energy-or-Catastrophic-Risk.pdf
3
+ Waste to Fuel in NY.pdf
4
+ Charting-a-Greener-Course-Embrace-CCS-on-Maritime-Vessels.pdf
5
+ wind-assisted ship propulsion.pdf
6
+ Clean Energy Market Analysis in the US.pdf
7
+ Clean investment US.pdf
8
+ emission-factors_2014.pdf
9
+ Green Hydrogen the Race to Success-Members.pdf
10
+ GREENHOUSE GAS EMISSIONS FROM BIo ETHANOL AND BIO-DIESEL FUEL SUPPLY.pdf
11
+ Hydrogen Bunkering at Ports by Eliseo Curcio.pdf
12
+ Cheat Sheet Hydrogen (1).pdf
13
+ Comparative Life Cycle Assessment of Bioethanol Production from Different Generations ofBiomass and Waste Feedstocks.pdf
14
+ Comparison of biofuel life-cycle GHG emissions assessment tools.pdf
15
+ Green-Hydrogen-The-Race-to-Success.pdf
16
+ LCFS Vs RFS.pdf
17
+ Post strategy.pdf
18
+ Review of Second Generation Bioethanol Production.pdf
requirements.txt CHANGED
@@ -1,3 +1,22 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ unstructured[all-docs]
2
+ pillow
3
+ lxml
4
+ pillow
5
+ tiktoken
6
+ langchain
7
+ langchain-community
8
+ langchain-openai
9
+ langchain-groq
10
+ python_dotenv
11
+ pymilvus[model]
12
+ pymilvus
13
+ transformers
14
+ torch
15
+ poppler-utils
16
+ tesseract
17
+ pytesseract
18
+ langchain-groq
19
+ langchain-milvus
20
+ agno
21
+ streamlit
22
+ faiss-cpu
summerizer/imageSummerizer.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Image Summerizer
2
+ from utils.helper import Summarizer
3
+
4
+ prompt_template = """Describe the image in detail. For context,
5
+ the image is part of a research paper.
6
+ Be specific about graphs, such as bar plots."""
7
+
8
+
9
+ def Image_Summerizer(prompt_template =prompt_template,data=None):
10
+ try:
11
+ images_summary = Summarizer(prompt_template=prompt_template,data=data,config=False,set_messages=True)
12
+ return images_summary
13
+ except Exception as e:
14
+ pass
15
+
16
+ if __name__ == "__main__":
17
+ pass
summerizer/textSummerizer.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Summerize
2
+ from utils.helper import Summarizer
3
+
4
+ # define the pronpt
5
+ prompt_text = """
6
+ You are an assistant tasked with summarizing tables and text.
7
+ Give a concise summary of the table or text.
8
+
9
+ Respond only with the summary, no additionnal comment.
10
+ Do not start your message by saying "Here is a summary" or anything like that.
11
+ Just give the summary as it is.
12
+
13
+ Table or text chunk: {element}
14
+
15
+ """
16
+
17
+ def TextSummerizer(prompt_template =prompt_text,data=None):
18
+ try:
19
+ text_summary = Summarizer(prompt_template=prompt_template,data=data,config=True,set_messages=False)
20
+ return text_summary
21
+ except Exception as e:
22
+ pass
23
+ if __name__ == "__main__":
24
+ print(TextSummerizer(data="""
25
+ To download and use Poppler as a Python library (or make it accessible to Python), follow these steps based on your operating system. Poppler is not a Python package—it's a C++ PDF rendering library with command-line tools like pdfinfo, pdftotext, and others, which Python libraries like unstructured or pdf2image call internally.
26
+ """))
utils/helper.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unstructured.partition.pdf import partition_pdf
2
+ from langchain_openai import ChatOpenAI
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+
9
+ def get_images_base64(chunks):
10
+ images_b64 = []
11
+ for chunk in chunks:
12
+ if "CompositeElement" in str(type(chunk)):
13
+ chunk_els = chunk.metadata.orig_elements
14
+ for el in chunk_els:
15
+ if "Image" in str(type(el)):
16
+ images_b64.append(el.metadata.image_base64)
17
+ return images_b64
18
+
19
+
20
+ def LoadAndExtractData(file_path):
21
+ try:
22
+ # separate tables from texts
23
+ tables = []
24
+ texts = []
25
+
26
+ print(">> Extracting Data")
27
+ data = partition_pdf(
28
+ filename=file_path,
29
+ infer_table_structure=True, # extract tables
30
+ # strategy="hi_res", # mandatory to infer tables
31
+
32
+ extract_image_block_types=["Image"], # Add 'Tabl
33
+
34
+ extract_image_block_to_payload=True, # if true, will extract base64 for API usage
35
+
36
+ chunking_strategy="by_title", # or 'basic'
37
+ max_characters=10000, # defaults to 500
38
+ combine_text_under_n_chars=2000, # defaults to 0
39
+ new_after_n_chars=6000,
40
+
41
+ # extract_images_in_pdf=True, # deprecated
42
+ )
43
+
44
+ # Extract the tables and text
45
+ print(">> Extracting Text and tables...")
46
+ for chunk in data:
47
+ if "Table" in str(type(chunk)):
48
+ tables.append(chunk)
49
+
50
+ if "CompositeElement" in str(type((chunk))):
51
+ texts.append(chunk)
52
+ print(">> Chunks are: ",data)
53
+ # extract the image
54
+ print(">> Extracting Images...")
55
+ images = get_images_base64(data)
56
+ return tables ,texts, images
57
+ except Exception as e:
58
+ print("Error is: ",str(e))
59
+ return [], [], str(e)
60
+
61
+
62
+
63
+ # Summarizer Function
64
+ def Summarizer(prompt_template, data, config=True, set_messages=False):
65
+ """
66
+ This function summarizes documents using a prompt template and the ChatOpenAI model.
67
+
68
+ Args:
69
+ prompt_template (str): Template string for the prompt.
70
+ data (List[Dict] or List[str]): Input data to be summarized.
71
+ config (bool): Whether to run the chain with concurrency limit.
72
+ set_messages (bool): Whether to set messages as chat messages with an image.
73
+
74
+ Returns:
75
+ List[str]: List of summaries.
76
+ """
77
+ try:
78
+ # api_key = os.getenv()
79
+ if set_messages:
80
+ messages = [
81
+ (
82
+ "user",
83
+ [
84
+ {"type": "text", "text": prompt_template},
85
+ {
86
+ "type": "image_url",
87
+ "image_url": {"url": "data:image/jpeg;base64,{image}"},
88
+ },
89
+ ],
90
+ )
91
+ ]
92
+ prompt = ChatPromptTemplate.from_messages(messages)
93
+ model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini")
94
+ summarize_chain = {"image": lambda x: x} | prompt | model | StrOutputParser()
95
+ else:
96
+ prompt = ChatPromptTemplate.from_template(prompt_template)
97
+ model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini")
98
+ summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
99
+
100
+
101
+ if config:
102
+ return summarize_chain.batch(data, {"max_concurrency": 3})
103
+ else:
104
+ return summarize_chain.batch(data)
105
+ except Exception as e:
106
+ return str(e)
107
+
vectorStore/__pycache__/vectorStore.cpython-311.pyc ADDED
Binary file (4.28 kB). View file
 
vectorStore/vectorStore.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymilvus import MilvusClient
2
+ # from langchain_milvus import Milvus
3
+ from langchain_openai import OpenAIEmbeddings
4
+ from langchain.docstore.document import Document
5
+ from tqdm import tqdm
6
+ from dotenv import load_dotenv
7
+ from typing import List
8
+ import os
9
+
10
+ load_dotenv()
11
+
12
+ embeddings = OpenAIEmbeddings(openai_api_key="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")
13
+
14
+ from typing import List
15
+
16
+ # =============Fais Setup============#
17
+ from typing import List
18
+ from langchain_core.documents import Document
19
+ import faiss
20
+ from langchain_community.docstore.in_memory import InMemoryDocstore
21
+ from langchain_community.vectorstores import FAISS
22
+ from uuid import uuid4
23
+ from tqdm import tqdm
24
+
25
+
26
+ def add_to_vector_store(docs_chunks: List[Document],batch_size:int = 64,vector_store_path = "my_faiss_index"):
27
+ """
28
+ Embeds document chunks and stores them in a FAISS vector store.
29
+
30
+ Args:
31
+ docs_chunks (List[Document]): List of LangChain Document objects.
32
+
33
+ Returns:
34
+ dict: Status message and vector store.
35
+ """
36
+ print(f">> Starting embedding for {len(docs_chunks)} documents...\n")
37
+
38
+
39
+
40
+ if os.path.exists(vector_store_path):
41
+ print(">> Loading the index <<")
42
+ vector_store = FAISS.load_local(vector_store_path, embeddings,allow_dangerous_deserialization=True)
43
+ else:
44
+ print(">> Creating the index <<")
45
+ # Create an index using the dimensionality of one sample embedding
46
+ dimension = len(embeddings.embed_query("hello world"))
47
+ index = faiss.IndexFlatL2(dimension)
48
+ # Initialize vector store
49
+ vector_store = FAISS(
50
+ embedding_function=embeddings,
51
+ index=index,
52
+ docstore=InMemoryDocstore(),
53
+ index_to_docstore_id={},
54
+ )
55
+
56
+ # Generate unique IDs for documents
57
+ uuids = [str(uuid4()) for _ in docs_chunks]
58
+
59
+ print(f"\n📦 Preparing to insert {len(docs_chunks)} documents into FAISS...\n")
60
+ # Loop over documents in batches
61
+ for i in tqdm(range(0, len(docs_chunks), batch_size), desc="🔍 Embedding & Inserting", unit="batch"):
62
+ batch_docs = docs_chunks[i:i+batch_size]
63
+ batch_ids = uuids[i:i+batch_size]
64
+
65
+ vector_store.add_documents(documents=batch_docs, ids=batch_ids)
66
+
67
+ vector_store.save_local(vector_store_path)
68
+ print("✅ Data insertion successful!\n")
69
+
70
+ return {
71
+ "status": "success",
72
+ "vector_store": vector_store,
73
+ "num_documents": len(docs_chunks)
74
+ }
75
+
76
+ def GetContext(query:str):
77
+ vector_store = FAISS.load_local("my_faiss_index", embeddings,allow_dangerous_deserialization=True)
78
+
79
+ results = vector_store.similarity_search(
80
+ query,
81
+ k=2,
82
+ # filter={"source": "tweet"},
83
+ )
84
+ # for res in results:
85
+ # print(f"* {res.page_content} [{res.metadata}]")
86
+
87
+ return {"Context":results}
88
+
89
+
90
+
91
+ if __name__ == "__main__":
92
+ pass