# import os | |
# from typing import List | |
# from dotenv import load_dotenv | |
# import chainlit as cl | |
# from langchain_community.embeddings import HuggingFaceEmbeddings | |
# from langchain_text_splitters import RecursiveCharacterTextSplitter | |
# from langchain_community.vectorstores import FAISS | |
# from langchain_community.document_loaders import PyPDFLoader | |
# from langchain.chains import RetrievalQA | |
# from langchain_groq import ChatGroq | |
# from langchain_huggingface import HuggingFaceEmbeddings | |
# # Load environment variables | |
# load_dotenv() | |
# # Initialize embedding model | |
# # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
# openai.api_key = os.getenv("OPENAI_API_KEY") | |
# # Initialize embedding model using OpenAI | |
# embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key,model="text-embedding-3-small") | |
# # Initialize vector store | |
# vector_store = None | |
# # Store PDF file paths | |
# pdf_files = {} | |
# # Define the path for the FAISS index | |
# FAISS_INDEX_PATH = "faiss_index" | |
# def process_pdfs(directory: str) -> None: | |
# """Process all PDFs in the given directory and add them to the vector store.""" | |
# global vector_store, pdf_files | |
# documents = [] | |
# for filename in os.listdir(directory): | |
# if filename.endswith(".pdf"): | |
# file_path = os.path.join(directory, filename) | |
# loader = PyPDFLoader(file_path) | |
# documents.extend(loader.load()) | |
# pdf_files[filename] = file_path | |
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
# texts = text_splitter.split_documents(documents) | |
# if os.path.exists(FAISS_INDEX_PATH): | |
# vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True) | |
# vector_store.add_documents(texts) | |
# else: | |
# vector_store = FAISS.from_documents(texts, embeddings) | |
# # Save the updated vector store | |
# vector_store.save_local(FAISS_INDEX_PATH) | |
# @cl.on_chat_start | |
# async def start(): | |
# """Initialize the chat session.""" | |
# await cl.Message(content="Welcome! Processing PDFs...").send() | |
# # Process PDFs (replace with your PDF directory) | |
# process_pdfs(r"C:\Users\sumes\OneDrive\Documents\pdf_docs") | |
# await cl.Message(content="PDFs processed. You can now ask questions!").send() | |
# @cl.on_message | |
# async def main(message: cl.Message): | |
# """Handle user messages and generate responses.""" | |
# if vector_store is None: | |
# await cl.Message(content="Error: Vector store not initialized.").send() | |
# return | |
# query = message.content | |
# retriever = vector_store.as_retriever(search_kwargs={"k": 1}) | |
# llm = OpenAI(openai_api_key=openai.api_key, model="gpt-4o-mini", temperature=0.4) | |
# qa_chain = RetrievalQA.from_chain_type( | |
# llm=llm, | |
# chain_type="stuff", | |
# retriever=retriever, | |
# return_source_documents=True | |
# ) | |
# result = qa_chain(query) | |
# answer = result['result'] | |
# source_docs = result['source_documents'] | |
# await cl.Message(content=answer).send() | |
# if source_docs: | |
# sources_message = "Sources:\n" | |
# for doc in source_docs: | |
# file_name = os.path.basename(doc.metadata['source']) | |
# if file_name in pdf_files: | |
# file_path = pdf_files[file_name] | |
# elements = [ | |
# cl.Text(name=file_name, content=f"Source: {file_name}"), | |
# cl.File(name=file_name, path=file_path, display="inline") | |
# ] | |
# await cl.Message(content=f"Source: {file_name}", elements=elements).send() | |
# else: | |
# sources_message += f"- {doc.metadata['source']}\n" | |
# if sources_message != "Sources:\n": | |
# await cl.Message(content=sources_message).send() | |
# if __name__ == "__main__": | |
# cl.run() | |
import os | |
from typing import List | |
from dotenv import load_dotenv | |
import chainlit as cl | |
from langchain_community.embeddings import OpenAIEmbeddings | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores import FAISS | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.chains import RetrievalQA | |
from langchain_openai import ChatOpenAI | |
from langchain_openai import OpenAIEmbeddings | |
# Load environment variables | |
load_dotenv() | |
# Initialize OpenAI API key | |
openai_api_key = os.getenv("OPENAI_API_KEY") | |
# Initialize embedding model using OpenAI | |
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key,model="text-embedding-3-small") | |
# Initialize vector store | |
vector_store = None | |
# Store PDF file paths | |
pdf_files = {} | |
# Define the path for the FAISS index | |
FAISS_INDEX_PATH = "faiss_index" | |
FAISS_INDEX_FILE = os.path.join(FAISS_INDEX_PATH, "index.faiss") | |
def process_pdfs(directory: str) -> None: | |
"""Process all PDFs in the given directory and add them to the vector store.""" | |
global vector_store, pdf_files | |
documents = [] | |
for filename in os.listdir(directory): | |
if filename.endswith(".pdf"): | |
file_path = os.path.join(directory, filename) | |
loader = PyPDFLoader(file_path) | |
documents.extend(loader.load()) | |
pdf_files[filename] = file_path | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
texts = text_splitter.split_documents(documents) | |
if os.path.exists(FAISS_INDEX_FILE): | |
try: | |
vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True) | |
vector_store.add_documents(texts) | |
except Exception as e: | |
print(f"Error loading FAISS index: {e}") | |
vector_store = FAISS.from_documents(texts, embeddings) | |
else: | |
vector_store = FAISS.from_documents(texts, embeddings) | |
# Save the updated vector store | |
if not os.path.exists(FAISS_INDEX_PATH): | |
os.makedirs(FAISS_INDEX_PATH) | |
vector_store.save_local(FAISS_INDEX_PATH) | |
async def start(): | |
"""Initialize the chat session.""" | |
await cl.Message(content="Welcome! Processing PDFs...").send() | |
# Process PDFs (replace with your PDF directory) | |
process_pdfs(r"C:\Users\sumes\OneDrive\Documents\pdf_docs") | |
await cl.Message(content="PDFs processed. You can now ask questions!").send() | |
# @cl.on_message | |
# async def main(message: cl.Message): | |
# """Handle user messages and generate responses.""" | |
# if vector_store is None: | |
# await cl.Message(content="Error: Vector store not initialized.").send() | |
# return | |
# query = message.content | |
# retriever = vector_store.as_retriever(search_kwargs={"k": 3}) | |
# # Initialize the OpenAI language model | |
# llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0) | |
# qa_chain = RetrievalQA.from_chain_type( | |
# llm=llm, | |
# chain_type="stuff", | |
# retriever=retriever, | |
# return_source_documents=True | |
# ) | |
# result = qa_chain(query) | |
# answer = result['result'] | |
# source_docs = result['source_documents'] | |
# await cl.Message(content=answer).send() | |
# if source_docs: | |
# sources_message = "Sources:\n" | |
# for doc in source_docs: | |
# file_name = os.path.basename(doc.metadata['source']) | |
# if file_name in pdf_files: | |
# file_path = pdf_files[file_name] | |
# elements = [ | |
# cl.Text(name=file_name, content=f"Source: {file_name}"), | |
# cl.File(name=file_name, path=file_path, display="inline") | |
# ] | |
# await cl.Message(content=f"Source: {file_name}", elements=elements).send() | |
# else: | |
# sources_message += f"- {doc.metadata['source']}\n" | |
# if sources_message != "Sources:\n": | |
# await cl.Message(content=sources_message).send() | |
async def main(message: cl.Message): | |
"""Handle user messages and generate responses.""" | |
if vector_store is None: | |
await cl.Message(content="Error: Vector store not initialized.").send() | |
return | |
query = message.content | |
retriever = vector_store.as_retriever(search_kwargs={"k": 3}) | |
# Initialize the OpenAI language model | |
llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0) | |
qa_chain = RetrievalQA.from_chain_type( | |
llm=llm, | |
chain_type="stuff", | |
retriever=retriever, | |
return_source_documents=True | |
) | |
result = qa_chain(query) | |
answer = result['result'] | |
source_docs = result['source_documents'] | |
await cl.Message(content=answer).send() | |
if source_docs: | |
unique_sources = set() | |
for doc in source_docs: | |
file_name = os.path.basename(doc.metadata['source']) | |
if file_name in pdf_files and file_name not in unique_sources: | |
unique_sources.add(file_name) | |
file_path = pdf_files[file_name] | |
elements = [ | |
cl.Text(name=file_name, content=f"Source: {file_name}"), | |
cl.File(name=file_name, path=file_path, display="inline") | |
] | |
await cl.Message(content=f"Source: {file_name}", elements=elements).send() | |
other_sources = [doc.metadata['source'] for doc in source_docs if os.path.basename(doc.metadata['source']) not in pdf_files] | |
unique_other_sources = set(other_sources) | |
if unique_other_sources: | |
sources_message = "Other Sources:\n" + "\n".join(f"- {source}" for source in unique_other_sources) | |
await cl.Message(content=sources_message).send() | |
if __name__ == "__main__": | |
cl.run() |