# import os # from typing import List # from dotenv import load_dotenv # import chainlit as cl # from langchain_community.embeddings import HuggingFaceEmbeddings # from langchain_text_splitters import RecursiveCharacterTextSplitter # from langchain_community.vectorstores import FAISS # from langchain_community.document_loaders import PyPDFLoader # from langchain.chains import RetrievalQA # from langchain_groq import ChatGroq # from langchain_huggingface import HuggingFaceEmbeddings # # Load environment variables # load_dotenv() # # Initialize embedding model # # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # openai.api_key = os.getenv("OPENAI_API_KEY") # # Initialize embedding model using OpenAI # embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key,model="text-embedding-3-small") # # Initialize vector store # vector_store = None # # Store PDF file paths # pdf_files = {} # # Define the path for the FAISS index # FAISS_INDEX_PATH = "faiss_index" # def process_pdfs(directory: str) -> None: # """Process all PDFs in the given directory and add them to the vector store.""" # global vector_store, pdf_files # documents = [] # for filename in os.listdir(directory): # if filename.endswith(".pdf"): # file_path = os.path.join(directory, filename) # loader = PyPDFLoader(file_path) # documents.extend(loader.load()) # pdf_files[filename] = file_path # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # texts = text_splitter.split_documents(documents) # if os.path.exists(FAISS_INDEX_PATH): # vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True) # vector_store.add_documents(texts) # else: # vector_store = FAISS.from_documents(texts, embeddings) # # Save the updated vector store # vector_store.save_local(FAISS_INDEX_PATH) # @cl.on_chat_start # async def start(): # """Initialize the chat session.""" # await cl.Message(content="Welcome! Processing PDFs...").send() # # Process PDFs (replace with your PDF directory) # process_pdfs(r"C:\Users\sumes\OneDrive\Documents\pdf_docs") # await cl.Message(content="PDFs processed. You can now ask questions!").send() # @cl.on_message # async def main(message: cl.Message): # """Handle user messages and generate responses.""" # if vector_store is None: # await cl.Message(content="Error: Vector store not initialized.").send() # return # query = message.content # retriever = vector_store.as_retriever(search_kwargs={"k": 1}) # llm = OpenAI(openai_api_key=openai.api_key, model="gpt-4o-mini", temperature=0.4) # qa_chain = RetrievalQA.from_chain_type( # llm=llm, # chain_type="stuff", # retriever=retriever, # return_source_documents=True # ) # result = qa_chain(query) # answer = result['result'] # source_docs = result['source_documents'] # await cl.Message(content=answer).send() # if source_docs: # sources_message = "Sources:\n" # for doc in source_docs: # file_name = os.path.basename(doc.metadata['source']) # if file_name in pdf_files: # file_path = pdf_files[file_name] # elements = [ # cl.Text(name=file_name, content=f"Source: {file_name}"), # cl.File(name=file_name, path=file_path, display="inline") # ] # await cl.Message(content=f"Source: {file_name}", elements=elements).send() # else: # sources_message += f"- {doc.metadata['source']}\n" # if sources_message != "Sources:\n": # await cl.Message(content=sources_message).send() # if __name__ == "__main__": # cl.run() import os from typing import List from dotenv import load_dotenv import chainlit as cl from langchain_community.embeddings import OpenAIEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.document_loaders import PyPDFLoader from langchain.chains import RetrievalQA from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings # Load environment variables load_dotenv() # Initialize OpenAI API key openai_api_key = os.getenv("OPENAI_API_KEY") # Initialize embedding model using OpenAI embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key,model="text-embedding-3-small") # Initialize vector store vector_store = None # Store PDF file paths pdf_files = {} # Define the path for the FAISS index FAISS_INDEX_PATH = "faiss_index" FAISS_INDEX_FILE = os.path.join(FAISS_INDEX_PATH, "index.faiss") def process_pdfs(directory: str) -> None: """Process all PDFs in the given directory and add them to the vector store.""" global vector_store, pdf_files documents = [] for filename in os.listdir(directory): if filename.endswith(".pdf"): file_path = os.path.join(directory, filename) loader = PyPDFLoader(file_path) documents.extend(loader.load()) pdf_files[filename] = file_path text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) texts = text_splitter.split_documents(documents) if os.path.exists(FAISS_INDEX_FILE): try: vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True) vector_store.add_documents(texts) except Exception as e: print(f"Error loading FAISS index: {e}") vector_store = FAISS.from_documents(texts, embeddings) else: vector_store = FAISS.from_documents(texts, embeddings) # Save the updated vector store if not os.path.exists(FAISS_INDEX_PATH): os.makedirs(FAISS_INDEX_PATH) vector_store.save_local(FAISS_INDEX_PATH) @cl.on_chat_start async def start(): """Initialize the chat session.""" await cl.Message(content="Welcome! Processing PDFs...").send() # Process PDFs (replace with your PDF directory) process_pdfs(r"C:\Users\sumes\OneDrive\Documents\pdf_docs") await cl.Message(content="PDFs processed. You can now ask questions!").send() # @cl.on_message # async def main(message: cl.Message): # """Handle user messages and generate responses.""" # if vector_store is None: # await cl.Message(content="Error: Vector store not initialized.").send() # return # query = message.content # retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # # Initialize the OpenAI language model # llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0) # qa_chain = RetrievalQA.from_chain_type( # llm=llm, # chain_type="stuff", # retriever=retriever, # return_source_documents=True # ) # result = qa_chain(query) # answer = result['result'] # source_docs = result['source_documents'] # await cl.Message(content=answer).send() # if source_docs: # sources_message = "Sources:\n" # for doc in source_docs: # file_name = os.path.basename(doc.metadata['source']) # if file_name in pdf_files: # file_path = pdf_files[file_name] # elements = [ # cl.Text(name=file_name, content=f"Source: {file_name}"), # cl.File(name=file_name, path=file_path, display="inline") # ] # await cl.Message(content=f"Source: {file_name}", elements=elements).send() # else: # sources_message += f"- {doc.metadata['source']}\n" # if sources_message != "Sources:\n": # await cl.Message(content=sources_message).send() @cl.on_message async def main(message: cl.Message): """Handle user messages and generate responses.""" if vector_store is None: await cl.Message(content="Error: Vector store not initialized.").send() return query = message.content retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # Initialize the OpenAI language model llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0) qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True ) result = qa_chain(query) answer = result['result'] source_docs = result['source_documents'] await cl.Message(content=answer).send() if source_docs: unique_sources = set() for doc in source_docs: file_name = os.path.basename(doc.metadata['source']) if file_name in pdf_files and file_name not in unique_sources: unique_sources.add(file_name) file_path = pdf_files[file_name] elements = [ cl.Text(name=file_name, content=f"Source: {file_name}"), cl.File(name=file_name, path=file_path, display="inline") ] await cl.Message(content=f"Source: {file_name}", elements=elements).send() other_sources = [doc.metadata['source'] for doc in source_docs if os.path.basename(doc.metadata['source']) not in pdf_files] unique_other_sources = set(other_sources) if unique_other_sources: sources_message = "Other Sources:\n" + "\n".join(f"- {source}" for source in unique_other_sources) await cl.Message(content=sources_message).send() if __name__ == "__main__": cl.run()