qna / app.py
vineeth N
Update app.py
030bc4f verified
raw
history blame
9.8 kB
# import os
# from typing import List
# from dotenv import load_dotenv
# import chainlit as cl
# from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import FAISS
# from langchain_community.document_loaders import PyPDFLoader
# from langchain.chains import RetrievalQA
# from langchain_groq import ChatGroq
# from langchain_huggingface import HuggingFaceEmbeddings
# # Load environment variables
# load_dotenv()
# # Initialize embedding model
# # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# openai.api_key = os.getenv("OPENAI_API_KEY")
# # Initialize embedding model using OpenAI
# embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key,model="text-embedding-3-small")
# # Initialize vector store
# vector_store = None
# # Store PDF file paths
# pdf_files = {}
# # Define the path for the FAISS index
# FAISS_INDEX_PATH = "faiss_index"
# def process_pdfs(directory: str) -> None:
# """Process all PDFs in the given directory and add them to the vector store."""
# global vector_store, pdf_files
# documents = []
# for filename in os.listdir(directory):
# if filename.endswith(".pdf"):
# file_path = os.path.join(directory, filename)
# loader = PyPDFLoader(file_path)
# documents.extend(loader.load())
# pdf_files[filename] = file_path
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# texts = text_splitter.split_documents(documents)
# if os.path.exists(FAISS_INDEX_PATH):
# vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
# vector_store.add_documents(texts)
# else:
# vector_store = FAISS.from_documents(texts, embeddings)
# # Save the updated vector store
# vector_store.save_local(FAISS_INDEX_PATH)
# @cl.on_chat_start
# async def start():
# """Initialize the chat session."""
# await cl.Message(content="Welcome! Processing PDFs...").send()
# # Process PDFs (replace with your PDF directory)
# process_pdfs(r"C:\Users\sumes\OneDrive\Documents\pdf_docs")
# await cl.Message(content="PDFs processed. You can now ask questions!").send()
# @cl.on_message
# async def main(message: cl.Message):
# """Handle user messages and generate responses."""
# if vector_store is None:
# await cl.Message(content="Error: Vector store not initialized.").send()
# return
# query = message.content
# retriever = vector_store.as_retriever(search_kwargs={"k": 1})
# llm = OpenAI(openai_api_key=openai.api_key, model="gpt-4o-mini", temperature=0.4)
# qa_chain = RetrievalQA.from_chain_type(
# llm=llm,
# chain_type="stuff",
# retriever=retriever,
# return_source_documents=True
# )
# result = qa_chain(query)
# answer = result['result']
# source_docs = result['source_documents']
# await cl.Message(content=answer).send()
# if source_docs:
# sources_message = "Sources:\n"
# for doc in source_docs:
# file_name = os.path.basename(doc.metadata['source'])
# if file_name in pdf_files:
# file_path = pdf_files[file_name]
# elements = [
# cl.Text(name=file_name, content=f"Source: {file_name}"),
# cl.File(name=file_name, path=file_path, display="inline")
# ]
# await cl.Message(content=f"Source: {file_name}", elements=elements).send()
# else:
# sources_message += f"- {doc.metadata['source']}\n"
# if sources_message != "Sources:\n":
# await cl.Message(content=sources_message).send()
# if __name__ == "__main__":
# cl.run()
import os
from typing import List
from dotenv import load_dotenv
import chainlit as cl
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
# Load environment variables
load_dotenv()
# Initialize OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")
# Initialize embedding model using OpenAI
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key,model="text-embedding-3-small")
# Initialize vector store
vector_store = None
# Store PDF file paths
pdf_files = {}
# Define the path for the FAISS index
FAISS_INDEX_PATH = "faiss_index"
FAISS_INDEX_FILE = os.path.join(FAISS_INDEX_PATH, "index.faiss")
def process_pdfs(directory: str) -> None:
"""Process all PDFs in the given directory and add them to the vector store."""
global vector_store, pdf_files
documents = []
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
file_path = os.path.join(directory, filename)
loader = PyPDFLoader(file_path)
documents.extend(loader.load())
pdf_files[filename] = file_path
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
if os.path.exists(FAISS_INDEX_FILE):
try:
vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
vector_store.add_documents(texts)
except Exception as e:
print(f"Error loading FAISS index: {e}")
vector_store = FAISS.from_documents(texts, embeddings)
else:
vector_store = FAISS.from_documents(texts, embeddings)
# Save the updated vector store
if not os.path.exists(FAISS_INDEX_PATH):
os.makedirs(FAISS_INDEX_PATH)
vector_store.save_local(FAISS_INDEX_PATH)
@cl.on_chat_start
async def start():
"""Initialize the chat session."""
await cl.Message(content="Welcome! Processing PDFs...").send()
# Process PDFs (replace with your PDF directory)
process_pdfs(r"C:\Users\sumes\OneDrive\Documents\pdf_docs")
await cl.Message(content="PDFs processed. You can now ask questions!").send()
# @cl.on_message
# async def main(message: cl.Message):
# """Handle user messages and generate responses."""
# if vector_store is None:
# await cl.Message(content="Error: Vector store not initialized.").send()
# return
# query = message.content
# retriever = vector_store.as_retriever(search_kwargs={"k": 3})
# # Initialize the OpenAI language model
# llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0)
# qa_chain = RetrievalQA.from_chain_type(
# llm=llm,
# chain_type="stuff",
# retriever=retriever,
# return_source_documents=True
# )
# result = qa_chain(query)
# answer = result['result']
# source_docs = result['source_documents']
# await cl.Message(content=answer).send()
# if source_docs:
# sources_message = "Sources:\n"
# for doc in source_docs:
# file_name = os.path.basename(doc.metadata['source'])
# if file_name in pdf_files:
# file_path = pdf_files[file_name]
# elements = [
# cl.Text(name=file_name, content=f"Source: {file_name}"),
# cl.File(name=file_name, path=file_path, display="inline")
# ]
# await cl.Message(content=f"Source: {file_name}", elements=elements).send()
# else:
# sources_message += f"- {doc.metadata['source']}\n"
# if sources_message != "Sources:\n":
# await cl.Message(content=sources_message).send()
@cl.on_message
async def main(message: cl.Message):
"""Handle user messages and generate responses."""
if vector_store is None:
await cl.Message(content="Error: Vector store not initialized.").send()
return
query = message.content
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
# Initialize the OpenAI language model
llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True
)
result = qa_chain(query)
answer = result['result']
source_docs = result['source_documents']
await cl.Message(content=answer).send()
if source_docs:
unique_sources = set()
for doc in source_docs:
file_name = os.path.basename(doc.metadata['source'])
if file_name in pdf_files and file_name not in unique_sources:
unique_sources.add(file_name)
file_path = pdf_files[file_name]
elements = [
cl.Text(name=file_name, content=f"Source: {file_name}"),
cl.File(name=file_name, path=file_path, display="inline")
]
await cl.Message(content=f"Source: {file_name}", elements=elements).send()
other_sources = [doc.metadata['source'] for doc in source_docs if os.path.basename(doc.metadata['source']) not in pdf_files]
unique_other_sources = set(other_sources)
if unique_other_sources:
sources_message = "Other Sources:\n" + "\n".join(f"- {source}" for source in unique_other_sources)
await cl.Message(content=sources_message).send()
if __name__ == "__main__":
cl.run()