qna / app.py
vineeth N
Update app.py
2f98d89 verified
raw
history blame
4.33 kB
import os
from typing import List
from dotenv import load_dotenv
import chainlit as cl
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
# Load environment variables
load_dotenv()
# Initialize OpenAI API key
openai_api_key = os.getenv("'sk-None-Nn6BodKwwjNYiNYT2QtWT3BlbkFJqTm7b3Fq4HftPntWdkUa'")
# Initialize embedding model using OpenAI
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key,model="text-embedding-3-small")
# Initialize vector store
vector_store = None
# Store PDF file paths
pdf_files = {}
# Define the path for the FAISS index
FAISS_INDEX_PATH = "faiss_index"
FAISS_INDEX_FILE = os.path.join(FAISS_INDEX_PATH, "index.faiss")
def process_pdfs(directory: str) -> None:
"""Process all PDFs in the given directory and add them to the vector store."""
global vector_store, pdf_files
documents = []
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
file_path = os.path.join(directory, filename)
loader = PyPDFLoader(file_path)
documents.extend(loader.load())
pdf_files[filename] = file_path
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
if os.path.exists(FAISS_INDEX_FILE):
try:
vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
vector_store.add_documents(texts)
except Exception as e:
print(f"Error loading FAISS index: {e}")
vector_store = FAISS.from_documents(texts, embeddings)
else:
vector_store = FAISS.from_documents(texts, embeddings)
# Save the updated vector store
if not os.path.exists(FAISS_INDEX_PATH):
os.makedirs(FAISS_INDEX_PATH)
vector_store.save_local(FAISS_INDEX_PATH)
@cl.on_chat_start
async def start():
"""Initialize the chat session."""
await cl.Message(content="Welcome! Processing PDFs...").send()
# Process PDFs (replace with your PDF directory)
process_pdfs(r"C:\Users\sumes\OneDrive\Documents\pdf_docs")
await cl.Message(content="PDFs processed. You can now ask questions!").send()
@cl.on_message
async def main(message: cl.Message):
"""Handle user messages and generate responses."""
if vector_store is None:
await cl.Message(content="Error: Vector store not initialized.").send()
return
query = message.content
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
# Initialize the OpenAI language model
llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True
)
result = qa_chain(query)
answer = result['result']
source_docs = result['source_documents']
await cl.Message(content=answer).send()
if source_docs:
unique_sources = set()
for doc in source_docs:
file_name = os.path.basename(doc.metadata['source'])
if file_name in pdf_files and file_name not in unique_sources:
unique_sources.add(file_name)
file_path = pdf_files[file_name]
elements = [
cl.Text(name=file_name, content=f"Source: {file_name}"),
cl.File(name=file_name, path=file_path, display="inline")
]
await cl.Message(content=f"Source: {file_name}", elements=elements).send()
other_sources = [doc.metadata['source'] for doc in source_docs if os.path.basename(doc.metadata['source']) not in pdf_files]
unique_other_sources = set(other_sources)
if unique_other_sources:
sources_message = "Other Sources:\n" + "\n".join(f"- {source}" for source in unique_other_sources)
await cl.Message(content=sources_message).send()
if __name__ == "__main__":
cl.run()