Spaces:

vineethn
/

qna

Sleeping

qna / app.py

vineeth N

Update app.py

030bc4f verified 9 months ago

9.8 kB

	# import os
	# from typing import List
	# from dotenv import load_dotenv
	# import chainlit as cl
	# from langchain_community.embeddings import HuggingFaceEmbeddings
	# from langchain_text_splitters import RecursiveCharacterTextSplitter
	# from langchain_community.vectorstores import FAISS
	# from langchain_community.document_loaders import PyPDFLoader
	# from langchain.chains import RetrievalQA
	# from langchain_groq import ChatGroq
	# from langchain_huggingface import HuggingFaceEmbeddings

	# # Load environment variables
	# load_dotenv()

	# # Initialize embedding model
	# # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

	# openai.api_key = os.getenv("OPENAI_API_KEY")

	# # Initialize embedding model using OpenAI
	# embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key,model="text-embedding-3-small")


	# # Initialize vector store
	# vector_store = None

	# # Store PDF file paths
	# pdf_files = {}

	# # Define the path for the FAISS index
	# FAISS_INDEX_PATH = "faiss_index"

	# def process_pdfs(directory: str) -> None:
	# """Process all PDFs in the given directory and add them to the vector store."""
	# global vector_store, pdf_files
	# documents = []

	# for filename in os.listdir(directory):
	# if filename.endswith(".pdf"):
	# file_path = os.path.join(directory, filename)
	# loader = PyPDFLoader(file_path)
	# documents.extend(loader.load())
	# pdf_files[filename] = file_path

	# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	# texts = text_splitter.split_documents(documents)

	# if os.path.exists(FAISS_INDEX_PATH):
	# vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
	# vector_store.add_documents(texts)
	# else:
	# vector_store = FAISS.from_documents(texts, embeddings)

	# # Save the updated vector store
	# vector_store.save_local(FAISS_INDEX_PATH)
	# @cl.on_chat_start
	# async def start():
	# """Initialize the chat session."""
	# await cl.Message(content="Welcome! Processing PDFs...").send()

	# # Process PDFs (replace with your PDF directory)
	# process_pdfs(r"C:\Users\sumes\OneDrive\Documents\pdf_docs")

	# await cl.Message(content="PDFs processed. You can now ask questions!").send()

	# @cl.on_message
	# async def main(message: cl.Message):
	# """Handle user messages and generate responses."""
	# if vector_store is None:
	# await cl.Message(content="Error: Vector store not initialized.").send()
	# return

	# query = message.content

	# retriever = vector_store.as_retriever(search_kwargs={"k": 1})

	# llm = OpenAI(openai_api_key=openai.api_key, model="gpt-4o-mini", temperature=0.4)

	# qa_chain = RetrievalQA.from_chain_type(
	# llm=llm,
	# chain_type="stuff",
	# retriever=retriever,
	# return_source_documents=True
	# )

	# result = qa_chain(query)
	# answer = result['result']
	# source_docs = result['source_documents']

	# await cl.Message(content=answer).send()

	# if source_docs:
	# sources_message = "Sources:\n"
	# for doc in source_docs:
	# file_name = os.path.basename(doc.metadata['source'])
	# if file_name in pdf_files:
	# file_path = pdf_files[file_name]
	# elements = [
	# cl.Text(name=file_name, content=f"Source: {file_name}"),
	# cl.File(name=file_name, path=file_path, display="inline")
	# ]
	# await cl.Message(content=f"Source: {file_name}", elements=elements).send()
	# else:
	# sources_message += f"- {doc.metadata['source']}\n"

	# if sources_message != "Sources:\n":
	# await cl.Message(content=sources_message).send()

	# if __name__ == "__main__":
	# cl.run()

	import os
	from typing import List
	from dotenv import load_dotenv
	import chainlit as cl
	from langchain_community.embeddings import OpenAIEmbeddings
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.chains import RetrievalQA
	from langchain_openai import ChatOpenAI
	from langchain_openai import OpenAIEmbeddings

	# Load environment variables
	load_dotenv()

	# Initialize OpenAI API key
	openai_api_key = os.getenv("OPENAI_API_KEY")

	# Initialize embedding model using OpenAI
	embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key,model="text-embedding-3-small")

	# Initialize vector store
	vector_store = None

	# Store PDF file paths
	pdf_files = {}

	# Define the path for the FAISS index
	FAISS_INDEX_PATH = "faiss_index"
	FAISS_INDEX_FILE = os.path.join(FAISS_INDEX_PATH, "index.faiss")

	def process_pdfs(directory: str) -> None:
	"""Process all PDFs in the given directory and add them to the vector store."""
	global vector_store, pdf_files
	documents = []

	for filename in os.listdir(directory):
	if filename.endswith(".pdf"):
	file_path = os.path.join(directory, filename)
	loader = PyPDFLoader(file_path)
	documents.extend(loader.load())
	pdf_files[filename] = file_path

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	texts = text_splitter.split_documents(documents)

	if os.path.exists(FAISS_INDEX_FILE):
	try:
	vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
	vector_store.add_documents(texts)
	except Exception as e:
	print(f"Error loading FAISS index: {e}")
	vector_store = FAISS.from_documents(texts, embeddings)
	else:
	vector_store = FAISS.from_documents(texts, embeddings)

	# Save the updated vector store
	if not os.path.exists(FAISS_INDEX_PATH):
	os.makedirs(FAISS_INDEX_PATH)
	vector_store.save_local(FAISS_INDEX_PATH)

	@cl.on_chat_start
	async def start():
	"""Initialize the chat session."""
	await cl.Message(content="Welcome! Processing PDFs...").send()

	# Process PDFs (replace with your PDF directory)
	process_pdfs(r"C:\Users\sumes\OneDrive\Documents\pdf_docs")

	await cl.Message(content="PDFs processed. You can now ask questions!").send()

	# @cl.on_message
	# async def main(message: cl.Message):
	# """Handle user messages and generate responses."""
	# if vector_store is None:
	# await cl.Message(content="Error: Vector store not initialized.").send()
	# return

	# query = message.content

	# retriever = vector_store.as_retriever(search_kwargs={"k": 3})

	# # Initialize the OpenAI language model
	# llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0)

	# qa_chain = RetrievalQA.from_chain_type(
	# llm=llm,
	# chain_type="stuff",
	# retriever=retriever,
	# return_source_documents=True
	# )

	# result = qa_chain(query)
	# answer = result['result']
	# source_docs = result['source_documents']

	# await cl.Message(content=answer).send()

	# if source_docs:
	# sources_message = "Sources:\n"
	# for doc in source_docs:
	# file_name = os.path.basename(doc.metadata['source'])
	# if file_name in pdf_files:
	# file_path = pdf_files[file_name]
	# elements = [
	# cl.Text(name=file_name, content=f"Source: {file_name}"),
	# cl.File(name=file_name, path=file_path, display="inline")
	# ]
	# await cl.Message(content=f"Source: {file_name}", elements=elements).send()
	# else:
	# sources_message += f"- {doc.metadata['source']}\n"

	# if sources_message != "Sources:\n":
	# await cl.Message(content=sources_message).send()



	@cl.on_message
	async def main(message: cl.Message):
	"""Handle user messages and generate responses."""
	if vector_store is None:
	await cl.Message(content="Error: Vector store not initialized.").send()
	return

	query = message.content

	retriever = vector_store.as_retriever(search_kwargs={"k": 3})

	# Initialize the OpenAI language model
	llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0)

	qa_chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff",
	retriever=retriever,
	return_source_documents=True
	)

	result = qa_chain(query)
	answer = result['result']
	source_docs = result['source_documents']

	await cl.Message(content=answer).send()

	if source_docs:
	unique_sources = set()
	for doc in source_docs:
	file_name = os.path.basename(doc.metadata['source'])
	if file_name in pdf_files and file_name not in unique_sources:
	unique_sources.add(file_name)
	file_path = pdf_files[file_name]
	elements = [
	cl.Text(name=file_name, content=f"Source: {file_name}"),
	cl.File(name=file_name, path=file_path, display="inline")
	]
	await cl.Message(content=f"Source: {file_name}", elements=elements).send()

	other_sources = [doc.metadata['source'] for doc in source_docs if os.path.basename(doc.metadata['source']) not in pdf_files]
	unique_other_sources = set(other_sources)
	if unique_other_sources:
	sources_message = "Other Sources:\n" + "\n".join(f"- {source}" for source in unique_other_sources)
	await cl.Message(content=sources_message).send()

	if __name__ == "__main__":
	cl.run()