|
import os |
|
from typing import List |
|
from dotenv import load_dotenv |
|
import chainlit as cl |
|
from langchain_community.embeddings import OpenAIEmbeddings |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain.chains import RetrievalQA |
|
from langchain_openai import ChatOpenAI |
|
from langchain_openai import OpenAIEmbeddings |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
openai_api_key = os.getenv("'sk-None-Nn6BodKwwjNYiNYT2QtWT3BlbkFJqTm7b3Fq4HftPntWdkUa'") |
|
|
|
|
|
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key,model="text-embedding-3-small") |
|
|
|
|
|
vector_store = None |
|
|
|
|
|
pdf_files = {} |
|
|
|
|
|
FAISS_INDEX_PATH = "faiss_index" |
|
FAISS_INDEX_FILE = os.path.join(FAISS_INDEX_PATH, "index.faiss") |
|
|
|
def process_pdfs(directory: str) -> None: |
|
"""Process all PDFs in the given directory and add them to the vector store.""" |
|
global vector_store, pdf_files |
|
documents = [] |
|
|
|
for filename in os.listdir(directory): |
|
if filename.endswith(".pdf"): |
|
file_path = os.path.join(directory, filename) |
|
loader = PyPDFLoader(file_path) |
|
documents.extend(loader.load()) |
|
pdf_files[filename] = file_path |
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) |
|
texts = text_splitter.split_documents(documents) |
|
|
|
if os.path.exists(FAISS_INDEX_FILE): |
|
try: |
|
vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True) |
|
vector_store.add_documents(texts) |
|
except Exception as e: |
|
print(f"Error loading FAISS index: {e}") |
|
vector_store = FAISS.from_documents(texts, embeddings) |
|
else: |
|
vector_store = FAISS.from_documents(texts, embeddings) |
|
|
|
|
|
if not os.path.exists(FAISS_INDEX_PATH): |
|
os.makedirs(FAISS_INDEX_PATH) |
|
vector_store.save_local(FAISS_INDEX_PATH) |
|
|
|
@cl.on_chat_start |
|
async def start(): |
|
"""Initialize the chat session.""" |
|
await cl.Message(content="Welcome! Processing PDFs...").send() |
|
|
|
|
|
process_pdfs(r"C:\Users\sumes\OneDrive\Documents\pdf_docs") |
|
|
|
await cl.Message(content="PDFs processed. You can now ask questions!").send() |
|
|
|
@cl.on_message |
|
async def main(message: cl.Message): |
|
"""Handle user messages and generate responses.""" |
|
if vector_store is None: |
|
await cl.Message(content="Error: Vector store not initialized.").send() |
|
return |
|
|
|
query = message.content |
|
|
|
retriever = vector_store.as_retriever(search_kwargs={"k": 3}) |
|
|
|
|
|
llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0) |
|
|
|
qa_chain = RetrievalQA.from_chain_type( |
|
llm=llm, |
|
chain_type="stuff", |
|
retriever=retriever, |
|
return_source_documents=True |
|
) |
|
|
|
result = qa_chain(query) |
|
answer = result['result'] |
|
source_docs = result['source_documents'] |
|
|
|
await cl.Message(content=answer).send() |
|
|
|
if source_docs: |
|
unique_sources = set() |
|
for doc in source_docs: |
|
file_name = os.path.basename(doc.metadata['source']) |
|
if file_name in pdf_files and file_name not in unique_sources: |
|
unique_sources.add(file_name) |
|
file_path = pdf_files[file_name] |
|
elements = [ |
|
cl.Text(name=file_name, content=f"Source: {file_name}"), |
|
cl.File(name=file_name, path=file_path, display="inline") |
|
] |
|
await cl.Message(content=f"Source: {file_name}", elements=elements).send() |
|
|
|
other_sources = [doc.metadata['source'] for doc in source_docs if os.path.basename(doc.metadata['source']) not in pdf_files] |
|
unique_other_sources = set(other_sources) |
|
if unique_other_sources: |
|
sources_message = "Other Sources:\n" + "\n".join(f"- {source}" for source in unique_other_sources) |
|
await cl.Message(content=sources_message).send() |
|
|
|
if __name__ == "__main__": |
|
cl.run() |
|
|