File size: 4,330 Bytes
030bc4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f98d89
030bc4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99e417d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
from typing import List
from dotenv import load_dotenv
import chainlit as cl
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

# Load environment variables
load_dotenv()

# Initialize OpenAI API key
openai_api_key = os.getenv("'sk-None-Nn6BodKwwjNYiNYT2QtWT3BlbkFJqTm7b3Fq4HftPntWdkUa'")

# Initialize embedding model using OpenAI
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key,model="text-embedding-3-small")

# Initialize vector store
vector_store = None

# Store PDF file paths
pdf_files = {}

# Define the path for the FAISS index
FAISS_INDEX_PATH = "faiss_index"
FAISS_INDEX_FILE = os.path.join(FAISS_INDEX_PATH, "index.faiss")

def process_pdfs(directory: str) -> None:
    """Process all PDFs in the given directory and add them to the vector store."""
    global vector_store, pdf_files
    documents = []

    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory, filename)
            loader = PyPDFLoader(file_path)
            documents.extend(loader.load())
            pdf_files[filename] = file_path

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)

    if os.path.exists(FAISS_INDEX_FILE):
        try:
            vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
            vector_store.add_documents(texts)
        except Exception as e:
            print(f"Error loading FAISS index: {e}")
            vector_store = FAISS.from_documents(texts, embeddings)
    else:
        vector_store = FAISS.from_documents(texts, embeddings)

    # Save the updated vector store
    if not os.path.exists(FAISS_INDEX_PATH):
        os.makedirs(FAISS_INDEX_PATH)
    vector_store.save_local(FAISS_INDEX_PATH)

@cl.on_chat_start
async def start():
    """Initialize the chat session."""
    await cl.Message(content="Welcome! Processing PDFs...").send()

    # Process PDFs (replace with your PDF directory)
    process_pdfs(r"C:\Users\sumes\OneDrive\Documents\pdf_docs")

    await cl.Message(content="PDFs processed. You can now ask questions!").send()

@cl.on_message
async def main(message: cl.Message):
    """Handle user messages and generate responses."""
    if vector_store is None:
        await cl.Message(content="Error: Vector store not initialized.").send()
        return

    query = message.content

    retriever = vector_store.as_retriever(search_kwargs={"k": 3})

    # Initialize the OpenAI language model
    llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini", temperature=0)

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )

    result = qa_chain(query)
    answer = result['result']
    source_docs = result['source_documents']

    await cl.Message(content=answer).send()

    if source_docs:
        unique_sources = set()
        for doc in source_docs:
            file_name = os.path.basename(doc.metadata['source'])
            if file_name in pdf_files and file_name not in unique_sources:
                unique_sources.add(file_name)
                file_path = pdf_files[file_name]
                elements = [
                    cl.Text(name=file_name, content=f"Source: {file_name}"),
                    cl.File(name=file_name, path=file_path, display="inline")
                ]
                await cl.Message(content=f"Source: {file_name}", elements=elements).send()

        other_sources = [doc.metadata['source'] for doc in source_docs if os.path.basename(doc.metadata['source']) not in pdf_files]
        unique_other_sources = set(other_sources)
        if unique_other_sources:
            sources_message = "Other Sources:\n" + "\n".join(f"- {source}" for source in unique_other_sources)
            await cl.Message(content=sources_message).send()

if __name__ == "__main__":
    cl.run()