File size: 3,440 Bytes
bd82d31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#ref: https://www.youtube.com/watch?v=3ZDVmzlM6Nc

import os
import chromadb
from chromadb import Client, Settings
import streamlit as st
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from PyPDF2 import PdfReader

# Clear ChromaDB cache to fix tenant issue
chromadb.api.client.SharedSystemClient.clear_system_cache()

# Ensure required environment variables are set
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    st.error("GROQ_API_KEY is not set. Please configure it in Hugging Face Spaces secrets.")
    st.stop()

# Function to process PDFs and set up the vectorstore
def process_and_store_pdfs(uploaded_files):
    texts = []
    for uploaded_file in uploaded_files:
        reader = PdfReader(uploaded_file)
        for page in reader.pages:
            texts.append(page.extract_text())

    # Combine and embed the texts
    embeddings = HuggingFaceEmbeddings()
    vectorstore = Chroma.from_texts(texts, embedding=embeddings)
    return vectorstore

# Function to set up the chat chain
def chat_chain(vectorstore):
    llm = ChatGroq(model="llama-3.1-70b-versatile", 
                   temperature=0, 
                   groq_api_key=GROQ_API_KEY)
    retriever = vectorstore.as_retriever()
    memory = ConversationBufferMemory(
        llm=llm,
        output_key="answer",
        memory_key="chat_history",
        return_messages=True
    )

    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        memory=memory,
        verbose=True,
        return_source_documents=True
    )
    return chain

# Streamlit UI configuration
st.set_page_config(
    page_title="Multi Doc Chat",
    page_icon="πŸ“š",
    layout="centered"
)

st.title("Chat with Your DocsπŸ“š")

# File uploader for PDFs
uploaded_files = st.file_uploader("Upload PDF files", accept_multiple_files=True, type=["pdf"])

# Process PDFs and initialize the vectorstore
if uploaded_files:
    with st.spinner("Processing files..."):
        vectorstore = process_and_store_pdfs(uploaded_files)
        st.session_state.vectorstore = vectorstore
        st.session_state.conversational_chain = chat_chain(vectorstore)
    st.success("Files successfully processed! You can now chat with your documents.")

# Initialize chat history
if "chat_history" not in st.session_state:
    st.session_state.chat_history = []

# Display chat history
for message in st.session_state.chat_history:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# User input
if "conversational_chain" in st.session_state:
    user_input = st.chat_input("Ask AI...")
    if user_input:
        st.session_state.chat_history.append({"role": "user", "content": user_input})

        with st.chat_message("user"):
            st.markdown(user_input)

        with st.chat_message("assistant"):
            # Generate response
            response = st.session_state.conversational_chain({"question": user_input})
            assistant_response = response["answer"]

            st.markdown(assistant_response)
            st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
else:
    st.info("Please upload PDF files to start chatting.")