Spaces:

rajesh1729
/

Streamlit-RAG-Chat-with-PDF

Sleeping

File size: 5,610 Bytes

import os
import streamlit as st
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.document_loaders import PyPDFLoader

# Initialize session state variables
if "messages" not in st.session_state:
    st.session_state.messages = []
if "chain" not in st.session_state:
    st.session_state.chain = None
if "vectorstore" not in st.session_state:  # Added vectorstore to session state
    st.session_state.vectorstore = None

def create_sidebar():
    with st.sidebar:
        st.title("PDF Chat")
        st.markdown("### Quick Demo of RAG")
        api_key = st.text_input("OpenAI API Key:", type="password")
        st.markdown("""
        ### Tools Used
        - OpenAI
        - LangChain
        - ChromaDB
        
        ### Steps
        1. Add API key
        2. Upload PDF
        3. Chat!
        """)
        return api_key

def process_pdfs(papers, api_key):
    """Process PDFs and return whether processing was successful"""
    if not papers:
        return False
        
    with st.spinner("Processing PDFs..."):
        try:
            # Create embeddings instance
            embeddings = OpenAIEmbeddings(openai_api_key=api_key)
            
            # Process all PDFs
            all_texts = []
            for paper in papers:
                # Save and load PDF
                file_path = os.path.join('./uploads', paper.name)
                os.makedirs('./uploads', exist_ok=True)
                with open(file_path, "wb") as f:
                    f.write(paper.getbuffer())
                
                # Load and split the PDF
                loader = PyPDFLoader(file_path)
                documents = loader.load()
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=1000,
                    chunk_overlap=200,
                )
                texts = text_splitter.split_documents(documents)
                all_texts.extend(texts)
                
                # Cleanup
                os.remove(file_path)
            
            # Create new vectorstore
            st.session_state.vectorstore = Chroma.from_documents(
                documents=all_texts,
                embedding=embeddings,
            )
            
            # Create chain
            st.session_state.chain = ConversationalRetrievalChain.from_llm(
                llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_key=api_key),
                retriever=st.session_state.vectorstore.as_retriever(
                    search_kwargs={"k": 3}  # Retrieve top 3 most relevant chunks
                ),
                memory=ConversationBufferMemory(
                    memory_key="chat_history",
                    return_messages=True,
                ),
                return_source_documents=True,  # Include source documents in response
            )
            
            st.success(f"Processed {len(papers)} PDF(s) successfully!")
            return True
            
        except Exception as e:
            st.error(f"Error processing PDFs: {str(e)}")
            return False

def main():
    st.set_page_config(page_title="PDF Chat")
    
    # Sidebar with API key input
    api_key = create_sidebar()
    
    if not api_key:
        st.warning("Please enter your OpenAI API key")
        return

    st.title("Chat with PDF")
    
    # File uploader
    papers = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
    
    # Process PDFs button
    if papers:
        if st.button("Process PDFs"):
            process_pdfs(papers, api_key)
    
    # Display chat messages from history
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])
    
    # Accept user input
    if prompt := st.chat_input("Ask about your PDFs"):
        # Add user message to chat history
        st.session_state.messages.append({"role": "user", "content": prompt})
        
        # Display user message
        with st.chat_message("user"):
            st.markdown(prompt)
            
        # Generate and display assistant response
        with st.chat_message("assistant"):
            if st.session_state.chain is None:
                response = "Please upload and process a PDF first."
            else:
                with st.spinner("Thinking..."):
                    # Get response with source documents
                    result = st.session_state.chain({"question": prompt})
                    response = result["answer"]
                    
                    # Optionally show sources
                    if "source_documents" in result:
                        sources = result["source_documents"]
                        if sources:
                            response += "\n\nSources:"
                            for i, doc in enumerate(sources, 1):
                                # Add page numbers if available
                                page_info = f" (Page {doc.metadata['page'] + 1})" if 'page' in doc.metadata else ""
                                response += f"\n{i}.{page_info} {doc.page_content[:200]}..."
            
            st.markdown(response)
            st.session_state.messages.append({"role": "assistant", "content": response})

if __name__ == "__main__":
    main()