RakeshUtekar's picture
Update app.py
bcf970d verified
import os
import time
import openai
import streamlit as st
from dotenv import load_dotenv
from extract import extract_text_from_pdfs
from generate import generate_response
from preprocess import preprocess_text
from retrieve import create_vectorizer, retrieve
# Load environment variables from .env file
load_dotenv()
# Set OpenAI API key
openai.api_key = os.getenv('api_key')
# Initialize session state
if "messages" not in st.session_state:
st.session_state.messages = []
if "pdf_files" not in st.session_state:
st.session_state.pdf_files = []
if "processed_texts" not in st.session_state:
st.session_state.processed_texts = []
st.title("RAG-based PDF Query System")
# File uploader for PDF files
uploaded_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
if uploaded_files:
if "uploaded_files" not in st.session_state or uploaded_files != st.session_state.uploaded_files:
st.session_state.uploaded_files = uploaded_files
st.session_state.messages = [] # Clear previous messages
st.session_state.pdf_files = []
st.session_state.processed_texts = []
# Initialize status container
with st.status("Processing the uploaded PDFs...", state="running") as status:
# Save uploaded files to disk
for uploaded_file in uploaded_files:
with open(uploaded_file.name, "wb") as f:
f.write(uploaded_file.getbuffer())
st.session_state.pdf_files.append(uploaded_file.name)
# Extract text from PDFs
num_files = len(st.session_state.pdf_files)
texts = []
for i, pdf_file in enumerate(st.session_state.pdf_files):
st.write(f"Extracting text from file {i + 1} of {num_files}...")
text = extract_text_from_pdfs([pdf_file])
texts.extend(text)
time.sleep(0.1) # Simulate time taken for processing
# Preprocess text
st.write("Preprocessing text...")
st.session_state.processed_texts = preprocess_text(texts)
time.sleep(0.1) # Simulate time taken for processing
# Create vectorizer and transform texts
st.write("Creating vectorizer and transforming texts...")
st.session_state.vectorizer, st.session_state.X = create_vectorizer(st.session_state.processed_texts)
time.sleep(0.1) # Simulate time taken for processing
# Update status to complete
status.update(label="Processing complete!", state="complete")
else:
st.stop()
# Chat interface
st.write("### Ask a question about the uploaded PDFs")
# Display chat messages
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.write(message["content"])
# Chat input
prompt = st.chat_input("Ask something about the uploaded PDFs")
if prompt:
# Add user message to session state
st.session_state.messages.append({"role": "user", "content": prompt})
# Retrieve relevant texts
top_indices = retrieve(prompt, st.session_state.X, st.session_state.vectorizer)
retrieved_texts = [" ".join(st.session_state.processed_texts[i]) for i in top_indices]
# Generate response
response = generate_response(retrieved_texts, prompt)
st.session_state.messages.append({"role": "assistant", "content": response})
# Display user message
with st.chat_message("user"):
st.write(prompt)
# Display assistant message
with st.chat_message("assistant"):
st.write(response)
# Clean up uploaded files
for pdf_file in st.session_state.pdf_files:
if os.path.exists(pdf_file):
os.remove(pdf_file)