Spaces:
Runtime error
Runtime error
File size: 6,387 Bytes
459ab69 56b0710 459ab69 85c57d3 56b0710 459ab69 56b0710 459ab69 85c57d3 56b0710 85c57d3 56b0710 459ab69 56b0710 459ab69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import os
import time
import streamlit as st
from dotenv import load_dotenv
from extract import extract_text_from_pdfs
from generate import generate_response
from preprocess import preprocess_text
from retrieve import create_vectorizer, retrieve
# Load environment variables from .env file (if needed)
load_dotenv()
# Initialize session state
if "messages" not in st.session_state:
st.session_state.messages = []
if "pdf_files" not in st.session_state:
st.session_state.pdf_files = []
if "processed_texts" not in st.session_state:
st.session_state.processed_texts = []
st.title("RAG-based PDF Query System")
# File uploader for PDF files
uploaded_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
if uploaded_files:
# Check if new files were uploaded (clear old data if new ones are uploaded)
if "uploaded_files" not in st.session_state or uploaded_files != st.session_state.uploaded_files:
st.session_state.uploaded_files = uploaded_files
st.session_state.messages = []
st.session_state.pdf_files = []
st.session_state.processed_texts = []
# Initialize status container
with st.status("Processing the uploaded PDFs...", state="running") as status:
# Save uploaded files to disk
for uploaded_file in uploaded_files:
with open(uploaded_file.name, "wb") as f:
f.write(uploaded_file.getbuffer())
st.session_state.pdf_files.append(uploaded_file.name)
# Extract text from PDFs
num_files = len(st.session_state.pdf_files)
texts = []
for i, pdf_file in enumerate(st.session_state.pdf_files):
st.write(f"Extracting text from file {i + 1} of {num_files}...")
text = extract_text_from_pdfs([pdf_file])
texts.extend(text)
time.sleep(0.1)
# Preprocess text
st.write("Preprocessing text...")
st.session_state.processed_texts = preprocess_text(texts)
time.sleep(0.1)
# Create vectorizer and transform texts
st.write("Creating vectorizer and transforming texts...")
st.session_state.vectorizer, st.session_state.X = create_vectorizer(st.session_state.processed_texts)
time.sleep(0.1)
# Update status to complete
status.update(label="Processing complete!", state="complete")
else:
st.stop()
# Chat interface
st.write("### Ask a question about the uploaded PDFs")
# Display chat messages
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.write(message["content"])
# Chat input
prompt = st.chat_input("Ask something about the uploaded PDFs")
if prompt:
# Add user message to session state
st.session_state.messages.append({"role": "user", "content": prompt})
# Retrieve relevant texts
top_indices = retrieve(prompt, st.session_state.X, st.session_state.vectorizer)
retrieved_texts = [" ".join(st.session_state.processed_texts[i]) for i in top_indices]
# Generate response using Qwen2.5-7B-Instruct-1M
response = generate_response(retrieved_texts, prompt)
st.session_state.messages.append({"role": "assistant", "content": response})
# Display user message
with st.chat_message("user"):
st.write(prompt)
# Display assistant message
with st.chat_message("assistant"):
st.write(response)
# Clean up uploaded files
for pdf_file in st.session_state.pdf_files:
if os.path.exists(pdf_file):
os.remove(pdf_file)
st.session_state.messages = [] # Clear previous messages
st.session_state.pdf_files = []
st.session_state.processed_texts = []
# Initialize status container
with st.status("Processing the uploaded PDFs...", state="running") as status:
# Save uploaded files to disk
for uploaded_file in uploaded_files:
with open(uploaded_file.name, "wb") as f:
f.write(uploaded_file.getbuffer())
st.session_state.pdf_files.append(uploaded_file.name)
# Extract text from PDFs
num_files = len(st.session_state.pdf_files)
texts = []
for i, pdf_file in enumerate(st.session_state.pdf_files):
st.write(f"Extracting text from file {i + 1} of {num_files}...")
text = extract_text_from_pdfs([pdf_file])
texts.extend(text)
time.sleep(0.1) # Simulate time taken for processing
# Preprocess text
st.write("Preprocessing text...")
st.session_state.processed_texts = preprocess_text(texts)
time.sleep(0.1) # Simulate time taken for processing
# Create vectorizer and transform texts
st.write("Creating vectorizer and transforming texts...")
st.session_state.vectorizer, st.session_state.X = create_vectorizer(st.session_state.processed_texts)
time.sleep(0.1) # Simulate time taken for processing
# Update status to complete
status.update(label="Processing complete!", state="complete")
else:
st.stop()
# Chat interface
st.write("### Ask a question about the uploaded PDFs")
# Display chat messages
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.write(message["content"])
# Chat input
prompt = st.chat_input("Ask something about the uploaded PDFs")
if prompt:
# Add user message to session state
st.session_state.messages.append({"role": "user", "content": prompt})
# Retrieve relevant texts
top_indices = retrieve(prompt, st.session_state.X, st.session_state.vectorizer)
retrieved_texts = [" ".join(st.session_state.processed_texts[i]) for i in top_indices]
# Generate response
response = generate_response(retrieved_texts, prompt)
st.session_state.messages.append({"role": "assistant", "content": response})
# Display user message
with st.chat_message("user"):
st.write(prompt)
# Display assistant message
with st.chat_message("assistant"):
st.write(response)
# Clean up uploaded files
for pdf_file in st.session_state.pdf_files:
if os.path.exists(pdf_file):
os.remove(pdf_file)
|