File size: 6,387 Bytes
459ab69
 
 
 
56b0710
459ab69
 
 
 
 
 
85c57d3
56b0710
 
 
 
 
 
 
 
 
 
 
 
459ab69
 
56b0710
459ab69
 
 
85c57d3
56b0710
 
85c57d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56b0710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459ab69
 
56b0710
 
 
459ab69
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import os
import time

import streamlit as st
from dotenv import load_dotenv

from extract import extract_text_from_pdfs
from generate import generate_response
from preprocess import preprocess_text
from retrieve import create_vectorizer, retrieve

# Load environment variables from .env file (if needed)
load_dotenv()

# Initialize session state
if "messages" not in st.session_state:
    st.session_state.messages = []

if "pdf_files" not in st.session_state:
    st.session_state.pdf_files = []

if "processed_texts" not in st.session_state:
    st.session_state.processed_texts = []

st.title("RAG-based PDF Query System")

# File uploader for PDF files
uploaded_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)

if uploaded_files:
    # Check if new files were uploaded (clear old data if new ones are uploaded)
    if "uploaded_files" not in st.session_state or uploaded_files != st.session_state.uploaded_files:
        st.session_state.uploaded_files = uploaded_files
        st.session_state.messages = []
        st.session_state.pdf_files = []
        st.session_state.processed_texts = []

        # Initialize status container
        with st.status("Processing the uploaded PDFs...", state="running") as status:
            # Save uploaded files to disk
            for uploaded_file in uploaded_files:
                with open(uploaded_file.name, "wb") as f:
                    f.write(uploaded_file.getbuffer())
                st.session_state.pdf_files.append(uploaded_file.name)

            # Extract text from PDFs
            num_files = len(st.session_state.pdf_files)
            texts = []
            for i, pdf_file in enumerate(st.session_state.pdf_files):
                st.write(f"Extracting text from file {i + 1} of {num_files}...")
                text = extract_text_from_pdfs([pdf_file])
                texts.extend(text)
                time.sleep(0.1)

            # Preprocess text
            st.write("Preprocessing text...")
            st.session_state.processed_texts = preprocess_text(texts)
            time.sleep(0.1)

            # Create vectorizer and transform texts
            st.write("Creating vectorizer and transforming texts...")
            st.session_state.vectorizer, st.session_state.X = create_vectorizer(st.session_state.processed_texts)
            time.sleep(0.1)

            # Update status to complete
            status.update(label="Processing complete!", state="complete")

else:
    st.stop()

# Chat interface
st.write("### Ask a question about the uploaded PDFs")

# Display chat messages
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.write(message["content"])

# Chat input
prompt = st.chat_input("Ask something about the uploaded PDFs")
if prompt:
    # Add user message to session state
    st.session_state.messages.append({"role": "user", "content": prompt})

    # Retrieve relevant texts
    top_indices = retrieve(prompt, st.session_state.X, st.session_state.vectorizer)
    retrieved_texts = [" ".join(st.session_state.processed_texts[i]) for i in top_indices]

    # Generate response using Qwen2.5-7B-Instruct-1M
    response = generate_response(retrieved_texts, prompt)
    st.session_state.messages.append({"role": "assistant", "content": response})

    # Display user message
    with st.chat_message("user"):
        st.write(prompt)

    # Display assistant message
    with st.chat_message("assistant"):
        st.write(response)

# Clean up uploaded files
for pdf_file in st.session_state.pdf_files:
    if os.path.exists(pdf_file):
        os.remove(pdf_file)
        st.session_state.messages = []  # Clear previous messages
        st.session_state.pdf_files = []
        st.session_state.processed_texts = []

        # Initialize status container
        with st.status("Processing the uploaded PDFs...", state="running") as status:
            # Save uploaded files to disk
            for uploaded_file in uploaded_files:
                with open(uploaded_file.name, "wb") as f:
                    f.write(uploaded_file.getbuffer())
                st.session_state.pdf_files.append(uploaded_file.name)

            # Extract text from PDFs
            num_files = len(st.session_state.pdf_files)
            texts = []
            for i, pdf_file in enumerate(st.session_state.pdf_files):
                st.write(f"Extracting text from file {i + 1} of {num_files}...")
                text = extract_text_from_pdfs([pdf_file])
                texts.extend(text)
                time.sleep(0.1)  # Simulate time taken for processing

            # Preprocess text
            st.write("Preprocessing text...")
            st.session_state.processed_texts = preprocess_text(texts)
            time.sleep(0.1)  # Simulate time taken for processing

            # Create vectorizer and transform texts
            st.write("Creating vectorizer and transforming texts...")
            st.session_state.vectorizer, st.session_state.X = create_vectorizer(st.session_state.processed_texts)
            time.sleep(0.1)  # Simulate time taken for processing

            # Update status to complete
            status.update(label="Processing complete!", state="complete")

else:
    st.stop()

# Chat interface
st.write("### Ask a question about the uploaded PDFs")

# Display chat messages
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.write(message["content"])

# Chat input
prompt = st.chat_input("Ask something about the uploaded PDFs")
if prompt:
    # Add user message to session state
    st.session_state.messages.append({"role": "user", "content": prompt})

    # Retrieve relevant texts
    top_indices = retrieve(prompt, st.session_state.X, st.session_state.vectorizer)
    retrieved_texts = [" ".join(st.session_state.processed_texts[i]) for i in top_indices]

    # Generate response
    response = generate_response(retrieved_texts, prompt)
    st.session_state.messages.append({"role": "assistant", "content": response})

    # Display user message
    with st.chat_message("user"):
        st.write(prompt)

    # Display assistant message
    with st.chat_message("assistant"):
        st.write(response)

# Clean up uploaded files
for pdf_file in st.session_state.pdf_files:
    if os.path.exists(pdf_file):
        os.remove(pdf_file)