fhmsf commited on
Commit
921780e
·
verified ·
1 Parent(s): 3d0f58b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -168
app.py CHANGED
@@ -1,16 +1,19 @@
1
  import os
2
  import faiss
3
- import gradio as gr
4
  import numpy as np
5
  import requests
 
6
 
7
  from pypdf import PdfReader
8
  from sentence_transformers import SentenceTransformer
9
 
10
- ################################################################################
11
  # 1. PDF Parsing and Chunking
12
- ################################################################################
13
  def extract_pdf_text(pdf_file) -> str:
 
 
 
14
  reader = PdfReader(pdf_file)
15
  all_text = []
16
  for page in reader.pages:
@@ -19,6 +22,10 @@ def extract_pdf_text(pdf_file) -> str:
19
  return "\n".join(all_text)
20
 
21
  def chunk_text(text, chunk_size=300, overlap=50):
 
 
 
 
22
  words = text.split()
23
  chunks = []
24
  start = 0
@@ -29,44 +36,58 @@ def chunk_text(text, chunk_size=300, overlap=50):
29
  start += (chunk_size - overlap)
30
  return chunks
31
 
32
- ################################################################################
33
  # 2. Embedding Model
34
- ################################################################################
35
  embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
36
 
37
- ################################################################################
38
  # 3. Build FAISS Index
39
- ################################################################################
40
  def build_faiss_index(chunks):
 
 
 
 
41
  chunk_embeddings = embedding_model.encode(chunks, show_progress_bar=False)
42
  chunk_embeddings = np.array(chunk_embeddings, dtype='float32')
 
43
  dimension = chunk_embeddings.shape[1]
44
  index = faiss.IndexFlatL2(dimension)
45
  index.add(chunk_embeddings)
 
46
  return index, chunk_embeddings
47
 
48
- ################################################################################
49
  # 4. Retrieval Function
50
- ################################################################################
51
  def retrieve_chunks(query, index, chunks, top_k=3):
 
 
 
52
  query_embedding = embedding_model.encode([query], show_progress_bar=False)
53
  query_embedding = np.array(query_embedding, dtype='float32')
54
-
55
  distances, indices = index.search(query_embedding, top_k)
56
- relevant_chunks = [chunks[i] for i in indices[0]]
57
- return relevant_chunks
58
 
59
- ################################################################################
60
- # 5. Gemini LLM Integration (Updated for "candidates" response)
61
- ################################################################################
62
  def gemini_generate(prompt):
 
 
 
63
  gemini_api_key = os.environ.get("GEMINI_API_KEY", "")
64
  if not gemini_api_key:
65
  return "Error: No GEMINI_API_KEY found in environment variables."
66
 
67
- url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key={gemini_api_key}"
68
-
69
- data = {
 
 
 
70
  "contents": [
71
  {
72
  "parts": [
@@ -77,170 +98,97 @@ def gemini_generate(prompt):
77
  }
78
  headers = {"Content-Type": "application/json"}
79
 
80
- response = requests.post(url, headers=headers, json=data)
81
- if response.status_code != 200:
82
- return f"Error {response.status_code}: {response.text}"
83
-
84
- r_data = response.json()
85
  try:
86
- generated_text = r_data["candidates"][0]["content"]["parts"][0]["text"]
87
- return generated_text
88
- except Exception:
89
- return f"Parsing error or unexpected response structure: {r_data}"
90
-
91
- ################################################################################
 
 
 
 
 
92
  # 6. RAG QA Function
93
- ################################################################################
94
  def answer_question_with_RAG(user_question, index, chunks):
 
 
 
95
  relevant_chunks = retrieve_chunks(user_question, index, chunks, top_k=3)
96
  context = "\n\n".join(relevant_chunks)
 
97
  prompt = f"""
98
  You are an AI assistant that knows the details from the uploaded research paper.
99
  Answer the user's question accurately using the context below.
100
- If something is not in the context, say you don't know.
 
101
  Context:
102
  {context}
 
103
  User's question: {user_question}
 
104
  Answer:
105
  """
106
  return gemini_generate(prompt)
107
 
108
- ################################################################################
109
- # 7. Gradio Interface (Enhanced Styling)
110
- ################################################################################
111
- def process_pdf(pdf_file):
112
- if pdf_file is None:
113
- return None, "Please upload a PDF file."
114
-
115
- text = extract_pdf_text(pdf_file.name)
116
- if not text:
117
- return None, "No text found in PDF."
118
-
119
- chunks = chunk_text(text, chunk_size=300, overlap=50)
120
- if not chunks:
121
- return None, "No valid text to chunk."
122
-
123
- faiss_index, _ = build_faiss_index(chunks)
124
- return (faiss_index, chunks), "PDF processed successfully!"
125
-
126
- def chat_with_paper(query, state):
127
- if not state:
128
- return "Please upload and process a PDF first."
129
- faiss_index, doc_chunks = state
130
- if not query or not query.strip():
131
- return "Please enter a valid question."
132
-
133
- answer = answer_question_with_RAG(query, faiss_index, doc_chunks)
134
- return answer
135
-
136
- demo_theme = gr.themes.Soft(primary_hue="slate")
137
-
138
- # Custom CSS:
139
- # 1. Lightest blue background
140
- # 2. Green buttons
141
- # 3. Thick black border, centered content
142
- # 4. Large, bold, center-aligned title
143
- # 5. Representative icon at top, bigger font for welcome text
144
- css_code = """
145
- body {
146
- background-color: #E6F7FF !important; /* Lightest blue */
147
- margin: 0;
148
- padding: 0;
149
- }
150
-
151
- /* Center the entire Gradio container and give a thick black border */
152
- .block > .inside {
153
- margin: auto !important;
154
- max-width: 900px !important; /* You can increase/decrease the max-width for your preference */
155
- border: 4px solid black !important; /* Thick black border */
156
- border-radius: 10px !important;
157
- background-color: #FFFFFF !important; /* White container for clarity */
158
- padding: 20px !important;
159
- }
160
-
161
- /* Title heading: bigger, bolder, centered */
162
- #app-title {
163
- text-align: center !important;
164
- font-size: 3rem !important;
165
- font-weight: 900 !important;
166
- margin-bottom: 0.5rem !important;
167
- margin-top: 0.5rem !important;
168
- }
169
-
170
- /* Welcome text: slightly smaller, but still bold, centered */
171
- #app-welcome {
172
- text-align: center !important;
173
- font-size: 1.5rem !important;
174
- color: #444 !important;
175
- margin-bottom: 25px !important;
176
- font-weight: 700 !important;
177
- }
178
-
179
- /* Buttons: green background, white text */
180
- button {
181
- background-color: #3CB371 !important; /* Medium sea green */
182
- color: #ffffff !important;
183
- border: none !important;
184
- font-weight: 600 !important;
185
- cursor: pointer;
186
- }
187
-
188
- /* Button hover effect: darker green */
189
- button:hover {
190
- background-color: #2E8B57 !important;
191
- }
192
-
193
- /* Optional: center the text in textboxes, if you like */
194
- textarea, input[type="text"] {
195
- text-align: center !important;
196
- }
197
-
198
- /* Icon container styling */
199
- #icon-container {
200
- text-align: center !important;
201
- margin-top: 1rem !important;
202
- margin-bottom: 1rem !important;
203
- }
204
- """
205
-
206
- with gr.Blocks(theme=demo_theme, css=css_code) as demo:
207
- # Representative icon/image at the top
208
- # Replace the 'src' with any other icon URL you prefer
209
- gr.Markdown("""
210
- <div id="icon-container">
211
- <img src="https://i.ibb.co/3Wp3yBZ/ai-icon.png" alt="AI icon" style="width:100px;">
212
- </div>
213
- """)
214
-
215
- # App title (large, bold, centered)
216
- gr.Markdown("<div id='app-title'>AI-Powered Personal Research Assistant</div>")
217
-
218
- # Welcome text right under the title
219
- gr.Markdown("<div id='app-welcome'>Welcome! How may I help you?</div>")
220
-
221
- state = gr.State()
222
-
223
- with gr.Row():
224
- pdf_input = gr.File(label="Upload your research paper (PDF)", file_types=[".pdf"])
225
- process_button = gr.Button("Process PDF")
226
-
227
- status_output = gr.Textbox(label="Status", interactive=False)
228
-
229
- process_button.click(
230
- fn=process_pdf,
231
- inputs=pdf_input,
232
- outputs=[state, status_output]
233
- )
234
-
235
- with gr.Row():
236
- user_query = gr.Textbox(label="Ask a question about your research paper:")
237
- ask_button = gr.Button("Get Answer")
238
- answer_output = gr.Textbox(label="Answer")
239
-
240
- ask_button.click(
241
- fn=chat_with_paper,
242
- inputs=[user_query, state],
243
- outputs=answer_output
244
  )
245
 
246
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import faiss
 
3
  import numpy as np
4
  import requests
5
+ import streamlit as st
6
 
7
  from pypdf import PdfReader
8
  from sentence_transformers import SentenceTransformer
9
 
10
+ ###############################################################################
11
  # 1. PDF Parsing and Chunking
12
+ ###############################################################################
13
  def extract_pdf_text(pdf_file) -> str:
14
+ """
15
+ Read and extract text from each page of an uploaded PDF file.
16
+ """
17
  reader = PdfReader(pdf_file)
18
  all_text = []
19
  for page in reader.pages:
 
22
  return "\n".join(all_text)
23
 
24
  def chunk_text(text, chunk_size=300, overlap=50):
25
+ """
26
+ Splits text into overlapping chunks, each approx. 'chunk_size' tokens.
27
+ 'overlap' is how many tokens from the previous chunk to include again.
28
+ """
29
  words = text.split()
30
  chunks = []
31
  start = 0
 
36
  start += (chunk_size - overlap)
37
  return chunks
38
 
39
+ ###############################################################################
40
  # 2. Embedding Model
41
+ ###############################################################################
42
  embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
43
 
44
+ ###############################################################################
45
  # 3. Build FAISS Index
46
+ ###############################################################################
47
  def build_faiss_index(chunks):
48
+ """
49
+ Creates a FAISS index from embedded chunks.
50
+ Returns (index, chunk_embeddings).
51
+ """
52
  chunk_embeddings = embedding_model.encode(chunks, show_progress_bar=False)
53
  chunk_embeddings = np.array(chunk_embeddings, dtype='float32')
54
+
55
  dimension = chunk_embeddings.shape[1]
56
  index = faiss.IndexFlatL2(dimension)
57
  index.add(chunk_embeddings)
58
+
59
  return index, chunk_embeddings
60
 
61
+ ###############################################################################
62
  # 4. Retrieval Function
63
+ ###############################################################################
64
  def retrieve_chunks(query, index, chunks, top_k=3):
65
+ """
66
+ Embeds 'query' and retrieves the top_k most relevant chunks from 'index'.
67
+ """
68
  query_embedding = embedding_model.encode([query], show_progress_bar=False)
69
  query_embedding = np.array(query_embedding, dtype='float32')
70
+
71
  distances, indices = index.search(query_embedding, top_k)
72
+ return [chunks[i] for i in indices[0]]
 
73
 
74
+ ###############################################################################
75
+ # 5. Gemini LLM Integration
76
+ ###############################################################################
77
  def gemini_generate(prompt):
78
+ """
79
+ Calls Google's Gemini API with the environment variable GEMINI_API_KEY.
80
+ """
81
  gemini_api_key = os.environ.get("GEMINI_API_KEY", "")
82
  if not gemini_api_key:
83
  return "Error: No GEMINI_API_KEY found in environment variables."
84
 
85
+ url = (
86
+ "https://generativelanguage.googleapis.com/"
87
+ "v1beta/models/gemini-1.5-flash:generateContent"
88
+ f"?key={gemini_api_key}"
89
+ )
90
+ payload = {
91
  "contents": [
92
  {
93
  "parts": [
 
98
  }
99
  headers = {"Content-Type": "application/json"}
100
 
 
 
 
 
 
101
  try:
102
+ response = requests.post(url, headers=headers, json=payload)
103
+ response.raise_for_status()
104
+ r_data = response.json()
105
+ # Extract the text from the 'candidates' structure:
106
+ return r_data["candidates"][0]["content"]["parts"][0]["text"]
107
+ except requests.exceptions.RequestException as e:
108
+ return f"Error calling Gemini API: {e}"
109
+ except KeyError:
110
+ return f"Parsing error or unexpected response format: {response.text}"
111
+
112
+ ###############################################################################
113
  # 6. RAG QA Function
114
+ ###############################################################################
115
  def answer_question_with_RAG(user_question, index, chunks):
116
+ """
117
+ Retrieves relevant chunks, builds an augmented prompt, and calls gemini_generate().
118
+ """
119
  relevant_chunks = retrieve_chunks(user_question, index, chunks, top_k=3)
120
  context = "\n\n".join(relevant_chunks)
121
+
122
  prompt = f"""
123
  You are an AI assistant that knows the details from the uploaded research paper.
124
  Answer the user's question accurately using the context below.
125
+ If something is not in the context, say 'I don't know'.
126
+
127
  Context:
128
  {context}
129
+
130
  User's question: {user_question}
131
+
132
  Answer:
133
  """
134
  return gemini_generate(prompt)
135
 
136
+ ###############################################################################
137
+ # Streamlit Application
138
+ ###############################################################################
139
+ def main():
140
+ # Basic page config (optional):
141
+ st.set_page_config(
142
+ page_title="AI-Powered Personal Research Assistant",
143
+ layout="centered"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  )
145
 
146
+ # Title and Subheader
147
+ st.title("AI-Powered Personal Research Assistant")
148
+ st.write("Welcome! How may I help you?")
149
+
150
+ # Store the FAISS index + chunks in session_state to persist across reruns
151
+ if "faiss_index" not in st.session_state:
152
+ st.session_state.faiss_index = None
153
+ if "chunks" not in st.session_state:
154
+ st.session_state.chunks = None
155
+
156
+ # Step 1: Upload and Process PDF
157
+ uploaded_pdf = st.file_uploader("Upload your research paper (PDF)", type=["pdf"])
158
+ if st.button("Process PDF"):
159
+ if uploaded_pdf is None:
160
+ st.warning("Please upload a PDF file first.")
161
+ else:
162
+ # Read and chunk
163
+ raw_text = extract_pdf_text(uploaded_pdf)
164
+ if not raw_text.strip():
165
+ st.error("No text found in PDF.")
166
+ return
167
+ chunks = chunk_text(raw_text, chunk_size=300, overlap=50)
168
+ if not chunks:
169
+ st.error("No valid text to chunk.")
170
+ return
171
+ # Build index
172
+ faiss_index, _ = build_faiss_index(chunks)
173
+ st.session_state.faiss_index = faiss_index
174
+ st.session_state.chunks = chunks
175
+ st.success("PDF processed successfully!")
176
+
177
+ # Step 2: Ask a Question
178
+ user_question = st.text_input("Ask a question about your research paper:")
179
+ if st.button("Get Answer"):
180
+ if not st.session_state.faiss_index or not st.session_state.chunks:
181
+ st.warning("Please upload and process a PDF first.")
182
+ elif not user_question.strip():
183
+ st.warning("Please enter a valid question.")
184
+ else:
185
+ answer = answer_question_with_RAG(
186
+ user_question,
187
+ st.session_state.faiss_index,
188
+ st.session_state.chunks
189
+ )
190
+ st.write("### Answer:")
191
+ st.write(answer)
192
+
193
+ if __name__ == "__main__":
194
+ main()