NanobotzAI commited on
Commit
15f4257
·
verified ·
1 Parent(s): 99b90f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -173
app.py CHANGED
@@ -1,29 +1,26 @@
1
- from openai import OpenAI
2
- from os import getenv
 
3
  from flask import Flask, request, jsonify, render_template
4
  import fitz # PyMuPDF for PDF text extraction
5
- import faiss # FAISS for vector search
6
- import numpy as np
7
  import os
8
- from sentence_transformers import SentenceTransformer
9
- # from huggingface_hub import InferenceClient # Not used in the current code, removed for clarity
10
- from typing import List, Tuple
11
- from io import BytesIO # Added for BytesIO
12
 
 
13
  app = Flask(__name__, template_folder=os.getcwd())
14
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
15
 
16
  # --- Configuration ---
17
  class ChatConfig:
18
- MODEL = "google/gemma-3-27b-it:free"
19
  DEFAULT_MAX_TOKENS = 512
20
- DEFAULT_TEMP = 0.5 # Slightly increased for more natural variance
21
  DEFAULT_TOP_P = 0.95
22
 
23
- # --- NEW: System Prompt Options ---
24
- # Option 1: Friendly & Helpful Assistant
25
- SYSTEM_PROMPT_FRIENDLY = """You are Sentry, an AI assistant representing SentryLabs. Your purpose is to act as a knowledgeable, trusted advisor and cybersecurity innovator, assisting users by analyzing the content of the uploaded PDF document.
26
-
27
  **Your Core Directives:**
28
  1. **Strict Document Scope:** Your knowledge is **strictly limited** to the content within the provided context sections of the uploaded PDF. You **must not** use any external knowledge, make assumptions beyond the text, or invent information.
29
  2. **Persona Embodiment:** Consistently embody the SentryLabs voice:
@@ -36,38 +33,28 @@ class ChatConfig:
36
  5. **Source Attribution:** When answering, subtly reference the document content as its from your memory(do not use words like "According to the document...", "The provided text indicates...", "Based on the analysis of the document sections..." ).
37
  6. **Audience Awareness (Implied):** While interacting with one user, frame your analysis in a way that would be valuable to decision-makers (balancing technical detail found in the document with its potential strategic relevance, *if* the document provides such context).
38
  7. **Focus:** Your primary goal is accurate information retrieval and synthesis *from the provided document text only*, presented through the SentryLabs persona.
39
-
40
  Engage directly and professionally. If this is the start of the conversation (no prior history), you can offer a brief introductory sentence. Remember, accuracy and adherence to the document are paramount.
41
  """
42
 
43
- # Option 2: Knowledgeable Expert (More Formal)
44
- SYSTEM_PROMPT_EXPERT = """You are a knowledgeable AI expert specializing in the content of the uploaded PDF document.
45
- You must answer user questions with precision, drawing *exclusively* from the provided context segments.
46
- Maintain a professional and informative tone.
47
- If the provided context does not contain the necessary information to answer the question, explicitly state that the information is not found within the scope of the provided text.
48
- Do not speculate, infer beyond the text, or utilize any external information sources.
49
- Clearly attribute your answers to the document, for instance, by starting with "The document indicates that..." or "Based on the provided context...".
50
- Provide comprehensive answers derived solely from the text.
51
- """
52
-
53
- # --- Select the desired prompt ---
54
- SELECTED_SYSTEM_PROMPT = SYSTEM_PROMPT_FRIENDLY # Choose which personality to use
55
-
56
- # --- API Client & Embedding Setup ---
57
- OPENROUTER_API_KEY = getenv('OPENROUTER_API_KEY')
58
- if not OPENROUTER_API_KEY:
59
- raise ValueError("OPENROUTER_API_KEY environment variable not set.")
60
-
61
- client = OpenAI(
62
- base_url="https://openrouter.ai/api/v1",
63
- api_key=OPENROUTER_API_KEY,
64
- )
65
-
66
- # Use a temporary cache directory if needed, or configure appropriately
67
- embed_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=getenv("SENTENCE_TRANSFORMERS_HOME", "/tmp/st_cache"))
68
- vector_dim = 384
69
- index = faiss.IndexFlatL2(vector_dim)
70
- documents = [] # Store original text chunks corresponding to index entries
71
 
72
  # --- Core Functions ---
73
 
@@ -78,167 +65,123 @@ def extract_text_from_pdf(pdf_stream: BytesIO) -> List[str]:
78
  pdf_stream = BytesIO(pdf_stream.read()) # Read if it's a file stream
79
 
80
  doc = fitz.open(stream=pdf_stream, filetype="pdf")
81
- # Simple chunking by page - consider more advanced chunking (by paragraph, sentence, fixed size) for better RAG performance
82
  text_chunks = [page.get_text("text").strip() for page in doc if page.get_text("text").strip()]
83
  doc.close()
84
  print(f"Extracted {len(text_chunks)} non-empty text chunks from PDF.")
85
  return text_chunks
86
 
87
- # Renamed for clarity, added error handling
88
  def build_vector_index(text_chunks: List[str]):
89
- """Embeds text chunks and builds the FAISS index."""
90
- global documents, index, vector_dim
91
 
92
  if not text_chunks:
93
- print("Warning: No text chunks provided to build the vector index.")
94
- documents = []
95
- index = faiss.IndexFlatL2(vector_dim) # Reinitialize empty index
96
  return
97
 
98
- print(f"Building vector index for {len(text_chunks)} chunks...")
99
- documents = text_chunks # Store the original text
100
-
101
- # Reset the index
102
- index = faiss.IndexFlatL2(vector_dim)
103
-
104
  try:
105
- embeddings = embed_model.encode(text_chunks, show_progress_bar=True)
106
- embeddings = np.array(embeddings, dtype=np.float32)
107
-
108
- if embeddings.ndim == 1:
109
- embeddings = embeddings.reshape(1, -1)
110
-
111
- if embeddings.shape[1] != vector_dim:
112
- raise ValueError(f"Embedding dimension mismatch: expected {vector_dim}, got {embeddings.shape[1]}")
113
-
114
- index.add(embeddings)
115
- print(f"FAISS index built successfully with {index.ntotal} vectors.")
116
-
 
 
 
117
  except Exception as e:
118
- print(f"Error during embedding or indexing: {e}")
119
- # Reset state in case of error
120
- documents = []
121
- index = faiss.IndexFlatL2(vector_dim)
122
- raise # Re-raise the exception to signal failure
123
-
 
124
 
125
- # Renamed for clarity, added checks
126
  def search_relevant_chunks(query: str, k: int = 3) -> str:
127
- """Finds the most relevant text chunks for the given query using FAISS."""
128
- global index, documents
129
-
130
- if index.ntotal == 0:
131
- print("Warning: Search attempted on an empty index.")
132
- return "" # Return empty string if index is not ready
133
 
134
  if not query:
135
  return ""
136
 
137
  try:
138
- query_embedding = embed_model.encode([query])
139
- query_embedding = np.array(query_embedding, dtype=np.float32)
140
-
141
- # Perform the search
142
- distances, indices = index.search(query_embedding, k=min(k, index.ntotal)) # Ensure k <= index size
143
-
144
- # Filter out potential invalid indices (-1 can sometimes occur if k > ntotal, though min() handles it)
145
- valid_indices = [idx for idx in indices[0] if idx != -1 and idx < len(documents)]
146
-
147
- if not valid_indices:
148
- print(f"No relevant chunks found for query: '{query[:50]}...'")
149
- return ""
150
-
151
- # Retrieve the actual text chunks
152
- relevant_docs = [documents[i] for i in valid_indices]
153
- print(f"Retrieved {len(relevant_docs)} relevant chunks.")
154
- return "\n\n---\n\n".join(relevant_docs) # Join with a clear separator
 
 
155
 
156
  except Exception as e:
157
  print(f"Error during similarity search: {e}")
158
- return "" # Return empty on error
159
 
160
- # --- Improved Generation Function ---
161
  def generate_response(
162
  message: str,
163
  history: List[Tuple[str, str]],
164
- system_message: str = ChatConfig.SELECTED_SYSTEM_PROMPT, # Use the chosen system prompt
165
  max_tokens: int = ChatConfig.DEFAULT_MAX_TOKENS,
166
  temperature: float = ChatConfig.DEFAULT_TEMP,
167
  top_p: float = ChatConfig.DEFAULT_TOP_P
168
  ) -> str:
169
 
170
- if index.ntotal == 0: # Check if index is built
171
  return "I need a PDF document to be uploaded and processed first before I can answer questions."
172
 
173
  # 1. Retrieve Context
174
- context = search_relevant_chunks(message, k=3) # Retrieve top 3 chunks
175
-
176
- # Prepare the prompt messages list
177
- messages = []
178
-
179
- # 2. Add the System Prompt (Crucial Change)
180
- messages.append({"role": "system", "content": system_message})
181
-
182
- # 3. Add Conversation History (if any)
183
- # Ensure alternating user/assistant roles, starting with user
184
- for user_msg, assistant_msg in history:
185
- if user_msg: # Add user message if not empty
186
- messages.append({"role": "user", "content": user_msg})
187
- if assistant_msg: # Add assistant message if not empty
188
- messages.append({"role": "assistant", "content": assistant_msg})
189
-
190
- # 4. Construct the Final User Prompt with Context
191
- # We include context here, clearly marked.
192
- # The system prompt already told the AI *how* to use this context.
193
- if context:
194
- user_prompt_content = f"Based on the following context from the document, please answer the question:\n\nCONTEXT:\n{context}\n\n---\n\nQUESTION:\n{message}"
195
- else:
196
- # If no context found, still ask the question but the system prompt guides the "I don't know" response.
197
- # Alternatively, you could return a hardcoded message here *before* calling the LLM if desired.
198
- # Forcing the LLM to respond based on the prompt is generally better for natural language.
199
- user_prompt_content = f"Regarding the document, I have the following question, although I couldn't retrieve specific context for it:\n\nQUESTION:\n{message}"
200
- # Or, more simply:
201
- # user_prompt_content = f"QUESTION: {message}\n\n(Note: No specific context sections were retrieved for this question based on similarity search.)"
202
-
203
- messages.append({"role": "user", "content": user_prompt_content})
204
-
205
- # 5. Call the LLM API
206
  try:
207
- print(f"--- Sending to {ChatConfig.MODEL} ---")
208
- # print("System Prompt:", system_message) # Optional: Debug logging
209
- # print("History:", history) # Optional: Debug logging
210
- # print("User Prompt:", user_prompt_content) # Optional: Debug logging
211
-
212
- completion = client.chat.completions.create(
213
- model=ChatConfig.MODEL,
214
- messages=messages,
215
- max_tokens=max_tokens,
216
- temperature=temperature,
217
- top_p=top_p,
218
- # Consider adding stop sequences if needed, e.g., stop=["\nUSER:", "\nASSISTANT:"]
219
- )
220
- response = completion.choices[0].message.content
221
- print(f"--- Received Response ({len(response)} chars) ---")
222
- return response.strip()
223
-
224
  except Exception as e:
225
- print(f"Error generating response from LLM: {str(e)}")
226
- # Provide a more user-friendly error message
227
  return "I'm sorry, but I encountered an issue while trying to process your request. Please check the connection or try again later."
228
 
229
-
230
- # --- Flask Routes (Mostly Unchanged, added checks) ---
231
 
232
  @app.route('/')
233
- def index_route(): # Renamed to avoid conflict with faiss.Index object
234
  """Serve the HTML page for the user interface"""
235
  return render_template('index.html')
236
 
237
  @app.route('/upload_pdf', methods=['POST'])
238
  def upload_pdf():
239
  """Handle PDF upload, extract text, and build vector index."""
240
- global documents, index # Ensure we are modifying the global state
241
-
242
  if 'pdf' not in request.files:
243
  return jsonify({"error": "No PDF file part in the request."}), 400
244
 
@@ -251,25 +194,22 @@ def upload_pdf():
251
  print(f"Received file: {file.filename}")
252
 
253
  try:
254
- pdf_stream = BytesIO(file.read()) # Read file into memory
255
 
256
  # Extract text
257
  text_chunks = extract_text_from_pdf(pdf_stream)
258
  if not text_chunks:
259
  return jsonify({"error": "Could not extract any text from the PDF."}), 400
260
 
261
- # Build vector database (index)
262
- build_vector_index(text_chunks) # This function now handles index creation
263
 
264
- return jsonify({"message": f"PDF '{file.filename}' processed successfully. {len(documents)} chunks indexed."}), 200
265
 
266
  except fitz.fitz.FileDataError:
267
  return jsonify({"error": "Invalid or corrupted PDF file."}), 400
268
  except Exception as e:
269
  print(f"Error processing PDF upload: {str(e)}")
270
- # Reset state on error
271
- documents = []
272
- index = faiss.IndexFlatL2(vector_dim)
273
  return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
274
 
275
  @app.route('/ask_question', methods=['POST'])
@@ -280,10 +220,10 @@ def ask_question():
280
  return jsonify({"error": "Missing 'message' in request body"}), 400
281
 
282
  message = data['message'].strip()
283
- history = data.get('history', []) # Get history, default to empty list
284
 
285
  if not message:
286
- return jsonify({"response": "Please enter a question."}) # Basic validation
287
 
288
  # Ensure history format is correct (list of tuples/lists)
289
  validated_history = []
@@ -291,21 +231,18 @@ def ask_question():
291
  for item in history:
292
  if isinstance(item, (list, tuple)) and len(item) == 2:
293
  validated_history.append((str(item[0]), str(item[1])))
294
- # else: log potential format error?
295
 
296
  try:
297
  response = generate_response(message, validated_history)
298
  return jsonify({"response": response})
299
  except Exception as e:
300
- # Catch potential errors during generation (though generate_response has its own try-except)
301
  print(f"Error in /ask_question endpoint: {e}")
302
  return jsonify({"response": "Sorry, an error occurred while generating the response."}), 500
303
 
304
 
305
  if __name__ == '__main__':
306
- # Make sure OPENROUTER_API_KEY is checked before starting the app
307
- if not OPENROUTER_API_KEY:
308
- print("ERROR: OPENROUTER_API_KEY environment variable is not set. Exiting.")
309
  else:
310
- # Consider host='0.0.0.0' to make it accessible on your network
311
  app.run(debug=True, host='127.0.0.1', port=5000)
 
1
+ import google.generativeai as genai
2
+ import chromadb
3
+ from typing import List, Tuple
4
  from flask import Flask, request, jsonify, render_template
5
  import fitz # PyMuPDF for PDF text extraction
 
 
6
  import os
7
+ from io import BytesIO
8
+ import uuid
9
+ from dotenv import load_dotenv
 
10
 
11
+ # Load environment variables from .env file
12
  app = Flask(__name__, template_folder=os.getcwd())
13
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
14
 
15
  # --- Configuration ---
16
  class ChatConfig:
17
+ MODEL = "gemini-2.0-flash-thinking-exp-01-21"
18
  DEFAULT_MAX_TOKENS = 512
19
+ DEFAULT_TEMP = 0.5
20
  DEFAULT_TOP_P = 0.95
21
 
22
+ # System Prompt
23
+ SYSTEM_PROMPT = """You are Sentry, an AI assistant representing SentryLabs. Your purpose is to act as a knowledgeable, trusted advisor and cybersecurity innovator, assisting users by analyzing the content of the uploaded PDF document.
 
 
24
  **Your Core Directives:**
25
  1. **Strict Document Scope:** Your knowledge is **strictly limited** to the content within the provided context sections of the uploaded PDF. You **must not** use any external knowledge, make assumptions beyond the text, or invent information.
26
  2. **Persona Embodiment:** Consistently embody the SentryLabs voice:
 
33
  5. **Source Attribution:** When answering, subtly reference the document content as its from your memory(do not use words like "According to the document...", "The provided text indicates...", "Based on the analysis of the document sections..." ).
34
  6. **Audience Awareness (Implied):** While interacting with one user, frame your analysis in a way that would be valuable to decision-makers (balancing technical detail found in the document with its potential strategic relevance, *if* the document provides such context).
35
  7. **Focus:** Your primary goal is accurate information retrieval and synthesis *from the provided document text only*, presented through the SentryLabs persona.
 
36
  Engage directly and professionally. If this is the start of the conversation (no prior history), you can offer a brief introductory sentence. Remember, accuracy and adherence to the document are paramount.
37
  """
38
 
39
+ # --- API Client & ChromaDB Setup ---
40
+ GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
41
+ if not GOOGLE_API_KEY:
42
+ raise ValueError("GOOGLE_API_KEY environment variable not set.")
43
+
44
+ # Configure Google Generative AI
45
+ genai.configure(api_key=GOOGLE_API_KEY)
46
+ gemini_model = genai.GenerativeModel(ChatConfig.MODEL)
47
+
48
+ # Initialize ChromaDB
49
+ chroma_client = chromadb.Client()
50
+ collection_name = "pdf_documents"
51
+ # Create or get collection
52
+ try:
53
+ collection = chroma_client.get_or_create_collection(name=collection_name)
54
+ print(f"ChromaDB collection '{collection_name}' ready.")
55
+ except Exception as e:
56
+ print(f"Error initializing ChromaDB: {e}")
57
+ raise
 
 
 
 
 
 
 
 
 
58
 
59
  # --- Core Functions ---
60
 
 
65
  pdf_stream = BytesIO(pdf_stream.read()) # Read if it's a file stream
66
 
67
  doc = fitz.open(stream=pdf_stream, filetype="pdf")
68
+ # Simple chunking by page - consider more advanced chunking for better RAG performance
69
  text_chunks = [page.get_text("text").strip() for page in doc if page.get_text("text").strip()]
70
  doc.close()
71
  print(f"Extracted {len(text_chunks)} non-empty text chunks from PDF.")
72
  return text_chunks
73
 
 
74
  def build_vector_index(text_chunks: List[str]):
75
+ """Stores text chunks in ChromaDB"""
76
+ global collection
77
 
78
  if not text_chunks:
79
+ print("Warning: No text chunks provided to store in ChromaDB.")
80
+ # Clear the collection if it exists
81
+ collection.delete(where={"origin": "pdf_upload"})
82
  return
83
 
84
+ print(f"Storing {len(text_chunks)} chunks in ChromaDB...")
85
+
 
 
 
 
86
  try:
87
+ # Clear previous documents with the same origin
88
+ collection.delete(where={"origin": "pdf_upload"})
89
+
90
+ # Add documents to ChromaDB
91
+ ids = [f"chunk_{uuid.uuid4()}" for _ in range(len(text_chunks))]
92
+ metadatas = [{"origin": "pdf_upload", "chunk_index": i} for i in range(len(text_chunks))]
93
+
94
+ collection.add(
95
+ documents=text_chunks,
96
+ ids=ids,
97
+ metadatas=metadatas
98
+ )
99
+
100
+ print(f"Successfully stored {len(text_chunks)} documents in ChromaDB.")
101
+
102
  except Exception as e:
103
+ print(f"Error storing documents in ChromaDB: {e}")
104
+ # Try to clear the collection on error
105
+ try:
106
+ collection.delete(where={"origin": "pdf_upload"})
107
+ except:
108
+ pass
109
+ raise
110
 
 
111
  def search_relevant_chunks(query: str, k: int = 3) -> str:
112
+ """Finds the most relevant text chunks for the given query using ChromaDB."""
113
+ global collection
 
 
 
 
114
 
115
  if not query:
116
  return ""
117
 
118
  try:
119
+ # Check if collection has documents
120
+ if collection.count() == 0:
121
+ print("Warning: Search attempted on an empty collection.")
122
+ return ""
123
+
124
+ # Query the collection
125
+ results = collection.query(
126
+ query_texts=[query],
127
+ n_results=k
128
+ )
129
+
130
+ # Extract documents
131
+ if results and 'documents' in results and results['documents']:
132
+ relevant_docs = results['documents'][0] # First query result
133
+ print(f"Retrieved {len(relevant_docs)} relevant chunks.")
134
+ return "\n\n---\n\n".join(relevant_docs) # Join with a clear separator
135
+ else:
136
+ print(f"No relevant chunks found for query: '{query[:50]}...'")
137
+ return ""
138
 
139
  except Exception as e:
140
  print(f"Error during similarity search: {e}")
141
+ return "" # Return empty on error
142
 
 
143
  def generate_response(
144
  message: str,
145
  history: List[Tuple[str, str]],
146
+ system_message: str = ChatConfig.SYSTEM_PROMPT,
147
  max_tokens: int = ChatConfig.DEFAULT_MAX_TOKENS,
148
  temperature: float = ChatConfig.DEFAULT_TEMP,
149
  top_p: float = ChatConfig.DEFAULT_TOP_P
150
  ) -> str:
151
 
152
+ if collection.count() == 0: # Check if collection has documents
153
  return "I need a PDF document to be uploaded and processed first before I can answer questions."
154
 
155
  # 1. Retrieve Context
156
+ context = search_relevant_chunks(message, k=3) # Retrieve top 3 chunks
157
+
158
+ if not context:
159
+ return "I don't have enough information from the PDF document to answer this question. Please try asking something covered in the document."
160
+
161
+ # 2. Prepare the conversation
162
+ chat = gemini_model.start_chat(history=[])
163
+
164
+ # 3. Set the system prompt with context
165
+ prompt = f"{system_message}\n\nCONTEXT FROM DOCUMENT:\n{context}\n\nUSER QUESTION:\n{message}"
166
+
167
+ # 4. Generate the response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  try:
169
+ response = chat.send_message(prompt)
170
+ return response.text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  except Exception as e:
172
+ print(f"Error generating response from Gemini: {str(e)}")
 
173
  return "I'm sorry, but I encountered an issue while trying to process your request. Please check the connection or try again later."
174
 
175
+ # --- Flask Routes ---
 
176
 
177
  @app.route('/')
178
+ def index_route():
179
  """Serve the HTML page for the user interface"""
180
  return render_template('index.html')
181
 
182
  @app.route('/upload_pdf', methods=['POST'])
183
  def upload_pdf():
184
  """Handle PDF upload, extract text, and build vector index."""
 
 
185
  if 'pdf' not in request.files:
186
  return jsonify({"error": "No PDF file part in the request."}), 400
187
 
 
194
  print(f"Received file: {file.filename}")
195
 
196
  try:
197
+ pdf_stream = BytesIO(file.read()) # Read file into memory
198
 
199
  # Extract text
200
  text_chunks = extract_text_from_pdf(pdf_stream)
201
  if not text_chunks:
202
  return jsonify({"error": "Could not extract any text from the PDF."}), 400
203
 
204
+ # Store in ChromaDB
205
+ build_vector_index(text_chunks)
206
 
207
+ return jsonify({"message": f"PDF '{file.filename}' processed successfully. {len(text_chunks)} chunks stored in ChromaDB."}), 200
208
 
209
  except fitz.fitz.FileDataError:
210
  return jsonify({"error": "Invalid or corrupted PDF file."}), 400
211
  except Exception as e:
212
  print(f"Error processing PDF upload: {str(e)}")
 
 
 
213
  return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
214
 
215
  @app.route('/ask_question', methods=['POST'])
 
220
  return jsonify({"error": "Missing 'message' in request body"}), 400
221
 
222
  message = data['message'].strip()
223
+ history = data.get('history', []) # Get history, default to empty list
224
 
225
  if not message:
226
+ return jsonify({"response": "Please enter a question."}) # Basic validation
227
 
228
  # Ensure history format is correct (list of tuples/lists)
229
  validated_history = []
 
231
  for item in history:
232
  if isinstance(item, (list, tuple)) and len(item) == 2:
233
  validated_history.append((str(item[0]), str(item[1])))
 
234
 
235
  try:
236
  response = generate_response(message, validated_history)
237
  return jsonify({"response": response})
238
  except Exception as e:
 
239
  print(f"Error in /ask_question endpoint: {e}")
240
  return jsonify({"response": "Sorry, an error occurred while generating the response."}), 500
241
 
242
 
243
  if __name__ == '__main__':
244
+ # Make sure GOOGLE_API_KEY is checked before starting the app
245
+ if not GOOGLE_API_KEY:
246
+ print("ERROR: GOOGLE_API_KEY environment variable is not set. Exiting.")
247
  else:
 
248
  app.run(debug=True, host='127.0.0.1', port=5000)