Pamudu13 commited on
Commit
9b6cc92
·
verified ·
1 Parent(s): 11f7ceb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +237 -82
app.py CHANGED
@@ -6,151 +6,306 @@ import faiss # FAISS for vector search
6
  import numpy as np
7
  import os
8
  from sentence_transformers import SentenceTransformer
9
- from huggingface_hub import InferenceClient
10
  from typing import List, Tuple
 
11
 
12
  app = Flask(__name__, template_folder=os.getcwd())
13
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
14
 
15
- # Default settings
16
  class ChatConfig:
17
- MODEL = "google/gemma-3-27b-it:free" # Use OpenRouter's Gemma model
18
- DEFAULT_SYSTEM_MSG = "You are an AI assistant answering only based on the uploaded PDF."
19
  DEFAULT_MAX_TOKENS = 512
20
- DEFAULT_TEMP = 0.3
21
  DEFAULT_TOP_P = 0.95
22
 
23
- # Get the token from environment variable
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  OPENROUTER_API_KEY = getenv('OPENROUTER_API_KEY')
 
 
 
25
  client = OpenAI(
26
  base_url="https://openrouter.ai/api/v1",
27
  api_key=OPENROUTER_API_KEY,
28
  )
29
 
30
- embed_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder="/tmp")
31
- vector_dim = 384 # Embedding size
32
- index = faiss.IndexFlatL2(vector_dim) # FAISS index
 
 
33
 
34
- documents = [] # Store extracted text
35
 
36
- def extract_text_from_pdf(pdf_stream):
37
  """Extracts text from PDF stream"""
 
 
 
 
38
  doc = fitz.open(stream=pdf_stream, filetype="pdf")
39
- text_chunks = [page.get_text("text") for page in doc]
 
40
  doc.close()
 
41
  return text_chunks
42
 
43
- def create_vector_db(text_chunks):
44
- """Embeds text chunks and adds them to FAISS index"""
45
- global documents, index
46
-
47
- # Reinitialize the FAISS index
 
 
 
 
 
 
 
 
 
 
48
  index = faiss.IndexFlatL2(vector_dim)
49
-
50
- documents = text_chunks
51
- embeddings = embed_model.encode(text_chunks)
52
 
53
- # Convert embeddings to np.float32 for FAISS
54
- embeddings = np.array(embeddings, dtype=np.float32)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- # Ensure that embeddings have the correct shape (should be 2D, with each vector having the right dimension)
57
- if embeddings.ndim == 1: # If only one embedding, reshape it
58
- embeddings = embeddings.reshape(1, -1)
59
 
60
- # Add embeddings to the FAISS index
61
- index.add(embeddings)
 
 
62
 
63
- # Check if adding was successful (optional)
64
  if index.ntotal == 0:
65
- print("Error: FAISS index is empty after adding embeddings.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- def search_relevant_text(query):
68
- """Finds the most relevant text chunk for the given query"""
69
- query_embedding = embed_model.encode([query])
70
- _, closest_idx = index.search(np.array(query_embedding, dtype=np.float32), k=3)
71
- return "\n".join([documents[i] for i in closest_idx[0]])
72
 
 
 
 
 
 
73
  def generate_response(
74
  message: str,
75
  history: List[Tuple[str, str]],
76
- system_message: str = ChatConfig.DEFAULT_SYSTEM_MSG,
77
  max_tokens: int = ChatConfig.DEFAULT_MAX_TOKENS,
78
  temperature: float = ChatConfig.DEFAULT_TEMP,
79
  top_p: float = ChatConfig.DEFAULT_TOP_P
80
  ) -> str:
81
- if not documents:
82
- return "Please upload a PDF first."
83
 
84
- context = search_relevant_text(message) # Get relevant content from PDF
 
 
 
 
85
 
86
- if not context.strip(): # If no relevant content is found, refuse to answer
87
- return "I can only answer based on the uploaded PDF. Your question is outside the document's content."
88
 
89
- messages = [
90
- {"role": "user", "content": f"{system_message}\n\nContext: {context}\nQuestion: {message}"}
91
- ]
92
 
93
- # Add conversation history ensuring alternating pattern (user, assistant, user, assistant...)
94
- for user_msg, bot_msg in history:
95
- if user_msg.strip():
96
- messages.append({"role": "user", "content": user_msg})
97
- if bot_msg.strip():
98
- messages.append({"role": "assistant", "content": bot_msg})
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  try:
101
- # Use OpenRouter to get the response
 
 
 
 
102
  completion = client.chat.completions.create(
103
- model="google/gemma-3-27b-it:free",
104
- messages=messages
 
 
 
 
105
  )
106
- return completion.choices[0].message.content
 
 
 
107
  except Exception as e:
108
- print(f"Error generating response: {str(e)}")
109
- return "I apologize, but I encountered an error while generating the response. Please try again."
 
 
110
 
 
111
 
112
  @app.route('/')
113
- def index():
114
  """Serve the HTML page for the user interface"""
115
  return render_template('index.html')
116
 
117
  @app.route('/upload_pdf', methods=['POST'])
118
  def upload_pdf():
119
- """Handle PDF upload"""
 
 
120
  if 'pdf' not in request.files:
121
- return jsonify({"error": "No file part"}), 400
122
 
123
  file = request.files['pdf']
124
  if file.filename == "":
125
- return jsonify({"error": "No selected file"}), 400
 
 
 
 
126
 
127
  try:
128
- # Read the file directly into memory instead of saving to disk
129
- pdf_stream = file.read()
130
-
131
- # Create a BytesIO object to work with the PDF in memory
132
- from io import BytesIO
133
- pdf_stream = BytesIO(pdf_stream)
134
-
135
- # Use fitz to open the PDF from memory
136
- doc = fitz.open(stream=pdf_stream, filetype="pdf")
137
- text_chunks = [page.get_text("text") for page in doc]
138
- doc.close()
139
-
140
- # Create vector database
141
- create_vector_db(text_chunks)
142
-
143
- return jsonify({"message": "PDF uploaded and indexed successfully!"}), 200
144
  except Exception as e:
145
- return jsonify({"error": f"Error processing file: {str(e)}"}), 500
 
 
 
 
146
 
147
  @app.route('/ask_question', methods=['POST'])
148
  def ask_question():
149
- """Handle user question"""
150
- message = request.json.get('message')
151
- history = request.json.get('history', [])
152
- response = generate_response(message, history)
153
- return jsonify({"response": response})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  if __name__ == '__main__':
156
- app.run(debug=True)
 
 
 
 
 
 
6
  import numpy as np
7
  import os
8
  from sentence_transformers import SentenceTransformer
9
+ # from huggingface_hub import InferenceClient # Not used in the current code, removed for clarity
10
  from typing import List, Tuple
11
+ from io import BytesIO # Added for BytesIO
12
 
13
  app = Flask(__name__, template_folder=os.getcwd())
14
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
15
 
16
+ # --- Configuration ---
17
  class ChatConfig:
18
+ MODEL = "google/gemma-3-27b-it:free"
 
19
  DEFAULT_MAX_TOKENS = 512
20
+ DEFAULT_TEMP = 0.5 # Slightly increased for more natural variance
21
  DEFAULT_TOP_P = 0.95
22
 
23
+ # --- NEW: System Prompt Options ---
24
+ # Option 1: Friendly & Helpful Assistant
25
+ SYSTEM_PROMPT_FRIENDLY = """You are Sentry, an AI assistant representing SentryLabs. Your purpose is to act as a knowledgeable, trusted advisor and cybersecurity innovator, assisting users by analyzing the content of the uploaded PDF document.
26
+
27
+ **Your Core Directives:**
28
+ 1. **Strict Document Scope:** Your knowledge is **strictly limited** to the content within the provided context sections of the uploaded PDF. You **must not** use any external knowledge, make assumptions beyond the text, or invent information.
29
+ 2. **Persona Embodiment:** Consistently embody the SentryLabs voice:
30
+ * **Authoritative but Approachable:** Be confident and expert, yet clear and supportive.
31
+ * **Innovative & Forward-Thinking:** Frame answers with a proactive cybersecurity mindset where the text allows.
32
+ * **Customer-Centric:** Focus on providing clear value and insights derived *from the document*.
33
+ * **Professional & Clear:** Use precise, professional language. Employ technical terms from the document accurately, but strive for accessibility. Explain complex document concepts simply if possible. Use **active voice**.
34
+ 3. **Tone:** Maintain a confident, informative, empathetic, and collaborative semi-formal tone. Avoid slang and overly casual language.
35
+ 4. **Handling Missing Information:** If the provided document context **does not** contain the information needed to answer a question, state this clearly and professionally. Indicate that the answer is outside the scope of the analyzed document sections. Do not apologize excessively; simply state the limitation based on the provided text. Example: "According to my knowledge, specific details on [topic] are not covered." or "The analyzed text does not contain information regarding [topic]."
36
+ 5. **Source Attribution:** When answering, subtly reference the document content as its from your memory(do not use words like "According to the document...", "The provided text indicates...", "Based on the analysis of the document sections..." ).
37
+ 6. **Audience Awareness (Implied):** While interacting with one user, frame your analysis in a way that would be valuable to decision-makers (balancing technical detail found in the document with its potential strategic relevance, *if* the document provides such context).
38
+ 7. **Focus:** Your primary goal is accurate information retrieval and synthesis *from the provided document text only*, presented through the SentryLabs persona.
39
+
40
+ Engage directly and professionally. If this is the start of the conversation (no prior history), you can offer a brief introductory sentence. Remember, accuracy and adherence to the document are paramount.
41
+ """
42
+
43
+ # Option 2: Knowledgeable Expert (More Formal)
44
+ SYSTEM_PROMPT_EXPERT = """You are a knowledgeable AI expert specializing in the content of the uploaded PDF document.
45
+ You must answer user questions with precision, drawing *exclusively* from the provided context segments.
46
+ Maintain a professional and informative tone.
47
+ If the provided context does not contain the necessary information to answer the question, explicitly state that the information is not found within the scope of the provided text.
48
+ Do not speculate, infer beyond the text, or utilize any external information sources.
49
+ Clearly attribute your answers to the document, for instance, by starting with "The document indicates that..." or "Based on the provided context...".
50
+ Provide comprehensive answers derived solely from the text.
51
+ """
52
+
53
+ # --- Select the desired prompt ---
54
+ SELECTED_SYSTEM_PROMPT = SYSTEM_PROMPT_FRIENDLY # Choose which personality to use
55
+
56
+ # --- API Client & Embedding Setup ---
57
  OPENROUTER_API_KEY = getenv('OPENROUTER_API_KEY')
58
+ if not OPENROUTER_API_KEY:
59
+ raise ValueError("OPENROUTER_API_KEY environment variable not set.")
60
+
61
  client = OpenAI(
62
  base_url="https://openrouter.ai/api/v1",
63
  api_key=OPENROUTER_API_KEY,
64
  )
65
 
66
+ # Use a temporary cache directory if needed, or configure appropriately
67
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=getenv("SENTENCE_TRANSFORMERS_HOME", "/tmp/st_cache"))
68
+ vector_dim = 384
69
+ index = faiss.IndexFlatL2(vector_dim)
70
+ documents = [] # Store original text chunks corresponding to index entries
71
 
72
+ # --- Core Functions ---
73
 
74
+ def extract_text_from_pdf(pdf_stream: BytesIO) -> List[str]:
75
  """Extracts text from PDF stream"""
76
+ # Ensure the stream is BytesIO
77
+ if not isinstance(pdf_stream, BytesIO):
78
+ pdf_stream = BytesIO(pdf_stream.read()) # Read if it's a file stream
79
+
80
  doc = fitz.open(stream=pdf_stream, filetype="pdf")
81
+ # Simple chunking by page - consider more advanced chunking (by paragraph, sentence, fixed size) for better RAG performance
82
+ text_chunks = [page.get_text("text").strip() for page in doc if page.get_text("text").strip()]
83
  doc.close()
84
+ print(f"Extracted {len(text_chunks)} non-empty text chunks from PDF.")
85
  return text_chunks
86
 
87
+ # Renamed for clarity, added error handling
88
+ def build_vector_index(text_chunks: List[str]):
89
+ """Embeds text chunks and builds the FAISS index."""
90
+ global documents, index, vector_dim
91
+
92
+ if not text_chunks:
93
+ print("Warning: No text chunks provided to build the vector index.")
94
+ documents = []
95
+ index = faiss.IndexFlatL2(vector_dim) # Reinitialize empty index
96
+ return
97
+
98
+ print(f"Building vector index for {len(text_chunks)} chunks...")
99
+ documents = text_chunks # Store the original text
100
+
101
+ # Reset the index
102
  index = faiss.IndexFlatL2(vector_dim)
 
 
 
103
 
104
+ try:
105
+ embeddings = embed_model.encode(text_chunks, show_progress_bar=True)
106
+ embeddings = np.array(embeddings, dtype=np.float32)
107
+
108
+ if embeddings.ndim == 1:
109
+ embeddings = embeddings.reshape(1, -1)
110
+
111
+ if embeddings.shape[1] != vector_dim:
112
+ raise ValueError(f"Embedding dimension mismatch: expected {vector_dim}, got {embeddings.shape[1]}")
113
+
114
+ index.add(embeddings)
115
+ print(f"FAISS index built successfully with {index.ntotal} vectors.")
116
+
117
+ except Exception as e:
118
+ print(f"Error during embedding or indexing: {e}")
119
+ # Reset state in case of error
120
+ documents = []
121
+ index = faiss.IndexFlatL2(vector_dim)
122
+ raise # Re-raise the exception to signal failure
123
 
 
 
 
124
 
125
+ # Renamed for clarity, added checks
126
+ def search_relevant_chunks(query: str, k: int = 3) -> str:
127
+ """Finds the most relevant text chunks for the given query using FAISS."""
128
+ global index, documents
129
 
 
130
  if index.ntotal == 0:
131
+ print("Warning: Search attempted on an empty index.")
132
+ return "" # Return empty string if index is not ready
133
+
134
+ if not query:
135
+ return ""
136
+
137
+ try:
138
+ query_embedding = embed_model.encode([query])
139
+ query_embedding = np.array(query_embedding, dtype=np.float32)
140
+
141
+ # Perform the search
142
+ distances, indices = index.search(query_embedding, k=min(k, index.ntotal)) # Ensure k <= index size
143
+
144
+ # Filter out potential invalid indices (-1 can sometimes occur if k > ntotal, though min() handles it)
145
+ valid_indices = [idx for idx in indices[0] if idx != -1 and idx < len(documents)]
146
+
147
+ if not valid_indices:
148
+ print(f"No relevant chunks found for query: '{query[:50]}...'")
149
+ return ""
150
 
151
+ # Retrieve the actual text chunks
152
+ relevant_docs = [documents[i] for i in valid_indices]
153
+ print(f"Retrieved {len(relevant_docs)} relevant chunks.")
154
+ return "\n\n---\n\n".join(relevant_docs) # Join with a clear separator
 
155
 
156
+ except Exception as e:
157
+ print(f"Error during similarity search: {e}")
158
+ return "" # Return empty on error
159
+
160
+ # --- Improved Generation Function ---
161
  def generate_response(
162
  message: str,
163
  history: List[Tuple[str, str]],
164
+ system_message: str = ChatConfig.SELECTED_SYSTEM_PROMPT, # Use the chosen system prompt
165
  max_tokens: int = ChatConfig.DEFAULT_MAX_TOKENS,
166
  temperature: float = ChatConfig.DEFAULT_TEMP,
167
  top_p: float = ChatConfig.DEFAULT_TOP_P
168
  ) -> str:
 
 
169
 
170
+ if index.ntotal == 0: # Check if index is built
171
+ return "I need a PDF document to be uploaded and processed first before I can answer questions."
172
+
173
+ # 1. Retrieve Context
174
+ context = search_relevant_chunks(message, k=3) # Retrieve top 3 chunks
175
 
176
+ # Prepare the prompt messages list
177
+ messages = []
178
 
179
+ # 2. Add the System Prompt (Crucial Change)
180
+ messages.append({"role": "system", "content": system_message})
 
181
 
182
+ # 3. Add Conversation History (if any)
183
+ # Ensure alternating user/assistant roles, starting with user
184
+ for user_msg, assistant_msg in history:
185
+ if user_msg: # Add user message if not empty
186
+ messages.append({"role": "user", "content": user_msg})
187
+ if assistant_msg: # Add assistant message if not empty
188
+ messages.append({"role": "assistant", "content": assistant_msg})
189
 
190
+ # 4. Construct the Final User Prompt with Context
191
+ # We include context here, clearly marked.
192
+ # The system prompt already told the AI *how* to use this context.
193
+ if context:
194
+ user_prompt_content = f"Based on the following context from the document, please answer the question:\n\nCONTEXT:\n{context}\n\n---\n\nQUESTION:\n{message}"
195
+ else:
196
+ # If no context found, still ask the question but the system prompt guides the "I don't know" response.
197
+ # Alternatively, you could return a hardcoded message here *before* calling the LLM if desired.
198
+ # Forcing the LLM to respond based on the prompt is generally better for natural language.
199
+ user_prompt_content = f"Regarding the document, I have the following question, although I couldn't retrieve specific context for it:\n\nQUESTION:\n{message}"
200
+ # Or, more simply:
201
+ # user_prompt_content = f"QUESTION: {message}\n\n(Note: No specific context sections were retrieved for this question based on similarity search.)"
202
+
203
+ messages.append({"role": "user", "content": user_prompt_content})
204
+
205
+ # 5. Call the LLM API
206
  try:
207
+ print(f"--- Sending to {ChatConfig.MODEL} ---")
208
+ # print("System Prompt:", system_message) # Optional: Debug logging
209
+ # print("History:", history) # Optional: Debug logging
210
+ # print("User Prompt:", user_prompt_content) # Optional: Debug logging
211
+
212
  completion = client.chat.completions.create(
213
+ model=ChatConfig.MODEL,
214
+ messages=messages,
215
+ max_tokens=max_tokens,
216
+ temperature=temperature,
217
+ top_p=top_p,
218
+ # Consider adding stop sequences if needed, e.g., stop=["\nUSER:", "\nASSISTANT:"]
219
  )
220
+ response = completion.choices[0].message.content
221
+ print(f"--- Received Response ({len(response)} chars) ---")
222
+ return response.strip()
223
+
224
  except Exception as e:
225
+ print(f"Error generating response from LLM: {str(e)}")
226
+ # Provide a more user-friendly error message
227
+ return "I'm sorry, but I encountered an issue while trying to process your request. Please check the connection or try again later."
228
+
229
 
230
+ # --- Flask Routes (Mostly Unchanged, added checks) ---
231
 
232
  @app.route('/')
233
+ def index_route(): # Renamed to avoid conflict with faiss.Index object
234
  """Serve the HTML page for the user interface"""
235
  return render_template('index.html')
236
 
237
  @app.route('/upload_pdf', methods=['POST'])
238
  def upload_pdf():
239
+ """Handle PDF upload, extract text, and build vector index."""
240
+ global documents, index # Ensure we are modifying the global state
241
+
242
  if 'pdf' not in request.files:
243
+ return jsonify({"error": "No PDF file part in the request."}), 400
244
 
245
  file = request.files['pdf']
246
  if file.filename == "":
247
+ return jsonify({"error": "No file selected."}), 400
248
+ if not file.filename.lower().endswith('.pdf'):
249
+ return jsonify({"error": "Invalid file type. Please upload a PDF."}), 400
250
+
251
+ print(f"Received file: {file.filename}")
252
 
253
  try:
254
+ pdf_stream = BytesIO(file.read()) # Read file into memory
255
+
256
+ # Extract text
257
+ text_chunks = extract_text_from_pdf(pdf_stream)
258
+ if not text_chunks:
259
+ return jsonify({"error": "Could not extract any text from the PDF."}), 400
260
+
261
+ # Build vector database (index)
262
+ build_vector_index(text_chunks) # This function now handles index creation
263
+
264
+ return jsonify({"message": f"PDF '{file.filename}' processed successfully. {len(documents)} chunks indexed."}), 200
265
+
266
+ except fitz.fitz.FileDataError:
267
+ return jsonify({"error": "Invalid or corrupted PDF file."}), 400
 
 
268
  except Exception as e:
269
+ print(f"Error processing PDF upload: {str(e)}")
270
+ # Reset state on error
271
+ documents = []
272
+ index = faiss.IndexFlatL2(vector_dim)
273
+ return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
274
 
275
  @app.route('/ask_question', methods=['POST'])
276
  def ask_question():
277
+ """Handle user question, retrieve context, and generate response."""
278
+ data = request.get_json()
279
+ if not data or 'message' not in data:
280
+ return jsonify({"error": "Missing 'message' in request body"}), 400
281
+
282
+ message = data['message'].strip()
283
+ history = data.get('history', []) # Get history, default to empty list
284
+
285
+ if not message:
286
+ return jsonify({"response": "Please enter a question."}) # Basic validation
287
+
288
+ # Ensure history format is correct (list of tuples/lists)
289
+ validated_history = []
290
+ if isinstance(history, list):
291
+ for item in history:
292
+ if isinstance(item, (list, tuple)) and len(item) == 2:
293
+ validated_history.append((str(item[0]), str(item[1])))
294
+ # else: log potential format error?
295
+
296
+ try:
297
+ response = generate_response(message, validated_history)
298
+ return jsonify({"response": response})
299
+ except Exception as e:
300
+ # Catch potential errors during generation (though generate_response has its own try-except)
301
+ print(f"Error in /ask_question endpoint: {e}")
302
+ return jsonify({"response": "Sorry, an error occurred while generating the response."}), 500
303
+
304
 
305
  if __name__ == '__main__':
306
+ # Make sure OPENROUTER_API_KEY is checked before starting the app
307
+ if not OPENROUTER_API_KEY:
308
+ print("ERROR: OPENROUTER_API_KEY environment variable is not set. Exiting.")
309
+ else:
310
+ # Consider host='0.0.0.0' to make it accessible on your network
311
+ app.run(debug=True, host='127.0.0.1', port=5000)