CosmickVisions commited on
Commit
4c6c992
·
verified ·
1 Parent(s): 84a6029

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -17
app.py CHANGED
@@ -25,23 +25,24 @@ client = groq.Client(api_key=os.getenv("GROQ_TECH_API_KEY"))
25
 
26
  # Initialize embeddings with error handling
27
  try:
 
28
  embeddings = HuggingFaceInstructEmbeddings(
29
  model_name="hkunlp/instructor-base",
30
- model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"}
31
  )
32
  except Exception as e:
33
  print(f"Warning: Failed to load primary embeddings model: {e}")
34
  try:
35
  embeddings = HuggingFaceInstructEmbeddings(
36
  model_name="all-MiniLM-L6-v2",
37
- model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"}
38
  )
39
  except Exception as e:
40
  print(f"Warning: Failed to load fallback embeddings model: {e}")
41
  embeddings = None
42
 
43
- # Directory to store FAISS indexes
44
- FAISS_INDEX_DIR = "faiss_indexes_tech"
45
  if not os.path.exists(FAISS_INDEX_DIR):
46
  os.makedirs(FAISS_INDEX_DIR)
47
 
@@ -204,7 +205,7 @@ def process_pdf(pdf_file):
204
 
205
  # Function to generate chatbot responses with Tech theme
206
  def generate_response(message, session_id, model_name, history):
207
- """Generate chatbot responses"""
208
  if not message:
209
  return history
210
 
@@ -212,10 +213,22 @@ def generate_response(message, session_id, model_name, history):
212
  context = ""
213
  if embeddings and session_id and session_id in user_vectorstores:
214
  try:
 
215
  vectorstore = user_vectorstores[session_id]
216
- docs = vectorstore.similarity_search(message, k=3)
 
 
 
217
  if docs:
218
- context = "\n\nRelevant code context:\n" + "\n".join(f"```\n{doc.page_content}\n```" for doc in docs)
 
 
 
 
 
 
 
 
219
  except Exception as e:
220
  print(f"Warning: Failed to perform similarity search: {e}")
221
 
@@ -224,7 +237,10 @@ def generate_response(message, session_id, model_name, history):
224
  Format code snippets with proper markdown code blocks and specify the language."""
225
 
226
  if context:
227
- system_prompt += f"\nUse this context from the uploaded code when relevant:{context}"
 
 
 
228
 
229
  completion = client.chat.completions.create(
230
  model=model_name,
@@ -237,12 +253,31 @@ def generate_response(message, session_id, model_name, history):
237
  )
238
 
239
  response = completion.choices[0].message.content
240
- history.append({"role": "assistant", "content": response})
 
 
 
 
 
 
 
 
 
 
241
  return history
242
 
243
  except Exception as e:
244
  error_msg = f"Error generating response: {str(e)}"
245
- history.append({"role": "assistant", "content": error_msg})
 
 
 
 
 
 
 
 
 
246
  return history
247
 
248
  # Functions to update PDF viewer
@@ -492,20 +527,20 @@ def perform_stack_search(query, tag, sort_by):
492
  except Exception as e:
493
  return f"Error searching Stack Overflow: {str(e)}"
494
 
495
- # Modify the file input and processing section
496
  def process_code_file(file_obj):
497
- """Process uploaded code files"""
498
  if file_obj is None:
499
  return None, "No file uploaded", {}
500
 
501
  try:
502
  # Handle both file objects and bytes objects
503
  if isinstance(file_obj, bytes):
504
- content = file_obj.decode('utf-8')
505
  file_name = "uploaded_file"
506
  file_extension = ".txt" # Default extension
507
  else:
508
- content = file_obj.read().decode('utf-8')
509
  file_name = getattr(file_obj, 'name', 'uploaded_file')
510
  file_extension = Path(file_name).suffix.lower()
511
 
@@ -518,17 +553,34 @@ def process_code_file(file_obj):
518
  session_id = None
519
  if embeddings:
520
  try:
521
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
522
- chunks = text_splitter.create_documents([content])
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  vectorstore = FAISS.from_documents(chunks, embeddings)
524
  session_id = str(uuid.uuid4())
525
  index_path = os.path.join(FAISS_INDEX_DIR, session_id)
526
  vectorstore.save_local(index_path)
527
  user_vectorstores[session_id] = vectorstore
 
 
 
 
528
  except Exception as e:
529
  print(f"Warning: Failed to create vectorstore: {e}")
530
 
531
- return session_id, f"✅ Successfully analyzed {file_name}", metrics
532
  except Exception as e:
533
  return None, f"Error processing file: {str(e)}", {}
534
 
 
25
 
26
  # Initialize embeddings with error handling
27
  try:
28
+ # Force CPU usage for embeddings
29
  embeddings = HuggingFaceInstructEmbeddings(
30
  model_name="hkunlp/instructor-base",
31
+ model_kwargs={"device": "cpu"} # Force CPU usage
32
  )
33
  except Exception as e:
34
  print(f"Warning: Failed to load primary embeddings model: {e}")
35
  try:
36
  embeddings = HuggingFaceInstructEmbeddings(
37
  model_name="all-MiniLM-L6-v2",
38
+ model_kwargs={"device": "cpu"} # Force CPU usage
39
  )
40
  except Exception as e:
41
  print(f"Warning: Failed to load fallback embeddings model: {e}")
42
  embeddings = None
43
 
44
+ # Directory to store FAISS indexes with better naming
45
+ FAISS_INDEX_DIR = "faiss_indexes_tech_cpu"
46
  if not os.path.exists(FAISS_INDEX_DIR):
47
  os.makedirs(FAISS_INDEX_DIR)
48
 
 
205
 
206
  # Function to generate chatbot responses with Tech theme
207
  def generate_response(message, session_id, model_name, history):
208
+ """Generate chatbot responses with FAISS context enhancement"""
209
  if not message:
210
  return history
211
 
 
213
  context = ""
214
  if embeddings and session_id and session_id in user_vectorstores:
215
  try:
216
+ print(f"Performing similarity search with session: {session_id}")
217
  vectorstore = user_vectorstores[session_id]
218
+
219
+ # Use a higher k value to get more relevant context
220
+ docs = vectorstore.similarity_search(message, k=5)
221
+
222
  if docs:
223
+ # Format the context more clearly with source information
224
+ context = "\n\nRelevant code context from your files:\n\n"
225
+ for i, doc in enumerate(docs, 1):
226
+ source = doc.metadata.get("source", "Unknown")
227
+ language = doc.metadata.get("language", "Unknown")
228
+ context += f"--- Segment {i} from {source} ({language}) ---\n"
229
+ context += f"```\n{doc.page_content}\n```\n\n"
230
+
231
+ print(f"Found {len(docs)} relevant code segments for context.")
232
  except Exception as e:
233
  print(f"Warning: Failed to perform similarity search: {e}")
234
 
 
237
  Format code snippets with proper markdown code blocks and specify the language."""
238
 
239
  if context:
240
+ system_prompt += f"\n\nUse this context from the uploaded code files to inform your answers:{context}"
241
+
242
+ # Add instruction to reference specific file parts
243
+ system_prompt += "\nWhen discussing code from the uploaded files, specifically reference the file name and segment number."
244
 
245
  completion = client.chat.completions.create(
246
  model=model_name,
 
253
  )
254
 
255
  response = completion.choices[0].message.content
256
+
257
+ # For proper chat history handling
258
+ if isinstance(history, list) and history and isinstance(history[0], dict):
259
+ # History is in message format
260
+ history.append({"role": "user", "content": message})
261
+ history.append({"role": "assistant", "content": response})
262
+ else:
263
+ # Fallback for other formats
264
+ history.append({"role": "user", "content": message})
265
+ history.append({"role": "assistant", "content": response})
266
+
267
  return history
268
 
269
  except Exception as e:
270
  error_msg = f"Error generating response: {str(e)}"
271
+
272
+ # Handle different history formats
273
+ if isinstance(history, list):
274
+ if history and isinstance(history[0], dict):
275
+ history.append({"role": "user", "content": message})
276
+ history.append({"role": "assistant", "content": error_msg})
277
+ else:
278
+ history.append({"role": "user", "content": message})
279
+ history.append({"role": "assistant", "content": error_msg})
280
+
281
  return history
282
 
283
  # Functions to update PDF viewer
 
527
  except Exception as e:
528
  return f"Error searching Stack Overflow: {str(e)}"
529
 
530
+ # Modify the process_code_file function
531
  def process_code_file(file_obj):
532
+ """Process uploaded code files and store in FAISS index"""
533
  if file_obj is None:
534
  return None, "No file uploaded", {}
535
 
536
  try:
537
  # Handle both file objects and bytes objects
538
  if isinstance(file_obj, bytes):
539
+ content = file_obj.decode('utf-8', errors='replace') # Added error handling
540
  file_name = "uploaded_file"
541
  file_extension = ".txt" # Default extension
542
  else:
543
+ content = file_obj.read().decode('utf-8', errors='replace') # Added error handling
544
  file_name = getattr(file_obj, 'name', 'uploaded_file')
545
  file_extension = Path(file_name).suffix.lower()
546
 
 
553
  session_id = None
554
  if embeddings:
555
  try:
556
+ print(f"Creating FAISS index for {file_name}...")
557
+ # Improved chunking for code files
558
+ text_splitter = RecursiveCharacterTextSplitter(
559
+ chunk_size=500, # Smaller chunks for code
560
+ chunk_overlap=50,
561
+ separators=["\n\n", "\n", " ", ""]
562
+ )
563
+ chunks = text_splitter.create_documents([content], metadatas=[{"filename": file_name, "language": language}])
564
+
565
+ # Add source metadata to help with retrieval
566
+ for i, chunk in enumerate(chunks):
567
+ chunk.metadata["chunk_id"] = i
568
+ chunk.metadata["source"] = file_name
569
+
570
+ # Create and store vectorstore
571
  vectorstore = FAISS.from_documents(chunks, embeddings)
572
  session_id = str(uuid.uuid4())
573
  index_path = os.path.join(FAISS_INDEX_DIR, session_id)
574
  vectorstore.save_local(index_path)
575
  user_vectorstores[session_id] = vectorstore
576
+
577
+ # Add number of chunks to metrics for display
578
+ metrics["chunks"] = len(chunks)
579
+ print(f"Successfully created FAISS index with {len(chunks)} chunks.")
580
  except Exception as e:
581
  print(f"Warning: Failed to create vectorstore: {e}")
582
 
583
+ return session_id, f"✅ Successfully analyzed {file_name} and stored in FAISS index", metrics
584
  except Exception as e:
585
  return None, f"Error processing file: {str(e)}", {}
586