Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -25,23 +25,24 @@ client = groq.Client(api_key=os.getenv("GROQ_TECH_API_KEY"))
|
|
25 |
|
26 |
# Initialize embeddings with error handling
|
27 |
try:
|
|
|
28 |
embeddings = HuggingFaceInstructEmbeddings(
|
29 |
model_name="hkunlp/instructor-base",
|
30 |
-
model_kwargs={"device": "
|
31 |
)
|
32 |
except Exception as e:
|
33 |
print(f"Warning: Failed to load primary embeddings model: {e}")
|
34 |
try:
|
35 |
embeddings = HuggingFaceInstructEmbeddings(
|
36 |
model_name="all-MiniLM-L6-v2",
|
37 |
-
model_kwargs={"device": "
|
38 |
)
|
39 |
except Exception as e:
|
40 |
print(f"Warning: Failed to load fallback embeddings model: {e}")
|
41 |
embeddings = None
|
42 |
|
43 |
-
# Directory to store FAISS indexes
|
44 |
-
FAISS_INDEX_DIR = "
|
45 |
if not os.path.exists(FAISS_INDEX_DIR):
|
46 |
os.makedirs(FAISS_INDEX_DIR)
|
47 |
|
@@ -204,7 +205,7 @@ def process_pdf(pdf_file):
|
|
204 |
|
205 |
# Function to generate chatbot responses with Tech theme
|
206 |
def generate_response(message, session_id, model_name, history):
|
207 |
-
"""Generate chatbot responses"""
|
208 |
if not message:
|
209 |
return history
|
210 |
|
@@ -212,10 +213,22 @@ def generate_response(message, session_id, model_name, history):
|
|
212 |
context = ""
|
213 |
if embeddings and session_id and session_id in user_vectorstores:
|
214 |
try:
|
|
|
215 |
vectorstore = user_vectorstores[session_id]
|
216 |
-
|
|
|
|
|
|
|
217 |
if docs:
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
except Exception as e:
|
220 |
print(f"Warning: Failed to perform similarity search: {e}")
|
221 |
|
@@ -224,7 +237,10 @@ def generate_response(message, session_id, model_name, history):
|
|
224 |
Format code snippets with proper markdown code blocks and specify the language."""
|
225 |
|
226 |
if context:
|
227 |
-
system_prompt += f"\nUse this context from the uploaded code
|
|
|
|
|
|
|
228 |
|
229 |
completion = client.chat.completions.create(
|
230 |
model=model_name,
|
@@ -237,12 +253,31 @@ def generate_response(message, session_id, model_name, history):
|
|
237 |
)
|
238 |
|
239 |
response = completion.choices[0].message.content
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
return history
|
242 |
|
243 |
except Exception as e:
|
244 |
error_msg = f"Error generating response: {str(e)}"
|
245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
return history
|
247 |
|
248 |
# Functions to update PDF viewer
|
@@ -492,20 +527,20 @@ def perform_stack_search(query, tag, sort_by):
|
|
492 |
except Exception as e:
|
493 |
return f"Error searching Stack Overflow: {str(e)}"
|
494 |
|
495 |
-
# Modify the
|
496 |
def process_code_file(file_obj):
|
497 |
-
"""Process uploaded code files"""
|
498 |
if file_obj is None:
|
499 |
return None, "No file uploaded", {}
|
500 |
|
501 |
try:
|
502 |
# Handle both file objects and bytes objects
|
503 |
if isinstance(file_obj, bytes):
|
504 |
-
content = file_obj.decode('utf-8')
|
505 |
file_name = "uploaded_file"
|
506 |
file_extension = ".txt" # Default extension
|
507 |
else:
|
508 |
-
content = file_obj.read().decode('utf-8')
|
509 |
file_name = getattr(file_obj, 'name', 'uploaded_file')
|
510 |
file_extension = Path(file_name).suffix.lower()
|
511 |
|
@@ -518,17 +553,34 @@ def process_code_file(file_obj):
|
|
518 |
session_id = None
|
519 |
if embeddings:
|
520 |
try:
|
521 |
-
|
522 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
523 |
vectorstore = FAISS.from_documents(chunks, embeddings)
|
524 |
session_id = str(uuid.uuid4())
|
525 |
index_path = os.path.join(FAISS_INDEX_DIR, session_id)
|
526 |
vectorstore.save_local(index_path)
|
527 |
user_vectorstores[session_id] = vectorstore
|
|
|
|
|
|
|
|
|
528 |
except Exception as e:
|
529 |
print(f"Warning: Failed to create vectorstore: {e}")
|
530 |
|
531 |
-
return session_id, f"✅ Successfully analyzed {file_name}", metrics
|
532 |
except Exception as e:
|
533 |
return None, f"Error processing file: {str(e)}", {}
|
534 |
|
|
|
25 |
|
26 |
# Initialize embeddings with error handling
|
27 |
try:
|
28 |
+
# Force CPU usage for embeddings
|
29 |
embeddings = HuggingFaceInstructEmbeddings(
|
30 |
model_name="hkunlp/instructor-base",
|
31 |
+
model_kwargs={"device": "cpu"} # Force CPU usage
|
32 |
)
|
33 |
except Exception as e:
|
34 |
print(f"Warning: Failed to load primary embeddings model: {e}")
|
35 |
try:
|
36 |
embeddings = HuggingFaceInstructEmbeddings(
|
37 |
model_name="all-MiniLM-L6-v2",
|
38 |
+
model_kwargs={"device": "cpu"} # Force CPU usage
|
39 |
)
|
40 |
except Exception as e:
|
41 |
print(f"Warning: Failed to load fallback embeddings model: {e}")
|
42 |
embeddings = None
|
43 |
|
44 |
+
# Directory to store FAISS indexes with better naming
|
45 |
+
FAISS_INDEX_DIR = "faiss_indexes_tech_cpu"
|
46 |
if not os.path.exists(FAISS_INDEX_DIR):
|
47 |
os.makedirs(FAISS_INDEX_DIR)
|
48 |
|
|
|
205 |
|
206 |
# Function to generate chatbot responses with Tech theme
|
207 |
def generate_response(message, session_id, model_name, history):
|
208 |
+
"""Generate chatbot responses with FAISS context enhancement"""
|
209 |
if not message:
|
210 |
return history
|
211 |
|
|
|
213 |
context = ""
|
214 |
if embeddings and session_id and session_id in user_vectorstores:
|
215 |
try:
|
216 |
+
print(f"Performing similarity search with session: {session_id}")
|
217 |
vectorstore = user_vectorstores[session_id]
|
218 |
+
|
219 |
+
# Use a higher k value to get more relevant context
|
220 |
+
docs = vectorstore.similarity_search(message, k=5)
|
221 |
+
|
222 |
if docs:
|
223 |
+
# Format the context more clearly with source information
|
224 |
+
context = "\n\nRelevant code context from your files:\n\n"
|
225 |
+
for i, doc in enumerate(docs, 1):
|
226 |
+
source = doc.metadata.get("source", "Unknown")
|
227 |
+
language = doc.metadata.get("language", "Unknown")
|
228 |
+
context += f"--- Segment {i} from {source} ({language}) ---\n"
|
229 |
+
context += f"```\n{doc.page_content}\n```\n\n"
|
230 |
+
|
231 |
+
print(f"Found {len(docs)} relevant code segments for context.")
|
232 |
except Exception as e:
|
233 |
print(f"Warning: Failed to perform similarity search: {e}")
|
234 |
|
|
|
237 |
Format code snippets with proper markdown code blocks and specify the language."""
|
238 |
|
239 |
if context:
|
240 |
+
system_prompt += f"\n\nUse this context from the uploaded code files to inform your answers:{context}"
|
241 |
+
|
242 |
+
# Add instruction to reference specific file parts
|
243 |
+
system_prompt += "\nWhen discussing code from the uploaded files, specifically reference the file name and segment number."
|
244 |
|
245 |
completion = client.chat.completions.create(
|
246 |
model=model_name,
|
|
|
253 |
)
|
254 |
|
255 |
response = completion.choices[0].message.content
|
256 |
+
|
257 |
+
# For proper chat history handling
|
258 |
+
if isinstance(history, list) and history and isinstance(history[0], dict):
|
259 |
+
# History is in message format
|
260 |
+
history.append({"role": "user", "content": message})
|
261 |
+
history.append({"role": "assistant", "content": response})
|
262 |
+
else:
|
263 |
+
# Fallback for other formats
|
264 |
+
history.append({"role": "user", "content": message})
|
265 |
+
history.append({"role": "assistant", "content": response})
|
266 |
+
|
267 |
return history
|
268 |
|
269 |
except Exception as e:
|
270 |
error_msg = f"Error generating response: {str(e)}"
|
271 |
+
|
272 |
+
# Handle different history formats
|
273 |
+
if isinstance(history, list):
|
274 |
+
if history and isinstance(history[0], dict):
|
275 |
+
history.append({"role": "user", "content": message})
|
276 |
+
history.append({"role": "assistant", "content": error_msg})
|
277 |
+
else:
|
278 |
+
history.append({"role": "user", "content": message})
|
279 |
+
history.append({"role": "assistant", "content": error_msg})
|
280 |
+
|
281 |
return history
|
282 |
|
283 |
# Functions to update PDF viewer
|
|
|
527 |
except Exception as e:
|
528 |
return f"Error searching Stack Overflow: {str(e)}"
|
529 |
|
530 |
+
# Modify the process_code_file function
|
531 |
def process_code_file(file_obj):
|
532 |
+
"""Process uploaded code files and store in FAISS index"""
|
533 |
if file_obj is None:
|
534 |
return None, "No file uploaded", {}
|
535 |
|
536 |
try:
|
537 |
# Handle both file objects and bytes objects
|
538 |
if isinstance(file_obj, bytes):
|
539 |
+
content = file_obj.decode('utf-8', errors='replace') # Added error handling
|
540 |
file_name = "uploaded_file"
|
541 |
file_extension = ".txt" # Default extension
|
542 |
else:
|
543 |
+
content = file_obj.read().decode('utf-8', errors='replace') # Added error handling
|
544 |
file_name = getattr(file_obj, 'name', 'uploaded_file')
|
545 |
file_extension = Path(file_name).suffix.lower()
|
546 |
|
|
|
553 |
session_id = None
|
554 |
if embeddings:
|
555 |
try:
|
556 |
+
print(f"Creating FAISS index for {file_name}...")
|
557 |
+
# Improved chunking for code files
|
558 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
559 |
+
chunk_size=500, # Smaller chunks for code
|
560 |
+
chunk_overlap=50,
|
561 |
+
separators=["\n\n", "\n", " ", ""]
|
562 |
+
)
|
563 |
+
chunks = text_splitter.create_documents([content], metadatas=[{"filename": file_name, "language": language}])
|
564 |
+
|
565 |
+
# Add source metadata to help with retrieval
|
566 |
+
for i, chunk in enumerate(chunks):
|
567 |
+
chunk.metadata["chunk_id"] = i
|
568 |
+
chunk.metadata["source"] = file_name
|
569 |
+
|
570 |
+
# Create and store vectorstore
|
571 |
vectorstore = FAISS.from_documents(chunks, embeddings)
|
572 |
session_id = str(uuid.uuid4())
|
573 |
index_path = os.path.join(FAISS_INDEX_DIR, session_id)
|
574 |
vectorstore.save_local(index_path)
|
575 |
user_vectorstores[session_id] = vectorstore
|
576 |
+
|
577 |
+
# Add number of chunks to metrics for display
|
578 |
+
metrics["chunks"] = len(chunks)
|
579 |
+
print(f"Successfully created FAISS index with {len(chunks)} chunks.")
|
580 |
except Exception as e:
|
581 |
print(f"Warning: Failed to create vectorstore: {e}")
|
582 |
|
583 |
+
return session_id, f"✅ Successfully analyzed {file_name} and stored in FAISS index", metrics
|
584 |
except Exception as e:
|
585 |
return None, f"Error processing file: {str(e)}", {}
|
586 |
|