Spaces:

CamiloVega
/

aQuaBot

Sleeping

App Files Files Community

CamiloVega commited on Oct 30, 2024

Commit

23734f7

verified ·

1 Parent(s): 1f5453e

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -46

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ model_name = "meta-llama/Llama-2-7b-hf"
 try:
     logger.info("Starting model initialization...")
     # Check CUDA availability
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
@@ -45,28 +45,31 @@ try:
     tokenizer.pad_token = tokenizer.eos_token
     logger.info("Tokenizer loaded successfully")
-    # Load model with basic configuration
     logger.info("Loading model...")
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         torch_dtype=torch.float16 if device == "cuda" else torch.float32,
         trust_remote_code=True,
         token=hf_token,
-        device_map="auto"
     )
     logger.info("Model loaded successfully")
-    # Create pipeline
     logger.info("Creating generation pipeline...")
     model_gen = pipeline(
         "text-generation",
         model=model,
         tokenizer=tokenizer,
-        max_new_tokens=256,
         do_sample=True,
-        temperature=0.7,
-        top_p=0.9,
-        repetition_penalty=1.1,
         device_map="auto"
     )
     logger.info("Pipeline created successfully")
@@ -75,9 +78,15 @@ except Exception as e:
     logger.error(f"Error during initialization: {str(e)}")
     raise
-# Configure system message
-system_message = """You are a helpful AI assistant called AQuaBot. You provide direct, clear, and detailed answers to questions while being aware of environmental impact. Keep your responses natural and informative, but concise. Always provide context and explanations with your answers. Respond directly to questions without using any special tags or markers."""
 @spaces.GPU(duration=60)
 @torch.inference_mode()
@@ -90,64 +99,46 @@ def generate_response(user_input, chat_history):
         input_water_consumption = calculate_water_consumption(user_input, True)
         total_water_consumption += input_water_consumption
-        # Create prompt with Llama 2 chat format
         conversation_history = ""
         if chat_history:
-            for message in chat_history:
-                # Remove any [INST] tags from the history
-                user_msg = message[0].replace("[INST]", "").replace("[/INST]", "").strip()
-                assistant_msg = message[1].replace("[INST]", "").replace("[/INST]", "").strip()
-                conversation_history += f"[INST] {user_msg} [/INST] {assistant_msg} "
-        prompt = f"<s>[INST] {system_message}\n\n{conversation_history}[INST] {user_input} [/INST]"
         logger.info("Generating model response...")
         outputs = model_gen(
             prompt,
-            max_new_tokens=256,
             return_full_text=False,
             pad_token_id=tokenizer.eos_token_id,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.9,
-            repetition_penalty=1.1
         )
         logger.info("Model response generated successfully")
-        # Clean up the response by removing any [INST] tags and trimming
         assistant_response = outputs[0]['generated_text'].strip()
-        assistant_response = assistant_response.replace("[INST]", "").replace("[/INST]", "").strip()
-        # If the response is too short, try to generate a more detailed one
-        if len(assistant_response.split()) < 10:
-            prompt += "\nPlease provide a more detailed answer with context and explanation."
-            outputs = model_gen(
-                prompt,
-                max_new_tokens=256,
-                return_full_text=False,
-                pad_token_id=tokenizer.eos_token_id,
-                do_sample=True,
-                temperature=0.7,
-                top_p=0.9,
-                repetition_penalty=1.1
-            )
-            assistant_response = outputs[0]['generated_text'].strip()
-            assistant_response = assistant_response.replace("[INST]", "").replace("[/INST]", "").strip()
         # Calculate water consumption for output
         output_water_consumption = calculate_water_consumption(assistant_response, False)
         total_water_consumption += output_water_consumption
-        # Update chat history with the cleaned messages
         chat_history.append([user_input, assistant_response])
-        # Prepare water consumption message
         water_message = f"""
         <div style="position: fixed; top: 20px; right: 20px;
                     background-color: white; padding: 15px;
-                    border: 2px solid #ff0000; border-radius: 10px;
-                    box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
-            <div style="color: #ff0000; font-size: 24px; font-weight: bold;">
                 💧 {total_water_consumption:.4f} ml
             </div>
             <div style="color: #666; font-size: 14px;">
@@ -160,7 +151,7 @@ def generate_response(user_input, chat_history):
     except Exception as e:
         logger.error(f"Error in generate_response: {str(e)}")
-        error_message = f"An error occurred: {str(e)}"
         chat_history.append([user_input, error_message])
         return chat_history, show_water

 try:
     logger.info("Starting model initialization...")
     # Check CUDA availability
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
     tokenizer.pad_token = tokenizer.eos_token
     logger.info("Tokenizer loaded successfully")
+    # Load model with optimized configuration
     logger.info("Loading model...")
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         torch_dtype=torch.float16 if device == "cuda" else torch.float32,
         trust_remote_code=True,
         token=hf_token,
+        device_map="auto",
+        max_memory={0: "12GiB"} if device == "cuda" else None,
+        load_in_8bit=True if device == "cuda" else False
     )
     logger.info("Model loaded successfully")
+    # Create pipeline with improved parameters
     logger.info("Creating generation pipeline...")
     model_gen = pipeline(
         "text-generation",
         model=model,
         tokenizer=tokenizer,
+        max_new_tokens=512,  # Increased for more detailed responses
         do_sample=True,
+        temperature=0.8,  # Slightly increased for more creative responses
+        top_p=0.95,  # Increased for more varied responses
+        top_k=50,  # Added top_k for better response quality
+        repetition_penalty=1.2,  # Increased to reduce repetition
         device_map="auto"
     )
     logger.info("Pipeline created successfully")
     logger.error(f"Error during initialization: {str(e)}")
     raise
+# Improved system message with better context and guidelines
+system_message = """You are AQuaBot, an AI assistant focused on providing accurate and environmentally conscious information. Your responses should be:
+1. Clear and concise yet informative
+2. Based on verified information when discussing economic and financial topics
+3. Balanced and well-reasoned
+4. Mindful of environmental impact
+5. Professional but conversational in tone
+Maintain a helpful and knowledgeable demeanor while avoiding speculation. If you're unsure about something, acknowledge it openly."""
 @spaces.GPU(duration=60)
 @torch.inference_mode()
         input_water_consumption = calculate_water_consumption(user_input, True)
         total_water_consumption += input_water_consumption
+        # Create a clean conversation history without [INST] tags
         conversation_history = ""
         if chat_history:
+            for user_msg, assistant_msg in chat_history:
+                conversation_history += f"User: {user_msg}\nAssistant: {assistant_msg}\n\n"
+        # Create a clean prompt format
+        prompt = f"{system_message}\n\nConversation History:\n{conversation_history}\nUser: {user_input}\nAssistant:"
         logger.info("Generating model response...")
         outputs = model_gen(
             prompt,
+            max_new_tokens=512,
             return_full_text=False,
             pad_token_id=tokenizer.eos_token_id,
         )
         logger.info("Model response generated successfully")
+        # Clean up response and remove any remaining [INST] tags
         assistant_response = outputs[0]['generated_text'].strip()
+        assistant_response = assistant_response.split('User:')[0].split('Assistant:')[-1].strip()
+        # Add fact-check disclaimer for economic/financial responses
+        if any(keyword in user_input.lower() for keyword in ['invest', 'money', 'salary', 'cost', 'wage', 'economy']):
+            assistant_response += "\n\nNote: Financial information provided should be verified with current market data and professional advisors."
         # Calculate water consumption for output
         output_water_consumption = calculate_water_consumption(assistant_response, False)
         total_water_consumption += output_water_consumption
+        # Update chat history
         chat_history.append([user_input, assistant_response])
+        # Prepare water consumption message with improved styling
         water_message = f"""
         <div style="position: fixed; top: 20px; right: 20px;
                     background-color: white; padding: 15px;
+                    border: 2px solid #2196F3; border-radius: 10px;
+                    box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
+            <div style="color: #2196F3; font-size: 24px; font-weight: bold;">
                 💧 {total_water_consumption:.4f} ml
             </div>
             <div style="color: #666; font-size: 14px;">
     except Exception as e:
         logger.error(f"Error in generate_response: {str(e)}")
+        error_message = f"I apologize, but I encountered an error. Please try rephrasing your question."
         chat_history.append([user_input, error_message])
         return chat_history, show_water