Spaces:

nafisneehal
/

trialbrain-playground

Sleeping

App Files Files Community

nafisneehal commited on Nov 21, 2024

Commit

30332c0

verified ·

1 Parent(s): e380cfe

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -13

app.py CHANGED Viewed

@@ -31,7 +31,8 @@ class ModelManager:
         self.current_model = None
         self.current_tokenizer = None
         self.current_model_name = None
-        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     def load_model(self, model_name):
         """Load model and free previous model's memory"""
@@ -46,16 +47,17 @@ class ModelManager:
                 model_name,
                 load_in_4bit=False,
                 torch_dtype=torch.bfloat16,
-                device_map={"": self.device}  # Changed this line
-            ).to(self.device)  # Added explicit device movement
             self.current_model_name = model_name
-            return f"Successfully loaded model: {model_name} on {self.device}"
         except Exception as e:
             return f"Error loading model: {str(e)}"
     def generate(self, prompt):
         """Helper method for generation"""
-        inputs = self.current_tokenizer(prompt, return_tensors="pt").to(self.device)
         return inputs
@@ -79,7 +81,7 @@ Each response should be formatted as follows:
 }
 Ensure EVERY response strictly follows this JSON structure."""
-@spaces.GPU
 def generate_response(model_name, system_instruction, user_input):
     """Generate response with GPU support and JSON formatting"""
     if model_manager.current_model_name != model_name:
@@ -88,7 +90,6 @@ def generate_response(model_name, system_instruction, user_input):
     if model_manager.current_model is None:
         return json.dumps({"error": "No model loaded. Please load a model first."}, indent=2)
-    # Prepare the prompt with explicit JSON formatting
     prompt = f"""### Instruction:
 {system_instruction}
 Remember to ALWAYS format your response as valid JSON.
@@ -98,7 +99,6 @@ Remember to ALWAYS format your response as valid JSON.
 {{"""
     try:
-        # Get tokenized inputs using helper method
         inputs = model_manager.generate(prompt)
         meta_config = {
@@ -112,15 +112,14 @@ Remember to ALWAYS format your response as valid JSON.
         }
         generation_config = GenerationConfig(**meta_config)
-        # Generate response
         with torch.no_grad():
             outputs = model_manager.current_model.generate(
                 **inputs,
                 generation_config=generation_config
-            ).to(model_manager.device)  # Ensure outputs are on correct device
             decoded_output = model_manager.current_tokenizer.batch_decode(
-                outputs.to(model_manager.device),
                 skip_special_tokens=True
             )[0]
@@ -140,8 +139,7 @@ Remember to ALWAYS format your response as valid JSON.
     except Exception as e:
         return json.dumps({
             "error": f"Error generating response: {str(e)}",
-            "details": "An unexpected error occurred during generation",
-            "device_info": f"Model device: {model_manager.device}, Input device: {inputs.input_ids.device if inputs else 'unknown'}"
         }, indent=2)

         self.current_model = None
         self.current_tokenizer = None
         self.current_model_name = None
+        # Don't initialize CUDA in __init__
+        self.device = None
     def load_model(self, model_name):
         """Load model and free previous model's memory"""
                 model_name,
                 load_in_4bit=False,
                 torch_dtype=torch.bfloat16,
+                device_map="auto"  # Let the model decide device mapping
+            )
             self.current_model_name = model_name
+            return f"Successfully loaded model: {model_name}"
         except Exception as e:
             return f"Error loading model: {str(e)}"
     def generate(self, prompt):
         """Helper method for generation"""
+        inputs = self.current_tokenizer(prompt, return_tensors="pt")
+        # Let device mapping happen automatically
         return inputs
 }
 Ensure EVERY response strictly follows this JSON structure."""
+@spaces.GPU  # This decorator handles the GPU allocation
 def generate_response(model_name, system_instruction, user_input):
     """Generate response with GPU support and JSON formatting"""
     if model_manager.current_model_name != model_name:
     if model_manager.current_model is None:
         return json.dumps({"error": "No model loaded. Please load a model first."}, indent=2)
     prompt = f"""### Instruction:
 {system_instruction}
 Remember to ALWAYS format your response as valid JSON.
 {{"""
     try:
         inputs = model_manager.generate(prompt)
         meta_config = {
         }
         generation_config = GenerationConfig(**meta_config)
         with torch.no_grad():
             outputs = model_manager.current_model.generate(
                 **inputs,
                 generation_config=generation_config
+            )
             decoded_output = model_manager.current_tokenizer.batch_decode(
+                outputs,
                 skip_special_tokens=True
             )[0]
     except Exception as e:
         return json.dumps({
             "error": f"Error generating response: {str(e)}",
+            "details": "An unexpected error occurred during generation"
         }, indent=2)