Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -31,7 +31,8 @@ class ModelManager:
|
|
31 |
self.current_model = None
|
32 |
self.current_tokenizer = None
|
33 |
self.current_model_name = None
|
34 |
-
|
|
|
35 |
|
36 |
def load_model(self, model_name):
|
37 |
"""Load model and free previous model's memory"""
|
@@ -46,16 +47,17 @@ class ModelManager:
|
|
46 |
model_name,
|
47 |
load_in_4bit=False,
|
48 |
torch_dtype=torch.bfloat16,
|
49 |
-
device_map=
|
50 |
-
)
|
51 |
self.current_model_name = model_name
|
52 |
-
return f"Successfully loaded model: {model_name}
|
53 |
except Exception as e:
|
54 |
return f"Error loading model: {str(e)}"
|
55 |
|
56 |
def generate(self, prompt):
|
57 |
"""Helper method for generation"""
|
58 |
-
inputs = self.current_tokenizer(prompt, return_tensors="pt")
|
|
|
59 |
return inputs
|
60 |
|
61 |
|
@@ -79,7 +81,7 @@ Each response should be formatted as follows:
|
|
79 |
}
|
80 |
Ensure EVERY response strictly follows this JSON structure."""
|
81 |
|
82 |
-
@spaces.GPU
|
83 |
def generate_response(model_name, system_instruction, user_input):
|
84 |
"""Generate response with GPU support and JSON formatting"""
|
85 |
if model_manager.current_model_name != model_name:
|
@@ -88,7 +90,6 @@ def generate_response(model_name, system_instruction, user_input):
|
|
88 |
if model_manager.current_model is None:
|
89 |
return json.dumps({"error": "No model loaded. Please load a model first."}, indent=2)
|
90 |
|
91 |
-
# Prepare the prompt with explicit JSON formatting
|
92 |
prompt = f"""### Instruction:
|
93 |
{system_instruction}
|
94 |
Remember to ALWAYS format your response as valid JSON.
|
@@ -98,7 +99,6 @@ Remember to ALWAYS format your response as valid JSON.
|
|
98 |
{{"""
|
99 |
|
100 |
try:
|
101 |
-
# Get tokenized inputs using helper method
|
102 |
inputs = model_manager.generate(prompt)
|
103 |
|
104 |
meta_config = {
|
@@ -112,15 +112,14 @@ Remember to ALWAYS format your response as valid JSON.
|
|
112 |
}
|
113 |
generation_config = GenerationConfig(**meta_config)
|
114 |
|
115 |
-
# Generate response
|
116 |
with torch.no_grad():
|
117 |
outputs = model_manager.current_model.generate(
|
118 |
**inputs,
|
119 |
generation_config=generation_config
|
120 |
-
)
|
121 |
|
122 |
decoded_output = model_manager.current_tokenizer.batch_decode(
|
123 |
-
outputs
|
124 |
skip_special_tokens=True
|
125 |
)[0]
|
126 |
|
@@ -140,8 +139,7 @@ Remember to ALWAYS format your response as valid JSON.
|
|
140 |
except Exception as e:
|
141 |
return json.dumps({
|
142 |
"error": f"Error generating response: {str(e)}",
|
143 |
-
"details": "An unexpected error occurred during generation"
|
144 |
-
"device_info": f"Model device: {model_manager.device}, Input device: {inputs.input_ids.device if inputs else 'unknown'}"
|
145 |
}, indent=2)
|
146 |
|
147 |
|
|
|
31 |
self.current_model = None
|
32 |
self.current_tokenizer = None
|
33 |
self.current_model_name = None
|
34 |
+
# Don't initialize CUDA in __init__
|
35 |
+
self.device = None
|
36 |
|
37 |
def load_model(self, model_name):
|
38 |
"""Load model and free previous model's memory"""
|
|
|
47 |
model_name,
|
48 |
load_in_4bit=False,
|
49 |
torch_dtype=torch.bfloat16,
|
50 |
+
device_map="auto" # Let the model decide device mapping
|
51 |
+
)
|
52 |
self.current_model_name = model_name
|
53 |
+
return f"Successfully loaded model: {model_name}"
|
54 |
except Exception as e:
|
55 |
return f"Error loading model: {str(e)}"
|
56 |
|
57 |
def generate(self, prompt):
|
58 |
"""Helper method for generation"""
|
59 |
+
inputs = self.current_tokenizer(prompt, return_tensors="pt")
|
60 |
+
# Let device mapping happen automatically
|
61 |
return inputs
|
62 |
|
63 |
|
|
|
81 |
}
|
82 |
Ensure EVERY response strictly follows this JSON structure."""
|
83 |
|
84 |
+
@spaces.GPU # This decorator handles the GPU allocation
|
85 |
def generate_response(model_name, system_instruction, user_input):
|
86 |
"""Generate response with GPU support and JSON formatting"""
|
87 |
if model_manager.current_model_name != model_name:
|
|
|
90 |
if model_manager.current_model is None:
|
91 |
return json.dumps({"error": "No model loaded. Please load a model first."}, indent=2)
|
92 |
|
|
|
93 |
prompt = f"""### Instruction:
|
94 |
{system_instruction}
|
95 |
Remember to ALWAYS format your response as valid JSON.
|
|
|
99 |
{{"""
|
100 |
|
101 |
try:
|
|
|
102 |
inputs = model_manager.generate(prompt)
|
103 |
|
104 |
meta_config = {
|
|
|
112 |
}
|
113 |
generation_config = GenerationConfig(**meta_config)
|
114 |
|
|
|
115 |
with torch.no_grad():
|
116 |
outputs = model_manager.current_model.generate(
|
117 |
**inputs,
|
118 |
generation_config=generation_config
|
119 |
+
)
|
120 |
|
121 |
decoded_output = model_manager.current_tokenizer.batch_decode(
|
122 |
+
outputs,
|
123 |
skip_special_tokens=True
|
124 |
)[0]
|
125 |
|
|
|
139 |
except Exception as e:
|
140 |
return json.dumps({
|
141 |
"error": f"Error generating response: {str(e)}",
|
142 |
+
"details": "An unexpected error occurred during generation"
|
|
|
143 |
}, indent=2)
|
144 |
|
145 |
|