nafisneehal commited on
Commit
41e7fe3
·
verified ·
1 Parent(s): 76d6bf4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -48
app.py CHANGED
@@ -17,12 +17,11 @@ except Exception as e:
17
  MODEL_FILE = "model_links.txt"
18
 
19
  def load_model_links():
20
- # """Load model links from file"""
21
- # if not os.path.exists(MODEL_FILE):
22
- # # Create default file with some example models
23
- # with open(MODEL_FILE, "w") as f:
24
- # f.write("meta-llama/Llama-2-7b-chat-hf\n")
25
- # f.write("tiiuae/falcon-7b-instruct\n")
26
 
27
  with open(MODEL_FILE, "r") as f:
28
  return [line.strip() for line in f.readlines() if line.strip()]
@@ -32,7 +31,6 @@ class ModelManager:
32
  self.current_model = None
33
  self.current_tokenizer = None
34
  self.current_model_name = None
35
- #self.device = "cuda" if torch.cuda.is_available() else "cpu"
36
  self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
37
 
38
  def load_model(self, model_name):
@@ -48,34 +46,17 @@ class ModelManager:
48
  model_name,
49
  load_in_4bit=False,
50
  torch_dtype=torch.bfloat16,
51
- device_map="auto"
52
- )
53
  self.current_model_name = model_name
54
- return f"Successfully loaded model: {model_name}"
55
  except Exception as e:
56
  return f"Error loading model: {str(e)}"
57
 
58
- # Initialize model manager
59
- model_manager = ModelManager()
60
-
61
- # Default system message for JSON output
62
- default_system_message = """You are a helpful AI assistant. You must ALWAYS return your response in valid JSON format.
63
- Each response should be formatted as follows:
64
-
65
- {
66
- "response": {
67
- "main_answer": "Your primary response here",
68
- "additional_details": "Any additional information or context",
69
- "confidence": 0.0 to 1.0,
70
- "tags": ["relevant", "tags", "here"]
71
- },
72
- "metadata": {
73
- "response_type": "type of response",
74
- "source": "basis of response if applicable"
75
- }
76
- }
77
-
78
- Ensure EVERY response strictly follows this JSON structure."""
79
 
80
  @spaces.GPU
81
  def generate_response(model_name, system_instruction, user_input):
@@ -93,20 +74,17 @@ Remember to ALWAYS format your response as valid JSON.
93
  ### Input:
94
  {user_input}
95
  ### Response:
96
- {{""" # Note the opening curly brace to hint JSON response
97
 
98
  try:
99
- # Ensure inputs are on the correct device
100
- inputs = model_manager.current_tokenizer([prompt], return_tensors="pt")
101
- # Move input_ids and attention_mask to the same device as the model
102
- inputs = {k: v.to(model_manager.device) for k, v in inputs.items()}
103
-
104
- # Generation configuration optimized for JSON output
105
  meta_config = {
106
  "do_sample": False,
107
  "temperature": 0.0,
108
  "max_new_tokens": 512,
109
- "repetition_penalty": 1.2,
110
  "use_cache": True,
111
  "pad_token_id": model_manager.current_tokenizer.eos_token_id,
112
  "eos_token_id": model_manager.current_tokenizer.eos_token_id
@@ -116,20 +94,20 @@ Remember to ALWAYS format your response as valid JSON.
116
  # Generate response
117
  with torch.no_grad():
118
  outputs = model_manager.current_model.generate(
119
- input_ids=inputs['input_ids'],
120
- attention_mask=inputs['attention_mask'],
121
  generation_config=generation_config
122
- )
123
- decoded_output = model_manager.current_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
 
 
 
 
 
124
  assistant_response = decoded_output.split("### Response:")[-1].strip()
125
 
126
- # Clean up and validate JSON
127
  try:
128
- # Find the last complete JSON object
129
  last_brace = assistant_response.rindex('}')
130
  assistant_response = assistant_response[:last_brace + 1]
131
-
132
- # Parse and re-format JSON
133
  json_response = json.loads(assistant_response)
134
  return json.dumps(json_response, indent=2)
135
  except (json.JSONDecodeError, ValueError):
@@ -141,9 +119,13 @@ Remember to ALWAYS format your response as valid JSON.
141
  except Exception as e:
142
  return json.dumps({
143
  "error": f"Error generating response: {str(e)}",
144
- "details": "An unexpected error occurred during generation"
 
145
  }, indent=2)
146
 
 
 
 
147
  # Gradio interface setup
148
  with gr.Blocks() as demo:
149
  gr.Markdown("# Chat Interface with Model Selection (JSON Output)")
 
17
  MODEL_FILE = "model_links.txt"
18
 
19
  def load_model_links():
20
+ """Load model links from file"""
21
+ if not os.path.exists(MODEL_FILE):
22
+ # Create default file with some example models
23
+ with open(MODEL_FILE, "w") as f:
24
+ f.write("meta-llama/Llama-2-7b-chat-hf\n")
 
25
 
26
  with open(MODEL_FILE, "r") as f:
27
  return [line.strip() for line in f.readlines() if line.strip()]
 
31
  self.current_model = None
32
  self.current_tokenizer = None
33
  self.current_model_name = None
 
34
  self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
35
 
36
  def load_model(self, model_name):
 
46
  model_name,
47
  load_in_4bit=False,
48
  torch_dtype=torch.bfloat16,
49
+ device_map={"": self.device} # Changed this line
50
+ ).to(self.device) # Added explicit device movement
51
  self.current_model_name = model_name
52
+ return f"Successfully loaded model: {model_name} on {self.device}"
53
  except Exception as e:
54
  return f"Error loading model: {str(e)}"
55
 
56
+ def generate(self, prompt):
57
+ """Helper method for generation"""
58
+ inputs = self.current_tokenizer(prompt, return_tensors="pt").to(self.device)
59
+ return inputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  @spaces.GPU
62
  def generate_response(model_name, system_instruction, user_input):
 
74
  ### Input:
75
  {user_input}
76
  ### Response:
77
+ {{"""
78
 
79
  try:
80
+ # Get tokenized inputs using helper method
81
+ inputs = model_manager.generate(prompt)
82
+
 
 
 
83
  meta_config = {
84
  "do_sample": False,
85
  "temperature": 0.0,
86
  "max_new_tokens": 512,
87
+ "repetition_penalty": 1.1,
88
  "use_cache": True,
89
  "pad_token_id": model_manager.current_tokenizer.eos_token_id,
90
  "eos_token_id": model_manager.current_tokenizer.eos_token_id
 
94
  # Generate response
95
  with torch.no_grad():
96
  outputs = model_manager.current_model.generate(
97
+ **inputs,
 
98
  generation_config=generation_config
99
+ ).to(model_manager.device) # Ensure outputs are on correct device
100
+
101
+ decoded_output = model_manager.current_tokenizer.batch_decode(
102
+ outputs.to(model_manager.device),
103
+ skip_special_tokens=True
104
+ )[0]
105
+
106
  assistant_response = decoded_output.split("### Response:")[-1].strip()
107
 
 
108
  try:
 
109
  last_brace = assistant_response.rindex('}')
110
  assistant_response = assistant_response[:last_brace + 1]
 
 
111
  json_response = json.loads(assistant_response)
112
  return json.dumps(json_response, indent=2)
113
  except (json.JSONDecodeError, ValueError):
 
119
  except Exception as e:
120
  return json.dumps({
121
  "error": f"Error generating response: {str(e)}",
122
+ "details": "An unexpected error occurred during generation",
123
+ "device_info": f"Model device: {model_manager.device}, Input device: {inputs.input_ids.device if inputs else 'unknown'}"
124
  }, indent=2)
125
 
126
+
127
+
128
+
129
  # Gradio interface setup
130
  with gr.Blocks() as demo:
131
  gr.Markdown("# Chat Interface with Model Selection (JSON Output)")