pradeep6kumar2024 commited on
Commit
8920961
·
1 Parent(s): c64b3d5

fixed issues

Browse files
Files changed (5) hide show
  1. app.py +1 -0
  2. app.py.bak +248 -0
  3. app_fixed.py +249 -0
  4. requirements.txt +1 -1
  5. update_app.py +15 -0
app.py CHANGED
@@ -245,4 +245,5 @@ demo = gr.Interface(
245
  )
246
 
247
  if __name__ == "__main__":
 
248
  demo.launch(max_threads=1) # Limit the number of worker threads
 
245
  )
246
 
247
  if __name__ == "__main__":
248
+ # Using the modern approach without queue method
249
  demo.launch(max_threads=1) # Limit the number of worker threads
app.py.bak ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from peft import PeftModel
5
+ import time
6
+ import gc
7
+ import os
8
+ import psutil
9
+
10
+ # Configuration
11
+ BASE_MODEL = "microsoft/phi-2"
12
+ ADAPTER_MODEL = "pradeep6kumar2024/phi2-qlora-assistant"
13
+
14
+ # Memory monitoring
15
+ def get_memory_usage():
16
+ process = psutil.Process(os.getpid())
17
+ return process.memory_info().rss / (1024 * 1024) # MB
18
+
19
+ class ModelWrapper:
20
+ def __init__(self):
21
+ self.model = None
22
+ self.tokenizer = None
23
+ self.loaded = False
24
+
25
+ def load_model(self):
26
+ if not self.loaded:
27
+ try:
28
+ # Force CPU usage
29
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
30
+ device = torch.device("cpu")
31
+
32
+ # Clear memory
33
+ gc.collect()
34
+
35
+ print(f"Memory before loading: {get_memory_usage():.2f} MB")
36
+
37
+ print("Loading tokenizer...")
38
+ self.tokenizer = AutoTokenizer.from_pretrained(
39
+ BASE_MODEL,
40
+ trust_remote_code=True,
41
+ padding_side="left"
42
+ )
43
+ self.tokenizer.pad_token = self.tokenizer.eos_token
44
+
45
+ print(f"Memory after tokenizer: {get_memory_usage():.2f} MB")
46
+
47
+ print("Loading base model...")
48
+ base_model = AutoModelForCausalLM.from_pretrained(
49
+ BASE_MODEL,
50
+ torch_dtype=torch.float32,
51
+ device_map="cpu",
52
+ trust_remote_code=True,
53
+ use_flash_attention_2=False,
54
+ low_cpu_mem_usage=True,
55
+ offload_folder="offload"
56
+ )
57
+
58
+ print(f"Memory after base model: {get_memory_usage():.2f} MB")
59
+
60
+ print("Loading LoRA adapter...")
61
+ self.model = PeftModel.from_pretrained(
62
+ base_model,
63
+ ADAPTER_MODEL,
64
+ torch_dtype=torch.float32,
65
+ device_map="cpu"
66
+ )
67
+
68
+ # Free up memory
69
+ del base_model
70
+ gc.collect()
71
+
72
+ print(f"Memory after adapter: {get_memory_usage():.2f} MB")
73
+
74
+ self.model.eval()
75
+ print("Model loading complete!")
76
+ self.loaded = True
77
+ except Exception as e:
78
+ print(f"Error during model loading: {str(e)}")
79
+ raise
80
+
81
+ def generate_response(self, prompt, max_length=256, temperature=0.7, top_p=0.9):
82
+ if not self.loaded:
83
+ self.load_model()
84
+
85
+ try:
86
+ # Use shorter prompts to save memory
87
+ if "function" in prompt.lower() and "python" in prompt.lower():
88
+ enhanced_prompt = f"""Write Python function: {prompt}"""
89
+ elif any(word in prompt.lower() for word in ["explain", "what is", "how does", "describe"]):
90
+ enhanced_prompt = f"""Explain briefly: {prompt}"""
91
+ else:
92
+ enhanced_prompt = prompt
93
+
94
+ print(f"Enhanced prompt: {enhanced_prompt}")
95
+
96
+ # Tokenize input with shorter max length
97
+ inputs = self.tokenizer(
98
+ enhanced_prompt,
99
+ return_tensors="pt",
100
+ truncation=True,
101
+ max_length=256, # Reduced for memory
102
+ padding=True
103
+ ).to("cpu")
104
+
105
+ # Generate with minimal parameters
106
+ start_time = time.time()
107
+ with torch.no_grad():
108
+ outputs = self.model.generate(
109
+ **inputs,
110
+ max_length=min(max_length, 256), # Strict limit
111
+ min_length=10, # Reduced minimum
112
+ temperature=min(0.5, temperature),
113
+ top_p=min(0.85, top_p),
114
+ do_sample=True,
115
+ pad_token_id=self.tokenizer.pad_token_id,
116
+ eos_token_id=self.tokenizer.eos_token_id,
117
+ repetition_penalty=1.2,
118
+ no_repeat_ngram_size=3,
119
+ num_return_sequences=1,
120
+ early_stopping=True,
121
+ num_beams=1, # Greedy decoding to save memory
122
+ length_penalty=0.6
123
+ )
124
+
125
+ # Decode response
126
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
127
+
128
+ # Clean up the response
129
+ if response.startswith(enhanced_prompt):
130
+ response = response[len(enhanced_prompt):].strip()
131
+
132
+ # Basic cleanup only
133
+ response = response.replace("Human:", "").replace("Assistant:", "")
134
+
135
+ # Ensure code examples are properly formatted
136
+ if "```python" not in response and "def " in response:
137
+ response = "```python\n" + response + "\n```"
138
+
139
+ # Simple validation
140
+ if len(response.strip()) < 10:
141
+ if "function" in prompt.lower():
142
+ fallback_response = """```python
143
+ def add_numbers(a, b):
144
+ return a + b
145
+ ```"""
146
+ else:
147
+ fallback_response = "I apologize, but I couldn't generate a response. Please try with a simpler prompt."
148
+
149
+ response = fallback_response
150
+
151
+ # Clear memory after generation
152
+ gc.collect()
153
+
154
+ generation_time = time.time() - start_time
155
+ return response, generation_time
156
+ except Exception as e:
157
+ print(f"Error during generation: {str(e)}")
158
+ raise
159
+
160
+ # Initialize model wrapper
161
+ model_wrapper = ModelWrapper()
162
+
163
+ def generate_text(prompt, max_length=256, temperature=0.5, top_p=0.85):
164
+ """Gradio interface function"""
165
+ try:
166
+ if not prompt.strip():
167
+ return "Please enter a prompt."
168
+
169
+ response, gen_time = model_wrapper.generate_response(
170
+ prompt,
171
+ max_length=max_length,
172
+ temperature=temperature,
173
+ top_p=top_p
174
+ )
175
+ return f"Generated in {gen_time:.2f} seconds:\n\n{response}"
176
+ except Exception as e:
177
+ print(f"Error in generate_text: {str(e)}")
178
+ return f"Error generating response: {str(e)}\nPlease try again with a shorter prompt."
179
+
180
+ # Create a very lightweight Gradio interface
181
+ demo = gr.Interface(
182
+ fn=generate_text,
183
+ inputs=[
184
+ gr.Textbox(
185
+ label="Enter your prompt",
186
+ placeholder="Type your prompt here...",
187
+ lines=3
188
+ ),
189
+ gr.Slider(
190
+ minimum=64,
191
+ maximum=256,
192
+ value=192,
193
+ step=32,
194
+ label="Maximum Length",
195
+ info="Keep this low for CPU"
196
+ ),
197
+ gr.Slider(
198
+ minimum=0.1,
199
+ maximum=0.7,
200
+ value=0.4,
201
+ step=0.1,
202
+ label="Temperature",
203
+ info="Lower is better for CPU"
204
+ ),
205
+ gr.Slider(
206
+ minimum=0.5,
207
+ maximum=0.9,
208
+ value=0.8,
209
+ step=0.1,
210
+ label="Top P",
211
+ info="Controls diversity"
212
+ ),
213
+ ],
214
+ outputs=gr.Textbox(label="Generated Response", lines=6),
215
+ title="Phi-2 QLoRA Assistant (CPU-Optimized)",
216
+ description="""This is a lightweight CPU version of the fine-tuned Phi-2 model.
217
+
218
+ Tips:
219
+ - Keep prompts short and specific
220
+ - Use lower maximum length (128-192) for faster responses
221
+ - Use lower temperature (0.3-0.5) for more reliable responses
222
+ """,
223
+ examples=[
224
+ [
225
+ "Write a Python function to calculate factorial",
226
+ 192,
227
+ 0.4,
228
+ 0.8
229
+ ],
230
+ [
231
+ "Explain machine learning simply",
232
+ 192,
233
+ 0.4,
234
+ 0.8
235
+ ],
236
+ [
237
+ "Write a short email to schedule a meeting",
238
+ 192,
239
+ 0.4,
240
+ 0.8
241
+ ]
242
+ ],
243
+ cache_examples=False,
244
+ concurrency_limit=1 # Use the correct parameter for limiting concurrency
245
+ )
246
+
247
+ if __name__ == "__main__":
248
+ demo.launch(max_threads=1) # Limit the number of worker threads
app_fixed.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from peft import PeftModel
5
+ import time
6
+ import gc
7
+ import os
8
+ import psutil
9
+
10
+ # Configuration
11
+ BASE_MODEL = "microsoft/phi-2"
12
+ ADAPTER_MODEL = "pradeep6kumar2024/phi2-qlora-assistant"
13
+
14
+ # Memory monitoring
15
+ def get_memory_usage():
16
+ process = psutil.Process(os.getpid())
17
+ return process.memory_info().rss / (1024 * 1024) # MB
18
+
19
+ class ModelWrapper:
20
+ def __init__(self):
21
+ self.model = None
22
+ self.tokenizer = None
23
+ self.loaded = False
24
+
25
+ def load_model(self):
26
+ if not self.loaded:
27
+ try:
28
+ # Force CPU usage
29
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
30
+ device = torch.device("cpu")
31
+
32
+ # Clear memory
33
+ gc.collect()
34
+
35
+ print(f"Memory before loading: {get_memory_usage():.2f} MB")
36
+
37
+ print("Loading tokenizer...")
38
+ self.tokenizer = AutoTokenizer.from_pretrained(
39
+ BASE_MODEL,
40
+ trust_remote_code=True,
41
+ padding_side="left"
42
+ )
43
+ self.tokenizer.pad_token = self.tokenizer.eos_token
44
+
45
+ print(f"Memory after tokenizer: {get_memory_usage():.2f} MB")
46
+
47
+ print("Loading base model...")
48
+ base_model = AutoModelForCausalLM.from_pretrained(
49
+ BASE_MODEL,
50
+ torch_dtype=torch.float32,
51
+ device_map="cpu",
52
+ trust_remote_code=True,
53
+ use_flash_attention_2=False,
54
+ low_cpu_mem_usage=True,
55
+ offload_folder="offload"
56
+ )
57
+
58
+ print(f"Memory after base model: {get_memory_usage():.2f} MB")
59
+
60
+ print("Loading LoRA adapter...")
61
+ self.model = PeftModel.from_pretrained(
62
+ base_model,
63
+ ADAPTER_MODEL,
64
+ torch_dtype=torch.float32,
65
+ device_map="cpu"
66
+ )
67
+
68
+ # Free up memory
69
+ del base_model
70
+ gc.collect()
71
+
72
+ print(f"Memory after adapter: {get_memory_usage():.2f} MB")
73
+
74
+ self.model.eval()
75
+ print("Model loading complete!")
76
+ self.loaded = True
77
+ except Exception as e:
78
+ print(f"Error during model loading: {str(e)}")
79
+ raise
80
+
81
+ def generate_response(self, prompt, max_length=256, temperature=0.7, top_p=0.9):
82
+ if not self.loaded:
83
+ self.load_model()
84
+
85
+ try:
86
+ # Use shorter prompts to save memory
87
+ if "function" in prompt.lower() and "python" in prompt.lower():
88
+ enhanced_prompt = f"""Write Python function: {prompt}"""
89
+ elif any(word in prompt.lower() for word in ["explain", "what is", "how does", "describe"]):
90
+ enhanced_prompt = f"""Explain briefly: {prompt}"""
91
+ else:
92
+ enhanced_prompt = prompt
93
+
94
+ print(f"Enhanced prompt: {enhanced_prompt}")
95
+
96
+ # Tokenize input with shorter max length
97
+ inputs = self.tokenizer(
98
+ enhanced_prompt,
99
+ return_tensors="pt",
100
+ truncation=True,
101
+ max_length=256, # Reduced for memory
102
+ padding=True
103
+ ).to("cpu")
104
+
105
+ # Generate with minimal parameters
106
+ start_time = time.time()
107
+ with torch.no_grad():
108
+ outputs = self.model.generate(
109
+ **inputs,
110
+ max_length=min(max_length, 256), # Strict limit
111
+ min_length=10, # Reduced minimum
112
+ temperature=min(0.5, temperature),
113
+ top_p=min(0.85, top_p),
114
+ do_sample=True,
115
+ pad_token_id=self.tokenizer.pad_token_id,
116
+ eos_token_id=self.tokenizer.eos_token_id,
117
+ repetition_penalty=1.2,
118
+ no_repeat_ngram_size=3,
119
+ num_return_sequences=1,
120
+ early_stopping=True,
121
+ num_beams=1, # Greedy decoding to save memory
122
+ length_penalty=0.6
123
+ )
124
+
125
+ # Decode response
126
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
127
+
128
+ # Clean up the response
129
+ if response.startswith(enhanced_prompt):
130
+ response = response[len(enhanced_prompt):].strip()
131
+
132
+ # Basic cleanup only
133
+ response = response.replace("Human:", "").replace("Assistant:", "")
134
+
135
+ # Ensure code examples are properly formatted
136
+ if "```python" not in response and "def " in response:
137
+ response = "```python\n" + response + "\n```"
138
+
139
+ # Simple validation
140
+ if len(response.strip()) < 10:
141
+ if "function" in prompt.lower():
142
+ fallback_response = """```python
143
+ def add_numbers(a, b):
144
+ return a + b
145
+ ```"""
146
+ else:
147
+ fallback_response = "I apologize, but I couldn't generate a response. Please try with a simpler prompt."
148
+
149
+ response = fallback_response
150
+
151
+ # Clear memory after generation
152
+ gc.collect()
153
+
154
+ generation_time = time.time() - start_time
155
+ return response, generation_time
156
+ except Exception as e:
157
+ print(f"Error during generation: {str(e)}")
158
+ raise
159
+
160
+ # Initialize model wrapper
161
+ model_wrapper = ModelWrapper()
162
+
163
+ def generate_text(prompt, max_length=256, temperature=0.5, top_p=0.85):
164
+ """Gradio interface function"""
165
+ try:
166
+ if not prompt.strip():
167
+ return "Please enter a prompt."
168
+
169
+ response, gen_time = model_wrapper.generate_response(
170
+ prompt,
171
+ max_length=max_length,
172
+ temperature=temperature,
173
+ top_p=top_p
174
+ )
175
+ return f"Generated in {gen_time:.2f} seconds:\n\n{response}"
176
+ except Exception as e:
177
+ print(f"Error in generate_text: {str(e)}")
178
+ return f"Error generating response: {str(e)}\nPlease try again with a shorter prompt."
179
+
180
+ # Create a very lightweight Gradio interface
181
+ demo = gr.Interface(
182
+ fn=generate_text,
183
+ inputs=[
184
+ gr.Textbox(
185
+ label="Enter your prompt",
186
+ placeholder="Type your prompt here...",
187
+ lines=3
188
+ ),
189
+ gr.Slider(
190
+ minimum=64,
191
+ maximum=256,
192
+ value=192,
193
+ step=32,
194
+ label="Maximum Length",
195
+ info="Keep this low for CPU"
196
+ ),
197
+ gr.Slider(
198
+ minimum=0.1,
199
+ maximum=0.7,
200
+ value=0.4,
201
+ step=0.1,
202
+ label="Temperature",
203
+ info="Lower is better for CPU"
204
+ ),
205
+ gr.Slider(
206
+ minimum=0.5,
207
+ maximum=0.9,
208
+ value=0.8,
209
+ step=0.1,
210
+ label="Top P",
211
+ info="Controls diversity"
212
+ ),
213
+ ],
214
+ outputs=gr.Textbox(label="Generated Response", lines=6),
215
+ title="Phi-2 QLoRA Assistant (CPU-Optimized)",
216
+ description="""This is a lightweight CPU version of the fine-tuned Phi-2 model.
217
+
218
+ Tips:
219
+ - Keep prompts short and specific
220
+ - Use lower maximum length (128-192) for faster responses
221
+ - Use lower temperature (0.3-0.5) for more reliable responses
222
+ """,
223
+ examples=[
224
+ [
225
+ "Write a Python function to calculate factorial",
226
+ 192,
227
+ 0.4,
228
+ 0.8
229
+ ],
230
+ [
231
+ "Explain machine learning simply",
232
+ 192,
233
+ 0.4,
234
+ 0.8
235
+ ],
236
+ [
237
+ "Write a short email to schedule a meeting",
238
+ 192,
239
+ 0.4,
240
+ 0.8
241
+ ]
242
+ ],
243
+ cache_examples=False,
244
+ concurrency_limit=1 # Use the correct parameter for limiting concurrency
245
+ )
246
+
247
+ if __name__ == "__main__":
248
+ # Using the modern approach without queue method
249
+ demo.launch(max_threads=1) # Limit the number of worker threads
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio>=4.19.2
2
  torch>=2.0.0
3
  transformers>=4.36.0
4
  peft>=0.7.0
 
1
+ gradio==4.44.1
2
  torch>=2.0.0
3
  transformers>=4.36.0
4
  peft>=0.7.0
update_app.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+
4
+ # Backup the original app.py
5
+ if os.path.exists('app.py'):
6
+ print("Backing up original app.py to app.py.bak")
7
+ shutil.copy('app.py', 'app.py.bak')
8
+
9
+ # Copy the fixed version to app.py
10
+ if os.path.exists('app_fixed.py'):
11
+ print("Replacing app.py with fixed version")
12
+ shutil.copy('app_fixed.py', 'app.py')
13
+ print("Done! The app.py file has been updated.")
14
+ else:
15
+ print("Error: app_fixed.py not found!")