amaltese commited on
Commit
f3b5025
Β·
verified Β·
1 Parent(s): e198d69

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +303 -11
app.py CHANGED
@@ -1,8 +1,30 @@
1
- # At the top of your file, add:
2
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from huggingface_hub import login
4
 
5
- # Get token from environment variable
 
 
 
 
6
  hf_token = os.environ.get("HF_TOKEN")
7
  if hf_token:
8
  login(token=hf_token)
@@ -10,13 +32,283 @@ if hf_token:
10
  else:
11
  print("No Hugging Face token found. You may encounter access issues with gated models.")
12
 
13
- # Then modify your model loading code to include the token:
14
- tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # And later:
17
- model = AutoModelForCausalLM.from_pretrained(
18
- model_name,
19
- torch_dtype=torch.float16,
20
- device_map="auto",
21
- token=hf_token
22
- )
 
1
+ import gradio as gr
2
  import os
3
+ import torch
4
+ import json
5
+ import pandas as pd
6
+ from datasets import Dataset
7
+ from transformers import (
8
+ AutoModelForCausalLM,
9
+ AutoTokenizer,
10
+ TrainingArguments,
11
+ Trainer,
12
+ DataCollatorForLanguageModeling
13
+ )
14
+ from peft import (
15
+ LoraConfig,
16
+ get_peft_model,
17
+ prepare_model_for_kbit_training,
18
+ PeftModel
19
+ )
20
+ import spaces
21
  from huggingface_hub import login
22
 
23
+ # Set environment variable for cache directory
24
+ os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
25
+ os.makedirs('/tmp/hf_cache', exist_ok=True)
26
+
27
+ # Get token from environment variable and log in
28
  hf_token = os.environ.get("HF_TOKEN")
29
  if hf_token:
30
  login(token=hf_token)
 
32
  else:
33
  print("No Hugging Face token found. You may encounter access issues with gated models.")
34
 
35
+ def sample_from_csv(csv_file, sample_size=100):
36
+ """Sample from CSV file and format for training"""
37
+ df = pd.read_csv(csv_file)
38
+
39
+ # Display CSV info
40
+ print(f"CSV columns: {df.columns.tolist()}")
41
+ print(f"Total rows in CSV: {len(df)}")
42
+
43
+ # Try to identify teacher and student columns
44
+ teacher_col = None
45
+ student_col = None
46
+
47
+ for col in df.columns:
48
+ col_lower = col.lower()
49
+ if 'teacher' in col_lower or 'instructor' in col_lower or 'prompt' in col_lower:
50
+ teacher_col = col
51
+ elif 'student' in col_lower or 'response' in col_lower or 'answer' in col_lower:
52
+ student_col = col
53
+
54
+ # If we couldn't identify columns, use the first two
55
+ if teacher_col is None or student_col is None:
56
+ teacher_col = df.columns[0]
57
+ student_col = df.columns[1]
58
+ print(f"Using columns: {teacher_col} (teacher) and {student_col} (student)")
59
+ else:
60
+ print(f"Identified columns: {teacher_col} (teacher) and {student_col} (student)")
61
+
62
+ # Sample rows
63
+ if sample_size >= len(df):
64
+ sampled_df = df
65
+ else:
66
+ sampled_df = df.sample(n=sample_size, random_state=42)
67
+
68
+ # Format data
69
+ texts = []
70
+ for _, row in sampled_df.iterrows():
71
+ teacher_text = str(row[teacher_col]).strip()
72
+ student_text = str(row[student_col]).strip()
73
+
74
+ # Skip rows with empty values
75
+ if not teacher_text or not student_text or teacher_text == 'nan' or student_text == 'nan':
76
+ continue
77
+
78
+ # Format according to the document format:
79
+ # <s> [INST] Teacher ** <Dialogue> [/INST] Student** <Dialogue> </s>
80
+ formatted_text = f"<s> [INST] Teacher ** {teacher_text} [/INST] Student** {student_text} </s>"
81
+ texts.append(formatted_text)
82
+
83
+ print(f"Created {len(texts)} formatted examples")
84
+ return Dataset.from_dict({"text": texts})
85
+
86
+ @spaces.GPU
87
+ def finetune_model(csv_file, sample_size=100, num_epochs=3, progress=gr.Progress()):
88
+ """Fine-tune the model and return results"""
89
+ # Check GPU
90
+ if torch.cuda.is_available():
91
+ print(f"GPU available: {torch.cuda.get_device_name(0)}")
92
+ device = torch.device("cuda")
93
+ else:
94
+ print("No GPU available, fine-tuning will be extremely slow!")
95
+ device = torch.device("cpu")
96
+
97
+ # Sample data
98
+ progress(0.1, "Sampling data from CSV...")
99
+ dataset = sample_from_csv(csv_file, sample_size)
100
+
101
+ # Split dataset
102
+ dataset_split = dataset.train_test_split(test_size=0.1)
103
+
104
+ # Load tokenizer
105
+ progress(0.2, "Loading tokenizer...")
106
+
107
+ # Try the non-gated Mistral model first
108
+ model_name = "mistralai/Mistral-7B-Instruct-v0.2"
109
+ try:
110
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
111
+ print(f"Successfully loaded tokenizer for {model_name}")
112
+ except Exception as e:
113
+ print(f"Error loading {model_name}: {e}")
114
+ print("Falling back to original Mistral model with token authentication...")
115
+ model_name = "mistralai/Mistral-7B-v0.1"
116
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
117
+
118
+ tokenizer.pad_token = tokenizer.eos_token
119
+
120
+ # Tokenize dataset
121
+ def tokenize_function(examples):
122
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
123
+
124
+ progress(0.3, "Tokenizing dataset...")
125
+ tokenized_datasets = dataset_split.map(tokenize_function, batched=True)
126
+
127
+ # Load model with LoRA configuration
128
+ progress(0.4, "Loading model...")
129
+ lora_config = LoraConfig(
130
+ r=8,
131
+ lora_alpha=16,
132
+ target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
133
+ lora_dropout=0.05,
134
+ bias="none",
135
+ task_type="CAUSAL_LM"
136
+ )
137
+
138
+ model = AutoModelForCausalLM.from_pretrained(
139
+ model_name,
140
+ torch_dtype=torch.float16,
141
+ device_map="auto",
142
+ token=hf_token,
143
+ )
144
+
145
+ # Prepare model for LoRA training
146
+ model = prepare_model_for_kbit_training(model)
147
+ model = get_peft_model(model, lora_config)
148
+
149
+ # Print model info
150
+ print(f"Model loaded: {model_name}")
151
+ model_params = sum(p.numel() for p in model.parameters())
152
+ print(f"Model parameters: {model_params:,}")
153
+
154
+ # Training arguments
155
+ output_dir = "mistral7b_finetuned"
156
+ training_args = TrainingArguments(
157
+ output_dir=output_dir,
158
+ num_train_epochs=num_epochs,
159
+ per_device_train_batch_size=1,
160
+ gradient_accumulation_steps=4,
161
+ save_steps=50,
162
+ logging_steps=10,
163
+ learning_rate=2e-4,
164
+ weight_decay=0.001,
165
+ fp16=True,
166
+ warmup_steps=50,
167
+ lr_scheduler_type="cosine",
168
+ report_to="none", # Disable wandb
169
+ )
170
+
171
+ # Initialize trainer
172
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
173
+ trainer = Trainer(
174
+ model=model,
175
+ args=training_args,
176
+ train_dataset=tokenized_datasets["train"],
177
+ eval_dataset=tokenized_datasets["test"],
178
+ data_collator=data_collator,
179
+ )
180
+
181
+ # Train model
182
+ progress(0.5, "Training model...")
183
+ trainer.train()
184
+
185
+ # Save model
186
+ progress(0.9, "Saving model...")
187
+ trainer.model.save_pretrained(output_dir)
188
+ tokenizer.save_pretrained(output_dir)
189
+
190
+ # Test with sample prompts
191
+ progress(0.95, "Testing model...")
192
+ test_prompts = [
193
+ "How was the Math exam?",
194
+ "Good morning students! How are you all?",
195
+ "What should you do if you get into a fight with a friend?",
196
+ "Did you complete your science project?",
197
+ "What did you learn in class today?"
198
+ ]
199
+
200
+ # Load the fine-tuned model for inference
201
+ fine_tuned_model = PeftModel.from_pretrained(
202
+ model,
203
+ output_dir,
204
+ device_map="auto",
205
+ )
206
+
207
+ # Generate responses
208
+ results = []
209
+ for prompt in test_prompts:
210
+ formatted_prompt = f"<s> [INST] Teacher ** {prompt} [/INST] Student**"
211
+ inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
212
+
213
+ with torch.no_grad():
214
+ outputs = fine_tuned_model.generate(
215
+ **inputs,
216
+ max_length=200,
217
+ temperature=0.7,
218
+ top_p=0.95,
219
+ do_sample=True,
220
+ )
221
+
222
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
223
+ student_part = response.split("Student**")[1].strip() if "Student**" in response else response
224
+
225
+ results.append({
226
+ "prompt": prompt,
227
+ "response": student_part
228
+ })
229
+
230
+ # Save results
231
+ with open("test_results.json", "w") as f:
232
+ json.dump(results, f, indent=2)
233
+
234
+ progress(1.0, "Completed!")
235
+ return results
236
+
237
+ # Define Gradio interface
238
+ with gr.Blocks() as demo:
239
+ gr.Markdown("# Mistral 7B Fine-Tuning for Student Bot")
240
+
241
+ with gr.Tab("System Check"):
242
+ check_btn = gr.Button("Check GPU and Authentication Status")
243
+ system_output = gr.Textbox(label="System Status", lines=5)
244
+
245
+ @spaces.GPU
246
+ def check_system():
247
+ status = []
248
+ # Check GPU
249
+ if torch.cuda.is_available():
250
+ status.append(f"βœ… GPU AVAILABLE: {torch.cuda.get_device_name(0)}")
251
+ gpu_memory = f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
252
+ status.append(gpu_memory)
253
+ else:
254
+ status.append("❌ NO GPU DETECTED.")
255
+
256
+ # Check HF token
257
+ if os.environ.get("HF_TOKEN"):
258
+ status.append("βœ… Hugging Face token found")
259
+ else:
260
+ status.append("❌ No Hugging Face token found. You may encounter access issues with gated models.")
261
+
262
+ # Check if we can access Mistral model
263
+ try:
264
+ from huggingface_hub import model_info
265
+ info = model_info("mistralai/Mistral-7B-Instruct-v0.2", token=hf_token)
266
+ status.append(f"βœ… Access to Mistral model verified: {info.modelId}")
267
+ except Exception as e:
268
+ status.append(f"❌ Cannot access Mistral model: {str(e)}")
269
+
270
+ return "\n".join(status)
271
+
272
+ check_btn.click(check_system, inputs=[], outputs=[system_output])
273
+
274
+ with gr.Tab("Fine-tune Model"):
275
+ with gr.Row():
276
+ csv_input = gr.File(label="Upload Teacher-Student CSV")
277
+
278
+ with gr.Row():
279
+ sample_size = gr.Slider(minimum=10, maximum=1000, value=100, step=10, label="Sample Size")
280
+ epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
281
+
282
+ with gr.Row():
283
+ start_btn = gr.Button("Start Fine-Tuning")
284
+
285
+ with gr.Row():
286
+ output = gr.JSON(label="Results")
287
+
288
+ start_btn.click(finetune_model, inputs=[csv_input, sample_size, epochs], outputs=[output])
289
+
290
+ with gr.Tab("About"):
291
+ gr.Markdown("""
292
+ ## Fine-Tuning Mistral 7B for Student Bot
293
+
294
+ This app fine-tunes the Mistral 7B model to respond like a student to teacher prompts.
295
+
296
+ ### Requirements
297
+ - CSV file with teacher-student conversation pairs
298
+ - GPU acceleration (provided by this Space)
299
+ - Hugging Face authentication for accessing gated models
300
+
301
+ ### Process
302
+ 1. Upload your CSV file
303
+ 2. Set sample size and number of epochs
304
+ 3. Click "Start Fine-Tuning"
305
+ 4. View test results with sample prompts
306
+
307
+ ### Important Notes
308
+ - The app tries to use Mistral-7B-Instruct-v0.2 which is not gated
309
+ - If that fails, it falls back to the original Mistral-7B-v0.1 model (which requires authentication)
310
+ - Fine-tuning can take several hours depending on your sample size and epochs
311
+ """)
312
 
313
+ # Launch app
314
+ demo.launch()