amaltese commited on
Commit
a4467aa
·
verified ·
1 Parent(s): de78a7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +205 -260
app.py CHANGED
@@ -1,307 +1,252 @@
1
  import gradio as gr
2
- import os
3
- import torch
4
- import json
5
  import pandas as pd
 
 
6
  from datasets import Dataset
7
- from transformers import (
8
- AutoModelForCausalLM,
9
- AutoTokenizer,
10
- TrainingArguments,
11
- Trainer,
12
- DataCollatorForLanguageModeling
13
- )
14
- from peft import (
15
- LoraConfig,
16
- get_peft_model,
17
- prepare_model_for_kbit_training,
18
- PeftModel
19
- )
20
- import spaces
21
- from huggingface_hub import login
22
 
23
- # Set environment variable for cache directory
24
- os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
25
- os.makedirs('/tmp/hf_cache', exist_ok=True)
 
26
 
27
- # Get token from environment variable and log in
28
- hf_token = os.environ.get("HF_TOKEN")
29
- if hf_token:
30
- login(token=hf_token)
31
- print("Successfully logged in to Hugging Face Hub")
32
- else:
33
- print("No Hugging Face token found. You may encounter access issues with gated models.")
 
 
 
34
 
35
- def sample_from_csv(csv_file, sample_size=100):
36
- """Sample from CSV file and format for training"""
37
- df = pd.read_csv(csv_file)
38
-
39
- # Display CSV info
40
- print(f"CSV columns: {df.columns.tolist()}")
41
- print(f"Total rows in CSV: {len(df)}")
42
-
43
- # Try to identify teacher and student columns
44
- teacher_col = None
45
- student_col = None
46
-
47
- for col in df.columns:
48
- col_lower = col.lower()
49
- if 'teacher' in col_lower or 'instructor' in col_lower or 'prompt' in col_lower:
50
- teacher_col = col
51
- elif 'student' in col_lower or 'response' in col_lower or 'answer' in col_lower:
52
- student_col = col
53
-
54
- # If we couldn't identify columns, use the first two
55
- if teacher_col is None or student_col is None:
56
- teacher_col = df.columns[0]
57
- student_col = df.columns[1]
58
- print(f"Using columns: {teacher_col} (teacher) and {student_col} (student)")
59
- else:
60
- print(f"Identified columns: {teacher_col} (teacher) and {student_col} (student)")
61
-
62
- # Sample rows
63
- if sample_size >= len(df):
64
- sampled_df = df
65
- else:
66
- sampled_df = df.sample(n=sample_size, random_state=42)
67
-
68
- # Format data
69
- texts = []
70
- for _, row in sampled_df.iterrows():
71
- teacher_text = str(row[teacher_col]).strip()
72
- student_text = str(row[student_col]).strip()
73
 
74
- # Skip rows with empty values
75
- if not teacher_text or not student_text or teacher_text == 'nan' or student_text == 'nan':
76
- continue
77
-
78
- # Format according to the document format:
79
- # <s> [INST] Teacher ** <Dialogue> [/INST] Student** <Dialogue> </s>
80
- formatted_text = f"<s> [INST] Teacher ** {teacher_text} [/INST] Student** {student_text} </s>"
81
- texts.append(formatted_text)
82
 
83
- print(f"Created {len(texts)} formatted examples")
84
- return Dataset.from_dict({"text": texts})
85
-
86
- @spaces.GPU
87
- def finetune_model(csv_file, sample_size=100, num_epochs=3, progress=gr.Progress()):
88
- """Fine-tune the model and return results"""
89
- # Check GPU
90
- if torch.cuda.is_available():
91
- print(f"GPU available: {torch.cuda.get_device_name(0)}")
92
- device = torch.device("cuda")
93
- else:
94
- print("No GPU available, fine-tuning will be extremely slow!")
95
- device = torch.device("cpu")
96
 
97
- # Sample data
98
- progress(0.1, "Sampling data from CSV...")
99
- dataset = sample_from_csv(csv_file, sample_size)
100
 
101
  # Split dataset
102
- dataset_split = dataset.train_test_split(test_size=0.1)
103
 
104
- # Load tokenizer
105
- progress(0.2, "Loading tokenizer...")
106
-
107
- # Use only the original Mistral model
108
- model_name = "mistralai/Mistral-7B-v0.1"
109
- print(f"Using model: {model_name}")
110
-
111
- tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
112
- tokenizer.pad_token = tokenizer.eos_token
113
-
114
- # Tokenize dataset
115
  def tokenize_function(examples):
116
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
117
-
118
- progress(0.3, "Tokenizing dataset...")
119
- tokenized_datasets = dataset_split.map(tokenize_function, batched=True)
120
-
121
- # Load model with LoRA configuration
122
- progress(0.4, "Loading model...")
123
- lora_config = LoraConfig(
124
- r=8,
125
- lora_alpha=16,
126
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
127
- lora_dropout=0.05,
128
- bias="none",
129
- task_type="CAUSAL_LM"
130
- )
 
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  model = AutoModelForCausalLM.from_pretrained(
133
- model_name,
134
- torch_dtype=torch.float16,
135
- device_map="auto",
136
- token=hf_token,
 
137
  )
138
 
139
- # Prepare model for LoRA training
 
 
 
140
  model = prepare_model_for_kbit_training(model)
141
- model = get_peft_model(model, lora_config)
142
 
143
- # Print model info
144
- print(f"Model loaded: {model_name}")
145
- model_params = sum(p.numel() for p in model.parameters())
146
- print(f"Model parameters: {model_params:,}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- # Training arguments
149
- output_dir = "mistral7b_finetuned"
150
  training_args = TrainingArguments(
151
  output_dir=output_dir,
152
- num_train_epochs=num_epochs,
153
- per_device_train_batch_size=1,
154
- gradient_accumulation_steps=4,
155
- save_steps=50,
 
 
 
 
 
 
 
 
 
156
  logging_steps=10,
 
 
157
  learning_rate=2e-4,
158
- weight_decay=0.001,
159
- fp16=True,
160
- warmup_steps=50,
161
- lr_scheduler_type="cosine",
162
- report_to="none", # Disable wandb
 
163
  )
164
 
165
- # Initialize trainer
166
- data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
167
  trainer = Trainer(
168
  model=model,
169
  args=training_args,
170
- train_dataset=tokenized_datasets["train"],
171
- eval_dataset=tokenized_datasets["test"],
172
- data_collator=data_collator,
173
  )
174
 
175
- # Train model
176
- progress(0.5, "Training model...")
 
 
 
 
 
 
177
  trainer.train()
178
 
179
- # Save model
180
- progress(0.9, "Saving model...")
181
- trainer.model.save_pretrained(output_dir)
182
  tokenizer.save_pretrained(output_dir)
 
183
 
184
- # Test with sample prompts
185
- progress(0.95, "Testing model...")
186
- test_prompts = [
187
- "How was the Math exam?",
188
- "Good morning students! How are you all?",
189
- "What should you do if you get into a fight with a friend?",
190
- "Did you complete your science project?",
191
- "What did you learn in class today?"
192
- ]
193
-
194
- # Load the fine-tuned model for inference
195
- fine_tuned_model = PeftModel.from_pretrained(
196
- model,
197
- output_dir,
198
- device_map="auto",
199
- )
200
-
201
- # Generate responses
202
- results = []
203
- for prompt in test_prompts:
204
- formatted_prompt = f"<s> [INST] Teacher ** {prompt} [/INST] Student**"
205
- inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
206
 
207
- with torch.no_grad():
208
- outputs = fine_tuned_model.generate(
209
- **inputs,
210
- max_length=200,
211
- temperature=0.7,
212
- top_p=0.95,
213
- do_sample=True,
214
- )
215
 
216
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
217
- student_part = response.split("Student**")[1].strip() if "Student**" in response else response
 
 
218
 
219
- results.append({
220
- "prompt": prompt,
221
- "response": student_part
222
- })
223
-
224
- # Save results
225
- with open("test_results.json", "w") as f:
226
- json.dump(results, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- progress(1.0, "Completed!")
229
- return results
 
230
 
231
- # Define Gradio interface
232
  with gr.Blocks() as demo:
233
- gr.Markdown("# Mistral 7B Fine-Tuning for Student Bot")
234
 
235
- with gr.Tab("System Check"):
236
- check_btn = gr.Button("Check GPU and Authentication Status")
237
- system_output = gr.Textbox(label="System Status", lines=5)
238
-
239
- @spaces.GPU
240
- def check_system():
241
- status = []
242
- # Check GPU
243
- if torch.cuda.is_available():
244
- status.append(f"✅ GPU AVAILABLE: {torch.cuda.get_device_name(0)}")
245
- gpu_memory = f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
246
- status.append(gpu_memory)
247
- else:
248
- status.append("❌ NO GPU DETECTED.")
249
-
250
- # Check HF token
251
- if os.environ.get("HF_TOKEN"):
252
- status.append("✅ Hugging Face token found")
253
- else:
254
- status.append("❌ No Hugging Face token found. You may encounter access issues with gated models.")
255
-
256
- # Check if we can access Mistral model
257
- try:
258
- from huggingface_hub import model_info
259
- info = model_info("mistralai/Mistral-7B-v0.1", token=hf_token)
260
- status.append(f"✅ Access to Mistral-7B-v0.1 model verified: {info.modelId}")
261
- except Exception as e:
262
- status.append(f"❌ Cannot access Mistral-7B-v0.1 model: {str(e)}")
263
-
264
- return "\n".join(status)
265
-
266
- check_btn.click(check_system, inputs=[], outputs=[system_output])
267
-
268
- with gr.Tab("Fine-tune Model"):
269
- with gr.Row():
270
- csv_input = gr.File(label="Upload Teacher-Student CSV")
271
-
272
- with gr.Row():
273
- sample_size = gr.Slider(minimum=10, maximum=1000, value=100, step=10, label="Sample Size")
274
- epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
275
-
276
  with gr.Row():
277
- start_btn = gr.Button("Start Fine-Tuning")
278
-
 
 
 
 
 
 
 
279
  with gr.Row():
280
- output = gr.JSON(label="Results")
281
-
282
- start_btn.click(finetune_model, inputs=[csv_input, sample_size, epochs], outputs=[output])
283
-
284
- with gr.Tab("About"):
285
- gr.Markdown("""
286
- ## Fine-Tuning Mistral 7B for Student Bot
287
-
288
- This app fine-tunes the original Mistral-7B-v0.1 model to respond like a student to teacher prompts.
289
-
290
- ### Requirements
291
- - CSV file with teacher-student conversation pairs
292
- - GPU acceleration (provided by this Space)
293
- - Hugging Face authentication for accessing Mistral-7B-v0.1 (which is a gated model)
294
 
295
- ### Process
296
- 1. Upload your CSV file
297
- 2. Set sample size and number of epochs
298
- 3. Click "Start Fine-Tuning"
299
- 4. View test results with sample prompts
300
 
301
- ### Important Notes
302
- - Fine-tuning can take several hours depending on your sample size and epochs
303
- - The model will be saved in the Space and can be downloaded for further use
304
- """)
 
305
 
306
- # Launch app
307
- demo.launch()
 
 
1
  import gradio as gr
 
 
 
2
  import pandas as pd
3
+ import torch
4
+ import os
5
  from datasets import Dataset
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
7
+ from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
8
+ import spaces # Import the spaces library for HF Spaces integration
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ # Initialize logging
11
+ import logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
 
15
+ # Function to load and process data
16
+ def load_data(csv_file):
17
+ try:
18
+ df = pd.read_csv(csv_file)
19
+ logger.info(f"CSV columns: {df.columns.tolist()}")
20
+ logger.info(f"Total rows in CSV: {len(df)}")
21
+ return df
22
+ except Exception as e:
23
+ logger.error(f"Error loading CSV: {e}")
24
+ return None
25
 
26
+ # Function to prepare dataset
27
+ def prepare_dataset(df, teacher_col, student_col, num_samples=100):
28
+ # Extract and format data
29
+ logger.info(f"Using columns: {teacher_col} (teacher) and {student_col} (student)")
30
+
31
+ formatted_data = []
32
+ for i in range(min(num_samples, len(df))):
33
+ teacher_text = str(df.iloc[i][teacher_col])
34
+ student_text = str(df.iloc[i][student_col])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # Create prompt
37
+ formatted_text = f"### Teacher: {teacher_text}\n### Student: {student_text}"
38
+ formatted_data.append({"text": formatted_text})
 
 
 
 
 
39
 
40
+ logger.info(f"Created {len(formatted_data)} formatted examples")
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ # Create dataset
43
+ dataset = Dataset.from_list(formatted_data)
 
44
 
45
  # Split dataset
46
+ train_val_split = dataset.train_test_split(test_size=0.1, seed=42)
47
 
48
+ return train_val_split
49
+
50
+ # Function to tokenize data
51
+ def tokenize_data(dataset, tokenizer, max_length=512):
 
 
 
 
 
 
 
52
  def tokenize_function(examples):
53
+ return tokenizer(
54
+ examples["text"],
55
+ truncation=True,
56
+ max_length=max_length,
57
+ padding="max_length"
58
+ )
59
+
60
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
61
+ return tokenized_dataset
62
+
63
+ # Main fine-tuning function with memory optimizations
64
+ def finetune_model(model_id, train_data, output_dir, epochs, batch_size=None):
65
+ """
66
+ Fine-tune a model with optimized memory settings to prevent CUDA OOM errors.
67
+ """
68
+ logger.info(f"Using model: {model_id}")
69
 
70
+ # Load tokenizer
71
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
72
+ if tokenizer.pad_token is None:
73
+ tokenizer.pad_token = tokenizer.eos_token
74
+
75
+ # ============ MEMORY OPTIMIZATION 1: REDUCED BATCH SIZE ============
76
+ # A smaller batch size dramatically reduces memory usage during training
77
+ # For 7B models on limited VRAM (40GB), values between 1-8 are recommended
78
+ actual_batch_size = 8 if batch_size is None else min(batch_size, 8)
79
+ logger.info(f"Using batch size: {actual_batch_size} (reduced from original to save memory)")
80
+
81
+ # ============ MEMORY OPTIMIZATION 2: 8-bit QUANTIZATION ============
82
+ # Load model in 8-bit to reduce memory footprint during training
83
  model = AutoModelForCausalLM.from_pretrained(
84
+ model_id,
85
+ load_in_8bit=True, # Use 8-bit quantization to reduce memory usage
86
+ device_map="auto", # Automatically handle model distribution
87
+ use_cache=False, # Disable KV cache which uses extra memory
88
+ torch_dtype=torch.float16, # Use lower precision
89
  )
90
 
91
+ # Count model parameters
92
+ logger.info(f"Model parameters: {model.num_parameters():,}")
93
+
94
+ # Prepare model for training with quantization
95
  model = prepare_model_for_kbit_training(model)
 
96
 
97
+ # ============ MEMORY OPTIMIZATION 3: GRADIENT CHECKPOINTING ============
98
+ # Enable gradient checkpointing to trade compute for memory
99
+ # This recomputes forward activations during backward pass instead of storing them
100
+ model.gradient_checkpointing_enable()
101
+ logger.info("Gradient checkpointing enabled: trading computation for memory savings")
102
+
103
+ # ============ MEMORY OPTIMIZATION 4: OPTIMIZED LORA CONFIG ============
104
+ # Use lower rank and fewer modules to reduce memory requirements
105
+ peft_config = LoraConfig(
106
+ task_type=TaskType.CAUSAL_LM,
107
+ inference_mode=False,
108
+ r=4, # REDUCED from default 8/16 to save memory
109
+ lora_alpha=16, # Scaling factor
110
+ lora_dropout=0.1, # Dropout probability for regularization
111
+ # Target specific modules instead of all linear layers to save memory
112
+ target_modules=["q_proj", "v_proj"], # Only attention query and value projections
113
+ )
114
+ logger.info("Using optimized LoRA parameters with reduced rank (r=4) and targeted modules")
115
+
116
+ # Apply LoRA adapters to the model
117
+ model = get_peft_model(model, peft_config)
118
+ model.print_trainable_parameters() # Print trainable parameters info
119
 
120
+ # Define training arguments
 
121
  training_args = TrainingArguments(
122
  output_dir=output_dir,
123
+ num_train_epochs=epochs,
124
+ # ============ MEMORY OPTIMIZATION 5: REDUCED BATCH SIZE IN ARGS ============
125
+ per_device_train_batch_size=actual_batch_size,
126
+ per_device_eval_batch_size=actual_batch_size,
127
+ # ============ MEMORY OPTIMIZATION 6: MIXED PRECISION TRAINING ============
128
+ # Mixed precision significantly reduces memory usage
129
+ fp16=True, # Use FP16 for mixed precision training
130
+ # ============ MEMORY OPTIMIZATION 7: GRADIENT ACCUMULATION ============
131
+ # Simulate larger batch sizes without the memory cost
132
+ gradient_accumulation_steps=4, # Accumulate gradients over 4 steps (effective batch size = 8*4=32)
133
+ # ============ MEMORY OPTIMIZATION 8: GRADIENT CHECKPOINTING IN ARGS ============
134
+ gradient_checkpointing=True,
135
+ # Other parameters
136
  logging_steps=10,
137
+ save_strategy="epoch",
138
+ evaluation_strategy="epoch",
139
  learning_rate=2e-4,
140
+ weight_decay=0.01,
141
+ warmup_ratio=0.03,
142
+ # ============ MEMORY OPTIMIZATION 9: REDUCED OPTIMIZER OVERHEAD ============
143
+ optim="adamw_torch_fused", # More memory-efficient optimizer
144
+ # ============ MEMORY OPTIMIZATION 10: REDUCED LOGGING MEMORY ============
145
+ report_to="none", # Disable extra logging to save memory
146
  )
147
 
148
+ # Initialize the Trainer
 
149
  trainer = Trainer(
150
  model=model,
151
  args=training_args,
152
+ train_dataset=train_data["train"],
153
+ eval_dataset=train_data["validation"],
154
+ tokenizer=tokenizer,
155
  )
156
 
157
+ # ============ MEMORY OPTIMIZATION 11: MANAGE CUDA CACHE ============
158
+ # Clear CUDA cache before training to start with a clean memory state
159
+ if torch.cuda.is_available():
160
+ torch.cuda.empty_cache()
161
+ logger.info("CUDA cache cleared before training")
162
+
163
+ # Start training
164
+ logger.info("Starting training...")
165
  trainer.train()
166
 
167
+ # Save the model
168
+ model.save_pretrained(output_dir)
 
169
  tokenizer.save_pretrained(output_dir)
170
+ logger.info(f"Model saved to {output_dir}")
171
 
172
+ return model, tokenizer
173
+
174
+ # Gradio interface functions
175
+ def process_csv(file, teacher_col, student_col, num_samples):
176
+ df = load_data(file.name)
177
+ if df is None:
178
+ return "Error loading CSV file"
179
+ return f"CSV loaded successfully with {len(df)} rows"
180
+
181
+ def start_fine_tuning(file, teacher_col, student_col, model_id, epochs, batch_size, num_samples):
182
+ try:
183
+ # Load and process data
184
+ df = load_data(file.name)
185
+ if df is None:
186
+ return "Error loading CSV file"
 
 
 
 
 
 
 
187
 
188
+ # Prepare dataset
189
+ dataset = prepare_dataset(df, teacher_col, student_col, num_samples=int(num_samples))
 
 
 
 
 
 
190
 
191
+ # Load tokenizer for preprocessing
192
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
193
+ if tokenizer.pad_token is None:
194
+ tokenizer.pad_token = tokenizer.eos_token
195
 
196
+ # Tokenize dataset
197
+ tokenized_dataset = {
198
+ "train": tokenize_data(dataset["train"], tokenizer),
199
+ "validation": tokenize_data(dataset["test"], tokenizer),
200
+ }
201
+
202
+ # Create output directory
203
+ output_dir = "./fine_tuned_model"
204
+ os.makedirs(output_dir, exist_ok=True)
205
+
206
+ # Finetune model with memory optimizations
207
+ finetune_model(
208
+ model_id=model_id,
209
+ train_data=tokenized_dataset,
210
+ output_dir=output_dir,
211
+ epochs=int(epochs),
212
+ batch_size=int(batch_size),
213
+ )
214
+
215
+ return "Fine-tuning completed successfully!"
216
 
217
+ except Exception as e:
218
+ logger.error(f"Error during fine-tuning: {e}")
219
+ return f"Error during fine-tuning: {str(e)}"
220
 
221
+ # Create Gradio interface
222
  with gr.Blocks() as demo:
223
+ gr.Markdown("# Teacher-Student Bot Fine-Tuning")
224
 
225
+ with gr.Tab("Upload Data"):
226
+ file_input = gr.File(label="Upload CSV File")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  with gr.Row():
228
+ teacher_col = gr.Textbox(label="Teacher Column", value="Unnamed: 0")
229
+ student_col = gr.Textbox(label="Student Column", value="idx")
230
+ num_samples = gr.Slider(label="Number of Samples", minimum=10, maximum=1000, value=100, step=10)
231
+ upload_btn = gr.Button("Process CSV")
232
+ csv_output = gr.Textbox(label="CSV Processing Result")
233
+ upload_btn.click(process_csv, inputs=[file_input, teacher_col, student_col, num_samples], outputs=csv_output)
234
+
235
+ with gr.Tab("Fine-Tune"):
236
+ model_id = gr.Textbox(label="Model ID", value="mistralai/Mistral-7B-v0.1")
237
  with gr.Row():
238
+ batch_size = gr.Number(label="Batch Size", value=8, info="Recommended: 8 or lower for 7B models")
239
+ epochs = gr.Number(label="Number of Epochs", value=2)
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
+ training_btn = gr.Button("Start Fine-Tuning")
242
+ training_output = gr.Textbox(label="Training Progress")
 
 
 
243
 
244
+ training_btn.click(
245
+ start_fine_tuning,
246
+ inputs=[file_input, teacher_col, student_col, model_id, epochs, batch_size, num_samples],
247
+ outputs=training_output
248
+ )
249
 
250
+ # Launch the Space
251
+ spaces.zero.mount()
252
+ demo.queue().launch(debug=True)