amaltese commited on
Commit
e198d69
·
verified ·
1 Parent(s): 6df21e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -242
app.py CHANGED
@@ -1,246 +1,22 @@
1
- import gradio as gr
2
  import os
3
- import torch
4
- import json
5
- import pandas as pd
6
- from datasets import Dataset
7
- from transformers import (
8
- AutoModelForCausalLM,
9
- AutoTokenizer,
10
- TrainingArguments,
11
- Trainer,
12
- DataCollatorForLanguageModeling
13
- )
14
- from peft import (
15
- LoraConfig,
16
- get_peft_model,
17
- prepare_model_for_kbit_training,
18
- PeftModel
19
- )
20
- import spaces
21
 
22
- # Set environment variable for cache directory
23
- os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
24
- os.makedirs('/tmp/hf_cache', exist_ok=True)
 
 
 
 
25
 
26
- def sample_from_csv(csv_file, sample_size=100):
27
- """Sample from CSV file and format for training"""
28
- df = pd.read_csv(csv_file)
29
-
30
- # Display CSV info
31
- print(f"CSV columns: {df.columns.tolist()}")
32
- print(f"Total rows in CSV: {len(df)}")
33
-
34
- # Try to identify teacher and student columns
35
- teacher_col = None
36
- student_col = None
37
-
38
- for col in df.columns:
39
- col_lower = col.lower()
40
- if 'teacher' in col_lower or 'instructor' in col_lower or 'prompt' in col_lower:
41
- teacher_col = col
42
- elif 'student' in col_lower or 'response' in col_lower or 'answer' in col_lower:
43
- student_col = col
44
-
45
- # If we couldn't identify columns, use the first two
46
- if teacher_col is None or student_col is None:
47
- teacher_col = df.columns[0]
48
- student_col = df.columns[1]
49
-
50
- # Sample rows
51
- if sample_size >= len(df):
52
- sampled_df = df
53
- else:
54
- sampled_df = df.sample(n=sample_size, random_state=42)
55
-
56
- # Format data
57
- texts = []
58
- for _, row in sampled_df.iterrows():
59
- teacher_text = str(row[teacher_col]).strip()
60
- student_text = str(row[student_col]).strip()
61
-
62
- # Skip rows with empty values
63
- if not teacher_text or not student_text or teacher_text == 'nan' or student_text == 'nan':
64
- continue
65
-
66
- # Format according to the document format:
67
- # <s> [INST] Teacher ** <Dialogue> [/INST] Student** <Dialogue> </s>
68
- formatted_text = f"<s> [INST] Teacher ** {teacher_text} [/INST] Student** {student_text} </s>"
69
- texts.append(formatted_text)
70
-
71
- return Dataset.from_dict({"text": texts})
72
 
73
- @spaces.GPU
74
- def finetune_model(csv_file, sample_size=100, num_epochs=3, progress=gr.Progress()):
75
- """Fine-tune the model and return results"""
76
- # Check GPU
77
- if torch.cuda.is_available():
78
- print(f"GPU available: {torch.cuda.get_device_name(0)}")
79
- device = torch.device("cuda")
80
- else:
81
- print("No GPU available, fine-tuning will be extremely slow!")
82
- device = torch.device("cpu")
83
-
84
- # Sample data
85
- progress(0.1, "Sampling data from CSV...")
86
- dataset = sample_from_csv(csv_file, sample_size)
87
-
88
- # Split dataset
89
- dataset_split = dataset.train_test_split(test_size=0.1)
90
-
91
- # Load tokenizer
92
- progress(0.2, "Loading tokenizer...")
93
- model_name = "mistralai/Mistral-7B-v0.1"
94
- tokenizer = AutoTokenizer.from_pretrained(model_name)
95
- tokenizer.pad_token = tokenizer.eos_token
96
-
97
- # Tokenize dataset
98
- def tokenize_function(examples):
99
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
100
-
101
- progress(0.3, "Tokenizing dataset...")
102
- tokenized_datasets = dataset_split.map(tokenize_function, batched=True)
103
-
104
- # Load model with LoRA configuration
105
- progress(0.4, "Loading model...")
106
- lora_config = LoraConfig(
107
- r=8,
108
- lora_alpha=16,
109
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
110
- lora_dropout=0.05,
111
- bias="none",
112
- task_type="CAUSAL_LM"
113
- )
114
-
115
- model = AutoModelForCausalLM.from_pretrained(
116
- model_name,
117
- torch_dtype=torch.float16,
118
- device_map="auto",
119
- )
120
-
121
- # Prepare model for LoRA training
122
- model = prepare_model_for_kbit_training(model)
123
- model = get_peft_model(model, lora_config)
124
-
125
- # Training arguments
126
- output_dir = "mistral7b_finetuned"
127
- training_args = TrainingArguments(
128
- output_dir=output_dir,
129
- num_train_epochs=num_epochs,
130
- per_device_train_batch_size=1,
131
- gradient_accumulation_steps=4,
132
- save_steps=50,
133
- logging_steps=10,
134
- learning_rate=2e-4,
135
- weight_decay=0.001,
136
- fp16=True,
137
- warmup_steps=50,
138
- lr_scheduler_type="cosine",
139
- report_to="none", # Disable wandb
140
- )
141
-
142
- # Initialize trainer
143
- data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
144
- trainer = Trainer(
145
- model=model,
146
- args=training_args,
147
- train_dataset=tokenized_datasets["train"],
148
- eval_dataset=tokenized_datasets["test"],
149
- data_collator=data_collator,
150
- )
151
-
152
- # Train model
153
- progress(0.5, "Training model...")
154
- trainer.train()
155
-
156
- # Save model
157
- progress(0.9, "Saving model...")
158
- trainer.model.save_pretrained(output_dir)
159
- tokenizer.save_pretrained(output_dir)
160
-
161
- # Test with sample prompts
162
- progress(0.95, "Testing model...")
163
- test_prompts = [
164
- "How was the Math exam?",
165
- "Good morning students! How are you all?",
166
- "What should you do if you get into a fight with a friend?",
167
- "Did you complete your science project?",
168
- "What did you learn in class today?"
169
- ]
170
-
171
- # Load the fine-tuned model for inference
172
- fine_tuned_model = PeftModel.from_pretrained(
173
- model,
174
- output_dir,
175
- device_map="auto",
176
- )
177
-
178
- # Generate responses
179
- results = []
180
- for prompt in test_prompts:
181
- formatted_prompt = f"<s> [INST] Teacher ** {prompt} [/INST] Student**"
182
- inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
183
-
184
- with torch.no_grad():
185
- outputs = fine_tuned_model.generate(
186
- **inputs,
187
- max_length=200,
188
- temperature=0.7,
189
- top_p=0.95,
190
- do_sample=True,
191
- )
192
-
193
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
194
- student_part = response.split("Student**")[1].strip() if "Student**" in response else response
195
-
196
- results.append({
197
- "prompt": prompt,
198
- "response": student_part
199
- })
200
-
201
- # Save results
202
- with open("test_results.json", "w") as f:
203
- json.dump(results, f, indent=2)
204
-
205
- progress(1.0, "Completed!")
206
- return results
207
-
208
- # Define Gradio interface
209
- with gr.Blocks() as demo:
210
- gr.Markdown("# Mistral 7B Fine-Tuning for Student Bot")
211
-
212
- with gr.Tab("Fine-tune Model"):
213
- with gr.Row():
214
- csv_input = gr.File(label="Upload Teacher-Student CSV")
215
-
216
- with gr.Row():
217
- sample_size = gr.Slider(minimum=10, maximum=1000, value=100, step=10, label="Sample Size")
218
- epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
219
-
220
- with gr.Row():
221
- start_btn = gr.Button("Start Fine-Tuning")
222
-
223
- with gr.Row():
224
- output = gr.JSON(label="Results")
225
-
226
- start_btn.click(finetune_model, inputs=[csv_input, sample_size, epochs], outputs=[output])
227
-
228
- with gr.Tab("About"):
229
- gr.Markdown("""
230
- ## Fine-Tuning Mistral 7B for Student Bot
231
-
232
- This app fine-tunes the Mistral 7B model to respond like a student to teacher prompts.
233
-
234
- ### Requirements
235
- - CSV file with teacher-student conversation pairs
236
- - GPU acceleration (provided by this Space)
237
-
238
- ### Process
239
- 1. Upload your CSV file
240
- 2. Set sample size and number of epochs
241
- 3. Click "Start Fine-Tuning"
242
- 4. View test results with sample prompts
243
- """)
244
-
245
- # Launch app
246
- demo.launch()
 
1
+ # At the top of your file, add:
2
  import os
3
+ from huggingface_hub import login
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ # Get token from environment variable
6
+ hf_token = os.environ.get("HF_TOKEN")
7
+ if hf_token:
8
+ login(token=hf_token)
9
+ print("Successfully logged in to Hugging Face Hub")
10
+ else:
11
+ print("No Hugging Face token found. You may encounter access issues with gated models.")
12
 
13
+ # Then modify your model loading code to include the token:
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # And later:
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ model_name,
19
+ torch_dtype=torch.float16,
20
+ device_map="auto",
21
+ token=hf_token
22
+ )