Sephfox commited on
Commit
177a610
β€’
1 Parent(s): b71e3bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +285 -226
app.py CHANGED
@@ -5,238 +5,297 @@ import torch
5
  import transformers
6
  from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
7
  from datasets import Dataset
 
8
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Set random seeds for reproducibility
11
- random.seed(42)
12
- np.random.seed(42)
13
- torch.manual_seed(42)
14
-
15
  def generate_demo_data(num_samples=60):
16
- # Generate meaningful sentences on various topics
17
- subjects = [
18
- 'Artificial intelligence', 'Climate change', 'Renewable energy',
19
- 'Space exploration', 'Quantum computing', 'Genetic engineering',
20
- 'Blockchain technology', 'Virtual reality', 'Cybersecurity',
21
- 'Biotechnology', 'Nanotechnology', 'Astrophysics'
22
- ]
23
- verbs = [
24
- 'is transforming', 'is influencing', 'is revolutionizing',
25
- 'is challenging', 'is advancing', 'is reshaping', 'is impacting',
26
- 'is enhancing', 'is disrupting', 'is redefining'
27
- ]
28
- objects = [
29
- 'modern science', 'global economies', 'healthcare systems',
30
- 'communication methods', 'educational approaches',
31
- 'environmental policies', 'social interactions', 'the job market',
32
- 'data security', 'the entertainment industry'
33
- ]
34
- data = []
35
- for i in range(num_samples):
36
- subject = random.choice(subjects)
37
- verb = random.choice(verbs)
38
- obj = random.choice(objects)
39
- sentence = f"{subject} {verb} {obj}."
40
- data.append(sentence)
41
- return data
42
-
43
- def load_data(uploaded_file):
44
- # Load user-uploaded text file
45
- data = uploaded_file.read().decode("utf-8")
46
- data = data.splitlines()
47
- return data
48
-
49
- def prepare_dataset(data, tokenizer, block_size=128):
50
- # Tokenize the texts
51
- def tokenize_function(examples):
52
- return tokenizer(examples['text'], truncation=True, max_length=block_size, padding='max_length')
53
-
54
- raw_dataset = Dataset.from_dict({'text': data})
55
- tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
56
-
57
- # Create labels for language modeling
58
- tokenized_dataset = tokenized_dataset.map(
59
- lambda examples: {'labels': examples['input_ids']},
60
- batched=True
61
- )
62
-
63
- # Set the format for PyTorch
64
- tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
65
-
66
- return tokenized_dataset
67
 
68
  def fitness_function(individual, train_dataset, model, tokenizer):
69
- # Define the training arguments
70
- training_args = TrainingArguments(
71
- output_dir='./results',
72
- overwrite_output_dir=True,
73
- num_train_epochs=individual['epochs'],
74
- per_device_train_batch_size=individual['batch_size'],
75
- learning_rate=individual['learning_rate'],
76
- logging_steps=10,
77
- save_steps=10,
78
- save_total_limit=2,
79
- report_to='none', # Disable logging to Wandb or other services
80
- )
81
-
82
- data_collator = DataCollatorForLanguageModeling(
83
- tokenizer=tokenizer, mlm=False
84
- )
85
-
86
- # Train the model
87
- trainer = Trainer(
88
- model=model,
89
- args=training_args,
90
- data_collator=data_collator,
91
- train_dataset=train_dataset,
92
- eval_dataset=None,
93
- )
94
-
95
- trainer.train()
96
-
97
- # For simplicity, use final training loss as fitness score
98
- logs = [log for log in trainer.state.log_history if 'loss' in log]
99
- if logs:
100
- loss = logs[-1]['loss']
101
- else:
102
- loss = float('inf')
103
- return loss
104
-
105
- # Genetic Algorithm Functions
106
- def create_population(size, param_bounds):
107
- population = []
108
- for _ in range(size):
109
- individual = {
110
- 'learning_rate': random.uniform(*param_bounds['learning_rate']),
111
- 'epochs': random.randint(*param_bounds['epochs']),
112
- 'batch_size': random.choice(param_bounds['batch_size']),
113
- }
114
- population.append(individual)
115
- return population
116
-
117
- def select_mating_pool(population, fitnesses, num_parents):
118
- parents = [population[i] for i in np.argsort(fitnesses)[:num_parents]]
119
- return parents
120
-
121
- def crossover(parents, offspring_size):
122
- offspring = []
123
- for _ in range(offspring_size):
124
- parent1 = random.choice(parents)
125
- parent2 = random.choice(parents)
126
- child = {
127
- 'learning_rate': random.choice([parent1['learning_rate'], parent2['learning_rate']]),
128
- 'epochs': random.choice([parent1['epochs'], parent2['epochs']]),
129
- 'batch_size': random.choice([parent1['batch_size'], parent2['batch_size']]),
130
- }
131
- offspring.append(child)
132
- return offspring
133
-
134
- def mutation(offspring, param_bounds, mutation_rate=0.1):
135
- for individual in offspring:
136
- if random.random() < mutation_rate:
137
- individual['learning_rate'] = random.uniform(*param_bounds['learning_rate'])
138
- if random.random() < mutation_rate:
139
- individual['epochs'] = random.randint(*param_bounds['epochs'])
140
- if random.random() < mutation_rate:
141
- individual['batch_size'] = random.choice(param_bounds['batch_size'])
142
- return offspring
143
-
144
- # Streamlit App
145
  def main():
146
- st.title("ACO madeGPT-2 Fine-Tuning with Genetic Algorithm")
147
-
148
- option = st.sidebar.selectbox(
149
- 'Choose Data Source',
150
- ('DEMO', 'Upload Text File')
151
- )
152
-
153
- if option == 'DEMO':
154
- st.write("Using DEMO data...")
155
- data = generate_demo_data()
156
- else:
157
- st.write("Upload a text file for fine-tuning.")
158
- uploaded_file = st.file_uploader("Choose a text file", type="txt")
159
- if uploaded_file is not None:
160
- data = load_data(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  else:
162
- st.warning("Please upload a text file.")
163
- st.stop()
164
-
165
- # Load tokenizer and model
166
- st.write("Loading GPT-2 tokenizer and model...")
167
- tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
168
- model = GPT2LMHeadModel.from_pretrained('gpt2')
169
- model.to('cuda' if torch.cuda.is_available() else 'cpu')
170
-
171
- # Set the pad token
172
- tokenizer.pad_token = tokenizer.eos_token
173
- model.config.pad_token_id = model.config.eos_token_id
174
-
175
- # Prepare dataset
176
- st.write("Preparing dataset...")
177
- train_dataset = prepare_dataset(data, tokenizer)
178
-
179
- # GA Parameters
180
- st.sidebar.subheader("Genetic Algorithm Parameters")
181
- population_size = st.sidebar.number_input("Population Size", 4, 20, 6)
182
- num_generations = st.sidebar.number_input("Number of Generations", 1, 10, 3)
183
- num_parents = st.sidebar.number_input("Number of Parents", 2, population_size, 2)
184
- mutation_rate = st.sidebar.slider("Mutation Rate", 0.0, 1.0, 0.1)
185
-
186
- # Hyperparameter bounds
187
- param_bounds = {
188
- 'learning_rate': (1e-5, 5e-5),
189
- 'epochs': (1, 3),
190
- 'batch_size': [2, 4, 8]
191
- }
192
-
193
- if st.button("Start Training"):
194
- st.write("Initializing Genetic Algorithm...")
195
- population = create_population(population_size, param_bounds)
196
- best_individual = None
197
- best_fitness = float('inf')
198
- fitness_history = []
199
-
200
- progress_bar = st.progress(0)
201
- status_text = st.empty()
202
-
203
- total_evaluations = num_generations * len(population)
204
- current_evaluation = 0
205
-
206
- for generation in range(num_generations):
207
- st.write(f"Generation {generation+1}/{num_generations}")
208
- fitnesses = []
209
- for idx, individual in enumerate(population):
210
- status_text.text(f"Evaluating individual {idx+1}/{len(population)} in generation {generation+1}")
211
- # Clone the model to avoid reusing the same model
212
- model_clone = GPT2LMHeadModel.from_pretrained('gpt2')
213
- model_clone.to('cuda' if torch.cuda.is_available() else 'cpu')
214
- fitness = fitness_function(individual, train_dataset, model_clone, tokenizer)
215
- fitnesses.append(fitness)
216
- if fitness < best_fitness:
217
- best_fitness = fitness
218
- best_individual = individual
219
- current_evaluation += 1
220
- progress_bar.progress(current_evaluation / total_evaluations)
221
- fitness_history.append(min(fitnesses))
222
- parents = select_mating_pool(population, fitnesses, num_parents)
223
- offspring_size = population_size - num_parents
224
- offspring = crossover(parents, offspring_size)
225
- offspring = mutation(offspring, param_bounds, mutation_rate)
226
- population = parents + offspring
227
-
228
- st.write("Training completed!")
229
- st.write(f"Best Hyperparameters: {best_individual}")
230
- st.write(f"Best Fitness (Loss): {best_fitness}")
231
-
232
- # Plot fitness history
233
- st.line_chart(fitness_history)
234
-
235
- # Save the best model
236
- if st.button("Save Model"):
237
- model_clone.save_pretrained('./fine_tuned_model')
238
- tokenizer.save_pretrained('./fine_tuned_model')
239
- st.write("Model saved successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
  if __name__ == "__main__":
242
- main()
 
5
  import transformers
6
  from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
7
  from datasets import Dataset
8
+ from huggingface_hub import HfApi
9
  import os
10
+ import traceback
11
+ from contextlib import contextmanager
12
+
13
+ # Error Handling Context Manager
14
+ @contextmanager
15
+ def error_handling(operation_name):
16
+ try:
17
+ yield
18
+ except Exception as e:
19
+ error_msg = f"Error during {operation_name}: {str(e)}\n{traceback.format_exc()}"
20
+ st.error(error_msg)
21
+ with open("error_log.txt", "a") as f:
22
+ f.write(f"\n{error_msg}")
23
+
24
+ # Cyberpunk Styling
25
+ def setup_cyberpunk_style():
26
+ st.markdown("""
27
+ <style>
28
+ @import url('https://fonts.googleapis.com/css2?family=Orbitron:wght@400;500;700&display=swap');
29
+
30
+ .stApp {
31
+ background: linear-gradient(45deg, #000428, #004e92);
32
+ }
33
+
34
+ .main-title {
35
+ font-family: 'Orbitron', sans-serif;
36
+ color: #00ff9d;
37
+ text-align: center;
38
+ text-shadow: 0 0 10px #00ff9d;
39
+ padding: 20px;
40
+ font-size: 2.5em;
41
+ margin-bottom: 30px;
42
+ }
43
+
44
+ .stButton>button {
45
+ background: linear-gradient(45deg, #00ff9d, #00b8ff);
46
+ color: black;
47
+ font-family: 'Orbitron', sans-serif;
48
+ border: none;
49
+ padding: 10px 20px;
50
+ border-radius: 5px;
51
+ text-transform: uppercase;
52
+ font-weight: bold;
53
+ transition: all 0.3s ease;
54
+ }
55
+
56
+ .stButton>button:hover {
57
+ transform: scale(1.05);
58
+ box-shadow: 0 0 15px #00ff9d;
59
+ }
60
+
61
+ .metric-container {
62
+ background: rgba(0, 0, 0, 0.5);
63
+ border: 2px solid #00ff9d;
64
+ border-radius: 10px;
65
+ padding: 15px;
66
+ margin: 10px 0;
67
+ }
68
+
69
+ .status-text {
70
+ color: #00ff9d;
71
+ font-family: 'Orbitron', sans-serif;
72
+ font-size: 1.2em;
73
+ }
74
+
75
+ .sidebar .stSelectbox, .sidebar .stSlider {
76
+ background-color: rgba(0, 0, 0, 0.3);
77
+ border-radius: 5px;
78
+ padding: 10px;
79
+ margin: 5px 0;
80
+ }
81
+ </style>
82
+ """, unsafe_allow_html=True)
83
 
84
+ # Your existing functions with error handling
 
 
 
 
85
  def generate_demo_data(num_samples=60):
86
+ with error_handling("demo data generation"):
87
+ # Your existing generate_demo_data code
88
+ subjects = [
89
+ 'Artificial intelligence', 'Climate change', 'Renewable energy',
90
+ 'Space exploration', 'Quantum computing', 'Genetic engineering',
91
+ 'Blockchain technology', 'Virtual reality', 'Cybersecurity',
92
+ 'Biotechnology', 'Nanotechnology', 'Astrophysics'
93
+ ]
94
+ verbs = [
95
+ 'is transforming', 'is influencing', 'is revolutionizing',
96
+ 'is challenging', 'is advancing', 'is reshaping', 'is impacting',
97
+ 'is enhancing', 'is disrupting', 'is redefining'
98
+ ]
99
+ objects = [
100
+ 'modern science', 'global economies', 'healthcare systems',
101
+ 'communication methods', 'educational approaches',
102
+ 'environmental policies', 'social interactions', 'the job market',
103
+ 'data security', 'the entertainment industry'
104
+ ]
105
+ data = []
106
+ for i in range(num_samples):
107
+ subject = random.choice(subjects)
108
+ verb = random.choice(verbs)
109
+ obj = random.choice(objects)
110
+ sentence = f"{subject} {verb} {obj}."
111
+ data.append(sentence)
112
+ return data
113
+
114
+ def upload_to_huggingface(model_path, token, repo_name):
115
+ with error_handling("HuggingFace upload"):
116
+ api = HfApi()
117
+ api.create_repo(repo_name, token=token, private=True)
118
+ api.upload_folder(
119
+ folder_path=model_path,
120
+ repo_id=repo_name,
121
+ token=token
122
+ )
123
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  def fitness_function(individual, train_dataset, model, tokenizer):
126
+ with error_handling("fitness evaluation"):
127
+ training_args = TrainingArguments(
128
+ output_dir='./results',
129
+ overwrite_output_dir=True,
130
+ num_train_epochs=individual['epochs'],
131
+ per_device_train_batch_size=individual['batch_size'],
132
+ learning_rate=individual['learning_rate'],
133
+ logging_steps=10,
134
+ save_steps=10,
135
+ save_total_limit=2,
136
+ report_to='none',
137
+ )
138
+
139
+ data_collator = DataCollatorForLanguageModeling(
140
+ tokenizer=tokenizer, mlm=False
141
+ )
142
+
143
+ trainer = Trainer(
144
+ model=model,
145
+ args=training_args,
146
+ data_collator=data_collator,
147
+ train_dataset=train_dataset,
148
+ eval_dataset=None,
149
+ )
150
+
151
+ trainer.train()
152
+ logs = [log for log in trainer.state.log_history if 'loss' in log]
153
+ return logs[-1]['loss'] if logs else float('inf')
154
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def main():
156
+ setup_cyberpunk_style()
157
+
158
+ st.markdown('<h1 class="main-title">Neural Evolution GPT-2 Training Hub</h1>', unsafe_allow_html=True)
159
+
160
+ # Sidebar Configuration
161
+ with st.sidebar:
162
+ st.markdown("### 🌐 Configuration")
163
+
164
+ hf_token = st.text_input("πŸ”‘ HuggingFace Token", type="password")
165
+ repo_name = st.text_input("πŸ“ Repository Name", "my-gpt2-model")
166
+
167
+ data_source = st.selectbox(
168
+ 'πŸ“Š Data Source',
169
+ ('DEMO', 'Upload Text File')
170
+ )
171
+
172
+ st.markdown("### βš™οΈ Evolution Parameters")
173
+ population_size = st.slider("Population Size", 4, 20, 6)
174
+ num_generations = st.slider("Generations", 1, 10, 3)
175
+ num_parents = st.slider("Parents", 2, population_size, 2)
176
+ mutation_rate = st.slider("Mutation Rate", 0.0, 1.0, 0.1)
177
+
178
+ # Hyperparameter bounds
179
+ param_bounds = {
180
+ 'learning_rate': (1e-5, 5e-5),
181
+ 'epochs': (1, 3),
182
+ 'batch_size': [2, 4, 8]
183
+ }
184
+
185
+ # Main Content Area
186
+ with error_handling("main application flow"):
187
+ if data_source == 'DEMO':
188
+ st.info("πŸ€– Using demo data...")
189
+ data = generate_demo_data()
190
  else:
191
+ uploaded_file = st.file_uploader("πŸ“‚ Upload Training Data", type="txt")
192
+ if uploaded_file:
193
+ data = load_data(uploaded_file)
194
+ else:
195
+ st.warning("⚠️ Please upload a text file")
196
+ st.stop()
197
+
198
+ # Model Setup
199
+ with st.spinner("πŸ”§ Loading GPT-2..."):
200
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
201
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
202
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
203
+ model.to(device)
204
+ tokenizer.pad_token = tokenizer.eos_token
205
+ model.config.pad_token_id = model.config.eos_token_id
206
+
207
+ # Dataset Preparation
208
+ with st.spinner("πŸ“Š Preparing dataset..."):
209
+ train_dataset = prepare_dataset(data, tokenizer)
210
+
211
+ if st.button("πŸš€ Start Training", key="start_training"):
212
+ progress_bar = st.progress(0)
213
+ status_text = st.empty()
214
+
215
+ # Metrics Display
216
+ col1, col2, col3 = st.columns(3)
217
+ with col1:
218
+ metrics_loss = st.empty()
219
+ with col2:
220
+ metrics_generation = st.empty()
221
+ with col3:
222
+ metrics_status = st.empty()
223
+
224
+ try:
225
+ # Initialize GA
226
+ population = create_population(population_size, param_bounds)
227
+ best_individual = None
228
+ best_fitness = float('inf')
229
+ fitness_history = []
230
+
231
+ total_evaluations = num_generations * len(population)
232
+ current_evaluation = 0
233
+
234
+ for generation in range(num_generations):
235
+ metrics_generation.markdown(f"""
236
+ <div class="metric-container">
237
+ <p class="status-text">Generation: {generation + 1}/{num_generations}</p>
238
+ </div>
239
+ """, unsafe_allow_html=True)
240
+
241
+ fitnesses = []
242
+ for idx, individual in enumerate(population):
243
+ status_text.text(f"🧬 Evaluating individual {idx+1}/{len(population)} in generation {generation+1}")
244
+
245
+ # Clone model for each individual
246
+ model_clone = GPT2LMHeadModel.from_pretrained('gpt2')
247
+ model_clone.to(device)
248
+
249
+ fitness = fitness_function(individual, train_dataset, model_clone, tokenizer)
250
+ fitnesses.append(fitness)
251
+
252
+ if fitness < best_fitness:
253
+ best_fitness = fitness
254
+ best_individual = individual.copy()
255
+
256
+ metrics_loss.markdown(f"""
257
+ <div class="metric-container">
258
+ <p class="status-text">Best Loss: {best_fitness:.4f}</p>
259
+ </div>
260
+ """, unsafe_allow_html=True)
261
+
262
+ current_evaluation += 1
263
+ progress_bar.progress(current_evaluation / total_evaluations)
264
+
265
+ # Evolution steps
266
+ parents = select_mating_pool(population, fitnesses, num_parents)
267
+ offspring_size = population_size - num_parents
268
+ offspring = crossover(parents, offspring_size)
269
+ offspring = mutation(offspring, param_bounds, mutation_rate)
270
+ population = parents + offspring
271
+ fitness_history.append(min(fitnesses))
272
+
273
+ # Training Complete
274
+ st.success("πŸŽ‰ Training completed!")
275
+ st.write("Best Hyperparameters:", best_individual)
276
+ st.write("Best Fitness (Loss):", best_fitness)
277
+
278
+ # Plot fitness history
279
+ st.line_chart(fitness_history)
280
+
281
+ # Save and Upload Model
282
+ if st.button("πŸ’Ύ Save & Upload Model"):
283
+ with st.spinner("Saving model..."):
284
+ model.save_pretrained('./fine_tuned_model')
285
+ tokenizer.save_pretrained('./fine_tuned_model')
286
+
287
+ if hf_token:
288
+ if upload_to_huggingface('./fine_tuned_model', hf_token, repo_name):
289
+ st.success(f"βœ… Model uploaded to HuggingFace: {repo_name}")
290
+ else:
291
+ st.error("❌ Failed to upload model")
292
+ else:
293
+ st.warning("⚠️ No HuggingFace token provided. Model saved locally only.")
294
+
295
+ except Exception as e:
296
+ st.error(f"❌ Training error: {str(e)}")
297
+ with open("error_log.txt", "a") as f:
298
+ f.write(f"\nTraining error: {str(e)}\n{traceback.format_exc()}")
299
 
300
  if __name__ == "__main__":
301
+ main()