Sephfox commited on
Commit
b71e3bf
·
verified ·
1 Parent(s): 1f06eff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +242 -242
app.py CHANGED
@@ -1,242 +1,242 @@
1
- import streamlit as st
2
- import numpy as np
3
- import random
4
- import torch
5
- import transformers
6
- from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
7
- from datasets import Dataset
8
- import os
9
-
10
- # Set random seeds for reproducibility
11
- random.seed(42)
12
- np.random.seed(42)
13
- torch.manual_seed(42)
14
-
15
- def generate_demo_data(num_samples=60):
16
- # Generate meaningful sentences on various topics
17
- subjects = [
18
- 'Artificial intelligence', 'Climate change', 'Renewable energy',
19
- 'Space exploration', 'Quantum computing', 'Genetic engineering',
20
- 'Blockchain technology', 'Virtual reality', 'Cybersecurity',
21
- 'Biotechnology', 'Nanotechnology', 'Astrophysics'
22
- ]
23
- verbs = [
24
- 'is transforming', 'is influencing', 'is revolutionizing',
25
- 'is challenging', 'is advancing', 'is reshaping', 'is impacting',
26
- 'is enhancing', 'is disrupting', 'is redefining'
27
- ]
28
- objects = [
29
- 'modern science', 'global economies', 'healthcare systems',
30
- 'communication methods', 'educational approaches',
31
- 'environmental policies', 'social interactions', 'the job market',
32
- 'data security', 'the entertainment industry'
33
- ]
34
- data = []
35
- for i in range(num_samples):
36
- subject = random.choice(subjects)
37
- verb = random.choice(verbs)
38
- obj = random.choice(objects)
39
- sentence = f"{subject} {verb} {obj}."
40
- data.append(sentence)
41
- return data
42
-
43
- def load_data(uploaded_file):
44
- # Load user-uploaded text file
45
- data = uploaded_file.read().decode("utf-8")
46
- data = data.splitlines()
47
- return data
48
-
49
- def prepare_dataset(data, tokenizer, block_size=128):
50
- # Tokenize the texts
51
- def tokenize_function(examples):
52
- return tokenizer(examples['text'], truncation=True, max_length=block_size, padding='max_length')
53
-
54
- raw_dataset = Dataset.from_dict({'text': data})
55
- tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
56
-
57
- # Create labels for language modeling
58
- tokenized_dataset = tokenized_dataset.map(
59
- lambda examples: {'labels': examples['input_ids']},
60
- batched=True
61
- )
62
-
63
- # Set the format for PyTorch
64
- tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
65
-
66
- return tokenized_dataset
67
-
68
- def fitness_function(individual, train_dataset, model, tokenizer):
69
- # Define the training arguments
70
- training_args = TrainingArguments(
71
- output_dir='./results',
72
- overwrite_output_dir=True,
73
- num_train_epochs=individual['epochs'],
74
- per_device_train_batch_size=individual['batch_size'],
75
- learning_rate=individual['learning_rate'],
76
- logging_steps=10,
77
- save_steps=10,
78
- save_total_limit=2,
79
- report_to='none', # Disable logging to Wandb or other services
80
- )
81
-
82
- data_collator = DataCollatorForLanguageModeling(
83
- tokenizer=tokenizer, mlm=False
84
- )
85
-
86
- # Train the model
87
- trainer = Trainer(
88
- model=model,
89
- args=training_args,
90
- data_collator=data_collator,
91
- train_dataset=train_dataset,
92
- eval_dataset=None,
93
- )
94
-
95
- trainer.train()
96
-
97
- # For simplicity, use final training loss as fitness score
98
- logs = [log for log in trainer.state.log_history if 'loss' in log]
99
- if logs:
100
- loss = logs[-1]['loss']
101
- else:
102
- loss = float('inf')
103
- return loss
104
-
105
- # Genetic Algorithm Functions
106
- def create_population(size, param_bounds):
107
- population = []
108
- for _ in range(size):
109
- individual = {
110
- 'learning_rate': random.uniform(*param_bounds['learning_rate']),
111
- 'epochs': random.randint(*param_bounds['epochs']),
112
- 'batch_size': random.choice(param_bounds['batch_size']),
113
- }
114
- population.append(individual)
115
- return population
116
-
117
- def select_mating_pool(population, fitnesses, num_parents):
118
- parents = [population[i] for i in np.argsort(fitnesses)[:num_parents]]
119
- return parents
120
-
121
- def crossover(parents, offspring_size):
122
- offspring = []
123
- for _ in range(offspring_size):
124
- parent1 = random.choice(parents)
125
- parent2 = random.choice(parents)
126
- child = {
127
- 'learning_rate': random.choice([parent1['learning_rate'], parent2['learning_rate']]),
128
- 'epochs': random.choice([parent1['epochs'], parent2['epochs']]),
129
- 'batch_size': random.choice([parent1['batch_size'], parent2['batch_size']]),
130
- }
131
- offspring.append(child)
132
- return offspring
133
-
134
- def mutation(offspring, param_bounds, mutation_rate=0.1):
135
- for individual in offspring:
136
- if random.random() < mutation_rate:
137
- individual['learning_rate'] = random.uniform(*param_bounds['learning_rate'])
138
- if random.random() < mutation_rate:
139
- individual['epochs'] = random.randint(*param_bounds['epochs'])
140
- if random.random() < mutation_rate:
141
- individual['batch_size'] = random.choice(param_bounds['batch_size'])
142
- return offspring
143
-
144
- # Streamlit App
145
- def main():
146
- st.title("GPT-2 Fine-Tuning with Genetic Algorithm")
147
-
148
- option = st.sidebar.selectbox(
149
- 'Choose Data Source',
150
- ('DEMO', 'Upload Text File')
151
- )
152
-
153
- if option == 'DEMO':
154
- st.write("Using DEMO data...")
155
- data = generate_demo_data()
156
- else:
157
- st.write("Upload a text file for fine-tuning.")
158
- uploaded_file = st.file_uploader("Choose a text file", type="txt")
159
- if uploaded_file is not None:
160
- data = load_data(uploaded_file)
161
- else:
162
- st.warning("Please upload a text file.")
163
- st.stop()
164
-
165
- # Load tokenizer and model
166
- st.write("Loading GPT-2 tokenizer and model...")
167
- tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
168
- model = GPT2LMHeadModel.from_pretrained('gpt2')
169
- model.to('cuda' if torch.cuda.is_available() else 'cpu')
170
-
171
- # Set the pad token
172
- tokenizer.pad_token = tokenizer.eos_token
173
- model.config.pad_token_id = model.config.eos_token_id
174
-
175
- # Prepare dataset
176
- st.write("Preparing dataset...")
177
- train_dataset = prepare_dataset(data, tokenizer)
178
-
179
- # GA Parameters
180
- st.sidebar.subheader("Genetic Algorithm Parameters")
181
- population_size = st.sidebar.number_input("Population Size", 4, 20, 6)
182
- num_generations = st.sidebar.number_input("Number of Generations", 1, 10, 3)
183
- num_parents = st.sidebar.number_input("Number of Parents", 2, population_size, 2)
184
- mutation_rate = st.sidebar.slider("Mutation Rate", 0.0, 1.0, 0.1)
185
-
186
- # Hyperparameter bounds
187
- param_bounds = {
188
- 'learning_rate': (1e-5, 5e-5),
189
- 'epochs': (1, 3),
190
- 'batch_size': [2, 4, 8]
191
- }
192
-
193
- if st.button("Start Training"):
194
- st.write("Initializing Genetic Algorithm...")
195
- population = create_population(population_size, param_bounds)
196
- best_individual = None
197
- best_fitness = float('inf')
198
- fitness_history = []
199
-
200
- progress_bar = st.progress(0)
201
- status_text = st.empty()
202
-
203
- total_evaluations = num_generations * len(population)
204
- current_evaluation = 0
205
-
206
- for generation in range(num_generations):
207
- st.write(f"Generation {generation+1}/{num_generations}")
208
- fitnesses = []
209
- for idx, individual in enumerate(population):
210
- status_text.text(f"Evaluating individual {idx+1}/{len(population)} in generation {generation+1}")
211
- # Clone the model to avoid reusing the same model
212
- model_clone = GPT2LMHeadModel.from_pretrained('gpt2')
213
- model_clone.to('cuda' if torch.cuda.is_available() else 'cpu')
214
- fitness = fitness_function(individual, train_dataset, model_clone, tokenizer)
215
- fitnesses.append(fitness)
216
- if fitness < best_fitness:
217
- best_fitness = fitness
218
- best_individual = individual
219
- current_evaluation += 1
220
- progress_bar.progress(current_evaluation / total_evaluations)
221
- fitness_history.append(min(fitnesses))
222
- parents = select_mating_pool(population, fitnesses, num_parents)
223
- offspring_size = population_size - num_parents
224
- offspring = crossover(parents, offspring_size)
225
- offspring = mutation(offspring, param_bounds, mutation_rate)
226
- population = parents + offspring
227
-
228
- st.write("Training completed!")
229
- st.write(f"Best Hyperparameters: {best_individual}")
230
- st.write(f"Best Fitness (Loss): {best_fitness}")
231
-
232
- # Plot fitness history
233
- st.line_chart(fitness_history)
234
-
235
- # Save the best model
236
- if st.button("Save Model"):
237
- model_clone.save_pretrained('./fine_tuned_model')
238
- tokenizer.save_pretrained('./fine_tuned_model')
239
- st.write("Model saved successfully!")
240
-
241
- if __name__ == "__main__":
242
- main()
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import random
4
+ import torch
5
+ import transformers
6
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
7
+ from datasets import Dataset
8
+ import os
9
+
10
+ # Set random seeds for reproducibility
11
+ random.seed(42)
12
+ np.random.seed(42)
13
+ torch.manual_seed(42)
14
+
15
+ def generate_demo_data(num_samples=60):
16
+ # Generate meaningful sentences on various topics
17
+ subjects = [
18
+ 'Artificial intelligence', 'Climate change', 'Renewable energy',
19
+ 'Space exploration', 'Quantum computing', 'Genetic engineering',
20
+ 'Blockchain technology', 'Virtual reality', 'Cybersecurity',
21
+ 'Biotechnology', 'Nanotechnology', 'Astrophysics'
22
+ ]
23
+ verbs = [
24
+ 'is transforming', 'is influencing', 'is revolutionizing',
25
+ 'is challenging', 'is advancing', 'is reshaping', 'is impacting',
26
+ 'is enhancing', 'is disrupting', 'is redefining'
27
+ ]
28
+ objects = [
29
+ 'modern science', 'global economies', 'healthcare systems',
30
+ 'communication methods', 'educational approaches',
31
+ 'environmental policies', 'social interactions', 'the job market',
32
+ 'data security', 'the entertainment industry'
33
+ ]
34
+ data = []
35
+ for i in range(num_samples):
36
+ subject = random.choice(subjects)
37
+ verb = random.choice(verbs)
38
+ obj = random.choice(objects)
39
+ sentence = f"{subject} {verb} {obj}."
40
+ data.append(sentence)
41
+ return data
42
+
43
+ def load_data(uploaded_file):
44
+ # Load user-uploaded text file
45
+ data = uploaded_file.read().decode("utf-8")
46
+ data = data.splitlines()
47
+ return data
48
+
49
+ def prepare_dataset(data, tokenizer, block_size=128):
50
+ # Tokenize the texts
51
+ def tokenize_function(examples):
52
+ return tokenizer(examples['text'], truncation=True, max_length=block_size, padding='max_length')
53
+
54
+ raw_dataset = Dataset.from_dict({'text': data})
55
+ tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
56
+
57
+ # Create labels for language modeling
58
+ tokenized_dataset = tokenized_dataset.map(
59
+ lambda examples: {'labels': examples['input_ids']},
60
+ batched=True
61
+ )
62
+
63
+ # Set the format for PyTorch
64
+ tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
65
+
66
+ return tokenized_dataset
67
+
68
+ def fitness_function(individual, train_dataset, model, tokenizer):
69
+ # Define the training arguments
70
+ training_args = TrainingArguments(
71
+ output_dir='./results',
72
+ overwrite_output_dir=True,
73
+ num_train_epochs=individual['epochs'],
74
+ per_device_train_batch_size=individual['batch_size'],
75
+ learning_rate=individual['learning_rate'],
76
+ logging_steps=10,
77
+ save_steps=10,
78
+ save_total_limit=2,
79
+ report_to='none', # Disable logging to Wandb or other services
80
+ )
81
+
82
+ data_collator = DataCollatorForLanguageModeling(
83
+ tokenizer=tokenizer, mlm=False
84
+ )
85
+
86
+ # Train the model
87
+ trainer = Trainer(
88
+ model=model,
89
+ args=training_args,
90
+ data_collator=data_collator,
91
+ train_dataset=train_dataset,
92
+ eval_dataset=None,
93
+ )
94
+
95
+ trainer.train()
96
+
97
+ # For simplicity, use final training loss as fitness score
98
+ logs = [log for log in trainer.state.log_history if 'loss' in log]
99
+ if logs:
100
+ loss = logs[-1]['loss']
101
+ else:
102
+ loss = float('inf')
103
+ return loss
104
+
105
+ # Genetic Algorithm Functions
106
+ def create_population(size, param_bounds):
107
+ population = []
108
+ for _ in range(size):
109
+ individual = {
110
+ 'learning_rate': random.uniform(*param_bounds['learning_rate']),
111
+ 'epochs': random.randint(*param_bounds['epochs']),
112
+ 'batch_size': random.choice(param_bounds['batch_size']),
113
+ }
114
+ population.append(individual)
115
+ return population
116
+
117
+ def select_mating_pool(population, fitnesses, num_parents):
118
+ parents = [population[i] for i in np.argsort(fitnesses)[:num_parents]]
119
+ return parents
120
+
121
+ def crossover(parents, offspring_size):
122
+ offspring = []
123
+ for _ in range(offspring_size):
124
+ parent1 = random.choice(parents)
125
+ parent2 = random.choice(parents)
126
+ child = {
127
+ 'learning_rate': random.choice([parent1['learning_rate'], parent2['learning_rate']]),
128
+ 'epochs': random.choice([parent1['epochs'], parent2['epochs']]),
129
+ 'batch_size': random.choice([parent1['batch_size'], parent2['batch_size']]),
130
+ }
131
+ offspring.append(child)
132
+ return offspring
133
+
134
+ def mutation(offspring, param_bounds, mutation_rate=0.1):
135
+ for individual in offspring:
136
+ if random.random() < mutation_rate:
137
+ individual['learning_rate'] = random.uniform(*param_bounds['learning_rate'])
138
+ if random.random() < mutation_rate:
139
+ individual['epochs'] = random.randint(*param_bounds['epochs'])
140
+ if random.random() < mutation_rate:
141
+ individual['batch_size'] = random.choice(param_bounds['batch_size'])
142
+ return offspring
143
+
144
+ # Streamlit App
145
+ def main():
146
+ st.title("ACO madeGPT-2 Fine-Tuning with Genetic Algorithm")
147
+
148
+ option = st.sidebar.selectbox(
149
+ 'Choose Data Source',
150
+ ('DEMO', 'Upload Text File')
151
+ )
152
+
153
+ if option == 'DEMO':
154
+ st.write("Using DEMO data...")
155
+ data = generate_demo_data()
156
+ else:
157
+ st.write("Upload a text file for fine-tuning.")
158
+ uploaded_file = st.file_uploader("Choose a text file", type="txt")
159
+ if uploaded_file is not None:
160
+ data = load_data(uploaded_file)
161
+ else:
162
+ st.warning("Please upload a text file.")
163
+ st.stop()
164
+
165
+ # Load tokenizer and model
166
+ st.write("Loading GPT-2 tokenizer and model...")
167
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
168
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
169
+ model.to('cuda' if torch.cuda.is_available() else 'cpu')
170
+
171
+ # Set the pad token
172
+ tokenizer.pad_token = tokenizer.eos_token
173
+ model.config.pad_token_id = model.config.eos_token_id
174
+
175
+ # Prepare dataset
176
+ st.write("Preparing dataset...")
177
+ train_dataset = prepare_dataset(data, tokenizer)
178
+
179
+ # GA Parameters
180
+ st.sidebar.subheader("Genetic Algorithm Parameters")
181
+ population_size = st.sidebar.number_input("Population Size", 4, 20, 6)
182
+ num_generations = st.sidebar.number_input("Number of Generations", 1, 10, 3)
183
+ num_parents = st.sidebar.number_input("Number of Parents", 2, population_size, 2)
184
+ mutation_rate = st.sidebar.slider("Mutation Rate", 0.0, 1.0, 0.1)
185
+
186
+ # Hyperparameter bounds
187
+ param_bounds = {
188
+ 'learning_rate': (1e-5, 5e-5),
189
+ 'epochs': (1, 3),
190
+ 'batch_size': [2, 4, 8]
191
+ }
192
+
193
+ if st.button("Start Training"):
194
+ st.write("Initializing Genetic Algorithm...")
195
+ population = create_population(population_size, param_bounds)
196
+ best_individual = None
197
+ best_fitness = float('inf')
198
+ fitness_history = []
199
+
200
+ progress_bar = st.progress(0)
201
+ status_text = st.empty()
202
+
203
+ total_evaluations = num_generations * len(population)
204
+ current_evaluation = 0
205
+
206
+ for generation in range(num_generations):
207
+ st.write(f"Generation {generation+1}/{num_generations}")
208
+ fitnesses = []
209
+ for idx, individual in enumerate(population):
210
+ status_text.text(f"Evaluating individual {idx+1}/{len(population)} in generation {generation+1}")
211
+ # Clone the model to avoid reusing the same model
212
+ model_clone = GPT2LMHeadModel.from_pretrained('gpt2')
213
+ model_clone.to('cuda' if torch.cuda.is_available() else 'cpu')
214
+ fitness = fitness_function(individual, train_dataset, model_clone, tokenizer)
215
+ fitnesses.append(fitness)
216
+ if fitness < best_fitness:
217
+ best_fitness = fitness
218
+ best_individual = individual
219
+ current_evaluation += 1
220
+ progress_bar.progress(current_evaluation / total_evaluations)
221
+ fitness_history.append(min(fitnesses))
222
+ parents = select_mating_pool(population, fitnesses, num_parents)
223
+ offspring_size = population_size - num_parents
224
+ offspring = crossover(parents, offspring_size)
225
+ offspring = mutation(offspring, param_bounds, mutation_rate)
226
+ population = parents + offspring
227
+
228
+ st.write("Training completed!")
229
+ st.write(f"Best Hyperparameters: {best_individual}")
230
+ st.write(f"Best Fitness (Loss): {best_fitness}")
231
+
232
+ # Plot fitness history
233
+ st.line_chart(fitness_history)
234
+
235
+ # Save the best model
236
+ if st.button("Save Model"):
237
+ model_clone.save_pretrained('./fine_tuned_model')
238
+ tokenizer.save_pretrained('./fine_tuned_model')
239
+ st.write("Model saved successfully!")
240
+
241
+ if __name__ == "__main__":
242
+ main()