import streamlit as st import numpy as np import random import torch import transformers from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling from datasets import Dataset import os # Set random seeds for reproducibility random.seed(42) np.random.seed(42) torch.manual_seed(42) def generate_demo_data(num_samples=60): # Generate meaningful sentences on various topics subjects = [ 'Artificial intelligence', 'Climate change', 'Renewable energy', 'Space exploration', 'Quantum computing', 'Genetic engineering', 'Blockchain technology', 'Virtual reality', 'Cybersecurity', 'Biotechnology', 'Nanotechnology', 'Astrophysics' ] verbs = [ 'is transforming', 'is influencing', 'is revolutionizing', 'is challenging', 'is advancing', 'is reshaping', 'is impacting', 'is enhancing', 'is disrupting', 'is redefining' ] objects = [ 'modern science', 'global economies', 'healthcare systems', 'communication methods', 'educational approaches', 'environmental policies', 'social interactions', 'the job market', 'data security', 'the entertainment industry' ] data = [] for i in range(num_samples): subject = random.choice(subjects) verb = random.choice(verbs) obj = random.choice(objects) sentence = f"{subject} {verb} {obj}." data.append(sentence) return data def load_data(uploaded_file): # Load user-uploaded text file data = uploaded_file.read().decode("utf-8") data = data.splitlines() return data def prepare_dataset(data, tokenizer, block_size=128): # Tokenize the texts def tokenize_function(examples): return tokenizer(examples['text'], truncation=True, max_length=block_size, padding='max_length') raw_dataset = Dataset.from_dict({'text': data}) tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['text']) # Create labels for language modeling tokenized_dataset = tokenized_dataset.map( lambda examples: {'labels': examples['input_ids']}, batched=True ) # Set the format for PyTorch tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) return tokenized_dataset def fitness_function(individual, train_dataset, model, tokenizer): # Define the training arguments training_args = TrainingArguments( output_dir='./results', overwrite_output_dir=True, num_train_epochs=individual['epochs'], per_device_train_batch_size=individual['batch_size'], learning_rate=individual['learning_rate'], logging_steps=10, save_steps=10, save_total_limit=2, report_to='none', # Disable logging to Wandb or other services ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # Train the model trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=None, ) trainer.train() # For simplicity, use final training loss as fitness score logs = [log for log in trainer.state.log_history if 'loss' in log] if logs: loss = logs[-1]['loss'] else: loss = float('inf') return loss # Genetic Algorithm Functions def create_population(size, param_bounds): population = [] for _ in range(size): individual = { 'learning_rate': random.uniform(*param_bounds['learning_rate']), 'epochs': random.randint(*param_bounds['epochs']), 'batch_size': random.choice(param_bounds['batch_size']), } population.append(individual) return population def select_mating_pool(population, fitnesses, num_parents): parents = [population[i] for i in np.argsort(fitnesses)[:num_parents]] return parents def crossover(parents, offspring_size): offspring = [] for _ in range(offspring_size): parent1 = random.choice(parents) parent2 = random.choice(parents) child = { 'learning_rate': random.choice([parent1['learning_rate'], parent2['learning_rate']]), 'epochs': random.choice([parent1['epochs'], parent2['epochs']]), 'batch_size': random.choice([parent1['batch_size'], parent2['batch_size']]), } offspring.append(child) return offspring def mutation(offspring, param_bounds, mutation_rate=0.1): for individual in offspring: if random.random() < mutation_rate: individual['learning_rate'] = random.uniform(*param_bounds['learning_rate']) if random.random() < mutation_rate: individual['epochs'] = random.randint(*param_bounds['epochs']) if random.random() < mutation_rate: individual['batch_size'] = random.choice(param_bounds['batch_size']) return offspring # Streamlit App def main(): st.title("ACO madeGPT-2 Fine-Tuning with Genetic Algorithm") option = st.sidebar.selectbox( 'Choose Data Source', ('DEMO', 'Upload Text File') ) if option == 'DEMO': st.write("Using DEMO data...") data = generate_demo_data() else: st.write("Upload a text file for fine-tuning.") uploaded_file = st.file_uploader("Choose a text file", type="txt") if uploaded_file is not None: data = load_data(uploaded_file) else: st.warning("Please upload a text file.") st.stop() # Load tokenizer and model st.write("Loading GPT-2 tokenizer and model...") tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') model.to('cuda' if torch.cuda.is_available() else 'cpu') # Set the pad token tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id # Prepare dataset st.write("Preparing dataset...") train_dataset = prepare_dataset(data, tokenizer) # GA Parameters st.sidebar.subheader("Genetic Algorithm Parameters") population_size = st.sidebar.number_input("Population Size", 4, 20, 6) num_generations = st.sidebar.number_input("Number of Generations", 1, 10, 3) num_parents = st.sidebar.number_input("Number of Parents", 2, population_size, 2) mutation_rate = st.sidebar.slider("Mutation Rate", 0.0, 1.0, 0.1) # Hyperparameter bounds param_bounds = { 'learning_rate': (1e-5, 5e-5), 'epochs': (1, 3), 'batch_size': [2, 4, 8] } if st.button("Start Training"): st.write("Initializing Genetic Algorithm...") population = create_population(population_size, param_bounds) best_individual = None best_fitness = float('inf') fitness_history = [] progress_bar = st.progress(0) status_text = st.empty() total_evaluations = num_generations * len(population) current_evaluation = 0 for generation in range(num_generations): st.write(f"Generation {generation+1}/{num_generations}") fitnesses = [] for idx, individual in enumerate(population): status_text.text(f"Evaluating individual {idx+1}/{len(population)} in generation {generation+1}") # Clone the model to avoid reusing the same model model_clone = GPT2LMHeadModel.from_pretrained('gpt2') model_clone.to('cuda' if torch.cuda.is_available() else 'cpu') fitness = fitness_function(individual, train_dataset, model_clone, tokenizer) fitnesses.append(fitness) if fitness < best_fitness: best_fitness = fitness best_individual = individual current_evaluation += 1 progress_bar.progress(current_evaluation / total_evaluations) fitness_history.append(min(fitnesses)) parents = select_mating_pool(population, fitnesses, num_parents) offspring_size = population_size - num_parents offspring = crossover(parents, offspring_size) offspring = mutation(offspring, param_bounds, mutation_rate) population = parents + offspring st.write("Training completed!") st.write(f"Best Hyperparameters: {best_individual}") st.write(f"Best Fitness (Loss): {best_fitness}") # Plot fitness history st.line_chart(fitness_history) # Save the best model if st.button("Save Model"): model_clone.save_pretrained('./fine_tuned_model') tokenizer.save_pretrained('./fine_tuned_model') st.write("Model saved successfully!") if __name__ == "__main__": main()