#this script will let you train your own distillgpt checkpoint or fine tune the one in checkpoint-4000 import os import torch from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling from transformers import Trainer, TrainingArguments, TrainerCallback # Added TrainerCallback here from datasets import load_dataset from datetime import datetime # Data preparation data_dir = r"https://github.com/zrebarchak/Codexchan.exe-Archive" # """replace this with folder of txt files the github link This is the base dataset. it includes all of codexchan's videos where they spoke. theres nothing wrong with the errored folder, you should combine it- and train on them both fom . note that this dataset doesnt include the faq (https://etherpad.mit.edu/p/r.46c0a7842e569d53dc22b44afed6bc40) or this https://www.onlinegdb.com/fork/IrQRJkyX0 also note checkpoint-4000 was not trained on these either, just this base dataset. have fun!""" # dataset = load_dataset("text", data_files=os.path.join(data_dir, "*.txt")) # Model and tokenizer setup model_name = "distilgpt2" base_output_dir = "./distilgpt2-fine-tuned" # Generate a unique name for this training run current_time = datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = os.path.join(base_output_dir, f"distilgpt2_continuous_{current_time}") # Function to find the most recent model directory def find_most_recent_model(base_dir): if not os.path.exists(base_dir): return None subdirs = [os.path.join(base_dir, d) for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))] valid_dirs = [d for d in subdirs if os.path.exists(os.path.join(d, 'config.json'))] return max(valid_dirs, key=os.path.getmtime) if valid_dirs else None most_recent_dir = find_most_recent_model(base_output_dir) if most_recent_dir: print(f"Loading most recent saved model from: {most_recent_dir}") try: model = GPT2LMHeadModel.from_pretrained(most_recent_dir) tokenizer = GPT2Tokenizer.from_pretrained(most_recent_dir) except Exception as e: print(f"Error loading saved model: {e}") print("Starting with fresh model instead.") model = GPT2LMHeadModel.from_pretrained(model_name) tokenizer = GPT2Tokenizer.from_pretrained(model_name) else: print("No valid saved model found. Starting with fresh model...") model = GPT2LMHeadModel.from_pretrained(model_name) tokenizer = GPT2Tokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id # Tokenize the dataset def tokenize_function(examples): return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128) tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"]) # Training arguments training_args = TrainingArguments( output_dir=output_dir, overwrite_output_dir=True, per_device_train_batch_size=1, gradient_accumulation_steps=4, save_steps=1000, save_total_limit=5, fp16=True, gradient_checkpointing=True, learning_rate=1e-4, warmup_steps=100, logging_steps=10, # Log more frequently max_steps=-1, # No limit on the number of steps num_train_epochs=215, # This will be ignored due to max_steps=-1 ) # Custom callback to print progress class ProgressCallback(TrainerCallback): def __init__(self, total_steps=1000000): # A large number, but not so large it causes display issues self.total_steps = total_steps def on_log(self, args, state, control, logs=None, **kwargs): if state.global_step % 10 == 0: # Print every 10 steps print(f"Step: {state.global_step}/{self.total_steps} - Loss: {logs.get('loss', 'N/A'):.4f}") # Trainer setup trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), callbacks=[ProgressCallback()] ) # Enable gradient checkpointing model.gradient_checkpointing_enable() # Start training print(f"Starting long-running training. Models will be saved to {output_dir}") print("Press Ctrl+C to stop...") try: trainer.train() except KeyboardInterrupt: print("\nTraining interrupted. Saving model...") trainer.save_model() print(f"Model saved to {output_dir}. You can resume training later by running this script again.") print("Training completed or interrupted. Final model saved.")