Spaces:
Runtime error
Runtime error
File size: 3,038 Bytes
f887b2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import mlflow
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Trainer,
TrainingArguments,
DataCollatorForLanguageModeling
)
from datasets import load_dataset
def prepare_data(tokenizer, dataset):
"""Tokenize and format the dataset."""
def tokenize_function(examples):
# Combine instruction and response with a separator
text = [f"Instruction: {instr}\nResponse: {resp}"
for instr, resp in zip(examples['instruction'], examples['response'])]
return tokenizer(
text,
truncation=True,
max_length=256,
padding='max_length'
)
tokenized_datasets = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset['train'].column_names
)
return tokenized_datasets
def fine_tune_model():
"""
Fine-tune GPT-Neo on customer support data using instructions and responses.
"""
# Load dataset
dataset = load_dataset('csv', data_files='data/raw/customer_support.csv')
dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
# Load model and tokenizer
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
# Prepare the dataset
tokenized_datasets = prepare_data(tokenizer, dataset)
# Create data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False # We're not doing masked language modeling
)
mlflow.start_run()
# Log hyperparameters
mlflow.log_param("model_name", model_name)
mlflow.log_param("epochs", 3)
mlflow.log_param("batch_size", 4)
mlflow.log_param("learning_rate", 2e-5)
training_args = TrainingArguments(
output_dir="models/",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.01,
save_strategy="epoch",
save_total_limit=2,
load_best_model_at_end=True,
report_to="mlflow"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test'],
data_collator=data_collator,
)
trainer.train()
# Save the model and tokenizer
model_path = "models/customer_support_gpt"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
# Log model artifacts
mlflow.log_artifact(model_path)
# Log evaluation metrics
metrics = trainer.evaluate()
mlflow.log_metrics(metrics)
mlflow.end_run()
if __name__ == "__main__":
fine_tune_model() |