File size: 3,038 Bytes
f887b2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import mlflow
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    Trainer, 
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

def prepare_data(tokenizer, dataset):
    """Tokenize and format the dataset."""
    def tokenize_function(examples):
        # Combine instruction and response with a separator
        text = [f"Instruction: {instr}\nResponse: {resp}" 
               for instr, resp in zip(examples['instruction'], examples['response'])]
        
        return tokenizer(
            text,
            truncation=True,
            max_length=256,
            padding='max_length'
        )

    tokenized_datasets = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset['train'].column_names
    )
    
    return tokenized_datasets

def fine_tune_model():
    """
    Fine-tune GPT-Neo on customer support data using instructions and responses.
    """
    # Load dataset
    dataset = load_dataset('csv', data_files='data/raw/customer_support.csv')
    dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)

    # Load model and tokenizer
    model_name = "EleutherAI/gpt-neo-125M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    # Prepare the dataset
    tokenized_datasets = prepare_data(tokenizer, dataset)

    # Create data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # We're not doing masked language modeling
    )

    mlflow.start_run()

    # Log hyperparameters
    mlflow.log_param("model_name", model_name)
    mlflow.log_param("epochs", 3)
    mlflow.log_param("batch_size", 4)
    mlflow.log_param("learning_rate", 2e-5)

    training_args = TrainingArguments(
        output_dir="models/",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        report_to="mlflow"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
        data_collator=data_collator,
    )

    trainer.train()

    # Save the model and tokenizer
    model_path = "models/customer_support_gpt"
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    
    # Log model artifacts
    mlflow.log_artifact(model_path)

    # Log evaluation metrics
    metrics = trainer.evaluate()
    mlflow.log_metrics(metrics)

    mlflow.end_run()

if __name__ == "__main__":
    fine_tune_model()