File size: 2,840 Bytes
2467ab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from distributed_training.data.dataset import DataLoader
import random
from huggingface_hub import list_repo_refs
import matplotlib.pyplot as plt
import json

device = "cuda"
test_indices_length = 10

models = ["distributed/optimized-gpt2-250m", "distributed/gpt2-250m"]

with open('./results.json', 'r') as file:
    results = json.load(file)

for model_name in models:

    if (model_name not in results.keys()) or (model_name == "distributed/optimized-gpt2-250m"):
        results[model_name] = {}

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    refs = list_repo_refs(model_name, repo_type="model")
    global_epoch = max([int(tag.name) for tag in refs.tags]) if refs.tags else None

    for epoch in range(0, global_epoch, 5):
    # for epoch in [global_epoch]:

        if str(epoch) in results[model_name].keys():
            continue

        model = AutoModelForCausalLM.from_pretrained(model_name, revision=str(epoch), trust_remote_code=True)
        model = model.to(device)

        search_start = random.choice(
            range(
                DataLoader.max_pages
                - test_indices_length
                + 1
            )
        )
        group = [
            i
            for i in range(
                search_start, search_start + test_indices_length
            )
        ]

        dataloader = DataLoader(
            batch_size=1,
            sequence_length=1024,
            rows=group,
        )

        total_loss = 0
        index = 0
        # Train data for one epoch
        for index, batch in enumerate(dataloader):
            inputs = batch[0].to(device)
            labels = batch[1].to(device)

            if (len(inputs[0]) != len(labels[0])):
                breakpoint()
                
            if "optimized" in model_name:
                outputs = model(input_ids=inputs, labels=labels)
                loss = outputs[1]
            else:
                outputs = model(input_ids=inputs, labels=inputs)
                loss = outputs.loss

            # Accumulate Total Loss
            total_loss += loss.detach().item()

            # Backward Pass
            model.zero_grad()

        average_loss = total_loss / (index+1)
        results[model_name][str(epoch)] = [average_loss]
        print(f"Epoch: {epoch}  Average Loss: {average_loss:.2f}")

    # breakpoint()
    with open("./results.json", "w") as outfile: 
        json.dump(results, outfile, indent = 4)

for model_name in models:

    plt.plot(results[model_name].keys(), results[model_name].values())
    plt.title(f"{model_name} Convergence Over Time")
    plt.xlabel("Steps")
    plt.ylabel("Loss")
    plt.xticks(fontsize=3.5)
    plt.savefig(f"{model_name.split('/')[1]}_results.png")