File size: 3,944 Bytes
034be76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.503370786516854,
  "eval_steps": 3,
  "global_step": 56,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.008988764044943821,
      "grad_norm": 0.0,
      "learning_rate": 0.0,
      "loss": 2.3561,
      "step": 1
    },
    {
      "epoch": 0.02696629213483146,
      "grad_norm": 0.0,
      "learning_rate": 0.0,
      "loss": 2.3742,
      "step": 3
    },
    {
      "epoch": 0.05393258426966292,
      "grad_norm": 1.6691291411784266,
      "learning_rate": 1.2000000000000002e-06,
      "loss": 2.378,
      "step": 6
    },
    {
      "epoch": 0.08089887640449438,
      "grad_norm": 1.3100423824193625,
      "learning_rate": 2.4000000000000003e-06,
      "loss": 2.3941,
      "step": 9
    },
    {
      "epoch": 0.10786516853932585,
      "grad_norm": 1.5134581208383007,
      "learning_rate": 2.942156862745098e-06,
      "loss": 2.3612,
      "step": 12
    },
    {
      "epoch": 0.1348314606741573,
      "grad_norm": 1.0141381931565556,
      "learning_rate": 2.7686274509803922e-06,
      "loss": 2.357,
      "step": 15
    },
    {
      "epoch": 0.16179775280898875,
      "grad_norm": 0.8466314668182882,
      "learning_rate": 2.652941176470588e-06,
      "loss": 2.3297,
      "step": 18
    },
    {
      "epoch": 0.18876404494382024,
      "grad_norm": 1.0826426333604904,
      "learning_rate": 2.4794117647058824e-06,
      "loss": 2.3321,
      "step": 21
    },
    {
      "epoch": 0.2157303370786517,
      "grad_norm": 1.6280064420318552,
      "learning_rate": 2.305882352941176e-06,
      "loss": 2.307,
      "step": 24
    },
    {
      "epoch": 0.24269662921348314,
      "grad_norm": 0.6741353244640793,
      "learning_rate": 2.1323529411764704e-06,
      "loss": 2.3129,
      "step": 27
    },
    {
      "epoch": 0.2696629213483146,
      "grad_norm": 0.6817491063656745,
      "learning_rate": 1.9588235294117646e-06,
      "loss": 2.3311,
      "step": 30
    },
    {
      "epoch": 0.2966292134831461,
      "grad_norm": 2.1277430036014007,
      "learning_rate": 1.7852941176470589e-06,
      "loss": 2.3254,
      "step": 33
    },
    {
      "epoch": 0.3235955056179775,
      "grad_norm": 0.638893880509862,
      "learning_rate": 1.6117647058823529e-06,
      "loss": 2.2827,
      "step": 36
    },
    {
      "epoch": 0.350561797752809,
      "grad_norm": 0.7151303076075821,
      "learning_rate": 1.4382352941176471e-06,
      "loss": 2.3282,
      "step": 39
    },
    {
      "epoch": 0.3775280898876405,
      "grad_norm": 0.7055970837174532,
      "learning_rate": 1.3225490196078432e-06,
      "loss": 2.2962,
      "step": 42
    },
    {
      "epoch": 0.4044943820224719,
      "grad_norm": 0.7744109790448125,
      "learning_rate": 1.1490196078431372e-06,
      "loss": 2.3371,
      "step": 45
    },
    {
      "epoch": 0.4314606741573034,
      "grad_norm": 0.733295577409483,
      "learning_rate": 9.754901960784315e-07,
      "loss": 2.2953,
      "step": 48
    },
    {
      "epoch": 0.4584269662921348,
      "grad_norm": 1.4483568999325545,
      "learning_rate": 8.019607843137255e-07,
      "loss": 2.2863,
      "step": 51
    },
    {
      "epoch": 0.4853932584269663,
      "grad_norm": 1.0119535871876866,
      "learning_rate": 6.284313725490195e-07,
      "loss": 2.2993,
      "step": 54
    },
    {
      "epoch": 0.503370786516854,
      "step": 56,
      "total_flos": 223005439426560.0,
      "train_loss": 2.3332885844366893,
      "train_runtime": 17499.5122,
      "train_samples_per_second": 0.407,
      "train_steps_per_second": 0.003
    }
  ],
  "logging_steps": 3,
  "max_steps": 56,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 12,
  "total_flos": 223005439426560.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}