BeardedMonster
commited on
Upload GPTJXForCausalLM
Browse files- config.json +19 -20
- generation_config.json +4 -4
- pretrained_model.py +12 -4
config.json
CHANGED
@@ -1,20 +1,19 @@
|
|
1 |
-
{
|
2 |
-
"
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
"
|
8 |
-
|
9 |
-
|
10 |
-
"
|
11 |
-
"
|
12 |
-
"
|
13 |
-
"
|
14 |
-
"
|
15 |
-
"
|
16 |
-
"
|
17 |
-
"
|
18 |
-
"
|
19 |
-
|
20 |
-
}
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"GPTJXForCausalLM"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "pretrained_config.GPTJXConfig",
|
7 |
+
"AutoModelForCausalLM": "pretrained_model.GPTJXForCausalLM"
|
8 |
+
},
|
9 |
+
"bias": false,
|
10 |
+
"block_size": 1024,
|
11 |
+
"dropout": 0.0,
|
12 |
+
"model_type": "nanogpt-j",
|
13 |
+
"n_embd": 768,
|
14 |
+
"n_head": 12,
|
15 |
+
"n_layer": 12,
|
16 |
+
"torch_dtype": "float32",
|
17 |
+
"transformers_version": "4.41.2",
|
18 |
+
"vocab_size": 52050
|
19 |
+
}
|
|
generation_config.json
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
{
|
2 |
-
"_from_model_config": true,
|
3 |
-
"transformers_version": "4.
|
4 |
-
}
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"transformers_version": "4.41.2"
|
4 |
+
}
|
pretrained_model.py
CHANGED
@@ -184,14 +184,22 @@ class GPTJXForCausalLM(PreTrainedModel):
|
|
184 |
x = block(x, attn_mask=attn_mask)
|
185 |
x = self.transformer.ln_f(x)
|
186 |
|
187 |
-
logits = self.lm_head(x) # logits over the entire sequence, shape (b, t, vocab_size)
|
188 |
-
|
189 |
if targets is not None:
|
190 |
-
#
|
|
|
191 |
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
|
192 |
else:
|
193 |
-
#
|
|
|
194 |
loss = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
return CausalLMOutputWithPast(
|
197 |
loss=loss,
|
|
|
184 |
x = block(x, attn_mask=attn_mask)
|
185 |
x = self.transformer.ln_f(x)
|
186 |
|
187 |
+
# logits = self.lm_head(x) # logits over the entire sequence, shape (b, t, vocab_size)
|
|
|
188 |
if targets is not None:
|
189 |
+
# if we are given some desired targets also calculate the loss
|
190 |
+
logits = self.lm_head(x)
|
191 |
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
|
192 |
else:
|
193 |
+
# inference-time mini-optimization: only forward the lm_head on the very last position
|
194 |
+
logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
|
195 |
loss = None
|
196 |
+
|
197 |
+
# if targets is not None:
|
198 |
+
# # If targets are provided, compute the loss
|
199 |
+
# loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
|
200 |
+
# else:
|
201 |
+
# # Inference-time: return logits for each timestep
|
202 |
+
# loss = None
|
203 |
|
204 |
return CausalLMOutputWithPast(
|
205 |
loss=loss,
|