BeardedMonster commited on
Commit
be3dc26
·
verified ·
1 Parent(s): 2e195a2

Upload GPTJXForCausalLM

Browse files
Files changed (3) hide show
  1. config.json +19 -20
  2. generation_config.json +4 -4
  3. pretrained_model.py +12 -4
config.json CHANGED
@@ -1,20 +1,19 @@
1
- {
2
- "_name_or_path": "/pretrainedmodel",
3
- "architectures": [
4
- "GPTJXForCausalLM"
5
- ],
6
- "auto_map": {
7
- "AutoConfig": "pretrained_config.GPTJXConfig",
8
- "AutoModelForCausalLM": "pretrained_model.GPTJXForCausalLM"
9
- },
10
- "bias": false,
11
- "block_size": 1024,
12
- "dropout": 0.0,
13
- "model_type": "nanogpt-j",
14
- "n_embd": 768,
15
- "n_head": 12,
16
- "n_layer": 12,
17
- "torch_dtype": "float32",
18
- "transformers_version": "4.39.3",
19
- "vocab_size": 52050
20
- }
 
1
+ {
2
+ "architectures": [
3
+ "GPTJXForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "pretrained_config.GPTJXConfig",
7
+ "AutoModelForCausalLM": "pretrained_model.GPTJXForCausalLM"
8
+ },
9
+ "bias": false,
10
+ "block_size": 1024,
11
+ "dropout": 0.0,
12
+ "model_type": "nanogpt-j",
13
+ "n_embd": 768,
14
+ "n_head": 12,
15
+ "n_layer": 12,
16
+ "torch_dtype": "float32",
17
+ "transformers_version": "4.41.2",
18
+ "vocab_size": 52050
19
+ }
 
generation_config.json CHANGED
@@ -1,4 +1,4 @@
1
- {
2
- "_from_model_config": true,
3
- "transformers_version": "4.39.3"
4
- }
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.41.2"
4
+ }
pretrained_model.py CHANGED
@@ -184,14 +184,22 @@ class GPTJXForCausalLM(PreTrainedModel):
184
  x = block(x, attn_mask=attn_mask)
185
  x = self.transformer.ln_f(x)
186
 
187
- logits = self.lm_head(x) # logits over the entire sequence, shape (b, t, vocab_size)
188
-
189
  if targets is not None:
190
- # If targets are provided, compute the loss
 
191
  loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
192
  else:
193
- # Inference-time: return logits for each timestep
 
194
  loss = None
 
 
 
 
 
 
 
195
 
196
  return CausalLMOutputWithPast(
197
  loss=loss,
 
184
  x = block(x, attn_mask=attn_mask)
185
  x = self.transformer.ln_f(x)
186
 
187
+ # logits = self.lm_head(x) # logits over the entire sequence, shape (b, t, vocab_size)
 
188
  if targets is not None:
189
+ # if we are given some desired targets also calculate the loss
190
+ logits = self.lm_head(x)
191
  loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
192
  else:
193
+ # inference-time mini-optimization: only forward the lm_head on the very last position
194
+ logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
195
  loss = None
196
+
197
+ # if targets is not None:
198
+ # # If targets are provided, compute the loss
199
+ # loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
200
+ # else:
201
+ # # Inference-time: return logits for each timestep
202
+ # loss = None
203
 
204
  return CausalLMOutputWithPast(
205
  loss=loss,