Upload GPTJXForCausalLM

Files changed (3) hide show

config.json CHANGED Viewed

@@ -1,20 +1,19 @@
-{
-  "_name_or_path": "/pretrainedmodel",
-  "architectures": [
-    "GPTJXForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "pretrained_config.GPTJXConfig",
-    "AutoModelForCausalLM": "pretrained_model.GPTJXForCausalLM"
-  },
-  "bias": false,
-  "block_size": 1024,
-  "dropout": 0.0,
-  "model_type": "nanogpt-j",
-  "n_embd": 768,
-  "n_head": 12,
-  "n_layer": 12,
-  "torch_dtype": "float32",
-  "transformers_version": "4.39.3",
-  "vocab_size": 52050
-}

+{
+  "architectures": [
+    "GPTJXForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "pretrained_config.GPTJXConfig",
+    "AutoModelForCausalLM": "pretrained_model.GPTJXForCausalLM"
+  },
+  "bias": false,
+  "block_size": 1024,
+  "dropout": 0.0,
+  "model_type": "nanogpt-j",
+  "n_embd": 768,
+  "n_head": 12,
+  "n_layer": 12,
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.2",
+  "vocab_size": 52050
+}

generation_config.json CHANGED Viewed

@@ -1,4 +1,4 @@
-{
-  "_from_model_config": true,
-  "transformers_version": "4.39.3"
-}

+{
+  "_from_model_config": true,
+  "transformers_version": "4.41.2"
+}

pretrained_model.py CHANGED Viewed

@@ -184,14 +184,22 @@ class GPTJXForCausalLM(PreTrainedModel):
             x = block(x, attn_mask=attn_mask)
         x = self.transformer.ln_f(x)
-        logits = self.lm_head(x)  # logits over the entire sequence, shape (b, t, vocab_size)
         if targets is not None:
-            # If targets are provided, compute the loss
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
         else:
-            # Inference-time: return logits for each timestep
             loss = None
         return CausalLMOutputWithPast(
             loss=loss,

             x = block(x, attn_mask=attn_mask)
         x = self.transformer.ln_f(x)
+        # logits = self.lm_head(x)  # logits over the entire sequence, shape (b, t, vocab_size)
         if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.lm_head(x)
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
         else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
             loss = None
+        # if targets is not None:
+        #     # If targets are provided, compute the loss
+        #     loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
+        # else:
+        #     # Inference-time: return logits for each timestep
+        #     loss = None
         return CausalLMOutputWithPast(
             loss=loss,