TheNateTCY
/

testing_opt_causal_model

TheNateTCY commited on Dec 18, 2022

Commit

69b3b18

1 Parent(s): 004473e

Training in progress epoch 0

Files changed (3) hide show

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-license: other
 tags:
 - generated_from_keras_callback
 model-index:
@@ -12,11 +12,11 @@ probably proofread and complete it, then remove this comment. -->
 # TheNateTCY/testing_opt_causal_model
-This model is a fine-tuned version of [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) on an unknown dataset.
 It achieves the following results on the evaluation set:
-- Train Loss: 7.1725
-- Validation Loss: 6.7890
-- Epoch: 2
 ## Model description
@@ -42,9 +42,7 @@ The following hyperparameters were used during training:
 | Train Loss | Validation Loss | Epoch |
 |:----------:|:---------------:|:-----:|
-| 8.4252     | 7.9847          | 0     |
-| 7.7442     | 7.2288          | 1     |
-| 7.1725     | 6.7890          | 2     |
 ### Framework versions

 ---
+license: mit
 tags:
 - generated_from_keras_callback
 model-index:
 # TheNateTCY/testing_opt_causal_model
+This model is a fine-tuned version of [gpt2](https://huggingface.co/gpt2) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- Train Loss: 10.6976
+- Validation Loss: 10.1180
+- Epoch: 0
 ## Model description
 | Train Loss | Validation Loss | Epoch |
 |:----------:|:---------------:|:-----:|
+| 10.6976    | 10.1180         | 0     |
 ### Framework versions

config.json CHANGED Viewed

@@ -1,31 +1,38 @@
 {
-  "_name_or_path": "facebook/opt-125m",
-  "_remove_final_layer_norm": false,
-  "activation_dropout": 0.0,
-  "activation_function": "relu",
   "architectures": [
-    "OPTForCausalLM"
   ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 2,
-  "do_layer_norm_before": true,
-  "dropout": 0.1,
-  "enable_bias": true,
-  "eos_token_id": 2,
-  "ffn_dim": 3072,
-  "hidden_size": 768,
-  "init_std": 0.02,
-  "layer_norm_elementwise_affine": true,
-  "layerdrop": 0.0,
-  "max_position_embeddings": 2048,
-  "model_type": "opt",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 1,
-  "prefix": "</s>",
-  "torch_dtype": "float16",
   "transformers_version": "4.25.1",
   "use_cache": true,
-  "vocab_size": 50272,
-  "word_embed_proj_dim": 768
 }

 {
+  "_name_or_path": "gpt2",
+  "activation_function": "gelu_new",
   "architectures": [
+    "GPT2LMHeadModel"
   ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 128,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
   "transformers_version": "4.25.1",
   "use_cache": true,
+  "vocab_size": 50257
 }

tf_model.h5 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76b1e8aec1fb99487b026dc7fe605f6cd9bb2d7baacb02f9d4972a8796c1e177
-size 501169176

 version https://git-lfs.github.com/spec/v1
+oid sha256:87c75b6577329fabd0067dbdd258b95f54a39405fe7aea537171a17e4d3af6fc
+size 497935440