TheNateTCY commited on
Commit
69b3b18
·
1 Parent(s): 004473e

Training in progress epoch 0

Browse files
Files changed (3) hide show
  1. README.md +6 -8
  2. config.json +32 -25
  3. tf_model.h5 +2 -2
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- license: other
3
  tags:
4
  - generated_from_keras_callback
5
  model-index:
@@ -12,11 +12,11 @@ probably proofread and complete it, then remove this comment. -->
12
 
13
  # TheNateTCY/testing_opt_causal_model
14
 
15
- This model is a fine-tuned version of [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
- - Train Loss: 7.1725
18
- - Validation Loss: 6.7890
19
- - Epoch: 2
20
 
21
  ## Model description
22
 
@@ -42,9 +42,7 @@ The following hyperparameters were used during training:
42
 
43
  | Train Loss | Validation Loss | Epoch |
44
  |:----------:|:---------------:|:-----:|
45
- | 8.4252 | 7.9847 | 0 |
46
- | 7.7442 | 7.2288 | 1 |
47
- | 7.1725 | 6.7890 | 2 |
48
 
49
 
50
  ### Framework versions
 
1
  ---
2
+ license: mit
3
  tags:
4
  - generated_from_keras_callback
5
  model-index:
 
12
 
13
  # TheNateTCY/testing_opt_causal_model
14
 
15
+ This model is a fine-tuned version of [gpt2](https://huggingface.co/gpt2) on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
+ - Train Loss: 10.6976
18
+ - Validation Loss: 10.1180
19
+ - Epoch: 0
20
 
21
  ## Model description
22
 
 
42
 
43
  | Train Loss | Validation Loss | Epoch |
44
  |:----------:|:---------------:|:-----:|
45
+ | 10.6976 | 10.1180 | 0 |
 
 
46
 
47
 
48
  ### Framework versions
config.json CHANGED
@@ -1,31 +1,38 @@
1
  {
2
- "_name_or_path": "facebook/opt-125m",
3
- "_remove_final_layer_norm": false,
4
- "activation_dropout": 0.0,
5
- "activation_function": "relu",
6
  "architectures": [
7
- "OPTForCausalLM"
8
  ],
9
- "attention_dropout": 0.0,
10
- "bos_token_id": 2,
11
- "do_layer_norm_before": true,
12
- "dropout": 0.1,
13
- "enable_bias": true,
14
- "eos_token_id": 2,
15
- "ffn_dim": 3072,
16
- "hidden_size": 768,
17
- "init_std": 0.02,
18
- "layer_norm_elementwise_affine": true,
19
- "layerdrop": 0.0,
20
- "max_position_embeddings": 2048,
21
- "model_type": "opt",
22
- "num_attention_heads": 12,
23
- "num_hidden_layers": 12,
24
- "pad_token_id": 1,
25
- "prefix": "</s>",
26
- "torch_dtype": "float16",
 
 
 
 
 
 
 
 
 
 
27
  "transformers_version": "4.25.1",
28
  "use_cache": true,
29
- "vocab_size": 50272,
30
- "word_embed_proj_dim": 768
31
  }
 
1
  {
2
+ "_name_or_path": "gpt2",
3
+ "activation_function": "gelu_new",
 
 
4
  "architectures": [
5
+ "GPT2LMHeadModel"
6
  ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 128,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
  "transformers_version": "4.25.1",
36
  "use_cache": true,
37
+ "vocab_size": 50257
 
38
  }
tf_model.h5 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76b1e8aec1fb99487b026dc7fe605f6cd9bb2d7baacb02f9d4972a8796c1e177
3
- size 501169176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87c75b6577329fabd0067dbdd258b95f54a39405fe7aea537171a17e4d3af6fc
3
+ size 497935440