Uploaded new model version trained from scratch using syncdoth/RetNet commit 40fd7585 (2023-11-03)

Browse files

Files changed (6) hide show

README.md +10 -8
config.json +24 -14
generation_config.json +6 -0
model.safetensors +2 -2
tokenizer.json +1 -6
tokenizer_config.json +116 -0

README.md CHANGED Viewed

@@ -29,6 +29,8 @@ https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/
 ## Training procedure
 Just used the single tinyshakespeare text file as both the training and validation set (split up into paragraphs). See:
 https://colab.research.google.com/drive/1wZnM7FCe4TsQpoamJ7NDAuQfA3DYiwHi?usp=sharing
@@ -51,15 +53,15 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
-| 5.3901        | 9.93  | 370  | 4.1523          |
-| 3.8122        | 19.87 | 740  | 3.3425          |
-| 3.1609        | 29.8  | 1110 | 2.8916          |
-| 2.8352        | 39.73 | 1480 | 2.7718          |
 ### Framework versions
-- Transformers 4.31.0
-- Pytorch 2.0.1+cu118
-- Datasets 2.14.3
-- Tokenizers 0.13.3

 ## Training procedure
+Note: updated on 2023-11-10 to work with the current version of syncdoth/RetNet.
 Just used the single tinyshakespeare text file as both the training and validation set (split up into paragraphs). See:
 https://colab.research.google.com/drive/1wZnM7FCe4TsQpoamJ7NDAuQfA3DYiwHi?usp=sharing
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
+| 3.6853        | 10.0  | 370  | 3.4459          |
+| 2.1973        | 20.0  | 740  | 2.0213          |
+| 1.3819        | 30.0  | 1110 | 1.3017          |
+| 1.1658        | 40.0  | 1480 | 1.1566          |
 ### Framework versions
+- Transformers 4.35.0
+- Pytorch 2.1.0+cu118
+- Datasets 2.14.6
+- Tokenizers 0.14.1

config.json CHANGED Viewed

@@ -1,28 +1,38 @@
 {
   "architectures": [
-    "RetNetModelWithLMHead"
   ],
-  "chunk_size": 512,
   "eos_token_id": 11,
-  "ffn_proj_size": 256,
   "forward_impl": "parallel",
-  "hidden_size": 128,
   "initializer_range": 0.02,
   "is_decoder": true,
   "model_type": "retnet",
-  "num_heads": 4,
-  "num_layers": 8,
   "output_retentions": false,
   "pad_token_id": 11,
-  "qk_dim": 128,
   "torch_dtype": "float32",
-  "transformers_version": "4.31.0",
   "unk_token_id": 11,
-  "use_bias_in_mlp": true,
-  "use_bias_in_msr": false,
-  "use_bias_in_msr_out": false,
   "use_cache": true,
-  "use_default_gamma": false,
-  "v_dim": 256,
-  "vocab_size": 65024
 }

 {
+  "activation_dropout": 0.0,
+  "activation_fn": "swish",
   "architectures": [
+    "RetNetForCausalLM"
   ],
+  "decoder_embed_dim": 128,
+  "decoder_ffn_embed_dim": 256,
+  "decoder_layers": 8,
+  "decoder_normalize_before": true,
+  "decoder_retention_heads": 4,
+  "decoder_value_embed_dim": 256,
+  "deepnorm": false,
+  "drop_path_rate": 0.0,
+  "dropout": 0.0,
   "eos_token_id": 11,
   "forward_impl": "parallel",
   "initializer_range": 0.02,
   "is_decoder": true,
+  "layernorm_embedding": true,
+  "layernorm_eps": 1e-06,
   "model_type": "retnet",
+  "no_scale_embedding": true,
   "output_retentions": false,
   "pad_token_id": 11,
+  "recurrent_chunk_size": 512,
+  "subln": true,
+  "tie_word_embeddings": false,
   "torch_dtype": "float32",
+  "transformers_version": "4.35.0",
   "unk_token_id": 11,
   "use_cache": true,
+  "use_ffn_rms_norm": false,
+  "use_glu": true,
+  "use_lm_decay": false,
+  "vocab_size": 65024,
+  "z_loss_coeff": 0.0
 }

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "eos_token_id": 11,
+  "pad_token_id": 11,
+  "transformers_version": "4.35.0"
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:946419d6dfb43d664380c8e8647a69f8500247ab5a631c6443aba08f08ce0aa6
-size 39624432

 version https://git-lfs.github.com/spec/v1
+oid sha256:ff8967abfad37523aff5bd998187886e89f0ed86a89067f7a05e29b2902eac9a
+size 73943144

tokenizer.json CHANGED Viewed

@@ -1,11 +1,6 @@
 {
   "version": "1.0",
-  "truncation": {
-    "direction": "Right",
-    "max_length": 64,
-    "strategy": "LongestFirst",
-    "stride": 0
-  },
   "padding": null,
   "added_tokens": [
     {

 {
   "version": "1.0",
+  "truncation": null,
   "padding": null,
   "added_tokens": [
     {

tokenizer_config.json CHANGED Viewed

@@ -1,7 +1,123 @@
 {
   "add_prefix_space": false,
   "clean_up_tokenization_spaces": true,
   "eos_token": "<|endoftext|>",
   "model_max_length": 2048,
   "tokenizer_class": "PreTrainedTokenizerFast"
 }

 {
   "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": ">>TITLE<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": ">>ABSTRACT<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": ">>INTRODUCTION<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": ">>SUMMARY<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": ">>COMMENT<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": ">>ANSWER<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": ">>QUESTION<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": ">>DOMAIN<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": ">>PREFIX<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": ">>SUFFIX<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": ">>MIDDLE<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    ">>TITLE<<",
+    ">>ABSTRACT<<",
+    ">>INTRODUCTION<<",
+    ">>SUMMARY<<",
+    ">>COMMENT<<",
+    ">>ANSWER<<",
+    ">>QUESTION<<",
+    ">>DOMAIN<<",
+    ">>PREFIX<<",
+    ">>SUFFIX<<",
+    ">>MIDDLE<<"
+  ],
   "clean_up_tokenization_spaces": true,
   "eos_token": "<|endoftext|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
   "model_max_length": 2048,
+  "pad_token": "<|endoftext|>",
   "tokenizer_class": "PreTrainedTokenizerFast"
 }