stas commited on
Commit
8f43b41
·
1 Parent(s): 1790d74
README.md CHANGED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ This is a tiny random pegasus-cnn_dailymail model used for testing
2
+
3
+ See `make-pegasus-cnn_dailymail-tiny-random.py` for how it was created.
4
+
config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "activation_function": "relu",
4
+ "add_bias_logits": false,
5
+ "add_final_layer_norm": true,
6
+ "architectures": [
7
+ "PegasusForConditionalGeneration"
8
+ ],
9
+ "attention_dropout": 0.1,
10
+ "bos_token_id": 0,
11
+ "classif_dropout": 0.0,
12
+ "classifier_dropout": 0.0,
13
+ "d_model": 64,
14
+ "decoder_attention_heads": 2,
15
+ "decoder_ffn_dim": 64,
16
+ "decoder_layerdrop": 0.0,
17
+ "decoder_layers": 2,
18
+ "decoder_start_token_id": 0,
19
+ "dropout": 0.1,
20
+ "encoder_attention_heads": 16,
21
+ "encoder_ffn_dim": 64,
22
+ "encoder_layerdrop": 0.0,
23
+ "encoder_layers": 2,
24
+ "eos_token_id": 1,
25
+ "extra_pos_embeddings": 1,
26
+ "forced_eos_token_id": 1,
27
+ "gradient_checkpointing": false,
28
+ "id2label": {
29
+ "0": "LABEL_0",
30
+ "1": "LABEL_1",
31
+ "2": "LABEL_2"
32
+ },
33
+ "init_std": 0.02,
34
+ "is_encoder_decoder": true,
35
+ "label2id": {
36
+ "LABEL_0": 0,
37
+ "LABEL_1": 1,
38
+ "LABEL_2": 2
39
+ },
40
+ "length_penalty": 0.8,
41
+ "max_length": 128,
42
+ "max_position_embeddings": 1024,
43
+ "min_length": 32,
44
+ "model_type": "pegasus",
45
+ "normalize_before": true,
46
+ "normalize_embedding": false,
47
+ "num_beams": 8,
48
+ "num_hidden_layers": 2,
49
+ "pad_token_id": 0,
50
+ "scale_embedding": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float16",
53
+ "transformers_version": "4.9.0.dev0",
54
+ "use_cache": true,
55
+ "vocab_size": 5103
56
+ }
make-pegasus-cnn_dailymail-tiny-random.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ # Copyright 2021 The HuggingFace Team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ # This script creates a smallish random model, with a few layers to test things like MP/PP, where
18
+ # tiny and tiner models are too too small
19
+ #
20
+ # It will be used then as "stas/mt5-tiny-random"
21
+
22
+ # To build:
23
+ # 1. clone sentencepiece into this dir
24
+ # git clone https://github.com/google/sentencepiece
25
+ #
26
+ # 2. run this script
27
+
28
+ from pathlib import Path
29
+ import json
30
+ import tempfile
31
+
32
+ from transformers import PegasusTokenizer, PegasusTokenizerFast, PegasusConfig, PegasusForConditionalGeneration
33
+ #from transformers.models.t5.tokenization_t5 import VOCAB_FILES_NAMES
34
+
35
+ mname_from = "google/pegasus-cnn_dailymail"
36
+ mname_very_small = "pegasus-cnn_dailymail-tiny-random"
37
+
38
+ tokenizer = PegasusTokenizer.from_pretrained(mname_from)
39
+ config = PegasusConfig.from_pretrained(mname_from)
40
+ #tokenizer_fast = PegasusTokenizerFast.from_pretrained(mname_from)
41
+
42
+ # Shrink the vocab of orig
43
+ import sys
44
+ # HACK: need the sentencepiece source to get sentencepiece_model_pb2, as it doesn't get installed
45
+ # git clone https://github.com/google/sentencepiece
46
+ sys.path.append("./sentencepiece/python/src/sentencepiece")
47
+ import sentencepiece_model_pb2 as model
48
+
49
+ tmp_dir = "/tmp/pegasus-tiny"
50
+ tokenizer.save_pretrained(tmp_dir)
51
+ file = tmp_dir + "/spiece.model"
52
+ with open(file, 'rb') as f: data = f.read()
53
+
54
+ # adapted from https://blog.ceshine.net/post/trim-down-sentencepiece-vocabulary/
55
+ m = model.ModelProto()
56
+ m.ParseFromString(data)
57
+
58
+ keep_items = 5000
59
+
60
+ print("Shrinking vocab")
61
+ print(f"original dict {len(m.pieces)}")
62
+ for i in range(len(m.pieces)-keep_items): _ = m.pieces.pop()
63
+ print(f"new dict {len(m.pieces)}")
64
+
65
+ with open(tmp_dir + "/spiece-short.model", 'wb') as f:
66
+ f.write(m.SerializeToString())
67
+
68
+ tokenizer = PegasusTokenizer(vocab_file=tmp_dir + "/spiece-short.model")
69
+
70
+ config.update(dict(
71
+ vocab_size=keep_items+12,
72
+ d_model=64,
73
+ decoder_attention_heads=2,
74
+ decoder_ffn_dim=64,
75
+ decoder_layers=2,
76
+ encoder_attention_heads=16,
77
+ encoder_ffn_dim=64,
78
+ encoder_layers=2,
79
+ num_hidden_layers=2,
80
+ ))
81
+ print("new config", config)
82
+
83
+ very_small_model = PegasusForConditionalGeneration(config)
84
+ print(f"num of params {very_small_model.num_parameters()}")
85
+ very_small_model.resize_token_embeddings(len(tokenizer))
86
+
87
+ # Test
88
+ src_texts = ["A long paragraph for summarization.", "Another paragraph for summarization."]
89
+ tgt_texts = ["Summary of the text.", "Another summary."]
90
+
91
+ batch = tokenizer.prepare_seq2seq_batch(src_texts, tgt_texts, return_tensors="pt")
92
+ outputs = very_small_model(**batch)
93
+
94
+ print("test output:", len(outputs.logits[0]))
95
+
96
+ # Save
97
+ very_small_model.half() # makes it smaller
98
+ very_small_model.save_pretrained(mname_very_small)
99
+ config.save_pretrained(mname_very_small)
100
+ tokenizer.save_pretrained(mname_very_small)
101
+ #tokenizer_fast.save_pretrained(mname_very_small)
102
+
103
+ print(f"Generated {mname_very_small}")
104
+
105
+ # Upload
106
+ # transformers-cli repo create pegasus-cnn_dailymail-tiny-random
107
+ # clone and add files
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:170d2a8b3c7499cdc5abe22abda269ccd3020bf1e5571e1f53364b11d4bd601d
3
+ size 1227089
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": "<mask_2>", "additional_special_tokens": ["<mask_1>", "<unk_2>", "<unk_3>", "<unk_4>", "<unk_5>", "<unk_6>", "<unk_7>", "<unk_8>", "<unk_9>", "<unk_10>", "<unk_11>", "<unk_12>", "<unk_13>", "<unk_14>", "<unk_15>", "<unk_16>", "<unk_17>", "<unk_18>", "<unk_19>", "<unk_20>", "<unk_21>", "<unk_22>", "<unk_23>", "<unk_24>", "<unk_25>", "<unk_26>", "<unk_27>", "<unk_28>", "<unk_29>", "<unk_30>", "<unk_31>", "<unk_32>", "<unk_33>", "<unk_34>", "<unk_35>", "<unk_36>", "<unk_37>", "<unk_38>", "<unk_39>", "<unk_40>", "<unk_41>", "<unk_42>", "<unk_43>", "<unk_44>", "<unk_45>", "<unk_46>", "<unk_47>", "<unk_48>", "<unk_49>", "<unk_50>", "<unk_51>", "<unk_52>", "<unk_53>", "<unk_54>", "<unk_55>", "<unk_56>", "<unk_57>", "<unk_58>", "<unk_59>", "<unk_60>", "<unk_61>", "<unk_62>", "<unk_63>", "<unk_64>", "<unk_65>", "<unk_66>", "<unk_67>", "<unk_68>", "<unk_69>", "<unk_70>", "<unk_71>", "<unk_72>", "<unk_73>", "<unk_74>", "<unk_75>", "<unk_76>", "<unk_77>", "<unk_78>", "<unk_79>", "<unk_80>", "<unk_81>", "<unk_82>", "<unk_83>", "<unk_84>", "<unk_85>", "<unk_86>", "<unk_87>", "<unk_88>", "<unk_89>", "<unk_90>", "<unk_91>", "<unk_92>", "<unk_93>", "<unk_94>", "<unk_95>", "<unk_96>", "<unk_97>", "<unk_98>", "<unk_99>", "<unk_100>", "<unk_101>", "<unk_102>"]}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:987dae5fe18ba4b1884e60e9c2e6eb0ee6b6aea7de7515983aee5aae69dde75a
3
+ size 326384
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "mask_token": "<mask_2>", "pad_token": "<pad>", "mask_token_sent": "<mask_1>", "offset": 103, "additional_special_tokens": ["<mask_1>", "<unk_2>", "<unk_3>", "<unk_4>", "<unk_5>", "<unk_6>", "<unk_7>", "<unk_8>", "<unk_9>", "<unk_10>", "<unk_11>", "<unk_12>", "<unk_13>", "<unk_14>", "<unk_15>", "<unk_16>", "<unk_17>", "<unk_18>", "<unk_19>", "<unk_20>", "<unk_21>", "<unk_22>", "<unk_23>", "<unk_24>", "<unk_25>", "<unk_26>", "<unk_27>", "<unk_28>", "<unk_29>", "<unk_30>", "<unk_31>", "<unk_32>", "<unk_33>", "<unk_34>", "<unk_35>", "<unk_36>", "<unk_37>", "<unk_38>", "<unk_39>", "<unk_40>", "<unk_41>", "<unk_42>", "<unk_43>", "<unk_44>", "<unk_45>", "<unk_46>", "<unk_47>", "<unk_48>", "<unk_49>", "<unk_50>", "<unk_51>", "<unk_52>", "<unk_53>", "<unk_54>", "<unk_55>", "<unk_56>", "<unk_57>", "<unk_58>", "<unk_59>", "<unk_60>", "<unk_61>", "<unk_62>", "<unk_63>", "<unk_64>", "<unk_65>", "<unk_66>", "<unk_67>", "<unk_68>", "<unk_69>", "<unk_70>", "<unk_71>", "<unk_72>", "<unk_73>", "<unk_74>", "<unk_75>", "<unk_76>", "<unk_77>", "<unk_78>", "<unk_79>", "<unk_80>", "<unk_81>", "<unk_82>", "<unk_83>", "<unk_84>", "<unk_85>", "<unk_86>", "<unk_87>", "<unk_88>", "<unk_89>", "<unk_90>", "<unk_91>", "<unk_92>", "<unk_93>", "<unk_94>", "<unk_95>", "<unk_96>", "<unk_97>", "<unk_98>", "<unk_99>", "<unk_100>", "<unk_101>", "<unk_102>"], "sp_model_kwargs": {}, "tokenizer_class": "PegasusTokenizer"}