TangSan003 commited on
Commit
8516514
·
0 Parent(s):

Load model

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
__pycache__/inference.cpython-310.pyc ADDED
Binary file (2.59 kB). View file
 
__pycache__/model.cpython-310.pyc ADDED
Binary file (10.5 kB). View file
 
__pycache__/utils.cpython-310.pyc ADDED
Binary file (5.29 kB). View file
 
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from flask_cors import CORS
3
+
4
+ from inference import InferenceModel
5
+ import traceback
6
+
7
+
8
+ app = Flask(__name__)
9
+ CORS(app)
10
+
11
+ try:
12
+ model = InferenceModel(path_to_weights="save_model/model.safetensors", huggingface_model=True)
13
+ except Exception as e:
14
+ print("❌ Lỗi khi load mô hình:")
15
+ traceback.print_exc()
16
+ model = None
17
+
18
+ @app.route('/pred', methods=['POST'])
19
+ def prediction():
20
+ payload = request.get_json()
21
+
22
+ # Lấy dữ liệu từ request
23
+ context = payload.get('context', '')
24
+ question = payload.get('question', '')
25
+
26
+ # # In ra terminal
27
+ # print("\n===== Nhận yêu cầu mới =====")
28
+ # print(f"Context: {context}")
29
+ # print(f"Question: {question}")
30
+
31
+ # Gọi mô hình
32
+ prediction = model.inference_model(question, context)
33
+ answer = prediction["answer"]
34
+
35
+ return jsonify({"answer": answer}), 200
36
+
37
+ if __name__ == '__main__':
38
+ app.run(port=5000, debug=True)
39
+
40
+
41
+ # Chayj server
finetune_roberta_qa.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ os.environ["WANDB_PROJECT"] = "RoBERTa_QA_Finetune"
4
+
5
+ import argparse
6
+ from datasets import load_dataset
7
+ from transformers import (
8
+ RobertaTokenizerFast,
9
+ DefaultDataCollator,
10
+ TrainingArguments,
11
+ Trainer,
12
+ )
13
+
14
+ import torch
15
+
16
+ from utils import RobertaConfig, ExtractiveQAPreProcesing
17
+ from model import RobertaForQuestionAnswering
18
+
19
+ import warnings
20
+ warnings.filterwarnings("ignore")
21
+
22
+ def parse_arguments():
23
+
24
+ parser = argparse.ArgumentParser(description="Wav2Vec2 Finetuning Arguments on Librispeech")
25
+
26
+ ### Experiment Logging ###
27
+ parser.add_argument(
28
+ "--experiment_name",
29
+ required=True,
30
+ type=str
31
+ )
32
+
33
+ parser.add_argument(
34
+ "--working_directory",
35
+ required=True,
36
+ type=str
37
+ )
38
+
39
+ parser.add_argument(
40
+ "--path_to_cache_dir",
41
+ help="Path to huggingface cache if different from default",
42
+ default=None,
43
+ type=str
44
+ )
45
+
46
+ parser.add_argument(
47
+ "--num_train_epochs",
48
+ help="Number of epochs you want to train for",
49
+ default=3,
50
+ type=int
51
+ )
52
+
53
+ parser.add_argument(
54
+ "--save_steps",
55
+ help="After how many steps do you want to log a checkpoint",
56
+ default=500,
57
+ type=int
58
+ )
59
+
60
+ parser.add_argument(
61
+ "--eval_steps",
62
+ help="After how many steps do you want to evaluate on eval data",
63
+ default=500,
64
+ type=int
65
+ )
66
+
67
+ parser.add_argument(
68
+ "--logging_steps",
69
+ help="After how many steps do you want to log to Weights and Biases (if installed)",
70
+ default=500,
71
+ type=int
72
+ )
73
+
74
+ parser.add_argument(
75
+ "--warmup_steps",
76
+ help="Number of learning rate warmup steps",
77
+ default=100,
78
+ type=int
79
+ )
80
+
81
+ ### Training Arguments ###
82
+
83
+ parser.add_argument(
84
+ "--per_device_batch_size",
85
+ help="Batch size for every gradient accumulation steps",
86
+ default=2,
87
+ type=int
88
+ )
89
+
90
+ parser.add_argument(
91
+ "--gradient_accumulation_steps",
92
+ help="Number of gradient accumulation steps you want",
93
+ default=2,
94
+ type=int
95
+ )
96
+
97
+ parser.add_argument(
98
+ "--learning_rate",
99
+ help="Max learning rate that we warmup to",
100
+ default=2e-5,
101
+ type=float
102
+ )
103
+
104
+ parser.add_argument(
105
+ "--weight_decay",
106
+ help="Weight decay applied to model parameters during training",
107
+ default=0.01,
108
+ type=float
109
+ )
110
+
111
+ parser.add_argument(
112
+ "--save_total_limit",
113
+ help="Max number of checkpoints to save",
114
+ default=4,
115
+ type=int
116
+ )
117
+
118
+ ### Backbone Arguments ###
119
+ parser.add_argument(
120
+ "--huggingface_model_name",
121
+ help="Name for pretrained RoBERTa backbone and Tokenizer",
122
+ default="deepset/roberta-base-squad2",
123
+ type=str
124
+ )
125
+
126
+ parser.add_argument(
127
+ "--path_to_pretrained_backbone",
128
+ help="Path to model weights stored from our pretraining to initialize the backbone",
129
+ default=None,
130
+ type=str
131
+ )
132
+
133
+ parser.add_argument(
134
+ "--pretrained_backbone",
135
+ help="Do you want want a `pretrained` backbone that we made (need to provide path_to_pretrained_backbone), \
136
+ `pretrained_huggingface` backbone (then need huggingface_model_name), or `random` initialized backbone",
137
+ choices=("pretrained", "pretrained_huggingface", "random"),
138
+ type=str
139
+ )
140
+ parser.add_argument('--resume_from_checkpoint', type=str, default=None)
141
+ parser.add_argument('--model_name_or_path', type=str, default="roberta-base")
142
+
143
+ args = parser.parse_args()
144
+
145
+ return args
146
+
147
+
148
+ ### Load Arguments ###
149
+ args = parse_arguments()
150
+
151
+ def load_tokenizer(model_name):
152
+ try:
153
+ return RobertaTokenizerFast.from_pretrained(model_name)
154
+ except Exception as e:
155
+ logging.error(f"Failed to load tokenizer: {e}")
156
+ raise
157
+
158
+ def load_model(config):
159
+ try:
160
+ return RobertaForQuestionAnswering(config)
161
+ except Exception as e:
162
+ logging.error(f"Failed to load model: {e}")
163
+ raise
164
+ logging.basicConfig(level=logging.INFO)
165
+ logging.info("----------Loading dataset and tokenizer----------")
166
+
167
+ ### Load Tokenizer ###
168
+ tokenizer = RobertaTokenizerFast.from_pretrained(args.huggingface_model_name)
169
+
170
+ ### Load Config ###
171
+
172
+
173
+
174
+ dataset = load_dataset("stanfordnlp/coqa")
175
+ processor = ExtractiveQAPreProcesing()
176
+ tokenized_squad = dataset.map(processor, batched=True, remove_columns=dataset["train"].column_names)
177
+
178
+ # print(tokenized_squad.column_names)
179
+ ### Load Model ###
180
+ if args.resume_from_checkpoint is not None:
181
+ config = RobertaConfig(pretrained_backbone=args.pretrained_backbone,
182
+ path_to_pretrained_weights=args.path_to_pretrained_backbone)
183
+ model = RobertaForQuestionAnswering(config)
184
+ model.load_state_dict(torch.load(f"{args.resume_from_checkpoint}/training_args.bin", map_location="cpu"))
185
+ else:
186
+ config = RobertaConfig(pretrained_backbone=args.pretrained_backbone,
187
+ path_to_pretrained_weights=args.path_to_pretrained_backbone)
188
+ model = RobertaForQuestionAnswering(config)
189
+
190
+
191
+
192
+ ### Load Default Collator, We padded to longest length so no padding necessary ##
193
+
194
+ data_collator = DefaultDataCollator()
195
+
196
+ ### Define Training Arguments ###
197
+ training_args = TrainingArguments(
198
+ output_dir=os.path.join(args.working_directory, args.experiment_name),
199
+ per_device_train_batch_size=args.per_device_batch_size,
200
+ gradient_accumulation_steps=args.gradient_accumulation_steps,
201
+ # evaluation_strategy="steps",
202
+ num_train_epochs=args.num_train_epochs,
203
+ bf16=True,
204
+ save_steps=args.save_steps,
205
+ eval_steps=args.eval_steps,
206
+ logging_steps=args.logging_steps,
207
+ learning_rate=args.learning_rate,
208
+ weight_decay=args.weight_decay,
209
+ warmup_steps=args.warmup_steps,
210
+ save_total_limit=args.save_total_limit,
211
+ run_name=args.experiment_name,
212
+
213
+ )
214
+
215
+ trainer = Trainer(
216
+ model=model,
217
+ args=training_args,
218
+ train_dataset=tokenized_squad["train"],
219
+ eval_dataset=tokenized_squad["validation"],
220
+ tokenizer=tokenizer,
221
+ data_collator=data_collator,
222
+ )
223
+
224
+ ### TRAIN MODEL !!! ###
225
+ # trainer.train()
226
+ trainer.train(resume_from_checkpoint="model/RoBERTa/finetune_qa_hf_roberta_backbone/checkpoint-54324")
227
+
228
+ ### Save Final Model ###
229
+ trainer.save_model("/home/tangsan/AllNlpProject/CoQAChat/model/RoBERTa/save_model")
inference.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import RobertaTokenizerFast
3
+ from utils import RobertaConfig
4
+ from model import RobertaForQuestionAnswering
5
+ from safetensors.torch import load_file
6
+ from datasets import load_dataset
7
+ from pprintpp import pprint
8
+
9
+
10
+
11
+ class InferenceModel:
12
+ """
13
+ Quick inference function that works with the models we have trained!
14
+ """
15
+
16
+ def __init__(self, path_to_weights, huggingface_model=True):
17
+ ### Init Config with either Huggingface Backbone or our own ###
18
+ self.config = RobertaConfig(pretrained_backbone="pretrained_huggingface" if huggingface_model else "random")
19
+
20
+ ### Load Tokenizer ###
21
+ self.tokenizer = RobertaTokenizerFast.from_pretrained(self.config.hf_model_name)
22
+
23
+ ### Load Model ###
24
+ self.model = RobertaForQuestionAnswering(self.config)
25
+
26
+ weights = load_file(path_to_weights)
27
+ self.model.load_state_dict(weights)
28
+
29
+ self.model.eval()
30
+
31
+ def inference_model(self,
32
+ question,
33
+ context):
34
+ ### Tokenize Text
35
+ inputs = self.tokenizer(text=question,
36
+ text_pair=context,
37
+ max_length=self.config.context_length,
38
+ truncation="only_second",
39
+ return_tensors="pt")
40
+ pass
41
+ ### Pass through Model ####
42
+ with torch.no_grad():
43
+ start_token_logits, end_token_logits = self.model(**inputs)
44
+
45
+ ### Grab Start and End Token Idx ###
46
+ start_token_idx = start_token_logits.squeeze().argmax().item()
47
+ end_token_idx = end_token_logits.squeeze().argmax().item()
48
+
49
+
50
+ ### Slice Tokens and then Decode with Tokenizer (+1 because slice is not right inclusive) ###
51
+ tokens = inputs["input_ids"].squeeze()[start_token_idx:end_token_idx + 1]
52
+ answer = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
53
+
54
+ prediction = {"start_token_idx": start_token_idx,
55
+ "end_token_idx": end_token_idx,
56
+ "answer": answer}
57
+
58
+ return prediction
59
+
60
+
61
+ if __name__ == "__main__":
62
+
63
+ dataset = load_dataset("stanfordnlp/coqa")
64
+
65
+ data = dataset["validation"][2]
66
+ # data = dataset["train"][0]
67
+ # print("answer:", data["answers"])
68
+ ### Sample Text ###
69
+ context = data["story"]
70
+ print("context:", context)
71
+ question = data["questions"][4]
72
+
73
+ tokenizer = RobertaTokenizerFast.from_pretrained("deepset/roberta-base-squad2")
74
+
75
+ encoded = tokenizer(
76
+ question,
77
+ context,
78
+ max_length=512,
79
+ truncation="only_second",
80
+ padding="max_length",
81
+ return_offsets_mapping=True,
82
+ return_tensors="pt"
83
+ )
84
+ offset_mapping = encoded["offset_mapping"][0].tolist() # convert to list of tuples
85
+ input_ids = encoded["input_ids"][0]
86
+
87
+
88
+ ### Inference Model ###
89
+ path_to_weights = "model/RoBERTa/save_model/model.safetensors"
90
+ inferencer = InferenceModel(path_to_weights=path_to_weights, huggingface_model=True)
91
+ prediction = inferencer.inference_model(question, context)
92
+ print("\n----------------------------------")
93
+ print("results:", prediction)
94
+
95
+ start_token_idx = prediction["start_token_idx"]
96
+ end_token_idx = prediction["end_token_idx"]
97
+
98
+ start_char = offset_mapping[start_token_idx][0]
99
+ end_char = offset_mapping[end_token_idx][1]
100
+
101
+ print("Question:", question)
102
+ print("Recovered answer:", context[start_char:end_char])
103
+
104
+ # test model
model.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from safetensors.torch import load_file
6
+ from transformers import RobertaModel as HFRobertaModel
7
+
8
+ from utils import RobertaConfig
9
+
10
+
11
+ from pprintpp import pprint
12
+
13
+ class RobertaEmbeddings(nn.Module):
14
+ """
15
+ Converts our tokens to embedding vectors and then adds positional embeddings (and potentially token type embeddings)
16
+ to our data! We wont need to token type embeddings until we do our QA finetuning.
17
+ """
18
+
19
+ def __init__(self, config):
20
+ super(RobertaEmbeddings, self).__init__()
21
+
22
+ ### Embeddings for Tokens ###
23
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_dimension, padding_idx=config.pad_token)
24
+
25
+ ### Positional Embeddings ###
26
+ self.position_embeddings = nn.Embedding(config.context_length, config.embedding_dimension)
27
+
28
+ ### Layernorm and Dropout ###
29
+ self.layernorm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
30
+ self.dropout = nn.Dropout(config.hidden_dropout_p)
31
+
32
+ def forward(self, input_ids):
33
+ batch_size, seq_length = input_ids.shape
34
+
35
+ ### Convert Tokens to Embeddings ###
36
+ x = self.word_embeddings(input_ids)
37
+
38
+ ### Add Positional Information ###
39
+ avail_idx = torch.arange(0, seq_length, dtype=torch.long, device=input_ids.device)
40
+ pos_embed = self.position_embeddings(avail_idx)
41
+ x = x + pos_embed
42
+
43
+ x = self.layernorm(x)
44
+ x = self.dropout(x)
45
+
46
+ return x
47
+
48
+
49
+ class RobertaAttention(nn.Module):
50
+ """
51
+ Regular Self-Attention but in this case we utilize flash_attention
52
+ incorporated in the F.scaled_dot_product_attention to speed up our training.
53
+ """
54
+
55
+ def __init__(self, config):
56
+ super(RobertaAttention, self).__init__()
57
+
58
+ ### Store Config ###
59
+ self.config = config
60
+
61
+ ### Sanity Checks ###
62
+ assert config.embedding_dimension % config.num_attention_heads == 0, "Double check embedding dim divisible by number of heads"
63
+
64
+ ### Attention Head Dim ###
65
+ self.head_dim = config.embedding_dimension // config.num_attention_heads
66
+
67
+ ### Attention Projections ###
68
+ self.q_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
69
+ self.k_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
70
+ self.v_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
71
+
72
+ ### Post Attention Projection ###
73
+ self.out_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
74
+
75
+ def forward(self, x, attention_mask=None):
76
+ ### Store Shape ###
77
+ batch, seq_len, embed_dim = x.shape
78
+
79
+ ### Compute Attention with Flash Attention ###
80
+ q = self.q_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
81
+ 2).contiguous()
82
+ k = self.k_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
83
+ 2).contiguous()
84
+ v = self.v_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
85
+ 2).contiguous()
86
+
87
+ ### Compute Attention (Attention Mask has shape Batch x Sequence len x Sequence len) ###
88
+ attention_out = F.scaled_dot_product_attention(q, k, v,
89
+ attn_mask=attention_mask,
90
+ dropout_p=self.config.attention_dropout_p if self.training else 0.0)
91
+
92
+ ### Compute Output Projection ###
93
+ attention_out = attention_out.transpose(1, 2).flatten(2)
94
+ attention_out = self.out_proj(attention_out)
95
+
96
+ return attention_out
97
+
98
+
99
+ class RobertaFeedForward(nn.Module):
100
+ """
101
+ Regular MLP module after our attention computation.
102
+ """
103
+
104
+ def __init__(self, config):
105
+ super(RobertaFeedForward, self).__init__()
106
+
107
+ hidden_size = config.embedding_dimension * config.mlp_ratio
108
+ self.intermediate_dense = nn.Linear(config.embedding_dimension, hidden_size)
109
+ self.activation = nn.GELU()
110
+ self.intermediate_dropout = nn.Dropout(config.hidden_dropout_p)
111
+
112
+ self.output_dense = nn.Linear(hidden_size, config.embedding_dimension)
113
+ self.output_dropout = nn.Dropout(config.hidden_dropout_p)
114
+
115
+ def forward(self, x):
116
+ x = self.intermediate_dense(x)
117
+ x = self.activation(x)
118
+ x = self.intermediate_dropout(x)
119
+
120
+ x = self.output_dense(x)
121
+ x = self.output_dropout(x)
122
+ return x
123
+
124
+
125
+ class RobertaEncoderLayer(nn.Module):
126
+ """
127
+ Single transformer block stacking together Attention and our FeedForward
128
+ layers, with normalization and residual connections.
129
+ """
130
+
131
+ def __init__(self, config):
132
+ super(RobertaEncoderLayer, self).__init__()
133
+
134
+ self.attention = RobertaAttention(config)
135
+ self.dropout = nn.Dropout(config.hidden_dropout_p)
136
+ self.layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
137
+ self.feed_forward = RobertaFeedForward(config)
138
+ self.final_layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
139
+
140
+ def forward(self, x, attention_mask=None):
141
+ x = x + self.dropout(self.attention(x, attention_mask=attention_mask))
142
+ x = self.layer_norm(x)
143
+
144
+ x = x + self.feed_forward(x)
145
+ x = self.final_layer_norm(x)
146
+
147
+ return x
148
+
149
+
150
+ class RobertaEncoder(nn.Module):
151
+ """
152
+ This will be the stack of all of our transformer blocks
153
+ """
154
+
155
+ def __init__(self, config):
156
+ super(RobertaEncoder, self).__init__()
157
+
158
+ self.config = config
159
+
160
+ ### Transformer Layers ###
161
+ self.layers = nn.ModuleList(
162
+ [
163
+ RobertaEncoderLayer(config) for _ in range(config.num_transformer_blocks)
164
+ ]
165
+ )
166
+
167
+ def forward(
168
+ self,
169
+ x,
170
+ attention_mask=None,
171
+ ):
172
+
173
+ batch_size, seq_len, embed_dim = x.shape
174
+
175
+ if attention_mask is not None:
176
+ ### Make Sure Attention Mask is a Boolean Tensor ###
177
+ attention_mask = attention_mask.bool()
178
+
179
+ ### Now our Attention Mask is in (Batch x Sequence Length) where we have 0 for tokens we don't want to attend to ###
180
+ ### F.scaled_dot_product_attention expects a mask of the shape (Batch x ..., x Seq_len x Seq_len) ###
181
+ ### the "..." in this case is any extra dimensions (such as heads of attention). lets expand our mask to (Batch x 1 x Seq_len x Seq_len) ###
182
+ ### The 1 in this case refers to the number of heads of attention we want, so it is a dummy index to broadcast over ###
183
+ ### In each (Seq_len x Seq_len) matrix for every batch, we want False for all columns corresponding to padding tokens ###
184
+ attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1, 1, seq_len, 1)
185
+
186
+ for layer in self.layers:
187
+ x = layer(x, attention_mask=attention_mask)
188
+
189
+ return x
190
+
191
+
192
+ class RobertaMLMHead(nn.Module):
193
+ """
194
+ The Masked Language model head is a stack of two linear layers with an activation in between!
195
+ """
196
+
197
+ def __init__(self, config):
198
+ super(RobertaMLMHead, self).__init__()
199
+
200
+ self.config = config
201
+
202
+ ### Projection Layer for Hidden States ###
203
+ self.dense = nn.Linear(config.embedding_dimension, config.embedding_dimension)
204
+ self.layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
205
+ self.activation = nn.GELU()
206
+
207
+ ### Mapping to Vocabulary ###
208
+ self.decoder = nn.Linear(config.embedding_dimension, config.vocab_size)
209
+
210
+ def forward(self, inputs):
211
+ ### Pass through Projection/Activation/Norm ###
212
+ x = self.dense(inputs)
213
+ x = self.activation(x)
214
+ x = self.layer_norm(x)
215
+
216
+ ### Prediction of Masked Tokens ###
217
+ x = self.decoder(x)
218
+
219
+ return x
220
+
221
+
222
+ class RobertaModel(nn.Module):
223
+ """
224
+ Backbone of our model, has to be pretrained via MLM on a ton of data!
225
+ """
226
+
227
+ def __init__(self, config):
228
+ super(RobertaModel, self).__init__()
229
+
230
+ self.config = config
231
+
232
+ ### Define all Parts of the Model ###
233
+ self.embeddings = RobertaEmbeddings(config)
234
+ self.encoder = RobertaEncoder(config)
235
+
236
+ def forward(self, input_ids, attention_mask=None):
237
+ embeddings = self.embeddings(input_ids)
238
+ output = self.encoder(embeddings, attention_mask)
239
+
240
+ return output
241
+
242
+
243
+ class RobertaForMaskedLM(nn.Module):
244
+ """
245
+ This model will perform the masked language modeling task.
246
+ """
247
+
248
+ def __init__(self, config):
249
+ super(RobertaForMaskedLM, self).__init__()
250
+
251
+ self.config = config
252
+
253
+ ### Define Model and MLM Head ###
254
+ self.roberta = RobertaModel(config)
255
+ self.mlm_head = RobertaMLMHead(config)
256
+
257
+ self.apply(_init_weights_)
258
+
259
+ def forward(self,
260
+ input_ids,
261
+ attention_mask=None,
262
+ labels=None):
263
+
264
+ ### Pass data through model ###
265
+ hidden_states = self.roberta(input_ids,
266
+ attention_mask)
267
+
268
+ preds = self.mlm_head(hidden_states)
269
+
270
+ ### Compute Loss if Labels are Available ###
271
+ loss = None
272
+ if labels is not None:
273
+
274
+ ### Flatten Logits to (B*S x N) and Labels to (B*S) ###
275
+ preds = preds.flatten(end_dim=1)
276
+ labels = labels.flatten()
277
+
278
+ loss = F.cross_entropy(preds, labels)
279
+
280
+ return hidden_states, preds, loss
281
+
282
+ else:
283
+ return hidden_states, preds
284
+
285
+
286
+
287
+
288
+ class RobertaForQuestionAnswering(nn.Module):
289
+ def __init__(self, config):
290
+ super().__init__()
291
+ self.config = config
292
+
293
+ self.load_backbone()
294
+ self.qa_head = nn.Linear(config.embedding_dimension, 2)
295
+
296
+
297
+ def load_backbone(self):
298
+ if self.config.pretrained_backbone == "pretrained_huggingface":
299
+ print("Loading Huggingface RoBERTa Model")
300
+ self.roberta = HFRobertaModel.from_pretrained(self.config.hf_model_name)
301
+ else:
302
+ self.roberta = RobertaModel(self.config)
303
+ if self.config.pretrained_backbone == "pretrained":
304
+ # state_dict = load_file(self.config.path_to_pretrained_weights)
305
+ # print(self.config.path_to_pretrained_weights)
306
+ if self.config.path_to_pretrained_weights is None:
307
+ # state_dict = HFRobertaModel.from_pretrained(RobertaConfig.hf_model_name).state_dict()
308
+ raise Exception(
309
+ "Provide the argument `path_to_pretrained_weights` in the config, else we cant load them!")
310
+ else:
311
+ if not os.path.isfile(self.config.path_to_pretrained_weights):
312
+ raise Exception(
313
+ f"Provided path to safetensors weights {self.config.path_to_pretrained_weights} is invalid!")
314
+ print(f"Loading RobertaModel Backbone from {self.config.path_to_pretrained_weights}")
315
+
316
+ state_dict = load_file(self.config.path_to_pretrained_weights)
317
+
318
+ # Filter and rename keys
319
+ backbone_keys = {}
320
+ for key in state_dict.keys():
321
+ if "roberta" in key:
322
+ new_key = key.replace("roberta.", "")
323
+
324
+ backbone_keys[new_key] = state_dict[key]
325
+ else:
326
+ continue
327
+
328
+ self.roberta.load_state_dict(backbone_keys)
329
+
330
+ def forward(self,
331
+ input_ids,
332
+ attention_mask=None,
333
+ start_positions=None,
334
+ end_positions=None):
335
+
336
+ if self.config.pretrained_backbone == "pretrained_huggingface":
337
+ output = self.roberta(input_ids, attention_mask=attention_mask).last_hidden_state
338
+
339
+ else:
340
+ output = self.roberta(input_ids, attention_mask=attention_mask)
341
+
342
+ logit = self.qa_head(output)
343
+ #
344
+ start_logits, end_logits = logit.split(1, dim=-1)
345
+ start_logits = start_logits.squeeze(-1)
346
+ end_logits = end_logits.squeeze(-1)
347
+
348
+
349
+ if start_positions is not None and end_positions is not None:
350
+
351
+ #
352
+ if len(start_positions.size()) >1:
353
+ start_positions = start_positions.squeeze(-1)
354
+ if len(end_positions.size()) > 1:
355
+ end_positions = end_positions.squeeze(-1)
356
+
357
+
358
+ ignored_index = start_logits.size(1)
359
+
360
+ start_positions = start_positions.clamp(0, ignored_index)
361
+ end_positions = end_positions.clamp(0, ignored_index)
362
+
363
+ start_loss = F.cross_entropy(start_logits, start_positions, ignore_index=ignored_index)
364
+ end_loss = F.cross_entropy(end_logits, end_positions, ignore_index=ignored_index)
365
+
366
+ total_loss = (start_loss + end_loss) / 2
367
+
368
+ return total_loss, start_logits, end_logits
369
+ return start_logits, end_logits
370
+
371
+
372
+ def _init_weights_(module):
373
+ """
374
+ Simple weight intialization taken directly from the huggingface
375
+ `modeling_roberta.py` implementation!
376
+ """
377
+ if isinstance(module, nn.Linear):
378
+ module.weight.data.normal_(mean=0.0, std=0.02)
379
+ if module.bias is not None:
380
+ module.bias.data.zero_()
381
+ elif isinstance(module, nn.Embedding):
382
+ module.weight.data.normal_(mean=0.0, std=0.02)
383
+ if module.padding_idx is not None:
384
+ module.weight.data[module.padding_idx].zero_()
385
+ elif isinstance(module, nn.LayerNorm):
386
+ module.bias.data.zero_()
387
+ module.weight.data.fill_(1.0)
388
+
389
+
390
+ if __name__ == "__main__":
391
+
392
+ config = RobertaConfig(pretrained_backbone = "pretrained",
393
+ path_to_pretrained_weights="/home/tangsan/AllNlpProject/CoQAChat/model/RoBERTa/finetune_qa_hf_roberta_backbone/checkpoint-27162/model.safetensors")
394
+ model = RobertaForQuestionAnswering(config=config)
395
+
396
+ rand= torch.randint(0,100,size=(4,8))
397
+ start_positions=torch.tensor([1,2,3,4])
398
+ end_positions=torch.tensor([5,6,7,8])
399
+ model(rand, start_positions=start_positions, end_positions=end_positions)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ transformers
4
+ torch
save_model/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
save_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf41fc1b853a2a0fc06599fa407a82f96d6aa4e2a6650347d07abc09defccd30
3
+ size 498612792
save_model/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
save_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
save_model/tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "do_lower_case": false,
49
+ "eos_token": "</s>",
50
+ "errors": "replace",
51
+ "extra_special_tokens": {},
52
+ "full_tokenizer_file": null,
53
+ "mask_token": "<mask>",
54
+ "model_max_length": 512,
55
+ "pad_token": "<pad>",
56
+ "sep_token": "</s>",
57
+ "tokenizer_class": "RobertaTokenizer",
58
+ "trim_offsets": true,
59
+ "unk_token": "<unk>"
60
+ }
save_model/training_args.bin ADDED
Binary file (5.78 kB). View file
 
save_model/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
utils.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import random
3
+ from typing import Literal
4
+ print("CUDA available:", torch.cuda.is_available())
5
+ print("CUDA device count:", torch.cuda.device_count())
6
+ if torch.cuda.is_available():
7
+ print("GPU name:", torch.cuda.get_device_name(0))
8
+
9
+ from datasets.features.video import Example
10
+ from transformers import RobertaTokenizerFast, PretrainedConfig
11
+ from dataclasses import dataclass, asdict
12
+
13
+ from datasets import load_dataset
14
+ from pprintpp import pprint
15
+
16
+ @dataclass
17
+ class RobertaConfig(PretrainedConfig):
18
+ ### Tokenizer Config
19
+ vocab_size: int = 50265
20
+ start_token: int = 0
21
+ end_token: int = 2
22
+ pad_token: int = 2
23
+ mask_token: int = 50264
24
+
25
+ ### Transformer Config ###
26
+ embedding_dimension: int = 768
27
+ num_transformer_blocks: int = 12
28
+ num_attention_heads: int = 12
29
+ mlp_ratio: int = 4
30
+ layer_norm_eps: float = 1e-6
31
+ hidden_dropout_p: float = 0.1
32
+ attention_dropout_p: float = 0.1
33
+ context_length: int = 512
34
+
35
+ ### Masking Config ###
36
+ masking_prob: float = 0.15
37
+
38
+ ### Huggingface Config ###
39
+ hf_model_name: str = "deepset/roberta-base-squad2"
40
+
41
+ ### Model Config ###
42
+ pretrained_backbone: Literal["pretrained", "pretrained_huggingface", "random"] = "pretrained"
43
+ max_position_embeddings: int = 512
44
+ path_to_pretrained_weights: str = None
45
+
46
+ ### Added in to_dict() method so this Config is compatible with Huggingface Trainer!!! ###
47
+ def to_dict(self):
48
+ return asdict(self)
49
+
50
+
51
+ def random_masking_text(tokens,
52
+ special_tokens_mask,
53
+ vocab_size=50264,
54
+ special_ids=(0, 1, 2, 3, 50264),
55
+ mask_ratio=0.15,
56
+ mask_token=50264):
57
+ """
58
+ Function for our random masking of tokens (excluding special tokens). This follow the logic provided
59
+ by BERT/RoBERTa:
60
+
61
+ - Select 15% of the tokens for masking
62
+ - 80% of the selected tokens are replaced with a mask token
63
+ - 10% of the selected tokens are replaced with another random token
64
+ - 10% of the selected tokens are left alone
65
+
66
+ This is almost identical to the masking function in our introductory jupyter notebook walkthrough of
67
+ masked language modeling, but some minor changes are made to apply masking to batches of tokens
68
+ rather than just one sequence at a time!
69
+ """
70
+
71
+ ### Create Random Uniform Sample Tensor ###
72
+ random_masking = torch.rand(*tokens.shape)
73
+
74
+ ### Set Value of Special Tokens to 1 so we DONT MASK THEM ###
75
+ random_masking[special_tokens_mask == 1] = 1
76
+
77
+ ### Get Boolean of Words under Masking Threshold ###
78
+ random_masking = (random_masking < mask_ratio)
79
+
80
+ ### Create Labels ###
81
+ labels = torch.full((tokens.shape), -100)
82
+ labels[random_masking] = tokens[random_masking]
83
+
84
+ ### Get Indexes of True ###
85
+ random_selected_idx = random_masking.nonzero()
86
+
87
+ ### 80% Of the Time Replace with Mask Token ###
88
+ masking_flag = torch.rand(len(random_selected_idx))
89
+ masking_flag = (masking_flag < 0.8)
90
+ selected_idx_for_masking = random_selected_idx[masking_flag]
91
+
92
+ ### Seperate out remaining indexes to be assigned ###
93
+ unselected_idx_for_masking = random_selected_idx[~masking_flag]
94
+
95
+ ### 10% of the time (or 50 percent of the remaining 20%) we fill with random token ###
96
+ ### The remaining times, leave the text as is ###
97
+ masking_flag = torch.rand(len(unselected_idx_for_masking))
98
+ masking_flag = (masking_flag < 0.5)
99
+ selected_idx_for_random_filling = unselected_idx_for_masking[masking_flag]
100
+ selected_idx_to_be_left_alone = unselected_idx_for_masking[~masking_flag]
101
+
102
+ ### Fill Mask Tokens ###
103
+ if len(selected_idx_for_masking) > 0:
104
+ tokens[selected_idx_for_masking[:, 0], selected_idx_for_masking[:, 1]] = mask_token
105
+
106
+ ### Fill Random Tokens ###
107
+ if len(selected_idx_for_random_filling) > 0:
108
+ non_special_ids = list(set(range(vocab_size)) - set(special_ids))
109
+ randomly_selected_tokens = torch.tensor(random.sample(non_special_ids, len(selected_idx_for_random_filling)))
110
+ tokens[selected_idx_for_random_filling[:, 0], selected_idx_for_random_filling[:, 1]] = randomly_selected_tokens
111
+
112
+ return tokens, labels
113
+
114
+
115
+
116
+ def ExtractiveQAPreProcesing():
117
+
118
+ tokenizer = RobertaTokenizerFast.from_pretrained("deepset/roberta-base-squad2")
119
+
120
+ def char2token_mapping(examples):
121
+ #
122
+ # pprint(examples)
123
+
124
+ questions = [q.strip() for sublist in examples["questions"] for q in sublist]
125
+ # pprint(questions)
126
+ stories = []
127
+ for idx, sublist in enumerate(examples["questions"]):
128
+ stories.extend([examples["story"][idx]] * len(sublist))
129
+
130
+ # Now both questions and stories are 1D lists of the same length
131
+ input = tokenizer(
132
+ text=questions,
133
+ text_pair=stories,
134
+ max_length=512,
135
+ truncation="only_second",
136
+ return_offsets_mapping=True,
137
+ padding="max_length",
138
+ )
139
+ offset_mapping = input.pop("offset_mapping")
140
+ # pprint(input)
141
+ answers = examples["answers"]
142
+ input_text =[]
143
+ answer_start = []
144
+ answer_end=[]
145
+
146
+ for key in answers:
147
+ input_text.extend(key["input_text"])
148
+ answer_start.extend(key["answer_start"])
149
+ answer_end.extend(key["answer_end"])
150
+
151
+
152
+ starting_token_idxs = []
153
+ ending_token_idxs = []
154
+ convert_data = {}
155
+
156
+ for i, offset in enumerate(offset_mapping):
157
+
158
+ start_char = answer_start[i]
159
+ end_char = answer_end[i]
160
+
161
+ # if start_char == -1 or end_char == -1:
162
+ # starting_token_idxs.append(0)
163
+ # ending_token_idxs.append(0)
164
+ # continue
165
+
166
+ sequencen_ids = input.sequence_ids(i)
167
+
168
+ context_start = None
169
+ context_end = None
170
+
171
+ for idx, id in enumerate(sequencen_ids):
172
+ if context_start is None and id == 1:
173
+ context_start = idx
174
+ elif context_start is not None and id != 1:
175
+ context_end = idx - 1
176
+ break
177
+ elif context_start is not None and idx == len(sequencen_ids) - 1:
178
+ context_end = idx
179
+
180
+
181
+ context_start_char = offset[context_start][0]
182
+ context_end_char = offset[context_end][-1]
183
+
184
+ if (start_char >= context_start_char) and (end_char <= context_end_char):
185
+ # print(start_char, end_char)
186
+ start_token_idx = None
187
+ end_token_idx = None
188
+ for token_idx, (offsets, seq_id) in enumerate(zip(offset, sequencen_ids)):
189
+ if seq_id == 1:
190
+ if start_char in range(offsets[0], offsets[1] + 1):
191
+ start_token_idx = token_idx
192
+ if end_char in range(offsets[0], offsets[1] + 1):
193
+ end_token_idx = token_idx
194
+
195
+ starting_token_idxs.append(start_token_idx)
196
+ ending_token_idxs.append(end_token_idx)
197
+ # print("start_token_idx", start_token_idx, "end_token_idx", end_token_idx)
198
+ else:
199
+ starting_token_idxs.append(0)
200
+ ending_token_idxs.append(0)
201
+
202
+
203
+ input["start_positions"] = starting_token_idxs
204
+ input["end_positions"] = ending_token_idxs
205
+ return input
206
+
207
+ return char2token_mapping
208
+
209
+ if __name__ == "__main__":
210
+ datasets = load_dataset("stanfordnlp/coqa")
211
+
212
+ # print(datasets)
213
+
214
+ processor = ExtractiveQAPreProcesing()
215
+ data = datasets["train"][:1]
216
+ print("Raw Data:", data["answers"])
217
+ result = processor(data)
218
+ # pprint(processor(data))
219
+
220
+ # Train model