unknown commited on
Commit
df55b07
·
1 Parent(s): 4bc33dc
Files changed (36) hide show
  1. Script/Exp_Script/ChatGPT/bleu.py +134 -0
  2. Script/Exp_Script/ChatGPT/calculate_chatgpt_completion.py +273 -0
  3. Script/Exp_Script/ChatGPT/calculate_chatgpt_gen.py +384 -0
  4. Script/Exp_Script/Code-LLaMA/bleu.py +134 -0
  5. Script/Exp_Script/Code-LLaMA/calculate_codellama_completion.py +269 -0
  6. Script/Exp_Script/Code-LLaMA/calculate_codellama_gen.py +382 -0
  7. Script/Exp_Script/ForkFlow/bleu.py +134 -0
  8. Script/Exp_Script/ForkFlow/calculate_forkflow.py +407 -0
  9. Script/Model/CodeBert/code-completion/model.py +213 -0
  10. Script/Model/CodeBert/code-completion/run_completion.py +540 -0
  11. Script/Model/CodeBert/code-generation/bleu.py +134 -0
  12. Script/Model/CodeBert/code-generation/model.py +213 -0
  13. Script/Model/CodeBert/code-generation/run_generation.py +470 -0
  14. Script/Model/CodeT5+/code-completion/run_completion.py +525 -0
  15. Script/Model/CodeT5+/code-generation/bleu.py +134 -0
  16. Script/Model/CodeT5+/code-generation/run_generation.py +478 -0
  17. Script/Model/CodeT5+/new-target-completion/run_completion.py +614 -0
  18. Script/Model/CodeT5+/new-target-generation/bleu.py +134 -0
  19. Script/Model/CodeT5+/new-target-generation/run_generation.py +546 -0
  20. Script/Model/CodeT5/code-completion/run_completion.py +543 -0
  21. Script/Model/CodeT5/code-generation/bleu.py +134 -0
  22. Script/Model/CodeT5/code-generation/model.py +213 -0
  23. Script/Model/CodeT5/code-generation/run_generation.py +478 -0
  24. Script/Model/GraphCodeBert/code-completion/model.py +213 -0
  25. Script/Model/GraphCodeBert/code-completion/run_completion.py +545 -0
  26. Script/Model/GraphCodeBert/code-generation/bleu.py +134 -0
  27. Script/Model/GraphCodeBert/code-generation/model.py +213 -0
  28. Script/Model/GraphCodeBert/code-generation/run_generation.py +474 -0
  29. Script/Model/NatGen/code-completion/run_completion.py +520 -0
  30. Script/Model/NatGen/code-generation/bleu.py +134 -0
  31. Script/Model/NatGen/code-generation/run_generation.py +477 -0
  32. Script/Model/UnixCoder/code-completion/model.py +213 -0
  33. Script/Model/UnixCoder/code-completion/run_completion.py +543 -0
  34. Script/Model/UnixCoder/code-generation/bleu.py +134 -0
  35. Script/Model/UnixCoder/code-generation/model.py +213 -0
  36. Script/Model/UnixCoder/code-generation/run_generation.py +467 -0
Script/Exp_Script/ChatGPT/bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 Google Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+
16
+ """Python implementation of BLEU and smooth-BLEU.
17
+
18
+ This module provides a Python implementation of BLEU and smooth-BLEU.
19
+ Smooth BLEU is computed following the method outlined in the paper:
20
+ Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
+ evaluation metrics for machine translation. COLING 2004.
22
+ """
23
+
24
+ import collections
25
+ import math
26
+
27
+
28
+ def _get_ngrams(segment, max_order):
29
+ """Extracts all n-grams upto a given maximum order from an input segment.
30
+
31
+ Args:
32
+ segment: text segment from which n-grams will be extracted.
33
+ max_order: maximum length in tokens of the n-grams returned by this
34
+ methods.
35
+
36
+ Returns:
37
+ The Counter containing all n-grams upto max_order in segment
38
+ with a count of how many times each n-gram occurred.
39
+ """
40
+ ngram_counts = collections.Counter()
41
+ for order in range(1, max_order + 1):
42
+ for i in range(0, len(segment) - order + 1):
43
+ ngram = tuple(segment[i:i+order])
44
+ ngram_counts[ngram] += 1
45
+ return ngram_counts
46
+
47
+
48
+ def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
+ smooth=False):
50
+ """Computes BLEU score of translated segments against one or more references.
51
+
52
+ Args:
53
+ reference_corpus: list of lists of references for each translation. Each
54
+ reference should be tokenized into a list of tokens.
55
+ translation_corpus: list of translations to score. Each translation
56
+ should be tokenized into a list of tokens.
57
+ max_order: Maximum n-gram order to use when computing BLEU score.
58
+ smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
+
60
+ Returns:
61
+ 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
+ precisions and brevity penalty.
63
+ """
64
+ matches_by_order = [0] * max_order
65
+ possible_matches_by_order = [0] * max_order
66
+ reference_length = 0
67
+ translation_length = 0
68
+ for (references, translation) in zip(reference_corpus,
69
+ translation_corpus):
70
+ reference_length += min(len(r) for r in references)
71
+ translation_length += len(translation)
72
+
73
+ merged_ref_ngram_counts = collections.Counter()
74
+ for reference in references:
75
+ merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
+ translation_ngram_counts = _get_ngrams(translation, max_order)
77
+ overlap = translation_ngram_counts & merged_ref_ngram_counts
78
+ for ngram in overlap:
79
+ matches_by_order[len(ngram)-1] += overlap[ngram]
80
+ for order in range(1, max_order+1):
81
+ possible_matches = len(translation) - order + 1
82
+ if possible_matches > 0:
83
+ possible_matches_by_order[order-1] += possible_matches
84
+
85
+ precisions = [0] * max_order
86
+ for i in range(0, max_order):
87
+ if smooth:
88
+ precisions[i] = ((matches_by_order[i] + 1.) /
89
+ (possible_matches_by_order[i] + 1.))
90
+ else:
91
+ if possible_matches_by_order[i] > 0:
92
+ precisions[i] = (float(matches_by_order[i]) /
93
+ possible_matches_by_order[i])
94
+ else:
95
+ precisions[i] = 0.0
96
+
97
+ if min(precisions) > 0:
98
+ p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
+ geo_mean = math.exp(p_log_sum)
100
+ else:
101
+ geo_mean = 0
102
+
103
+ ratio = float(translation_length) / reference_length
104
+
105
+ if ratio > 1.0:
106
+ bp = 1.
107
+ else:
108
+ bp = math.exp(1 - 1. / ratio)
109
+
110
+ bleu = geo_mean * bp
111
+
112
+ return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
+
114
+
115
+ def _bleu(ref_file, trans_file, subword_option=None):
116
+ max_order = 4
117
+ smooth = True
118
+ ref_files = [ref_file]
119
+ reference_text = []
120
+ for reference_filename in ref_files:
121
+ with open(reference_filename) as fh:
122
+ reference_text.append(fh.readlines())
123
+ per_segment_references = []
124
+ for references in zip(*reference_text):
125
+ reference_list = []
126
+ for reference in references:
127
+ reference_list.append(reference.strip().split())
128
+ per_segment_references.append(reference_list)
129
+ translations = []
130
+ with open(trans_file) as fh:
131
+ for line in fh:
132
+ translations.append(line.strip().split())
133
+ bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
+ return round(100 * bleu_score,2)
Script/Exp_Script/ChatGPT/calculate_chatgpt_completion.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # from tree_sitter import Language, Parser
3
+ # # import pandas as pd
4
+ # import openpyxl
5
+ import json
6
+ import time
7
+ import csv
8
+ import pathlib
9
+ import difflib
10
+ import re
11
+ from bleu import _bleu
12
+ from fuzzywuzzy import fuzz
13
+ import random
14
+ import numpy as np
15
+ from transformers import RobertaTokenizer
16
+ #tokens = nltk.word_tokenize(sentence)
17
+ import argparse
18
+
19
+ parser = argparse.ArgumentParser(description='Test')
20
+ parser.add_argument("--task", default=None, type=str, required=True,
21
+ help="Task Type: statement_level, next_statement" )
22
+ args = parser.parse_args()
23
+
24
+
25
+
26
+ folder = str(pathlib.Path(__file__).parent.resolve())
27
+ isa_type_dir = folder+"/../../../Dataset"
28
+ src_dir = folder+f"/../../../Dataset/Code_Completion/{args.task}"
29
+ dst_dir = folder+"/Result"
30
+
31
+ train_lis = []
32
+ valid_lis = []
33
+ test_lis = []
34
+
35
+ target_clf = {}
36
+ def get_target_clf_list():
37
+ global target_clf
38
+ with open(isa_type_dir+"/comback_isa_type.csv","r",encoding="utf-8") as f:
39
+ reader = csv.reader(f)
40
+ for idx, l in enumerate(reader):
41
+ if l[1].lower() == "arc" or l[1].lower() == "riscv" or l[1].lower() == "nvptx":
42
+ continue
43
+ if l[0] + " " + l[2] not in target_clf.keys():
44
+ target_clf[l[0] + " " + l[2]] = [l[1]]
45
+ else:
46
+ target_clf[l[0] + " " + l[2]] += [l[1]]
47
+
48
+
49
+
50
+
51
+ def Calculate_Completion():
52
+ get_target_clf_list()
53
+ print("############## Exp 2: Calculate ChatGPT Stmt Completion ################\n")
54
+
55
+ test_lis = ["nvptx","arc","riscv"]
56
+
57
+
58
+ codellama_gcc_code = {}
59
+ codellama_llvm_code = {}
60
+
61
+ if args.task == "next_statement":
62
+ dst_file = dst_dir+"/Output/chatgpt_next_output_cleaned.csv"
63
+ else:
64
+ dst_file = dst_dir+"/Output/chatgpt_stmt_output_cleaned.csv"
65
+
66
+
67
+
68
+ with open(dst_file, encoding="utf-8") as f:
69
+ reader = csv.reader(f)
70
+ for idx, row in enumerate(reader):
71
+ if row[0] == "GCC":
72
+ codellama_gcc_code[row[1] + " " + str(row[2])] = row[3]
73
+ else:
74
+ codellama_llvm_code[row[1] + " " + str(row[2])] = row[3]
75
+ avg_accuracy = {}
76
+ for comp_type in ["GCC", "LLVM"]:
77
+ for isa_type in ["GPU", "MPU", "CPU"]:
78
+ test_target_dic = {}
79
+ cnt_idx = 0
80
+ if comp_type == "GCC":
81
+ if isa_type == "CPU":
82
+ cnt_idx = 0
83
+ for line in open(src_dir + "/GCC/riscv.jsonl", 'r'):
84
+ dic = json.loads(line)
85
+ test_target_dic["riscv" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
86
+
87
+ cnt_idx += 1
88
+ total_EM = 0.0
89
+ total_ED = 0.0
90
+ for k in test_target_dic.keys():
91
+ edit_dis = 0.0
92
+ EM = 0.0
93
+ src_code = test_target_dic[k]
94
+
95
+ if k in codellama_gcc_code.keys():
96
+ chat_code = codellama_gcc_code[k]
97
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
98
+ EM = 1
99
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
100
+ total_ED += edit_dis
101
+ total_EM += EM
102
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
103
+ writer = csv.writer(file)
104
+ writer.writerow([comp_type, "riscv", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
105
+ else:
106
+ print(k)
107
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
108
+ writer = csv.writer(file)
109
+ writer.writerow([comp_type, "riscv", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
110
+ avg_accuracy[comp_type + " " + "riscv"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
111
+ if isa_type == "GPU":
112
+ cnt_idx = 0
113
+ for line in open(src_dir + "/GCC/nvptx.jsonl", 'r'):
114
+ dic = json.loads(line)
115
+ test_target_dic["nvptx" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
116
+ cnt_idx += 1
117
+ total_EM = 0.0
118
+ total_ED = 0.0
119
+
120
+ for k in test_target_dic.keys():
121
+ edit_dis = 0.0
122
+ EM = 0.0
123
+ src_code = test_target_dic[k]
124
+ if k in codellama_gcc_code.keys():
125
+ chat_code = codellama_gcc_code[k]
126
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
127
+ EM = 1
128
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
129
+ total_ED += edit_dis
130
+ total_EM += EM
131
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
132
+ writer = csv.writer(file)
133
+ writer.writerow([comp_type, "nvptx", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
134
+ else:
135
+ print(k)
136
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
137
+ writer = csv.writer(file)
138
+ writer.writerow([comp_type, "nvptx", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
139
+ avg_accuracy[comp_type + " " + "nvptx"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
140
+ if isa_type == "MPU":
141
+ cnt_idx = 0
142
+ for line in open(src_dir + "/GCC/arc.jsonl", 'r'):
143
+ dic = json.loads(line)
144
+ test_target_dic["arc" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
145
+ cnt_idx += 1
146
+ total_EM = 0.0
147
+ total_ED = 0.0
148
+ for k in test_target_dic.keys():
149
+ edit_dis = 0.0
150
+ EM = 0.0
151
+ src_code = test_target_dic[k]
152
+ if k in codellama_gcc_code.keys():
153
+ chat_code = codellama_gcc_code[k]
154
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
155
+ EM = 1
156
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
157
+ total_ED += edit_dis
158
+ total_EM += EM
159
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
160
+ writer = csv.writer(file)
161
+ writer.writerow([comp_type, "arc", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
162
+ else:
163
+ print(k)
164
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
165
+ writer = csv.writer(file)
166
+ writer.writerow([comp_type, "arc", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
167
+ avg_accuracy[comp_type + " " + "arc"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
168
+
169
+ if comp_type == "LLVM":
170
+ if isa_type == "CPU":
171
+ cnt_idx = 0
172
+ for line in open(src_dir + "/LLVM/RISCV.jsonl", 'r'):
173
+ dic = json.loads(line)
174
+ test_target_dic["RISCV" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
175
+ cnt_idx += 1
176
+ total_EM = 0.0
177
+ total_ED = 0.0
178
+ for k in test_target_dic.keys():
179
+ edit_dis = 0.0
180
+ EM = 0.0
181
+ src_code = test_target_dic[k]
182
+ if k in codellama_llvm_code.keys():
183
+ chat_code = codellama_llvm_code[k]
184
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
185
+ EM = 1
186
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
187
+ total_ED += edit_dis
188
+ total_EM += EM
189
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
190
+ writer = csv.writer(file)
191
+ writer.writerow([comp_type, "RISCV", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
192
+ else:
193
+ print(k)
194
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
195
+ writer = csv.writer(file)
196
+ writer.writerow([comp_type, "RISCV", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
197
+ avg_accuracy[comp_type + " " + "RISCV"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
198
+ if isa_type == "GPU":
199
+ cnt_idx = 0
200
+ for line in open(src_dir + "/LLVM/NVPTX.jsonl", 'r'):
201
+ dic = json.loads(line)
202
+ test_target_dic["NVPTX" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
203
+ cnt_idx += 1
204
+ total_EM = 0.0
205
+ total_ED = 0.0
206
+ for k in test_target_dic.keys():
207
+ edit_dis = 0.0
208
+ EM = 0.0
209
+ src_code = test_target_dic[k]
210
+ if k in codellama_llvm_code.keys():
211
+ chat_code = codellama_llvm_code[k]
212
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
213
+ EM = 1
214
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
215
+ total_ED += edit_dis
216
+ total_EM += EM
217
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
218
+ writer = csv.writer(file)
219
+ writer.writerow([comp_type, "NVPTX", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
220
+ else:
221
+ print(k)
222
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
223
+ writer = csv.writer(file)
224
+ writer.writerow([comp_type, "NVPTX", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
225
+ avg_accuracy[comp_type + " " + "NVPTX"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
226
+ if isa_type == "MPU":
227
+ cnt_idx = 0
228
+ for line in open(src_dir + "/LLVM/ARC.jsonl", 'r'):
229
+ dic = json.loads(line)
230
+ test_target_dic["ARC" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
231
+ cnt_idx += 1
232
+ total_EM = 0.0
233
+ total_ED = 0.0
234
+
235
+ for k in test_target_dic.keys():
236
+ edit_dis = 0.0
237
+ EM = 0.0
238
+ src_code = test_target_dic[k]
239
+ if k in codellama_llvm_code.keys():
240
+ chat_code = codellama_llvm_code[k]
241
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
242
+ EM = 1
243
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
244
+ total_ED += edit_dis
245
+ total_EM += EM
246
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
247
+ writer = csv.writer(file)
248
+ writer.writerow([comp_type, "ARC", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
249
+ else:
250
+ print(k)
251
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
252
+ writer = csv.writer(file)
253
+ writer.writerow([comp_type, "ARC", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
254
+ avg_accuracy[comp_type + " " + "ARC"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
255
+
256
+ return avg_accuracy
257
+
258
+
259
+
260
+
261
+ if __name__ == "__main__":
262
+ with open(dst_dir + '/result.csv', 'w', newline='') as file:
263
+ writer = csv.writer(file)
264
+ writer.writerow(["Compiler Type", "Target", "Idx", "Exact Match", "Edit Didtance"])
265
+
266
+ avg_dic = Calculate_Completion()
267
+
268
+ for k in avg_dic:
269
+ print("########################")
270
+
271
+ print(k)
272
+ print(" ".join(["Exact Match", "Edit Didtance"]))
273
+ print(" ".join(avg_dic[k]))
Script/Exp_Script/ChatGPT/calculate_chatgpt_gen.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # from tree_sitter import Language, Parser
3
+ # # import pandas as pd
4
+ # import openpyxl
5
+ import json
6
+ import time
7
+ import csv
8
+ import pathlib
9
+ import difflib
10
+ import re
11
+ from bleu import _bleu
12
+ from fuzzywuzzy import fuzz
13
+ import random
14
+ import numpy as np
15
+ from transformers import RobertaTokenizer
16
+ #tokens = nltk.word_tokenize(sentence)
17
+
18
+ folder = str(pathlib.Path(__file__).parent.resolve())
19
+ isa_type_dir = folder+"/../../../Dataset"
20
+ src_dir = folder+"/../../../Dataset/Code_Generation"
21
+ dst_dir = folder+"/Result"
22
+
23
+ train_lis = []
24
+ valid_lis = []
25
+ test_lis = []
26
+
27
+ target_clf = {}
28
+ def get_target_clf_list():
29
+ global target_clf
30
+ with open(isa_type_dir+"/comback_isa_type.csv","r",encoding="utf-8") as f:
31
+ reader = csv.reader(f)
32
+ for idx, l in enumerate(reader):
33
+ if l[1].lower() == "arc" or l[1].lower() == "riscv" or l[1].lower() == "nvptx":
34
+ continue
35
+ if l[0] + " " + l[2] not in target_clf.keys():
36
+ target_clf[l[0] + " " + l[2]] = [l[1]]
37
+ else:
38
+ target_clf[l[0] + " " + l[2]] += [l[1]]
39
+
40
+
41
+ def Calculate_Statements_Ratio(Src_List, Fork_Lis, src_name, fork_name):
42
+ src_code = ""
43
+ Fork_code = ""
44
+ idx = 0
45
+ cnt_stmt = 0.0
46
+ while idx < len(Src_List):
47
+ src_code += Src_List[idx].replace(src_name, "").replace(src_name.upper(), "")
48
+ if Src_List[idx] in [";", ":", "{", "}"]:
49
+ src_code += "\n"
50
+ cnt_stmt += 1
51
+ idx += 1
52
+ while idx < len(Fork_Lis):
53
+ Fork_code += Fork_Lis[idx].replace(fork_name, "").replace(fork_name.upper(), "")
54
+ if Fork_Lis[idx] in [";", ":", "{", "}"]:
55
+ Fork_code += "\n"
56
+ idx += 1
57
+
58
+ code_same = 0
59
+ code_modi = 0
60
+ code_add = 0
61
+ diff_code = list(difflib.Differ().compare(src_code.splitlines(), Fork_code.splitlines()))
62
+ for idx, dv in enumerate(diff_code):
63
+ if dv[0] == '-':
64
+ if idx < len(diff_code) - 1 and diff_code[idx+1][0] == '?':
65
+ code_modi += 1
66
+ else:
67
+ code_add += 1
68
+ elif dv[0] == '+':
69
+ continue
70
+ elif dv[0] == '?':
71
+ continue
72
+ #vega_add -= 1
73
+ elif dv.strip().replace("\n", "") == '':
74
+ continue
75
+ else:
76
+ code_same += 1
77
+ return round(float(code_same) / cnt_stmt, 2)
78
+
79
+
80
+
81
+ def Calculate_Gen():
82
+ get_target_clf_list()
83
+ print("############## Exp 2: Calculate ChatGPT ################\n")
84
+
85
+ test_lis = ["nvptx","arc","riscv"]
86
+
87
+
88
+ chatgpt_gcc_code = {}
89
+ chatgpt_llvm_code = {}
90
+ avg_accuracy = {}
91
+
92
+ with open(dst_dir+"/chatgpt_gen_output.jsonl",encoding="utf-8") as f:
93
+ for idx, line in enumerate(f):
94
+
95
+ js=json.loads(line)
96
+ if js["Compiler_Type"] == "GCC":
97
+ chatgpt_gcc_code[str(js["Target"]) + " " + js["idx"]] = js["Code"]
98
+ else:
99
+ chatgpt_llvm_code[str(js["Target"]) + " " + js["idx"]] = js["Code"]
100
+
101
+ for comp_type in ["GCC", "LLVM"]:
102
+ for isa_type in ["GPU", "MPU", "CPU"]:
103
+ target_lis = target_clf[comp_type + " " + isa_type]
104
+ test_target_dic = {}
105
+ cnt_idx = 0
106
+ if comp_type == "GCC":
107
+ if isa_type == "CPU":
108
+ cnt_idx = 0
109
+ for line in open(src_dir + "/GCC/riscv.jsonl", 'r'):
110
+ dic = json.loads(line)
111
+ test_target_dic["riscv" + " " + str(cnt_idx)] = dic["ground_truth"]
112
+ cnt_idx += 1
113
+ total_EM = 0.0
114
+ total_ED = 0.0
115
+ total_PoVS = 0.0
116
+ total_BLEU4 = 0.0
117
+ for k in test_target_dic.keys():
118
+ edit_dis = 0.0
119
+ EM = 0.0
120
+ bleu4 = 0.0
121
+ stmt_mod = 0.0
122
+ src_code = " ".join(test_target_dic[k]).replace("riscv", "")
123
+ if k in chatgpt_gcc_code.keys():
124
+ chat_code = " ".join(chatgpt_gcc_code[k]).replace("riscv", "").replace("RISCV", "")
125
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], chatgpt_gcc_code[k], "riscv", "riscv")
126
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
127
+ f.write(chat_code+'\n')
128
+ f1.write(src_code+'\n')
129
+ if chat_code==src_code:
130
+ EM = 1
131
+ edit_dis = fuzz.ratio(chat_code, src_code)
132
+ if chat_code.strip() == "":
133
+ bleu4 = 0
134
+ else:
135
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
136
+ total_BLEU4 += bleu4
137
+ total_ED += edit_dis
138
+ total_PoVS += stmt_mod
139
+ total_EM += EM
140
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
141
+ writer = csv.writer(file)
142
+ writer.writerow([comp_type, "riscv", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
143
+ else:
144
+ print(k)
145
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
146
+ writer = csv.writer(file)
147
+ writer.writerow([comp_type, "riscv", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
148
+ avg_accuracy[comp_type + " " + "riscv"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
149
+
150
+ if isa_type == "GPU":
151
+ cnt_idx = 0
152
+ for line in open(src_dir + "/GCC/nvptx.jsonl", 'r'):
153
+ dic = json.loads(line)
154
+ test_target_dic["nvptx" + " " + str(cnt_idx)] = dic["ground_truth"]
155
+ cnt_idx += 1
156
+ total_EM = 0.0
157
+ total_ED = 0.0
158
+ total_PoVS = 0.0
159
+ total_BLEU4 = 0.0
160
+ for k in test_target_dic.keys():
161
+ edit_dis = 0.0
162
+ EM = 0.0
163
+ bleu4 = 0.0
164
+ stmt_mod = 0.0
165
+ src_code = " ".join(test_target_dic[k]).replace("nvptx", "")
166
+ if k in chatgpt_gcc_code.keys():
167
+ chat_code = " ".join(chatgpt_gcc_code[k]).replace("nvptx", "").replace("NVPTX", "")
168
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], chatgpt_gcc_code[k], "nvptx", "nvptx")
169
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
170
+ f.write(chat_code+'\n')
171
+ f1.write(src_code+'\n')
172
+ if chat_code==src_code:
173
+ EM = 1
174
+ edit_dis = fuzz.ratio(chat_code, src_code)
175
+ if chat_code.strip() == "":
176
+ bleu4 = 0
177
+ else:
178
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
179
+ total_BLEU4 += bleu4
180
+ total_ED += edit_dis
181
+ total_PoVS += stmt_mod
182
+ total_EM += EM
183
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
184
+ writer = csv.writer(file)
185
+ writer.writerow([comp_type, "nvptx", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
186
+ else:
187
+ print(k)
188
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
189
+ writer = csv.writer(file)
190
+ writer.writerow([comp_type, "nvptx", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
191
+ avg_accuracy[comp_type + " " + "nvptx"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
192
+
193
+ if isa_type == "MPU":
194
+ cnt_idx = 0
195
+ for line in open(src_dir + "/GCC/arc.jsonl", 'r'):
196
+ dic = json.loads(line)
197
+ test_target_dic["arc" + " " + str(cnt_idx)] = dic["ground_truth"]
198
+ cnt_idx += 1
199
+ total_EM = 0.0
200
+ total_ED = 0.0
201
+ total_PoVS = 0.0
202
+ total_BLEU4 = 0.0
203
+ for k in test_target_dic.keys():
204
+ edit_dis = 0.0
205
+ EM = 0.0
206
+ bleu4 = 0.0
207
+ stmt_mod = 0.0
208
+ src_code = " ".join(test_target_dic[k]).replace("arc", "")
209
+ if k in chatgpt_gcc_code.keys():
210
+ chat_code = " ".join(chatgpt_gcc_code[k]).replace("arc", "").replace("ARC", "")
211
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], chatgpt_gcc_code[k], "arc", "arc")
212
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
213
+ f.write(chat_code+'\n')
214
+ f1.write(src_code+'\n')
215
+ if chat_code==src_code:
216
+ EM = 1
217
+ edit_dis = fuzz.ratio(chat_code, src_code)
218
+ if chat_code.strip() == "":
219
+ bleu4 = 0
220
+ else:
221
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
222
+ total_BLEU4 += bleu4
223
+ total_ED += edit_dis
224
+ total_PoVS += stmt_mod
225
+ total_EM += EM
226
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
227
+ writer = csv.writer(file)
228
+ writer.writerow([comp_type, "arc", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
229
+ else:
230
+ print(k)
231
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
232
+ writer = csv.writer(file)
233
+ writer.writerow([comp_type, "arc", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
234
+ avg_accuracy[comp_type + " " + "arc"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
235
+
236
+ if comp_type == "LLVM":
237
+ if isa_type == "CPU":
238
+ cnt_idx = 0
239
+ for line in open(src_dir + "/LLVM/RISCV.jsonl", 'r'):
240
+ dic = json.loads(line)
241
+ test_target_dic["RISCV" + " " + str(cnt_idx)] = dic["ground_truth"]
242
+ cnt_idx += 1
243
+ total_EM = 0.0
244
+ total_ED = 0.0
245
+ total_PoVS = 0.0
246
+ total_BLEU4 = 0.0
247
+ for k in test_target_dic.keys():
248
+ edit_dis = 0.0
249
+ EM = 0.0
250
+ bleu4 = 0.0
251
+ stmt_mod = 0.0
252
+ src_code = " ".join(test_target_dic[k]).replace("RISCV", "")
253
+ if k in chatgpt_llvm_code.keys():
254
+ chat_code = " ".join(chatgpt_llvm_code[k]).replace("riscv", "").replace("RISCV", "")
255
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], chatgpt_llvm_code[k], "riscv", "riscv")
256
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
257
+ f.write(chat_code+'\n')
258
+ f1.write(src_code+'\n')
259
+ if chat_code==src_code:
260
+ EM = 1
261
+ edit_dis = fuzz.ratio(chat_code, src_code)
262
+ if chat_code.strip() == "":
263
+ bleu4 = 0
264
+ else:
265
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
266
+ total_BLEU4 += bleu4
267
+ total_ED += edit_dis
268
+ total_PoVS += stmt_mod
269
+ total_EM += EM
270
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
271
+ writer = csv.writer(file)
272
+ writer.writerow([comp_type, "RISCV", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
273
+ else:
274
+ print(k)
275
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
276
+ writer = csv.writer(file)
277
+ writer.writerow([comp_type, "RISCV", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
278
+ avg_accuracy[comp_type + " " + "RISCV"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
279
+ if isa_type == "GPU":
280
+ cnt_idx = 0
281
+ for line in open(src_dir + "/LLVM/NVPTX.jsonl", 'r'):
282
+ dic = json.loads(line)
283
+ test_target_dic["NVPTX" + " " + str(cnt_idx)] = dic["ground_truth"]
284
+ cnt_idx += 1
285
+
286
+ total_EM = 0.0
287
+ total_ED = 0.0
288
+ total_PoVS = 0.0
289
+ total_BLEU4 = 0.0
290
+ for k in test_target_dic.keys():
291
+ edit_dis = 0.0
292
+ EM = 0.0
293
+ bleu4 = 0.0
294
+ stmt_mod = 0.0
295
+ src_code = " ".join(test_target_dic[k]).replace("NVPTX", "")
296
+ if k in chatgpt_llvm_code.keys():
297
+ chat_code = " ".join(chatgpt_llvm_code[k]).replace("nvptx", "").replace("NVPTX", "")
298
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], chatgpt_llvm_code[k], "nvptx", "nvptx")
299
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
300
+ f.write(chat_code+'\n')
301
+ f1.write(src_code+'\n')
302
+ if chat_code==src_code:
303
+ EM = 1
304
+ edit_dis = fuzz.ratio(chat_code, src_code)
305
+ if chat_code.strip() == "":
306
+ bleu4 = 0
307
+ else:
308
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
309
+ total_BLEU4 += bleu4
310
+ total_ED += edit_dis
311
+ total_PoVS += stmt_mod
312
+ total_EM += EM
313
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
314
+ writer = csv.writer(file)
315
+ writer.writerow([comp_type, "NVPTX", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
316
+ else:
317
+ print(k)
318
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
319
+ writer = csv.writer(file)
320
+ writer.writerow([comp_type, "NVPTX", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
321
+ avg_accuracy[comp_type + " " + "NVPTX"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
322
+
323
+ if isa_type == "MPU":
324
+ cnt_idx = 0
325
+ for line in open(src_dir + "/LLVM/ARC.jsonl", 'r'):
326
+ dic = json.loads(line)
327
+ test_target_dic["ARC" + " " + str(cnt_idx)] = dic["ground_truth"]
328
+ cnt_idx += 1
329
+ total_EM = 0.0
330
+ total_ED = 0.0
331
+ total_PoVS = 0.0
332
+ total_BLEU4 = 0.0
333
+ for k in test_target_dic.keys():
334
+ edit_dis = 0.0
335
+ EM = 0.0
336
+ bleu4 = 0.0
337
+ stmt_mod = 0.0
338
+ src_code = " ".join(test_target_dic[k]).replace("ARC", "")
339
+ if k in chatgpt_llvm_code.keys():
340
+ chat_code = " ".join(chatgpt_llvm_code[k]).replace("arc", "").replace("ARC", "")
341
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], chatgpt_llvm_code[k], "arc", "arc")
342
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
343
+ f.write(chat_code+'\n')
344
+ f1.write(src_code+'\n')
345
+ if chat_code==src_code:
346
+ EM = 1
347
+ edit_dis = fuzz.ratio(chat_code, src_code)
348
+ if chat_code.strip() == "":
349
+ bleu4 = 0
350
+ else:
351
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
352
+ total_BLEU4 += bleu4
353
+ total_ED += edit_dis
354
+ total_PoVS += stmt_mod
355
+ total_EM += EM
356
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
357
+ writer = csv.writer(file)
358
+ writer.writerow([comp_type, "ARC", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
359
+ else:
360
+ print(k)
361
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
362
+ writer = csv.writer(file)
363
+ writer.writerow([comp_type, "ARC", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
364
+ avg_accuracy[comp_type + " " + "ARC"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
365
+
366
+ return avg_accuracy
367
+
368
+
369
+
370
+
371
+ if __name__ == "__main__":
372
+ with open(dst_dir + '/result.csv', 'w', newline='') as file:
373
+ writer = csv.writer(file)
374
+ writer.writerow(["Compiler Type", "Target", "Idx", "BLEU4", "Exact Match", "Edit Didtance", "Stmt_Ratio"])
375
+
376
+ avg_dic = Calculate_Gen()
377
+
378
+ for k in avg_dic:
379
+ print("########################")
380
+
381
+ print(k)
382
+ print(" ".join(["BLEU4", "Exact Match", "Edit Didtance", "Stmt_Ratio"]))
383
+ print(" ".join(avg_dic[k]))
384
+
Script/Exp_Script/Code-LLaMA/bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 Google Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+
16
+ """Python implementation of BLEU and smooth-BLEU.
17
+
18
+ This module provides a Python implementation of BLEU and smooth-BLEU.
19
+ Smooth BLEU is computed following the method outlined in the paper:
20
+ Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
+ evaluation metrics for machine translation. COLING 2004.
22
+ """
23
+
24
+ import collections
25
+ import math
26
+
27
+
28
+ def _get_ngrams(segment, max_order):
29
+ """Extracts all n-grams upto a given maximum order from an input segment.
30
+
31
+ Args:
32
+ segment: text segment from which n-grams will be extracted.
33
+ max_order: maximum length in tokens of the n-grams returned by this
34
+ methods.
35
+
36
+ Returns:
37
+ The Counter containing all n-grams upto max_order in segment
38
+ with a count of how many times each n-gram occurred.
39
+ """
40
+ ngram_counts = collections.Counter()
41
+ for order in range(1, max_order + 1):
42
+ for i in range(0, len(segment) - order + 1):
43
+ ngram = tuple(segment[i:i+order])
44
+ ngram_counts[ngram] += 1
45
+ return ngram_counts
46
+
47
+
48
+ def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
+ smooth=False):
50
+ """Computes BLEU score of translated segments against one or more references.
51
+
52
+ Args:
53
+ reference_corpus: list of lists of references for each translation. Each
54
+ reference should be tokenized into a list of tokens.
55
+ translation_corpus: list of translations to score. Each translation
56
+ should be tokenized into a list of tokens.
57
+ max_order: Maximum n-gram order to use when computing BLEU score.
58
+ smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
+
60
+ Returns:
61
+ 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
+ precisions and brevity penalty.
63
+ """
64
+ matches_by_order = [0] * max_order
65
+ possible_matches_by_order = [0] * max_order
66
+ reference_length = 0
67
+ translation_length = 0
68
+ for (references, translation) in zip(reference_corpus,
69
+ translation_corpus):
70
+ reference_length += min(len(r) for r in references)
71
+ translation_length += len(translation)
72
+
73
+ merged_ref_ngram_counts = collections.Counter()
74
+ for reference in references:
75
+ merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
+ translation_ngram_counts = _get_ngrams(translation, max_order)
77
+ overlap = translation_ngram_counts & merged_ref_ngram_counts
78
+ for ngram in overlap:
79
+ matches_by_order[len(ngram)-1] += overlap[ngram]
80
+ for order in range(1, max_order+1):
81
+ possible_matches = len(translation) - order + 1
82
+ if possible_matches > 0:
83
+ possible_matches_by_order[order-1] += possible_matches
84
+
85
+ precisions = [0] * max_order
86
+ for i in range(0, max_order):
87
+ if smooth:
88
+ precisions[i] = ((matches_by_order[i] + 1.) /
89
+ (possible_matches_by_order[i] + 1.))
90
+ else:
91
+ if possible_matches_by_order[i] > 0:
92
+ precisions[i] = (float(matches_by_order[i]) /
93
+ possible_matches_by_order[i])
94
+ else:
95
+ precisions[i] = 0.0
96
+
97
+ if min(precisions) > 0:
98
+ p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
+ geo_mean = math.exp(p_log_sum)
100
+ else:
101
+ geo_mean = 0
102
+
103
+ ratio = float(translation_length) / reference_length
104
+
105
+ if ratio > 1.0:
106
+ bp = 1.
107
+ else:
108
+ bp = math.exp(1 - 1. / ratio)
109
+
110
+ bleu = geo_mean * bp
111
+
112
+ return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
+
114
+
115
+ def _bleu(ref_file, trans_file, subword_option=None):
116
+ max_order = 4
117
+ smooth = True
118
+ ref_files = [ref_file]
119
+ reference_text = []
120
+ for reference_filename in ref_files:
121
+ with open(reference_filename) as fh:
122
+ reference_text.append(fh.readlines())
123
+ per_segment_references = []
124
+ for references in zip(*reference_text):
125
+ reference_list = []
126
+ for reference in references:
127
+ reference_list.append(reference.strip().split())
128
+ per_segment_references.append(reference_list)
129
+ translations = []
130
+ with open(trans_file) as fh:
131
+ for line in fh:
132
+ translations.append(line.strip().split())
133
+ bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
+ return round(100 * bleu_score,2)
Script/Exp_Script/Code-LLaMA/calculate_codellama_completion.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # from tree_sitter import Language, Parser
3
+ # # import pandas as pd
4
+ # import openpyxl
5
+ import json
6
+ import time
7
+ import csv
8
+ import pathlib
9
+ import difflib
10
+ import re
11
+ from bleu import _bleu
12
+ from fuzzywuzzy import fuzz
13
+ import random
14
+ import numpy as np
15
+ from transformers import RobertaTokenizer
16
+ #tokens = nltk.word_tokenize(sentence)
17
+ import argparse
18
+
19
+ parser = argparse.ArgumentParser(description='Test')
20
+ parser.add_argument("--task", default=None, type=str, required=True,
21
+ help="Task Type: statement_level, next_statement" )
22
+ args = parser.parse_args()
23
+
24
+ folder = str(pathlib.Path(__file__).parent.resolve())
25
+ isa_type_dir = folder+"/../../../Dataset"
26
+ src_dir = folder+f"/../../../Dataset/Code_Completion/{args.task}"
27
+ dst_dir = folder+"/Result"
28
+
29
+ train_lis = []
30
+ valid_lis = []
31
+ test_lis = []
32
+
33
+ target_clf = {}
34
+ def get_target_clf_list():
35
+ global target_clf
36
+ with open(isa_type_dir+"/comback_isa_type.csv","r",encoding="utf-8") as f:
37
+ reader = csv.reader(f)
38
+ for idx, l in enumerate(reader):
39
+ if l[1].lower() == "arc" or l[1].lower() == "riscv" or l[1].lower() == "nvptx":
40
+ continue
41
+ if l[0] + " " + l[2] not in target_clf.keys():
42
+ target_clf[l[0] + " " + l[2]] = [l[1]]
43
+ else:
44
+ target_clf[l[0] + " " + l[2]] += [l[1]]
45
+
46
+
47
+
48
+
49
+ def Calculate_Completion():
50
+ get_target_clf_list()
51
+ print("############## Exp 2: Calculate Code-LLaMA Stmt Completion ################\n")
52
+
53
+ test_lis = ["nvptx","arc","riscv"]
54
+
55
+
56
+ codellama_gcc_code = {}
57
+ codellama_llvm_code = {}
58
+
59
+ if args.task == "next_statement":
60
+ dst_file = dst_dir+"/Output/chatgpt_next_output_cleaned.csv"
61
+ else:
62
+ dst_file = dst_dir+"/Output/chatgpt_stmt_output_cleaned.csv"
63
+
64
+ with open(dst_file,encoding="utf-8") as f:
65
+ reader = csv.reader(f)
66
+ for idx, row in enumerate(reader):
67
+ if row[0] == "GCC":
68
+ codellama_gcc_code[row[1] + " " + str(row[2])] = row[3]
69
+ else:
70
+ codellama_llvm_code[row[1] + " " + str(row[2])] = row[3]
71
+ avg_accuracy = {}
72
+ for comp_type in ["GCC", "LLVM"]:
73
+ for isa_type in ["GPU", "MPU", "CPU"]:
74
+ test_target_dic = {}
75
+ cnt_idx = 0
76
+ if comp_type == "GCC":
77
+ if isa_type == "CPU":
78
+ cnt_idx = 0
79
+ for line in open(src_dir + "/GCC/riscv.jsonl", 'r'):
80
+ dic = json.loads(line)
81
+ test_target_dic["riscv" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
82
+
83
+ cnt_idx += 1
84
+ total_EM = 0.0
85
+ total_ED = 0.0
86
+ for k in test_target_dic.keys():
87
+ edit_dis = 0.0
88
+ EM = 0.0
89
+ src_code = test_target_dic[k]
90
+
91
+ if k in codellama_gcc_code.keys():
92
+ chat_code = codellama_gcc_code[k]
93
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
94
+ EM = 1
95
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
96
+ total_ED += edit_dis
97
+ total_EM += EM
98
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
99
+ writer = csv.writer(file)
100
+ writer.writerow([comp_type, "riscv", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
101
+ else:
102
+ print(k)
103
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
104
+ writer = csv.writer(file)
105
+ writer.writerow([comp_type, "riscv", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
106
+ avg_accuracy[comp_type + " " + "riscv"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
107
+ if isa_type == "GPU":
108
+ cnt_idx = 0
109
+ for line in open(src_dir + "/GCC/nvptx.jsonl", 'r'):
110
+ dic = json.loads(line)
111
+ test_target_dic["nvptx" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
112
+ cnt_idx += 1
113
+ total_EM = 0.0
114
+ total_ED = 0.0
115
+
116
+ for k in test_target_dic.keys():
117
+ edit_dis = 0.0
118
+ EM = 0.0
119
+ src_code = test_target_dic[k]
120
+ if k in codellama_gcc_code.keys():
121
+ chat_code = codellama_gcc_code[k]
122
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
123
+ EM = 1
124
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
125
+ total_ED += edit_dis
126
+ total_EM += EM
127
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
128
+ writer = csv.writer(file)
129
+ writer.writerow([comp_type, "nvptx", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
130
+ else:
131
+ print(k)
132
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
133
+ writer = csv.writer(file)
134
+ writer.writerow([comp_type, "nvptx", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
135
+ avg_accuracy[comp_type + " " + "nvptx"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
136
+ if isa_type == "MPU":
137
+ cnt_idx = 0
138
+ for line in open(src_dir + "/GCC/arc.jsonl", 'r'):
139
+ dic = json.loads(line)
140
+ test_target_dic["arc" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
141
+ cnt_idx += 1
142
+ total_EM = 0.0
143
+ total_ED = 0.0
144
+ for k in test_target_dic.keys():
145
+ edit_dis = 0.0
146
+ EM = 0.0
147
+ src_code = test_target_dic[k]
148
+ if k in codellama_gcc_code.keys():
149
+ chat_code = codellama_gcc_code[k]
150
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
151
+ EM = 1
152
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
153
+ total_ED += edit_dis
154
+ total_EM += EM
155
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
156
+ writer = csv.writer(file)
157
+ writer.writerow([comp_type, "arc", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
158
+ else:
159
+ print(k)
160
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
161
+ writer = csv.writer(file)
162
+ writer.writerow([comp_type, "arc", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
163
+ avg_accuracy[comp_type + " " + "arc"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
164
+
165
+ if comp_type == "LLVM":
166
+ if isa_type == "CPU":
167
+ cnt_idx = 0
168
+ for line in open(src_dir + "/LLVM/RISCV.jsonl", 'r'):
169
+ dic = json.loads(line)
170
+ test_target_dic["RISCV" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
171
+ cnt_idx += 1
172
+ total_EM = 0.0
173
+ total_ED = 0.0
174
+ for k in test_target_dic.keys():
175
+ edit_dis = 0.0
176
+ EM = 0.0
177
+ src_code = test_target_dic[k]
178
+ if k in codellama_llvm_code.keys():
179
+ chat_code = codellama_llvm_code[k]
180
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
181
+ EM = 1
182
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
183
+ total_ED += edit_dis
184
+ total_EM += EM
185
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
186
+ writer = csv.writer(file)
187
+ writer.writerow([comp_type, "RISCV", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
188
+ else:
189
+ print(k)
190
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
191
+ writer = csv.writer(file)
192
+ writer.writerow([comp_type, "RISCV", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
193
+ avg_accuracy[comp_type + " " + "RISCV"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
194
+ if isa_type == "GPU":
195
+ cnt_idx = 0
196
+ for line in open(src_dir + "/LLVM/NVPTX.jsonl", 'r'):
197
+ dic = json.loads(line)
198
+ test_target_dic["NVPTX" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
199
+ cnt_idx += 1
200
+ total_EM = 0.0
201
+ total_ED = 0.0
202
+ for k in test_target_dic.keys():
203
+ edit_dis = 0.0
204
+ EM = 0.0
205
+ src_code = test_target_dic[k]
206
+ if k in codellama_llvm_code.keys():
207
+ chat_code = codellama_llvm_code[k]
208
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
209
+ EM = 1
210
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
211
+ total_ED += edit_dis
212
+ total_EM += EM
213
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
214
+ writer = csv.writer(file)
215
+ writer.writerow([comp_type, "NVPTX", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
216
+ else:
217
+ print(k)
218
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
219
+ writer = csv.writer(file)
220
+ writer.writerow([comp_type, "NVPTX", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
221
+ avg_accuracy[comp_type + " " + "NVPTX"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
222
+ if isa_type == "MPU":
223
+ cnt_idx = 0
224
+ for line in open(src_dir + "/LLVM/ARC.jsonl", 'r'):
225
+ dic = json.loads(line)
226
+ test_target_dic["ARC" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
227
+ cnt_idx += 1
228
+ total_EM = 0.0
229
+ total_ED = 0.0
230
+
231
+ for k in test_target_dic.keys():
232
+ edit_dis = 0.0
233
+ EM = 0.0
234
+ src_code = test_target_dic[k]
235
+ if k in codellama_llvm_code.keys():
236
+ chat_code = codellama_llvm_code[k]
237
+ if chat_code.replace(" ", "") == src_code.replace(" ", ""):
238
+ EM = 1
239
+ edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
240
+ total_ED += edit_dis
241
+ total_EM += EM
242
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
243
+ writer = csv.writer(file)
244
+ writer.writerow([comp_type, "ARC", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
245
+ else:
246
+ print(k)
247
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
248
+ writer = csv.writer(file)
249
+ writer.writerow([comp_type, "ARC", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
250
+ avg_accuracy[comp_type + " " + "ARC"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
251
+
252
+ return avg_accuracy
253
+
254
+
255
+
256
+
257
+ if __name__ == "__main__":
258
+ with open(dst_dir + '/result.csv', 'w', newline='') as file:
259
+ writer = csv.writer(file)
260
+ writer.writerow(["Compiler Type", "Target", "Idx", "Exact Match", "Edit Didtance"])
261
+
262
+ avg_dic = Calculate_Completion()
263
+
264
+ for k in avg_dic:
265
+ print("########################")
266
+
267
+ print(k)
268
+ print(" ".join(["Exact Match", "Edit Didtance"]))
269
+ print(" ".join(avg_dic[k]))
Script/Exp_Script/Code-LLaMA/calculate_codellama_gen.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # from tree_sitter import Language, Parser
3
+ # # import pandas as pd
4
+ # import openpyxl
5
+ import json
6
+ import time
7
+ import csv
8
+ import pathlib
9
+ import difflib
10
+ import re
11
+ from bleu import _bleu
12
+ from fuzzywuzzy import fuzz
13
+ import random
14
+ import numpy as np
15
+ from transformers import RobertaTokenizer
16
+ #tokens = nltk.word_tokenize(sentence)
17
+
18
+ folder = str(pathlib.Path(__file__).parent.resolve())
19
+ isa_type_dir = folder+"/../../../Dataset"
20
+ src_dir = folder+"/../../../Dataset/Code_Generation"
21
+ dst_dir = folder+"/Result"
22
+
23
+ train_lis = []
24
+ valid_lis = []
25
+ test_lis = []
26
+
27
+ target_clf = {}
28
+ def get_target_clf_list():
29
+ global target_clf
30
+ with open(isa_type_dir+"/comback_isa_type.csv","r",encoding="utf-8") as f:
31
+ reader = csv.reader(f)
32
+ for idx, l in enumerate(reader):
33
+ if l[1].lower() == "arc" or l[1].lower() == "riscv" or l[1].lower() == "nvptx":
34
+ continue
35
+ if l[0] + " " + l[2] not in target_clf.keys():
36
+ target_clf[l[0] + " " + l[2]] = [l[1]]
37
+ else:
38
+ target_clf[l[0] + " " + l[2]] += [l[1]]
39
+
40
+
41
+ def Calculate_Statements_Ratio(Src_List, Fork_Lis, src_name, fork_name):
42
+ src_code = ""
43
+ Fork_code = ""
44
+ idx = 0
45
+ cnt_stmt = 0.0
46
+ while idx < len(Src_List):
47
+ src_code += Src_List[idx].replace(src_name, "").replace(src_name.upper(), "")
48
+ if Src_List[idx] in [";", ":", "{", "}"]:
49
+ src_code += "\n"
50
+ cnt_stmt += 1
51
+ idx += 1
52
+ while idx < len(Fork_Lis):
53
+ Fork_code += Fork_Lis[idx].replace(fork_name, "").replace(fork_name.upper(), "")
54
+ if Fork_Lis[idx] in [";", ":", "{", "}"]:
55
+ Fork_code += "\n"
56
+ idx += 1
57
+
58
+ code_same = 0
59
+ code_modi = 0
60
+ code_add = 0
61
+ diff_code = list(difflib.Differ().compare(src_code.splitlines(), Fork_code.splitlines()))
62
+ for idx, dv in enumerate(diff_code):
63
+ if dv[0] == '-':
64
+ if idx < len(diff_code) - 1 and diff_code[idx+1][0] == '?':
65
+ code_modi += 1
66
+ else:
67
+ code_add += 1
68
+ elif dv[0] == '+':
69
+ continue
70
+ elif dv[0] == '?':
71
+ continue
72
+ #vega_add -= 1
73
+ elif dv.strip().replace("\n", "") == '':
74
+ continue
75
+ else:
76
+ code_same += 1
77
+ return round(float(code_same) / cnt_stmt, 2)
78
+
79
+
80
+
81
+ def Calculate_Gen():
82
+ get_target_clf_list()
83
+ print("############## Exp 2: Calculate Code-LLaMA Gen ################\n")
84
+
85
+ test_lis = ["nvptx","arc","riscv"]
86
+
87
+ avg_accuracy = {}
88
+ codellama_gcc_code = {}
89
+ codellama_llvm_code = {}
90
+
91
+ with open(dst_dir+"/codellama_gen_output.jsonl",encoding="utf-8") as f:
92
+ for idx, line in enumerate(f):
93
+
94
+ js=json.loads(line)
95
+ if js["Compiler_Type"] == "GCC":
96
+ codellama_gcc_code[str(js["Target"]) + " " + js["idx"]] = js["Code"]
97
+ else:
98
+ codellama_llvm_code[str(js["Target"]) + " " + js["idx"]] = js["Code"]
99
+
100
+ for comp_type in ["GCC", "LLVM"]:
101
+ for isa_type in ["GPU", "MPU", "CPU"]:
102
+ target_lis = target_clf[comp_type + " " + isa_type]
103
+ test_target_dic = {}
104
+ cnt_idx = 0
105
+ if comp_type == "GCC":
106
+ if isa_type == "CPU":
107
+ cnt_idx = 0
108
+ for line in open(src_dir + "/GCC/riscv.jsonl", 'r'):
109
+ dic = json.loads(line)
110
+ test_target_dic["riscv" + " " + str(cnt_idx)] = dic["ground_truth"]
111
+ cnt_idx += 1
112
+ total_EM = 0.0
113
+ total_ED = 0.0
114
+ total_PoVS = 0.0
115
+ total_BLEU4 = 0.0
116
+ for k in test_target_dic.keys():
117
+ edit_dis = 0.0
118
+ EM = 0.0
119
+ bleu4 = 0.0
120
+ stmt_mod = 0.0
121
+ src_code = " ".join(test_target_dic[k]).replace("riscv", "")
122
+ if k in codellama_gcc_code.keys():
123
+ chat_code = " ".join(codellama_gcc_code[k]).replace("riscv", "").replace("RISCV", "")
124
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], codellama_gcc_code[k], "riscv", "riscv")
125
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
126
+ f.write(chat_code+'\n')
127
+ f1.write(src_code+'\n')
128
+ if chat_code==src_code:
129
+ EM = 1
130
+ edit_dis = fuzz.ratio(chat_code, src_code)
131
+ if chat_code.strip() == "":
132
+ bleu4 = 0
133
+ else:
134
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
135
+ total_BLEU4 += bleu4
136
+ total_ED += edit_dis
137
+ total_PoVS += stmt_mod
138
+ total_EM += EM
139
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
140
+ writer = csv.writer(file)
141
+ writer.writerow([comp_type, "riscv", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
142
+ else:
143
+ print(k)
144
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
145
+ writer = csv.writer(file)
146
+ writer.writerow([comp_type, "riscv", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
147
+ avg_accuracy[comp_type + " " + "riscv"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
148
+ if isa_type == "GPU":
149
+ cnt_idx = 0
150
+ for line in open(src_dir + "/GCC/nvptx.jsonl", 'r'):
151
+ dic = json.loads(line)
152
+ test_target_dic["nvptx" + " " + str(cnt_idx)] = dic["ground_truth"]
153
+ cnt_idx += 1
154
+ total_EM = 0.0
155
+ total_ED = 0.0
156
+ total_PoVS = 0.0
157
+ total_BLEU4 = 0.0
158
+ for k in test_target_dic.keys():
159
+ edit_dis = 0.0
160
+ EM = 0.0
161
+ bleu4 = 0.0
162
+ stmt_mod = 0.0
163
+ src_code = " ".join(test_target_dic[k]).replace("nvptx", "")
164
+ if k in codellama_gcc_code.keys():
165
+ chat_code = " ".join(codellama_gcc_code[k]).replace("nvptx", "").replace("NVPTX", "")
166
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], codellama_gcc_code[k], "nvptx", "nvptx")
167
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
168
+ f.write(chat_code+'\n')
169
+ f1.write(src_code+'\n')
170
+ if chat_code==src_code:
171
+ EM = 1
172
+ edit_dis = fuzz.ratio(chat_code, src_code)
173
+ if chat_code.strip() == "":
174
+ bleu4 = 0
175
+ else:
176
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
177
+ total_BLEU4 += bleu4
178
+ total_ED += edit_dis
179
+ total_PoVS += stmt_mod
180
+ total_EM += EM
181
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
182
+ writer = csv.writer(file)
183
+ writer.writerow([comp_type, "nvptx", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
184
+ else:
185
+ print(k)
186
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
187
+ writer = csv.writer(file)
188
+ writer.writerow([comp_type, "nvptx", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
189
+ avg_accuracy[comp_type + " " + "nvptx"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
190
+
191
+ if isa_type == "MPU":
192
+ cnt_idx = 0
193
+ for line in open(src_dir + "/GCC/arc.jsonl", 'r'):
194
+ dic = json.loads(line)
195
+ test_target_dic["arc" + " " + str(cnt_idx)] = dic["ground_truth"]
196
+ cnt_idx += 1
197
+ total_EM = 0.0
198
+ total_ED = 0.0
199
+ total_PoVS = 0.0
200
+ total_BLEU4 = 0.0
201
+ for k in test_target_dic.keys():
202
+ edit_dis = 0.0
203
+ EM = 0.0
204
+ bleu4 = 0.0
205
+ stmt_mod = 0.0
206
+ src_code = " ".join(test_target_dic[k]).replace("arc", "")
207
+ if k in codellama_gcc_code.keys():
208
+ chat_code = " ".join(codellama_gcc_code[k]).replace("arc", "").replace("ARC", "")
209
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], codellama_gcc_code[k], "arc", "arc")
210
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
211
+ f.write(chat_code+'\n')
212
+ f1.write(src_code+'\n')
213
+ if chat_code==src_code:
214
+ EM = 1
215
+ edit_dis = fuzz.ratio(chat_code, src_code)
216
+ if chat_code.strip() == "":
217
+ bleu4 = 0
218
+ else:
219
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
220
+ total_BLEU4 += bleu4
221
+ total_ED += edit_dis
222
+ total_PoVS += stmt_mod
223
+ total_EM += EM
224
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
225
+ writer = csv.writer(file)
226
+ writer.writerow([comp_type, "arc", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
227
+ else:
228
+ print(k)
229
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
230
+ writer = csv.writer(file)
231
+ writer.writerow([comp_type, "arc", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
232
+ avg_accuracy[comp_type + " " + "arc"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
233
+
234
+ if comp_type == "LLVM":
235
+ if isa_type == "CPU":
236
+ cnt_idx = 0
237
+ for line in open(src_dir + "/LLVM/RISCV.jsonl", 'r'):
238
+ dic = json.loads(line)
239
+ test_target_dic["RISCV" + " " + str(cnt_idx)] = dic["ground_truth"]
240
+ cnt_idx += 1
241
+ total_EM = 0.0
242
+ total_ED = 0.0
243
+ total_PoVS = 0.0
244
+ total_BLEU4 = 0.0
245
+ for k in test_target_dic.keys():
246
+ edit_dis = 0.0
247
+ EM = 0.0
248
+ bleu4 = 0.0
249
+ stmt_mod = 0.0
250
+ src_code = " ".join(test_target_dic[k]).replace("RISCV", "")
251
+ if k in codellama_llvm_code.keys():
252
+ chat_code = " ".join(codellama_llvm_code[k]).replace("riscv", "").replace("RISCV", "")
253
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], codellama_llvm_code[k], "riscv", "riscv")
254
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
255
+ f.write(chat_code+'\n')
256
+ f1.write(src_code+'\n')
257
+ if chat_code==src_code:
258
+ EM = 1
259
+ edit_dis = fuzz.ratio(chat_code, src_code)
260
+ if chat_code.strip() == "":
261
+ bleu4 = 0
262
+ else:
263
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
264
+ total_BLEU4 += bleu4
265
+ total_ED += edit_dis
266
+ total_PoVS += stmt_mod
267
+ total_EM += EM
268
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
269
+ writer = csv.writer(file)
270
+ writer.writerow([comp_type, "RISCV", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
271
+ else:
272
+ print(k)
273
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
274
+ writer = csv.writer(file)
275
+ writer.writerow([comp_type, "RISCV", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
276
+ avg_accuracy[comp_type + " " + "RISCV"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
277
+ if isa_type == "GPU":
278
+ cnt_idx = 0
279
+ for line in open(src_dir + "/LLVM/NVPTX.jsonl", 'r'):
280
+ dic = json.loads(line)
281
+ test_target_dic["NVPTX" + " " + str(cnt_idx)] = dic["ground_truth"]
282
+ cnt_idx += 1
283
+
284
+ total_EM = 0.0
285
+ total_ED = 0.0
286
+ total_PoVS = 0.0
287
+ total_BLEU4 = 0.0
288
+ for k in test_target_dic.keys():
289
+ edit_dis = 0.0
290
+ EM = 0.0
291
+ bleu4 = 0.0
292
+ stmt_mod = 0.0
293
+ src_code = " ".join(test_target_dic[k]).replace("NVPTX", "")
294
+ if k in codellama_llvm_code.keys():
295
+ chat_code = " ".join(codellama_llvm_code[k]).replace("nvptx", "").replace("NVPTX", "")
296
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], codellama_llvm_code[k], "nvptx", "nvptx")
297
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
298
+ f.write(chat_code+'\n')
299
+ f1.write(src_code+'\n')
300
+ if chat_code==src_code:
301
+ EM = 1
302
+ edit_dis = fuzz.ratio(chat_code, src_code)
303
+ if chat_code.strip() == "":
304
+ bleu4 = 0
305
+ else:
306
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
307
+ total_BLEU4 += bleu4
308
+ total_ED += edit_dis
309
+ total_PoVS += stmt_mod
310
+ total_EM += EM
311
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
312
+ writer = csv.writer(file)
313
+ writer.writerow([comp_type, "NVPTX", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
314
+ else:
315
+ print(k)
316
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
317
+ writer = csv.writer(file)
318
+ writer.writerow([comp_type, "NVPTX", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
319
+ avg_accuracy[comp_type + " " + "NVPTX"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
320
+
321
+ if isa_type == "MPU":
322
+ cnt_idx = 0
323
+ for line in open(src_dir + "/LLVM/ARC.jsonl", 'r'):
324
+ dic = json.loads(line)
325
+ test_target_dic["ARC" + " " + str(cnt_idx)] = dic["ground_truth"]
326
+ cnt_idx += 1
327
+ total_EM = 0.0
328
+ total_ED = 0.0
329
+ total_PoVS = 0.0
330
+ total_BLEU4 = 0.0
331
+ for k in test_target_dic.keys():
332
+ edit_dis = 0.0
333
+ EM = 0.0
334
+ bleu4 = 0.0
335
+ stmt_mod = 0.0
336
+ src_code = " ".join(test_target_dic[k]).replace("ARC", "")
337
+ if k in codellama_llvm_code.keys():
338
+ chat_code = " ".join(codellama_llvm_code[k]).replace("arc", "").replace("ARC", "")
339
+ stmt_mod = Calculate_Statements_Ratio(test_target_dic[k], codellama_llvm_code[k], "arc", "arc")
340
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
341
+ f.write(chat_code+'\n')
342
+ f1.write(src_code+'\n')
343
+ if chat_code==src_code:
344
+ EM = 1
345
+ edit_dis = fuzz.ratio(chat_code, src_code)
346
+ if chat_code.strip() == "":
347
+ bleu4 = 0
348
+ else:
349
+ bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
350
+ total_BLEU4 += bleu4
351
+ total_ED += edit_dis
352
+ total_PoVS += stmt_mod
353
+ total_EM += EM
354
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
355
+ writer = csv.writer(file)
356
+ writer.writerow([comp_type, "ARC", k.split(" ")[1], str(round(float(bleu4),2)), str(round(EM*100,2)), str(round(float(edit_dis),2)), str(round(float(stmt_mod)*100,2))])
357
+ else:
358
+ print(k)
359
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
360
+ writer = csv.writer(file)
361
+ writer.writerow([comp_type, "ARC", "average", str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))])
362
+ avg_accuracy[comp_type + " " + "ARC"] = [str(round(float(total_BLEU4 / cnt_idx),2)), str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2)), str(round(float(total_PoVS / cnt_idx)*100,2))]
363
+ return avg_accuracy
364
+
365
+
366
+
367
+
368
+
369
+ if __name__ == "__main__":
370
+ with open(dst_dir + '/result.csv', 'w', newline='') as file:
371
+ writer = csv.writer(file)
372
+ writer.writerow(["Compiler Type", "Target", "Idx", "BLEU4", "Exact Match", "Edit Didtance", "Stmt_Ratio"])
373
+
374
+ avg_dic = Calculate_Gen()
375
+
376
+ for k in avg_dic:
377
+ print("########################")
378
+
379
+ print(k)
380
+ print(" ".join(["BLEU4", "Exact Match", "Edit Didtance", "Stmt_Ratio"]))
381
+ print(" ".join(avg_dic[k]))
382
+
Script/Exp_Script/ForkFlow/bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 Google Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+
16
+ """Python implementation of BLEU and smooth-BLEU.
17
+
18
+ This module provides a Python implementation of BLEU and smooth-BLEU.
19
+ Smooth BLEU is computed following the method outlined in the paper:
20
+ Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
+ evaluation metrics for machine translation. COLING 2004.
22
+ """
23
+
24
+ import collections
25
+ import math
26
+
27
+
28
+ def _get_ngrams(segment, max_order):
29
+ """Extracts all n-grams upto a given maximum order from an input segment.
30
+
31
+ Args:
32
+ segment: text segment from which n-grams will be extracted.
33
+ max_order: maximum length in tokens of the n-grams returned by this
34
+ methods.
35
+
36
+ Returns:
37
+ The Counter containing all n-grams upto max_order in segment
38
+ with a count of how many times each n-gram occurred.
39
+ """
40
+ ngram_counts = collections.Counter()
41
+ for order in range(1, max_order + 1):
42
+ for i in range(0, len(segment) - order + 1):
43
+ ngram = tuple(segment[i:i+order])
44
+ ngram_counts[ngram] += 1
45
+ return ngram_counts
46
+
47
+
48
+ def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
+ smooth=False):
50
+ """Computes BLEU score of translated segments against one or more references.
51
+
52
+ Args:
53
+ reference_corpus: list of lists of references for each translation. Each
54
+ reference should be tokenized into a list of tokens.
55
+ translation_corpus: list of translations to score. Each translation
56
+ should be tokenized into a list of tokens.
57
+ max_order: Maximum n-gram order to use when computing BLEU score.
58
+ smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
+
60
+ Returns:
61
+ 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
+ precisions and brevity penalty.
63
+ """
64
+ matches_by_order = [0] * max_order
65
+ possible_matches_by_order = [0] * max_order
66
+ reference_length = 0
67
+ translation_length = 0
68
+ for (references, translation) in zip(reference_corpus,
69
+ translation_corpus):
70
+ reference_length += min(len(r) for r in references)
71
+ translation_length += len(translation)
72
+
73
+ merged_ref_ngram_counts = collections.Counter()
74
+ for reference in references:
75
+ merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
+ translation_ngram_counts = _get_ngrams(translation, max_order)
77
+ overlap = translation_ngram_counts & merged_ref_ngram_counts
78
+ for ngram in overlap:
79
+ matches_by_order[len(ngram)-1] += overlap[ngram]
80
+ for order in range(1, max_order+1):
81
+ possible_matches = len(translation) - order + 1
82
+ if possible_matches > 0:
83
+ possible_matches_by_order[order-1] += possible_matches
84
+
85
+ precisions = [0] * max_order
86
+ for i in range(0, max_order):
87
+ if smooth:
88
+ precisions[i] = ((matches_by_order[i] + 1.) /
89
+ (possible_matches_by_order[i] + 1.))
90
+ else:
91
+ if possible_matches_by_order[i] > 0:
92
+ precisions[i] = (float(matches_by_order[i]) /
93
+ possible_matches_by_order[i])
94
+ else:
95
+ precisions[i] = 0.0
96
+
97
+ if min(precisions) > 0:
98
+ p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
+ geo_mean = math.exp(p_log_sum)
100
+ else:
101
+ geo_mean = 0
102
+
103
+ ratio = float(translation_length) / reference_length
104
+
105
+ if ratio > 1.0:
106
+ bp = 1.
107
+ else:
108
+ bp = math.exp(1 - 1. / ratio)
109
+
110
+ bleu = geo_mean * bp
111
+
112
+ return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
+
114
+
115
+ def _bleu(ref_file, trans_file, subword_option=None):
116
+ max_order = 4
117
+ smooth = True
118
+ ref_files = [ref_file]
119
+ reference_text = []
120
+ for reference_filename in ref_files:
121
+ with open(reference_filename) as fh:
122
+ reference_text.append(fh.readlines())
123
+ per_segment_references = []
124
+ for references in zip(*reference_text):
125
+ reference_list = []
126
+ for reference in references:
127
+ reference_list.append(reference.strip().split())
128
+ per_segment_references.append(reference_list)
129
+ translations = []
130
+ with open(trans_file) as fh:
131
+ for line in fh:
132
+ translations.append(line.strip().split())
133
+ bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
+ return round(100 * bleu_score,2)
Script/Exp_Script/ForkFlow/calculate_forkflow.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # from tree_sitter import Language, Parser
3
+ # # import pandas as pd
4
+ # import openpyxl
5
+ import json
6
+ import time
7
+ import csv
8
+ import pathlib
9
+ import difflib
10
+ import re
11
+ from bleu import _bleu
12
+ from fuzzywuzzy import fuzz
13
+ import random
14
+ import numpy as np
15
+ from transformers import RobertaTokenizer
16
+ #tokens = nltk.word_tokenize(sentence)
17
+
18
+ folder = str(pathlib.Path(__file__).parent.resolve())
19
+ isa_type_dir = folder+"/../../../Dataset"
20
+ src_dir = folder+"/../../../Dataset/Code_Generation"
21
+ dst_dir = folder+"/Result"
22
+
23
+ train_lis = []
24
+ valid_lis = []
25
+ test_lis = []
26
+
27
+ target_clf = {}
28
+ def get_target_clf_list():
29
+ global target_clf
30
+ with open(isa_type_dir+"/comback_isa_type.csv","r",encoding="utf-8") as f:
31
+ reader = csv.reader(f)
32
+ for idx, l in enumerate(reader):
33
+ if l[1].lower() == "arc" or l[1].lower() == "riscv" or l[1].lower() == "nvptx":
34
+ continue
35
+ if l[0] + " " + l[2] not in target_clf.keys():
36
+ target_clf[l[0] + " " + l[2]] = [l[1]]
37
+ else:
38
+ target_clf[l[0] + " " + l[2]] += [l[1]]
39
+
40
+
41
+ def Calculate_Statements_Ratio(Src_List, Fork_Lis, src_name, fork_name):
42
+ src_code = ""
43
+ Fork_code = ""
44
+ idx = 0
45
+ cnt_stmt = 0.0
46
+ while idx < len(Src_List):
47
+ src_code += Src_List[idx].replace(src_name, "")
48
+ if Src_List[idx] in [";", ":", "{", "}"]:
49
+ src_code += "\n"
50
+ cnt_stmt += 1
51
+ idx += 1
52
+ while idx < len(Fork_Lis):
53
+ Fork_code += Fork_Lis[idx].replace(fork_name, "")
54
+ if Fork_Lis[idx] in [";", ":", "{", "}"]:
55
+ Fork_code += "\n"
56
+ idx += 1
57
+
58
+ code_same = 0
59
+ code_modi = 0
60
+ code_add = 0
61
+ diff_code = list(difflib.Differ().compare(src_code.splitlines(), Fork_code.splitlines()))
62
+ for idx, dv in enumerate(diff_code):
63
+ if dv[0] == '-':
64
+ if idx < len(diff_code) - 1 and diff_code[idx+1][0] == '?':
65
+ code_modi += 1
66
+ else:
67
+ code_add += 1
68
+ elif dv[0] == '+':
69
+ continue
70
+ elif dv[0] == '?':
71
+ continue
72
+ #vega_add -= 1
73
+ elif dv.strip().replace("\n", "") == '':
74
+ continue
75
+ else:
76
+ code_same += 1
77
+ return round(float(code_same) / cnt_stmt, 2)
78
+
79
+
80
+
81
+ def Calculate_Forkflow():
82
+ get_target_clf_list()
83
+ print("############## Exp 1: Calculate Fork-Flow ################\n")
84
+
85
+ test_lis = ["nvptx","arc","riscv"]
86
+ for comp_type in ["GCC", "LLVM"]:
87
+ for isa_type in ["GPU", "MPU", "CPU"]:
88
+ max_ed = 0
89
+ avg_ed = 0
90
+ max_bleu4 = 0
91
+ avg_bleu4 = 0
92
+ avg_cnt = 0
93
+ target_lis = target_clf[comp_type + " " + isa_type]
94
+ test_target_dic = {}
95
+ cnt_idx = 0
96
+ if comp_type == "GCC":
97
+ if isa_type == "CPU":
98
+ for line in open(src_dir + "/GCC/riscv.jsonl", 'r'):
99
+ dic = json.loads(line)
100
+ test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("riscv", "")] = dic["ground_truth"]
101
+ cnt_idx += 1
102
+
103
+ for tar in target_lis:
104
+ edit_dis = 0.0
105
+ EM = []
106
+ bleu4 = 0.0
107
+ stmt_mod = 0.0
108
+ cnt = 0
109
+ fork_target_dic = {}
110
+ for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
111
+ dic = json.loads(line)
112
+ fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
113
+
114
+ for k in test_target_dic.keys():
115
+ func = k.split(" ")[1]
116
+ src_code = " ".join(test_target_dic[k]).replace("riscv", "")
117
+ if func in fork_target_dic.keys():
118
+ fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
119
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "riscv", tar)
120
+ else:
121
+ fork_code = ""
122
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "riscv", tar)
123
+
124
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
125
+ f.write(fork_code+'\n')
126
+ f1.write(src_code+'\n')
127
+ EM.append(fork_code==src_code)
128
+ edit_dis += fuzz.ratio(fork_code, src_code)
129
+ avg_ed += fuzz.ratio(fork_code, src_code)
130
+ cnt += 1
131
+ avg_cnt += 1
132
+ if fork_code.strip() == "":
133
+ bleu4 += 0
134
+ else:
135
+ tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
136
+ bleu4 += tmp_bleu4
137
+ avg_bleu4 += tmp_bleu4
138
+
139
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
140
+ writer = csv.writer(file)
141
+ writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
142
+ if round(float(bleu4)/cnt,2) > max_bleu4:
143
+ max_bleu4 = round(float(bleu4)/cnt,2)
144
+ if round(float(edit_dis)/cnt,2) > max_ed:
145
+ max_ed = round(float(edit_dis)/cnt,2)
146
+ if isa_type == "GPU":
147
+ for line in open(src_dir + "/GCC/nvptx.jsonl", 'r'):
148
+ dic = json.loads(line)
149
+ test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("nvptx", "")] = dic["ground_truth"]
150
+ cnt_idx += 1
151
+
152
+ for tar in target_lis:
153
+ edit_dis = 0.0
154
+ EM = []
155
+ bleu4 = 0.0
156
+ stmt_mod = 0.0
157
+ cnt = 0
158
+ fork_target_dic = {}
159
+ for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
160
+ dic = json.loads(line)
161
+ fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
162
+
163
+ for k in test_target_dic.keys():
164
+ func = k.split(" ")[1]
165
+ src_code = " ".join(test_target_dic[k]).replace("nvptx", "")
166
+ if func in fork_target_dic.keys():
167
+ fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
168
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "nvptx", tar)
169
+ else:
170
+ fork_code = ""
171
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "nvptx", tar)
172
+
173
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
174
+ f.write(fork_code+'\n')
175
+ f1.write(src_code+'\n')
176
+ EM.append(fork_code==src_code)
177
+ edit_dis += fuzz.ratio(fork_code, src_code)
178
+ avg_ed += fuzz.ratio(fork_code, src_code)
179
+ cnt += 1
180
+ avg_cnt += 1
181
+ if fork_code.strip() == "":
182
+ bleu4 += 0
183
+ else:
184
+ tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
185
+ bleu4 += tmp_bleu4
186
+ avg_bleu4 += tmp_bleu4
187
+
188
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
189
+ writer = csv.writer(file)
190
+ writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
191
+ if round(float(bleu4)/cnt,2) > max_bleu4:
192
+ max_bleu4 = round(float(bleu4)/cnt,2)
193
+ if round(float(edit_dis)/cnt,2) > max_ed:
194
+ max_ed = round(float(edit_dis)/cnt,2)
195
+ if isa_type == "MPU":
196
+ for line in open(src_dir + "/GCC/arc.jsonl", 'r'):
197
+ dic = json.loads(line)
198
+ test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("arc", "")] = dic["ground_truth"]
199
+ cnt_idx += 1
200
+
201
+ for tar in target_lis:
202
+ edit_dis = 0.0
203
+ EM = []
204
+ bleu4 = 0.0
205
+ stmt_mod = 0.0
206
+ cnt = 0
207
+ fork_target_dic = {}
208
+ for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
209
+ dic = json.loads(line)
210
+ fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
211
+
212
+ for k in test_target_dic.keys():
213
+ func = k.split(" ")[1]
214
+ src_code = " ".join(test_target_dic[k]).replace("arc", "")
215
+ if func in fork_target_dic.keys():
216
+ fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
217
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "arc", tar)
218
+ else:
219
+ fork_code = ""
220
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "arc", tar)
221
+
222
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
223
+ f.write(fork_code+'\n')
224
+ f1.write(src_code+'\n')
225
+ EM.append(fork_code==src_code)
226
+ edit_dis += fuzz.ratio(fork_code, src_code)
227
+ avg_ed += fuzz.ratio(fork_code, src_code)
228
+ cnt += 1
229
+ avg_cnt += 1
230
+ if fork_code.strip() == "":
231
+ bleu4 += 0
232
+ else:
233
+ tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
234
+ bleu4 += tmp_bleu4
235
+ avg_bleu4 += tmp_bleu4
236
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
237
+ writer = csv.writer(file)
238
+ writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
239
+ if round(float(bleu4)/cnt,2) > max_bleu4:
240
+ max_bleu4 = round(float(bleu4)/cnt,2)
241
+ if round(float(edit_dis)/cnt,2) > max_ed:
242
+ max_ed = round(float(edit_dis)/cnt,2)
243
+ if comp_type == "LLVM":
244
+ if isa_type == "CPU":
245
+ for line in open(src_dir + "/LLVM/RISCV.jsonl", 'r'):
246
+ dic = json.loads(line)
247
+ test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("RISCV", "")] = dic["ground_truth"]
248
+ cnt_idx += 1
249
+
250
+ for tar in target_lis:
251
+ if tar == "RI5CY":
252
+ continue
253
+ edit_dis = 0.0
254
+ EM = []
255
+ bleu4 = 0.0
256
+ stmt_mod = 0.0
257
+ cnt = 0
258
+ fork_target_dic = {}
259
+ for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
260
+ dic = json.loads(line)
261
+ fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
262
+
263
+ for k in test_target_dic.keys():
264
+ func = k.split(" ")[1]
265
+ src_code = " ".join(test_target_dic[k]).replace("RISCV", "")
266
+ if func in fork_target_dic.keys():
267
+ fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
268
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "RISCV", tar)
269
+ else:
270
+ fork_code = ""
271
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "RISCV", tar)
272
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
273
+ f.write(fork_code+'\n')
274
+ f1.write(src_code+'\n')
275
+ EM.append(fork_code==src_code)
276
+ edit_dis += fuzz.ratio(fork_code, src_code)
277
+ avg_ed += fuzz.ratio(fork_code, src_code)
278
+ cnt += 1
279
+ avg_cnt += 1
280
+ if fork_code.strip() == "":
281
+ bleu4 += 0
282
+ else:
283
+ tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
284
+ bleu4 += tmp_bleu4
285
+ avg_bleu4 += tmp_bleu4
286
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
287
+ writer = csv.writer(file)
288
+ writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
289
+ if round(float(bleu4)/cnt,2) > max_bleu4:
290
+ max_bleu4 = round(float(bleu4)/cnt,2)
291
+ if round(float(edit_dis)/cnt,2) > max_ed:
292
+ max_ed = round(float(edit_dis)/cnt,2)
293
+ if isa_type == "GPU":
294
+ for line in open(src_dir + "/LLVM/NVPTX.jsonl", 'r'):
295
+ dic = json.loads(line)
296
+ test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("NVPTX", "")] = dic["ground_truth"]
297
+ cnt_idx += 1
298
+
299
+ for tar in target_lis:
300
+ edit_dis = 0.0
301
+ EM = []
302
+ bleu4 = 0.0
303
+ stmt_mod = 0.0
304
+ cnt = 0
305
+ fork_target_dic = {}
306
+ for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
307
+ dic = json.loads(line)
308
+ fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
309
+
310
+ for k in test_target_dic.keys():
311
+ func = k.split(" ")[1]
312
+ src_code = " ".join(test_target_dic[k]).replace("NVPTX", "")
313
+ if func in fork_target_dic.keys():
314
+ fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
315
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "NVPTX", tar)
316
+ else:
317
+ fork_code = ""
318
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "NVPTX", tar)
319
+
320
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
321
+ f.write(fork_code+'\n')
322
+ f1.write(src_code+'\n')
323
+ EM.append(fork_code==src_code)
324
+ edit_dis += fuzz.ratio(fork_code, src_code)
325
+ avg_ed += fuzz.ratio(fork_code, src_code)
326
+ cnt += 1
327
+ avg_cnt += 1
328
+ if fork_code.strip() == "":
329
+ bleu4 += 0
330
+ else:
331
+ tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
332
+ bleu4 += tmp_bleu4
333
+ avg_bleu4 += tmp_bleu4
334
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
335
+ writer = csv.writer(file)
336
+ writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
337
+ if round(float(bleu4)/cnt,2) > max_bleu4:
338
+ max_bleu4 = round(float(bleu4)/cnt,2)
339
+ if round(float(edit_dis)/cnt,2) > max_ed:
340
+ max_ed = round(float(edit_dis)/cnt,2)
341
+ if isa_type == "MPU":
342
+ for line in open(src_dir + "/LLVM/ARC.jsonl", 'r'):
343
+ dic = json.loads(line)
344
+ test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("ARC", "")] = dic["ground_truth"]
345
+ cnt_idx += 1
346
+ for tar in target_lis:
347
+ edit_dis = 0.0
348
+ EM = []
349
+ bleu4 = 0.0
350
+ stmt_mod = 0.0
351
+ cnt = 0
352
+ fork_target_dic = {}
353
+ for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
354
+ dic = json.loads(line)
355
+ fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
356
+
357
+ for k in test_target_dic.keys():
358
+ func = k.split(" ")[1]
359
+ src_code = " ".join(test_target_dic[k]).replace("ARC", "")
360
+ if func in fork_target_dic.keys():
361
+ fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
362
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "ARC", tar)
363
+ else:
364
+ fork_code = ""
365
+ stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "ARC", tar)
366
+ with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
367
+ f.write(fork_code+'\n')
368
+ f1.write(src_code+'\n')
369
+ EM.append(fork_code==src_code)
370
+ edit_dis += fuzz.ratio(fork_code, src_code)
371
+ avg_ed += fuzz.ratio(fork_code, src_code)
372
+ cnt += 1
373
+ avg_cnt += 1
374
+ if fork_code.strip() == "":
375
+ bleu4 += 0
376
+ else:
377
+ tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
378
+ bleu4 += tmp_bleu4
379
+ avg_bleu4 += tmp_bleu4
380
+
381
+
382
+ with open(dst_dir + '/result.csv', 'a', newline='') as file:
383
+ writer = csv.writer(file)
384
+ writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
385
+ if round(float(bleu4)/cnt,2) > max_bleu4:
386
+ max_bleu4 = round(float(bleu4)/cnt,2)
387
+ if round(float(edit_dis)/cnt,2) > max_ed:
388
+ max_ed = round(float(edit_dis)/cnt,2)
389
+ print(comp_type + " " + isa_type)
390
+ print("Avg ED: " + str(round(float(avg_ed)/avg_cnt,2)))
391
+ print("Max ED: " + str(max_ed))
392
+ print("Avg BLEU4: " + str(round(float(avg_bleu4)/avg_cnt,2)))
393
+ print("Max BLEU4: " + str(max_bleu4))
394
+ print("\n\n")
395
+
396
+
397
+
398
+
399
+
400
+ if __name__ == "__main__":
401
+ with open(dst_dir + '/result.csv', 'w', newline='') as file:
402
+ writer = csv.writer(file)
403
+ writer.writerow(["Compiler Type", "ISA Type", "Target", "BLEU4", "Exact Match", "Edit Didtance", "Stmt_Ratio"])
404
+ Calculate_Forkflow()
405
+
406
+
407
+
Script/Model/CodeBert/code-completion/model.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch
7
+ from torch.autograd import Variable
8
+ import copy
9
+ class Seq2Seq(nn.Module):
10
+ """
11
+ Build Seqence-to-Sequence.
12
+
13
+ Parameters:
14
+
15
+ * `encoder`- encoder of seq2seq model. e.g. roberta
16
+ * `decoder`- decoder of seq2seq model. e.g. transformer
17
+ * `config`- configuration of encoder model.
18
+ * `beam_size`- beam size for beam search.
19
+ * `max_length`- max length of target for beam search.
20
+ * `sos_id`- start of symbol ids in target for beam search.
21
+ * `eos_id`- end of symbol ids in target for beam search.
22
+ """
23
+ def __init__(self, encoder,decoder, config, beam_size=None, max_length=None, sos_id=None, eos_id=None):
24
+ super(Seq2Seq, self).__init__()
25
+ self.encoder = encoder
26
+ self.decoder=decoder
27
+ self.config=config
28
+ self.register_buffer(
29
+ "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
30
+ )
31
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
32
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
33
+ self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
34
+ self.lsm = nn.LogSoftmax(dim=-1)
35
+
36
+ self.beam_size = beam_size
37
+ self.max_length = max_length
38
+ self.sos_id = sos_id
39
+ self.eos_id = eos_id
40
+
41
+ def forward(self, source_ids, target_ids=None):
42
+ if target_ids is None:
43
+ return self.generate(source_ids)
44
+
45
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
46
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
47
+ ids = torch.cat((source_ids,target_ids),-1)
48
+ mask = self.bias[:,source_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
49
+ mask = mask & ids[:,None,:].ne(1)
50
+
51
+ out = self.decoder(target_ids,attention_mask=mask,past_key_values=encoder_output.past_key_values).last_hidden_state
52
+ lm_logits = self.lm_head(out)
53
+ # Shift so that tokens < n predict n
54
+ active_loss = target_ids[..., 1:].ne(1).view(-1)
55
+ shift_logits = lm_logits[..., :-1, :].contiguous()
56
+ shift_labels = target_ids[..., 1:].contiguous()
57
+ # Flatten the tokens
58
+ loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
59
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
60
+ shift_labels.view(-1)[active_loss])
61
+
62
+ outputs = loss,loss*active_loss.sum(),active_loss.sum()
63
+ return outputs
64
+
65
+ def generate(self, source_ids):
66
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
67
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
68
+ preds = []
69
+ zero = torch.cuda.LongTensor(1).fill_(0)
70
+ source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
71
+ for i in range(source_ids.shape[0]):
72
+ context = [[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y]
73
+ for y in encoder_output.past_key_values]
74
+ beam = Beam(self.beam_size,self.sos_id,self.eos_id)
75
+ input_ids = beam.getCurrentState()
76
+ context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
77
+ for _ in range(self.max_length):
78
+ if beam.done():
79
+ break
80
+
81
+ ids = torch.cat((context_ids,input_ids),-1)
82
+ mask = self.bias[:,context_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
83
+ mask = mask & ids[:,None,:].ne(1)
84
+ out = self.decoder(input_ids,attention_mask=mask,past_key_values=context).last_hidden_state
85
+ hidden_states = out[:,-1,:]
86
+ out = self.lsm(self.lm_head(hidden_states)).data
87
+ beam.advance(out)
88
+ input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
89
+ input_ids = torch.cat((input_ids,beam.getCurrentState()),-1)
90
+ hyp = beam.getHyp(beam.getFinal())
91
+ pred = beam.buildTargetTokens(hyp)[:self.beam_size]
92
+ pred = [torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
93
+ preds.append(torch.cat(pred,0).unsqueeze(0))
94
+
95
+ preds = torch.cat(preds,0)
96
+
97
+ return preds
98
+
99
+
100
+
101
+ class Beam(object):
102
+ def __init__(self, size,sos,eos):
103
+ self.size = size
104
+ self.tt = torch.cuda
105
+ # The score for each translation on the beam.
106
+ self.scores = self.tt.FloatTensor(size).zero_()
107
+ # The backpointers at each time-step.
108
+ self.prevKs = []
109
+ # The outputs at each time-step.
110
+ self.nextYs = [self.tt.LongTensor(size)
111
+ .fill_(0)]
112
+ self.nextYs[0][0] = sos
113
+ # Has EOS topped the beam yet.
114
+ self._eos = eos
115
+ self.eosTop = False
116
+ # Time and k pair for finished.
117
+ self.finished = []
118
+
119
+ def getCurrentState(self):
120
+ "Get the outputs for the current timestep."
121
+ batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
122
+ return batch
123
+
124
+ def getCurrentOrigin(self):
125
+ "Get the backpointers for the current timestep."
126
+ return self.prevKs[-1]
127
+
128
+ def advance(self, wordLk):
129
+ """
130
+ Given prob over words for every last beam `wordLk` and attention
131
+ `attnOut`: Compute and update the beam search.
132
+
133
+ Parameters:
134
+
135
+ * `wordLk`- probs of advancing from the last step (K x words)
136
+ * `attnOut`- attention at the last step
137
+
138
+ Returns: True if beam search is complete.
139
+ """
140
+ numWords = wordLk.size(1)
141
+
142
+ # Sum the previous scores.
143
+ if len(self.prevKs) > 0:
144
+ beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
145
+
146
+ # Don't let EOS have children.
147
+ for i in range(self.nextYs[-1].size(0)):
148
+ if self.nextYs[-1][i] == self._eos:
149
+ beamLk[i] = -1e20
150
+ else:
151
+ beamLk = wordLk[0]
152
+ flatBeamLk = beamLk.view(-1)
153
+ bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
154
+
155
+ self.scores = bestScores
156
+
157
+ # bestScoresId is flattened beam x word array, so calculate which
158
+ # word and beam each score came from
159
+ prevK = bestScoresId // numWords
160
+ self.prevKs.append(prevK)
161
+ self.nextYs.append((bestScoresId - prevK * numWords))
162
+
163
+
164
+ for i in range(self.nextYs[-1].size(0)):
165
+ if self.nextYs[-1][i] == self._eos:
166
+ s = self.scores[i]
167
+ self.finished.append((s, len(self.nextYs) - 1, i))
168
+
169
+ # End condition is when top-of-beam is EOS and no global score.
170
+ if self.nextYs[-1][0] == self._eos:
171
+ self.eosTop = True
172
+
173
+ def done(self):
174
+ return self.eosTop and len(self.finished) >=self.size
175
+
176
+ def getFinal(self):
177
+ if len(self.finished) == 0:
178
+ self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
179
+ self.finished.sort(key=lambda a: -a[0])
180
+ if len(self.finished) != self.size:
181
+ unfinished=[]
182
+ for i in range(self.nextYs[-1].size(0)):
183
+ if self.nextYs[-1][i] != self._eos:
184
+ s = self.scores[i]
185
+ unfinished.append((s, len(self.nextYs) - 1, i))
186
+ unfinished.sort(key=lambda a: -a[0])
187
+ self.finished+=unfinished[:self.size-len(self.finished)]
188
+ return self.finished[:self.size]
189
+
190
+ def getHyp(self, beam_res):
191
+ """
192
+ Walk back to construct the full hypothesis.
193
+ """
194
+ hyps=[]
195
+ for _,timestep, k in beam_res:
196
+ hyp = []
197
+ for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
198
+ hyp.append(self.nextYs[j+1][k])
199
+ k = self.prevKs[j][k]
200
+ hyps.append(hyp[::-1])
201
+ return hyps
202
+
203
+ def buildTargetTokens(self, preds):
204
+ sentence=[]
205
+ for pred in preds:
206
+ tokens = []
207
+ for tok in pred:
208
+ if tok==self._eos:
209
+ break
210
+ tokens.append(tok)
211
+ sentence.append(tokens)
212
+ return sentence
213
+
Script/Model/CodeBert/code-completion/run_completion.py ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ import pickle
26
+ import torch
27
+ import json
28
+
29
+ import random
30
+ import logging
31
+ import argparse
32
+ import numpy as np
33
+ from io import open
34
+ from itertools import cycle
35
+ import torch.nn as nn
36
+ from model import Seq2Seq
37
+ from tqdm import tqdm, trange
38
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
39
+ from torch.utils.data.distributed import DistributedSampler
40
+ from tqdm import tqdm
41
+ from fuzzywuzzy import fuzz
42
+ import re
43
+ import multiprocessing
44
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
45
+ RobertaConfig, RobertaModel, RobertaTokenizer)
46
+
47
+ divide_number = 2
48
+ cpu_cont = 16
49
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
50
+ datefmt = '%m/%d/%Y %H:%M:%S',
51
+ level = logging.INFO)
52
+ logger = logging.getLogger(__name__)
53
+
54
+
55
+ class Example(object):
56
+ """A single training/test example."""
57
+ def __init__(self,
58
+ idx,
59
+ source,
60
+ target,
61
+ max_src_len,
62
+ max_tar_len
63
+ ):
64
+ self.idx = idx
65
+ self.source = source
66
+ self.target = target
67
+ self.max_src_len = max_src_len
68
+ self.max_tar_len = max_tar_len
69
+
70
+ def read_examples(filename):
71
+ """Read examples from filename."""
72
+ examples=[]
73
+
74
+ with open(filename,encoding="utf-8") as f:
75
+ max_src_len = 0
76
+ max_tar_len = 0
77
+ for idx, line in enumerate(f):
78
+ js=json.loads(line)
79
+ inputs = " ".join(js["Template_token"][1:])
80
+ max_src_len = max(max_src_len, len(js["Template_token"]))
81
+
82
+ if "ground_truth" in js:
83
+ outputs = " ".join(js["ground_truth"])
84
+ max_tar_len = max(max_src_len, len(js["ground_truth"]))
85
+ else:
86
+ outputs = inputs
87
+ if 'Idx' in js:
88
+ idx = js['Idx']
89
+ examples.append(
90
+ Example(
91
+ idx = idx,
92
+ source = inputs,
93
+ target = outputs,
94
+ max_src_len = max_src_len,
95
+ max_tar_len = max_tar_len
96
+ )
97
+ )
98
+ return examples
99
+
100
+
101
+ class InputFeatures(object):
102
+ """A single training/test features for a example."""
103
+ def __init__(self,
104
+ example_id,
105
+ source_ids,
106
+ target_ids,
107
+ ):
108
+ self.example_id = example_id
109
+ self.source_ids = source_ids
110
+ self.target_ids = target_ids
111
+
112
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
113
+ features = []
114
+ for example_index, example in enumerate(examples):
115
+ #source
116
+ source_tokens = tokenizer.tokenize(example.source)[:args.max_source_length-5]
117
+ source_tokens =[tokenizer.cls_token,tokenizer.sep_token]+source_tokens+["<mask>", tokenizer.sep_token]
118
+ source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
119
+ padding_length = args.max_source_length - len(source_ids)
120
+ source_ids+=[tokenizer.pad_token_id]*padding_length
121
+
122
+ #target
123
+ if stage=="test":
124
+ target_tokens = tokenizer.tokenize("None")
125
+ else:
126
+ target_tokens = ["<mask>"] + tokenizer.tokenize(example.target)[:args.max_target_length-2]
127
+ target_tokens = target_tokens+[tokenizer.sep_token]
128
+ target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
129
+ padding_length = args.max_target_length - len(target_ids)
130
+ target_ids+=[tokenizer.pad_token_id]*padding_length
131
+
132
+ features.append(
133
+ InputFeatures(
134
+ example_index,
135
+ source_ids,
136
+ target_ids,
137
+ )
138
+ )
139
+ return features
140
+
141
+
142
+
143
+ def set_seed(seed=20240124):
144
+ random.seed(seed)
145
+ os.environ['PYHTONHASHSEED'] = str(seed)
146
+ np.random.seed(seed)
147
+ torch.manual_seed(seed)
148
+ torch.cuda.manual_seed(seed)
149
+ torch.backends.cudnn.deterministic = True
150
+
151
+
152
+ def main():
153
+ parser = argparse.ArgumentParser()
154
+
155
+ ## Required parameters
156
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
157
+ help="Path to pre-trained model: e.g. roberta-base" )
158
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
159
+ help="The output directory where the model predictions and checkpoints will be written.")
160
+ parser.add_argument("--load_model_path", default=None, type=str,
161
+ help="Path to trained model: Should contain the .bin files" )
162
+ ## Other parameters
163
+ parser.add_argument("--task", default=None, type=str, required=True,
164
+ help="Task Type: statement_level, next_statement" )
165
+
166
+ parser.add_argument("--train_filename", default="../../Dataset/", type=str,
167
+ help="The train filename. Should contain the .jsonl files for this task.")
168
+ parser.add_argument("--dev_filename", default="../../Dataset/", type=str,
169
+ help="The dev filename. Should contain the .jsonl files for this task.")
170
+ parser.add_argument("--test_filename", default="../../Dataset/", type=str,
171
+ help="The test filename. Should contain the .jsonl files for this task.")
172
+
173
+ parser.add_argument("--config_name", default="", type=str,
174
+ help="Pretrained config name or path if not the same as model_name")
175
+ parser.add_argument("--tokenizer_name", default="", type=str,
176
+ help="Pretrained tokenizer name or path if not the same as model_name")
177
+ # parser.add_argument("--max_source_length", default=64, type=int,
178
+ # help="The maximum total source sequence length after tokenization. Sequences longer "
179
+ # "than this will be truncated, sequences shorter will be padded.")
180
+ # parser.add_argument("--max_target_length", default=32, type=int,
181
+ # help="The maximum total target sequence length after tokenization. Sequences longer "
182
+ # "than this will be truncated, sequences shorter will be padded.")
183
+
184
+ parser.add_argument("--do_train", action='store_true',
185
+ help="Whether to run training.")
186
+ parser.add_argument("--do_eval", action='store_true',
187
+ help="Whether to run eval on the dev set.")
188
+ parser.add_argument("--do_test", action='store_true',
189
+ help="Whether to run eval on the dev set.")
190
+ parser.add_argument("--test_org", action='store_true',
191
+ help="Whether to run eval on org model.")
192
+ parser.add_argument("--do_lower_case", action='store_true',
193
+ help="Set this flag if you are using an uncased model.")
194
+ parser.add_argument("--no_cuda", action='store_true',
195
+ help="Avoid using CUDA when available")
196
+
197
+ parser.add_argument("--train_batch_size", default=8, type=int,
198
+ help="Batch size per GPU/CPU for training.")
199
+ parser.add_argument("--eval_batch_size", default=8, type=int,
200
+ help="Batch size per GPU/CPU for evaluation.")
201
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
202
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
203
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
204
+ help="The initial learning rate for Adam.")
205
+ parser.add_argument("--beam_size", default=10, type=int,
206
+ help="beam size for beam search")
207
+ parser.add_argument("--weight_decay", default=0.0, type=float,
208
+ help="Weight deay if we apply some.")
209
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
210
+ help="Epsilon for Adam optimizer.")
211
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
212
+ help="Max gradient norm.")
213
+ parser.add_argument("--num_train_epochs", default=3, type=int,
214
+ help="Total number of training epochs to perform.")
215
+ parser.add_argument("--max_steps", default=-1, type=int,
216
+ help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
217
+ parser.add_argument("--eval_steps", default=-1, type=int,
218
+ help="")
219
+ parser.add_argument("--max_target_length", default=128, type=int,
220
+ help="")
221
+ parser.add_argument("--max_source_length", default=384, type=int,
222
+ help="")
223
+ parser.add_argument("--train_steps", default=-1, type=int,
224
+ help="")
225
+ parser.add_argument("--warmup_steps", default=0, type=int,
226
+ help="Linear warmup over warmup_steps.")
227
+ parser.add_argument("--local_rank", type=int, default=-1,
228
+ help="For distributed training: local_rank")
229
+ parser.add_argument('--seed', type=int, default=20240124,
230
+ help="random seed for initialization")
231
+ # print arguments
232
+ args = parser.parse_args()
233
+ # set log
234
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
235
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
236
+ # set device
237
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
238
+ args.n_gpu = torch.cuda.device_count()
239
+ args.device = device
240
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
241
+
242
+ # Set seed
243
+ set_seed(args.seed)
244
+
245
+ # make dir if output_dir not exist
246
+ if os.path.exists(args.output_dir) is False:
247
+ os.makedirs(args.output_dir)
248
+
249
+ # build model
250
+ tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
251
+ config = RobertaConfig.from_pretrained(args.model_name_or_path)
252
+ # import!!!you must set is_decoder as True for generation
253
+ config.is_decoder = True
254
+ encoder = RobertaModel.from_pretrained(args.model_name_or_path,config=config)
255
+
256
+ model = Seq2Seq(encoder=encoder,decoder=encoder,config=config,
257
+ beam_size=args.beam_size,max_length=args.max_target_length,
258
+ sos_id=tokenizer.convert_tokens_to_ids(["<mask0>"])[0],eos_id=tokenizer.sep_token_id)
259
+
260
+ logger.info("Training/evaluation parameters %s", args)
261
+
262
+ if args.load_model_path is not None:
263
+ if args.task == "statement_level":
264
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
265
+ model.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
266
+ else:
267
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
268
+ model.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
269
+
270
+ model.to(args.device)
271
+
272
+ if args.n_gpu > 1:
273
+ # multi-gpu training
274
+ model = torch.nn.DataParallel(model)
275
+
276
+ if args.do_train:
277
+ # Prepare training data loader
278
+ if args.task == "statement_level":
279
+ train_examples = read_examples(args.train_filename + "/Code_Completion/statement_level/train.jsonl")
280
+ else:
281
+ train_examples = read_examples(args.train_filename + "/Code_Completion/next_statement/train.jsonl")
282
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
283
+ all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long)
284
+ all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
285
+ train_data = TensorDataset(all_source_ids,all_target_ids)
286
+ train_sampler = RandomSampler(train_data)
287
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
288
+
289
+
290
+ # Prepare optimizer and schedule (linear warmup and decay)
291
+ no_decay = ['bias', 'LayerNorm.weight']
292
+ optimizer_grouped_parameters = [
293
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
294
+ 'weight_decay': args.weight_decay},
295
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
296
+ ]
297
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
298
+ scheduler = get_linear_schedule_with_warmup(optimizer,
299
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
300
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
301
+
302
+ #Start training
303
+ logger.info("***** Running training *****")
304
+ logger.info(" Num examples = %d", len(train_examples))
305
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
306
+ logger.info(" Num epoch = %d", args.num_train_epochs)
307
+
308
+
309
+ model.train()
310
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
311
+ for epoch in range(args.num_train_epochs):
312
+ for idx,batch in enumerate(train_dataloader):
313
+ batch = tuple(t.to(device) for t in batch)
314
+ source_ids,target_ids = batch
315
+ loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
316
+
317
+ if args.n_gpu > 1:
318
+ loss = loss.mean() # mean() to average on multi-gpu.
319
+ if args.gradient_accumulation_steps > 1:
320
+ loss = loss / args.gradient_accumulation_steps
321
+
322
+ losses.append(loss.item())
323
+ loss.backward()
324
+ if len(losses) % args.gradient_accumulation_steps == 0:
325
+ #Update parameters
326
+ optimizer.step()
327
+ optimizer.zero_grad()
328
+ scheduler.step()
329
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
330
+ logger.info("epoch {} step {} loss {}".format(epoch,
331
+ len(losses)//args.gradient_accumulation_steps,
332
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
333
+ if args.do_eval:
334
+ #Eval model with dev dataset
335
+
336
+ if 'dev_loss' in dev_dataset:
337
+ eval_examples,eval_data = dev_dataset['dev_loss']
338
+ else:
339
+ if args.task == "statement_level":
340
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl")
341
+ else:
342
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl")
343
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
344
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
345
+ all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long)
346
+ eval_data = TensorDataset(all_source_ids,all_target_ids)
347
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
348
+ eval_sampler = SequentialSampler(eval_data)
349
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
350
+ res_list = []
351
+ logger.info("\n***** Running evaluation *****")
352
+ logger.info(" Num examples = %d", len(eval_examples))
353
+ logger.info(" Batch size = %d", args.eval_batch_size)
354
+
355
+ #Start Evaling model
356
+ model.eval()
357
+ eval_loss,tokens_num = 0,0
358
+ for batch in eval_dataloader:
359
+ batch = tuple(t.to(device) for t in batch)
360
+ source_ids,target_ids = batch
361
+
362
+ with torch.no_grad():
363
+ _,loss,num = model(source_ids=source_ids,target_ids=target_ids)
364
+ eval_loss += loss.sum().item()
365
+ tokens_num += num.sum().item()
366
+ #Pring loss of dev dataset
367
+ model.train()
368
+ eval_loss = eval_loss / tokens_num
369
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
370
+ for key in sorted(result.keys()):
371
+ logger.info(" %s = %s", key, str(result[key]))
372
+ logger.info(" "+"*"*20)
373
+
374
+ #Calculate bleu
375
+ if 'dev_bleu' in dev_dataset:
376
+ eval_examples,eval_data=dev_dataset['dev_bleu']
377
+ else:
378
+ if args.task == "statement_level":
379
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl")
380
+ else:
381
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl")
382
+ # eval_examples = random.sample(eval_examples, int(len(eval_examples) / divide_number))
383
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
384
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
385
+ eval_data = TensorDataset(all_source_ids)
386
+ dev_dataset['dev_bleu'] = eval_examples,eval_data
387
+
388
+ eval_sampler = SequentialSampler(eval_data)
389
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
390
+
391
+ model.eval()
392
+ p=[]
393
+ for batch in eval_dataloader:
394
+ batch = tuple(t.to(device) for t in batch)
395
+ source_ids = batch[0]
396
+ with torch.no_grad():
397
+ preds = model(source_ids)
398
+ # convert ids to text
399
+ for pred in preds:
400
+ t = pred[0].cpu().numpy()
401
+ t = list(t)
402
+ if 0 in t:
403
+ t = t[:t.index(0)]
404
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
405
+ p.append(text)
406
+ model.train()
407
+ EM = 0.0
408
+ edit_sim = 0.0
409
+ total = len(p)
410
+ token_accuracy = 0
411
+ for ref,gold in zip(p,eval_examples):
412
+ pred = ref.strip()
413
+ gt = gold.target
414
+ edit_sim += fuzz.ratio(pred, gt)
415
+ if pred.split() == gt.split():
416
+ EM += 1
417
+ res_list.append([pred,gt])
418
+ dev_acc = round(EM/total*100, 2)
419
+ # logger.info(" %s = %s "%("loss",round(np.mean(dev_losses),4)))
420
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
421
+ logger.info(" %s = %s "%("EM Acc",str(dev_acc)))
422
+ logger.info(" %s = %s "%("Edit Distance",str(round(edit_sim/total, 2))))
423
+ logger.info(" "+"*"*20)
424
+
425
+ if dev_acc > best_score:
426
+ best_score = dev_acc
427
+ # Save best checkpoint for best bleu
428
+ if args.task == "statement_level":
429
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
430
+ else:
431
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
432
+ if not os.path.exists(output_dir):
433
+ os.makedirs(output_dir)
434
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
435
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
436
+ torch.save(model_to_save.state_dict(), output_model_file)
437
+ patience = 0
438
+ else:
439
+ patience += 1
440
+ if patience == 3:
441
+ break
442
+ logger.info(" Best score:%s",best_score)
443
+ logger.info(" "+"*"*20)
444
+
445
+ if args.task == "statement_level":
446
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
447
+ else:
448
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
449
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
450
+ for line in res_list:
451
+ dic = {}
452
+ dic["Pred"] = line[0]
453
+ dic["GT"] = line[1]
454
+ wf.write(json.dumps(dic))
455
+ wf.write("\n")
456
+
457
+ if args.do_test:
458
+ res_list = []
459
+ output_dir2 = ""
460
+
461
+ if args.load_model_path is not None:
462
+ model_to_load = model.module if hasattr(model, 'module') else model
463
+
464
+ if args.task == "statement_level":
465
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
466
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
467
+ else:
468
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
469
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
470
+
471
+ if args.task == "statement_level":
472
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/statement_level/test.jsonl')
473
+ else:
474
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/next_statement/test.jsonl')
475
+ eval_examples = read_examples(args.test_filename)
476
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
477
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
478
+ eval_data = TensorDataset(all_source_ids)
479
+
480
+ # Calculate bleu
481
+ eval_sampler = SequentialSampler(eval_data)
482
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
483
+
484
+ model.eval()
485
+ p=[]
486
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
487
+ batch = tuple(t.to(device) for t in batch)
488
+ source_ids = batch[0]
489
+ with torch.no_grad():
490
+ preds = model(source_ids)
491
+ # convert ids to text
492
+ for pred in preds:
493
+ t = pred[0].cpu().numpy()
494
+ t = list(t)
495
+ if 0 in t:
496
+ t = t[:t.index(0)]
497
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
498
+ p.append(text)
499
+ model.train()
500
+ avg_acc = 0.0
501
+ avg_EM = 0.0
502
+ total = 0
503
+ for ref,gold in zip(p,eval_examples):
504
+ pred = ref.strip() # post_process(ref.strip()).split(" ")
505
+ gt = gold.target.strip()
506
+ if pred == gt:
507
+ avg_EM += 1
508
+ avg_acc += fuzz.ratio(pred, gt)
509
+ res_list.append([pred, gt])
510
+ total += 1
511
+ dev_acc = round(avg_acc/total, 2)
512
+ dev_em = round(avg_EM/total, 4)
513
+
514
+ logger.info(" %s = %s "%("Test Token Avg Edit Distance",str(dev_acc)))
515
+ logger.info(" %s = %s "%("Test Token Avg Exact Match Rate",str(dev_em)))
516
+ logger.info(" "+"*"*20)
517
+ if args.test_org:
518
+ output_dir = args.output_dir
519
+ else:
520
+ if args.task == "statement_level":
521
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
522
+ else:
523
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
524
+
525
+ with open(output_dir + "/test_result.jsonl", 'w') as wf:
526
+ for line in res_list:
527
+ dic = {}
528
+ dic["Pred"] = line[0]
529
+ dic["GT"] = line[1]
530
+ wf.write(json.dumps(dic))
531
+ wf.write("\n")
532
+
533
+
534
+
535
+
536
+ if __name__ == "__main__":
537
+ main()
538
+
539
+
540
+
Script/Model/CodeBert/code-generation/bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 Google Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+
16
+ """Python implementation of BLEU and smooth-BLEU.
17
+
18
+ This module provides a Python implementation of BLEU and smooth-BLEU.
19
+ Smooth BLEU is computed following the method outlined in the paper:
20
+ Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
+ evaluation metrics for machine translation. COLING 2004.
22
+ """
23
+
24
+ import collections
25
+ import math
26
+
27
+
28
+ def _get_ngrams(segment, max_order):
29
+ """Extracts all n-grams upto a given maximum order from an input segment.
30
+
31
+ Args:
32
+ segment: text segment from which n-grams will be extracted.
33
+ max_order: maximum length in tokens of the n-grams returned by this
34
+ methods.
35
+
36
+ Returns:
37
+ The Counter containing all n-grams upto max_order in segment
38
+ with a count of how many times each n-gram occurred.
39
+ """
40
+ ngram_counts = collections.Counter()
41
+ for order in range(1, max_order + 1):
42
+ for i in range(0, len(segment) - order + 1):
43
+ ngram = tuple(segment[i:i+order])
44
+ ngram_counts[ngram] += 1
45
+ return ngram_counts
46
+
47
+
48
+ def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
+ smooth=False):
50
+ """Computes BLEU score of translated segments against one or more references.
51
+
52
+ Args:
53
+ reference_corpus: list of lists of references for each translation. Each
54
+ reference should be tokenized into a list of tokens.
55
+ translation_corpus: list of translations to score. Each translation
56
+ should be tokenized into a list of tokens.
57
+ max_order: Maximum n-gram order to use when computing BLEU score.
58
+ smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
+
60
+ Returns:
61
+ 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
+ precisions and brevity penalty.
63
+ """
64
+ matches_by_order = [0] * max_order
65
+ possible_matches_by_order = [0] * max_order
66
+ reference_length = 0
67
+ translation_length = 0
68
+ for (references, translation) in zip(reference_corpus,
69
+ translation_corpus):
70
+ reference_length += min(len(r) for r in references)
71
+ translation_length += len(translation)
72
+
73
+ merged_ref_ngram_counts = collections.Counter()
74
+ for reference in references:
75
+ merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
+ translation_ngram_counts = _get_ngrams(translation, max_order)
77
+ overlap = translation_ngram_counts & merged_ref_ngram_counts
78
+ for ngram in overlap:
79
+ matches_by_order[len(ngram)-1] += overlap[ngram]
80
+ for order in range(1, max_order+1):
81
+ possible_matches = len(translation) - order + 1
82
+ if possible_matches > 0:
83
+ possible_matches_by_order[order-1] += possible_matches
84
+
85
+ precisions = [0] * max_order
86
+ for i in range(0, max_order):
87
+ if smooth:
88
+ precisions[i] = ((matches_by_order[i] + 1.) /
89
+ (possible_matches_by_order[i] + 1.))
90
+ else:
91
+ if possible_matches_by_order[i] > 0:
92
+ precisions[i] = (float(matches_by_order[i]) /
93
+ possible_matches_by_order[i])
94
+ else:
95
+ precisions[i] = 0.0
96
+
97
+ if min(precisions) > 0:
98
+ p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
+ geo_mean = math.exp(p_log_sum)
100
+ else:
101
+ geo_mean = 0
102
+
103
+ ratio = float(translation_length) / reference_length
104
+
105
+ if ratio > 1.0:
106
+ bp = 1.
107
+ else:
108
+ bp = math.exp(1 - 1. / ratio)
109
+
110
+ bleu = geo_mean * bp
111
+
112
+ return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
+
114
+
115
+ def _bleu(ref_file, trans_file, subword_option=None):
116
+ max_order = 4
117
+ smooth = True
118
+ ref_files = [ref_file]
119
+ reference_text = []
120
+ for reference_filename in ref_files:
121
+ with open(reference_filename) as fh:
122
+ reference_text.append(fh.readlines())
123
+ per_segment_references = []
124
+ for references in zip(*reference_text):
125
+ reference_list = []
126
+ for reference in references:
127
+ reference_list.append(reference.strip().split())
128
+ per_segment_references.append(reference_list)
129
+ translations = []
130
+ with open(trans_file) as fh:
131
+ for line in fh:
132
+ translations.append(line.strip().split())
133
+ bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
+ return round(100 * bleu_score,2)
Script/Model/CodeBert/code-generation/model.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch
7
+ from torch.autograd import Variable
8
+ import copy
9
+ class Seq2Seq(nn.Module):
10
+ """
11
+ Build Seqence-to-Sequence.
12
+
13
+ Parameters:
14
+
15
+ * `encoder`- encoder of seq2seq model. e.g. roberta
16
+ * `decoder`- decoder of seq2seq model. e.g. transformer
17
+ * `config`- configuration of encoder model.
18
+ * `beam_size`- beam size for beam search.
19
+ * `max_length`- max length of target for beam search.
20
+ * `sos_id`- start of symbol ids in target for beam search.
21
+ * `eos_id`- end of symbol ids in target for beam search.
22
+ """
23
+ def __init__(self, encoder,decoder, config, beam_size=None, max_length=None, sos_id=None, eos_id=None):
24
+ super(Seq2Seq, self).__init__()
25
+ self.encoder = encoder
26
+ self.decoder=decoder
27
+ self.config=config
28
+ self.register_buffer(
29
+ "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
30
+ )
31
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
32
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
33
+ self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
34
+ self.lsm = nn.LogSoftmax(dim=-1)
35
+
36
+ self.beam_size = beam_size
37
+ self.max_length = max_length
38
+ self.sos_id = sos_id
39
+ self.eos_id = eos_id
40
+
41
+ def forward(self, source_ids, target_ids=None):
42
+ if target_ids is None:
43
+ return self.generate(source_ids)
44
+
45
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
46
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
47
+ ids = torch.cat((source_ids,target_ids),-1)
48
+ mask = self.bias[:,source_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
49
+ mask = mask & ids[:,None,:].ne(1)
50
+
51
+ out = self.decoder(target_ids,attention_mask=mask,past_key_values=encoder_output.past_key_values).last_hidden_state
52
+ lm_logits = self.lm_head(out)
53
+ # Shift so that tokens < n predict n
54
+ active_loss = target_ids[..., 1:].ne(1).view(-1)
55
+ shift_logits = lm_logits[..., :-1, :].contiguous()
56
+ shift_labels = target_ids[..., 1:].contiguous()
57
+ # Flatten the tokens
58
+ loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
59
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
60
+ shift_labels.view(-1)[active_loss])
61
+
62
+ outputs = loss,loss*active_loss.sum(),active_loss.sum()
63
+ return outputs
64
+
65
+ def generate(self, source_ids):
66
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
67
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
68
+ preds = []
69
+ zero = torch.cuda.LongTensor(1).fill_(0)
70
+ source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
71
+ for i in range(source_ids.shape[0]):
72
+ context = [[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y]
73
+ for y in encoder_output.past_key_values]
74
+ beam = Beam(self.beam_size,self.sos_id,self.eos_id)
75
+ input_ids = beam.getCurrentState()
76
+ context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
77
+ for _ in range(self.max_length):
78
+ if beam.done():
79
+ break
80
+
81
+ ids = torch.cat((context_ids,input_ids),-1)
82
+ mask = self.bias[:,context_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
83
+ mask = mask & ids[:,None,:].ne(1)
84
+ out = self.decoder(input_ids,attention_mask=mask,past_key_values=context).last_hidden_state
85
+ hidden_states = out[:,-1,:]
86
+ out = self.lsm(self.lm_head(hidden_states)).data
87
+ beam.advance(out)
88
+ input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
89
+ input_ids = torch.cat((input_ids,beam.getCurrentState()),-1)
90
+ hyp = beam.getHyp(beam.getFinal())
91
+ pred = beam.buildTargetTokens(hyp)[:self.beam_size]
92
+ pred = [torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
93
+ preds.append(torch.cat(pred,0).unsqueeze(0))
94
+
95
+ preds = torch.cat(preds,0)
96
+
97
+ return preds
98
+
99
+
100
+
101
+ class Beam(object):
102
+ def __init__(self, size,sos,eos):
103
+ self.size = size
104
+ self.tt = torch.cuda
105
+ # The score for each translation on the beam.
106
+ self.scores = self.tt.FloatTensor(size).zero_()
107
+ # The backpointers at each time-step.
108
+ self.prevKs = []
109
+ # The outputs at each time-step.
110
+ self.nextYs = [self.tt.LongTensor(size)
111
+ .fill_(0)]
112
+ self.nextYs[0][0] = sos
113
+ # Has EOS topped the beam yet.
114
+ self._eos = eos
115
+ self.eosTop = False
116
+ # Time and k pair for finished.
117
+ self.finished = []
118
+
119
+ def getCurrentState(self):
120
+ "Get the outputs for the current timestep."
121
+ batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
122
+ return batch
123
+
124
+ def getCurrentOrigin(self):
125
+ "Get the backpointers for the current timestep."
126
+ return self.prevKs[-1]
127
+
128
+ def advance(self, wordLk):
129
+ """
130
+ Given prob over words for every last beam `wordLk` and attention
131
+ `attnOut`: Compute and update the beam search.
132
+
133
+ Parameters:
134
+
135
+ * `wordLk`- probs of advancing from the last step (K x words)
136
+ * `attnOut`- attention at the last step
137
+
138
+ Returns: True if beam search is complete.
139
+ """
140
+ numWords = wordLk.size(1)
141
+
142
+ # Sum the previous scores.
143
+ if len(self.prevKs) > 0:
144
+ beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
145
+
146
+ # Don't let EOS have children.
147
+ for i in range(self.nextYs[-1].size(0)):
148
+ if self.nextYs[-1][i] == self._eos:
149
+ beamLk[i] = -1e20
150
+ else:
151
+ beamLk = wordLk[0]
152
+ flatBeamLk = beamLk.view(-1)
153
+ bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
154
+
155
+ self.scores = bestScores
156
+
157
+ # bestScoresId is flattened beam x word array, so calculate which
158
+ # word and beam each score came from
159
+ prevK = bestScoresId // numWords
160
+ self.prevKs.append(prevK)
161
+ self.nextYs.append((bestScoresId - prevK * numWords))
162
+
163
+
164
+ for i in range(self.nextYs[-1].size(0)):
165
+ if self.nextYs[-1][i] == self._eos:
166
+ s = self.scores[i]
167
+ self.finished.append((s, len(self.nextYs) - 1, i))
168
+
169
+ # End condition is when top-of-beam is EOS and no global score.
170
+ if self.nextYs[-1][0] == self._eos:
171
+ self.eosTop = True
172
+
173
+ def done(self):
174
+ return self.eosTop and len(self.finished) >=self.size
175
+
176
+ def getFinal(self):
177
+ if len(self.finished) == 0:
178
+ self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
179
+ self.finished.sort(key=lambda a: -a[0])
180
+ if len(self.finished) != self.size:
181
+ unfinished=[]
182
+ for i in range(self.nextYs[-1].size(0)):
183
+ if self.nextYs[-1][i] != self._eos:
184
+ s = self.scores[i]
185
+ unfinished.append((s, len(self.nextYs) - 1, i))
186
+ unfinished.sort(key=lambda a: -a[0])
187
+ self.finished+=unfinished[:self.size-len(self.finished)]
188
+ return self.finished[:self.size]
189
+
190
+ def getHyp(self, beam_res):
191
+ """
192
+ Walk back to construct the full hypothesis.
193
+ """
194
+ hyps=[]
195
+ for _,timestep, k in beam_res:
196
+ hyp = []
197
+ for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
198
+ hyp.append(self.nextYs[j+1][k])
199
+ k = self.prevKs[j][k]
200
+ hyps.append(hyp[::-1])
201
+ return hyps
202
+
203
+ def buildTargetTokens(self, preds):
204
+ sentence=[]
205
+ for pred in preds:
206
+ tokens = []
207
+ for tok in pred:
208
+ if tok==self._eos:
209
+ break
210
+ tokens.append(tok)
211
+ sentence.append(tokens)
212
+ return sentence
213
+
Script/Model/CodeBert/code-generation/run_generation.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ from bleu import _bleu
26
+ import pickle
27
+ import torch
28
+ import json
29
+ import random
30
+ import logging
31
+ import argparse
32
+ import numpy as np
33
+ from io import open
34
+ from itertools import cycle
35
+ import torch.nn as nn
36
+ from model import Seq2Seq
37
+ from tqdm import tqdm, trange
38
+ from fuzzywuzzy import fuzz
39
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
40
+ from torch.utils.data.distributed import DistributedSampler
41
+
42
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
43
+ RobertaConfig, RobertaModel, RobertaTokenizer)
44
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
45
+ datefmt = '%m/%d/%Y %H:%M:%S',
46
+ level = logging.INFO)
47
+ logger = logging.getLogger(__name__)
48
+ divide_number = 3
49
+ #
50
+
51
+ class Example(object):
52
+ """A single training/test example."""
53
+ def __init__(self,
54
+ idx,
55
+ source,
56
+ ts_v,
57
+ target,
58
+ ):
59
+ self.idx = idx
60
+ self.source = source
61
+ self.ts_v = ts_v
62
+ self.target = target
63
+
64
+ def read_examples(filename):
65
+ """Read examples from filename."""
66
+ examples=[]
67
+ with open(filename,encoding="utf-8") as f:
68
+ for idx, line in enumerate(f):
69
+ line=line.strip()
70
+ js=json.loads(line)
71
+ examples.append(
72
+ Example(
73
+ idx = idx,
74
+ source=" ".join(js['natrual_language']),
75
+ ts_v = ",".join(js['TS_V_token']),
76
+ target = " ".join(js["ground_truth"][1:-1]),
77
+ )
78
+ )
79
+
80
+ return examples
81
+
82
+
83
+ class InputFeatures(object):
84
+ """A single training/test features for a example."""
85
+ def __init__(self,
86
+ example_id,
87
+ source_ids,
88
+ target_ids,
89
+ ):
90
+ self.example_id = example_id
91
+ self.source_ids = source_ids
92
+ self.target_ids = target_ids
93
+
94
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
95
+ features = []
96
+ for example_index, example in enumerate(examples):
97
+ #source
98
+ source_tokens = tokenizer.tokenize(example.source)
99
+ ts_v_tokens = tokenizer.tokenize(example.ts_v)
100
+ source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]+ts_v_tokens+[tokenizer.sep_token]
101
+
102
+ source_ids = tokenizer.convert_tokens_to_ids(source_tokens[:args.max_source_length-5])
103
+ padding_length = args.max_source_length - len(source_ids)
104
+ source_ids+=[tokenizer.pad_token_id]*padding_length
105
+
106
+ #target
107
+ if stage=="test":
108
+ target_tokens = tokenizer.tokenize("None")
109
+ else:
110
+ target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
111
+ target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]
112
+ target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
113
+ padding_length = args.max_target_length - len(target_ids)
114
+ target_ids+=[tokenizer.pad_token_id]*padding_length
115
+
116
+ features.append(
117
+ InputFeatures(
118
+ example_index,
119
+ source_ids,
120
+ target_ids,
121
+ )
122
+ )
123
+ return features
124
+
125
+
126
+
127
+ def set_seed(seed=20240124):
128
+ random.seed(seed)
129
+ os.environ['PYHTONHASHSEED'] = str(seed)
130
+ np.random.seed(seed)
131
+ torch.manual_seed(seed)
132
+ torch.cuda.manual_seed(seed)
133
+ torch.backends.cudnn.deterministic = True
134
+
135
+ def main():
136
+ parser = argparse.ArgumentParser()
137
+
138
+ ## Required parameters
139
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
140
+ help="Path to pre-trained model: e.g. roberta-base" )
141
+ parser.add_argument("--load_model_path", default=None, type=str,
142
+ help="Path to trained model" )
143
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
144
+ help="The output directory where the model predictions and checkpoints will be written.")
145
+
146
+ ## Other parameters
147
+ parser.add_argument("--train_filename", default=None, type=str,
148
+ help="The train filename. Should contain the .jsonl files for this task.")
149
+ parser.add_argument("--dev_filename", default=None, type=str,
150
+ help="The dev filename. Should contain the .jsonl files for this task.")
151
+ parser.add_argument("--test_filename", default=None, type=str,
152
+ help="The test filename. Should contain the .jsonl files for this task.")
153
+ parser.add_argument("--max_source_length", default=256, type=int,
154
+ help="The maximum total source sequence length after tokenization. Sequences longer "
155
+ "than this will be truncated, sequences shorter will be padded.")
156
+ parser.add_argument("--max_target_length", default=256, type=int,
157
+ help="The maximum total target sequence length after tokenization. Sequences longer "
158
+ "than this will be truncated, sequences shorter will be padded.")
159
+ parser.add_argument("--do_train", action='store_true',
160
+ help="Whether to run training.")
161
+ parser.add_argument("--do_eval", action='store_true',
162
+ help="Whether to run eval on the dev set.")
163
+ parser.add_argument("--do_test", action='store_true',
164
+ help="Whether to run eval on the dev set.")
165
+ parser.add_argument("--no_cuda", action='store_true',
166
+ help="Avoid using CUDA when available")
167
+
168
+ parser.add_argument("--train_batch_size", default=8, type=int,
169
+ help="Batch size per GPU/CPU for training.")
170
+ parser.add_argument("--eval_batch_size", default=8, type=int,
171
+ help="Batch size per GPU/CPU for evaluation.")
172
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
173
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
174
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
175
+ help="The initial learning rate for Adam.")
176
+ parser.add_argument("--beam_size", default=10, type=int,
177
+ help="beam size for beam search")
178
+ parser.add_argument("--weight_decay", default=0.0, type=float,
179
+ help="Weight deay if we apply some.")
180
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
181
+ help="Epsilon for Adam optimizer.")
182
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
183
+ help="Max gradient norm.")
184
+ parser.add_argument("--num_train_epochs", default=3, type=int,
185
+ help="Total number of training epochs to perform.")
186
+ parser.add_argument('--seed', type=int, default=20240124,
187
+ help="random seed for initialization")
188
+
189
+ # print arguments
190
+ args = parser.parse_args()
191
+ # set log
192
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
193
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
194
+ # set device
195
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
196
+ args.n_gpu = torch.cuda.device_count()
197
+ args.device = device
198
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
199
+
200
+ # Set seed
201
+ set_seed(args.seed)
202
+ # make dir if output_dir not exist
203
+ if os.path.exists(args.output_dir) is False:
204
+ os.makedirs(args.output_dir)
205
+
206
+ # build model
207
+ tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
208
+ config = RobertaConfig.from_pretrained(args.model_name_or_path)
209
+ # import!!!you must set is_decoder as True for generation
210
+ config.is_decoder = True
211
+ encoder = RobertaModel.from_pretrained(args.model_name_or_path,config=config)
212
+
213
+ model = Seq2Seq(encoder=encoder,decoder=encoder,config=config,
214
+ beam_size=args.beam_size,max_length=args.max_target_length,
215
+ sos_id=tokenizer.convert_tokens_to_ids(["<mask0>"])[0],eos_id=tokenizer.sep_token_id)
216
+
217
+ logger.info("Training/evaluation parameters %s", args)
218
+ if args.load_model_path is not None:
219
+ logger.info("reload model from {}".format(args.load_model_path + "/pytorch_model.bin"))
220
+ model.load_state_dict(torch.load(args.load_model_path + "/pytorch_model.bin"))
221
+ model.to(args.device)
222
+
223
+ if args.n_gpu > 1:
224
+ # multi-gpu training
225
+ model = torch.nn.DataParallel(model)
226
+
227
+ if args.do_train:
228
+ # Prepare training data loader
229
+ train_examples = read_examples(args.train_filename)
230
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
231
+ all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long)
232
+ all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
233
+ train_data = TensorDataset(all_source_ids,all_target_ids)
234
+ train_sampler = RandomSampler(train_data)
235
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
236
+
237
+
238
+ # Prepare optimizer and schedule (linear warmup and decay)
239
+ no_decay = ['bias', 'LayerNorm.weight']
240
+ optimizer_grouped_parameters = [
241
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
242
+ 'weight_decay': args.weight_decay},
243
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
244
+ ]
245
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
246
+ scheduler = get_linear_schedule_with_warmup(optimizer,
247
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
248
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
249
+
250
+ #Start training
251
+ logger.info("***** Running training *****")
252
+ logger.info(" Num examples = %d", len(train_examples))
253
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
254
+ logger.info(" Num epoch = %d", args.num_train_epochs)
255
+
256
+
257
+ model.train()
258
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
259
+ for epoch in range(args.num_train_epochs):
260
+ for idx,batch in enumerate(train_dataloader):
261
+ batch = tuple(t.to(device) for t in batch)
262
+ source_ids,target_ids = batch
263
+ loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
264
+
265
+ if args.n_gpu > 1:
266
+ loss = loss.mean() # mean() to average on multi-gpu.
267
+ if args.gradient_accumulation_steps > 1:
268
+ loss = loss / args.gradient_accumulation_steps
269
+
270
+ losses.append(loss.item())
271
+ loss.backward()
272
+ if len(losses) % args.gradient_accumulation_steps == 0:
273
+ #Update parameters
274
+ optimizer.step()
275
+ optimizer.zero_grad()
276
+ scheduler.step()
277
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
278
+ logger.info("epoch {} step {} loss {}".format(epoch,
279
+ len(losses)//args.gradient_accumulation_steps,
280
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
281
+ if args.do_eval:
282
+ #Eval model with dev dataset
283
+ if 'dev_loss' in dev_dataset:
284
+ eval_examples,eval_data = dev_dataset['dev_loss']
285
+ else:
286
+ eval_examples = read_examples(args.dev_filename)
287
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
288
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
289
+ all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long)
290
+ eval_data = TensorDataset(all_source_ids,all_target_ids)
291
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
292
+ eval_sampler = SequentialSampler(eval_data)
293
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
294
+
295
+ logger.info("\n***** Running evaluation *****")
296
+ logger.info(" Num examples = %d", len(eval_examples))
297
+ logger.info(" Batch size = %d", args.eval_batch_size)
298
+
299
+ #Start Evaling model
300
+ model.eval()
301
+ eval_loss,tokens_num = 0,0
302
+ for batch in eval_dataloader:
303
+ batch = tuple(t.to(device) for t in batch)
304
+ source_ids,target_ids = batch
305
+
306
+ with torch.no_grad():
307
+ _,loss,num = model(source_ids=source_ids,target_ids=target_ids)
308
+ eval_loss += loss.sum().item()
309
+ tokens_num += num.sum().item()
310
+ #Pring loss of dev dataset
311
+ model.train()
312
+ eval_loss = eval_loss / tokens_num
313
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
314
+ for key in sorted(result.keys()):
315
+ logger.info(" %s = %s", key, str(result[key]))
316
+ logger.info(" "+"*"*20)
317
+
318
+ #Calculate bleu
319
+ if 'dev_bleu' in dev_dataset:
320
+ eval_examples,eval_data=dev_dataset['dev_bleu']
321
+ else:
322
+ eval_examples = read_examples(args.dev_filename)
323
+ # eval_examples = random.sample(eval_examples)
324
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
325
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
326
+ eval_data = TensorDataset(all_source_ids)
327
+ dev_dataset['dev_bleu'] = eval_examples,eval_data
328
+
329
+ eval_sampler = SequentialSampler(eval_data)
330
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
331
+
332
+ model.eval()
333
+ p=[]
334
+ for batch in eval_dataloader:
335
+ batch = tuple(t.to(device) for t in batch)
336
+ source_ids = batch[0]
337
+ with torch.no_grad():
338
+ preds = model(source_ids=source_ids)
339
+ # convert ids to text
340
+ for pred in preds:
341
+ t = pred[0].cpu().numpy()
342
+ t = list(t)
343
+ if 0 in t:
344
+ t = t[:t.index(0)]
345
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
346
+ # print(text)
347
+ p.append(text)
348
+
349
+ model.train()
350
+ predictions = []
351
+ edit_dis = 0
352
+ cnt_all = 0
353
+ res_list = []
354
+ EM = []
355
+ is_gened = False
356
+ with open(args.output_dir+"/dev.output",'w') as f, open(args.output_dir+"/dev.gold",'w') as f1:
357
+ for ref,gold in zip(p,eval_examples):
358
+ predictions.append(ref)
359
+ if len(ref) > 0:
360
+ is_gened = True
361
+ f.write(ref+'\n')
362
+ f1.write(gold.target+'\n')
363
+ EM.append(ref.split()==gold.target.split())
364
+ edit_dis += fuzz.ratio(ref, gold.target)
365
+ res_list.append([ref,gold.target])
366
+ cnt_all += 1
367
+ if is_gened:
368
+ dev_bleu = _bleu(args.output_dir+"/dev.gold", args.output_dir+"/dev.output")
369
+ else:
370
+ dev_bleu = 0
371
+ avg_edit_dis = float(edit_dis)/cnt_all
372
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
373
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
374
+ logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt_all,2))))
375
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
376
+ logger.info(" "+"*"*20)
377
+ dev_score = (dev_bleu+avg_edit_dis) / 2.0
378
+ if dev_score>best_score:
379
+ best_score=dev_score
380
+ # Save best checkpoint for best bleu
381
+ output_dir = args.output_dir
382
+ if not os.path.exists(output_dir):
383
+ os.makedirs(output_dir)
384
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
385
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
386
+ torch.save(model_to_save.state_dict(), output_model_file)
387
+ patience =0
388
+ else:
389
+ patience +=1
390
+ if patience == 3:
391
+ break
392
+ output_dir = args.output_dir
393
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
394
+ for line in res_list:
395
+ dic = {}
396
+ dic["Pred"] = line[0]
397
+ dic["GT"] = line[1]
398
+ wf.write(json.dumps(dic))
399
+ wf.write("\n")
400
+
401
+ logger.info(" Best score:%s",best_score)
402
+ logger.info(" "+"*"*20)
403
+ if args.do_test:
404
+ res_list = []
405
+ if args.load_model_path is not None:
406
+ checkpoint_prefix = 'pytorch_model.bin'
407
+ output_dir = os.path.join(args.output_dir, checkpoint_prefix)
408
+ model_to_load = model.module if hasattr(model, 'module') else model
409
+ model_to_load.load_state_dict(torch.load(output_dir))
410
+
411
+
412
+
413
+ eval_examples = read_examples(args.test_filename)
414
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
415
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
416
+ eval_data = TensorDataset(all_source_ids)
417
+
418
+ # Calculate bleu
419
+ eval_sampler = SequentialSampler(eval_data)
420
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
421
+
422
+ model.eval()
423
+ p=[]
424
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
425
+ batch = tuple(t.to(device) for t in batch)
426
+ source_ids = batch[0]
427
+ with torch.no_grad():
428
+ preds = model(source_ids)
429
+ # convert ids to text
430
+ for pred in preds:
431
+ t = pred[0].cpu().numpy()
432
+ t = list(t)
433
+ if 0 in t:
434
+ t = t[:t.index(0)]
435
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
436
+ p.append(text)
437
+
438
+ predictions=[]
439
+ EM = []
440
+ edit_dis = 0
441
+ cnt = 0
442
+ with open(args.output_dir+"/test.output",'w') as f, open(args.output_dir+"/test.gold",'w') as f1:
443
+ for ref,gold in zip(p,eval_examples):
444
+ res_list.append([ref,gold.target])
445
+ predictions.append(ref)
446
+ f.write(ref+'\n')
447
+ f1.write(gold.target+'\n')
448
+ EM.append(ref.split()==gold.target.split())
449
+ edit_dis += fuzz.ratio(ref, gold.target)
450
+ cnt += 1
451
+
452
+ dev_bleu = _bleu(args.output_dir+"/test.gold", args.output_dir+"/test.output")
453
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
454
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
455
+ logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt,2))))
456
+ logger.info(" "+"*"*20)
457
+
458
+
459
+ with open(args.output_dir + "/last_training_result.jsonl", 'w') as wf:
460
+ for line in res_list:
461
+ dic = {}
462
+ dic["Pred"] = line[0]
463
+ dic["GT"] = line[1]
464
+ wf.write(json.dumps(dic))
465
+ wf.write("\n")
466
+
467
+ if __name__ == "__main__":
468
+ main()
469
+
470
+
Script/Model/CodeT5+/code-completion/run_completion.py ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ import pickle
26
+ import torch
27
+ import json
28
+ import random
29
+ import logging
30
+ import argparse
31
+ import numpy as np
32
+ from io import open
33
+ from itertools import cycle
34
+ import torch.nn as nn
35
+ from tqdm import tqdm, trange
36
+ from torch.nn.utils.rnn import pad_sequence
37
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
38
+ from torch.utils.data.distributed import DistributedSampler
39
+ from tqdm import tqdm
40
+ from fuzzywuzzy import fuzz
41
+ import re
42
+ import multiprocessing
43
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, T5ForConditionalGeneration, AutoTokenizer)
44
+
45
+ divide_number = 2
46
+ cpu_cont = 16
47
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
48
+ datefmt = '%m/%d/%Y %H:%M:%S',
49
+ level = logging.INFO)
50
+ logger = logging.getLogger(__name__)
51
+
52
+ #
53
+
54
+
55
+ class Example(object):
56
+ """A single training/test example."""
57
+ def __init__(self,
58
+ idx,
59
+ source,
60
+ target
61
+ ):
62
+ self.idx = idx
63
+ self.source = source
64
+ self.target = target
65
+
66
+ def read_examples(filename):
67
+ """Read examples from filename."""
68
+ examples=[]
69
+
70
+ with open(filename,encoding="utf-8") as f:
71
+ max_src_len = 0
72
+ max_tar_len = 0
73
+ for idx, line in enumerate(f):
74
+ js=json.loads(line)
75
+ inputs = " ".join(js["Template_token"][1:])
76
+
77
+ # print(inputs)
78
+ if "ground_truth" in js:
79
+ outputs = " ".join(js["ground_truth"])
80
+ else:
81
+ outputs = inputs
82
+ if 'Idx' in js:
83
+ idx = js['Idx']
84
+ examples.append(
85
+ Example(
86
+ idx = idx,
87
+ source = inputs,
88
+ target = outputs
89
+ )
90
+ )
91
+ return examples
92
+
93
+
94
+ class InputFeatures(object):
95
+ """A single training/test features for a example."""
96
+ def __init__(self,
97
+ example_id,
98
+ source_ids, source_mask,
99
+ target_ids, target_mask
100
+ ):
101
+ self.example_id = example_id
102
+ self.source_ids = source_ids
103
+ self.source_mask = source_mask
104
+ self.target_ids = target_ids
105
+ self.target_mask = target_mask
106
+
107
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
108
+ features = []
109
+ for example_index, example in enumerate(examples):
110
+ #source
111
+ source_ids = torch.LongTensor(tokenizer.encode(example.source,
112
+ add_special_tokens=True, max_length=args.max_source_length, truncation=True))
113
+
114
+ source_mask = torch.ones_like(source_ids)
115
+ #target
116
+ if stage=="test":
117
+ target = "None"
118
+ else:
119
+ target = example.target
120
+
121
+ target_ids = torch.LongTensor(tokenizer.encode(target,
122
+ add_special_tokens=True, max_length=args.max_target_length, truncation=True))
123
+ target_mask = torch.ones_like(target_ids)
124
+
125
+
126
+ features.append(
127
+ InputFeatures(
128
+ example_index,
129
+ source_ids, source_mask,
130
+ target_ids, target_mask
131
+ )
132
+ )
133
+ return features
134
+
135
+
136
+
137
+ def set_seed(seed=20240124):
138
+ random.seed(seed)
139
+ os.environ['PYHTONHASHSEED'] = str(seed)
140
+ np.random.seed(seed)
141
+ torch.manual_seed(seed)
142
+ torch.cuda.manual_seed(seed)
143
+ torch.backends.cudnn.deterministic = True
144
+
145
+
146
+ def main():
147
+ parser = argparse.ArgumentParser()
148
+
149
+ ## Required parameters
150
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
151
+ help="Path to pre-trained model: e.g. roberta-base" )
152
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
153
+ help="The output directory where the model predictions and checkpoints will be written.")
154
+ parser.add_argument("--load_model_path", default=None, type=str,
155
+ help="Path to trained model: Should contain the .bin files" )
156
+ ## Other parameters
157
+ parser.add_argument("--task", default=None, type=str, required=True,
158
+ help="Task Type: statement_level, next_statement" )
159
+
160
+ parser.add_argument("--train_filename", default="../../Dataset/", type=str,
161
+ help="The train filename. Should contain the .jsonl files for this task.")
162
+ parser.add_argument("--dev_filename", default="../../Dataset/", type=str,
163
+ help="The dev filename. Should contain the .jsonl files for this task.")
164
+ parser.add_argument("--test_filename", default="../../Dataset/", type=str,
165
+ help="The test filename. Should contain the .jsonl files for this task.")
166
+
167
+ parser.add_argument("--config_name", default="", type=str,
168
+ help="Pretrained config name or path if not the same as model_name")
169
+ parser.add_argument("--tokenizer_name", default="", type=str,
170
+ help="Pretrained tokenizer name or path if not the same as model_name")
171
+ # parser.add_argument("--max_source_length", default=64, type=int,
172
+ # help="The maximum total source sequence length after tokenization. Sequences longer "
173
+ # "than this will be truncated, sequences shorter will be padded.")
174
+ # parser.add_argument("--max_target_length", default=32, type=int,
175
+ # help="The maximum total target sequence length after tokenization. Sequences longer "
176
+ # "than this will be truncated, sequences shorter will be padded.")
177
+
178
+ parser.add_argument("--do_train", action='store_true',
179
+ help="Whether to run training.")
180
+ parser.add_argument("--do_eval", action='store_true',
181
+ help="Whether to run eval on the dev set.")
182
+ parser.add_argument("--do_test", action='store_true',
183
+ help="Whether to run eval on the dev set.")
184
+ parser.add_argument("--test_org", action='store_true',
185
+ help="Whether to run eval on org model.")
186
+ parser.add_argument("--do_lower_case", action='store_true',
187
+ help="Set this flag if you are using an uncased model.")
188
+ parser.add_argument("--no_cuda", action='store_true',
189
+ help="Avoid using CUDA when available")
190
+
191
+ parser.add_argument("--train_batch_size", default=8, type=int,
192
+ help="Batch size per GPU/CPU for training.")
193
+ parser.add_argument("--eval_batch_size", default=8, type=int,
194
+ help="Batch size per GPU/CPU for evaluation.")
195
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
196
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
197
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
198
+ help="The initial learning rate for Adam.")
199
+ parser.add_argument("--beam_size", default=10, type=int,
200
+ help="beam size for beam search")
201
+ parser.add_argument("--weight_decay", default=0.0, type=float,
202
+ help="Weight deay if we apply some.")
203
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
204
+ help="Epsilon for Adam optimizer.")
205
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
206
+ help="Max gradient norm.")
207
+ parser.add_argument("--num_train_epochs", default=3, type=int,
208
+ help="Total number of training epochs to perform.")
209
+ parser.add_argument("--max_steps", default=-1, type=int,
210
+ help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
211
+ parser.add_argument("--eval_steps", default=-1, type=int,
212
+ help="")
213
+ parser.add_argument("--max_target_length", default=128, type=int,
214
+ help="")
215
+ parser.add_argument("--max_source_length", default=512, type=int,
216
+ help="")
217
+ parser.add_argument("--train_steps", default=-1, type=int,
218
+ help="")
219
+ parser.add_argument("--warmup_steps", default=0, type=int,
220
+ help="Linear warmup over warmup_steps.")
221
+ parser.add_argument("--local_rank", type=int, default=-1,
222
+ help="For distributed training: local_rank")
223
+ parser.add_argument('--seed', type=int, default=20240124,
224
+ help="random seed for initialization")
225
+ # print arguments
226
+ args = parser.parse_args()
227
+ # set log
228
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
229
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
230
+ # set device
231
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
232
+ args.n_gpu = torch.cuda.device_count()
233
+ args.device = device
234
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
235
+
236
+ # Set seed
237
+ set_seed(args.seed)
238
+
239
+ # make dir if output_dir not exist
240
+ if os.path.exists(args.output_dir) is False:
241
+ os.makedirs(args.output_dir)
242
+
243
+ # build model
244
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
245
+ is_trust = False
246
+ if "codet5p-220m" in args.model_name_or_path or "codet5p-770m" in args.model_name_or_path:
247
+ is_trust = False
248
+ else:
249
+ is_trust = True
250
+ model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
251
+
252
+
253
+ logger.info("Training/evaluation parameters %s", args)
254
+
255
+ if args.load_model_path is not None:
256
+ if args.task == "statement_level":
257
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
258
+ model.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
259
+ else:
260
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
261
+ model.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
262
+
263
+ # model.eval()
264
+ model.to(args.device)
265
+
266
+ if args.n_gpu > 1:
267
+ # multi-gpu training
268
+ model = torch.nn.DataParallel(model)
269
+
270
+ if args.do_train:
271
+ # Prepare training data loader
272
+ if args.task == "statement_level":
273
+ train_examples = read_examples(args.train_filename + "/Code_Completion/statement_level/train.jsonl")
274
+ else:
275
+ train_examples = read_examples(args.train_filename + "/Code_Completion/next_statement/train.jsonl")
276
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
277
+ all_source_ids = pad_sequence([f.source_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
278
+ all_source_mask = pad_sequence([f.source_mask for f in train_features], batch_first=True, padding_value=0)
279
+ all_target_ids = pad_sequence([f.target_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
280
+ all_target_mask = pad_sequence([f.target_mask for f in train_features], batch_first=True, padding_value=0)
281
+ train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
282
+ train_sampler = RandomSampler(train_data)
283
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
284
+
285
+
286
+ # Prepare optimizer and schedule (linear warmup and decay)
287
+ no_decay = ['bias', 'LayerNorm.weight']
288
+ optimizer_grouped_parameters = [
289
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
290
+ 'weight_decay': args.weight_decay},
291
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
292
+ ]
293
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
294
+ scheduler = get_linear_schedule_with_warmup(optimizer,
295
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
296
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
297
+
298
+ #Start training
299
+ logger.info("***** Running training *****")
300
+ logger.info(" Num examples = %d", len(train_examples))
301
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
302
+ logger.info(" Num epoch = %d", args.num_train_epochs)
303
+
304
+
305
+ model.train()
306
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
307
+ for epoch in range(args.num_train_epochs):
308
+ for idx,batch in enumerate(train_dataloader):
309
+ batch = tuple(t.to(device) for t in batch)
310
+ source_ids,source_mask,target_ids,target_mask = batch
311
+ # loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
312
+
313
+ loss = model(input_ids=source_ids, attention_mask=source_mask.gt(0),
314
+ labels=target_ids, decoder_attention_mask=target_mask.gt(0)).loss
315
+
316
+
317
+ if args.n_gpu > 1:
318
+ loss = loss.mean() # mean() to average on multi-gpu.
319
+
320
+ if args.gradient_accumulation_steps > 1:
321
+ loss = loss / args.gradient_accumulation_steps
322
+
323
+ losses.append(loss.item())
324
+ loss.backward()
325
+ if len(losses) % args.gradient_accumulation_steps == 0:
326
+ #Update parameters
327
+ optimizer.step()
328
+ optimizer.zero_grad()
329
+ scheduler.step()
330
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
331
+ logger.info("epoch {} step {} loss {}".format(epoch,
332
+ len(losses)//args.gradient_accumulation_steps,
333
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
334
+ if args.do_eval:
335
+ #Eval model with dev dataset
336
+
337
+ if 'dev_loss' in dev_dataset:
338
+ eval_examples,eval_data = dev_dataset['dev_loss']
339
+ else:
340
+ if args.task == "statement_level":
341
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl")
342
+ else:
343
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl")
344
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
345
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
346
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
347
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
348
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
349
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
350
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
351
+ eval_sampler = SequentialSampler(eval_data)
352
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
353
+ res_list = []
354
+ logger.info("\n***** Running evaluation *****")
355
+ logger.info(" Num examples = %d", len(eval_examples))
356
+ logger.info(" Batch size = %d", args.eval_batch_size)
357
+
358
+ #Start Evaling model
359
+ model.eval()
360
+ p=[]
361
+ eval_loss,tokens_num = 0,0
362
+ for batch in eval_dataloader:
363
+ batch = tuple(t.to(device) for t in batch)
364
+ source_ids,source_mask,target_ids,target_mask = batch
365
+ with torch.no_grad():
366
+ loss = model(input_ids=source_ids, attention_mask=source_mask,
367
+ labels=target_ids, decoder_attention_mask=target_mask).loss
368
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
369
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length) # module. for multi GPU
370
+
371
+ # convert ids to text
372
+ for pred in preds:
373
+ # print(pred)
374
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
375
+ p.append(text)
376
+ if args.n_gpu > 1:
377
+ loss = loss.mean() # mean() to average on multi-gpu.
378
+
379
+ if args.gradient_accumulation_steps > 1:
380
+ loss = loss / args.gradient_accumulation_steps
381
+ eval_loss += loss.item()
382
+ tokens_num += 1
383
+
384
+ #Pring loss of dev dataset
385
+ model.train()
386
+ eval_loss = eval_loss / tokens_num
387
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
388
+ for key in sorted(result.keys()):
389
+ logger.info(" %s = %s", key, str(result[key]))
390
+ logger.info(" "+"*"*20)
391
+
392
+
393
+ EM = 0.0
394
+ edit_sim = 0.0
395
+ total = len(p)
396
+ token_accuracy = 0
397
+ for ref,gold in zip(p,eval_examples):
398
+ pred = ref.strip()
399
+ gt = gold.target
400
+ edit_sim += fuzz.ratio(pred, gt)
401
+ if pred.split() == gt.split():
402
+ EM += 1
403
+ res_list.append([pred,gt])
404
+ dev_acc = round(EM/total*100, 2)
405
+ # logger.info(" %s = %s "%("loss",round(np.mean(dev_losses),4)))
406
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
407
+ logger.info(" %s = %s "%("EM Acc",str(dev_acc)))
408
+ logger.info(" %s = %s "%("Edit Distance",str(round(edit_sim/total, 2))))
409
+ logger.info(" "+"*"*20)
410
+
411
+ if dev_acc > best_score:
412
+ best_score = dev_acc
413
+ # Save best checkpoint for best bleu
414
+ if args.task == "statement_level":
415
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
416
+ else:
417
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
418
+ if not os.path.exists(output_dir):
419
+ os.makedirs(output_dir)
420
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
421
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
422
+ torch.save(model_to_save.state_dict(), output_model_file)
423
+ patience = 0
424
+ else:
425
+ patience += 1
426
+ if patience == 3:
427
+ break
428
+ logger.info(" Best score:%s",best_score)
429
+ logger.info(" "+"*"*20)
430
+
431
+ if args.task == "statement_level":
432
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
433
+ else:
434
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
435
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
436
+ for line in res_list:
437
+ dic = {}
438
+ dic["Pred"] = line[0]
439
+ dic["GT"] = line[1]
440
+ wf.write(json.dumps(dic))
441
+ wf.write("\n")
442
+
443
+ if args.do_test:
444
+ res_list = []
445
+ output_dir2 = ""
446
+
447
+ if args.load_model_path is not None:
448
+ model_to_load = model.module if hasattr(model, 'module') else model
449
+
450
+ if args.task == "statement_level":
451
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
452
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
453
+ else:
454
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
455
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
456
+
457
+
458
+ if args.task == "statement_level":
459
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/statement_level/test.jsonl')
460
+ else:
461
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/next_statement/test.jsonl')
462
+ eval_examples = read_examples(args.test_filename)
463
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
464
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
465
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
466
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
467
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
468
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
469
+
470
+ # Calculate bleu
471
+ eval_sampler = SequentialSampler(eval_data)
472
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
473
+
474
+ model.eval()
475
+ p=[]
476
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
477
+ batch = tuple(t.to(device) for t in batch)
478
+ source_ids, source_mask, _, _ = batch
479
+ with torch.no_grad():
480
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
481
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length) # module. for multi GPU
482
+ for pred in preds:
483
+ # print(pred)
484
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
485
+ p.append(text)
486
+ model.train()
487
+ edit_sim = 0.0
488
+ EM = 0.0
489
+ total = len(p)
490
+ for ref,gold in zip(p,eval_examples):
491
+ pred = ref.strip()
492
+ gt = gold.target
493
+ edit_sim += fuzz.ratio(pred, gt)
494
+ if pred.split() == gt.split():
495
+ EM += 1
496
+ res_list.append([pred,gt])
497
+ dev_acc = round(edit_sim/total, 2)
498
+ dev_em = round(EM/total, 4)
499
+ logger.info(" %s = %s "%("Test Token Avg Edit Distance",str(dev_acc)))
500
+ logger.info(" %s = %s "%("Test Token Avg Exact Match Rate",str(dev_em)))
501
+ logger.info(" "+"*"*20)
502
+ if args.test_org:
503
+ output_dir = args.output_dir
504
+ else:
505
+ if args.task == "statement_level":
506
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
507
+ else:
508
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
509
+
510
+ with open(output_dir + "/test_result.jsonl", 'w') as wf:
511
+ for line in res_list:
512
+ dic = {}
513
+ dic["Pred"] = line[0]
514
+ dic["GT"] = line[1]
515
+ wf.write(json.dumps(dic))
516
+ wf.write("\n")
517
+
518
+
519
+
520
+
521
+ if __name__ == "__main__":
522
+ main()
523
+
524
+
525
+
Script/Model/CodeT5+/code-generation/bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 Google Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+
16
+ """Python implementation of BLEU and smooth-BLEU.
17
+
18
+ This module provides a Python implementation of BLEU and smooth-BLEU.
19
+ Smooth BLEU is computed following the method outlined in the paper:
20
+ Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
+ evaluation metrics for machine translation. COLING 2004.
22
+ """
23
+
24
+ import collections
25
+ import math
26
+
27
+
28
+ def _get_ngrams(segment, max_order):
29
+ """Extracts all n-grams upto a given maximum order from an input segment.
30
+
31
+ Args:
32
+ segment: text segment from which n-grams will be extracted.
33
+ max_order: maximum length in tokens of the n-grams returned by this
34
+ methods.
35
+
36
+ Returns:
37
+ The Counter containing all n-grams upto max_order in segment
38
+ with a count of how many times each n-gram occurred.
39
+ """
40
+ ngram_counts = collections.Counter()
41
+ for order in range(1, max_order + 1):
42
+ for i in range(0, len(segment) - order + 1):
43
+ ngram = tuple(segment[i:i+order])
44
+ ngram_counts[ngram] += 1
45
+ return ngram_counts
46
+
47
+
48
+ def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
+ smooth=False):
50
+ """Computes BLEU score of translated segments against one or more references.
51
+
52
+ Args:
53
+ reference_corpus: list of lists of references for each translation. Each
54
+ reference should be tokenized into a list of tokens.
55
+ translation_corpus: list of translations to score. Each translation
56
+ should be tokenized into a list of tokens.
57
+ max_order: Maximum n-gram order to use when computing BLEU score.
58
+ smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
+
60
+ Returns:
61
+ 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
+ precisions and brevity penalty.
63
+ """
64
+ matches_by_order = [0] * max_order
65
+ possible_matches_by_order = [0] * max_order
66
+ reference_length = 0
67
+ translation_length = 0
68
+ for (references, translation) in zip(reference_corpus,
69
+ translation_corpus):
70
+ reference_length += min(len(r) for r in references)
71
+ translation_length += len(translation)
72
+
73
+ merged_ref_ngram_counts = collections.Counter()
74
+ for reference in references:
75
+ merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
+ translation_ngram_counts = _get_ngrams(translation, max_order)
77
+ overlap = translation_ngram_counts & merged_ref_ngram_counts
78
+ for ngram in overlap:
79
+ matches_by_order[len(ngram)-1] += overlap[ngram]
80
+ for order in range(1, max_order+1):
81
+ possible_matches = len(translation) - order + 1
82
+ if possible_matches > 0:
83
+ possible_matches_by_order[order-1] += possible_matches
84
+
85
+ precisions = [0] * max_order
86
+ for i in range(0, max_order):
87
+ if smooth:
88
+ precisions[i] = ((matches_by_order[i] + 1.) /
89
+ (possible_matches_by_order[i] + 1.))
90
+ else:
91
+ if possible_matches_by_order[i] > 0:
92
+ precisions[i] = (float(matches_by_order[i]) /
93
+ possible_matches_by_order[i])
94
+ else:
95
+ precisions[i] = 0.0
96
+
97
+ if min(precisions) > 0:
98
+ p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
+ geo_mean = math.exp(p_log_sum)
100
+ else:
101
+ geo_mean = 0
102
+
103
+ ratio = float(translation_length) / reference_length
104
+
105
+ if ratio > 1.0:
106
+ bp = 1.
107
+ else:
108
+ bp = math.exp(1 - 1. / ratio)
109
+
110
+ bleu = geo_mean * bp
111
+
112
+ return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
+
114
+
115
+ def _bleu(ref_file, trans_file, subword_option=None):
116
+ max_order = 4
117
+ smooth = True
118
+ ref_files = [ref_file]
119
+ reference_text = []
120
+ for reference_filename in ref_files:
121
+ with open(reference_filename) as fh:
122
+ reference_text.append(fh.readlines())
123
+ per_segment_references = []
124
+ for references in zip(*reference_text):
125
+ reference_list = []
126
+ for reference in references:
127
+ reference_list.append(reference.strip().split())
128
+ per_segment_references.append(reference_list)
129
+ translations = []
130
+ with open(trans_file) as fh:
131
+ for line in fh:
132
+ translations.append(line.strip().split())
133
+ bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
+ return round(100 * bleu_score,2)
Script/Model/CodeT5+/code-generation/run_generation.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ from bleu import _bleu
26
+ import pickle
27
+ import torch
28
+ import json
29
+ import random
30
+ import logging
31
+ import argparse
32
+ import numpy as np
33
+ from io import open
34
+ from itertools import cycle
35
+ import torch.nn as nn
36
+ from tqdm import tqdm, trange
37
+ from torch.nn.utils.rnn import pad_sequence
38
+ from fuzzywuzzy import fuzz
39
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
40
+ from torch.utils.data.distributed import DistributedSampler
41
+
42
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, T5ForConditionalGeneration, AutoTokenizer)
43
+
44
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
45
+ datefmt = '%m/%d/%Y %H:%M:%S',
46
+ level = logging.INFO)
47
+ logger = logging.getLogger(__name__)
48
+ divide_number = 3
49
+ #
50
+
51
+ class Example(object):
52
+ """A single training/test example."""
53
+ def __init__(self,
54
+ idx,
55
+ source,
56
+ ts_v,
57
+ target,
58
+ ):
59
+ self.idx = idx
60
+ self.source = source
61
+ self.ts_v = ts_v
62
+ self.target = target
63
+
64
+ def read_examples(filename):
65
+ """Read examples from filename."""
66
+ examples=[]
67
+ with open(filename,encoding="utf-8") as f:
68
+ for idx, line in enumerate(f):
69
+
70
+ line=line.strip()
71
+ js=json.loads(line)
72
+
73
+ examples.append(
74
+ Example(
75
+ idx = idx,
76
+ source=" ".join(js['natrual_language']),
77
+ ts_v = ",".join(js['TS_V_token']),
78
+ target = " ".join(js["ground_truth"][1:-1]),
79
+ )
80
+ )
81
+
82
+ return examples
83
+
84
+
85
+ class InputFeatures(object):
86
+ """A single training/test features for a example."""
87
+ def __init__(self,
88
+ example_id,
89
+ source_ids, source_mask,
90
+ target_ids, target_mask
91
+ ):
92
+ self.example_id = example_id
93
+ self.source_ids = source_ids
94
+ self.source_mask = source_mask
95
+ self.target_ids = target_ids
96
+ self.target_mask = target_mask
97
+
98
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
99
+ features = []
100
+ for example_index, example in enumerate(examples):
101
+ #source
102
+
103
+ source_ids = torch.LongTensor(tokenizer.encode(example.source + tokenizer.pad_token + example.ts_v,
104
+ add_special_tokens=True, max_length=args.max_source_length, truncation=True))
105
+
106
+ source_mask = torch.ones_like(source_ids)
107
+ #target
108
+ if stage=="test":
109
+ target_tokens = tokenizer.tokenize("None")
110
+ else:
111
+ target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
112
+
113
+ target_ids = torch.LongTensor(tokenizer.encode(example.target,
114
+ add_special_tokens=True, max_length=args.max_target_length, truncation=True))
115
+ target_mask = torch.ones_like(target_ids)
116
+
117
+
118
+
119
+ features.append(
120
+ InputFeatures(
121
+ example_index,
122
+ source_ids, source_mask,
123
+ target_ids, target_mask
124
+ )
125
+ )
126
+ return features
127
+
128
+
129
+
130
+ def set_seed(seed=20240124):
131
+ random.seed(seed)
132
+ os.environ['PYHTONHASHSEED'] = str(seed)
133
+ np.random.seed(seed)
134
+ torch.manual_seed(seed)
135
+ torch.cuda.manual_seed(seed)
136
+ torch.backends.cudnn.deterministic = True
137
+
138
+ def main():
139
+ parser = argparse.ArgumentParser()
140
+
141
+ ## Required parameters
142
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
143
+ help="Path to pre-trained model: e.g. roberta-base" )
144
+ parser.add_argument("--load_model_path", default=None, type=str,
145
+ help="Path to trained model" )
146
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
147
+ help="The output directory where the model predictions and checkpoints will be written.")
148
+
149
+ ## Other parameters
150
+ parser.add_argument("--train_filename", default=None, type=str,
151
+ help="The train filename. Should contain the .jsonl files for this task.")
152
+ parser.add_argument("--dev_filename", default=None, type=str,
153
+ help="The dev filename. Should contain the .jsonl files for this task.")
154
+ parser.add_argument("--test_filename", default=None, type=str,
155
+ help="The test filename. Should contain the .jsonl files for this task.")
156
+ parser.add_argument("--max_source_length", default=256, type=int,
157
+ help="The maximum total source sequence length after tokenization. Sequences longer "
158
+ "than this will be truncated, sequences shorter will be padded.")
159
+ parser.add_argument("--max_target_length", default=512, type=int,
160
+ help="The maximum total target sequence length after tokenization. Sequences longer "
161
+ "than this will be truncated, sequences shorter will be padded.")
162
+ parser.add_argument("--do_train", action='store_true',
163
+ help="Whether to run training.")
164
+ parser.add_argument("--do_eval", action='store_true',
165
+ help="Whether to run eval on the dev set.")
166
+ parser.add_argument("--do_test", action='store_true',
167
+ help="Whether to run eval on the dev set.")
168
+ parser.add_argument("--no_cuda", action='store_true',
169
+ help="Avoid using CUDA when available")
170
+
171
+ parser.add_argument("--train_batch_size", default=8, type=int,
172
+ help="Batch size per GPU/CPU for training.")
173
+ parser.add_argument("--eval_batch_size", default=8, type=int,
174
+ help="Batch size per GPU/CPU for evaluation.")
175
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
176
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
177
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
178
+ help="The initial learning rate for Adam.")
179
+ parser.add_argument("--beam_size", default=10, type=int,
180
+ help="beam size for beam search")
181
+ parser.add_argument("--weight_decay", default=0.0, type=float,
182
+ help="Weight deay if we apply some.")
183
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
184
+ help="Epsilon for Adam optimizer.")
185
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
186
+ help="Max gradient norm.")
187
+ parser.add_argument("--num_train_epochs", default=3, type=int,
188
+ help="Total number of training epochs to perform.")
189
+ parser.add_argument('--seed', type=int, default=20240124,
190
+ help="random seed for initialization")
191
+
192
+ # print arguments
193
+ args = parser.parse_args()
194
+ # set log
195
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
196
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
197
+ # set device
198
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
199
+ args.n_gpu = torch.cuda.device_count()
200
+ args.device = device
201
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
202
+
203
+ # Set seed
204
+ set_seed(args.seed)
205
+ # make dir if output_dir not exist
206
+ if os.path.exists(args.output_dir) is False:
207
+ os.makedirs(args.output_dir)
208
+
209
+ # build model
210
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
211
+ is_trust = False
212
+ if "codet5p-220m" in args.model_name_or_path:
213
+ is_trust = False
214
+ else:
215
+ is_trust = True
216
+ model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
217
+ logger.info("Training/evaluation parameters %s", args)
218
+ if args.load_model_path is not None:
219
+ logger.info("reload model from {}".format(args.load_model_path + "/pytorch_model.bin"))
220
+ model.load_state_dict(torch.load(args.load_model_path + "/pytorch_model.bin"))
221
+ model.to(args.device)
222
+
223
+ if args.n_gpu > 1:
224
+ # multi-gpu training
225
+ model = torch.nn.DataParallel(model)
226
+
227
+ if args.do_train:
228
+ # Prepare training data loader
229
+ train_examples = read_examples(args.train_filename)
230
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
231
+ all_source_ids = pad_sequence([f.source_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
232
+ all_source_mask = pad_sequence([f.source_mask for f in train_features], batch_first=True, padding_value=0)
233
+ all_target_ids = pad_sequence([f.target_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
234
+ all_target_mask = pad_sequence([f.target_mask for f in train_features], batch_first=True, padding_value=0)
235
+ train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
236
+ train_sampler = RandomSampler(train_data)
237
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
238
+
239
+ # Prepare optimizer and schedule (linear warmup and decay)
240
+ no_decay = ['bias', 'LayerNorm.weight']
241
+ optimizer_grouped_parameters = [
242
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
243
+ 'weight_decay': args.weight_decay},
244
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
245
+ ]
246
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
247
+ scheduler = get_linear_schedule_with_warmup(optimizer,
248
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
249
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
250
+
251
+ #Start training
252
+ logger.info("***** Running training *****")
253
+ logger.info(" Num examples = %d", len(train_examples))
254
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
255
+ logger.info(" Num epoch = %d", args.num_train_epochs)
256
+
257
+
258
+ model.train()
259
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
260
+ for epoch in range(args.num_train_epochs):
261
+ for idx,batch in enumerate(train_dataloader):
262
+ batch = tuple(t.to(device) for t in batch)
263
+ source_ids,source_mask,target_ids,target_mask = batch
264
+ # loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
265
+
266
+ loss = model(input_ids=source_ids, attention_mask=source_mask.gt(0),
267
+ labels=target_ids, decoder_attention_mask=target_mask.gt(0)).loss
268
+
269
+ if args.n_gpu > 1:
270
+ loss = loss.mean() # mean() to average on multi-gpu.
271
+ if args.gradient_accumulation_steps > 1:
272
+ loss = loss / args.gradient_accumulation_steps
273
+
274
+ losses.append(loss.item())
275
+ loss.backward()
276
+ if len(losses) % args.gradient_accumulation_steps == 0:
277
+ #Update parameters
278
+ optimizer.step()
279
+ optimizer.zero_grad()
280
+ scheduler.step()
281
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
282
+ logger.info("epoch {} step {} loss {}".format(epoch,
283
+ len(losses)//args.gradient_accumulation_steps,
284
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
285
+ if args.do_eval:
286
+ #Eval model with dev dataset
287
+ if 'dev_loss' in dev_dataset:
288
+ eval_examples,eval_data = dev_dataset['dev_loss']
289
+ else:
290
+ eval_examples = read_examples(args.dev_filename)
291
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
292
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
293
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
294
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
295
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
296
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
297
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
298
+ eval_sampler = SequentialSampler(eval_data)
299
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
300
+
301
+ logger.info("\n***** Running evaluation *****")
302
+ logger.info(" Num examples = %d", len(eval_examples))
303
+ logger.info(" Batch size = %d", args.eval_batch_size)
304
+
305
+ #Start Evaling model
306
+ model.eval()
307
+ eval_loss,tokens_num = 0,0
308
+ for batch in eval_dataloader:
309
+ batch = tuple(t.to(device) for t in batch)
310
+ source_ids,source_mask,target_ids,target_mask = batch
311
+ with torch.no_grad():
312
+ loss = model(input_ids=source_ids, attention_mask=source_mask,
313
+ labels=target_ids, decoder_attention_mask=target_mask).loss
314
+
315
+ if args.n_gpu > 1:
316
+ loss = loss.mean() # mean() to average on multi-gpu.
317
+
318
+ if args.gradient_accumulation_steps > 1:
319
+ loss = loss / args.gradient_accumulation_steps
320
+ eval_loss += loss.item()
321
+ tokens_num += 1
322
+ #Pring loss of dev dataset
323
+ model.train()
324
+ eval_loss = eval_loss / tokens_num
325
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
326
+ for key in sorted(result.keys()):
327
+ logger.info(" %s = %s", key, str(result[key]))
328
+ logger.info(" "+"*"*20)
329
+
330
+ #Calculate bleu
331
+ if 'dev_bleu' in dev_dataset:
332
+ eval_examples,eval_data=dev_dataset['dev_bleu']
333
+ else:
334
+ eval_examples = read_examples(args.dev_filename)
335
+ # eval_examples = random.sample(eval_examples)
336
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
337
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
338
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
339
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
340
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
341
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
342
+ dev_dataset['dev_bleu'] = eval_examples,eval_data
343
+
344
+ eval_sampler = SequentialSampler(eval_data)
345
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
346
+
347
+ model.eval()
348
+ p=[]
349
+ for batch in eval_dataloader:
350
+ batch = tuple(t.to(device) for t in batch)
351
+ source_ids,source_mask,target_ids,target_mask = batch
352
+ with torch.no_grad():
353
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
354
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
355
+
356
+ # convert ids to text
357
+ for pred in preds:
358
+ # print(pred)
359
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
360
+ p.append(text)
361
+
362
+ model.train()
363
+ predictions = []
364
+ res_list = []
365
+ EM = []
366
+ is_gened = False
367
+ with open(args.output_dir+"/dev.output",'w') as f, open(args.output_dir+"/dev.gold",'w') as f1:
368
+ for ref,gold in zip(p,eval_examples):
369
+ predictions.append(ref)
370
+ if len(ref) > 0:
371
+ is_gened = True
372
+ f.write(ref+'\n')
373
+ f1.write(gold.target+'\n')
374
+ EM.append(ref.split()==gold.target.split())
375
+ res_list.append([ref,gold.target])
376
+ if is_gened:
377
+ dev_bleu = _bleu(args.output_dir+"/dev.gold", args.output_dir+"/dev.output")
378
+ else:
379
+ dev_bleu = 0
380
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
381
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
382
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
383
+ logger.info(" "+"*"*20)
384
+ dev_score = (dev_bleu+round(np.mean(EM)*100,2))
385
+ if dev_score>best_score:
386
+ best_score=dev_score
387
+ # Save best checkpoint for best bleu
388
+ output_dir = args.output_dir
389
+ if not os.path.exists(output_dir):
390
+ os.makedirs(output_dir)
391
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
392
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
393
+ torch.save(model_to_save.state_dict(), output_model_file)
394
+ patience = 0
395
+ else:
396
+ patience += 1
397
+ if patience == 3:
398
+ break
399
+ output_dir = args.output_dir
400
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
401
+ for line in res_list:
402
+ dic = {}
403
+ dic["Pred"] = line[0]
404
+ dic["GT"] = line[1]
405
+ wf.write(json.dumps(dic))
406
+ wf.write("\n")
407
+
408
+ logger.info(" Best score:%s",best_score)
409
+ logger.info(" "+"*"*20)
410
+ if args.do_test:
411
+ res_list = []
412
+
413
+ if args.load_model_path is not None:
414
+ checkpoint_prefix = 'pytorch_model.bin'
415
+ output_dir = os.path.join(args.output_dir, checkpoint_prefix)
416
+ model_to_load = model.module if hasattr(model, 'module') else model
417
+ model_to_load.load_state_dict(torch.load(output_dir))
418
+
419
+
420
+
421
+ eval_examples = read_examples(args.test_filename)
422
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
423
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
424
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
425
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
426
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
427
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
428
+
429
+ # Calculate bleu
430
+ eval_sampler = SequentialSampler(eval_data)
431
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
432
+
433
+ model.eval()
434
+ p=[]
435
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
436
+ batch = tuple(t.to(device) for t in batch)
437
+ source_ids, source_mask, _, _ = batch
438
+ with torch.no_grad():
439
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
440
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
441
+ for pred in preds:
442
+ # print(pred)
443
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
444
+ p.append(text)
445
+
446
+ predictions=[]
447
+ EM = []
448
+ edit_dis = 0
449
+ cnt = 0
450
+ with open(args.output_dir+"/test.output",'w') as f, open(args.output_dir+"/test.gold",'w') as f1:
451
+ for ref,gold in zip(p,eval_examples):
452
+ res_list.append([ref,gold.target])
453
+ predictions.append(ref)
454
+ f.write(ref+'\n')
455
+ f1.write(gold.target+'\n')
456
+ EM.append(ref.split()==gold.target.split())
457
+ edit_dis += fuzz.ratio(ref, gold.target)
458
+ cnt += 1
459
+
460
+ dev_bleu = _bleu(args.output_dir+"/test.gold", args.output_dir+"/test.output")
461
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
462
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
463
+ logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt,2))))
464
+ logger.info(" "+"*"*20)
465
+
466
+
467
+ with open(args.output_dir + "/last_training_result.jsonl", 'w') as wf:
468
+ for line in res_list:
469
+ dic = {}
470
+ dic["Pred"] = line[0]
471
+ dic["GT"] = line[1]
472
+ wf.write(json.dumps(dic))
473
+ wf.write("\n")
474
+
475
+ if __name__ == "__main__":
476
+ main()
477
+
478
+
Script/Model/CodeT5+/new-target-completion/run_completion.py ADDED
@@ -0,0 +1,614 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ import pickle
26
+ import torch
27
+ import json
28
+ import random
29
+ import logging
30
+ import argparse
31
+ import numpy as np
32
+ from io import open
33
+ from itertools import cycle
34
+ import torch.nn as nn
35
+ from tqdm import tqdm, trange
36
+ from torch.nn.utils.rnn import pad_sequence
37
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
38
+ from torch.utils.data.distributed import DistributedSampler
39
+ from tqdm import tqdm
40
+ from fuzzywuzzy import fuzz
41
+ import re
42
+ import multiprocessing
43
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, T5ForConditionalGeneration, AutoTokenizer)
44
+
45
+ divide_number = 2
46
+ cpu_cont = 16
47
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
48
+ datefmt = '%m/%d/%Y %H:%M:%S',
49
+ level = logging.INFO)
50
+ logger = logging.getLogger(__name__)
51
+
52
+ #
53
+
54
+
55
+ class Example(object):
56
+ """A single training/test example."""
57
+ def __init__(self,
58
+ idx,
59
+ source,
60
+ target,
61
+ comp_type,
62
+ tar_type
63
+ ):
64
+ self.idx = idx
65
+ self.source = source
66
+ self.target = target
67
+ self.comp_type = comp_type
68
+ self.tar_type = tar_type
69
+
70
+
71
+ def read_examples(filename):
72
+ """Read examples from filename."""
73
+ examples=[]
74
+
75
+ with open(filename,encoding="utf-8") as f:
76
+ max_src_len = 0
77
+ max_tar_len = 0
78
+ for idx, line in enumerate(f):
79
+ js=json.loads(line)
80
+ inputs = " ".join(js["Template_token"][1:])
81
+
82
+ # print(inputs)
83
+ if "ground_truth" in js:
84
+ outputs = " ".join(js["ground_truth"])
85
+ else:
86
+ outputs = inputs
87
+ if 'Idx' in js:
88
+ idx = js['Idx']
89
+
90
+
91
+ comp_type = js["Compiler_Type"]
92
+ tar_type = js["Target"]
93
+ examples.append(
94
+ Example(
95
+ idx = idx,
96
+ source = inputs,
97
+ target = outputs,
98
+ comp_type = comp_type,
99
+ tar_type = tar_type
100
+ )
101
+ )
102
+ return examples
103
+
104
+
105
+ class InputFeatures(object):
106
+ """A single training/test features for a example."""
107
+ def __init__(self,
108
+ example_id,
109
+ source_ids, source_mask,
110
+ target_ids, target_mask,
111
+ comp_type, tar_type
112
+ ):
113
+ self.example_id = example_id
114
+ self.source_ids = source_ids
115
+ self.source_mask = source_mask
116
+ self.target_ids = target_ids
117
+ self.target_mask = target_mask
118
+ self.comp_type = comp_type
119
+ self.tar_type = tar_type
120
+
121
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
122
+ features = []
123
+ for example_index, example in enumerate(examples):
124
+ #source
125
+ source_ids = torch.LongTensor(tokenizer.encode(example.source,
126
+ add_special_tokens=True, max_length=args.max_source_length, truncation=True))
127
+ # print(tokenizer.encode(example.source,
128
+ # add_special_tokens=True, max_length=args.max_source_length, truncation=True))
129
+ source_mask = torch.ones_like(source_ids)
130
+ #target
131
+ if stage=="test":
132
+ target = "None"
133
+ else:
134
+ target = example.target
135
+
136
+ target_ids = torch.LongTensor(tokenizer.encode(target,
137
+ add_special_tokens=True, max_length=args.max_target_length, truncation=True))
138
+ target_mask = torch.ones_like(target_ids)
139
+
140
+
141
+ features.append(
142
+ InputFeatures(
143
+ example_index,
144
+ source_ids, source_mask,
145
+ target_ids, target_mask,
146
+ example.comp_type, example.tar_type
147
+ )
148
+ )
149
+ return features
150
+
151
+
152
+
153
+ def set_seed(seed=20240124):
154
+ random.seed(seed)
155
+ os.environ['PYHTONHASHSEED'] = str(seed)
156
+ np.random.seed(seed)
157
+ torch.manual_seed(seed)
158
+ torch.cuda.manual_seed(seed)
159
+ torch.backends.cudnn.deterministic = True
160
+
161
+
162
+ def main():
163
+ parser = argparse.ArgumentParser()
164
+
165
+ ## Required parameters
166
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
167
+ help="Path to pre-trained model: e.g. roberta-base" )
168
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
169
+ help="The output directory where the model predictions and checkpoints will be written.")
170
+ parser.add_argument("--load_model_path", default=None, type=str,
171
+ help="Path to trained model: Should contain the .bin files" )
172
+ ## Other parameters
173
+ parser.add_argument("--task", default=None, type=str, required=True,
174
+ help="Task Type: statement_level, next_statement" )
175
+
176
+ parser.add_argument("--train_filename", default="../../Dataset/", type=str,
177
+ help="The train filename. Should contain the .jsonl files for this task.")
178
+ parser.add_argument("--dev_filename", default="../../Dataset/", type=str,
179
+ help="The dev filename. Should contain the .jsonl files for this task.")
180
+ parser.add_argument("--test_filename", default="../../Dataset/", type=str,
181
+ help="The test filename. Should contain the .jsonl files for this task.")
182
+
183
+ parser.add_argument("--config_name", default="", type=str,
184
+ help="Pretrained config name or path if not the same as model_name")
185
+ parser.add_argument("--tokenizer_name", default="", type=str,
186
+ help="Pretrained tokenizer name or path if not the same as model_name")
187
+ # parser.add_argument("--max_source_length", default=64, type=int,
188
+ # help="The maximum total source sequence length after tokenization. Sequences longer "
189
+ # "than this will be truncated, sequences shorter will be padded.")
190
+ # parser.add_argument("--max_target_length", default=32, type=int,
191
+ # help="The maximum total target sequence length after tokenization. Sequences longer "
192
+ # "than this will be truncated, sequences shorter will be padded.")
193
+
194
+ parser.add_argument("--do_train", action='store_true',
195
+ help="Whether to run training.")
196
+
197
+ parser.add_argument("--do_eval", action='store_true',
198
+ help="Whether to run eval on the dev set.")
199
+ parser.add_argument("--do_test", action='store_true',
200
+ help="Whether to run eval on the dev set.")
201
+ parser.add_argument("--test_org", action='store_true',
202
+ help="Whether to run eval on org model.")
203
+ parser.add_argument("--do_lower_case", action='store_true',
204
+ help="Set this flag if you are using an uncased model.")
205
+ parser.add_argument("--no_cuda", action='store_true',
206
+ help="Avoid using CUDA when available")
207
+ parser.add_argument("--do_cpuonly", action='store_true',
208
+ help="Whether CPU only training.")
209
+ parser.add_argument("--do_itr", action='store_true',
210
+ help="Whether to itr training.")
211
+ parser.add_argument("--train_batch_size", default=8, type=int,
212
+ help="Batch size per GPU/CPU for training.")
213
+ parser.add_argument("--eval_batch_size", default=8, type=int,
214
+ help="Batch size per GPU/CPU for evaluation.")
215
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
216
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
217
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
218
+ help="The initial learning rate for Adam.")
219
+ parser.add_argument("--beam_size", default=10, type=int,
220
+ help="beam size for beam search")
221
+ parser.add_argument("--weight_decay", default=0.0, type=float,
222
+ help="Weight deay if we apply some.")
223
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
224
+ help="Epsilon for Adam optimizer.")
225
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
226
+ help="Max gradient norm.")
227
+ parser.add_argument("--num_train_epochs", default=3, type=int,
228
+ help="Total number of training epochs to perform.")
229
+ parser.add_argument("--max_steps", default=-1, type=int,
230
+ help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
231
+ parser.add_argument("--eval_steps", default=-1, type=int,
232
+ help="")
233
+ parser.add_argument("--max_target_length", default=128, type=int,
234
+ help="")
235
+ parser.add_argument("--max_source_length", default=512, type=int,
236
+ help="")
237
+ parser.add_argument("--train_steps", default=-1, type=int,
238
+ help="")
239
+ parser.add_argument("--warmup_steps", default=0, type=int,
240
+ help="Linear warmup over warmup_steps.")
241
+ parser.add_argument("--local_rank", type=int, default=-1,
242
+ help="For distributed training: local_rank")
243
+ parser.add_argument('--seed', type=int, default=20240124,
244
+ help="random seed for initialization")
245
+ # print arguments
246
+ args = parser.parse_args()
247
+ # set log
248
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
249
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
250
+ # set device
251
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
252
+ args.n_gpu = torch.cuda.device_count()
253
+ args.device = device
254
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
255
+
256
+ # Set seed
257
+ set_seed(args.seed)
258
+
259
+ # make dir if output_dir not exist
260
+ if os.path.exists(args.output_dir) is False:
261
+ os.makedirs(args.output_dir)
262
+
263
+ # build model
264
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
265
+ is_trust = False
266
+ if "codet5p-220m" in args.model_name_or_path:
267
+ is_trust = False
268
+ else:
269
+ is_trust = True
270
+ model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
271
+
272
+
273
+ logger.info("Training/evaluation parameters %s", args)
274
+
275
+ if args.load_model_path is not None:
276
+ model_save_name = "Existing_Types/pytorch_model.bin"
277
+ if args.do_itr:
278
+ model_save_name = "pytorch_model.bin"
279
+ if args.do_cpuonly:
280
+ model_save_name = "New_Types/pytorch_model.bin"
281
+ if args.task == "statement_level":
282
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/"+model_save_name))
283
+ model.load_state_dict(torch.load(args.load_model_path + "/statement_level/"+model_save_name))
284
+ else:
285
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/"+model_save_name))
286
+ model.load_state_dict(torch.load(args.load_model_path + "/next_statement/"+model_save_name))
287
+
288
+ # model.eval()
289
+ model.to(args.device)
290
+
291
+ if args.n_gpu > 1:
292
+ # multi-gpu training
293
+ model = torch.nn.DataParallel(model)
294
+
295
+ if args.do_train:
296
+ # Prepare training data loader
297
+
298
+ file_name_pre = "New_Target_Completion"
299
+ file_name_post = "Existing_Types/train.jsonl"
300
+ if args.do_itr:
301
+ file_name_pre = "Iterative_Expansion_Completion"
302
+ file_name_post = "train.jsonl"
303
+ if args.do_cpuonly and not args.do_itr:
304
+ file_name_pre = "New_Target_Completion"
305
+ file_name_post = "New_Types/train.jsonl"
306
+ if args.task == "statement_level":
307
+ train_examples = read_examples(args.train_filename + file_name_pre +'/statement_level/'+file_name_post)
308
+ else:
309
+ train_examples = read_examples(args.train_filename + file_name_pre +'/statement_level/'+file_name_post)
310
+
311
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
312
+ all_source_ids = pad_sequence([f.source_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
313
+ all_source_mask = pad_sequence([f.source_mask for f in train_features], batch_first=True, padding_value=0)
314
+ all_target_ids = pad_sequence([f.target_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
315
+ all_target_mask = pad_sequence([f.target_mask for f in train_features], batch_first=True, padding_value=0)
316
+ train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
317
+ train_sampler = RandomSampler(train_data)
318
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
319
+
320
+
321
+ # Prepare optimizer and schedule (linear warmup and decay)
322
+ no_decay = ['bias', 'LayerNorm.weight']
323
+ optimizer_grouped_parameters = [
324
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
325
+ 'weight_decay': args.weight_decay},
326
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
327
+ ]
328
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
329
+ scheduler = get_linear_schedule_with_warmup(optimizer,
330
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
331
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
332
+
333
+ #Start training
334
+ logger.info("***** Running training *****")
335
+ logger.info(" Num examples = %d", len(train_examples))
336
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
337
+ logger.info(" Num epoch = %d", args.num_train_epochs)
338
+
339
+
340
+ model.train()
341
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
342
+ for epoch in range(args.num_train_epochs):
343
+ for idx,batch in enumerate(train_dataloader):
344
+ batch = tuple(t.to(device) for t in batch)
345
+ source_ids,source_mask,target_ids,target_mask = batch
346
+ # loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
347
+
348
+ loss = model(input_ids=source_ids, attention_mask=source_mask.gt(0),
349
+ labels=target_ids, decoder_attention_mask=target_mask.gt(0)).loss
350
+
351
+
352
+ if args.n_gpu > 1:
353
+ loss = loss.mean() # mean() to average on multi-gpu.
354
+
355
+ if args.gradient_accumulation_steps > 1:
356
+ loss = loss / args.gradient_accumulation_steps
357
+
358
+ losses.append(loss.item())
359
+ loss.backward()
360
+ if len(losses) % args.gradient_accumulation_steps == 0:
361
+ #Update parameters
362
+ optimizer.step()
363
+ optimizer.zero_grad()
364
+ scheduler.step()
365
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
366
+ logger.info("epoch {} step {} loss {}".format(epoch,
367
+ len(losses)//args.gradient_accumulation_steps,
368
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
369
+ if args.do_eval:
370
+ #Eval model with dev dataset
371
+
372
+ if 'dev_loss' in dev_dataset:
373
+ eval_examples,eval_data = dev_dataset['dev_loss']
374
+ else:
375
+ file_name_pre = "New_Target_Completion"
376
+ file_name_post = "Existing_Types/valid.jsonl"
377
+ if args.do_itr:
378
+ file_name_pre = "Iterative_Expansion_Completion"
379
+ file_name_post = "valid.jsonl"
380
+ if args.do_cpuonly and not args.do_itr:
381
+ file_name_pre = "New_Target_Completion"
382
+ file_name_post = "New_Types/valid.jsonl"
383
+ if args.task == "statement_level":
384
+ eval_examples = read_examples(args.dev_filename + file_name_pre +'/statement_level/'+file_name_post)
385
+ else:
386
+ eval_examples = read_examples(args.dev_filename + file_name_pre +'/statement_level/'+file_name_post)
387
+
388
+
389
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
390
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
391
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
392
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
393
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
394
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
395
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
396
+ eval_sampler = SequentialSampler(eval_data)
397
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
398
+ res_list = []
399
+ logger.info("\n***** Running evaluation *****")
400
+ logger.info(" Num examples = %d", len(eval_examples))
401
+ logger.info(" Batch size = %d", args.eval_batch_size)
402
+
403
+ #Start Evaling model
404
+ model.eval()
405
+ p=[]
406
+ eval_loss,tokens_num = 0,0
407
+ for batch in eval_dataloader:
408
+ batch = tuple(t.to(device) for t in batch)
409
+ source_ids,source_mask,target_ids,target_mask = batch
410
+ with torch.no_grad():
411
+ loss = model(input_ids=source_ids, attention_mask=source_mask,
412
+ labels=target_ids, decoder_attention_mask=target_mask).loss
413
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
414
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length) # module. for multi GPU
415
+
416
+ # convert ids to text
417
+ for pred in preds:
418
+ # print(pred)
419
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
420
+ p.append(text)
421
+ if args.n_gpu > 1:
422
+ loss = loss.mean() # mean() to average on multi-gpu.
423
+
424
+ if args.gradient_accumulation_steps > 1:
425
+ loss = loss / args.gradient_accumulation_steps
426
+ eval_loss += loss.item()
427
+ tokens_num += 1
428
+
429
+
430
+ #Pring loss of dev dataset
431
+ model.train()
432
+ eval_loss = eval_loss / tokens_num
433
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
434
+ for key in sorted(result.keys()):
435
+ logger.info(" %s = %s", key, str(result[key]))
436
+ logger.info(" "+"*"*20)
437
+
438
+ EM = 0.0
439
+ edit_sim = 0.0
440
+ total = len(p)
441
+ token_accuracy = 0
442
+ for ref,gold in zip(p,eval_examples):
443
+ pred = ref.strip()
444
+ gt = gold.target
445
+ edit_sim += fuzz.ratio(pred, gt)
446
+ if pred.split() == gt.split():
447
+ EM += 1
448
+ res_list.append([pred,gt])
449
+ dev_acc = round(EM/total*100, 2)
450
+ # logger.info(" %s = %s "%("loss",round(np.mean(dev_losses),4)))
451
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
452
+ logger.info(" %s = %s "%("EM Acc",str(dev_acc)))
453
+ logger.info(" %s = %s "%("Edit Distance",str(round(edit_sim/total, 2))))
454
+ logger.info(" "+"*"*20)
455
+
456
+ if dev_acc > best_score:
457
+ best_score = dev_acc
458
+ # Save best checkpoint for best bleu
459
+ if args.task == "statement_level":
460
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
461
+ else:
462
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
463
+ if not os.path.exists(output_dir):
464
+ os.makedirs(output_dir)
465
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
466
+ model_save_name = "Exitsing_Types/pytorch_model.bin"
467
+ if args.do_itr:
468
+ model_save_name = "pytorch_model.bin"
469
+ if args.do_cpuonly:
470
+ model_save_name = "New_Types/pytorch_model.bin"
471
+ output_model_file = os.path.join(output_dir, model_save_name)
472
+ torch.save(model_to_save.state_dict(), output_model_file)
473
+ patience = 0
474
+ else:
475
+ patience += 1
476
+ if patience == 3:
477
+ break
478
+ logger.info(" Best score:%s",best_score)
479
+ logger.info(" "+"*"*20)
480
+
481
+ if args.task == "statement_level":
482
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
483
+ else:
484
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
485
+
486
+ if args.do_test:
487
+ res_list = []
488
+ output_dir2 = ""
489
+
490
+ if args.load_model_path is not None:
491
+ model_to_load = model.module if hasattr(model, 'module') else model
492
+ # print(output_dir)
493
+ # odel_to_load.load_state_dict(torch.load(output_dir))
494
+ model_save_name = "Existing_Types/pytorch_model.bin"
495
+ if args.do_itr and not args.do_cpuonly:
496
+ model_save_name = "pytorch_model.bin"
497
+ if args.do_itr and args.do_cpuonly:
498
+ args.load_model_path = "../../../../Saved_Models/CodeT5+/New_Target_Completion"
499
+ model_save_name = "New_Types/pytorch_model.bin"
500
+ if args.do_cpuonly:
501
+ model_save_name = "New_Types/pytorch_model.bin"
502
+ if args.task == "statement_level":
503
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/"+model_save_name))
504
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/statement_level/"+model_save_name))
505
+ else:
506
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/"+model_save_name))
507
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/next_statement/"+model_save_name))
508
+
509
+ file_name_pre = "New_Target_Completion"
510
+ file_name_post = "Existing_Types/test.jsonl"
511
+ if args.do_itr:
512
+ file_name_pre = "Iterative_Expansion_Completion"
513
+ file_name_post = "test.jsonl"
514
+ if args.do_cpuonly and not args.do_itr:
515
+ file_name_pre = "New_Target_Completion"
516
+ file_name_post = "New_Types/test.jsonl"
517
+ if args.task == "statement_level":
518
+ args.test_filename = os.path.join(args.test_filename, file_name_pre +'/statement_level/'+file_name_post)
519
+ else:
520
+ args.test_filename = os.path.join(args.test_filename, file_name_pre +'/next_statement/'+file_name_post)
521
+ eval_examples = read_examples(args.test_filename)
522
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
523
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
524
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
525
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
526
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
527
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
528
+
529
+ # Calculate bleu
530
+ eval_sampler = SequentialSampler(eval_data)
531
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
532
+
533
+ model.eval()
534
+ p=[]
535
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
536
+ batch = tuple(t.to(device) for t in batch)
537
+ source_ids, source_mask, _, _ = batch
538
+ with torch.no_grad():
539
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
540
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length) # module. for multi GPU
541
+ for pred in preds:
542
+ # print(pred)
543
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
544
+ p.append(text)
545
+ model.train()
546
+ # edit_sim = 0.0
547
+ # EM = 0.0
548
+ # total = len(p)
549
+ gcc_dic = {"riscv":[0,0,0], "nvptx":[0,0,0], "arc":[0,0,0]}
550
+ llvm_dic = {"RISCV":[0,0,0], "NVPTX":[0,0,0], "ARC":[0,0,0],"RI5CY":[0,0,0]}
551
+ for ref,gold in zip(p,eval_examples):
552
+ pred = ref.strip()
553
+ gt = gold.target
554
+ if gold.comp_type == "GCC":
555
+ gcc_dic[gold.tar_type][1] += fuzz.ratio(pred, gt)
556
+ gcc_dic[gold.tar_type][2] += 1
557
+ if pred.split() == gt.split():
558
+ gcc_dic[gold.tar_type][0] += 1
559
+ if gold.comp_type == "LLVM":
560
+ llvm_dic[gold.tar_type][1] += fuzz.ratio(pred, gt)
561
+ llvm_dic[gold.tar_type][2] += 1
562
+ if pred.split() == gt.split():
563
+ llvm_dic[gold.tar_type][0] += 1
564
+ res_list.append([pred,gt])
565
+ # dev_acc = round(edit_sim/total, 2)
566
+ # dev_em = round(EM/total, 4)
567
+
568
+ for k in gcc_dic.keys():
569
+ if gcc_dic[k][2] > 0:
570
+ dev_acc = round(1.0*gcc_dic[k][1] / gcc_dic[k][2], 2)
571
+ dev_em = round(100.0*gcc_dic[k][0] / gcc_dic[k][2], 4)
572
+ logger.info(" "+"#"*20)
573
+ logger.info("GCC %s: %s = %s "%(k, "Edit Distance", str(dev_acc)))
574
+ logger.info("GCC %s: %s = %s "%(k, "Exact Match Rate", str(dev_em)))
575
+ logger.info(" "+"*"*20)
576
+
577
+ for k in llvm_dic.keys():
578
+ if llvm_dic[k][2] > 0:
579
+ dev_acc = round(1.0*llvm_dic[k][1] / llvm_dic[k][2], 2)
580
+ dev_em = round(100.0*llvm_dic[k][0] / llvm_dic[k][2], 4)
581
+ logger.info(" "+"#"*20)
582
+ logger.info("LLVM %s: %s = %s "%(k, "Edit Distance", str(dev_acc)))
583
+ logger.info("LLVM %s: %s = %s "%(k, "Exact Match Rate", str(dev_em)))
584
+ logger.info(" "+"*"*20)
585
+
586
+
587
+ # if args.test_org:
588
+ # output_dir = args.output_dir
589
+ # else:
590
+ # if args.task == "statement_level":
591
+ # output_dir = os.path.join(args.output_dir, 'statement_level/')
592
+ # else:
593
+ # output_dir = os.path.join(args.output_dir, 'next_statement/')
594
+ # result_file_name = "/test_result.jsonl"
595
+ # if args.do_itr:
596
+ # result_file_name = "/test_result_itr.jsonl"
597
+ # if args.do_cpuonly:
598
+ # result_file_name = "/test_result_cpu.jsonl"
599
+ # with open(output_dir + result_file_name, 'w') as wf:
600
+ # for line in res_list:
601
+ # dic = {}
602
+ # dic["Pred"] = line[0]
603
+ # dic["GT"] = line[1]
604
+ # wf.write(json.dumps(dic))
605
+ # wf.write("\n")
606
+
607
+
608
+
609
+
610
+ if __name__ == "__main__":
611
+ main()
612
+
613
+
614
+
Script/Model/CodeT5+/new-target-generation/bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 Google Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+
16
+ """Python implementation of BLEU and smooth-BLEU.
17
+
18
+ This module provides a Python implementation of BLEU and smooth-BLEU.
19
+ Smooth BLEU is computed following the method outlined in the paper:
20
+ Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
+ evaluation metrics for machine translation. COLING 2004.
22
+ """
23
+
24
+ import collections
25
+ import math
26
+
27
+
28
+ def _get_ngrams(segment, max_order):
29
+ """Extracts all n-grams upto a given maximum order from an input segment.
30
+
31
+ Args:
32
+ segment: text segment from which n-grams will be extracted.
33
+ max_order: maximum length in tokens of the n-grams returned by this
34
+ methods.
35
+
36
+ Returns:
37
+ The Counter containing all n-grams upto max_order in segment
38
+ with a count of how many times each n-gram occurred.
39
+ """
40
+ ngram_counts = collections.Counter()
41
+ for order in range(1, max_order + 1):
42
+ for i in range(0, len(segment) - order + 1):
43
+ ngram = tuple(segment[i:i+order])
44
+ ngram_counts[ngram] += 1
45
+ return ngram_counts
46
+
47
+
48
+ def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
+ smooth=False):
50
+ """Computes BLEU score of translated segments against one or more references.
51
+
52
+ Args:
53
+ reference_corpus: list of lists of references for each translation. Each
54
+ reference should be tokenized into a list of tokens.
55
+ translation_corpus: list of translations to score. Each translation
56
+ should be tokenized into a list of tokens.
57
+ max_order: Maximum n-gram order to use when computing BLEU score.
58
+ smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
+
60
+ Returns:
61
+ 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
+ precisions and brevity penalty.
63
+ """
64
+ matches_by_order = [0] * max_order
65
+ possible_matches_by_order = [0] * max_order
66
+ reference_length = 0
67
+ translation_length = 0
68
+ for (references, translation) in zip(reference_corpus,
69
+ translation_corpus):
70
+ reference_length += min(len(r) for r in references)
71
+ translation_length += len(translation)
72
+
73
+ merged_ref_ngram_counts = collections.Counter()
74
+ for reference in references:
75
+ merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
+ translation_ngram_counts = _get_ngrams(translation, max_order)
77
+ overlap = translation_ngram_counts & merged_ref_ngram_counts
78
+ for ngram in overlap:
79
+ matches_by_order[len(ngram)-1] += overlap[ngram]
80
+ for order in range(1, max_order+1):
81
+ possible_matches = len(translation) - order + 1
82
+ if possible_matches > 0:
83
+ possible_matches_by_order[order-1] += possible_matches
84
+
85
+ precisions = [0] * max_order
86
+ for i in range(0, max_order):
87
+ if smooth:
88
+ precisions[i] = ((matches_by_order[i] + 1.) /
89
+ (possible_matches_by_order[i] + 1.))
90
+ else:
91
+ if possible_matches_by_order[i] > 0:
92
+ precisions[i] = (float(matches_by_order[i]) /
93
+ possible_matches_by_order[i])
94
+ else:
95
+ precisions[i] = 0.0
96
+
97
+ if min(precisions) > 0:
98
+ p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
+ geo_mean = math.exp(p_log_sum)
100
+ else:
101
+ geo_mean = 0
102
+
103
+ ratio = float(translation_length) / reference_length
104
+
105
+ if ratio > 1.0:
106
+ bp = 1.
107
+ else:
108
+ bp = math.exp(1 - 1. / ratio)
109
+
110
+ bleu = geo_mean * bp
111
+
112
+ return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
+
114
+
115
+ def _bleu(ref_file, trans_file, subword_option=None):
116
+ max_order = 4
117
+ smooth = True
118
+ ref_files = [ref_file]
119
+ reference_text = []
120
+ for reference_filename in ref_files:
121
+ with open(reference_filename) as fh:
122
+ reference_text.append(fh.readlines())
123
+ per_segment_references = []
124
+ for references in zip(*reference_text):
125
+ reference_list = []
126
+ for reference in references:
127
+ reference_list.append(reference.strip().split())
128
+ per_segment_references.append(reference_list)
129
+ translations = []
130
+ with open(trans_file) as fh:
131
+ for line in fh:
132
+ translations.append(line.strip().split())
133
+ bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
+ return round(100 * bleu_score,2)
Script/Model/CodeT5+/new-target-generation/run_generation.py ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ from bleu import _bleu
26
+ import pickle
27
+ import torch
28
+ import json
29
+ import random
30
+ import logging
31
+ import argparse
32
+ import numpy as np
33
+ from io import open
34
+ from itertools import cycle
35
+ import torch.nn as nn
36
+ from tqdm import tqdm, trange
37
+ from torch.nn.utils.rnn import pad_sequence
38
+ from fuzzywuzzy import fuzz
39
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
40
+ from torch.utils.data.distributed import DistributedSampler
41
+
42
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, T5ForConditionalGeneration, AutoTokenizer)
43
+
44
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
45
+ datefmt = '%m/%d/%Y %H:%M:%S',
46
+ level = logging.INFO)
47
+ logger = logging.getLogger(__name__)
48
+ divide_number = 3
49
+ #
50
+
51
+ class Example(object):
52
+ """A single training/test example."""
53
+ def __init__(self,
54
+ idx,
55
+ source,
56
+ ts_v,
57
+ target,
58
+ comp_type,
59
+ tar_type
60
+ ):
61
+ self.idx = idx
62
+ self.source = source
63
+ self.ts_v = ts_v
64
+ self.target = target
65
+ self.comp_type = comp_type
66
+ self.tar_type = tar_type
67
+
68
+ def read_examples(filename):
69
+ """Read examples from filename."""
70
+ examples=[]
71
+ with open(filename,encoding="utf-8") as f:
72
+ for idx, line in enumerate(f):
73
+
74
+ line=line.strip()
75
+ js=json.loads(line)
76
+
77
+ comp_type = js["Compiler_Type"]
78
+ tar_type = js["Target"]
79
+ examples.append(
80
+ Example(
81
+ idx = idx,
82
+ source=" ".join(js['natrual_language']),
83
+ ts_v = ",".join(js['TS_V_token']),
84
+ target = " ".join(js["ground_truth"][1:-1]),
85
+ comp_type = comp_type,
86
+ tar_type = tar_type
87
+ )
88
+ )
89
+
90
+ return examples
91
+
92
+
93
+ class InputFeatures(object):
94
+ """A single training/test features for a example."""
95
+ def __init__(self,
96
+ example_id,
97
+ source_ids, source_mask,
98
+ target_ids, target_mask,
99
+ comp_type, tar_type
100
+ ):
101
+ self.example_id = example_id
102
+ self.source_ids = source_ids
103
+ self.source_mask = source_mask
104
+ self.target_ids = target_ids
105
+ self.target_mask = target_mask
106
+ self.comp_type = comp_type
107
+ self.tar_type = tar_type
108
+
109
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
110
+ features = []
111
+ for example_index, example in enumerate(examples):
112
+ #source
113
+
114
+ source_ids = torch.LongTensor(tokenizer.encode(example.source + tokenizer.pad_token + example.ts_v,
115
+ add_special_tokens=True, max_length=args.max_source_length, truncation=True))
116
+
117
+ source_mask = torch.ones_like(source_ids)
118
+ #target
119
+ if stage=="test":
120
+ target_tokens = tokenizer.tokenize("None")
121
+ else:
122
+ target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
123
+
124
+ target_ids = torch.LongTensor(tokenizer.encode(example.target,
125
+ add_special_tokens=True, max_length=args.max_target_length, truncation=True))
126
+ target_mask = torch.ones_like(target_ids)
127
+
128
+
129
+
130
+ features.append(
131
+ InputFeatures(
132
+ example_index,
133
+ source_ids, source_mask,
134
+ target_ids, target_mask,
135
+ example.comp_type, example.tar_type
136
+ )
137
+ )
138
+ return features
139
+
140
+
141
+
142
+ def set_seed(seed=20240124):
143
+ random.seed(seed)
144
+ os.environ['PYHTONHASHSEED'] = str(seed)
145
+ np.random.seed(seed)
146
+ torch.manual_seed(seed)
147
+ torch.cuda.manual_seed(seed)
148
+ torch.backends.cudnn.deterministic = True
149
+
150
+ def main():
151
+ parser = argparse.ArgumentParser()
152
+
153
+ ## Required parameters
154
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
155
+ help="Path to pre-trained model: e.g. roberta-base" )
156
+ parser.add_argument("--load_model_path", default=None, type=str,
157
+ help="Path to trained model" )
158
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
159
+ help="The output directory where the model predictions and checkpoints will be written.")
160
+
161
+ ## Other parameters
162
+ parser.add_argument("--train_filename", default=None, type=str,
163
+ help="The train filename. Should contain the .jsonl files for this task.")
164
+ parser.add_argument("--dev_filename", default=None, type=str,
165
+ help="The dev filename. Should contain the .jsonl files for this task.")
166
+ parser.add_argument("--test_filename", default=None, type=str,
167
+ help="The test filename. Should contain the .jsonl files for this task.")
168
+ parser.add_argument("--max_source_length", default=256, type=int,
169
+ help="The maximum total source sequence length after tokenization. Sequences longer "
170
+ "than this will be truncated, sequences shorter will be padded.")
171
+ parser.add_argument("--max_target_length", default=512, type=int,
172
+ help="The maximum total target sequence length after tokenization. Sequences longer "
173
+ "than this will be truncated, sequences shorter will be padded.")
174
+ parser.add_argument("--do_train", action='store_true',
175
+ help="Whether to run training.")
176
+ parser.add_argument("--do_eval", action='store_true',
177
+ help="Whether to run eval on the dev set.")
178
+ parser.add_argument("--do_test", action='store_true',
179
+ help="Whether to run eval on the dev set.")
180
+ parser.add_argument("--no_cuda", action='store_true',
181
+ help="Avoid using CUDA when available")
182
+ parser.add_argument("--do_cpuonly", action='store_true',
183
+ help="Whether CPU only training.")
184
+ parser.add_argument("--do_itr", action='store_true',
185
+ help="Whether to itr training.")
186
+ parser.add_argument("--train_batch_size", default=8, type=int,
187
+ help="Batch size per GPU/CPU for training.")
188
+ parser.add_argument("--eval_batch_size", default=8, type=int,
189
+ help="Batch size per GPU/CPU for evaluation.")
190
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
191
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
192
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
193
+ help="The initial learning rate for Adam.")
194
+ parser.add_argument("--beam_size", default=10, type=int,
195
+ help="beam size for beam search")
196
+ parser.add_argument("--weight_decay", default=0.0, type=float,
197
+ help="Weight deay if we apply some.")
198
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
199
+ help="Epsilon for Adam optimizer.")
200
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
201
+ help="Max gradient norm.")
202
+ parser.add_argument("--num_train_epochs", default=3, type=int,
203
+ help="Total number of training epochs to perform.")
204
+ parser.add_argument('--seed', type=int, default=20240124,
205
+ help="random seed for initialization")
206
+
207
+ # print arguments
208
+ args = parser.parse_args()
209
+ # set log
210
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
211
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
212
+ # set device
213
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
214
+ args.n_gpu = torch.cuda.device_count()
215
+ args.device = device
216
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
217
+
218
+ # Set seed
219
+ set_seed(args.seed)
220
+ # make dir if output_dir not exist
221
+ if os.path.exists(args.output_dir) is False:
222
+ os.makedirs(args.output_dir)
223
+
224
+ # build model
225
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
226
+ is_trust = False
227
+ if "codet5p-220m" in args.model_name_or_path or "codet5p-770m" in args.model_name_or_path:
228
+ is_trust = False
229
+ else:
230
+ is_trust = True
231
+ model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
232
+ logger.info("Training/evaluation parameters %s", args)
233
+ if args.load_model_path is not None:
234
+ model_save_name = "/Existing_Types/pytorch_model.bin"
235
+ if args.do_itr and not args.do_cpuonly:
236
+ model_save_name = "/pytorch_model.bin"
237
+ if args.do_itr and args.do_cpuonly:
238
+ model_save_name = "/New_Types/pytorch_model.bin"
239
+ if args.do_cpuonly :
240
+ model_save_name = "/New_Types/pytorch_model.bin"
241
+ logger.info("reload model from {}".format(args.load_model_path + model_save_name))
242
+ model.load_state_dict(torch.load(args.load_model_path + model_save_name))
243
+ model.to(args.device)
244
+
245
+ if args.n_gpu > 1:
246
+ # multi-gpu training
247
+ model = torch.nn.DataParallel(model)
248
+
249
+ if args.do_train:
250
+ # Prepare training data loader
251
+ train_examples = read_examples(args.train_filename)
252
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
253
+ all_source_ids = pad_sequence([f.source_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
254
+ all_source_mask = pad_sequence([f.source_mask for f in train_features], batch_first=True, padding_value=0)
255
+ all_target_ids = pad_sequence([f.target_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
256
+ all_target_mask = pad_sequence([f.target_mask for f in train_features], batch_first=True, padding_value=0)
257
+ train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
258
+ train_sampler = RandomSampler(train_data)
259
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
260
+
261
+ # Prepare optimizer and schedule (linear warmup and decay)
262
+ no_decay = ['bias', 'LayerNorm.weight']
263
+ optimizer_grouped_parameters = [
264
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
265
+ 'weight_decay': args.weight_decay},
266
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
267
+ ]
268
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
269
+ scheduler = get_linear_schedule_with_warmup(optimizer,
270
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
271
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
272
+
273
+ #Start training
274
+ logger.info("***** Running training *****")
275
+ logger.info(" Num examples = %d", len(train_examples))
276
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
277
+ logger.info(" Num epoch = %d", args.num_train_epochs)
278
+
279
+
280
+ model.train()
281
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
282
+ for epoch in range(args.num_train_epochs):
283
+ for idx,batch in enumerate(train_dataloader):
284
+ batch = tuple(t.to(device) for t in batch)
285
+ source_ids,source_mask,target_ids,target_mask = batch
286
+ # loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
287
+
288
+ loss = model(input_ids=source_ids, attention_mask=source_mask.gt(0),
289
+ labels=target_ids, decoder_attention_mask=target_mask.gt(0)).loss
290
+
291
+ if args.n_gpu > 1:
292
+ loss = loss.mean() # mean() to average on multi-gpu.
293
+ if args.gradient_accumulation_steps > 1:
294
+ loss = loss / args.gradient_accumulation_steps
295
+
296
+ losses.append(loss.item())
297
+ loss.backward()
298
+ if len(losses) % args.gradient_accumulation_steps == 0:
299
+ #Update parameters
300
+ optimizer.step()
301
+ optimizer.zero_grad()
302
+ scheduler.step()
303
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
304
+ logger.info("epoch {} step {} loss {}".format(epoch,
305
+ len(losses)//args.gradient_accumulation_steps,
306
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
307
+ if args.do_eval:
308
+ #Eval model with dev dataset
309
+ if 'dev_loss' in dev_dataset:
310
+ eval_examples,eval_data = dev_dataset['dev_loss']
311
+ else:
312
+ eval_examples = read_examples(args.dev_filename)
313
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
314
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
315
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
316
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
317
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
318
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
319
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
320
+ eval_sampler = SequentialSampler(eval_data)
321
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
322
+
323
+ logger.info("\n***** Running evaluation *****")
324
+ logger.info(" Num examples = %d", len(eval_examples))
325
+ logger.info(" Batch size = %d", args.eval_batch_size)
326
+
327
+ #Start Evaling model
328
+ model.eval()
329
+ eval_loss,tokens_num = 0,0
330
+ for batch in eval_dataloader:
331
+ batch = tuple(t.to(device) for t in batch)
332
+ source_ids,source_mask,target_ids,target_mask = batch
333
+ with torch.no_grad():
334
+ loss = model(input_ids=source_ids, attention_mask=source_mask,
335
+ labels=target_ids, decoder_attention_mask=target_mask).loss
336
+
337
+ if args.n_gpu > 1:
338
+ loss = loss.mean() # mean() to average on multi-gpu.
339
+
340
+ if args.gradient_accumulation_steps > 1:
341
+ loss = loss / args.gradient_accumulation_steps
342
+ eval_loss += loss.item()
343
+ tokens_num += 1
344
+ #Pring loss of dev dataset
345
+ model.train()
346
+ eval_loss = eval_loss / tokens_num
347
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
348
+ for key in sorted(result.keys()):
349
+ logger.info(" %s = %s", key, str(result[key]))
350
+ logger.info(" "+"*"*20)
351
+
352
+ #Calculate bleu
353
+ if 'dev_bleu' in dev_dataset:
354
+ eval_examples,eval_data=dev_dataset['dev_bleu']
355
+ else:
356
+ eval_examples = read_examples(args.dev_filename)
357
+ # eval_examples = random.sample(eval_examples)
358
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
359
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
360
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
361
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
362
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
363
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
364
+ dev_dataset['dev_bleu'] = eval_examples,eval_data
365
+
366
+ eval_sampler = SequentialSampler(eval_data)
367
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
368
+
369
+ model.eval()
370
+ p=[]
371
+ for batch in eval_dataloader:
372
+ batch = tuple(t.to(device) for t in batch)
373
+ source_ids,source_mask,target_ids,target_mask = batch
374
+ with torch.no_grad():
375
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
376
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
377
+
378
+ # convert ids to text
379
+ for pred in preds:
380
+ # print(pred)
381
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
382
+ p.append(text)
383
+
384
+ model.train()
385
+ predictions = []
386
+ res_list = []
387
+ EM = []
388
+ is_gened = False
389
+ with open(args.output_dir+"/dev.output",'w') as f, open(args.output_dir+"/dev.gold",'w') as f1:
390
+ for ref,gold in zip(p,eval_examples):
391
+ predictions.append(ref)
392
+ if len(ref) > 0:
393
+ is_gened = True
394
+ f.write(ref+'\n')
395
+ f1.write(gold.target+'\n')
396
+ EM.append(ref.split()==gold.target.split())
397
+ res_list.append([ref,gold.target])
398
+ if is_gened:
399
+ dev_bleu = _bleu(args.output_dir+"/dev.gold", args.output_dir+"/dev.output")
400
+ else:
401
+ dev_bleu = 0
402
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
403
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
404
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
405
+ logger.info(" "+"*"*20)
406
+ dev_score = (dev_bleu+round(np.mean(EM)*100,2)) / 2.0
407
+ if dev_score>best_score:
408
+ best_score=dev_score
409
+ # Save best checkpoint for best bleu
410
+ output_dir = args.output_dir
411
+ if not os.path.exists(output_dir):
412
+ os.makedirs(output_dir)
413
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
414
+ model_save_name = "Existing_Types/pytorch_model.bin"
415
+ if args.do_itr and not args.do_cpuonly:
416
+ model_save_name = "pytorch_model.bin"
417
+ if args.do_itr and args.do_cpuonly:
418
+ model_save_name = "New_Types/pytorch_model.bin"
419
+ if args.do_cpuonly :
420
+ model_save_name = "New_Types/pytorch_model.bin"
421
+ output_model_file = os.path.join(output_dir, model_save_name)
422
+ torch.save(model_to_save.state_dict(), output_model_file)
423
+ patience = 0
424
+ else:
425
+ patience += 1
426
+ if patience == 3:
427
+ break
428
+ output_dir = args.output_dir
429
+ logger.info(" Best score:%s",best_score)
430
+ logger.info(" "+"*"*20)
431
+ if args.do_test:
432
+ res_list = []
433
+
434
+ if args.load_model_path is not None:
435
+ model_save_name = "Existing_Types/pytorch_model.bin"
436
+ if args.do_itr and not args.do_cpuonly:
437
+ model_save_name = "pytorch_model.bin"
438
+ if args.do_itr and args.do_cpuonly:
439
+ model_save_name = "New_Types/pytorch_model.bin"
440
+ if args.do_cpuonly :
441
+ model_save_name = "New_Types/pytorch_model.bin"
442
+ checkpoint_prefix = model_save_name
443
+ output_dir = os.path.join(args.output_dir, checkpoint_prefix)
444
+ model_to_load = model.module if hasattr(model, 'module') else model
445
+ model_to_load.load_state_dict(torch.load(output_dir))
446
+
447
+
448
+
449
+ eval_examples = read_examples(args.test_filename)
450
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
451
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
452
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
453
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
454
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
455
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
456
+
457
+ # Calculate bleu
458
+ eval_sampler = SequentialSampler(eval_data)
459
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
460
+
461
+ model.eval()
462
+ p=[]
463
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
464
+ batch = tuple(t.to(device) for t in batch)
465
+ source_ids, source_mask, _, _ = batch
466
+ with torch.no_grad():
467
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
468
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
469
+ for pred in preds:
470
+ # print(pred)
471
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
472
+ p.append(text)
473
+
474
+ predictions=[]
475
+ EM = []
476
+ edit_dis = 0
477
+ cnt = 0
478
+ gcc_dic = {"riscv":[0,0,0,0], "nvptx":[0,0,0,0], "arc":[0,0,0,0]}
479
+ llvm_dic = {"RISCV":[0,0,0,0], "NVPTX":[0,0,0,0], "ARC":[0,0,0,0],"RI5CY":[0,0,0,0]}
480
+
481
+
482
+ for ref,gold in zip(p,eval_examples):
483
+ res_list.append([ref,gold.target])
484
+ predictions.append(ref)
485
+ with open(args.output_dir+"/test.output",'w') as f, open(args.output_dir+"/test.gold",'w') as f1:
486
+ f.write(ref+'\n')
487
+ f1.write(gold.target+'\n')
488
+ pred = ref.strip()
489
+ gt = gold.target
490
+ if gold.comp_type == "GCC":
491
+ gcc_dic[gold.tar_type][1] += fuzz.ratio(pred, gt)
492
+ gcc_dic[gold.tar_type][2] += _bleu(args.output_dir+"/test.gold", args.output_dir+"/test.output")
493
+ gcc_dic[gold.tar_type][3] += 1
494
+ if pred.split() == gt.split():
495
+ gcc_dic[gold.tar_type][0] += 1
496
+ if gold.comp_type == "LLVM":
497
+ llvm_dic[gold.tar_type][1] += fuzz.ratio(pred, gt)
498
+ llvm_dic[gold.tar_type][2] += _bleu(args.output_dir+"/test.gold", args.output_dir+"/test.output")
499
+ llvm_dic[gold.tar_type][3] += 1
500
+ if pred.split() == gt.split():
501
+ llvm_dic[gold.tar_type][0] += 1
502
+
503
+
504
+
505
+ for k in gcc_dic.keys():
506
+ if gcc_dic[k][3] > 0:
507
+ dev_acc = round(1.0*gcc_dic[k][1] / gcc_dic[k][3], 2)
508
+ dev_em = round(100.0*gcc_dic[k][0] / gcc_dic[k][3], 4)
509
+ dev_b4 = round(1.0*gcc_dic[k][2] / gcc_dic[k][3], 2)
510
+ logger.info(" "+"#"*20)
511
+ logger.info("GCC %s: %s = %s "%(k, "Edit Distance", str(dev_acc)))
512
+ logger.info("GCC %s: %s = %s "%(k, "Exact Match Rate", str(dev_em)))
513
+ logger.info("GCC %s: %s = %s "%(k, "BLEU4", str(dev_b4)))
514
+ logger.info(" "+"*"*20)
515
+
516
+ for k in llvm_dic.keys():
517
+ if llvm_dic[k][3] > 0:
518
+ dev_acc = round(1.0*llvm_dic[k][1] / llvm_dic[k][3], 2)
519
+ dev_em = round(100.0*llvm_dic[k][0] / llvm_dic[k][3], 4)
520
+ dev_b4 = round(1.0*llvm_dic[k][2] / llvm_dic[k][3], 2)
521
+ logger.info(" "+"#"*20)
522
+ logger.info("LLVM %s: %s = %s "%(k, "Edit Distance", str(dev_acc)))
523
+ logger.info("LLVM %s: %s = %s "%(k, "Exact Match Rate", str(dev_em)))
524
+ logger.info("LLVM %s: %s = %s "%(k, "BLEU4", str(dev_b4)))
525
+ logger.info(" "+"*"*20)
526
+
527
+
528
+
529
+
530
+ # result_file_name = "/test_result.jsonl"
531
+ # if args.do_itr:
532
+ # result_file_name = "/test_result_itr.jsonl"
533
+ # if args.do_cpuonly:
534
+ # result_file_name = "/test_result_cpu.jsonl"
535
+ # with open(args.output_dir + result_file_name, 'w') as wf:
536
+ # for line in res_list:
537
+ # dic = {}
538
+ # dic["Pred"] = line[0]
539
+ # dic["GT"] = line[1]
540
+ # wf.write(json.dumps(dic))
541
+ # wf.write("\n")
542
+
543
+ if __name__ == "__main__":
544
+ main()
545
+
546
+
Script/Model/CodeT5/code-completion/run_completion.py ADDED
@@ -0,0 +1,543 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ import pickle
26
+ import torch
27
+ import json
28
+ import random
29
+ import logging
30
+ import argparse
31
+ import numpy as np
32
+ from io import open
33
+ from itertools import cycle
34
+ import torch.nn as nn
35
+ from tqdm import tqdm, trange
36
+ from torch.nn.utils.rnn import pad_sequence
37
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
38
+ from torch.utils.data.distributed import DistributedSampler
39
+ from tqdm import tqdm
40
+ from fuzzywuzzy import fuzz
41
+ import re
42
+ import multiprocessing
43
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, T5ForConditionalGeneration, RobertaTokenizer)
44
+
45
+ divide_number = 2
46
+ cpu_cont = 16
47
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
48
+ datefmt = '%m/%d/%Y %H:%M:%S',
49
+ level = logging.INFO)
50
+ logger = logging.getLogger(__name__)
51
+
52
+ class Example(object):
53
+ """A single training/test example."""
54
+ def __init__(self,
55
+ idx,
56
+ source,
57
+ target
58
+ ):
59
+ self.idx = idx
60
+ self.source = source
61
+ self.target = target
62
+
63
+ def read_examples(filename):
64
+ """Read examples from filename."""
65
+ examples=[]
66
+
67
+ with open(filename,encoding="utf-8") as f:
68
+ max_src_len = 0
69
+ max_tar_len = 0
70
+ for idx, line in enumerate(f):
71
+ js=json.loads(line)
72
+ inputs = " ".join(js["Template_token"][1:])
73
+ if "ground_truth" in js:
74
+ outputs = " ".join(js["ground_truth"])
75
+ else:
76
+ outputs = inputs
77
+ if 'Idx' in js:
78
+ idx = js['Idx']
79
+ examples.append(
80
+ Example(
81
+ idx = idx,
82
+ source = inputs,
83
+ target = outputs
84
+ )
85
+ )
86
+ return examples
87
+
88
+
89
+ class InputFeatures(object):
90
+ """A single training/test features for a example."""
91
+ def __init__(self,
92
+ example_id,
93
+ source_ids, source_mask,
94
+ target_ids, target_mask
95
+ ):
96
+ self.example_id = example_id
97
+ self.source_ids = source_ids
98
+ self.source_mask = source_mask
99
+ self.target_ids = target_ids
100
+ self.target_mask = target_mask
101
+
102
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
103
+ features = []
104
+ for example_index, example in enumerate(examples):
105
+ #source
106
+ source_ids = torch.LongTensor(tokenizer.encode(example.source,
107
+ add_special_tokens=True, max_length=args.max_source_length, truncation=True))
108
+ source_mask = torch.ones_like(source_ids)
109
+ #target
110
+ if stage=="test":
111
+ target = "None"
112
+ else:
113
+ target = example.target
114
+ target_ids = torch.LongTensor(tokenizer.encode(target,
115
+ add_special_tokens=True, max_length=args.max_target_length, truncation=True))
116
+ target_mask = torch.ones_like(target_ids)
117
+
118
+
119
+ features.append(
120
+ InputFeatures(
121
+ example_index,
122
+ source_ids, source_mask,
123
+ target_ids, target_mask
124
+ )
125
+ )
126
+ return features
127
+
128
+
129
+
130
+ def set_seed(seed=20240124):
131
+ random.seed(seed)
132
+ os.environ['PYHTONHASHSEED'] = str(seed)
133
+ np.random.seed(seed)
134
+ torch.manual_seed(seed)
135
+ torch.cuda.manual_seed(seed)
136
+ torch.backends.cudnn.deterministic = True
137
+
138
+
139
+ def main():
140
+ parser = argparse.ArgumentParser()
141
+
142
+ ## Required parameters
143
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
144
+ help="Path to pre-trained model: e.g. roberta-base" )
145
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
146
+ help="The output directory where the model predictions and checkpoints will be written.")
147
+ parser.add_argument("--load_model_path", default=None, type=str,
148
+ help="Path to trained model: Should contain the .bin files" )
149
+ ## Other parameters
150
+ parser.add_argument("--task", default=None, type=str, required=True,
151
+ help="Task Type: statement_level, next_statement" )
152
+
153
+ parser.add_argument("--train_filename", default="../../Dataset/", type=str,
154
+ help="The train filename. Should contain the .jsonl files for this task.")
155
+ parser.add_argument("--dev_filename", default="../../Dataset/", type=str,
156
+ help="The dev filename. Should contain the .jsonl files for this task.")
157
+ parser.add_argument("--test_filename", default="../../Dataset/", type=str,
158
+ help="The test filename. Should contain the .jsonl files for this task.")
159
+
160
+ parser.add_argument("--config_name", default="", type=str,
161
+ help="Pretrained config name or path if not the same as model_name")
162
+ parser.add_argument("--tokenizer_name", default="", type=str,
163
+ help="Pretrained tokenizer name or path if not the same as model_name")
164
+ # parser.add_argument("--max_source_length", default=64, type=int,
165
+ # help="The maximum total source sequence length after tokenization. Sequences longer "
166
+ # "than this will be truncated, sequences shorter will be padded.")
167
+ # parser.add_argument("--max_target_length", default=32, type=int,
168
+ # help="The maximum total target sequence length after tokenization. Sequences longer "
169
+ # "than this will be truncated, sequences shorter will be padded.")
170
+
171
+ parser.add_argument("--do_train", action='store_true',
172
+ help="Whether to run training.")
173
+ parser.add_argument("--do_eval", action='store_true',
174
+ help="Whether to run eval on the dev set.")
175
+ parser.add_argument("--do_test", action='store_true',
176
+ help="Whether to run eval on the dev set.")
177
+ parser.add_argument("--test_org", action='store_true',
178
+ help="Whether to run eval on org model.")
179
+ parser.add_argument("--do_lower_case", action='store_true',
180
+ help="Set this flag if you are using an uncased model.")
181
+ parser.add_argument("--no_cuda", action='store_true',
182
+ help="Avoid using CUDA when available")
183
+
184
+ parser.add_argument("--train_batch_size", default=8, type=int,
185
+ help="Batch size per GPU/CPU for training.")
186
+ parser.add_argument("--eval_batch_size", default=8, type=int,
187
+ help="Batch size per GPU/CPU for evaluation.")
188
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
189
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
190
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
191
+ help="The initial learning rate for Adam.")
192
+ parser.add_argument("--beam_size", default=10, type=int,
193
+ help="beam size for beam search")
194
+ parser.add_argument("--weight_decay", default=0.0, type=float,
195
+ help="Weight deay if we apply some.")
196
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
197
+ help="Epsilon for Adam optimizer.")
198
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
199
+ help="Max gradient norm.")
200
+ parser.add_argument("--num_train_epochs", default=3, type=int,
201
+ help="Total number of training epochs to perform.")
202
+ parser.add_argument("--max_steps", default=-1, type=int,
203
+ help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
204
+ parser.add_argument("--eval_steps", default=-1, type=int,
205
+ help="")
206
+ parser.add_argument("--max_target_length", default=128, type=int,
207
+ help="")
208
+ parser.add_argument("--max_source_length", default=512, type=int,
209
+ help="")
210
+ parser.add_argument("--train_steps", default=-1, type=int,
211
+ help="")
212
+ parser.add_argument("--warmup_steps", default=0, type=int,
213
+ help="Linear warmup over warmup_steps.")
214
+ parser.add_argument("--local_rank", type=int, default=-1,
215
+ help="For distributed training: local_rank")
216
+ parser.add_argument('--seed', type=int, default=20240124,
217
+ help="random seed for initialization")
218
+ # print arguments
219
+ args = parser.parse_args()
220
+ # set log
221
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
222
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
223
+ # set device
224
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
225
+ args.n_gpu = torch.cuda.device_count()
226
+ args.device = device
227
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
228
+
229
+ # Set seed
230
+ set_seed(args.seed)
231
+
232
+ # make dir if output_dir not exist
233
+ if os.path.exists(args.output_dir) is False:
234
+ os.makedirs(args.output_dir)
235
+
236
+ # build model
237
+ tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
238
+ model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
239
+
240
+
241
+ logger.info("Training/evaluation parameters %s", args)
242
+
243
+ if args.load_model_path is not None:
244
+ if args.task == "statement_level":
245
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
246
+ model.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
247
+ else:
248
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
249
+ model.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
250
+
251
+ model.to(args.device)
252
+
253
+ if args.n_gpu > 1:
254
+ # multi-gpu training
255
+ model = torch.nn.DataParallel(model)
256
+
257
+ if args.do_train:
258
+ # Prepare training data loader
259
+ if args.task == "statement_level":
260
+ train_examples = read_examples(args.train_filename + "/Code_Completion/statement_level/train.jsonl")
261
+ else:
262
+ train_examples = read_examples(args.train_filename + "/Code_Completion/next_statement/train.jsonl")
263
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
264
+ all_source_ids = pad_sequence([f.source_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
265
+ all_source_mask = pad_sequence([f.source_mask for f in train_features], batch_first=True, padding_value=0)
266
+ all_target_ids = pad_sequence([f.target_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
267
+ all_target_mask = pad_sequence([f.target_mask for f in train_features], batch_first=True, padding_value=0)
268
+ train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
269
+ train_sampler = RandomSampler(train_data)
270
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
271
+
272
+
273
+ # Prepare optimizer and schedule (linear warmup and decay)
274
+ no_decay = ['bias', 'LayerNorm.weight']
275
+ optimizer_grouped_parameters = [
276
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
277
+ 'weight_decay': args.weight_decay},
278
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
279
+ ]
280
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
281
+ scheduler = get_linear_schedule_with_warmup(optimizer,
282
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
283
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
284
+
285
+ #Start training
286
+ logger.info("***** Running training *****")
287
+ logger.info(" Num examples = %d", len(train_examples))
288
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
289
+ logger.info(" Num epoch = %d", args.num_train_epochs)
290
+
291
+
292
+ model.train()
293
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
294
+ for epoch in range(args.num_train_epochs):
295
+ for idx,batch in enumerate(train_dataloader):
296
+ batch = tuple(t.to(device) for t in batch)
297
+ source_ids,source_mask,target_ids,target_mask = batch
298
+ # loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
299
+
300
+ loss = model(input_ids=source_ids, attention_mask=source_mask.gt(0),
301
+ labels=target_ids, decoder_attention_mask=target_mask.gt(0)).loss
302
+
303
+
304
+ if args.n_gpu > 1:
305
+ loss = loss.mean() # mean() to average on multi-gpu.
306
+
307
+ if args.gradient_accumulation_steps > 1:
308
+ loss = loss / args.gradient_accumulation_steps
309
+
310
+ losses.append(loss.item())
311
+ loss.backward()
312
+ if len(losses) % args.gradient_accumulation_steps == 0:
313
+ #Update parameters
314
+ optimizer.step()
315
+ optimizer.zero_grad()
316
+ scheduler.step()
317
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
318
+ logger.info("epoch {} step {} loss {}".format(epoch,
319
+ len(losses)//args.gradient_accumulation_steps,
320
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
321
+ if args.do_eval:
322
+ #Eval model with dev dataset
323
+
324
+ if 'dev_loss' in dev_dataset:
325
+ eval_examples,eval_data = dev_dataset['dev_loss']
326
+ else:
327
+ if args.task == "statement_level":
328
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl")
329
+ else:
330
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl")
331
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
332
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
333
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
334
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
335
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
336
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
337
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
338
+ eval_sampler = SequentialSampler(eval_data)
339
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
340
+ res_list = []
341
+ logger.info("\n***** Running evaluation *****")
342
+ logger.info(" Num examples = %d", len(eval_examples))
343
+ logger.info(" Batch size = %d", args.eval_batch_size)
344
+
345
+ #Start Evaling model
346
+ model.eval()
347
+ eval_loss,tokens_num = 0,0
348
+ for batch in eval_dataloader:
349
+ batch = tuple(t.to(device) for t in batch)
350
+ source_ids,source_mask,target_ids,target_mask = batch
351
+ with torch.no_grad():
352
+ loss = model(input_ids=source_ids, attention_mask=source_mask,
353
+ labels=target_ids, decoder_attention_mask=target_mask).loss
354
+
355
+ if args.n_gpu > 1:
356
+ loss = loss.mean() # mean() to average on multi-gpu.
357
+
358
+ if args.gradient_accumulation_steps > 1:
359
+ loss = loss / args.gradient_accumulation_steps
360
+ eval_loss += loss.item()
361
+ tokens_num += 1
362
+
363
+
364
+ #Pring loss of dev dataset
365
+ model.train()
366
+ eval_loss = eval_loss / tokens_num
367
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
368
+ for key in sorted(result.keys()):
369
+ logger.info(" %s = %s", key, str(result[key]))
370
+ logger.info(" "+"*"*20)
371
+
372
+ #Calculate bleu
373
+ if 'dev_bleu' in dev_dataset:
374
+ eval_examples,eval_data=dev_dataset['dev_bleu']
375
+ else:
376
+ if args.task == "statement_level":
377
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl")
378
+ else:
379
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl")
380
+ # eval_examples = random.sample(eval_examples, int(len(eval_examples) / divide_number))
381
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
382
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
383
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
384
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
385
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
386
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
387
+ dev_dataset['dev_bleu'] = eval_examples,eval_data
388
+
389
+ eval_sampler = SequentialSampler(eval_data)
390
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
391
+
392
+ model.eval()
393
+ p=[]
394
+ for batch in eval_dataloader:
395
+ batch = tuple(t.to(device) for t in batch)
396
+ source_ids, source_mask, _, _ = batch
397
+ with torch.no_grad():
398
+ # preds = model(source_ids)
399
+ # 1 card -- model.gen
400
+ # multicard -- model.module.gen
401
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
402
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
403
+
404
+ # convert ids to text
405
+ for pred in preds:
406
+ # print(pred)
407
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
408
+ p.append(text)
409
+ model.train()
410
+ EM = 0.0
411
+ edit_sim = 0.0
412
+ total = len(p)
413
+ token_accuracy = 0
414
+ for ref,gold in zip(p,eval_examples):
415
+ pred = ref.strip()
416
+ gt = gold.target
417
+ edit_sim += fuzz.ratio(pred, gt)
418
+ if pred.split() == gt.split():
419
+ EM += 1
420
+ res_list.append([pred,gt])
421
+ dev_acc = round(EM/total*100, 2)
422
+ # logger.info(" %s = %s "%("loss",round(np.mean(dev_losses),4)))
423
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
424
+ logger.info(" %s = %s "%("EM Acc",str(dev_acc)))
425
+ logger.info(" %s = %s "%("Edit Distance",str(round(edit_sim/total, 2))))
426
+ logger.info(" "+"*"*20)
427
+
428
+ if dev_acc > best_score:
429
+ best_score = dev_acc
430
+ # Save best checkpoint for best bleu
431
+ if args.task == "statement_level":
432
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
433
+ else:
434
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
435
+ if not os.path.exists(output_dir):
436
+ os.makedirs(output_dir)
437
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
438
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
439
+ torch.save(model_to_save.state_dict(), output_model_file)
440
+ patience = 0
441
+ else:
442
+ patience += 1
443
+ if patience == 3:
444
+ break
445
+
446
+ logger.info(" Best score:%s",best_score)
447
+ logger.info(" "+"*"*20)
448
+
449
+ if args.task == "statement_level":
450
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
451
+ else:
452
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
453
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
454
+ for line in res_list:
455
+ dic = {}
456
+ dic["Pred"] = line[0]
457
+ dic["GT"] = line[1]
458
+ wf.write(json.dumps(dic))
459
+ wf.write("\n")
460
+
461
+ if args.do_test:
462
+ res_list = []
463
+ output_dir2 = ""
464
+
465
+ if args.load_model_path is not None:
466
+ model_to_load = model.module if hasattr(model, 'module') else model
467
+
468
+ if args.task == "statement_level":
469
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
470
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
471
+ else:
472
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
473
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
474
+
475
+
476
+ if args.task == "statement_level":
477
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/statement_level/test.jsonl')
478
+ else:
479
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/next_statement/test.jsonl')
480
+ eval_examples = read_examples(args.test_filename)
481
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
482
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
483
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
484
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
485
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
486
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
487
+
488
+ # Calculate bleu
489
+ eval_sampler = SequentialSampler(eval_data)
490
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
491
+
492
+ model.eval()
493
+ p=[]
494
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
495
+ batch = tuple(t.to(device) for t in batch)
496
+ source_ids, source_mask, _, _ = batch
497
+ with torch.no_grad():
498
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
499
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
500
+ for pred in preds:
501
+ # print(pred)
502
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
503
+ p.append(text)
504
+ model.train()
505
+ edit_sim = 0.0
506
+ EM = 0.0
507
+ total = len(p)
508
+ for ref,gold in zip(p,eval_examples):
509
+ pred = ref.strip()
510
+ gt = gold.target
511
+ edit_sim += fuzz.ratio(pred, gt)
512
+ if pred.split() == gt.split():
513
+ EM += 1
514
+ res_list.append([pred,gt])
515
+ dev_acc = round(edit_sim/total, 2)
516
+ dev_em = round(EM/total, 4)
517
+ logger.info(" %s = %s "%("Test Token Avg Edit Distance",str(dev_acc)))
518
+ logger.info(" %s = %s "%("Test Token Avg Exact Match Rate",str(dev_em)))
519
+ logger.info(" "+"*"*20)
520
+ if args.test_org:
521
+ output_dir = args.output_dir
522
+ else:
523
+ if args.task == "statement_level":
524
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
525
+ else:
526
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
527
+
528
+ with open(output_dir + "/test_result.jsonl", 'w') as wf:
529
+ for line in res_list:
530
+ dic = {}
531
+ dic["Pred"] = line[0]
532
+ dic["GT"] = line[1]
533
+ wf.write(json.dumps(dic))
534
+ wf.write("\n")
535
+
536
+
537
+
538
+
539
+ if __name__ == "__main__":
540
+ main()
541
+
542
+
543
+
Script/Model/CodeT5/code-generation/bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 Google Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+
16
+ """Python implementation of BLEU and smooth-BLEU.
17
+
18
+ This module provides a Python implementation of BLEU and smooth-BLEU.
19
+ Smooth BLEU is computed following the method outlined in the paper:
20
+ Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
+ evaluation metrics for machine translation. COLING 2004.
22
+ """
23
+
24
+ import collections
25
+ import math
26
+
27
+
28
+ def _get_ngrams(segment, max_order):
29
+ """Extracts all n-grams upto a given maximum order from an input segment.
30
+
31
+ Args:
32
+ segment: text segment from which n-grams will be extracted.
33
+ max_order: maximum length in tokens of the n-grams returned by this
34
+ methods.
35
+
36
+ Returns:
37
+ The Counter containing all n-grams upto max_order in segment
38
+ with a count of how many times each n-gram occurred.
39
+ """
40
+ ngram_counts = collections.Counter()
41
+ for order in range(1, max_order + 1):
42
+ for i in range(0, len(segment) - order + 1):
43
+ ngram = tuple(segment[i:i+order])
44
+ ngram_counts[ngram] += 1
45
+ return ngram_counts
46
+
47
+
48
+ def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
+ smooth=False):
50
+ """Computes BLEU score of translated segments against one or more references.
51
+
52
+ Args:
53
+ reference_corpus: list of lists of references for each translation. Each
54
+ reference should be tokenized into a list of tokens.
55
+ translation_corpus: list of translations to score. Each translation
56
+ should be tokenized into a list of tokens.
57
+ max_order: Maximum n-gram order to use when computing BLEU score.
58
+ smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
+
60
+ Returns:
61
+ 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
+ precisions and brevity penalty.
63
+ """
64
+ matches_by_order = [0] * max_order
65
+ possible_matches_by_order = [0] * max_order
66
+ reference_length = 0
67
+ translation_length = 0
68
+ for (references, translation) in zip(reference_corpus,
69
+ translation_corpus):
70
+ reference_length += min(len(r) for r in references)
71
+ translation_length += len(translation)
72
+
73
+ merged_ref_ngram_counts = collections.Counter()
74
+ for reference in references:
75
+ merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
+ translation_ngram_counts = _get_ngrams(translation, max_order)
77
+ overlap = translation_ngram_counts & merged_ref_ngram_counts
78
+ for ngram in overlap:
79
+ matches_by_order[len(ngram)-1] += overlap[ngram]
80
+ for order in range(1, max_order+1):
81
+ possible_matches = len(translation) - order + 1
82
+ if possible_matches > 0:
83
+ possible_matches_by_order[order-1] += possible_matches
84
+
85
+ precisions = [0] * max_order
86
+ for i in range(0, max_order):
87
+ if smooth:
88
+ precisions[i] = ((matches_by_order[i] + 1.) /
89
+ (possible_matches_by_order[i] + 1.))
90
+ else:
91
+ if possible_matches_by_order[i] > 0:
92
+ precisions[i] = (float(matches_by_order[i]) /
93
+ possible_matches_by_order[i])
94
+ else:
95
+ precisions[i] = 0.0
96
+
97
+ if min(precisions) > 0:
98
+ p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
+ geo_mean = math.exp(p_log_sum)
100
+ else:
101
+ geo_mean = 0
102
+
103
+ ratio = float(translation_length) / reference_length
104
+
105
+ if ratio > 1.0:
106
+ bp = 1.
107
+ else:
108
+ bp = math.exp(1 - 1. / ratio)
109
+
110
+ bleu = geo_mean * bp
111
+
112
+ return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
+
114
+
115
+ def _bleu(ref_file, trans_file, subword_option=None):
116
+ max_order = 4
117
+ smooth = True
118
+ ref_files = [ref_file]
119
+ reference_text = []
120
+ for reference_filename in ref_files:
121
+ with open(reference_filename) as fh:
122
+ reference_text.append(fh.readlines())
123
+ per_segment_references = []
124
+ for references in zip(*reference_text):
125
+ reference_list = []
126
+ for reference in references:
127
+ reference_list.append(reference.strip().split())
128
+ per_segment_references.append(reference_list)
129
+ translations = []
130
+ with open(trans_file) as fh:
131
+ for line in fh:
132
+ translations.append(line.strip().split())
133
+ bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
+ return round(100 * bleu_score,2)
Script/Model/CodeT5/code-generation/model.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch
7
+ from torch.autograd import Variable
8
+ import copy
9
+ class Seq2Seq(nn.Module):
10
+ """
11
+ Build Seqence-to-Sequence.
12
+
13
+ Parameters:
14
+
15
+ * `encoder`- encoder of seq2seq model. e.g. roberta
16
+ * `decoder`- decoder of seq2seq model. e.g. transformer
17
+ * `config`- configuration of encoder model.
18
+ * `beam_size`- beam size for beam search.
19
+ * `max_length`- max length of target for beam search.
20
+ * `sos_id`- start of symbol ids in target for beam search.
21
+ * `eos_id`- end of symbol ids in target for beam search.
22
+ """
23
+ def __init__(self, encoder,decoder, config, beam_size=None, max_length=None, sos_id=None, eos_id=None):
24
+ super(Seq2Seq, self).__init__()
25
+ self.encoder = encoder
26
+ self.decoder=decoder
27
+ self.config=config
28
+ self.register_buffer(
29
+ "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
30
+ )
31
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
32
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
33
+ self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
34
+ self.lsm = nn.LogSoftmax(dim=-1)
35
+
36
+ self.beam_size = beam_size
37
+ self.max_length = max_length
38
+ self.sos_id = sos_id
39
+ self.eos_id = eos_id
40
+
41
+ def forward(self, source_ids, target_ids=None):
42
+ if target_ids is None:
43
+ return self.generate(source_ids)
44
+
45
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
46
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
47
+ ids = torch.cat((source_ids,target_ids),-1)
48
+ mask = self.bias[:,source_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
49
+ mask = mask & ids[:,None,:].ne(1)
50
+
51
+ out = self.decoder(target_ids,attention_mask=mask,past_key_values=encoder_output.past_key_values).last_hidden_state
52
+ lm_logits = self.lm_head(out)
53
+ # Shift so that tokens < n predict n
54
+ active_loss = target_ids[..., 1:].ne(1).view(-1)
55
+ shift_logits = lm_logits[..., :-1, :].contiguous()
56
+ shift_labels = target_ids[..., 1:].contiguous()
57
+ # Flatten the tokens
58
+ loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
59
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
60
+ shift_labels.view(-1)[active_loss])
61
+
62
+ outputs = loss,loss*active_loss.sum(),active_loss.sum()
63
+ return outputs
64
+
65
+ def generate(self, source_ids):
66
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
67
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
68
+ preds = []
69
+ zero = torch.cuda.LongTensor(1).fill_(0)
70
+ source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
71
+ for i in range(source_ids.shape[0]):
72
+ context = [[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y]
73
+ for y in encoder_output.past_key_values]
74
+ beam = Beam(self.beam_size,self.sos_id,self.eos_id)
75
+ input_ids = beam.getCurrentState()
76
+ context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
77
+ for _ in range(self.max_length):
78
+ if beam.done():
79
+ break
80
+
81
+ ids = torch.cat((context_ids,input_ids),-1)
82
+ mask = self.bias[:,context_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
83
+ mask = mask & ids[:,None,:].ne(1)
84
+ out = self.decoder(input_ids,attention_mask=mask,past_key_values=context).last_hidden_state
85
+ hidden_states = out[:,-1,:]
86
+ out = self.lsm(self.lm_head(hidden_states)).data
87
+ beam.advance(out)
88
+ input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
89
+ input_ids = torch.cat((input_ids,beam.getCurrentState()),-1)
90
+ hyp = beam.getHyp(beam.getFinal())
91
+ pred = beam.buildTargetTokens(hyp)[:self.beam_size]
92
+ pred = [torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
93
+ preds.append(torch.cat(pred,0).unsqueeze(0))
94
+
95
+ preds = torch.cat(preds,0)
96
+
97
+ return preds
98
+
99
+
100
+
101
+ class Beam(object):
102
+ def __init__(self, size,sos,eos):
103
+ self.size = size
104
+ self.tt = torch.cuda
105
+ # The score for each translation on the beam.
106
+ self.scores = self.tt.FloatTensor(size).zero_()
107
+ # The backpointers at each time-step.
108
+ self.prevKs = []
109
+ # The outputs at each time-step.
110
+ self.nextYs = [self.tt.LongTensor(size)
111
+ .fill_(0)]
112
+ self.nextYs[0][0] = sos
113
+ # Has EOS topped the beam yet.
114
+ self._eos = eos
115
+ self.eosTop = False
116
+ # Time and k pair for finished.
117
+ self.finished = []
118
+
119
+ def getCurrentState(self):
120
+ "Get the outputs for the current timestep."
121
+ batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
122
+ return batch
123
+
124
+ def getCurrentOrigin(self):
125
+ "Get the backpointers for the current timestep."
126
+ return self.prevKs[-1]
127
+
128
+ def advance(self, wordLk):
129
+ """
130
+ Given prob over words for every last beam `wordLk` and attention
131
+ `attnOut`: Compute and update the beam search.
132
+
133
+ Parameters:
134
+
135
+ * `wordLk`- probs of advancing from the last step (K x words)
136
+ * `attnOut`- attention at the last step
137
+
138
+ Returns: True if beam search is complete.
139
+ """
140
+ numWords = wordLk.size(1)
141
+
142
+ # Sum the previous scores.
143
+ if len(self.prevKs) > 0:
144
+ beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
145
+
146
+ # Don't let EOS have children.
147
+ for i in range(self.nextYs[-1].size(0)):
148
+ if self.nextYs[-1][i] == self._eos:
149
+ beamLk[i] = -1e20
150
+ else:
151
+ beamLk = wordLk[0]
152
+ flatBeamLk = beamLk.view(-1)
153
+ bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
154
+
155
+ self.scores = bestScores
156
+
157
+ # bestScoresId is flattened beam x word array, so calculate which
158
+ # word and beam each score came from
159
+ prevK = bestScoresId // numWords
160
+ self.prevKs.append(prevK)
161
+ self.nextYs.append((bestScoresId - prevK * numWords))
162
+
163
+
164
+ for i in range(self.nextYs[-1].size(0)):
165
+ if self.nextYs[-1][i] == self._eos:
166
+ s = self.scores[i]
167
+ self.finished.append((s, len(self.nextYs) - 1, i))
168
+
169
+ # End condition is when top-of-beam is EOS and no global score.
170
+ if self.nextYs[-1][0] == self._eos:
171
+ self.eosTop = True
172
+
173
+ def done(self):
174
+ return self.eosTop and len(self.finished) >=self.size
175
+
176
+ def getFinal(self):
177
+ if len(self.finished) == 0:
178
+ self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
179
+ self.finished.sort(key=lambda a: -a[0])
180
+ if len(self.finished) != self.size:
181
+ unfinished=[]
182
+ for i in range(self.nextYs[-1].size(0)):
183
+ if self.nextYs[-1][i] != self._eos:
184
+ s = self.scores[i]
185
+ unfinished.append((s, len(self.nextYs) - 1, i))
186
+ unfinished.sort(key=lambda a: -a[0])
187
+ self.finished+=unfinished[:self.size-len(self.finished)]
188
+ return self.finished[:self.size]
189
+
190
+ def getHyp(self, beam_res):
191
+ """
192
+ Walk back to construct the full hypothesis.
193
+ """
194
+ hyps=[]
195
+ for _,timestep, k in beam_res:
196
+ hyp = []
197
+ for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
198
+ hyp.append(self.nextYs[j+1][k])
199
+ k = self.prevKs[j][k]
200
+ hyps.append(hyp[::-1])
201
+ return hyps
202
+
203
+ def buildTargetTokens(self, preds):
204
+ sentence=[]
205
+ for pred in preds:
206
+ tokens = []
207
+ for tok in pred:
208
+ if tok==self._eos:
209
+ break
210
+ tokens.append(tok)
211
+ sentence.append(tokens)
212
+ return sentence
213
+
Script/Model/CodeT5/code-generation/run_generation.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ from bleu import _bleu
26
+ import pickle
27
+ import torch
28
+ import json
29
+ import random
30
+ import logging
31
+ import argparse
32
+ import numpy as np
33
+ from io import open
34
+ from itertools import cycle
35
+ import torch.nn as nn
36
+ from model import Seq2Seq
37
+ from tqdm import tqdm, trange
38
+ from torch.nn.utils.rnn import pad_sequence
39
+ from accelerate import Accelerator
40
+ from fuzzywuzzy import fuzz
41
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
42
+ from torch.utils.data.distributed import DistributedSampler
43
+
44
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, T5ForConditionalGeneration, RobertaTokenizer)
45
+
46
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
47
+ datefmt = '%m/%d/%Y %H:%M:%S',
48
+ level = logging.INFO)
49
+ logger = logging.getLogger(__name__)
50
+ divide_number = 3
51
+
52
+
53
+ class Example(object):
54
+ """A single training/test example."""
55
+ def __init__(self,
56
+ idx,
57
+ source,
58
+ ts_v,
59
+ target,
60
+ ):
61
+ self.idx = idx
62
+ self.source = source
63
+ self.ts_v = ts_v
64
+ self.target = target
65
+
66
+ def read_examples(filename):
67
+ """Read examples from filename."""
68
+ examples=[]
69
+ with open(filename,encoding="utf-8") as f:
70
+ for idx, line in enumerate(f):
71
+ line=line.strip()
72
+ js=json.loads(line)
73
+ # print(" ".join(js['natrual_language']))
74
+ # print(",".join(js['TS_V_token']))
75
+ # print(" ".join(js["ground_truth"]))
76
+ # print("###########################################")
77
+ examples.append(
78
+ Example(
79
+ idx = idx,
80
+ source=" ".join(js['natrual_language']),
81
+ ts_v = ",".join(js['TS_V_token']),
82
+ target = " ".join(js["ground_truth"][1:-1]),
83
+ )
84
+ )
85
+
86
+ return examples
87
+
88
+
89
+ class InputFeatures(object):
90
+ """A single training/test features for a example."""
91
+ def __init__(self,
92
+ example_id,
93
+ source_ids, source_mask,
94
+ target_ids, target_mask
95
+ ):
96
+ self.example_id = example_id
97
+ self.source_ids = source_ids
98
+ self.source_mask = source_mask
99
+ self.target_ids = target_ids
100
+ self.target_mask = target_mask
101
+
102
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
103
+ features = []
104
+ for example_index, example in enumerate(examples):
105
+ #source
106
+
107
+ source_ids = torch.LongTensor(tokenizer.encode(example.source + tokenizer.pad_token + example.ts_v,
108
+ add_special_tokens=True, max_length=args.max_source_length, truncation=True))
109
+
110
+ source_mask = torch.ones_like(source_ids)
111
+ #target
112
+ if stage=="test":
113
+ target_tokens = tokenizer.tokenize("None")
114
+ else:
115
+ target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
116
+
117
+ target_ids = torch.LongTensor(tokenizer.encode(example.target,
118
+ add_special_tokens=True, max_length=args.max_target_length, truncation=True))
119
+ target_mask = torch.ones_like(target_ids)
120
+
121
+
122
+
123
+ features.append(
124
+ InputFeatures(
125
+ example_index,
126
+ source_ids, source_mask,
127
+ target_ids, target_mask
128
+ )
129
+ )
130
+ return features
131
+
132
+
133
+
134
+ def set_seed(seed=20240124):
135
+ random.seed(seed)
136
+ os.environ['PYHTONHASHSEED'] = str(seed)
137
+ np.random.seed(seed)
138
+ torch.manual_seed(seed)
139
+ torch.cuda.manual_seed(seed)
140
+ torch.backends.cudnn.deterministic = True
141
+
142
+ def main():
143
+ parser = argparse.ArgumentParser()
144
+
145
+ ## Required parameters
146
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
147
+ help="Path to pre-trained model: e.g. roberta-base" )
148
+ parser.add_argument("--load_model_path", default=None, type=str,
149
+ help="Path to trained model" )
150
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
151
+ help="The output directory where the model predictions and checkpoints will be written.")
152
+
153
+ ## Other parameters
154
+ parser.add_argument("--train_filename", default=None, type=str,
155
+ help="The train filename. Should contain the .jsonl files for this task.")
156
+ parser.add_argument("--dev_filename", default=None, type=str,
157
+ help="The dev filename. Should contain the .jsonl files for this task.")
158
+ parser.add_argument("--test_filename", default=None, type=str,
159
+ help="The test filename. Should contain the .jsonl files for this task.")
160
+ parser.add_argument("--max_source_length", default=256, type=int,
161
+ help="The maximum total source sequence length after tokenization. Sequences longer "
162
+ "than this will be truncated, sequences shorter will be padded.")
163
+ parser.add_argument("--max_target_length", default=512, type=int,
164
+ help="The maximum total target sequence length after tokenization. Sequences longer "
165
+ "than this will be truncated, sequences shorter will be padded.")
166
+ parser.add_argument("--do_train", action='store_true',
167
+ help="Whether to run training.")
168
+ parser.add_argument("--do_eval", action='store_true',
169
+ help="Whether to run eval on the dev set.")
170
+ parser.add_argument("--do_test", action='store_true',
171
+ help="Whether to run eval on the dev set.")
172
+ parser.add_argument("--no_cuda", action='store_true',
173
+ help="Avoid using CUDA when available")
174
+
175
+ parser.add_argument("--train_batch_size", default=8, type=int,
176
+ help="Batch size per GPU/CPU for training.")
177
+ parser.add_argument("--eval_batch_size", default=8, type=int,
178
+ help="Batch size per GPU/CPU for evaluation.")
179
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
180
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
181
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
182
+ help="The initial learning rate for Adam.")
183
+ parser.add_argument("--beam_size", default=10, type=int,
184
+ help="beam size for beam search")
185
+ parser.add_argument("--weight_decay", default=0.0, type=float,
186
+ help="Weight deay if we apply some.")
187
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
188
+ help="Epsilon for Adam optimizer.")
189
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
190
+ help="Max gradient norm.")
191
+ parser.add_argument("--num_train_epochs", default=3, type=int,
192
+ help="Total number of training epochs to perform.")
193
+ parser.add_argument('--seed', type=int, default=20240124,
194
+ help="random seed for initialization")
195
+
196
+ # print arguments
197
+ args = parser.parse_args()
198
+ # set log
199
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
200
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
201
+ # set device
202
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
203
+ args.n_gpu = torch.cuda.device_count()
204
+ args.device = device
205
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
206
+
207
+ # Set seed
208
+ set_seed(args.seed)
209
+ # make dir if output_dir not exist
210
+ if os.path.exists(args.output_dir) is False:
211
+ os.makedirs(args.output_dir)
212
+
213
+ # build model
214
+ tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
215
+ model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
216
+
217
+ logger.info("Training/evaluation parameters %s", args)
218
+ if args.load_model_path is not None:
219
+ logger.info("reload model from {}".format(args.load_model_path + "/pytorch_model.bin"))
220
+ model.load_state_dict(torch.load(args.load_model_path + "/pytorch_model.bin"))
221
+ model.to(args.device)
222
+
223
+ if args.n_gpu > 1:
224
+ # multi-gpu training
225
+ model = torch.nn.DataParallel(model)
226
+
227
+ if args.do_train:
228
+ # Prepare training data loader
229
+ train_examples = read_examples(args.train_filename)
230
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
231
+ all_source_ids = pad_sequence([f.source_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
232
+ all_source_mask = pad_sequence([f.source_mask for f in train_features], batch_first=True, padding_value=0)
233
+ all_target_ids = pad_sequence([f.target_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
234
+ all_target_mask = pad_sequence([f.target_mask for f in train_features], batch_first=True, padding_value=0)
235
+ train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
236
+ train_sampler = RandomSampler(train_data)
237
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
238
+
239
+ # Prepare optimizer and schedule (linear warmup and decay)
240
+ no_decay = ['bias', 'LayerNorm.weight']
241
+ optimizer_grouped_parameters = [
242
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
243
+ 'weight_decay': args.weight_decay},
244
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
245
+ ]
246
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
247
+ scheduler = get_linear_schedule_with_warmup(optimizer,
248
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
249
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
250
+
251
+ #Start training
252
+ logger.info("***** Running training *****")
253
+ logger.info(" Num examples = %d", len(train_examples))
254
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
255
+ logger.info(" Num epoch = %d", args.num_train_epochs)
256
+
257
+
258
+ model.train()
259
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
260
+ for epoch in range(args.num_train_epochs):
261
+ for idx,batch in enumerate(train_dataloader):
262
+ batch = tuple(t.to(device) for t in batch)
263
+ source_ids,source_mask,target_ids,target_mask = batch
264
+ # loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
265
+
266
+ loss = model(input_ids=source_ids, attention_mask=source_mask.gt(0),
267
+ labels=target_ids, decoder_attention_mask=target_mask.gt(0)).loss
268
+
269
+ if args.n_gpu > 1:
270
+ loss = loss.mean() # mean() to average on multi-gpu.
271
+ if args.gradient_accumulation_steps > 1:
272
+ loss = loss / args.gradient_accumulation_steps
273
+
274
+ losses.append(loss.item())
275
+ loss.backward()
276
+ if len(losses) % args.gradient_accumulation_steps == 0:
277
+ #Update parameters
278
+ optimizer.step()
279
+ optimizer.zero_grad()
280
+ scheduler.step()
281
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
282
+ logger.info("epoch {} step {} loss {}".format(epoch,
283
+ len(losses)//args.gradient_accumulation_steps,
284
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
285
+ if args.do_eval:
286
+ #Eval model with dev dataset
287
+ if 'dev_loss' in dev_dataset:
288
+ eval_examples,eval_data = dev_dataset['dev_loss']
289
+ else:
290
+ eval_examples = read_examples(args.dev_filename)
291
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
292
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
293
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
294
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
295
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
296
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
297
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
298
+ eval_sampler = SequentialSampler(eval_data)
299
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
300
+
301
+ logger.info("\n***** Running evaluation *****")
302
+ logger.info(" Num examples = %d", len(eval_examples))
303
+ logger.info(" Batch size = %d", args.eval_batch_size)
304
+
305
+ #Start Evaling model
306
+ model.eval()
307
+ eval_loss,tokens_num = 0,0
308
+ for batch in eval_dataloader:
309
+ batch = tuple(t.to(device) for t in batch)
310
+ source_ids,source_mask,target_ids,target_mask = batch
311
+ with torch.no_grad():
312
+ loss = model(input_ids=source_ids, attention_mask=source_mask,
313
+ labels=target_ids, decoder_attention_mask=target_mask).loss
314
+
315
+ if args.n_gpu > 1:
316
+ loss = loss.mean() # mean() to average on multi-gpu.
317
+
318
+ if args.gradient_accumulation_steps > 1:
319
+ loss = loss / args.gradient_accumulation_steps
320
+ eval_loss += loss.item()
321
+ tokens_num += 1
322
+ #Pring loss of dev dataset
323
+ model.train()
324
+ eval_loss = eval_loss / tokens_num
325
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
326
+ for key in sorted(result.keys()):
327
+ logger.info(" %s = %s", key, str(result[key]))
328
+ logger.info(" "+"*"*20)
329
+
330
+ #Calculate bleu
331
+ if 'dev_bleu' in dev_dataset:
332
+ eval_examples,eval_data=dev_dataset['dev_bleu']
333
+ else:
334
+ eval_examples = read_examples(args.dev_filename)
335
+ # eval_examples = random.sample(eval_examples)
336
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
337
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
338
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
339
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
340
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
341
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
342
+ dev_dataset['dev_bleu'] = eval_examples,eval_data
343
+
344
+ eval_sampler = SequentialSampler(eval_data)
345
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
346
+
347
+ model.eval()
348
+ p=[]
349
+ for batch in eval_dataloader:
350
+ batch = tuple(t.to(device) for t in batch)
351
+ source_ids,source_mask,target_ids,target_mask = batch
352
+ with torch.no_grad():
353
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
354
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
355
+
356
+ # convert ids to text
357
+ for pred in preds:
358
+ # print(pred)
359
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
360
+ p.append(text)
361
+
362
+ model.train()
363
+ predictions = []
364
+ res_list = []
365
+ EM = []
366
+ is_gened = False
367
+ with open(args.output_dir+"/dev.output",'w') as f, open(args.output_dir+"/dev.gold",'w') as f1:
368
+ for ref,gold in zip(p,eval_examples):
369
+ predictions.append(ref)
370
+ if len(ref) > 0:
371
+ is_gened = True
372
+ f.write(ref+'\n')
373
+ f1.write(gold.target+'\n')
374
+ EM.append(ref.split()==gold.target.split())
375
+ res_list.append([ref,gold.target])
376
+ if is_gened:
377
+ dev_bleu = _bleu(args.output_dir+"/dev.gold", args.output_dir+"/dev.output")
378
+ else:
379
+ dev_bleu = 0
380
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
381
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
382
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
383
+ logger.info(" "+"*"*20)
384
+ dev_score = (dev_bleu+round(np.mean(EM)*100,2))
385
+ if dev_score>best_score:
386
+ best_score=dev_score
387
+ # Save best checkpoint for best bleu
388
+ output_dir = args.output_dir
389
+ if not os.path.exists(output_dir):
390
+ os.makedirs(output_dir)
391
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
392
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
393
+ torch.save(model_to_save.state_dict(), output_model_file)
394
+ patience = 0
395
+ else:
396
+ patience += 1
397
+ if patience == 3:
398
+ break
399
+ output_dir = args.output_dir
400
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
401
+ for line in res_list:
402
+ dic = {}
403
+ dic["Pred"] = line[0]
404
+ dic["GT"] = line[1]
405
+ wf.write(json.dumps(dic))
406
+ wf.write("\n")
407
+
408
+ logger.info(" Best score:%s",best_score)
409
+ logger.info(" "+"*"*20)
410
+ if args.do_test:
411
+ res_list = []
412
+ if args.load_model_path is not None:
413
+ checkpoint_prefix = 'pytorch_model.bin'
414
+ output_dir = os.path.join(args.output_dir, checkpoint_prefix)
415
+ model_to_load = model.module if hasattr(model, 'module') else model
416
+ model_to_load.load_state_dict(torch.load(output_dir))
417
+
418
+
419
+
420
+
421
+ eval_examples = read_examples(args.test_filename)
422
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
423
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
424
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
425
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
426
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
427
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
428
+
429
+ # Calculate bleu
430
+ eval_sampler = SequentialSampler(eval_data)
431
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
432
+
433
+ model.eval()
434
+ p=[]
435
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
436
+ batch = tuple(t.to(device) for t in batch)
437
+ source_ids, source_mask, _, _ = batch
438
+ with torch.no_grad():
439
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
440
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
441
+ for pred in preds:
442
+ # print(pred)
443
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
444
+ p.append(text)
445
+
446
+ predictions=[]
447
+ EM = []
448
+ edit_dis = 0
449
+ cnt = 0
450
+ with open(args.output_dir+"/test.output",'w') as f, open(args.output_dir+"/test.gold",'w') as f1:
451
+ for ref,gold in zip(p,eval_examples):
452
+ res_list.append([ref,gold.target])
453
+ predictions.append(ref)
454
+ f.write(ref+'\n')
455
+ f1.write(gold.target+'\n')
456
+ EM.append(ref.split()==gold.target.split())
457
+ edit_dis += fuzz.ratio(ref, gold.target)
458
+ cnt += 1
459
+
460
+ dev_bleu = _bleu(args.output_dir+"/test.gold", args.output_dir+"/test.output")
461
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
462
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
463
+ logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt,2))))
464
+ logger.info(" "+"*"*20)
465
+
466
+
467
+ with open(args.output_dir + "/last_training_result.jsonl", 'w') as wf:
468
+ for line in res_list:
469
+ dic = {}
470
+ dic["Pred"] = line[0]
471
+ dic["GT"] = line[1]
472
+ wf.write(json.dumps(dic))
473
+ wf.write("\n")
474
+
475
+ if __name__ == "__main__":
476
+ main()
477
+
478
+
Script/Model/GraphCodeBert/code-completion/model.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch
7
+ from torch.autograd import Variable
8
+ import copy
9
+ class Seq2Seq(nn.Module):
10
+ """
11
+ Build Seqence-to-Sequence.
12
+
13
+ Parameters:
14
+
15
+ * `encoder`- encoder of seq2seq model. e.g. roberta
16
+ * `decoder`- decoder of seq2seq model. e.g. transformer
17
+ * `config`- configuration of encoder model.
18
+ * `beam_size`- beam size for beam search.
19
+ * `max_length`- max length of target for beam search.
20
+ * `sos_id`- start of symbol ids in target for beam search.
21
+ * `eos_id`- end of symbol ids in target for beam search.
22
+ """
23
+ def __init__(self, encoder,decoder, config, beam_size=None, max_length=None, sos_id=None, eos_id=None):
24
+ super(Seq2Seq, self).__init__()
25
+ self.encoder = encoder
26
+ self.decoder=decoder
27
+ self.config=config
28
+ self.register_buffer(
29
+ "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
30
+ )
31
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
32
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
33
+ self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
34
+ self.lsm = nn.LogSoftmax(dim=-1)
35
+
36
+ self.beam_size = beam_size
37
+ self.max_length = max_length
38
+ self.sos_id = sos_id
39
+ self.eos_id = eos_id
40
+
41
+ def forward(self, source_ids, target_ids=None):
42
+ if target_ids is None:
43
+ return self.generate(source_ids)
44
+
45
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
46
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
47
+ ids = torch.cat((source_ids,target_ids),-1)
48
+ mask = self.bias[:,source_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
49
+ mask = mask & ids[:,None,:].ne(1)
50
+
51
+ out = self.decoder(target_ids,attention_mask=mask,past_key_values=encoder_output.past_key_values).last_hidden_state
52
+ lm_logits = self.lm_head(out)
53
+ # Shift so that tokens < n predict n
54
+ active_loss = target_ids[..., 1:].ne(1).view(-1)
55
+ shift_logits = lm_logits[..., :-1, :].contiguous()
56
+ shift_labels = target_ids[..., 1:].contiguous()
57
+ # Flatten the tokens
58
+ loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
59
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
60
+ shift_labels.view(-1)[active_loss])
61
+
62
+ outputs = loss,loss*active_loss.sum(),active_loss.sum()
63
+ return outputs
64
+
65
+ def generate(self, source_ids):
66
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
67
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
68
+ preds = []
69
+ zero = torch.cuda.LongTensor(1).fill_(0)
70
+ source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
71
+ for i in range(source_ids.shape[0]):
72
+ context = [[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y]
73
+ for y in encoder_output.past_key_values]
74
+ beam = Beam(self.beam_size,self.sos_id,self.eos_id)
75
+ input_ids = beam.getCurrentState()
76
+ context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
77
+ for _ in range(self.max_length):
78
+ if beam.done():
79
+ break
80
+
81
+ ids = torch.cat((context_ids,input_ids),-1)
82
+ mask = self.bias[:,context_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
83
+ mask = mask & ids[:,None,:].ne(1)
84
+ out = self.decoder(input_ids,attention_mask=mask,past_key_values=context).last_hidden_state
85
+ hidden_states = out[:,-1,:]
86
+ out = self.lsm(self.lm_head(hidden_states)).data
87
+ beam.advance(out)
88
+ input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
89
+ input_ids = torch.cat((input_ids,beam.getCurrentState()),-1)
90
+ hyp = beam.getHyp(beam.getFinal())
91
+ pred = beam.buildTargetTokens(hyp)[:self.beam_size]
92
+ pred = [torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
93
+ preds.append(torch.cat(pred,0).unsqueeze(0))
94
+
95
+ preds = torch.cat(preds,0)
96
+
97
+ return preds
98
+
99
+
100
+
101
+ class Beam(object):
102
+ def __init__(self, size,sos,eos):
103
+ self.size = size
104
+ self.tt = torch.cuda
105
+ # The score for each translation on the beam.
106
+ self.scores = self.tt.FloatTensor(size).zero_()
107
+ # The backpointers at each time-step.
108
+ self.prevKs = []
109
+ # The outputs at each time-step.
110
+ self.nextYs = [self.tt.LongTensor(size)
111
+ .fill_(0)]
112
+ self.nextYs[0][0] = sos
113
+ # Has EOS topped the beam yet.
114
+ self._eos = eos
115
+ self.eosTop = False
116
+ # Time and k pair for finished.
117
+ self.finished = []
118
+
119
+ def getCurrentState(self):
120
+ "Get the outputs for the current timestep."
121
+ batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
122
+ return batch
123
+
124
+ def getCurrentOrigin(self):
125
+ "Get the backpointers for the current timestep."
126
+ return self.prevKs[-1]
127
+
128
+ def advance(self, wordLk):
129
+ """
130
+ Given prob over words for every last beam `wordLk` and attention
131
+ `attnOut`: Compute and update the beam search.
132
+
133
+ Parameters:
134
+
135
+ * `wordLk`- probs of advancing from the last step (K x words)
136
+ * `attnOut`- attention at the last step
137
+
138
+ Returns: True if beam search is complete.
139
+ """
140
+ numWords = wordLk.size(1)
141
+
142
+ # Sum the previous scores.
143
+ if len(self.prevKs) > 0:
144
+ beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
145
+
146
+ # Don't let EOS have children.
147
+ for i in range(self.nextYs[-1].size(0)):
148
+ if self.nextYs[-1][i] == self._eos:
149
+ beamLk[i] = -1e20
150
+ else:
151
+ beamLk = wordLk[0]
152
+ flatBeamLk = beamLk.view(-1)
153
+ bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
154
+
155
+ self.scores = bestScores
156
+
157
+ # bestScoresId is flattened beam x word array, so calculate which
158
+ # word and beam each score came from
159
+ prevK = bestScoresId // numWords
160
+ self.prevKs.append(prevK)
161
+ self.nextYs.append((bestScoresId - prevK * numWords))
162
+
163
+
164
+ for i in range(self.nextYs[-1].size(0)):
165
+ if self.nextYs[-1][i] == self._eos:
166
+ s = self.scores[i]
167
+ self.finished.append((s, len(self.nextYs) - 1, i))
168
+
169
+ # End condition is when top-of-beam is EOS and no global score.
170
+ if self.nextYs[-1][0] == self._eos:
171
+ self.eosTop = True
172
+
173
+ def done(self):
174
+ return self.eosTop and len(self.finished) >=self.size
175
+
176
+ def getFinal(self):
177
+ if len(self.finished) == 0:
178
+ self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
179
+ self.finished.sort(key=lambda a: -a[0])
180
+ if len(self.finished) != self.size:
181
+ unfinished=[]
182
+ for i in range(self.nextYs[-1].size(0)):
183
+ if self.nextYs[-1][i] != self._eos:
184
+ s = self.scores[i]
185
+ unfinished.append((s, len(self.nextYs) - 1, i))
186
+ unfinished.sort(key=lambda a: -a[0])
187
+ self.finished+=unfinished[:self.size-len(self.finished)]
188
+ return self.finished[:self.size]
189
+
190
+ def getHyp(self, beam_res):
191
+ """
192
+ Walk back to construct the full hypothesis.
193
+ """
194
+ hyps=[]
195
+ for _,timestep, k in beam_res:
196
+ hyp = []
197
+ for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
198
+ hyp.append(self.nextYs[j+1][k])
199
+ k = self.prevKs[j][k]
200
+ hyps.append(hyp[::-1])
201
+ return hyps
202
+
203
+ def buildTargetTokens(self, preds):
204
+ sentence=[]
205
+ for pred in preds:
206
+ tokens = []
207
+ for tok in pred:
208
+ if tok==self._eos:
209
+ break
210
+ tokens.append(tok)
211
+ sentence.append(tokens)
212
+ return sentence
213
+
Script/Model/GraphCodeBert/code-completion/run_completion.py ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ import pickle
26
+ import torch
27
+ import json
28
+ import random
29
+ import logging
30
+ import argparse
31
+ import numpy as np
32
+ from io import open
33
+ from itertools import cycle
34
+ import torch.nn as nn
35
+ from model import Seq2Seq
36
+ from tqdm import tqdm, trange
37
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
38
+ from torch.utils.data.distributed import DistributedSampler
39
+ from tqdm import tqdm
40
+ from fuzzywuzzy import fuzz
41
+ import re
42
+ import multiprocessing
43
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
44
+ RobertaConfig, RobertaModel, RobertaTokenizer)
45
+
46
+ divide_number = 2
47
+ cpu_cont = 16
48
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
49
+ datefmt = '%m/%d/%Y %H:%M:%S',
50
+ level = logging.INFO)
51
+ logger = logging.getLogger(__name__)
52
+
53
+
54
+
55
+
56
+ class Example(object):
57
+ """A single training/test example."""
58
+ def __init__(self,
59
+ idx,
60
+ source,
61
+ target,
62
+ max_src_len,
63
+ max_tar_len
64
+ ):
65
+ self.idx = idx
66
+ self.source = source
67
+ self.target = target
68
+ self.max_src_len = max_src_len
69
+ self.max_tar_len = max_tar_len
70
+
71
+ def read_examples(filename):
72
+ """Read examples from filename."""
73
+ examples=[]
74
+
75
+ with open(filename,encoding="utf-8") as f:
76
+ max_src_len = 0
77
+ max_tar_len = 0
78
+ for idx, line in enumerate(f):
79
+
80
+ js=json.loads(line)
81
+ inputs = " ".join(js["Template_token"][1:])
82
+ max_src_len = max(max_src_len, len(js["Template_token"]))
83
+
84
+ # print(inputs)
85
+ if "ground_truth" in js:
86
+ outputs = " ".join(js["ground_truth"])
87
+ max_tar_len = max(max_src_len, len(js["ground_truth"]))
88
+ else:
89
+ outputs = inputs
90
+ if 'Idx' in js:
91
+ idx = js['Idx']
92
+ examples.append(
93
+ Example(
94
+ idx = idx,
95
+ source = inputs,
96
+ target = outputs,
97
+ max_src_len = max_src_len,
98
+ max_tar_len = max_tar_len
99
+ )
100
+ )
101
+ return examples
102
+
103
+
104
+ class InputFeatures(object):
105
+ """A single training/test features for a example."""
106
+ def __init__(self,
107
+ example_id,
108
+ source_ids,
109
+ target_ids,
110
+ ):
111
+ self.example_id = example_id
112
+ self.source_ids = source_ids
113
+ self.target_ids = target_ids
114
+
115
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
116
+ features = []
117
+ for example_index, example in enumerate(examples):
118
+ #source
119
+ source_tokens = tokenizer.tokenize(example.source)[:args.max_source_length-5]
120
+ source_tokens =[tokenizer.cls_token,tokenizer.sep_token]+source_tokens+["<mask>", tokenizer.sep_token]
121
+ source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
122
+ padding_length = args.max_source_length - len(source_ids)
123
+ source_ids+=[tokenizer.pad_token_id]*padding_length
124
+
125
+ #target
126
+ if stage=="test":
127
+ target_tokens = tokenizer.tokenize("None")
128
+ else:
129
+ target_tokens = ["<mask>"] + tokenizer.tokenize(example.target)[:args.max_target_length-2]
130
+ target_tokens = target_tokens+[tokenizer.sep_token]
131
+ target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
132
+ padding_length = args.max_target_length - len(target_ids)
133
+ target_ids+=[tokenizer.pad_token_id]*padding_length
134
+
135
+
136
+
137
+ features.append(
138
+ InputFeatures(
139
+ example_index,
140
+ source_ids,
141
+ target_ids,
142
+ )
143
+ )
144
+ return features
145
+
146
+
147
+
148
+ def set_seed(seed=20240124):
149
+ random.seed(seed)
150
+ os.environ['PYHTONHASHSEED'] = str(seed)
151
+ np.random.seed(seed)
152
+ torch.manual_seed(seed)
153
+ torch.cuda.manual_seed(seed)
154
+ torch.backends.cudnn.deterministic = True
155
+
156
+
157
+ def main():
158
+ parser = argparse.ArgumentParser()
159
+
160
+ ## Required parameters
161
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
162
+ help="Path to pre-trained model: e.g. roberta-base" )
163
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
164
+ help="The output directory where the model predictions and checkpoints will be written.")
165
+ parser.add_argument("--load_model_path", default=None, type=str,
166
+ help="Path to trained model: Should contain the .bin files" )
167
+ ## Other parameters
168
+ parser.add_argument("--task", default=None, type=str, required=True,
169
+ help="Task Type: statement_level, next_statement" )
170
+
171
+ parser.add_argument("--train_filename", default="../../Dataset/", type=str,
172
+ help="The train filename. Should contain the .jsonl files for this task.")
173
+ parser.add_argument("--dev_filename", default="../../Dataset/", type=str,
174
+ help="The dev filename. Should contain the .jsonl files for this task.")
175
+ parser.add_argument("--test_filename", default="../../Dataset/", type=str,
176
+ help="The test filename. Should contain the .jsonl files for this task.")
177
+
178
+ parser.add_argument("--config_name", default="", type=str,
179
+ help="Pretrained config name or path if not the same as model_name")
180
+ parser.add_argument("--tokenizer_name", default="", type=str,
181
+ help="Pretrained tokenizer name or path if not the same as model_name")
182
+ # parser.add_argument("--max_source_length", default=64, type=int,
183
+ # help="The maximum total source sequence length after tokenization. Sequences longer "
184
+ # "than this will be truncated, sequences shorter will be padded.")
185
+ # parser.add_argument("--max_target_length", default=32, type=int,
186
+ # help="The maximum total target sequence length after tokenization. Sequences longer "
187
+ # "than this will be truncated, sequences shorter will be padded.")
188
+
189
+ parser.add_argument("--do_train", action='store_true',
190
+ help="Whether to run training.")
191
+ parser.add_argument("--do_eval", action='store_true',
192
+ help="Whether to run eval on the dev set.")
193
+ parser.add_argument("--do_test", action='store_true',
194
+ help="Whether to run eval on the dev set.")
195
+ parser.add_argument("--test_org", action='store_true',
196
+ help="Whether to run eval on org model.")
197
+ parser.add_argument("--do_lower_case", action='store_true',
198
+ help="Set this flag if you are using an uncased model.")
199
+ parser.add_argument("--no_cuda", action='store_true',
200
+ help="Avoid using CUDA when available")
201
+
202
+ parser.add_argument("--train_batch_size", default=8, type=int,
203
+ help="Batch size per GPU/CPU for training.")
204
+ parser.add_argument("--eval_batch_size", default=8, type=int,
205
+ help="Batch size per GPU/CPU for evaluation.")
206
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
207
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
208
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
209
+ help="The initial learning rate for Adam.")
210
+ parser.add_argument("--beam_size", default=10, type=int,
211
+ help="beam size for beam search")
212
+ parser.add_argument("--weight_decay", default=0.0, type=float,
213
+ help="Weight deay if we apply some.")
214
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
215
+ help="Epsilon for Adam optimizer.")
216
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
217
+ help="Max gradient norm.")
218
+ parser.add_argument("--num_train_epochs", default=3, type=int,
219
+ help="Total number of training epochs to perform.")
220
+ parser.add_argument("--max_steps", default=-1, type=int,
221
+ help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
222
+ parser.add_argument("--eval_steps", default=-1, type=int,
223
+ help="")
224
+ parser.add_argument("--max_target_length", default=128, type=int,
225
+ help="")
226
+ parser.add_argument("--max_source_length", default=256, type=int,
227
+ help="")
228
+ parser.add_argument("--train_steps", default=-1, type=int,
229
+ help="")
230
+ parser.add_argument("--warmup_steps", default=0, type=int,
231
+ help="Linear warmup over warmup_steps.")
232
+ parser.add_argument("--local_rank", type=int, default=-1,
233
+ help="For distributed training: local_rank")
234
+ parser.add_argument('--seed', type=int, default=20240124,
235
+ help="random seed for initialization")
236
+ # print arguments
237
+ args = parser.parse_args()
238
+ # set log
239
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
240
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
241
+ # set device
242
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
243
+ args.n_gpu = torch.cuda.device_count()
244
+ args.device = device
245
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
246
+
247
+ # Set seed
248
+ set_seed(args.seed)
249
+
250
+ # make dir if output_dir not exist
251
+ if os.path.exists(args.output_dir) is False:
252
+ os.makedirs(args.output_dir)
253
+
254
+ # build model
255
+ tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
256
+ config = RobertaConfig.from_pretrained(args.model_name_or_path)
257
+ # import!!!you must set is_decoder as True for generation
258
+ config.is_decoder = True
259
+ encoder = RobertaModel.from_pretrained(args.model_name_or_path,config=config)
260
+
261
+ model = Seq2Seq(encoder=encoder,decoder=encoder,config=config,
262
+ beam_size=args.beam_size,max_length=args.max_target_length,
263
+ sos_id=tokenizer.convert_tokens_to_ids(["<mask0>"])[0],eos_id=tokenizer.sep_token_id)
264
+
265
+ logger.info("Training/evaluation parameters %s", args)
266
+
267
+ if args.load_model_path is not None:
268
+ if args.task == "statement_level":
269
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
270
+ model.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
271
+ else:
272
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
273
+ model.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
274
+
275
+ model.to(args.device)
276
+
277
+ if args.n_gpu > 1:
278
+ # multi-gpu training
279
+ model = torch.nn.DataParallel(model)
280
+
281
+ if args.do_train:
282
+ # Prepare training data loader
283
+ if args.task == "statement_level":
284
+ train_examples = read_examples(args.train_filename + "/Code_Completion/statement_level/train.jsonl")
285
+ else:
286
+ train_examples = read_examples(args.train_filename + "/Code_Completion/next_statement/train.jsonl")
287
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
288
+ all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long)
289
+ all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
290
+ train_data = TensorDataset(all_source_ids,all_target_ids)
291
+ train_sampler = RandomSampler(train_data)
292
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
293
+
294
+
295
+ # Prepare optimizer and schedule (linear warmup and decay)
296
+ no_decay = ['bias', 'LayerNorm.weight']
297
+ optimizer_grouped_parameters = [
298
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
299
+ 'weight_decay': args.weight_decay},
300
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
301
+ ]
302
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
303
+ scheduler = get_linear_schedule_with_warmup(optimizer,
304
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
305
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
306
+
307
+ #Start training
308
+ logger.info("***** Running training *****")
309
+ logger.info(" Num examples = %d", len(train_examples))
310
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
311
+ logger.info(" Num epoch = %d", args.num_train_epochs)
312
+
313
+
314
+ model.train()
315
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
316
+ for epoch in range(args.num_train_epochs):
317
+ for idx,batch in enumerate(train_dataloader):
318
+ batch = tuple(t.to(device) for t in batch)
319
+ source_ids,target_ids = batch
320
+ loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
321
+
322
+ if args.n_gpu > 1:
323
+ loss = loss.mean() # mean() to average on multi-gpu.
324
+ if args.gradient_accumulation_steps > 1:
325
+ loss = loss / args.gradient_accumulation_steps
326
+
327
+ losses.append(loss.item())
328
+ loss.backward()
329
+ if len(losses) % args.gradient_accumulation_steps == 0:
330
+ #Update parameters
331
+ optimizer.step()
332
+ optimizer.zero_grad()
333
+ scheduler.step()
334
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
335
+ logger.info("epoch {} step {} loss {}".format(epoch,
336
+ len(losses)//args.gradient_accumulation_steps,
337
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
338
+ if args.do_eval:
339
+ #Eval model with dev dataset
340
+
341
+ if 'dev_loss' in dev_dataset:
342
+ eval_examples,eval_data = dev_dataset['dev_loss']
343
+ else:
344
+ if args.task == "statement_level":
345
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl")
346
+ else:
347
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl")
348
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
349
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
350
+ all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long)
351
+ eval_data = TensorDataset(all_source_ids,all_target_ids)
352
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
353
+ eval_sampler = SequentialSampler(eval_data)
354
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
355
+ res_list = []
356
+ logger.info("\n***** Running evaluation *****")
357
+ logger.info(" Num examples = %d", len(eval_examples))
358
+ logger.info(" Batch size = %d", args.eval_batch_size)
359
+
360
+ #Start Evaling model
361
+ model.eval()
362
+ eval_loss,tokens_num = 0,0
363
+ for batch in eval_dataloader:
364
+ batch = tuple(t.to(device) for t in batch)
365
+ source_ids,target_ids = batch
366
+
367
+ with torch.no_grad():
368
+ _,loss,num = model(source_ids=source_ids,target_ids=target_ids)
369
+ eval_loss += loss.sum().item()
370
+ tokens_num += num.sum().item()
371
+ #Pring loss of dev dataset
372
+ model.train()
373
+ eval_loss = eval_loss / tokens_num
374
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
375
+ for key in sorted(result.keys()):
376
+ logger.info(" %s = %s", key, str(result[key]))
377
+ logger.info(" "+"*"*20)
378
+
379
+ #Calculate bleu
380
+ if 'dev_bleu' in dev_dataset:
381
+ eval_examples,eval_data=dev_dataset['dev_bleu']
382
+ else:
383
+ if args.task == "statement_level":
384
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl")
385
+ else:
386
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl")
387
+ # eval_examples = random.sample(eval_examples, int(len(eval_examples) / divide_number))
388
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
389
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
390
+ eval_data = TensorDataset(all_source_ids)
391
+ dev_dataset['dev_bleu'] = eval_examples,eval_data
392
+
393
+ eval_sampler = SequentialSampler(eval_data)
394
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
395
+
396
+ model.eval()
397
+ p=[]
398
+ for batch in eval_dataloader:
399
+ batch = tuple(t.to(device) for t in batch)
400
+ source_ids = batch[0]
401
+ with torch.no_grad():
402
+ preds = model(source_ids)
403
+ # convert ids to text
404
+ for pred in preds:
405
+ t = pred[0].cpu().numpy()
406
+ t = list(t)
407
+ if 0 in t:
408
+ t = t[:t.index(0)]
409
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
410
+ p.append(text)
411
+ model.train()
412
+ EM = 0.0
413
+ edit_sim = 0.0
414
+ total = len(p)
415
+ token_accuracy = 0
416
+ for ref,gold in zip(p,eval_examples):
417
+ pred = ref.strip()
418
+ gt = gold.target
419
+ edit_sim += fuzz.ratio(pred, gt)
420
+ if pred.split() == gt.split():
421
+ EM += 1
422
+ res_list.append([pred,gt])
423
+ dev_acc = round(EM/total*100, 2)
424
+ # logger.info(" %s = %s "%("loss",round(np.mean(dev_losses),4)))
425
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
426
+ logger.info(" %s = %s "%("EM Acc",str(dev_acc)))
427
+ logger.info(" %s = %s "%("Edit Distance",str(round(edit_sim/total, 2))))
428
+ logger.info(" "+"*"*20)
429
+
430
+ if dev_acc > best_score:
431
+ best_score = dev_acc
432
+ # Save best checkpoint for best bleu
433
+ if args.task == "statement_level":
434
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
435
+ else:
436
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
437
+ if not os.path.exists(output_dir):
438
+ os.makedirs(output_dir)
439
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
440
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
441
+ torch.save(model_to_save.state_dict(), output_model_file)
442
+ patience = 0
443
+ else:
444
+ patience += 1
445
+ if patience == 3:
446
+ break
447
+ logger.info(" Best score:%s",best_score)
448
+ logger.info(" "+"*"*20)
449
+
450
+ if args.task == "statement_level":
451
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
452
+ else:
453
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
454
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
455
+ for line in res_list:
456
+ dic = {}
457
+ dic["Pred"] = line[0]
458
+ dic["GT"] = line[1]
459
+ wf.write(json.dumps(dic))
460
+ wf.write("\n")
461
+
462
+ if args.do_test:
463
+ res_list = []
464
+ output_dir2 = ""
465
+
466
+ if args.load_model_path is not None:
467
+ model_to_load = model.module if hasattr(model, 'module') else model
468
+
469
+ if args.task == "statement_level":
470
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
471
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
472
+ else:
473
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
474
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
475
+
476
+
477
+ if args.task == "statement_level":
478
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/statement_level/test.jsonl')
479
+ else:
480
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/next_statement/test.jsonl')
481
+ eval_examples = read_examples(args.test_filename)
482
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
483
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
484
+ eval_data = TensorDataset(all_source_ids)
485
+
486
+ # Calculate bleu
487
+ eval_sampler = SequentialSampler(eval_data)
488
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
489
+
490
+ model.eval()
491
+ p=[]
492
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
493
+ batch = tuple(t.to(device) for t in batch)
494
+ source_ids = batch[0]
495
+ with torch.no_grad():
496
+ preds = model(source_ids)
497
+ # convert ids to text
498
+ for pred in preds:
499
+ t = pred[0].cpu().numpy()
500
+ t = list(t)
501
+ if 0 in t:
502
+ t = t[:t.index(0)]
503
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
504
+ p.append(text)
505
+ model.train()
506
+ avg_acc = 0.0
507
+ avg_EM = 0.0
508
+ total = 0
509
+ for ref,gold in zip(p,eval_examples):
510
+ pred = ref.strip() # post_process(ref.strip()).split(" ")
511
+ gt = gold.target.strip()
512
+ if pred == gt:
513
+ avg_EM += 1
514
+ avg_acc += fuzz.ratio(pred, gt)
515
+ res_list.append([pred, gt])
516
+ total += 1
517
+ dev_acc = round(avg_acc/total, 2)
518
+ dev_em = round(avg_EM/total, 4)
519
+ logger.info(" %s = %s "%("Test Token Avg Edit Distance",str(dev_acc)))
520
+ logger.info(" %s = %s "%("Test Token Avg Exact Match Rate",str(dev_em)))
521
+ logger.info(" "+"*"*20)
522
+ if args.test_org:
523
+ output_dir = args.output_dir
524
+ else:
525
+ if args.task == "statement_level":
526
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
527
+ else:
528
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
529
+
530
+ with open(output_dir + "/test_result.jsonl", 'w') as wf:
531
+ for line in res_list:
532
+ dic = {}
533
+ dic["Pred"] = line[0]
534
+ dic["GT"] = line[1]
535
+ wf.write(json.dumps(dic))
536
+ wf.write("\n")
537
+
538
+
539
+
540
+
541
+ if __name__ == "__main__":
542
+ main()
543
+
544
+
545
+
Script/Model/GraphCodeBert/code-generation/bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 Google Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+
16
+ """Python implementation of BLEU and smooth-BLEU.
17
+
18
+ This module provides a Python implementation of BLEU and smooth-BLEU.
19
+ Smooth BLEU is computed following the method outlined in the paper:
20
+ Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
+ evaluation metrics for machine translation. COLING 2004.
22
+ """
23
+
24
+ import collections
25
+ import math
26
+
27
+
28
+ def _get_ngrams(segment, max_order):
29
+ """Extracts all n-grams upto a given maximum order from an input segment.
30
+
31
+ Args:
32
+ segment: text segment from which n-grams will be extracted.
33
+ max_order: maximum length in tokens of the n-grams returned by this
34
+ methods.
35
+
36
+ Returns:
37
+ The Counter containing all n-grams upto max_order in segment
38
+ with a count of how many times each n-gram occurred.
39
+ """
40
+ ngram_counts = collections.Counter()
41
+ for order in range(1, max_order + 1):
42
+ for i in range(0, len(segment) - order + 1):
43
+ ngram = tuple(segment[i:i+order])
44
+ ngram_counts[ngram] += 1
45
+ return ngram_counts
46
+
47
+
48
+ def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
+ smooth=False):
50
+ """Computes BLEU score of translated segments against one or more references.
51
+
52
+ Args:
53
+ reference_corpus: list of lists of references for each translation. Each
54
+ reference should be tokenized into a list of tokens.
55
+ translation_corpus: list of translations to score. Each translation
56
+ should be tokenized into a list of tokens.
57
+ max_order: Maximum n-gram order to use when computing BLEU score.
58
+ smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
+
60
+ Returns:
61
+ 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
+ precisions and brevity penalty.
63
+ """
64
+ matches_by_order = [0] * max_order
65
+ possible_matches_by_order = [0] * max_order
66
+ reference_length = 0
67
+ translation_length = 0
68
+ for (references, translation) in zip(reference_corpus,
69
+ translation_corpus):
70
+ reference_length += min(len(r) for r in references)
71
+ translation_length += len(translation)
72
+
73
+ merged_ref_ngram_counts = collections.Counter()
74
+ for reference in references:
75
+ merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
+ translation_ngram_counts = _get_ngrams(translation, max_order)
77
+ overlap = translation_ngram_counts & merged_ref_ngram_counts
78
+ for ngram in overlap:
79
+ matches_by_order[len(ngram)-1] += overlap[ngram]
80
+ for order in range(1, max_order+1):
81
+ possible_matches = len(translation) - order + 1
82
+ if possible_matches > 0:
83
+ possible_matches_by_order[order-1] += possible_matches
84
+
85
+ precisions = [0] * max_order
86
+ for i in range(0, max_order):
87
+ if smooth:
88
+ precisions[i] = ((matches_by_order[i] + 1.) /
89
+ (possible_matches_by_order[i] + 1.))
90
+ else:
91
+ if possible_matches_by_order[i] > 0:
92
+ precisions[i] = (float(matches_by_order[i]) /
93
+ possible_matches_by_order[i])
94
+ else:
95
+ precisions[i] = 0.0
96
+
97
+ if min(precisions) > 0:
98
+ p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
+ geo_mean = math.exp(p_log_sum)
100
+ else:
101
+ geo_mean = 0
102
+
103
+ ratio = float(translation_length) / reference_length
104
+
105
+ if ratio > 1.0:
106
+ bp = 1.
107
+ else:
108
+ bp = math.exp(1 - 1. / ratio)
109
+
110
+ bleu = geo_mean * bp
111
+
112
+ return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
+
114
+
115
+ def _bleu(ref_file, trans_file, subword_option=None):
116
+ max_order = 4
117
+ smooth = True
118
+ ref_files = [ref_file]
119
+ reference_text = []
120
+ for reference_filename in ref_files:
121
+ with open(reference_filename) as fh:
122
+ reference_text.append(fh.readlines())
123
+ per_segment_references = []
124
+ for references in zip(*reference_text):
125
+ reference_list = []
126
+ for reference in references:
127
+ reference_list.append(reference.strip().split())
128
+ per_segment_references.append(reference_list)
129
+ translations = []
130
+ with open(trans_file) as fh:
131
+ for line in fh:
132
+ translations.append(line.strip().split())
133
+ bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
+ return round(100 * bleu_score,2)
Script/Model/GraphCodeBert/code-generation/model.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch
7
+ from torch.autograd import Variable
8
+ import copy
9
+ class Seq2Seq(nn.Module):
10
+ """
11
+ Build Seqence-to-Sequence.
12
+
13
+ Parameters:
14
+
15
+ * `encoder`- encoder of seq2seq model. e.g. roberta
16
+ * `decoder`- decoder of seq2seq model. e.g. transformer
17
+ * `config`- configuration of encoder model.
18
+ * `beam_size`- beam size for beam search.
19
+ * `max_length`- max length of target for beam search.
20
+ * `sos_id`- start of symbol ids in target for beam search.
21
+ * `eos_id`- end of symbol ids in target for beam search.
22
+ """
23
+ def __init__(self, encoder,decoder, config, beam_size=None, max_length=None, sos_id=None, eos_id=None):
24
+ super(Seq2Seq, self).__init__()
25
+ self.encoder = encoder
26
+ self.decoder=decoder
27
+ self.config=config
28
+ self.register_buffer(
29
+ "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
30
+ )
31
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
32
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
33
+ self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
34
+ self.lsm = nn.LogSoftmax(dim=-1)
35
+
36
+ self.beam_size = beam_size
37
+ self.max_length = max_length
38
+ self.sos_id = sos_id
39
+ self.eos_id = eos_id
40
+
41
+ def forward(self, source_ids, target_ids=None):
42
+ if target_ids is None:
43
+ return self.generate(source_ids)
44
+
45
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
46
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
47
+ ids = torch.cat((source_ids,target_ids),-1)
48
+ mask = self.bias[:,source_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
49
+ mask = mask & ids[:,None,:].ne(1)
50
+
51
+ out = self.decoder(target_ids,attention_mask=mask,past_key_values=encoder_output.past_key_values).last_hidden_state
52
+ lm_logits = self.lm_head(out)
53
+ # Shift so that tokens < n predict n
54
+ active_loss = target_ids[..., 1:].ne(1).view(-1)
55
+ shift_logits = lm_logits[..., :-1, :].contiguous()
56
+ shift_labels = target_ids[..., 1:].contiguous()
57
+ # Flatten the tokens
58
+ loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
59
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
60
+ shift_labels.view(-1)[active_loss])
61
+
62
+ outputs = loss,loss*active_loss.sum(),active_loss.sum()
63
+ return outputs
64
+
65
+ def generate(self, source_ids):
66
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
67
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
68
+ preds = []
69
+ zero = torch.cuda.LongTensor(1).fill_(0)
70
+ source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
71
+ for i in range(source_ids.shape[0]):
72
+ context = [[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y]
73
+ for y in encoder_output.past_key_values]
74
+ beam = Beam(self.beam_size,self.sos_id,self.eos_id)
75
+ input_ids = beam.getCurrentState()
76
+ context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
77
+ for _ in range(self.max_length):
78
+ if beam.done():
79
+ break
80
+
81
+ ids = torch.cat((context_ids,input_ids),-1)
82
+ mask = self.bias[:,context_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
83
+ mask = mask & ids[:,None,:].ne(1)
84
+ out = self.decoder(input_ids,attention_mask=mask,past_key_values=context).last_hidden_state
85
+ hidden_states = out[:,-1,:]
86
+ out = self.lsm(self.lm_head(hidden_states)).data
87
+ beam.advance(out)
88
+ input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
89
+ input_ids = torch.cat((input_ids,beam.getCurrentState()),-1)
90
+ hyp = beam.getHyp(beam.getFinal())
91
+ pred = beam.buildTargetTokens(hyp)[:self.beam_size]
92
+ pred = [torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
93
+ preds.append(torch.cat(pred,0).unsqueeze(0))
94
+
95
+ preds = torch.cat(preds,0)
96
+
97
+ return preds
98
+
99
+
100
+
101
+ class Beam(object):
102
+ def __init__(self, size,sos,eos):
103
+ self.size = size
104
+ self.tt = torch.cuda
105
+ # The score for each translation on the beam.
106
+ self.scores = self.tt.FloatTensor(size).zero_()
107
+ # The backpointers at each time-step.
108
+ self.prevKs = []
109
+ # The outputs at each time-step.
110
+ self.nextYs = [self.tt.LongTensor(size)
111
+ .fill_(0)]
112
+ self.nextYs[0][0] = sos
113
+ # Has EOS topped the beam yet.
114
+ self._eos = eos
115
+ self.eosTop = False
116
+ # Time and k pair for finished.
117
+ self.finished = []
118
+
119
+ def getCurrentState(self):
120
+ "Get the outputs for the current timestep."
121
+ batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
122
+ return batch
123
+
124
+ def getCurrentOrigin(self):
125
+ "Get the backpointers for the current timestep."
126
+ return self.prevKs[-1]
127
+
128
+ def advance(self, wordLk):
129
+ """
130
+ Given prob over words for every last beam `wordLk` and attention
131
+ `attnOut`: Compute and update the beam search.
132
+
133
+ Parameters:
134
+
135
+ * `wordLk`- probs of advancing from the last step (K x words)
136
+ * `attnOut`- attention at the last step
137
+
138
+ Returns: True if beam search is complete.
139
+ """
140
+ numWords = wordLk.size(1)
141
+
142
+ # Sum the previous scores.
143
+ if len(self.prevKs) > 0:
144
+ beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
145
+
146
+ # Don't let EOS have children.
147
+ for i in range(self.nextYs[-1].size(0)):
148
+ if self.nextYs[-1][i] == self._eos:
149
+ beamLk[i] = -1e20
150
+ else:
151
+ beamLk = wordLk[0]
152
+ flatBeamLk = beamLk.view(-1)
153
+ bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
154
+
155
+ self.scores = bestScores
156
+
157
+ # bestScoresId is flattened beam x word array, so calculate which
158
+ # word and beam each score came from
159
+ prevK = bestScoresId // numWords
160
+ self.prevKs.append(prevK)
161
+ self.nextYs.append((bestScoresId - prevK * numWords))
162
+
163
+
164
+ for i in range(self.nextYs[-1].size(0)):
165
+ if self.nextYs[-1][i] == self._eos:
166
+ s = self.scores[i]
167
+ self.finished.append((s, len(self.nextYs) - 1, i))
168
+
169
+ # End condition is when top-of-beam is EOS and no global score.
170
+ if self.nextYs[-1][0] == self._eos:
171
+ self.eosTop = True
172
+
173
+ def done(self):
174
+ return self.eosTop and len(self.finished) >=self.size
175
+
176
+ def getFinal(self):
177
+ if len(self.finished) == 0:
178
+ self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
179
+ self.finished.sort(key=lambda a: -a[0])
180
+ if len(self.finished) != self.size:
181
+ unfinished=[]
182
+ for i in range(self.nextYs[-1].size(0)):
183
+ if self.nextYs[-1][i] != self._eos:
184
+ s = self.scores[i]
185
+ unfinished.append((s, len(self.nextYs) - 1, i))
186
+ unfinished.sort(key=lambda a: -a[0])
187
+ self.finished+=unfinished[:self.size-len(self.finished)]
188
+ return self.finished[:self.size]
189
+
190
+ def getHyp(self, beam_res):
191
+ """
192
+ Walk back to construct the full hypothesis.
193
+ """
194
+ hyps=[]
195
+ for _,timestep, k in beam_res:
196
+ hyp = []
197
+ for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
198
+ hyp.append(self.nextYs[j+1][k])
199
+ k = self.prevKs[j][k]
200
+ hyps.append(hyp[::-1])
201
+ return hyps
202
+
203
+ def buildTargetTokens(self, preds):
204
+ sentence=[]
205
+ for pred in preds:
206
+ tokens = []
207
+ for tok in pred:
208
+ if tok==self._eos:
209
+ break
210
+ tokens.append(tok)
211
+ sentence.append(tokens)
212
+ return sentence
213
+
Script/Model/GraphCodeBert/code-generation/run_generation.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ from bleu import _bleu
26
+ import pickle
27
+ import torch
28
+ import json
29
+ import random
30
+ import logging
31
+ import argparse
32
+ import numpy as np
33
+ from io import open
34
+ from itertools import cycle
35
+ import torch.nn as nn
36
+ from model import Seq2Seq
37
+ from tqdm import tqdm, trange
38
+ from fuzzywuzzy import fuzz
39
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
40
+ from torch.utils.data.distributed import DistributedSampler
41
+
42
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
43
+ RobertaConfig, RobertaModel, RobertaTokenizer)
44
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
45
+ datefmt = '%m/%d/%Y %H:%M:%S',
46
+ level = logging.INFO)
47
+ logger = logging.getLogger(__name__)
48
+ divide_number = 3
49
+
50
+
51
+ class Example(object):
52
+ """A single training/test example."""
53
+ def __init__(self,
54
+ idx,
55
+ source,
56
+ ts_v,
57
+ target,
58
+ ):
59
+ self.idx = idx
60
+ self.source = source
61
+ self.ts_v = ts_v
62
+ self.target = target
63
+
64
+ def read_examples(filename):
65
+ """Read examples from filename."""
66
+ examples=[]
67
+ with open(filename,encoding="utf-8") as f:
68
+ for idx, line in enumerate(f):
69
+
70
+ line=line.strip()
71
+ js=json.loads(line)
72
+
73
+ examples.append(
74
+ Example(
75
+ idx = idx,
76
+ source=" ".join(js['natrual_language']),
77
+ ts_v = ",".join(js['TS_V_token']),
78
+ target = " ".join(js["ground_truth"][1:-1]),
79
+ )
80
+ )
81
+
82
+ return examples
83
+
84
+
85
+ class InputFeatures(object):
86
+ """A single training/test features for a example."""
87
+ def __init__(self,
88
+ example_id,
89
+ source_ids,
90
+ target_ids,
91
+ ):
92
+ self.example_id = example_id
93
+ self.source_ids = source_ids
94
+ self.target_ids = target_ids
95
+
96
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
97
+ features = []
98
+ for example_index, example in enumerate(examples):
99
+ #source
100
+ source_tokens = tokenizer.tokenize(example.source)
101
+ ts_v_tokens = tokenizer.tokenize(example.ts_v)
102
+ source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]+ts_v_tokens+[tokenizer.sep_token]
103
+
104
+ source_ids = tokenizer.convert_tokens_to_ids(source_tokens[:args.max_source_length-5])
105
+ padding_length = args.max_source_length - len(source_ids)
106
+ source_ids+=[tokenizer.pad_token_id]*padding_length
107
+
108
+ #target
109
+ if stage=="test":
110
+ target_tokens = tokenizer.tokenize("None")
111
+ else:
112
+ target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
113
+ target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]
114
+ target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
115
+ padding_length = args.max_target_length - len(target_ids)
116
+ target_ids+=[tokenizer.pad_token_id]*padding_length
117
+
118
+
119
+
120
+ features.append(
121
+ InputFeatures(
122
+ example_index,
123
+ source_ids,
124
+ target_ids,
125
+ )
126
+ )
127
+ return features
128
+
129
+
130
+
131
+ def set_seed(seed=20240124):
132
+ random.seed(seed)
133
+ os.environ['PYHTONHASHSEED'] = str(seed)
134
+ np.random.seed(seed)
135
+ torch.manual_seed(seed)
136
+ torch.cuda.manual_seed(seed)
137
+ torch.backends.cudnn.deterministic = True
138
+
139
+ def main():
140
+ parser = argparse.ArgumentParser()
141
+
142
+ ## Required parameters
143
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
144
+ help="Path to pre-trained model: e.g. roberta-base" )
145
+ parser.add_argument("--load_model_path", default=None, type=str,
146
+ help="Path to trained model" )
147
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
148
+ help="The output directory where the model predictions and checkpoints will be written.")
149
+
150
+ ## Other parameters
151
+ parser.add_argument("--train_filename", default=None, type=str,
152
+ help="The train filename. Should contain the .jsonl files for this task.")
153
+ parser.add_argument("--dev_filename", default=None, type=str,
154
+ help="The dev filename. Should contain the .jsonl files for this task.")
155
+ parser.add_argument("--test_filename", default=None, type=str,
156
+ help="The test filename. Should contain the .jsonl files for this task.")
157
+ parser.add_argument("--max_source_length", default=256, type=int,
158
+ help="The maximum total source sequence length after tokenization. Sequences longer "
159
+ "than this will be truncated, sequences shorter will be padded.")
160
+ parser.add_argument("--max_target_length", default=256, type=int,
161
+ help="The maximum total target sequence length after tokenization. Sequences longer "
162
+ "than this will be truncated, sequences shorter will be padded.")
163
+ parser.add_argument("--do_train", action='store_true',
164
+ help="Whether to run training.")
165
+ parser.add_argument("--do_eval", action='store_true',
166
+ help="Whether to run eval on the dev set.")
167
+ parser.add_argument("--do_test", action='store_true',
168
+ help="Whether to run eval on the dev set.")
169
+ parser.add_argument("--no_cuda", action='store_true',
170
+ help="Avoid using CUDA when available")
171
+
172
+ parser.add_argument("--train_batch_size", default=8, type=int,
173
+ help="Batch size per GPU/CPU for training.")
174
+ parser.add_argument("--eval_batch_size", default=8, type=int,
175
+ help="Batch size per GPU/CPU for evaluation.")
176
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
177
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
178
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
179
+ help="The initial learning rate for Adam.")
180
+ parser.add_argument("--beam_size", default=10, type=int,
181
+ help="beam size for beam search")
182
+ parser.add_argument("--weight_decay", default=0.0, type=float,
183
+ help="Weight deay if we apply some.")
184
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
185
+ help="Epsilon for Adam optimizer.")
186
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
187
+ help="Max gradient norm.")
188
+ parser.add_argument("--num_train_epochs", default=3, type=int,
189
+ help="Total number of training epochs to perform.")
190
+ parser.add_argument('--seed', type=int, default=20240124,
191
+ help="random seed for initialization")
192
+
193
+ # print arguments
194
+ args = parser.parse_args()
195
+ # set log
196
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
197
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
198
+ # set device
199
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
200
+ args.n_gpu = torch.cuda.device_count()
201
+ args.device = device
202
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
203
+
204
+ # Set seed
205
+ set_seed(args.seed)
206
+ # make dir if output_dir not exist
207
+ if os.path.exists(args.output_dir) is False:
208
+ os.makedirs(args.output_dir)
209
+
210
+ # build model
211
+ tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
212
+ config = RobertaConfig.from_pretrained(args.model_name_or_path)
213
+ # import!!!you must set is_decoder as True for generation
214
+ config.is_decoder = True
215
+ encoder = RobertaModel.from_pretrained(args.model_name_or_path,config=config)
216
+
217
+ model = Seq2Seq(encoder=encoder,decoder=encoder,config=config,
218
+ beam_size=args.beam_size,max_length=args.max_target_length,
219
+ sos_id=tokenizer.convert_tokens_to_ids(["<mask0>"])[0],eos_id=tokenizer.sep_token_id)
220
+
221
+ logger.info("Training/evaluation parameters %s", args)
222
+ if args.load_model_path is not None:
223
+ logger.info("reload model from {}".format(args.load_model_path + "/pytorch_model.bin"))
224
+ model.load_state_dict(torch.load(args.load_model_path + "/pytorch_model.bin"))
225
+ model.to(args.device)
226
+
227
+ if args.n_gpu > 1:
228
+ # multi-gpu training
229
+ model = torch.nn.DataParallel(model)
230
+
231
+ if args.do_train:
232
+ # Prepare training data loader
233
+ train_examples = read_examples(args.train_filename)
234
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
235
+ all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long)
236
+ all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
237
+ train_data = TensorDataset(all_source_ids,all_target_ids)
238
+ train_sampler = RandomSampler(train_data)
239
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
240
+
241
+
242
+ # Prepare optimizer and schedule (linear warmup and decay)
243
+ no_decay = ['bias', 'LayerNorm.weight']
244
+ optimizer_grouped_parameters = [
245
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
246
+ 'weight_decay': args.weight_decay},
247
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
248
+ ]
249
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
250
+ scheduler = get_linear_schedule_with_warmup(optimizer,
251
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
252
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
253
+
254
+ #Start training
255
+ logger.info("***** Running training *****")
256
+ logger.info(" Num examples = %d", len(train_examples))
257
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
258
+ logger.info(" Num epoch = %d", args.num_train_epochs)
259
+
260
+
261
+ model.train()
262
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
263
+ for epoch in range(args.num_train_epochs):
264
+ for idx,batch in enumerate(train_dataloader):
265
+ batch = tuple(t.to(device) for t in batch)
266
+ source_ids,target_ids = batch
267
+ loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
268
+
269
+ if args.n_gpu > 1:
270
+ loss = loss.mean() # mean() to average on multi-gpu.
271
+ if args.gradient_accumulation_steps > 1:
272
+ loss = loss / args.gradient_accumulation_steps
273
+
274
+ losses.append(loss.item())
275
+ loss.backward()
276
+ if len(losses) % args.gradient_accumulation_steps == 0:
277
+ #Update parameters
278
+ optimizer.step()
279
+ optimizer.zero_grad()
280
+ scheduler.step()
281
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
282
+ logger.info("epoch {} step {} loss {}".format(epoch,
283
+ len(losses)//args.gradient_accumulation_steps,
284
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
285
+ if args.do_eval:
286
+ #Eval model with dev dataset
287
+ if 'dev_loss' in dev_dataset:
288
+ eval_examples,eval_data = dev_dataset['dev_loss']
289
+ else:
290
+ eval_examples = read_examples(args.dev_filename)
291
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
292
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
293
+ all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long)
294
+ eval_data = TensorDataset(all_source_ids,all_target_ids)
295
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
296
+ eval_sampler = SequentialSampler(eval_data)
297
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
298
+
299
+ logger.info("\n***** Running evaluation *****")
300
+ logger.info(" Num examples = %d", len(eval_examples))
301
+ logger.info(" Batch size = %d", args.eval_batch_size)
302
+
303
+ #Start Evaling model
304
+ model.eval()
305
+ eval_loss,tokens_num = 0,0
306
+ for batch in eval_dataloader:
307
+ batch = tuple(t.to(device) for t in batch)
308
+ source_ids,target_ids = batch
309
+
310
+ with torch.no_grad():
311
+ _,loss,num = model(source_ids=source_ids,target_ids=target_ids)
312
+ eval_loss += loss.sum().item()
313
+ tokens_num += num.sum().item()
314
+ #Pring loss of dev dataset
315
+ model.train()
316
+ eval_loss = eval_loss / tokens_num
317
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
318
+ for key in sorted(result.keys()):
319
+ logger.info(" %s = %s", key, str(result[key]))
320
+ logger.info(" "+"*"*20)
321
+
322
+ #Calculate bleu
323
+ if 'dev_bleu' in dev_dataset:
324
+ eval_examples,eval_data=dev_dataset['dev_bleu']
325
+ else:
326
+ eval_examples = read_examples(args.dev_filename)
327
+ # eval_examples = random.sample(eval_examples)
328
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
329
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
330
+ eval_data = TensorDataset(all_source_ids)
331
+ dev_dataset['dev_bleu'] = eval_examples,eval_data
332
+
333
+ eval_sampler = SequentialSampler(eval_data)
334
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
335
+
336
+ model.eval()
337
+ p=[]
338
+ for batch in eval_dataloader:
339
+ batch = tuple(t.to(device) for t in batch)
340
+ source_ids = batch[0]
341
+ with torch.no_grad():
342
+ preds = model(source_ids=source_ids)
343
+ # convert ids to text
344
+ for pred in preds:
345
+ t = pred[0].cpu().numpy()
346
+ t = list(t)
347
+ if 0 in t:
348
+ t = t[:t.index(0)]
349
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
350
+ # print(text)
351
+ p.append(text)
352
+
353
+ model.train()
354
+ predictions = []
355
+ edit_dis = 0
356
+ cnt_all = 0
357
+ res_list = []
358
+ EM = []
359
+ is_gened = False
360
+ with open(args.output_dir+"/dev.output",'w') as f, open(args.output_dir+"/dev.gold",'w') as f1:
361
+ for ref,gold in zip(p,eval_examples):
362
+ predictions.append(ref)
363
+ if len(ref) > 0:
364
+ is_gened = True
365
+ f.write(ref+'\n')
366
+ f1.write(gold.target+'\n')
367
+ EM.append(ref.split()==gold.target.split())
368
+ edit_dis += fuzz.ratio(ref, gold.target)
369
+ res_list.append([ref,gold.target])
370
+ cnt_all += 1
371
+
372
+ if is_gened:
373
+ dev_bleu = _bleu(args.output_dir+"/dev.gold", args.output_dir+"/dev.output")
374
+ else:
375
+ dev_bleu = 0
376
+ avg_edit_dis = float(edit_dis)/cnt_all
377
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
378
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
379
+ logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt_all,2))))
380
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
381
+ logger.info(" "+"*"*20)
382
+ dev_score = (dev_bleu+avg_edit_dis) / 2.0
383
+ if dev_score>best_score:
384
+ best_score=dev_score
385
+ # Save best checkpoint for best bleu
386
+ output_dir = args.output_dir
387
+ if not os.path.exists(output_dir):
388
+ os.makedirs(output_dir)
389
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
390
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
391
+ torch.save(model_to_save.state_dict(), output_model_file)
392
+ patience = 0
393
+ else:
394
+ patience += 1
395
+ if patience == 3:
396
+ break
397
+ output_dir = args.output_dir
398
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
399
+ for line in res_list:
400
+ dic = {}
401
+ dic["Pred"] = line[0]
402
+ dic["GT"] = line[1]
403
+ wf.write(json.dumps(dic))
404
+ wf.write("\n")
405
+ logger.info(" Best score:%s",best_score)
406
+ logger.info(" "+"*"*20)
407
+ if args.do_test:
408
+ res_list = []
409
+ if args.load_model_path is not None:
410
+ checkpoint_prefix = 'pytorch_model.bin'
411
+ output_dir = os.path.join(args.output_dir, checkpoint_prefix)
412
+ model_to_load = model.module if hasattr(model, 'module') else model
413
+ model_to_load.load_state_dict(torch.load(output_dir))
414
+
415
+
416
+
417
+ eval_examples = read_examples(args.test_filename)
418
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
419
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
420
+ eval_data = TensorDataset(all_source_ids)
421
+
422
+ # Calculate bleu
423
+ eval_sampler = SequentialSampler(eval_data)
424
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
425
+
426
+ model.eval()
427
+ p=[]
428
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
429
+ batch = tuple(t.to(device) for t in batch)
430
+ source_ids = batch[0]
431
+ with torch.no_grad():
432
+ preds = model(source_ids)
433
+ # convert ids to text
434
+ for pred in preds:
435
+ t = pred[0].cpu().numpy()
436
+ t = list(t)
437
+ if 0 in t:
438
+ t = t[:t.index(0)]
439
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
440
+ p.append(text)
441
+
442
+ predictions=[]
443
+ EM = []
444
+ edit_dis = 0
445
+ cnt = 0
446
+ with open(args.output_dir+"/test.output",'w') as f, open(args.output_dir+"/test.gold",'w') as f1:
447
+ for ref,gold in zip(p,eval_examples):
448
+ res_list.append([ref,gold.target])
449
+ predictions.append(ref)
450
+ f.write(ref+'\n')
451
+ f1.write(gold.target+'\n')
452
+ EM.append(ref.split()==gold.target.split())
453
+ edit_dis += fuzz.ratio(ref, gold.target)
454
+ cnt += 1
455
+
456
+ dev_bleu = _bleu(args.output_dir+"/test.gold", args.output_dir+"/test.output")
457
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
458
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
459
+ logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt,2))))
460
+ logger.info(" "+"*"*20)
461
+
462
+
463
+ with open(args.output_dir + "/last_training_result.jsonl", 'w') as wf:
464
+ for line in res_list:
465
+ dic = {}
466
+ dic["Pred"] = line[0]
467
+ dic["GT"] = line[1]
468
+ wf.write(json.dumps(dic))
469
+ wf.write("\n")
470
+
471
+ if __name__ == "__main__":
472
+ main()
473
+
474
+
Script/Model/NatGen/code-completion/run_completion.py ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ import pickle
26
+ import torch
27
+ import json
28
+ import random
29
+ import logging
30
+ import argparse
31
+ import numpy as np
32
+ from io import open
33
+ from itertools import cycle
34
+ import torch.nn as nn
35
+ from tqdm import tqdm, trange
36
+ from torch.nn.utils.rnn import pad_sequence
37
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
38
+ from torch.utils.data.distributed import DistributedSampler
39
+ from tqdm import tqdm
40
+ from fuzzywuzzy import fuzz
41
+ import re
42
+ import multiprocessing
43
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, T5ForConditionalGeneration, AutoTokenizer)
44
+
45
+ divide_number = 2
46
+ cpu_cont = 16
47
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
48
+ datefmt = '%m/%d/%Y %H:%M:%S',
49
+ level = logging.INFO)
50
+ logger = logging.getLogger(__name__)
51
+
52
+ #
53
+
54
+
55
+ class Example(object):
56
+ """A single training/test example."""
57
+ def __init__(self,
58
+ idx,
59
+ source,
60
+ target
61
+ ):
62
+ self.idx = idx
63
+ self.source = source
64
+ self.target = target
65
+
66
+ def read_examples(filename):
67
+ """Read examples from filename."""
68
+ examples=[]
69
+
70
+ with open(filename,encoding="utf-8") as f:
71
+ max_src_len = 0
72
+ max_tar_len = 0
73
+ for idx, line in enumerate(f):
74
+
75
+ js=json.loads(line)
76
+ inputs = " ".join(js["Template_token"][1:])
77
+
78
+ # print(inputs)
79
+ if "ground_truth" in js:
80
+ outputs = " ".join(js["ground_truth"])
81
+ else:
82
+ outputs = inputs
83
+ if 'Idx' in js:
84
+ idx = js['Idx']
85
+ examples.append(
86
+ Example(
87
+ idx = idx,
88
+ source = inputs,
89
+ target = outputs
90
+ )
91
+ )
92
+ return examples
93
+
94
+
95
+ class InputFeatures(object):
96
+ """A single training/test features for a example."""
97
+ def __init__(self,
98
+ example_id,
99
+ source_ids, source_mask,
100
+ target_ids, target_mask
101
+ ):
102
+ self.example_id = example_id
103
+ self.source_ids = source_ids
104
+ self.source_mask = source_mask
105
+ self.target_ids = target_ids
106
+ self.target_mask = target_mask
107
+
108
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
109
+ features = []
110
+ for example_index, example in enumerate(examples):
111
+ #source
112
+ source_ids = torch.LongTensor(tokenizer.encode(example.source,
113
+ add_special_tokens=True, max_length=args.max_source_length, truncation=True))
114
+
115
+ source_mask = torch.ones_like(source_ids)
116
+ #target
117
+ if stage=="test":
118
+ target = "None"
119
+ else:
120
+ target = example.target
121
+
122
+ target_ids = torch.LongTensor(tokenizer.encode(target,
123
+ add_special_tokens=True, max_length=args.max_target_length, truncation=True))
124
+ target_mask = torch.ones_like(target_ids)
125
+
126
+
127
+ features.append(
128
+ InputFeatures(
129
+ example_index,
130
+ source_ids, source_mask,
131
+ target_ids, target_mask
132
+ )
133
+ )
134
+ return features
135
+
136
+
137
+
138
+ def set_seed(seed=20240124):
139
+ random.seed(seed)
140
+ os.environ['PYHTONHASHSEED'] = str(seed)
141
+ np.random.seed(seed)
142
+ torch.manual_seed(seed)
143
+ torch.cuda.manual_seed(seed)
144
+ torch.backends.cudnn.deterministic = True
145
+
146
+
147
+ def main():
148
+ parser = argparse.ArgumentParser()
149
+
150
+ ## Required parameters
151
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
152
+ help="Path to pre-trained model: e.g. roberta-base" )
153
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
154
+ help="The output directory where the model predictions and checkpoints will be written.")
155
+ parser.add_argument("--load_model_path", default=None, type=str,
156
+ help="Path to trained model: Should contain the .bin files" )
157
+ ## Other parameters
158
+ parser.add_argument("--task", default=None, type=str, required=True,
159
+ help="Task Type: statement_level, next_statement" )
160
+
161
+ parser.add_argument("--train_filename", default="../../Dataset/", type=str,
162
+ help="The train filename. Should contain the .jsonl files for this task.")
163
+ parser.add_argument("--dev_filename", default="../../Dataset/", type=str,
164
+ help="The dev filename. Should contain the .jsonl files for this task.")
165
+ parser.add_argument("--test_filename", default="../../Dataset/", type=str,
166
+ help="The test filename. Should contain the .jsonl files for this task.")
167
+
168
+ parser.add_argument("--config_name", default="", type=str,
169
+ help="Pretrained config name or path if not the same as model_name")
170
+ parser.add_argument("--tokenizer_name", default="", type=str,
171
+ help="Pretrained tokenizer name or path if not the same as model_name")
172
+ # parser.add_argument("--max_source_length", default=64, type=int,
173
+ # help="The maximum total source sequence length after tokenization. Sequences longer "
174
+ # "than this will be truncated, sequences shorter will be padded.")
175
+ # parser.add_argument("--max_target_length", default=32, type=int,
176
+ # help="The maximum total target sequence length after tokenization. Sequences longer "
177
+ # "than this will be truncated, sequences shorter will be padded.")
178
+
179
+ parser.add_argument("--do_train", action='store_true',
180
+ help="Whether to run training.")
181
+ parser.add_argument("--do_eval", action='store_true',
182
+ help="Whether to run eval on the dev set.")
183
+ parser.add_argument("--do_test", action='store_true',
184
+ help="Whether to run eval on the dev set.")
185
+ parser.add_argument("--test_org", action='store_true',
186
+ help="Whether to run eval on org model.")
187
+ parser.add_argument("--do_lower_case", action='store_true',
188
+ help="Set this flag if you are using an uncased model.")
189
+ parser.add_argument("--no_cuda", action='store_true',
190
+ help="Avoid using CUDA when available")
191
+
192
+ parser.add_argument("--train_batch_size", default=8, type=int,
193
+ help="Batch size per GPU/CPU for training.")
194
+ parser.add_argument("--eval_batch_size", default=8, type=int,
195
+ help="Batch size per GPU/CPU for evaluation.")
196
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
197
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
198
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
199
+ help="The initial learning rate for Adam.")
200
+ parser.add_argument("--beam_size", default=10, type=int,
201
+ help="beam size for beam search")
202
+ parser.add_argument("--weight_decay", default=0.0, type=float,
203
+ help="Weight deay if we apply some.")
204
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
205
+ help="Epsilon for Adam optimizer.")
206
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
207
+ help="Max gradient norm.")
208
+ parser.add_argument("--num_train_epochs", default=3, type=int,
209
+ help="Total number of training epochs to perform.")
210
+ parser.add_argument("--max_steps", default=-1, type=int,
211
+ help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
212
+ parser.add_argument("--eval_steps", default=-1, type=int,
213
+ help="")
214
+ parser.add_argument("--max_target_length", default=128, type=int,
215
+ help="")
216
+ parser.add_argument("--max_source_length", default=512, type=int,
217
+ help="")
218
+ parser.add_argument("--train_steps", default=-1, type=int,
219
+ help="")
220
+ parser.add_argument("--warmup_steps", default=0, type=int,
221
+ help="Linear warmup over warmup_steps.")
222
+ parser.add_argument("--local_rank", type=int, default=-1,
223
+ help="For distributed training: local_rank")
224
+ parser.add_argument('--seed', type=int, default=20240124,
225
+ help="random seed for initialization")
226
+ # print arguments
227
+ args = parser.parse_args()
228
+ # set log
229
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
230
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
231
+ # set device
232
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
233
+ args.n_gpu = torch.cuda.device_count()
234
+ args.device = device
235
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
236
+
237
+ # Set seed
238
+ set_seed(args.seed)
239
+
240
+ # make dir if output_dir not exist
241
+ if os.path.exists(args.output_dir) is False:
242
+ os.makedirs(args.output_dir)
243
+
244
+ # build model
245
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
246
+ model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
247
+
248
+
249
+ logger.info("Training/evaluation parameters %s", args)
250
+
251
+ if args.load_model_path is not None:
252
+ if args.task == "statement_level":
253
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
254
+ model.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
255
+ else:
256
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
257
+ model.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
258
+
259
+ model.to(args.device)
260
+
261
+ if args.n_gpu > 1:
262
+ # multi-gpu training
263
+ model = torch.nn.DataParallel(model)
264
+
265
+ if args.do_train:
266
+ # Prepare training data loader
267
+ if args.task == "statement_level":
268
+ train_examples = read_examples(args.train_filename + "/Code_Completion/statement_level/train.jsonl")
269
+ else:
270
+ train_examples = read_examples(args.train_filename + "/Code_Completion/next_statement/train.jsonl")
271
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
272
+ all_source_ids = pad_sequence([f.source_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
273
+ all_source_mask = pad_sequence([f.source_mask for f in train_features], batch_first=True, padding_value=0)
274
+ all_target_ids = pad_sequence([f.target_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
275
+ all_target_mask = pad_sequence([f.target_mask for f in train_features], batch_first=True, padding_value=0)
276
+ train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
277
+ train_sampler = RandomSampler(train_data)
278
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
279
+
280
+
281
+ # Prepare optimizer and schedule (linear warmup and decay)
282
+ no_decay = ['bias', 'LayerNorm.weight']
283
+ optimizer_grouped_parameters = [
284
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
285
+ 'weight_decay': args.weight_decay},
286
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
287
+ ]
288
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
289
+ scheduler = get_linear_schedule_with_warmup(optimizer,
290
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
291
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
292
+
293
+ #Start training
294
+ logger.info("***** Running training *****")
295
+ logger.info(" Num examples = %d", len(train_examples))
296
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
297
+ logger.info(" Num epoch = %d", args.num_train_epochs)
298
+
299
+
300
+ model.train()
301
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
302
+ for epoch in range(args.num_train_epochs):
303
+ for idx,batch in enumerate(train_dataloader):
304
+ batch = tuple(t.to(device) for t in batch)
305
+ source_ids,source_mask,target_ids,target_mask = batch
306
+ # loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
307
+
308
+ loss = model(input_ids=source_ids, attention_mask=source_mask.gt(0),
309
+ labels=target_ids, decoder_attention_mask=target_mask.gt(0)).loss
310
+
311
+
312
+ if args.n_gpu > 1:
313
+ loss = loss.mean() # mean() to average on multi-gpu.
314
+
315
+ if args.gradient_accumulation_steps > 1:
316
+ loss = loss / args.gradient_accumulation_steps
317
+
318
+ losses.append(loss.item())
319
+ loss.backward()
320
+ if len(losses) % args.gradient_accumulation_steps == 0:
321
+ #Update parameters
322
+ optimizer.step()
323
+ optimizer.zero_grad()
324
+ scheduler.step()
325
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
326
+ logger.info("epoch {} step {} loss {}".format(epoch,
327
+ len(losses)//args.gradient_accumulation_steps,
328
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
329
+ if args.do_eval:
330
+ #Eval model with dev dataset
331
+
332
+ if 'dev_loss' in dev_dataset:
333
+ eval_examples,eval_data = dev_dataset['dev_loss']
334
+ else:
335
+ if args.task == "statement_level":
336
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl")
337
+ else:
338
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl")
339
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
340
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
341
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
342
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
343
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
344
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
345
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
346
+ eval_sampler = SequentialSampler(eval_data)
347
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
348
+ res_list = []
349
+ logger.info("\n***** Running evaluation *****")
350
+ logger.info(" Num examples = %d", len(eval_examples))
351
+ logger.info(" Batch size = %d", args.eval_batch_size)
352
+ p=[]
353
+ #Start Evaling model
354
+ model.eval()
355
+ eval_loss,tokens_num = 0,0
356
+ for batch in eval_dataloader:
357
+ batch = tuple(t.to(device) for t in batch)
358
+ source_ids,source_mask,target_ids,target_mask = batch
359
+ with torch.no_grad():
360
+ loss = model(input_ids=source_ids, attention_mask=source_mask,
361
+ labels=target_ids, decoder_attention_mask=target_mask).loss
362
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
363
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
364
+
365
+ # convert ids to text
366
+ for pred in preds:
367
+ # print(pred)
368
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
369
+ p.append(text)
370
+ if args.n_gpu > 1:
371
+ loss = loss.mean() # mean() to average on multi-gpu.
372
+
373
+ if args.gradient_accumulation_steps > 1:
374
+ loss = loss / args.gradient_accumulation_steps
375
+ eval_loss += loss.item()
376
+ tokens_num += 1
377
+
378
+
379
+ #Pring loss of dev dataset
380
+ model.train()
381
+ eval_loss = eval_loss / tokens_num
382
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
383
+ for key in sorted(result.keys()):
384
+ logger.info(" %s = %s", key, str(result[key]))
385
+ logger.info(" "+"*"*20)
386
+
387
+
388
+ EM = 0.0
389
+ edit_sim = 0.0
390
+ total = len(p)
391
+ token_accuracy = 0
392
+ for ref,gold in zip(p,eval_examples):
393
+ pred = ref.strip()
394
+ gt = gold.target
395
+ edit_sim += fuzz.ratio(pred, gt)
396
+ if pred.split() == gt.split():
397
+ EM += 1
398
+ res_list.append([pred,gt])
399
+ dev_acc = round(EM/total*100, 2)
400
+ # logger.info(" %s = %s "%("loss",round(np.mean(dev_losses),4)))
401
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
402
+ logger.info(" %s = %s "%("EM Acc",str(dev_acc)))
403
+ logger.info(" %s = %s "%("Edit Distance",str(round(edit_sim/total, 2))))
404
+ logger.info(" "+"*"*20)
405
+
406
+ if dev_acc > best_score:
407
+ best_score = dev_acc
408
+ # Save best checkpoint for best bleu
409
+ if args.task == "statement_level":
410
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
411
+ else:
412
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
413
+ if not os.path.exists(output_dir):
414
+ os.makedirs(output_dir)
415
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
416
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
417
+ torch.save(model_to_save.state_dict(), output_model_file)
418
+ patience = 0
419
+ else:
420
+ patience += 1
421
+ if patience == 3:
422
+ break
423
+ logger.info(" Best score:%s",best_score)
424
+ logger.info(" "+"*"*20)
425
+
426
+ if args.task == "statement_level":
427
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
428
+ else:
429
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
430
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
431
+ for line in res_list:
432
+ dic = {}
433
+ dic["Pred"] = line[0]
434
+ dic["GT"] = line[1]
435
+ wf.write(json.dumps(dic))
436
+ wf.write("\n")
437
+
438
+ if args.do_test:
439
+ res_list = []
440
+ output_dir2 = ""
441
+
442
+ if args.load_model_path is not None:
443
+ model_to_load = model.module if hasattr(model, 'module') else model
444
+
445
+ if args.task == "statement_level":
446
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
447
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
448
+ else:
449
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
450
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
451
+
452
+
453
+ if args.task == "statement_level":
454
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/statement_level/test.jsonl')
455
+ else:
456
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/next_statement/test.jsonl')
457
+ eval_examples = read_examples(args.test_filename)
458
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
459
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
460
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
461
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
462
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
463
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
464
+
465
+ # Calculate bleu
466
+ eval_sampler = SequentialSampler(eval_data)
467
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
468
+
469
+ model.eval()
470
+ p=[]
471
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
472
+ batch = tuple(t.to(device) for t in batch)
473
+ source_ids, source_mask, _, _ = batch
474
+ with torch.no_grad():
475
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
476
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
477
+ for pred in preds:
478
+ # print(pred)
479
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
480
+ p.append(text)
481
+ model.train()
482
+ edit_sim = 0.0
483
+ EM = 0.0
484
+ total = len(p)
485
+ for ref,gold in zip(p,eval_examples):
486
+ pred = ref.strip()
487
+ gt = gold.target
488
+ edit_sim += fuzz.ratio(pred, gt)
489
+ if pred.split() == gt.split():
490
+ EM += 1
491
+ res_list.append([pred,gt])
492
+ dev_acc = round(edit_sim/total, 2)
493
+ dev_em = round(EM/total, 4)
494
+ logger.info(" %s = %s "%("Test Token Avg Edit Distance",str(dev_acc)))
495
+ logger.info(" %s = %s "%("Test Token Avg Exact Match Rate",str(dev_em)))
496
+ logger.info(" "+"*"*20)
497
+ if args.test_org:
498
+ output_dir = args.output_dir
499
+ else:
500
+ if args.task == "statement_level":
501
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
502
+ else:
503
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
504
+
505
+ with open(output_dir + "/test_result.jsonl", 'w') as wf:
506
+ for line in res_list:
507
+ dic = {}
508
+ dic["Pred"] = line[0]
509
+ dic["GT"] = line[1]
510
+ wf.write(json.dumps(dic))
511
+ wf.write("\n")
512
+
513
+
514
+
515
+
516
+ if __name__ == "__main__":
517
+ main()
518
+
519
+
520
+
Script/Model/NatGen/code-generation/bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 Google Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+
16
+ """Python implementation of BLEU and smooth-BLEU.
17
+
18
+ This module provides a Python implementation of BLEU and smooth-BLEU.
19
+ Smooth BLEU is computed following the method outlined in the paper:
20
+ Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
+ evaluation metrics for machine translation. COLING 2004.
22
+ """
23
+
24
+ import collections
25
+ import math
26
+
27
+
28
+ def _get_ngrams(segment, max_order):
29
+ """Extracts all n-grams upto a given maximum order from an input segment.
30
+
31
+ Args:
32
+ segment: text segment from which n-grams will be extracted.
33
+ max_order: maximum length in tokens of the n-grams returned by this
34
+ methods.
35
+
36
+ Returns:
37
+ The Counter containing all n-grams upto max_order in segment
38
+ with a count of how many times each n-gram occurred.
39
+ """
40
+ ngram_counts = collections.Counter()
41
+ for order in range(1, max_order + 1):
42
+ for i in range(0, len(segment) - order + 1):
43
+ ngram = tuple(segment[i:i+order])
44
+ ngram_counts[ngram] += 1
45
+ return ngram_counts
46
+
47
+
48
+ def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
+ smooth=False):
50
+ """Computes BLEU score of translated segments against one or more references.
51
+
52
+ Args:
53
+ reference_corpus: list of lists of references for each translation. Each
54
+ reference should be tokenized into a list of tokens.
55
+ translation_corpus: list of translations to score. Each translation
56
+ should be tokenized into a list of tokens.
57
+ max_order: Maximum n-gram order to use when computing BLEU score.
58
+ smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
+
60
+ Returns:
61
+ 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
+ precisions and brevity penalty.
63
+ """
64
+ matches_by_order = [0] * max_order
65
+ possible_matches_by_order = [0] * max_order
66
+ reference_length = 0
67
+ translation_length = 0
68
+ for (references, translation) in zip(reference_corpus,
69
+ translation_corpus):
70
+ reference_length += min(len(r) for r in references)
71
+ translation_length += len(translation)
72
+
73
+ merged_ref_ngram_counts = collections.Counter()
74
+ for reference in references:
75
+ merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
+ translation_ngram_counts = _get_ngrams(translation, max_order)
77
+ overlap = translation_ngram_counts & merged_ref_ngram_counts
78
+ for ngram in overlap:
79
+ matches_by_order[len(ngram)-1] += overlap[ngram]
80
+ for order in range(1, max_order+1):
81
+ possible_matches = len(translation) - order + 1
82
+ if possible_matches > 0:
83
+ possible_matches_by_order[order-1] += possible_matches
84
+
85
+ precisions = [0] * max_order
86
+ for i in range(0, max_order):
87
+ if smooth:
88
+ precisions[i] = ((matches_by_order[i] + 1.) /
89
+ (possible_matches_by_order[i] + 1.))
90
+ else:
91
+ if possible_matches_by_order[i] > 0:
92
+ precisions[i] = (float(matches_by_order[i]) /
93
+ possible_matches_by_order[i])
94
+ else:
95
+ precisions[i] = 0.0
96
+
97
+ if min(precisions) > 0:
98
+ p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
+ geo_mean = math.exp(p_log_sum)
100
+ else:
101
+ geo_mean = 0
102
+
103
+ ratio = float(translation_length) / reference_length
104
+
105
+ if ratio > 1.0:
106
+ bp = 1.
107
+ else:
108
+ bp = math.exp(1 - 1. / ratio)
109
+
110
+ bleu = geo_mean * bp
111
+
112
+ return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
+
114
+
115
+ def _bleu(ref_file, trans_file, subword_option=None):
116
+ max_order = 4
117
+ smooth = True
118
+ ref_files = [ref_file]
119
+ reference_text = []
120
+ for reference_filename in ref_files:
121
+ with open(reference_filename) as fh:
122
+ reference_text.append(fh.readlines())
123
+ per_segment_references = []
124
+ for references in zip(*reference_text):
125
+ reference_list = []
126
+ for reference in references:
127
+ reference_list.append(reference.strip().split())
128
+ per_segment_references.append(reference_list)
129
+ translations = []
130
+ with open(trans_file) as fh:
131
+ for line in fh:
132
+ translations.append(line.strip().split())
133
+ bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
+ return round(100 * bleu_score,2)
Script/Model/NatGen/code-generation/run_generation.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ from bleu import _bleu
26
+ import pickle
27
+ import torch
28
+ import json
29
+ import random
30
+ import logging
31
+ import argparse
32
+ import numpy as np
33
+ from io import open
34
+ from itertools import cycle
35
+ import torch.nn as nn
36
+ from tqdm import tqdm, trange
37
+ from torch.nn.utils.rnn import pad_sequence
38
+ from fuzzywuzzy import fuzz
39
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
40
+ from torch.utils.data.distributed import DistributedSampler
41
+
42
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, T5ForConditionalGeneration, AutoTokenizer)
43
+
44
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
45
+ datefmt = '%m/%d/%Y %H:%M:%S',
46
+ level = logging.INFO)
47
+ logger = logging.getLogger(__name__)
48
+ divide_number = 3
49
+ #
50
+
51
+ class Example(object):
52
+ """A single training/test example."""
53
+ def __init__(self,
54
+ idx,
55
+ source,
56
+ ts_v,
57
+ target,
58
+ ):
59
+ self.idx = idx
60
+ self.source = source
61
+ self.ts_v = ts_v
62
+ self.target = target
63
+
64
+ def read_examples(filename):
65
+ """Read examples from filename."""
66
+ examples=[]
67
+ with open(filename,encoding="utf-8") as f:
68
+ for idx, line in enumerate(f):
69
+
70
+ line=line.strip()
71
+ js=json.loads(line)
72
+
73
+ examples.append(
74
+ Example(
75
+ idx = idx,
76
+ source=" ".join(js['natrual_language']),
77
+ ts_v = ",".join(js['TS_V_token']),
78
+ target = " ".join(js["ground_truth"][1:-1]),
79
+ )
80
+ )
81
+
82
+ return examples
83
+
84
+
85
+ class InputFeatures(object):
86
+ """A single training/test features for a example."""
87
+ def __init__(self,
88
+ example_id,
89
+ source_ids, source_mask,
90
+ target_ids, target_mask
91
+ ):
92
+ self.example_id = example_id
93
+ self.source_ids = source_ids
94
+ self.source_mask = source_mask
95
+ self.target_ids = target_ids
96
+ self.target_mask = target_mask
97
+
98
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
99
+ features = []
100
+ for example_index, example in enumerate(examples):
101
+ #source
102
+
103
+ source_ids = torch.LongTensor(tokenizer.encode(example.source + tokenizer.pad_token + example.ts_v,
104
+ add_special_tokens=True, max_length=args.max_source_length, truncation=True))
105
+
106
+ source_mask = torch.ones_like(source_ids)
107
+ #target
108
+ if stage=="test":
109
+ target_tokens = tokenizer.tokenize("None")
110
+ else:
111
+ target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
112
+
113
+ target_ids = torch.LongTensor(tokenizer.encode(example.target,
114
+ add_special_tokens=True, max_length=args.max_target_length, truncation=True))
115
+ target_mask = torch.ones_like(target_ids)
116
+
117
+
118
+
119
+ features.append(
120
+ InputFeatures(
121
+ example_index,
122
+ source_ids, source_mask,
123
+ target_ids, target_mask
124
+ )
125
+ )
126
+ return features
127
+
128
+
129
+
130
+ def set_seed(seed=20240124):
131
+ random.seed(seed)
132
+ os.environ['PYHTONHASHSEED'] = str(seed)
133
+ np.random.seed(seed)
134
+ torch.manual_seed(seed)
135
+ torch.cuda.manual_seed(seed)
136
+ torch.backends.cudnn.deterministic = True
137
+
138
+ def main():
139
+ parser = argparse.ArgumentParser()
140
+
141
+ ## Required parameters
142
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
143
+ help="Path to pre-trained model: e.g. roberta-base" )
144
+ parser.add_argument("--load_model_path", default=None, type=str,
145
+ help="Path to trained model" )
146
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
147
+ help="The output directory where the model predictions and checkpoints will be written.")
148
+
149
+ ## Other parameters
150
+ parser.add_argument("--train_filename", default=None, type=str,
151
+ help="The train filename. Should contain the .jsonl files for this task.")
152
+ parser.add_argument("--dev_filename", default=None, type=str,
153
+ help="The dev filename. Should contain the .jsonl files for this task.")
154
+ parser.add_argument("--test_filename", default=None, type=str,
155
+ help="The test filename. Should contain the .jsonl files for this task.")
156
+ parser.add_argument("--max_source_length", default=256, type=int,
157
+ help="The maximum total source sequence length after tokenization. Sequences longer "
158
+ "than this will be truncated, sequences shorter will be padded.")
159
+ parser.add_argument("--max_target_length", default=512, type=int,
160
+ help="The maximum total target sequence length after tokenization. Sequences longer "
161
+ "than this will be truncated, sequences shorter will be padded.")
162
+ parser.add_argument("--do_train", action='store_true',
163
+ help="Whether to run training.")
164
+ parser.add_argument("--do_eval", action='store_true',
165
+ help="Whether to run eval on the dev set.")
166
+ parser.add_argument("--do_test", action='store_true',
167
+ help="Whether to run eval on the dev set.")
168
+ parser.add_argument("--no_cuda", action='store_true',
169
+ help="Avoid using CUDA when available")
170
+
171
+ parser.add_argument("--train_batch_size", default=8, type=int,
172
+ help="Batch size per GPU/CPU for training.")
173
+ parser.add_argument("--eval_batch_size", default=8, type=int,
174
+ help="Batch size per GPU/CPU for evaluation.")
175
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
176
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
177
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
178
+ help="The initial learning rate for Adam.")
179
+ parser.add_argument("--beam_size", default=10, type=int,
180
+ help="beam size for beam search")
181
+ parser.add_argument("--weight_decay", default=0.0, type=float,
182
+ help="Weight deay if we apply some.")
183
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
184
+ help="Epsilon for Adam optimizer.")
185
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
186
+ help="Max gradient norm.")
187
+ parser.add_argument("--num_train_epochs", default=3, type=int,
188
+ help="Total number of training epochs to perform.")
189
+ parser.add_argument('--seed', type=int, default=20240124,
190
+ help="random seed for initialization")
191
+
192
+ # print arguments
193
+ args = parser.parse_args()
194
+ # set log
195
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
196
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
197
+ # set device
198
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
199
+ args.n_gpu = torch.cuda.device_count()
200
+ args.device = device
201
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
202
+
203
+ # Set seed
204
+ set_seed(args.seed)
205
+ # make dir if output_dir not exist
206
+ if os.path.exists(args.output_dir) is False:
207
+ os.makedirs(args.output_dir)
208
+
209
+ # build model
210
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
211
+ model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
212
+
213
+ logger.info("Training/evaluation parameters %s", args)
214
+ if args.load_model_path is not None:
215
+ logger.info("reload model from {}".format(args.load_model_path + "/pytorch_model.bin"))
216
+ model.load_state_dict(torch.load(args.load_model_path + "/pytorch_model.bin"))
217
+ model.to(args.device)
218
+
219
+ if args.n_gpu > 1:
220
+ # multi-gpu training
221
+ model = torch.nn.DataParallel(model)
222
+
223
+ if args.do_train:
224
+ # Prepare training data loader
225
+ train_examples = read_examples(args.train_filename)
226
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
227
+ all_source_ids = pad_sequence([f.source_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
228
+ all_source_mask = pad_sequence([f.source_mask for f in train_features], batch_first=True, padding_value=0)
229
+ all_target_ids = pad_sequence([f.target_ids for f in train_features], batch_first=True, padding_value=tokenizer.pad_token_id)
230
+ all_target_mask = pad_sequence([f.target_mask for f in train_features], batch_first=True, padding_value=0)
231
+ train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
232
+ train_sampler = RandomSampler(train_data)
233
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
234
+
235
+ # Prepare optimizer and schedule (linear warmup and decay)
236
+ no_decay = ['bias', 'LayerNorm.weight']
237
+ optimizer_grouped_parameters = [
238
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
239
+ 'weight_decay': args.weight_decay},
240
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
241
+ ]
242
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
243
+ scheduler = get_linear_schedule_with_warmup(optimizer,
244
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
245
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
246
+
247
+ #Start training
248
+ logger.info("***** Running training *****")
249
+ logger.info(" Num examples = %d", len(train_examples))
250
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
251
+ logger.info(" Num epoch = %d", args.num_train_epochs)
252
+
253
+
254
+ model.train()
255
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
256
+ for epoch in range(args.num_train_epochs):
257
+ for idx,batch in enumerate(train_dataloader):
258
+ batch = tuple(t.to(device) for t in batch)
259
+ source_ids,source_mask,target_ids,target_mask = batch
260
+ # loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
261
+
262
+ loss = model(input_ids=source_ids, attention_mask=source_mask.gt(0),
263
+ labels=target_ids, decoder_attention_mask=target_mask.gt(0)).loss
264
+
265
+ if args.n_gpu > 1:
266
+ loss = loss.mean() # mean() to average on multi-gpu.
267
+ if args.gradient_accumulation_steps > 1:
268
+ loss = loss / args.gradient_accumulation_steps
269
+
270
+ losses.append(loss.item())
271
+ loss.backward()
272
+ if len(losses) % args.gradient_accumulation_steps == 0:
273
+ #Update parameters
274
+ optimizer.step()
275
+ optimizer.zero_grad()
276
+ scheduler.step()
277
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
278
+ logger.info("epoch {} step {} loss {}".format(epoch,
279
+ len(losses)//args.gradient_accumulation_steps,
280
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
281
+ if args.do_eval:
282
+ #Eval model with dev dataset
283
+ if 'dev_loss' in dev_dataset:
284
+ eval_examples,eval_data = dev_dataset['dev_loss']
285
+ else:
286
+ eval_examples = read_examples(args.dev_filename)
287
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
288
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
289
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
290
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
291
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
292
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
293
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
294
+ eval_sampler = SequentialSampler(eval_data)
295
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
296
+
297
+ logger.info("\n***** Running evaluation *****")
298
+
299
+ logger.info(" Num examples = %d", len(eval_examples))
300
+ logger.info(" Batch size = %d", args.eval_batch_size)
301
+
302
+ #Start Evaling model
303
+ model.eval()
304
+ eval_loss,tokens_num = 0,0
305
+ for batch in eval_dataloader:
306
+ batch = tuple(t.to(device) for t in batch)
307
+ source_ids,source_mask,target_ids,target_mask = batch
308
+ with torch.no_grad():
309
+ loss = model(input_ids=source_ids, attention_mask=source_mask,
310
+ labels=target_ids, decoder_attention_mask=target_mask).loss
311
+
312
+ if args.n_gpu > 1:
313
+ loss = loss.mean() # mean() to average on multi-gpu.
314
+
315
+ if args.gradient_accumulation_steps > 1:
316
+ loss = loss / args.gradient_accumulation_steps
317
+ eval_loss += loss.item()
318
+ tokens_num += 1
319
+ #Pring loss of dev dataset
320
+ model.train()
321
+ eval_loss = eval_loss / tokens_num
322
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
323
+ for key in sorted(result.keys()):
324
+ logger.info(" %s = %s", key, str(result[key]))
325
+ logger.info(" "+"*"*20)
326
+
327
+ #Calculate bleu
328
+ if 'dev_bleu' in dev_dataset:
329
+ eval_examples,eval_data=dev_dataset['dev_bleu']
330
+ else:
331
+ eval_examples = read_examples(args.dev_filename)
332
+ # eval_examples = random.sample(eval_examples)
333
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
334
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
335
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
336
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
337
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
338
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
339
+ dev_dataset['dev_bleu'] = eval_examples,eval_data
340
+
341
+ eval_sampler = SequentialSampler(eval_data)
342
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
343
+
344
+ model.eval()
345
+ p=[]
346
+ for batch in eval_dataloader:
347
+ batch = tuple(t.to(device) for t in batch)
348
+ source_ids,source_mask,target_ids,target_mask = batch
349
+ with torch.no_grad():
350
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
351
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
352
+
353
+ # convert ids to text
354
+ for pred in preds:
355
+ # print(pred)
356
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
357
+ p.append(text)
358
+
359
+ model.train()
360
+ predictions = []
361
+ res_list = []
362
+ EM = []
363
+ is_gened = False
364
+ with open(args.output_dir+"/dev.output",'w') as f, open(args.output_dir+"/dev.gold",'w') as f1:
365
+ for ref,gold in zip(p,eval_examples):
366
+ predictions.append(ref)
367
+ if len(ref) > 0:
368
+ is_gened = True
369
+ f.write(ref+'\n')
370
+ f1.write(gold.target+'\n')
371
+ EM.append(ref.split()==gold.target.split())
372
+ res_list.append([ref,gold.target])
373
+ if is_gened:
374
+ dev_bleu = _bleu(args.output_dir+"/dev.gold", args.output_dir+"/dev.output")
375
+ else:
376
+ dev_bleu = 0
377
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
378
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
379
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
380
+ logger.info(" "+"*"*20)
381
+ dev_score = (dev_bleu+round(np.mean(EM)*100,2))
382
+ if dev_score>best_score:
383
+ best_score=dev_score
384
+ # Save best checkpoint for best bleu
385
+ output_dir = args.output_dir
386
+ if not os.path.exists(output_dir):
387
+ os.makedirs(output_dir)
388
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
389
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
390
+ torch.save(model_to_save.state_dict(), output_model_file)
391
+ patience = 0
392
+ else:
393
+ patience += 1
394
+ if patience == 3:
395
+ break
396
+ output_dir = args.output_dir
397
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
398
+ for line in res_list:
399
+ dic = {}
400
+ dic["Pred"] = line[0]
401
+ dic["GT"] = line[1]
402
+ wf.write(json.dumps(dic))
403
+ wf.write("\n")
404
+ # patience =0
405
+ # else:
406
+ # patience +=1
407
+ # if patience == -1:
408
+ # break
409
+ logger.info(" Best score:%s",best_score)
410
+ logger.info(" "+"*"*20)
411
+ if args.do_test:
412
+ res_list = []
413
+ if args.load_model_path is not None:
414
+ checkpoint_prefix = 'pytorch_model.bin'
415
+ output_dir = os.path.join(args.output_dir, checkpoint_prefix)
416
+ model_to_load = model.module if hasattr(model, 'module') else model
417
+ model_to_load.load_state_dict(torch.load(output_dir))
418
+
419
+
420
+ eval_examples = read_examples(args.test_filename)
421
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
422
+ all_source_ids = pad_sequence([f.source_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
423
+ all_source_mask = pad_sequence([f.source_mask for f in eval_features], batch_first=True, padding_value=0)
424
+ all_target_ids = pad_sequence([f.target_ids for f in eval_features], batch_first=True, padding_value=tokenizer.pad_token_id)
425
+ all_target_mask = pad_sequence([f.target_mask for f in eval_features], batch_first=True, padding_value=0)
426
+ eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
427
+
428
+ # Calculate bleu
429
+ eval_sampler = SequentialSampler(eval_data)
430
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
431
+
432
+ model.eval()
433
+ p=[]
434
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
435
+ batch = tuple(t.to(device) for t in batch)
436
+ source_ids, source_mask, _, _ = batch
437
+ with torch.no_grad():
438
+ preds = model.module.generate(source_ids, attention_mask=source_mask, use_cache=True,
439
+ num_beams=args.beam_size, max_new_tokens =args.max_target_length)
440
+ for pred in preds:
441
+ # print(pred)
442
+ text = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
443
+ p.append(text)
444
+
445
+ predictions=[]
446
+ EM = []
447
+ edit_dis = 0
448
+ cnt = 0
449
+ with open(args.output_dir+"/test.output",'w') as f, open(args.output_dir+"/test.gold",'w') as f1:
450
+ for ref,gold in zip(p,eval_examples):
451
+ res_list.append([ref,gold.target])
452
+ predictions.append(ref)
453
+ f.write(ref+'\n')
454
+ f1.write(gold.target+'\n')
455
+ EM.append(ref.split()==gold.target.split())
456
+ edit_dis += fuzz.ratio(ref, gold.target)
457
+ cnt += 1
458
+
459
+ dev_bleu = _bleu(args.output_dir+"/test.gold", args.output_dir+"/test.output")
460
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
461
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
462
+ logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt,2))))
463
+ logger.info(" "+"*"*20)
464
+
465
+
466
+ with open(args.output_dir + "/last_training_result.jsonl", 'w') as wf:
467
+ for line in res_list:
468
+ dic = {}
469
+ dic["Pred"] = line[0]
470
+ dic["GT"] = line[1]
471
+ wf.write(json.dumps(dic))
472
+ wf.write("\n")
473
+
474
+ if __name__ == "__main__":
475
+ main()
476
+
477
+
Script/Model/UnixCoder/code-completion/model.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch
7
+ from torch.autograd import Variable
8
+ import copy
9
+ class Seq2Seq(nn.Module):
10
+ """
11
+ Build Seqence-to-Sequence.
12
+
13
+ Parameters:
14
+
15
+ * `encoder`- encoder of seq2seq model. e.g. roberta
16
+ * `decoder`- decoder of seq2seq model. e.g. transformer
17
+ * `config`- configuration of encoder model.
18
+ * `beam_size`- beam size for beam search.
19
+ * `max_length`- max length of target for beam search.
20
+ * `sos_id`- start of symbol ids in target for beam search.
21
+ * `eos_id`- end of symbol ids in target for beam search.
22
+ """
23
+ def __init__(self, encoder,decoder, config, beam_size=None, max_length=None, sos_id=None, eos_id=None):
24
+ super(Seq2Seq, self).__init__()
25
+ self.encoder = encoder
26
+ self.decoder=decoder
27
+ self.config=config
28
+ self.register_buffer(
29
+ "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
30
+ )
31
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
32
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
33
+ self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
34
+ self.lsm = nn.LogSoftmax(dim=-1)
35
+
36
+ self.beam_size = beam_size
37
+ self.max_length = max_length
38
+ self.sos_id = sos_id
39
+ self.eos_id = eos_id
40
+
41
+ def forward(self, source_ids, target_ids=None):
42
+ if target_ids is None:
43
+ return self.generate(source_ids)
44
+
45
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
46
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
47
+ ids = torch.cat((source_ids,target_ids),-1)
48
+ mask = self.bias[:,source_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
49
+ mask = mask & ids[:,None,:].ne(1)
50
+
51
+ out = self.decoder(target_ids,attention_mask=mask,past_key_values=encoder_output.past_key_values).last_hidden_state
52
+ lm_logits = self.lm_head(out)
53
+ # Shift so that tokens < n predict n
54
+ active_loss = target_ids[..., 1:].ne(1).view(-1)
55
+ shift_logits = lm_logits[..., :-1, :].contiguous()
56
+ shift_labels = target_ids[..., 1:].contiguous()
57
+ # Flatten the tokens
58
+ loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
59
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
60
+ shift_labels.view(-1)[active_loss])
61
+
62
+ outputs = loss,loss*active_loss.sum(),active_loss.sum()
63
+ return outputs
64
+
65
+ def generate(self, source_ids):
66
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
67
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
68
+ preds = []
69
+ zero = torch.cuda.LongTensor(1).fill_(0)
70
+ source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
71
+ for i in range(source_ids.shape[0]):
72
+ context = [[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y]
73
+ for y in encoder_output.past_key_values]
74
+ beam = Beam(self.beam_size,self.sos_id,self.eos_id)
75
+ input_ids = beam.getCurrentState()
76
+ context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
77
+ for _ in range(self.max_length):
78
+ if beam.done():
79
+ break
80
+
81
+ ids = torch.cat((context_ids,input_ids),-1)
82
+ mask = self.bias[:,context_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
83
+ mask = mask & ids[:,None,:].ne(1)
84
+ out = self.decoder(input_ids,attention_mask=mask,past_key_values=context).last_hidden_state
85
+ hidden_states = out[:,-1,:]
86
+ out = self.lsm(self.lm_head(hidden_states)).data
87
+ beam.advance(out)
88
+ input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
89
+ input_ids = torch.cat((input_ids,beam.getCurrentState()),-1)
90
+ hyp = beam.getHyp(beam.getFinal())
91
+ pred = beam.buildTargetTokens(hyp)[:self.beam_size]
92
+ pred = [torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
93
+ preds.append(torch.cat(pred,0).unsqueeze(0))
94
+
95
+ preds = torch.cat(preds,0)
96
+
97
+ return preds
98
+
99
+
100
+
101
+ class Beam(object):
102
+ def __init__(self, size,sos,eos):
103
+ self.size = size
104
+ self.tt = torch.cuda
105
+ # The score for each translation on the beam.
106
+ self.scores = self.tt.FloatTensor(size).zero_()
107
+ # The backpointers at each time-step.
108
+ self.prevKs = []
109
+ # The outputs at each time-step.
110
+ self.nextYs = [self.tt.LongTensor(size)
111
+ .fill_(0)]
112
+ self.nextYs[0][0] = sos
113
+ # Has EOS topped the beam yet.
114
+ self._eos = eos
115
+ self.eosTop = False
116
+ # Time and k pair for finished.
117
+ self.finished = []
118
+
119
+ def getCurrentState(self):
120
+ "Get the outputs for the current timestep."
121
+ batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
122
+ return batch
123
+
124
+ def getCurrentOrigin(self):
125
+ "Get the backpointers for the current timestep."
126
+ return self.prevKs[-1]
127
+
128
+ def advance(self, wordLk):
129
+ """
130
+ Given prob over words for every last beam `wordLk` and attention
131
+ `attnOut`: Compute and update the beam search.
132
+
133
+ Parameters:
134
+
135
+ * `wordLk`- probs of advancing from the last step (K x words)
136
+ * `attnOut`- attention at the last step
137
+
138
+ Returns: True if beam search is complete.
139
+ """
140
+ numWords = wordLk.size(1)
141
+
142
+ # Sum the previous scores.
143
+ if len(self.prevKs) > 0:
144
+ beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
145
+
146
+ # Don't let EOS have children.
147
+ for i in range(self.nextYs[-1].size(0)):
148
+ if self.nextYs[-1][i] == self._eos:
149
+ beamLk[i] = -1e20
150
+ else:
151
+ beamLk = wordLk[0]
152
+ flatBeamLk = beamLk.view(-1)
153
+ bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
154
+
155
+ self.scores = bestScores
156
+
157
+ # bestScoresId is flattened beam x word array, so calculate which
158
+ # word and beam each score came from
159
+ prevK = bestScoresId // numWords
160
+ self.prevKs.append(prevK)
161
+ self.nextYs.append((bestScoresId - prevK * numWords))
162
+
163
+
164
+ for i in range(self.nextYs[-1].size(0)):
165
+ if self.nextYs[-1][i] == self._eos:
166
+ s = self.scores[i]
167
+ self.finished.append((s, len(self.nextYs) - 1, i))
168
+
169
+ # End condition is when top-of-beam is EOS and no global score.
170
+ if self.nextYs[-1][0] == self._eos:
171
+ self.eosTop = True
172
+
173
+ def done(self):
174
+ return self.eosTop and len(self.finished) >=self.size
175
+
176
+ def getFinal(self):
177
+ if len(self.finished) == 0:
178
+ self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
179
+ self.finished.sort(key=lambda a: -a[0])
180
+ if len(self.finished) != self.size:
181
+ unfinished=[]
182
+ for i in range(self.nextYs[-1].size(0)):
183
+ if self.nextYs[-1][i] != self._eos:
184
+ s = self.scores[i]
185
+ unfinished.append((s, len(self.nextYs) - 1, i))
186
+ unfinished.sort(key=lambda a: -a[0])
187
+ self.finished+=unfinished[:self.size-len(self.finished)]
188
+ return self.finished[:self.size]
189
+
190
+ def getHyp(self, beam_res):
191
+ """
192
+ Walk back to construct the full hypothesis.
193
+ """
194
+ hyps=[]
195
+ for _,timestep, k in beam_res:
196
+ hyp = []
197
+ for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
198
+ hyp.append(self.nextYs[j+1][k])
199
+ k = self.prevKs[j][k]
200
+ hyps.append(hyp[::-1])
201
+ return hyps
202
+
203
+ def buildTargetTokens(self, preds):
204
+ sentence=[]
205
+ for pred in preds:
206
+ tokens = []
207
+ for tok in pred:
208
+ if tok==self._eos:
209
+ break
210
+ tokens.append(tok)
211
+ sentence.append(tokens)
212
+ return sentence
213
+
Script/Model/UnixCoder/code-completion/run_completion.py ADDED
@@ -0,0 +1,543 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ import pickle
26
+ import torch
27
+ import json
28
+ import random
29
+ import logging
30
+ import argparse
31
+ import numpy as np
32
+ from io import open
33
+ from itertools import cycle
34
+ import torch.nn as nn
35
+ from model import Seq2Seq
36
+ from tqdm import tqdm, trange
37
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
38
+ from torch.utils.data.distributed import DistributedSampler
39
+ from tqdm import tqdm
40
+ from fuzzywuzzy import fuzz
41
+ import re
42
+ import multiprocessing
43
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
44
+ RobertaConfig, RobertaModel, RobertaTokenizer)
45
+
46
+ divide_number = 2
47
+ cpu_cont = 16
48
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
49
+ datefmt = '%m/%d/%Y %H:%M:%S',
50
+ level = logging.INFO)
51
+ logger = logging.getLogger(__name__)
52
+
53
+ #
54
+
55
+
56
+ class Example(object):
57
+ """A single training/test example."""
58
+ def __init__(self,
59
+ idx,
60
+ source,
61
+ target,
62
+ max_src_len,
63
+ max_tar_len
64
+ ):
65
+ self.idx = idx
66
+ self.source = source
67
+ self.target = target
68
+ self.max_src_len = max_src_len
69
+ self.max_tar_len = max_tar_len
70
+
71
+ def read_examples(filename):
72
+ """Read examples from filename."""
73
+ examples=[]
74
+
75
+ with open(filename,encoding="utf-8") as f:
76
+ max_src_len = 0
77
+ max_tar_len = 0
78
+ for idx, line in enumerate(f):
79
+
80
+ js=json.loads(line)
81
+ inputs = " ".join(js["Template_token"][1:])
82
+ max_src_len = max(max_src_len, len(js["Template_token"]))
83
+
84
+ # print(inputs)
85
+ if "ground_truth" in js:
86
+ outputs = " ".join(js["ground_truth"])
87
+ max_tar_len = max(max_src_len, len(js["ground_truth"]))
88
+ else:
89
+ outputs = inputs
90
+ if 'Idx' in js:
91
+ idx = js['Idx']
92
+ examples.append(
93
+ Example(
94
+ idx = idx,
95
+ source = inputs,
96
+ target = outputs,
97
+ max_src_len = max_src_len,
98
+ max_tar_len = max_tar_len
99
+ )
100
+ )
101
+ return examples
102
+
103
+
104
+ class InputFeatures(object):
105
+ """A single training/test features for a example."""
106
+ def __init__(self,
107
+ example_id,
108
+ source_ids,
109
+ target_ids,
110
+ ):
111
+ self.example_id = example_id
112
+ self.source_ids = source_ids
113
+ self.target_ids = target_ids
114
+
115
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
116
+ features = []
117
+ for example_index, example in enumerate(examples):
118
+ #source
119
+ source_tokens = tokenizer.tokenize(example.source)[:args.max_source_length-5]
120
+ source_tokens =[tokenizer.cls_token,"<encoder-decoder>",tokenizer.sep_token]+source_tokens+["<mask0>",tokenizer.sep_token]
121
+ source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
122
+ padding_length = args.max_source_length - len(source_ids)
123
+ source_ids+=[tokenizer.pad_token_id]*padding_length
124
+
125
+ #target
126
+ if stage=="test":
127
+ target_tokens = tokenizer.tokenize("None")
128
+ else:
129
+ target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
130
+ target_tokens = ["<mask0>"]+target_tokens+[tokenizer.sep_token]
131
+ target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
132
+ padding_length = args.max_target_length - len(target_ids)
133
+ target_ids+=[tokenizer.pad_token_id]*padding_length
134
+
135
+
136
+
137
+ features.append(
138
+ InputFeatures(
139
+ example_index,
140
+ source_ids,
141
+ target_ids,
142
+ )
143
+ )
144
+ return features
145
+
146
+
147
+
148
+ def set_seed(seed=20240124):
149
+ random.seed(seed)
150
+ os.environ['PYHTONHASHSEED'] = str(seed)
151
+ np.random.seed(seed)
152
+ torch.manual_seed(seed)
153
+ torch.cuda.manual_seed(seed)
154
+ torch.backends.cudnn.deterministic = True
155
+
156
+
157
+ def main():
158
+ parser = argparse.ArgumentParser()
159
+
160
+ ## Required parameters
161
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
162
+ help="Path to pre-trained model: e.g. roberta-base" )
163
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
164
+ help="The output directory where the model predictions and checkpoints will be written.")
165
+ parser.add_argument("--load_model_path", default=None, type=str,
166
+ help="Path to trained model: Should contain the .bin files" )
167
+ ## Other parameters
168
+ parser.add_argument("--task", default=None, type=str, required=True,
169
+ help="Task Type: statement_level, next_statement" )
170
+
171
+ parser.add_argument("--train_filename", default="../../Dataset/", type=str,
172
+ help="The train filename. Should contain the .jsonl files for this task.")
173
+ parser.add_argument("--dev_filename", default="../../Dataset/", type=str,
174
+ help="The dev filename. Should contain the .jsonl files for this task.")
175
+ parser.add_argument("--test_filename", default="../../Dataset/", type=str,
176
+ help="The test filename. Should contain the .jsonl files for this task.")
177
+
178
+ parser.add_argument("--config_name", default="", type=str,
179
+ help="Pretrained config name or path if not the same as model_name")
180
+ parser.add_argument("--tokenizer_name", default="", type=str,
181
+ help="Pretrained tokenizer name or path if not the same as model_name")
182
+ # parser.add_argument("--max_source_length", default=64, type=int,
183
+ # help="The maximum total source sequence length after tokenization. Sequences longer "
184
+ # "than this will be truncated, sequences shorter will be padded.")
185
+ # parser.add_argument("--max_target_length", default=32, type=int,
186
+ # help="The maximum total target sequence length after tokenization. Sequences longer "
187
+ # "than this will be truncated, sequences shorter will be padded.")
188
+
189
+ parser.add_argument("--do_train", action='store_true',
190
+ help="Whether to run training.")
191
+ parser.add_argument("--do_eval", action='store_true',
192
+ help="Whether to run eval on the dev set.")
193
+ parser.add_argument("--do_test", action='store_true',
194
+ help="Whether to run eval on the dev set.")
195
+ parser.add_argument("--test_org", action='store_true',
196
+ help="Whether to run eval on org model.")
197
+ parser.add_argument("--do_lower_case", action='store_true',
198
+ help="Set this flag if you are using an uncased model.")
199
+ parser.add_argument("--no_cuda", action='store_true',
200
+ help="Avoid using CUDA when available")
201
+
202
+ parser.add_argument("--train_batch_size", default=8, type=int,
203
+ help="Batch size per GPU/CPU for training.")
204
+ parser.add_argument("--eval_batch_size", default=8, type=int,
205
+ help="Batch size per GPU/CPU for evaluation.")
206
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
207
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
208
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
209
+ help="The initial learning rate for Adam.")
210
+ parser.add_argument("--beam_size", default=10, type=int,
211
+ help="beam size for beam search")
212
+ parser.add_argument("--weight_decay", default=0.0, type=float,
213
+ help="Weight deay if we apply some.")
214
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
215
+ help="Epsilon for Adam optimizer.")
216
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
217
+ help="Max gradient norm.")
218
+ parser.add_argument("--num_train_epochs", default=3, type=int,
219
+ help="Total number of training epochs to perform.")
220
+ parser.add_argument("--max_steps", default=-1, type=int,
221
+ help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
222
+ parser.add_argument("--eval_steps", default=-1, type=int,
223
+ help="")
224
+ parser.add_argument("--max_target_length", default=128, type=int,
225
+ help="")
226
+ parser.add_argument("--max_source_length", default=512, type=int,
227
+ help="")
228
+ parser.add_argument("--train_steps", default=-1, type=int,
229
+ help="")
230
+ parser.add_argument("--warmup_steps", default=0, type=int,
231
+ help="Linear warmup over warmup_steps.")
232
+ parser.add_argument("--local_rank", type=int, default=-1,
233
+ help="For distributed training: local_rank")
234
+ parser.add_argument('--seed', type=int, default=20240124,
235
+ help="random seed for initialization")
236
+ # print arguments
237
+ args = parser.parse_args()
238
+ # set log
239
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
240
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
241
+ # set device
242
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
243
+ args.n_gpu = torch.cuda.device_count()
244
+ args.device = device
245
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
246
+
247
+ # Set seed
248
+ set_seed(args.seed)
249
+
250
+ # make dir if output_dir not exist
251
+ if os.path.exists(args.output_dir) is False:
252
+ os.makedirs(args.output_dir)
253
+
254
+ # build model
255
+ tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
256
+ config = RobertaConfig.from_pretrained(args.model_name_or_path)
257
+ # import!!!you must set is_decoder as True for generation
258
+ config.is_decoder = True
259
+ encoder = RobertaModel.from_pretrained(args.model_name_or_path,config=config)
260
+
261
+ model = Seq2Seq(encoder=encoder,decoder=encoder,config=config,
262
+ beam_size=args.beam_size,max_length=args.max_target_length,
263
+ sos_id=tokenizer.convert_tokens_to_ids(["<mask0>"])[0],eos_id=tokenizer.sep_token_id)
264
+
265
+ logger.info("Training/evaluation parameters %s", args)
266
+
267
+ if args.load_model_path is not None:
268
+ if args.task == "statement_level":
269
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
270
+ model.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
271
+ else:
272
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
273
+ model.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
274
+
275
+ model.to(args.device)
276
+
277
+ if args.n_gpu > 1:
278
+ # multi-gpu training
279
+ model = torch.nn.DataParallel(model)
280
+
281
+ if args.do_train:
282
+ # Prepare training data loader
283
+ if args.task == "statement_level":
284
+ train_examples = read_examples(args.train_filename + "/Code_Completion/statement_level/train.jsonl")
285
+ else:
286
+ train_examples = read_examples(args.train_filename + "/Code_Completion/next_statement/train.jsonl")
287
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
288
+ all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long)
289
+ all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
290
+ train_data = TensorDataset(all_source_ids,all_target_ids)
291
+ train_sampler = RandomSampler(train_data)
292
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
293
+
294
+
295
+ # Prepare optimizer and schedule (linear warmup and decay)
296
+ no_decay = ['bias', 'LayerNorm.weight']
297
+ optimizer_grouped_parameters = [
298
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
299
+ 'weight_decay': args.weight_decay},
300
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
301
+ ]
302
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
303
+ scheduler = get_linear_schedule_with_warmup(optimizer,
304
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
305
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
306
+
307
+ #Start training
308
+ logger.info("***** Running training *****")
309
+ logger.info(" Num examples = %d", len(train_examples))
310
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
311
+ logger.info(" Num epoch = %d", args.num_train_epochs)
312
+
313
+
314
+ model.train()
315
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
316
+ for epoch in range(args.num_train_epochs):
317
+ for idx,batch in enumerate(train_dataloader):
318
+ batch = tuple(t.to(device) for t in batch)
319
+ source_ids,target_ids = batch
320
+ loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
321
+
322
+ if args.n_gpu > 1:
323
+ loss = loss.mean() # mean() to average on multi-gpu.
324
+ if args.gradient_accumulation_steps > 1:
325
+ loss = loss / args.gradient_accumulation_steps
326
+
327
+ losses.append(loss.item())
328
+ loss.backward()
329
+ if len(losses) % args.gradient_accumulation_steps == 0:
330
+ #Update parameters
331
+ optimizer.step()
332
+ optimizer.zero_grad()
333
+ scheduler.step()
334
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
335
+ logger.info("epoch {} step {} loss {}".format(epoch,
336
+ len(losses)//args.gradient_accumulation_steps,
337
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
338
+ if args.do_eval:
339
+ #Eval model with dev dataset
340
+
341
+ if 'dev_loss' in dev_dataset:
342
+ eval_examples,eval_data = dev_dataset['dev_loss']
343
+ else:
344
+ if args.task == "statement_level":
345
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl")
346
+ else:
347
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl")
348
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
349
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
350
+ all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long)
351
+ eval_data = TensorDataset(all_source_ids,all_target_ids)
352
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
353
+ eval_sampler = SequentialSampler(eval_data)
354
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
355
+ res_list = []
356
+ logger.info("\n***** Running evaluation *****")
357
+ logger.info(" Num examples = %d", len(eval_examples))
358
+ logger.info(" Batch size = %d", args.eval_batch_size)
359
+
360
+ #Start Evaling model
361
+ model.eval()
362
+ eval_loss,tokens_num = 0,0
363
+ for batch in eval_dataloader:
364
+ batch = tuple(t.to(device) for t in batch)
365
+ source_ids,target_ids = batch
366
+
367
+ with torch.no_grad():
368
+ _,loss,num = model(source_ids=source_ids,target_ids=target_ids)
369
+ eval_loss += loss.sum().item()
370
+ tokens_num += num.sum().item()
371
+ #Pring loss of dev dataset
372
+ model.train()
373
+ eval_loss = eval_loss / tokens_num
374
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
375
+ for key in sorted(result.keys()):
376
+ logger.info(" %s = %s", key, str(result[key]))
377
+ logger.info(" "+"*"*20)
378
+
379
+ #Calculate bleu
380
+ if 'dev_bleu' in dev_dataset:
381
+ eval_examples,eval_data=dev_dataset['dev_bleu']
382
+ else:
383
+ if args.task == "statement_level":
384
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl")
385
+ else:
386
+ eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl")
387
+ # eval_examples = random.sample(eval_examples, int(len(eval_examples) / divide_number))
388
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
389
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
390
+ eval_data = TensorDataset(all_source_ids)
391
+ dev_dataset['dev_bleu'] = eval_examples,eval_data
392
+
393
+ eval_sampler = SequentialSampler(eval_data)
394
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
395
+
396
+ model.eval()
397
+ p=[]
398
+ for batch in eval_dataloader:
399
+ batch = tuple(t.to(device) for t in batch)
400
+ source_ids = batch[0]
401
+ with torch.no_grad():
402
+ preds = model(source_ids)
403
+ # convert ids to text
404
+ for pred in preds:
405
+ t = pred[0].cpu().numpy()
406
+ t = list(t)
407
+ if 0 in t:
408
+ t = t[:t.index(0)]
409
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
410
+ p.append(text)
411
+ model.train()
412
+ EM = 0.0
413
+ edit_sim = 0.0
414
+ total = len(p)
415
+ token_accuracy = 0
416
+ for ref,gold in zip(p,eval_examples):
417
+ pred = ref.strip()
418
+ gt = gold.target
419
+ edit_sim += fuzz.ratio(pred, gt)
420
+ if pred.split() == gt.split():
421
+ EM += 1
422
+ res_list.append([pred,gt])
423
+ dev_acc = round(EM/total*100, 2)
424
+ # logger.info(" %s = %s "%("loss",round(np.mean(dev_losses),4)))
425
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
426
+ logger.info(" %s = %s "%("EM Acc",str(dev_acc)))
427
+ logger.info(" %s = %s "%("Edit Distance",str(round(edit_sim/total, 2))))
428
+ logger.info(" "+"*"*20)
429
+
430
+ if dev_acc > best_score:
431
+ best_score = dev_acc
432
+ # Save best checkpoint for best bleu
433
+ if args.task == "statement_level":
434
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
435
+ else:
436
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
437
+ if not os.path.exists(output_dir):
438
+ os.makedirs(output_dir)
439
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
440
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
441
+ torch.save(model_to_save.state_dict(), output_model_file)
442
+ patience = 0
443
+ else:
444
+ patience += 1
445
+ if patience == 3:
446
+ break
447
+ logger.info(" Best score:%s",best_score)
448
+ logger.info(" "+"*"*20)
449
+
450
+ if args.task == "statement_level":
451
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
452
+ else:
453
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
454
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
455
+ for line in res_list:
456
+ dic = {}
457
+ dic["Pred"] = line[0]
458
+ dic["GT"] = line[1]
459
+ wf.write(json.dumps(dic))
460
+ wf.write("\n")
461
+
462
+ if args.do_test:
463
+ res_list = []
464
+ output_dir2 = ""
465
+
466
+ if args.load_model_path is not None:
467
+ model_to_load = model.module if hasattr(model, 'module') else model
468
+
469
+ if args.task == "statement_level":
470
+ logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin"))
471
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin"))
472
+ else:
473
+ logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin"))
474
+ model_to_load.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin"))
475
+
476
+
477
+ if args.task == "statement_level":
478
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/statement_level/test.jsonl')
479
+ else:
480
+ args.test_filename = os.path.join(args.test_filename, 'Code_Completion/next_statement/test.jsonl')
481
+ eval_examples = read_examples(args.test_filename)
482
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
483
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
484
+ eval_data = TensorDataset(all_source_ids)
485
+
486
+ # Calculate bleu
487
+ eval_sampler = SequentialSampler(eval_data)
488
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
489
+
490
+ model.eval()
491
+ p=[]
492
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
493
+ batch = tuple(t.to(device) for t in batch)
494
+ source_ids = batch[0]
495
+ with torch.no_grad():
496
+ preds = model(source_ids)
497
+ # convert ids to text
498
+ for pred in preds:
499
+ t = pred[0].cpu().numpy()
500
+ t = list(t)
501
+ if 0 in t:
502
+ t = t[:t.index(0)]
503
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
504
+ p.append(text)
505
+ model.train()
506
+ avg_acc = 0.0
507
+ avg_EM = 0.0
508
+ total = 0
509
+ for ref,gold in zip(p,eval_examples):
510
+ pred = ref.strip() # post_process(ref.strip()).split(" ")
511
+ gt = gold.target.strip()
512
+ if pred == gt:
513
+ avg_EM += 1
514
+ avg_acc += fuzz.ratio(pred, gt)
515
+ res_list.append([pred, gt])
516
+ total += 1
517
+ dev_acc = round(avg_acc/total, 2)
518
+ dev_em = round(avg_EM/total, 6)
519
+ logger.info(" %s = %s "%("Test Token Avg Edit Distance",str(dev_acc)))
520
+ logger.info(" %s = %s "%("Test Token Avg Exact Match Rate",str(dev_em)))
521
+ logger.info(" "+"*"*20)
522
+ if args.test_org:
523
+ output_dir = args.output_dir
524
+ else:
525
+ if args.task == "statement_level":
526
+ output_dir = os.path.join(args.output_dir, 'statement_level/')
527
+ else:
528
+ output_dir = os.path.join(args.output_dir, 'next_statement/')
529
+
530
+ with open(output_dir + "/test_result.jsonl", 'w') as wf:
531
+ for line in res_list:
532
+ dic = {}
533
+ dic["Pred"] = line[0]
534
+ dic["GT"] = line[1]
535
+ wf.write(json.dumps(dic))
536
+ wf.write("\n")
537
+
538
+
539
+ if __name__ == "__main__":
540
+ main()
541
+
542
+
543
+
Script/Model/UnixCoder/code-generation/bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 Google Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+
16
+ """Python implementation of BLEU and smooth-BLEU.
17
+
18
+ This module provides a Python implementation of BLEU and smooth-BLEU.
19
+ Smooth BLEU is computed following the method outlined in the paper:
20
+ Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
+ evaluation metrics for machine translation. COLING 2004.
22
+ """
23
+
24
+ import collections
25
+ import math
26
+
27
+
28
+ def _get_ngrams(segment, max_order):
29
+ """Extracts all n-grams upto a given maximum order from an input segment.
30
+
31
+ Args:
32
+ segment: text segment from which n-grams will be extracted.
33
+ max_order: maximum length in tokens of the n-grams returned by this
34
+ methods.
35
+
36
+ Returns:
37
+ The Counter containing all n-grams upto max_order in segment
38
+ with a count of how many times each n-gram occurred.
39
+ """
40
+ ngram_counts = collections.Counter()
41
+ for order in range(1, max_order + 1):
42
+ for i in range(0, len(segment) - order + 1):
43
+ ngram = tuple(segment[i:i+order])
44
+ ngram_counts[ngram] += 1
45
+ return ngram_counts
46
+
47
+
48
+ def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
+ smooth=False):
50
+ """Computes BLEU score of translated segments against one or more references.
51
+
52
+ Args:
53
+ reference_corpus: list of lists of references for each translation. Each
54
+ reference should be tokenized into a list of tokens.
55
+ translation_corpus: list of translations to score. Each translation
56
+ should be tokenized into a list of tokens.
57
+ max_order: Maximum n-gram order to use when computing BLEU score.
58
+ smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
+
60
+ Returns:
61
+ 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
+ precisions and brevity penalty.
63
+ """
64
+ matches_by_order = [0] * max_order
65
+ possible_matches_by_order = [0] * max_order
66
+ reference_length = 0
67
+ translation_length = 0
68
+ for (references, translation) in zip(reference_corpus,
69
+ translation_corpus):
70
+ reference_length += min(len(r) for r in references)
71
+ translation_length += len(translation)
72
+
73
+ merged_ref_ngram_counts = collections.Counter()
74
+ for reference in references:
75
+ merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
+ translation_ngram_counts = _get_ngrams(translation, max_order)
77
+ overlap = translation_ngram_counts & merged_ref_ngram_counts
78
+ for ngram in overlap:
79
+ matches_by_order[len(ngram)-1] += overlap[ngram]
80
+ for order in range(1, max_order+1):
81
+ possible_matches = len(translation) - order + 1
82
+ if possible_matches > 0:
83
+ possible_matches_by_order[order-1] += possible_matches
84
+
85
+ precisions = [0] * max_order
86
+ for i in range(0, max_order):
87
+ if smooth:
88
+ precisions[i] = ((matches_by_order[i] + 1.) /
89
+ (possible_matches_by_order[i] + 1.))
90
+ else:
91
+ if possible_matches_by_order[i] > 0:
92
+ precisions[i] = (float(matches_by_order[i]) /
93
+ possible_matches_by_order[i])
94
+ else:
95
+ precisions[i] = 0.0
96
+
97
+ if min(precisions) > 0:
98
+ p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
+ geo_mean = math.exp(p_log_sum)
100
+ else:
101
+ geo_mean = 0
102
+
103
+ ratio = float(translation_length) / reference_length
104
+
105
+ if ratio > 1.0:
106
+ bp = 1.
107
+ else:
108
+ bp = math.exp(1 - 1. / ratio)
109
+
110
+ bleu = geo_mean * bp
111
+
112
+ return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
+
114
+
115
+ def _bleu(ref_file, trans_file, subword_option=None):
116
+ max_order = 4
117
+ smooth = True
118
+ ref_files = [ref_file]
119
+ reference_text = []
120
+ for reference_filename in ref_files:
121
+ with open(reference_filename) as fh:
122
+ reference_text.append(fh.readlines())
123
+ per_segment_references = []
124
+ for references in zip(*reference_text):
125
+ reference_list = []
126
+ for reference in references:
127
+ reference_list.append(reference.strip().split())
128
+ per_segment_references.append(reference_list)
129
+ translations = []
130
+ with open(trans_file) as fh:
131
+ for line in fh:
132
+ translations.append(line.strip().split())
133
+ bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
+ return round(100 * bleu_score,2)
Script/Model/UnixCoder/code-generation/model.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch
7
+ from torch.autograd import Variable
8
+ import copy
9
+ class Seq2Seq(nn.Module):
10
+ """
11
+ Build Seqence-to-Sequence.
12
+
13
+ Parameters:
14
+
15
+ * `encoder`- encoder of seq2seq model. e.g. roberta
16
+ * `decoder`- decoder of seq2seq model. e.g. transformer
17
+ * `config`- configuration of encoder model.
18
+ * `beam_size`- beam size for beam search.
19
+ * `max_length`- max length of target for beam search.
20
+ * `sos_id`- start of symbol ids in target for beam search.
21
+ * `eos_id`- end of symbol ids in target for beam search.
22
+ """
23
+ def __init__(self, encoder,decoder, config, beam_size=None, max_length=None, sos_id=None, eos_id=None):
24
+ super(Seq2Seq, self).__init__()
25
+ self.encoder = encoder
26
+ self.decoder=decoder
27
+ self.config=config
28
+ self.register_buffer(
29
+ "bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)
30
+ )
31
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
32
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
33
+ self.lm_head.weight = self.encoder.embeddings.word_embeddings.weight
34
+ self.lsm = nn.LogSoftmax(dim=-1)
35
+
36
+ self.beam_size = beam_size
37
+ self.max_length = max_length
38
+ self.sos_id = sos_id
39
+ self.eos_id = eos_id
40
+
41
+ def forward(self, source_ids, target_ids=None):
42
+ if target_ids is None:
43
+ return self.generate(source_ids)
44
+
45
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
46
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
47
+ ids = torch.cat((source_ids,target_ids),-1)
48
+ mask = self.bias[:,source_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
49
+ mask = mask & ids[:,None,:].ne(1)
50
+
51
+ out = self.decoder(target_ids,attention_mask=mask,past_key_values=encoder_output.past_key_values).last_hidden_state
52
+ lm_logits = self.lm_head(out)
53
+ # Shift so that tokens < n predict n
54
+ active_loss = target_ids[..., 1:].ne(1).view(-1)
55
+ shift_logits = lm_logits[..., :-1, :].contiguous()
56
+ shift_labels = target_ids[..., 1:].contiguous()
57
+ # Flatten the tokens
58
+ loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
59
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
60
+ shift_labels.view(-1)[active_loss])
61
+
62
+ outputs = loss,loss*active_loss.sum(),active_loss.sum()
63
+ return outputs
64
+
65
+ def generate(self, source_ids):
66
+ mask = source_ids.ne(1)[:,None,:]*source_ids.ne(1)[:,:,None]
67
+ encoder_output = self.encoder(source_ids,attention_mask=mask,use_cache=True)
68
+ preds = []
69
+ zero = torch.cuda.LongTensor(1).fill_(0)
70
+ source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
71
+ for i in range(source_ids.shape[0]):
72
+ context = [[x[i:i+1,:,:source_len[i]].repeat(self.beam_size,1,1,1) for x in y]
73
+ for y in encoder_output.past_key_values]
74
+ beam = Beam(self.beam_size,self.sos_id,self.eos_id)
75
+ input_ids = beam.getCurrentState()
76
+ context_ids = source_ids[i:i+1,:source_len[i]].repeat(self.beam_size,1)
77
+ for _ in range(self.max_length):
78
+ if beam.done():
79
+ break
80
+
81
+ ids = torch.cat((context_ids,input_ids),-1)
82
+ mask = self.bias[:,context_ids.size(-1):ids.size(-1),:ids.size(-1)].bool()
83
+ mask = mask & ids[:,None,:].ne(1)
84
+ out = self.decoder(input_ids,attention_mask=mask,past_key_values=context).last_hidden_state
85
+ hidden_states = out[:,-1,:]
86
+ out = self.lsm(self.lm_head(hidden_states)).data
87
+ beam.advance(out)
88
+ input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
89
+ input_ids = torch.cat((input_ids,beam.getCurrentState()),-1)
90
+ hyp = beam.getHyp(beam.getFinal())
91
+ pred = beam.buildTargetTokens(hyp)[:self.beam_size]
92
+ pred = [torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
93
+ preds.append(torch.cat(pred,0).unsqueeze(0))
94
+
95
+ preds = torch.cat(preds,0)
96
+
97
+ return preds
98
+
99
+
100
+
101
+ class Beam(object):
102
+ def __init__(self, size,sos,eos):
103
+ self.size = size
104
+ self.tt = torch.cuda
105
+ # The score for each translation on the beam.
106
+ self.scores = self.tt.FloatTensor(size).zero_()
107
+ # The backpointers at each time-step.
108
+ self.prevKs = []
109
+ # The outputs at each time-step.
110
+ self.nextYs = [self.tt.LongTensor(size)
111
+ .fill_(0)]
112
+ self.nextYs[0][0] = sos
113
+ # Has EOS topped the beam yet.
114
+ self._eos = eos
115
+ self.eosTop = False
116
+ # Time and k pair for finished.
117
+ self.finished = []
118
+
119
+ def getCurrentState(self):
120
+ "Get the outputs for the current timestep."
121
+ batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
122
+ return batch
123
+
124
+ def getCurrentOrigin(self):
125
+ "Get the backpointers for the current timestep."
126
+ return self.prevKs[-1]
127
+
128
+ def advance(self, wordLk):
129
+ """
130
+ Given prob over words for every last beam `wordLk` and attention
131
+ `attnOut`: Compute and update the beam search.
132
+
133
+ Parameters:
134
+
135
+ * `wordLk`- probs of advancing from the last step (K x words)
136
+ * `attnOut`- attention at the last step
137
+
138
+ Returns: True if beam search is complete.
139
+ """
140
+ numWords = wordLk.size(1)
141
+
142
+ # Sum the previous scores.
143
+ if len(self.prevKs) > 0:
144
+ beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
145
+
146
+ # Don't let EOS have children.
147
+ for i in range(self.nextYs[-1].size(0)):
148
+ if self.nextYs[-1][i] == self._eos:
149
+ beamLk[i] = -1e20
150
+ else:
151
+ beamLk = wordLk[0]
152
+ flatBeamLk = beamLk.view(-1)
153
+ bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
154
+
155
+ self.scores = bestScores
156
+
157
+ # bestScoresId is flattened beam x word array, so calculate which
158
+ # word and beam each score came from
159
+ prevK = bestScoresId // numWords
160
+ self.prevKs.append(prevK)
161
+ self.nextYs.append((bestScoresId - prevK * numWords))
162
+
163
+
164
+ for i in range(self.nextYs[-1].size(0)):
165
+ if self.nextYs[-1][i] == self._eos:
166
+ s = self.scores[i]
167
+ self.finished.append((s, len(self.nextYs) - 1, i))
168
+
169
+ # End condition is when top-of-beam is EOS and no global score.
170
+ if self.nextYs[-1][0] == self._eos:
171
+ self.eosTop = True
172
+
173
+ def done(self):
174
+ return self.eosTop and len(self.finished) >=self.size
175
+
176
+ def getFinal(self):
177
+ if len(self.finished) == 0:
178
+ self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
179
+ self.finished.sort(key=lambda a: -a[0])
180
+ if len(self.finished) != self.size:
181
+ unfinished=[]
182
+ for i in range(self.nextYs[-1].size(0)):
183
+ if self.nextYs[-1][i] != self._eos:
184
+ s = self.scores[i]
185
+ unfinished.append((s, len(self.nextYs) - 1, i))
186
+ unfinished.sort(key=lambda a: -a[0])
187
+ self.finished+=unfinished[:self.size-len(self.finished)]
188
+ return self.finished[:self.size]
189
+
190
+ def getHyp(self, beam_res):
191
+ """
192
+ Walk back to construct the full hypothesis.
193
+ """
194
+ hyps=[]
195
+ for _,timestep, k in beam_res:
196
+ hyp = []
197
+ for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
198
+ hyp.append(self.nextYs[j+1][k])
199
+ k = self.prevKs[j][k]
200
+ hyps.append(hyp[::-1])
201
+ return hyps
202
+
203
+ def buildTargetTokens(self, preds):
204
+ sentence=[]
205
+ for pred in preds:
206
+ tokens = []
207
+ for tok in pred:
208
+ if tok==self._eos:
209
+ break
210
+ tokens.append(tok)
211
+ sentence.append(tokens)
212
+ return sentence
213
+
Script/Model/UnixCoder/code-generation/run_generation.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19
+ using a masked language modeling (MLM) loss.
20
+ """
21
+
22
+ from __future__ import absolute_import
23
+ import os
24
+ import sys
25
+ from bleu import _bleu
26
+ import pickle
27
+ import torch
28
+ import json
29
+ import random
30
+ import logging
31
+ import argparse
32
+ import numpy as np
33
+ from io import open
34
+ from itertools import cycle
35
+ import torch.nn as nn
36
+ from model import Seq2Seq
37
+ from tqdm import tqdm, trange
38
+ from fuzzywuzzy import fuzz
39
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
40
+ from torch.utils.data.distributed import DistributedSampler
41
+
42
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
43
+ RobertaConfig, RobertaModel, RobertaTokenizer)
44
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
45
+ datefmt = '%m/%d/%Y %H:%M:%S',
46
+ level = logging.INFO)
47
+ logger = logging.getLogger(__name__)
48
+ divide_number = 3
49
+
50
+
51
+ class Example(object):
52
+ """A single training/test example."""
53
+ def __init__(self,
54
+ idx,
55
+ source,
56
+ ts_v,
57
+ target,
58
+ ):
59
+ self.idx = idx
60
+ self.source = source
61
+ self.ts_v = ts_v
62
+ self.target = target
63
+
64
+ def read_examples(filename):
65
+ """Read examples from filename."""
66
+ examples=[]
67
+ with open(filename,encoding="utf-8") as f:
68
+ for idx, line in enumerate(f):
69
+ line=line.strip()
70
+ js=json.loads(line)
71
+
72
+ examples.append(
73
+ Example(
74
+ idx = idx,
75
+ source=" ".join(js['natrual_language']),
76
+ ts_v = ",".join(js['TS_V_token']),
77
+ target = " ".join(js["ground_truth"][1:-1]),
78
+ )
79
+ )
80
+
81
+ return examples
82
+
83
+
84
+ class InputFeatures(object):
85
+ """A single training/test features for a example."""
86
+ def __init__(self,
87
+ example_id,
88
+ source_ids,
89
+ target_ids,
90
+ ):
91
+ self.example_id = example_id
92
+ self.source_ids = source_ids
93
+ self.target_ids = target_ids
94
+
95
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
96
+ features = []
97
+ for example_index, example in enumerate(examples):
98
+ #source
99
+ source_tokens = tokenizer.tokenize(example.source)
100
+ ts_v_tokens = tokenizer.tokenize(example.ts_v)
101
+ source_tokens =[tokenizer.cls_token,"<encoder-decoder>",tokenizer.sep_token]+source_tokens+[tokenizer.sep_token]+ts_v_tokens+["<mask0>",tokenizer.sep_token]
102
+
103
+ source_ids = tokenizer.convert_tokens_to_ids(source_tokens[:args.max_source_length-5])
104
+ padding_length = args.max_source_length - len(source_ids)
105
+ source_ids+=[tokenizer.pad_token_id]*padding_length
106
+
107
+ #target
108
+ if stage=="test":
109
+ target_tokens = tokenizer.tokenize("None")
110
+ else:
111
+ target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
112
+ target_tokens = ["<mask0>"]+target_tokens+[tokenizer.sep_token]
113
+ target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
114
+ padding_length = args.max_target_length - len(target_ids)
115
+ target_ids+=[tokenizer.pad_token_id]*padding_length
116
+
117
+
118
+
119
+ features.append(
120
+ InputFeatures(
121
+ example_index,
122
+ source_ids,
123
+ target_ids,
124
+ )
125
+ )
126
+ return features
127
+
128
+
129
+
130
+ def set_seed(seed=20240124):
131
+ random.seed(seed)
132
+ os.environ['PYHTONHASHSEED'] = str(seed)
133
+ np.random.seed(seed)
134
+ torch.manual_seed(seed)
135
+ torch.cuda.manual_seed(seed)
136
+ torch.backends.cudnn.deterministic = True
137
+
138
+ def main():
139
+ parser = argparse.ArgumentParser()
140
+
141
+ ## Required parameters
142
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
143
+ help="Path to pre-trained model: e.g. roberta-base" )
144
+ parser.add_argument("--load_model_path", default=None, type=str,
145
+ help="Path to trained model" )
146
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
147
+ help="The output directory where the model predictions and checkpoints will be written.")
148
+
149
+ ## Other parameters
150
+ parser.add_argument("--train_filename", default=None, type=str,
151
+ help="The train filename. Should contain the .jsonl files for this task.")
152
+ parser.add_argument("--dev_filename", default=None, type=str,
153
+ help="The dev filename. Should contain the .jsonl files for this task.")
154
+ parser.add_argument("--test_filename", default=None, type=str,
155
+ help="The test filename. Should contain the .jsonl files for this task.")
156
+ parser.add_argument("--max_source_length", default=256, type=int,
157
+ help="The maximum total source sequence length after tokenization. Sequences longer "
158
+ "than this will be truncated, sequences shorter will be padded.")
159
+ parser.add_argument("--max_target_length", default=512, type=int,
160
+ help="The maximum total target sequence length after tokenization. Sequences longer "
161
+ "than this will be truncated, sequences shorter will be padded.")
162
+ parser.add_argument("--do_train", action='store_true',
163
+ help="Whether to run training.")
164
+ parser.add_argument("--do_eval", action='store_true',
165
+ help="Whether to run eval on the dev set.")
166
+ parser.add_argument("--do_test", action='store_true',
167
+ help="Whether to run eval on the dev set.")
168
+ parser.add_argument("--no_cuda", action='store_true',
169
+ help="Avoid using CUDA when available")
170
+
171
+ parser.add_argument("--train_batch_size", default=8, type=int,
172
+ help="Batch size per GPU/CPU for training.")
173
+ parser.add_argument("--eval_batch_size", default=8, type=int,
174
+ help="Batch size per GPU/CPU for evaluation.")
175
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
176
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
177
+ parser.add_argument("--learning_rate", default=5e-5, type=float,
178
+ help="The initial learning rate for Adam.")
179
+ parser.add_argument("--beam_size", default=10, type=int,
180
+ help="beam size for beam search")
181
+ parser.add_argument("--weight_decay", default=0.0, type=float,
182
+ help="Weight deay if we apply some.")
183
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float,
184
+ help="Epsilon for Adam optimizer.")
185
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
186
+ help="Max gradient norm.")
187
+ parser.add_argument("--num_train_epochs", default=3, type=int,
188
+ help="Total number of training epochs to perform.")
189
+ parser.add_argument('--seed', type=int, default=20240124,
190
+ help="random seed for initialization")
191
+
192
+ # print arguments
193
+ args = parser.parse_args()
194
+ # set log
195
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
196
+ datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
197
+ # set device
198
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
199
+ args.n_gpu = torch.cuda.device_count()
200
+ args.device = device
201
+ logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
202
+
203
+ # Set seed
204
+ set_seed(args.seed)
205
+ # make dir if output_dir not exist
206
+ if os.path.exists(args.output_dir) is False:
207
+ os.makedirs(args.output_dir)
208
+
209
+ # build model
210
+ tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
211
+ config = RobertaConfig.from_pretrained(args.model_name_or_path)
212
+ # import!!!you must set is_decoder as True for generation
213
+ config.is_decoder = True
214
+ encoder = RobertaModel.from_pretrained(args.model_name_or_path,config=config)
215
+
216
+ model = Seq2Seq(encoder=encoder,decoder=encoder,config=config,
217
+ beam_size=args.beam_size,max_length=args.max_target_length,
218
+ sos_id=tokenizer.convert_tokens_to_ids(["<mask0>"])[0],eos_id=tokenizer.sep_token_id)
219
+
220
+ logger.info("Training/evaluation parameters %s", args)
221
+ if args.load_model_path is not None:
222
+ logger.info("reload model from {}".format(args.load_model_path + "/pytorch_model.bin"))
223
+ model.load_state_dict(torch.load(args.load_model_path + "/pytorch_model.bin"))
224
+ model.to(args.device)
225
+
226
+ if args.n_gpu > 1:
227
+ # multi-gpu training
228
+ model = torch.nn.DataParallel(model)
229
+
230
+ if args.do_train:
231
+ # Prepare training data loader
232
+ train_examples = read_examples(args.train_filename)
233
+ train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
234
+ all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long)
235
+ all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
236
+ train_data = TensorDataset(all_source_ids,all_target_ids)
237
+ train_sampler = RandomSampler(train_data)
238
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
239
+
240
+
241
+ # Prepare optimizer and schedule (linear warmup and decay)
242
+ no_decay = ['bias', 'LayerNorm.weight']
243
+ optimizer_grouped_parameters = [
244
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
245
+ 'weight_decay': args.weight_decay},
246
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
247
+ ]
248
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
249
+ scheduler = get_linear_schedule_with_warmup(optimizer,
250
+ num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1),
251
+ num_training_steps=len(train_dataloader)*args.num_train_epochs)
252
+
253
+ #Start training
254
+ logger.info("***** Running training *****")
255
+ logger.info(" Num examples = %d", len(train_examples))
256
+ logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps)
257
+ logger.info(" Num epoch = %d", args.num_train_epochs)
258
+
259
+
260
+ model.train()
261
+ patience, best_score, losses, dev_dataset = 0, 0, [], {}
262
+ for epoch in range(args.num_train_epochs):
263
+ for idx,batch in enumerate(train_dataloader):
264
+ batch = tuple(t.to(device) for t in batch)
265
+ source_ids,target_ids = batch
266
+ loss,_,_ = model(source_ids=source_ids,target_ids=target_ids)
267
+
268
+ if args.n_gpu > 1:
269
+ loss = loss.mean() # mean() to average on multi-gpu.
270
+ if args.gradient_accumulation_steps > 1:
271
+ loss = loss / args.gradient_accumulation_steps
272
+
273
+ losses.append(loss.item())
274
+ loss.backward()
275
+ if len(losses) % args.gradient_accumulation_steps == 0:
276
+ #Update parameters
277
+ optimizer.step()
278
+ optimizer.zero_grad()
279
+ scheduler.step()
280
+ if len(losses) // args.gradient_accumulation_steps % 100 == 0:
281
+ logger.info("epoch {} step {} loss {}".format(epoch,
282
+ len(losses)//args.gradient_accumulation_steps,
283
+ round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4)))
284
+ if args.do_eval:
285
+ #Eval model with dev dataset
286
+ if 'dev_loss' in dev_dataset:
287
+ eval_examples,eval_data = dev_dataset['dev_loss']
288
+ else:
289
+ eval_examples = read_examples(args.dev_filename)
290
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
291
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
292
+ all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long)
293
+ eval_data = TensorDataset(all_source_ids,all_target_ids)
294
+ dev_dataset['dev_loss' ]= eval_examples,eval_data
295
+ eval_sampler = SequentialSampler(eval_data)
296
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
297
+
298
+ logger.info("\n***** Running evaluation *****")
299
+ logger.info(" Num examples = %d", len(eval_examples))
300
+ logger.info(" Batch size = %d", args.eval_batch_size)
301
+
302
+ #Start Evaling model
303
+ model.eval()
304
+ eval_loss,tokens_num = 0,0
305
+ for batch in eval_dataloader:
306
+ batch = tuple(t.to(device) for t in batch)
307
+ source_ids,target_ids = batch
308
+
309
+ with torch.no_grad():
310
+ _,loss,num = model(source_ids=source_ids,target_ids=target_ids)
311
+ eval_loss += loss.sum().item()
312
+ tokens_num += num.sum().item()
313
+ #Pring loss of dev dataset
314
+ model.train()
315
+ eval_loss = eval_loss / tokens_num
316
+ result = {'eval_ppl': round(np.exp(eval_loss),5)}
317
+ for key in sorted(result.keys()):
318
+ logger.info(" %s = %s", key, str(result[key]))
319
+ logger.info(" "+"*"*20)
320
+
321
+ #Calculate bleu
322
+ if 'dev_bleu' in dev_dataset:
323
+ eval_examples,eval_data=dev_dataset['dev_bleu']
324
+ else:
325
+ eval_examples = read_examples(args.dev_filename)
326
+ # eval_examples = random.sample(eval_examples)
327
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
328
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
329
+ eval_data = TensorDataset(all_source_ids)
330
+ dev_dataset['dev_bleu'] = eval_examples,eval_data
331
+
332
+ eval_sampler = SequentialSampler(eval_data)
333
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
334
+
335
+ model.eval()
336
+ p=[]
337
+ for batch in eval_dataloader:
338
+ batch = tuple(t.to(device) for t in batch)
339
+ source_ids = batch[0]
340
+ with torch.no_grad():
341
+ preds = model(source_ids)
342
+ # convert ids to text
343
+ for pred in preds:
344
+ t = pred[0].cpu().numpy()
345
+ t = list(t)
346
+ if 0 in t:
347
+ t = t[:t.index(0)]
348
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
349
+ # print(text)
350
+ p.append(text)
351
+
352
+ model.train()
353
+ predictions = []
354
+ res_list = []
355
+ EM = []
356
+ is_gened = False
357
+ with open(args.output_dir+"/dev.output",'w') as f, open(args.output_dir+"/dev.gold",'w') as f1:
358
+ for ref,gold in zip(p,eval_examples):
359
+ predictions.append(ref)
360
+ if len(ref) > 0:
361
+ is_gened = True
362
+ f.write(ref+'\n')
363
+ f1.write(gold.target+'\n')
364
+ EM.append(ref.split()==gold.target.split())
365
+ res_list.append([ref,gold.target])
366
+ if is_gened:
367
+ dev_bleu = _bleu(args.output_dir+"/dev.gold", args.output_dir+"/dev.output")
368
+ else:
369
+ dev_bleu = 0
370
+ logger.info(" %s = %s "%("Epoch",str(epoch)))
371
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
372
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2))))
373
+ logger.info(" "+"*"*20)
374
+ dev_score = (dev_bleu+round(np.mean(EM)*100,2)) / 2.0
375
+ if dev_score>best_score:
376
+ best_score=dev_score
377
+ # Save best checkpoint for best bleu
378
+ output_dir = args.output_dir
379
+ if not os.path.exists(output_dir):
380
+ os.makedirs(output_dir)
381
+ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
382
+ output_model_file = os.path.join(output_dir, "pytorch_model.bin")
383
+ torch.save(model_to_save.state_dict(), output_model_file)
384
+ patience = 0
385
+ else:
386
+ patience += 1
387
+ if patience == 3:
388
+ break
389
+ output_dir = args.output_dir
390
+ with open(output_dir + "/last_training_result.jsonl", 'w') as wf:
391
+ for line in res_list:
392
+ dic = {}
393
+ dic["Pred"] = line[0]
394
+ dic["GT"] = line[1]
395
+ wf.write(json.dumps(dic))
396
+ wf.write("\n")
397
+
398
+ logger.info(" Best score:%s",best_score)
399
+ logger.info(" "+"*"*20)
400
+ if args.do_test:
401
+ res_list = []
402
+ if args.load_model_path is not None:
403
+ checkpoint_prefix = 'pytorch_model.bin'
404
+ output_dir = os.path.join(args.output_dir, checkpoint_prefix)
405
+ model_to_load = model.module if hasattr(model, 'module') else model
406
+ model_to_load.load_state_dict(torch.load(output_dir))
407
+
408
+
409
+
410
+ eval_examples = read_examples(args.test_filename)
411
+ eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
412
+ all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
413
+ eval_data = TensorDataset(all_source_ids)
414
+
415
+ # Calculate bleu
416
+ eval_sampler = SequentialSampler(eval_data)
417
+ eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
418
+
419
+ model.eval()
420
+ p=[]
421
+ for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
422
+ batch = tuple(t.to(device) for t in batch)
423
+ source_ids = batch[0]
424
+ with torch.no_grad():
425
+ preds = model(source_ids)
426
+ # convert ids to text
427
+ for pred in preds:
428
+ t = pred[0].cpu().numpy()
429
+ t = list(t)
430
+ if 0 in t:
431
+ t = t[:t.index(0)]
432
+ text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
433
+ p.append(text)
434
+
435
+ predictions=[]
436
+ EM = []
437
+ edit_dis = 0
438
+ cnt = 0
439
+ with open(args.output_dir+"/test.output",'w') as f, open(args.output_dir+"/test.gold",'w') as f1:
440
+ for ref,gold in zip(p,eval_examples):
441
+ res_list.append([ref,gold.target])
442
+ predictions.append(ref)
443
+ f.write(ref+'\n')
444
+ f1.write(gold.target+'\n')
445
+ EM.append(ref.split()==gold.target.split())
446
+ edit_dis += fuzz.ratio(ref, gold.target)
447
+ cnt += 1
448
+
449
+ dev_bleu = _bleu(args.output_dir+"/test.gold", args.output_dir+"/test.output")
450
+ logger.info(" %s = %s "%("bleu-4",str(dev_bleu)))
451
+ logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,4))))
452
+ logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt,4))))
453
+ logger.info(" "+"*"*20)
454
+
455
+
456
+ with open(args.output_dir + "/last_training_result.jsonl", 'w') as wf:
457
+ for line in res_list:
458
+ dic = {}
459
+ dic["Pred"] = line[0]
460
+ dic["GT"] = line[1]
461
+ wf.write(json.dumps(dic))
462
+ wf.write("\n")
463
+
464
+ if __name__ == "__main__":
465
+ main()
466
+
467
+