Add pipeline for clone detection
Browse files- clone_detection_pipeline.py +180 -0
- config.json +19 -1
- tokenizer_config.json +1 -1
clone_detection_pipeline.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Original work:
|
3 |
+
https://github.com/sangHa0411/CloneDetection/blob/main/utils/preprocessor.py
|
4 |
+
|
5 |
+
Copyright (c) 2022 Sangha Park(sangha110495), Young Jin Ahn(snoop2head)
|
6 |
+
|
7 |
+
All credits to the original authors.
|
8 |
+
"""
|
9 |
+
import re
|
10 |
+
import torch
|
11 |
+
from transformers import Pipeline
|
12 |
+
|
13 |
+
|
14 |
+
class FunctionPreprocessor:
|
15 |
+
def get_function(self, code):
|
16 |
+
results = []
|
17 |
+
fn_list = re.findall("\ndef [a-zA-Z0-9_]+\(", code)
|
18 |
+
|
19 |
+
for fn in fn_list:
|
20 |
+
results.append(fn[4:-1].strip())
|
21 |
+
return results
|
22 |
+
|
23 |
+
def determine_function(self, code, function_name):
|
24 |
+
num = len(re.findall("[^a-zA-Z]" + function_name + "[^a-zA-Z]", code))
|
25 |
+
return False if num <= 1 else True
|
26 |
+
|
27 |
+
def delete_function(self, code, name):
|
28 |
+
start_id, _ = re.search("def " + name, code).span()
|
29 |
+
ptr = start_id
|
30 |
+
|
31 |
+
while ptr < len(code) - 1:
|
32 |
+
if code[ptr] == "\n" and re.search("[a-zA-Z]", code[ptr + 1]) is not None:
|
33 |
+
break
|
34 |
+
ptr += 1
|
35 |
+
|
36 |
+
if ptr != len(code) - 1:
|
37 |
+
end_id = ptr
|
38 |
+
code = code[:start_id] + code[end_id:]
|
39 |
+
|
40 |
+
return code
|
41 |
+
|
42 |
+
def preprocess(self, code):
|
43 |
+
code = "\n" + code
|
44 |
+
fn_list = self.get_function(code)
|
45 |
+
if len(fn_list) == 0:
|
46 |
+
return code
|
47 |
+
|
48 |
+
for fn in fn_list:
|
49 |
+
flag = self.determine_function(code, fn)
|
50 |
+
|
51 |
+
if flag == False:
|
52 |
+
code = self.delete_function(code, fn)
|
53 |
+
|
54 |
+
return code
|
55 |
+
|
56 |
+
|
57 |
+
class AnnotationPreprocessor:
|
58 |
+
def search(self, sen_list, string):
|
59 |
+
for i, sen in enumerate(sen_list):
|
60 |
+
if string in sen:
|
61 |
+
return i
|
62 |
+
return -1
|
63 |
+
|
64 |
+
def delete_annotation_block(self, code, string):
|
65 |
+
sens = [sen for sen in code.split("\n")]
|
66 |
+
|
67 |
+
start_id = self.search(sens, string)
|
68 |
+
end_id = self.search(sens[start_id + 1 :], string)
|
69 |
+
if end_id != -1:
|
70 |
+
end_id += start_id + 1
|
71 |
+
code = sens[:start_id] + sens[end_id + 1 :]
|
72 |
+
else:
|
73 |
+
code = sens[:start_id] + sens[start_id + 1 :]
|
74 |
+
|
75 |
+
code = "\n".join(code)
|
76 |
+
return code
|
77 |
+
|
78 |
+
def delete_block(self, code, string):
|
79 |
+
while string in code:
|
80 |
+
code = self.delete_annotation_block(code, string)
|
81 |
+
return code
|
82 |
+
|
83 |
+
def delete_annotation(self, code):
|
84 |
+
sens = code.split("\n")
|
85 |
+
|
86 |
+
sens_processed = []
|
87 |
+
for sen in sens:
|
88 |
+
if "#" in sen:
|
89 |
+
index = sen.index("#")
|
90 |
+
sen = sen[:index]
|
91 |
+
sens_processed.append(sen)
|
92 |
+
|
93 |
+
return "\n".join(sens_processed)
|
94 |
+
|
95 |
+
def delete_import(self, code):
|
96 |
+
sens = code.split("\n")
|
97 |
+
|
98 |
+
sens_processed = []
|
99 |
+
for sen in sens:
|
100 |
+
if "import" not in sen:
|
101 |
+
sens_processed.append(sen)
|
102 |
+
|
103 |
+
return "\n".join(sens_processed)
|
104 |
+
|
105 |
+
def preprocess(self, code):
|
106 |
+
code = self.delete_block(code, '"""')
|
107 |
+
code = self.delete_block(code, "'''")
|
108 |
+
code = self.delete_annotation(code)
|
109 |
+
code = self.delete_import(code)
|
110 |
+
code = re.sub("\s+", " ", code).strip()
|
111 |
+
return code
|
112 |
+
|
113 |
+
|
114 |
+
def preprocessor(code, instance):
|
115 |
+
processed_code = instance.preprocess(code)
|
116 |
+
return processed_code if processed_code.strip() else code
|
117 |
+
|
118 |
+
|
119 |
+
def token_to_inputs(feature):
|
120 |
+
inputs = {}
|
121 |
+
for k, v in feature.items():
|
122 |
+
inputs[k] = torch.tensor(v).unsqueeze(0)
|
123 |
+
|
124 |
+
return inputs
|
125 |
+
|
126 |
+
|
127 |
+
class CloneDetectionPipeline(Pipeline):
|
128 |
+
fn_preprocessor = FunctionPreprocessor()
|
129 |
+
an_preprocessor = AnnotationPreprocessor()
|
130 |
+
|
131 |
+
def _sanitize_parameters(self, **kwargs):
|
132 |
+
preprocess_kwargs = {}
|
133 |
+
return preprocess_kwargs, {}, {}
|
134 |
+
|
135 |
+
def preprocess(self, inputs):
|
136 |
+
code1 = inputs[0]
|
137 |
+
code2 = inputs[1]
|
138 |
+
if code1.strip() == "" or code2.strip() == "":
|
139 |
+
ture_prob = float(code1.strip() == code2.strip())
|
140 |
+
return {"skip": True, "output": {False: 1 - ture_prob, True: ture_prob}}
|
141 |
+
|
142 |
+
code1 = preprocessor(
|
143 |
+
preprocessor(code1, self.fn_preprocessor), self.an_preprocessor
|
144 |
+
)
|
145 |
+
code2 = preprocessor(
|
146 |
+
preprocessor(code2, self.fn_preprocessor), self.an_preprocessor
|
147 |
+
)
|
148 |
+
|
149 |
+
feature1 = self.tokenizer(
|
150 |
+
code1, code2, max_length=512, return_token_type_ids=False, truncation=True
|
151 |
+
)
|
152 |
+
feature2 = self.tokenizer(
|
153 |
+
code2, code1, max_length=512, return_token_type_ids=False, truncation=True
|
154 |
+
)
|
155 |
+
|
156 |
+
return {
|
157 |
+
"inputs1": token_to_inputs(feature1),
|
158 |
+
"inputs2": token_to_inputs(feature2),
|
159 |
+
}
|
160 |
+
|
161 |
+
def _forward(self, model_inputs):
|
162 |
+
if model_inputs.get("skip", False):
|
163 |
+
return model_inputs
|
164 |
+
|
165 |
+
inputs1 = model_inputs["inputs1"]
|
166 |
+
inputs2 = model_inputs["inputs2"]
|
167 |
+
|
168 |
+
logits1 = self.model(**inputs1).logits[0]
|
169 |
+
logits2 = self.model(**inputs2).logits[0]
|
170 |
+
logits = (logits1 + logits2) / 2
|
171 |
+
|
172 |
+
return {"logits": logits}
|
173 |
+
|
174 |
+
def postprocess(self, model_outputs):
|
175 |
+
if model_outputs.get("skip", False):
|
176 |
+
return model_outputs["output"]
|
177 |
+
|
178 |
+
probs = model_outputs["logits"].softmax(-1).tolist()
|
179 |
+
|
180 |
+
return {False: probs[0], True: probs[1]}
|
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
"CloneDetectionModel"
|
5 |
],
|
@@ -9,6 +9,24 @@
|
|
9 |
},
|
10 |
"bos_token_id": 0,
|
11 |
"classifier_dropout": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"dropout_rate": 0.1,
|
13 |
"eos_token_id": 2,
|
14 |
"gradient_checkpointing": false,
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "Lazyhope/python-clone-detection",
|
3 |
"architectures": [
|
4 |
"CloneDetectionModel"
|
5 |
],
|
|
|
9 |
},
|
10 |
"bos_token_id": 0,
|
11 |
"classifier_dropout": null,
|
12 |
+
"custom_pipelines": {
|
13 |
+
"python-clone-detection": {
|
14 |
+
"default": {
|
15 |
+
"model": {
|
16 |
+
"pt": [
|
17 |
+
"Lazyhope/python-clone-detection",
|
18 |
+
"main"
|
19 |
+
]
|
20 |
+
}
|
21 |
+
},
|
22 |
+
"impl": "clone_detection_pipeline.CloneDetectionPipeline",
|
23 |
+
"pt": [
|
24 |
+
"AutoModel"
|
25 |
+
],
|
26 |
+
"tf": [],
|
27 |
+
"type": "text"
|
28 |
+
}
|
29 |
+
},
|
30 |
"dropout_rate": 0.1,
|
31 |
"eos_token_id": 2,
|
32 |
"gradient_checkpointing": false,
|
tokenizer_config.json
CHANGED
@@ -34,7 +34,7 @@
|
|
34 |
"single_word": false
|
35 |
},
|
36 |
"model_max_length": 512,
|
37 |
-
"name_or_path": "
|
38 |
"pad_token": {
|
39 |
"__type": "AddedToken",
|
40 |
"content": "<pad>",
|
|
|
34 |
"single_word": false
|
35 |
},
|
36 |
"model_max_length": 512,
|
37 |
+
"name_or_path": "Lazyhope/python-clone-detection",
|
38 |
"pad_token": {
|
39 |
"__type": "AddedToken",
|
40 |
"content": "<pad>",
|