Add pipeline for clone detection

Browse files

Files changed (3) hide show

clone_detection_pipeline.py +180 -0
config.json +19 -1
tokenizer_config.json +1 -1

clone_detection_pipeline.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""
+Original work:
+https://github.com/sangHa0411/CloneDetection/blob/main/utils/preprocessor.py
+Copyright (c) 2022 Sangha Park(sangha110495), Young Jin Ahn(snoop2head)
+All credits to the original authors.
+"""
+import re
+import torch
+from transformers import Pipeline
+class FunctionPreprocessor:
+    def get_function(self, code):
+        results = []
+        fn_list = re.findall("\ndef [a-zA-Z0-9_]+\(", code)
+        for fn in fn_list:
+            results.append(fn[4:-1].strip())
+        return results
+    def determine_function(self, code, function_name):
+        num = len(re.findall("[^a-zA-Z]" + function_name + "[^a-zA-Z]", code))
+        return False if num <= 1 else True
+    def delete_function(self, code, name):
+        start_id, _ = re.search("def " + name, code).span()
+        ptr = start_id
+        while ptr < len(code) - 1:
+            if code[ptr] == "\n" and re.search("[a-zA-Z]", code[ptr + 1]) is not None:
+                break
+            ptr += 1
+        if ptr != len(code) - 1:
+            end_id = ptr
+            code = code[:start_id] + code[end_id:]
+        return code
+    def preprocess(self, code):
+        code = "\n" + code
+        fn_list = self.get_function(code)
+        if len(fn_list) == 0:
+            return code
+        for fn in fn_list:
+            flag = self.determine_function(code, fn)
+            if flag == False:
+                code = self.delete_function(code, fn)
+        return code
+class AnnotationPreprocessor:
+    def search(self, sen_list, string):
+        for i, sen in enumerate(sen_list):
+            if string in sen:
+                return i
+        return -1
+    def delete_annotation_block(self, code, string):
+        sens = [sen for sen in code.split("\n")]
+        start_id = self.search(sens, string)
+        end_id = self.search(sens[start_id + 1 :], string)
+        if end_id != -1:
+            end_id += start_id + 1
+            code = sens[:start_id] + sens[end_id + 1 :]
+        else:
+            code = sens[:start_id] + sens[start_id + 1 :]
+        code = "\n".join(code)
+        return code
+    def delete_block(self, code, string):
+        while string in code:
+            code = self.delete_annotation_block(code, string)
+        return code
+    def delete_annotation(self, code):
+        sens = code.split("\n")
+        sens_processed = []
+        for sen in sens:
+            if "#" in sen:
+                index = sen.index("#")
+                sen = sen[:index]
+            sens_processed.append(sen)
+        return "\n".join(sens_processed)
+    def delete_import(self, code):
+        sens = code.split("\n")
+        sens_processed = []
+        for sen in sens:
+            if "import" not in sen:
+                sens_processed.append(sen)
+        return "\n".join(sens_processed)
+    def preprocess(self, code):
+        code = self.delete_block(code, '"""')
+        code = self.delete_block(code, "'''")
+        code = self.delete_annotation(code)
+        code = self.delete_import(code)
+        code = re.sub("\s+", " ", code).strip()
+        return code
+def preprocessor(code, instance):
+    processed_code = instance.preprocess(code)
+    return processed_code if processed_code.strip() else code
+def token_to_inputs(feature):
+    inputs = {}
+    for k, v in feature.items():
+        inputs[k] = torch.tensor(v).unsqueeze(0)
+    return inputs
+class CloneDetectionPipeline(Pipeline):
+    fn_preprocessor = FunctionPreprocessor()
+    an_preprocessor = AnnotationPreprocessor()
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        return preprocess_kwargs, {}, {}
+    def preprocess(self, inputs):
+        code1 = inputs[0]
+        code2 = inputs[1]
+        if code1.strip() == "" or code2.strip() == "":
+            ture_prob = float(code1.strip() == code2.strip())
+            return {"skip": True, "output": {False: 1 - ture_prob, True: ture_prob}}
+        code1 = preprocessor(
+            preprocessor(code1, self.fn_preprocessor), self.an_preprocessor
+        )
+        code2 = preprocessor(
+            preprocessor(code2, self.fn_preprocessor), self.an_preprocessor
+        )
+        feature1 = self.tokenizer(
+            code1, code2, max_length=512, return_token_type_ids=False, truncation=True
+        )
+        feature2 = self.tokenizer(
+            code2, code1, max_length=512, return_token_type_ids=False, truncation=True
+        )
+        return {
+            "inputs1": token_to_inputs(feature1),
+            "inputs2": token_to_inputs(feature2),
+        }
+    def _forward(self, model_inputs):
+        if model_inputs.get("skip", False):
+            return model_inputs
+        inputs1 = model_inputs["inputs1"]
+        inputs2 = model_inputs["inputs2"]
+        logits1 = self.model(**inputs1).logits[0]
+        logits2 = self.model(**inputs2).logits[0]
+        logits = (logits1 + logits2) / 2
+        return {"logits": logits}
+    def postprocess(self, model_outputs):
+        if model_outputs.get("skip", False):
+            return model_outputs["output"]
+        probs = model_outputs["logits"].softmax(-1).tolist()
+        return {False: probs[0], True: probs[1]}

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "microsoft/graphcodebert-base",
   "architectures": [
     "CloneDetectionModel"
   ],
@@ -9,6 +9,24 @@
   },
   "bos_token_id": 0,
   "classifier_dropout": null,
   "dropout_rate": 0.1,
   "eos_token_id": 2,
   "gradient_checkpointing": false,

 {
+  "_name_or_path": "Lazyhope/python-clone-detection",
   "architectures": [
     "CloneDetectionModel"
   ],
   },
   "bos_token_id": 0,
   "classifier_dropout": null,
+  "custom_pipelines": {
+    "python-clone-detection": {
+      "default": {
+        "model": {
+          "pt": [
+            "Lazyhope/python-clone-detection",
+            "main"
+          ]
+        }
+      },
+      "impl": "clone_detection_pipeline.CloneDetectionPipeline",
+      "pt": [
+        "AutoModel"
+      ],
+      "tf": [],
+      "type": "text"
+    }
+  },
   "dropout_rate": 0.1,
   "eos_token_id": 2,
   "gradient_checkpointing": false,

tokenizer_config.json CHANGED Viewed

@@ -34,7 +34,7 @@
     "single_word": false
   },
   "model_max_length": 512,
-  "name_or_path": "microsoft/graphcodebert-base",
   "pad_token": {
     "__type": "AddedToken",
     "content": "<pad>",

     "single_word": false
   },
   "model_max_length": 512,
+  "name_or_path": "Lazyhope/python-clone-detection",
   "pad_token": {
     "__type": "AddedToken",
     "content": "<pad>",