File size: 3,755 Bytes
c52c03c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e18c8d1
 
 
c52c03c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import torch
import torch.nn as nn
import torch.optim as optim
import re

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# device = torch.device('cpu')


def remove_java_comments(code):
    # Remove single-line comments (//)
    code = re.sub(r'//.*', '', code)

    # Remove multi-line comments (/* ... */)
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)

    return code


def remove_python_comments(code):
    
    # Remove single-line comments (#)
    code = re.sub(r'#.*', '', code)

    # Remove multi-line comments (""" ... """ or ''' ... ''')
    code = re.sub(r'""".*?"""', '', code, flags=re.DOTALL)
    code = re.sub(r"'''.*?'''", '', code, flags=re.DOTALL)

    return code


# Model with Binary Classifier
class CodeBERTBinaryClassifier(nn.Module):
    def __init__(self, encoder_model, hidden_size=256, num_layers=2):
        super(CodeBERTBinaryClassifier, self).__init__()
        self.encoder = encoder_model
       
        self.classifier = nn.Sequential(
            nn.Dropout(0.3),  # Dropout with 30%
            nn.Linear(self.encoder.config.hidden_size, 128),  # Hidden layer with 128 units
            nn.BatchNorm1d(128),  # Batch normalization for the hidden layer
            nn.ReLU(),  # ReLU activation for the hidden layer
            nn.Dropout(0.3),  # Dropout with 30%
            nn.Linear(128, 1)  # Output layer with 1 unit
        )
        
        
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token representation
        logits = self.classifier(cls_output.detach()).squeeze(-1)  # Squeeze for binary logit
        return logits, cls_output



def infer_single_sample(code_text, model, tokenizer, language='java'):

    # Ensure model is in evaluation mode
    model.eval()
    
    # Remove comments from the code (assuming the same preprocessing as during training)
    if language == 'python':
        code_text = remove_python_comments(code_text)

    else:
        code_text = remove_java_comments(code_text)

    # print(code_text)
    
    # Tokenize the input
    inputs = tokenizer.encode_plus(
        code_text, 
        padding='max_length', 
        max_length=512, 
        truncation=True, 
        return_tensors='pt'
    )
    
    # Move inputs to the specified device
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Disable gradient computation for inference
    with torch.no_grad():
        # Get model prediction
        logits, _ = model(input_ids, attention_mask)
        
        # Apply sigmoid to get probability
        probability = torch.sigmoid(logits).cpu().item()
        
        # Classify based on 0.5 threshold
        predicted_label = 1 if probability > 0.5 else 0
    
    return {
        'probability': probability,
        'predicted_label': predicted_label,
        'interpretation': 'GPT-generated' if predicted_label == 0 else 'Human-written'
    }



def load_model_and_tokenizer(model_architecture, model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_architecture)
    base_model = AutoModel.from_pretrained(model_architecture)

    model = CodeBERTBinaryClassifier(base_model)
    # model = model.to(device)

    map_location = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.load_state_dict(torch.load(model_path, map_location=map_location))
    model = model.to(map_location)

    return model, tokenizer