rahideer commited on
Commit
23e2ec8
·
verified ·
1 Parent(s): 17df11e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -0
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import javalang
4
+ import re
5
+ import os
6
+ import tempfile
7
+ from transformers import AutoModel, AutoTokenizer
8
+ import torch.nn as nn
9
+
10
+ # Configuration
11
+ MAX_FILE_SIZE = 5000
12
+ MAX_AST_DEPTH = 50
13
+ EMBEDDING_DIM = 128
14
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
15
+
16
+ # Code Normalization
17
+ def normalize_code(code):
18
+ code = re.sub(r'//.*', '', code)
19
+ code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
20
+ code = re.sub(r'"[^"]*"', '"STRING"', code)
21
+ code = re.sub(r'\s+', ' ', code).strip()
22
+ return code
23
+
24
+ # AST Extraction
25
+ def parse_java(code):
26
+ try:
27
+ tokens = javalang.tokenizer.tokenize(code)
28
+ parser = javalang.parser.Parser(tokens)
29
+ return parser.parse()
30
+ except:
31
+ return None
32
+
33
+ # AST Processor
34
+ class ASTProcessor:
35
+ def __init__(self):
36
+ self.node_types = set()
37
+
38
+ def extract_paths(self, node, max_depth=MAX_AST_DEPTH):
39
+ paths = []
40
+ self._dfs(node, [], paths, 0, max_depth)
41
+ return paths
42
+
43
+ def _dfs(self, node, current_path, paths, depth, max_depth):
44
+ if depth > max_depth:
45
+ return
46
+ node_type = type(node).__name__
47
+ current_path.append(node_type)
48
+
49
+ if not hasattr(node, 'children') or depth == max_depth:
50
+ paths.append(current_path.copy())
51
+ current_path.pop()
52
+ return
53
+
54
+ for child in node.children:
55
+ if isinstance(child, (javalang.ast.Node, list, tuple)):
56
+ if isinstance(child, (list, tuple)):
57
+ for c in child:
58
+ if isinstance(c, javalang.ast.Node):
59
+ self._dfs(c, current_path, paths, depth + 1, max_depth)
60
+ else:
61
+ self._dfs(child, current_path, paths, depth + 1, max_depth)
62
+
63
+ current_path.pop()
64
+
65
+ # Model
66
+ class ASTEncoder(nn.Module):
67
+ def __init__(self, vocab_size, embedding_dim):
68
+ super().__init__()
69
+ self.embedding = nn.Embedding(vocab_size, embedding_dim)
70
+ self.lstm = nn.LSTM(embedding_dim, embedding_dim, batch_first=True)
71
+
72
+ def forward(self, paths):
73
+ embedded = self.embedding(paths)
74
+ _, (hidden, _) = self.lstm(embedded)
75
+ return hidden[-1]
76
+
77
+ class CodeBERTEncoder(nn.Module):
78
+ def __init__(self):
79
+ super().__init__()
80
+ self.bert = AutoModel.from_pretrained('microsoft/codebert-base')
81
+ self.tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base')
82
+
83
+ def forward(self, code):
84
+ inputs = self.tokenizer(code, return_tensors='pt', truncation=True, padding=True)
85
+ outputs = self.bert(**inputs)
86
+ return outputs.last_hidden_state.mean(dim=1)
87
+
88
+ class HybridCloneDetector(nn.Module):
89
+ def __init__(self, ast_vocab_size):
90
+ super().__init__()
91
+ self.ast_encoder = ASTEncoder(ast_vocab_size, EMBEDDING_DIM)
92
+ self.code_encoder = CodeBERTEncoder()
93
+ self.classifier = nn.Sequential(
94
+ nn.Linear(EMBEDDING_DIM * 2, EMBEDDING_DIM),
95
+ nn.ReLU(),
96
+ nn.Linear(EMBEDDING_DIM, 2)
97
+ )
98
+
99
+ def forward(self, ast1, code1, ast2, code2):
100
+ ast_emb1 = self.ast_encoder(ast1)
101
+ ast_emb2 = self.ast_encoder(ast2)
102
+ code_emb1 = self.code_encoder(code1)
103
+ code_emb2 = self.code_encoder(code2)
104
+
105
+ diff_ast = torch.abs(ast_emb1 - ast_emb2)
106
+ diff_code = torch.abs(code_emb1 - code_emb2)
107
+
108
+ combined = torch.cat([diff_ast, diff_code], dim=1)
109
+ return self.classifier(combined)
110
+
111
+ # Streamlit UI
112
+ st.title("Java Code Clone Detector")
113
+
114
+ uploaded_file1 = st.file_uploader("Upload Java File 1", type=["java"])
115
+ uploaded_file2 = st.file_uploader("Upload Java File 2", type=["java"])
116
+
117
+ if uploaded_file1 and uploaded_file2:
118
+ code1 = uploaded_file1.read().decode('utf-8')
119
+ code2 = uploaded_file2.read().decode('utf-8')
120
+
121
+ # Normalize code
122
+ norm_code1 = normalize_code(code1)
123
+ norm_code2 = normalize_code(code2)
124
+
125
+ # Parse AST
126
+ ast1 = parse_java(norm_code1)
127
+ ast2 = parse_java(norm_code2)
128
+
129
+ if ast1 is None or ast2 is None:
130
+ st.error("Failed to parse one of the files. Please upload proper Java code.")
131
+ else:
132
+ st.success("Files parsed successfully.")
133
+
134
+ # Inference (placeholder)
135
+ st.write("🔧 **Model loading...** (currently using placeholder)")
136
+
137
+ # In a real app you would load your trained model here
138
+ st.warning("Model inference not available yet in this simple demo.")
139
+
140
+ st.write("✅ Code normalization done.")
141
+ st.code(norm_code1[:500], language='java')
142
+ st.code(norm_code2[:500], language='java')
143
+
144
+ st.info("Clone detection: [Placeholder] Results will appear here after training integration.")
145
+
146
+ else:
147
+ st.info("Upload two Java files to start clone detection.")
148
+