Habiba A. Elbehairy commited on
Commit
1306f0a
·
1 Parent(s): b4fe073
Files changed (4) hide show
  1. Dockerfile +18 -0
  2. app.py +250 -0
  3. model_definition.py +33 -0
  4. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
17
+
18
+
app.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import logging
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel
9
+ from transformers import AutoTokenizer, AutoConfig
10
+ from model_definition import MultitaskCodeSimilarityModel
11
+ from typing import List
12
+ import uvicorn
13
+ from datetime import datetime
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # System information - Updated with the provided values
20
+ DEPLOYMENT_DATE = "2025-06-10 15:11:04" # Updated timestamp
21
+ DEPLOYED_BY = "Fastest"
22
+
23
+ # Get device
24
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+ logger.info(f"Using device: {device}")
26
+
27
+ # Your Hugging Face model repository
28
+ REPO_ID = "FastestAI/Redundant_Model"
29
+
30
+ # Initialize FastAPI app
31
+ app = FastAPI(
32
+ title="Test Similarity Analyzer API",
33
+ description="API for analyzing similarity between test cases. Deployed by " + DEPLOYED_BY,
34
+ version="1.0.0",
35
+ docs_url="/",
36
+ )
37
+
38
+ # Add CORS middleware to allow cross-origin requests
39
+ app.add_middleware(
40
+ CORSMiddleware,
41
+ allow_origins=["*"],
42
+ allow_credentials=True,
43
+ allow_methods=["*"],
44
+ allow_headers=["*"],
45
+ )
46
+
47
+ # Define label to class mapping with CORRECT NUMBERING (1, 2, 3 instead of 0, 1, 2)
48
+ label_to_class = {1: "Duplicate", 2: "Redundant", 3: "Distinct"}
49
+
50
+ # Model output to API label mapping (if your model outputs 0, 1, 2 but we want 1, 2, 3)
51
+ model_to_api_label = {0: 1, 1: 2, 2: 3}
52
+
53
+ # Define input models for API
54
+ class SourceCode(BaseModel):
55
+ class_name: str
56
+ code: str
57
+
58
+ class TestCase(BaseModel):
59
+ id: str
60
+ test_fixture: str
61
+ name: str
62
+ code: str
63
+ target_class: str
64
+ target_method: List[str]
65
+
66
+ class SimilarityInput(BaseModel):
67
+ pair_id: str
68
+ source_code: SourceCode
69
+ test_case_1: TestCase
70
+ test_case_2: TestCase
71
+
72
+ # Global variables for model and tokenizer
73
+ model = None
74
+ tokenizer = None
75
+
76
+ # Load model and tokenizer on startup
77
+ @app.on_event("startup")
78
+ async def startup_event():
79
+ global model, tokenizer
80
+ try:
81
+ logger.info(f"Loading model and tokenizer from {REPO_ID}...")
82
+
83
+ # Load tokenizer directly from Hugging Face
84
+ tokenizer = AutoTokenizer.from_pretrained(REPO_ID)
85
+
86
+ # Load config from Hugging Face
87
+ config = AutoConfig.from_pretrained(REPO_ID)
88
+
89
+ # Create model instance using imported MultitaskCodeSimilarityModel class
90
+ model = MultitaskCodeSimilarityModel(config, tokenizer)
91
+
92
+ # Load weights directly from Hugging Face
93
+ state_dict = torch.hub.load_state_dict_from_url(
94
+ f"https://huggingface.co/{REPO_ID}/resolve/main/pytorch_model.bin",
95
+ map_location=device,
96
+ check_hash=False
97
+ )
98
+ model.load_state_dict(state_dict)
99
+
100
+ # Move model to device and set to evaluation mode
101
+ model.to(device)
102
+ model.eval()
103
+
104
+ logger.info("Model and tokenizer loaded successfully!")
105
+ except Exception as e:
106
+ logger.error(f"Error loading model: {e}")
107
+ import traceback
108
+ logger.error(traceback.format_exc())
109
+ model = None
110
+ tokenizer = None
111
+
112
+ @app.get("/health", tags=["Health"])
113
+ async def health_check():
114
+ """Health check endpoint that also returns deployment information"""
115
+ if model is None or tokenizer is None:
116
+ return {
117
+ "status": "error",
118
+ "message": "Model or tokenizer not loaded",
119
+ "deployment_date": DEPLOYMENT_DATE,
120
+ "deployed_by": DEPLOYED_BY
121
+ }
122
+
123
+ return {
124
+ "status": "ok",
125
+ "model": REPO_ID,
126
+ "device": str(device),
127
+ "deployment_date": DEPLOYMENT_DATE,
128
+ "deployed_by": DEPLOYED_BY,
129
+ "current_time": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
130
+ }
131
+
132
+ @app.post("/predict")
133
+ async def predict(data: SimilarityInput):
134
+ """
135
+ Predict similarity class between two test cases for a given source class.
136
+
137
+ Input schema follows the specified format with source_code, test_case_1, and test_case_2.
138
+ Uses heuristics to detect class and method differences before using the model.
139
+ """
140
+ if model is None:
141
+ raise HTTPException(status_code=500, detail="Model not loaded correctly")
142
+
143
+ try:
144
+ # Apply heuristics for method and class differences
145
+ class_1 = data.test_case_1.target_class
146
+ class_2 = data.test_case_2.target_class
147
+ method_1 = data.test_case_1.target_method
148
+ method_2 = data.test_case_2.target_method
149
+
150
+ # Check if we can determine similarity without using the model
151
+ if class_1 and class_2 and class_1 != class_2:
152
+ logger.info(f"Heuristic detection: Different target classes - Distinct")
153
+ api_prediction = 3 # Distinct
154
+ probs = [0.0, 0.0, 1.0] # 100% confidence in Distinct
155
+ elif method_1 and method_2 and not set(method_1).intersection(set(method_2)):
156
+ logger.info(f"Heuristic detection: Different target methods - Distinct")
157
+ api_prediction = 3 # Distinct
158
+ probs = [0.0, 0.0, 1.0] # 100% confidence in Distinct
159
+ else:
160
+ # No clear heuristic match, use the model
161
+ # Format input to match training format
162
+ combined_input = (
163
+ f"SOURCE CODE: {data.source_code.code}\n"
164
+ f"TEST 1: {data.test_case_1.code}\n"
165
+ f"TEST 2: {data.test_case_2.code}"
166
+ )
167
+
168
+ # Tokenize input
169
+ inputs = tokenizer(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
170
+
171
+ # THIS IS WHERE THE MODEL IS CALLED
172
+ with torch.no_grad():
173
+ # Our custom model
174
+ logits, _ = model(
175
+ input_ids=inputs["input_ids"],
176
+ attention_mask=inputs["attention_mask"]
177
+ )
178
+
179
+ # Process results
180
+ probs = F.softmax(logits, dim=-1)[0].cpu().tolist()
181
+ model_prediction = torch.argmax(logits, dim=-1).item()
182
+
183
+ # Convert model prediction (0,1,2) to API prediction (1,2,3)
184
+ api_prediction = model_to_api_label[model_prediction]
185
+ logger.info(f"Model prediction: {label_to_class[api_prediction]}")
186
+
187
+ # Map prediction to class name
188
+ classification = label_to_class.get(api_prediction, "Unknown")
189
+
190
+ return {
191
+ "pair_id": data.pair_id,
192
+ "test_case_1_name": data.test_case_1.name,
193
+ "test_case_2_name": data.test_case_2.name,
194
+ "similarity": {
195
+ "score": api_prediction,
196
+ "classification": classification,
197
+ },
198
+ "probabilities": probs
199
+ }
200
+
201
+ except Exception as e:
202
+ import traceback
203
+ error_trace = traceback.format_exc()
204
+ logger.error(f"Prediction error: {str(e)}")
205
+ logger.error(error_trace)
206
+ raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
207
+
208
+ # Example endpoint
209
+ @app.get("/example", response_model=SimilarityInput, tags=["Examples"])
210
+ async def get_example():
211
+ """Get an example input to test the API"""
212
+ return SimilarityInput(
213
+ pair_id="example-1",
214
+ source_code=SourceCode(
215
+ class_name="Calculator",
216
+ code="class Calculator {\n public int add(int a, int b) {\n return a + b;\n }\n}"
217
+ ),
218
+ test_case_1=TestCase(
219
+ id="test-1",
220
+ test_fixture="CalculatorTest",
221
+ name="testAddsTwoPositiveNumbers",
222
+ code="TEST(CalculatorTest, AddsTwoPositiveNumbers) {\n Calculator calc;\n EXPECT_EQ(5, calc.add(2, 3));\n}",
223
+ target_class="Calculator",
224
+ target_method=["add"]
225
+ ),
226
+ test_case_2=TestCase(
227
+ id="test-2",
228
+ test_fixture="CalculatorTest",
229
+ name="testAddsTwoPositiveIntegers",
230
+ code="TEST(CalculatorTest, AddsTwoPositiveIntegers) {\n Calculator calc;\n EXPECT_EQ(5, calc.add(2, 3));\n}",
231
+ target_class="Calculator",
232
+ target_method=["add"]
233
+ )
234
+ )
235
+
236
+ @app.get("/", tags=["Root"])
237
+ async def root():
238
+ """
239
+ Redirect to the API documentation.
240
+ This is a convenience endpoint that redirects to the auto-generated docs.
241
+ """
242
+ return {
243
+ "message": "Test Similarity Analyzer API",
244
+ "documentation": "/docs",
245
+ "deployment_date": DEPLOYMENT_DATE,
246
+ "deployed_by": DEPLOYED_BY
247
+ }
248
+
249
+ if __name__ == "__main__":
250
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)
model_definition.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import AutoModel
4
+
5
+ class MultitaskCodeSimilarityModel(nn.Module):
6
+ def __init__(self, config, tokenizer):
7
+ super().__init__()
8
+ self.config = config
9
+ self.tokenizer = tokenizer
10
+ self.encoder = AutoModel.from_config(config)
11
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
12
+
13
+ # For explanation generation
14
+ self.decoder_embedding = nn.Linear(config.hidden_size, config.hidden_size)
15
+ self.decoder = nn.GRU(
16
+ input_size=config.hidden_size,
17
+ hidden_size=config.hidden_size,
18
+ batch_first=True
19
+ )
20
+ self.explanation_head = nn.Linear(config.hidden_size, len(tokenizer))
21
+
22
+ def forward(self, input_ids, attention_mask, explanation_ids=None, explanation_mask=None):
23
+ outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
24
+ pooled = outputs.last_hidden_state[:, 0]
25
+ logits = self.classifier(pooled)
26
+
27
+ explanation_logits = None
28
+ if explanation_ids is not None:
29
+ decoder_input = self.decoder_embedding(pooled).unsqueeze(1).expand(-1, explanation_ids.size(1), -1)
30
+ decoder_outputs, _ = self.decoder(decoder_input)
31
+ explanation_logits = self.explanation_head(decoder_outputs)
32
+
33
+ return logits, explanation_logits
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch>=1.10.0
2
+ transformers>=4.18.0
3
+ fastapi>=0.68.0
4
+ uvicorn>=0.15.0
5
+ pydantic>=1.8.0
6
+ numpy>=1.20.0