Spaces:

FastestAI
/

CodeBert_Redundant_Detection_Task

Running

App Files Files Community

Habiba A. Elbehairy commited on 15 days ago

Commit

1306f0a

1 Parent(s): b4fe073

App

Browse files

Files changed (4) hide show

Dockerfile +18 -0
app.py +250 -0
model_definition.py +33 -0
requirements.txt +6 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import os
+import time
+import logging
+import torch
+import torch.nn.functional as F
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoConfig
+from model_definition import MultitaskCodeSimilarityModel
+from typing import List
+import uvicorn
+from datetime import datetime
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# System information - Updated with the provided values
+DEPLOYMENT_DATE = "2025-06-10 15:11:04"  # Updated timestamp
+DEPLOYED_BY = "Fastest"
+# Get device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+logger.info(f"Using device: {device}")
+# Your Hugging Face model repository
+REPO_ID = "FastestAI/Redundant_Model"
+# Initialize FastAPI app
+app = FastAPI(
+    title="Test Similarity Analyzer API",
+    description="API for analyzing similarity between test cases. Deployed by " + DEPLOYED_BY,
+    version="1.0.0",
+    docs_url="/",
+)
+# Add CORS middleware to allow cross-origin requests
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Define label to class mapping with CORRECT NUMBERING (1, 2, 3 instead of 0, 1, 2)
+label_to_class = {1: "Duplicate", 2: "Redundant", 3: "Distinct"}
+# Model output to API label mapping (if your model outputs 0, 1, 2 but we want 1, 2, 3)
+model_to_api_label = {0: 1, 1: 2, 2: 3}
+# Define input models for API
+class SourceCode(BaseModel):
+    class_name: str
+    code: str
+class TestCase(BaseModel):
+    id: str
+    test_fixture: str
+    name: str
+    code: str
+    target_class: str
+    target_method: List[str]
+class SimilarityInput(BaseModel):
+    pair_id: str
+    source_code: SourceCode
+    test_case_1: TestCase
+    test_case_2: TestCase
+# Global variables for model and tokenizer
+model = None
+tokenizer = None
+# Load model and tokenizer on startup
+@app.on_event("startup")
+async def startup_event():
+    global model, tokenizer
+    try:
+        logger.info(f"Loading model and tokenizer from {REPO_ID}...")
+        # Load tokenizer directly from Hugging Face
+        tokenizer = AutoTokenizer.from_pretrained(REPO_ID)
+        # Load config from Hugging Face
+        config = AutoConfig.from_pretrained(REPO_ID)
+        # Create model instance using imported MultitaskCodeSimilarityModel class
+        model = MultitaskCodeSimilarityModel(config, tokenizer)
+        # Load weights directly from Hugging Face
+        state_dict = torch.hub.load_state_dict_from_url(
+            f"https://huggingface.co/{REPO_ID}/resolve/main/pytorch_model.bin",
+            map_location=device,
+            check_hash=False
+        )
+        model.load_state_dict(state_dict)
+        # Move model to device and set to evaluation mode
+        model.to(device)
+        model.eval()
+        logger.info("Model and tokenizer loaded successfully!")
+    except Exception as e:
+        logger.error(f"Error loading model: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        model = None
+        tokenizer = None
+@app.get("/health", tags=["Health"])
+async def health_check():
+    """Health check endpoint that also returns deployment information"""
+    if model is None or tokenizer is None:
+        return {
+            "status": "error",
+            "message": "Model or tokenizer not loaded",
+            "deployment_date": DEPLOYMENT_DATE,
+            "deployed_by": DEPLOYED_BY
+        }
+    return {
+        "status": "ok",
+        "model": REPO_ID,
+        "device": str(device),
+        "deployment_date": DEPLOYMENT_DATE,
+        "deployed_by": DEPLOYED_BY,
+        "current_time": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
+    }
+@app.post("/predict")
+async def predict(data: SimilarityInput):
+    """
+    Predict similarity class between two test cases for a given source class.
+    Input schema follows the specified format with source_code, test_case_1, and test_case_2.
+    Uses heuristics to detect class and method differences before using the model.
+    """
+    if model is None:
+        raise HTTPException(status_code=500, detail="Model not loaded correctly")
+    try:
+        # Apply heuristics for method and class differences
+        class_1 = data.test_case_1.target_class
+        class_2 = data.test_case_2.target_class
+        method_1 = data.test_case_1.target_method
+        method_2 = data.test_case_2.target_method
+        # Check if we can determine similarity without using the model
+        if class_1 and class_2 and class_1 != class_2:
+            logger.info(f"Heuristic detection: Different target classes - Distinct")
+            api_prediction = 3  # Distinct
+            probs = [0.0, 0.0, 1.0]  # 100% confidence in Distinct
+        elif method_1 and method_2 and not set(method_1).intersection(set(method_2)):
+            logger.info(f"Heuristic detection: Different target methods - Distinct")
+            api_prediction = 3  # Distinct
+            probs = [0.0, 0.0, 1.0]  # 100% confidence in Distinct
+        else:
+            # No clear heuristic match, use the model
+            # Format input to match training format
+            combined_input = (
+                f"SOURCE CODE: {data.source_code.code}\n"
+                f"TEST 1: {data.test_case_1.code}\n"
+                f"TEST 2: {data.test_case_2.code}"
+            )
+            # Tokenize input
+            inputs = tokenizer(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
+            # THIS IS WHERE THE MODEL IS CALLED
+            with torch.no_grad():
+                # Our custom model
+                logits, _ = model(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"]
+                )
+            # Process results
+            probs = F.softmax(logits, dim=-1)[0].cpu().tolist()
+            model_prediction = torch.argmax(logits, dim=-1).item()
+            # Convert model prediction (0,1,2) to API prediction (1,2,3)
+            api_prediction = model_to_api_label[model_prediction]
+            logger.info(f"Model prediction: {label_to_class[api_prediction]}")
+        # Map prediction to class name
+        classification = label_to_class.get(api_prediction, "Unknown")
+        return {
+            "pair_id": data.pair_id,
+            "test_case_1_name": data.test_case_1.name,
+            "test_case_2_name": data.test_case_2.name,
+            "similarity": {
+                "score": api_prediction,
+                "classification": classification,
+            },
+            "probabilities": probs
+        }
+    except Exception as e:
+        import traceback
+        error_trace = traceback.format_exc()
+        logger.error(f"Prediction error: {str(e)}")
+        logger.error(error_trace)
+        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
+# Example endpoint
+@app.get("/example", response_model=SimilarityInput, tags=["Examples"])
+async def get_example():
+    """Get an example input to test the API"""
+    return SimilarityInput(
+        pair_id="example-1",
+        source_code=SourceCode(
+            class_name="Calculator",
+            code="class Calculator {\n    public int add(int a, int b) {\n        return a + b;\n    }\n}"
+        ),
+        test_case_1=TestCase(
+            id="test-1",
+            test_fixture="CalculatorTest",
+            name="testAddsTwoPositiveNumbers",
+            code="TEST(CalculatorTest, AddsTwoPositiveNumbers) {\n    Calculator calc;\n    EXPECT_EQ(5, calc.add(2, 3));\n}",
+            target_class="Calculator",
+            target_method=["add"]
+        ),
+        test_case_2=TestCase(
+            id="test-2",
+            test_fixture="CalculatorTest",
+            name="testAddsTwoPositiveIntegers",
+            code="TEST(CalculatorTest, AddsTwoPositiveIntegers) {\n    Calculator calc;\n    EXPECT_EQ(5, calc.add(2, 3));\n}",
+            target_class="Calculator",
+            target_method=["add"]
+        )
+    )
+@app.get("/", tags=["Root"])
+async def root():
+    """
+    Redirect to the API documentation.
+    This is a convenience endpoint that redirects to the auto-generated docs.
+    """
+    return {
+        "message": "Test Similarity Analyzer API",
+        "documentation": "/docs",
+        "deployment_date": DEPLOYMENT_DATE,
+        "deployed_by": DEPLOYED_BY
+    }
+if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)

model_definition.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import torch.nn as nn
+from transformers import AutoModel
+class MultitaskCodeSimilarityModel(nn.Module):
+    def __init__(self, config, tokenizer):
+        super().__init__()
+        self.config = config
+        self.tokenizer = tokenizer
+        self.encoder = AutoModel.from_config(config)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # For explanation generation
+        self.decoder_embedding = nn.Linear(config.hidden_size, config.hidden_size)
+        self.decoder = nn.GRU(
+            input_size=config.hidden_size,
+            hidden_size=config.hidden_size,
+            batch_first=True
+        )
+        self.explanation_head = nn.Linear(config.hidden_size, len(tokenizer))
+    def forward(self, input_ids, attention_mask, explanation_ids=None, explanation_mask=None):
+        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        pooled = outputs.last_hidden_state[:, 0]
+        logits = self.classifier(pooled)
+        explanation_logits = None
+        if explanation_ids is not None:
+            decoder_input = self.decoder_embedding(pooled).unsqueeze(1).expand(-1, explanation_ids.size(1), -1)
+            decoder_outputs, _ = self.decoder(decoder_input)
+            explanation_logits = self.explanation_head(decoder_outputs)
+        return logits, explanation_logits

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch>=1.10.0
+transformers>=4.18.0
+fastapi>=0.68.0
+uvicorn>=0.15.0
+pydantic>=1.8.0
+numpy>=1.20.0