Spaces:

FastestAI
/

CodeBert_Redundant_Detection_Task

Running

Habiba A. Elbehairy commited on 1 day ago

Commit

a5cd505

1 Parent(s): 1306f0a

Refactor Code Similarity Classifier and update Dockerfile, README, and requirements

- Updated Dockerfile to copy application files and set CMD for uvicorn.
- Revised README title and emoji for clarity.
- Enhanced app.py with a new CodeSimilarityClassifier model and feature extraction logic.
- Improved model loading and error handling in app.py.
- Added health check and prediction endpoints with detailed logging.
- Refactored model_definition.py to define CodeSimilarityClassifier with a more powerful classification head.
- Introduced feature extraction function for better similarity detection.
- Updated requirements.txt to include necessary packages.
- Added config.json for model architecture and parameters.

Files changed (6) hide show

Dockerfile +0 -2
README.md +4 -4
app.py +221 -77
config.json +29 -0
model_definition.py +72 -27
requirements.txt +5 -6

Dockerfile CHANGED Viewed

@@ -14,5 +14,3 @@ RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]


14
15	COPY --chown=user . /app
16	CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: CodeBert Redundant Detection Task
-emoji: 🔥
-colorFrom: blue
-colorTo: purple
 sdk: docker
 pinned: false
 ---

 ---
+title: Code Similarity Classifier
+emoji: 🐨
+colorFrom: purple
+colorTo: blue
 sdk: docker
 pinned: false
 ---

app.py CHANGED Viewed

@@ -1,31 +1,37 @@
 import os
-import time
 import logging
 import torch
 import torch.nn.functional as F
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoConfig
-from model_definition import MultitaskCodeSimilarityModel
 from typing import List
 import uvicorn
 from datetime import datetime
 # Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# System information - Updated with the provided values
-DEPLOYMENT_DATE = "2025-06-10 15:11:04"  # Updated timestamp
-DEPLOYED_BY = "Fastest"
 # Get device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
-# Your Hugging Face model repository
 REPO_ID = "FastestAI/Redundant_Model"
 # Initialize FastAPI app
 app = FastAPI(
@@ -35,7 +41,7 @@ app = FastAPI(
     docs_url="/",
 )
-# Add CORS middleware to allow cross-origin requests
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -44,11 +50,8 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Define label to class mapping with CORRECT NUMBERING (1, 2, 3 instead of 0, 1, 2)
-label_to_class = {1: "Duplicate", 2: "Redundant", 3: "Distinct"}
-# Model output to API label mapping (if your model outputs 0, 1, 2 but we want 1, 2, 3)
-model_to_api_label = {0: 1, 1: 2, 2: 3}
 # Define input models for API
 class SourceCode(BaseModel):
@@ -69,59 +72,198 @@ class SimilarityInput(BaseModel):
     test_case_1: TestCase
     test_case_2: TestCase
 # Global variables for model and tokenizer
-model = None
 tokenizer = None
 # Load model and tokenizer on startup
 @app.on_event("startup")
 async def startup_event():
-    global model, tokenizer
     try:
-        logger.info(f"Loading model and tokenizer from {REPO_ID}...")
-        # Load tokenizer directly from Hugging Face
-        tokenizer = AutoTokenizer.from_pretrained(REPO_ID)
-        # Load config from Hugging Face
-        config = AutoConfig.from_pretrained(REPO_ID)
-        # Create model instance using imported MultitaskCodeSimilarityModel class
-        model = MultitaskCodeSimilarityModel(config, tokenizer)
-        # Load weights directly from Hugging Face
-        state_dict = torch.hub.load_state_dict_from_url(
-            f"https://huggingface.co/{REPO_ID}/resolve/main/pytorch_model.bin",
-            map_location=device,
-            check_hash=False
-        )
-        model.load_state_dict(state_dict)
         # Move model to device and set to evaluation mode
         model.to(device)
         model.eval()
-        logger.info("Model and tokenizer loaded successfully!")
     except Exception as e:
-        logger.error(f"Error loading model: {e}")
         import traceback
         logger.error(traceback.format_exc())
         model = None
         tokenizer = None
-@app.get("/health", tags=["Health"])
 async def health_check():
     """Health check endpoint that also returns deployment information"""
-    if model is None or tokenizer is None:
-        return {
-            "status": "error",
-            "message": "Model or tokenizer not loaded",
-            "deployment_date": DEPLOYMENT_DATE,
-            "deployed_by": DEPLOYED_BY
-        }
     return {
-        "status": "ok",
         "model": REPO_ID,
         "device": str(device),
         "deployment_date": DEPLOYMENT_DATE,
@@ -133,11 +275,8 @@ async def health_check():
 async def predict(data: SimilarityInput):
     """
     Predict similarity class between two test cases for a given source class.
-    Input schema follows the specified format with source_code, test_case_1, and test_case_2.
-    Uses heuristics to detect class and method differences before using the model.
     """
-    if model is None:
         raise HTTPException(status_code=500, detail="Model not loaded correctly")
     try:
@@ -150,28 +289,37 @@ async def predict(data: SimilarityInput):
         # Check if we can determine similarity without using the model
         if class_1 and class_2 and class_1 != class_2:
             logger.info(f"Heuristic detection: Different target classes - Distinct")
-            api_prediction = 3  # Distinct
             probs = [0.0, 0.0, 1.0]  # 100% confidence in Distinct
         elif method_1 and method_2 and not set(method_1).intersection(set(method_2)):
             logger.info(f"Heuristic detection: Different target methods - Distinct")
-            api_prediction = 3  # Distinct
             probs = [0.0, 0.0, 1.0]  # 100% confidence in Distinct
         else:
             # No clear heuristic match, use the model
-            # Format input to match training format
-            combined_input = (
-                f"SOURCE CODE: {data.source_code.code}\n"
-                f"TEST 1: {data.test_case_1.code}\n"
-                f"TEST 2: {data.test_case_2.code}"
             )
             # Tokenize input
-            inputs = tokenizer(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
-            # THIS IS WHERE THE MODEL IS CALLED
             with torch.no_grad():
-                # Our custom model
-                logits, _ = model(
                     input_ids=inputs["input_ids"],
                     attention_mask=inputs["attention_mask"]
                 )
@@ -179,20 +327,20 @@ async def predict(data: SimilarityInput):
             # Process results
             probs = F.softmax(logits, dim=-1)[0].cpu().tolist()
             model_prediction = torch.argmax(logits, dim=-1).item()
-            # Convert model prediction (0,1,2) to API prediction (1,2,3)
-            api_prediction = model_to_api_label[model_prediction]
-            logger.info(f"Model prediction: {label_to_class[api_prediction]}")
         # Map prediction to class name
-        classification = label_to_class.get(api_prediction, "Unknown")
         return {
             "pair_id": data.pair_id,
             "test_case_1_name": data.test_case_1.name,
             "test_case_2_name": data.test_case_2.name,
             "similarity": {
-                "score": api_prediction,
                 "classification": classification,
             },
             "probabilities": probs
@@ -205,8 +353,17 @@ async def predict(data: SimilarityInput):
         logger.error(error_trace)
         raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
-# Example endpoint
-@app.get("/example", response_model=SimilarityInput, tags=["Examples"])
 async def get_example():
     """Get an example input to test the API"""
     return SimilarityInput(
@@ -233,18 +390,5 @@ async def get_example():
         )
     )
-@app.get("/", tags=["Root"])
-async def root():
-    """
-    Redirect to the API documentation.
-    This is a convenience endpoint that redirects to the auto-generated docs.
-    """
-    return {
-        "message": "Test Similarity Analyzer API",
-        "documentation": "/docs",
-        "deployment_date": DEPLOYMENT_DATE,
-        "deployed_by": DEPLOYED_BY
-    }
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)

 import os
 import logging
 import torch
 import torch.nn.functional as F
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from typing import List
 import uvicorn
 from datetime import datetime
+from transformers import AutoTokenizer, AutoModel
+import requests
+import re
+import tempfile
 # Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler()]
+)
 logger = logging.getLogger(__name__)
+# System information - with your current values
+DEPLOYMENT_DATE = "2025-06-22 22:15:13"
+DEPLOYED_BY = "FASTESTAI"
 # Get device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
+# HuggingFace model repository path just for weights file
 REPO_ID = "FastestAI/Redundant_Model"
+MODEL_WEIGHTS_URL = f"https://huggingface.co/{REPO_ID}/resolve/main/pytorch_model.bin"
 # Initialize FastAPI app
 app = FastAPI(
     docs_url="/",
 )
+# Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Define label to class mapping
+label_to_class = {0: "Duplicate", 1: "Redundant", 2: "Distinct"}
 # Define input models for API
 class SourceCode(BaseModel):
     test_case_1: TestCase
     test_case_2: TestCase
+# Define the model class
+class CodeSimilarityClassifier(torch.nn.Module):
+    def __init__(self, model_name="microsoft/codebert-base", num_labels=3):
+        super().__init__()
+        self.encoder = AutoModel.from_pretrained(model_name)
+        self.dropout = torch.nn.Dropout(0.1)
+        # Create a more powerful classification head
+        hidden_size = self.encoder.config.hidden_size
+        self.classifier = torch.nn.Sequential(
+            torch.nn.Linear(hidden_size, hidden_size),
+            torch.nn.LayerNorm(hidden_size),
+            torch.nn.GELU(),
+            torch.nn.Dropout(0.1),
+            torch.nn.Linear(hidden_size, 512),
+            torch.nn.LayerNorm(512),
+            torch.nn.GELU(),
+            torch.nn.Dropout(0.1),
+            torch.nn.Linear(512, num_labels)
+        )
+    def forward(self, input_ids, attention_mask):
+        outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True
+        )
+        pooled_output = outputs.pooler_output
+        logits = self.classifier(pooled_output)
+        return logits
+def extract_features(source_code, test_code_1, test_code_2):
+    """Extract specific features to help the model identify similarities"""
+    # Extract test fixtures
+    fixture1 = re.search(r'TEST(?:_F)?\s*\(\s*(\w+)', test_code_1)
+    fixture1 = fixture1.group(1) if fixture1 else ""
+    fixture2 = re.search(r'TEST(?:_F)?\s*\(\s*(\w+)', test_code_2)
+    fixture2 = fixture2.group(1) if fixture2 else ""
+    # Extract test names
+    name1 = re.search(r'TEST(?:_F)?\s*\(\s*\w+\s*,\s*(\w+)', test_code_1)
+    name1 = name1.group(1) if name1 else ""
+    name2 = re.search(r'TEST(?:_F)?\s*\(\s*\w+\s*,\s*(\w+)', test_code_2)
+    name2 = name2.group(1) if name2 else ""
+    # Extract assertions
+    assertions1 = re.findall(r'(EXPECT_|ASSERT_)(\w+)', test_code_1)
+    assertions2 = re.findall(r'(EXPECT_|ASSERT_)(\w+)', test_code_2)
+    # Extract function/method calls
+    calls1 = re.findall(r'(\w+)\s*\(', test_code_1)
+    calls2 = re.findall(r'(\w+)\s*\(', test_code_2)
+    # Create explicit feature section
+    same_fixture = "SAME_FIXTURE" if fixture1 == fixture2 else "DIFFERENT_FIXTURE"
+    common_assertions = set([a[0] + a[1] for a in assertions1]).intersection(set([a[0] + a[1] for a in assertions2]))
+    common_calls = set(calls1).intersection(set(calls2))
+    # Calculate assertion ratio with safety check for zero
+    assertion_ratio = 0
+    if assertions1 and assertions2:
+        total_assertions = len(assertions1) + len(assertions2)
+        if total_assertions > 0:
+            assertion_ratio = len(common_assertions) / total_assertions
+    features = (
+        f"METADATA: {same_fixture} | "
+        f"FIXTURE1: {fixture1} | FIXTURE2: {fixture2} | "
+        f"NAME1: {name1} | NAME2: {name2} | "
+        f"COMMON_ASSERTIONS: {len(common_assertions)} | "
+        f"COMMON_CALLS: {len(common_calls)} | "
+        f"ASSERTION_RATIO: {assertion_ratio}"
+    )
+    return features
 # Global variables for model and tokenizer
 tokenizer = None
+model = None
+def download_model_weights(url, save_path):
+    """Download model weights from URL to a local file"""
+    try:
+        logger.info(f"Downloading model weights from {url}...")
+        response = requests.get(url, stream=True)
+        if response.status_code != 200:
+            logger.error(f"Failed to download: HTTP {response.status_code}")
+            return False
+        with open(save_path, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+        logger.info(f"Successfully downloaded model weights to {save_path}")
+        return True
+    except Exception as e:
+        logger.error(f"Error downloading model weights: {e}")
+        return False
 # Load model and tokenizer on startup
 @app.on_event("startup")
 async def startup_event():
+    global tokenizer, model
     try:
+        logger.info("=== Starting model loading process ===")
+        # Step 1: Load the tokenizer from the base model
+        logger.info(f"Loading tokenizer from microsoft/codebert-base...")
+        try:
+            tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
+            logger.info("✅ Base tokenizer loaded successfully")
+        except Exception as e:
+            logger.error(f"❌ Failed to load tokenizer: {str(e)}")
+            raise
+        # Step 2: Create model with base architecture
+        logger.info("Creating model architecture...")
+        try:
+            # Initialize with base CodeBERT
+            model = CodeSimilarityClassifier(model_name="microsoft/codebert-base")
+            logger.info("✅ Model architecture created successfully")
+        except Exception as e:
+            logger.error(f"❌ Failed to create model architecture: {str(e)}")
+            raise
+        # Step 3: Download and load weights
+        model_path = "pytorch_model.bin"
+        # First check if the file already exists
+        if not os.path.exists(model_path):
+            # Try downloading
+            if not download_model_weights(MODEL_WEIGHTS_URL, model_path):
+                logger.error("❌ Failed to download model weights")
+                raise RuntimeError("Failed to download model weights")
+        # Try to load the model weights
+        try:
+            # Check if the weights are a state dict or the whole model
+            logger.info(f"Loading weights from {model_path}...")
+            checkpoint = torch.load(model_path, map_location=device)
+            if isinstance(checkpoint, dict):
+                # If it's a state dict directly
+                if "state_dict" in checkpoint:
+                    logger.info("Loading from checkpoint['state_dict']")
+                    model.load_state_dict(checkpoint["state_dict"])
+                elif "model_state_dict" in checkpoint:
+                    logger.info("Loading from checkpoint['model_state_dict']")
+                    model.load_state_dict(checkpoint["model_state_dict"])
+                else:
+                    logger.info("Loading from checkpoint directly")
+                    model.load_state_dict(checkpoint)
+            else:
+                logger.error("❌ Unsupported model format")
+                raise RuntimeError("Unsupported model format")
+            logger.info("✅ Model weights loaded successfully")
+        except Exception as e:
+            logger.error(f"❌ Error loading model weights: {str(e)}")
+            raise
         # Move model to device and set to evaluation mode
         model.to(device)
         model.eval()
+        logger.info(f"✅ Model moved to {device} and set to evaluation mode")
+        logger.info("=== Model loading process complete ===")
     except Exception as e:
+        logger.error(f"❌ CRITICAL ERROR in startup: {str(e)}")
         import traceback
         logger.error(traceback.format_exc())
         model = None
         tokenizer = None
+@app.get("/health")
 async def health_check():
     """Health check endpoint that also returns deployment information"""
+    model_status = model is not None
+    tokenizer_status = tokenizer is not None
+    status = "ok" if (model_status and tokenizer_status) else "error"
     return {
+        "status": status,
+        "model_loaded": model_status,
+        "tokenizer_loaded": tokenizer_status,
         "model": REPO_ID,
         "device": str(device),
         "deployment_date": DEPLOYMENT_DATE,
 async def predict(data: SimilarityInput):
     """
     Predict similarity class between two test cases for a given source class.
     """
+    if model is None or tokenizer is None:
         raise HTTPException(status_code=500, detail="Model not loaded correctly")
     try:
         # Check if we can determine similarity without using the model
         if class_1 and class_2 and class_1 != class_2:
             logger.info(f"Heuristic detection: Different target classes - Distinct")
+            model_prediction = 2  # Distinct
             probs = [0.0, 0.0, 1.0]  # 100% confidence in Distinct
         elif method_1 and method_2 and not set(method_1).intersection(set(method_2)):
             logger.info(f"Heuristic detection: Different target methods - Distinct")
+            model_prediction = 2  # Distinct
             probs = [0.0, 0.0, 1.0]  # 100% confidence in Distinct
         else:
             # No clear heuristic match, use the model
+            # Extract features to help with classification
+            features = extract_features(data.source_code.code, data.test_case_1.code, data.test_case_2.code)
+            # Format the input text with clear section markers as done during training
+            formatted_text = (
+                f"{features}\n\n"
+                f"SOURCE CODE:\n{data.source_code.code.strip()}\n\n"
+                f"TEST CASE 1:\n{data.test_case_1.code.strip()}\n\n"
+                f"TEST CASE 2:\n{data.test_case_2.code.strip()}"
             )
             # Tokenize input
+            inputs = tokenizer(
+                formatted_text,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=512
+            ).to(device)
+            # Model inference
             with torch.no_grad():
+                logits = model(
                     input_ids=inputs["input_ids"],
                     attention_mask=inputs["attention_mask"]
                 )
             # Process results
             probs = F.softmax(logits, dim=-1)[0].cpu().tolist()
             model_prediction = torch.argmax(logits, dim=-1).item()
+            logger.info(f"Model prediction: {label_to_class[model_prediction]}")
         # Map prediction to class name
+        classification = label_to_class.get(model_prediction, "Unknown")
+        # For API compatibility, map the model outputs (0,1,2) to API scores (1,2,3)
+        api_score = model_prediction + 1
         return {
             "pair_id": data.pair_id,
             "test_case_1_name": data.test_case_1.name,
             "test_case_2_name": data.test_case_2.name,
             "similarity": {
+                "score": api_score,
                 "classification": classification,
             },
             "probabilities": probs
         logger.error(error_trace)
         raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
+# Root and example endpoints
+@app.get("/")
+async def root():
+    return {
+        "message": "Test Similarity Analyzer API",
+        "documentation": "/docs",
+        "deployment_date": DEPLOYMENT_DATE,
+        "deployed_by": DEPLOYED_BY
+    }
+@app.get("/example", response_model=SimilarityInput)
 async def get_example():
     """Get an example input to test the API"""
     return SimilarityInput(
         )
     )
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": ["CodeSimilarityClassifier"],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 514,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "type_vocab_size": 1,
+  "vocab_size": 50265,
+  "layer_norm_eps": 1e-5,
+  "pad_token_id": 1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "model_type": "codebert",
+  "problem_type": "single_label_classification",
+  "num_labels": 3,
+  "classifier_dropout": 0.1,
+  "classifier_hidden_size": 512,
+  "classifier_layers": 2,
+  "classifier_activation": "gelu",
+  "base_model_name": "microsoft/codebert-base",
+  "feature_extraction": true,
+  "deployment_date": "2025-06-22 22:17:05",
+  "deployed_by": "habibaelbehairy"
+}

model_definition.py CHANGED Viewed

@@ -1,33 +1,78 @@
 import torch
 import torch.nn as nn
 from transformers import AutoModel
-class MultitaskCodeSimilarityModel(nn.Module):
-    def __init__(self, config, tokenizer):
         super().__init__()
-        self.config = config
-        self.tokenizer = tokenizer
-        self.encoder = AutoModel.from_config(config)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        # For explanation generation
-        self.decoder_embedding = nn.Linear(config.hidden_size, config.hidden_size)
-        self.decoder = nn.GRU(
-            input_size=config.hidden_size,
-            hidden_size=config.hidden_size,
-            batch_first=True
         )
-        self.explanation_head = nn.Linear(config.hidden_size, len(tokenizer))
-    def forward(self, input_ids, attention_mask, explanation_ids=None, explanation_mask=None):
-        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
-        pooled = outputs.last_hidden_state[:, 0]
-        logits = self.classifier(pooled)
-        explanation_logits = None
-        if explanation_ids is not None:
-            decoder_input = self.decoder_embedding(pooled).unsqueeze(1).expand(-1, explanation_ids.size(1), -1)
-            decoder_outputs, _ = self.decoder(decoder_input)
-            explanation_logits = self.explanation_head(decoder_outputs)
-        return logits, explanation_logits

 import torch
 import torch.nn as nn
 from transformers import AutoModel
+import re
+class CodeSimilarityClassifier(nn.Module):
+    def __init__(self, model_name="microsoft/codebert-base", num_labels=3):
         super().__init__()
+        self.encoder = AutoModel.from_pretrained(model_name)
+        self.dropout = nn.Dropout(0.1)
+        # Create a more powerful classification head
+        hidden_size = self.encoder.config.hidden_size
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size),
+            nn.LayerNorm(hidden_size),
+            nn.GELU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size, 512),
+            nn.LayerNorm(512),
+            nn.GELU(),
+            nn.Dropout(0.1),
+            nn.Linear(512, num_labels)
+        )
+    def forward(self, input_ids, attention_mask):
+        outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True
         )
+        pooled_output = outputs.pooler_output
+        logits = self.classifier(pooled_output)
+        return logits
+def extract_features(source_code, test_code_1, test_code_2):
+    """Extract specific features to help the model identify similarities"""
+    # Extract test fixtures
+    fixture1 = re.search(r'TEST(?:_F)?\s*\(\s*(\w+)', test_code_1)
+    fixture1 = fixture1.group(1) if fixture1 else ""
+    fixture2 = re.search(r'TEST(?:_F)?\s*\(\s*(\w+)', test_code_2)
+    fixture2 = fixture2.group(1) if fixture2 else ""
+    # Extract test names
+    name1 = re.search(r'TEST(?:_F)?\s*\(\s*\w+\s*,\s*(\w+)', test_code_1)
+    name1 = name1.group(1) if name1 else ""
+    name2 = re.search(r'TEST(?:_F)?\s*\(\s*\w+\s*,\s*(\w+)', test_code_2)
+    name2 = name2.group(1) if name2 else ""
+    # Extract assertions
+    assertions1 = re.findall(r'(EXPECT_|ASSERT_)(\w+)', test_code_1)
+    assertions2 = re.findall(r'(EXPECT_|ASSERT_)(\w+)', test_code_2)
+    # Extract function/method calls
+    calls1 = re.findall(r'(\w+)\s*\(', test_code_1)
+    calls2 = re.findall(r'(\w+)\s*\(', test_code_2)
+    # Create explicit feature section
+    same_fixture = "SAME_FIXTURE" if fixture1 == fixture2 else "DIFFERENT_FIXTURE"
+    common_assertions = set([a[0] + a[1] for a in assertions1]).intersection(set([a[0] + a[1] for a in assertions2]))
+    common_calls = set(calls1).intersection(set(calls2))
+    features = (
+        f"METADATA: {same_fixture} | "
+        f"FIXTURE1: {fixture1} | FIXTURE2: {fixture2} | "
+        f"NAME1: {name1} | NAME2: {name2} | "
+        f"COMMON_ASSERTIONS: {len(common_assertions)} | "
+        f"COMMON_CALLS: {len(common_calls)} | "
+        f"ASSERTION_RATIO: {len(common_assertions)/(len(assertions1) + len(assertions2)) if assertions1 and assertions2 else 0}"
+    )
+    return features

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
-torch>=1.10.0
-transformers>=4.18.0
-fastapi>=0.68.0
-uvicorn>=0.15.0
-pydantic>=1.8.0
-numpy>=1.20.0

+fastapi
+torch
+transformers
+uvicorn
+requests