Spaces:

metunlp
/

model-eval-be

Paused

App Files Files Community

Ahmet Kaan Sever commited on 15 days ago

Commit

1a88171

2 Parent(s): 6807ea3 ddb6316

Merge branch 'main' into deneme

Browse files

Files changed (9) hide show

src/deepeval/base_task.py +1 -1
src/deepeval/commonsense_reasoning_task.py +86 -0
src/deepeval/complex_reasoning.py +63 -0
src/deepeval/deepeval_task_manager.py +139 -111
src/deepeval/nli.py +77 -0
src/deepeval/reading_comp_mc.py +79 -0
src/deepeval/sentiment_analysis_task.py +1 -0
src/deepeval/turkish_general_knowledge_task.py +1 -1
svc/router.py +4 -1

src/deepeval/base_task.py CHANGED Viewed

@@ -178,7 +178,7 @@ class BaseTask(ABC):
         ]
         allowed_special_tokens = self.tokenizer.apply_chat_template(allowed_token_chat, tokenize=True)
         return allowed_special_tokens
     @abstractmethod
     def load_dataset_from_hf(self):
         """

         ]
         allowed_special_tokens = self.tokenizer.apply_chat_template(allowed_token_chat, tokenize=True)
         return allowed_special_tokens
     @abstractmethod
     def load_dataset_from_hf(self):
         """

src/deepeval/commonsense_reasoning_task.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from src.deepeval.base_task import BaseTask
+from collections import defaultdict
+from src.deepeval.utils import accuracy, accuracy_standard_error
+from typing import Any
+class CommonsenseReasoningTask(BaseTask):
+    def __init__(self, model_name):
+        super().__init__("metunlp/commonsense", model_name=model_name)
+    def load_dataset_from_hf(self):
+        dataset = super().load_dataset_from_hf()
+        return dataset.select(range(min(10, len(dataset))))
+    def evaluate(self) -> dict[str, Any]:
+        responses = []
+        difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
+        total_count = 0
+        true = 0
+        for row in self.dataset:
+            total_count += 1
+            # Get values from row
+            label = row["label"]
+            choices=[row["choice1"], row["choice2"]]
+            formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
+            category = row["difficulty"]
+            answer = row["answer"]
+            text = row["text"]
+            context = row["context"]
+            # Prints for debugging
+            print(f"Choices: {choices}")
+            print("Type of choices:", type(choices))
+            print("Type of answer:", type(answer))
+            # Get answer index (starting from 0)
+            if type(answer) == int:
+                answer_index = answer - 1 # 1 or 2
+            else:
+                answer_index = int(answer) - 1
+            correct_answer_letter = chr(65 + answer_index)
+            # Get question based on label
+            if label == "effect":
+                question = "Seçeneklerden hangisi verilen önermenin bir sonucu veya etkisi olabilir?"
+            elif label == "cause":
+                question = "Seçeneklerden hangisi verilen önermenin bir neden veya sebebi olabilir?"
+            else:
+                question = "Seçeneklerden hangisi uygun?"  # Alternatif
+            # Construct the prompt/message
+            instruction = ""
+            prompt = f"Bağlam:\n{text}\nÖnerme:\n{context}\nSoru:{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
+            message = prompt
+            # Get/format answer of the model
+            model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=10)
+            responses.append(model_answer)
+            model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
+            # Print answers
+            print(f"Correct Answer: {correct_answer_letter}")
+            print(f"Model Answer: {model_answer}")
+            print(f"Model Answer Cleaned: {model_answer_cleaned}")
+            # Check if correct based on metric
+            if correct_answer_letter == model_answer_cleaned:
+                true += 1
+                difficulty_results[category]['correct'] += 1
+            difficulty_results[category]['total'] += 1
+        # Print results categorized by difficulty
+        for category, stats in difficulty_results.items():
+            calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
+            print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
+        print("Results:", responses)
+        print("Overall Accuracy:", true / total_count)
+        acc = accuracy(true, total_count)
+        acc_stderr = accuracy_standard_error(acc, total_count)
+        return {"acc": acc, "acc_stderr": acc_stderr}

src/deepeval/complex_reasoning.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from src.deepeval.base_task import BaseTask
+from collections import defaultdict
+from src.deepeval.utils import accuracy, accuracy_standard_error
+from typing import Any
+import ast
+class ComplexReasoningTask(BaseTask):
+    def __init__(self, model_name):
+        super().__init__("metunlp/complex-ales", model_name=model_name)
+    def load_dataset_from_hf(self):
+        dataset = super().load_dataset_from_hf()
+        return dataset.select(range(min(10, len(dataset))))
+    def evaluate(self) -> dict[str, Any]:
+        responses = []
+        correct_answers = []
+        total_count = 0
+        true = 0
+        for row in self.dataset:
+            total_count += 1
+            # Get values from row
+            choices = ast.literal_eval(row["choices"]) # Convert string to list
+            narrative = row["narrative"]
+            question = row["question"]
+            formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
+            correct_answer_letter = row["answer_choice"]
+            correct_answers.append(correct_answer_letter)
+            # Prints for debugging
+            print(f"Choices: {choices}")
+            print("Type of choices:", type(choices))
+            # Construct the prompt/message
+            instruction = ""
+            prompt = f"Soru:\n{narrative}\n{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
+            message = prompt
+            # Get/format answer of the model
+            model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
+            responses.append(model_answer)
+            model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
+            if correct_answer_letter == model_answer_cleaned:
+                true += 1
+            # Print answers
+            print(f"Correct Answer: {correct_answer_letter}")
+            print(f"Model Answer: {model_answer}")
+            print(f"Model Answer Cleaned: {model_answer_cleaned}")
+        print("Answers:", correct_answers)
+        print("Results:", responses)
+        print("Overall Accuracy:", true / total_count)
+        acc = accuracy(true, total_count)
+        acc_stderr = accuracy_standard_error(acc, total_count)
+        return {"acc": acc, "acc_stderr": acc_stderr}

src/deepeval/deepeval_task_manager.py CHANGED Viewed

@@ -1,112 +1,140 @@
-import os
-from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
-from dotenv import load_dotenv
-from enum import Enum
-from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
-from src.deepeval.summarization_task import SummarizationTask
-from src.deepeval.faithfulness_task import FaithfulnessTask
-from src.deepeval.toxicity_task import ToxicityTask
-from src.deepeval.bias_task import BiasTask
-from src.deepeval.instruction_following_task import InstructionFollowingTask
-from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
-from src.deepeval.truthfulness_task import TruthfulnessTask
-from typing import List
-load_dotenv()
-openai_configs = {
-    'OPENAI_API_KEY': 'OPENAI_KEY'
-}
-os.environ['OPENAI_API_KEY'] = openai_configs['OPENAI_API_KEY']
-HF_TOKEN=os.getenv("HF_TOKEN")
-class Task(Enum):
-    # SUMMARIZATION = "summarization"
-    SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
-    TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge"
-    SUMMARIZATION = "summarization_tr"
-    FAITHFULNESS = "sosyoloji_faithfulness"
-    TOXICITY = "sosyoloji_toxicity"
-    BIAS = "sosyoloji_bias"
-    INSTRUCTION_FOLLOWING = "instruction_following_tr"
-    READING_COMPREHENSION = "reading_comp_oe"
-    TRUTHFULNESS = "sosyoloji_truthfulness"
-class DeepEvalTaskManager:
-    def __init__(self, model_name, tasks: List[str]):
-        self.model_name = model_name
-        self.available_tasks = {task.name: getattr(self, task.value) for task in Task}
-        self.tasks_to_run = self.validate_tasks(tasks)
-    def validate_tasks(self, user_tasks):
-        """Validate user tasks and store method references."""
-        print(self.available_tasks.keys())
-        print(user_tasks)
-        if not set(user_tasks).issubset(self.available_tasks.keys()):
-            invalid_tasks = set(user_tasks) - self.available_tasks.keys()
-            raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
-        # Store actual method references instead of strings
-        return {task : self.available_tasks[task] for task in user_tasks}
-    def run_tasks(self):
-        """Execute validated tasks in order."""
-        results = {}
-        for task_name, task_method in self.tasks_to_run.items():
-            print("Running task: ", task_name)
-            task_enum = getattr(Task, task_name)
-            task_value = task_enum.value
-            results[task_value] = task_method()  # Call the stored method reference
-        return results
-    def sentiment_analysis_tr(self):
-        st_task = SentimentAnalysisTask(self.model_name)
-        res = st_task.evaluate()
-        return res
-    def turkish_general_knowledge(self):
-        turkish_general_knowledge_task = TurkishGeneralKnowledgeTask(self.model_name)
-        res = turkish_general_knowledge_task.evaluate()
-        return res
-    def summarization_tr(self):
-        summarization_task = SummarizationTask(self.model_name)
-        res = summarization_task.evaluate()
-        return res
-    def sosyoloji_faithfulness(self):
-        faithfulness_task = FaithfulnessTask(self.model_name)
-        res = faithfulness_task.evaluate()
-        return res
-    def sosyoloji_toxicity(self):
-        toxicity_task = ToxicityTask(self.model_name)
-        res = toxicity_task.evaluate()
-        return res
-    def sosyoloji_bias(self):
-        bias_task = BiasTask(self.model_name)
-        res = bias_task.evaluate()
-        return res
-    def instruction_following_tr(self):
-        instruction_following_task = InstructionFollowingTask(self.model_name)
-        res = instruction_following_task.evaluate()
-        return res
-    def reading_comp_oe(self):
-        reading_comprehension_task = ReadingComprehensionTask(self.model_name)
-        res = reading_comprehension_task.evaluate()
-        return res
-    def sosyoloji_truthfulness(self):
-        truthfulness_task = TruthfulnessTask(self.model_name)
-        res = truthfulness_task.evaluate()
-        return res
-if __name__ == "__main__":
-    des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["SENTIMENT_ANALYSIS", "SUMMARIZATION", "FAITHFULNESS", "TOXICITY", "BIAS", "INSTRUCTION_FOLLOWING","READING_COMPREHENSION", "TRUTHFULNESS"])
-    res = des.run_tasks()
     print(res)

+import os
+from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
+from dotenv import load_dotenv
+from enum import Enum
+from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
+from src.deepeval.commonsense_reasoning_task import CommonsenseReasoningTask
+from src.deepeval.summarization_task import SummarizationTask
+from src.deepeval.faithfulness_task import FaithfulnessTask
+from src.deepeval.toxicity_task import ToxicityTask
+from src.deepeval.bias_task import BiasTask
+from src.deepeval.instruction_following_task import InstructionFollowingTask
+from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
+from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask
+from src.deepeval.complex_reasoning import ComplexReasoningTask
+from src.deepeval.truthfulness_task import TruthfulnessTask
+from src.deepeval.nli import NLITask
+from typing import List
+load_dotenv()
+openai_configs = {
+    'OPENAI_API_KEY': 'OPENAI_KEY'
+}
+os.environ['OPENAI_API_KEY'] = openai_configs['OPENAI_API_KEY']
+HF_TOKEN=os.getenv("HF_TOKEN")
+class Task(Enum):
+    # SUMMARIZATION = "summarization"
+    SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
+    TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge"
+    SUMMARIZATION = "summarization_tr"
+    FAITHFULNESS = "sosyoloji_faithfulness"
+    TOXICITY = "sosyoloji_toxicity"
+    BIAS = "sosyoloji_bias"
+    INSTRUCTION_FOLLOWING = "instruction_following_tr"
+    READING_COMPREHENSION = "reading_comprehension_mc"
+    READING_COMPREHENSION_OE = "reading_comp_oe"
+    COMMONSENSE_REASONING = "commonsense_reasoning"
+    COMPLEX_REASONING = "complex_reasoning"
+    TRUTHFULNESS = "sosyoloji_truthfulness"
+    NLI = "nli"
+class DeepEvalTaskManager:
+    def __init__(self, model_name, tasks: List[str]):
+        self.model_name = model_name
+        self.available_tasks = {task.name: getattr(self, task.value) for task in Task}
+        self.tasks_to_run = self.validate_tasks(tasks)
+    def validate_tasks(self, user_tasks):
+        """Validate user tasks and store method references."""
+        print(self.available_tasks.keys())
+        print(user_tasks)
+        if not set(user_tasks).issubset(self.available_tasks.keys()):
+            invalid_tasks = set(user_tasks) - self.available_tasks.keys()
+            raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
+        # Store actual method references instead of strings
+        return {task : self.available_tasks[task] for task in user_tasks}
+    def run_tasks(self):
+        """Execute validated tasks in order."""
+        results = {}
+        for task_name, task_method in self.tasks_to_run.items():
+            print("Running task: ", task_name)
+            task_enum = getattr(Task, task_name)
+            task_value = task_enum.value
+            results[task_value] = task_method()  # Call the stored method reference
+        return results
+    def sentiment_analysis_tr(self):
+        st_task = SentimentAnalysisTask(self.model_name)
+        res = st_task.evaluate()
+        return res
+    def turkish_general_knowledge(self):
+        turkish_general_knowledge_task = TurkishGeneralKnowledgeTask(self.model_name)
+        res = turkish_general_knowledge_task.evaluate()
+        return res
+    def summarization_tr(self):
+        summarization_task = SummarizationTask(self.model_name)
+        res = summarization_task.evaluate()
+        return res
+    def sosyoloji_faithfulness(self):
+        faithfulness_task = FaithfulnessTask(self.model_name)
+        res = faithfulness_task.evaluate()
+        return res
+    def sosyoloji_toxicity(self):
+        toxicity_task = ToxicityTask(self.model_name)
+        res = toxicity_task.evaluate()
+        return res
+    def sosyoloji_bias(self):
+        bias_task = BiasTask(self.model_name)
+        res = bias_task.evaluate()
+        return res
+    def instruction_following_tr(self):
+        instruction_following_task = InstructionFollowingTask(self.model_name)
+        res = instruction_following_task.evaluate()
+        return res
+    def reading_comprehension_mc(self):
+        reading_comprehension_mc_task = ReadingComprehensionMCTask(self.model_name)
+        res = reading_comprehension_mc_task.evaluate()
+        return res
+    def reading_comp_oe(self):
+        reading_comprehension_task = ReadingComprehensionTask(self.model_name)
+        res = reading_comprehension_task.evaluate()
+        return res
+    def commonsense_reasoning(self):
+        commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name)
+        res = commonsense_reasoning_task.evaluate()
+        return res
+    def complex_reasoning(self):
+        complex_reasoning_task = ComplexReasoningTask(self.model_name)
+        res = complex_reasoning_task.evaluate()
+        return res
+    def sosyoloji_truthfulness(self):
+        truthfulness_task = TruthfulnessTask(self.model_name)
+        res = truthfulness_task.evaluate()
+        return res
+    def nli(self):
+        nli_task = NLITask(self.model_name)
+        res = nli_task.evaluate()
+        return res
+if __name__ == "__main__":
+    des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["COMPLEX_REASONING","NLI"])
+    res = des.run_tasks()
     print(res)

src/deepeval/nli.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from src.deepeval.base_task import BaseTask
+from collections import defaultdict
+from src.deepeval.utils import accuracy, accuracy_standard_error
+from typing import Any
+class NLITask(BaseTask):
+    def __init__(self, model_name):
+        super().__init__("metunlp/nli_tr", model_name=model_name)
+    def load_dataset_from_hf(self):
+        dataset = super().load_dataset_from_hf()
+        return dataset.select(range(min(10, len(dataset))))
+    def evaluate(self) -> dict[str, Any]:
+        responses = []
+        difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
+        total_count = 0
+        true = 0
+        for row in self.dataset:
+            total_count += 1
+            # Get values from row
+            text = row["text"]
+            premise = row["premise"]
+            hypothesis = row["hypothesis"]
+            label = row["label"].lower().replace(' ','')
+            choices=["entailment","contradiction","neutral"]
+            formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
+            category = row["difficulty"]
+            correct_answer_letter = "A" if label == "entailment" else \
+                                    "B" if label == "contradiction" else \
+                                    "C" if label == "neutral" else None
+            # Prints for debugging
+            print(f"Choices: {choices}")
+            print("Type of choices:", type(choices))
+            print("Label:", label)
+            # Construct the prompt/message
+            instruction = ""
+            question = "Yukarıdaki cümleler arasındaki ilişki “entailment” (bir cümle diğerini ima eder), “neutral (cümleler birbirini ima etmez ve çelişmez) veya “contradiction (cümleler birbirleriyle çelişir) olarak karakterize edilebilir. Bu ilişkilerden hangisi olduğunu söyleyin."
+            context = f"Bağlam:\n{text}\n" # can add to prompt if needed
+            prompt = f"Cümle1:\n{premise}\nCümle2:{hypothesis}\nSoru:\n{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
+            message = prompt
+            # Get/format answer of the model
+            model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=10)
+            responses.append(model_answer)
+            model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
+            # Print answers
+            print(f"Correct Answer: {correct_answer_letter}")
+            print(f"Model Answer: {model_answer}")
+            print(f"Model Answer Cleaned: {model_answer_cleaned}")
+            # Check if correct based on metric
+            if correct_answer_letter == model_answer_cleaned:
+                true += 1
+                difficulty_results[category]['correct'] += 1
+            difficulty_results[category]['total'] += 1
+        # Print results categorized by difficulty
+        for category, stats in difficulty_results.items():
+            calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
+            print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
+        print("Results:", responses)
+        print("Overall Accuracy:", true / total_count)
+        acc = accuracy(true, total_count)
+        acc_stderr = accuracy_standard_error(acc, total_count)
+        return {"acc": acc, "acc_stderr": acc_stderr}

src/deepeval/reading_comp_mc.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from src.deepeval.base_task import BaseTask
+from collections import defaultdict
+from src.deepeval.utils import accuracy, accuracy_standard_error
+from typing import Any
+import ast
+class ReadingComprehensionMCTask(BaseTask):
+    def __init__(self, model_name):
+        super().__init__("metunlp/reading_comp_mc", model_name=model_name)
+    def load_dataset_from_hf(self):
+        dataset = super().load_dataset_from_hf()
+        return dataset.select(range(min(10, len(dataset))))
+    def evaluate(self) -> dict[str, Any]:
+        responses = []
+        difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
+        total_count = 0
+        true = 0
+        for row in self.dataset:
+            total_count += 1
+            # Get values from row
+            choices = ast.literal_eval(row["choices"]) # Convert string to list
+            formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
+            category = row["difficulty"].lower().replace(' ','')
+            answer = row["answer"]
+            text = row["text"]
+            question_about_the_text = row["question_about_the_text"]
+            # Prints for debugging
+            print(f"Choices: {choices}")
+            print("Type of choices:", type(choices))
+            print("Type of answer:", type(answer))
+            # Get answer index (starting from 0)
+            if type(answer) == int:
+                answer_index = answer
+            else:
+                answer_index = int(answer)
+            correct_answer_letter = chr(65 + answer_index)
+            # Construct the prompt/message
+            instruction = ""
+            prompt = f"Paragraf:\n{text}\nSoru:{question_about_the_text}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
+            message = prompt
+            # Get/format answer of the model
+            model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
+            responses.append(model_answer)
+            model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
+            # Print answers
+            print(f"Correct Answer: {correct_answer_letter}")
+            print(f"Model Answer: {model_answer}")
+            print(f"Model Answer Cleaned: {model_answer_cleaned}")
+            # Check if correct based on metric
+            if correct_answer_letter == model_answer_cleaned:
+                true += 1
+                difficulty_results[category]['correct'] += 1
+            difficulty_results[category]['total'] += 1
+        # Print results categorized by difficulty
+        for category, stats in difficulty_results.items():
+            calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
+            print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
+        print("Results:", responses)
+        print("Overall Accuracy:", true / total_count)
+        acc = accuracy(true, total_count)
+        acc_stderr = accuracy_standard_error(acc, total_count)
+        return {"acc": acc, "acc_stderr": acc_stderr}

src/deepeval/sentiment_analysis_task.py CHANGED Viewed

@@ -7,6 +7,7 @@ class SentimentAnalysisTask(BaseTask):
         super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
     def load_dataset_from_hf(self):
         dataset = super().load_dataset_from_hf()
         return dataset.select(range(min(10, len(dataset))))

         super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
     def load_dataset_from_hf(self):
+        print("Loading the dataset")
         dataset = super().load_dataset_from_hf()
         return dataset.select(range(min(10, len(dataset))))

src/deepeval/turkish_general_knowledge_task.py CHANGED Viewed

@@ -9,7 +9,7 @@ class TurkishGeneralKnowledgeTask(BaseTask):
     def load_dataset_from_hf(self):
         dataset = super().load_dataset_from_hf()
-        return dataset.select(range(min(10, len(dataset))))
     def evaluate(self):
         responses = []

     def load_dataset_from_hf(self):
         dataset = super().load_dataset_from_hf()
+        return dataset.select(range(min(1, len(dataset))))
     def evaluate(self):
         responses = []

svc/router.py CHANGED Viewed

@@ -43,6 +43,10 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
 async def protected_route(username: str = Depends(get_current_user)):
     return {"message": f"Hello, {username}! This is a protected resource."}
 @router.post("/chat",  response_model=TaskResponse)
 def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
@@ -78,7 +82,6 @@ def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_c
     return TaskResponse(results=dumped)
 @router.post("/deepeval/eval", response_model=TaskResponse)
 async def deep_eval_suite(request: DeepEvalSuiteRequest):
     des = DeepEvalTaskManager(request.model_name, request.tasks)

 async def protected_route(username: str = Depends(get_current_user)):
     return {"message": f"Hello, {username}! This is a protected resource."}
+@router.get("/deepeval/status")
+async def deep_eval_status():
+    #Return running with 200 status code
+    return {"status": "running"}
 @router.post("/chat",  response_model=TaskResponse)
 def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
     return TaskResponse(results=dumped)
 @router.post("/deepeval/eval", response_model=TaskResponse)
 async def deep_eval_suite(request: DeepEvalSuiteRequest):
     des = DeepEvalTaskManager(request.model_name, request.tasks)