model-eval-be / src /deepeval /deepeval_task_manager.py
Ahmet Kaan Sever
Distinguished bias mc and bias oe
9c25ebd
import os
from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
from dotenv import load_dotenv
from enum import Enum
from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
from src.deepeval.commonsense_reasoning_task import CommonsenseReasoningTask
from src.deepeval.summarization_task import SummarizationTask
from src.deepeval.faithfulness_task import FaithfulnessTask
from src.deepeval.toxicity_task import ToxicityTask
from src.deepeval.bias_task import BiasTaskOE
from src.deepeval.instruction_following_task import InstructionFollowingTask
from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask
from src.deepeval.complex_reasoning import ComplexReasoningTask
from src.deepeval.truthfulness_task import TruthfulnessTask
from src.deepeval.nli import NLITask
from src.deepeval.math import MathTask
from src.deepeval.turkish_vocabulary import TurkishVocabularyTask
from src.deepeval.metaphors_and_idioms import MetaphorsAndIdiomsTask
from src.deepeval.topic_detection import TopicDetectionTask
from src.deepeval.sts import STSTask
from src.deepeval.mmlu import MMLUTask
from src.deepeval.bias import BiasTask
from typing import List
from datetime import datetime
load_dotenv()
HF_TOKEN=os.getenv("HF_TOKEN")
class Task(Enum):
# SUMMARIZATION = "summarization"
SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge"
SUMMARIZATION = "summarization_tr"
FAITHFULNESS = "sosyoloji_faithfulness"
TOXICITY = "sosyoloji_toxicity"
BIAS = "sosyoloji_bias"
INSTRUCTION_FOLLOWING = "instruction_following_tr"
READING_COMPREHENSION = "reading_comprehension_mc"
READING_COMPREHENSION_OE = "reading_comp_oe"
COMMONSENSE_REASONING = "commonsense_reasoning"
COMPLEX_REASONING = "complex_reasoning"
TRUTHFULNESS = "sosyoloji_truthfulness"
NLI = "nli"
MATH = "math"
TURKISH_VOCABULARY = "turkish_vocabulary"
METAPHORS_AND_IDIOMS = "metaphors_and_idioms"
TOPIC_DETECTION = "topic_detection"
STS = "sts"
MMLU = "mmlu"
BIAS_MC = "bias"
class DeepEvalTaskManager:
def __init__(self, model_name, tasks: List[str]):
self.model_name = model_name
self.available_tasks = {task.name: getattr(self, task.value) for task in Task}
self.tasks_to_run = self.validate_tasks(tasks)
def validate_tasks(self, user_tasks):
"""Validate user tasks and store method references."""
print(self.available_tasks.keys())
print(user_tasks)
try:
if not set(user_tasks).issubset(self.available_tasks.keys()):
invalid_tasks = set(user_tasks) - self.available_tasks.keys()
raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
except Exception as e:
print(f"Error: {e}")
# Store actual method references instead of strings
return {task : self.available_tasks[task] for task in user_tasks}
def run_tasks(self):
"""Execute validated tasks in order."""
results = {}
total_start_time = datetime.now()
for task_name, task_method in self.tasks_to_run.items():
try:
start_time = datetime.now()
print("Running task: ", task_name)
task_enum = getattr(Task, task_name)
task_value = task_enum.value
results[task_value] = task_method() # Call the stored method reference
end_time = datetime.now()
print(f"Task {task_name} completed in {(end_time - start_time).seconds} seconds.")
except Exception as e:
print(f"Error At Task: {task_name} - {e}")
continue
total_end_time = datetime.now()
print(f"All tasks completed in {(total_end_time - total_start_time).seconds} seconds.")
print("All tasks completed.")
return results
def sentiment_analysis_tr(self):
st_task = SentimentAnalysisTask(self.model_name)
res = st_task.evaluate()
return res
def turkish_general_knowledge(self):
turkish_general_knowledge_task = TurkishGeneralKnowledgeTask(self.model_name)
res = turkish_general_knowledge_task.evaluate()
return res
def summarization_tr(self):
summarization_task = SummarizationTask(self.model_name)
res = summarization_task.evaluate()
return res
def sosyoloji_faithfulness(self):
faithfulness_task = FaithfulnessTask(self.model_name)
res = faithfulness_task.evaluate()
return res
def sosyoloji_toxicity(self):
toxicity_task = ToxicityTask(self.model_name)
res = toxicity_task.evaluate()
return res
def sosyoloji_bias(self):
bias_task = BiasTaskOE(self.model_name)
res = bias_task.evaluate()
return res
def instruction_following_tr(self):
instruction_following_task = InstructionFollowingTask(self.model_name)
res = instruction_following_task.evaluate()
return res
def reading_comprehension_mc(self):
reading_comprehension_mc_task = ReadingComprehensionMCTask(self.model_name)
res = reading_comprehension_mc_task.evaluate()
return res
def reading_comp_oe(self):
reading_comprehension_task = ReadingComprehensionTask(self.model_name)
res = reading_comprehension_task.evaluate()
return res
def commonsense_reasoning(self):
commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name)
res = commonsense_reasoning_task.evaluate()
return res
def complex_reasoning(self):
complex_reasoning_task = ComplexReasoningTask(self.model_name)
res = complex_reasoning_task.evaluate()
return res
def sosyoloji_truthfulness(self):
truthfulness_task = TruthfulnessTask(self.model_name)
res = truthfulness_task.evaluate()
return res
def nli(self):
nli_task = NLITask(self.model_name)
res = nli_task.evaluate()
return res
def math(self):
math_task = MathTask(self.model_name)
res = math_task.evaluate()
return res
def turkish_vocabulary(self):
turkish_vocabulary_task = TurkishVocabularyTask(self.model_name)
res = turkish_vocabulary_task.evaluate()
return res
def metaphors_and_idioms(self):
metaphors_and_idioms_task = MetaphorsAndIdiomsTask(self.model_name)
res = metaphors_and_idioms_task.evaluate()
return res
def topic_detection(self):
topic_detection_task = TopicDetectionTask(self.model_name)
res = topic_detection_task.evaluate()
return res
def sts(self):
sts_task = STSTask(self.model_name)
res = sts_task.evaluate()
return res
def mmlu(self):
mmlu_task = MMLUTask(self.model_name)
res = mmlu_task.evaluate()
return res
def bias(self):
bias_task = BiasTask(self.model_name)
res = bias_task.evaluate()
return res
if __name__ == "__main__":
des = DeepEvalTaskManager("google/gemma-2-2b-it", ["TOXICITY", "BIAS"])
res = des.run_tasks()
print(res)