import os from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask from dotenv import load_dotenv from enum import Enum from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask from src.deepeval.commonsense_reasoning_task import CommonsenseReasoningTask from src.deepeval.summarization_task import SummarizationTask from src.deepeval.faithfulness_task import FaithfulnessTask from src.deepeval.toxicity_task import ToxicityTask from src.deepeval.bias_task import BiasTaskOE from src.deepeval.instruction_following_task import InstructionFollowingTask from src.deepeval.reading_comprehension_task import ReadingComprehensionTask from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask from src.deepeval.complex_reasoning import ComplexReasoningTask from src.deepeval.truthfulness_task import TruthfulnessTask from src.deepeval.nli import NLITask from src.deepeval.math import MathTask from src.deepeval.turkish_vocabulary import TurkishVocabularyTask from src.deepeval.metaphors_and_idioms import MetaphorsAndIdiomsTask from src.deepeval.topic_detection import TopicDetectionTask from src.deepeval.sts import STSTask from src.deepeval.mmlu import MMLUTask from src.deepeval.bias import BiasTask from typing import List from datetime import datetime load_dotenv() HF_TOKEN=os.getenv("HF_TOKEN") class Task(Enum): # SUMMARIZATION = "summarization" SENTIMENT_ANALYSIS = "sentiment_analysis_tr" TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge" SUMMARIZATION = "summarization_tr" FAITHFULNESS = "sosyoloji_faithfulness" TOXICITY = "sosyoloji_toxicity" BIAS = "sosyoloji_bias" INSTRUCTION_FOLLOWING = "instruction_following_tr" READING_COMPREHENSION = "reading_comprehension_mc" READING_COMPREHENSION_OE = "reading_comp_oe" COMMONSENSE_REASONING = "commonsense_reasoning" COMPLEX_REASONING = "complex_reasoning" TRUTHFULNESS = "sosyoloji_truthfulness" NLI = "nli" MATH = "math" TURKISH_VOCABULARY = "turkish_vocabulary" METAPHORS_AND_IDIOMS = "metaphors_and_idioms" TOPIC_DETECTION = "topic_detection" STS = "sts" MMLU = "mmlu" BIAS_MC = "bias" class DeepEvalTaskManager: def __init__(self, model_name, tasks: List[str]): self.model_name = model_name self.available_tasks = {task.name: getattr(self, task.value) for task in Task} self.tasks_to_run = self.validate_tasks(tasks) def validate_tasks(self, user_tasks): """Validate user tasks and store method references.""" print(self.available_tasks.keys()) print(user_tasks) try: if not set(user_tasks).issubset(self.available_tasks.keys()): invalid_tasks = set(user_tasks) - self.available_tasks.keys() raise ValueError(f"Invalid task(s) requested: {invalid_tasks}") except Exception as e: print(f"Error: {e}") # Store actual method references instead of strings return {task : self.available_tasks[task] for task in user_tasks} def run_tasks(self): """Execute validated tasks in order.""" results = {} total_start_time = datetime.now() for task_name, task_method in self.tasks_to_run.items(): try: start_time = datetime.now() print("Running task: ", task_name) task_enum = getattr(Task, task_name) task_value = task_enum.value results[task_value] = task_method() # Call the stored method reference end_time = datetime.now() print(f"Task {task_name} completed in {(end_time - start_time).seconds} seconds.") except Exception as e: print(f"Error At Task: {task_name} - {e}") continue total_end_time = datetime.now() print(f"All tasks completed in {(total_end_time - total_start_time).seconds} seconds.") print("All tasks completed.") return results def sentiment_analysis_tr(self): st_task = SentimentAnalysisTask(self.model_name) res = st_task.evaluate() return res def turkish_general_knowledge(self): turkish_general_knowledge_task = TurkishGeneralKnowledgeTask(self.model_name) res = turkish_general_knowledge_task.evaluate() return res def summarization_tr(self): summarization_task = SummarizationTask(self.model_name) res = summarization_task.evaluate() return res def sosyoloji_faithfulness(self): faithfulness_task = FaithfulnessTask(self.model_name) res = faithfulness_task.evaluate() return res def sosyoloji_toxicity(self): toxicity_task = ToxicityTask(self.model_name) res = toxicity_task.evaluate() return res def sosyoloji_bias(self): bias_task = BiasTaskOE(self.model_name) res = bias_task.evaluate() return res def instruction_following_tr(self): instruction_following_task = InstructionFollowingTask(self.model_name) res = instruction_following_task.evaluate() return res def reading_comprehension_mc(self): reading_comprehension_mc_task = ReadingComprehensionMCTask(self.model_name) res = reading_comprehension_mc_task.evaluate() return res def reading_comp_oe(self): reading_comprehension_task = ReadingComprehensionTask(self.model_name) res = reading_comprehension_task.evaluate() return res def commonsense_reasoning(self): commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name) res = commonsense_reasoning_task.evaluate() return res def complex_reasoning(self): complex_reasoning_task = ComplexReasoningTask(self.model_name) res = complex_reasoning_task.evaluate() return res def sosyoloji_truthfulness(self): truthfulness_task = TruthfulnessTask(self.model_name) res = truthfulness_task.evaluate() return res def nli(self): nli_task = NLITask(self.model_name) res = nli_task.evaluate() return res def math(self): math_task = MathTask(self.model_name) res = math_task.evaluate() return res def turkish_vocabulary(self): turkish_vocabulary_task = TurkishVocabularyTask(self.model_name) res = turkish_vocabulary_task.evaluate() return res def metaphors_and_idioms(self): metaphors_and_idioms_task = MetaphorsAndIdiomsTask(self.model_name) res = metaphors_and_idioms_task.evaluate() return res def topic_detection(self): topic_detection_task = TopicDetectionTask(self.model_name) res = topic_detection_task.evaluate() return res def sts(self): sts_task = STSTask(self.model_name) res = sts_task.evaluate() return res def mmlu(self): mmlu_task = MMLUTask(self.model_name) res = mmlu_task.evaluate() return res def bias(self): bias_task = BiasTask(self.model_name) res = bias_task.evaluate() return res if __name__ == "__main__": des = DeepEvalTaskManager("google/gemma-2-2b-it", ["TOXICITY", "BIAS"]) res = des.run_tasks() print(res)