Spaces:
Running
on
L4
Running
on
L4
import os | |
from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask | |
from dotenv import load_dotenv | |
from enum import Enum | |
from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask | |
from src.deepeval.commonsense_reasoning_task import CommonsenseReasoningTask | |
from src.deepeval.summarization_task import SummarizationTask | |
from src.deepeval.faithfulness_task import FaithfulnessTask | |
from src.deepeval.toxicity_task import ToxicityTask | |
from src.deepeval.bias_task import BiasTaskOE | |
from src.deepeval.instruction_following_task import InstructionFollowingTask | |
from src.deepeval.reading_comprehension_task import ReadingComprehensionTask | |
from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask | |
from src.deepeval.complex_reasoning import ComplexReasoningTask | |
from src.deepeval.truthfulness_task import TruthfulnessTask | |
from src.deepeval.nli import NLITask | |
from src.deepeval.math import MathTask | |
from src.deepeval.turkish_vocabulary import TurkishVocabularyTask | |
from src.deepeval.metaphors_and_idioms import MetaphorsAndIdiomsTask | |
from src.deepeval.topic_detection import TopicDetectionTask | |
from src.deepeval.sts import STSTask | |
from src.deepeval.mmlu import MMLUTask | |
from src.deepeval.bias import BiasTask | |
from typing import List | |
from datetime import datetime | |
load_dotenv() | |
HF_TOKEN=os.getenv("HF_TOKEN") | |
class Task(Enum): | |
# SUMMARIZATION = "summarization" | |
SENTIMENT_ANALYSIS = "sentiment_analysis_tr" | |
TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge" | |
SUMMARIZATION = "summarization_tr" | |
FAITHFULNESS = "sosyoloji_faithfulness" | |
TOXICITY = "sosyoloji_toxicity" | |
BIAS = "sosyoloji_bias" | |
INSTRUCTION_FOLLOWING = "instruction_following_tr" | |
READING_COMPREHENSION = "reading_comprehension_mc" | |
READING_COMPREHENSION_OE = "reading_comp_oe" | |
COMMONSENSE_REASONING = "commonsense_reasoning" | |
COMPLEX_REASONING = "complex_reasoning" | |
TRUTHFULNESS = "sosyoloji_truthfulness" | |
NLI = "nli" | |
MATH = "math" | |
TURKISH_VOCABULARY = "turkish_vocabulary" | |
METAPHORS_AND_IDIOMS = "metaphors_and_idioms" | |
TOPIC_DETECTION = "topic_detection" | |
STS = "sts" | |
MMLU = "mmlu" | |
BIAS_MC = "bias" | |
class DeepEvalTaskManager: | |
def __init__(self, model_name, tasks: List[str]): | |
self.model_name = model_name | |
self.available_tasks = {task.name: getattr(self, task.value) for task in Task} | |
self.tasks_to_run = self.validate_tasks(tasks) | |
def validate_tasks(self, user_tasks): | |
"""Validate user tasks and store method references.""" | |
print(self.available_tasks.keys()) | |
print(user_tasks) | |
try: | |
if not set(user_tasks).issubset(self.available_tasks.keys()): | |
invalid_tasks = set(user_tasks) - self.available_tasks.keys() | |
raise ValueError(f"Invalid task(s) requested: {invalid_tasks}") | |
except Exception as e: | |
print(f"Error: {e}") | |
# Store actual method references instead of strings | |
return {task : self.available_tasks[task] for task in user_tasks} | |
def run_tasks(self): | |
"""Execute validated tasks in order.""" | |
results = {} | |
total_start_time = datetime.now() | |
for task_name, task_method in self.tasks_to_run.items(): | |
try: | |
start_time = datetime.now() | |
print("Running task: ", task_name) | |
task_enum = getattr(Task, task_name) | |
task_value = task_enum.value | |
results[task_value] = task_method() # Call the stored method reference | |
end_time = datetime.now() | |
print(f"Task {task_name} completed in {(end_time - start_time).seconds} seconds.") | |
except Exception as e: | |
print(f"Error At Task: {task_name} - {e}") | |
continue | |
total_end_time = datetime.now() | |
print(f"All tasks completed in {(total_end_time - total_start_time).seconds} seconds.") | |
print("All tasks completed.") | |
return results | |
def sentiment_analysis_tr(self): | |
st_task = SentimentAnalysisTask(self.model_name) | |
res = st_task.evaluate() | |
return res | |
def turkish_general_knowledge(self): | |
turkish_general_knowledge_task = TurkishGeneralKnowledgeTask(self.model_name) | |
res = turkish_general_knowledge_task.evaluate() | |
return res | |
def summarization_tr(self): | |
summarization_task = SummarizationTask(self.model_name) | |
res = summarization_task.evaluate() | |
return res | |
def sosyoloji_faithfulness(self): | |
faithfulness_task = FaithfulnessTask(self.model_name) | |
res = faithfulness_task.evaluate() | |
return res | |
def sosyoloji_toxicity(self): | |
toxicity_task = ToxicityTask(self.model_name) | |
res = toxicity_task.evaluate() | |
return res | |
def sosyoloji_bias(self): | |
bias_task = BiasTaskOE(self.model_name) | |
res = bias_task.evaluate() | |
return res | |
def instruction_following_tr(self): | |
instruction_following_task = InstructionFollowingTask(self.model_name) | |
res = instruction_following_task.evaluate() | |
return res | |
def reading_comprehension_mc(self): | |
reading_comprehension_mc_task = ReadingComprehensionMCTask(self.model_name) | |
res = reading_comprehension_mc_task.evaluate() | |
return res | |
def reading_comp_oe(self): | |
reading_comprehension_task = ReadingComprehensionTask(self.model_name) | |
res = reading_comprehension_task.evaluate() | |
return res | |
def commonsense_reasoning(self): | |
commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name) | |
res = commonsense_reasoning_task.evaluate() | |
return res | |
def complex_reasoning(self): | |
complex_reasoning_task = ComplexReasoningTask(self.model_name) | |
res = complex_reasoning_task.evaluate() | |
return res | |
def sosyoloji_truthfulness(self): | |
truthfulness_task = TruthfulnessTask(self.model_name) | |
res = truthfulness_task.evaluate() | |
return res | |
def nli(self): | |
nli_task = NLITask(self.model_name) | |
res = nli_task.evaluate() | |
return res | |
def math(self): | |
math_task = MathTask(self.model_name) | |
res = math_task.evaluate() | |
return res | |
def turkish_vocabulary(self): | |
turkish_vocabulary_task = TurkishVocabularyTask(self.model_name) | |
res = turkish_vocabulary_task.evaluate() | |
return res | |
def metaphors_and_idioms(self): | |
metaphors_and_idioms_task = MetaphorsAndIdiomsTask(self.model_name) | |
res = metaphors_and_idioms_task.evaluate() | |
return res | |
def topic_detection(self): | |
topic_detection_task = TopicDetectionTask(self.model_name) | |
res = topic_detection_task.evaluate() | |
return res | |
def sts(self): | |
sts_task = STSTask(self.model_name) | |
res = sts_task.evaluate() | |
return res | |
def mmlu(self): | |
mmlu_task = MMLUTask(self.model_name) | |
res = mmlu_task.evaluate() | |
return res | |
def bias(self): | |
bias_task = BiasTask(self.model_name) | |
res = bias_task.evaluate() | |
return res | |
if __name__ == "__main__": | |
des = DeepEvalTaskManager("google/gemma-2-2b-it", ["TOXICITY", "BIAS"]) | |
res = des.run_tasks() | |
print(res) |