Spaces:
Paused
Paused
File size: 7,356 Bytes
51ae401 c5ce1aa 51ae401 7b3d3a5 c5ce1aa 9c25ebd c5ce1aa 33d2454 1a88171 33d2454 57be006 51ae401 66a11b3 51ae401 c88093d 76d5f6d c5ce1aa a18754a 57be006 51ae401 597b990 51ae401 76d5f6d ea93485 51ae401 66a11b3 51ae401 8930e56 66a11b3 8930e56 66a11b3 8a3d32e 8930e56 66a11b3 8a3d32e 8930e56 51ae401 c88093d 51ae401 76d5f6d 51ae401 c5ce1aa 1a88171 c5ce1aa 1a88171 c5ce1aa 1a88171 c5ce1aa 1a88171 9c25ebd 1a88171 c5ce1aa 1a88171 33d2454 1a88171 33d2454 1a88171 33d2454 1a88171 33d2454 f869c66 57be006 51ae401 1657c25 51ae401 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
import os
from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
from dotenv import load_dotenv
from enum import Enum
from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
from src.deepeval.commonsense_reasoning_task import CommonsenseReasoningTask
from src.deepeval.summarization_task import SummarizationTask
from src.deepeval.faithfulness_task import FaithfulnessTask
from src.deepeval.toxicity_task import ToxicityTask
from src.deepeval.bias_task import BiasTaskOE
from src.deepeval.instruction_following_task import InstructionFollowingTask
from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask
from src.deepeval.complex_reasoning import ComplexReasoningTask
from src.deepeval.truthfulness_task import TruthfulnessTask
from src.deepeval.nli import NLITask
from src.deepeval.math import MathTask
from src.deepeval.turkish_vocabulary import TurkishVocabularyTask
from src.deepeval.metaphors_and_idioms import MetaphorsAndIdiomsTask
from src.deepeval.topic_detection import TopicDetectionTask
from src.deepeval.sts import STSTask
from src.deepeval.mmlu import MMLUTask
from src.deepeval.bias import BiasTask
from typing import List
from datetime import datetime
load_dotenv()
HF_TOKEN=os.getenv("HF_TOKEN")
class Task(Enum):
# SUMMARIZATION = "summarization"
SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge"
SUMMARIZATION = "summarization_tr"
FAITHFULNESS = "sosyoloji_faithfulness"
TOXICITY = "sosyoloji_toxicity"
BIAS = "sosyoloji_bias"
INSTRUCTION_FOLLOWING = "instruction_following_tr"
READING_COMPREHENSION = "reading_comprehension_mc"
READING_COMPREHENSION_OE = "reading_comp_oe"
COMMONSENSE_REASONING = "commonsense_reasoning"
COMPLEX_REASONING = "complex_reasoning"
TRUTHFULNESS = "sosyoloji_truthfulness"
NLI = "nli"
MATH = "math"
TURKISH_VOCABULARY = "turkish_vocabulary"
METAPHORS_AND_IDIOMS = "metaphors_and_idioms"
TOPIC_DETECTION = "topic_detection"
STS = "sts"
MMLU = "mmlu"
BIAS_MC = "bias"
class DeepEvalTaskManager:
def __init__(self, model_name, tasks: List[str]):
self.model_name = model_name
self.available_tasks = {task.name: getattr(self, task.value) for task in Task}
self.tasks_to_run = self.validate_tasks(tasks)
def validate_tasks(self, user_tasks):
"""Validate user tasks and store method references."""
print(self.available_tasks.keys())
print(user_tasks)
try:
if not set(user_tasks).issubset(self.available_tasks.keys()):
invalid_tasks = set(user_tasks) - self.available_tasks.keys()
raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
except Exception as e:
print(f"Error: {e}")
# Store actual method references instead of strings
return {task : self.available_tasks[task] for task in user_tasks}
def run_tasks(self):
"""Execute validated tasks in order."""
results = {}
total_start_time = datetime.now()
for task_name, task_method in self.tasks_to_run.items():
try:
start_time = datetime.now()
print("Running task: ", task_name)
task_enum = getattr(Task, task_name)
task_value = task_enum.value
results[task_value] = task_method() # Call the stored method reference
end_time = datetime.now()
print(f"Task {task_name} completed in {(end_time - start_time).seconds} seconds.")
except Exception as e:
print(f"Error At Task: {task_name} - {e}")
continue
total_end_time = datetime.now()
print(f"All tasks completed in {(total_end_time - total_start_time).seconds} seconds.")
print("All tasks completed.")
return results
def sentiment_analysis_tr(self):
st_task = SentimentAnalysisTask(self.model_name)
res = st_task.evaluate()
return res
def turkish_general_knowledge(self):
turkish_general_knowledge_task = TurkishGeneralKnowledgeTask(self.model_name)
res = turkish_general_knowledge_task.evaluate()
return res
def summarization_tr(self):
summarization_task = SummarizationTask(self.model_name)
res = summarization_task.evaluate()
return res
def sosyoloji_faithfulness(self):
faithfulness_task = FaithfulnessTask(self.model_name)
res = faithfulness_task.evaluate()
return res
def sosyoloji_toxicity(self):
toxicity_task = ToxicityTask(self.model_name)
res = toxicity_task.evaluate()
return res
def sosyoloji_bias(self):
bias_task = BiasTaskOE(self.model_name)
res = bias_task.evaluate()
return res
def instruction_following_tr(self):
instruction_following_task = InstructionFollowingTask(self.model_name)
res = instruction_following_task.evaluate()
return res
def reading_comprehension_mc(self):
reading_comprehension_mc_task = ReadingComprehensionMCTask(self.model_name)
res = reading_comprehension_mc_task.evaluate()
return res
def reading_comp_oe(self):
reading_comprehension_task = ReadingComprehensionTask(self.model_name)
res = reading_comprehension_task.evaluate()
return res
def commonsense_reasoning(self):
commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name)
res = commonsense_reasoning_task.evaluate()
return res
def complex_reasoning(self):
complex_reasoning_task = ComplexReasoningTask(self.model_name)
res = complex_reasoning_task.evaluate()
return res
def sosyoloji_truthfulness(self):
truthfulness_task = TruthfulnessTask(self.model_name)
res = truthfulness_task.evaluate()
return res
def nli(self):
nli_task = NLITask(self.model_name)
res = nli_task.evaluate()
return res
def math(self):
math_task = MathTask(self.model_name)
res = math_task.evaluate()
return res
def turkish_vocabulary(self):
turkish_vocabulary_task = TurkishVocabularyTask(self.model_name)
res = turkish_vocabulary_task.evaluate()
return res
def metaphors_and_idioms(self):
metaphors_and_idioms_task = MetaphorsAndIdiomsTask(self.model_name)
res = metaphors_and_idioms_task.evaluate()
return res
def topic_detection(self):
topic_detection_task = TopicDetectionTask(self.model_name)
res = topic_detection_task.evaluate()
return res
def sts(self):
sts_task = STSTask(self.model_name)
res = sts_task.evaluate()
return res
def mmlu(self):
mmlu_task = MMLUTask(self.model_name)
res = mmlu_task.evaluate()
return res
def bias(self):
bias_task = BiasTask(self.model_name)
res = bias_task.evaluate()
return res
if __name__ == "__main__":
des = DeepEvalTaskManager("google/gemma-2-2b-it", ["TOXICITY", "BIAS"])
res = des.run_tasks()
print(res) |