File size: 7,356 Bytes
51ae401
c5ce1aa
51ae401
 
 
7b3d3a5
c5ce1aa
 
 
9c25ebd
c5ce1aa
 
33d2454
 
1a88171
33d2454
57be006
 
 
 
 
 
 
51ae401
66a11b3
51ae401
 
 
 
 
c88093d
76d5f6d
c5ce1aa
a18754a
 
 
 
 
 
 
 
 
 
57be006
 
 
 
 
 
 
 
51ae401
 
 
 
 
597b990
51ae401
 
 
 
 
76d5f6d
ea93485
 
 
 
 
 
 
51ae401
 
 
 
 
 
 
66a11b3
51ae401
8930e56
66a11b3
8930e56
 
 
 
66a11b3
8a3d32e
8930e56
 
 
66a11b3
8a3d32e
8930e56
51ae401
 
c88093d
51ae401
 
 
76d5f6d
 
 
 
 
51ae401
c5ce1aa
1a88171
 
 
c5ce1aa
1a88171
 
 
 
c5ce1aa
1a88171
 
 
 
c5ce1aa
1a88171
9c25ebd
1a88171
 
c5ce1aa
 
1a88171
 
33d2454
1a88171
33d2454
 
 
 
 
1a88171
 
 
 
 
 
 
 
 
 
33d2454
 
 
 
1a88171
 
 
 
 
33d2454
 
 
 
 
 
f869c66
57be006
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51ae401
1657c25
51ae401
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
from dotenv import load_dotenv
from enum import Enum
from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
from src.deepeval.commonsense_reasoning_task import CommonsenseReasoningTask
from src.deepeval.summarization_task import SummarizationTask
from src.deepeval.faithfulness_task import FaithfulnessTask
from src.deepeval.toxicity_task import ToxicityTask
from src.deepeval.bias_task import BiasTaskOE
from src.deepeval.instruction_following_task import InstructionFollowingTask
from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask
from src.deepeval.complex_reasoning import ComplexReasoningTask
from src.deepeval.truthfulness_task import TruthfulnessTask
from src.deepeval.nli import NLITask
from src.deepeval.math import MathTask
from src.deepeval.turkish_vocabulary import TurkishVocabularyTask
from src.deepeval.metaphors_and_idioms import MetaphorsAndIdiomsTask
from src.deepeval.topic_detection import TopicDetectionTask
from src.deepeval.sts import STSTask
from src.deepeval.mmlu import MMLUTask
from src.deepeval.bias import BiasTask
from typing import List
from datetime import datetime
load_dotenv()
HF_TOKEN=os.getenv("HF_TOKEN")

class Task(Enum):
    # SUMMARIZATION = "summarization"
    SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
    TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge"
    SUMMARIZATION = "summarization_tr"
    FAITHFULNESS = "sosyoloji_faithfulness"
    TOXICITY = "sosyoloji_toxicity"
    BIAS = "sosyoloji_bias"
    INSTRUCTION_FOLLOWING = "instruction_following_tr"
    READING_COMPREHENSION = "reading_comprehension_mc"
    READING_COMPREHENSION_OE = "reading_comp_oe"
    COMMONSENSE_REASONING = "commonsense_reasoning"
    COMPLEX_REASONING = "complex_reasoning"
    TRUTHFULNESS = "sosyoloji_truthfulness"
    NLI = "nli"
    MATH = "math"
    TURKISH_VOCABULARY = "turkish_vocabulary"
    METAPHORS_AND_IDIOMS = "metaphors_and_idioms"
    TOPIC_DETECTION = "topic_detection"
    STS = "sts"
    MMLU = "mmlu"
    BIAS_MC = "bias"
    


class DeepEvalTaskManager:
    def __init__(self, model_name, tasks: List[str]):
        self.model_name = model_name
        self.available_tasks = {task.name: getattr(self, task.value) for task in Task}
        self.tasks_to_run = self.validate_tasks(tasks)

    def validate_tasks(self, user_tasks):
        """Validate user tasks and store method references."""
        print(self.available_tasks.keys())
        print(user_tasks)

        try:
            if not set(user_tasks).issubset(self.available_tasks.keys()):
                invalid_tasks = set(user_tasks) - self.available_tasks.keys()
                raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
        except Exception as e:
            print(f"Error: {e}")

        # Store actual method references instead of strings
        return {task : self.available_tasks[task] for task in user_tasks}

    def run_tasks(self):
        """Execute validated tasks in order."""
        results = {}
        total_start_time = datetime.now()
        for task_name, task_method in self.tasks_to_run.items():
            try:
                start_time = datetime.now()
                print("Running task: ", task_name)
                task_enum = getattr(Task, task_name)
                task_value = task_enum.value
                results[task_value] = task_method()  # Call the stored method reference
                end_time = datetime.now()
                print(f"Task {task_name} completed in {(end_time - start_time).seconds} seconds.")
            except Exception as e:
                print(f"Error At Task: {task_name} - {e}")
                continue
        total_end_time = datetime.now()
        print(f"All tasks completed in {(total_end_time - total_start_time).seconds} seconds.")
        print("All tasks completed.")
        return results

    def sentiment_analysis_tr(self):
        st_task = SentimentAnalysisTask(self.model_name)
        res = st_task.evaluate()
        return res
    
    def turkish_general_knowledge(self):
        turkish_general_knowledge_task = TurkishGeneralKnowledgeTask(self.model_name)
        res = turkish_general_knowledge_task.evaluate()
        return res

    def summarization_tr(self):
        summarization_task = SummarizationTask(self.model_name)
        res = summarization_task.evaluate()
        return res

    def sosyoloji_faithfulness(self):
        faithfulness_task = FaithfulnessTask(self.model_name)
        res = faithfulness_task.evaluate()
        return res

    def sosyoloji_toxicity(self):
        toxicity_task = ToxicityTask(self.model_name)
        res = toxicity_task.evaluate()
        return res

    def sosyoloji_bias(self):
        bias_task = BiasTaskOE(self.model_name)
        res = bias_task.evaluate()
        return res

    def instruction_following_tr(self):
        instruction_following_task = InstructionFollowingTask(self.model_name)
        res = instruction_following_task.evaluate()
        return res
    
    def reading_comprehension_mc(self):
        reading_comprehension_mc_task = ReadingComprehensionMCTask(self.model_name)
        res = reading_comprehension_mc_task.evaluate()
        return res

    def reading_comp_oe(self):
        reading_comprehension_task = ReadingComprehensionTask(self.model_name)
        res = reading_comprehension_task.evaluate()
        return res

    def commonsense_reasoning(self):
        commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name)
        res = commonsense_reasoning_task.evaluate()
        return res
    
    def complex_reasoning(self):
        complex_reasoning_task = ComplexReasoningTask(self.model_name)
        res = complex_reasoning_task.evaluate()
        return res
    
    def sosyoloji_truthfulness(self):
        truthfulness_task = TruthfulnessTask(self.model_name)
        res = truthfulness_task.evaluate()
        return res

    def nli(self):
        nli_task = NLITask(self.model_name)
        res = nli_task.evaluate()
        return res

    def math(self):
        math_task = MathTask(self.model_name)
        res = math_task.evaluate()
        return res

    def turkish_vocabulary(self):
        turkish_vocabulary_task = TurkishVocabularyTask(self.model_name)
        res = turkish_vocabulary_task.evaluate()
        return res

    def metaphors_and_idioms(self):
        metaphors_and_idioms_task = MetaphorsAndIdiomsTask(self.model_name)
        res = metaphors_and_idioms_task.evaluate()
        return res

    def topic_detection(self):
        topic_detection_task = TopicDetectionTask(self.model_name)
        res = topic_detection_task.evaluate()
        return res

    def sts(self):
        sts_task = STSTask(self.model_name)
        res = sts_task.evaluate()
        return res

    def mmlu(self):
        mmlu_task = MMLUTask(self.model_name)
        res = mmlu_task.evaluate()
        return res

    def bias(self):
        bias_task = BiasTask(self.model_name)
        res = bias_task.evaluate()
        return res


if __name__ == "__main__":
    des = DeepEvalTaskManager("google/gemma-2-2b-it", ["TOXICITY", "BIAS"])
    res = des.run_tasks()
    print(res)