Spaces:
Sleeping
Sleeping
Ahmet Kaan Sever
commited on
Commit
·
9828c0e
1
Parent(s):
b30c279
Post merge fix
Browse files- src/deepeval/base_task.py +1 -1
- src/deepeval/bias.py +4 -2
- src/deepeval/math.py +6 -3
- src/deepeval/metaphors_and_idioms.py +4 -2
- src/deepeval/mmlu.py +4 -2
- src/deepeval/ner.py +6 -3
- src/deepeval/pos.py +6 -3
- src/deepeval/sts.py +9 -4
- src/deepeval/topic_detection.py +4 -2
- src/deepeval/turkish_vocabulary.py +4 -2
- svc/schemas.py +0 -1
src/deepeval/base_task.py
CHANGED
@@ -206,7 +206,7 @@ class BaseTask(ABC):
|
|
206 |
start_time = datetime.now()
|
207 |
dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
|
208 |
print("Dataset loaded.")
|
209 |
-
|
210 |
# Load 50 from each dataset
|
211 |
if len(dataset) > 50:
|
212 |
dataset = dataset.shuffle(seed=42).select(range(50))
|
|
|
206 |
start_time = datetime.now()
|
207 |
dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
|
208 |
print("Dataset loaded.")
|
209 |
+
|
210 |
# Load 50 from each dataset
|
211 |
if len(dataset) > 50:
|
212 |
dataset = dataset.shuffle(seed=42).select(range(50))
|
src/deepeval/bias.py
CHANGED
@@ -87,8 +87,10 @@ class BiasTask(BaseTask):
|
|
87 |
|
88 |
# Print results categorized by difficulty
|
89 |
for category, stats in difficulty_results.items():
|
90 |
-
|
91 |
-
|
|
|
|
|
92 |
|
93 |
print("Results:", responses)
|
94 |
print("Overall Accuracy:", true / total_count)
|
|
|
87 |
|
88 |
# Print results categorized by difficulty
|
89 |
for category, stats in difficulty_results.items():
|
90 |
+
correct = stats['correct']
|
91 |
+
total = stats['total']
|
92 |
+
calculatedAccuracy = correct / total if total > 0 else 0
|
93 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
|
94 |
|
95 |
print("Results:", responses)
|
96 |
print("Overall Accuracy:", true / total_count)
|
src/deepeval/math.py
CHANGED
@@ -65,6 +65,7 @@ class MathTask(BaseTask):
|
|
65 |
# Get values from row
|
66 |
category = str(row["difficulty"])
|
67 |
answer = row["final_answer"]
|
|
|
68 |
|
69 |
# Prints for debugging
|
70 |
print(f"Answer: {answer}")
|
@@ -94,7 +95,7 @@ Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir ku
|
|
94 |
|
95 |
Nihai cevap:
|
96 |
"""
|
97 |
-
prompt = f"{instruction}\n\nSoru:\n{
|
98 |
message = prompt
|
99 |
|
100 |
# Get/format answer of the model
|
@@ -117,8 +118,10 @@ Nihai cevap:
|
|
117 |
|
118 |
# Print results categorized by difficulty
|
119 |
for category, stats in difficulty_results.items():
|
120 |
-
|
121 |
-
|
|
|
|
|
122 |
|
123 |
print("Results:", responses)
|
124 |
print("Overall Accuracy:", true / total_count)
|
|
|
65 |
# Get values from row
|
66 |
category = str(row["difficulty"])
|
67 |
answer = row["final_answer"]
|
68 |
+
question = row["question"]
|
69 |
|
70 |
# Prints for debugging
|
71 |
print(f"Answer: {answer}")
|
|
|
95 |
|
96 |
Nihai cevap:
|
97 |
"""
|
98 |
+
prompt = f"{instruction}\n\nSoru:\n{question}\n"
|
99 |
message = prompt
|
100 |
|
101 |
# Get/format answer of the model
|
|
|
118 |
|
119 |
# Print results categorized by difficulty
|
120 |
for category, stats in difficulty_results.items():
|
121 |
+
correct = stats['correct']
|
122 |
+
total = stats['total']
|
123 |
+
calculatedAccuracy = correct / total if total > 0 else 0
|
124 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
|
125 |
|
126 |
print("Results:", responses)
|
127 |
print("Overall Accuracy:", true / total_count)
|
src/deepeval/metaphors_and_idioms.py
CHANGED
@@ -76,8 +76,10 @@ class MetaphorsAndIdiomsTask(BaseTask):
|
|
76 |
for subset in difficulty_results.keys():
|
77 |
subset_results = difficulty_results[subset]
|
78 |
for category, stats in subset_results.items():
|
79 |
-
|
80 |
-
|
|
|
|
|
81 |
|
82 |
print("Results:", responses)
|
83 |
print("Overall Accuracy:", true / total_count)
|
|
|
76 |
for subset in difficulty_results.keys():
|
77 |
subset_results = difficulty_results[subset]
|
78 |
for category, stats in subset_results.items():
|
79 |
+
correct = stats['correct']
|
80 |
+
total = stats['total']
|
81 |
+
calculatedAccuracy = correct / total if total > 0 else 0
|
82 |
+
print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
|
83 |
|
84 |
print("Results:", responses)
|
85 |
print("Overall Accuracy:", true / total_count)
|
src/deepeval/mmlu.py
CHANGED
@@ -76,8 +76,10 @@ class MMLUTask(BaseTask):
|
|
76 |
|
77 |
# Print results categorized by subset
|
78 |
for category, stats in difficulty_results.items():
|
79 |
-
|
80 |
-
|
|
|
|
|
81 |
|
82 |
print("Results:", responses)
|
83 |
print("Overall Accuracy:", true / total_count)
|
|
|
76 |
|
77 |
# Print results categorized by subset
|
78 |
for category, stats in difficulty_results.items():
|
79 |
+
correct = stats['correct']
|
80 |
+
total = stats['total']
|
81 |
+
calculatedAccuracy = correct / total if total > 0 else 0
|
82 |
+
print(f"{subset.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
|
83 |
|
84 |
print("Results:", responses)
|
85 |
print("Overall Accuracy:", true / total_count)
|
src/deepeval/ner.py
CHANGED
@@ -66,6 +66,7 @@ class NERTask(BaseTask):
|
|
66 |
# Get values from row
|
67 |
category = str(row["difficulty"])
|
68 |
answer = row["final_answer"]
|
|
|
69 |
|
70 |
# Prints for debugging
|
71 |
print(f"Answer: {answer}")
|
@@ -132,7 +133,7 @@ class NERTask(BaseTask):
|
|
132 |
""
|
133 |
"Verilen cümlelerdeki her varlığı csv formatında yukarıdaki örneklere benzer şekilde belirleyin. Çıktıdaki her satırı aşağıdaki gibi oluşturun: "
|
134 |
"<Varlık metni>,<Varlık etiketi>"),
|
135 |
-
prompt = f"{instruction}\n\nSoru:\n{
|
136 |
message = prompt
|
137 |
|
138 |
# Get/format answer of the model
|
@@ -155,8 +156,10 @@ class NERTask(BaseTask):
|
|
155 |
|
156 |
# Print results categorized by difficulty
|
157 |
for category, stats in difficulty_results.items():
|
158 |
-
|
159 |
-
|
|
|
|
|
160 |
|
161 |
print("Results:", responses)
|
162 |
print("Overall Accuracy:", true / total_count)
|
|
|
66 |
# Get values from row
|
67 |
category = str(row["difficulty"])
|
68 |
answer = row["final_answer"]
|
69 |
+
question = row["question"]
|
70 |
|
71 |
# Prints for debugging
|
72 |
print(f"Answer: {answer}")
|
|
|
133 |
""
|
134 |
"Verilen cümlelerdeki her varlığı csv formatında yukarıdaki örneklere benzer şekilde belirleyin. Çıktıdaki her satırı aşağıdaki gibi oluşturun: "
|
135 |
"<Varlık metni>,<Varlık etiketi>"),
|
136 |
+
prompt = f"{instruction}\n\nSoru:\n{question}\n"
|
137 |
message = prompt
|
138 |
|
139 |
# Get/format answer of the model
|
|
|
156 |
|
157 |
# Print results categorized by difficulty
|
158 |
for category, stats in difficulty_results.items():
|
159 |
+
correct = stats['correct']
|
160 |
+
total = stats['total']
|
161 |
+
calculatedAccuracy = correct / total if total > 0 else 0
|
162 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
|
163 |
|
164 |
print("Results:", responses)
|
165 |
print("Overall Accuracy:", true / total_count)
|
src/deepeval/pos.py
CHANGED
@@ -96,6 +96,7 @@ class POSTask(BaseTask):
|
|
96 |
# Get values from row
|
97 |
category = str(row["difficulty"])
|
98 |
answer = row["final_answer"]
|
|
|
99 |
|
100 |
# Prints for debugging
|
101 |
print(f"Answer: {answer}")
|
@@ -125,7 +126,7 @@ Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir ku
|
|
125 |
|
126 |
Nihai cevap:
|
127 |
"""
|
128 |
-
prompt = f"{instruction}\n\nSoru:\n{
|
129 |
message = prompt
|
130 |
|
131 |
# Get/format answer of the model
|
@@ -148,8 +149,10 @@ Nihai cevap:
|
|
148 |
|
149 |
# Print results categorized by difficulty
|
150 |
for category, stats in difficulty_results.items():
|
151 |
-
|
152 |
-
|
|
|
|
|
153 |
|
154 |
print("Results:", responses)
|
155 |
print("Overall Accuracy:", true / total_count)
|
|
|
96 |
# Get values from row
|
97 |
category = str(row["difficulty"])
|
98 |
answer = row["final_answer"]
|
99 |
+
question = row["question"]
|
100 |
|
101 |
# Prints for debugging
|
102 |
print(f"Answer: {answer}")
|
|
|
126 |
|
127 |
Nihai cevap:
|
128 |
"""
|
129 |
+
prompt = f"{instruction}\n\nSoru:\n{question}\n"
|
130 |
message = prompt
|
131 |
|
132 |
# Get/format answer of the model
|
|
|
149 |
|
150 |
# Print results categorized by difficulty
|
151 |
for category, stats in difficulty_results.items():
|
152 |
+
correct = stats['correct']
|
153 |
+
total = stats['total']
|
154 |
+
calculatedAccuracy = correct / total if total > 0 else 0
|
155 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
|
156 |
|
157 |
print("Results:", responses)
|
158 |
print("Overall Accuracy:", true / total_count)
|
src/deepeval/sts.py
CHANGED
@@ -90,6 +90,8 @@ class STSTask(BaseTask):
|
|
90 |
# Get values from row
|
91 |
answer = row["score"]
|
92 |
choices = ["0","1","2","3","4","5"]
|
|
|
|
|
93 |
|
94 |
# Prints for debugging
|
95 |
print(f"Answer: {answer}")
|
@@ -97,7 +99,7 @@ class STSTask(BaseTask):
|
|
97 |
|
98 |
# Construct the prompt/message
|
99 |
instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
|
100 |
-
prompt = f"""{instruction}\nCümle 1: {
|
101 |
message = prompt
|
102 |
|
103 |
# Get/format answer of the model
|
@@ -119,9 +121,12 @@ class STSTask(BaseTask):
|
|
119 |
difficulty_results['total'] += 1
|
120 |
|
121 |
# Print results
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
125 |
|
126 |
print("Results:", responses)
|
127 |
print("Overall Accuracy:", true / total_count)
|
|
|
90 |
# Get values from row
|
91 |
answer = row["score"]
|
92 |
choices = ["0","1","2","3","4","5"]
|
93 |
+
sentence_1 = row["sentence_1"]
|
94 |
+
sentence_2 = row["sentence_2"]
|
95 |
|
96 |
# Prints for debugging
|
97 |
print(f"Answer: {answer}")
|
|
|
99 |
|
100 |
# Construct the prompt/message
|
101 |
instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
|
102 |
+
prompt = f"""{instruction}\nCümle 1: {sentence_1}\nCümle 2: {sentence_2}\nSadece tek bir tam sayı söyleyin, ek bir kelime ya da sembol kullanmayın."""
|
103 |
message = prompt
|
104 |
|
105 |
# Get/format answer of the model
|
|
|
121 |
difficulty_results['total'] += 1
|
122 |
|
123 |
# Print results
|
124 |
+
stats = difficulty_results
|
125 |
+
correct = stats['correct']
|
126 |
+
total = stats['total']
|
127 |
+
|
128 |
+
calculatedAccuracy = correct / total if total > 0 else 0
|
129 |
+
print(f"Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
|
130 |
|
131 |
print("Results:", responses)
|
132 |
print("Overall Accuracy:", true / total_count)
|
src/deepeval/topic_detection.py
CHANGED
@@ -68,8 +68,10 @@ class TopicDetectionTask(BaseTask):
|
|
68 |
|
69 |
# Print results categorized by difficulty
|
70 |
for category, stats in difficulty_results.items():
|
71 |
-
|
72 |
-
|
|
|
|
|
73 |
|
74 |
print("Results:", responses)
|
75 |
print("Overall Accuracy:", true / total_count)
|
|
|
68 |
|
69 |
# Print results categorized by difficulty
|
70 |
for category, stats in difficulty_results.items():
|
71 |
+
correct = stats['correct']
|
72 |
+
total = stats['total']
|
73 |
+
calculatedAccuracy = correct / total if total > 0 else 0
|
74 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
|
75 |
|
76 |
print("Results:", responses)
|
77 |
print("Overall Accuracy:", true / total_count)
|
src/deepeval/turkish_vocabulary.py
CHANGED
@@ -89,8 +89,10 @@ class TurkishVocabularyTask(BaseTask):
|
|
89 |
for subset in self.subsets:
|
90 |
subset_results = difficulty_results[subset]
|
91 |
for category, stats in subset_results.items():
|
92 |
-
|
93 |
-
|
|
|
|
|
94 |
|
95 |
print("Results:", responses)
|
96 |
print("Overall Accuracy:", true / total_count)
|
|
|
89 |
for subset in self.subsets:
|
90 |
subset_results = difficulty_results[subset]
|
91 |
for category, stats in subset_results.items():
|
92 |
+
correct = stats['correct']
|
93 |
+
total = stats['total']
|
94 |
+
calculatedAccuracy = correct / total if total > 0 else 0
|
95 |
+
print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
|
96 |
|
97 |
print("Results:", responses)
|
98 |
print("Overall Accuracy:", true / total_count)
|
svc/schemas.py
CHANGED
@@ -35,7 +35,6 @@ class DeepEvalSuiteRequest(BaseModel):
|
|
35 |
tasks: Optional[List[str]] = None
|
36 |
|
37 |
|
38 |
-
|
39 |
class TaskResponse(BaseModel):
|
40 |
results: Json # dict[Any,Any]
|
41 |
|
|
|
35 |
tasks: Optional[List[str]] = None
|
36 |
|
37 |
|
|
|
38 |
class TaskResponse(BaseModel):
|
39 |
results: Json # dict[Any,Any]
|
40 |
|