Ahmet Kaan Sever commited on
Commit
9828c0e
·
1 Parent(s): b30c279

Post merge fix

Browse files
src/deepeval/base_task.py CHANGED
@@ -206,7 +206,7 @@ class BaseTask(ABC):
206
  start_time = datetime.now()
207
  dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
208
  print("Dataset loaded.")
209
-
210
  # Load 50 from each dataset
211
  if len(dataset) > 50:
212
  dataset = dataset.shuffle(seed=42).select(range(50))
 
206
  start_time = datetime.now()
207
  dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
208
  print("Dataset loaded.")
209
+
210
  # Load 50 from each dataset
211
  if len(dataset) > 50:
212
  dataset = dataset.shuffle(seed=42).select(range(50))
src/deepeval/bias.py CHANGED
@@ -87,8 +87,10 @@ class BiasTask(BaseTask):
87
 
88
  # Print results categorized by difficulty
89
  for category, stats in difficulty_results.items():
90
- calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
91
- print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
 
 
92
 
93
  print("Results:", responses)
94
  print("Overall Accuracy:", true / total_count)
 
87
 
88
  # Print results categorized by difficulty
89
  for category, stats in difficulty_results.items():
90
+ correct = stats['correct']
91
+ total = stats['total']
92
+ calculatedAccuracy = correct / total if total > 0 else 0
93
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
94
 
95
  print("Results:", responses)
96
  print("Overall Accuracy:", true / total_count)
src/deepeval/math.py CHANGED
@@ -65,6 +65,7 @@ class MathTask(BaseTask):
65
  # Get values from row
66
  category = str(row["difficulty"])
67
  answer = row["final_answer"]
 
68
 
69
  # Prints for debugging
70
  print(f"Answer: {answer}")
@@ -94,7 +95,7 @@ Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir ku
94
 
95
  Nihai cevap:
96
  """
97
- prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
98
  message = prompt
99
 
100
  # Get/format answer of the model
@@ -117,8 +118,10 @@ Nihai cevap:
117
 
118
  # Print results categorized by difficulty
119
  for category, stats in difficulty_results.items():
120
- calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
121
- print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
 
 
122
 
123
  print("Results:", responses)
124
  print("Overall Accuracy:", true / total_count)
 
65
  # Get values from row
66
  category = str(row["difficulty"])
67
  answer = row["final_answer"]
68
+ question = row["question"]
69
 
70
  # Prints for debugging
71
  print(f"Answer: {answer}")
 
95
 
96
  Nihai cevap:
97
  """
98
+ prompt = f"{instruction}\n\nSoru:\n{question}\n"
99
  message = prompt
100
 
101
  # Get/format answer of the model
 
118
 
119
  # Print results categorized by difficulty
120
  for category, stats in difficulty_results.items():
121
+ correct = stats['correct']
122
+ total = stats['total']
123
+ calculatedAccuracy = correct / total if total > 0 else 0
124
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
125
 
126
  print("Results:", responses)
127
  print("Overall Accuracy:", true / total_count)
src/deepeval/metaphors_and_idioms.py CHANGED
@@ -76,8 +76,10 @@ class MetaphorsAndIdiomsTask(BaseTask):
76
  for subset in difficulty_results.keys():
77
  subset_results = difficulty_results[subset]
78
  for category, stats in subset_results.items():
79
- calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
80
- print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
 
 
81
 
82
  print("Results:", responses)
83
  print("Overall Accuracy:", true / total_count)
 
76
  for subset in difficulty_results.keys():
77
  subset_results = difficulty_results[subset]
78
  for category, stats in subset_results.items():
79
+ correct = stats['correct']
80
+ total = stats['total']
81
+ calculatedAccuracy = correct / total if total > 0 else 0
82
+ print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
83
 
84
  print("Results:", responses)
85
  print("Overall Accuracy:", true / total_count)
src/deepeval/mmlu.py CHANGED
@@ -76,8 +76,10 @@ class MMLUTask(BaseTask):
76
 
77
  # Print results categorized by subset
78
  for category, stats in difficulty_results.items():
79
- calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
80
- print(f"{subset.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
 
 
81
 
82
  print("Results:", responses)
83
  print("Overall Accuracy:", true / total_count)
 
76
 
77
  # Print results categorized by subset
78
  for category, stats in difficulty_results.items():
79
+ correct = stats['correct']
80
+ total = stats['total']
81
+ calculatedAccuracy = correct / total if total > 0 else 0
82
+ print(f"{subset.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
83
 
84
  print("Results:", responses)
85
  print("Overall Accuracy:", true / total_count)
src/deepeval/ner.py CHANGED
@@ -66,6 +66,7 @@ class NERTask(BaseTask):
66
  # Get values from row
67
  category = str(row["difficulty"])
68
  answer = row["final_answer"]
 
69
 
70
  # Prints for debugging
71
  print(f"Answer: {answer}")
@@ -132,7 +133,7 @@ class NERTask(BaseTask):
132
  ""
133
  "Verilen cümlelerdeki her varlığı csv formatında yukarıdaki örneklere benzer şekilde belirleyin. Çıktıdaki her satırı aşağıdaki gibi oluşturun: "
134
  "<Varlık metni>,<Varlık etiketi>"),
135
- prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
136
  message = prompt
137
 
138
  # Get/format answer of the model
@@ -155,8 +156,10 @@ class NERTask(BaseTask):
155
 
156
  # Print results categorized by difficulty
157
  for category, stats in difficulty_results.items():
158
- calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
159
- print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
 
 
160
 
161
  print("Results:", responses)
162
  print("Overall Accuracy:", true / total_count)
 
66
  # Get values from row
67
  category = str(row["difficulty"])
68
  answer = row["final_answer"]
69
+ question = row["question"]
70
 
71
  # Prints for debugging
72
  print(f"Answer: {answer}")
 
133
  ""
134
  "Verilen cümlelerdeki her varlığı csv formatında yukarıdaki örneklere benzer şekilde belirleyin. Çıktıdaki her satırı aşağıdaki gibi oluşturun: "
135
  "<Varlık metni>,<Varlık etiketi>"),
136
+ prompt = f"{instruction}\n\nSoru:\n{question}\n"
137
  message = prompt
138
 
139
  # Get/format answer of the model
 
156
 
157
  # Print results categorized by difficulty
158
  for category, stats in difficulty_results.items():
159
+ correct = stats['correct']
160
+ total = stats['total']
161
+ calculatedAccuracy = correct / total if total > 0 else 0
162
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
163
 
164
  print("Results:", responses)
165
  print("Overall Accuracy:", true / total_count)
src/deepeval/pos.py CHANGED
@@ -96,6 +96,7 @@ class POSTask(BaseTask):
96
  # Get values from row
97
  category = str(row["difficulty"])
98
  answer = row["final_answer"]
 
99
 
100
  # Prints for debugging
101
  print(f"Answer: {answer}")
@@ -125,7 +126,7 @@ Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir ku
125
 
126
  Nihai cevap:
127
  """
128
- prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
129
  message = prompt
130
 
131
  # Get/format answer of the model
@@ -148,8 +149,10 @@ Nihai cevap:
148
 
149
  # Print results categorized by difficulty
150
  for category, stats in difficulty_results.items():
151
- calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
152
- print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
 
 
153
 
154
  print("Results:", responses)
155
  print("Overall Accuracy:", true / total_count)
 
96
  # Get values from row
97
  category = str(row["difficulty"])
98
  answer = row["final_answer"]
99
+ question = row["question"]
100
 
101
  # Prints for debugging
102
  print(f"Answer: {answer}")
 
126
 
127
  Nihai cevap:
128
  """
129
+ prompt = f"{instruction}\n\nSoru:\n{question}\n"
130
  message = prompt
131
 
132
  # Get/format answer of the model
 
149
 
150
  # Print results categorized by difficulty
151
  for category, stats in difficulty_results.items():
152
+ correct = stats['correct']
153
+ total = stats['total']
154
+ calculatedAccuracy = correct / total if total > 0 else 0
155
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
156
 
157
  print("Results:", responses)
158
  print("Overall Accuracy:", true / total_count)
src/deepeval/sts.py CHANGED
@@ -90,6 +90,8 @@ class STSTask(BaseTask):
90
  # Get values from row
91
  answer = row["score"]
92
  choices = ["0","1","2","3","4","5"]
 
 
93
 
94
  # Prints for debugging
95
  print(f"Answer: {answer}")
@@ -97,7 +99,7 @@ class STSTask(BaseTask):
97
 
98
  # Construct the prompt/message
99
  instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
100
- prompt = f"""{instruction}\nCümle 1: {row["sentence_1"]}\nCümle 2: {row["sentence_2"]}\nSadece tek bir tam sayı söyleyin, ek bir kelime ya da sembol kullanmayın."""
101
  message = prompt
102
 
103
  # Get/format answer of the model
@@ -119,9 +121,12 @@ class STSTask(BaseTask):
119
  difficulty_results['total'] += 1
120
 
121
  # Print results
122
- stats = difficulty_results
123
- calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
124
- print(f"Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
 
 
 
125
 
126
  print("Results:", responses)
127
  print("Overall Accuracy:", true / total_count)
 
90
  # Get values from row
91
  answer = row["score"]
92
  choices = ["0","1","2","3","4","5"]
93
+ sentence_1 = row["sentence_1"]
94
+ sentence_2 = row["sentence_2"]
95
 
96
  # Prints for debugging
97
  print(f"Answer: {answer}")
 
99
 
100
  # Construct the prompt/message
101
  instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
102
+ prompt = f"""{instruction}\nCümle 1: {sentence_1}\nCümle 2: {sentence_2}\nSadece tek bir tam sayı söyleyin, ek bir kelime ya da sembol kullanmayın."""
103
  message = prompt
104
 
105
  # Get/format answer of the model
 
121
  difficulty_results['total'] += 1
122
 
123
  # Print results
124
+ stats = difficulty_results
125
+ correct = stats['correct']
126
+ total = stats['total']
127
+
128
+ calculatedAccuracy = correct / total if total > 0 else 0
129
+ print(f"Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
130
 
131
  print("Results:", responses)
132
  print("Overall Accuracy:", true / total_count)
src/deepeval/topic_detection.py CHANGED
@@ -68,8 +68,10 @@ class TopicDetectionTask(BaseTask):
68
 
69
  # Print results categorized by difficulty
70
  for category, stats in difficulty_results.items():
71
- calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
72
- print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
 
 
73
 
74
  print("Results:", responses)
75
  print("Overall Accuracy:", true / total_count)
 
68
 
69
  # Print results categorized by difficulty
70
  for category, stats in difficulty_results.items():
71
+ correct = stats['correct']
72
+ total = stats['total']
73
+ calculatedAccuracy = correct / total if total > 0 else 0
74
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
75
 
76
  print("Results:", responses)
77
  print("Overall Accuracy:", true / total_count)
src/deepeval/turkish_vocabulary.py CHANGED
@@ -89,8 +89,10 @@ class TurkishVocabularyTask(BaseTask):
89
  for subset in self.subsets:
90
  subset_results = difficulty_results[subset]
91
  for category, stats in subset_results.items():
92
- calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
93
- print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
 
 
94
 
95
  print("Results:", responses)
96
  print("Overall Accuracy:", true / total_count)
 
89
  for subset in self.subsets:
90
  subset_results = difficulty_results[subset]
91
  for category, stats in subset_results.items():
92
+ correct = stats['correct']
93
+ total = stats['total']
94
+ calculatedAccuracy = correct / total if total > 0 else 0
95
+ print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
96
 
97
  print("Results:", responses)
98
  print("Overall Accuracy:", true / total_count)
svc/schemas.py CHANGED
@@ -35,7 +35,6 @@ class DeepEvalSuiteRequest(BaseModel):
35
  tasks: Optional[List[str]] = None
36
 
37
 
38
-
39
  class TaskResponse(BaseModel):
40
  results: Json # dict[Any,Any]
41
 
 
35
  tasks: Optional[List[str]] = None
36
 
37
 
 
38
  class TaskResponse(BaseModel):
39
  results: Json # dict[Any,Any]
40