Ahmet Kaan Sever commited on
Commit
cd8917c
·
1 Parent(s): 52b6367

Removed logging from new tasks

Browse files
src/deepeval/bias.py CHANGED
@@ -43,21 +43,13 @@ class BiasTask(BaseTask):
43
  disambiguated_correct_answer_letter = chr(64 + disambiguated_answer) # 65 - 1 since we need the indexing to start from 0
44
  disambiguated_prompt = f"Bağlam: {disambiguated_context}\nSoru: {disambiguated_question}\nSeçenekler:{formatted_choices}"
45
 
46
- # Prints for debugging
47
- print(f"Choices: {choices}")
48
- print("Type of choices:", type(choices))
49
-
50
  # STAGE 1
51
  instruction = ""
52
  prompt = f"Stage1:\n{ambiguous_prompt}\n{instruction}\n"
53
  message = prompt
54
  ambiguous_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
55
  ambiguous_model_answer_cleaned = ambiguous_model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
56
- ## Print answers
57
- print(f"Correct Answer: {ambiguous_correct_answer_letter}")
58
- print(f"Model Answer: {ambiguous_model_answer}")
59
- print(f"Model Answer Cleaned: {ambiguous_model_answer_cleaned}")
60
- print(f"Result: {ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned}")
61
  ## Check if correct based on metric
62
  if ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned:
63
  true += 1
@@ -71,11 +63,6 @@ class BiasTask(BaseTask):
71
  message = prompt
72
  disambiguated_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
73
  disambiguated_model_answer_cleaned = disambiguated_model_answer.strip().replace('\n', '').replace(' ','').upper().replace(':', '')
74
- ## Print answers
75
- print(f"Correct Answer: {disambiguated_correct_answer_letter}")
76
- print(f"Model Answer: {disambiguated_model_answer}")
77
- print(f"Model Answer Cleaned: {disambiguated_model_answer_cleaned}")
78
- print(f"Result: {disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned}")
79
  responses.append((ambiguous_model_answer_cleaned,disambiguated_model_answer_cleaned))
80
 
81
  ## Check if correct based on metric
 
43
  disambiguated_correct_answer_letter = chr(64 + disambiguated_answer) # 65 - 1 since we need the indexing to start from 0
44
  disambiguated_prompt = f"Bağlam: {disambiguated_context}\nSoru: {disambiguated_question}\nSeçenekler:{formatted_choices}"
45
 
 
 
 
 
46
  # STAGE 1
47
  instruction = ""
48
  prompt = f"Stage1:\n{ambiguous_prompt}\n{instruction}\n"
49
  message = prompt
50
  ambiguous_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
51
  ambiguous_model_answer_cleaned = ambiguous_model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
52
+
 
 
 
 
53
  ## Check if correct based on metric
54
  if ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned:
55
  true += 1
 
63
  message = prompt
64
  disambiguated_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
65
  disambiguated_model_answer_cleaned = disambiguated_model_answer.strip().replace('\n', '').replace(' ','').upper().replace(':', '')
 
 
 
 
 
66
  responses.append((ambiguous_model_answer_cleaned,disambiguated_model_answer_cleaned))
67
 
68
  ## Check if correct based on metric
src/deepeval/math.py CHANGED
@@ -29,7 +29,6 @@ class MathTask(BaseTask):
29
  {"role": "user", "content": f"{msg}"},
30
  ]
31
  formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
32
- print(formatted_chat)
33
 
34
  inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
35
  input_ids = inputs.input_ids.to(self.model.device)
@@ -67,10 +66,6 @@ class MathTask(BaseTask):
67
  answer = row["final_answer"]
68
  question = row["question"]
69
 
70
- # Prints for debugging
71
- print(f"Answer: {answer}")
72
- print("Type of answer:", type(answer))
73
-
74
  # Construct the prompt/message
75
  instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
76
 
@@ -103,12 +98,6 @@ Nihai cevap:
103
  responses.append(model_answer)
104
  model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
105
 
106
- # Print answers
107
- print(f"Correct Answer: {answer}")
108
- print(f"Model Answer: {model_answer}")
109
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
110
- print(f"Result: {answer == model_answer_cleaned}")
111
-
112
  # Check if correct based on metric
113
  if answer == model_answer_cleaned:
114
  true += 1
 
29
  {"role": "user", "content": f"{msg}"},
30
  ]
31
  formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
 
32
 
33
  inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
34
  input_ids = inputs.input_ids.to(self.model.device)
 
66
  answer = row["final_answer"]
67
  question = row["question"]
68
 
 
 
 
 
69
  # Construct the prompt/message
70
  instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
71
 
 
98
  responses.append(model_answer)
99
  model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
100
 
 
 
 
 
 
 
101
  # Check if correct based on metric
102
  if answer == model_answer_cleaned:
103
  true += 1
src/deepeval/metaphors_and_idioms.py CHANGED
@@ -43,12 +43,6 @@ class MetaphorsAndIdiomsTask(BaseTask):
43
  else:
44
  question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
45
 
46
- # Prints for debugging
47
- print(f"Difficulty: {category}")
48
- print("Type of difficulty:", type(category))
49
- print(f"Answer: {correct_answer_letter}")
50
- print("Type of answer:", type(answer_index))
51
-
52
  # Construct the prompt/message
53
  instruction = ""
54
  prompt = f"Soru: {question}\nBağlam: {context}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
@@ -59,12 +53,6 @@ class MetaphorsAndIdiomsTask(BaseTask):
59
  responses.append(model_answer)
60
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
61
 
62
- # Print answers
63
- print(f"Correct Answer: {correct_answer_letter}")
64
- print(f"Model Answer: {model_answer}")
65
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
66
- print(f"Result: {correct_answer_letter == model_answer_cleaned}")
67
-
68
  # Check if correct based on metric
69
  if correct_answer_letter == model_answer_cleaned:
70
  true += 1
 
43
  else:
44
  question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
45
 
 
 
 
 
 
 
46
  # Construct the prompt/message
47
  instruction = ""
48
  prompt = f"Soru: {question}\nBağlam: {context}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
 
53
  responses.append(model_answer)
54
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
55
 
 
 
 
 
 
 
56
  # Check if correct based on metric
57
  if correct_answer_letter == model_answer_cleaned:
58
  true += 1
src/deepeval/mmlu.py CHANGED
@@ -16,12 +16,10 @@ class MMLUTask(BaseTask):
16
 
17
  def load_dataset_from_hf(self):
18
  evaluate_count = 50
19
- print("Loading dataset from Hugging Face.")
20
  dataset_dict = {}
21
  for subset in self.subsets:
22
  subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
23
  dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
24
- print("Dataset loaded.")
25
  return dataset_dict
26
 
27
 
@@ -48,10 +46,6 @@ class MMLUTask(BaseTask):
48
  formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
49
 
50
 
51
- # Prints for debugging
52
- print(f"Answer: {correct_answer_letter}")
53
- print("Type of answer:", type(answer_index))
54
-
55
  # Construct the prompt/message
56
  instruction = f"Aşağıda {subject} konusunda çoktan seçmeli bir soru verilmiştir."
57
  prompt = f"{instruction}\n\nSoru: {question}\nSeçenekler:\n{formatted_choices}\n\n"
@@ -62,12 +56,6 @@ class MMLUTask(BaseTask):
62
  responses.append(model_answer)
63
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
64
 
65
- # Print answers
66
- print(f"Correct Answer: {correct_answer_letter}")
67
- print(f"Model Answer: {model_answer}")
68
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
69
- print(f"Result: {correct_answer_letter == model_answer_cleaned}")
70
-
71
  # Check if correct based on metric
72
  if correct_answer_letter == model_answer_cleaned:
73
  true += 1
 
16
 
17
  def load_dataset_from_hf(self):
18
  evaluate_count = 50
 
19
  dataset_dict = {}
20
  for subset in self.subsets:
21
  subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
22
  dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
 
23
  return dataset_dict
24
 
25
 
 
46
  formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
47
 
48
 
 
 
 
 
49
  # Construct the prompt/message
50
  instruction = f"Aşağıda {subject} konusunda çoktan seçmeli bir soru verilmiştir."
51
  prompt = f"{instruction}\n\nSoru: {question}\nSeçenekler:\n{formatted_choices}\n\n"
 
56
  responses.append(model_answer)
57
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
58
 
 
 
 
 
 
 
59
  # Check if correct based on metric
60
  if correct_answer_letter == model_answer_cleaned:
61
  true += 1
src/deepeval/ner.py CHANGED
@@ -29,7 +29,6 @@ class NERTask(BaseTask):
29
  {"role": "user", "content": f"{msg}"},
30
  ]
31
  formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
32
- print(formatted_chat)
33
 
34
  inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
35
  input_ids = inputs.input_ids.to(self.model.device)
@@ -68,10 +67,6 @@ class NERTask(BaseTask):
68
  answer = row["final_answer"]
69
  question = row["question"]
70
 
71
- # Prints for debugging
72
- print(f"Answer: {answer}")
73
- print("Type of answer:", type(answer))
74
-
75
  # Construct the prompt/message
76
  instruction = ("Aşağıdaki Named Entity Recognition (NER) için etiketlenmesi gereken cümleler vardır. "
77
  "Cümlelerdeki varlıkları belirleyin ve şu kategorilere ayırın: CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PER, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, TITLE, WORK_OF_ART. "
@@ -141,12 +136,6 @@ class NERTask(BaseTask):
141
  responses.append(model_answer)
142
  model_answer_cleaned = model_answer
143
 
144
- # Print answers
145
- print(f"Correct Answer: {answer}")
146
- print(f"Model Answer: {model_answer}")
147
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
148
- print(f"Result: {answer == model_answer_cleaned}")
149
-
150
  # Check if correct based on metric
151
  if answer == model_answer_cleaned:
152
  true += 1
 
29
  {"role": "user", "content": f"{msg}"},
30
  ]
31
  formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
 
32
 
33
  inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
34
  input_ids = inputs.input_ids.to(self.model.device)
 
67
  answer = row["final_answer"]
68
  question = row["question"]
69
 
 
 
 
 
70
  # Construct the prompt/message
71
  instruction = ("Aşağıdaki Named Entity Recognition (NER) için etiketlenmesi gereken cümleler vardır. "
72
  "Cümlelerdeki varlıkları belirleyin ve şu kategorilere ayırın: CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PER, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, TITLE, WORK_OF_ART. "
 
136
  responses.append(model_answer)
137
  model_answer_cleaned = model_answer
138
 
 
 
 
 
 
 
139
  # Check if correct based on metric
140
  if answer == model_answer_cleaned:
141
  true += 1
src/deepeval/pos.py CHANGED
@@ -29,7 +29,6 @@ class POSTask(BaseTask):
29
  {"role": "user", "content": f"{msg}"},
30
  ]
31
  formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
32
- print(formatted_chat)
33
 
34
  inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
35
  input_ids = inputs.input_ids.to(self.model.device)
@@ -98,10 +97,6 @@ class POSTask(BaseTask):
98
  answer = row["final_answer"]
99
  question = row["question"]
100
 
101
- # Prints for debugging
102
- print(f"Answer: {answer}")
103
- print("Type of answer:", type(answer))
104
-
105
  # Construct the prompt/message
106
  instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
107
 
@@ -134,12 +129,6 @@ Nihai cevap:
134
  responses.append(model_answer)
135
  model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
136
 
137
- # Print answers
138
- print(f"Correct Answer: {answer}")
139
- print(f"Model Answer: {model_answer}")
140
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
141
- print(f"Result: {answer == model_answer_cleaned}")
142
-
143
  # Check if correct based on metric
144
  if answer == model_answer_cleaned:
145
  true += 1
 
29
  {"role": "user", "content": f"{msg}"},
30
  ]
31
  formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
 
32
 
33
  inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
34
  input_ids = inputs.input_ids.to(self.model.device)
 
97
  answer = row["final_answer"]
98
  question = row["question"]
99
 
 
 
 
 
100
  # Construct the prompt/message
101
  instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
102
 
 
129
  responses.append(model_answer)
130
  model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
131
 
 
 
 
 
 
 
132
  # Check if correct based on metric
133
  if answer == model_answer_cleaned:
134
  true += 1
src/deepeval/sts.py CHANGED
@@ -37,7 +37,6 @@ class STSTask(BaseTask):
37
  {"role": "user", "content": f"{msg}"},
38
  ]
39
  formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
40
- print(formatted_chat)
41
  inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
42
  input_ids = inputs.input_ids.to(self.model.device)
43
  attention_mask = inputs.attention_mask.to(self.model.device)
@@ -46,7 +45,6 @@ class STSTask(BaseTask):
46
  letters = ["0","1","2","3","4","5"]
47
  encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
48
  flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
49
- print(flattened_encoded_choices)
50
 
51
  allowed_tokens = flattened_encoded_choices
52
  allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
@@ -93,10 +91,6 @@ class STSTask(BaseTask):
93
  sentence_1 = row["sentence_1"]
94
  sentence_2 = row["sentence_2"]
95
 
96
- # Prints for debugging
97
- print(f"Answer: {answer}")
98
- print("Type of answer:", type(answer))
99
-
100
  # Construct the prompt/message
101
  instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
102
  prompt = f"""{instruction}\nCümle 1: {sentence_1}\nCümle 2: {sentence_2}\nSadece tek bir tam sayı söyleyin, ek bir kelime ya da sembol kullanmayın."""
@@ -107,12 +101,6 @@ class STSTask(BaseTask):
107
  responses.append(model_answer)
108
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
109
 
110
- # Print answers
111
- print(f"Correct Answer: {answer}")
112
- print(f"Model Answer: {model_answer}")
113
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
114
- print(f"Result: {answer == model_answer_cleaned}")
115
-
116
  # Check if correct based on metric
117
  if answer == model_answer_cleaned:
118
  true += 1
 
37
  {"role": "user", "content": f"{msg}"},
38
  ]
39
  formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
 
40
  inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
41
  input_ids = inputs.input_ids.to(self.model.device)
42
  attention_mask = inputs.attention_mask.to(self.model.device)
 
45
  letters = ["0","1","2","3","4","5"]
46
  encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
47
  flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
 
48
 
49
  allowed_tokens = flattened_encoded_choices
50
  allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
 
91
  sentence_1 = row["sentence_1"]
92
  sentence_2 = row["sentence_2"]
93
 
 
 
 
 
94
  # Construct the prompt/message
95
  instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
96
  prompt = f"""{instruction}\nCümle 1: {sentence_1}\nCümle 2: {sentence_2}\nSadece tek bir tam sayı söyleyin, ek bir kelime ya da sembol kullanmayın."""
 
101
  responses.append(model_answer)
102
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
103
 
 
 
 
 
 
 
104
  # Check if correct based on metric
105
  if answer == model_answer_cleaned:
106
  true += 1
src/deepeval/topic_detection.py CHANGED
@@ -30,11 +30,6 @@ class TopicDetectionTask(BaseTask):
30
  answer = row["answer"]
31
  text = row["text"]
32
 
33
- # Prints for debugging
34
- print(f"Choices: {choices}")
35
- print("Type of choices:", type(choices))
36
- print("Type of answer:", type(answer))
37
-
38
  # Get answer index (starting from 0)
39
  if type(answer) == int:
40
  answer_index = answer
@@ -53,12 +48,6 @@ class TopicDetectionTask(BaseTask):
53
  responses.append(model_answer)
54
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
55
 
56
- # Print answers
57
- print(f"Correct Answer: {correct_answer_letter}")
58
- print(f"Model Answer: {model_answer}")
59
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
60
- print(f"Result: {correct_answer_letter == model_answer_cleaned}")
61
-
62
  # Check if correct based on metric
63
  if correct_answer_letter == model_answer_cleaned:
64
  true += 1
 
30
  answer = row["answer"]
31
  text = row["text"]
32
 
 
 
 
 
 
33
  # Get answer index (starting from 0)
34
  if type(answer) == int:
35
  answer_index = answer
 
48
  responses.append(model_answer)
49
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
50
 
 
 
 
 
 
 
51
  # Check if correct based on metric
52
  if correct_answer_letter == model_answer_cleaned:
53
  true += 1
src/deepeval/turkish_vocabulary.py CHANGED
@@ -15,12 +15,10 @@ class TurkishVocabularyTask(BaseTask):
15
 
16
  def load_dataset_from_hf(self):
17
  evaluate_count = 50
18
- print("Loading dataset from Hugging Face.")
19
  dataset_dict = {}
20
  for subset in self.subsets:
21
  subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
22
  dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
23
- print("Dataset loaded.")
24
  return dataset_dict
25
 
26
 
@@ -54,14 +52,6 @@ class TurkishVocabularyTask(BaseTask):
54
  choices = ast.literal_eval(row["choices"]) # Convert string to list
55
  formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
56
 
57
-
58
-
59
- # Prints for debugging
60
- print(f"Difficulty: {category}")
61
- print("Type of difficulty:", type(category))
62
- print(f"Answer: {correct_answer_letter}")
63
- print("Type of answer:", type(answer_index))
64
-
65
  # Construct the prompt/message
66
  instruction = ""
67
  prompt = f"Soru: {question}\nKelime: {word}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
@@ -72,12 +62,6 @@ class TurkishVocabularyTask(BaseTask):
72
  responses.append(model_answer)
73
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
74
 
75
- # Print answers
76
- print(f"Correct Answer: {correct_answer_letter}")
77
- print(f"Model Answer: {model_answer}")
78
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
79
- print(f"Result: {correct_answer_letter == model_answer_cleaned}")
80
-
81
  # Check if correct based on metric
82
  if correct_answer_letter == model_answer_cleaned:
83
  true += 1
 
15
 
16
  def load_dataset_from_hf(self):
17
  evaluate_count = 50
 
18
  dataset_dict = {}
19
  for subset in self.subsets:
20
  subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
21
  dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
 
22
  return dataset_dict
23
 
24
 
 
52
  choices = ast.literal_eval(row["choices"]) # Convert string to list
53
  formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
54
 
 
 
 
 
 
 
 
 
55
  # Construct the prompt/message
56
  instruction = ""
57
  prompt = f"Soru: {question}\nKelime: {word}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
 
62
  responses.append(model_answer)
63
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
64
 
 
 
 
 
 
 
65
  # Check if correct based on metric
66
  if correct_answer_letter == model_answer_cleaned:
67
  true += 1