Spaces:
Paused
Paused
Ahmet Kaan Sever
commited on
Commit
·
cd8917c
1
Parent(s):
52b6367
Removed logging from new tasks
Browse files- src/deepeval/bias.py +1 -14
- src/deepeval/math.py +0 -11
- src/deepeval/metaphors_and_idioms.py +0 -12
- src/deepeval/mmlu.py +0 -12
- src/deepeval/ner.py +0 -11
- src/deepeval/pos.py +0 -11
- src/deepeval/sts.py +0 -12
- src/deepeval/topic_detection.py +0 -11
- src/deepeval/turkish_vocabulary.py +0 -16
src/deepeval/bias.py
CHANGED
@@ -43,21 +43,13 @@ class BiasTask(BaseTask):
|
|
43 |
disambiguated_correct_answer_letter = chr(64 + disambiguated_answer) # 65 - 1 since we need the indexing to start from 0
|
44 |
disambiguated_prompt = f"Bağlam: {disambiguated_context}\nSoru: {disambiguated_question}\nSeçenekler:{formatted_choices}"
|
45 |
|
46 |
-
# Prints for debugging
|
47 |
-
print(f"Choices: {choices}")
|
48 |
-
print("Type of choices:", type(choices))
|
49 |
-
|
50 |
# STAGE 1
|
51 |
instruction = ""
|
52 |
prompt = f"Stage1:\n{ambiguous_prompt}\n{instruction}\n"
|
53 |
message = prompt
|
54 |
ambiguous_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
55 |
ambiguous_model_answer_cleaned = ambiguous_model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
56 |
-
|
57 |
-
print(f"Correct Answer: {ambiguous_correct_answer_letter}")
|
58 |
-
print(f"Model Answer: {ambiguous_model_answer}")
|
59 |
-
print(f"Model Answer Cleaned: {ambiguous_model_answer_cleaned}")
|
60 |
-
print(f"Result: {ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned}")
|
61 |
## Check if correct based on metric
|
62 |
if ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned:
|
63 |
true += 1
|
@@ -71,11 +63,6 @@ class BiasTask(BaseTask):
|
|
71 |
message = prompt
|
72 |
disambiguated_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
73 |
disambiguated_model_answer_cleaned = disambiguated_model_answer.strip().replace('\n', '').replace(' ','').upper().replace(':', '')
|
74 |
-
## Print answers
|
75 |
-
print(f"Correct Answer: {disambiguated_correct_answer_letter}")
|
76 |
-
print(f"Model Answer: {disambiguated_model_answer}")
|
77 |
-
print(f"Model Answer Cleaned: {disambiguated_model_answer_cleaned}")
|
78 |
-
print(f"Result: {disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned}")
|
79 |
responses.append((ambiguous_model_answer_cleaned,disambiguated_model_answer_cleaned))
|
80 |
|
81 |
## Check if correct based on metric
|
|
|
43 |
disambiguated_correct_answer_letter = chr(64 + disambiguated_answer) # 65 - 1 since we need the indexing to start from 0
|
44 |
disambiguated_prompt = f"Bağlam: {disambiguated_context}\nSoru: {disambiguated_question}\nSeçenekler:{formatted_choices}"
|
45 |
|
|
|
|
|
|
|
|
|
46 |
# STAGE 1
|
47 |
instruction = ""
|
48 |
prompt = f"Stage1:\n{ambiguous_prompt}\n{instruction}\n"
|
49 |
message = prompt
|
50 |
ambiguous_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
51 |
ambiguous_model_answer_cleaned = ambiguous_model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
52 |
+
|
|
|
|
|
|
|
|
|
53 |
## Check if correct based on metric
|
54 |
if ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned:
|
55 |
true += 1
|
|
|
63 |
message = prompt
|
64 |
disambiguated_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
65 |
disambiguated_model_answer_cleaned = disambiguated_model_answer.strip().replace('\n', '').replace(' ','').upper().replace(':', '')
|
|
|
|
|
|
|
|
|
|
|
66 |
responses.append((ambiguous_model_answer_cleaned,disambiguated_model_answer_cleaned))
|
67 |
|
68 |
## Check if correct based on metric
|
src/deepeval/math.py
CHANGED
@@ -29,7 +29,6 @@ class MathTask(BaseTask):
|
|
29 |
{"role": "user", "content": f"{msg}"},
|
30 |
]
|
31 |
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
32 |
-
print(formatted_chat)
|
33 |
|
34 |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
35 |
input_ids = inputs.input_ids.to(self.model.device)
|
@@ -67,10 +66,6 @@ class MathTask(BaseTask):
|
|
67 |
answer = row["final_answer"]
|
68 |
question = row["question"]
|
69 |
|
70 |
-
# Prints for debugging
|
71 |
-
print(f"Answer: {answer}")
|
72 |
-
print("Type of answer:", type(answer))
|
73 |
-
|
74 |
# Construct the prompt/message
|
75 |
instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
|
76 |
|
@@ -103,12 +98,6 @@ Nihai cevap:
|
|
103 |
responses.append(model_answer)
|
104 |
model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
|
105 |
|
106 |
-
# Print answers
|
107 |
-
print(f"Correct Answer: {answer}")
|
108 |
-
print(f"Model Answer: {model_answer}")
|
109 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
110 |
-
print(f"Result: {answer == model_answer_cleaned}")
|
111 |
-
|
112 |
# Check if correct based on metric
|
113 |
if answer == model_answer_cleaned:
|
114 |
true += 1
|
|
|
29 |
{"role": "user", "content": f"{msg}"},
|
30 |
]
|
31 |
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
|
|
32 |
|
33 |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
34 |
input_ids = inputs.input_ids.to(self.model.device)
|
|
|
66 |
answer = row["final_answer"]
|
67 |
question = row["question"]
|
68 |
|
|
|
|
|
|
|
|
|
69 |
# Construct the prompt/message
|
70 |
instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
|
71 |
|
|
|
98 |
responses.append(model_answer)
|
99 |
model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
# Check if correct based on metric
|
102 |
if answer == model_answer_cleaned:
|
103 |
true += 1
|
src/deepeval/metaphors_and_idioms.py
CHANGED
@@ -43,12 +43,6 @@ class MetaphorsAndIdiomsTask(BaseTask):
|
|
43 |
else:
|
44 |
question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
|
45 |
|
46 |
-
# Prints for debugging
|
47 |
-
print(f"Difficulty: {category}")
|
48 |
-
print("Type of difficulty:", type(category))
|
49 |
-
print(f"Answer: {correct_answer_letter}")
|
50 |
-
print("Type of answer:", type(answer_index))
|
51 |
-
|
52 |
# Construct the prompt/message
|
53 |
instruction = ""
|
54 |
prompt = f"Soru: {question}\nBağlam: {context}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
@@ -59,12 +53,6 @@ class MetaphorsAndIdiomsTask(BaseTask):
|
|
59 |
responses.append(model_answer)
|
60 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
61 |
|
62 |
-
# Print answers
|
63 |
-
print(f"Correct Answer: {correct_answer_letter}")
|
64 |
-
print(f"Model Answer: {model_answer}")
|
65 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
66 |
-
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
67 |
-
|
68 |
# Check if correct based on metric
|
69 |
if correct_answer_letter == model_answer_cleaned:
|
70 |
true += 1
|
|
|
43 |
else:
|
44 |
question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
# Construct the prompt/message
|
47 |
instruction = ""
|
48 |
prompt = f"Soru: {question}\nBağlam: {context}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
|
|
53 |
responses.append(model_answer)
|
54 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
# Check if correct based on metric
|
57 |
if correct_answer_letter == model_answer_cleaned:
|
58 |
true += 1
|
src/deepeval/mmlu.py
CHANGED
@@ -16,12 +16,10 @@ class MMLUTask(BaseTask):
|
|
16 |
|
17 |
def load_dataset_from_hf(self):
|
18 |
evaluate_count = 50
|
19 |
-
print("Loading dataset from Hugging Face.")
|
20 |
dataset_dict = {}
|
21 |
for subset in self.subsets:
|
22 |
subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
|
23 |
dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
|
24 |
-
print("Dataset loaded.")
|
25 |
return dataset_dict
|
26 |
|
27 |
|
@@ -48,10 +46,6 @@ class MMLUTask(BaseTask):
|
|
48 |
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
49 |
|
50 |
|
51 |
-
# Prints for debugging
|
52 |
-
print(f"Answer: {correct_answer_letter}")
|
53 |
-
print("Type of answer:", type(answer_index))
|
54 |
-
|
55 |
# Construct the prompt/message
|
56 |
instruction = f"Aşağıda {subject} konusunda çoktan seçmeli bir soru verilmiştir."
|
57 |
prompt = f"{instruction}\n\nSoru: {question}\nSeçenekler:\n{formatted_choices}\n\n"
|
@@ -62,12 +56,6 @@ class MMLUTask(BaseTask):
|
|
62 |
responses.append(model_answer)
|
63 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
64 |
|
65 |
-
# Print answers
|
66 |
-
print(f"Correct Answer: {correct_answer_letter}")
|
67 |
-
print(f"Model Answer: {model_answer}")
|
68 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
69 |
-
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
70 |
-
|
71 |
# Check if correct based on metric
|
72 |
if correct_answer_letter == model_answer_cleaned:
|
73 |
true += 1
|
|
|
16 |
|
17 |
def load_dataset_from_hf(self):
|
18 |
evaluate_count = 50
|
|
|
19 |
dataset_dict = {}
|
20 |
for subset in self.subsets:
|
21 |
subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
|
22 |
dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
|
|
|
23 |
return dataset_dict
|
24 |
|
25 |
|
|
|
46 |
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
47 |
|
48 |
|
|
|
|
|
|
|
|
|
49 |
# Construct the prompt/message
|
50 |
instruction = f"Aşağıda {subject} konusunda çoktan seçmeli bir soru verilmiştir."
|
51 |
prompt = f"{instruction}\n\nSoru: {question}\nSeçenekler:\n{formatted_choices}\n\n"
|
|
|
56 |
responses.append(model_answer)
|
57 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
# Check if correct based on metric
|
60 |
if correct_answer_letter == model_answer_cleaned:
|
61 |
true += 1
|
src/deepeval/ner.py
CHANGED
@@ -29,7 +29,6 @@ class NERTask(BaseTask):
|
|
29 |
{"role": "user", "content": f"{msg}"},
|
30 |
]
|
31 |
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
32 |
-
print(formatted_chat)
|
33 |
|
34 |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
35 |
input_ids = inputs.input_ids.to(self.model.device)
|
@@ -68,10 +67,6 @@ class NERTask(BaseTask):
|
|
68 |
answer = row["final_answer"]
|
69 |
question = row["question"]
|
70 |
|
71 |
-
# Prints for debugging
|
72 |
-
print(f"Answer: {answer}")
|
73 |
-
print("Type of answer:", type(answer))
|
74 |
-
|
75 |
# Construct the prompt/message
|
76 |
instruction = ("Aşağıdaki Named Entity Recognition (NER) için etiketlenmesi gereken cümleler vardır. "
|
77 |
"Cümlelerdeki varlıkları belirleyin ve şu kategorilere ayırın: CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PER, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, TITLE, WORK_OF_ART. "
|
@@ -141,12 +136,6 @@ class NERTask(BaseTask):
|
|
141 |
responses.append(model_answer)
|
142 |
model_answer_cleaned = model_answer
|
143 |
|
144 |
-
# Print answers
|
145 |
-
print(f"Correct Answer: {answer}")
|
146 |
-
print(f"Model Answer: {model_answer}")
|
147 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
148 |
-
print(f"Result: {answer == model_answer_cleaned}")
|
149 |
-
|
150 |
# Check if correct based on metric
|
151 |
if answer == model_answer_cleaned:
|
152 |
true += 1
|
|
|
29 |
{"role": "user", "content": f"{msg}"},
|
30 |
]
|
31 |
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
|
|
32 |
|
33 |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
34 |
input_ids = inputs.input_ids.to(self.model.device)
|
|
|
67 |
answer = row["final_answer"]
|
68 |
question = row["question"]
|
69 |
|
|
|
|
|
|
|
|
|
70 |
# Construct the prompt/message
|
71 |
instruction = ("Aşağıdaki Named Entity Recognition (NER) için etiketlenmesi gereken cümleler vardır. "
|
72 |
"Cümlelerdeki varlıkları belirleyin ve şu kategorilere ayırın: CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PER, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, TITLE, WORK_OF_ART. "
|
|
|
136 |
responses.append(model_answer)
|
137 |
model_answer_cleaned = model_answer
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
# Check if correct based on metric
|
140 |
if answer == model_answer_cleaned:
|
141 |
true += 1
|
src/deepeval/pos.py
CHANGED
@@ -29,7 +29,6 @@ class POSTask(BaseTask):
|
|
29 |
{"role": "user", "content": f"{msg}"},
|
30 |
]
|
31 |
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
32 |
-
print(formatted_chat)
|
33 |
|
34 |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
35 |
input_ids = inputs.input_ids.to(self.model.device)
|
@@ -98,10 +97,6 @@ class POSTask(BaseTask):
|
|
98 |
answer = row["final_answer"]
|
99 |
question = row["question"]
|
100 |
|
101 |
-
# Prints for debugging
|
102 |
-
print(f"Answer: {answer}")
|
103 |
-
print("Type of answer:", type(answer))
|
104 |
-
|
105 |
# Construct the prompt/message
|
106 |
instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
|
107 |
|
@@ -134,12 +129,6 @@ Nihai cevap:
|
|
134 |
responses.append(model_answer)
|
135 |
model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
|
136 |
|
137 |
-
# Print answers
|
138 |
-
print(f"Correct Answer: {answer}")
|
139 |
-
print(f"Model Answer: {model_answer}")
|
140 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
141 |
-
print(f"Result: {answer == model_answer_cleaned}")
|
142 |
-
|
143 |
# Check if correct based on metric
|
144 |
if answer == model_answer_cleaned:
|
145 |
true += 1
|
|
|
29 |
{"role": "user", "content": f"{msg}"},
|
30 |
]
|
31 |
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
|
|
32 |
|
33 |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
34 |
input_ids = inputs.input_ids.to(self.model.device)
|
|
|
97 |
answer = row["final_answer"]
|
98 |
question = row["question"]
|
99 |
|
|
|
|
|
|
|
|
|
100 |
# Construct the prompt/message
|
101 |
instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
|
102 |
|
|
|
129 |
responses.append(model_answer)
|
130 |
model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
# Check if correct based on metric
|
133 |
if answer == model_answer_cleaned:
|
134 |
true += 1
|
src/deepeval/sts.py
CHANGED
@@ -37,7 +37,6 @@ class STSTask(BaseTask):
|
|
37 |
{"role": "user", "content": f"{msg}"},
|
38 |
]
|
39 |
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
40 |
-
print(formatted_chat)
|
41 |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
42 |
input_ids = inputs.input_ids.to(self.model.device)
|
43 |
attention_mask = inputs.attention_mask.to(self.model.device)
|
@@ -46,7 +45,6 @@ class STSTask(BaseTask):
|
|
46 |
letters = ["0","1","2","3","4","5"]
|
47 |
encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
|
48 |
flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
|
49 |
-
print(flattened_encoded_choices)
|
50 |
|
51 |
allowed_tokens = flattened_encoded_choices
|
52 |
allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
|
@@ -93,10 +91,6 @@ class STSTask(BaseTask):
|
|
93 |
sentence_1 = row["sentence_1"]
|
94 |
sentence_2 = row["sentence_2"]
|
95 |
|
96 |
-
# Prints for debugging
|
97 |
-
print(f"Answer: {answer}")
|
98 |
-
print("Type of answer:", type(answer))
|
99 |
-
|
100 |
# Construct the prompt/message
|
101 |
instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
|
102 |
prompt = f"""{instruction}\nCümle 1: {sentence_1}\nCümle 2: {sentence_2}\nSadece tek bir tam sayı söyleyin, ek bir kelime ya da sembol kullanmayın."""
|
@@ -107,12 +101,6 @@ class STSTask(BaseTask):
|
|
107 |
responses.append(model_answer)
|
108 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
109 |
|
110 |
-
# Print answers
|
111 |
-
print(f"Correct Answer: {answer}")
|
112 |
-
print(f"Model Answer: {model_answer}")
|
113 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
114 |
-
print(f"Result: {answer == model_answer_cleaned}")
|
115 |
-
|
116 |
# Check if correct based on metric
|
117 |
if answer == model_answer_cleaned:
|
118 |
true += 1
|
|
|
37 |
{"role": "user", "content": f"{msg}"},
|
38 |
]
|
39 |
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
|
|
40 |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
41 |
input_ids = inputs.input_ids.to(self.model.device)
|
42 |
attention_mask = inputs.attention_mask.to(self.model.device)
|
|
|
45 |
letters = ["0","1","2","3","4","5"]
|
46 |
encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
|
47 |
flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
|
|
|
48 |
|
49 |
allowed_tokens = flattened_encoded_choices
|
50 |
allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
|
|
|
91 |
sentence_1 = row["sentence_1"]
|
92 |
sentence_2 = row["sentence_2"]
|
93 |
|
|
|
|
|
|
|
|
|
94 |
# Construct the prompt/message
|
95 |
instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
|
96 |
prompt = f"""{instruction}\nCümle 1: {sentence_1}\nCümle 2: {sentence_2}\nSadece tek bir tam sayı söyleyin, ek bir kelime ya da sembol kullanmayın."""
|
|
|
101 |
responses.append(model_answer)
|
102 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
# Check if correct based on metric
|
105 |
if answer == model_answer_cleaned:
|
106 |
true += 1
|
src/deepeval/topic_detection.py
CHANGED
@@ -30,11 +30,6 @@ class TopicDetectionTask(BaseTask):
|
|
30 |
answer = row["answer"]
|
31 |
text = row["text"]
|
32 |
|
33 |
-
# Prints for debugging
|
34 |
-
print(f"Choices: {choices}")
|
35 |
-
print("Type of choices:", type(choices))
|
36 |
-
print("Type of answer:", type(answer))
|
37 |
-
|
38 |
# Get answer index (starting from 0)
|
39 |
if type(answer) == int:
|
40 |
answer_index = answer
|
@@ -53,12 +48,6 @@ class TopicDetectionTask(BaseTask):
|
|
53 |
responses.append(model_answer)
|
54 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
55 |
|
56 |
-
# Print answers
|
57 |
-
print(f"Correct Answer: {correct_answer_letter}")
|
58 |
-
print(f"Model Answer: {model_answer}")
|
59 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
60 |
-
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
61 |
-
|
62 |
# Check if correct based on metric
|
63 |
if correct_answer_letter == model_answer_cleaned:
|
64 |
true += 1
|
|
|
30 |
answer = row["answer"]
|
31 |
text = row["text"]
|
32 |
|
|
|
|
|
|
|
|
|
|
|
33 |
# Get answer index (starting from 0)
|
34 |
if type(answer) == int:
|
35 |
answer_index = answer
|
|
|
48 |
responses.append(model_answer)
|
49 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
# Check if correct based on metric
|
52 |
if correct_answer_letter == model_answer_cleaned:
|
53 |
true += 1
|
src/deepeval/turkish_vocabulary.py
CHANGED
@@ -15,12 +15,10 @@ class TurkishVocabularyTask(BaseTask):
|
|
15 |
|
16 |
def load_dataset_from_hf(self):
|
17 |
evaluate_count = 50
|
18 |
-
print("Loading dataset from Hugging Face.")
|
19 |
dataset_dict = {}
|
20 |
for subset in self.subsets:
|
21 |
subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
|
22 |
dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
|
23 |
-
print("Dataset loaded.")
|
24 |
return dataset_dict
|
25 |
|
26 |
|
@@ -54,14 +52,6 @@ class TurkishVocabularyTask(BaseTask):
|
|
54 |
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
55 |
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
# Prints for debugging
|
60 |
-
print(f"Difficulty: {category}")
|
61 |
-
print("Type of difficulty:", type(category))
|
62 |
-
print(f"Answer: {correct_answer_letter}")
|
63 |
-
print("Type of answer:", type(answer_index))
|
64 |
-
|
65 |
# Construct the prompt/message
|
66 |
instruction = ""
|
67 |
prompt = f"Soru: {question}\nKelime: {word}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
@@ -72,12 +62,6 @@ class TurkishVocabularyTask(BaseTask):
|
|
72 |
responses.append(model_answer)
|
73 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
74 |
|
75 |
-
# Print answers
|
76 |
-
print(f"Correct Answer: {correct_answer_letter}")
|
77 |
-
print(f"Model Answer: {model_answer}")
|
78 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
79 |
-
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
80 |
-
|
81 |
# Check if correct based on metric
|
82 |
if correct_answer_letter == model_answer_cleaned:
|
83 |
true += 1
|
|
|
15 |
|
16 |
def load_dataset_from_hf(self):
|
17 |
evaluate_count = 50
|
|
|
18 |
dataset_dict = {}
|
19 |
for subset in self.subsets:
|
20 |
subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
|
21 |
dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
|
|
|
22 |
return dataset_dict
|
23 |
|
24 |
|
|
|
52 |
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
53 |
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
# Construct the prompt/message
|
56 |
instruction = ""
|
57 |
prompt = f"Soru: {question}\nKelime: {word}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
|
|
62 |
responses.append(model_answer)
|
63 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
# Check if correct based on metric
|
66 |
if correct_answer_letter == model_answer_cleaned:
|
67 |
true += 1
|