Ahmet Kaan Sever commited on
Commit
9e6ede8
·
1 Parent(s): 76d5f6d

Fixed TRGenKnowledge task and mcqa generation function

Browse files

generate_response_mcqa_multi_token works correctly for all kinds of choices.
Model generates letters.
Also added support for gemini models.

requirements.txt CHANGED
@@ -2,6 +2,7 @@ fastapi
2
  uvicorn[standard]
3
  # lm_eval==0.4.3
4
  git+https://github.com/ecemumutlu/lm-evaluation-harness.git
 
5
  python-jose
6
  python-multipart
7
  deepeval
 
2
  uvicorn[standard]
3
  # lm_eval==0.4.3
4
  git+https://github.com/ecemumutlu/lm-evaluation-harness.git
5
+ git+https://github.com/huggingface/[email protected]
6
  python-jose
7
  python-multipart
8
  deepeval
src/deepeval/base_task.py CHANGED
@@ -3,7 +3,7 @@ import itertools
3
  from datasets import load_dataset
4
  import os
5
  from dotenv import load_dotenv
6
- from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList
7
  import torch
8
  from typing import List
9
  load_dotenv()
@@ -29,12 +29,20 @@ class BaseTask(ABC):
29
  @staticmethod
30
  def load_model(model_name: str, device):
31
  """Loads model and tokenizer once and caches it."""
32
- model = AutoModelForCausalLM.from_pretrained(
33
- model_name,
34
- torch_dtype=torch.float16,
35
- device_map=device,
36
- token=HF_TOKEN, # Replace with actual token
37
- )
 
 
 
 
 
 
 
 
38
  tokenizer = AutoTokenizer.from_pretrained(model_name)
39
  return model, tokenizer
40
 
@@ -77,48 +85,66 @@ class BaseTask(ABC):
77
  """
78
  Handles multiple-choice questions where answers might have multiple tokens.
79
  """
80
- # Ensure the tokenizer has a padding token
81
  if self.tokenizer.pad_token is None:
82
- self.tokenizer.pad_token = self.tokenizer.eos_token # Use EOS token as PAD token
83
 
84
- inputs = self.tokenizer(msg, return_tensors="pt", padding=True, truncation=True)
 
 
 
 
 
 
 
 
 
 
85
  input_ids = inputs.input_ids.to(self.model.device)
86
  attention_mask = inputs.attention_mask.to(self.model.device)
87
 
88
- if self.model.config.pad_token_id is None:
89
- self.model.config.pad_token_id = self.tokenizer.eos_token_id
90
-
91
- # Tokenize multi-token choices (do not flatten)
92
- valid_token_ids = [self.tokenizer.encode(ans, add_special_tokens=False) for ans in choices]
93
- print("Valid token IDs:", valid_token_ids)
94
 
95
- class MultipleChoiceLogitsProcessor:
96
- def __init__(self, valid_token_ids):
97
- self.valid_token_ids = valid_token_ids # List of tokenized choices
98
 
 
 
99
  def __call__(self, input_ids, scores):
100
- mask = torch.full_like(scores, float("-inf")) # Mask everything by default
101
-
102
- # Allow the tokens in choices
103
- allowed_tokens = {token for tokens in self.valid_token_ids for token in tokens}
104
- mask[:, list(allowed_tokens)] = scores[:, list(allowed_tokens)] # Allow only these tokens
105
-
106
  return mask
107
-
108
- logits_processor = LogitsProcessorList([MultipleChoiceLogitsProcessor(valid_token_ids)])
109
 
 
110
  output = self.model.generate(
111
  input_ids,
 
112
  attention_mask=attention_mask,
113
  max_new_tokens=max_new_tokens,
114
- logits_processor=logits_processor
 
 
 
115
  )
116
-
117
- # Decode and compare with choices to find the best match
118
- generated_text = self.tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
119
- best_match = max(choices, key=lambda choice: generated_text.startswith(choice)) # Pick closest match
120
-
121
- return best_match
 
 
 
 
 
 
 
122
 
123
  @abstractmethod
124
  def load_dataset_from_hf(self):
 
3
  from datasets import load_dataset
4
  import os
5
  from dotenv import load_dotenv
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor, Gemma3ForCausalLM
7
  import torch
8
  from typing import List
9
  load_dotenv()
 
29
  @staticmethod
30
  def load_model(model_name: str, device):
31
  """Loads model and tokenizer once and caches it."""
32
+ if "gemma" in model_name:
33
+ model = Gemma3ForCausalLM.from_pretrained(
34
+ model_name,
35
+ #device_map=device, #Gives Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device. error
36
+ #torch_dtype=torch.float16, ##Gives Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed error.
37
+ token=HF_TOKEN, # Replace with actual token
38
+ ).to(device)
39
+ else:
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ model_name,
42
+ torch_dtype=torch.float16,
43
+ device_map=device,
44
+ token=HF_TOKEN, # Replace with actual token
45
+ )
46
  tokenizer = AutoTokenizer.from_pretrained(model_name)
47
  return model, tokenizer
48
 
 
85
  """
86
  Handles multiple-choice questions where answers might have multiple tokens.
87
  """
88
+ # Ensure tokenizer has proper special tokens set
89
  if self.tokenizer.pad_token is None:
90
+ self.tokenizer.pad_token = self.tokenizer.eos_token
91
 
92
+ if self.model.config.pad_token_id is None:
93
+ self.model.config.pad_token_id = self.tokenizer.pad_token_id
94
+
95
+ chat = [
96
+ {"role": "user", "content": "You are a multiple choice question-answering chatbot. Do not give an answer that is not included in the choices. Only answer with letters like A, B, C, D..."},
97
+ {"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
98
+ {"role": "user", "content": f"{msg}"},
99
+ ]
100
+ formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
101
+ print(formatted_chat)
102
+ inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
103
  input_ids = inputs.input_ids.to(self.model.device)
104
  attention_mask = inputs.attention_mask.to(self.model.device)
105
 
106
+ # Generate the sequence of letters starting from 'A'
107
+ letters = [chr(ord('A') + i) for i in range(len(choices))] # Create option letters A, B, C, D, E, ...
108
+ encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
109
+ flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
110
+ print(flattened_encoded_choices)
 
111
 
112
+ allowed_tokens = flattened_encoded_choices
113
+ allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
114
+ allowed_token_ids = set(allowed_tokens) # Ensure uniqueness
115
 
116
+ # Custom LogitsProcessor to restrict generation
117
+ class RestrictToABCDLogitsProcessor(LogitsProcessor):
118
  def __call__(self, input_ids, scores):
119
+ mask = torch.full_like(scores, float("-inf")) # Block all tokens
120
+ mask[:, list(allowed_token_ids)] = scores[:, list(allowed_token_ids)] # Allow only A, B, C, D tokens
 
 
 
 
121
  return mask
122
+ logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()])
 
123
 
124
+ # Generate response
125
  output = self.model.generate(
126
  input_ids,
127
+ do_sample=True,
128
  attention_mask=attention_mask,
129
  max_new_tokens=max_new_tokens,
130
+ eos_token_id=self.tokenizer.eos_token_id,
131
+ pad_token_id=self.tokenizer.pad_token_id,
132
+ temperature=0.4,
133
+ logits_processor=logits_processor,
134
  )
135
+ generated_ids = output[0] # The generated sequence including the prompt
136
+ generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
137
+ generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
138
+ return generated_text
139
+
140
+ def get_chat_template_tokens(self):
141
+ allowed_token_chat = [
142
+ {"role": "user", "content": ""},
143
+ {"role": "assistant", "content": ""}
144
+ ]
145
+ allowed_special_tokens = self.tokenizer.apply_chat_template(allowed_token_chat, tokenize=True)
146
+ return allowed_special_tokens
147
+
148
 
149
  @abstractmethod
150
  def load_dataset_from_hf(self):
src/deepeval/deepeval_task_manager.py CHANGED
@@ -53,6 +53,6 @@ class DeepEvalTaskManager:
53
 
54
 
55
  if __name__ == "__main__":
56
- des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["TURKISH_GENERAL_KNOWLEDGE"])
57
  res = des.run_tasks()
58
  print(res)
 
53
 
54
 
55
  if __name__ == "__main__":
56
+ des = DeepEvalTaskManager("google/gemma-3-4b-it", ["TURKISH_GENERAL_KNOWLEDGE"])
57
  res = des.run_tasks()
58
  print(res)
src/deepeval/sentiment_analysis_task.py CHANGED
@@ -7,7 +7,8 @@ class SentimentAnalysisTask(BaseTask):
7
  super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
8
 
9
  def load_dataset_from_hf(self):
10
- return super().load_dataset_from_hf()
 
11
 
12
 
13
  def evaluate(self) -> dict[str, Any]:
@@ -16,11 +17,16 @@ class SentimentAnalysisTask(BaseTask):
16
  n_correct = 0
17
  for row in self.dataset:
18
  sentence = row["sentence"]
19
- prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}"
 
 
20
  messages = prompt
21
- answer = self.generate_response_mcqa(messages, choices=["positive", "negative", "neutral"])
 
22
  responses.append(answer)
23
- if row["sentiment"] == answer:
 
 
24
  n_correct += 1
25
 
26
  acc = accuracy(n_correct, total_count)
 
7
  super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
8
 
9
  def load_dataset_from_hf(self):
10
+ dataset = super().load_dataset_from_hf()
11
+ return dataset.select(range(min(10, len(dataset))))
12
 
13
 
14
  def evaluate(self) -> dict[str, Any]:
 
17
  n_correct = 0
18
  for row in self.dataset:
19
  sentence = row["sentence"]
20
+ choices=["positive", "negative", "neutral"]
21
+ formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
22
+ prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}\n {formatted_choices}"
23
  messages = prompt
24
+ answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
25
+ print("Answer:", answer)
26
  responses.append(answer)
27
+ correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None
28
+ model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper()
29
+ if correct_answer_letter == model_answer_cleaned:
30
  n_correct += 1
31
 
32
  acc = accuracy(n_correct, total_count)
src/deepeval/turkish_general_knowledge_task.py CHANGED
@@ -34,16 +34,26 @@ class TurkishGeneralKnowledgeTask(BaseTask):
34
  category = 'hard'
35
 
36
  # Create a multiple-choice prompt to encourage index output
37
- formatted_choices = "\n".join([f"{i}: {choice}" for i, choice in enumerate(choices)])
38
- prompt = f"Soru: {question}\nSeçenekler:\n{formatted_choices}\nSorunun doğru cevabı hangisidir?"
39
 
40
- print(f"Prompt: {prompt}")
41
- model_answer = self.generate_response_mcqa_multi_token(prompt, choices=choices, max_new_tokens=30)
 
 
 
 
42
  responses.append(model_answer)
43
  print(f"Correct Answer: {choices[answer_index]}")
44
  print(f"Model Answer: {model_answer}")
 
 
 
 
45
  # Check if the answer is correct
46
- if choices[answer_index] == model_answer:
 
 
 
47
  true += 1
48
  difficulty_results[category]['correct'] += 1
49
 
 
34
  category = 'hard'
35
 
36
  # Create a multiple-choice prompt to encourage index output
37
+ formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
 
38
 
39
+ instruction = ""
40
+ message = f"{question}\nChoices:\n{formatted_choices}\n{instruction}\n"
41
+
42
+ #"""Wrap the result between final_answer tags. For example: <final_answer/> letter <final_answer>.
43
+ #"""
44
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=30)
45
  responses.append(model_answer)
46
  print(f"Correct Answer: {choices[answer_index]}")
47
  print(f"Model Answer: {model_answer}")
48
+
49
+ #TODO: Make the cleaning in the mcqa function
50
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
51
+
52
  # Check if the answer is correct
53
+ correct_answer_letter = chr(65 + answer_index)
54
+ print("Correct Answer Letter:", correct_answer_letter)
55
+
56
+ if correct_answer_letter == model_answer_cleaned:
57
  true += 1
58
  difficulty_results[category]['correct'] += 1
59