Spaces:

metunlp
/

model-eval-be

Sleeping

Ahmet Kaan Sever commited on 22 days ago

Commit

9e6ede8

1 Parent(s): 76d5f6d

Fixed TRGenKnowledge task and mcqa generation function

generate_response_mcqa_multi_token works correctly for all kinds of choices.
Model generates letters.
Also added support for gemini models.

Files changed (5) hide show

requirements.txt +1 -0
src/deepeval/base_task.py +60 -34
src/deepeval/deepeval_task_manager.py +1 -1
src/deepeval/sentiment_analysis_task.py +10 -4
src/deepeval/turkish_general_knowledge_task.py +15 -5

requirements.txt CHANGED Viewed

@@ -2,6 +2,7 @@ fastapi
 uvicorn[standard]
 # lm_eval==0.4.3
 git+https://github.com/ecemumutlu/lm-evaluation-harness.git
 python-jose
 python-multipart
 deepeval

 uvicorn[standard]
 # lm_eval==0.4.3
 git+https://github.com/ecemumutlu/lm-evaluation-harness.git
+git+https://github.com/huggingface/[email protected]
 python-jose
 python-multipart
 deepeval

src/deepeval/base_task.py CHANGED Viewed

@@ -3,7 +3,7 @@ import itertools
 from datasets import load_dataset
 import os
 from dotenv import load_dotenv
-from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList
 import torch
 from typing import List
 load_dotenv()
@@ -29,12 +29,20 @@ class BaseTask(ABC):
     @staticmethod
     def load_model(model_name: str, device):
         """Loads model and tokenizer once and caches it."""
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.float16,
-            device_map=device,
-            token=HF_TOKEN,  # Replace with actual token
-        )
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         return model, tokenizer
@@ -77,48 +85,66 @@ class BaseTask(ABC):
         """
         Handles multiple-choice questions where answers might have multiple tokens.
         """
-        # Ensure the tokenizer has a padding token
         if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token  # Use EOS token as PAD token
-        inputs = self.tokenizer(msg, return_tensors="pt", padding=True, truncation=True)
         input_ids = inputs.input_ids.to(self.model.device)
         attention_mask = inputs.attention_mask.to(self.model.device)
-        if self.model.config.pad_token_id is None:
-            self.model.config.pad_token_id = self.tokenizer.eos_token_id
-        # Tokenize multi-token choices (do not flatten)
-        valid_token_ids = [self.tokenizer.encode(ans, add_special_tokens=False) for ans in choices]
-        print("Valid token IDs:", valid_token_ids)
-        class MultipleChoiceLogitsProcessor:
-            def __init__(self, valid_token_ids):
-                self.valid_token_ids = valid_token_ids  # List of tokenized choices
             def __call__(self, input_ids, scores):
-                mask = torch.full_like(scores, float("-inf"))  # Mask everything by default
-                # Allow the tokens in choices
-                allowed_tokens = {token for tokens in self.valid_token_ids for token in tokens}
-                mask[:, list(allowed_tokens)] = scores[:, list(allowed_tokens)]  # Allow only these tokens
                 return mask
-        logits_processor = LogitsProcessorList([MultipleChoiceLogitsProcessor(valid_token_ids)])
         output = self.model.generate(
             input_ids,
             attention_mask=attention_mask,
             max_new_tokens=max_new_tokens,
-            logits_processor=logits_processor
         )
-        # Decode and compare with choices to find the best match
-        generated_text = self.tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
-        best_match = max(choices, key=lambda choice: generated_text.startswith(choice))  # Pick closest match
-        return best_match
     @abstractmethod
     def load_dataset_from_hf(self):

 from datasets import load_dataset
 import os
 from dotenv import load_dotenv
+from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor, Gemma3ForCausalLM
 import torch
 from typing import List
 load_dotenv()
     @staticmethod
     def load_model(model_name: str, device):
         """Loads model and tokenizer once and caches it."""
+        if "gemma" in model_name:
+            model = Gemma3ForCausalLM.from_pretrained(
+                model_name,
+                #device_map=device, #Gives Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device. error
+                #torch_dtype=torch.float16, ##Gives Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed error.
+                token=HF_TOKEN,  # Replace with actual token
+        ).to(device)
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                device_map=device,
+                token=HF_TOKEN,  # Replace with actual token
+            )
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         return model, tokenizer
         """
         Handles multiple-choice questions where answers might have multiple tokens.
         """
+        # Ensure tokenizer has proper special tokens set
         if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        if self.model.config.pad_token_id is None:
+            self.model.config.pad_token_id = self.tokenizer.pad_token_id
+        chat = [
+                {"role": "user", "content": "You are a multiple choice question-answering chatbot. Do not give an answer that is not included in the choices. Only answer with letters like A, B, C, D..."},
+                {"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
+                {"role": "user", "content": f"{msg}"},
+            ]
+        formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+        print(formatted_chat)
+        inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
         input_ids = inputs.input_ids.to(self.model.device)
         attention_mask = inputs.attention_mask.to(self.model.device)
+        # Generate the sequence of letters starting from 'A'
+        letters = [chr(ord('A') + i) for i in range(len(choices))]  # Create option letters A, B, C, D, E, ...
+        encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
+        flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist]  # Flatten the list
+        print(flattened_encoded_choices)
+        allowed_tokens = flattened_encoded_choices
+        allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
+        allowed_token_ids = set(allowed_tokens)  # Ensure uniqueness
+        # Custom LogitsProcessor to restrict generation
+        class RestrictToABCDLogitsProcessor(LogitsProcessor):
             def __call__(self, input_ids, scores):
+                mask = torch.full_like(scores, float("-inf"))  # Block all tokens
+                mask[:, list(allowed_token_ids)] = scores[:, list(allowed_token_ids)]  # Allow only A, B, C, D tokens
                 return mask
+        logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()])
+        # Generate response
         output = self.model.generate(
             input_ids,
+            do_sample=True,
             attention_mask=attention_mask,
             max_new_tokens=max_new_tokens,
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=self.tokenizer.pad_token_id,
+            temperature=0.4,
+            logits_processor=logits_processor,
         )
+        generated_ids = output[0]  # The generated sequence including the prompt
+        generated_tokens = generated_ids[len(input_ids[0]):]  # Exclude the input_ids part
+        generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        return generated_text
+    def get_chat_template_tokens(self):
+        allowed_token_chat = [
+            {"role": "user", "content": ""},
+            {"role": "assistant", "content": ""}
+        ]
+        allowed_special_tokens = self.tokenizer.apply_chat_template(allowed_token_chat, tokenize=True)
+        return allowed_special_tokens
     @abstractmethod
     def load_dataset_from_hf(self):

src/deepeval/deepeval_task_manager.py CHANGED Viewed

@@ -53,6 +53,6 @@ class DeepEvalTaskManager:
 if __name__ == "__main__":
-    des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["TURKISH_GENERAL_KNOWLEDGE"])
     res = des.run_tasks()
     print(res)

 if __name__ == "__main__":
+    des = DeepEvalTaskManager("google/gemma-3-4b-it", ["TURKISH_GENERAL_KNOWLEDGE"])
     res = des.run_tasks()
     print(res)

src/deepeval/sentiment_analysis_task.py CHANGED Viewed

@@ -7,7 +7,8 @@ class SentimentAnalysisTask(BaseTask):
         super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
     def load_dataset_from_hf(self):
-        return super().load_dataset_from_hf()
     def evaluate(self) -> dict[str, Any]:
@@ -16,11 +17,16 @@ class SentimentAnalysisTask(BaseTask):
         n_correct = 0
         for row in self.dataset:
             sentence = row["sentence"]
-            prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}"
             messages = prompt
-            answer = self.generate_response_mcqa(messages, choices=["positive", "negative", "neutral"])
             responses.append(answer)
-            if row["sentiment"] == answer:
                 n_correct += 1
         acc = accuracy(n_correct, total_count)

         super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
     def load_dataset_from_hf(self):
+        dataset = super().load_dataset_from_hf()
+        return dataset.select(range(min(10, len(dataset))))
     def evaluate(self) -> dict[str, Any]:
         n_correct = 0
         for row in self.dataset:
             sentence = row["sentence"]
+            choices=["positive", "negative", "neutral"]
+            formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
+            prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}\n {formatted_choices}"
             messages = prompt
+            answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
+            print("Answer:", answer)
             responses.append(answer)
+            correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None
+            model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper()
+            if correct_answer_letter == model_answer_cleaned:
                 n_correct += 1
         acc = accuracy(n_correct, total_count)

src/deepeval/turkish_general_knowledge_task.py CHANGED Viewed

@@ -34,16 +34,26 @@ class TurkishGeneralKnowledgeTask(BaseTask):
                 category = 'hard'
             # Create a multiple-choice prompt to encourage index output
-            formatted_choices = "\n".join([f"{i}: {choice}" for i, choice in enumerate(choices)])
-            prompt = f"Soru: {question}\nSeçenekler:\n{formatted_choices}\nSorunun doğru cevabı hangisidir?"
-            print(f"Prompt: {prompt}")
-            model_answer = self.generate_response_mcqa_multi_token(prompt, choices=choices, max_new_tokens=30)
             responses.append(model_answer)
             print(f"Correct Answer: {choices[answer_index]}")
             print(f"Model Answer: {model_answer}")
             # Check if the answer is correct
-            if choices[answer_index] == model_answer:
                 true += 1
                 difficulty_results[category]['correct'] += 1

                 category = 'hard'
             # Create a multiple-choice prompt to encourage index output
+            formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
+            instruction = ""
+            message = f"{question}\nChoices:\n{formatted_choices}\n{instruction}\n"
+            #"""Wrap the result between final_answer tags. For example: <final_answer/> letter <final_answer>.
+            #"""
+            model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=30)
             responses.append(model_answer)
             print(f"Correct Answer: {choices[answer_index]}")
             print(f"Model Answer: {model_answer}")
+            #TODO: Make the cleaning in the mcqa function
+            model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
             # Check if the answer is correct
+            correct_answer_letter = chr(65 + answer_index)
+            print("Correct Answer Letter:", correct_answer_letter)
+            if correct_answer_letter == model_answer_cleaned:
                 true += 1
                 difficulty_results[category]['correct'] += 1