Spaces:

metunlp
/

gpu-backend

Paused

Ahmet Kaan Sever commited on Mar 15

Commit

76d5f6d

1 Parent(s): 597b990

Added Turkish General Knowledge task.

Created turkish_general_knowledge_task.py
Added generate_response_mcqa_multi_token
because the original function was not built to handle choices with multiple tokens.
Also created gitignore.

Files changed (4) hide show

.gitignore +10 -0
src/deepeval/base_task.py +48 -0
src/deepeval/deepeval_task_manager.py +9 -1
src/deepeval/turkish_general_knowledge_task.py +59 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.DS_Store
+.env
+.vscode/
+.idea/
+*.log
+node_modules/

src/deepeval/base_task.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
 from datasets import load_dataset
 import os
 from dotenv import load_dotenv
@@ -71,6 +72,53 @@ class BaseTask(ABC):
         answer = self.tokenizer.decode(output[0][-1])
         return answer
     @abstractmethod
     def load_dataset_from_hf(self):

 from abc import ABC, abstractmethod
+import itertools
 from datasets import load_dataset
 import os
 from dotenv import load_dotenv
         answer = self.tokenizer.decode(output[0][-1])
         return answer
+    def generate_response_mcqa_multi_token(self, msg, max_new_tokens=5, choices: list = []):
+        """
+        Handles multiple-choice questions where answers might have multiple tokens.
+        """
+        # Ensure the tokenizer has a padding token
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token  # Use EOS token as PAD token
+        inputs = self.tokenizer(msg, return_tensors="pt", padding=True, truncation=True)
+        input_ids = inputs.input_ids.to(self.model.device)
+        attention_mask = inputs.attention_mask.to(self.model.device)
+        if self.model.config.pad_token_id is None:
+            self.model.config.pad_token_id = self.tokenizer.eos_token_id
+        # Tokenize multi-token choices (do not flatten)
+        valid_token_ids = [self.tokenizer.encode(ans, add_special_tokens=False) for ans in choices]
+        print("Valid token IDs:", valid_token_ids)
+        class MultipleChoiceLogitsProcessor:
+            def __init__(self, valid_token_ids):
+                self.valid_token_ids = valid_token_ids  # List of tokenized choices
+            def __call__(self, input_ids, scores):
+                mask = torch.full_like(scores, float("-inf"))  # Mask everything by default
+                # Allow the tokens in choices
+                allowed_tokens = {token for tokens in self.valid_token_ids for token in tokens}
+                mask[:, list(allowed_tokens)] = scores[:, list(allowed_tokens)]  # Allow only these tokens
+                return mask
+        logits_processor = LogitsProcessorList([MultipleChoiceLogitsProcessor(valid_token_ids)])
+        output = self.model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_new_tokens,
+            logits_processor=logits_processor
+        )
+        # Decode and compare with choices to find the best match
+        generated_text = self.tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
+        best_match = max(choices, key=lambda choice: generated_text.startswith(choice))  # Pick closest match
+        return best_match
     @abstractmethod
     def load_dataset_from_hf(self):

src/deepeval/deepeval_task_manager.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from dotenv import load_dotenv
 from enum import Enum
 from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
 from typing import List
 load_dotenv()
@@ -10,6 +11,7 @@ HF_TOKEN=os.getenv("HF_TOKEN")
 class Task(Enum):
     # SUMMARIZATION = "summarization"
     SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
 class DeepEvalTaskManager:
@@ -21,6 +23,7 @@ class DeepEvalTaskManager:
     def validate_tasks(self, user_tasks):
         """Validate user tasks and store method references."""
         print(self.available_tasks.keys())
         if not set(user_tasks).issubset(self.available_tasks.keys()):
             invalid_tasks = set(user_tasks) - self.available_tasks.keys()
             raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
@@ -42,9 +45,14 @@ class DeepEvalTaskManager:
         st_task = SentimentAnalysisTask(self.model_name)
         res = st_task.evaluate()
         return res
 if __name__ == "__main__":
-    des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["SENTIMENT_ANALYSIS"])
     res = des.run_tasks()
     print(res)

 import os
 from dotenv import load_dotenv
 from enum import Enum
+from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
 from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
 from typing import List
 load_dotenv()
 class Task(Enum):
     # SUMMARIZATION = "summarization"
     SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
+    TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge"
 class DeepEvalTaskManager:
     def validate_tasks(self, user_tasks):
         """Validate user tasks and store method references."""
         print(self.available_tasks.keys())
+        print(user_tasks)
         if not set(user_tasks).issubset(self.available_tasks.keys()):
             invalid_tasks = set(user_tasks) - self.available_tasks.keys()
             raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
         st_task = SentimentAnalysisTask(self.model_name)
         res = st_task.evaluate()
         return res
+    def turkish_general_knowledge(self):
+        turkish_general_knowledge_task = TurkishGeneralKnowledgeTask(self.model_name)
+        res = turkish_general_knowledge_task.evaluate()
+        return res
 if __name__ == "__main__":
+    des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["TURKISH_GENERAL_KNOWLEDGE"])
     res = des.run_tasks()
     print(res)

src/deepeval/turkish_general_knowledge_task.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from src.deepeval.base_task import BaseTask
+from collections import defaultdict
+import ast
+class TurkishGeneralKnowledgeTask(BaseTask):
+    def __init__(self, model_name):
+        super().__init__("metunlp/turkish_general_knowledge", model_name=model_name)
+    def load_dataset_from_hf(self):
+        dataset = super().load_dataset_from_hf()
+        return dataset.select(range(min(10, len(dataset))))
+    def evaluate(self):
+        responses = []
+        difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
+        total_count = 0
+        true = 0
+        for row in self.dataset:
+            total_count += 1
+            question = row["question"]
+            choices = ast.literal_eval(row["choices"]) # Convert string to list
+            answer_index = row["answer"]  # Assuming it's zero-based index
+            difficulty = row["difficulty"]
+            print(f"Choices: {choices}")
+            print("Type of choices:", type(choices))
+            # Categorize difficulty
+            if difficulty <= 3:
+                category = 'easy'
+            elif 3 < difficulty <= 6:
+                category = 'medium'
+            else:
+                category = 'hard'
+            # Create a multiple-choice prompt to encourage index output
+            formatted_choices = "\n".join([f"{i}: {choice}" for i, choice in enumerate(choices)])
+            prompt = f"Soru: {question}\nSeçenekler:\n{formatted_choices}\nSorunun doğru cevabı hangisidir?"
+            print(f"Prompt: {prompt}")
+            model_answer = self.generate_response_mcqa_multi_token(prompt, choices=choices, max_new_tokens=30)
+            responses.append(model_answer)
+            print(f"Correct Answer: {choices[answer_index]}")
+            print(f"Model Answer: {model_answer}")
+            # Check if the answer is correct
+            if choices[answer_index] == model_answer:
+                true += 1
+                difficulty_results[category]['correct'] += 1
+            difficulty_results[category]['total'] += 1
+        # Print results categorized by difficulty
+        for category, stats in difficulty_results.items():
+            accuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
+            print(f"{category.capitalize()} Accuracy: {accuracy:.2%} ({stats['correct']}/{stats['total']})")
+        print("Results:", responses)
+        print("Overall Accuracy:", true / total_count)
+        return true / total_count