ecemumutlu commited on
Commit
51ae401
·
1 Parent(s): 3a6903d

Create deep eval suite

Browse files
src/deepeval/__init__.py ADDED
File without changes
src/deepeval/base_task.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from datasets import load_dataset
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList
6
+ import torch
7
+ from typing import List
8
+ load_dotenv()
9
+ HF_TOKEN=os.getenv("HF_TOKEN")
10
+
11
+ class BaseTask(ABC):
12
+ _model_cache = {} # Class-level cache for models and tokenizers
13
+
14
+ def __init__(self, dataset_repo, model_name):
15
+ self.dataset_repo = dataset_repo
16
+ self.dataset = self.load_dataset_from_hf()
17
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
18
+ self.model, self.tokenizer = self.get_cached_model(model_name, self.device)
19
+
20
+
21
+ @classmethod
22
+ def get_cached_model(cls, model_name, device):
23
+ """Ensures the same model and tokenizer are used for every instance of subclasses."""
24
+ if model_name not in cls._model_cache:
25
+ cls._model_cache[model_name] = cls.load_model(model_name, device)
26
+ return cls._model_cache[model_name]
27
+
28
+ @staticmethod
29
+ def load_model(model_name: str, device):
30
+ """Loads model and tokenizer once and caches it."""
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ model_name,
33
+ torch_dtype=torch.float16,
34
+ device_map=device,
35
+ token=HF_TOKEN, # Replace with actual token
36
+ )
37
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
38
+ return model, tokenizer
39
+
40
+
41
+ def generate_response_mcqa(self, msg, max_new_tokens=1, choices: List[str]=[]):
42
+ # Ensure the tokenizer has a padding token
43
+ if self.tokenizer.pad_token is None:
44
+ self.tokenizer.pad_token = self.tokenizer.eos_token # Use EOS token as PAD token
45
+
46
+ inputs = self.tokenizer(msg, return_tensors="pt", padding=True, truncation=True)
47
+ input_ids = inputs.input_ids.to(self.model.device)
48
+ attention_mask = inputs.attention_mask.to(self.model.device)
49
+
50
+ if self.model.config.pad_token_id is None:
51
+ self.model.config.pad_token_id = self.tokenizer.eos_token_id
52
+
53
+ # Get token IDs for answer choices
54
+ valid_answers = choices
55
+ valid_token_ids = [self.tokenizer.convert_tokens_to_ids(ans) for ans in valid_answers]
56
+
57
+ class MultipleChoiceLogitsProcessor:
58
+ def __call__(self, input_ids, scores):
59
+ mask = torch.full_like(scores, float("-inf"))
60
+ mask[:, valid_token_ids] = scores[:, valid_token_ids] # Allow only valid tokens
61
+ return mask
62
+
63
+ logits_processor = LogitsProcessorList([MultipleChoiceLogitsProcessor()])
64
+
65
+ output = self.model.generate(
66
+ input_ids,
67
+ attention_mask=attention_mask, # Fix: Pass attention_mask to avoid warning
68
+ max_new_tokens=max_new_tokens,
69
+ logits_processor=logits_processor
70
+ )
71
+ answer = self.tokenizer.decode(output[0][-1])
72
+
73
+ return answer
74
+
75
+ @abstractmethod
76
+ def load_dataset_from_hf(self):
77
+ """
78
+ Define your own loading method if needed.
79
+ :return: Dataset
80
+ """
81
+ return load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
82
+
83
+ @abstractmethod
84
+ def evaluate(self):
85
+ pass
src/deepeval/deepeval_task_manager.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from enum import Enum
4
+ from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
5
+ from typing import List
6
+ load_dotenv()
7
+
8
+ HF_TOKEN=os.getenv("HF_TOKEN")
9
+
10
+ class Task(Enum):
11
+ # SUMMARIZATION = "summarization"
12
+ SENTIMENT_ANALYSIS = "sentiment_analysis"
13
+
14
+
15
+ class DeepEvalTaskManager:
16
+ def __init__(self, model_name, tasks: List[str]):
17
+ self.model_name = model_name
18
+ self.available_tasks = {task.name: getattr(self, task.name.lower()) for task in Task}
19
+ self.tasks_to_run = self.validate_tasks(tasks)
20
+
21
+ def validate_tasks(self, user_tasks):
22
+ """Validate user tasks and store method references."""
23
+ print(self.available_tasks.keys())
24
+ if not set(user_tasks).issubset(self.available_tasks.keys()):
25
+ invalid_tasks = set(user_tasks) - self.available_tasks.keys()
26
+ raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
27
+
28
+ # Store actual method references instead of strings
29
+ return {task : self.available_tasks[task] for task in user_tasks}
30
+
31
+ def run_tasks(self):
32
+ """Execute validated tasks in order."""
33
+ results = {}
34
+ for task_name, task_method in self.tasks_to_run.items():
35
+ results[task_name] = task_method() # Call the stored method reference
36
+
37
+ return results
38
+
39
+ def sentiment_analysis(self):
40
+ st_task = SentimentAnalysisTask(self.model_name)
41
+ res = st_task.evaluate()
42
+ return res
43
+
44
+
45
+ if __name__ == "__main__":
46
+ des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["SENTIMENT_ANALYSIS"])
47
+ res = des.run_tasks()
48
+ print(res)
src/deepeval/sentiment_analysis_task.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+
3
+ class SentimentAnalysisTask(BaseTask):
4
+ def __init__(self, model_name):
5
+ super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
6
+
7
+ def load_dataset_from_hf(self):
8
+ return super().load_dataset_from_hf()
9
+
10
+
11
+ def evaluate(self):
12
+ responses = []
13
+ total_count = len(self.dataset)
14
+ true = 0
15
+ for row in self.dataset:
16
+ sentence = row["sentence"]
17
+ prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}"
18
+ messages = prompt
19
+ answer = self.generate_response_mcqa(messages, choices=["positive", "negative", "neutral"])
20
+ responses.append(answer)
21
+ if row["sentiment"] == "positive":
22
+ true += 1
23
+
24
+ print(responses)
25
+ return true/total_count
26
+