Spaces:
Running
on
L4
Running
on
L4
Commit
·
51ae401
1
Parent(s):
3a6903d
Create deep eval suite
Browse files
src/deepeval/__init__.py
ADDED
File without changes
|
src/deepeval/base_task.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from datasets import load_dataset
|
3 |
+
import os
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList
|
6 |
+
import torch
|
7 |
+
from typing import List
|
8 |
+
load_dotenv()
|
9 |
+
HF_TOKEN=os.getenv("HF_TOKEN")
|
10 |
+
|
11 |
+
class BaseTask(ABC):
|
12 |
+
_model_cache = {} # Class-level cache for models and tokenizers
|
13 |
+
|
14 |
+
def __init__(self, dataset_repo, model_name):
|
15 |
+
self.dataset_repo = dataset_repo
|
16 |
+
self.dataset = self.load_dataset_from_hf()
|
17 |
+
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
18 |
+
self.model, self.tokenizer = self.get_cached_model(model_name, self.device)
|
19 |
+
|
20 |
+
|
21 |
+
@classmethod
|
22 |
+
def get_cached_model(cls, model_name, device):
|
23 |
+
"""Ensures the same model and tokenizer are used for every instance of subclasses."""
|
24 |
+
if model_name not in cls._model_cache:
|
25 |
+
cls._model_cache[model_name] = cls.load_model(model_name, device)
|
26 |
+
return cls._model_cache[model_name]
|
27 |
+
|
28 |
+
@staticmethod
|
29 |
+
def load_model(model_name: str, device):
|
30 |
+
"""Loads model and tokenizer once and caches it."""
|
31 |
+
model = AutoModelForCausalLM.from_pretrained(
|
32 |
+
model_name,
|
33 |
+
torch_dtype=torch.float16,
|
34 |
+
device_map=device,
|
35 |
+
token=HF_TOKEN, # Replace with actual token
|
36 |
+
)
|
37 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
38 |
+
return model, tokenizer
|
39 |
+
|
40 |
+
|
41 |
+
def generate_response_mcqa(self, msg, max_new_tokens=1, choices: List[str]=[]):
|
42 |
+
# Ensure the tokenizer has a padding token
|
43 |
+
if self.tokenizer.pad_token is None:
|
44 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token # Use EOS token as PAD token
|
45 |
+
|
46 |
+
inputs = self.tokenizer(msg, return_tensors="pt", padding=True, truncation=True)
|
47 |
+
input_ids = inputs.input_ids.to(self.model.device)
|
48 |
+
attention_mask = inputs.attention_mask.to(self.model.device)
|
49 |
+
|
50 |
+
if self.model.config.pad_token_id is None:
|
51 |
+
self.model.config.pad_token_id = self.tokenizer.eos_token_id
|
52 |
+
|
53 |
+
# Get token IDs for answer choices
|
54 |
+
valid_answers = choices
|
55 |
+
valid_token_ids = [self.tokenizer.convert_tokens_to_ids(ans) for ans in valid_answers]
|
56 |
+
|
57 |
+
class MultipleChoiceLogitsProcessor:
|
58 |
+
def __call__(self, input_ids, scores):
|
59 |
+
mask = torch.full_like(scores, float("-inf"))
|
60 |
+
mask[:, valid_token_ids] = scores[:, valid_token_ids] # Allow only valid tokens
|
61 |
+
return mask
|
62 |
+
|
63 |
+
logits_processor = LogitsProcessorList([MultipleChoiceLogitsProcessor()])
|
64 |
+
|
65 |
+
output = self.model.generate(
|
66 |
+
input_ids,
|
67 |
+
attention_mask=attention_mask, # Fix: Pass attention_mask to avoid warning
|
68 |
+
max_new_tokens=max_new_tokens,
|
69 |
+
logits_processor=logits_processor
|
70 |
+
)
|
71 |
+
answer = self.tokenizer.decode(output[0][-1])
|
72 |
+
|
73 |
+
return answer
|
74 |
+
|
75 |
+
@abstractmethod
|
76 |
+
def load_dataset_from_hf(self):
|
77 |
+
"""
|
78 |
+
Define your own loading method if needed.
|
79 |
+
:return: Dataset
|
80 |
+
"""
|
81 |
+
return load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
|
82 |
+
|
83 |
+
@abstractmethod
|
84 |
+
def evaluate(self):
|
85 |
+
pass
|
src/deepeval/deepeval_task_manager.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from enum import Enum
|
4 |
+
from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
|
5 |
+
from typing import List
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
HF_TOKEN=os.getenv("HF_TOKEN")
|
9 |
+
|
10 |
+
class Task(Enum):
|
11 |
+
# SUMMARIZATION = "summarization"
|
12 |
+
SENTIMENT_ANALYSIS = "sentiment_analysis"
|
13 |
+
|
14 |
+
|
15 |
+
class DeepEvalTaskManager:
|
16 |
+
def __init__(self, model_name, tasks: List[str]):
|
17 |
+
self.model_name = model_name
|
18 |
+
self.available_tasks = {task.name: getattr(self, task.name.lower()) for task in Task}
|
19 |
+
self.tasks_to_run = self.validate_tasks(tasks)
|
20 |
+
|
21 |
+
def validate_tasks(self, user_tasks):
|
22 |
+
"""Validate user tasks and store method references."""
|
23 |
+
print(self.available_tasks.keys())
|
24 |
+
if not set(user_tasks).issubset(self.available_tasks.keys()):
|
25 |
+
invalid_tasks = set(user_tasks) - self.available_tasks.keys()
|
26 |
+
raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
|
27 |
+
|
28 |
+
# Store actual method references instead of strings
|
29 |
+
return {task : self.available_tasks[task] for task in user_tasks}
|
30 |
+
|
31 |
+
def run_tasks(self):
|
32 |
+
"""Execute validated tasks in order."""
|
33 |
+
results = {}
|
34 |
+
for task_name, task_method in self.tasks_to_run.items():
|
35 |
+
results[task_name] = task_method() # Call the stored method reference
|
36 |
+
|
37 |
+
return results
|
38 |
+
|
39 |
+
def sentiment_analysis(self):
|
40 |
+
st_task = SentimentAnalysisTask(self.model_name)
|
41 |
+
res = st_task.evaluate()
|
42 |
+
return res
|
43 |
+
|
44 |
+
|
45 |
+
if __name__ == "__main__":
|
46 |
+
des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["SENTIMENT_ANALYSIS"])
|
47 |
+
res = des.run_tasks()
|
48 |
+
print(res)
|
src/deepeval/sentiment_analysis_task.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
|
3 |
+
class SentimentAnalysisTask(BaseTask):
|
4 |
+
def __init__(self, model_name):
|
5 |
+
super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
|
6 |
+
|
7 |
+
def load_dataset_from_hf(self):
|
8 |
+
return super().load_dataset_from_hf()
|
9 |
+
|
10 |
+
|
11 |
+
def evaluate(self):
|
12 |
+
responses = []
|
13 |
+
total_count = len(self.dataset)
|
14 |
+
true = 0
|
15 |
+
for row in self.dataset:
|
16 |
+
sentence = row["sentence"]
|
17 |
+
prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}"
|
18 |
+
messages = prompt
|
19 |
+
answer = self.generate_response_mcqa(messages, choices=["positive", "negative", "neutral"])
|
20 |
+
responses.append(answer)
|
21 |
+
if row["sentiment"] == "positive":
|
22 |
+
true += 1
|
23 |
+
|
24 |
+
print(responses)
|
25 |
+
return true/total_count
|
26 |
+
|