model-eval-be / src /deepeval /base_task.py
Ahmet Kaan Sever
Fixed lm judge abstraction
f74f2a9
from abc import ABC, abstractmethod
from datasets import load_dataset
import os
from dotenv import load_dotenv
import openai
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor
import torch
from typing import List
from datetime import datetime
load_dotenv()
HF_TOKEN=os.getenv("HF_TOKEN")
OPENAI_KEY = os.getenv("OPENAI_API_KEY")
class BaseTask(ABC):
_model_cache = {} # Class-level cache for models and tokenizers
def __init__(self, dataset_repo, model_name):
self.dataset_repo = dataset_repo
self.dataset = self.load_dataset_from_hf()
device_count = torch.cuda.device_count()
if device_count > 1:
self.device = "auto"
print(f"Using {device_count} GPUs with auto config.")
elif device_count == 1:
self.device = "cuda"
print(f"Using {device_count} GPU with cuda config.")
else:
self.device = "cpu"
print("No GPU found. Using CPU.")
self.model, self.tokenizer = self.get_cached_model(model_name, self.device)
openai.api_key = OPENAI_KEY
@classmethod
def get_cached_model(cls, model_name, device):
"""Ensures the same model and tokenizer are used for every instance of subclasses."""
if model_name not in cls._model_cache:
cls._model_cache[model_name] = cls.load_model(model_name, device)
return cls._model_cache[model_name]
@staticmethod
def load_model(model_name: str, device):
"""Loads model and tokenizer once and caches it."""
print(f"Loading model: {model_name}")
start_time = datetime.now()
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map=device,
token=HF_TOKEN, # Replace with actual token
)
end_time = datetime.now()
print(f"Model loaded in {(end_time - start_time).seconds} seconds.")
print("Model loaded.")
tokenizer = AutoTokenizer.from_pretrained(model_name)
return model, tokenizer
# @staticmethod
# def load_model(model_name: str, device, weight, dtype, base_model):
# """Loads model and tokenizer once and caches it."""
# print(f"Loading model: {model_name}")
# start_time = datetime.now()
# if weight == "Adapter":
# base_model_1 = AutoModelForCausalLM.from_pretrained(
# base_model,
# torch_dtype=dtype,
# device_map=device,
# token=HF_TOKEN, # Replace with actual token
# )
# model = PeftModel.from_pretrained(base_model_1, base_model)
# tokenizer = AutoTokenizer.from_pretrained(base_model)
# end_time = datetime.now()
# else:
# model = AutoModelForCausalLM.from_pretrained(
# model_name,
# torch_dtype=dtype,
# device_map=device,
# token=HF_TOKEN, # Replace with actual token
# )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# end_time = datetime.now()
# print(f"Model loaded in {(end_time - start_time).seconds} seconds.")
# print("Model loaded.")
# return model, tokenizer
def generate_response_mcqa(self, msg, max_new_tokens=1, choices: List[str]=[]):
# Ensure the tokenizer has a padding token
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token # Use EOS token as PAD token
inputs = self.tokenizer(msg, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask
if self.model.config.pad_token_id is None:
self.model.config.pad_token_id = self.tokenizer.eos_token_id
# Get token IDs for answer choices
valid_answers = choices
valid_token_ids = [self.tokenizer.convert_tokens_to_ids(ans) for ans in valid_answers]
class MultipleChoiceLogitsProcessor:
def __call__(self, input_ids, scores):
mask = torch.full_like(scores, float("-inf"))
mask[:, valid_token_ids] = scores[:, valid_token_ids] # Allow only valid tokens
return mask
logits_processor = LogitsProcessorList([MultipleChoiceLogitsProcessor()])
output = self.model.generate(
input_ids,
attention_mask=attention_mask, # Fix: Pass attention_mask to avoid warning
max_new_tokens=max_new_tokens,
logits_processor=logits_processor
)
answer = self.tokenizer.decode(output[0][-1])
return answer
def generate_response_mcqa_multi_token(self, msg, max_new_tokens=2, choices: list = []):
"""
Handles multiple-choice questions where answers might have multiple tokens.
"""
# Ensure tokenizer has proper special tokens set
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
if self.model.config.pad_token_id is None:
self.model.config.pad_token_id = self.tokenizer.pad_token_id
chat = [
{"role": "user", "content": "You are a multiple choice question-answering chatbot. Do not give an answer that is not included in the choices. Only answer with letters like A, B, C, D..."},
{"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
{"role": "user", "content": f"{msg}"},
]
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
#print(formatted_chat)
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
if self.device == "auto":
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask
else:
input_ids = inputs.input_ids.to(self.model.device)
attention_mask = inputs.attention_mask.to(self.model.device)
# Generate the sequence of letters starting from 'A'
letters = [chr(ord('A') + i) for i in range(len(choices))] # Create option letters A, B, C, D, E, ...
encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
#print(flattened_encoded_choices)
allowed_tokens = flattened_encoded_choices
allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
allowed_token_ids = set(allowed_tokens) # Ensure uniqueness
# Custom LogitsProcessor to restrict generation
class RestrictToABCDLogitsProcessor(LogitsProcessor):
def __call__(self, input_ids, scores):
mask = torch.full_like(scores, float("-inf")) # Block all tokens
mask[:, list(allowed_token_ids)] = scores[:, list(allowed_token_ids)] # Allow only A, B, C, D tokens
return mask
logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()])
# Generate response
output = self.model.generate(
input_ids,
do_sample=True,
attention_mask=attention_mask,
max_new_tokens=max_new_tokens,
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.pad_token_id,
temperature=0.4,
logits_processor=logits_processor,
)
generated_ids = output[0] # The generated sequence including the prompt
generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
return generated_text
def generate_response(self, prompt: str, max_new_tokens: int = 100) -> str:
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
if self.model.config.pad_token_id is None:
self.model.config.pad_token_id = self.tokenizer.eos_token_id
chat = [
{"role": "user", "content": "You are a helpful AI assistant."},
{"role": "assistant", "content": "I am here to help you with any questions you may have."},
{"role": "user", "content": prompt},
]
formatted_chat = self.tokenizer.apply_chat_template(
chat,
tokenize=False,
add_generation_prompt=True
)
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
if self.device == "auto":
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask
else:
input_ids = inputs.input_ids.to(self.model.device)
attention_mask = inputs.attention_mask.to(self.model.device)
output = self.model.generate(
input_ids,
attention_mask=attention_mask,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=0.7,
)
generated_ids = output[0]
prompt_len = input_ids.shape[1]
generated_tokens = generated_ids[prompt_len:]
result = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
return result
def get_chat_template_tokens(self):
allowed_token_chat = [
{"role": "user", "content": ""},
{"role": "assistant", "content": ""}
]
allowed_special_tokens = self.tokenizer.apply_chat_template(allowed_token_chat, tokenize=True)
return allowed_special_tokens
@abstractmethod
def load_dataset_from_hf(self):
"""
Define your own loading method if needed.
:return: Dataset
"""
print("Loading dataset from Hugging Face.")
start_time = datetime.now()
dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
print("Dataset loaded.")
# Load 50 from each dataset
if len(dataset) > 50:
dataset = dataset.shuffle(seed=42).select(range(50))
end_time = datetime.now()
print(f"Dataset loaded in {(end_time - start_time).seconds} seconds.")
return dataset
def load_dataset_lmjudge_from_hf(self):
"""
Define your own loading method if needed.
:return: Dataset
"""
print("Loading dataset from Hugging Face.")
start_time = datetime.now()
dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
print("Dataset loaded.")
# Load 50 from each dataset
if len(dataset) > 10:
dataset = dataset.shuffle(seed=42).select(range(10))
end_time = datetime.now()
print(f"Dataset loaded in {(end_time - start_time).seconds} seconds.")
return dataset
@abstractmethod
def evaluate(self):
pass