Spaces:
Running
on
L4
Running
on
L4
from abc import ABC, abstractmethod | |
from datasets import load_dataset | |
import os | |
from dotenv import load_dotenv | |
import openai | |
from peft import PeftModel | |
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor | |
import torch | |
from typing import List | |
from datetime import datetime | |
load_dotenv() | |
HF_TOKEN=os.getenv("HF_TOKEN") | |
OPENAI_KEY = os.getenv("OPENAI_API_KEY") | |
class BaseTask(ABC): | |
_model_cache = {} # Class-level cache for models and tokenizers | |
def __init__(self, dataset_repo, model_name): | |
self.dataset_repo = dataset_repo | |
self.dataset = self.load_dataset_from_hf() | |
device_count = torch.cuda.device_count() | |
if device_count > 1: | |
self.device = "auto" | |
print(f"Using {device_count} GPUs with auto config.") | |
elif device_count == 1: | |
self.device = "cuda" | |
print(f"Using {device_count} GPU with cuda config.") | |
else: | |
self.device = "cpu" | |
print("No GPU found. Using CPU.") | |
self.model, self.tokenizer = self.get_cached_model(model_name, self.device) | |
openai.api_key = OPENAI_KEY | |
def get_cached_model(cls, model_name, device): | |
"""Ensures the same model and tokenizer are used for every instance of subclasses.""" | |
if model_name not in cls._model_cache: | |
cls._model_cache[model_name] = cls.load_model(model_name, device) | |
return cls._model_cache[model_name] | |
def load_model(model_name: str, device): | |
"""Loads model and tokenizer once and caches it.""" | |
print(f"Loading model: {model_name}") | |
start_time = datetime.now() | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float16, | |
device_map=device, | |
token=HF_TOKEN, # Replace with actual token | |
) | |
end_time = datetime.now() | |
print(f"Model loaded in {(end_time - start_time).seconds} seconds.") | |
print("Model loaded.") | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
return model, tokenizer | |
# @staticmethod | |
# def load_model(model_name: str, device, weight, dtype, base_model): | |
# """Loads model and tokenizer once and caches it.""" | |
# print(f"Loading model: {model_name}") | |
# start_time = datetime.now() | |
# if weight == "Adapter": | |
# base_model_1 = AutoModelForCausalLM.from_pretrained( | |
# base_model, | |
# torch_dtype=dtype, | |
# device_map=device, | |
# token=HF_TOKEN, # Replace with actual token | |
# ) | |
# model = PeftModel.from_pretrained(base_model_1, base_model) | |
# tokenizer = AutoTokenizer.from_pretrained(base_model) | |
# end_time = datetime.now() | |
# else: | |
# model = AutoModelForCausalLM.from_pretrained( | |
# model_name, | |
# torch_dtype=dtype, | |
# device_map=device, | |
# token=HF_TOKEN, # Replace with actual token | |
# ) | |
# tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# end_time = datetime.now() | |
# print(f"Model loaded in {(end_time - start_time).seconds} seconds.") | |
# print("Model loaded.") | |
# return model, tokenizer | |
def generate_response_mcqa(self, msg, max_new_tokens=1, choices: List[str]=[]): | |
# Ensure the tokenizer has a padding token | |
if self.tokenizer.pad_token is None: | |
self.tokenizer.pad_token = self.tokenizer.eos_token # Use EOS token as PAD token | |
inputs = self.tokenizer(msg, return_tensors="pt", padding=True, truncation=True) | |
input_ids = inputs.input_ids | |
attention_mask = inputs.attention_mask | |
if self.model.config.pad_token_id is None: | |
self.model.config.pad_token_id = self.tokenizer.eos_token_id | |
# Get token IDs for answer choices | |
valid_answers = choices | |
valid_token_ids = [self.tokenizer.convert_tokens_to_ids(ans) for ans in valid_answers] | |
class MultipleChoiceLogitsProcessor: | |
def __call__(self, input_ids, scores): | |
mask = torch.full_like(scores, float("-inf")) | |
mask[:, valid_token_ids] = scores[:, valid_token_ids] # Allow only valid tokens | |
return mask | |
logits_processor = LogitsProcessorList([MultipleChoiceLogitsProcessor()]) | |
output = self.model.generate( | |
input_ids, | |
attention_mask=attention_mask, # Fix: Pass attention_mask to avoid warning | |
max_new_tokens=max_new_tokens, | |
logits_processor=logits_processor | |
) | |
answer = self.tokenizer.decode(output[0][-1]) | |
return answer | |
def generate_response_mcqa_multi_token(self, msg, max_new_tokens=2, choices: list = []): | |
""" | |
Handles multiple-choice questions where answers might have multiple tokens. | |
""" | |
# Ensure tokenizer has proper special tokens set | |
if self.tokenizer.pad_token is None: | |
self.tokenizer.pad_token = self.tokenizer.eos_token | |
if self.model.config.pad_token_id is None: | |
self.model.config.pad_token_id = self.tokenizer.pad_token_id | |
chat = [ | |
{"role": "user", "content": "You are a multiple choice question-answering chatbot. Do not give an answer that is not included in the choices. Only answer with letters like A, B, C, D..."}, | |
{"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"}, | |
{"role": "user", "content": f"{msg}"}, | |
] | |
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) | |
#print(formatted_chat) | |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True) | |
if self.device == "auto": | |
input_ids = inputs.input_ids | |
attention_mask = inputs.attention_mask | |
else: | |
input_ids = inputs.input_ids.to(self.model.device) | |
attention_mask = inputs.attention_mask.to(self.model.device) | |
# Generate the sequence of letters starting from 'A' | |
letters = [chr(ord('A') + i) for i in range(len(choices))] # Create option letters A, B, C, D, E, ... | |
encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters] | |
flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list | |
#print(flattened_encoded_choices) | |
allowed_tokens = flattened_encoded_choices | |
allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens | |
allowed_token_ids = set(allowed_tokens) # Ensure uniqueness | |
# Custom LogitsProcessor to restrict generation | |
class RestrictToABCDLogitsProcessor(LogitsProcessor): | |
def __call__(self, input_ids, scores): | |
mask = torch.full_like(scores, float("-inf")) # Block all tokens | |
mask[:, list(allowed_token_ids)] = scores[:, list(allowed_token_ids)] # Allow only A, B, C, D tokens | |
return mask | |
logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()]) | |
# Generate response | |
output = self.model.generate( | |
input_ids, | |
do_sample=True, | |
attention_mask=attention_mask, | |
max_new_tokens=max_new_tokens, | |
eos_token_id=self.tokenizer.eos_token_id, | |
pad_token_id=self.tokenizer.pad_token_id, | |
temperature=0.4, | |
logits_processor=logits_processor, | |
) | |
generated_ids = output[0] # The generated sequence including the prompt | |
generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part | |
generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True) | |
return generated_text | |
def generate_response(self, prompt: str, max_new_tokens: int = 100) -> str: | |
if self.tokenizer.pad_token is None: | |
self.tokenizer.pad_token = self.tokenizer.eos_token | |
if self.model.config.pad_token_id is None: | |
self.model.config.pad_token_id = self.tokenizer.eos_token_id | |
chat = [ | |
{"role": "user", "content": "You are a helpful AI assistant."}, | |
{"role": "assistant", "content": "I am here to help you with any questions you may have."}, | |
{"role": "user", "content": prompt}, | |
] | |
formatted_chat = self.tokenizer.apply_chat_template( | |
chat, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True) | |
if self.device == "auto": | |
input_ids = inputs.input_ids | |
attention_mask = inputs.attention_mask | |
else: | |
input_ids = inputs.input_ids.to(self.model.device) | |
attention_mask = inputs.attention_mask.to(self.model.device) | |
output = self.model.generate( | |
input_ids, | |
attention_mask=attention_mask, | |
max_new_tokens=max_new_tokens, | |
do_sample=True, | |
temperature=0.7, | |
) | |
generated_ids = output[0] | |
prompt_len = input_ids.shape[1] | |
generated_tokens = generated_ids[prompt_len:] | |
result = self.tokenizer.decode(generated_tokens, skip_special_tokens=True) | |
return result | |
def get_chat_template_tokens(self): | |
allowed_token_chat = [ | |
{"role": "user", "content": ""}, | |
{"role": "assistant", "content": ""} | |
] | |
allowed_special_tokens = self.tokenizer.apply_chat_template(allowed_token_chat, tokenize=True) | |
return allowed_special_tokens | |
def load_dataset_from_hf(self): | |
""" | |
Define your own loading method if needed. | |
:return: Dataset | |
""" | |
print("Loading dataset from Hugging Face.") | |
start_time = datetime.now() | |
dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train") | |
print("Dataset loaded.") | |
# Load 50 from each dataset | |
if len(dataset) > 50: | |
dataset = dataset.shuffle(seed=42).select(range(50)) | |
end_time = datetime.now() | |
print(f"Dataset loaded in {(end_time - start_time).seconds} seconds.") | |
return dataset | |
def load_dataset_lmjudge_from_hf(self): | |
""" | |
Define your own loading method if needed. | |
:return: Dataset | |
""" | |
print("Loading dataset from Hugging Face.") | |
start_time = datetime.now() | |
dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train") | |
print("Dataset loaded.") | |
# Load 50 from each dataset | |
if len(dataset) > 10: | |
dataset = dataset.shuffle(seed=42).select(range(10)) | |
end_time = datetime.now() | |
print(f"Dataset loaded in {(end_time - start_time).seconds} seconds.") | |
return dataset | |
def evaluate(self): | |
pass |