from abc import ABC, abstractmethod from datasets import load_dataset import os from dotenv import load_dotenv import openai from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor import torch from typing import List from datetime import datetime load_dotenv() HF_TOKEN=os.getenv("HF_TOKEN") OPENAI_KEY = os.getenv("OPENAI_API_KEY") class BaseTask(ABC): _model_cache = {} # Class-level cache for models and tokenizers def __init__(self, dataset_repo, model_name): self.dataset_repo = dataset_repo self.dataset = self.load_dataset_from_hf() device_count = torch.cuda.device_count() if device_count > 1: self.device = "auto" print(f"Using {device_count} GPUs with auto config.") elif device_count == 1: self.device = "cuda" print(f"Using {device_count} GPU with cuda config.") else: self.device = "cpu" print("No GPU found. Using CPU.") self.model, self.tokenizer = self.get_cached_model(model_name, self.device) openai.api_key = OPENAI_KEY @classmethod def get_cached_model(cls, model_name, device): """Ensures the same model and tokenizer are used for every instance of subclasses.""" if model_name not in cls._model_cache: cls._model_cache[model_name] = cls.load_model(model_name, device) return cls._model_cache[model_name] @staticmethod def load_model(model_name: str, device): """Loads model and tokenizer once and caches it.""" print(f"Loading model: {model_name}") start_time = datetime.now() model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map=device, token=HF_TOKEN, # Replace with actual token ) end_time = datetime.now() print(f"Model loaded in {(end_time - start_time).seconds} seconds.") print("Model loaded.") tokenizer = AutoTokenizer.from_pretrained(model_name) return model, tokenizer # @staticmethod # def load_model(model_name: str, device, weight, dtype, base_model): # """Loads model and tokenizer once and caches it.""" # print(f"Loading model: {model_name}") # start_time = datetime.now() # if weight == "Adapter": # base_model_1 = AutoModelForCausalLM.from_pretrained( # base_model, # torch_dtype=dtype, # device_map=device, # token=HF_TOKEN, # Replace with actual token # ) # model = PeftModel.from_pretrained(base_model_1, base_model) # tokenizer = AutoTokenizer.from_pretrained(base_model) # end_time = datetime.now() # else: # model = AutoModelForCausalLM.from_pretrained( # model_name, # torch_dtype=dtype, # device_map=device, # token=HF_TOKEN, # Replace with actual token # ) # tokenizer = AutoTokenizer.from_pretrained(model_name) # end_time = datetime.now() # print(f"Model loaded in {(end_time - start_time).seconds} seconds.") # print("Model loaded.") # return model, tokenizer def generate_response_mcqa(self, msg, max_new_tokens=1, choices: List[str]=[]): # Ensure the tokenizer has a padding token if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token # Use EOS token as PAD token inputs = self.tokenizer(msg, return_tensors="pt", padding=True, truncation=True) input_ids = inputs.input_ids attention_mask = inputs.attention_mask if self.model.config.pad_token_id is None: self.model.config.pad_token_id = self.tokenizer.eos_token_id # Get token IDs for answer choices valid_answers = choices valid_token_ids = [self.tokenizer.convert_tokens_to_ids(ans) for ans in valid_answers] class MultipleChoiceLogitsProcessor: def __call__(self, input_ids, scores): mask = torch.full_like(scores, float("-inf")) mask[:, valid_token_ids] = scores[:, valid_token_ids] # Allow only valid tokens return mask logits_processor = LogitsProcessorList([MultipleChoiceLogitsProcessor()]) output = self.model.generate( input_ids, attention_mask=attention_mask, # Fix: Pass attention_mask to avoid warning max_new_tokens=max_new_tokens, logits_processor=logits_processor ) answer = self.tokenizer.decode(output[0][-1]) return answer def generate_response_mcqa_multi_token(self, msg, max_new_tokens=2, choices: list = []): """ Handles multiple-choice questions where answers might have multiple tokens. """ # Ensure tokenizer has proper special tokens set if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token if self.model.config.pad_token_id is None: self.model.config.pad_token_id = self.tokenizer.pad_token_id chat = [ {"role": "user", "content": "You are a multiple choice question-answering chatbot. Do not give an answer that is not included in the choices. Only answer with letters like A, B, C, D..."}, {"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"}, {"role": "user", "content": f"{msg}"}, ] formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) #print(formatted_chat) inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True) if self.device == "auto": input_ids = inputs.input_ids attention_mask = inputs.attention_mask else: input_ids = inputs.input_ids.to(self.model.device) attention_mask = inputs.attention_mask.to(self.model.device) # Generate the sequence of letters starting from 'A' letters = [chr(ord('A') + i) for i in range(len(choices))] # Create option letters A, B, C, D, E, ... encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters] flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list #print(flattened_encoded_choices) allowed_tokens = flattened_encoded_choices allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens allowed_token_ids = set(allowed_tokens) # Ensure uniqueness # Custom LogitsProcessor to restrict generation class RestrictToABCDLogitsProcessor(LogitsProcessor): def __call__(self, input_ids, scores): mask = torch.full_like(scores, float("-inf")) # Block all tokens mask[:, list(allowed_token_ids)] = scores[:, list(allowed_token_ids)] # Allow only A, B, C, D tokens return mask logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()]) # Generate response output = self.model.generate( input_ids, do_sample=True, attention_mask=attention_mask, max_new_tokens=max_new_tokens, eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.pad_token_id, temperature=0.4, logits_processor=logits_processor, ) generated_ids = output[0] # The generated sequence including the prompt generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True) return generated_text def generate_response(self, prompt: str, max_new_tokens: int = 100) -> str: if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token if self.model.config.pad_token_id is None: self.model.config.pad_token_id = self.tokenizer.eos_token_id chat = [ {"role": "user", "content": "You are a helpful AI assistant."}, {"role": "assistant", "content": "I am here to help you with any questions you may have."}, {"role": "user", "content": prompt}, ] formatted_chat = self.tokenizer.apply_chat_template( chat, tokenize=False, add_generation_prompt=True ) inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True) if self.device == "auto": input_ids = inputs.input_ids attention_mask = inputs.attention_mask else: input_ids = inputs.input_ids.to(self.model.device) attention_mask = inputs.attention_mask.to(self.model.device) output = self.model.generate( input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7, ) generated_ids = output[0] prompt_len = input_ids.shape[1] generated_tokens = generated_ids[prompt_len:] result = self.tokenizer.decode(generated_tokens, skip_special_tokens=True) return result def get_chat_template_tokens(self): allowed_token_chat = [ {"role": "user", "content": ""}, {"role": "assistant", "content": ""} ] allowed_special_tokens = self.tokenizer.apply_chat_template(allowed_token_chat, tokenize=True) return allowed_special_tokens @abstractmethod def load_dataset_from_hf(self): """ Define your own loading method if needed. :return: Dataset """ print("Loading dataset from Hugging Face.") start_time = datetime.now() dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train") print("Dataset loaded.") # Load 50 from each dataset if len(dataset) > 50: dataset = dataset.shuffle(seed=42).select(range(50)) end_time = datetime.now() print(f"Dataset loaded in {(end_time - start_time).seconds} seconds.") return dataset def load_dataset_lmjudge_from_hf(self): """ Define your own loading method if needed. :return: Dataset """ print("Loading dataset from Hugging Face.") start_time = datetime.now() dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train") print("Dataset loaded.") # Load 50 from each dataset if len(dataset) > 10: dataset = dataset.shuffle(seed=42).select(range(10)) end_time = datetime.now() print(f"Dataset loaded in {(end_time - start_time).seconds} seconds.") return dataset @abstractmethod def evaluate(self): pass