File size: 1,554 Bytes

b404f80

import os
import torch
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPTNeoForCausalLM

# Debugging: Print the current working directory to ensure the file is in the correct location
print(f"Current Working Directory: {os.getcwd()}")

# Debugging: Print the file path of chat_with_tars
import chat_with_tars
print(f"chat_with_tars file path: {chat_with_tars.__file__}")

def patch_pad_token(model_name, tokenizer_class, model_class):
    print(f"🔄 Loading tokenizer and model: {model_name}...")
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name)

    # Debugging: Print tokenizer and model configurations
    print(f"Tokenizer Configuration: {tokenizer}")
    print(f"Model Configuration: {model.config}")

    # Add a padding token
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

    # Debugging: Print the new vocabulary size
    print(f"New Vocabulary Size: {len(tokenizer)}")

    # Save the model with the new padding token
    model.save_pretrained(model_name)
    tokenizer.save_pretrained(model_name)

    print("✅ Padding token added and model resized.")
    print("✅ Model saved with padding token patched.")

if __name__ == "__main__":
    # Patch GPT-Neo
    gpt_model_name = 'EleutherAI/gpt-neo-125M'
    patch_pad_token(gpt_model_name, GPT2Tokenizer, GPTNeoForCausalLM)

    # Patch BERT
    bert_model_name = 'bert-base-uncased'
    patch_pad_token(bert_model_name, BertTokenizer, BertModel)