File size: 1,554 Bytes
b404f80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import os
import torch
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPTNeoForCausalLM

# Debugging: Print the current working directory to ensure the file is in the correct location
print(f"Current Working Directory: {os.getcwd()}")

# Debugging: Print the file path of chat_with_tars
import chat_with_tars
print(f"chat_with_tars file path: {chat_with_tars.__file__}")

def patch_pad_token(model_name, tokenizer_class, model_class):
    print(f"🔄 Loading tokenizer and model: {model_name}...")
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name)

    # Debugging: Print tokenizer and model configurations
    print(f"Tokenizer Configuration: {tokenizer}")
    print(f"Model Configuration: {model.config}")

    # Add a padding token
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

    # Debugging: Print the new vocabulary size
    print(f"New Vocabulary Size: {len(tokenizer)}")

    # Save the model with the new padding token
    model.save_pretrained(model_name)
    tokenizer.save_pretrained(model_name)

    print("✅ Padding token added and model resized.")
    print("✅ Model saved with padding token patched.")

if __name__ == "__main__":
    # Patch GPT-Neo
    gpt_model_name = 'EleutherAI/gpt-neo-125M'
    patch_pad_token(gpt_model_name, GPT2Tokenizer, GPTNeoForCausalLM)

    # Patch BERT
    bert_model_name = 'bert-base-uncased'
    patch_pad_token(bert_model_name, BertTokenizer, BertModel)