File size: 4,464 Bytes
94011a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# 1. 选择更加合理的perplexity的计算,一个文本还是一句话算一个
# 2. 考虑perplexity的计算方式
# 3. 有偶然性,所以perplexity需要多次计算取平均值

import sys
sys.path.append("..")

from utils_qwen import CHECKPOINT_READ_PATH, PERTURBATIONS, BABYLM_DATA_PATH, PAREN_MODELS 
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import pandas as pd
import torch
import argparse
import os
from glob import glob
from tqdm import tqdm
from numpy.random import default_rng

# Constants
FILE_SAMPLE_SIZE = 1500 ## 考虑用3000更稳定一些
BATCH_SIZE = 8
device = "cuda"

MODEL_NAME = "Qwen/Qwen2.5-0.5B"
MODEL_NAME_SAVE = "Qwen2.5-0.5B"


class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids):
        self.input_ids = input_ids
        
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {"input_ids": self.input_ids[idx], "labels": self.input_ids[idx]}

def get_perplexities(model, eval_dataset):
    # Use Trainer to evaluate and get the loss
    trainer = Trainer(model=model)
    eval_results = trainer.evaluate(eval_dataset)  # Perform evaluation
    loss = eval_results['eval_loss']  # Get the loss value

    # Calculate perplexity
    perplexity = torch.exp(torch.tensor(loss)).item()
    return perplexity

if __name__ == "__main__":
    parser = argparse.ArgumentParser(prog='Edge probing', description='Edge probing experiments')
    parser.add_argument('test_perturbation_type',
                        default='all',
                        const='all',
                        nargs='?',
                        choices=PERTURBATIONS.keys(),
                        help='Perturbation function used to transform test BabyLM dataset')
    parser.add_argument('checkpoint_path',
                    type=str,
                    nargs='?',
                    default='default-checkpoint',
                    help='Train checkpoint')
    parser.add_argument('random_seed',
                        type=int,
                        nargs='?',
                        default=0,
                        help='Random seed')

    args = parser.parse_args()

    test_files = sorted(glob(
        f"../data/babylm_data_perturbed_qwen/babylm_{args.test_perturbation_type}/babylm_test_affected/*"))

    rng = default_rng(args.random_seed)

    # checkpoint_path = 'checkpoint-1000'
    checkpoint_dir = f'../train/checkpoints/babylm/babylm_{args.test_perturbation_type}_10M_seed0/runs/{args.checkpoint_path}'

    print("Sampling BabyLM affected test files to extract surprisals...")
    token_sequences = []
    print("test_files:", test_files)
    for test_file in test_files:
        print(test_file)
        with open(test_file, 'r') as f:
            file_token_sequences = [
                [int(s) for s in l.split()] for l in f.readlines()]
            sample_indices = rng.choice(
                list(range(len(file_token_sequences))), FILE_SAMPLE_SIZE, replace=False)
            file_token_sequences = [file_token_sequences[i]
                                    for i in sample_indices]
            token_sequences.extend(file_token_sequences)

    tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
    
    print("Loading the Qwen model...")
    model = AutoModelForCausalLM.from_pretrained(checkpoint_dir).to(device)

    # 将token_sequences解码为文本
    test_texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in token_sequences]

    # Tokenize the input sequences and prepare the dataset in the Trainer format
    tokenized_sequences = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt", max_length=1024)

    # Create a Dataset for Trainer


    # Prepare the dataset
    dataset = CustomDataset(tokenized_sequences['input_ids'])

    # Calculate perplexity
    perplexity = get_perplexities(model, dataset)

    # Prepare DataFrame for saving the results
    ppl_df = pd.DataFrame({"Perplexity": [perplexity]})

    directory = f"perplexity_results/{MODEL_NAME_SAVE}/{args.test_perturbation_type}"
    if not os.path.exists(directory):
        os.makedirs(directory)
    file = f"{directory}/{MODEL_NAME_SAVE}_seed{args.random_seed}_test_{args.test_perturbation_type}_{args.checkpoint_path}.csv"
    print(f"Writing results to CSV: {file}")
    ppl_df.to_csv(file, index=False)

    print(f"Calculated Perplexity: {perplexity}")