File size: 1,347 Bytes
7a0ff7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import torch

class PerplexityEvaluator(object):
    def __init__(self, model, tokenizer, ignore_index=-1):
        self.model = model
        self.tokenizer = tokenizer
        self.ignore_index = ignore_index

    def __call__(self, text, context=None):
        return self.log_perplexity(text, context)

    def log_perplexity(self, text, context=None):
        """

        Evaluate log perplexity of text with respect to the language model

        based on the context



        :param text:

        :param context:

        :return:

        """
        device = self.model.device
        text_ids = self.tokenizer(text, return_tensors='pt')
        if context:
            context_ids = self.tokenizer(context, return_tensors='pt')
            input_ids = torch.concatenate([context_ids['input_ids'], text_ids['input_ids']], axis=1)
            labels = torch.concatenate([torch.ones_like(context_ids['input_ids']) * self.ignore_index,
                                        text_ids['input_ids']], axis=1)
            print("Warning, need to remove context length when reporting lppx")
        else:
            input_ids = text_ids['input_ids']
            labels = input_ids

        loss = self.model(input_ids=input_ids.to(device), labels=labels.to(device)).loss
        return loss.cpu().detach().numpy()