AhmedSSabir commited on
Commit
40180ec
1 Parent(s): ba84ae7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -51
app.py CHANGED
@@ -6,6 +6,19 @@ import re
6
  import os
7
  import gradio as gr
8
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  #url = "https://github.com/simonepri/lm-scorer/tree/master/lm_scorer/models"
11
  #resp = requests.get(url)
@@ -30,16 +43,16 @@ from transformers import GPT2Tokenizer, GPT2LMHeadModel
30
  import numpy as np
31
  import re
32
 
33
- def Sort_Tuple(tup):
34
 
35
- # (Sorts in descending order)
36
- tup.sort(key = lambda x: x[1])
37
- return tup[::-1]
38
 
39
 
40
- def softmax(x):
41
- exps = np.exp(x)
42
- return np.divide(exps, np.sum(exps))
43
 
44
 
45
  def get_sim(x):
@@ -49,56 +62,84 @@ def get_sim(x):
49
 
50
  # Load pre-trained model
51
 
52
- model = GPT2LMHeadModel.from_pretrained('gpt2', output_hidden_states = True, output_attentions = True)
53
 
54
- #model = gr.Interface.load('huggingface/distilgpt2', output_hidden_states = True, output_attentions = True)
55
 
56
- #model.eval()
57
- #tokenizer = gr.Interface.load('huggingface/distilgpt2')
58
 
59
- tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
60
- #tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
61
-
62
-
63
- def cloze_prob(text):
64
-
65
- whole_text_encoding = tokenizer.encode(text)
66
- # Parse out the stem of the whole sentence (i.e., the part leading up to but not including the critical word)
67
- text_list = text.split()
68
- stem = ' '.join(text_list[:-1])
69
- stem_encoding = tokenizer.encode(stem)
70
- # cw_encoding is just the difference between whole_text_encoding and stem_encoding
71
- # note: this might not correspond exactly to the word itself
72
- cw_encoding = whole_text_encoding[len(stem_encoding):]
73
- # Run the entire sentence through the model. Then go "back in time" to look at what the model predicted for each token, starting at the stem.
74
- # Put the whole text encoding into a tensor, and get the model's comprehensive output
75
- tokens_tensor = torch.tensor([whole_text_encoding])
76
 
77
- with torch.no_grad():
78
- outputs = model(tokens_tensor)
79
- predictions = outputs[0]
80
-
81
- logprobs = []
82
- # start at the stem and get downstream probabilities incrementally from the model(see above)
83
- start = -1-len(cw_encoding)
84
- for j in range(start,-1,1):
85
- raw_output = []
86
- for i in predictions[-1][j]:
87
- raw_output.append(i.item())
88
 
89
- logprobs.append(np.log(softmax(raw_output)))
90
 
91
- # if the critical word is three tokens long, the raw_probabilities should look something like this:
92
- # [ [0.412, 0.001, ... ] ,[0.213, 0.004, ...], [0.002,0.001, 0.93 ...]]
93
- # Then for the i'th token we want to find its associated probability
94
- # this is just: raw_probabilities[i][token_index]
95
- conditional_probs = []
96
- for cw,prob in zip(cw_encoding,logprobs):
97
- conditional_probs.append(prob[cw])
98
- # now that you have all the relevant probabilities, return their product.
99
- # This is the probability of the critical word given the context before it.
100
-
101
- return np.exp(np.sum(conditional_probs))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
 
104
 
 
6
  import os
7
  import gradio as gr
8
  import requests
9
+ from doctest import OutputChecker
10
+ import sys
11
+ import torch
12
+ import re
13
+ import os
14
+ import gradio as gr
15
+ import requests
16
+ import torch
17
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
18
+ from torch.nn.functional import softmax
19
+ import numpy as np
20
+
21
+
22
 
23
  #url = "https://github.com/simonepri/lm-scorer/tree/master/lm_scorer/models"
24
  #resp = requests.get(url)
 
43
  import numpy as np
44
  import re
45
 
46
+ # def Sort_Tuple(tup):
47
 
48
+ # # (Sorts in descending order)
49
+ # tup.sort(key = lambda x: x[1])
50
+ # return tup[::-1]
51
 
52
 
53
+ # def softmax(x):
54
+ # exps = np.exp(x)
55
+ # return np.divide(exps, np.sum(exps))
56
 
57
 
58
  def get_sim(x):
 
62
 
63
  # Load pre-trained model
64
 
65
+ # model = GPT2LMHeadModel.from_pretrained('gpt2', output_hidden_states = True, output_attentions = True)
66
 
67
+ # #model = gr.Interface.load('huggingface/distilgpt2', output_hidden_states = True, output_attentions = True)
68
 
69
+ # #model.eval()
70
+ # #tokenizer = gr.Interface.load('huggingface/distilgpt2')
71
 
72
+ # tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
73
+ # #tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
74
+
75
+
76
+ # def cloze_prob(text):
77
+
78
+ # whole_text_encoding = tokenizer.encode(text)
79
+ # # Parse out the stem of the whole sentence (i.e., the part leading up to but not including the critical word)
80
+ # text_list = text.split()
81
+ # stem = ' '.join(text_list[:-1])
82
+ # stem_encoding = tokenizer.encode(stem)
83
+ # # cw_encoding is just the difference between whole_text_encoding and stem_encoding
84
+ # # note: this might not correspond exactly to the word itself
85
+ # cw_encoding = whole_text_encoding[len(stem_encoding):]
86
+ # # Run the entire sentence through the model. Then go "back in time" to look at what the model predicted for each token, starting at the stem.
87
+ # # Put the whole text encoding into a tensor, and get the model's comprehensive output
88
+ # tokens_tensor = torch.tensor([whole_text_encoding])
89
 
90
+ # with torch.no_grad():
91
+ # outputs = model(tokens_tensor)
92
+ # predictions = outputs[0]
93
+
94
+ # logprobs = []
95
+ # # start at the stem and get downstream probabilities incrementally from the model(see above)
96
+ # start = -1-len(cw_encoding)
97
+ # for j in range(start,-1,1):
98
+ # raw_output = []
99
+ # for i in predictions[-1][j]:
100
+ # raw_output.append(i.item())
101
 
102
+ # logprobs.append(np.log(softmax(raw_output)))
103
 
104
+ # # if the critical word is three tokens long, the raw_probabilities should look something like this:
105
+ # # [ [0.412, 0.001, ... ] ,[0.213, 0.004, ...], [0.002,0.001, 0.93 ...]]
106
+ # # Then for the i'th token we want to find its associated probability
107
+ # # this is just: raw_probabilities[i][token_index]
108
+ # conditional_probs = []
109
+ # for cw,prob in zip(cw_encoding,logprobs):
110
+ # conditional_probs.append(prob[cw])
111
+ # # now that you have all the relevant probabilities, return their product.
112
+ # # This is the probability of the critical word given the context before it.
113
+
114
+ # return np.exp(np.sum(conditional_probs))
115
+
116
+
117
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
118
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
119
+
120
+ def sentence_prob_mean(text):
121
+ # Tokenize the input text and add special tokens
122
+ input_ids = tokenizer.encode(text, return_tensors='pt')
123
+
124
+ # Obtain model outputs
125
+ with torch.no_grad():
126
+ outputs = model(input_ids, labels=input_ids)
127
+ logits = outputs.logits # logits are the model outputs before applying softmax
128
+
129
+ # Shift logits and labels so that tokens are aligned:
130
+ shift_logits = logits[..., :-1, :].contiguous()
131
+ shift_labels = input_ids[..., 1:].contiguous()
132
+
133
+ # Calculate the softmax probabilities
134
+ probs = softmax(shift_logits, dim=-1)
135
+
136
+ # Gather the probabilities of the actual token IDs
137
+ gathered_probs = torch.gather(probs, 2, shift_labels.unsqueeze(-1)).squeeze(-1)
138
+
139
+ # Compute the mean probability across the tokens
140
+ mean_prob = torch.mean(gathered_probs).item()
141
+
142
+ return mean_prob
143
 
144
 
145