Spaces:
Sleeping
Sleeping
File size: 11,537 Bytes
229a3ba c864132 229a3ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 |
import torch
import numpy as np
import bitarray
from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
def decode(self, token_ids, **kwargs):
filtered_tokens = self.convert_ids_to_tokens(token_ids)
text = self.convert_tokens_to_string(filtered_tokens)
return text
GPT2Tokenizer.decode = decode
def _convert_token_to_id(self, token):
return self.encoder.get(token, 0)
GPT2Tokenizer._convert_token_to_id = _convert_token_to_id
def limit_past(past):
past = list(past)
for i in range(len(past)):
past[i] = past[i][:, :, :, -1022:]
return past
def kl(q, logq, logp):
res = q*(logq-logp)/0.69315
res[q==0] = 0
return res.sum().item() # in bits
def entropy(q, logq):
res = q*logq/0.69315
res[q==0] = 0
return -res.sum().item() # in bits
# e.g. [0, 1, 1, 1] looks like 1110=14
def bits2int(bits):
res = 0
for i, bit in enumerate(bits):
res += bit*(2**i)
return res
def int2bits(inp, num_bits):
if num_bits == 0:
return []
strlist = ('{0:0%db}'%num_bits).format(inp)
return [int(strval) for strval in reversed(strlist)]
def is_sent_finish(token_idx, enc):
token = enc.decoder[token_idx]
return '.' in token or '!' in token or '?' in token
def num_same_from_beg(bits1, bits2):
assert len(bits1) == len(bits2)
for i in range(len(bits1)):
if bits1[i] != bits2[i]:
break
return i
def encode_context(raw_text, enc):
context_tokens = [enc.encoder['<|endoftext|>']] + enc.encode(raw_text)
return context_tokens
# Use gpt2-medium for 345M param model
# Use gpt2-large for 774M param model
def get_model(seed=1234, model_name='gpt2'):
np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
device = torch.device("cpu")
enc = GPT2Tokenizer.from_pretrained(model_name)
enc.unk_token = None
enc.bos_token = None
enc.eos_token = None
model = GPT2LMHeadModel.from_pretrained(model_name)
model.to(device)
model.eval()
#model.double()
return enc, model
enc32_itoc = ['\0', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', ',', "'", '!', ' ']
enc32_ctoi = {k: v for v, k in enumerate(enc32_itoc)}
def enc32(text):
bits = []
for c in text:
bits.extend(int2bits(enc32_ctoi[c], 5))
return bits
def dec32(bits):
text = ''
for i in range(0, len(bits), 5):
c = enc32_itoc[bits2int(bits[i:i+5])]
if c == '\0':
break
text += c
return text
# message should be bit string
# encoded should be text string
def expansion_ratio(message, encoded):
message_bits = len(message)
encoded_ba = bitarray.bitarray()
encoded_ba.frombytes(encoded.encode('utf-8'))
encoded_bits = len(encoded_ba.tolist())
return encoded_bits/message_bits
#@title
import torch
import math
import random
def bin_sort(l, token_indices, total, entropy, device):
#compute entropy for upper bound on the number of bins we need
bucket_size = total
num_bins = 2**int(entropy+1)
bucket_size = total / num_bins
bins = [torch.empty(0, dtype=torch.long, device=device)] * num_bins
value_in_bins = [0] * num_bins
space_left_after = [total - i*bucket_size for i in range(0,num_bins)]
token_bins = [torch.empty(0, dtype=torch.long, device=device)] * num_bins
# Figuring out what the search order should be
step_size = num_bins/4
search_order = []
priorities = [0]*num_bins
priority = 0
search_order.append(int(num_bins/2))
search_order.append(0)
priorities[int(num_bins/2)] = 0
priorities[0] = 0
while(step_size>=1):
priority += 1
for x in range(num_bins-int(step_size), -1, -int(step_size*2)):
search_order.append(x)
priorities[x] = priority
step_size = step_size/2
# Adding the actual elements
for (item, token_index) in zip(l.tolist(), token_indices.tolist()):
found_single_bucket_fit = False
single_bucket_index = -1
single_bucket_value = bucket_size
found_multi_bucket_bumpless_fit = False
multi_bucket_bumpless_index = -1
multi_bucket_bumpless_value = total
found_multi_bucket_bumping_fit = False
multi_bucket_bumping_index = -1
multi_bucket_bumping_value = total
for i in search_order: # for index in search_order
if(item > space_left_after[i]):
continue
if(value_in_bins[i] >= bucket_size):
continue
# Priority of choices
# 1. Can i place this thing in an empty bucket all on its own?
# 2. Can i plan this somewhere where is doesnt have to bump anything else around?
# 2a. Minimize the wasted space. Aka use the smallest space (of equal priority) that accomplishes this goal
# 3. If not (1) and (2), then put it in the space the bumps stuff the least.
if(value_in_bins[i] + item > bucket_size): #Would overflow.
space_before_next_block = bucket_size - value_in_bins[i]
for j in range(i+1, len(bins)):
if(value_in_bins[j] > 0): # We have found a bucket with something in it. This is how much space we have here.
space_before_next_block = space_before_next_block + (bucket_size - value_in_bins[i])
break
else: # This was a empty bucket
space_before_next_block = space_before_next_block + bucket_size
if((not found_multi_bucket_bumpless_fit) or (found_multi_bucket_bumpless_fit and priorities[i] <= priorities[multi_bucket_bumpless_index])): #This could potentially be a match
# If this is a valid space to put this without bumping and it is a better fit than previous spaces
if(space_before_next_block > item and space_before_next_block < multi_bucket_bumpless_value):
# set this to be the pointer! we can fit stuff here
found_multi_bucket_bumpless_fit = True
multi_bucket_bumpless_index = i
multi_bucket_bumpless_value = space_before_next_block
# Find the overflow that will bump the least
if ( item - space_before_next_block < multi_bucket_bumping_value):
found_multi_bucket_bumping_fit = True
multi_bucket_bumping_index = i
multi_bucket_bumping_value = item - space_before_next_block
if(value_in_bins[i] + item <= bucket_size): #Would fit
if(single_bucket_value > value_in_bins[i]):
found_single_bucket_fit = True
single_bucket_value = value_in_bins[i]
single_bucket_index = i
if (single_bucket_index == multi_bucket_bumpless_index == multi_bucket_bumping_index == -1):
bins[0] = torch.cat( (torch.tensor([item], device=device), bins[0]), 0)
token_bins[0] = torch.cat( (torch.tensor([token_index], device=device), token_bins[0]), 0)
continue
if found_single_bucket_fit:
# We found somewhere we can actually fit!
bins[single_bucket_index] = torch.cat( (bins[single_bucket_index], torch.tensor([item], device=device)), 0)
token_bins[single_bucket_index] = torch.cat( (token_bins[single_bucket_index], torch.tensor([token_index], device=device)), 0)
value_in_bins[single_bucket_index] += item
for i in range(0, single_bucket_index+1):
space_left_after[i] -= item
elif found_multi_bucket_bumpless_fit:
# Found somewhere we can put this without upsetting the force
part_in_bucket = bucket_size - value_in_bins[multi_bucket_bumpless_index]
part_overflow = item - part_in_bucket
bins[multi_bucket_bumpless_index] = torch.cat( (bins[multi_bucket_bumpless_index], torch.tensor([item], device=device)), 0)
token_bins[multi_bucket_bumpless_index] = torch.cat( (token_bins[multi_bucket_bumpless_index], torch.tensor([token_index], device=device)), 0)
value_in_bins[multi_bucket_bumpless_index] = bucket_size
# Fill this bucket and continue overflowing
j = multi_bucket_bumpless_index + 1
for i in range(0, j):
space_left_after[i] -= item
while(part_overflow > 0):
new_part_overflow = (value_in_bins[j] + part_overflow) - bucket_size
value_in_bins[j] = min(bucket_size, part_overflow+value_in_bins[j]) # mark the bucket as filled
space_left_after[j] -= part_overflow
part_overflow = new_part_overflow
j+=1
else:
part_in_bucket = bucket_size - value_in_bins[multi_bucket_bumping_index]
part_overflow = item - part_in_bucket
bins[multi_bucket_bumping_index] = torch.cat( (bins[multi_bucket_bumping_index], torch.tensor([item], device=device)), 0)
token_bins[multi_bucket_bumping_index] = torch.cat( (token_bins[multi_bucket_bumping_index], torch.tensor([token_index], device=device)), 0)
value_in_bins[multi_bucket_bumping_index] = bucket_size
# Fill this bucket and continue overflowing
j = multi_bucket_bumping_index + 1
for i in range(0, j):
space_left_after[i] -= item
while(part_overflow > 0):
new_part_overflow = (value_in_bins[j] + part_overflow) - bucket_size
value_in_bins[j] = min(bucket_size, part_overflow+value_in_bins[j]) # mark the bucket as filled
space_left_after[j] -= part_overflow
part_overflow = new_part_overflow
j+=1
sorted_tensor = torch.cat(bins, 0)
sorted_tokens = torch.cat(token_bins, 0)
return sorted_tensor, sorted_tokens
def compute_ev(t, precision):
expected_bits = []
cum_probs = t.cumsum(0)
for selection in range(0, len(cum_probs)):
# Calculate new range as ints
new_int_bottom = cum_probs[selection-1] if selection > 0 else 0
new_int_top = cum_probs[selection]
# Convert range to bits
new_int_bottom_bits_inc = list(reversed(int2bits(new_int_bottom, precision)))
new_int_top_bits_inc = list(reversed(int2bits(new_int_top-1, precision))) # -1 here because upper bound is exclusive
# Consume most significant bits which are now fixed and update interval
num_bits_encoded = num_same_from_beg(new_int_bottom_bits_inc, new_int_top_bits_inc)
expected_bits.append(t[selection] * num_bits_encoded)
return(float(sum(expected_bits).item())/(2**precision))
def visualize_bins(values_in_bins, bucket_size):
out_str = "["
for b in values_in_bins:
out_str = out_str + " " + str(round(100*b/bucket_size,2)) + " |"
out_str = out_str + "]"
print(out_str)
def visualize_distribution(l):
total = sum(l)
out_str = "["
for b in l:
out_str = out_str + " " + str(round(100*b/total,2)) + " |"
out_str = out_str + "]"
print(out_str)
def compute_entropy(lists):
total = sum(lists)
entropy = -1*sum([ (x/total) * math.log2(x/total) for x in lists])
return entropy |