Spaces:
Sleeping
Sleeping
File size: 11,968 Bytes
85f9580 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 |
from __future__ import absolute_import, division, print_function
import numpy as np
import torch
from tqdm import tqdm
import ot
from math import log
from collections import defaultdict, Counter
from transformers import AutoModelForMaskedLM, AutoTokenizer
class BaryScoreMetric:
def __init__(self, model_name="bert-base-uncased", last_layers=5, use_idfs=True, sinkhorn_ref=0.01):
"""
BaryScore metric
:param model_name: model name or path from HuggingFace Librairy
:param last_layers: last layer to use in the pretrained model
:param use_idfs: if true use idf costs else use uniform weights
:param sinkhorn_ref: weight of the KL in the SD
"""
self.model_name = model_name
self.load_tokenizer_and_model()
n = self.model.config.num_hidden_layers + 1
assert n - last_layers > 0
self.layers_to_consider = range(n - last_layers, n)
self.use_idfs = use_idfs
self.sinkhorn_ref = sinkhorn_ref
self.idfs = []
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def prepare_idfs(self, hyps, refs):
"""
:param hyps: hypothesis list of string sentences has to be computed at corpus level
:param refs:reference list of string sentences has to be computed at corpus level
"""
t_hyps = self.tokenizer(hyps)['input_ids']
t_refs = self.tokenizer(refs)['input_ids']
idf_dict_ref = self.ref_list_to_idf(t_refs)
idf_dict_hyp = self.ref_list_to_idf(t_hyps)
idfs_tokenizer = (idf_dict_ref, idf_dict_hyp)
self.model_ids = idfs_tokenizer
return idf_dict_hyp, idf_dict_ref
def ref_list_to_idf(self, input_refs):
"""
:param input_refs: list of input reference
:return: idf dictionnary
"""
idf_count = Counter()
num_docs = len(input_refs)
idf_count.update(sum([list(set(i)) for i in input_refs], []))
idf_dict = defaultdict(lambda: log((num_docs + 1) / (1)))
idf_dict.update({idx: log((num_docs + 1) / (c + 1)) for (idx, c) in idf_count.items()})
return idf_dict
def load_tokenizer_and_model(self):
"""
Loading and initializing the chosen model and tokenizer
"""
tokenizer = AutoTokenizer.from_pretrained('{}'.format(self.model_name))
model = AutoModelForMaskedLM.from_pretrained('{}'.format(self.model_name))
model.config.output_hidden_states = True
model.eval()
self.tokenizer = tokenizer
self.model = model
def evaluate_batch(self, batch_hyps, batch_refs, idf_hyps=None, idf_ref=None):
"""
:param batch_hyps: hypothesis list of string sentences
:param batch_refs: reference list of string sentences
:param idf_hyps: idfs of hypothesis computed at corpus level
:param idf_ref: idfs of references computed at corpus level
:return: dictionnary of scores
"""
###############################################
## Extract Embeddings From Pretrained Models ##
###############################################
if isinstance(batch_hyps, str):
batch_hyps = [batch_hyps]
if isinstance(batch_refs, str):
batch_refs = [batch_refs]
nb_sentences = len(batch_refs)
baryscores = []
assert len(batch_hyps) == len(batch_refs)
if (idf_hyps is None) and (idf_ref is None):
idf_hyps, idf_ref = self.model_ids
model = self.model.to(self.device)
with torch.no_grad():
###############################################
## Extract Embeddings From Pretrained Models ##
###############################################
batch_refs = self.tokenizer(batch_refs, return_tensors='pt', padding=True, truncation=True).to(self.device)
batch_refs_embeddings_ = model(**batch_refs)[-1]
batch_hyps = self.tokenizer(batch_hyps, return_tensors='pt', padding=True, truncation=True).to(self.device)
batch_hyps_embeddings_ = model(**batch_hyps)[-1]
batch_refs_embeddings = [batch_refs_embeddings_[i] for i in list(self.layers_to_consider)]
batch_hyps_embeddings = [batch_hyps_embeddings_[i] for i in list(self.layers_to_consider)]
batch_refs_embeddings = torch.cat([i.unsqueeze(0) for i in batch_refs_embeddings])
batch_refs_embeddings.div_(torch.norm(batch_refs_embeddings, dim=-1).unsqueeze(-1))
batch_hyps_embeddings = torch.cat([i.unsqueeze(0) for i in batch_hyps_embeddings])
batch_hyps_embeddings.div_(torch.norm(batch_hyps_embeddings, dim=-1).unsqueeze(-1))
ref_tokens_id = batch_refs['input_ids'].cpu().tolist()
hyp_tokens_id = batch_hyps['input_ids'].cpu().tolist()
####################################
## Unbatched BaryScore Prediction ##
####################################
for index_sentence in tqdm(range(nb_sentences), 'BaryScore Progress'):
dict_score = {}
ref_ids_idf = batch_refs['input_ids'][index_sentence]
hyp_idf_ids = batch_hyps['input_ids'][index_sentence]
ref_tokens = [i for i in self.tokenizer.convert_ids_to_tokens(ref_tokens_id[index_sentence],
skip_special_tokens=False) if
i != self.tokenizer.pad_token]
hyp_tokens = [i for i in self.tokenizer.convert_ids_to_tokens(hyp_tokens_id[index_sentence],
skip_special_tokens=False) if
i != self.tokenizer.pad_token]
ref_ids = [k for k, w in enumerate(ref_tokens)]
hyp_ids = [k for k, w in enumerate(hyp_tokens)]
# With stop words
ref_idf_i = [idf_ref[i] for i in ref_ids_idf[ref_ids]]
hyp_idf_i = [idf_hyps[i] for i in hyp_idf_ids[hyp_ids]]
ref_embedding_i = batch_refs_embeddings[:, index_sentence, ref_ids, :]
hyp_embedding_i = batch_hyps_embeddings[:, index_sentence, hyp_ids, :]
measures_locations_ref = ref_embedding_i.permute(1, 0, 2).cpu().numpy().tolist()
measures_locations_ref = [np.array(i) for i in measures_locations_ref]
measures_locations_hyps = hyp_embedding_i.permute(1, 0, 2).cpu().numpy().tolist()
measures_locations_hyps = [np.array(i) for i in measures_locations_hyps]
# ADDED
measures_locations_ref = [np.array(i) for i in
np.array(measures_locations_ref).transpose(1, 0, 2).tolist()]
measures_locations_hyps = [np.array(i) for i in
np.array(measures_locations_hyps).transpose(1, 0,
2).tolist()]
if self.use_idfs:
#########################
## Use TF-IDF weights ##
#########################
baryscore = self.baryscore(measures_locations_ref, measures_locations_hyps, ref_idf_i,
hyp_idf_i)
else:
#####################
## Uniform Weights ##
#####################
baryscore = self.baryscore(measures_locations_ref, measures_locations_hyps, None, None)
for key, value in baryscore.items():
dict_score['baryscore_{}'.format(key)] = value
baryscores.append(dict_score)
baryscores_dic = {}
for k in dict_score.keys():
baryscores_dic[k] = []
for score in baryscores:
baryscores_dic[k].append(score[k])
return baryscores_dic
def baryscore(self, measures_locations_ref, measures_locations_hyps, weights_refs, weights_hyps):
"""
:param measures_locations_ref: input measure reference locations
:param measures_locations_hyps: input measure hypothesis locations
:param weights_refs: references weights in the Wasserstein Barycenters
:param weights_hyps: hypothesis weights in the Wasserstein Barycenters
:return:
"""
if weights_hyps is not None or weights_refs is not None:
assert weights_refs is not None
assert weights_hyps is not None
weights_hyps = np.array([i / sum(weights_hyps) for i in weights_hyps]).astype(np.float64)
weights_refs = np.array([i / sum(weights_refs) for i in weights_refs]).astype(np.float64)
self.n_layers = len(measures_locations_ref)
self.d_bert = measures_locations_ref[0].shape[1]
####################################
## Compute Wasserstein Barycenter ##
####################################
bary_ref = self.w_barycenter(measures_locations_ref, weights_refs)
bary_hyp = self.w_barycenter(measures_locations_hyps, weights_hyps)
#################################################
## Compute Wasserstein and Sinkhorn Divergence ##
#################################################
C = ot.dist(bary_ref, bary_hyp)
weights_first_barycenter = np.zeros((C.shape[0])) + 1 / C.shape[0]
weights_second_barycenter = np.zeros((C.shape[1])) + 1 / C.shape[1]
wasserstein_distance = ot.emd2(weights_first_barycenter, weights_second_barycenter, C,
log=True)[0]
dic_results = {
"W": wasserstein_distance,
}
for reg in [10, 1, 5, 1, 0.1, 0.5, 0.01, 0.001]:
wasserstein_sinkhorn = ot.bregman.sinkhorn2(weights_first_barycenter, weights_second_barycenter, C,
reg=reg, numItermax=10000).tolist()
if isinstance(wasserstein_sinkhorn, list):
wasserstein_sinkhorn = wasserstein_sinkhorn[0] # for POT==0.7.0
dic_results['SD_{}'.format(reg)] = wasserstein_sinkhorn
return dic_results
def w_barycenter(self, measures_locations, weights):
"""
:param measures_locations: location of the discrete input measures
:param weights: weights of the input measures
:return: barycentrique distribution
"""
X_init = np.zeros((measures_locations[0].shape[0], self.d_bert)).astype(np.float64)
if weights is None:
measures_weights = [np.array(
[1 / measures_locations[0].shape[0]] * measures_locations[0].shape[0])] * self.n_layers
else:
measures_weights = [weights / sum(weights)] * self.n_layers
b = np.array([1 / measures_locations[0].shape[0]] * measures_locations[0].shape[0]).astype(np.float64)
mesure_bary = ot.lp.free_support_barycenter(measures_locations, measures_weights, X_init,
b=b, numItermax=1000, verbose=False)
return mesure_bary
@property
def supports_multi_ref(self):
"""
:return: BaryScore does not support multi ref
"""
return False
if __name__ == '__main__':
"""
Here you can find an example to use the BaryScore
"""
metric_call = BaryScoreMetric(use_idfs=False)
ref = [
'I like my cakes very much',
'I hate these cakes!']
hypothesis = ['I like my cakes very much',
'I like my cakes very much']
metric_call.prepare_idfs(ref, hypothesis)
final_preds = metric_call.evaluate_batch(ref, hypothesis)
print(final_preds) |