sango07 commited on
Commit
ba1cc7c
·
verified ·
1 Parent(s): 271d2c1

Create traditional_metrics.py

Browse files
Files changed (1) hide show
  1. traditional_metrics.py +190 -0
traditional_metrics.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from sacrebleu import corpus_bleu
3
+ from rouge_score import rouge_scorer
4
+ from bert_score import score
5
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
6
+ import nltk
7
+ from nltk.util import ngrams
8
+ import pandas as pd
9
+
10
+ import torch
11
+ from sacrebleu import corpus_bleu
12
+ from rouge_score import rouge_scorer
13
+ from bert_score import score
14
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
15
+ import nltk
16
+ from nltk.util import ngrams
17
+ import pandas as pd
18
+
19
+ def RAGEvaluator(df, selected_metrics):
20
+ # Load models and pipelines
21
+ gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
22
+ gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
23
+ bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
24
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
25
+
26
+ # Define metric evaluation functions
27
+ def evaluate_bleu_rouge(candidates, references):
28
+ bleu_score = corpus_bleu(candidates, [references]).score
29
+ rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
30
+ rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
31
+ return bleu_score, rouge1
32
+
33
+ def evaluate_bert_score(candidates, references):
34
+ P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased')
35
+ return P.mean().item(), R.mean().item(), F1.mean().item()
36
+
37
+ def evaluate_perplexity(text):
38
+ encodings = gpt2_tokenizer(text, return_tensors='pt')
39
+ max_length = gpt2_model.config.n_positions
40
+ stride = 512
41
+ lls = []
42
+ for i in range(0, encodings.input_ids.size(1), stride):
43
+ begin_loc = max(i + stride - max_length, 0)
44
+ end_loc = min(i + stride, encodings.input_ids.size(1))
45
+ trg_len = end_loc - i
46
+ input_ids = encodings.input_ids[:, begin_loc:end_loc]
47
+ target_ids = input_ids.clone()
48
+ target_ids[:, :-trg_len] = -100
49
+ with torch.no_grad():
50
+ outputs = gpt2_model(input_ids, labels=target_ids)
51
+ log_likelihood = outputs[0] * trg_len
52
+ lls.append(log_likelihood)
53
+ ppl = torch.exp(torch.stack(lls).sum() / end_loc)
54
+ return ppl.item()
55
+
56
+ def evaluate_diversity(texts):
57
+ all_tokens = [tok for text in texts for tok in text.split()]
58
+ unique_bigrams = set(ngrams(all_tokens, 2))
59
+ diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
60
+ return diversity_score
61
+
62
+ def evaluate_racial_bias(text):
63
+ results = bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"])
64
+ bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')]
65
+ return bias_score
66
+
67
+ # Process each row and add selected metric results to the DataFrame
68
+ for idx, row in df.iterrows():
69
+ question, answer, contexts = row['question'], row['answer'], row['contexts']
70
+ candidates = [answer]
71
+ references = [contexts]
72
+
73
+ # Calculate metrics as per the selected metrics list and add them as columns in the DataFrame
74
+ if "BLEU" in selected_metrics or "ROUGE-1" in selected_metrics:
75
+ bleu, rouge1 = evaluate_bleu_rouge(candidates, references)
76
+ if "BLEU" in selected_metrics:
77
+ df.at[idx, "BLEU"] = bleu
78
+ if "ROUGE-1" in selected_metrics:
79
+ df.at[idx, "ROUGE-1"] = rouge1
80
+ if "BERT Precision" in selected_metrics or "BERT Recall" in selected_metrics or "BERT F1" in selected_metrics:
81
+ bert_p, bert_r, bert_f1 = evaluate_bert_score(candidates, references)
82
+ if "BERT Precision" in selected_metrics:
83
+ df.at[idx, "BERT Precision"] = bert_p
84
+ if "BERT Recall" in selected_metrics:
85
+ df.at[idx, "BERT Recall"] = bert_r
86
+ if "BERT F1" in selected_metrics:
87
+ df.at[idx, "BERT F1"] = bert_f1
88
+ if "Perplexity" in selected_metrics:
89
+ df.at[idx, "Perplexity"] = evaluate_perplexity(answer)
90
+ if "Diversity" in selected_metrics:
91
+ df.at[idx, "Diversity"] = evaluate_diversity(candidates)
92
+ if "Racial Bias" in selected_metrics:
93
+ df.at[idx, "Racial Bias"] = evaluate_racial_bias(answer)
94
+
95
+ return df
96
+
97
+
98
+ # def RAGEvaluator(df, selected_metrics):
99
+ # # Load models and pipelines
100
+ # gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
101
+ # gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
102
+ # bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
103
+ # scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
104
+
105
+ # # Function definitions for evaluations
106
+ # def evaluate_bleu_rouge(candidates, references):
107
+ # bleu_score = corpus_bleu(candidates, [references]).score
108
+ # rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
109
+ # rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
110
+ # return bleu_score, rouge1
111
+
112
+ # def evaluate_bert_score(candidates, references):
113
+ # P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased')
114
+ # return P.mean().item(), R.mean().item(), F1.mean().item()
115
+
116
+ # def evaluate_perplexity(text):
117
+ # encodings = gpt2_tokenizer(text, return_tensors='pt')
118
+ # max_length = gpt2_model.config.n_positions
119
+ # stride = 512
120
+ # lls = []
121
+ # for i in range(0, encodings.input_ids.size(1), stride):
122
+ # begin_loc = max(i + stride - max_length, 0)
123
+ # end_loc = min(i + stride, encodings.input_ids.size(1))
124
+ # trg_len = end_loc - i
125
+ # input_ids = encodings.input_ids[:, begin_loc:end_loc]
126
+ # target_ids = input_ids.clone()
127
+ # target_ids[:, :-trg_len] = -100
128
+ # with torch.no_grad():
129
+ # outputs = gpt2_model(input_ids, labels=target_ids)
130
+ # log_likelihood = outputs[0] * trg_len
131
+ # lls.append(log_likelihood)
132
+ # ppl = torch.exp(torch.stack(lls).sum() / end_loc)
133
+ # return ppl.item()
134
+
135
+ # def evaluate_diversity(texts):
136
+ # all_tokens = [tok for text in texts for tok in text.split()]
137
+ # unique_bigrams = set(ngrams(all_tokens, 2))
138
+ # diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
139
+ # return diversity_score
140
+
141
+ # def evaluate_racial_bias(text):
142
+ # results = bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"])
143
+ # bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')]
144
+ # return bias_score
145
+
146
+ # # Dictionary to store results for each metric per row
147
+ # metrics_data = {metric: [] for metric in selected_metrics}
148
+
149
+ # # Evaluate each row in the DataFrame
150
+ # for idx, row in df.iterrows():
151
+ # question, answer, contexts = row['question'], row['answer'], row['contexts']
152
+ # candidates = [answer]
153
+ # references = [contexts]
154
+
155
+ # # Collect metrics conditionally based on selected_metrics
156
+ # if 'BLEU' in selected_metrics or 'ROUGE-1' in selected_metrics:
157
+ # bleu, rouge1 = evaluate_bleu_rouge(candidates, references)
158
+ # if 'BLEU' in selected_metrics:
159
+ # metrics_data['BLEU'].append(bleu)
160
+ # if 'ROUGE-1' in selected_metrics:
161
+ # metrics_data['ROUGE-1'].append(rouge1)
162
+
163
+ # if 'BERT Precision' in selected_metrics or 'BERT Recall' in selected_metrics or 'BERT F1' in selected_metrics:
164
+ # bert_p, bert_r, bert_f1 = evaluate_bert_score(candidates, references)
165
+ # if 'BERT Precision' in selected_metrics:
166
+ # metrics_data['BERT Precision'].append(bert_p)
167
+ # if 'BERT Recall' in selected_metrics:
168
+ # metrics_data['BERT Recall'].append(bert_r)
169
+ # if 'BERT F1' in selected_metrics:
170
+ # metrics_data['BERT F1'].append(bert_f1)
171
+
172
+ # if 'Perplexity' in selected_metrics:
173
+ # perplexity = evaluate_perplexity(answer)
174
+ # metrics_data['Perplexity'].append(perplexity)
175
+
176
+ # if 'Diversity' in selected_metrics:
177
+ # diversity = evaluate_diversity(candidates)
178
+ # metrics_data['Diversity'].append(diversity)
179
+
180
+ # if 'Racial Bias' in selected_metrics:
181
+ # racial_bias = evaluate_racial_bias(answer)
182
+ # metrics_data['Racial Bias'].append(racial_bias)
183
+
184
+ # # Convert metrics_data dictionary to a DataFrame
185
+ # metrics_df = pd.DataFrame(metrics_data)
186
+
187
+ # # Concatenate original DataFrame with metrics DataFrame
188
+ # result_df = pd.concat([df.reset_index(drop=True), metrics_df], axis=1)
189
+
190
+ # return result_df