Spaces:
Running
Running
import pandas as pd | |
# from evaluate import load | |
from scipy import stats | |
from nltk.translate.bleu_score import sentence_bleu | |
import string | |
# Load data from the CSV files | |
df1 = pd.read_csv('MT0_xxl_ape/result_mr') | |
df2 = pd.read_csv('MT0_xxl_ape/result_mr_50p') | |
df_reference = pd.read_csv('MT0_xxl_ape/result_mr') | |
# bleu = load("sacrebleu") | |
sentences1 = df1['pred_label'] | |
sentences2 = df2['pred_label'] | |
reference_sentences = df_reference['ref'] | |
def process_sentence(sentence): | |
if not isinstance(sentence, str): | |
return "" | |
# Remove spaces before and after the sentence | |
sentence = sentence.split('\n')[0] | |
sentence = sentence.strip() | |
sentence = sentence.lower() | |
# Remove punctuation marks in the sentence | |
for punctuation in string.punctuation: | |
sentence = sentence.replace(punctuation, "") | |
sentence = sentence.strip() | |
if sentence == "": | |
return sentence | |
if (sentence[-1] == '।'): | |
print(sentence) | |
sentence = sentence[:-1] | |
print(sentence) | |
return sentence | |
# Calculate BLEU scores | |
def calculate_bleu(sentences, reference): | |
return [sentence_bleu([reference[i]], sentences[i]) for i in range(len(sentences))] | |
sentences1 = [process_sentence(s) for s in list(sentences1)] | |
sentences2 = [process_sentence(s) for s in list(sentences2)] | |
reference_sentences = [process_sentence(s) for s in list(reference_sentences)] | |
bleu_scores1 = calculate_bleu(sentences1, reference_sentences) | |
bleu_scores2 = calculate_bleu(sentences2, reference_sentences) | |
# Check for normality | |
def check_normality(data): | |
stat, p = stats.shapiro(data) | |
if p > 0.05: | |
return True | |
else: | |
return False | |
is_normal1 = check_normality(bleu_scores1) | |
is_normal2 = check_normality(bleu_scores2) | |
# Check for equal variances | |
def check_variance(data1, data2): | |
stat, p = stats.levene(data1, data2) | |
if p > 0.05: | |
return True | |
else: | |
return False | |
is_equal_var = check_variance(bleu_scores1, bleu_scores2) | |
# Decide and perform the significance test | |
def perform_significance_test(): | |
if is_normal1 and is_normal2: | |
if is_equal_var: | |
t_stat, p = stats.ttest_ind(bleu_scores1, bleu_scores2) | |
return "T-test", p | |
else: | |
t_stat, p = stats.ttest_ind(bleu_scores1, bleu_scores2, equal_var=False) | |
return "Welch's T-test", p | |
else: | |
u_stat, p = stats.mannwhitneyu(bleu_scores1, bleu_scores2) | |
return "Mann-Whitney U test", p | |
test_name, p_value = perform_significance_test() | |
# Output results | |
print(f"Test used: {test_name}") | |
print(f"P-value: {p_value}") | |
if p_value < 0.05: | |
print("The difference in BLEU scores is statistically significant.") | |
else: | |
print("The difference in BLEU scores is not statistically significant.") | |