|
|
|
from sklearn.metrics import roc_auc_score, root_mean_squared_error |
|
from generator.generate_metrics import generate_metrics |
|
import logging |
|
|
|
def compute_rmse_auc_roc_metrics(llm, dataset, vector_store, num_question): |
|
|
|
|
|
all_ground_truth_relevance = [] |
|
all_predicted_relevance = [] |
|
|
|
all_ground_truth_utilization = [] |
|
all_predicted_utilization = [] |
|
|
|
all_ground_truth_adherence = [] |
|
all_predicted_adherence = [] |
|
|
|
|
|
relevance_scores = [] |
|
utilization_scores = [] |
|
adherence_scores = [] |
|
|
|
|
|
for i, document in enumerate(dataset): |
|
|
|
ground_truth_relevance = dataset[i]['relevance_score'] |
|
ground_truth_utilization = dataset[i]['utilization_score'] |
|
ground_truth_adherence = dataset[i]['gpt3_adherence'] |
|
|
|
query = document['question'] |
|
logging.info(f'Query number: {i + 1}') |
|
|
|
metrics = generate_metrics(llm, vector_store, query) |
|
|
|
|
|
predicted_relevance = metrics['Context Relevance'] |
|
predicted_utilization = metrics['Context Utilization'] |
|
predicted_adherence = metrics['Adherence'] |
|
|
|
|
|
relevance_rmse = root_mean_squared_error([ground_truth_relevance], [predicted_relevance]) |
|
utilization_rmse = root_mean_squared_error([ground_truth_utilization], [predicted_utilization]) |
|
adherence_rmse = root_mean_squared_error([ground_truth_adherence], [predicted_adherence]) |
|
|
|
|
|
binary_ground_truth_relevance = 1 if ground_truth_relevance > 0.5 else 0 |
|
|
|
|
|
binary_ground_truth_utilization = 1 if ground_truth_utilization > 0.5 else 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
all_ground_truth_relevance.append(binary_ground_truth_relevance) |
|
all_predicted_relevance.append(predicted_relevance) |
|
|
|
all_ground_truth_utilization.append(binary_ground_truth_utilization) |
|
all_predicted_utilization.append(predicted_utilization) |
|
|
|
all_ground_truth_adherence.append(ground_truth_adherence) |
|
all_predicted_adherence.append(predicted_adherence) |
|
|
|
|
|
relevance_scores.append(relevance_rmse) |
|
utilization_scores.append(utilization_rmse) |
|
adherence_scores.append(adherence_rmse) |
|
if i == num_question: |
|
break |
|
|
|
|
|
try: |
|
|
|
|
|
relevance_auc = roc_auc_score(all_ground_truth_relevance, all_predicted_relevance) |
|
except ValueError: |
|
relevance_auc = None |
|
|
|
try: |
|
|
|
|
|
utilization_auc = roc_auc_score(all_ground_truth_utilization, all_predicted_utilization) |
|
except ValueError: |
|
utilization_auc = None |
|
|
|
try: |
|
|
|
|
|
adherence_auc = roc_auc_score(all_ground_truth_adherence, all_predicted_adherence) |
|
except ValueError: |
|
adherence_auc = None |
|
|
|
print(f"Relevance RMSE (per question): {relevance_scores}") |
|
print(f"Utilization RMSE (per question): {utilization_scores}") |
|
print(f"Adherence RMSE (per question): {adherence_scores}") |
|
print(f"\nOverall Relevance AUC-ROC: {relevance_auc}") |
|
print(f"Overall Utilization AUC-ROC: {utilization_auc}") |
|
print(f"Overall Adherence AUC-ROC: {adherence_auc}") |
|
|