File size: 4,459 Bytes
f7c2fa3
 
e384879
 
f7c2fa3
e384879
f7c2fa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e384879
 
f7c2fa3
 
 
e384879
f7c2fa3
e384879
 
 
 
f7c2fa3
 
 
 
e384879
f7c2fa3
 
 
 
e384879
f7c2fa3
 
 
e384879
f7c2fa3
 
e384879
f7c2fa3
 
 
 
 
 
 
 
 
 
 
e384879
 
f7c2fa3
 
 
 
e384879
 
f7c2fa3
e384879
f7c2fa3
 
e384879
 
f7c2fa3
 
 
 
 
e384879
 
f7c2fa3
 
 
e384879
 
 
 
 
 
 
 
f7c2fa3
 
e384879
f7c2fa3
 
e384879
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

from sklearn.metrics import roc_auc_score, root_mean_squared_error
from generator.generate_metrics import generate_metrics
import logging

def compute_rmse_auc_roc_metrics(llm, dataset, vector_store, num_question):

     # Lists to accumulate ground truths and predictions for AUC-ROC computation
    all_ground_truth_relevance = []
    all_predicted_relevance = []

    all_ground_truth_utilization = []
    all_predicted_utilization = []

    all_ground_truth_adherence = []
    all_predicted_adherence = []

    # To store RMSE scores for each question
    relevance_scores = []
    utilization_scores = []
    adherence_scores = []

    # For each question in dataset get the metrics
    for i, document in enumerate(dataset):
        # Extract ground truth metrics from dataset
        ground_truth_relevance = dataset[i]['relevance_score']
        ground_truth_utilization = dataset[i]['utilization_score']
        ground_truth_adherence = dataset[i]['gpt3_adherence']
        
        query = document['question']
        logging.info(f'Query number: {i + 1}')
        # Call the generate_metrics for each query
        metrics = generate_metrics(llm, vector_store, query)
        
        # Extract predicted metrics (ensure these are continuous if possible)
        predicted_relevance = metrics['Context Relevance']
        predicted_utilization = metrics['Context Utilization']
        predicted_adherence = metrics['Adherence']
        
        # === Handle Continuous Inputs for RMSE ===
        relevance_rmse = root_mean_squared_error([ground_truth_relevance], [predicted_relevance])
        utilization_rmse = root_mean_squared_error([ground_truth_utilization], [predicted_utilization])
        adherence_rmse = root_mean_squared_error([ground_truth_adherence], [predicted_adherence])
        
        # === Handle Binary Conversion for AUC-ROC ===
        binary_ground_truth_relevance = 1 if ground_truth_relevance > 0.5 else 0
        #binary_predicted_relevance = 1 if predicted_relevance > 0.5 else 0

        binary_ground_truth_utilization = 1 if ground_truth_utilization > 0.5 else 0
        #binary_predicted_utilization = 1 if predicted_utilization > 0.5 else 0

        #binary_ground_truth_adherence = 1 if ground_truth_adherence > 0.5 else 0
        #binary_predicted_adherence = 1 if predicted_adherence > 0.5 else 0
        
        # === Accumulate data for overall AUC-ROC computation ===
        all_ground_truth_relevance.append(binary_ground_truth_relevance)
        all_predicted_relevance.append(predicted_relevance)  # Use probability-based predictions

        all_ground_truth_utilization.append(binary_ground_truth_utilization)
        all_predicted_utilization.append(predicted_utilization)

        all_ground_truth_adherence.append(ground_truth_adherence)
        all_predicted_adherence.append(predicted_adherence)

        # Store RMSE scores for each question
        relevance_scores.append(relevance_rmse)
        utilization_scores.append(utilization_rmse)
        adherence_scores.append(adherence_rmse)
        if i == num_question:
          break
    
    # === Compute AUC-ROC for the Entire Dataset ===
    try:
        #print(f"All Ground Truth Relevance: {all_ground_truth_relevance}")
        #print(f"All Predicted Relevance: {all_predicted_relevance}")
        relevance_auc = roc_auc_score(all_ground_truth_relevance, all_predicted_relevance)
    except ValueError:
        relevance_auc = None

    try:
        #print(f"All Ground Truth Utilization: {all_ground_truth_utilization}")
        #print(f"All Predicted Utilization: {all_predicted_utilization}")
        utilization_auc = roc_auc_score(all_ground_truth_utilization, all_predicted_utilization)
    except ValueError:
        utilization_auc = None   

    try:
        #print(f"All Ground Truth Adherence: {all_ground_truth_utilization}")
        #print(f"All Predicted Utilization: {all_predicted_utilization}")
        adherence_auc = roc_auc_score(all_ground_truth_adherence, all_predicted_adherence)
    except ValueError:
        adherence_auc = None   

    print(f"Relevance RMSE (per question): {relevance_scores}")
    print(f"Utilization RMSE (per question): {utilization_scores}")
    print(f"Adherence RMSE (per question): {adherence_scores}")
    print(f"\nOverall Relevance AUC-ROC: {relevance_auc}")
    print(f"Overall Utilization AUC-ROC: {utilization_auc}")
    print(f"Overall Adherence AUC-ROC: {adherence_auc}")