Gourisankar Padihary
commited on
Commit
·
e384879
1
Parent(s):
f7c2fa3
Code rafactored
Browse files- generator/compute_metrics.py +3 -2
- generator/compute_rmse_auc_roc_metrics.py +33 -35
- generator/extract_attributes.py +3 -3
- generator/generate_metrics.py +29 -0
- main.py +12 -32
generator/compute_metrics.py
CHANGED
@@ -20,7 +20,8 @@ def compute_metrics(attributes, total_sentences):
|
|
20 |
completeness_score = len(Ri & Ui) / len(Ri) if len(Ri) else 0
|
21 |
|
22 |
# Compute Adherence
|
23 |
-
adherence = all(info.get("fully_supported", False) for info in sentence_support_information)
|
|
|
24 |
|
25 |
return {
|
26 |
"Context Relevance": context_relevance,
|
@@ -39,7 +40,7 @@ def get_metrics(attributes, total_sentences):
|
|
39 |
|
40 |
try:
|
41 |
result_json = json.loads(json_str)
|
42 |
-
print(json.dumps(result_json, indent=2))
|
43 |
|
44 |
# Compute metrics using the extracted attributes
|
45 |
metrics = compute_metrics(result_json, total_sentences)
|
|
|
20 |
completeness_score = len(Ri & Ui) / len(Ri) if len(Ri) else 0
|
21 |
|
22 |
# Compute Adherence
|
23 |
+
#adherence = all(info.get("fully_supported", False) for info in sentence_support_information)
|
24 |
+
adherence = 1 if all(info.get("fully_supported", False) for info in sentence_support_information) else 0
|
25 |
|
26 |
return {
|
27 |
"Context Relevance": context_relevance,
|
|
|
40 |
|
41 |
try:
|
42 |
result_json = json.loads(json_str)
|
43 |
+
#print(json.dumps(result_json, indent=2))
|
44 |
|
45 |
# Compute metrics using the extracted attributes
|
46 |
metrics = compute_metrics(result_json, total_sentences)
|
generator/compute_rmse_auc_roc_metrics.py
CHANGED
@@ -1,11 +1,9 @@
|
|
1 |
|
2 |
from sklearn.metrics import roc_auc_score, root_mean_squared_error
|
3 |
-
from generator.
|
4 |
-
|
5 |
-
from generator.generate_response import generate_response
|
6 |
-
from retriever.retrieve_documents import retrieve_top_k_documents
|
7 |
|
8 |
-
def compute_rmse_auc_roc_metrics(llm, dataset, vector_store):
|
9 |
|
10 |
# Lists to accumulate ground truths and predictions for AUC-ROC computation
|
11 |
all_ground_truth_relevance = []
|
@@ -22,43 +20,34 @@ def compute_rmse_auc_roc_metrics(llm, dataset, vector_store):
|
|
22 |
utilization_scores = []
|
23 |
adherence_scores = []
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
sample_question = sample['question']
|
28 |
-
|
29 |
# Extract ground truth metrics from dataset
|
30 |
ground_truth_relevance = dataset[i]['relevance_score']
|
31 |
ground_truth_utilization = dataset[i]['utilization_score']
|
32 |
-
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
response, source_docs = generate_response(llm, vector_store, sample_question, relevant_docs)
|
39 |
-
|
40 |
-
# Step 3: Extract attributes
|
41 |
-
attributes, total_sentences = extract_attributes(sample_question, source_docs, response)
|
42 |
-
|
43 |
-
# Call the process_attributes method in the main block
|
44 |
-
metrics = get_metrics(attributes, total_sentences)
|
45 |
|
46 |
# Extract predicted metrics (ensure these are continuous if possible)
|
47 |
predicted_relevance = metrics['Context Relevance']
|
48 |
predicted_utilization = metrics['Context Utilization']
|
49 |
-
|
50 |
|
51 |
# === Handle Continuous Inputs for RMSE ===
|
52 |
relevance_rmse = root_mean_squared_error([ground_truth_relevance], [predicted_relevance])
|
53 |
utilization_rmse = root_mean_squared_error([ground_truth_utilization], [predicted_utilization])
|
54 |
-
|
55 |
|
56 |
# === Handle Binary Conversion for AUC-ROC ===
|
57 |
binary_ground_truth_relevance = 1 if ground_truth_relevance > 0.5 else 0
|
58 |
-
binary_predicted_relevance = 1 if predicted_relevance > 0.5 else 0
|
59 |
|
60 |
binary_ground_truth_utilization = 1 if ground_truth_utilization > 0.5 else 0
|
61 |
-
binary_predicted_utilization = 1 if predicted_utilization > 0.5 else 0
|
62 |
|
63 |
#binary_ground_truth_adherence = 1 if ground_truth_adherence > 0.5 else 0
|
64 |
#binary_predicted_adherence = 1 if predicted_adherence > 0.5 else 0
|
@@ -70,32 +59,41 @@ def compute_rmse_auc_roc_metrics(llm, dataset, vector_store):
|
|
70 |
all_ground_truth_utilization.append(binary_ground_truth_utilization)
|
71 |
all_predicted_utilization.append(predicted_utilization)
|
72 |
|
73 |
-
|
74 |
-
|
75 |
|
76 |
# Store RMSE scores for each question
|
77 |
relevance_scores.append(relevance_rmse)
|
78 |
utilization_scores.append(utilization_rmse)
|
79 |
-
|
80 |
-
if i ==
|
81 |
break
|
|
|
82 |
# === Compute AUC-ROC for the Entire Dataset ===
|
83 |
try:
|
84 |
-
print(f"All Ground Truth Relevance: {all_ground_truth_relevance}")
|
85 |
-
print(f"All Predicted Relevance: {all_predicted_relevance}")
|
86 |
relevance_auc = roc_auc_score(all_ground_truth_relevance, all_predicted_relevance)
|
87 |
except ValueError:
|
88 |
relevance_auc = None
|
89 |
|
90 |
try:
|
91 |
-
print(f"All Ground Truth Utilization: {all_ground_truth_utilization}")
|
92 |
-
print(f"All Predicted Utilization: {all_predicted_utilization}")
|
93 |
utilization_auc = roc_auc_score(all_ground_truth_utilization, all_predicted_utilization)
|
94 |
except ValueError:
|
95 |
utilization_auc = None
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
print(f"Relevance RMSE (per question): {relevance_scores}")
|
98 |
print(f"Utilization RMSE (per question): {utilization_scores}")
|
99 |
-
|
100 |
print(f"\nOverall Relevance AUC-ROC: {relevance_auc}")
|
101 |
print(f"Overall Utilization AUC-ROC: {utilization_auc}")
|
|
|
|
1 |
|
2 |
from sklearn.metrics import roc_auc_score, root_mean_squared_error
|
3 |
+
from generator.generate_metrics import generate_metrics
|
4 |
+
import logging
|
|
|
|
|
5 |
|
6 |
+
def compute_rmse_auc_roc_metrics(llm, dataset, vector_store, num_question):
|
7 |
|
8 |
# Lists to accumulate ground truths and predictions for AUC-ROC computation
|
9 |
all_ground_truth_relevance = []
|
|
|
20 |
utilization_scores = []
|
21 |
adherence_scores = []
|
22 |
|
23 |
+
# For each question in dataset get the metrics
|
24 |
+
for i, document in enumerate(dataset):
|
|
|
|
|
25 |
# Extract ground truth metrics from dataset
|
26 |
ground_truth_relevance = dataset[i]['relevance_score']
|
27 |
ground_truth_utilization = dataset[i]['utilization_score']
|
28 |
+
ground_truth_adherence = dataset[i]['gpt3_adherence']
|
29 |
|
30 |
+
query = document['question']
|
31 |
+
logging.info(f'Query number: {i + 1}')
|
32 |
+
# Call the generate_metrics for each query
|
33 |
+
metrics = generate_metrics(llm, vector_store, query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
# Extract predicted metrics (ensure these are continuous if possible)
|
36 |
predicted_relevance = metrics['Context Relevance']
|
37 |
predicted_utilization = metrics['Context Utilization']
|
38 |
+
predicted_adherence = metrics['Adherence']
|
39 |
|
40 |
# === Handle Continuous Inputs for RMSE ===
|
41 |
relevance_rmse = root_mean_squared_error([ground_truth_relevance], [predicted_relevance])
|
42 |
utilization_rmse = root_mean_squared_error([ground_truth_utilization], [predicted_utilization])
|
43 |
+
adherence_rmse = root_mean_squared_error([ground_truth_adherence], [predicted_adherence])
|
44 |
|
45 |
# === Handle Binary Conversion for AUC-ROC ===
|
46 |
binary_ground_truth_relevance = 1 if ground_truth_relevance > 0.5 else 0
|
47 |
+
#binary_predicted_relevance = 1 if predicted_relevance > 0.5 else 0
|
48 |
|
49 |
binary_ground_truth_utilization = 1 if ground_truth_utilization > 0.5 else 0
|
50 |
+
#binary_predicted_utilization = 1 if predicted_utilization > 0.5 else 0
|
51 |
|
52 |
#binary_ground_truth_adherence = 1 if ground_truth_adherence > 0.5 else 0
|
53 |
#binary_predicted_adherence = 1 if predicted_adherence > 0.5 else 0
|
|
|
59 |
all_ground_truth_utilization.append(binary_ground_truth_utilization)
|
60 |
all_predicted_utilization.append(predicted_utilization)
|
61 |
|
62 |
+
all_ground_truth_adherence.append(ground_truth_adherence)
|
63 |
+
all_predicted_adherence.append(predicted_adherence)
|
64 |
|
65 |
# Store RMSE scores for each question
|
66 |
relevance_scores.append(relevance_rmse)
|
67 |
utilization_scores.append(utilization_rmse)
|
68 |
+
adherence_scores.append(adherence_rmse)
|
69 |
+
if i == num_question:
|
70 |
break
|
71 |
+
|
72 |
# === Compute AUC-ROC for the Entire Dataset ===
|
73 |
try:
|
74 |
+
#print(f"All Ground Truth Relevance: {all_ground_truth_relevance}")
|
75 |
+
#print(f"All Predicted Relevance: {all_predicted_relevance}")
|
76 |
relevance_auc = roc_auc_score(all_ground_truth_relevance, all_predicted_relevance)
|
77 |
except ValueError:
|
78 |
relevance_auc = None
|
79 |
|
80 |
try:
|
81 |
+
#print(f"All Ground Truth Utilization: {all_ground_truth_utilization}")
|
82 |
+
#print(f"All Predicted Utilization: {all_predicted_utilization}")
|
83 |
utilization_auc = roc_auc_score(all_ground_truth_utilization, all_predicted_utilization)
|
84 |
except ValueError:
|
85 |
utilization_auc = None
|
86 |
+
|
87 |
+
try:
|
88 |
+
#print(f"All Ground Truth Adherence: {all_ground_truth_utilization}")
|
89 |
+
#print(f"All Predicted Utilization: {all_predicted_utilization}")
|
90 |
+
adherence_auc = roc_auc_score(all_ground_truth_adherence, all_predicted_adherence)
|
91 |
+
except ValueError:
|
92 |
+
adherence_auc = None
|
93 |
+
|
94 |
print(f"Relevance RMSE (per question): {relevance_scores}")
|
95 |
print(f"Utilization RMSE (per question): {utilization_scores}")
|
96 |
+
print(f"Adherence RMSE (per question): {adherence_scores}")
|
97 |
print(f"\nOverall Relevance AUC-ROC: {relevance_auc}")
|
98 |
print(f"Overall Utilization AUC-ROC: {utilization_auc}")
|
99 |
+
print(f"Overall Adherence AUC-ROC: {adherence_auc}")
|
generator/extract_attributes.py
CHANGED
@@ -14,13 +14,13 @@ def extract_attributes(question, relevant_docs, response):
|
|
14 |
|
15 |
#print(f"Formatted documents : {formatted_documents}")
|
16 |
# Print the number of sentences in each document
|
17 |
-
for i, doc in enumerate(formatted_documents):
|
18 |
num_sentences = len(doc)
|
19 |
-
print(f"Document {i} has {num_sentences} sentences.")
|
20 |
|
21 |
# Calculate the total number of sentences from formatted_documents
|
22 |
total_sentences = sum(len(doc) for doc in formatted_documents)
|
23 |
-
print(f"Total number of sentences {total_sentences}")
|
24 |
|
25 |
attribute_prompt = create_prompt(formatted_documents, question, formatted_responses)
|
26 |
|
|
|
14 |
|
15 |
#print(f"Formatted documents : {formatted_documents}")
|
16 |
# Print the number of sentences in each document
|
17 |
+
'''for i, doc in enumerate(formatted_documents):
|
18 |
num_sentences = len(doc)
|
19 |
+
print(f"Document {i} has {num_sentences} sentences.")'''
|
20 |
|
21 |
# Calculate the total number of sentences from formatted_documents
|
22 |
total_sentences = sum(len(doc) for doc in formatted_documents)
|
23 |
+
#print(f"Total number of sentences {total_sentences}")
|
24 |
|
25 |
attribute_prompt = create_prompt(formatted_documents, question, formatted_responses)
|
26 |
|
generator/generate_metrics.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from generator.generate_response import generate_response
|
3 |
+
from retriever.retrieve_documents import retrieve_top_k_documents
|
4 |
+
from generator.compute_metrics import get_metrics
|
5 |
+
from generator.extract_attributes import extract_attributes
|
6 |
+
|
7 |
+
def generate_metrics(llm, vector_store, query):
|
8 |
+
logging.info(f'Query: {query}')
|
9 |
+
|
10 |
+
# Step 1: Retrieve relevant documents for given query
|
11 |
+
relevant_docs = retrieve_top_k_documents(vector_store, query, top_k=5)
|
12 |
+
logging.info(f"Relevant documents retrieved :{len(relevant_docs)}")
|
13 |
+
|
14 |
+
# Log each retrieved document individually
|
15 |
+
#for i, doc in enumerate(relevant_docs):
|
16 |
+
#logging.info(f"Relevant document {i+1}: {doc} \n")
|
17 |
+
|
18 |
+
# Step 2: Generate a response using LLM
|
19 |
+
response, source_docs = generate_response(llm, vector_store, query, relevant_docs)
|
20 |
+
|
21 |
+
logging.info(f"Response from LLM: {response}")
|
22 |
+
|
23 |
+
# Step 3: Extract attributes and total sentences for each query
|
24 |
+
attributes, total_sentences = extract_attributes(query, source_docs, response)
|
25 |
+
|
26 |
+
# Call the get_metrics
|
27 |
+
metrics = get_metrics(attributes, total_sentences)
|
28 |
+
|
29 |
+
return metrics
|
main.py
CHANGED
@@ -1,13 +1,10 @@
|
|
1 |
import logging
|
2 |
from data.load_dataset import load_data
|
3 |
-
from generator import compute_rmse_auc_roc_metrics
|
4 |
from retriever.chunk_documents import chunk_documents
|
5 |
from retriever.embed_documents import embed_documents
|
6 |
-
from
|
7 |
from generator.initialize_llm import initialize_llm
|
8 |
-
from generator.generate_response import generate_response
|
9 |
-
from generator.extract_attributes import extract_attributes
|
10 |
-
from generator.compute_metrics import get_metrics
|
11 |
|
12 |
# Configure logging
|
13 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
@@ -27,38 +24,21 @@ def main():
|
|
27 |
vector_store = embed_documents(documents)
|
28 |
logging.info("Documents embedded")
|
29 |
|
30 |
-
|
31 |
-
row_num = 1
|
32 |
-
sample_question = dataset[row_num]['question']
|
33 |
-
logging.info(f"Sample question: {sample_question}")
|
34 |
-
|
35 |
-
# Retrieve relevant documents
|
36 |
-
relevant_docs = retrieve_top_k_documents(vector_store, sample_question, top_k=5)
|
37 |
-
logging.info(f"Relevant documents retrieved :{len(relevant_docs)}")
|
38 |
-
# Log each retrieved document individually
|
39 |
-
#for i, doc in enumerate(relevant_docs):
|
40 |
-
#logging.info(f"Relevant document {i+1}: {doc} \n")
|
41 |
-
|
42 |
-
# Initialize the LLM
|
43 |
llm = initialize_llm()
|
44 |
logging.info("LLM initialized")
|
45 |
|
46 |
-
#
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
# Print the response
|
51 |
-
logging.info(f"Response from LLM: {response}")
|
52 |
-
#print(f"Source Documents: {source_docs}")
|
53 |
|
54 |
-
#
|
55 |
-
|
56 |
-
|
57 |
-
# Call the process_attributes method in the main block
|
58 |
-
metrics = get_metrics(attributes, total_sentences)
|
59 |
|
60 |
#Compute RMSE and AUC-ROC for entire dataset
|
61 |
-
|
62 |
-
|
|
|
|
|
63 |
if __name__ == "__main__":
|
64 |
main()
|
|
|
1 |
import logging
|
2 |
from data.load_dataset import load_data
|
3 |
+
from generator.compute_rmse_auc_roc_metrics import compute_rmse_auc_roc_metrics
|
4 |
from retriever.chunk_documents import chunk_documents
|
5 |
from retriever.embed_documents import embed_documents
|
6 |
+
from generator.generate_metrics import generate_metrics
|
7 |
from generator.initialize_llm import initialize_llm
|
|
|
|
|
|
|
8 |
|
9 |
# Configure logging
|
10 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
24 |
vector_store = embed_documents(documents)
|
25 |
logging.info("Documents embedded")
|
26 |
|
27 |
+
# Initialize the Generation LLM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
llm = initialize_llm()
|
29 |
logging.info("LLM initialized")
|
30 |
|
31 |
+
# Sample question
|
32 |
+
row_num = 43
|
33 |
+
sample_question = dataset[row_num]['question']
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
# Call generate_metrics for above sample question
|
36 |
+
generate_metrics(llm, vector_store, sample_question)
|
|
|
|
|
|
|
37 |
|
38 |
#Compute RMSE and AUC-ROC for entire dataset
|
39 |
+
compute_rmse_auc_roc_metrics(llm, dataset, vector_store, dataset.num_rows)
|
40 |
+
|
41 |
+
logging.info("Finished!!!")
|
42 |
+
|
43 |
if __name__ == "__main__":
|
44 |
main()
|