Gourisankar Padihary commited on
Commit
e384879
·
1 Parent(s): f7c2fa3

Code rafactored

Browse files
generator/compute_metrics.py CHANGED
@@ -20,7 +20,8 @@ def compute_metrics(attributes, total_sentences):
20
  completeness_score = len(Ri & Ui) / len(Ri) if len(Ri) else 0
21
 
22
  # Compute Adherence
23
- adherence = all(info.get("fully_supported", False) for info in sentence_support_information)
 
24
 
25
  return {
26
  "Context Relevance": context_relevance,
@@ -39,7 +40,7 @@ def get_metrics(attributes, total_sentences):
39
 
40
  try:
41
  result_json = json.loads(json_str)
42
- print(json.dumps(result_json, indent=2))
43
 
44
  # Compute metrics using the extracted attributes
45
  metrics = compute_metrics(result_json, total_sentences)
 
20
  completeness_score = len(Ri & Ui) / len(Ri) if len(Ri) else 0
21
 
22
  # Compute Adherence
23
+ #adherence = all(info.get("fully_supported", False) for info in sentence_support_information)
24
+ adherence = 1 if all(info.get("fully_supported", False) for info in sentence_support_information) else 0
25
 
26
  return {
27
  "Context Relevance": context_relevance,
 
40
 
41
  try:
42
  result_json = json.loads(json_str)
43
+ #print(json.dumps(result_json, indent=2))
44
 
45
  # Compute metrics using the extracted attributes
46
  metrics = compute_metrics(result_json, total_sentences)
generator/compute_rmse_auc_roc_metrics.py CHANGED
@@ -1,11 +1,9 @@
1
 
2
  from sklearn.metrics import roc_auc_score, root_mean_squared_error
3
- from generator.compute_metrics import get_metrics
4
- from generator.extract_attributes import extract_attributes
5
- from generator.generate_response import generate_response
6
- from retriever.retrieve_documents import retrieve_top_k_documents
7
 
8
- def compute_rmse_auc_roc_metrics(llm, dataset, vector_store):
9
 
10
  # Lists to accumulate ground truths and predictions for AUC-ROC computation
11
  all_ground_truth_relevance = []
@@ -22,43 +20,34 @@ def compute_rmse_auc_roc_metrics(llm, dataset, vector_store):
22
  utilization_scores = []
23
  adherence_scores = []
24
 
25
- for i, sample in enumerate(dataset):
26
- print(sample)
27
- sample_question = sample['question']
28
-
29
  # Extract ground truth metrics from dataset
30
  ground_truth_relevance = dataset[i]['relevance_score']
31
  ground_truth_utilization = dataset[i]['utilization_score']
32
- ground_truth_completeness = dataset[i]['completeness_score']
33
 
34
- # Step 1: Retrieve relevant documents
35
- relevant_docs = retrieve_top_k_documents(vector_store, sample_question, top_k=5)
36
-
37
- # Step 2: Generate a response using LLM
38
- response, source_docs = generate_response(llm, vector_store, sample_question, relevant_docs)
39
-
40
- # Step 3: Extract attributes
41
- attributes, total_sentences = extract_attributes(sample_question, source_docs, response)
42
-
43
- # Call the process_attributes method in the main block
44
- metrics = get_metrics(attributes, total_sentences)
45
 
46
  # Extract predicted metrics (ensure these are continuous if possible)
47
  predicted_relevance = metrics['Context Relevance']
48
  predicted_utilization = metrics['Context Utilization']
49
- predicted_completeness = metrics['Completeness Score']
50
 
51
  # === Handle Continuous Inputs for RMSE ===
52
  relevance_rmse = root_mean_squared_error([ground_truth_relevance], [predicted_relevance])
53
  utilization_rmse = root_mean_squared_error([ground_truth_utilization], [predicted_utilization])
54
- #adherence_rmse = mean_squared_error([ground_truth_adherence], [predicted_adherence], squared=False)
55
 
56
  # === Handle Binary Conversion for AUC-ROC ===
57
  binary_ground_truth_relevance = 1 if ground_truth_relevance > 0.5 else 0
58
- binary_predicted_relevance = 1 if predicted_relevance > 0.5 else 0
59
 
60
  binary_ground_truth_utilization = 1 if ground_truth_utilization > 0.5 else 0
61
- binary_predicted_utilization = 1 if predicted_utilization > 0.5 else 0
62
 
63
  #binary_ground_truth_adherence = 1 if ground_truth_adherence > 0.5 else 0
64
  #binary_predicted_adherence = 1 if predicted_adherence > 0.5 else 0
@@ -70,32 +59,41 @@ def compute_rmse_auc_roc_metrics(llm, dataset, vector_store):
70
  all_ground_truth_utilization.append(binary_ground_truth_utilization)
71
  all_predicted_utilization.append(predicted_utilization)
72
 
73
- #all_ground_truth_adherence.append(binary_ground_truth_adherence)
74
- #all_predicted_adherence.append(predicted_adherence)
75
 
76
  # Store RMSE scores for each question
77
  relevance_scores.append(relevance_rmse)
78
  utilization_scores.append(utilization_rmse)
79
- #adherence_scores.append(adherence_rmse)
80
- if i == 9: # Stop after processing the first 10 rows
81
  break
 
82
  # === Compute AUC-ROC for the Entire Dataset ===
83
  try:
84
- print(f"All Ground Truth Relevance: {all_ground_truth_relevance}")
85
- print(f"All Predicted Relevance: {all_predicted_relevance}")
86
  relevance_auc = roc_auc_score(all_ground_truth_relevance, all_predicted_relevance)
87
  except ValueError:
88
  relevance_auc = None
89
 
90
  try:
91
- print(f"All Ground Truth Utilization: {all_ground_truth_utilization}")
92
- print(f"All Predicted Utilization: {all_predicted_utilization}")
93
  utilization_auc = roc_auc_score(all_ground_truth_utilization, all_predicted_utilization)
94
  except ValueError:
95
  utilization_auc = None
96
-
 
 
 
 
 
 
 
97
  print(f"Relevance RMSE (per question): {relevance_scores}")
98
  print(f"Utilization RMSE (per question): {utilization_scores}")
99
- #print(f"Adherence RMSE (per question): {adherence_scores}")
100
  print(f"\nOverall Relevance AUC-ROC: {relevance_auc}")
101
  print(f"Overall Utilization AUC-ROC: {utilization_auc}")
 
 
1
 
2
  from sklearn.metrics import roc_auc_score, root_mean_squared_error
3
+ from generator.generate_metrics import generate_metrics
4
+ import logging
 
 
5
 
6
+ def compute_rmse_auc_roc_metrics(llm, dataset, vector_store, num_question):
7
 
8
  # Lists to accumulate ground truths and predictions for AUC-ROC computation
9
  all_ground_truth_relevance = []
 
20
  utilization_scores = []
21
  adherence_scores = []
22
 
23
+ # For each question in dataset get the metrics
24
+ for i, document in enumerate(dataset):
 
 
25
  # Extract ground truth metrics from dataset
26
  ground_truth_relevance = dataset[i]['relevance_score']
27
  ground_truth_utilization = dataset[i]['utilization_score']
28
+ ground_truth_adherence = dataset[i]['gpt3_adherence']
29
 
30
+ query = document['question']
31
+ logging.info(f'Query number: {i + 1}')
32
+ # Call the generate_metrics for each query
33
+ metrics = generate_metrics(llm, vector_store, query)
 
 
 
 
 
 
 
34
 
35
  # Extract predicted metrics (ensure these are continuous if possible)
36
  predicted_relevance = metrics['Context Relevance']
37
  predicted_utilization = metrics['Context Utilization']
38
+ predicted_adherence = metrics['Adherence']
39
 
40
  # === Handle Continuous Inputs for RMSE ===
41
  relevance_rmse = root_mean_squared_error([ground_truth_relevance], [predicted_relevance])
42
  utilization_rmse = root_mean_squared_error([ground_truth_utilization], [predicted_utilization])
43
+ adherence_rmse = root_mean_squared_error([ground_truth_adherence], [predicted_adherence])
44
 
45
  # === Handle Binary Conversion for AUC-ROC ===
46
  binary_ground_truth_relevance = 1 if ground_truth_relevance > 0.5 else 0
47
+ #binary_predicted_relevance = 1 if predicted_relevance > 0.5 else 0
48
 
49
  binary_ground_truth_utilization = 1 if ground_truth_utilization > 0.5 else 0
50
+ #binary_predicted_utilization = 1 if predicted_utilization > 0.5 else 0
51
 
52
  #binary_ground_truth_adherence = 1 if ground_truth_adherence > 0.5 else 0
53
  #binary_predicted_adherence = 1 if predicted_adherence > 0.5 else 0
 
59
  all_ground_truth_utilization.append(binary_ground_truth_utilization)
60
  all_predicted_utilization.append(predicted_utilization)
61
 
62
+ all_ground_truth_adherence.append(ground_truth_adherence)
63
+ all_predicted_adherence.append(predicted_adherence)
64
 
65
  # Store RMSE scores for each question
66
  relevance_scores.append(relevance_rmse)
67
  utilization_scores.append(utilization_rmse)
68
+ adherence_scores.append(adherence_rmse)
69
+ if i == num_question:
70
  break
71
+
72
  # === Compute AUC-ROC for the Entire Dataset ===
73
  try:
74
+ #print(f"All Ground Truth Relevance: {all_ground_truth_relevance}")
75
+ #print(f"All Predicted Relevance: {all_predicted_relevance}")
76
  relevance_auc = roc_auc_score(all_ground_truth_relevance, all_predicted_relevance)
77
  except ValueError:
78
  relevance_auc = None
79
 
80
  try:
81
+ #print(f"All Ground Truth Utilization: {all_ground_truth_utilization}")
82
+ #print(f"All Predicted Utilization: {all_predicted_utilization}")
83
  utilization_auc = roc_auc_score(all_ground_truth_utilization, all_predicted_utilization)
84
  except ValueError:
85
  utilization_auc = None
86
+
87
+ try:
88
+ #print(f"All Ground Truth Adherence: {all_ground_truth_utilization}")
89
+ #print(f"All Predicted Utilization: {all_predicted_utilization}")
90
+ adherence_auc = roc_auc_score(all_ground_truth_adherence, all_predicted_adherence)
91
+ except ValueError:
92
+ adherence_auc = None
93
+
94
  print(f"Relevance RMSE (per question): {relevance_scores}")
95
  print(f"Utilization RMSE (per question): {utilization_scores}")
96
+ print(f"Adherence RMSE (per question): {adherence_scores}")
97
  print(f"\nOverall Relevance AUC-ROC: {relevance_auc}")
98
  print(f"Overall Utilization AUC-ROC: {utilization_auc}")
99
+ print(f"Overall Adherence AUC-ROC: {adherence_auc}")
generator/extract_attributes.py CHANGED
@@ -14,13 +14,13 @@ def extract_attributes(question, relevant_docs, response):
14
 
15
  #print(f"Formatted documents : {formatted_documents}")
16
  # Print the number of sentences in each document
17
- for i, doc in enumerate(formatted_documents):
18
  num_sentences = len(doc)
19
- print(f"Document {i} has {num_sentences} sentences.")
20
 
21
  # Calculate the total number of sentences from formatted_documents
22
  total_sentences = sum(len(doc) for doc in formatted_documents)
23
- print(f"Total number of sentences {total_sentences}")
24
 
25
  attribute_prompt = create_prompt(formatted_documents, question, formatted_responses)
26
 
 
14
 
15
  #print(f"Formatted documents : {formatted_documents}")
16
  # Print the number of sentences in each document
17
+ '''for i, doc in enumerate(formatted_documents):
18
  num_sentences = len(doc)
19
+ print(f"Document {i} has {num_sentences} sentences.")'''
20
 
21
  # Calculate the total number of sentences from formatted_documents
22
  total_sentences = sum(len(doc) for doc in formatted_documents)
23
+ #print(f"Total number of sentences {total_sentences}")
24
 
25
  attribute_prompt = create_prompt(formatted_documents, question, formatted_responses)
26
 
generator/generate_metrics.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from generator.generate_response import generate_response
3
+ from retriever.retrieve_documents import retrieve_top_k_documents
4
+ from generator.compute_metrics import get_metrics
5
+ from generator.extract_attributes import extract_attributes
6
+
7
+ def generate_metrics(llm, vector_store, query):
8
+ logging.info(f'Query: {query}')
9
+
10
+ # Step 1: Retrieve relevant documents for given query
11
+ relevant_docs = retrieve_top_k_documents(vector_store, query, top_k=5)
12
+ logging.info(f"Relevant documents retrieved :{len(relevant_docs)}")
13
+
14
+ # Log each retrieved document individually
15
+ #for i, doc in enumerate(relevant_docs):
16
+ #logging.info(f"Relevant document {i+1}: {doc} \n")
17
+
18
+ # Step 2: Generate a response using LLM
19
+ response, source_docs = generate_response(llm, vector_store, query, relevant_docs)
20
+
21
+ logging.info(f"Response from LLM: {response}")
22
+
23
+ # Step 3: Extract attributes and total sentences for each query
24
+ attributes, total_sentences = extract_attributes(query, source_docs, response)
25
+
26
+ # Call the get_metrics
27
+ metrics = get_metrics(attributes, total_sentences)
28
+
29
+ return metrics
main.py CHANGED
@@ -1,13 +1,10 @@
1
  import logging
2
  from data.load_dataset import load_data
3
- from generator import compute_rmse_auc_roc_metrics
4
  from retriever.chunk_documents import chunk_documents
5
  from retriever.embed_documents import embed_documents
6
- from retriever.retrieve_documents import retrieve_top_k_documents
7
  from generator.initialize_llm import initialize_llm
8
- from generator.generate_response import generate_response
9
- from generator.extract_attributes import extract_attributes
10
- from generator.compute_metrics import get_metrics
11
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -27,38 +24,21 @@ def main():
27
  vector_store = embed_documents(documents)
28
  logging.info("Documents embedded")
29
 
30
- # Sample question
31
- row_num = 1
32
- sample_question = dataset[row_num]['question']
33
- logging.info(f"Sample question: {sample_question}")
34
-
35
- # Retrieve relevant documents
36
- relevant_docs = retrieve_top_k_documents(vector_store, sample_question, top_k=5)
37
- logging.info(f"Relevant documents retrieved :{len(relevant_docs)}")
38
- # Log each retrieved document individually
39
- #for i, doc in enumerate(relevant_docs):
40
- #logging.info(f"Relevant document {i+1}: {doc} \n")
41
-
42
- # Initialize the LLM
43
  llm = initialize_llm()
44
  logging.info("LLM initialized")
45
 
46
- # Generate a response using the relevant documents
47
- response, source_docs = generate_response(llm, vector_store, sample_question, relevant_docs)
48
- logging.info("Response generated")
49
-
50
- # Print the response
51
- logging.info(f"Response from LLM: {response}")
52
- #print(f"Source Documents: {source_docs}")
53
 
54
- # Valuations : Extract attributes from the response and source documents
55
- attributes, total_sentences = extract_attributes(sample_question, source_docs, response)
56
-
57
- # Call the process_attributes method in the main block
58
- metrics = get_metrics(attributes, total_sentences)
59
 
60
  #Compute RMSE and AUC-ROC for entire dataset
61
- #compute_rmse_auc_roc_metrics(llm, dataset, vector_store)
62
-
 
 
63
  if __name__ == "__main__":
64
  main()
 
1
  import logging
2
  from data.load_dataset import load_data
3
+ from generator.compute_rmse_auc_roc_metrics import compute_rmse_auc_roc_metrics
4
  from retriever.chunk_documents import chunk_documents
5
  from retriever.embed_documents import embed_documents
6
+ from generator.generate_metrics import generate_metrics
7
  from generator.initialize_llm import initialize_llm
 
 
 
8
 
9
  # Configure logging
10
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
24
  vector_store = embed_documents(documents)
25
  logging.info("Documents embedded")
26
 
27
+ # Initialize the Generation LLM
 
 
 
 
 
 
 
 
 
 
 
 
28
  llm = initialize_llm()
29
  logging.info("LLM initialized")
30
 
31
+ # Sample question
32
+ row_num = 43
33
+ sample_question = dataset[row_num]['question']
 
 
 
 
34
 
35
+ # Call generate_metrics for above sample question
36
+ generate_metrics(llm, vector_store, sample_question)
 
 
 
37
 
38
  #Compute RMSE and AUC-ROC for entire dataset
39
+ compute_rmse_auc_roc_metrics(llm, dataset, vector_store, dataset.num_rows)
40
+
41
+ logging.info("Finished!!!")
42
+
43
  if __name__ == "__main__":
44
  main()