Spaces:

gourisankar85
/

realtime-rag-pipeline

Running

App Files Files Community

Gourisankar Padihary commited on Dec 21, 2024

Commit

e384879

1 Parent(s): f7c2fa3

Code rafactored

Browse files

Files changed (5) hide show

generator/compute_metrics.py +3 -2
generator/compute_rmse_auc_roc_metrics.py +33 -35
generator/extract_attributes.py +3 -3
generator/generate_metrics.py +29 -0
main.py +12 -32

generator/compute_metrics.py CHANGED Viewed

@@ -20,7 +20,8 @@ def compute_metrics(attributes, total_sentences):
     completeness_score = len(Ri & Ui) / len(Ri) if len(Ri) else 0
     # Compute Adherence
-    adherence = all(info.get("fully_supported", False) for info in sentence_support_information)
     return {
         "Context Relevance": context_relevance,
@@ -39,7 +40,7 @@ def get_metrics(attributes, total_sentences):
         try:
             result_json = json.loads(json_str)
-            print(json.dumps(result_json, indent=2))
             # Compute metrics using the extracted attributes
             metrics = compute_metrics(result_json, total_sentences)

     completeness_score = len(Ri & Ui) / len(Ri) if len(Ri) else 0
     # Compute Adherence
+    #adherence = all(info.get("fully_supported", False) for info in sentence_support_information)
+    adherence = 1 if all(info.get("fully_supported", False) for info in sentence_support_information) else 0
     return {
         "Context Relevance": context_relevance,
         try:
             result_json = json.loads(json_str)
+            #print(json.dumps(result_json, indent=2))
             # Compute metrics using the extracted attributes
             metrics = compute_metrics(result_json, total_sentences)

generator/compute_rmse_auc_roc_metrics.py CHANGED Viewed

@@ -1,11 +1,9 @@
 from sklearn.metrics import roc_auc_score, root_mean_squared_error
-from generator.compute_metrics import get_metrics
-from generator.extract_attributes import extract_attributes
-from generator.generate_response import generate_response
-from retriever.retrieve_documents import retrieve_top_k_documents
-def compute_rmse_auc_roc_metrics(llm, dataset, vector_store):
      # Lists to accumulate ground truths and predictions for AUC-ROC computation
     all_ground_truth_relevance = []
@@ -22,43 +20,34 @@ def compute_rmse_auc_roc_metrics(llm, dataset, vector_store):
     utilization_scores = []
     adherence_scores = []
-    for i, sample in enumerate(dataset):
-        print(sample)
-        sample_question = sample['question']
         # Extract ground truth metrics from dataset
         ground_truth_relevance = dataset[i]['relevance_score']
         ground_truth_utilization = dataset[i]['utilization_score']
-        ground_truth_completeness = dataset[i]['completeness_score']
-        # Step 1: Retrieve relevant documents
-        relevant_docs = retrieve_top_k_documents(vector_store, sample_question, top_k=5)
-        # Step 2: Generate a response using LLM
-        response, source_docs = generate_response(llm, vector_store, sample_question, relevant_docs)
-        # Step 3: Extract attributes
-        attributes, total_sentences = extract_attributes(sample_question, source_docs, response)
-        # Call the process_attributes method in the main block
-        metrics = get_metrics(attributes, total_sentences)
         # Extract predicted metrics (ensure these are continuous if possible)
         predicted_relevance = metrics['Context Relevance']
         predicted_utilization = metrics['Context Utilization']
-        predicted_completeness = metrics['Completeness Score']
         # === Handle Continuous Inputs for RMSE ===
         relevance_rmse = root_mean_squared_error([ground_truth_relevance], [predicted_relevance])
         utilization_rmse = root_mean_squared_error([ground_truth_utilization], [predicted_utilization])
-        #adherence_rmse = mean_squared_error([ground_truth_adherence], [predicted_adherence], squared=False)
         # === Handle Binary Conversion for AUC-ROC ===
         binary_ground_truth_relevance = 1 if ground_truth_relevance > 0.5 else 0
-        binary_predicted_relevance = 1 if predicted_relevance > 0.5 else 0
         binary_ground_truth_utilization = 1 if ground_truth_utilization > 0.5 else 0
-        binary_predicted_utilization = 1 if predicted_utilization > 0.5 else 0
         #binary_ground_truth_adherence = 1 if ground_truth_adherence > 0.5 else 0
         #binary_predicted_adherence = 1 if predicted_adherence > 0.5 else 0
@@ -70,32 +59,41 @@ def compute_rmse_auc_roc_metrics(llm, dataset, vector_store):
         all_ground_truth_utilization.append(binary_ground_truth_utilization)
         all_predicted_utilization.append(predicted_utilization)
-        #all_ground_truth_adherence.append(binary_ground_truth_adherence)
-        #all_predicted_adherence.append(predicted_adherence)
         # Store RMSE scores for each question
         relevance_scores.append(relevance_rmse)
         utilization_scores.append(utilization_rmse)
-        #adherence_scores.append(adherence_rmse)
-        if i == 9:  # Stop after processing the first 10 rows
           break
     # === Compute AUC-ROC for the Entire Dataset ===
     try:
-        print(f"All Ground Truth Relevance: {all_ground_truth_relevance}")
-        print(f"All Predicted Relevance: {all_predicted_relevance}")
         relevance_auc = roc_auc_score(all_ground_truth_relevance, all_predicted_relevance)
     except ValueError:
         relevance_auc = None
     try:
-        print(f"All Ground Truth Utilization: {all_ground_truth_utilization}")
-        print(f"All Predicted Utilization: {all_predicted_utilization}")
         utilization_auc = roc_auc_score(all_ground_truth_utilization, all_predicted_utilization)
     except ValueError:
         utilization_auc = None
     print(f"Relevance RMSE (per question): {relevance_scores}")
     print(f"Utilization RMSE (per question): {utilization_scores}")
-    #print(f"Adherence RMSE (per question): {adherence_scores}")
     print(f"\nOverall Relevance AUC-ROC: {relevance_auc}")
     print(f"Overall Utilization AUC-ROC: {utilization_auc}")

 from sklearn.metrics import roc_auc_score, root_mean_squared_error
+from generator.generate_metrics import generate_metrics
+import logging
+def compute_rmse_auc_roc_metrics(llm, dataset, vector_store, num_question):
      # Lists to accumulate ground truths and predictions for AUC-ROC computation
     all_ground_truth_relevance = []
     utilization_scores = []
     adherence_scores = []
+    # For each question in dataset get the metrics
+    for i, document in enumerate(dataset):
         # Extract ground truth metrics from dataset
         ground_truth_relevance = dataset[i]['relevance_score']
         ground_truth_utilization = dataset[i]['utilization_score']
+        ground_truth_adherence = dataset[i]['gpt3_adherence']
+        query = document['question']
+        logging.info(f'Query number: {i + 1}')
+        # Call the generate_metrics for each query
+        metrics = generate_metrics(llm, vector_store, query)
         # Extract predicted metrics (ensure these are continuous if possible)
         predicted_relevance = metrics['Context Relevance']
         predicted_utilization = metrics['Context Utilization']
+        predicted_adherence = metrics['Adherence']
         # === Handle Continuous Inputs for RMSE ===
         relevance_rmse = root_mean_squared_error([ground_truth_relevance], [predicted_relevance])
         utilization_rmse = root_mean_squared_error([ground_truth_utilization], [predicted_utilization])
+        adherence_rmse = root_mean_squared_error([ground_truth_adherence], [predicted_adherence])
         # === Handle Binary Conversion for AUC-ROC ===
         binary_ground_truth_relevance = 1 if ground_truth_relevance > 0.5 else 0
+        #binary_predicted_relevance = 1 if predicted_relevance > 0.5 else 0
         binary_ground_truth_utilization = 1 if ground_truth_utilization > 0.5 else 0
+        #binary_predicted_utilization = 1 if predicted_utilization > 0.5 else 0
         #binary_ground_truth_adherence = 1 if ground_truth_adherence > 0.5 else 0
         #binary_predicted_adherence = 1 if predicted_adherence > 0.5 else 0
         all_ground_truth_utilization.append(binary_ground_truth_utilization)
         all_predicted_utilization.append(predicted_utilization)
+        all_ground_truth_adherence.append(ground_truth_adherence)
+        all_predicted_adherence.append(predicted_adherence)
         # Store RMSE scores for each question
         relevance_scores.append(relevance_rmse)
         utilization_scores.append(utilization_rmse)
+        adherence_scores.append(adherence_rmse)
+        if i == num_question:
           break
     # === Compute AUC-ROC for the Entire Dataset ===
     try:
+        #print(f"All Ground Truth Relevance: {all_ground_truth_relevance}")
+        #print(f"All Predicted Relevance: {all_predicted_relevance}")
         relevance_auc = roc_auc_score(all_ground_truth_relevance, all_predicted_relevance)
     except ValueError:
         relevance_auc = None
     try:
+        #print(f"All Ground Truth Utilization: {all_ground_truth_utilization}")
+        #print(f"All Predicted Utilization: {all_predicted_utilization}")
         utilization_auc = roc_auc_score(all_ground_truth_utilization, all_predicted_utilization)
     except ValueError:
         utilization_auc = None
+    try:
+        #print(f"All Ground Truth Adherence: {all_ground_truth_utilization}")
+        #print(f"All Predicted Utilization: {all_predicted_utilization}")
+        adherence_auc = roc_auc_score(all_ground_truth_adherence, all_predicted_adherence)
+    except ValueError:
+        adherence_auc = None
     print(f"Relevance RMSE (per question): {relevance_scores}")
     print(f"Utilization RMSE (per question): {utilization_scores}")
+    print(f"Adherence RMSE (per question): {adherence_scores}")
     print(f"\nOverall Relevance AUC-ROC: {relevance_auc}")
     print(f"Overall Utilization AUC-ROC: {utilization_auc}")
+    print(f"Overall Adherence AUC-ROC: {adherence_auc}")

generator/extract_attributes.py CHANGED Viewed

@@ -14,13 +14,13 @@ def extract_attributes(question, relevant_docs, response):
     #print(f"Formatted documents : {formatted_documents}")
     # Print the number of sentences in each document
-    for i, doc in enumerate(formatted_documents):
         num_sentences = len(doc)
-        print(f"Document {i} has {num_sentences} sentences.")
     # Calculate the total number of sentences from formatted_documents
     total_sentences = sum(len(doc) for doc in formatted_documents)
-    print(f"Total number of sentences {total_sentences}")
     attribute_prompt = create_prompt(formatted_documents, question, formatted_responses)

     #print(f"Formatted documents : {formatted_documents}")
     # Print the number of sentences in each document
+    '''for i, doc in enumerate(formatted_documents):
         num_sentences = len(doc)
+        print(f"Document {i} has {num_sentences} sentences.")'''
     # Calculate the total number of sentences from formatted_documents
     total_sentences = sum(len(doc) for doc in formatted_documents)
+    #print(f"Total number of sentences {total_sentences}")
     attribute_prompt = create_prompt(formatted_documents, question, formatted_responses)

generator/generate_metrics.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import logging
+from generator.generate_response import generate_response
+from retriever.retrieve_documents import retrieve_top_k_documents
+from generator.compute_metrics import get_metrics
+from generator.extract_attributes import extract_attributes
+def generate_metrics(llm, vector_store, query):
+    logging.info(f'Query: {query}')
+    # Step 1: Retrieve relevant documents for given query
+    relevant_docs = retrieve_top_k_documents(vector_store, query, top_k=5)
+    logging.info(f"Relevant documents retrieved :{len(relevant_docs)}")
+    # Log each retrieved document individually
+    #for i, doc in enumerate(relevant_docs):
+        #logging.info(f"Relevant document {i+1}: {doc} \n")
+    # Step 2: Generate a response using LLM
+    response, source_docs = generate_response(llm, vector_store, query, relevant_docs)
+    logging.info(f"Response from LLM: {response}")
+    # Step 3: Extract attributes and total sentences for each query
+    attributes, total_sentences = extract_attributes(query, source_docs, response)
+    # Call the get_metrics
+    metrics = get_metrics(attributes, total_sentences)
+    return metrics

main.py CHANGED Viewed

@@ -1,13 +1,10 @@
 import logging
 from data.load_dataset import load_data
-from generator import compute_rmse_auc_roc_metrics
 from retriever.chunk_documents import chunk_documents
 from retriever.embed_documents import embed_documents
-from retriever.retrieve_documents import retrieve_top_k_documents
 from generator.initialize_llm import initialize_llm
-from generator.generate_response import generate_response
-from generator.extract_attributes import extract_attributes
-from generator.compute_metrics import get_metrics
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -27,38 +24,21 @@ def main():
     vector_store = embed_documents(documents)
     logging.info("Documents embedded")
-    # Sample question
-    row_num = 1
-    sample_question = dataset[row_num]['question']
-    logging.info(f"Sample question: {sample_question}")
-    # Retrieve relevant documents
-    relevant_docs = retrieve_top_k_documents(vector_store, sample_question, top_k=5)
-    logging.info(f"Relevant documents retrieved :{len(relevant_docs)}")
-    # Log each retrieved document individually
-    #for i, doc in enumerate(relevant_docs):
-        #logging.info(f"Relevant document {i+1}: {doc} \n")
-    # Initialize the LLM
     llm = initialize_llm()
     logging.info("LLM initialized")
-    # Generate a response using the relevant documents
-    response, source_docs = generate_response(llm, vector_store, sample_question, relevant_docs)
-    logging.info("Response generated")
-    # Print the response
-    logging.info(f"Response from LLM: {response}")
-    #print(f"Source Documents: {source_docs}")
-    # Valuations : Extract attributes from the response and source documents
-    attributes, total_sentences = extract_attributes(sample_question, source_docs, response)
-    # Call the process_attributes method in the main block
-    metrics = get_metrics(attributes, total_sentences)
     #Compute RMSE and AUC-ROC for entire dataset
-    #compute_rmse_auc_roc_metrics(llm, dataset, vector_store)
 if __name__ == "__main__":
     main()

 import logging
 from data.load_dataset import load_data
+from generator.compute_rmse_auc_roc_metrics import compute_rmse_auc_roc_metrics
 from retriever.chunk_documents import chunk_documents
 from retriever.embed_documents import embed_documents
+from generator.generate_metrics import generate_metrics
 from generator.initialize_llm import initialize_llm
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
     vector_store = embed_documents(documents)
     logging.info("Documents embedded")
+     # Initialize the Generation LLM
     llm = initialize_llm()
     logging.info("LLM initialized")
+    # Sample question
+    row_num = 43
+    sample_question = dataset[row_num]['question']
+    # Call generate_metrics for above sample question
+    generate_metrics(llm, vector_store, sample_question)
     #Compute RMSE and AUC-ROC for entire dataset
+    compute_rmse_auc_roc_metrics(llm, dataset, vector_store, dataset.num_rows)
+    logging.info("Finished!!!")
 if __name__ == "__main__":
     main()