File size: 5,737 Bytes
185fa42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3c2c41
185fa42
1b8cfde
 
 
 
 
 
185fa42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import streamlit as st

st.set_page_config(
    page_title="04_Evaluation",  # Use this format for ordering
    page_icon="πŸ“Š",
    layout="wide"
)

import pandas as pd
from database import DatabaseHandler
from data_processor import DataProcessor
from rag import RAGSystem
from evaluation import EvaluationSystem
from generate_ground_truth import get_evaluation_display_data
import logging
import sys

# Configure logging for stdout only
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    stream=sys.stdout
)
logger = logging.getLogger(__name__)

# Define evaluation prompt template
EVALUATION_PROMPT_TEMPLATE = """
You are an expert evaluator for a Youtube transcript assistant.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in the following JSON format:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "Your explanation for the relevance classification"
}}

Requirements:
1. Relevance must be one of the three exact values
2. Provide clear reasoning in the explanation
3. Consider accuracy and completeness of the answer
4. Return valid JSON only
""".strip()

@st.cache_resource
def init_components():
    db_handler = DatabaseHandler()
    data_processor = DataProcessor()
    rag_system = RAGSystem(data_processor)
    evaluation_system = EvaluationSystem(data_processor, db_handler)
    return db_handler, data_processor, rag_system, evaluation_system

def main():
    st.title("RAG Evaluation πŸ“Š")
    
    db_handler, data_processor, rag_system, evaluation_system = init_components()
    
    try:
        # Check for ground truth data
        ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
        ground_truth_available = True
        
        # Display existing evaluations
        existing_evaluations = get_evaluation_display_data()
        if not existing_evaluations.empty:
            st.subheader("Existing Evaluation Results")
            st.dataframe(existing_evaluations)
            
            # Download button for evaluation results
            csv = existing_evaluations.to_csv(index=False)
            st.download_button(
                label="Download Evaluation Results",
                data=csv,
                file_name="evaluation_results.csv",
                mime="text/csv",
            )
        
        # Run evaluation
        if ground_truth_available:
            if st.button("Run Full Evaluation"):
                with st.spinner("Running evaluation..."):
                    try:
                        evaluation_results = evaluation_system.run_full_evaluation(
                            rag_system,
                            'data/ground-truth-retrieval.csv',
                            EVALUATION_PROMPT_TEMPLATE
                        )
                        
                        if evaluation_results:
                            # Display RAG evaluations
                            st.subheader("RAG Evaluations")
                            rag_eval_df = pd.DataFrame(evaluation_results["rag_evaluations"])
                            st.dataframe(rag_eval_df)
                            
                            # Display search performance
                            st.subheader("Search Performance")
                            search_perf_df = pd.DataFrame([evaluation_results["search_performance"]])
                            st.dataframe(search_perf_df)
                            
                            # Display optimized parameters
                            st.subheader("Optimized Search Parameters")
                            params_df = pd.DataFrame([{
                                'parameter': k,
                                'value': v,
                                'score': evaluation_results['best_score']
                            } for k, v in evaluation_results['best_params'].items()])
                            st.dataframe(params_df)
                            
                            # Save results
                            for video_id in rag_eval_df['video_id'].unique():
                                db_handler.save_search_performance(
                                    video_id,
                                    evaluation_results["search_performance"]['hit_rate'],
                                    evaluation_results["search_performance"]['mrr']
                                )
                                db_handler.save_search_parameters(
                                    video_id,
                                    evaluation_results['best_params'],
                                    evaluation_results['best_score']
                                )
                            
                            st.success("Evaluation complete. Results saved to database and CSV.")
                    except Exception as e:
                        st.error(f"Error during evaluation: {str(e)}")
                        logger.error(f"Error in evaluation: {str(e)}")
        
    except FileNotFoundError:
        st.warning("No ground truth data available. Please generate ground truth data in the Ground Truth Generation page first.")
        if st.button("Go to Ground Truth Generation"):
            st.switch_page("pages/3_Ground_Truth.py")

if __name__ == "__main__":
    main()