sango07 commited on
Commit
38a48ce
·
verified ·
1 Parent(s): 40d8e7e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +284 -0
app.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ from langchain_core.prompts import PromptTemplate
5
+ import json
6
+ from langchain_openai import ChatOpenAI
7
+ import evaluate
8
+ from typing import List, Dict
9
+ from prompts_v1 import *
10
+ import tempfile
11
+ from langchain_groq import ChatGroq
12
+
13
+ os.environ["OPENAI_API_KEY"] = "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"
14
+ os.environ["GROQ_API_KEY"]="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"
15
+
16
+
17
+
18
+ # Configure page settings
19
+ st.set_page_config(
20
+ page_title="RAG Evaluator",
21
+ page_icon="📊",
22
+ layout="wide",
23
+ initial_sidebar_state="expanded"
24
+ )
25
+
26
+ # Custom CSS for better UI
27
+ st.markdown("""
28
+ <style>
29
+ .stApp {
30
+ max-width: 1200px;
31
+ margin: 0 auto;
32
+ }
33
+ .metric-box {
34
+ background-color: #f0f2f6;
35
+ border-radius: 10px;
36
+ padding: 20px;
37
+ margin: 10px 0;
38
+ }
39
+ </style>
40
+ """, unsafe_allow_html=True)
41
+
42
+ # Initialize session state
43
+ if 'evaluation_results' not in st.session_state:
44
+ st.session_state.evaluation_results = None
45
+
46
+ class RAGEvaluator:
47
+ def __init__(self):
48
+ #self.llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.2)
49
+ self.llm = ChatOpenAI(temperature=0.7, model="gpt-3.5-turbo")
50
+ self.eval_prompts = {
51
+ "diversity_metrics": diversity_metrics,
52
+ "creativity_metric": creativity_metric,
53
+ "groundedness_metric": groundedness_metric,
54
+ "coherence_metric": coherence_metric,
55
+ "pointwise_metric":pointwise_metric,
56
+ # "pairwise_metric":pairwise_metric
57
+ }
58
+
59
+ def evaluate_custom_metrics(self, df: pd.DataFrame, selected_metrics: List[str]) -> pd.DataFrame:
60
+ for metric in selected_metrics:
61
+ prompt = self.eval_prompts.get(metric)
62
+ if not prompt:
63
+ continue
64
+
65
+ review_template = PromptTemplate.from_template(prompt)
66
+ eval_score = []
67
+ explanation = []
68
+
69
+ progress_bar = st.progress(0)
70
+ for idx in range(len(df)):
71
+ progress = (idx + 1) / len(df)
72
+ progress_bar.progress(progress)
73
+
74
+ question = df["question"][idx]
75
+ answer = df["answer"][idx]
76
+ context = df["context"][idx]
77
+
78
+ final_prompt = review_template.format(
79
+ question=question,
80
+ answer=answer,
81
+ context=context
82
+ )
83
+
84
+ response = self.llm.invoke(final_prompt).content
85
+ data_dict = json.loads(response)
86
+
87
+ eval_score.append(data_dict["eval_score"])
88
+ explanation.append(data_dict["explanation"])
89
+
90
+ df[f"{metric}_score"] = eval_score
91
+ df[f"{metric}_explanation"] = explanation
92
+ progress_bar.empty()
93
+
94
+ return df
95
+
96
+ def evaluate_traditional_metrics(self, df: pd.DataFrame, selected_metrics: List[str]) -> pd.DataFrame:
97
+ if "BLEU" in selected_metrics:
98
+ bleu = evaluate.load('bleu')
99
+ scores = []
100
+ for _, row in df.iterrows():
101
+ score = bleu.compute(
102
+ predictions=[row['answer']],
103
+ references=[row['context']],
104
+ max_order=2
105
+ )
106
+ scores.append(score['bleu'])
107
+ df['bleu_score'] = scores
108
+
109
+ if "ROUGE" in selected_metrics:
110
+ rouge = evaluate.load("rouge")
111
+ rouge1_scores = []
112
+ rouge2_scores = []
113
+ rougeL_scores = []
114
+
115
+ for _, row in df.iterrows():
116
+ scores = rouge.compute(
117
+ predictions=[row['answer']],
118
+ references=[row['context']],
119
+ rouge_types=['rouge1', 'rouge2', 'rougeL']
120
+ )
121
+ rouge1_scores.append(scores['rouge1'])
122
+ rouge2_scores.append(scores['rouge2'])
123
+ rougeL_scores.append(scores['rougeL'])
124
+
125
+ df['rouge1_score'] = rouge1_scores
126
+ df['rouge2_score'] = rouge2_scores
127
+ df['rougeL_score'] = rougeL_scores
128
+
129
+ if "Perplexity" in selected_metrics:
130
+ try:
131
+ perplexity = evaluate.load("perplexity", module_type="metric")
132
+ scores = []
133
+ for _, row in df.iterrows():
134
+ try:
135
+ score = perplexity.compute(
136
+ model_id="gpt2",
137
+ add_start_token=False,
138
+ predictions=[row['answer']]
139
+ )
140
+ scores.append(score['mean_perplexity'])
141
+ except KeyError:
142
+ # If mean_perplexity is not available, try perplexity
143
+ scores.append(score.get('perplexity', 0))
144
+ except Exception as e:
145
+ st.warning(f"Skipping perplexity calculation for one row due to: {str(e)}")
146
+ scores.append(0)
147
+ df['perplexity_score'] = scores
148
+ except Exception as e:
149
+ st.error(f"Error calculating perplexity: {str(e)}")
150
+ df['perplexity_score'] = [0] * len(df)
151
+
152
+ return df
153
+
154
+ def main():
155
+ st.title("🎯 RAG Evaluator")
156
+ st.write("Upload your data and select evaluation metrics to analyze your RAG system's performance.")
157
+
158
+ # Sidebar configuration
159
+ st.sidebar.header("Configuration")
160
+
161
+ # File upload
162
+ uploaded_file = st.sidebar.file_uploader(
163
+ "Upload your evaluation data (CSV/Excel)",
164
+ type=['csv', 'xlsx']
165
+ )
166
+
167
+ # Metric selection
168
+ st.sidebar.subheader("Select Evaluation Metrics")
169
+
170
+ custom_metrics = st.sidebar.expander("Custom Metrics", expanded=True)
171
+ selected_custom_metrics = custom_metrics.multiselect(
172
+ "Choose custom metrics:",
173
+ ["diversity_metrics", "creativity_metric", "groundedness_metric", "coherence_metric","pointwise_metric"],
174
+ default=["coherence_metric"]
175
+ )
176
+
177
+ traditional_metrics = st.sidebar.expander("Traditional Metrics", expanded=True)
178
+ selected_traditional_metrics = traditional_metrics.multiselect(
179
+ "Choose traditional metrics:",
180
+ ["BLEU", "ROUGE", "Perplexity"],
181
+ default=["BLEU"]
182
+ )
183
+
184
+ if uploaded_file is not None:
185
+ try:
186
+ # Read the uploaded file
187
+ if uploaded_file.name.endswith('.csv'):
188
+ df = pd.read_csv(uploaded_file)
189
+ else:
190
+ df = pd.read_excel(uploaded_file)
191
+
192
+ # Display data preview
193
+ st.subheader("📊 Data Preview")
194
+ st.dataframe(df.head(), use_container_width=True)
195
+
196
+ # Initialize evaluator
197
+ evaluator = RAGEvaluator()
198
+
199
+ # Evaluation button
200
+ if st.button("🚀 Start Evaluation", type="primary"):
201
+ with st.spinner("Evaluating..."):
202
+ # Perform evaluations
203
+ if selected_custom_metrics:
204
+ df = evaluator.evaluate_custom_metrics(df, selected_custom_metrics)
205
+
206
+ if selected_traditional_metrics:
207
+ df = evaluator.evaluate_traditional_metrics(df, selected_traditional_metrics)
208
+
209
+ st.session_state.evaluation_results = df
210
+
211
+ # Save results
212
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
213
+ df.to_excel(tmp.name, index=False)
214
+ st.download_button(
215
+ label="📥 Download Results",
216
+ data=open(tmp.name, 'rb'),
217
+ file_name="rag_evaluation_results.xlsx",
218
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
219
+ )
220
+
221
+ # Display results if available
222
+ if st.session_state.evaluation_results is not None:
223
+ st.subheader("📈 Evaluation Results")
224
+
225
+ # Create tabs for different result views
226
+ tab1, tab2 = st.tabs(["📊 Metrics Overview", "📝 Detailed Results"])
227
+
228
+ with tab1:
229
+ # Display metric summaries
230
+ cols = st.columns(len(selected_custom_metrics) + len(selected_traditional_metrics))
231
+
232
+ metric_idx = 0
233
+ for metric in selected_custom_metrics:
234
+ with cols[metric_idx]:
235
+ avg_score = st.session_state.evaluation_results[f"{metric}_score"].mean()
236
+ st.metric(
237
+ label=metric.replace('_', ' ').title(),
238
+ value=f"{avg_score:.2f}"
239
+ )
240
+ metric_idx += 1
241
+
242
+ if "BLEU" in selected_traditional_metrics:
243
+ with cols[metric_idx]:
244
+ avg_bleu = st.session_state.evaluation_results['bleu_score'].mean()
245
+ st.metric(label="BLEU Score", value=f"{avg_bleu:.2f}")
246
+ metric_idx += 1
247
+
248
+ if "ROUGE" in selected_traditional_metrics:
249
+ with cols[metric_idx]:
250
+ avg_rouge = st.session_state.evaluation_results['rouge1_score'].mean()
251
+ st.metric(label="ROUGE-1 Score", value=f"{avg_rouge:.2f}")
252
+ metric_idx += 1
253
+
254
+ if "Perplexity" in selected_traditional_metrics:
255
+ with cols[metric_idx]:
256
+ avg_rouge = st.session_state.evaluation_results['perplexity_score'].mean()
257
+ st.metric(label="perplexity Score", value=f"{avg_rouge:.2f}")
258
+ metric_idx += 1
259
+
260
+ with tab2:
261
+ st.dataframe(
262
+ st.session_state.evaluation_results,
263
+ use_container_width=True,
264
+ height=400
265
+ )
266
+
267
+ except Exception as e:
268
+ st.error(f"An error occurred: {str(e)}")
269
+
270
+ else:
271
+ # Display welcome message and instructions
272
+ st.info("👈 Please upload your evaluation data file (CSV/Excel) from the sidebar to begin.")
273
+
274
+ # Display sample format
275
+ st.subheader("📋 Expected Data Format")
276
+ sample_data = pd.DataFrame({
277
+ 'question': ['What is RAG?', 'How does RAG work?'],
278
+ 'answer': ['RAG is...', 'RAG works by...'],
279
+ 'context': ['RAG (Retrieval-Augmented Generation)...', 'The RAG process involves...']
280
+ })
281
+ st.dataframe(sample_data, use_container_width=True)
282
+
283
+ if __name__ == "__main__":
284
+ main()