Spaces:
Running
Running
Synced repo using 'sync_with_huggingface' Github Action
Browse files
app.py
CHANGED
@@ -66,20 +66,20 @@ CLARITY_PROMPT = """
|
|
66 |
- Specific suggestions for improvement, including restructuring ideas or refining language for ultimate clarity
|
67 |
"""
|
68 |
|
69 |
-
|
70 |
-
Evaluate the response on
|
71 |
|
72 |
Scoring Rubric:
|
73 |
-
Score 1: The response
|
74 |
-
Score 2: The response
|
75 |
-
Score 3: The response
|
76 |
-
Score 4: The response
|
77 |
-
Score 5: The response demonstrates
|
78 |
|
79 |
Provide:
|
80 |
- A numeric score (1-5, where 5 is near impossible to achieve)
|
81 |
-
- A detailed critique justifying the score, analyzing the
|
82 |
-
- Specific suggestions for improvement, including additional
|
83 |
"""
|
84 |
|
85 |
# Initialize API keys from environment variables or Streamlit secrets
|
@@ -158,28 +158,28 @@ async def evaluate_with_atla_async(inputs: dict[str, str]) -> Tuple[float, Dict[
|
|
158 |
accuracy_task = evaluate_dimension(inputs["question"], inputs["response"], ACCURACY_PROMPT)
|
159 |
relevance_task = evaluate_dimension(inputs["question"], inputs["response"], RELEVANCE_PROMPT)
|
160 |
clarity_task = evaluate_dimension(inputs["question"], inputs["response"], CLARITY_PROMPT)
|
161 |
-
|
162 |
|
163 |
# Run all evaluations concurrently using asyncio.gather
|
164 |
-
accuracy_result, relevance_result, clarity_result,
|
165 |
-
accuracy_task, relevance_task, clarity_task,
|
166 |
)
|
167 |
|
168 |
# Unpack results
|
169 |
accuracy_score, accuracy_critique = accuracy_result
|
170 |
relevance_score, relevance_critique = relevance_result
|
171 |
clarity_score, clarity_critique = clarity_result
|
172 |
-
|
173 |
|
174 |
# Calculate average score
|
175 |
-
avg_score = (accuracy_score + relevance_score + clarity_score +
|
176 |
|
177 |
# Compile detailed results
|
178 |
detailed_results = {
|
179 |
"accuracy": {"score": accuracy_score, "critique": accuracy_critique},
|
180 |
"relevance": {"score": relevance_score, "critique": relevance_critique},
|
181 |
"clarity": {"score": clarity_score, "critique": clarity_critique},
|
182 |
-
"
|
183 |
}
|
184 |
|
185 |
# Compile overall critique
|
@@ -190,7 +190,7 @@ async def evaluate_with_atla_async(inputs: dict[str, str]) -> Tuple[float, Dict[
|
|
190 |
|
191 |
Clarity ({clarity_score}/5): {clarity_critique}
|
192 |
|
193 |
-
|
194 |
|
195 |
**Overall Score: {avg_score:.2f}/5**
|
196 |
"""
|
@@ -507,7 +507,7 @@ def main():
|
|
507 |
st.markdown(
|
508 |
"""
|
509 |
This app uses multiple LLMs (GPT-4o, Claude 3.7, and DeepSeekV3.0) to answer your questions.
|
510 |
-
The world's best LLM-as-a-Judge, [Selene](https://www.atla-ai.com/api), evaluates each response, and the best one is selected and refined if needed.
|
511 |
"""
|
512 |
)
|
513 |
|
|
|
66 |
- Specific suggestions for improvement, including restructuring ideas or refining language for ultimate clarity
|
67 |
"""
|
68 |
|
69 |
+
HELPFULNESS_PROMPT = """
|
70 |
+
Evaluate the response on Helpfulness: Does the response provide practical, actionable value that directly addresses the user's needs and empowers them to achieve their goals?
|
71 |
|
72 |
Scoring Rubric:
|
73 |
+
Score 1: The response is unhelpful, misinterprets the user's needs, or provides information that cannot be practically applied.
|
74 |
+
Score 2: The response partially addresses the user's needs but leaves significant gaps or provides advice that is difficult to implement.
|
75 |
+
Score 3: The response is generally helpful, addressing the main aspects of the query with usable guidance.
|
76 |
+
Score 4: The response is very helpful, providing comprehensive solutions tailored to the specific context with clear implementation paths.
|
77 |
+
Score 5: The response demonstrates exceptional helpfulness, anticipating unstated needs, removing obstacles, and transforming the user's ability to succeed with minimal friction.
|
78 |
|
79 |
Provide:
|
80 |
- A numeric score (1-5, where 5 is near impossible to achieve)
|
81 |
+
- A detailed critique justifying the score, analyzing how well the response addresses the user's explicit and implicit needs
|
82 |
+
- Specific suggestions for improvement, including additional guidance, clarifications, alternative approaches, or resources that would enhance the practical value to the user
|
83 |
"""
|
84 |
|
85 |
# Initialize API keys from environment variables or Streamlit secrets
|
|
|
158 |
accuracy_task = evaluate_dimension(inputs["question"], inputs["response"], ACCURACY_PROMPT)
|
159 |
relevance_task = evaluate_dimension(inputs["question"], inputs["response"], RELEVANCE_PROMPT)
|
160 |
clarity_task = evaluate_dimension(inputs["question"], inputs["response"], CLARITY_PROMPT)
|
161 |
+
helpfulness_task = evaluate_dimension(inputs["question"], inputs["response"], HELPFULNESS_PROMPT)
|
162 |
|
163 |
# Run all evaluations concurrently using asyncio.gather
|
164 |
+
accuracy_result, relevance_result, clarity_result, helpfulness_result = await asyncio.gather(
|
165 |
+
accuracy_task, relevance_task, clarity_task, helpfulness_task
|
166 |
)
|
167 |
|
168 |
# Unpack results
|
169 |
accuracy_score, accuracy_critique = accuracy_result
|
170 |
relevance_score, relevance_critique = relevance_result
|
171 |
clarity_score, clarity_critique = clarity_result
|
172 |
+
helpfulness_score, helpfulness_critique = helpfulness_result
|
173 |
|
174 |
# Calculate average score
|
175 |
+
avg_score = (accuracy_score + relevance_score + clarity_score + helpfulness_score) / 4
|
176 |
|
177 |
# Compile detailed results
|
178 |
detailed_results = {
|
179 |
"accuracy": {"score": accuracy_score, "critique": accuracy_critique},
|
180 |
"relevance": {"score": relevance_score, "critique": relevance_critique},
|
181 |
"clarity": {"score": clarity_score, "critique": clarity_critique},
|
182 |
+
"helpfulness": {"score": helpfulness_score, "critique": helpfulness_critique}
|
183 |
}
|
184 |
|
185 |
# Compile overall critique
|
|
|
190 |
|
191 |
Clarity ({clarity_score}/5): {clarity_critique}
|
192 |
|
193 |
+
Helpfulness ({helpfulness_score}/5): {helpfulness_critique}
|
194 |
|
195 |
**Overall Score: {avg_score:.2f}/5**
|
196 |
"""
|
|
|
507 |
st.markdown(
|
508 |
"""
|
509 |
This app uses multiple LLMs (GPT-4o, Claude 3.7, and DeepSeekV3.0) to answer your questions.
|
510 |
+
The world's best LLM-as-a-Judge, [Selene](https://www.atla-ai.com/api), evaluates each response on accuracy, relevance, clarity, and depth, and the best one is selected and refined if needed (< 4.0 score).
|
511 |
"""
|
512 |
)
|
513 |
|