kaikaidai commited on
Commit
113e657
·
verified ·
1 Parent(s): 16c7d6b

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (1) hide show
  1. app.py +17 -17
app.py CHANGED
@@ -66,20 +66,20 @@ CLARITY_PROMPT = """
66
  - Specific suggestions for improvement, including restructuring ideas or refining language for ultimate clarity
67
  """
68
 
69
- DEPTH_PROMPT = """
70
- Evaluate the response on Depth: Does the response provide extraordinarily comprehensive coverage, offering cutting-edge insights and exploring the topic to its fullest extent?
71
 
72
  Scoring Rubric:
73
- Score 1: The response lacks depth, misses key concepts, or fails to go beyond surface-level information.
74
- Score 2: The response provides good coverage but doesn't delve into advanced concepts or implications.
75
- Score 3: The response offers solid depth with some advanced concepts, but doesn't push the boundaries of the topic.
76
- Score 4: The response provides excellent depth, touching on cutting-edge ideas, but falls short of exhaustive coverage.
77
- Score 5: The response demonstrates unparalleled depth, offering groundbreaking insights, and exhaustively covering all aspects including future implications.
78
 
79
  Provide:
80
  - A numeric score (1-5, where 5 is near impossible to achieve)
81
- - A detailed critique justifying the score, analyzing the breadth and depth of concepts covered
82
- - Specific suggestions for improvement, including additional advanced topics, interdisciplinary connections, or futuristic implications that could have been explored
83
  """
84
 
85
  # Initialize API keys from environment variables or Streamlit secrets
@@ -158,28 +158,28 @@ async def evaluate_with_atla_async(inputs: dict[str, str]) -> Tuple[float, Dict[
158
  accuracy_task = evaluate_dimension(inputs["question"], inputs["response"], ACCURACY_PROMPT)
159
  relevance_task = evaluate_dimension(inputs["question"], inputs["response"], RELEVANCE_PROMPT)
160
  clarity_task = evaluate_dimension(inputs["question"], inputs["response"], CLARITY_PROMPT)
161
- depth_task = evaluate_dimension(inputs["question"], inputs["response"], DEPTH_PROMPT)
162
 
163
  # Run all evaluations concurrently using asyncio.gather
164
- accuracy_result, relevance_result, clarity_result, depth_result = await asyncio.gather(
165
- accuracy_task, relevance_task, clarity_task, depth_task
166
  )
167
 
168
  # Unpack results
169
  accuracy_score, accuracy_critique = accuracy_result
170
  relevance_score, relevance_critique = relevance_result
171
  clarity_score, clarity_critique = clarity_result
172
- depth_score, depth_critique = depth_result
173
 
174
  # Calculate average score
175
- avg_score = (accuracy_score + relevance_score + clarity_score + depth_score) / 4
176
 
177
  # Compile detailed results
178
  detailed_results = {
179
  "accuracy": {"score": accuracy_score, "critique": accuracy_critique},
180
  "relevance": {"score": relevance_score, "critique": relevance_critique},
181
  "clarity": {"score": clarity_score, "critique": clarity_critique},
182
- "depth": {"score": depth_score, "critique": depth_critique}
183
  }
184
 
185
  # Compile overall critique
@@ -190,7 +190,7 @@ async def evaluate_with_atla_async(inputs: dict[str, str]) -> Tuple[float, Dict[
190
 
191
  Clarity ({clarity_score}/5): {clarity_critique}
192
 
193
- Depth ({depth_score}/5): {depth_critique}
194
 
195
  **Overall Score: {avg_score:.2f}/5**
196
  """
@@ -507,7 +507,7 @@ def main():
507
  st.markdown(
508
  """
509
  This app uses multiple LLMs (GPT-4o, Claude 3.7, and DeepSeekV3.0) to answer your questions.
510
- The world's best LLM-as-a-Judge, [Selene](https://www.atla-ai.com/api), evaluates each response, and the best one is selected and refined if needed.
511
  """
512
  )
513
 
 
66
  - Specific suggestions for improvement, including restructuring ideas or refining language for ultimate clarity
67
  """
68
 
69
+ HELPFULNESS_PROMPT = """
70
+ Evaluate the response on Helpfulness: Does the response provide practical, actionable value that directly addresses the user's needs and empowers them to achieve their goals?
71
 
72
  Scoring Rubric:
73
+ Score 1: The response is unhelpful, misinterprets the user's needs, or provides information that cannot be practically applied.
74
+ Score 2: The response partially addresses the user's needs but leaves significant gaps or provides advice that is difficult to implement.
75
+ Score 3: The response is generally helpful, addressing the main aspects of the query with usable guidance.
76
+ Score 4: The response is very helpful, providing comprehensive solutions tailored to the specific context with clear implementation paths.
77
+ Score 5: The response demonstrates exceptional helpfulness, anticipating unstated needs, removing obstacles, and transforming the user's ability to succeed with minimal friction.
78
 
79
  Provide:
80
  - A numeric score (1-5, where 5 is near impossible to achieve)
81
+ - A detailed critique justifying the score, analyzing how well the response addresses the user's explicit and implicit needs
82
+ - Specific suggestions for improvement, including additional guidance, clarifications, alternative approaches, or resources that would enhance the practical value to the user
83
  """
84
 
85
  # Initialize API keys from environment variables or Streamlit secrets
 
158
  accuracy_task = evaluate_dimension(inputs["question"], inputs["response"], ACCURACY_PROMPT)
159
  relevance_task = evaluate_dimension(inputs["question"], inputs["response"], RELEVANCE_PROMPT)
160
  clarity_task = evaluate_dimension(inputs["question"], inputs["response"], CLARITY_PROMPT)
161
+ helpfulness_task = evaluate_dimension(inputs["question"], inputs["response"], HELPFULNESS_PROMPT)
162
 
163
  # Run all evaluations concurrently using asyncio.gather
164
+ accuracy_result, relevance_result, clarity_result, helpfulness_result = await asyncio.gather(
165
+ accuracy_task, relevance_task, clarity_task, helpfulness_task
166
  )
167
 
168
  # Unpack results
169
  accuracy_score, accuracy_critique = accuracy_result
170
  relevance_score, relevance_critique = relevance_result
171
  clarity_score, clarity_critique = clarity_result
172
+ helpfulness_score, helpfulness_critique = helpfulness_result
173
 
174
  # Calculate average score
175
+ avg_score = (accuracy_score + relevance_score + clarity_score + helpfulness_score) / 4
176
 
177
  # Compile detailed results
178
  detailed_results = {
179
  "accuracy": {"score": accuracy_score, "critique": accuracy_critique},
180
  "relevance": {"score": relevance_score, "critique": relevance_critique},
181
  "clarity": {"score": clarity_score, "critique": clarity_critique},
182
+ "helpfulness": {"score": helpfulness_score, "critique": helpfulness_critique}
183
  }
184
 
185
  # Compile overall critique
 
190
 
191
  Clarity ({clarity_score}/5): {clarity_critique}
192
 
193
+ Helpfulness ({helpfulness_score}/5): {helpfulness_critique}
194
 
195
  **Overall Score: {avg_score:.2f}/5**
196
  """
 
507
  st.markdown(
508
  """
509
  This app uses multiple LLMs (GPT-4o, Claude 3.7, and DeepSeekV3.0) to answer your questions.
510
+ The world's best LLM-as-a-Judge, [Selene](https://www.atla-ai.com/api), evaluates each response on accuracy, relevance, clarity, and depth, and the best one is selected and refined if needed (< 4.0 score).
511
  """
512
  )
513