Spaces:
Sleeping
Sleeping
Parse judgments with structured output prompting, one response model, one judge model at a time.
eb4ec23
from judging_dataclasses import Criteria | |
PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the response from a judge for a direct assessment. | |
The judge was asked to give a rating for each of the following criteria, along with an explanation: | |
{criteria_list} | |
The possible options for each criterion are as follows: | |
{options} | |
Here is the response from the judge: | |
{judging_response} | |
Please provide a JSON object the scores for each of the criteria, along with any explanation the judge provided. | |
""" | |
DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses. | |
[USER PROMPT START] | |
{user_prompt} | |
[USER PROMPT END] | |
Responses from other LLMs: | |
{responses_from_other_llms} | |
Consider how you would combine the best aspects of the responses above into a single response. | |
Directly provide your response to the user's query as if you were the original LLM. Do not mention that you are synthesizing the responses from other LLMs. | |
""" | |
DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query. | |
[USER PROMPT START] | |
{user_prompt} | |
[USER PROMPT END] | |
The response is as follows: | |
[RESPONSE START] | |
{response} | |
[RESPONSE END] | |
Please evaluate the quality of the response based on the following criteria: | |
{criteria_list} | |
Options: | |
{options} | |
For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion.""" | |
DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [ | |
Criteria( | |
name="helpfulness", | |
description="Provides meaningful information and clear solutions that address the query.", | |
min_score=1, | |
max_score=7, | |
), | |
Criteria( | |
name="relevance", | |
description="Stays on topic and directly relates to the query without unnecessary details.", | |
min_score=1, | |
max_score=7, | |
), | |
Criteria( | |
name="conciseness", | |
description="Communicates clearly and efficiently, avoiding excess content while retaining substance.", | |
min_score=1, | |
max_score=7, | |
), | |
] | |
# 7-point likert scale. | |
SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [ | |
"Strongly Disagree", | |
"Disagree", | |
"Slightly Disagree", | |
"Neither Agree Nor Disagree", | |
"Slightly Agree", | |
"Agree", | |
"Strongly Agree", | |
] | |
# 6-point likert scale. | |
SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [ | |
"Strongly Disagree", | |
"Disagree", | |
"Slightly Disagree", | |
"Slightly Agree", | |
"Agree", | |
"Strongly Agree", | |
] | |
# 5-point likert scale. | |
FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [ | |
"Strongly Disagree", | |
"Disagree", | |
"Neither Agree Nor Disagree", | |
"Agree", | |
"Strongly Agree", | |
] | |
# 4-point likert scale. | |
FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [ | |
"Strongly Disagree", | |
"Disagree", | |
"Agree", | |
"Strongly Agree", | |
] | |
# 3-point likert scale. | |
THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [ | |
"Disagree", | |
"Neither Agree Nor Disagree", | |
"Agree", | |
] | |
# 2-point likert scale. | |
BINARY_DIRECT_ASSESSMENT_OPTIONS = [ | |
"Disagree", | |
"Agree", | |
] | |
DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query. | |
[USER PROMPT START] | |
{prompt} | |
[USER PROMPT END] | |
[RESPONSE A START] | |
{first_completion} | |
[RESPONSE A END] | |
[RESPONSE B START] | |
{second_completion} | |
[RESPONSE B END] | |
Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}. | |
After providing your explanation, output your final verdict as one of the following options: | |
{pairwise_comparison_options} | |
""" | |
DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [ | |
( | |
"helpfulness", | |
"Provides meaningful information and clear solutions that address the query.", | |
), | |
( | |
"relevance", | |
"Stays on topic and directly relates to the query without unnecessary details.", | |
), | |
( | |
"conciseness", | |
"Communicates clearly and efficiently, avoiding excess content while retaining substance.", | |
), | |
] | |
# COARSE WITH TIE. | |
DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [ | |
("A>B", "Response A is better than Response B"), | |
("B<A", "Response B is better than Response A"), | |
("A=B", "Both responses are equally good"), | |
] | |