sandbox / prompts.py
justinxzhao's picture
Parse judgments with structured output prompting, one response model, one judge model at a time.
eb4ec23
raw
history blame
4.38 kB
from judging_dataclasses import Criteria
PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the response from a judge for a direct assessment.
The judge was asked to give a rating for each of the following criteria, along with an explanation:
{criteria_list}
The possible options for each criterion are as follows:
{options}
Here is the response from the judge:
{judging_response}
Please provide a JSON object the scores for each of the criteria, along with any explanation the judge provided.
"""
DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses.
[USER PROMPT START]
{user_prompt}
[USER PROMPT END]
Responses from other LLMs:
{responses_from_other_llms}
Consider how you would combine the best aspects of the responses above into a single response.
Directly provide your response to the user's query as if you were the original LLM. Do not mention that you are synthesizing the responses from other LLMs.
"""
DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query.
[USER PROMPT START]
{user_prompt}
[USER PROMPT END]
The response is as follows:
[RESPONSE START]
{response}
[RESPONSE END]
Please evaluate the quality of the response based on the following criteria:
{criteria_list}
Options:
{options}
For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion."""
DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [
Criteria(
name="helpfulness",
description="Provides meaningful information and clear solutions that address the query.",
min_score=1,
max_score=7,
),
Criteria(
name="relevance",
description="Stays on topic and directly relates to the query without unnecessary details.",
min_score=1,
max_score=7,
),
Criteria(
name="conciseness",
description="Communicates clearly and efficiently, avoiding excess content while retaining substance.",
min_score=1,
max_score=7,
),
]
# 7-point likert scale.
SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Slightly Disagree",
"Neither Agree Nor Disagree",
"Slightly Agree",
"Agree",
"Strongly Agree",
]
# 6-point likert scale.
SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Slightly Disagree",
"Slightly Agree",
"Agree",
"Strongly Agree",
]
# 5-point likert scale.
FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Neither Agree Nor Disagree",
"Agree",
"Strongly Agree",
]
# 4-point likert scale.
FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Agree",
"Strongly Agree",
]
# 3-point likert scale.
THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Disagree",
"Neither Agree Nor Disagree",
"Agree",
]
# 2-point likert scale.
BINARY_DIRECT_ASSESSMENT_OPTIONS = [
"Disagree",
"Agree",
]
DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query.
[USER PROMPT START]
{prompt}
[USER PROMPT END]
[RESPONSE A START]
{first_completion}
[RESPONSE A END]
[RESPONSE B START]
{second_completion}
[RESPONSE B END]
Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}.
After providing your explanation, output your final verdict as one of the following options:
{pairwise_comparison_options}
"""
DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [
(
"helpfulness",
"Provides meaningful information and clear solutions that address the query.",
),
(
"relevance",
"Stays on topic and directly relates to the query without unnecessary details.",
),
(
"conciseness",
"Communicates clearly and efficiently, avoiding excess content while retaining substance.",
),
]
# COARSE WITH TIE.
DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [
("A>B", "Response A is better than Response B"),
("B<A", "Response B is better than Response A"),
("A=B", "Both responses are equally good"),
]