Spaces:
Sleeping
Sleeping
File size: 4,375 Bytes
577870e eb4ec23 3e0f8f8 eb4ec23 3e0f8f8 eb4ec23 3e0f8f8 eb4ec23 3e0f8f8 577870e 38e43b5 577870e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
from judging_dataclasses import Criteria
PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the response from a judge for a direct assessment.
The judge was asked to give a rating for each of the following criteria, along with an explanation:
{criteria_list}
The possible options for each criterion are as follows:
{options}
Here is the response from the judge:
{judging_response}
Please provide a JSON object the scores for each of the criteria, along with any explanation the judge provided.
"""
DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses.
[USER PROMPT START]
{user_prompt}
[USER PROMPT END]
Responses from other LLMs:
{responses_from_other_llms}
Consider how you would combine the best aspects of the responses above into a single response.
Directly provide your response to the user's query as if you were the original LLM. Do not mention that you are synthesizing the responses from other LLMs.
"""
DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query.
[USER PROMPT START]
{user_prompt}
[USER PROMPT END]
The response is as follows:
[RESPONSE START]
{response}
[RESPONSE END]
Please evaluate the quality of the response based on the following criteria:
{criteria_list}
Options:
{options}
For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion."""
DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [
Criteria(
name="helpfulness",
description="Provides meaningful information and clear solutions that address the query.",
min_score=1,
max_score=7,
),
Criteria(
name="relevance",
description="Stays on topic and directly relates to the query without unnecessary details.",
min_score=1,
max_score=7,
),
Criteria(
name="conciseness",
description="Communicates clearly and efficiently, avoiding excess content while retaining substance.",
min_score=1,
max_score=7,
),
]
# 7-point likert scale.
SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Slightly Disagree",
"Neither Agree Nor Disagree",
"Slightly Agree",
"Agree",
"Strongly Agree",
]
# 6-point likert scale.
SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Slightly Disagree",
"Slightly Agree",
"Agree",
"Strongly Agree",
]
# 5-point likert scale.
FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Neither Agree Nor Disagree",
"Agree",
"Strongly Agree",
]
# 4-point likert scale.
FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Agree",
"Strongly Agree",
]
# 3-point likert scale.
THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Disagree",
"Neither Agree Nor Disagree",
"Agree",
]
# 2-point likert scale.
BINARY_DIRECT_ASSESSMENT_OPTIONS = [
"Disagree",
"Agree",
]
DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query.
[USER PROMPT START]
{prompt}
[USER PROMPT END]
[RESPONSE A START]
{first_completion}
[RESPONSE A END]
[RESPONSE B START]
{second_completion}
[RESPONSE B END]
Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}.
After providing your explanation, output your final verdict as one of the following options:
{pairwise_comparison_options}
"""
DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [
(
"helpfulness",
"Provides meaningful information and clear solutions that address the query.",
),
(
"relevance",
"Stays on topic and directly relates to the query without unnecessary details.",
),
(
"conciseness",
"Communicates clearly and efficiently, avoiding excess content while retaining substance.",
),
]
# COARSE WITH TIE.
DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [
("A>B", "Response A is better than Response B"),
("B<A", "Response B is better than Response A"),
("A=B", "Both responses are equally good"),
]
|