File size: 4,375 Bytes
577870e
 
 
eb4ec23
 
 
3e0f8f8
 
 
 
eb4ec23
3e0f8f8
 
eb4ec23
 
 
3e0f8f8
eb4ec23
3e0f8f8
 
 
577870e
 
 
 
 
 
 
 
 
38e43b5
 
 
 
577870e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from judging_dataclasses import Criteria


PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the response from a judge for a direct assessment.

The judge was asked to give a rating for each of the following criteria, along with an explanation:

{criteria_list}

The possible options for each criterion are as follows:

{options}

Here is the response from the judge:

{judging_response}

Please provide a JSON object the scores for each of the criteria, along with any explanation the judge provided.
"""


DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses.

[USER PROMPT START]
{user_prompt}
[USER PROMPT END]

Responses from other LLMs:
{responses_from_other_llms}

Consider how you would combine the best aspects of the responses above into a single response.

Directly provide your response to the user's query as if you were the original LLM. Do not mention that you are synthesizing the responses from other LLMs.
"""


DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query.

[USER PROMPT START]
{user_prompt}
[USER PROMPT END]

The response is as follows: 

[RESPONSE START]
{response}
[RESPONSE END]

Please evaluate the quality of the response based on the following criteria:

{criteria_list}

Options:
{options}

For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion."""

DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [
    Criteria(
        name="helpfulness",
        description="Provides meaningful information and clear solutions that address the query.",
        min_score=1,
        max_score=7,
    ),
    Criteria(
        name="relevance",
        description="Stays on topic and directly relates to the query without unnecessary details.",
        min_score=1,
        max_score=7,
    ),
    Criteria(
        name="conciseness",
        description="Communicates clearly and efficiently, avoiding excess content while retaining substance.",
        min_score=1,
        max_score=7,
    ),
]

# 7-point likert scale.
SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [
    "Strongly Disagree",
    "Disagree",
    "Slightly Disagree",
    "Neither Agree Nor Disagree",
    "Slightly Agree",
    "Agree",
    "Strongly Agree",
]

# 6-point likert scale.
SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [
    "Strongly Disagree",
    "Disagree",
    "Slightly Disagree",
    "Slightly Agree",
    "Agree",
    "Strongly Agree",
]

# 5-point likert scale.
FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
    "Strongly Disagree",
    "Disagree",
    "Neither Agree Nor Disagree",
    "Agree",
    "Strongly Agree",
]

# 4-point likert scale.
FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [
    "Strongly Disagree",
    "Disagree",
    "Agree",
    "Strongly Agree",
]

# 3-point likert scale.
THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
    "Disagree",
    "Neither Agree Nor Disagree",
    "Agree",
]

# 2-point likert scale.
BINARY_DIRECT_ASSESSMENT_OPTIONS = [
    "Disagree",
    "Agree",
]


DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query.

[USER PROMPT START]
{prompt}
[USER PROMPT END]

[RESPONSE A START]
{first_completion}
[RESPONSE A END]

[RESPONSE B START]
{second_completion}
[RESPONSE B END]

Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}.

After providing your explanation, output your final verdict as one of the following options:
{pairwise_comparison_options}
"""

DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [
    (
        "helpfulness",
        "Provides meaningful information and clear solutions that address the query.",
    ),
    (
        "relevance",
        "Stays on topic and directly relates to the query without unnecessary details.",
    ),
    (
        "conciseness",
        "Communicates clearly and efficiently, avoiding excess content while retaining substance.",
    ),
]

# COARSE WITH TIE.
DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [
    ("A>B", "Response A is better than Response B"),
    ("B<A", "Response B is better than Response A"),
    ("A=B", "Both responses are equally good"),
]