playing_with_the_source_code
#2
by
XinGuan2000
- opened
- pages/1_Single_Evaluation.py +1 -1
- pages/{3_Benchmark_Data.py β 2_Benchmark_Data.py} +0 -0
- pages/{4_Explanation_Generation.py β 3_Explanation_Generation.py} +0 -0
- pages/{5_Batch_Evaluation.py β 4_Batch_Evaluation.py} +0 -0
- pages/{2_Conversation_Evaluation.py β 5_Conversation_Evaluation.py} +1 -1
- util/evaluator.py +60 -171
pages/1_Single_Evaluation.py
CHANGED
@@ -57,7 +57,7 @@ if not st.session_state.get('password_correct', False):
|
|
57 |
check_password()
|
58 |
else:
|
59 |
st.sidebar.success("Password Verified. Proceed with the demo.")
|
60 |
-
model_name = st.selectbox('Select a model:', ['gpt35-1106'])
|
61 |
|
62 |
# User choice between predefined examples or their own input
|
63 |
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
|
|
|
57 |
check_password()
|
58 |
else:
|
59 |
st.sidebar.success("Password Verified. Proceed with the demo.")
|
60 |
+
model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
|
61 |
|
62 |
# User choice between predefined examples or their own input
|
63 |
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
|
pages/{3_Benchmark_Data.py β 2_Benchmark_Data.py}
RENAMED
File without changes
|
pages/{4_Explanation_Generation.py β 3_Explanation_Generation.py}
RENAMED
File without changes
|
pages/{5_Batch_Evaluation.py β 4_Batch_Evaluation.py}
RENAMED
File without changes
|
pages/{2_Conversation_Evaluation.py β 5_Conversation_Evaluation.py}
RENAMED
@@ -66,7 +66,7 @@ if not st.session_state.get('password_correct', False):
|
|
66 |
check_password()
|
67 |
else:
|
68 |
st.sidebar.success("Password Verified. Proceed with the demo.")
|
69 |
-
model_name = st.selectbox('Select a model:', ['gpt35-1106'])
|
70 |
|
71 |
# User choice between predefined examples or their own input
|
72 |
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
|
|
|
66 |
check_password()
|
67 |
else:
|
68 |
st.sidebar.success("Password Verified. Proceed with the demo.")
|
69 |
+
model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
|
70 |
|
71 |
# User choice between predefined examples or their own input
|
72 |
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
|
util/evaluator.py
CHANGED
@@ -9,34 +9,20 @@ class evaluator:
|
|
9 |
|
10 |
def validate_scores(self, scores):
|
11 |
required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
|
12 |
-
|
13 |
for key in required_keys:
|
14 |
-
if key not in scores:
|
15 |
-
return {
|
16 |
-
|
17 |
-
score_data = scores[key]
|
18 |
-
|
19 |
-
if not isinstance(score_data, dict):
|
20 |
-
return {k: {"Score": -1, "Justification": "Invalid input format"} for k in required_keys}
|
21 |
-
|
22 |
-
if "Score" not in score_data or not isinstance(score_data["Score"], (int, float)) or not (
|
23 |
-
0 <= score_data["Score"] <= 10):
|
24 |
-
return {k: {"Score": -1, "Justification": "Invalid score value"} for k in required_keys}
|
25 |
-
|
26 |
-
if "Justification" not in score_data or not isinstance(score_data["Justification"], str) or not score_data[
|
27 |
-
"Justification"].strip():
|
28 |
-
return {k: {"Score": -1, "Justification": "Invalid or missing justification"} for k in required_keys}
|
29 |
|
30 |
return scores
|
31 |
|
32 |
def evaluate_single(self, question,explanation):
|
33 |
|
34 |
-
evaluation_prompt = f"""You are provided with a user's
|
35 |
-
an
|
36 |
-
should be scored on a scale from 0 to
|
37 |
-
and
|
38 |
|
39 |
-
|
40 |
{question}
|
41 |
|
42 |
Provided Explanation:
|
@@ -46,55 +32,35 @@ class evaluator:
|
|
46 |
|
47 |
Factually Correct:
|
48 |
Definition: The explanation must be accurate and relevant to the question and the subject matter.
|
49 |
-
Score: (0-
|
50 |
|
51 |
Useful:
|
52 |
Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
|
53 |
-
Score: (0-
|
54 |
|
55 |
Context Specific:
|
56 |
Definition: The explanation should be relevant to the specific context or scenario implied by the question.
|
57 |
-
Score: (0-
|
58 |
|
59 |
User Specific:
|
60 |
Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
|
61 |
-
Score: (0-
|
62 |
|
63 |
Provides Pluralism:
|
64 |
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
|
65 |
-
Score: (0-
|
66 |
|
67 |
-
After evaluating the provided question and explanation based on the five principles, please format your scores
|
68 |
|
69 |
Example JSON format:
|
70 |
-
|
71 |
-
"Factually Correct":
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
"Useful": {{
|
76 |
-
"Justification": "xxx",
|
77 |
-
"Score": 8.5
|
78 |
-
}},
|
79 |
-
"Context Specific": {{
|
80 |
-
"Justification": "xxx",
|
81 |
-
"Score": 8
|
82 |
-
}},
|
83 |
-
"User Specific": {{
|
84 |
-
"Justification": "xxx",
|
85 |
-
"Score": 7.5
|
86 |
-
}},
|
87 |
-
"Provides Pluralism": {{
|
88 |
-
"Justification": "xxx",
|
89 |
-
"Score": 7
|
90 |
-
}}
|
91 |
-
}}
|
92 |
-
|
93 |
-
Answer:
|
94 |
-
"""
|
95 |
-
|
96 |
-
response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=500).strip()
|
97 |
|
|
|
|
|
98 |
print(response)
|
99 |
try:
|
100 |
scores = json.loads(response)
|
@@ -119,70 +85,48 @@ class evaluator:
|
|
119 |
def evaluate_conversation(self, conversation, context):
|
120 |
formatted_conversation = self.format_conversation(conversation)
|
121 |
evaluation_prompt = f"""
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
"Factually Correct": {{
|
159 |
-
"Justification": "xxx",
|
160 |
-
"Score": 9
|
161 |
-
}},
|
162 |
-
"Useful": {{
|
163 |
-
"Justification": "xxx",
|
164 |
-
"Score": 8.5
|
165 |
-
}},
|
166 |
-
"Context Specific": {{
|
167 |
-
"Justification": "xxx",
|
168 |
-
"Score": 8
|
169 |
-
}},
|
170 |
-
"User Specific": {{
|
171 |
-
"Justification": "xxx",
|
172 |
-
"Score": 7.5
|
173 |
-
}},
|
174 |
-
"Provides Pluralism": {{
|
175 |
-
"Justification": "xxx",
|
176 |
-
"Score": 7
|
177 |
-
}}
|
178 |
-
}}
|
179 |
-
|
180 |
Answer:
|
181 |
"""
|
182 |
|
183 |
print(evaluation_prompt)
|
184 |
|
185 |
-
response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=
|
186 |
try:
|
187 |
scores = json.loads(response)
|
188 |
except json.JSONDecodeError:
|
@@ -195,19 +139,12 @@ class evaluator:
|
|
195 |
|
196 |
return self.validate_scores(scores)
|
197 |
|
198 |
-
|
199 |
def write_evaluation_commentary(scores):
|
200 |
evaluation_details = []
|
201 |
-
|
202 |
-
for principle, details in scores.items():
|
203 |
-
print(details)
|
204 |
-
score = details.get('Score', -1)
|
205 |
-
justification = details.get('Justification', '')
|
206 |
|
207 |
if score == -1:
|
208 |
-
evaluation_details.append(
|
209 |
-
{'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.',
|
210 |
-
'Justification': justification})
|
211 |
continue
|
212 |
|
213 |
if principle == "Factually Correct":
|
@@ -246,56 +183,8 @@ def write_evaluation_commentary(scores):
|
|
246 |
else:
|
247 |
comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
248 |
|
249 |
-
evaluation_details.append(
|
250 |
-
{'Principle': principle, 'Score': score, 'Justification': justification,'Commentary': comment})
|
251 |
-
|
252 |
return evaluation_details
|
253 |
-
# def write_evaluation_commentary(scores):
|
254 |
-
# evaluation_details = []
|
255 |
-
# for principle, score in scores.items():
|
256 |
-
#
|
257 |
-
# if score == -1:
|
258 |
-
# evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
|
259 |
-
# continue
|
260 |
-
#
|
261 |
-
# if principle == "Factually Correct":
|
262 |
-
# if score >= 0.8:
|
263 |
-
# comment = "Excellent accuracy! The information is precise and directly relevant to the question."
|
264 |
-
# elif score >= 0.5:
|
265 |
-
# comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
|
266 |
-
# else:
|
267 |
-
# comment = "The explanation contains significant inaccuracies or irrelevant information."
|
268 |
-
# elif principle == "Useful":
|
269 |
-
# if score >= 0.8:
|
270 |
-
# comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
|
271 |
-
# elif score >= 0.5:
|
272 |
-
# comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
|
273 |
-
# else:
|
274 |
-
# comment = "The explanation does little to help understand or apply the information provided."
|
275 |
-
# elif principle == "Context Specific":
|
276 |
-
# if score >= 0.8:
|
277 |
-
# comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
|
278 |
-
# elif score >= 0.5:
|
279 |
-
# comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
|
280 |
-
# else:
|
281 |
-
# comment = "Fails to address the context of the question, lacking relevance or specificity."
|
282 |
-
# elif principle == "User Specific":
|
283 |
-
# if score >= 0.8:
|
284 |
-
# comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
|
285 |
-
# elif score >= 0.5:
|
286 |
-
# comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
|
287 |
-
# else:
|
288 |
-
# comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
|
289 |
-
# elif principle == "Provides Pluralism":
|
290 |
-
# if score >= 0.8:
|
291 |
-
# comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
|
292 |
-
# elif score >= 0.5:
|
293 |
-
# comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
|
294 |
-
# else:
|
295 |
-
# comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
296 |
-
#
|
297 |
-
# evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
|
298 |
-
# return evaluation_details
|
299 |
|
300 |
if __name__ == '__main__':
|
301 |
|
|
|
9 |
|
10 |
def validate_scores(self, scores):
|
11 |
required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
|
|
|
12 |
for key in required_keys:
|
13 |
+
if key not in scores or not isinstance(scores[key], (int, float)) or not (-1 <= scores[key] <= 1):
|
14 |
+
return {"Factually Correct": -1,"Useful": -1,"Context Specific": -1,"User Specific":-1,"Provides Pluralism":-1}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
return scores
|
17 |
|
18 |
def evaluate_single(self, question,explanation):
|
19 |
|
20 |
+
evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
|
21 |
+
an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
|
22 |
+
should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
|
23 |
+
and 1 indicates that the principle is fully satisfied.
|
24 |
|
25 |
+
Question:
|
26 |
{question}
|
27 |
|
28 |
Provided Explanation:
|
|
|
32 |
|
33 |
Factually Correct:
|
34 |
Definition: The explanation must be accurate and relevant to the question and the subject matter.
|
35 |
+
Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
|
36 |
|
37 |
Useful:
|
38 |
Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
|
39 |
+
Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
|
40 |
|
41 |
Context Specific:
|
42 |
Definition: The explanation should be relevant to the specific context or scenario implied by the question.
|
43 |
+
Score: (0-1) How well does the explanation address the specific context or scenario of the question?
|
44 |
|
45 |
User Specific:
|
46 |
Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
|
47 |
+
Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
|
48 |
|
49 |
Provides Pluralism:
|
50 |
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
|
51 |
+
Score: (0-1) How well does the explanation provide or support multiple perspectives?
|
52 |
|
53 |
+
After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
|
54 |
|
55 |
Example JSON format:
|
56 |
+
|
57 |
+
Answer:{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}
|
58 |
+
|
59 |
+
Answer:
|
60 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
+
response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
|
63 |
+
#response = """{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}"""
|
64 |
print(response)
|
65 |
try:
|
66 |
scores = json.loads(response)
|
|
|
85 |
def evaluate_conversation(self, conversation, context):
|
86 |
formatted_conversation = self.format_conversation(conversation)
|
87 |
evaluation_prompt = f"""
|
88 |
+
You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the chatbot explanation in the conversation based on the following five principles. Each principle should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, and 1 indicates that the principle is fully satisfied.
|
89 |
+
|
90 |
+
Conversation:
|
91 |
+
{formatted_conversation}
|
92 |
+
|
93 |
+
Context:
|
94 |
+
{context}
|
95 |
+
|
96 |
+
Evaluation Criteria:
|
97 |
+
|
98 |
+
Factually Correct:
|
99 |
+
Definition: The explanation must be accurate and relevant to the question and the subject matter.
|
100 |
+
Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
|
101 |
+
|
102 |
+
Useful:
|
103 |
+
Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
|
104 |
+
Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
|
105 |
+
|
106 |
+
Context Specific:
|
107 |
+
Definition: The explanation should be relevant to the specific context or scenario implied by the question.
|
108 |
+
Score: (0-1) How well does the explanation address the specific context or scenario of the question?
|
109 |
+
|
110 |
+
User Specific:
|
111 |
+
Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
|
112 |
+
Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
|
113 |
+
|
114 |
+
Provides Pluralism:
|
115 |
+
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
|
116 |
+
Score: (0-1) How well does the explanation provide or support multiple perspectives?
|
117 |
+
|
118 |
+
After evaluating the provided conversation based on the context and five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
|
119 |
+
|
120 |
+
Example JSON format:
|
121 |
+
|
122 |
+
Answer: {{"Factually Correct": 0.9, "Useful": 0.85, "Context Specific": 0.8, "User Specific": 0.75, "Provides Pluralism": 0.7}}
|
123 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
Answer:
|
125 |
"""
|
126 |
|
127 |
print(evaluation_prompt)
|
128 |
|
129 |
+
response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=500).strip()
|
130 |
try:
|
131 |
scores = json.loads(response)
|
132 |
except json.JSONDecodeError:
|
|
|
139 |
|
140 |
return self.validate_scores(scores)
|
141 |
|
|
|
142 |
def write_evaluation_commentary(scores):
|
143 |
evaluation_details = []
|
144 |
+
for principle, score in scores.items():
|
|
|
|
|
|
|
|
|
145 |
|
146 |
if score == -1:
|
147 |
+
evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
|
|
|
|
|
148 |
continue
|
149 |
|
150 |
if principle == "Factually Correct":
|
|
|
183 |
else:
|
184 |
comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
185 |
|
186 |
+
evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
|
|
|
|
|
187 |
return evaluation_details
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
if __name__ == '__main__':
|
190 |
|