Zekun Wu
commited on
Commit
•
ea070cc
1
Parent(s):
0eb1a66
update
Browse files- app.py +13 -40
- evaluator.py +40 -0
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
-
from evaluator import evaluator
|
3 |
import os
|
4 |
|
5 |
# Predefined examples
|
@@ -14,45 +15,7 @@ examples = {
|
|
14 |
}
|
15 |
}
|
16 |
|
17 |
-
def write_evaluation_commentary(scores):
|
18 |
-
for principle, score in scores.items():
|
19 |
-
if principle == "Factually Correct":
|
20 |
-
if score >= 0.8:
|
21 |
-
comment = "Excellent accuracy! The information is precise and directly relevant to the question."
|
22 |
-
elif score >= 0.5:
|
23 |
-
comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
|
24 |
-
else:
|
25 |
-
comment = "The explanation contains significant inaccuracies or irrelevant information."
|
26 |
-
elif principle == "Useful":
|
27 |
-
if score >= 0.8:
|
28 |
-
comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
|
29 |
-
elif score >= 0.5:
|
30 |
-
comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
|
31 |
-
else:
|
32 |
-
comment = "The explanation does little to help understand or apply the information provided."
|
33 |
-
elif principle == "Context Specific":
|
34 |
-
if score >= 0.8:
|
35 |
-
comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
|
36 |
-
elif score >= 0.5:
|
37 |
-
comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
|
38 |
-
else:
|
39 |
-
comment = "Fails to address the context of the question, lacking relevance or specificity."
|
40 |
-
elif principle == "User Specific":
|
41 |
-
if score >= 0.8:
|
42 |
-
comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
|
43 |
-
elif score >= 0.5:
|
44 |
-
comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
|
45 |
-
else:
|
46 |
-
comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
|
47 |
-
elif principle == "Provides Pluralism":
|
48 |
-
if score >= 0.8:
|
49 |
-
comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
|
50 |
-
elif score >= 0.5:
|
51 |
-
comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
|
52 |
-
else:
|
53 |
-
comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
54 |
|
55 |
-
st.write(f"{principle} ({score}): {comment}")
|
56 |
|
57 |
# Function to check password
|
58 |
def check_password():
|
@@ -101,6 +64,16 @@ else:
|
|
101 |
eval = evaluator(model_name)
|
102 |
scores = eval(question, explanation)
|
103 |
st.write('### Scores')
|
104 |
-
write_evaluation_commentary(scores)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
else:
|
106 |
st.error('Please enter both a question and an explanation to evaluate.')
|
|
|
1 |
+
import pandas as pd
|
2 |
import streamlit as st
|
3 |
+
from evaluator import evaluator,write_evaluation_commentary
|
4 |
import os
|
5 |
|
6 |
# Predefined examples
|
|
|
15 |
}
|
16 |
}
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
|
|
19 |
|
20 |
# Function to check password
|
21 |
def check_password():
|
|
|
64 |
eval = evaluator(model_name)
|
65 |
scores = eval(question, explanation)
|
66 |
st.write('### Scores')
|
67 |
+
details = write_evaluation_commentary(scores)
|
68 |
+
df = pd.DataFrame(details)
|
69 |
+
st.write(df)
|
70 |
+
|
71 |
+
csv = df.to_csv(index=False)
|
72 |
+
st.download_button(
|
73 |
+
label="Download evaluation as CSV",
|
74 |
+
data=csv,
|
75 |
+
file_name='evaluation.csv',
|
76 |
+
mime='text/csv',
|
77 |
+
)
|
78 |
else:
|
79 |
st.error('Please enter both a question and an explanation to evaluate.')
|
evaluator.py
CHANGED
@@ -75,7 +75,47 @@ class evaluator:
|
|
75 |
|
76 |
return self.validate_scores(scores)
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
|
|
|
|
79 |
|
80 |
if __name__ == '__main__':
|
81 |
eval = evaluator()
|
|
|
75 |
|
76 |
return self.validate_scores(scores)
|
77 |
|
78 |
+
def write_evaluation_commentary(scores):
|
79 |
+
evaluation_details = []
|
80 |
+
for principle, score in scores.items():
|
81 |
+
if principle == "Factually Correct":
|
82 |
+
if score >= 0.8:
|
83 |
+
comment = "Excellent accuracy! The information is precise and directly relevant to the question."
|
84 |
+
elif score >= 0.5:
|
85 |
+
comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
|
86 |
+
else:
|
87 |
+
comment = "The explanation contains significant inaccuracies or irrelevant information."
|
88 |
+
elif principle == "Useful":
|
89 |
+
if score >= 0.8:
|
90 |
+
comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
|
91 |
+
elif score >= 0.5:
|
92 |
+
comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
|
93 |
+
else:
|
94 |
+
comment = "The explanation does little to help understand or apply the information provided."
|
95 |
+
elif principle == "Context Specific":
|
96 |
+
if score >= 0.8:
|
97 |
+
comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
|
98 |
+
elif score >= 0.5:
|
99 |
+
comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
|
100 |
+
else:
|
101 |
+
comment = "Fails to address the context of the question, lacking relevance or specificity."
|
102 |
+
elif principle == "User Specific":
|
103 |
+
if score >= 0.8:
|
104 |
+
comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
|
105 |
+
elif score >= 0.5:
|
106 |
+
comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
|
107 |
+
else:
|
108 |
+
comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
|
109 |
+
elif principle == "Provides Pluralism":
|
110 |
+
if score >= 0.8:
|
111 |
+
comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
|
112 |
+
elif score >= 0.5:
|
113 |
+
comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
|
114 |
+
else:
|
115 |
+
comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
116 |
|
117 |
+
evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
|
118 |
+
return evaluation_details
|
119 |
|
120 |
if __name__ == '__main__':
|
121 |
eval = evaluator()
|