Spaces:
Sleeping
Sleeping
justinxzhao
commited on
Commit
·
eb4ec23
1
Parent(s):
1afb9ca
Parse judgments with structured output prompting, one response model, one judge model at a time.
Browse files- app.py +124 -66
- judging_dataclasses.py +3 -3
- prompts.py +8 -5
app.py
CHANGED
@@ -17,7 +17,7 @@ from constants import (
|
|
17 |
)
|
18 |
from prompts import *
|
19 |
from judging_dataclasses import (
|
20 |
-
DirectAssessmentJudgingResponse,
|
21 |
DirectAssessmentCriterionScore,
|
22 |
DirectAssessmentCriteriaScores,
|
23 |
)
|
@@ -191,24 +191,24 @@ def get_llm_response_stream(model_identifier, prompt):
|
|
191 |
|
192 |
|
193 |
def create_dataframe_for_direct_assessment_judging_response(
|
194 |
-
response:
|
195 |
-
):
|
196 |
# Initialize empty list to collect data
|
197 |
data = []
|
198 |
|
199 |
# Loop through models
|
200 |
-
for judging_model in response.judging_models:
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
|
213 |
# Create DataFrame
|
214 |
return pd.DataFrame(data)
|
@@ -295,26 +295,29 @@ def get_default_aggregator_prompt(user_prompt, llms):
|
|
295 |
|
296 |
|
297 |
def get_parse_judging_response_for_direct_assessment_prompt(
|
298 |
-
|
299 |
criteria_list,
|
300 |
options,
|
301 |
-
):
|
302 |
-
formatted_judging_responses = "\n\n".join(
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
|
|
|
|
|
|
307 |
)
|
308 |
return PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT.format(
|
309 |
-
|
310 |
criteria_list=format_criteria_list(criteria_list),
|
311 |
options=format_likert_comparison_options(options),
|
312 |
)
|
313 |
|
314 |
|
315 |
-
def
|
316 |
-
prompt: str,
|
317 |
-
) ->
|
318 |
# if os.getenv("DEBUG_MODE") == "True":
|
319 |
# return DirectAssessmentJudgingResponse(
|
320 |
# judging_models=[
|
@@ -358,7 +361,7 @@ def parse_judging_responses(
|
|
358 |
},
|
359 |
{"role": "user", "content": prompt},
|
360 |
],
|
361 |
-
response_format=
|
362 |
)
|
363 |
# Track token usage.
|
364 |
st.session_state["input_token_usage"][
|
@@ -443,7 +446,7 @@ def plot_overall_scores(overall_scores_df):
|
|
443 |
y="mean_score",
|
444 |
hue="ui_friendly_name",
|
445 |
data=summary,
|
446 |
-
palette="
|
447 |
capsize=0.1,
|
448 |
legend=False,
|
449 |
)
|
@@ -663,29 +666,76 @@ def st_direct_assessment_results(user_prompt, direct_assessment_prompt, criteria
|
|
663 |
judging_stream
|
664 |
)
|
665 |
|
666 |
-
|
667 |
-
|
668 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
669 |
if response_model not in st.session_state.direct_assessment_judging_df:
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
|
|
|
|
|
|
|
|
678 |
)
|
679 |
-
)
|
680 |
-
parsed_judging_responses = parse_judging_responses(
|
681 |
-
parse_judging_response_prompt, judging_responses
|
682 |
-
)
|
683 |
st.session_state.direct_assessment_judging_df[response_model] = (
|
684 |
-
|
685 |
-
parsed_judging_responses
|
686 |
-
)
|
687 |
)
|
688 |
|
|
|
|
|
|
|
689 |
# Uses the session state to plot the criteria scores and graphs for a given response
|
690 |
# model.
|
691 |
plot_criteria_scores(
|
@@ -706,13 +756,11 @@ def st_direct_assessment_results(user_prompt, direct_assessment_prompt, criteria
|
|
706 |
|
707 |
# Save the overall scores to the session state if it's not already there.
|
708 |
for record in grouped.to_dict(orient="records"):
|
709 |
-
|
710 |
-
response_model
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
record["judging_model"]
|
715 |
-
] = record["overall_score"]
|
716 |
|
717 |
overall_score = grouped["overall_score"].mean()
|
718 |
controversy = grouped["overall_score"].std()
|
@@ -796,7 +844,14 @@ def main():
|
|
796 |
if "direct_assessment_overall_score" not in st.session_state:
|
797 |
st.session_state.direct_assessment_overall_score = {}
|
798 |
if "direct_assessment_judging_df" not in st.session_state:
|
799 |
-
st.session_state.direct_assessment_judging_df =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
800 |
if "direct_assessment_judging_responses" not in st.session_state:
|
801 |
st.session_state.direct_assessment_judging_responses = defaultdict(dict)
|
802 |
if "direct_assessment_overall_scores" not in st.session_state:
|
@@ -940,19 +995,22 @@ def main():
|
|
940 |
overall_scores_df["response_model"] = overall_scores_df[
|
941 |
"response_model"
|
942 |
].apply(get_ui_friendly_name)
|
943 |
-
overall_scores_df["judging_model"] = overall_scores_df[
|
944 |
-
|
945 |
-
].apply(get_ui_friendly_name)
|
946 |
|
947 |
with st.expander("Overall scores from all judges"):
|
|
|
|
|
948 |
st.dataframe(overall_scores_df)
|
949 |
|
950 |
# All criteria scores.
|
951 |
with right_column:
|
952 |
all_scores_df = pd.DataFrame()
|
953 |
-
for
|
954 |
-
|
955 |
-
|
|
|
956 |
score_df["response_model"] = response_model
|
957 |
all_scores_df = pd.concat([all_scores_df, score_df])
|
958 |
all_scores_df = all_scores_df.reset_index()
|
@@ -968,12 +1026,12 @@ def main():
|
|
968 |
"explanation",
|
969 |
]
|
970 |
]
|
971 |
-
all_scores_df["response_model"] = all_scores_df[
|
972 |
-
|
973 |
-
].apply(get_ui_friendly_name)
|
974 |
-
all_scores_df["judging_model"] = all_scores_df[
|
975 |
-
|
976 |
-
].apply(get_ui_friendly_name)
|
977 |
|
978 |
with st.expander(
|
979 |
"Criteria-specific scores and explanations from all judges"
|
|
|
17 |
)
|
18 |
from prompts import *
|
19 |
from judging_dataclasses import (
|
20 |
+
# DirectAssessmentJudgingResponse,
|
21 |
DirectAssessmentCriterionScore,
|
22 |
DirectAssessmentCriteriaScores,
|
23 |
)
|
|
|
191 |
|
192 |
|
193 |
def create_dataframe_for_direct_assessment_judging_response(
|
194 |
+
response: DirectAssessmentCriteriaScores, judging_model: str
|
195 |
+
) -> pd.DataFrame:
|
196 |
# Initialize empty list to collect data
|
197 |
data = []
|
198 |
|
199 |
# Loop through models
|
200 |
+
# for judging_model in response.judging_models:
|
201 |
+
# model_name = judging_model.model
|
202 |
+
# Loop through criteria_scores
|
203 |
+
for criteria_score in response.criteria_scores:
|
204 |
+
data.append(
|
205 |
+
{
|
206 |
+
"judging_model": judging_model, # Gets passed in.
|
207 |
+
"criteria": criteria_score.criterion,
|
208 |
+
"score": criteria_score.score,
|
209 |
+
"explanation": criteria_score.explanation,
|
210 |
+
}
|
211 |
+
)
|
212 |
|
213 |
# Create DataFrame
|
214 |
return pd.DataFrame(data)
|
|
|
295 |
|
296 |
|
297 |
def get_parse_judging_response_for_direct_assessment_prompt(
|
298 |
+
judging_response: str,
|
299 |
criteria_list,
|
300 |
options,
|
301 |
+
) -> str:
|
302 |
+
# formatted_judging_responses = "\n\n\n".join(
|
303 |
+
# [
|
304 |
+
# f"----- {get_ui_friendly_name(model)} START -----\n\n\n{judging_responses[model]}\n\n\n-----{get_ui_friendly_name(model)} END-----\n\n\n"
|
305 |
+
# for model in judging_responses.keys()
|
306 |
+
# ]
|
307 |
+
# )
|
308 |
+
formatted_judging_response = (
|
309 |
+
f"----- START -----\n\n\n{judging_response}\n\n\n----- END -----\n\n\n"
|
310 |
)
|
311 |
return PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT.format(
|
312 |
+
judging_response=formatted_judging_response,
|
313 |
criteria_list=format_criteria_list(criteria_list),
|
314 |
options=format_likert_comparison_options(options),
|
315 |
)
|
316 |
|
317 |
|
318 |
+
def get_parsed_judging_response_obj_using_llm(
|
319 |
+
prompt: str,
|
320 |
+
) -> DirectAssessmentCriteriaScores:
|
321 |
# if os.getenv("DEBUG_MODE") == "True":
|
322 |
# return DirectAssessmentJudgingResponse(
|
323 |
# judging_models=[
|
|
|
361 |
},
|
362 |
{"role": "user", "content": prompt},
|
363 |
],
|
364 |
+
response_format=DirectAssessmentCriteriaScores,
|
365 |
)
|
366 |
# Track token usage.
|
367 |
st.session_state["input_token_usage"][
|
|
|
446 |
y="mean_score",
|
447 |
hue="ui_friendly_name",
|
448 |
data=summary,
|
449 |
+
palette="rainbow",
|
450 |
capsize=0.1,
|
451 |
legend=False,
|
452 |
)
|
|
|
666 |
judging_stream
|
667 |
)
|
668 |
|
669 |
+
# Parse the judging response. If parsing results are already cached, then
|
670 |
+
# skip.
|
671 |
+
# Use Structured Output to parse the judging response.
|
672 |
+
parse_judging_response_prompt = get_parse_judging_response_for_direct_assessment_prompt(
|
673 |
+
judging_response=st.session_state.direct_assessment_judging_responses[
|
674 |
+
response_model
|
675 |
+
][
|
676 |
+
judging_model
|
677 |
+
],
|
678 |
+
criteria_list=criteria_list,
|
679 |
+
options=SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
|
680 |
+
)
|
681 |
+
|
682 |
+
st.write("Parse judging response prompt:")
|
683 |
+
st.write(parse_judging_response_prompt)
|
684 |
+
|
685 |
+
if (
|
686 |
+
response_model
|
687 |
+
not in st.session_state.direct_assessment_judging_by_response_and_judging_model_df
|
688 |
+
or judging_model
|
689 |
+
not in st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
|
690 |
+
response_model
|
691 |
+
]
|
692 |
+
):
|
693 |
+
parsed_judging_response_obj = (
|
694 |
+
get_parsed_judging_response_obj_using_llm(
|
695 |
+
parse_judging_response_prompt
|
696 |
+
)
|
697 |
+
)
|
698 |
+
st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
|
699 |
+
response_model
|
700 |
+
][
|
701 |
+
judging_model
|
702 |
+
] = create_dataframe_for_direct_assessment_judging_response(
|
703 |
+
parsed_judging_response_obj, judging_model
|
704 |
+
)
|
705 |
+
|
706 |
+
# with st.expander("Structured output parsing response"):
|
707 |
+
st.write("Structured output parsing response:")
|
708 |
+
st.write(
|
709 |
+
st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
|
710 |
+
response_model
|
711 |
+
][
|
712 |
+
judging_model
|
713 |
+
]
|
714 |
+
)
|
715 |
+
|
716 |
+
# Combined the dataframes for each judging model into a single dataframe for each
|
717 |
+
# response model.
|
718 |
if response_model not in st.session_state.direct_assessment_judging_df:
|
719 |
+
# Combine the dataframes for each judging model into a single dataframe.
|
720 |
+
combined_judging_df = pd.DataFrame()
|
721 |
+
for judging_model in st.session_state.selected_models:
|
722 |
+
combined_judging_df = pd.concat(
|
723 |
+
[
|
724 |
+
combined_judging_df,
|
725 |
+
st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
|
726 |
+
response_model
|
727 |
+
][
|
728 |
+
judging_model
|
729 |
+
],
|
730 |
+
]
|
731 |
)
|
|
|
|
|
|
|
|
|
732 |
st.session_state.direct_assessment_judging_df[response_model] = (
|
733 |
+
combined_judging_df
|
|
|
|
|
734 |
)
|
735 |
|
736 |
+
with st.expander("Judging results from all judges"):
|
737 |
+
st.write(st.session_state.direct_assessment_judging_df[response_model])
|
738 |
+
|
739 |
# Uses the session state to plot the criteria scores and graphs for a given response
|
740 |
# model.
|
741 |
plot_criteria_scores(
|
|
|
756 |
|
757 |
# Save the overall scores to the session state if it's not already there.
|
758 |
for record in grouped.to_dict(orient="records"):
|
759 |
+
st.session_state.direct_assessment_overall_scores[
|
760 |
+
get_ui_friendly_name(response_model)
|
761 |
+
][get_ui_friendly_name(record["judging_model"])] = record[
|
762 |
+
"overall_score"
|
763 |
+
]
|
|
|
|
|
764 |
|
765 |
overall_score = grouped["overall_score"].mean()
|
766 |
controversy = grouped["overall_score"].std()
|
|
|
844 |
if "direct_assessment_overall_score" not in st.session_state:
|
845 |
st.session_state.direct_assessment_overall_score = {}
|
846 |
if "direct_assessment_judging_df" not in st.session_state:
|
847 |
+
st.session_state.direct_assessment_judging_df = {}
|
848 |
+
if (
|
849 |
+
"direct_assessment_judging_by_response_and_judging_model_df"
|
850 |
+
not in st.session_state
|
851 |
+
):
|
852 |
+
st.session_state.direct_assessment_judging_by_response_and_judging_model_df = defaultdict(
|
853 |
+
dict
|
854 |
+
)
|
855 |
if "direct_assessment_judging_responses" not in st.session_state:
|
856 |
st.session_state.direct_assessment_judging_responses = defaultdict(dict)
|
857 |
if "direct_assessment_overall_scores" not in st.session_state:
|
|
|
995 |
overall_scores_df["response_model"] = overall_scores_df[
|
996 |
"response_model"
|
997 |
].apply(get_ui_friendly_name)
|
998 |
+
# overall_scores_df["judging_model"] = overall_scores_df[
|
999 |
+
# "judging_model"
|
1000 |
+
# ].apply(get_ui_friendly_name)
|
1001 |
|
1002 |
with st.expander("Overall scores from all judges"):
|
1003 |
+
st.write(st.session_state.direct_assessment_overall_scores)
|
1004 |
+
st.dataframe(overall_scores_df_raw)
|
1005 |
st.dataframe(overall_scores_df)
|
1006 |
|
1007 |
# All criteria scores.
|
1008 |
with right_column:
|
1009 |
all_scores_df = pd.DataFrame()
|
1010 |
+
for (
|
1011 |
+
response_model,
|
1012 |
+
score_df,
|
1013 |
+
) in st.session_state.direct_assessment_judging_df.items():
|
1014 |
score_df["response_model"] = response_model
|
1015 |
all_scores_df = pd.concat([all_scores_df, score_df])
|
1016 |
all_scores_df = all_scores_df.reset_index()
|
|
|
1026 |
"explanation",
|
1027 |
]
|
1028 |
]
|
1029 |
+
# all_scores_df["response_model"] = all_scores_df[
|
1030 |
+
# "response_model"
|
1031 |
+
# ].apply(get_ui_friendly_name)
|
1032 |
+
# all_scores_df["judging_model"] = all_scores_df[
|
1033 |
+
# "judging_model"
|
1034 |
+
# ].apply(get_ui_friendly_name)
|
1035 |
|
1036 |
with st.expander(
|
1037 |
"Criteria-specific scores and explanations from all judges"
|
judging_dataclasses.py
CHANGED
@@ -35,9 +35,9 @@ class DirectAssessmentCriterionScore(BaseModel):
|
|
35 |
|
36 |
|
37 |
class DirectAssessmentCriteriaScores(BaseModel):
|
38 |
-
model: str
|
39 |
criteria_scores: List[DirectAssessmentCriterionScore]
|
40 |
|
41 |
|
42 |
-
class DirectAssessmentJudgingResponse(BaseModel):
|
43 |
-
|
|
|
35 |
|
36 |
|
37 |
class DirectAssessmentCriteriaScores(BaseModel):
|
38 |
+
# model: str
|
39 |
criteria_scores: List[DirectAssessmentCriterionScore]
|
40 |
|
41 |
|
42 |
+
# class DirectAssessmentJudgingResponse(BaseModel):
|
43 |
+
# judging_models: List[DirectAssessmentCriteriaScores]
|
prompts.py
CHANGED
@@ -1,18 +1,21 @@
|
|
1 |
from judging_dataclasses import Criteria
|
2 |
|
3 |
|
4 |
-
PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the
|
|
|
|
|
5 |
|
6 |
-
Each judge was asked to give a rating for each of the following criteria, along with an explanation:
|
7 |
{criteria_list}
|
8 |
|
9 |
The possible options for each criterion are as follows:
|
|
|
10 |
{options}
|
11 |
|
12 |
-
|
13 |
-
|
|
|
14 |
|
15 |
-
Please provide a JSON object
|
16 |
"""
|
17 |
|
18 |
|
|
|
1 |
from judging_dataclasses import Criteria
|
2 |
|
3 |
|
4 |
+
PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the response from a judge for a direct assessment.
|
5 |
+
|
6 |
+
The judge was asked to give a rating for each of the following criteria, along with an explanation:
|
7 |
|
|
|
8 |
{criteria_list}
|
9 |
|
10 |
The possible options for each criterion are as follows:
|
11 |
+
|
12 |
{options}
|
13 |
|
14 |
+
Here is the response from the judge:
|
15 |
+
|
16 |
+
{judging_response}
|
17 |
|
18 |
+
Please provide a JSON object the scores for each of the criteria, along with any explanation the judge provided.
|
19 |
"""
|
20 |
|
21 |
|