justinxzhao commited on
Commit
eb4ec23
·
1 Parent(s): 1afb9ca

Parse judgments with structured output prompting, one response model, one judge model at a time.

Browse files
Files changed (3) hide show
  1. app.py +124 -66
  2. judging_dataclasses.py +3 -3
  3. prompts.py +8 -5
app.py CHANGED
@@ -17,7 +17,7 @@ from constants import (
17
  )
18
  from prompts import *
19
  from judging_dataclasses import (
20
- DirectAssessmentJudgingResponse,
21
  DirectAssessmentCriterionScore,
22
  DirectAssessmentCriteriaScores,
23
  )
@@ -191,24 +191,24 @@ def get_llm_response_stream(model_identifier, prompt):
191
 
192
 
193
  def create_dataframe_for_direct_assessment_judging_response(
194
- response: DirectAssessmentJudgingResponse,
195
- ):
196
  # Initialize empty list to collect data
197
  data = []
198
 
199
  # Loop through models
200
- for judging_model in response.judging_models:
201
- model_name = judging_model.model
202
- # Loop through criteria_scores
203
- for criteria_score in judging_model.criteria_scores:
204
- data.append(
205
- {
206
- "judging_model": model_name,
207
- "criteria": criteria_score.criterion,
208
- "score": criteria_score.score,
209
- "explanation": criteria_score.explanation,
210
- }
211
- )
212
 
213
  # Create DataFrame
214
  return pd.DataFrame(data)
@@ -295,26 +295,29 @@ def get_default_aggregator_prompt(user_prompt, llms):
295
 
296
 
297
  def get_parse_judging_response_for_direct_assessment_prompt(
298
- judging_responses: dict[str, str],
299
  criteria_list,
300
  options,
301
- ):
302
- formatted_judging_responses = "\n\n".join(
303
- [
304
- f"{get_ui_friendly_name(model)} START\n{judging_responses[model]}\n\n{get_ui_friendly_name(model)} END\n\n\n"
305
- for model in judging_responses.keys()
306
- ]
 
 
 
307
  )
308
  return PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT.format(
309
- judging_responses=formatted_judging_responses,
310
  criteria_list=format_criteria_list(criteria_list),
311
  options=format_likert_comparison_options(options),
312
  )
313
 
314
 
315
- def parse_judging_responses(
316
- prompt: str, judging_responses: dict[str, str]
317
- ) -> DirectAssessmentJudgingResponse:
318
  # if os.getenv("DEBUG_MODE") == "True":
319
  # return DirectAssessmentJudgingResponse(
320
  # judging_models=[
@@ -358,7 +361,7 @@ def parse_judging_responses(
358
  },
359
  {"role": "user", "content": prompt},
360
  ],
361
- response_format=DirectAssessmentJudgingResponse,
362
  )
363
  # Track token usage.
364
  st.session_state["input_token_usage"][
@@ -443,7 +446,7 @@ def plot_overall_scores(overall_scores_df):
443
  y="mean_score",
444
  hue="ui_friendly_name",
445
  data=summary,
446
- palette="prism",
447
  capsize=0.1,
448
  legend=False,
449
  )
@@ -663,29 +666,76 @@ def st_direct_assessment_results(user_prompt, direct_assessment_prompt, criteria
663
  judging_stream
664
  )
665
 
666
- # Extract actual scores from open-ended responses using structured outputs.
667
- # Since we're extracting structured data for the first time, we can save the dataframe
668
- # to the session state so that it's cached.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
669
  if response_model not in st.session_state.direct_assessment_judging_df:
670
- judging_responses = (
671
- st.session_state.direct_assessment_judging_responses[response_model]
672
- )
673
- parse_judging_response_prompt = (
674
- get_parse_judging_response_for_direct_assessment_prompt(
675
- judging_responses,
676
- criteria_list,
677
- SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
 
 
 
 
678
  )
679
- )
680
- parsed_judging_responses = parse_judging_responses(
681
- parse_judging_response_prompt, judging_responses
682
- )
683
  st.session_state.direct_assessment_judging_df[response_model] = (
684
- create_dataframe_for_direct_assessment_judging_response(
685
- parsed_judging_responses
686
- )
687
  )
688
 
 
 
 
689
  # Uses the session state to plot the criteria scores and graphs for a given response
690
  # model.
691
  plot_criteria_scores(
@@ -706,13 +756,11 @@ def st_direct_assessment_results(user_prompt, direct_assessment_prompt, criteria
706
 
707
  # Save the overall scores to the session state if it's not already there.
708
  for record in grouped.to_dict(orient="records"):
709
- if (
710
- response_model
711
- not in st.session_state.direct_assessment_overall_scores
712
- ):
713
- st.session_state.direct_assessment_overall_scores[response_model][
714
- record["judging_model"]
715
- ] = record["overall_score"]
716
 
717
  overall_score = grouped["overall_score"].mean()
718
  controversy = grouped["overall_score"].std()
@@ -796,7 +844,14 @@ def main():
796
  if "direct_assessment_overall_score" not in st.session_state:
797
  st.session_state.direct_assessment_overall_score = {}
798
  if "direct_assessment_judging_df" not in st.session_state:
799
- st.session_state.direct_assessment_judging_df = defaultdict(dict)
 
 
 
 
 
 
 
800
  if "direct_assessment_judging_responses" not in st.session_state:
801
  st.session_state.direct_assessment_judging_responses = defaultdict(dict)
802
  if "direct_assessment_overall_scores" not in st.session_state:
@@ -940,19 +995,22 @@ def main():
940
  overall_scores_df["response_model"] = overall_scores_df[
941
  "response_model"
942
  ].apply(get_ui_friendly_name)
943
- overall_scores_df["judging_model"] = overall_scores_df[
944
- "judging_model"
945
- ].apply(get_ui_friendly_name)
946
 
947
  with st.expander("Overall scores from all judges"):
 
 
948
  st.dataframe(overall_scores_df)
949
 
950
  # All criteria scores.
951
  with right_column:
952
  all_scores_df = pd.DataFrame()
953
- for response_model, score_df in st.session_state[
954
- "direct_assessment_judging_df"
955
- ].items():
 
956
  score_df["response_model"] = response_model
957
  all_scores_df = pd.concat([all_scores_df, score_df])
958
  all_scores_df = all_scores_df.reset_index()
@@ -968,12 +1026,12 @@ def main():
968
  "explanation",
969
  ]
970
  ]
971
- all_scores_df["response_model"] = all_scores_df[
972
- "response_model"
973
- ].apply(get_ui_friendly_name)
974
- all_scores_df["judging_model"] = all_scores_df[
975
- "judging_model"
976
- ].apply(get_ui_friendly_name)
977
 
978
  with st.expander(
979
  "Criteria-specific scores and explanations from all judges"
 
17
  )
18
  from prompts import *
19
  from judging_dataclasses import (
20
+ # DirectAssessmentJudgingResponse,
21
  DirectAssessmentCriterionScore,
22
  DirectAssessmentCriteriaScores,
23
  )
 
191
 
192
 
193
  def create_dataframe_for_direct_assessment_judging_response(
194
+ response: DirectAssessmentCriteriaScores, judging_model: str
195
+ ) -> pd.DataFrame:
196
  # Initialize empty list to collect data
197
  data = []
198
 
199
  # Loop through models
200
+ # for judging_model in response.judging_models:
201
+ # model_name = judging_model.model
202
+ # Loop through criteria_scores
203
+ for criteria_score in response.criteria_scores:
204
+ data.append(
205
+ {
206
+ "judging_model": judging_model, # Gets passed in.
207
+ "criteria": criteria_score.criterion,
208
+ "score": criteria_score.score,
209
+ "explanation": criteria_score.explanation,
210
+ }
211
+ )
212
 
213
  # Create DataFrame
214
  return pd.DataFrame(data)
 
295
 
296
 
297
  def get_parse_judging_response_for_direct_assessment_prompt(
298
+ judging_response: str,
299
  criteria_list,
300
  options,
301
+ ) -> str:
302
+ # formatted_judging_responses = "\n\n\n".join(
303
+ # [
304
+ # f"----- {get_ui_friendly_name(model)} START -----\n\n\n{judging_responses[model]}\n\n\n-----{get_ui_friendly_name(model)} END-----\n\n\n"
305
+ # for model in judging_responses.keys()
306
+ # ]
307
+ # )
308
+ formatted_judging_response = (
309
+ f"----- START -----\n\n\n{judging_response}\n\n\n----- END -----\n\n\n"
310
  )
311
  return PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT.format(
312
+ judging_response=formatted_judging_response,
313
  criteria_list=format_criteria_list(criteria_list),
314
  options=format_likert_comparison_options(options),
315
  )
316
 
317
 
318
+ def get_parsed_judging_response_obj_using_llm(
319
+ prompt: str,
320
+ ) -> DirectAssessmentCriteriaScores:
321
  # if os.getenv("DEBUG_MODE") == "True":
322
  # return DirectAssessmentJudgingResponse(
323
  # judging_models=[
 
361
  },
362
  {"role": "user", "content": prompt},
363
  ],
364
+ response_format=DirectAssessmentCriteriaScores,
365
  )
366
  # Track token usage.
367
  st.session_state["input_token_usage"][
 
446
  y="mean_score",
447
  hue="ui_friendly_name",
448
  data=summary,
449
+ palette="rainbow",
450
  capsize=0.1,
451
  legend=False,
452
  )
 
666
  judging_stream
667
  )
668
 
669
+ # Parse the judging response. If parsing results are already cached, then
670
+ # skip.
671
+ # Use Structured Output to parse the judging response.
672
+ parse_judging_response_prompt = get_parse_judging_response_for_direct_assessment_prompt(
673
+ judging_response=st.session_state.direct_assessment_judging_responses[
674
+ response_model
675
+ ][
676
+ judging_model
677
+ ],
678
+ criteria_list=criteria_list,
679
+ options=SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
680
+ )
681
+
682
+ st.write("Parse judging response prompt:")
683
+ st.write(parse_judging_response_prompt)
684
+
685
+ if (
686
+ response_model
687
+ not in st.session_state.direct_assessment_judging_by_response_and_judging_model_df
688
+ or judging_model
689
+ not in st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
690
+ response_model
691
+ ]
692
+ ):
693
+ parsed_judging_response_obj = (
694
+ get_parsed_judging_response_obj_using_llm(
695
+ parse_judging_response_prompt
696
+ )
697
+ )
698
+ st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
699
+ response_model
700
+ ][
701
+ judging_model
702
+ ] = create_dataframe_for_direct_assessment_judging_response(
703
+ parsed_judging_response_obj, judging_model
704
+ )
705
+
706
+ # with st.expander("Structured output parsing response"):
707
+ st.write("Structured output parsing response:")
708
+ st.write(
709
+ st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
710
+ response_model
711
+ ][
712
+ judging_model
713
+ ]
714
+ )
715
+
716
+ # Combined the dataframes for each judging model into a single dataframe for each
717
+ # response model.
718
  if response_model not in st.session_state.direct_assessment_judging_df:
719
+ # Combine the dataframes for each judging model into a single dataframe.
720
+ combined_judging_df = pd.DataFrame()
721
+ for judging_model in st.session_state.selected_models:
722
+ combined_judging_df = pd.concat(
723
+ [
724
+ combined_judging_df,
725
+ st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
726
+ response_model
727
+ ][
728
+ judging_model
729
+ ],
730
+ ]
731
  )
 
 
 
 
732
  st.session_state.direct_assessment_judging_df[response_model] = (
733
+ combined_judging_df
 
 
734
  )
735
 
736
+ with st.expander("Judging results from all judges"):
737
+ st.write(st.session_state.direct_assessment_judging_df[response_model])
738
+
739
  # Uses the session state to plot the criteria scores and graphs for a given response
740
  # model.
741
  plot_criteria_scores(
 
756
 
757
  # Save the overall scores to the session state if it's not already there.
758
  for record in grouped.to_dict(orient="records"):
759
+ st.session_state.direct_assessment_overall_scores[
760
+ get_ui_friendly_name(response_model)
761
+ ][get_ui_friendly_name(record["judging_model"])] = record[
762
+ "overall_score"
763
+ ]
 
 
764
 
765
  overall_score = grouped["overall_score"].mean()
766
  controversy = grouped["overall_score"].std()
 
844
  if "direct_assessment_overall_score" not in st.session_state:
845
  st.session_state.direct_assessment_overall_score = {}
846
  if "direct_assessment_judging_df" not in st.session_state:
847
+ st.session_state.direct_assessment_judging_df = {}
848
+ if (
849
+ "direct_assessment_judging_by_response_and_judging_model_df"
850
+ not in st.session_state
851
+ ):
852
+ st.session_state.direct_assessment_judging_by_response_and_judging_model_df = defaultdict(
853
+ dict
854
+ )
855
  if "direct_assessment_judging_responses" not in st.session_state:
856
  st.session_state.direct_assessment_judging_responses = defaultdict(dict)
857
  if "direct_assessment_overall_scores" not in st.session_state:
 
995
  overall_scores_df["response_model"] = overall_scores_df[
996
  "response_model"
997
  ].apply(get_ui_friendly_name)
998
+ # overall_scores_df["judging_model"] = overall_scores_df[
999
+ # "judging_model"
1000
+ # ].apply(get_ui_friendly_name)
1001
 
1002
  with st.expander("Overall scores from all judges"):
1003
+ st.write(st.session_state.direct_assessment_overall_scores)
1004
+ st.dataframe(overall_scores_df_raw)
1005
  st.dataframe(overall_scores_df)
1006
 
1007
  # All criteria scores.
1008
  with right_column:
1009
  all_scores_df = pd.DataFrame()
1010
+ for (
1011
+ response_model,
1012
+ score_df,
1013
+ ) in st.session_state.direct_assessment_judging_df.items():
1014
  score_df["response_model"] = response_model
1015
  all_scores_df = pd.concat([all_scores_df, score_df])
1016
  all_scores_df = all_scores_df.reset_index()
 
1026
  "explanation",
1027
  ]
1028
  ]
1029
+ # all_scores_df["response_model"] = all_scores_df[
1030
+ # "response_model"
1031
+ # ].apply(get_ui_friendly_name)
1032
+ # all_scores_df["judging_model"] = all_scores_df[
1033
+ # "judging_model"
1034
+ # ].apply(get_ui_friendly_name)
1035
 
1036
  with st.expander(
1037
  "Criteria-specific scores and explanations from all judges"
judging_dataclasses.py CHANGED
@@ -35,9 +35,9 @@ class DirectAssessmentCriterionScore(BaseModel):
35
 
36
 
37
  class DirectAssessmentCriteriaScores(BaseModel):
38
- model: str
39
  criteria_scores: List[DirectAssessmentCriterionScore]
40
 
41
 
42
- class DirectAssessmentJudgingResponse(BaseModel):
43
- judging_models: List[DirectAssessmentCriteriaScores]
 
35
 
36
 
37
  class DirectAssessmentCriteriaScores(BaseModel):
38
+ # model: str
39
  criteria_scores: List[DirectAssessmentCriterionScore]
40
 
41
 
42
+ # class DirectAssessmentJudgingResponse(BaseModel):
43
+ # judging_models: List[DirectAssessmentCriteriaScores]
prompts.py CHANGED
@@ -1,18 +1,21 @@
1
  from judging_dataclasses import Criteria
2
 
3
 
4
- PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the responses from the judges for a direct assessment.
 
 
5
 
6
- Each judge was asked to give a rating for each of the following criteria, along with an explanation:
7
  {criteria_list}
8
 
9
  The possible options for each criterion are as follows:
 
10
  {options}
11
 
12
- The responses from the judges are as follows:
13
- {judging_responses}
 
14
 
15
- Please provide a JSON object with the following structure that includes the model name and the scores for each of the criteria, along with the explanation.
16
  """
17
 
18
 
 
1
  from judging_dataclasses import Criteria
2
 
3
 
4
+ PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the response from a judge for a direct assessment.
5
+
6
+ The judge was asked to give a rating for each of the following criteria, along with an explanation:
7
 
 
8
  {criteria_list}
9
 
10
  The possible options for each criterion are as follows:
11
+
12
  {options}
13
 
14
+ Here is the response from the judge:
15
+
16
+ {judging_response}
17
 
18
+ Please provide a JSON object the scores for each of the criteria, along with any explanation the judge provided.
19
  """
20
 
21