Corey Morris commited on
Commit
fb25b1e
1 Parent(s): 7b77065

Moved radar chart to after analysis

Browse files
Files changed (1) hide show
  1. app.py +34 -33
app.py CHANGED
@@ -270,6 +270,37 @@ if selected_x_column != selected_y_column: # Avoid creating a plot with the s
270
  else:
271
  st.write("Please select different columns for the x and y axes.")
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  # Section to select a model and display radar and line charts
274
  st.header("Compare a Selected Model to the 5 Models Closest in MMLU Average Performance")
275
  st.write("""
@@ -289,17 +320,15 @@ closest_models_diffs = filtered_data['MMLU_average'].sub(filtered_data.loc[selec
289
  closest_models = closest_models_diffs.nsmallest(5, keep='first').index.drop_duplicates().tolist()
290
 
291
 
292
- print(closest_models)
293
-
294
  # Find the top 10 tasks with the largest differences and convert to a DataFrame
295
  top_differences_table, top_differences_tasks = find_top_differences_table(filtered_data, selected_model_name, closest_models)
296
 
297
  # Display the DataFrame for the closest models and the top differences tasks
298
  st.dataframe(filtered_data.loc[closest_models, top_differences_tasks])
299
 
300
- # Display the table in the Streamlit app
301
- st.markdown("## Top Differences")
302
- st.dataframe(top_differences_table)
303
 
304
  # Create a radar chart for the tasks with the largest differences
305
  fig_radar_top_differences = create_radar_chart_unfilled(filtered_data, closest_models, top_differences_tasks)
@@ -307,34 +336,6 @@ fig_radar_top_differences = create_radar_chart_unfilled(filtered_data, closest_m
307
  # Display the radar chart
308
  st.plotly_chart(fig_radar_top_differences)
309
 
310
-
311
- # end of custom scatter plots
312
- st.markdown("## Notable findings and plots")
313
-
314
- st.markdown('### Abstract Algebra Performance')
315
- st.write("Small models showed surprisingly strong performance on the abstract algebra task. A 6 Billion parameter model is tied for the best performance on this task and there are a number of other small models in the top 10.")
316
- plot_top_n(filtered_data, 'MMLU_abstract_algebra', 10)
317
-
318
- fig = create_plot(filtered_data, 'Parameters', 'MMLU_abstract_algebra')
319
- st.plotly_chart(fig)
320
-
321
- # Moral scenarios plots
322
- st.markdown("### Moral Scenarios Performance")
323
- st.write("""
324
- While smaller models can perform well at many tasks, the model size threshold for decent performance on moral scenarios is much higher.
325
- There are no models with less than 13 billion parameters with performance much better than random chance. Further investigation into other capabilities that emerge at 13 billion parameters could help
326
- identify capabilities that are important for moral reasoning.
327
- """)
328
-
329
- fig = create_plot(filtered_data, 'Parameters', 'MMLU_moral_scenarios', title="Impact of Parameter Count on Accuracy for Moral Scenarios")
330
- st.plotly_chart(fig)
331
- st.write()
332
-
333
-
334
-
335
- fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_moral_scenarios')
336
- st.plotly_chart(fig)
337
-
338
  st.markdown("***Thank you to hugging face for running the evaluations and supplying the data as well as the original authors of the evaluations.***")
339
 
340
  st.markdown("""
 
270
  else:
271
  st.write("Please select different columns for the x and y axes.")
272
 
273
+
274
+
275
+
276
+ # end of custom scatter plots
277
+ st.markdown("## Notable findings and plots")
278
+
279
+ st.markdown('### Abstract Algebra Performance')
280
+ st.write("Small models showed surprisingly strong performance on the abstract algebra task. A 6 Billion parameter model is tied for the best performance on this task and there are a number of other small models in the top 10.")
281
+ plot_top_n(filtered_data, 'MMLU_abstract_algebra', 10)
282
+
283
+ fig = create_plot(filtered_data, 'Parameters', 'MMLU_abstract_algebra')
284
+ st.plotly_chart(fig)
285
+
286
+ # Moral scenarios plots
287
+ st.markdown("### Moral Scenarios Performance")
288
+ st.write("""
289
+ While smaller models can perform well at many tasks, the model size threshold for decent performance on moral scenarios is much higher.
290
+ There are no models with less than 13 billion parameters with performance much better than random chance. Further investigation into other capabilities that emerge at 13 billion parameters could help
291
+ identify capabilities that are important for moral reasoning.
292
+ """)
293
+
294
+ fig = create_plot(filtered_data, 'Parameters', 'MMLU_moral_scenarios', title="Impact of Parameter Count on Accuracy for Moral Scenarios")
295
+ st.plotly_chart(fig)
296
+ st.write()
297
+
298
+
299
+
300
+ fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_moral_scenarios')
301
+ st.plotly_chart(fig)
302
+
303
+
304
  # Section to select a model and display radar and line charts
305
  st.header("Compare a Selected Model to the 5 Models Closest in MMLU Average Performance")
306
  st.write("""
 
320
  closest_models = closest_models_diffs.nsmallest(5, keep='first').index.drop_duplicates().tolist()
321
 
322
 
 
 
323
  # Find the top 10 tasks with the largest differences and convert to a DataFrame
324
  top_differences_table, top_differences_tasks = find_top_differences_table(filtered_data, selected_model_name, closest_models)
325
 
326
  # Display the DataFrame for the closest models and the top differences tasks
327
  st.dataframe(filtered_data.loc[closest_models, top_differences_tasks])
328
 
329
+ # # Display the table in the Streamlit app
330
+ # st.markdown("## Top Differences")
331
+ # st.dataframe(top_differences_table)
332
 
333
  # Create a radar chart for the tasks with the largest differences
334
  fig_radar_top_differences = create_radar_chart_unfilled(filtered_data, closest_models, top_differences_tasks)
 
336
  # Display the radar chart
337
  st.plotly_chart(fig_radar_top_differences)
338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  st.markdown("***Thank you to hugging face for running the evaluations and supplying the data as well as the original authors of the evaluations.***")
340
 
341
  st.markdown("""