Spaces:
Runtime error
Runtime error
Display the original and adjusted EM per subjects per model
Browse files- app.py +11 -0
- data/concat_all_subjects_EM_per_error.csv +94 -0
app.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
# Load the CSV file
|
5 |
+
file_path = "data/concat_all_subjects_EM_per_error.csv"
|
6 |
+
df = pd.read_csv(file_path)
|
7 |
+
|
8 |
+
# Streamlit app
|
9 |
+
st.title("Exact Match (EM) for each model on all MMLU-Redux subjects combined")
|
10 |
+
|
11 |
+
st.dataframe(df)
|
data/concat_all_subjects_EM_per_error.csv
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Subject,Condition,Claude 3 Opus (20240229),GPT-4o (2024-05-13),GPT-4 (0613),Gemini 1.5 Pro (0409 preview),GPT-4 Turbo (1106 preview),Llama 3 (70B),Palmyra X V3 (72B),PaLM-2 (Unicorn),Mixtral (8x22B),Gemini 1.5 Flash (0514 preview)
|
2 |
+
All,Overall,0.7876666666666666,0.781,0.7556666666666667,0.7666666666666667,0.7133333333333333,0.7273333333333334,0.7243333333333334,0.7243333333333334,0.7093333333333334,0.7266666666666667
|
3 |
+
All,OK,0.8188119179482939,0.8143757592018139,0.7918679649580322,0.796233989023108,0.7481844316480185,0.7608898291200237,0.7549373327929648,0.7549373327929648,0.741889427285375,0.7556001104880412
|
4 |
+
All,Not OK,0.5850130408902339,0.5015649977053486,0.4581050117307427,0.467992747992748,0.4455619169654257,0.4657074493624201,0.5061395481278522,0.5061395481278522,0.45554439279585474,0.5376109876694672
|
5 |
+
Anatomy,Overall,0.79,0.91,0.81,0.77,0.8,0.78,0.72,0.72,0.72,0.83
|
6 |
+
Anatomy,OK,0.797979797979798,0.9191919191919192,0.8181818181818182,0.7777777777777778,0.8080808080808081,0.7878787878787878,0.7272727272727273,0.7272727272727273,0.7272727272727273,0.8383838383838383
|
7 |
+
Anatomy,Not OK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
8 |
+
Astronomy,Overall,0.98,0.95,0.93,0.91,0.96,0.91,0.87,0.87,0.87,0.86
|
9 |
+
Astronomy,OK,0.989010989010989,0.978021978021978,0.967032967032967,0.9340659340659341,0.978021978021978,0.9230769230769231,0.9010989010989011,0.9010989010989011,0.8901098901098901,0.8791208791208791
|
10 |
+
Astronomy,Not OK,0.8888888888888888,0.6666666666666666,0.5555555555555556,0.6666666666666666,0.7777777777777778,0.7777777777777778,0.5555555555555556,0.5555555555555556,0.6666666666666666,0.6666666666666666
|
11 |
+
Business ethics,Overall,0.86,0.85,0.79,0.8,0.78,0.83,0.83,0.83,0.74,0.82
|
12 |
+
Business ethics,OK,0.9529411764705882,0.9647058823529412,0.9294117647058824,0.8823529411764706,0.9058823529411765,0.9294117647058824,0.9058823529411765,0.9058823529411765,0.8352941176470589,0.9058823529411765
|
13 |
+
Business ethics,Not OK,0.3333333333333333,0.2,0.0,0.3333333333333333,0.06666666666666667,0.26666666666666666,0.4,0.4,0.2,0.3333333333333333
|
14 |
+
Clinical knowledge,Overall,0.84,0.89,0.86,0.85,0.87,0.86,0.83,0.83,0.8,0.84
|
15 |
+
Clinical knowledge,OK,0.8383838383838383,0.8888888888888888,0.8585858585858586,0.8585858585858586,0.8686868686868687,0.8585858585858586,0.8282828282828283,0.8282828282828283,0.797979797979798,0.8383838383838383
|
16 |
+
Clinical knowledge,Not OK,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
|
17 |
+
College chemistry,Overall,0.6,0.61,0.55,0.58,0.47,0.56,0.59,0.59,0.57,0.6
|
18 |
+
College chemistry,OK,0.72,0.7066666666666667,0.68,0.6933333333333334,0.6,0.6666666666666666,0.7066666666666667,0.7066666666666667,0.64,0.7066666666666667
|
19 |
+
College chemistry,Not OK,0.24,0.32,0.16,0.24,0.08,0.24,0.24,0.24,0.36,0.28
|
20 |
+
College computer_science,Overall,0.81,0.77,0.76,0.78,0.69,0.7,0.65,0.65,0.7,0.64
|
21 |
+
College computer_science,OK,0.8041237113402062,0.7628865979381443,0.7525773195876289,0.7731958762886598,0.6804123711340206,0.7010309278350515,0.6494845360824743,0.6494845360824743,0.6907216494845361,0.6288659793814433
|
22 |
+
College computer_science,Not OK,1.0,1.0,1.0,1.0,1.0,0.6666666666666666,0.6666666666666666,0.6666666666666666,1.0,1.0
|
23 |
+
College mathematics,Overall,0.55,0.47,0.54,0.59,0.4,0.56,0.51,0.51,0.48,0.53
|
24 |
+
College mathematics,OK,0.5454545454545454,0.46464646464646464,0.5353535353535354,0.5858585858585859,0.3939393939393939,0.5555555555555556,0.5050505050505051,0.5050505050505051,0.47474747474747475,0.5252525252525253
|
25 |
+
College mathematics,Not OK,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
|
26 |
+
College medicine,Overall,0.84,0.82,0.78,0.81,0.78,0.82,0.77,0.77,0.78,0.75
|
27 |
+
College medicine,OK,0.8735632183908046,0.8620689655172413,0.8160919540229885,0.8275862068965517,0.8045977011494253,0.8735632183908046,0.8160919540229885,0.8160919540229885,0.7931034482758621,0.7701149425287356
|
28 |
+
College medicine,Not OK,0.6153846153846154,0.5384615384615384,0.5384615384615384,0.6923076923076923,0.6153846153846154,0.46153846153846156,0.46153846153846156,0.46153846153846156,0.6923076923076923,0.6153846153846154
|
29 |
+
College physics,Overall,0.72,0.69,0.64,0.8,0.41,0.53,0.55,0.55,0.58,0.68
|
30 |
+
College physics,OK,0.72,0.69,0.64,0.8,0.41,0.53,0.55,0.55,0.58,0.68
|
31 |
+
College physics,Not OK,-,-,-,-,-,-,-,-,-,-
|
32 |
+
Conceptual physics:,Overall,0.85,0.88,0.85,0.88,0.87,0.82,0.79,0.79,0.79,0.82
|
33 |
+
Conceptual physics:,OK,0.8804347826086957,0.9021739130434783,0.8695652173913043,0.8913043478260869,0.8913043478260869,0.8478260869565217,0.8152173913043478,0.8152173913043478,0.8152173913043478,0.8478260869565217
|
34 |
+
Conceptual physics:,Not OK,0.5,0.625,0.625,0.75,0.625,0.5,0.5,0.5,0.5,0.5
|
35 |
+
Econometrics,Overall,0.8,0.69,0.68,0.76,0.68,0.69,0.66,0.66,0.69,0.66
|
36 |
+
Econometrics,OK,0.7938144329896907,0.711340206185567,0.7010309278350515,0.7628865979381443,0.7010309278350515,0.7010309278350515,0.6804123711340206,0.6804123711340206,0.711340206185567,0.6597938144329897
|
37 |
+
Econometrics,Not OK,1.0,0.0,0.0,0.6666666666666666,0.0,0.3333333333333333,0.0,0.0,0.0,0.6666666666666666
|
38 |
+
Electrical engineering,Overall,0.82,0.81,0.78,0.76,0.77,0.74,0.76,0.76,0.75,0.8
|
39 |
+
Electrical engineering,OK,0.826530612244898,0.8163265306122449,0.7857142857142857,0.7755102040816326,0.7755102040816326,0.7448979591836735,0.7653061224489796,0.7653061224489796,0.7551020408163265,0.8061224489795918
|
40 |
+
Electrical engineering,Not OK,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5
|
41 |
+
Formal logic,Overall,0.72,0.72,0.65,0.63,0.66,0.65,0.66,0.66,0.63,0.6
|
42 |
+
Formal logic,OK,0.7126436781609196,0.7241379310344828,0.6666666666666666,0.632183908045977,0.6781609195402298,0.6206896551724138,0.6551724137931034,0.6551724137931034,0.6436781609195402,0.6091954022988506
|
43 |
+
Formal logic,Not OK,0.7692307692307693,0.6923076923076923,0.5384615384615384,0.6153846153846154,0.5384615384615384,0.8461538461538461,0.6923076923076923,0.6923076923076923,0.5384615384615384,0.5384615384615384
|
44 |
+
Global facts,Overall,0.66,0.64,0.62,0.66,0.58,0.49,0.53,0.53,0.56,0.55
|
45 |
+
Global facts,OK,0.7045454545454546,0.6704545454545454,0.6590909090909091,0.6931818181818182,0.5909090909090909,0.5113636363636364,0.5454545454545454,0.5454545454545454,0.5454545454545454,0.5568181818181818
|
46 |
+
Global facts,Not OK,0.3333333333333333,0.4166666666666667,0.3333333333333333,0.4166666666666667,0.5,0.3333333333333333,0.4166666666666667,0.4166666666666667,0.6666666666666666,0.5
|
47 |
+
High school chemistry,Overall,0.77,0.74,0.67,0.8,0.65,0.72,0.73,0.73,0.7,0.75
|
48 |
+
High school chemistry,OK,0.7777777777777778,0.7474747474747475,0.6767676767676768,0.8080808080808081,0.6565656565656566,0.7272727272727273,0.7373737373737373,0.7373737373737373,0.7070707070707071,0.7575757575757576
|
49 |
+
High school chemistry,Not OK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
50 |
+
High school geography,Overall,0.94,0.94,0.94,0.9,0.91,0.93,0.91,0.91,0.94,0.92
|
51 |
+
High school geography,OK,0.94,0.94,0.94,0.9,0.91,0.93,0.91,0.91,0.94,0.92
|
52 |
+
High school geography,Not OK,-,-,-,-,-,-,-,-,-,-
|
53 |
+
High school macroeconomics,Overall,0.83,0.89,0.88,0.83,0.88,0.79,0.78,0.78,0.76,0.82
|
54 |
+
High school macroeconomics,OK,0.8409090909090909,0.8977272727272727,0.8863636363636364,0.8409090909090909,0.8863636363636364,0.8181818181818182,0.7954545454545454,0.7954545454545454,0.7840909090909091,0.8295454545454546
|
55 |
+
High school macroeconomics,Not OK,0.75,0.8333333333333334,0.8333333333333334,0.75,0.8333333333333334,0.5833333333333334,0.6666666666666666,0.6666666666666666,0.5833333333333334,0.75
|
56 |
+
High school mathematics,Overall,0.58,0.46,0.55,0.64,0.11,0.51,0.52,0.52,0.49,0.55
|
57 |
+
High school mathematics,OK,0.5757575757575758,0.46464646464646464,0.5555555555555556,0.6464646464646465,0.1111111111111111,0.5151515151515151,0.5151515151515151,0.5151515151515151,0.494949494949495,0.5454545454545454
|
58 |
+
High school mathematics,Not OK,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
|
59 |
+
High school physics,Overall,0.74,0.72,0.59,0.69,0.57,0.61,0.55,0.55,0.55,0.64
|
60 |
+
High school physics,OK,0.7525773195876289,0.7319587628865979,0.5979381443298969,0.7010309278350515,0.5876288659793815,0.6185567010309279,0.5567010309278351,0.5567010309278351,0.5567010309278351,0.6494845360824743
|
61 |
+
High school physics,Not OK,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.0,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.3333333333333333
|
62 |
+
High school statistics,Overall,0.79,0.8,0.82,0.84,0.68,0.73,0.74,0.74,0.69,0.77
|
63 |
+
High school statistics,OK,0.8061224489795918,0.8163265306122449,0.8367346938775511,0.8571428571428571,0.6938775510204082,0.7448979591836735,0.7551020408163265,0.7551020408163265,0.7040816326530612,0.7857142857142857
|
64 |
+
High school statistics,Not OK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
65 |
+
High school US history,Overall,0.94,0.95,0.93,0.92,0.91,0.95,0.94,0.94,0.92,0.91
|
66 |
+
High school US history,OK,0.94,0.95,0.93,0.92,0.91,0.95,0.94,0.94,0.92,0.91
|
67 |
+
High school US history,Not OK,-,-,-,-,-,-,-,-,-,-
|
68 |
+
Human aging,Overall,0.82,0.84,0.8,0.8,0.83,0.78,0.79,0.79,0.73,0.77
|
69 |
+
Human aging,OK,0.8735632183908046,0.8850574712643678,0.8390804597701149,0.8505747126436781,0.8850574712643678,0.8275862068965517,0.8505747126436781,0.8505747126436781,0.7816091954022989,0.8160919540229885
|
70 |
+
Human aging,Not OK,0.46153846153846156,0.5384615384615384,0.5384615384615384,0.46153846153846156,0.46153846153846156,0.46153846153846156,0.38461538461538464,0.38461538461538464,0.38461538461538464,0.46153846153846156
|
71 |
+
Logical fallacies,Overall,0.9,0.89,0.89,0.88,0.86,0.85,0.9,0.9,0.9,0.86
|
72 |
+
Logical fallacies,OK,0.9594594594594594,0.9864864864864865,0.9594594594594594,0.9594594594594594,0.9324324324324325,0.9594594594594594,0.9459459459459459,0.9459459459459459,0.9864864864864865,0.918918918918919
|
73 |
+
Logical fallacies,Not OK,0.7307692307692307,0.6153846153846154,0.6923076923076923,0.6538461538461539,0.6538461538461539,0.5384615384615384,0.7692307692307693,0.7692307692307693,0.6538461538461539,0.6923076923076923
|
74 |
+
Machine learning,Overall,0.74,0.78,0.76,0.69,0.72,0.71,0.63,0.63,0.67,0.56
|
75 |
+
Machine learning,OK,0.7640449438202247,0.797752808988764,0.7865168539325843,0.7078651685393258,0.7415730337078652,0.7303370786516854,0.651685393258427,0.651685393258427,0.6966292134831461,0.5842696629213483
|
76 |
+
Machine learning,Not OK,0.5454545454545454,0.6363636363636364,0.5454545454545454,0.5454545454545454,0.5454545454545454,0.5454545454545454,0.45454545454545453,0.45454545454545453,0.45454545454545453,0.36363636363636365
|
77 |
+
Miscellaneous,Overall,0.96,0.97,0.97,0.94,0.96,0.94,0.93,0.93,0.93,0.91
|
78 |
+
Miscellaneous,OK,0.9777777777777777,0.9777777777777777,0.9888888888888889,0.9666666666666667,0.9888888888888889,0.9666666666666667,0.9555555555555556,0.9555555555555556,0.9555555555555556,0.9333333333333333
|
79 |
+
Miscellaneous,Not OK,0.8,0.9,0.8,0.7,0.7,0.7,0.7,0.7,0.7,0.7
|
80 |
+
Philosophy,Overall,0.9,0.9,0.88,0.85,0.85,0.83,0.84,0.84,0.8,0.81
|
81 |
+
Philosophy,OK,0.9213483146067416,0.9325842696629213,0.9213483146067416,0.8651685393258427,0.8764044943820225,0.8539325842696629,0.8539325842696629,0.8539325842696629,0.8314606741573034,0.8426966292134831
|
82 |
+
Philosophy,Not OK,0.7272727272727273,0.6363636363636364,0.5454545454545454,0.7272727272727273,0.6363636363636364,0.6363636363636364,0.7272727272727273,0.7272727272727273,0.5454545454545454,0.5454545454545454
|
83 |
+
Professional accounting,Overall,0.82,0.76,0.72,0.65,0.66,0.67,0.71,0.71,0.63,0.64
|
84 |
+
Professional accounting,OK,0.8152173913043478,0.7717391304347826,0.7282608695652174,0.6413043478260869,0.7065217391304348,0.6739130434782609,0.7065217391304348,0.7065217391304348,0.6413043478260869,0.6304347826086957
|
85 |
+
Professional accounting,Not OK,0.875,0.625,0.625,0.75,0.125,0.625,0.75,0.75,0.5,0.75
|
86 |
+
Professional law,Overall,0.69,0.7,0.71,0.67,0.71,0.55,0.68,0.68,0.59,0.58
|
87 |
+
Professional law,OK,0.7195121951219512,0.6951219512195121,0.7439024390243902,0.6951219512195121,0.7317073170731707,0.5609756097560976,0.6463414634146342,0.6463414634146342,0.6585365853658537,0.573170731707317
|
88 |
+
Professional law,Not OK,0.5555555555555556,0.7222222222222222,0.5555555555555556,0.5555555555555556,0.6111111111111112,0.5,0.8333333333333334,0.8333333333333334,0.2777777777777778,0.6111111111111112
|
89 |
+
Public relations,Overall,0.83,0.83,0.76,0.77,0.82,0.76,0.8,0.8,0.76,0.8
|
90 |
+
Public relations,OK,0.8571428571428571,0.8681318681318682,0.8021978021978022,0.8021978021978022,0.8571428571428571,0.7912087912087912,0.8461538461538461,0.8461538461538461,0.7912087912087912,0.8351648351648352
|
91 |
+
Public relations,Not OK,0.5555555555555556,0.4444444444444444,0.3333333333333333,0.4444444444444444,0.4444444444444444,0.4444444444444444,0.3333333333333333,0.3333333333333333,0.4444444444444444,0.4444444444444444
|
92 |
+
Virology,Overall,0.54,0.56,0.56,0.55,0.56,0.55,0.56,0.56,0.56,0.53
|
93 |
+
Virology,OK,0.8837209302325582,0.9069767441860465,0.8837209302325582,0.8372093023255814,0.8837209302325582,0.9069767441860465,0.9302325581395349,0.9302325581395349,0.9069767441860465,0.8837209302325582
|
94 |
+
Virology,Not OK,0.2807017543859649,0.2982456140350877,0.3157894736842105,0.3333333333333333,0.3157894736842105,0.2807017543859649,0.2807017543859649,0.2807017543859649,0.2982456140350877,0.2631578947368421
|