aryopg commited on
Commit
1fac449
1 Parent(s): fc5e798

Display the original and adjusted EM per subjects per model

Browse files
Files changed (2) hide show
  1. app.py +11 -0
  2. data/concat_all_subjects_EM_per_error.csv +94 -0
app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ # Load the CSV file
5
+ file_path = "data/concat_all_subjects_EM_per_error.csv"
6
+ df = pd.read_csv(file_path)
7
+
8
+ # Streamlit app
9
+ st.title("Exact Match (EM) for each model on all MMLU-Redux subjects combined")
10
+
11
+ st.dataframe(df)
data/concat_all_subjects_EM_per_error.csv ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Subject,Condition,Claude 3 Opus (20240229),GPT-4o (2024-05-13),GPT-4 (0613),Gemini 1.5 Pro (0409 preview),GPT-4 Turbo (1106 preview),Llama 3 (70B),Palmyra X V3 (72B),PaLM-2 (Unicorn),Mixtral (8x22B),Gemini 1.5 Flash (0514 preview)
2
+ All,Overall,0.7876666666666666,0.781,0.7556666666666667,0.7666666666666667,0.7133333333333333,0.7273333333333334,0.7243333333333334,0.7243333333333334,0.7093333333333334,0.7266666666666667
3
+ All,OK,0.8188119179482939,0.8143757592018139,0.7918679649580322,0.796233989023108,0.7481844316480185,0.7608898291200237,0.7549373327929648,0.7549373327929648,0.741889427285375,0.7556001104880412
4
+ All,Not OK,0.5850130408902339,0.5015649977053486,0.4581050117307427,0.467992747992748,0.4455619169654257,0.4657074493624201,0.5061395481278522,0.5061395481278522,0.45554439279585474,0.5376109876694672
5
+ Anatomy,Overall,0.79,0.91,0.81,0.77,0.8,0.78,0.72,0.72,0.72,0.83
6
+ Anatomy,OK,0.797979797979798,0.9191919191919192,0.8181818181818182,0.7777777777777778,0.8080808080808081,0.7878787878787878,0.7272727272727273,0.7272727272727273,0.7272727272727273,0.8383838383838383
7
+ Anatomy,Not OK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8
+ Astronomy,Overall,0.98,0.95,0.93,0.91,0.96,0.91,0.87,0.87,0.87,0.86
9
+ Astronomy,OK,0.989010989010989,0.978021978021978,0.967032967032967,0.9340659340659341,0.978021978021978,0.9230769230769231,0.9010989010989011,0.9010989010989011,0.8901098901098901,0.8791208791208791
10
+ Astronomy,Not OK,0.8888888888888888,0.6666666666666666,0.5555555555555556,0.6666666666666666,0.7777777777777778,0.7777777777777778,0.5555555555555556,0.5555555555555556,0.6666666666666666,0.6666666666666666
11
+ Business ethics,Overall,0.86,0.85,0.79,0.8,0.78,0.83,0.83,0.83,0.74,0.82
12
+ Business ethics,OK,0.9529411764705882,0.9647058823529412,0.9294117647058824,0.8823529411764706,0.9058823529411765,0.9294117647058824,0.9058823529411765,0.9058823529411765,0.8352941176470589,0.9058823529411765
13
+ Business ethics,Not OK,0.3333333333333333,0.2,0.0,0.3333333333333333,0.06666666666666667,0.26666666666666666,0.4,0.4,0.2,0.3333333333333333
14
+ Clinical knowledge,Overall,0.84,0.89,0.86,0.85,0.87,0.86,0.83,0.83,0.8,0.84
15
+ Clinical knowledge,OK,0.8383838383838383,0.8888888888888888,0.8585858585858586,0.8585858585858586,0.8686868686868687,0.8585858585858586,0.8282828282828283,0.8282828282828283,0.797979797979798,0.8383838383838383
16
+ Clinical knowledge,Not OK,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
17
+ College chemistry,Overall,0.6,0.61,0.55,0.58,0.47,0.56,0.59,0.59,0.57,0.6
18
+ College chemistry,OK,0.72,0.7066666666666667,0.68,0.6933333333333334,0.6,0.6666666666666666,0.7066666666666667,0.7066666666666667,0.64,0.7066666666666667
19
+ College chemistry,Not OK,0.24,0.32,0.16,0.24,0.08,0.24,0.24,0.24,0.36,0.28
20
+ College computer_science,Overall,0.81,0.77,0.76,0.78,0.69,0.7,0.65,0.65,0.7,0.64
21
+ College computer_science,OK,0.8041237113402062,0.7628865979381443,0.7525773195876289,0.7731958762886598,0.6804123711340206,0.7010309278350515,0.6494845360824743,0.6494845360824743,0.6907216494845361,0.6288659793814433
22
+ College computer_science,Not OK,1.0,1.0,1.0,1.0,1.0,0.6666666666666666,0.6666666666666666,0.6666666666666666,1.0,1.0
23
+ College mathematics,Overall,0.55,0.47,0.54,0.59,0.4,0.56,0.51,0.51,0.48,0.53
24
+ College mathematics,OK,0.5454545454545454,0.46464646464646464,0.5353535353535354,0.5858585858585859,0.3939393939393939,0.5555555555555556,0.5050505050505051,0.5050505050505051,0.47474747474747475,0.5252525252525253
25
+ College mathematics,Not OK,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
26
+ College medicine,Overall,0.84,0.82,0.78,0.81,0.78,0.82,0.77,0.77,0.78,0.75
27
+ College medicine,OK,0.8735632183908046,0.8620689655172413,0.8160919540229885,0.8275862068965517,0.8045977011494253,0.8735632183908046,0.8160919540229885,0.8160919540229885,0.7931034482758621,0.7701149425287356
28
+ College medicine,Not OK,0.6153846153846154,0.5384615384615384,0.5384615384615384,0.6923076923076923,0.6153846153846154,0.46153846153846156,0.46153846153846156,0.46153846153846156,0.6923076923076923,0.6153846153846154
29
+ College physics,Overall,0.72,0.69,0.64,0.8,0.41,0.53,0.55,0.55,0.58,0.68
30
+ College physics,OK,0.72,0.69,0.64,0.8,0.41,0.53,0.55,0.55,0.58,0.68
31
+ College physics,Not OK,-,-,-,-,-,-,-,-,-,-
32
+ Conceptual physics:,Overall,0.85,0.88,0.85,0.88,0.87,0.82,0.79,0.79,0.79,0.82
33
+ Conceptual physics:,OK,0.8804347826086957,0.9021739130434783,0.8695652173913043,0.8913043478260869,0.8913043478260869,0.8478260869565217,0.8152173913043478,0.8152173913043478,0.8152173913043478,0.8478260869565217
34
+ Conceptual physics:,Not OK,0.5,0.625,0.625,0.75,0.625,0.5,0.5,0.5,0.5,0.5
35
+ Econometrics,Overall,0.8,0.69,0.68,0.76,0.68,0.69,0.66,0.66,0.69,0.66
36
+ Econometrics,OK,0.7938144329896907,0.711340206185567,0.7010309278350515,0.7628865979381443,0.7010309278350515,0.7010309278350515,0.6804123711340206,0.6804123711340206,0.711340206185567,0.6597938144329897
37
+ Econometrics,Not OK,1.0,0.0,0.0,0.6666666666666666,0.0,0.3333333333333333,0.0,0.0,0.0,0.6666666666666666
38
+ Electrical engineering,Overall,0.82,0.81,0.78,0.76,0.77,0.74,0.76,0.76,0.75,0.8
39
+ Electrical engineering,OK,0.826530612244898,0.8163265306122449,0.7857142857142857,0.7755102040816326,0.7755102040816326,0.7448979591836735,0.7653061224489796,0.7653061224489796,0.7551020408163265,0.8061224489795918
40
+ Electrical engineering,Not OK,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5
41
+ Formal logic,Overall,0.72,0.72,0.65,0.63,0.66,0.65,0.66,0.66,0.63,0.6
42
+ Formal logic,OK,0.7126436781609196,0.7241379310344828,0.6666666666666666,0.632183908045977,0.6781609195402298,0.6206896551724138,0.6551724137931034,0.6551724137931034,0.6436781609195402,0.6091954022988506
43
+ Formal logic,Not OK,0.7692307692307693,0.6923076923076923,0.5384615384615384,0.6153846153846154,0.5384615384615384,0.8461538461538461,0.6923076923076923,0.6923076923076923,0.5384615384615384,0.5384615384615384
44
+ Global facts,Overall,0.66,0.64,0.62,0.66,0.58,0.49,0.53,0.53,0.56,0.55
45
+ Global facts,OK,0.7045454545454546,0.6704545454545454,0.6590909090909091,0.6931818181818182,0.5909090909090909,0.5113636363636364,0.5454545454545454,0.5454545454545454,0.5454545454545454,0.5568181818181818
46
+ Global facts,Not OK,0.3333333333333333,0.4166666666666667,0.3333333333333333,0.4166666666666667,0.5,0.3333333333333333,0.4166666666666667,0.4166666666666667,0.6666666666666666,0.5
47
+ High school chemistry,Overall,0.77,0.74,0.67,0.8,0.65,0.72,0.73,0.73,0.7,0.75
48
+ High school chemistry,OK,0.7777777777777778,0.7474747474747475,0.6767676767676768,0.8080808080808081,0.6565656565656566,0.7272727272727273,0.7373737373737373,0.7373737373737373,0.7070707070707071,0.7575757575757576
49
+ High school chemistry,Not OK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50
+ High school geography,Overall,0.94,0.94,0.94,0.9,0.91,0.93,0.91,0.91,0.94,0.92
51
+ High school geography,OK,0.94,0.94,0.94,0.9,0.91,0.93,0.91,0.91,0.94,0.92
52
+ High school geography,Not OK,-,-,-,-,-,-,-,-,-,-
53
+ High school macroeconomics,Overall,0.83,0.89,0.88,0.83,0.88,0.79,0.78,0.78,0.76,0.82
54
+ High school macroeconomics,OK,0.8409090909090909,0.8977272727272727,0.8863636363636364,0.8409090909090909,0.8863636363636364,0.8181818181818182,0.7954545454545454,0.7954545454545454,0.7840909090909091,0.8295454545454546
55
+ High school macroeconomics,Not OK,0.75,0.8333333333333334,0.8333333333333334,0.75,0.8333333333333334,0.5833333333333334,0.6666666666666666,0.6666666666666666,0.5833333333333334,0.75
56
+ High school mathematics,Overall,0.58,0.46,0.55,0.64,0.11,0.51,0.52,0.52,0.49,0.55
57
+ High school mathematics,OK,0.5757575757575758,0.46464646464646464,0.5555555555555556,0.6464646464646465,0.1111111111111111,0.5151515151515151,0.5151515151515151,0.5151515151515151,0.494949494949495,0.5454545454545454
58
+ High school mathematics,Not OK,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
59
+ High school physics,Overall,0.74,0.72,0.59,0.69,0.57,0.61,0.55,0.55,0.55,0.64
60
+ High school physics,OK,0.7525773195876289,0.7319587628865979,0.5979381443298969,0.7010309278350515,0.5876288659793815,0.6185567010309279,0.5567010309278351,0.5567010309278351,0.5567010309278351,0.6494845360824743
61
+ High school physics,Not OK,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.0,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.3333333333333333
62
+ High school statistics,Overall,0.79,0.8,0.82,0.84,0.68,0.73,0.74,0.74,0.69,0.77
63
+ High school statistics,OK,0.8061224489795918,0.8163265306122449,0.8367346938775511,0.8571428571428571,0.6938775510204082,0.7448979591836735,0.7551020408163265,0.7551020408163265,0.7040816326530612,0.7857142857142857
64
+ High school statistics,Not OK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65
+ High school US history,Overall,0.94,0.95,0.93,0.92,0.91,0.95,0.94,0.94,0.92,0.91
66
+ High school US history,OK,0.94,0.95,0.93,0.92,0.91,0.95,0.94,0.94,0.92,0.91
67
+ High school US history,Not OK,-,-,-,-,-,-,-,-,-,-
68
+ Human aging,Overall,0.82,0.84,0.8,0.8,0.83,0.78,0.79,0.79,0.73,0.77
69
+ Human aging,OK,0.8735632183908046,0.8850574712643678,0.8390804597701149,0.8505747126436781,0.8850574712643678,0.8275862068965517,0.8505747126436781,0.8505747126436781,0.7816091954022989,0.8160919540229885
70
+ Human aging,Not OK,0.46153846153846156,0.5384615384615384,0.5384615384615384,0.46153846153846156,0.46153846153846156,0.46153846153846156,0.38461538461538464,0.38461538461538464,0.38461538461538464,0.46153846153846156
71
+ Logical fallacies,Overall,0.9,0.89,0.89,0.88,0.86,0.85,0.9,0.9,0.9,0.86
72
+ Logical fallacies,OK,0.9594594594594594,0.9864864864864865,0.9594594594594594,0.9594594594594594,0.9324324324324325,0.9594594594594594,0.9459459459459459,0.9459459459459459,0.9864864864864865,0.918918918918919
73
+ Logical fallacies,Not OK,0.7307692307692307,0.6153846153846154,0.6923076923076923,0.6538461538461539,0.6538461538461539,0.5384615384615384,0.7692307692307693,0.7692307692307693,0.6538461538461539,0.6923076923076923
74
+ Machine learning,Overall,0.74,0.78,0.76,0.69,0.72,0.71,0.63,0.63,0.67,0.56
75
+ Machine learning,OK,0.7640449438202247,0.797752808988764,0.7865168539325843,0.7078651685393258,0.7415730337078652,0.7303370786516854,0.651685393258427,0.651685393258427,0.6966292134831461,0.5842696629213483
76
+ Machine learning,Not OK,0.5454545454545454,0.6363636363636364,0.5454545454545454,0.5454545454545454,0.5454545454545454,0.5454545454545454,0.45454545454545453,0.45454545454545453,0.45454545454545453,0.36363636363636365
77
+ Miscellaneous,Overall,0.96,0.97,0.97,0.94,0.96,0.94,0.93,0.93,0.93,0.91
78
+ Miscellaneous,OK,0.9777777777777777,0.9777777777777777,0.9888888888888889,0.9666666666666667,0.9888888888888889,0.9666666666666667,0.9555555555555556,0.9555555555555556,0.9555555555555556,0.9333333333333333
79
+ Miscellaneous,Not OK,0.8,0.9,0.8,0.7,0.7,0.7,0.7,0.7,0.7,0.7
80
+ Philosophy,Overall,0.9,0.9,0.88,0.85,0.85,0.83,0.84,0.84,0.8,0.81
81
+ Philosophy,OK,0.9213483146067416,0.9325842696629213,0.9213483146067416,0.8651685393258427,0.8764044943820225,0.8539325842696629,0.8539325842696629,0.8539325842696629,0.8314606741573034,0.8426966292134831
82
+ Philosophy,Not OK,0.7272727272727273,0.6363636363636364,0.5454545454545454,0.7272727272727273,0.6363636363636364,0.6363636363636364,0.7272727272727273,0.7272727272727273,0.5454545454545454,0.5454545454545454
83
+ Professional accounting,Overall,0.82,0.76,0.72,0.65,0.66,0.67,0.71,0.71,0.63,0.64
84
+ Professional accounting,OK,0.8152173913043478,0.7717391304347826,0.7282608695652174,0.6413043478260869,0.7065217391304348,0.6739130434782609,0.7065217391304348,0.7065217391304348,0.6413043478260869,0.6304347826086957
85
+ Professional accounting,Not OK,0.875,0.625,0.625,0.75,0.125,0.625,0.75,0.75,0.5,0.75
86
+ Professional law,Overall,0.69,0.7,0.71,0.67,0.71,0.55,0.68,0.68,0.59,0.58
87
+ Professional law,OK,0.7195121951219512,0.6951219512195121,0.7439024390243902,0.6951219512195121,0.7317073170731707,0.5609756097560976,0.6463414634146342,0.6463414634146342,0.6585365853658537,0.573170731707317
88
+ Professional law,Not OK,0.5555555555555556,0.7222222222222222,0.5555555555555556,0.5555555555555556,0.6111111111111112,0.5,0.8333333333333334,0.8333333333333334,0.2777777777777778,0.6111111111111112
89
+ Public relations,Overall,0.83,0.83,0.76,0.77,0.82,0.76,0.8,0.8,0.76,0.8
90
+ Public relations,OK,0.8571428571428571,0.8681318681318682,0.8021978021978022,0.8021978021978022,0.8571428571428571,0.7912087912087912,0.8461538461538461,0.8461538461538461,0.7912087912087912,0.8351648351648352
91
+ Public relations,Not OK,0.5555555555555556,0.4444444444444444,0.3333333333333333,0.4444444444444444,0.4444444444444444,0.4444444444444444,0.3333333333333333,0.3333333333333333,0.4444444444444444,0.4444444444444444
92
+ Virology,Overall,0.54,0.56,0.56,0.55,0.56,0.55,0.56,0.56,0.56,0.53
93
+ Virology,OK,0.8837209302325582,0.9069767441860465,0.8837209302325582,0.8372093023255814,0.8837209302325582,0.9069767441860465,0.9302325581395349,0.9302325581395349,0.9069767441860465,0.8837209302325582
94
+ Virology,Not OK,0.2807017543859649,0.2982456140350877,0.3157894736842105,0.3333333333333333,0.3157894736842105,0.2807017543859649,0.2807017543859649,0.2807017543859649,0.2982456140350877,0.2631578947368421