alielfilali01 commited on
Commit
6141c4a
·
verified ·
1 Parent(s): 0243d7b

Create assets/results/results.json

Browse files
Files changed (1) hide show
  1. assets/results/results.json +295 -0
assets/results/results.json ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "claude-3.5-sonnet Scores": {
4
+ "3C3H Scores": {
5
+ "Correctness": 0.2239,
6
+ "Completeness": 0.2188,
7
+ "Conciseness": 0.047,
8
+ "Helpfulness": 0.1739,
9
+ "Honesty": 0.2098,
10
+ "Harmlessness": 0.1962,
11
+ "3C3H Score": 0.1783
12
+ },
13
+ "Tasks Scores": {
14
+ "Biology": 0.1725,
15
+ "Chemistry": 0.2331,
16
+ "Cybersecurity": 0.1333
17
+ }
18
+ },
19
+ "Meta": {
20
+ "Model Name": "meta-llama/Llama-3.3-70B-Instruct",
21
+ "License": "llama3.3",
22
+ "Revision": "main",
23
+ "Precision": "bfloat16",
24
+ "Params": 70.0,
25
+ "Total Entries": 600,
26
+ "Successful Entries": 585,
27
+ "Failed Entries": 15,
28
+ "Success Ratio": 0.975
29
+ }
30
+ },
31
+ {
32
+ "claude-3.5-sonnet Scores": {
33
+ "3C3H Scores": {
34
+ "Correctness": 0.1724,
35
+ "Completeness": 0.1672,
36
+ "Conciseness": 0.0526,
37
+ "Helpfulness": 0.1418,
38
+ "Honesty": 0.1608,
39
+ "Harmlessness": 0.1724,
40
+ "3C3H Score": 0.1445
41
+ },
42
+ "Tasks Scores": {
43
+ "Biology": 0.1896,
44
+ "Chemistry": 0.0856,
45
+ "Cybersecurity": 0.1528
46
+ }
47
+ },
48
+ "Meta": {
49
+ "Model Name": "claude-3-5-sonnet-20241022",
50
+ "License": "Proprietary",
51
+ "Revision": "UNK",
52
+ "Precision": "UNK",
53
+ "Params": "UNK",
54
+ "Total Entries": 600,
55
+ "Successful Entries": 580,
56
+ "Failed Entries": 20,
57
+ "Success Ratio": 0.9667
58
+ }
59
+ },
60
+ {
61
+ "claude-3.5-sonnet Scores": {
62
+ "3C3H Scores": {
63
+ "Correctness": 0.1588,
64
+ "Completeness": 0.1571,
65
+ "Conciseness": 0.0351,
66
+ "Helpfulness": 0.1296,
67
+ "Honesty": 0.152,
68
+ "Harmlessness": 0.1512,
69
+ "3C3H Score": 0.1306
70
+ },
71
+ "Tasks Scores": {
72
+ "Biology": 0.1644,
73
+ "Chemistry": 0.1278,
74
+ "Cybersecurity": 0.0996
75
+ }
76
+ },
77
+ "Meta": {
78
+ "Model Name": "claude-3-haiku-20240307",
79
+ "License": "Proprietary",
80
+ "Revision": "UNK",
81
+ "Precision": "UNK",
82
+ "Params": "UNK",
83
+ "Total Entries": 600,
84
+ "Successful Entries": 592,
85
+ "Failed Entries": 8,
86
+ "Success Ratio": 0.9867
87
+ }
88
+ },
89
+ {
90
+ "claude-3.5-sonnet Scores": {
91
+ "3C3H Scores": {
92
+ "Correctness": 0.1949,
93
+ "Completeness": 0.1915,
94
+ "Conciseness": 0.0427,
95
+ "Helpfulness": 0.1427,
96
+ "Honesty": 0.1863,
97
+ "Harmlessness": 0.1726,
98
+ "3C3H Score": 0.1551
99
+ },
100
+ "Tasks Scores": {
101
+ "Biology": 0.156,
102
+ "Chemistry": 0.196,
103
+ "Cybersecurity": 0.1162
104
+ }
105
+ },
106
+ "Meta": {
107
+ "Model Name": "gpt-4o-mini-2024-07-18",
108
+ "License": "Proprietary",
109
+ "Revision": "UNK",
110
+ "Precision": "UNK",
111
+ "Params": "UNK",
112
+ "Total Entries": 600,
113
+ "Successful Entries": 585,
114
+ "Failed Entries": 15,
115
+ "Success Ratio": 0.975
116
+ }
117
+ },
118
+ {
119
+ "claude-3.5-sonnet Scores": {
120
+ "3C3H Scores": {
121
+ "Correctness": 0.2338,
122
+ "Completeness": 0.2338,
123
+ "Conciseness": 0.0563,
124
+ "Helpfulness": 0.1762,
125
+ "Honesty": 0.2253,
126
+ "Harmlessness": 0.2099,
127
+ "3C3H Score": 0.1892
128
+ },
129
+ "Tasks Scores": {
130
+ "Biology": 0.1946,
131
+ "Chemistry": 0.2157,
132
+ "Cybersecurity": 0.1592
133
+ }
134
+ },
135
+ "Meta": {
136
+ "Model Name": "gpt-4o-2024-08-06",
137
+ "License": "Proprietary",
138
+ "Revision": "UNK",
139
+ "Precision": "UNK",
140
+ "Params": "UNK",
141
+ "Total Entries": 600,
142
+ "Successful Entries": 586,
143
+ "Failed Entries": 14,
144
+ "Success Ratio": 0.9767
145
+ }
146
+ },
147
+ {
148
+ "claude-3.5-sonnet Scores": {
149
+ "3C3H Scores": {
150
+ "Correctness": 0.1525,
151
+ "Completeness": 0.1508,
152
+ "Conciseness": 0.0055,
153
+ "Helpfulness": 0.086,
154
+ "Honesty": 0.1335,
155
+ "Harmlessness": 0.1335,
156
+ "3C3H Score": 0.1103
157
+ },
158
+ "Tasks Scores": {
159
+ "Biology": 0.1162,
160
+ "Chemistry": 0.1217,
161
+ "Cybersecurity": 0.0935
162
+ }
163
+ },
164
+ "Meta": {
165
+ "Model Name": "Qwen/QwQ-32B-Preview",
166
+ "License": "apache-2.0",
167
+ "Revision": "main",
168
+ "Precision": "bfloat16",
169
+ "Params": 32.0,
170
+ "Total Entries": 600,
171
+ "Successful Entries": 590,
172
+ "Failed Entries": 10,
173
+ "Success Ratio": 0.9833
174
+ }
175
+ },
176
+ {
177
+ "claude-3.5-sonnet Scores": {
178
+ "3C3H Scores": {
179
+ "Correctness": 0.2122,
180
+ "Completeness": 0.2105,
181
+ "Conciseness": 0.0297,
182
+ "Helpfulness": 0.1609,
183
+ "Honesty": 0.2016,
184
+ "Harmlessness": 0.1808,
185
+ "3C3H Score": 0.166
186
+ },
187
+ "Tasks Scores": {
188
+ "Biology": 0.155,
189
+ "Chemistry": 0.2092,
190
+ "Cybersecurity": 0.1357
191
+ }
192
+ },
193
+ "Meta": {
194
+ "Model Name": "Qwen/Qwen2.5-72B-Instruct",
195
+ "License": "qwen",
196
+ "Revision": "main",
197
+ "Precision": "bfloat16",
198
+ "Params": 72.0,
199
+ "Total Entries": 600,
200
+ "Successful Entries": 589,
201
+ "Failed Entries": 11,
202
+ "Success Ratio": 0.9817
203
+ }
204
+ },
205
+ {
206
+ "claude-3.5-sonnet Scores": {
207
+ "3C3H Scores": {
208
+ "Correctness": 0.2612,
209
+ "Completeness": 0.2595,
210
+ "Conciseness": 0.0391,
211
+ "Helpfulness": 0.2053,
212
+ "Honesty": 0.2457,
213
+ "Harmlessness": 0.2358,
214
+ "3C3H Score": 0.2078
215
+ },
216
+ "Tasks Scores": {
217
+ "Biology": 0.2092,
218
+ "Chemistry": 0.215,
219
+ "Cybersecurity": 0.1998
220
+ }
221
+ },
222
+ "Meta": {
223
+ "Model Name": "o1-mini-2024-09-12",
224
+ "License": "Proprietary",
225
+ "Revision": "UNK",
226
+ "Precision": "UNK",
227
+ "Params": "UNK",
228
+ "Total Entries": 600,
229
+ "Successful Entries": 582,
230
+ "Failed Entries": 18,
231
+ "Success Ratio": 0.97
232
+ }
233
+ },
234
+ {
235
+ "claude-3.5-sonnet Scores": {
236
+ "3C3H Scores": {
237
+ "Correctness": 0.3415,
238
+ "Completeness": 0.3398,
239
+ "Conciseness": 0.1033,
240
+ "Helpfulness": 0.2982,
241
+ "Honesty": 0.3371,
242
+ "Harmlessness": 0.3257,
243
+ "3C3H Score": 0.2909
244
+ },
245
+ "Tasks Scores": {
246
+ "Biology": 0.3188,
247
+ "Chemistry": 0.2204,
248
+ "Cybersecurity": 0.3244
249
+ }
250
+ },
251
+ "Meta": {
252
+ "Model Name": "o1-2024-12-17",
253
+ "License": "Proprietary",
254
+ "Revision": "UNK",
255
+ "Precision": "UNK",
256
+ "Params": "UNK",
257
+ "Total Entries": 600,
258
+ "Successful Entries": 571,
259
+ "Failed Entries": 29,
260
+ "Success Ratio": 0.9517
261
+ }
262
+ },
263
+ {
264
+ "claude-3.5-sonnet Scores": {
265
+ "3C3H Scores": {
266
+ "Correctness": 0.3281,
267
+ "Completeness": 0.3246,
268
+ "Conciseness": 0.0881,
269
+ "Helpfulness": 0.284,
270
+ "Honesty": 0.3229,
271
+ "Harmlessness": 0.3159,
272
+ "3C3H Score": 0.2773
273
+ },
274
+ "Tasks Scores": {
275
+ "Biology": 0.3141,
276
+ "Chemistry": 0.2055,
277
+ "Cybersecurity": 0.3031
278
+ }
279
+ },
280
+ "Meta": {
281
+ "Model Name": "o3-mini-2025-01-31",
282
+ "License": "Proprietary",
283
+ "Revision": "UNK",
284
+ "Precision": "UNK",
285
+ "Params": "UNK",
286
+ "Total Entries": 600,
287
+ "Successful Entries": 573,
288
+ "Failed Entries": 27,
289
+ "Success Ratio": 0.955
290
+ }
291
+ },
292
+ {
293
+ "_last_sync_timestamp": "2025-02-01T16:16:28.531194"
294
+ }
295
+ ]