陈俊杰 commited on
Commit
b781bf5
·
1 Parent(s): f51ed55
Files changed (1) hide show
  1. app.py +76 -34
app.py CHANGED
@@ -265,10 +265,13 @@ elif page == "LeaderBoard":
265
  "Spearman (Non-Factoid QA)": [],
266
  }
267
 
 
 
 
268
  # teamId 唯一标识码
269
  DG = {
270
- "TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
271
- "Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o"],
272
  "Accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
273
  "Kendall's Tau": [0.3243, 0.1739, 0.3042, 0.4167],
274
  "Spearman": [0.3505, 0.1857, 0.3264, 0.4512]
@@ -276,8 +279,8 @@ elif page == "LeaderBoard":
276
  df1 = pd.DataFrame(DG)
277
 
278
  TE = {
279
- "TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
280
- "Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o"],
281
  "Accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
282
  "Kendall's Tau": [0.1281, 0.0635, 0.2716, 0.3864],
283
  "Spearman": [0.1352, 0.0667, 0.2867, 0.4157]
@@ -285,8 +288,8 @@ elif page == "LeaderBoard":
285
  df2 = pd.DataFrame(TE)
286
 
287
  SG = {
288
- "TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
289
- "Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o"],
290
  "Accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
291
  "Kendall's Tau": [0.3957, 0.2688, 0.5092, 0.5001],
292
  "Spearman": [0.4188, 0.2817, 0.5403, 0.5405],
@@ -294,18 +297,42 @@ elif page == "LeaderBoard":
294
  df3 = pd.DataFrame(SG)
295
 
296
  NFQA = {
297
- "TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
298
- "Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o"],
299
  "Accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
300
  "Kendall's Tau": [0.2332, 0.2389, 0.4440, 0.4235],
301
  "Spearman": [0.2443, 0.2492, 0.4630, 0.4511]
302
  }
303
  df4 = pd.DataFrame(NFQA)
304
 
305
- # df = [df1, df2, df3, df4]
306
- # for d in df:
307
- # for col in d.select_dtypes(include=['float64', 'int64']).columns:
308
- # d[col] = d[col].apply(lambda x: f"{x:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  # # 创建标签页
311
  # tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
@@ -325,30 +352,45 @@ elif page == "LeaderBoard":
325
  # with tab4:
326
  # st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
327
  # st.dataframe(df4, use_container_width=True)
328
-
329
- data = [DG, NFQA, SG, TE]
330
- task = ["Dialogue Generation", "Non-Factoid QA", "Summary Generation", "Text Expansion"]
331
- metric = ["Accuracy", "Kendall's Tau", "Spearman"]
332
 
333
- overall_total = [0] * len(df["TeamId"])
334
- for i, d in enumerate(data): # 每种数据集
335
- total = [0] * len(df["TeamId"]) # 长度初始化为方法数
336
- for j in range(len(metric)): # 每种指标
337
- index = f"{metric[j]} ({task[i]})"
338
- df[index] = d[metric[j]]
339
- for k in range(len(df["TeamId"])):
340
- total[k] += d[metric[j]][k]
341
- average_index = f"Average ({task[i]})"
342
- df[average_index] = [k / len(metric) for k in total]
343
- for k in range(len(df["TeamId"])):
344
- overall_total[k] += df[average_index][k]
345
-
346
- df["Average (all 4 datatsets)"] = [k / len(task) for k in overall_total]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
- df = pd.DataFrame(df)
349
- for col in df.select_dtypes(include=['float64', 'int64']).columns:
350
- df[col] = df[col].apply(lambda x: f"{x:.4f}")
351
- st.dataframe(df,use_container_width=True)
352
 
353
  st.markdown("""
354
  🔗 To register for AEOLLM task, you can visit the following link and choose our AEOLLM task: [https://research.nii.ac.jp/ntcir/ntcir-18/howto.html](https://research.nii.ac.jp/ntcir/ntcir-18/howto.html).
 
265
  "Spearman (Non-Factoid QA)": [],
266
  }
267
 
268
+ TeamId = ["baseline1", "baseline2", "baseline3", "baseline4"]
269
+ Methods = ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o"]
270
+
271
  # teamId 唯一标识码
272
  DG = {
273
+ "TeamId": TeamId,
274
+ "Methods": Methods,
275
  "Accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
276
  "Kendall's Tau": [0.3243, 0.1739, 0.3042, 0.4167],
277
  "Spearman": [0.3505, 0.1857, 0.3264, 0.4512]
 
279
  df1 = pd.DataFrame(DG)
280
 
281
  TE = {
282
+ "TeamId": TeamId,
283
+ "Methods": Methods,
284
  "Accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
285
  "Kendall's Tau": [0.1281, 0.0635, 0.2716, 0.3864],
286
  "Spearman": [0.1352, 0.0667, 0.2867, 0.4157]
 
288
  df2 = pd.DataFrame(TE)
289
 
290
  SG = {
291
+ "TeamId": TeamId,
292
+ "Methods": Methods,
293
  "Accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
294
  "Kendall's Tau": [0.3957, 0.2688, 0.5092, 0.5001],
295
  "Spearman": [0.4188, 0.2817, 0.5403, 0.5405],
 
297
  df3 = pd.DataFrame(SG)
298
 
299
  NFQA = {
300
+ "TeamId": TeamId,
301
+ "Methods": Methods,
302
  "Accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
303
  "Kendall's Tau": [0.2332, 0.2389, 0.4440, 0.4235],
304
  "Spearman": [0.2443, 0.2492, 0.4630, 0.4511]
305
  }
306
  df4 = pd.DataFrame(NFQA)
307
 
308
+ OverAll = {
309
+ "TeamId": TeamId,
310
+ "Methods": Methods,
311
+ "Accuracy": [],
312
+ "Kendall's Tau": [],
313
+ "Spearman": []
314
+ }
315
+
316
+ data = [DG, NFQA, SG, TE]
317
+ task = ["Dialogue Generation", "Non-Factoid QA", "Summary Generation", "Text Expansion"]
318
+ metric = ["Accuracy", "Kendall's Tau", "Spearman"]
319
+
320
+ for m in metric:
321
+ # 每个指标
322
+ metric_score = [0] * len(TeamId)
323
+ for j in range(len(TeamId)):
324
+ # 每支队伍
325
+ for d in data:
326
+ metric_score[j] += d[m][j]
327
+ metric_score = [k / len(task) for k in metric_score]
328
+ OverAll[m] = metric_score
329
+
330
+ dfo = pd.DataFrame(OverAll)
331
+
332
+ df = [df1, df2, df3, df4, dfo]
333
+ for d in df:
334
+ for col in d.select_dtypes(include=['float64', 'int64']).columns:
335
+ d[col] = d[col].apply(lambda x: f"{x:.4f}")
336
 
337
  # # 创建标签页
338
  # tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
 
352
  # with tab4:
353
  # st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
354
  # st.dataframe(df4, use_container_width=True)
 
 
 
 
355
 
356
+ st.markdown("""<p class='main-text'>Overall</p>""", unsafe_allow_html=True)
357
+ st.dataframe(dfo, use_container_width=True)
358
+
359
+ st.markdown("""<p class='main-text'>Task: Dialogue Generation; Dataset: DialyDialog</p>""", unsafe_allow_html=True)
360
+ st.dataframe(df1, use_container_width=True)
361
+
362
+ st.markdown("""<p class='main-text'>Task: Text Expansion; Dataset: WritingPrompts</p>""", unsafe_allow_html=True)
363
+ st.dataframe(df2, use_container_width=True)
364
+
365
+ st.markdown("""<p class='main-text'>Task: Summary Generation; Dataset: Xsum</p>""", unsafe_allow_html=True)
366
+ st.dataframe(df3, use_container_width=True)
367
+
368
+ st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
369
+ st.dataframe(df4, use_container_width=True)
370
+
371
+ # data = [DG, NFQA, SG, TE]
372
+ # task = ["Dialogue Generation", "Non-Factoid QA", "Summary Generation", "Text Expansion"]
373
+ # metric = ["Accuracy", "Kendall's Tau", "Spearman"]
374
+
375
+ # overall_total = [0] * len(df["TeamId"])
376
+ # for i, d in enumerate(data): # 每种数据集
377
+ # total = [0] * len(df["TeamId"]) # 长度初始化为方法数
378
+ # for j in range(len(metric)): # 每种指标
379
+ # index = f"{metric[j]} ({task[i]})"
380
+ # df[index] = d[metric[j]]
381
+ # for k in range(len(df["TeamId"])):
382
+ # total[k] += d[metric[j]][k]
383
+ # average_index = f"Average ({task[i]})"
384
+ # df[average_index] = [k / len(metric) for k in total]
385
+ # for k in range(len(df["TeamId"])):
386
+ # overall_total[k] += df[average_index][k]
387
+
388
+ # df["Average (all 4 datatsets)"] = [k / len(task) for k in overall_total]
389
 
390
+ # df = pd.DataFrame(df)
391
+ # for col in df.select_dtypes(include=['float64', 'int64']).columns:
392
+ # df[col] = df[col].apply(lambda x: f"{x:.4f}")
393
+ # st.dataframe(df,use_container_width=True)
394
 
395
  st.markdown("""
396
  🔗 To register for AEOLLM task, you can visit the following link and choose our AEOLLM task: [https://research.nii.ac.jp/ntcir/ntcir-18/howto.html](https://research.nii.ac.jp/ntcir/ntcir-18/howto.html).