File size: 1,731 Bytes
2290cee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b45db7b
2290cee
b45db7b
2290cee
b45db7b
2290cee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from glob import glob
from sklearn.metrics import accuracy_score
import os
import pandas as pd 

def get_merged_df(results_path, skip_samples = False):
  results = glob(os.path.join(results_path, "*.json"))
  dfs = []
  for r in results:
    if skip_samples and 'sample_result' in r:
      continue
    df = pd.read_json(r, lines = True)
    if df.isna().values.any(): 
      print(f"Missing values in {r}")
    else:
      dfs.append(df)
  full_df = pd.concat(dfs)
  return full_df

def map_df(full_df):
  gnd_truth_mapping = {'full fake': 0, 
           'half fake': 0,
           'real': 1}

  pred_mapping = {'fake':0, 'real': 1}


  full_df['label'] = full_df['ground_truth'].map(gnd_truth_mapping)
  full_df['pred'] = full_df['type'].map(pred_mapping)

  return full_df

def get_scores(df):

  columns = ['Under 25s', '26s - 55s', '56s - 125s', 'Overall' ]
  samples_tested = []
  acc_scores = []

  for c in columns:
    
    if c == 'Overall':
      mask = df.label == 0
    elif c == 'Under 26 s':
      mask = (df.label == 0) & (df.duration < 26)
    elif c == '55 s':
      mask = (df.label == 0) & (df.duration >= 26) & (df.duration < 56)
    elif c == '125 s':
      mask = (df.label == 0) & (df.duration >= 56) & (df.duration < 126)
    else:
      raise ValueError
    sel_df = df[mask]

    samples_tested.append(len(sel_df))
    acc_scores.append(round(accuracy_score(sel_df.label.values, sel_df.pred.values), 3))
  
  lb = pd.DataFrame({"Sample": columns, "Num Samples": samples_tested, "Accuracy": acc_scores})
  return lb

def build_leaderboard(results_path = 'results'):
  full_df = get_merged_df(results_path)
  full_df_mapped = map_df(full_df)
  leaderboard = get_scores(full_df_mapped)
  return leaderboard