MINGYISU commited on
Commit
d94b3a2
·
1 Parent(s): 6ae8aba

updated scores algorithm

Browse files
Files changed (1) hide show
  1. utils_v2.py +39 -32
utils_v2.py CHANGED
@@ -66,17 +66,9 @@ def load_data(base_dir=SCORE_BASE_DIR):
66
  all_data.append(data)
67
  return all_data
68
 
69
- def calculate_score(raw_scores=None):
70
- """This function calculates the overall average scores for all datasets as well as avg scores for each modality and sub-task based on the raw scores.
71
- Algorithm summary:
72
- """
73
- def get_avg(sum_score, leng):
74
- avg = sum_score / leng if leng > 0 else 0.0
75
- avg = round(avg, 2) # Round to 2 decimal places
76
- return avg
77
-
78
- avg_scores = {}
79
- overall_scores_summary = {} # Stores the scores sum and length for each modality and all datasets
80
  for modality, datasets_list in DATASETS.items(): # Ex.: ('image', {'I-CLS': [...], 'I-QA': [...]})
81
  overall_scores_summary[modality] = (0.0, 0) # Initialize the sum and count for each modality
82
  for sub_task, datasets in datasets_list.items(): # Ex.: ('I-CLS', ['VOC2007', 'N24News', ...])
@@ -87,26 +79,42 @@ def calculate_score(raw_scores=None):
87
  metric = SPECIAL_METRICS.get(dataset, 'hit@1')
88
  if isinstance(score, dict):
89
  score = score.get(metric, 0.0)
90
- sub_task_sum_score += score
91
-
92
- sub_task_overall = get_avg(sub_task_sum_score, sub_task_datasets_len)
93
- avg_scores[sub_task] = sub_task_overall
94
-
95
- # Accumulate the scores sum and length for the each modality
96
- modality_sum_score, modality_datasets_len = overall_scores_summary[modality]
97
- modality_sum_score += sub_task_sum_score
98
- modality_datasets_len += sub_task_datasets_len
99
- overall_scores_summary[modality] = (modality_sum_score, modality_datasets_len)
100
-
101
- all_datasets_sum_score, all_datasets_len = 0.0, 0
102
- for modality, (modality_sum_score, modality_datasets_len) in overall_scores_summary.items():
103
- name = f"{modality.capitalize()}-Overall"
104
- avg_scores[name] = get_avg(modality_sum_score, modality_datasets_len)
105
- # Accumulate the scores sum and length for all datasets
106
- all_datasets_sum_score += modality_sum_score
107
- all_datasets_len += modality_datasets_len
108
- avg_scores['Overall'] = get_avg(all_datasets_sum_score, all_datasets_len)
109
- return avg_scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  def generate_model_row(data):
112
  metadata = data['metadata']
@@ -127,7 +135,6 @@ def get_df():
127
  df = df.sort_values(by='Overall', ascending=False).reset_index(drop=True)
128
  df['Rank'] = range(1, len(df) + 1)
129
  df = create_hyperlinked_names(df)
130
-
131
  return df
132
 
133
  def refresh_data():
 
66
  all_data.append(data)
67
  return all_data
68
 
69
+ def load_scores(raw_scores=None):
70
+ """This function loads the raw scores from the user provided scores summary and flattens them into a single dictionary."""
71
+ all_scores = {}
 
 
 
 
 
 
 
 
72
  for modality, datasets_list in DATASETS.items(): # Ex.: ('image', {'I-CLS': [...], 'I-QA': [...]})
73
  overall_scores_summary[modality] = (0.0, 0) # Initialize the sum and count for each modality
74
  for sub_task, datasets in datasets_list.items(): # Ex.: ('I-CLS', ['VOC2007', 'N24News', ...])
 
79
  metric = SPECIAL_METRICS.get(dataset, 'hit@1')
80
  if isinstance(score, dict):
81
  score = score.get(metric, 0.0)
82
+ single_dataset_score = {'dataset': dataset, 'score': score}
83
+ all_scores.update(single_dataset_score)
84
+ return all_scores
85
+
86
+ def calculate_score(raw_scores=None):
87
+ """This function calculates the overall average scores for all datasets as well as avg scores for each modality and sub-task based on the raw scores.
88
+ """
89
+ def get_avg(sum_score, leng):
90
+ avg = sum_score / leng if leng > 0 else 0.0
91
+ avg = round(avg, 2) # Round to 2 decimal places
92
+ return avg
93
+
94
+ all_scores = load_scores(raw_scores)
95
+ avg_scores = {}
96
+
97
+ # Calculate overall score for all datasets
98
+ avg_scores['Overall'] = get_avg(sum(
99
+ all_scores.values()),
100
+ len(ALL_DATASETS))
101
+
102
+ # Calculate scores for each modality
103
+ for modality in MODALITIES:
104
+ datasets_for_each_modality = ALL_DATASETS_SPLITS.get(modality, [])
105
+ avg_scores[f"{modality.capitalize()}-Overall"] = get_avg(
106
+ sum(all_scores.get(dataset, 0.0) for dataset in datasets_for_each_modality),
107
+ len(datasets_for_each_modality)
108
+ )
109
+
110
+ # Calculate scores for each sub-task
111
+ for modality, datasets_list in DATASETS.items():
112
+ for sub_task, datasets in datasets_list.items():
113
+ sub_task_score = sum(all_scores.get(dataset, 0.0) for dataset in datasets)
114
+ avg_scores[sub_task] = get_avg(sub_task_score, len(datasets))
115
+
116
+ all_scores.update(avg_scores)
117
+ return all_scores
118
 
119
  def generate_model_row(data):
120
  metadata = data['metadata']
 
135
  df = df.sort_values(by='Overall', ascending=False).reset_index(drop=True)
136
  df['Rank'] = range(1, len(df) + 1)
137
  df = create_hyperlinked_names(df)
 
138
  return df
139
 
140
  def refresh_data():