|
import gc |
|
import numpy as np |
|
import pandas as pd |
|
import pandas.api.types |
|
import sklearn.metrics |
|
|
|
|
|
class ParticipantVisibleError(Exception): |
|
pass |
|
|
|
|
|
def apk(actual, predicted, k=20): |
|
""" |
|
Compute the average precision at k. |
|
|
|
This function computes the average prescision at k between two lists of |
|
items. |
|
|
|
Parameters |
|
---------- |
|
actual : list |
|
A list of elements that are to be predicted (order doesn't matter) |
|
predicted : list |
|
A list of predicted elements (order does matter) |
|
k : int, optional |
|
The maximum number of predicted elements |
|
|
|
Returns |
|
------- |
|
score : double |
|
The average precision at k over the input lists |
|
|
|
""" |
|
if len(predicted) > k: |
|
predicted = predicted[:k] |
|
|
|
score = 0.0 |
|
num_hits = 0.0 |
|
|
|
for i, p in enumerate(predicted): |
|
if p in actual and p not in predicted[:i]: |
|
num_hits += 1.0 |
|
score += num_hits / (i + 1.0) |
|
if not actual: |
|
return 0.0 |
|
|
|
return score / min(len(actual), k) |
|
|
|
|
|
def mapk(actual, predicted, k=20): |
|
""" |
|
Compute the mean average precision at k. |
|
|
|
This function computes the mean average prescision at k between two lists |
|
of lists of items. |
|
|
|
Parameters |
|
---------- |
|
actual : list |
|
A list of lists of elements that are to be predicted |
|
(order doesn't matter in the lists) |
|
predicted : list |
|
A list of lists of predicted elements |
|
(order matters in the lists) |
|
k : int, optional |
|
The maximum number of predicted elements |
|
|
|
Returns |
|
------- |
|
score : double |
|
The mean average precision at k over the input lists |
|
|
|
""" |
|
return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)]) |
|
|
|
|
|
def prepare(df, out_of_sample_column_name): |
|
df['categories'] = df['categories'].str.split(' ') |
|
df[out_of_sample_column_name] = df[out_of_sample_column_name].astype(float) |
|
return df |
|
|
|
|
|
def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, out_of_sample_column_name: str='osd', k: int=20) -> float: |
|
"""Metric for the FathomNet 2023 FGVC competition (46149). |
|
|
|
Computes the average of a MAP@k and a normalized AUC on an "out-of-sample" indicator. |
|
|
|
Parameters |
|
---------- |
|
solution : DataFrame with columns having for each instance: |
|
- categories: a list of integer categories |
|
- osd: a binary out-of-sample indicator |
|
submission : array-like of float, shape = (n_samples, n_classes + 1) |
|
out_of_sample_column_name: str, the name of the out-of-sample indicator |
|
k: the maximum number of predicted categories |
|
""" |
|
if row_id_column_name not in submission.columns: |
|
raise ParticipantVisibleError('Submission file missing expected column ' + row_id_column_name) |
|
if row_id_column_name not in solution.columns: |
|
raise ParticipantVisibleError('Solution file missing expected column ' + row_id_column_name) |
|
solution = solution.sort_values(by=[row_id_column_name]) |
|
submission = submission.sort_values(by=[row_id_column_name]) |
|
if not (solution[row_id_column_name].values == submission[row_id_column_name].values).all(): |
|
raise ParticipantVisibleError('The solution and submission row IDs are not identical') |
|
del solution[row_id_column_name] |
|
del submission[row_id_column_name] |
|
gc.collect() |
|
|
|
if out_of_sample_column_name is None: |
|
raise ParticipantVisibleError('out_of_sample_column_name cannot be None') |
|
missing_cols = solution.columns.difference(submission.columns) |
|
if len(missing_cols) > 0: |
|
raise ParticipantVisibleError('Submission file missing expected columns ' + ', '.join(missing_cols)) |
|
|
|
solution, submission = prepare(solution, out_of_sample_column_name), prepare(submission, out_of_sample_column_name) |
|
|
|
oos_true = solution.pop(out_of_sample_column_name).to_numpy() |
|
oos_pred = submission.pop(out_of_sample_column_name).to_numpy() |
|
oos_score = sklearn.metrics.roc_auc_score(oos_true, oos_pred) |
|
normalized_oos_score = 2 * (oos_score - 0.5) |
|
|
|
solution = solution.squeeze().to_list() |
|
submission = submission.squeeze().to_list() |
|
cat_score = mapk(solution, submission, k=k) |
|
results = 0.5 * (normalized_oos_score + cat_score) |
|
return results |