File size: 4,275 Bytes
ed2eb44
 
06d4ee9
 
 
ed2eb44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06d4ee9
 
 
ed2eb44
 
 
06d4ee9
 
 
 
 
 
 
ed2eb44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06d4ee9
 
 
ed2eb44
 
 
 
 
06d4ee9
 
 
 
 
 
 
 
 
 
 
 
ed2eb44
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Theme and configuration settings for the Model Capability Leaderboard application

# Import task mapping
from src.utils.task_mapping import task_display_names

# Theme colors - using dark mode by default
dark_theme = {
    'bg_color': '#1a202c',
    'text_color': '#e2e8f0',
    'card_bg': '#2d3748',
    'primary': '#818cf8',
    'secondary': '#a78bfa',
    'border': '#4a5568',
    'hover': '#4a5568',
    'table_header': '#2d3748',
    'table_border': '#4a5568',
    'heading_color': '#e2e8f0',
    'gradient': 'linear-gradient(135deg, #818cf8 0%, #a78bfa 100%)',
    'warning_bg': '#7c2d12',
    'warning_border': '#f97316',
    'info_bg': '#1e3a8a',
    'info_border': '#3b82f6',
    'footer_color': '#a0aec0',
    'title_color': 'white',
    'subtitle_color': 'rgba(255, 255, 255, 0.9)',
    'footer_border': '#4a5568',
    'task_title': '#a5b4fc',
    'task_border': '#818cf8',
    # Table-specific colors for the custom table
    'table_bg': '#0a0a0a',
    'table_border_color': '#333',
    'table_header_bg': '#191919',
    'table_subheader_bg': '#141414',
    'table_average_column_bg': '#202020',
    'table_row_odd': '#0a0a0a',
    'table_row_even': '#111111',
    'table_hover_bg': '#1a1a1a',
    'positive_value_color': '#4ade80',
    'negative_value_color': '#f87171'
}

# Application settings
app_config = {
    'title': 'MLRC-Bench Leaderboard',
    'description': 'Machine Learning Research Challenges Benchmark for AI Agents',
    'layout': 'wide',
    'initial_sidebar_state': 'collapsed'
}

# Metrics configuration
metrics_config = {
    "Relative Improvement to Human": {
        "file": "src/data/metrics/relative_improvement_to_human.json",
        "description": "Measures how much of the performance gap between baseline and human the agent has closed. Calculated as: (Agent performance - Baseline) / (Human - Baseline) × 100%.",
        "min_value": -100,  # Approximate, adjust as needed
        "max_value": 50,    # Approximate, adjust as needed
        "color_map": "RdYlGn"
    },
    "Absolute Improvement to Baseline": {
        "file": "src/data/metrics/absolute_improvement_to_baseline.json",
        "description": "Measures the percentage improvement over the baseline performance. Calculated as: (Agent performance - Baseline) / Baseline × 100%.",
        "min_value": -100,  # Approximate, adjust as needed
        "max_value": 100,   # Approximate, adjust as needed
        "color_map": "RdYlGn"
    }
    # Future metrics can be added here
    # "Another Metric": {
    #     "file": "src/data/metrics/another_metric.json",
    #     "description": "Description of another metric",
    #     "min_value": 0,
    #     "max_value": 100,
    #     "color_map": "viridis"
    # }
}

# Model type categories
model_categories = {
    "MLAB (claude-3-5-sonnet-v2)": "Closed Source",
    "MLAB (gemini-exp-1206)": "Closed Source",
    "MLAB (o3-mini)": "Closed Source",
    "MLAB (gpt-4o)": "Closed Source",
    "MLAB (llama3-1-405b-instruct)": "Open Weights",
    "CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
    "Human": "Human",
    "Top Human in Competition": "Human"
    # More models would be added here as needed
}

# Task descriptions
tasks_info = {
    task_display_names.get("Perception Temporal Action Loc", "Temporal Action Localisation"): 
        "Testing the model's ability to understand and localize actions within temporal sequences of events.",
    task_display_names.get("Llm Merging", "LLM Merging"): 
        "Assessing the capability to effectively merge knowledge from multiple language models.",
    task_display_names.get("Meta Learning", "Meta Learning"): 
        "Evaluating the model's ability to learn how to learn - adapting quickly to new tasks.",
    task_display_names.get("Product Recommendation", "Next Product Recommendation"): 
        "Testing the model's ability to recommend relevant products based on user preferences and behavior.",
    task_display_names.get("Machine Unlearning", "Machine Unlearning"): 
        "Evaluating how well models can 'unlearn' specific information when required.",
    task_display_names.get("Backdoor Trigger Recovery", "Backdoor Trigger Recovery"): 
        "Testing resilience against backdoor attacks and ability to recover from triggered behaviors."
}