Hannah commited on
Commit
cce0ed9
·
1 Parent(s): 30adea1
Files changed (7) hide show
  1. README copy.md +75 -0
  2. api.py +250 -0
  3. hf.svg +10 -0
  4. requirements.txt +30 -0
  5. run.py +280 -0
  6. run_all.py +68 -0
  7. style.css +44 -0
README copy.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LLM Leaderboard Demo
2
+
3
+ This demo showcases a modern LLM leaderboard application built with Gradio and FastAPI.
4
+
5
+ ## Features
6
+
7
+ - Displays a sortable, filterable table of LLM models and their performance
8
+ - Shows scores across multiple benchmarks
9
+ - External links to model documentation for both open and closed models
10
+ - Dynamic filtering by model performance categories
11
+ - Real-time data refresh functionality
12
+ - Last-updated timestamp display
13
+ - Automatic startup of both API and frontend
14
+
15
+ ## Installation
16
+
17
+ First, install the required dependencies:
18
+
19
+ ```bash
20
+ pip install -r requirements.txt
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ There are two ways to run the application:
26
+
27
+ ### Option 1: Using run.py (all-in-one)
28
+
29
+ ```bash
30
+ python run.py
31
+ ```
32
+
33
+ This will start both the FastAPI backend and Gradio interface in a single process.
34
+
35
+ ### Option 2: Using run_all.py (separate processes)
36
+
37
+ ```bash
38
+ python run_all.py
39
+ ```
40
+
41
+ This runs the API server and Gradio interface in separate processes, making it easier to debug each component.
42
+
43
+ With either option, the application will be available at:
44
+ - Frontend: http://localhost:7860
45
+ - API: http://localhost:8000
46
+
47
+ ## Architecture
48
+
49
+ The application consists of two main components:
50
+
51
+ 1. **FastAPI Backend** (`api.py`):
52
+ - Provides API endpoints for leaderboard data
53
+ - Caches data to improve performance
54
+ - Handles data processing and filtering
55
+
56
+ 2. **Gradio Frontend** (`run.py`):
57
+ - Creates an interactive web interface
58
+ - Fetches data from the API
59
+ - Provides filtering and search capabilities
60
+
61
+ ## API Endpoints
62
+
63
+ - `GET /api/leaderboard` - Get the full leaderboard data
64
+ - `GET /api/leaderboard?refresh=true` - Force refresh data from source
65
+ - `GET /api/models` - Get a list of all model names
66
+ - `GET /api/model/{model_name}` - Get details for a specific model
67
+ - `GET /api/filters` - Get counts for different filter categories
68
+
69
+ ## Customization
70
+
71
+ You can modify the appearance by editing the CSS in `run.py`. The application uses Gradio's theme system for styling.
72
+
73
+ ## License
74
+
75
+ This project is licensed under the MIT License.
api.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Dict, List, Optional
4
+
5
+ import numpy as np
6
+ import requests
7
+ from fastapi import FastAPI, HTTPException, Query
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from pydantic import BaseModel
10
+
11
+
12
+ class LeaderboardModel(BaseModel):
13
+ model_name: str
14
+ type: str
15
+ model_link: Optional[str] = None
16
+ scores: Dict[str, float]
17
+ co2_cost: Optional[float] = None
18
+
19
+ class LeaderboardData(BaseModel):
20
+ models: List[LeaderboardModel]
21
+ updated_at: str
22
+
23
+ app = FastAPI(
24
+ title="LLM Leaderboard API",
25
+ description="API for serving Open LLM Leaderboard data",
26
+ version="1.0.0"
27
+ )
28
+
29
+ # Add CORS middleware to allow requests from your Gradio app
30
+ app.add_middleware(
31
+ CORSMiddleware,
32
+ allow_origins=["*"], # For production, specify your exact frontend URL
33
+ allow_credentials=True,
34
+ allow_methods=["*"],
35
+ allow_headers=["*"],
36
+ )
37
+
38
+ # Cache for leaderboard data
39
+ cached_data = None
40
+ cache_file = Path("leaderboard_cache.json")
41
+
42
+ def fetch_external_leaderboard_data(refresh: bool = False) -> Optional[Dict]:
43
+ """
44
+ Fetch leaderboard data from external sources like HuggingFace.
45
+ Uses local cache if available and refresh is False.
46
+ """
47
+ global cached_data
48
+
49
+ if not refresh and cached_data:
50
+ return cached_data
51
+
52
+ if not refresh and cache_file.exists():
53
+ try:
54
+ with open(cache_file) as f:
55
+ cached_data = json.load(f)
56
+ return cached_data
57
+ except:
58
+ pass # Fall back to fetching if cache read fails
59
+
60
+ try:
61
+ # Try different endpoints that might contain leaderboard data
62
+ endpoints = [
63
+ "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/raw/main/leaderboard_data.json",
64
+ "https://huggingface.co/api/spaces/HuggingFaceH4/open_llm_leaderboard/api/get_results",
65
+ ]
66
+
67
+ for url in endpoints:
68
+ response = requests.get(url)
69
+ if response.status_code == 200:
70
+ data = response.json()
71
+ cached_data = data
72
+ with open(cache_file, "w") as f:
73
+ json.dump(data, f)
74
+ return data
75
+
76
+ # If all endpoints fail, return None
77
+ return None
78
+ except Exception as e:
79
+ print(f"Error fetching external leaderboard data: {e}")
80
+ return None
81
+
82
+ def generate_sample_data() -> Dict:
83
+ """
84
+ Generate sample leaderboard data when external data can't be fetched.
85
+ """
86
+ models = [
87
+ {"model_name": "meta-llama/llama-3-70b-instruct", "type": "open"},
88
+ {"model_name": "mistralai/Mistral-7B-Instruct-v0.3", "type": "open"},
89
+ {"model_name": "google/gemma-7b-it", "type": "open"},
90
+ {"model_name": "Qwen/Qwen2-7B-Instruct", "type": "open"},
91
+ {"model_name": "anthropic/claude-3-opus", "type": "closed", "external_link": "https://www.anthropic.com/claude"},
92
+ {"model_name": "OpenAI/gpt-4o", "type": "closed", "external_link": "https://openai.com/gpt-4"},
93
+ {"model_name": "01-ai/Yi-1.5-34B-Chat", "type": "open"},
94
+ {"model_name": "google/gemma-2b", "type": "open"},
95
+ {"model_name": "microsoft/phi-3-mini-4k-instruct", "type": "open"},
96
+ {"model_name": "microsoft/phi-3-mini-128k-instruct", "type": "open"},
97
+ {"model_name": "stabilityai/stable-beluga-7b", "type": "open"},
98
+ {"model_name": "togethercomputer/RedPajama-INCITE-7B-Instruct", "type": "open"},
99
+ {"model_name": "databricks/dbrx-instruct", "type": "closed", "external_link": "https://www.databricks.com/product/machine-learning/large-language-models"},
100
+ {"model_name": "mosaicml/mpt-7b-instruct", "type": "open"},
101
+ {"model_name": "01-ai/Yi-1.5-9B-Chat", "type": "open"},
102
+ {"model_name": "anthropic/claude-3-sonnet", "type": "closed", "external_link": "https://www.anthropic.com/claude"},
103
+ {"model_name": "cohere/command-r-plus", "type": "closed", "external_link": "https://cohere.com/models/command-r-plus"},
104
+ {"model_name": "meta-llama/llama-3-8b-instruct", "type": "open"}
105
+ ]
106
+
107
+ np.random.seed(42) # For reproducibility
108
+
109
+ model_data = []
110
+ for model_info in models:
111
+ model_name = model_info["model_name"]
112
+ model_type = model_info["type"]
113
+ external_link = model_info.get("external_link", None)
114
+
115
+ # Generate random scores
116
+ average = round(np.random.uniform(40, 90), 2)
117
+ ifeval = round(np.random.uniform(30, 90), 2)
118
+ bbhi = round(np.random.uniform(40, 85), 2)
119
+ math = round(np.random.uniform(20, 80), 2)
120
+ gpqa = round(np.random.uniform(10, 70), 2)
121
+ mujb = round(np.random.uniform(10, 70), 2)
122
+ mmlu = round(np.random.uniform(40, 85), 2)
123
+ co2_cost = round(np.random.uniform(1, 100), 2)
124
+
125
+ # If it's an open model, it should have a link to Hugging Face
126
+ model_link = None
127
+ if external_link:
128
+ model_link = external_link
129
+ elif "/" in model_name:
130
+ model_link = f"https://huggingface.co/{model_name}"
131
+ else:
132
+ model_link = f"https://huggingface.co/models?search={model_name}"
133
+
134
+ model_data.append({
135
+ "model_name": model_name,
136
+ "type": model_type,
137
+ "model_link": model_link,
138
+ "scores": {
139
+ "average": average,
140
+ "ifeval": ifeval,
141
+ "bbhi": bbhi,
142
+ "math": math,
143
+ "gpqa": gpqa,
144
+ "mujb": mujb,
145
+ "mmlu": mmlu
146
+ },
147
+ "co2_cost": co2_cost
148
+ })
149
+
150
+ # Sort by average score
151
+ model_data.sort(key=lambda x: x["scores"]["average"], reverse=True)
152
+
153
+ # Create the final data structure
154
+ from datetime import datetime
155
+ leaderboard_data = {
156
+ "models": model_data,
157
+ "updated_at": datetime.now().isoformat()
158
+ }
159
+
160
+ return leaderboard_data
161
+
162
+ @app.get("/")
163
+ def read_root():
164
+ return {"message": "Welcome to the LLM Leaderboard API"}
165
+
166
+ @app.get("/api/leaderboard", response_model=LeaderboardData)
167
+ def get_leaderboard(refresh: bool = Query(False, description="Force refresh data from source")):
168
+ """
169
+ Get the full leaderboard data.
170
+ If refresh is True, force fetch from source instead of using cache.
171
+ """
172
+ external_data = fetch_external_leaderboard_data(refresh=refresh)
173
+
174
+ if external_data:
175
+ # Process external data to match our expected format
176
+ try:
177
+ # Here you would transform the external data to match LeaderboardData model
178
+ # This is a simplified example - you'd need to adapt this to the actual structure
179
+ return external_data
180
+ except Exception as e:
181
+ print(f"Error processing external data: {e}")
182
+
183
+ # Fall back to sample data if external data can't be processed
184
+ return generate_sample_data()
185
+
186
+ @app.get("/api/models", response_model=List[str])
187
+ def get_models():
188
+ """Get a list of all model names in the leaderboard"""
189
+ data = fetch_external_leaderboard_data() or generate_sample_data()
190
+ return [model["model_name"] for model in data["models"]]
191
+
192
+ @app.get("/api/model/{model_name}", response_model=LeaderboardModel)
193
+ def get_model_details(model_name: str):
194
+ """Get detailed information about a specific model"""
195
+ data = fetch_external_leaderboard_data() or generate_sample_data()
196
+
197
+ for model in data["models"]:
198
+ if model["model_name"] == model_name:
199
+ return model
200
+
201
+ raise HTTPException(status_code=404, detail=f"Model {model_name} not found")
202
+
203
+ @app.get("/api/filters")
204
+ def get_filter_counts():
205
+ """
206
+ Get counts for different filter categories to display in the UI.
207
+ This matches what's shown in the 'Quick Filters' section of the leaderboard.
208
+ """
209
+ data = fetch_external_leaderboard_data() or generate_sample_data()
210
+
211
+ # Count models by different categories
212
+ edge_count = 0
213
+ consumer_count = 0
214
+ midrange_count = 0
215
+ gpu_rich_count = 0
216
+ official_count = 0
217
+
218
+ for model in data["models"]:
219
+ # Edge devices (typically small models)
220
+ if "scores" in model and model["scores"].get("average", 0) < 45:
221
+ edge_count += 1
222
+
223
+ # Consumer (moderate size/performance)
224
+ if "scores" in model and 45 <= model["scores"].get("average", 0) < 55:
225
+ consumer_count += 1
226
+
227
+ # Mid-range
228
+ if "scores" in model and 55 <= model["scores"].get("average", 0) < 65:
229
+ midrange_count += 1
230
+
231
+ # GPU-rich (high-end models)
232
+ if "scores" in model and model["scores"].get("average", 0) >= 65:
233
+ gpu_rich_count += 1
234
+
235
+ # Official providers
236
+ # This is just a placeholder logic - adapt to your actual criteria
237
+ if "/" not in model["model_name"] or model["model_name"].startswith("meta/") or model["model_name"].startswith("google/"):
238
+ official_count += 1
239
+
240
+ return {
241
+ "edge_devices": edge_count,
242
+ "consumers": consumer_count,
243
+ "midrange": midrange_count,
244
+ "gpu_rich": gpu_rich_count,
245
+ "official_providers": official_count
246
+ }
247
+
248
+ if __name__ == "__main__":
249
+ import uvicorn
250
+ uvicorn.run(app, host="0.0.0.0", port=8000)
hf.svg ADDED
requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles>=22.0,<24.0
2
+ anyio>=3.0,<5.0
3
+ audioop-lts<1.0; python_version >= "3.13" #it provides support for 'audioop' module removed in latest python version used by pydub
4
+ fastapi>=0.95.0
5
+ ffmpy
6
+ groovy~=0.1
7
+ gradio>=5.0.0
8
+ httpx>=0.24.1
9
+ huggingface_hub>=0.28.1
10
+ Jinja2<4.0
11
+ markupsafe>=2.0,<4.0
12
+ numpy>=1.20.0
13
+ orjson~=3.0
14
+ packaging
15
+ pandas>=1.3.0
16
+ pillow>=8.0,<12.0
17
+ pydantic>=1.10.0
18
+ python-multipart>=0.0.18 # required for fastapi forms.
19
+ pydub
20
+ pyyaml>=5.0,<7.0
21
+ ruff>=0.9.3; sys.platform != 'emscripten' # needed here for custom component docs generation
22
+ safehttpx>=0.1.6,<0.2.0
23
+ semantic_version~=2.0
24
+ starlette>=0.40.0,<1.0; sys.platform != 'emscripten'
25
+ tomlkit>=0.12.0,<0.14.0
26
+ typer>=0.12,<1.0; sys.platform != 'emscripten'
27
+ typing_extensions~=4.0
28
+ urllib3~=2.0; sys.platform == 'emscripten' # urllib3 is used for Lite support. Version spec can be omitted because urllib3==2.1.0 is prebuilt for Pyodide and urllib>=2.2.0 supports Pyodide as well.
29
+ uvicorn>=0.21.0
30
+ requests>=2.28.0
run.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import requests
3
+ from urllib.parse import quote
4
+ import subprocess
5
+ import os
6
+ import time
7
+ import sys
8
+ from datetime import datetime
9
+
10
+ import gradio as gr
11
+
12
+ def style_dataframe(df):
13
+ if len(df) == 0:
14
+ return df
15
+
16
+ # Define the columns to highlight based on the screenshot
17
+ highlight_cols = ["Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO"]
18
+
19
+ # Initialize the styler
20
+ styled = df.style
21
+
22
+ # Function to create gradient background based on value
23
+ def highlight_green(val):
24
+ try:
25
+ # Extract numeric value from string (remove % if present)
26
+ val_float = float(str(val).replace('%', '').replace(' kg', ''))
27
+
28
+ # Create gradient background filling based on the value percentage
29
+ # Use the exact colors from the example
30
+ return f'background: linear-gradient(90deg, rgba(46, 125, 50, 0.5) {val_float}%, rgba(46, 125, 50, 0.1) {val_float}%); color: white;'
31
+ except:
32
+ return 'background-color: #121212; color: white;'
33
+
34
+ # Apply the highlighting to performance metric columns
35
+ for col in highlight_cols:
36
+ styled = styled.applymap(highlight_green, subset=[col])
37
+
38
+ styled = styled.set_properties(
39
+ subset=["Model"],
40
+ **{'color': '#4da6ff'}
41
+ )
42
+
43
+ return styled
44
+
45
+ def increment_counter(counter):
46
+ return counter + 1
47
+
48
+ def handle_select(evt: gr.SelectData, counter):
49
+ return counter + 1, f"{evt.index}", f"{evt.value}"
50
+
51
+ def start_api_server():
52
+ api_process = subprocess.Popen(
53
+ [sys.executable, "api.py"],
54
+ cwd=os.path.dirname(os.path.abspath(__file__))
55
+ )
56
+ # Give the API server a moment to start
57
+ time.sleep(2)
58
+ return api_process
59
+
60
+ def apply_filters(filters, models_data):
61
+ if not filters or len(filters) == 0:
62
+ # No filters selected, return all data
63
+ return models_data
64
+
65
+ filtered_models = []
66
+
67
+ for model in models_data:
68
+ scores = model.get("scores", {})
69
+ average_score = scores.get("average", 0)
70
+ model_name = model.get("model_name", "")
71
+
72
+ # Check which filters are selected and apply them
73
+ should_include = False
74
+ for filter_option in filters:
75
+ if "Edge Devices" in filter_option and average_score < 45 or "Consumers" in filter_option and 45 <= average_score < 55 or "Mid-range" in filter_option and 55 <= average_score < 65 or "GPU-rich" in filter_option and average_score >= 65 or "Official Providers" in filter_option and ("/" not in model_name or
76
+ model_name.startswith("meta/") or
77
+ model_name.startswith("google/") or
78
+ model_name.startswith("openai/") or
79
+ model_name.startswith("microsoft/")):
80
+ should_include = True
81
+ break
82
+
83
+ if should_include:
84
+ filtered_models.append(model)
85
+
86
+ return filtered_models
87
+
88
+ def format_timestamp(timestamp_str):
89
+ try:
90
+ # Parse ISO format timestamp
91
+
92
+ # Try to parse ISO format with timezone
93
+ try:
94
+ dt = datetime.fromisoformat(timestamp_str)
95
+ except:
96
+ # Fallback for different timestamp formats
97
+ dt = datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S.%f")
98
+
99
+ # Format nicely for display
100
+ return dt.strftime("%B %d, %Y at %I:%M %p")
101
+ except Exception as e:
102
+ print(f"Error formatting timestamp: {e}")
103
+ return timestamp_str
104
+
105
+ def create_leaderboard_data(selected_filters=None):
106
+ try:
107
+ response = requests.get("http://localhost:8000/api/leaderboard")
108
+ if response.status_code == 200:
109
+ data = response.json()
110
+ models_data = data.get("models", [])
111
+ updated_at = data.get("updated_at", "Unknown")
112
+ formatted_time = format_timestamp(updated_at)
113
+
114
+ # Apply filters if any are selected
115
+ if selected_filters:
116
+ models_data = apply_filters(selected_filters, models_data)
117
+
118
+ rows = []
119
+ for i, model in enumerate(models_data, 1):
120
+ model_name = model["model_name"]
121
+ model_type = model["type"]
122
+ scores = model["scores"]
123
+ co2_cost = model.get("co2_cost", "N/A")
124
+
125
+ # Only use green for open and red for closed
126
+ emoji = "🟢" if model_type.lower() == "open" else "🔴"
127
+ type_with_emoji = f"{emoji} {model_type.upper()}"
128
+
129
+ # Use model_link from API if available, otherwise create one
130
+ if "model_link" in model and model["model_link"]:
131
+ model_link = f"[{model_name}]({model['model_link']})"
132
+ # Format model name with link
133
+ elif "/" in model_name:
134
+ org, name = model_name.split("/", 1)
135
+ model_link = f"[{model_name}](https://huggingface.co/{quote(model_name)})"
136
+ else:
137
+ model_link = f"[{model_name}](https://huggingface.co/models?search={quote(model_name)})"
138
+
139
+ rows.append([
140
+ i, # Rank
141
+ type_with_emoji,
142
+ model_link,
143
+ f"{scores.get('average', 0):.2f}",
144
+ f"{scores.get('ifeval', 0):.2f}",
145
+ f"{scores.get('bbhi', 0):.2f}",
146
+ f"{scores.get('math', 0):.2f}",
147
+ f"{scores.get('gpqa', 0):.2f}",
148
+ f"{scores.get('mujb', 0):.2f}",
149
+ f"{scores.get('mmlu', 0):.2f}",
150
+ f"{co2_cost}" if isinstance(co2_cost, (int, float)) else co2_cost
151
+ ])
152
+
153
+ df = pd.DataFrame(rows, columns=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"])
154
+ styled_df = style_dataframe(df)
155
+ return styled_df, formatted_time
156
+ else:
157
+ # Return an empty dataframe with proper columns if API fails
158
+ empty_df = pd.DataFrame(columns=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"])
159
+ return empty_df, "Unknown"
160
+ except Exception as e:
161
+ print(f"Error fetching leaderboard data: {e}")
162
+ # Return an empty dataframe with proper columns if API fails
163
+ empty_df = pd.DataFrame(columns=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"])
164
+ return empty_df, "Unknown"
165
+
166
+ def load_svg(file_path="hf.svg"):
167
+ with open(file_path) as f:
168
+ svg_content = f.read()
169
+ return svg_content
170
+
171
+ def get_filter_data():
172
+ try:
173
+ response = requests.get("http://localhost:8000/api/filters")
174
+ if response.status_code == 200:
175
+ filter_data = response.json()
176
+ return [
177
+ f"For Edge Devices · {filter_data.get('edge_devices', 0)}",
178
+ f"For Consumers · {filter_data.get('consumers', 0)}",
179
+ f"Mid-range · {filter_data.get('midrange', 0)}",
180
+ f"For the GPU-rich · {filter_data.get('gpu_rich', 0)}",
181
+ f"Only Official Providers · {filter_data.get('official_providers', 0)}"
182
+ ]
183
+ else:
184
+ return [
185
+ "For Edge Devices · 0",
186
+ "For Consumers · 0",
187
+ "Mid-range · 0",
188
+ "For the GPU-rich · 0",
189
+ "Only Official Providers · 0"
190
+ ]
191
+ except Exception as e:
192
+ print(f"Error fetching filter data: {e}")
193
+ return [
194
+ "For Edge Devices · 0",
195
+ "For Consumers · 0",
196
+ "Mid-range · 0",
197
+ "For the GPU-rich · 0",
198
+ "Only Official Providers · 0"
199
+ ]
200
+
201
+ def refresh_leaderboard(selected_filters=None):
202
+ try:
203
+ # Request a refresh from the API
204
+ requests.get("http://localhost:8000/api/leaderboard?refresh=true")
205
+ # Get updated data
206
+ df, timestamp = create_leaderboard_data(selected_filters)
207
+ filter_choices = get_filter_data()
208
+ return df, filter_choices, f"Last updated: {timestamp}"
209
+ except Exception as e:
210
+ print(f"Error refreshing data: {e}")
211
+ return None, None, "Error refreshing data"
212
+
213
+ def update_table(filters):
214
+ df, timestamp = create_leaderboard_data(filters)
215
+ return df, f"Last updated: {timestamp}"
216
+
217
+ def load_css(file_path="style.css"):
218
+ try:
219
+ current_dir = os.path.dirname(os.path.abspath(__file__))
220
+ css_path = os.path.join(current_dir, file_path)
221
+ with open(css_path) as f:
222
+ css_content = f.read()
223
+ return css_content
224
+ except Exception as e:
225
+ print(f"Error loading CSS file: {e}")
226
+ # Return a basic CSS if file not found
227
+ return """
228
+ .dataframe-container {
229
+ border-radius: 8px;
230
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
231
+ }
232
+ """
233
+
234
+ with gr.Blocks(css=load_css()) as demo:
235
+ df, timestamp = create_leaderboard_data()
236
+
237
+ with gr.Row():
238
+ svg_content = load_svg()
239
+ gr.HTML(svg_content)
240
+ gr.HTML("""
241
+ <div style="display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
242
+ <div class="leaderboard-title">Open LLM Leaderboard</div>
243
+ </div>
244
+ <div class="leaderboard-subtitle">Comparing Large Language Models in an open and reproducible way</div>
245
+ """)
246
+ status_text = gr.HTML(f"""<div style="text-align: center; margin-bottom: 10px;">Last updated: {timestamp}</div>""")
247
+
248
+ with gr.Row(elem_classes="filters-container"):
249
+ filter_choices = get_filter_data()
250
+ filters = gr.CheckboxGroup(
251
+ label="Quick Filters",
252
+ choices=filter_choices,
253
+ )
254
+
255
+ # Create and display the dataframe
256
+
257
+ leaderboard_table = gr.Dataframe(
258
+ value=df,
259
+ headers=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"],
260
+ datatype=["number", "str", "markdown", "str", "str", "str", "str", "str", "str", "str", "str"],
261
+ elem_id="leaderboard-table",
262
+ elem_classes="dataframe-container",
263
+ interactive=False,
264
+ max_height=600,
265
+ show_search="search",
266
+ show_copy_button=True,
267
+ show_fullscreen_button=True,
268
+ pinned_columns=2,
269
+ column_widths=["5%", "10%", "35%", "7%", "7%", "7%", "7%", "7%", "7%", "7%", "6%"]
270
+ )
271
+
272
+ refresh_btn = gr.Button("Refresh Data", elem_classes="refresh-btn")
273
+ refresh_btn.click(refresh_leaderboard, inputs=[filters], outputs=[leaderboard_table, filters, status_text])
274
+
275
+ filters.change(update_table, inputs=[filters], outputs=[leaderboard_table, status_text])
276
+
277
+ if __name__ == "__main__":
278
+ api_process = start_api_server()
279
+ demo.launch()
280
+ api_process.terminate()
run_all.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import subprocess
3
+ import sys
4
+ import os
5
+ import time
6
+ import signal
7
+ import atexit
8
+
9
+ def start_api_server():
10
+ api_process = subprocess.Popen(
11
+ [sys.executable, "api.py"],
12
+ cwd=os.path.dirname(os.path.abspath(__file__))
13
+ )
14
+ print("API server started (PID:", api_process.pid, ")")
15
+ return api_process
16
+
17
+ def start_gradio_app():
18
+ gradio_process = subprocess.Popen(
19
+ [sys.executable, "-c", "import gradio as gr; import run; run.demo.launch()"],
20
+ cwd=os.path.dirname(os.path.abspath(__file__))
21
+ )
22
+ print("Gradio interface started (PID:", gradio_process.pid, ")")
23
+ return gradio_process
24
+
25
+ def cleanup_processes(api_process, gradio_process):
26
+ print("\nShutting down services...")
27
+
28
+ if api_process and api_process.poll() is None:
29
+ api_process.terminate()
30
+ print("API server terminated")
31
+
32
+ if gradio_process and gradio_process.poll() is None:
33
+ gradio_process.terminate()
34
+ print("Gradio interface terminated")
35
+
36
+ def main():
37
+ api_process = start_api_server()
38
+ # Give the API server a moment to start
39
+ time.sleep(2)
40
+
41
+ gradio_process = start_gradio_app()
42
+
43
+ # Register cleanup function to be called on exit
44
+ atexit.register(cleanup_processes, api_process, gradio_process)
45
+
46
+ # Handle keyboard interrupts
47
+ def signal_handler(sig, frame):
48
+ print("\nReceived termination signal")
49
+ cleanup_processes(api_process, gradio_process)
50
+ sys.exit(0)
51
+
52
+ signal.signal(signal.SIGINT, signal_handler)
53
+ signal.signal(signal.SIGTERM, signal_handler)
54
+
55
+ print("\nLeaderboard application started!")
56
+ print("- API server running at http://localhost:8000")
57
+ print("- Gradio interface running at http://localhost:7860")
58
+ print("\nPress Ctrl+C to stop all services")
59
+
60
+ # Keep the main process running
61
+ try:
62
+ while True:
63
+ time.sleep(1)
64
+ except KeyboardInterrupt:
65
+ pass
66
+
67
+ if __name__ == "__main__":
68
+ main()
style.css ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .html-container {
2
+ text-align: center;
3
+ display: flex;
4
+ justify-content: center;
5
+ width: 100%;
6
+ }
7
+
8
+ .dataframe-container {
9
+ margin-top: 0.5rem;
10
+ margin-bottom: 0.5rem;
11
+ }
12
+
13
+ .leaderboard-title {
14
+ font-size: 1.5rem;
15
+ font-weight: bold;
16
+ margin-bottom: 0.25rem;
17
+ color: #f0f0f0;
18
+ }
19
+
20
+ .leaderboard-subtitle {
21
+ font-size: 0.9rem;
22
+ margin-bottom: 1rem;
23
+ color: #a0a0a0;
24
+ }
25
+
26
+ .filters-container fieldset {
27
+ display: flex;
28
+ flex-direction: row;
29
+ justify-content: center;
30
+ align-items: center;
31
+ gap: 0.5rem;
32
+ }
33
+
34
+ .refresh-btn {
35
+ margin-top: 0.5rem;
36
+ }
37
+
38
+ .status-container {
39
+ display: flex;
40
+ justify-content: flex-end;
41
+ font-size: 0.75rem;
42
+ color: #a0a0a0;
43
+ }
44
+