Spaces:
Running
Running
Hannah
commited on
Commit
·
cce0ed9
1
Parent(s):
30adea1
initial
Browse files- README copy.md +75 -0
- api.py +250 -0
- hf.svg +10 -0
- requirements.txt +30 -0
- run.py +280 -0
- run_all.py +68 -0
- style.css +44 -0
README copy.md
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# LLM Leaderboard Demo
|
2 |
+
|
3 |
+
This demo showcases a modern LLM leaderboard application built with Gradio and FastAPI.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- Displays a sortable, filterable table of LLM models and their performance
|
8 |
+
- Shows scores across multiple benchmarks
|
9 |
+
- External links to model documentation for both open and closed models
|
10 |
+
- Dynamic filtering by model performance categories
|
11 |
+
- Real-time data refresh functionality
|
12 |
+
- Last-updated timestamp display
|
13 |
+
- Automatic startup of both API and frontend
|
14 |
+
|
15 |
+
## Installation
|
16 |
+
|
17 |
+
First, install the required dependencies:
|
18 |
+
|
19 |
+
```bash
|
20 |
+
pip install -r requirements.txt
|
21 |
+
```
|
22 |
+
|
23 |
+
## Usage
|
24 |
+
|
25 |
+
There are two ways to run the application:
|
26 |
+
|
27 |
+
### Option 1: Using run.py (all-in-one)
|
28 |
+
|
29 |
+
```bash
|
30 |
+
python run.py
|
31 |
+
```
|
32 |
+
|
33 |
+
This will start both the FastAPI backend and Gradio interface in a single process.
|
34 |
+
|
35 |
+
### Option 2: Using run_all.py (separate processes)
|
36 |
+
|
37 |
+
```bash
|
38 |
+
python run_all.py
|
39 |
+
```
|
40 |
+
|
41 |
+
This runs the API server and Gradio interface in separate processes, making it easier to debug each component.
|
42 |
+
|
43 |
+
With either option, the application will be available at:
|
44 |
+
- Frontend: http://localhost:7860
|
45 |
+
- API: http://localhost:8000
|
46 |
+
|
47 |
+
## Architecture
|
48 |
+
|
49 |
+
The application consists of two main components:
|
50 |
+
|
51 |
+
1. **FastAPI Backend** (`api.py`):
|
52 |
+
- Provides API endpoints for leaderboard data
|
53 |
+
- Caches data to improve performance
|
54 |
+
- Handles data processing and filtering
|
55 |
+
|
56 |
+
2. **Gradio Frontend** (`run.py`):
|
57 |
+
- Creates an interactive web interface
|
58 |
+
- Fetches data from the API
|
59 |
+
- Provides filtering and search capabilities
|
60 |
+
|
61 |
+
## API Endpoints
|
62 |
+
|
63 |
+
- `GET /api/leaderboard` - Get the full leaderboard data
|
64 |
+
- `GET /api/leaderboard?refresh=true` - Force refresh data from source
|
65 |
+
- `GET /api/models` - Get a list of all model names
|
66 |
+
- `GET /api/model/{model_name}` - Get details for a specific model
|
67 |
+
- `GET /api/filters` - Get counts for different filter categories
|
68 |
+
|
69 |
+
## Customization
|
70 |
+
|
71 |
+
You can modify the appearance by editing the CSS in `run.py`. The application uses Gradio's theme system for styling.
|
72 |
+
|
73 |
+
## License
|
74 |
+
|
75 |
+
This project is licensed under the MIT License.
|
api.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Dict, List, Optional
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import requests
|
7 |
+
from fastapi import FastAPI, HTTPException, Query
|
8 |
+
from fastapi.middleware.cors import CORSMiddleware
|
9 |
+
from pydantic import BaseModel
|
10 |
+
|
11 |
+
|
12 |
+
class LeaderboardModel(BaseModel):
|
13 |
+
model_name: str
|
14 |
+
type: str
|
15 |
+
model_link: Optional[str] = None
|
16 |
+
scores: Dict[str, float]
|
17 |
+
co2_cost: Optional[float] = None
|
18 |
+
|
19 |
+
class LeaderboardData(BaseModel):
|
20 |
+
models: List[LeaderboardModel]
|
21 |
+
updated_at: str
|
22 |
+
|
23 |
+
app = FastAPI(
|
24 |
+
title="LLM Leaderboard API",
|
25 |
+
description="API for serving Open LLM Leaderboard data",
|
26 |
+
version="1.0.0"
|
27 |
+
)
|
28 |
+
|
29 |
+
# Add CORS middleware to allow requests from your Gradio app
|
30 |
+
app.add_middleware(
|
31 |
+
CORSMiddleware,
|
32 |
+
allow_origins=["*"], # For production, specify your exact frontend URL
|
33 |
+
allow_credentials=True,
|
34 |
+
allow_methods=["*"],
|
35 |
+
allow_headers=["*"],
|
36 |
+
)
|
37 |
+
|
38 |
+
# Cache for leaderboard data
|
39 |
+
cached_data = None
|
40 |
+
cache_file = Path("leaderboard_cache.json")
|
41 |
+
|
42 |
+
def fetch_external_leaderboard_data(refresh: bool = False) -> Optional[Dict]:
|
43 |
+
"""
|
44 |
+
Fetch leaderboard data from external sources like HuggingFace.
|
45 |
+
Uses local cache if available and refresh is False.
|
46 |
+
"""
|
47 |
+
global cached_data
|
48 |
+
|
49 |
+
if not refresh and cached_data:
|
50 |
+
return cached_data
|
51 |
+
|
52 |
+
if not refresh and cache_file.exists():
|
53 |
+
try:
|
54 |
+
with open(cache_file) as f:
|
55 |
+
cached_data = json.load(f)
|
56 |
+
return cached_data
|
57 |
+
except:
|
58 |
+
pass # Fall back to fetching if cache read fails
|
59 |
+
|
60 |
+
try:
|
61 |
+
# Try different endpoints that might contain leaderboard data
|
62 |
+
endpoints = [
|
63 |
+
"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/raw/main/leaderboard_data.json",
|
64 |
+
"https://huggingface.co/api/spaces/HuggingFaceH4/open_llm_leaderboard/api/get_results",
|
65 |
+
]
|
66 |
+
|
67 |
+
for url in endpoints:
|
68 |
+
response = requests.get(url)
|
69 |
+
if response.status_code == 200:
|
70 |
+
data = response.json()
|
71 |
+
cached_data = data
|
72 |
+
with open(cache_file, "w") as f:
|
73 |
+
json.dump(data, f)
|
74 |
+
return data
|
75 |
+
|
76 |
+
# If all endpoints fail, return None
|
77 |
+
return None
|
78 |
+
except Exception as e:
|
79 |
+
print(f"Error fetching external leaderboard data: {e}")
|
80 |
+
return None
|
81 |
+
|
82 |
+
def generate_sample_data() -> Dict:
|
83 |
+
"""
|
84 |
+
Generate sample leaderboard data when external data can't be fetched.
|
85 |
+
"""
|
86 |
+
models = [
|
87 |
+
{"model_name": "meta-llama/llama-3-70b-instruct", "type": "open"},
|
88 |
+
{"model_name": "mistralai/Mistral-7B-Instruct-v0.3", "type": "open"},
|
89 |
+
{"model_name": "google/gemma-7b-it", "type": "open"},
|
90 |
+
{"model_name": "Qwen/Qwen2-7B-Instruct", "type": "open"},
|
91 |
+
{"model_name": "anthropic/claude-3-opus", "type": "closed", "external_link": "https://www.anthropic.com/claude"},
|
92 |
+
{"model_name": "OpenAI/gpt-4o", "type": "closed", "external_link": "https://openai.com/gpt-4"},
|
93 |
+
{"model_name": "01-ai/Yi-1.5-34B-Chat", "type": "open"},
|
94 |
+
{"model_name": "google/gemma-2b", "type": "open"},
|
95 |
+
{"model_name": "microsoft/phi-3-mini-4k-instruct", "type": "open"},
|
96 |
+
{"model_name": "microsoft/phi-3-mini-128k-instruct", "type": "open"},
|
97 |
+
{"model_name": "stabilityai/stable-beluga-7b", "type": "open"},
|
98 |
+
{"model_name": "togethercomputer/RedPajama-INCITE-7B-Instruct", "type": "open"},
|
99 |
+
{"model_name": "databricks/dbrx-instruct", "type": "closed", "external_link": "https://www.databricks.com/product/machine-learning/large-language-models"},
|
100 |
+
{"model_name": "mosaicml/mpt-7b-instruct", "type": "open"},
|
101 |
+
{"model_name": "01-ai/Yi-1.5-9B-Chat", "type": "open"},
|
102 |
+
{"model_name": "anthropic/claude-3-sonnet", "type": "closed", "external_link": "https://www.anthropic.com/claude"},
|
103 |
+
{"model_name": "cohere/command-r-plus", "type": "closed", "external_link": "https://cohere.com/models/command-r-plus"},
|
104 |
+
{"model_name": "meta-llama/llama-3-8b-instruct", "type": "open"}
|
105 |
+
]
|
106 |
+
|
107 |
+
np.random.seed(42) # For reproducibility
|
108 |
+
|
109 |
+
model_data = []
|
110 |
+
for model_info in models:
|
111 |
+
model_name = model_info["model_name"]
|
112 |
+
model_type = model_info["type"]
|
113 |
+
external_link = model_info.get("external_link", None)
|
114 |
+
|
115 |
+
# Generate random scores
|
116 |
+
average = round(np.random.uniform(40, 90), 2)
|
117 |
+
ifeval = round(np.random.uniform(30, 90), 2)
|
118 |
+
bbhi = round(np.random.uniform(40, 85), 2)
|
119 |
+
math = round(np.random.uniform(20, 80), 2)
|
120 |
+
gpqa = round(np.random.uniform(10, 70), 2)
|
121 |
+
mujb = round(np.random.uniform(10, 70), 2)
|
122 |
+
mmlu = round(np.random.uniform(40, 85), 2)
|
123 |
+
co2_cost = round(np.random.uniform(1, 100), 2)
|
124 |
+
|
125 |
+
# If it's an open model, it should have a link to Hugging Face
|
126 |
+
model_link = None
|
127 |
+
if external_link:
|
128 |
+
model_link = external_link
|
129 |
+
elif "/" in model_name:
|
130 |
+
model_link = f"https://huggingface.co/{model_name}"
|
131 |
+
else:
|
132 |
+
model_link = f"https://huggingface.co/models?search={model_name}"
|
133 |
+
|
134 |
+
model_data.append({
|
135 |
+
"model_name": model_name,
|
136 |
+
"type": model_type,
|
137 |
+
"model_link": model_link,
|
138 |
+
"scores": {
|
139 |
+
"average": average,
|
140 |
+
"ifeval": ifeval,
|
141 |
+
"bbhi": bbhi,
|
142 |
+
"math": math,
|
143 |
+
"gpqa": gpqa,
|
144 |
+
"mujb": mujb,
|
145 |
+
"mmlu": mmlu
|
146 |
+
},
|
147 |
+
"co2_cost": co2_cost
|
148 |
+
})
|
149 |
+
|
150 |
+
# Sort by average score
|
151 |
+
model_data.sort(key=lambda x: x["scores"]["average"], reverse=True)
|
152 |
+
|
153 |
+
# Create the final data structure
|
154 |
+
from datetime import datetime
|
155 |
+
leaderboard_data = {
|
156 |
+
"models": model_data,
|
157 |
+
"updated_at": datetime.now().isoformat()
|
158 |
+
}
|
159 |
+
|
160 |
+
return leaderboard_data
|
161 |
+
|
162 |
+
@app.get("/")
|
163 |
+
def read_root():
|
164 |
+
return {"message": "Welcome to the LLM Leaderboard API"}
|
165 |
+
|
166 |
+
@app.get("/api/leaderboard", response_model=LeaderboardData)
|
167 |
+
def get_leaderboard(refresh: bool = Query(False, description="Force refresh data from source")):
|
168 |
+
"""
|
169 |
+
Get the full leaderboard data.
|
170 |
+
If refresh is True, force fetch from source instead of using cache.
|
171 |
+
"""
|
172 |
+
external_data = fetch_external_leaderboard_data(refresh=refresh)
|
173 |
+
|
174 |
+
if external_data:
|
175 |
+
# Process external data to match our expected format
|
176 |
+
try:
|
177 |
+
# Here you would transform the external data to match LeaderboardData model
|
178 |
+
# This is a simplified example - you'd need to adapt this to the actual structure
|
179 |
+
return external_data
|
180 |
+
except Exception as e:
|
181 |
+
print(f"Error processing external data: {e}")
|
182 |
+
|
183 |
+
# Fall back to sample data if external data can't be processed
|
184 |
+
return generate_sample_data()
|
185 |
+
|
186 |
+
@app.get("/api/models", response_model=List[str])
|
187 |
+
def get_models():
|
188 |
+
"""Get a list of all model names in the leaderboard"""
|
189 |
+
data = fetch_external_leaderboard_data() or generate_sample_data()
|
190 |
+
return [model["model_name"] for model in data["models"]]
|
191 |
+
|
192 |
+
@app.get("/api/model/{model_name}", response_model=LeaderboardModel)
|
193 |
+
def get_model_details(model_name: str):
|
194 |
+
"""Get detailed information about a specific model"""
|
195 |
+
data = fetch_external_leaderboard_data() or generate_sample_data()
|
196 |
+
|
197 |
+
for model in data["models"]:
|
198 |
+
if model["model_name"] == model_name:
|
199 |
+
return model
|
200 |
+
|
201 |
+
raise HTTPException(status_code=404, detail=f"Model {model_name} not found")
|
202 |
+
|
203 |
+
@app.get("/api/filters")
|
204 |
+
def get_filter_counts():
|
205 |
+
"""
|
206 |
+
Get counts for different filter categories to display in the UI.
|
207 |
+
This matches what's shown in the 'Quick Filters' section of the leaderboard.
|
208 |
+
"""
|
209 |
+
data = fetch_external_leaderboard_data() or generate_sample_data()
|
210 |
+
|
211 |
+
# Count models by different categories
|
212 |
+
edge_count = 0
|
213 |
+
consumer_count = 0
|
214 |
+
midrange_count = 0
|
215 |
+
gpu_rich_count = 0
|
216 |
+
official_count = 0
|
217 |
+
|
218 |
+
for model in data["models"]:
|
219 |
+
# Edge devices (typically small models)
|
220 |
+
if "scores" in model and model["scores"].get("average", 0) < 45:
|
221 |
+
edge_count += 1
|
222 |
+
|
223 |
+
# Consumer (moderate size/performance)
|
224 |
+
if "scores" in model and 45 <= model["scores"].get("average", 0) < 55:
|
225 |
+
consumer_count += 1
|
226 |
+
|
227 |
+
# Mid-range
|
228 |
+
if "scores" in model and 55 <= model["scores"].get("average", 0) < 65:
|
229 |
+
midrange_count += 1
|
230 |
+
|
231 |
+
# GPU-rich (high-end models)
|
232 |
+
if "scores" in model and model["scores"].get("average", 0) >= 65:
|
233 |
+
gpu_rich_count += 1
|
234 |
+
|
235 |
+
# Official providers
|
236 |
+
# This is just a placeholder logic - adapt to your actual criteria
|
237 |
+
if "/" not in model["model_name"] or model["model_name"].startswith("meta/") or model["model_name"].startswith("google/"):
|
238 |
+
official_count += 1
|
239 |
+
|
240 |
+
return {
|
241 |
+
"edge_devices": edge_count,
|
242 |
+
"consumers": consumer_count,
|
243 |
+
"midrange": midrange_count,
|
244 |
+
"gpu_rich": gpu_rich_count,
|
245 |
+
"official_providers": official_count
|
246 |
+
}
|
247 |
+
|
248 |
+
if __name__ == "__main__":
|
249 |
+
import uvicorn
|
250 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
hf.svg
ADDED
|
requirements.txt
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles>=22.0,<24.0
|
2 |
+
anyio>=3.0,<5.0
|
3 |
+
audioop-lts<1.0; python_version >= "3.13" #it provides support for 'audioop' module removed in latest python version used by pydub
|
4 |
+
fastapi>=0.95.0
|
5 |
+
ffmpy
|
6 |
+
groovy~=0.1
|
7 |
+
gradio>=5.0.0
|
8 |
+
httpx>=0.24.1
|
9 |
+
huggingface_hub>=0.28.1
|
10 |
+
Jinja2<4.0
|
11 |
+
markupsafe>=2.0,<4.0
|
12 |
+
numpy>=1.20.0
|
13 |
+
orjson~=3.0
|
14 |
+
packaging
|
15 |
+
pandas>=1.3.0
|
16 |
+
pillow>=8.0,<12.0
|
17 |
+
pydantic>=1.10.0
|
18 |
+
python-multipart>=0.0.18 # required for fastapi forms.
|
19 |
+
pydub
|
20 |
+
pyyaml>=5.0,<7.0
|
21 |
+
ruff>=0.9.3; sys.platform != 'emscripten' # needed here for custom component docs generation
|
22 |
+
safehttpx>=0.1.6,<0.2.0
|
23 |
+
semantic_version~=2.0
|
24 |
+
starlette>=0.40.0,<1.0; sys.platform != 'emscripten'
|
25 |
+
tomlkit>=0.12.0,<0.14.0
|
26 |
+
typer>=0.12,<1.0; sys.platform != 'emscripten'
|
27 |
+
typing_extensions~=4.0
|
28 |
+
urllib3~=2.0; sys.platform == 'emscripten' # urllib3 is used for Lite support. Version spec can be omitted because urllib3==2.1.0 is prebuilt for Pyodide and urllib>=2.2.0 supports Pyodide as well.
|
29 |
+
uvicorn>=0.21.0
|
30 |
+
requests>=2.28.0
|
run.py
ADDED
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import requests
|
3 |
+
from urllib.parse import quote
|
4 |
+
import subprocess
|
5 |
+
import os
|
6 |
+
import time
|
7 |
+
import sys
|
8 |
+
from datetime import datetime
|
9 |
+
|
10 |
+
import gradio as gr
|
11 |
+
|
12 |
+
def style_dataframe(df):
|
13 |
+
if len(df) == 0:
|
14 |
+
return df
|
15 |
+
|
16 |
+
# Define the columns to highlight based on the screenshot
|
17 |
+
highlight_cols = ["Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO"]
|
18 |
+
|
19 |
+
# Initialize the styler
|
20 |
+
styled = df.style
|
21 |
+
|
22 |
+
# Function to create gradient background based on value
|
23 |
+
def highlight_green(val):
|
24 |
+
try:
|
25 |
+
# Extract numeric value from string (remove % if present)
|
26 |
+
val_float = float(str(val).replace('%', '').replace(' kg', ''))
|
27 |
+
|
28 |
+
# Create gradient background filling based on the value percentage
|
29 |
+
# Use the exact colors from the example
|
30 |
+
return f'background: linear-gradient(90deg, rgba(46, 125, 50, 0.5) {val_float}%, rgba(46, 125, 50, 0.1) {val_float}%); color: white;'
|
31 |
+
except:
|
32 |
+
return 'background-color: #121212; color: white;'
|
33 |
+
|
34 |
+
# Apply the highlighting to performance metric columns
|
35 |
+
for col in highlight_cols:
|
36 |
+
styled = styled.applymap(highlight_green, subset=[col])
|
37 |
+
|
38 |
+
styled = styled.set_properties(
|
39 |
+
subset=["Model"],
|
40 |
+
**{'color': '#4da6ff'}
|
41 |
+
)
|
42 |
+
|
43 |
+
return styled
|
44 |
+
|
45 |
+
def increment_counter(counter):
|
46 |
+
return counter + 1
|
47 |
+
|
48 |
+
def handle_select(evt: gr.SelectData, counter):
|
49 |
+
return counter + 1, f"{evt.index}", f"{evt.value}"
|
50 |
+
|
51 |
+
def start_api_server():
|
52 |
+
api_process = subprocess.Popen(
|
53 |
+
[sys.executable, "api.py"],
|
54 |
+
cwd=os.path.dirname(os.path.abspath(__file__))
|
55 |
+
)
|
56 |
+
# Give the API server a moment to start
|
57 |
+
time.sleep(2)
|
58 |
+
return api_process
|
59 |
+
|
60 |
+
def apply_filters(filters, models_data):
|
61 |
+
if not filters or len(filters) == 0:
|
62 |
+
# No filters selected, return all data
|
63 |
+
return models_data
|
64 |
+
|
65 |
+
filtered_models = []
|
66 |
+
|
67 |
+
for model in models_data:
|
68 |
+
scores = model.get("scores", {})
|
69 |
+
average_score = scores.get("average", 0)
|
70 |
+
model_name = model.get("model_name", "")
|
71 |
+
|
72 |
+
# Check which filters are selected and apply them
|
73 |
+
should_include = False
|
74 |
+
for filter_option in filters:
|
75 |
+
if "Edge Devices" in filter_option and average_score < 45 or "Consumers" in filter_option and 45 <= average_score < 55 or "Mid-range" in filter_option and 55 <= average_score < 65 or "GPU-rich" in filter_option and average_score >= 65 or "Official Providers" in filter_option and ("/" not in model_name or
|
76 |
+
model_name.startswith("meta/") or
|
77 |
+
model_name.startswith("google/") or
|
78 |
+
model_name.startswith("openai/") or
|
79 |
+
model_name.startswith("microsoft/")):
|
80 |
+
should_include = True
|
81 |
+
break
|
82 |
+
|
83 |
+
if should_include:
|
84 |
+
filtered_models.append(model)
|
85 |
+
|
86 |
+
return filtered_models
|
87 |
+
|
88 |
+
def format_timestamp(timestamp_str):
|
89 |
+
try:
|
90 |
+
# Parse ISO format timestamp
|
91 |
+
|
92 |
+
# Try to parse ISO format with timezone
|
93 |
+
try:
|
94 |
+
dt = datetime.fromisoformat(timestamp_str)
|
95 |
+
except:
|
96 |
+
# Fallback for different timestamp formats
|
97 |
+
dt = datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S.%f")
|
98 |
+
|
99 |
+
# Format nicely for display
|
100 |
+
return dt.strftime("%B %d, %Y at %I:%M %p")
|
101 |
+
except Exception as e:
|
102 |
+
print(f"Error formatting timestamp: {e}")
|
103 |
+
return timestamp_str
|
104 |
+
|
105 |
+
def create_leaderboard_data(selected_filters=None):
|
106 |
+
try:
|
107 |
+
response = requests.get("http://localhost:8000/api/leaderboard")
|
108 |
+
if response.status_code == 200:
|
109 |
+
data = response.json()
|
110 |
+
models_data = data.get("models", [])
|
111 |
+
updated_at = data.get("updated_at", "Unknown")
|
112 |
+
formatted_time = format_timestamp(updated_at)
|
113 |
+
|
114 |
+
# Apply filters if any are selected
|
115 |
+
if selected_filters:
|
116 |
+
models_data = apply_filters(selected_filters, models_data)
|
117 |
+
|
118 |
+
rows = []
|
119 |
+
for i, model in enumerate(models_data, 1):
|
120 |
+
model_name = model["model_name"]
|
121 |
+
model_type = model["type"]
|
122 |
+
scores = model["scores"]
|
123 |
+
co2_cost = model.get("co2_cost", "N/A")
|
124 |
+
|
125 |
+
# Only use green for open and red for closed
|
126 |
+
emoji = "🟢" if model_type.lower() == "open" else "🔴"
|
127 |
+
type_with_emoji = f"{emoji} {model_type.upper()}"
|
128 |
+
|
129 |
+
# Use model_link from API if available, otherwise create one
|
130 |
+
if "model_link" in model and model["model_link"]:
|
131 |
+
model_link = f"[{model_name}]({model['model_link']})"
|
132 |
+
# Format model name with link
|
133 |
+
elif "/" in model_name:
|
134 |
+
org, name = model_name.split("/", 1)
|
135 |
+
model_link = f"[{model_name}](https://huggingface.co/{quote(model_name)})"
|
136 |
+
else:
|
137 |
+
model_link = f"[{model_name}](https://huggingface.co/models?search={quote(model_name)})"
|
138 |
+
|
139 |
+
rows.append([
|
140 |
+
i, # Rank
|
141 |
+
type_with_emoji,
|
142 |
+
model_link,
|
143 |
+
f"{scores.get('average', 0):.2f}",
|
144 |
+
f"{scores.get('ifeval', 0):.2f}",
|
145 |
+
f"{scores.get('bbhi', 0):.2f}",
|
146 |
+
f"{scores.get('math', 0):.2f}",
|
147 |
+
f"{scores.get('gpqa', 0):.2f}",
|
148 |
+
f"{scores.get('mujb', 0):.2f}",
|
149 |
+
f"{scores.get('mmlu', 0):.2f}",
|
150 |
+
f"{co2_cost}" if isinstance(co2_cost, (int, float)) else co2_cost
|
151 |
+
])
|
152 |
+
|
153 |
+
df = pd.DataFrame(rows, columns=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"])
|
154 |
+
styled_df = style_dataframe(df)
|
155 |
+
return styled_df, formatted_time
|
156 |
+
else:
|
157 |
+
# Return an empty dataframe with proper columns if API fails
|
158 |
+
empty_df = pd.DataFrame(columns=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"])
|
159 |
+
return empty_df, "Unknown"
|
160 |
+
except Exception as e:
|
161 |
+
print(f"Error fetching leaderboard data: {e}")
|
162 |
+
# Return an empty dataframe with proper columns if API fails
|
163 |
+
empty_df = pd.DataFrame(columns=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"])
|
164 |
+
return empty_df, "Unknown"
|
165 |
+
|
166 |
+
def load_svg(file_path="hf.svg"):
|
167 |
+
with open(file_path) as f:
|
168 |
+
svg_content = f.read()
|
169 |
+
return svg_content
|
170 |
+
|
171 |
+
def get_filter_data():
|
172 |
+
try:
|
173 |
+
response = requests.get("http://localhost:8000/api/filters")
|
174 |
+
if response.status_code == 200:
|
175 |
+
filter_data = response.json()
|
176 |
+
return [
|
177 |
+
f"For Edge Devices · {filter_data.get('edge_devices', 0)}",
|
178 |
+
f"For Consumers · {filter_data.get('consumers', 0)}",
|
179 |
+
f"Mid-range · {filter_data.get('midrange', 0)}",
|
180 |
+
f"For the GPU-rich · {filter_data.get('gpu_rich', 0)}",
|
181 |
+
f"Only Official Providers · {filter_data.get('official_providers', 0)}"
|
182 |
+
]
|
183 |
+
else:
|
184 |
+
return [
|
185 |
+
"For Edge Devices · 0",
|
186 |
+
"For Consumers · 0",
|
187 |
+
"Mid-range · 0",
|
188 |
+
"For the GPU-rich · 0",
|
189 |
+
"Only Official Providers · 0"
|
190 |
+
]
|
191 |
+
except Exception as e:
|
192 |
+
print(f"Error fetching filter data: {e}")
|
193 |
+
return [
|
194 |
+
"For Edge Devices · 0",
|
195 |
+
"For Consumers · 0",
|
196 |
+
"Mid-range · 0",
|
197 |
+
"For the GPU-rich · 0",
|
198 |
+
"Only Official Providers · 0"
|
199 |
+
]
|
200 |
+
|
201 |
+
def refresh_leaderboard(selected_filters=None):
|
202 |
+
try:
|
203 |
+
# Request a refresh from the API
|
204 |
+
requests.get("http://localhost:8000/api/leaderboard?refresh=true")
|
205 |
+
# Get updated data
|
206 |
+
df, timestamp = create_leaderboard_data(selected_filters)
|
207 |
+
filter_choices = get_filter_data()
|
208 |
+
return df, filter_choices, f"Last updated: {timestamp}"
|
209 |
+
except Exception as e:
|
210 |
+
print(f"Error refreshing data: {e}")
|
211 |
+
return None, None, "Error refreshing data"
|
212 |
+
|
213 |
+
def update_table(filters):
|
214 |
+
df, timestamp = create_leaderboard_data(filters)
|
215 |
+
return df, f"Last updated: {timestamp}"
|
216 |
+
|
217 |
+
def load_css(file_path="style.css"):
|
218 |
+
try:
|
219 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
220 |
+
css_path = os.path.join(current_dir, file_path)
|
221 |
+
with open(css_path) as f:
|
222 |
+
css_content = f.read()
|
223 |
+
return css_content
|
224 |
+
except Exception as e:
|
225 |
+
print(f"Error loading CSS file: {e}")
|
226 |
+
# Return a basic CSS if file not found
|
227 |
+
return """
|
228 |
+
.dataframe-container {
|
229 |
+
border-radius: 8px;
|
230 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
|
231 |
+
}
|
232 |
+
"""
|
233 |
+
|
234 |
+
with gr.Blocks(css=load_css()) as demo:
|
235 |
+
df, timestamp = create_leaderboard_data()
|
236 |
+
|
237 |
+
with gr.Row():
|
238 |
+
svg_content = load_svg()
|
239 |
+
gr.HTML(svg_content)
|
240 |
+
gr.HTML("""
|
241 |
+
<div style="display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
|
242 |
+
<div class="leaderboard-title">Open LLM Leaderboard</div>
|
243 |
+
</div>
|
244 |
+
<div class="leaderboard-subtitle">Comparing Large Language Models in an open and reproducible way</div>
|
245 |
+
""")
|
246 |
+
status_text = gr.HTML(f"""<div style="text-align: center; margin-bottom: 10px;">Last updated: {timestamp}</div>""")
|
247 |
+
|
248 |
+
with gr.Row(elem_classes="filters-container"):
|
249 |
+
filter_choices = get_filter_data()
|
250 |
+
filters = gr.CheckboxGroup(
|
251 |
+
label="Quick Filters",
|
252 |
+
choices=filter_choices,
|
253 |
+
)
|
254 |
+
|
255 |
+
# Create and display the dataframe
|
256 |
+
|
257 |
+
leaderboard_table = gr.Dataframe(
|
258 |
+
value=df,
|
259 |
+
headers=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"],
|
260 |
+
datatype=["number", "str", "markdown", "str", "str", "str", "str", "str", "str", "str", "str"],
|
261 |
+
elem_id="leaderboard-table",
|
262 |
+
elem_classes="dataframe-container",
|
263 |
+
interactive=False,
|
264 |
+
max_height=600,
|
265 |
+
show_search="search",
|
266 |
+
show_copy_button=True,
|
267 |
+
show_fullscreen_button=True,
|
268 |
+
pinned_columns=2,
|
269 |
+
column_widths=["5%", "10%", "35%", "7%", "7%", "7%", "7%", "7%", "7%", "7%", "6%"]
|
270 |
+
)
|
271 |
+
|
272 |
+
refresh_btn = gr.Button("Refresh Data", elem_classes="refresh-btn")
|
273 |
+
refresh_btn.click(refresh_leaderboard, inputs=[filters], outputs=[leaderboard_table, filters, status_text])
|
274 |
+
|
275 |
+
filters.change(update_table, inputs=[filters], outputs=[leaderboard_table, status_text])
|
276 |
+
|
277 |
+
if __name__ == "__main__":
|
278 |
+
api_process = start_api_server()
|
279 |
+
demo.launch()
|
280 |
+
api_process.terminate()
|
run_all.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import subprocess
|
3 |
+
import sys
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
import signal
|
7 |
+
import atexit
|
8 |
+
|
9 |
+
def start_api_server():
|
10 |
+
api_process = subprocess.Popen(
|
11 |
+
[sys.executable, "api.py"],
|
12 |
+
cwd=os.path.dirname(os.path.abspath(__file__))
|
13 |
+
)
|
14 |
+
print("API server started (PID:", api_process.pid, ")")
|
15 |
+
return api_process
|
16 |
+
|
17 |
+
def start_gradio_app():
|
18 |
+
gradio_process = subprocess.Popen(
|
19 |
+
[sys.executable, "-c", "import gradio as gr; import run; run.demo.launch()"],
|
20 |
+
cwd=os.path.dirname(os.path.abspath(__file__))
|
21 |
+
)
|
22 |
+
print("Gradio interface started (PID:", gradio_process.pid, ")")
|
23 |
+
return gradio_process
|
24 |
+
|
25 |
+
def cleanup_processes(api_process, gradio_process):
|
26 |
+
print("\nShutting down services...")
|
27 |
+
|
28 |
+
if api_process and api_process.poll() is None:
|
29 |
+
api_process.terminate()
|
30 |
+
print("API server terminated")
|
31 |
+
|
32 |
+
if gradio_process and gradio_process.poll() is None:
|
33 |
+
gradio_process.terminate()
|
34 |
+
print("Gradio interface terminated")
|
35 |
+
|
36 |
+
def main():
|
37 |
+
api_process = start_api_server()
|
38 |
+
# Give the API server a moment to start
|
39 |
+
time.sleep(2)
|
40 |
+
|
41 |
+
gradio_process = start_gradio_app()
|
42 |
+
|
43 |
+
# Register cleanup function to be called on exit
|
44 |
+
atexit.register(cleanup_processes, api_process, gradio_process)
|
45 |
+
|
46 |
+
# Handle keyboard interrupts
|
47 |
+
def signal_handler(sig, frame):
|
48 |
+
print("\nReceived termination signal")
|
49 |
+
cleanup_processes(api_process, gradio_process)
|
50 |
+
sys.exit(0)
|
51 |
+
|
52 |
+
signal.signal(signal.SIGINT, signal_handler)
|
53 |
+
signal.signal(signal.SIGTERM, signal_handler)
|
54 |
+
|
55 |
+
print("\nLeaderboard application started!")
|
56 |
+
print("- API server running at http://localhost:8000")
|
57 |
+
print("- Gradio interface running at http://localhost:7860")
|
58 |
+
print("\nPress Ctrl+C to stop all services")
|
59 |
+
|
60 |
+
# Keep the main process running
|
61 |
+
try:
|
62 |
+
while True:
|
63 |
+
time.sleep(1)
|
64 |
+
except KeyboardInterrupt:
|
65 |
+
pass
|
66 |
+
|
67 |
+
if __name__ == "__main__":
|
68 |
+
main()
|
style.css
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.html-container {
|
2 |
+
text-align: center;
|
3 |
+
display: flex;
|
4 |
+
justify-content: center;
|
5 |
+
width: 100%;
|
6 |
+
}
|
7 |
+
|
8 |
+
.dataframe-container {
|
9 |
+
margin-top: 0.5rem;
|
10 |
+
margin-bottom: 0.5rem;
|
11 |
+
}
|
12 |
+
|
13 |
+
.leaderboard-title {
|
14 |
+
font-size: 1.5rem;
|
15 |
+
font-weight: bold;
|
16 |
+
margin-bottom: 0.25rem;
|
17 |
+
color: #f0f0f0;
|
18 |
+
}
|
19 |
+
|
20 |
+
.leaderboard-subtitle {
|
21 |
+
font-size: 0.9rem;
|
22 |
+
margin-bottom: 1rem;
|
23 |
+
color: #a0a0a0;
|
24 |
+
}
|
25 |
+
|
26 |
+
.filters-container fieldset {
|
27 |
+
display: flex;
|
28 |
+
flex-direction: row;
|
29 |
+
justify-content: center;
|
30 |
+
align-items: center;
|
31 |
+
gap: 0.5rem;
|
32 |
+
}
|
33 |
+
|
34 |
+
.refresh-btn {
|
35 |
+
margin-top: 0.5rem;
|
36 |
+
}
|
37 |
+
|
38 |
+
.status-container {
|
39 |
+
display: flex;
|
40 |
+
justify-content: flex-end;
|
41 |
+
font-size: 0.75rem;
|
42 |
+
color: #a0a0a0;
|
43 |
+
}
|
44 |
+
|