Spaces:
Running
Running
Commit
·
95ba712
1
Parent(s):
66491be
add
Browse files- requirements.txt +1 -0
- src/envs.py +4 -0
- src/leaderboard/github_data.py +131 -0
- src/leaderboard/read_evals.py +10 -0
requirements.txt
CHANGED
@@ -10,6 +10,7 @@ matplotlib
|
|
10 |
numpy
|
11 |
pandas
|
12 |
python-dateutil
|
|
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
|
|
10 |
numpy
|
11 |
pandas
|
12 |
python-dateutil
|
13 |
+
requests
|
14 |
tqdm
|
15 |
transformers
|
16 |
tokenizers>=0.15.0
|
src/envs.py
CHANGED
@@ -11,6 +11,10 @@ LOCAL_MODE = True
|
|
11 |
# Get token from environment or use None in local mode
|
12 |
TOKEN = os.environ.get("HF_TOKEN") if not LOCAL_MODE else None
|
13 |
|
|
|
|
|
|
|
|
|
14 |
OWNER = "libvulnwatch" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
15 |
# ----------------------------------
|
16 |
|
|
|
11 |
# Get token from environment or use None in local mode
|
12 |
TOKEN = os.environ.get("HF_TOKEN") if not LOCAL_MODE else None
|
13 |
|
14 |
+
# GitHub API token for fetching repo metadata
|
15 |
+
# This increases rate limits from 60 to 5000 requests per hour
|
16 |
+
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
|
17 |
+
|
18 |
OWNER = "libvulnwatch" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
19 |
# ----------------------------------
|
20 |
|
src/leaderboard/github_data.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utilities for fetching GitHub repository data"""
|
2 |
+
|
3 |
+
import os
|
4 |
+
import requests
|
5 |
+
import time
|
6 |
+
from functools import lru_cache
|
7 |
+
from urllib.parse import urlparse
|
8 |
+
|
9 |
+
# Import GitHub token from envs
|
10 |
+
from src.envs import GITHUB_TOKEN
|
11 |
+
|
12 |
+
|
13 |
+
def extract_repo_path(repo_url):
|
14 |
+
"""Extract org/repo path from GitHub URL
|
15 |
+
|
16 |
+
Args:
|
17 |
+
repo_url: GitHub repository URL
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
Repository path in format "org/repo"
|
21 |
+
"""
|
22 |
+
if not repo_url:
|
23 |
+
return None
|
24 |
+
|
25 |
+
# Handle both URL and org/repo format
|
26 |
+
if repo_url.startswith(("http://", "https://")):
|
27 |
+
parsed = urlparse(repo_url)
|
28 |
+
path = parsed.path.strip("/")
|
29 |
+
|
30 |
+
# Remove .git suffix if present
|
31 |
+
if path.endswith(".git"):
|
32 |
+
path = path[:-4]
|
33 |
+
|
34 |
+
return path
|
35 |
+
|
36 |
+
# Already in org/repo format
|
37 |
+
return repo_url
|
38 |
+
|
39 |
+
|
40 |
+
@lru_cache(maxsize=128)
|
41 |
+
def get_github_data(repo_path, use_token=True):
|
42 |
+
"""Fetch repository data from GitHub API
|
43 |
+
|
44 |
+
Args:
|
45 |
+
repo_path: Repository path in format "org/repo"
|
46 |
+
use_token: Whether to use GitHub token if available
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
Dictionary with repository data including stars and license
|
50 |
+
"""
|
51 |
+
if not repo_path:
|
52 |
+
return {"github_stars": 0, "license": "Unknown"}
|
53 |
+
|
54 |
+
api_url = f"https://api.github.com/repos/{repo_path}"
|
55 |
+
headers = {"Accept": "application/vnd.github.v3+json"}
|
56 |
+
|
57 |
+
# Add token for higher rate limits if available
|
58 |
+
if use_token and GITHUB_TOKEN:
|
59 |
+
headers["Authorization"] = f"token {GITHUB_TOKEN}"
|
60 |
+
|
61 |
+
try:
|
62 |
+
response = requests.get(api_url, headers=headers)
|
63 |
+
|
64 |
+
if response.status_code == 200:
|
65 |
+
data = response.json()
|
66 |
+
|
67 |
+
# Extract relevant fields
|
68 |
+
result = {
|
69 |
+
"github_stars": data.get("stargazers_count", 0),
|
70 |
+
"license": data.get("license", {}).get("spdx_id", "Unknown"),
|
71 |
+
"full_name": data.get("full_name", repo_path),
|
72 |
+
"created_at": data.get("created_at", ""),
|
73 |
+
"updated_at": data.get("updated_at", ""),
|
74 |
+
"language": data.get("language", ""),
|
75 |
+
"forks_count": data.get("forks_count", 0),
|
76 |
+
"default_branch": data.get("default_branch", "main"),
|
77 |
+
}
|
78 |
+
|
79 |
+
# If license is None or "NOASSERTION", use "Unknown"
|
80 |
+
if not result["license"] or result["license"] == "NOASSERTION":
|
81 |
+
result["license"] = "Unknown"
|
82 |
+
|
83 |
+
return result
|
84 |
+
else:
|
85 |
+
print(f"GitHub API error for {repo_path}: {response.status_code} - {response.text}")
|
86 |
+
return {"github_stars": 0, "license": "Unknown"}
|
87 |
+
|
88 |
+
except Exception as e:
|
89 |
+
print(f"Error fetching GitHub data for {repo_path}: {e}")
|
90 |
+
return {"github_stars": 0, "license": "Unknown"}
|
91 |
+
|
92 |
+
|
93 |
+
def update_assessment_with_github_data(assessment, force_update=False):
|
94 |
+
"""Update assessment with data from GitHub
|
95 |
+
|
96 |
+
Args:
|
97 |
+
assessment: AssessmentResult object
|
98 |
+
force_update: Whether to force update even if values exist
|
99 |
+
|
100 |
+
Returns:
|
101 |
+
Updated AssessmentResult object
|
102 |
+
"""
|
103 |
+
# Skip if no data is missing or if force_update is False
|
104 |
+
if not force_update and assessment.stars > 0 and assessment.license != "?":
|
105 |
+
return assessment
|
106 |
+
|
107 |
+
# Try getting repo path from library_name first
|
108 |
+
repo_path = None
|
109 |
+
if assessment.library_name and "/" in assessment.library_name:
|
110 |
+
repo_path = assessment.library_name
|
111 |
+
|
112 |
+
# Fall back to repository_url if available
|
113 |
+
if not repo_path and hasattr(assessment, 'repository_url') and assessment.repository_url:
|
114 |
+
repo_path = extract_repo_path(assessment.repository_url)
|
115 |
+
|
116 |
+
# If we still don't have a path, reconstruct from org/repo
|
117 |
+
if not repo_path and assessment.org and assessment.repo:
|
118 |
+
repo_path = f"{assessment.org}/{assessment.repo}"
|
119 |
+
|
120 |
+
# If we found a valid path, fetch and update
|
121 |
+
if repo_path:
|
122 |
+
github_data = get_github_data(repo_path)
|
123 |
+
|
124 |
+
# Update if data is missing or force_update is True
|
125 |
+
if force_update or assessment.stars == 0:
|
126 |
+
assessment.stars = github_data["github_stars"]
|
127 |
+
|
128 |
+
if force_update or assessment.license == "?":
|
129 |
+
assessment.license = github_data["license"]
|
130 |
+
|
131 |
+
return assessment
|
src/leaderboard/read_evals.py
CHANGED
@@ -9,6 +9,7 @@ import numpy as np
|
|
9 |
|
10 |
from src.display.formatting import make_clickable_library, make_clickable_report
|
11 |
from src.display.utils import AutoEvalColumn, LibraryType, Tasks, Language, AssessmentStatus
|
|
|
12 |
|
13 |
|
14 |
@dataclass
|
@@ -30,6 +31,7 @@ class AssessmentResult:
|
|
30 |
availability: bool = True
|
31 |
verified: bool = False
|
32 |
report_url: str = "" # URL to detailed assessment report
|
|
|
33 |
|
34 |
@classmethod
|
35 |
def init_from_json_file(self, json_filepath):
|
@@ -88,6 +90,7 @@ class AssessmentResult:
|
|
88 |
verified=assessment.get("independently_verified", False),
|
89 |
last_update=last_update,
|
90 |
report_url=assessment.get("report_url", ""),
|
|
|
91 |
)
|
92 |
|
93 |
def update_with_request_file(self, requests_path):
|
@@ -99,8 +102,15 @@ class AssessmentResult:
|
|
99 |
request = json.load(f)
|
100 |
self.library_type = LibraryType.from_str(request.get("library_type", ""))
|
101 |
self.stars = request.get("stars", 0)
|
|
|
|
|
|
|
102 |
except Exception:
|
103 |
print(f"Could not find request file for {self.library_name} version {self.version}")
|
|
|
|
|
|
|
|
|
104 |
|
105 |
def to_dict(self):
|
106 |
"""Converts the Assessment Result to a dict compatible with our dataframe display"""
|
|
|
9 |
|
10 |
from src.display.formatting import make_clickable_library, make_clickable_report
|
11 |
from src.display.utils import AutoEvalColumn, LibraryType, Tasks, Language, AssessmentStatus
|
12 |
+
from src.leaderboard.github_data import update_assessment_with_github_data
|
13 |
|
14 |
|
15 |
@dataclass
|
|
|
31 |
availability: bool = True
|
32 |
verified: bool = False
|
33 |
report_url: str = "" # URL to detailed assessment report
|
34 |
+
repository_url: str = "" # GitHub repository URL
|
35 |
|
36 |
@classmethod
|
37 |
def init_from_json_file(self, json_filepath):
|
|
|
90 |
verified=assessment.get("independently_verified", False),
|
91 |
last_update=last_update,
|
92 |
report_url=assessment.get("report_url", ""),
|
93 |
+
repository_url=assessment.get("repository_url", ""),
|
94 |
)
|
95 |
|
96 |
def update_with_request_file(self, requests_path):
|
|
|
102 |
request = json.load(f)
|
103 |
self.library_type = LibraryType.from_str(request.get("library_type", ""))
|
104 |
self.stars = request.get("stars", 0)
|
105 |
+
# Add repository URL if not already set
|
106 |
+
if not self.repository_url and "repository_url" in request:
|
107 |
+
self.repository_url = request.get("repository_url", "")
|
108 |
except Exception:
|
109 |
print(f"Could not find request file for {self.library_name} version {self.version}")
|
110 |
+
|
111 |
+
# Try to get GitHub stars and license if missing
|
112 |
+
if self.stars == 0 or self.license == "?":
|
113 |
+
update_assessment_with_github_data(self)
|
114 |
|
115 |
def to_dict(self):
|
116 |
"""Converts the Assessment Result to a dict compatible with our dataframe display"""
|