wu981526092 commited on
Commit
95ba712
·
1 Parent(s): 66491be
requirements.txt CHANGED
@@ -10,6 +10,7 @@ matplotlib
10
  numpy
11
  pandas
12
  python-dateutil
 
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
 
10
  numpy
11
  pandas
12
  python-dateutil
13
+ requests
14
  tqdm
15
  transformers
16
  tokenizers>=0.15.0
src/envs.py CHANGED
@@ -11,6 +11,10 @@ LOCAL_MODE = True
11
  # Get token from environment or use None in local mode
12
  TOKEN = os.environ.get("HF_TOKEN") if not LOCAL_MODE else None
13
 
 
 
 
 
14
  OWNER = "libvulnwatch" # Change to your org - don't forget to create a results and request dataset, with the correct format!
15
  # ----------------------------------
16
 
 
11
  # Get token from environment or use None in local mode
12
  TOKEN = os.environ.get("HF_TOKEN") if not LOCAL_MODE else None
13
 
14
+ # GitHub API token for fetching repo metadata
15
+ # This increases rate limits from 60 to 5000 requests per hour
16
+ GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
17
+
18
  OWNER = "libvulnwatch" # Change to your org - don't forget to create a results and request dataset, with the correct format!
19
  # ----------------------------------
20
 
src/leaderboard/github_data.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utilities for fetching GitHub repository data"""
2
+
3
+ import os
4
+ import requests
5
+ import time
6
+ from functools import lru_cache
7
+ from urllib.parse import urlparse
8
+
9
+ # Import GitHub token from envs
10
+ from src.envs import GITHUB_TOKEN
11
+
12
+
13
+ def extract_repo_path(repo_url):
14
+ """Extract org/repo path from GitHub URL
15
+
16
+ Args:
17
+ repo_url: GitHub repository URL
18
+
19
+ Returns:
20
+ Repository path in format "org/repo"
21
+ """
22
+ if not repo_url:
23
+ return None
24
+
25
+ # Handle both URL and org/repo format
26
+ if repo_url.startswith(("http://", "https://")):
27
+ parsed = urlparse(repo_url)
28
+ path = parsed.path.strip("/")
29
+
30
+ # Remove .git suffix if present
31
+ if path.endswith(".git"):
32
+ path = path[:-4]
33
+
34
+ return path
35
+
36
+ # Already in org/repo format
37
+ return repo_url
38
+
39
+
40
+ @lru_cache(maxsize=128)
41
+ def get_github_data(repo_path, use_token=True):
42
+ """Fetch repository data from GitHub API
43
+
44
+ Args:
45
+ repo_path: Repository path in format "org/repo"
46
+ use_token: Whether to use GitHub token if available
47
+
48
+ Returns:
49
+ Dictionary with repository data including stars and license
50
+ """
51
+ if not repo_path:
52
+ return {"github_stars": 0, "license": "Unknown"}
53
+
54
+ api_url = f"https://api.github.com/repos/{repo_path}"
55
+ headers = {"Accept": "application/vnd.github.v3+json"}
56
+
57
+ # Add token for higher rate limits if available
58
+ if use_token and GITHUB_TOKEN:
59
+ headers["Authorization"] = f"token {GITHUB_TOKEN}"
60
+
61
+ try:
62
+ response = requests.get(api_url, headers=headers)
63
+
64
+ if response.status_code == 200:
65
+ data = response.json()
66
+
67
+ # Extract relevant fields
68
+ result = {
69
+ "github_stars": data.get("stargazers_count", 0),
70
+ "license": data.get("license", {}).get("spdx_id", "Unknown"),
71
+ "full_name": data.get("full_name", repo_path),
72
+ "created_at": data.get("created_at", ""),
73
+ "updated_at": data.get("updated_at", ""),
74
+ "language": data.get("language", ""),
75
+ "forks_count": data.get("forks_count", 0),
76
+ "default_branch": data.get("default_branch", "main"),
77
+ }
78
+
79
+ # If license is None or "NOASSERTION", use "Unknown"
80
+ if not result["license"] or result["license"] == "NOASSERTION":
81
+ result["license"] = "Unknown"
82
+
83
+ return result
84
+ else:
85
+ print(f"GitHub API error for {repo_path}: {response.status_code} - {response.text}")
86
+ return {"github_stars": 0, "license": "Unknown"}
87
+
88
+ except Exception as e:
89
+ print(f"Error fetching GitHub data for {repo_path}: {e}")
90
+ return {"github_stars": 0, "license": "Unknown"}
91
+
92
+
93
+ def update_assessment_with_github_data(assessment, force_update=False):
94
+ """Update assessment with data from GitHub
95
+
96
+ Args:
97
+ assessment: AssessmentResult object
98
+ force_update: Whether to force update even if values exist
99
+
100
+ Returns:
101
+ Updated AssessmentResult object
102
+ """
103
+ # Skip if no data is missing or if force_update is False
104
+ if not force_update and assessment.stars > 0 and assessment.license != "?":
105
+ return assessment
106
+
107
+ # Try getting repo path from library_name first
108
+ repo_path = None
109
+ if assessment.library_name and "/" in assessment.library_name:
110
+ repo_path = assessment.library_name
111
+
112
+ # Fall back to repository_url if available
113
+ if not repo_path and hasattr(assessment, 'repository_url') and assessment.repository_url:
114
+ repo_path = extract_repo_path(assessment.repository_url)
115
+
116
+ # If we still don't have a path, reconstruct from org/repo
117
+ if not repo_path and assessment.org and assessment.repo:
118
+ repo_path = f"{assessment.org}/{assessment.repo}"
119
+
120
+ # If we found a valid path, fetch and update
121
+ if repo_path:
122
+ github_data = get_github_data(repo_path)
123
+
124
+ # Update if data is missing or force_update is True
125
+ if force_update or assessment.stars == 0:
126
+ assessment.stars = github_data["github_stars"]
127
+
128
+ if force_update or assessment.license == "?":
129
+ assessment.license = github_data["license"]
130
+
131
+ return assessment
src/leaderboard/read_evals.py CHANGED
@@ -9,6 +9,7 @@ import numpy as np
9
 
10
  from src.display.formatting import make_clickable_library, make_clickable_report
11
  from src.display.utils import AutoEvalColumn, LibraryType, Tasks, Language, AssessmentStatus
 
12
 
13
 
14
  @dataclass
@@ -30,6 +31,7 @@ class AssessmentResult:
30
  availability: bool = True
31
  verified: bool = False
32
  report_url: str = "" # URL to detailed assessment report
 
33
 
34
  @classmethod
35
  def init_from_json_file(self, json_filepath):
@@ -88,6 +90,7 @@ class AssessmentResult:
88
  verified=assessment.get("independently_verified", False),
89
  last_update=last_update,
90
  report_url=assessment.get("report_url", ""),
 
91
  )
92
 
93
  def update_with_request_file(self, requests_path):
@@ -99,8 +102,15 @@ class AssessmentResult:
99
  request = json.load(f)
100
  self.library_type = LibraryType.from_str(request.get("library_type", ""))
101
  self.stars = request.get("stars", 0)
 
 
 
102
  except Exception:
103
  print(f"Could not find request file for {self.library_name} version {self.version}")
 
 
 
 
104
 
105
  def to_dict(self):
106
  """Converts the Assessment Result to a dict compatible with our dataframe display"""
 
9
 
10
  from src.display.formatting import make_clickable_library, make_clickable_report
11
  from src.display.utils import AutoEvalColumn, LibraryType, Tasks, Language, AssessmentStatus
12
+ from src.leaderboard.github_data import update_assessment_with_github_data
13
 
14
 
15
  @dataclass
 
31
  availability: bool = True
32
  verified: bool = False
33
  report_url: str = "" # URL to detailed assessment report
34
+ repository_url: str = "" # GitHub repository URL
35
 
36
  @classmethod
37
  def init_from_json_file(self, json_filepath):
 
90
  verified=assessment.get("independently_verified", False),
91
  last_update=last_update,
92
  report_url=assessment.get("report_url", ""),
93
+ repository_url=assessment.get("repository_url", ""),
94
  )
95
 
96
  def update_with_request_file(self, requests_path):
 
102
  request = json.load(f)
103
  self.library_type = LibraryType.from_str(request.get("library_type", ""))
104
  self.stars = request.get("stars", 0)
105
+ # Add repository URL if not already set
106
+ if not self.repository_url and "repository_url" in request:
107
+ self.repository_url = request.get("repository_url", "")
108
  except Exception:
109
  print(f"Could not find request file for {self.library_name} version {self.version}")
110
+
111
+ # Try to get GitHub stars and license if missing
112
+ if self.stars == 0 or self.license == "?":
113
+ update_assessment_with_github_data(self)
114
 
115
  def to_dict(self):
116
  """Converts the Assessment Result to a dict compatible with our dataframe display"""