LibVulnWatch / src /submission /check_validity.py
wu981526092's picture
update
bccaf50
raw
history blame
4.27 kB
import json
import os
import re
import requests
from collections import defaultdict
from datetime import datetime, timedelta, timezone
from typing import Dict, Tuple, Any, List, Set
def is_repository_valid(repo_name: str, repo_url: str) -> Tuple[bool, str, Dict[str, Any]]:
"""
Checks if a GitHub repository is valid and accessible.
Args:
repo_name: The name of the repository (org/repo format)
repo_url: URL to the repository
Returns:
Tuple of (is_valid, error_message, library_info)
"""
# Basic format validation
if not repo_name or "/" not in repo_name:
return False, "Repository name must be in the format 'organization/repository'", {}
# Check if GitHub URL
if repo_url and "github.com" in repo_url:
# Extract org and repo from URL if provided
try:
parts = repo_url.split("github.com/")[1].split("/")
org = parts[0]
repo = parts[1].split(".")[0] if "." in parts[1] else parts[1]
url_repo_name = f"{org}/{repo}"
# Check if URL matches repo_name
if url_repo_name != repo_name:
return False, f"Repository name ({repo_name}) doesn't match the URL ({url_repo_name})", {}
except:
pass # Fall back to using repo_name
# Get repository information from GitHub API
org, repo = repo_name.split("/")
api_url = f"https://api.github.com/repos/{org}/{repo}"
try:
response = requests.get(api_url)
if response.status_code != 200:
return False, f"Repository not found or not accessible: {response.json().get('message', 'Unknown error')}", {}
# Parse repository data
repo_data = response.json()
library_info = get_library_info(repo_data)
return True, "", library_info
except Exception as e:
return False, f"Error accessing repository: {str(e)}", {}
def get_library_info(repo_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Extracts relevant information from GitHub repository data.
Args:
repo_data: GitHub API response for a repository
Returns:
Dictionary with library metadata
"""
library_info = {
"name": repo_data.get("name", ""),
"full_name": repo_data.get("full_name", ""),
"description": repo_data.get("description", ""),
"stars": repo_data.get("stargazers_count", 0),
"forks": repo_data.get("forks_count", 0),
"license": repo_data.get("license", {}).get("name", "Unknown"),
"created_at": repo_data.get("created_at", ""),
"updated_at": repo_data.get("updated_at", ""),
"open_issues": repo_data.get("open_issues_count", 0),
"default_branch": repo_data.get("default_branch", "main"),
"is_archived": repo_data.get("archived", False),
}
return library_info
def already_submitted_libraries(requested_libraries_dir: str) -> Tuple[Set[str], Dict[str, List[str]]]:
"""
Gathers a list of already submitted libraries to avoid duplicates.
Args:
requested_libraries_dir: Directory with library assessment requests
Returns:
Tuple of (set of library identifiers, dict mapping orgs to submission dates)
"""
depth = 1
library_ids = []
orgs_to_submission_dates = defaultdict(list)
for root, _, files in os.walk(requested_libraries_dir):
current_depth = root.count(os.sep) - requested_libraries_dir.count(os.sep)
if current_depth == depth:
for file in files:
if not file.endswith(".json"):
continue
with open(os.path.join(root, file), "r") as f:
info = json.load(f)
library_ids.append(f"{info['library']}_{info['version']}")
# Select organisation
if info["library"].count("/") == 0 or "submitted_time" not in info:
continue
organisation, _ = info["library"].split("/")
orgs_to_submission_dates[organisation].append(info["submitted_time"])
return set(library_ids), orgs_to_submission_dates