Spaces:

holistic-ai
/

LibVulnWatch

Running

App Files Files Community

LibVulnWatch / src /submission /check_validity.py

wu981526092

update

bccaf50 2 months ago

raw

history blame

4.27 kB

	import json
	import os
	import re
	import requests
	from collections import defaultdict
	from datetime import datetime, timedelta, timezone
	from typing import Dict, Tuple, Any, List, Set

	def is_repository_valid(repo_name: str, repo_url: str) -> Tuple[bool, str, Dict[str, Any]]:
	"""
	Checks if a GitHub repository is valid and accessible.

	Args:
	repo_name: The name of the repository (org/repo format)
	repo_url: URL to the repository

	Returns:
	Tuple of (is_valid, error_message, library_info)
	"""
	# Basic format validation
	if not repo_name or "/" not in repo_name:
	return False, "Repository name must be in the format 'organization/repository'", {}

	# Check if GitHub URL
	if repo_url and "github.com" in repo_url:
	# Extract org and repo from URL if provided
	try:
	parts = repo_url.split("github.com/")[1].split("/")
	org = parts[0]
	repo = parts[1].split(".")[0] if "." in parts[1] else parts[1]
	url_repo_name = f"{org}/{repo}"

	# Check if URL matches repo_name
	if url_repo_name != repo_name:
	return False, f"Repository name ({repo_name}) doesn't match the URL ({url_repo_name})", {}
	except:
	pass # Fall back to using repo_name

	# Get repository information from GitHub API
	org, repo = repo_name.split("/")
	api_url = f"https://api.github.com/repos/{org}/{repo}"

	try:
	response = requests.get(api_url)
	if response.status_code != 200:
	return False, f"Repository not found or not accessible: {response.json().get('message', 'Unknown error')}", {}

	# Parse repository data
	repo_data = response.json()
	library_info = get_library_info(repo_data)

	return True, "", library_info

	except Exception as e:
	return False, f"Error accessing repository: {str(e)}", {}

	def get_library_info(repo_data: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Extracts relevant information from GitHub repository data.

	Args:
	repo_data: GitHub API response for a repository

	Returns:
	Dictionary with library metadata
	"""
	library_info = {
	"name": repo_data.get("name", ""),
	"full_name": repo_data.get("full_name", ""),
	"description": repo_data.get("description", ""),
	"stars": repo_data.get("stargazers_count", 0),
	"forks": repo_data.get("forks_count", 0),
	"license": repo_data.get("license", {}).get("name", "Unknown"),
	"created_at": repo_data.get("created_at", ""),
	"updated_at": repo_data.get("updated_at", ""),
	"open_issues": repo_data.get("open_issues_count", 0),
	"default_branch": repo_data.get("default_branch", "main"),
	"is_archived": repo_data.get("archived", False),
	}

	return library_info

	def already_submitted_libraries(requested_libraries_dir: str) -> Tuple[Set[str], Dict[str, List[str]]]:
	"""
	Gathers a list of already submitted libraries to avoid duplicates.

	Args:
	requested_libraries_dir: Directory with library assessment requests

	Returns:
	Tuple of (set of library identifiers, dict mapping orgs to submission dates)
	"""
	depth = 1
	library_ids = []
	orgs_to_submission_dates = defaultdict(list)

	for root, _, files in os.walk(requested_libraries_dir):
	current_depth = root.count(os.sep) - requested_libraries_dir.count(os.sep)
	if current_depth == depth:
	for file in files:
	if not file.endswith(".json"):
	continue
	with open(os.path.join(root, file), "r") as f:
	info = json.load(f)
	library_ids.append(f"{info['library']}_{info['version']}")

	# Select organisation
	if info["library"].count("/") == 0 or "submitted_time" not in info:
	continue
	organisation, _ = info["library"].split("/")
	orgs_to_submission_dates[organisation].append(info["submitted_time"])

	return set(library_ids), orgs_to_submission_dates