Spaces:

holistic-ai
/

LibVulnWatch

Running

File size: 4,265 Bytes

9ab539a
 
 
bccaf50
9ab539a
 
bccaf50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ab539a
bccaf50
 
 
 
 
 
 
 
 
 
9ab539a
bccaf50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ab539a
bccaf50
 
9ab539a
bccaf50
 
9ab539a
 
 
 
 
 
bccaf50
9ab539a
 
bccaf50
9ab539a
bccaf50
 
9ab539a
bccaf50

import json
import os
import re
import requests
from collections import defaultdict
from datetime import datetime, timedelta, timezone
from typing import Dict, Tuple, Any, List, Set

def is_repository_valid(repo_name: str, repo_url: str) -> Tuple[bool, str, Dict[str, Any]]:
    """
    Checks if a GitHub repository is valid and accessible.
    
    Args:
        repo_name: The name of the repository (org/repo format)
        repo_url: URL to the repository
        
    Returns:
        Tuple of (is_valid, error_message, library_info)
    """
    # Basic format validation
    if not repo_name or "/" not in repo_name:
        return False, "Repository name must be in the format 'organization/repository'", {}
    
    # Check if GitHub URL
    if repo_url and "github.com" in repo_url:
        # Extract org and repo from URL if provided
        try:
            parts = repo_url.split("github.com/")[1].split("/")
            org = parts[0]
            repo = parts[1].split(".")[0] if "." in parts[1] else parts[1]
            url_repo_name = f"{org}/{repo}"
            
            # Check if URL matches repo_name
            if url_repo_name != repo_name:
                return False, f"Repository name ({repo_name}) doesn't match the URL ({url_repo_name})", {}
        except:
            pass  # Fall back to using repo_name
    
    # Get repository information from GitHub API
    org, repo = repo_name.split("/")
    api_url = f"https://api.github.com/repos/{org}/{repo}"
    
    try:
        response = requests.get(api_url)
        if response.status_code != 200:
            return False, f"Repository not found or not accessible: {response.json().get('message', 'Unknown error')}", {}
        
        # Parse repository data
        repo_data = response.json()
        library_info = get_library_info(repo_data)
        
        return True, "", library_info
        
    except Exception as e:
        return False, f"Error accessing repository: {str(e)}", {}

def get_library_info(repo_data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Extracts relevant information from GitHub repository data.
    
    Args:
        repo_data: GitHub API response for a repository
        
    Returns:
        Dictionary with library metadata
    """
    library_info = {
        "name": repo_data.get("name", ""),
        "full_name": repo_data.get("full_name", ""),
        "description": repo_data.get("description", ""),
        "stars": repo_data.get("stargazers_count", 0),
        "forks": repo_data.get("forks_count", 0),
        "license": repo_data.get("license", {}).get("name", "Unknown"),
        "created_at": repo_data.get("created_at", ""),
        "updated_at": repo_data.get("updated_at", ""),
        "open_issues": repo_data.get("open_issues_count", 0),
        "default_branch": repo_data.get("default_branch", "main"),
        "is_archived": repo_data.get("archived", False),
    }
    
    return library_info

def already_submitted_libraries(requested_libraries_dir: str) -> Tuple[Set[str], Dict[str, List[str]]]:
    """
    Gathers a list of already submitted libraries to avoid duplicates.
    
    Args:
        requested_libraries_dir: Directory with library assessment requests
        
    Returns:
        Tuple of (set of library identifiers, dict mapping orgs to submission dates)
    """
    depth = 1
    library_ids = []
    orgs_to_submission_dates = defaultdict(list)

    for root, _, files in os.walk(requested_libraries_dir):
        current_depth = root.count(os.sep) - requested_libraries_dir.count(os.sep)
        if current_depth == depth:
            for file in files:
                if not file.endswith(".json"):
                    continue
                with open(os.path.join(root, file), "r") as f:
                    info = json.load(f)
                    library_ids.append(f"{info['library']}_{info['version']}")

                    # Select organisation
                    if info["library"].count("/") == 0 or "submitted_time" not in info:
                        continue
                    organisation, _ = info["library"].split("/")
                    orgs_to_submission_dates[organisation].append(info["submitted_time"])

    return set(library_ids), orgs_to_submission_dates