Spaces:

aetheris-ai
/

aibom-generator

Running

File size: 7,870 Bytes

"""
Utility functions for the AIBOM Generator.
"""

import json
import logging
import os
import re
import uuid
from typing import Dict, List, Optional, Any, Union

logger = logging.getLogger(__name__)


def setup_logging(level=logging.INFO):
    """Set up logging configuration."""
    logging.basicConfig(
        level=level,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )


def ensure_directory(directory_path):
    """Ensure that a directory exists, creating it if necessary."""
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
    return directory_path


def generate_uuid():
    """Generate a UUID for the AIBOM serialNumber."""
    return str(uuid.uuid4())


def normalize_license_id(license_text):
    """
    Normalize a license string to a SPDX license identifier if possible.
    
    Args:
        license_text: The license text to normalize
        
    Returns:
        SPDX license identifier or the original text if no match
    """
    # Common license mappings
    license_mappings = {
        "mit": "MIT",
        "apache": "Apache-2.0",
        "apache 2": "Apache-2.0",
        "apache 2.0": "Apache-2.0",
        "apache-2": "Apache-2.0",
        "apache-2.0": "Apache-2.0",
        "gpl": "GPL-3.0-only",
        "gpl-3": "GPL-3.0-only",
        "gpl-3.0": "GPL-3.0-only",
        "gpl3": "GPL-3.0-only",
        "gpl v3": "GPL-3.0-only",
        "gpl-2": "GPL-2.0-only",
        "gpl-2.0": "GPL-2.0-only",
        "gpl2": "GPL-2.0-only",
        "gpl v2": "GPL-2.0-only",
        "lgpl": "LGPL-3.0-only",
        "lgpl-3": "LGPL-3.0-only",
        "lgpl-3.0": "LGPL-3.0-only",
        "bsd": "BSD-3-Clause",
        "bsd-3": "BSD-3-Clause",
        "bsd-3-clause": "BSD-3-Clause",
        "bsd-2": "BSD-2-Clause",
        "bsd-2-clause": "BSD-2-Clause",
        "cc": "CC-BY-4.0",
        "cc-by": "CC-BY-4.0",
        "cc-by-4.0": "CC-BY-4.0",
        "cc-by-sa": "CC-BY-SA-4.0",
        "cc-by-sa-4.0": "CC-BY-SA-4.0",
        "cc-by-nc": "CC-BY-NC-4.0",
        "cc-by-nc-4.0": "CC-BY-NC-4.0",
        "cc0": "CC0-1.0",
        "cc0-1.0": "CC0-1.0",
        "public domain": "CC0-1.0",
        "unlicense": "Unlicense",
        "proprietary": "NONE",
        "commercial": "NONE",
    }
    
    if not license_text:
        return None
    
    # Normalize to lowercase and remove punctuation
    normalized = re.sub(r'[^\w\s-]', '', license_text.lower())
    
    # Check for direct matches
    if normalized in license_mappings:
        return license_mappings[normalized]
    
    # Check for partial matches
    for key, value in license_mappings.items():
        if key in normalized:
            return value
    
    # Return original if no match
    return license_text


def calculate_completeness_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
    """
    Calculate a completeness score for the AIBOM.

    Args:
        aibom: The AIBOM dictionary

    Returns:
        Dictionary containing:
            - total_score: overall completeness score (0-100)
            - section_scores: points earned per section
            - field_checklist: dictionary showing presence (✔) or absence (✘) of key fields
    """
    score = 0
    max_score = 100
    section_scores = {}
    field_checklist = {}

    # Define scoring weights for different sections
    weights = {
        "required_fields": 20,
        "metadata": 20,
        "component_basic": 20,
        "component_model_card": 30,
        "external_references": 10,
    }

    # Required Fields
    required_fields = ["bomFormat", "specVersion", "serialNumber", "version"]
    required_present = [field for field in required_fields if field in aibom]
    required_score = (len(required_present) / len(required_fields)) * weights["required_fields"]
    section_scores["required_fields"] = round(required_score)
    for field in required_fields:
        field_checklist[field] = "✔" if field in required_present else "✘"

    # Metadata Fields
    metadata_score = 0
    if "metadata" in aibom:
        metadata_fields = ["timestamp", "tools", "authors", "component"]
        present = [field for field in metadata_fields if field in aibom["metadata"]]
        metadata_score = (len(present) / len(metadata_fields)) * weights["metadata"]
        for field in metadata_fields:
            field_checklist[f"metadata.{field}"] = "✔" if field in present else "✘"
    section_scores["metadata"] = round(metadata_score)

    # Component Basic Info
    component_score = 0
    component = aibom.get("components", [{}])[0]
    component_fields = ["type", "name", "bom-ref", "purl", "description", "licenses"]
    present = [field for field in component_fields if field in component]
    component_score = (len(present) / len(component_fields)) * weights["component_basic"]
    section_scores["component_basic"] = round(component_score)
    for field in component_fields:
        field_checklist[f"component.{field}"] = "✔" if field in present else "✘"

    # Model Card Section
    model_card_score = 0
    model_card_fields = ["modelParameters", "quantitativeAnalysis", "considerations"]
    if "modelCard" in component:
        model_card = component["modelCard"]
        present = [field for field in model_card_fields if field in model_card]
        model_card_score = (len(present) / len(model_card_fields)) * weights["component_model_card"]
        for field in model_card_fields:
            field_checklist[f"modelCard.{field}"] = "✔" if field in present else "✘"
    else:
        for field in model_card_fields:
            field_checklist[f"modelCard.{field}"] = "✘"
    section_scores["component_model_card"] = round(model_card_score)

    # External References
    ext_score = weights["external_references"] if aibom.get("externalReferences") else 0
    section_scores["external_references"] = round(ext_score)
    field_checklist["externalReferences"] = "✔" if ext_score else "✘"

    # Final total score
    total_score = round(sum(section_scores.values()))

    return {
        "total_score": total_score,
        "section_scores": section_scores,
        "field_checklist": field_checklist
    }

def merge_metadata(primary: Dict[str, Any], secondary: Dict[str, Any]) -> Dict[str, Any]:
    """
    Merge two metadata dictionaries, giving priority to the primary dictionary.
    
    Args:
        primary: Primary metadata dictionary
        secondary: Secondary metadata dictionary
        
    Returns:
        Merged metadata dictionary
    """
    result = secondary.copy()
    
    for key, value in primary.items():
        if value is not None:
            if key in result and isinstance(value, dict) and isinstance(result[key], dict):
                result[key] = merge_metadata(value, result[key])
            else:
                result[key] = value
    
    return result


def extract_model_id_parts(model_id: str) -> Dict[str, str]:
    """
    Extract parts from a Hugging Face model ID.
    
    Args:
        model_id: Hugging Face model ID (e.g., "google/bert-base-uncased")
        
    Returns:
        Dictionary with parts (owner, name)
    """
    parts = model_id.split("/")
    
    if len(parts) == 1:
        return {
            "owner": None,
            "name": parts[0],
        }
    else:
        return {
            "owner": parts[0],
            "name": "/".join(parts[1:]),
        }


def create_purl(model_id: str) -> str:
    """
    Create a Package URL (purl) for a Hugging Face model.
    
    Args:
        model_id: Hugging Face model ID
        
    Returns:
        Package URL string
    """
    parts = extract_model_id_parts(model_id)
    
    if parts["owner"]:
        return f"pkg:huggingface/{parts['owner']}/{parts['name']}"
    else:
        return f"pkg:huggingface/{parts['name']}"