Spaces:
Running
Running
""" | |
Utility functions for the AIBOM Generator. | |
""" | |
import json | |
import logging | |
import os | |
import re | |
import uuid | |
from typing import Dict, List, Optional, Any, Union | |
logger = logging.getLogger(__name__) | |
def setup_logging(level=logging.INFO): | |
"""Set up logging configuration.""" | |
logging.basicConfig( | |
level=level, | |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", | |
datefmt="%Y-%m-%d %H:%M:%S", | |
) | |
def ensure_directory(directory_path): | |
"""Ensure that a directory exists, creating it if necessary.""" | |
if not os.path.exists(directory_path): | |
os.makedirs(directory_path) | |
return directory_path | |
def generate_uuid(): | |
"""Generate a UUID for the AIBOM serialNumber.""" | |
return str(uuid.uuid4()) | |
def normalize_license_id(license_text): | |
""" | |
Normalize a license string to a SPDX license identifier if possible. | |
Args: | |
license_text: The license text to normalize | |
Returns: | |
SPDX license identifier or the original text if no match | |
""" | |
# Common license mappings | |
license_mappings = { | |
"mit": "MIT", | |
"apache": "Apache-2.0", | |
"apache 2": "Apache-2.0", | |
"apache 2.0": "Apache-2.0", | |
"apache-2": "Apache-2.0", | |
"apache-2.0": "Apache-2.0", | |
"gpl": "GPL-3.0-only", | |
"gpl-3": "GPL-3.0-only", | |
"gpl-3.0": "GPL-3.0-only", | |
"gpl3": "GPL-3.0-only", | |
"gpl v3": "GPL-3.0-only", | |
"gpl-2": "GPL-2.0-only", | |
"gpl-2.0": "GPL-2.0-only", | |
"gpl2": "GPL-2.0-only", | |
"gpl v2": "GPL-2.0-only", | |
"lgpl": "LGPL-3.0-only", | |
"lgpl-3": "LGPL-3.0-only", | |
"lgpl-3.0": "LGPL-3.0-only", | |
"bsd": "BSD-3-Clause", | |
"bsd-3": "BSD-3-Clause", | |
"bsd-3-clause": "BSD-3-Clause", | |
"bsd-2": "BSD-2-Clause", | |
"bsd-2-clause": "BSD-2-Clause", | |
"cc": "CC-BY-4.0", | |
"cc-by": "CC-BY-4.0", | |
"cc-by-4.0": "CC-BY-4.0", | |
"cc-by-sa": "CC-BY-SA-4.0", | |
"cc-by-sa-4.0": "CC-BY-SA-4.0", | |
"cc-by-nc": "CC-BY-NC-4.0", | |
"cc-by-nc-4.0": "CC-BY-NC-4.0", | |
"cc0": "CC0-1.0", | |
"cc0-1.0": "CC0-1.0", | |
"public domain": "CC0-1.0", | |
"unlicense": "Unlicense", | |
"proprietary": "NONE", | |
"commercial": "NONE", | |
} | |
if not license_text: | |
return None | |
# Normalize to lowercase and remove punctuation | |
normalized = re.sub(r'[^\w\s-]', '', license_text.lower()) | |
# Check for direct matches | |
if normalized in license_mappings: | |
return license_mappings[normalized] | |
# Check for partial matches | |
for key, value in license_mappings.items(): | |
if key in normalized: | |
return value | |
# Return original if no match | |
return license_text | |
def calculate_completeness_score(aibom: Dict[str, Any]) -> Dict[str, Any]: | |
""" | |
Calculate a completeness score for the AIBOM. | |
Args: | |
aibom: The AIBOM dictionary | |
Returns: | |
Dictionary containing: | |
- total_score: overall completeness score (0-100) | |
- section_scores: points earned per section | |
- field_checklist: dictionary showing presence (β) or absence (β) of key fields | |
""" | |
score = 0 | |
max_score = 100 | |
section_scores = {} | |
field_checklist = {} | |
# Define scoring weights for different sections | |
weights = { | |
"required_fields": 20, | |
"metadata": 20, | |
"component_basic": 20, | |
"component_model_card": 30, | |
"external_references": 10, | |
} | |
# Required Fields | |
required_fields = ["bomFormat", "specVersion", "serialNumber", "version"] | |
required_present = [field for field in required_fields if field in aibom] | |
required_score = (len(required_present) / len(required_fields)) * weights["required_fields"] | |
section_scores["required_fields"] = round(required_score) | |
for field in required_fields: | |
field_checklist[field] = "β" if field in required_present else "β" | |
# Metadata Fields | |
metadata_score = 0 | |
if "metadata" in aibom: | |
metadata_fields = ["timestamp", "tools", "authors", "component"] | |
present = [field for field in metadata_fields if field in aibom["metadata"]] | |
metadata_score = (len(present) / len(metadata_fields)) * weights["metadata"] | |
for field in metadata_fields: | |
field_checklist[f"metadata.{field}"] = "β" if field in present else "β" | |
section_scores["metadata"] = round(metadata_score) | |
# Component Basic Info | |
component_score = 0 | |
component = aibom.get("components", [{}])[0] | |
component_fields = ["type", "name", "bom-ref", "purl", "description", "licenses"] | |
present = [field for field in component_fields if field in component] | |
component_score = (len(present) / len(component_fields)) * weights["component_basic"] | |
section_scores["component_basic"] = round(component_score) | |
for field in component_fields: | |
field_checklist[f"component.{field}"] = "β" if field in present else "β" | |
# Model Card Section | |
model_card_score = 0 | |
model_card_fields = ["modelParameters", "quantitativeAnalysis", "considerations"] | |
if "modelCard" in component: | |
model_card = component["modelCard"] | |
present = [field for field in model_card_fields if field in model_card] | |
model_card_score = (len(present) / len(model_card_fields)) * weights["component_model_card"] | |
for field in model_card_fields: | |
field_checklist[f"modelCard.{field}"] = "β" if field in present else "β" | |
else: | |
for field in model_card_fields: | |
field_checklist[f"modelCard.{field}"] = "β" | |
section_scores["component_model_card"] = round(model_card_score) | |
# External References | |
ext_score = weights["external_references"] if aibom.get("externalReferences") else 0 | |
section_scores["external_references"] = round(ext_score) | |
field_checklist["externalReferences"] = "β" if ext_score else "β" | |
# Final total score | |
total_score = round(sum(section_scores.values())) | |
return { | |
"total_score": total_score, | |
"section_scores": section_scores, | |
"field_checklist": field_checklist | |
} | |
def merge_metadata(primary: Dict[str, Any], secondary: Dict[str, Any]) -> Dict[str, Any]: | |
""" | |
Merge two metadata dictionaries, giving priority to the primary dictionary. | |
Args: | |
primary: Primary metadata dictionary | |
secondary: Secondary metadata dictionary | |
Returns: | |
Merged metadata dictionary | |
""" | |
result = secondary.copy() | |
for key, value in primary.items(): | |
if value is not None: | |
if key in result and isinstance(value, dict) and isinstance(result[key], dict): | |
result[key] = merge_metadata(value, result[key]) | |
else: | |
result[key] = value | |
return result | |
def extract_model_id_parts(model_id: str) -> Dict[str, str]: | |
""" | |
Extract parts from a Hugging Face model ID. | |
Args: | |
model_id: Hugging Face model ID (e.g., "google/bert-base-uncased") | |
Returns: | |
Dictionary with parts (owner, name) | |
""" | |
parts = model_id.split("/") | |
if len(parts) == 1: | |
return { | |
"owner": None, | |
"name": parts[0], | |
} | |
else: | |
return { | |
"owner": parts[0], | |
"name": "/".join(parts[1:]), | |
} | |
def create_purl(model_id: str) -> str: | |
""" | |
Create a Package URL (purl) for a Hugging Face model. | |
Args: | |
model_id: Hugging Face model ID | |
Returns: | |
Package URL string | |
""" | |
parts = extract_model_id_parts(model_id) | |
if parts["owner"]: | |
return f"pkg:huggingface/{parts['owner']}/{parts['name']}" | |
else: | |
return f"pkg:huggingface/{parts['name']}" | |