Spaces:
Running
Running
import json | |
import uuid | |
import datetime | |
from typing import Dict, Optional, Any | |
from huggingface_hub import HfApi, ModelCard | |
from utils import calculate_completeness_score | |
class AIBOMGenerator: | |
def __init__( | |
self, | |
hf_token: Optional[str] = None, | |
inference_model_url: Optional[str] = None, | |
use_inference: bool = True, | |
cache_dir: Optional[str] = None, | |
): | |
self.hf_api = HfApi(token=hf_token) | |
self.inference_model_url = inference_model_url | |
self.use_inference = use_inference | |
self.cache_dir = cache_dir | |
def generate_aibom( | |
self, | |
model_id: str, | |
output_file: Optional[str] = None, | |
include_inference: Optional[bool] = None, | |
) -> Dict[str, Any]: | |
use_inference = include_inference if include_inference is not None else self.use_inference | |
model_info = self._fetch_model_info(model_id) | |
model_card = self._fetch_model_card(model_id) | |
aibom = self._create_aibom_structure(model_id, model_info, model_card, use_inference) | |
if output_file: | |
with open(output_file, 'w') as f: | |
json.dump(aibom, f, indent=2) | |
return aibom | |
def _fetch_model_info(self, model_id: str) -> Dict[str, Any]: | |
try: | |
return self.hf_api.model_info(model_id) | |
except Exception as e: | |
print(f"Error fetching model info for {model_id}: {e}") | |
return {} | |
def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]: | |
try: | |
return ModelCard.load(model_id) | |
except Exception as e: | |
print(f"Error fetching model card for {model_id}: {e}") | |
return None | |
def _create_aibom_structure( | |
self, | |
model_id: str, | |
model_info: Dict[str, Any], | |
model_card: Optional[ModelCard], | |
use_inference: bool, | |
) -> Dict[str, Any]: | |
metadata = self._extract_structured_metadata(model_id, model_info, model_card) | |
if use_inference and model_card and self.inference_model_url: | |
unstructured_metadata = self._extract_unstructured_metadata(model_card) | |
metadata = {**unstructured_metadata, **metadata} | |
aibom = { | |
"bomFormat": "CycloneDX", | |
"specVersion": "1.6", | |
"serialNumber": f"urn:uuid:{str(uuid.uuid4())}", | |
"version": 1, | |
"metadata": self._create_metadata_section(model_id, metadata), | |
"components": [self._create_component_section(model_id, metadata)], | |
"dependencies": [ | |
{ | |
"ref": f"pkg:generic/{model_id.replace('/', '%2F')}", | |
"dependsOn": ["pkg:pypi/[email protected]"] | |
} | |
] | |
} | |
return aibom | |
def _extract_structured_metadata( | |
self, | |
model_id: str, | |
model_info: Dict[str, Any], | |
model_card: Optional[ModelCard], | |
) -> Dict[str, Any]: | |
metadata = {} | |
if model_info: | |
metadata.update({ | |
"name": model_info.modelId.split("/")[-1] if hasattr(model_info, "modelId") else model_id.split("/")[-1], | |
"author": model_info.author if hasattr(model_info, "author") else None, | |
"tags": model_info.tags if hasattr(model_info, "tags") else [], | |
"pipeline_tag": model_info.pipeline_tag if hasattr(model_info, "pipeline_tag") else None, | |
"downloads": model_info.downloads if hasattr(model_info, "downloads") else 0, | |
"last_modified": model_info.lastModified if hasattr(model_info, "lastModified") else None, | |
"commit": model_info.sha[:7] if hasattr(model_info, "sha") and model_info.sha else None, | |
"commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if hasattr(model_info, "sha") and model_info.sha else None, | |
}) | |
if model_card and model_card.data: | |
card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {} | |
metadata.update({ | |
"language": card_data.get("language"), | |
"license": card_data.get("license"), | |
"library_name": card_data.get("library_name"), | |
"base_model": card_data.get("base_model"), | |
"datasets": card_data.get("datasets"), | |
"model_name": card_data.get("model_name"), | |
"tags": card_data.get("tags", metadata.get("tags", [])), | |
}) | |
if hasattr(model_card.data, "eval_results") and model_card.data.eval_results: | |
metadata["eval_results"] = model_card.data.eval_results | |
metadata["ai:type"] = "Transformer" | |
metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation") | |
metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown" | |
return {k: v for k, v in metadata.items() if v is not None} | |
def _extract_unstructured_metadata(self, model_card: ModelCard) -> Dict[str, Any]: | |
return {} | |
def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]: | |
aibom_stub = {"metadata": metadata} # Build stub for scoring | |
score_report = calculate_completeness_score(aibom_stub) | |
timestamp = datetime.datetime.utcnow().isoformat() + "Z" | |
tools = [{ | |
"vendor": "Aetheris AI", | |
"name": "aibom-generator", | |
"version": "0.1.0" | |
}] | |
authors = [] | |
if "author" in metadata and metadata["author"]: | |
authors.append({ | |
"name": metadata["author"], | |
"url": f"https://huggingface.co/{metadata['author']}" | |
}) | |
component = { | |
"type": "machine-learning-model", | |
"name": metadata.get("name", model_id.split("/")[-1]), | |
"bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}" | |
} | |
properties = [] | |
for key, value in metadata.items(): | |
if key not in ["name", "author", "license"] and value is not None: | |
if isinstance(value, (list, dict)): | |
value = json.dumps(value) | |
properties.append({"name": key, "value": str(value)}) | |
# Add quality scoring results | |
properties.append({"name": "aibom:quality-score", "value": str(score_report["total_score"])}) | |
properties.append({"name": "aibom:quality-breakdown", "value": json.dumps(score_report["section_scores"])}) | |
properties.append({"name": "aibom:field-checklist", "value": json.dumps(score_report["field_checklist"])}) | |
metadata_section = { | |
"timestamp": timestamp, | |
"tools": tools, | |
"component": component | |
} | |
if authors: | |
metadata_section["authors"] = authors | |
if properties: | |
metadata_section["properties"] = properties | |
return metadata_section | |
def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]: | |
component = { | |
"type": "machine-learning-model", | |
"bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}", | |
"name": metadata.get("name", model_id.split("/")[-1]), | |
"purl": f"pkg:generic/{model_id.replace('/', '%2F')}" | |
} | |
if "description" in metadata: | |
component["description"] = metadata["description"] | |
if "commit" in metadata: | |
component["version"] = metadata["commit"] | |
if "license" in metadata: | |
component["licenses"] = [{"license": {"id": metadata["license"]}}] | |
external_refs = [{ | |
"type": "website", | |
"url": f"https://huggingface.co/{model_id}" | |
}] | |
if "commit_url" in metadata: | |
external_refs.append({ | |
"type": "vcs", | |
"url": metadata["commit_url"] | |
}) | |
component["externalReferences"] = external_refs | |
component["modelCard"] = self._create_model_card_section(metadata) | |
return component | |
def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]: | |
model_card_section = {} | |
model_parameters = {k: metadata[k] for k in ["base_model", "library_name", "pipeline_tag"] if k in metadata} | |
if model_parameters: | |
model_card_section["modelParameters"] = model_parameters | |
if "eval_results" in metadata: | |
model_card_section["quantitativeAnalysis"] = {"performanceMetrics": metadata["eval_results"]} | |
considerations = {} | |
for k in ["limitations", "ethical_considerations", "bias", "risks"]: | |
if k in metadata: | |
considerations[k] = metadata[k] | |
if considerations: | |
model_card_section["considerations"] = considerations | |
properties = [] | |
for key, value in metadata.items(): | |
if key not in ["name", "author", "license", "base_model", "library_name", "pipeline_tag", "eval_results", "limitations", "ethical_considerations", "bias", "risks"]: | |
if isinstance(value, (list, dict)): | |
value = json.dumps(value) | |
properties.append({"name": key, "value": str(value)}) | |
if properties: | |
model_card_section["properties"] = properties | |
return model_card_section | |