Spaces:

aetheris-ai
/

aibom-generator

Running

File size: 9,407 Bytes

import json
import uuid
import datetime
from typing import Dict, Optional, Any

from huggingface_hub import HfApi, ModelCard
from utils import calculate_completeness_score


class AIBOMGenerator:
    def __init__(
        self,
        hf_token: Optional[str] = None,
        inference_model_url: Optional[str] = None,
        use_inference: bool = True,
        cache_dir: Optional[str] = None,
    ):
        self.hf_api = HfApi(token=hf_token)
        self.inference_model_url = inference_model_url
        self.use_inference = use_inference
        self.cache_dir = cache_dir

    def generate_aibom(
        self,
        model_id: str,
        output_file: Optional[str] = None,
        include_inference: Optional[bool] = None,
    ) -> Dict[str, Any]:
        use_inference = include_inference if include_inference is not None else self.use_inference
        model_info = self._fetch_model_info(model_id)
        model_card = self._fetch_model_card(model_id)
        aibom = self._create_aibom_structure(model_id, model_info, model_card, use_inference)

        if output_file:
            with open(output_file, 'w') as f:
                json.dump(aibom, f, indent=2)

        return aibom

    def _fetch_model_info(self, model_id: str) -> Dict[str, Any]:
        try:
            return self.hf_api.model_info(model_id)
        except Exception as e:
            print(f"Error fetching model info for {model_id}: {e}")
            return {}

    def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]:
        try:
            return ModelCard.load(model_id)
        except Exception as e:
            print(f"Error fetching model card for {model_id}: {e}")
            return None

    def _create_aibom_structure(
        self,
        model_id: str,
        model_info: Dict[str, Any],
        model_card: Optional[ModelCard],
        use_inference: bool,
    ) -> Dict[str, Any]:
        metadata = self._extract_structured_metadata(model_id, model_info, model_card)

        if use_inference and model_card and self.inference_model_url:
            unstructured_metadata = self._extract_unstructured_metadata(model_card)
            metadata = {**unstructured_metadata, **metadata}

        aibom = {
            "bomFormat": "CycloneDX",
            "specVersion": "1.6",
            "serialNumber": f"urn:uuid:{str(uuid.uuid4())}",
            "version": 1,
            "metadata": self._create_metadata_section(model_id, metadata),
            "components": [self._create_component_section(model_id, metadata)],
            "dependencies": [
                {
                    "ref": f"pkg:generic/{model_id.replace('/', '%2F')}",
                    "dependsOn": ["pkg:pypi/[email protected]"]
                }
            ]
        }

        return aibom

    def _extract_structured_metadata(
        self,
        model_id: str,
        model_info: Dict[str, Any],
        model_card: Optional[ModelCard],
    ) -> Dict[str, Any]:
        metadata = {}

        if model_info:
            metadata.update({
                "name": model_info.modelId.split("/")[-1] if hasattr(model_info, "modelId") else model_id.split("/")[-1],
                "author": model_info.author if hasattr(model_info, "author") else None,
                "tags": model_info.tags if hasattr(model_info, "tags") else [],
                "pipeline_tag": model_info.pipeline_tag if hasattr(model_info, "pipeline_tag") else None,
                "downloads": model_info.downloads if hasattr(model_info, "downloads") else 0,
                "last_modified": model_info.lastModified if hasattr(model_info, "lastModified") else None,
                "commit": model_info.sha[:7] if hasattr(model_info, "sha") and model_info.sha else None,
                "commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if hasattr(model_info, "sha") and model_info.sha else None,
            })

        if model_card and model_card.data:
            card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {}
            metadata.update({
                "language": card_data.get("language"),
                "license": card_data.get("license"),
                "library_name": card_data.get("library_name"),
                "base_model": card_data.get("base_model"),
                "datasets": card_data.get("datasets"),
                "model_name": card_data.get("model_name"),
                "tags": card_data.get("tags", metadata.get("tags", [])),
            })
            if hasattr(model_card.data, "eval_results") and model_card.data.eval_results:
                metadata["eval_results"] = model_card.data.eval_results

        metadata["ai:type"] = "Transformer"
        metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation")
        metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown"


       return {k: v for k, v in metadata.items() if v is not None}

    def _extract_unstructured_metadata(self, model_card: ModelCard) -> Dict[str, Any]:
        return {}

    def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
        aibom_stub = {"metadata": metadata}  # Build stub for scoring
        score_report = calculate_completeness_score(aibom_stub)

        timestamp = datetime.datetime.utcnow().isoformat() + "Z"
        tools = [{
            "vendor": "Aetheris AI",
            "name": "aibom-generator",
            "version": "0.1.0"
        }]

        authors = []
        if "author" in metadata and metadata["author"]:
            authors.append({
                "name": metadata["author"],
                "url": f"https://huggingface.co/{metadata['author']}"
            })

        component = {
            "type": "machine-learning-model",
            "name": metadata.get("name", model_id.split("/")[-1]),
            "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}"
        }

        properties = []
        for key, value in metadata.items():
            if key not in ["name", "author", "license"] and value is not None:
                if isinstance(value, (list, dict)):
                    value = json.dumps(value)
                properties.append({"name": key, "value": str(value)})

        # Add quality scoring results
        properties.append({"name": "aibom:quality-score", "value": str(score_report["total_score"])})
        properties.append({"name": "aibom:quality-breakdown", "value": json.dumps(score_report["section_scores"])})
        properties.append({"name": "aibom:field-checklist", "value": json.dumps(score_report["field_checklist"])})

        metadata_section = {
            "timestamp": timestamp,
            "tools": tools,
            "component": component
        }

        if authors:
            metadata_section["authors"] = authors
        if properties:
            metadata_section["properties"] = properties

        return metadata_section

    def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
        component = {
            "type": "machine-learning-model",
            "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}",
            "name": metadata.get("name", model_id.split("/")[-1]),
            "purl": f"pkg:generic/{model_id.replace('/', '%2F')}"
        }

        if "description" in metadata:
            component["description"] = metadata["description"]

        if "commit" in metadata:
            component["version"] = metadata["commit"]

        if "license" in metadata:
            component["licenses"] = [{"license": {"id": metadata["license"]}}]

        external_refs = [{
            "type": "website",
            "url": f"https://huggingface.co/{model_id}"
        }]
        if "commit_url" in metadata:
            external_refs.append({
                "type": "vcs",
                "url": metadata["commit_url"]
            })
        component["externalReferences"] = external_refs

        component["modelCard"] = self._create_model_card_section(metadata)

        return component

    def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
        model_card_section = {}
        model_parameters = {k: metadata[k] for k in ["base_model", "library_name", "pipeline_tag"] if k in metadata}
        if model_parameters:
            model_card_section["modelParameters"] = model_parameters

        if "eval_results" in metadata:
            model_card_section["quantitativeAnalysis"] = {"performanceMetrics": metadata["eval_results"]}

        considerations = {}
        for k in ["limitations", "ethical_considerations", "bias", "risks"]:
            if k in metadata:
                considerations[k] = metadata[k]
        if considerations:
            model_card_section["considerations"] = considerations

        properties = []
        for key, value in metadata.items():
            if key not in ["name", "author", "license", "base_model", "library_name", "pipeline_tag", "eval_results", "limitations", "ethical_considerations", "bias", "risks"]:
                if isinstance(value, (list, dict)):
                    value = json.dumps(value)
                properties.append({"name": key, "value": str(value)})

        if properties:
            model_card_section["properties"] = properties

        return model_card_section