import json import uuid import datetime from typing import Dict, Optional, Any, List from huggingface_hub import HfApi, ModelCard from urllib.parse import urlparse from .utils import calculate_completeness_score class AIBOMGenerator: def __init__( self, hf_token: Optional[str] = None, inference_model_url: Optional[str] = None, use_inference: bool = True, cache_dir: Optional[str] = None, use_best_practices: bool = True, # Added parameter for industry-neutral scoring ): self.hf_api = HfApi(token=hf_token) self.inference_model_url = inference_model_url self.use_inference = use_inference self.cache_dir = cache_dir self.enhancement_report = None # Store enhancement report as instance variable self.use_best_practices = use_best_practices # Store best practices flag def generate_aibom( self, model_id: str, output_file: Optional[str] = None, include_inference: Optional[bool] = None, use_best_practices: Optional[bool] = None, # Added parameter for industry-neutral scoring ) -> Dict[str, Any]: try: model_id = self._normalise_model_id(model_id) use_inference = include_inference if include_inference is not None else self.use_inference # Use method parameter if provided, otherwise use instance variable use_best_practices = use_best_practices if use_best_practices is not None else self.use_best_practices model_info = self._fetch_model_info(model_id) model_card = self._fetch_model_card(model_id) # Store original metadata before any AI enhancement original_metadata = self._extract_structured_metadata(model_id, model_info, model_card) # Create initial AIBOM with original metadata original_aibom = self._create_aibom_structure(model_id, original_metadata) # Calculate initial score with industry-neutral approach if enabled original_score = calculate_completeness_score(original_aibom, validate=True, use_best_practices=use_best_practices) # Final metadata starts with original metadata final_metadata = original_metadata.copy() if original_metadata else {} # Apply AI enhancement if requested ai_enhanced = False ai_model_name = None if use_inference and self.inference_model_url: try: # Extract additional metadata using AI enhanced_metadata = self._extract_unstructured_metadata(model_card, model_id) # If we got enhanced metadata, merge it with original if enhanced_metadata: ai_enhanced = True ai_model_name = "BERT-base-uncased" # Will be replaced with actual model name # Merge enhanced metadata with original (enhanced takes precedence) for key, value in enhanced_metadata.items(): if value is not None and (key not in final_metadata or not final_metadata[key]): final_metadata[key] = value except Exception as e: print(f"Error during AI enhancement: {e}") # Continue with original metadata if enhancement fails # Create final AIBOM with potentially enhanced metadata aibom = self._create_aibom_structure(model_id, final_metadata) # Calculate final score with industry-neutral approach if enabled final_score = calculate_completeness_score(aibom, validate=True, use_best_practices=use_best_practices) # Ensure metadata.properties exists if "metadata" in aibom and "properties" not in aibom["metadata"]: aibom["metadata"]["properties"] = [] # Note: Quality score information is no longer added to the AIBOM metadata # This was removed as requested by the user if output_file: with open(output_file, 'w') as f: json.dump(aibom, f, indent=2) # Create enhancement report for UI display and store as instance variable self.enhancement_report = { "ai_enhanced": ai_enhanced, "ai_model": ai_model_name if ai_enhanced else None, "original_score": original_score, "final_score": final_score, "improvement": round(final_score["total_score"] - original_score["total_score"], 2) if ai_enhanced else 0 } # Return only the AIBOM to maintain compatibility with existing code return aibom except Exception as e: print(f"Error generating AIBOM: {e}") # Return a minimal valid AIBOM structure in case of error return self._create_minimal_aibom(model_id) def _create_minimal_aibom(self, model_id: str) -> Dict[str, Any]: """Create a minimal valid AIBOM structure in case of errors""" return { "bomFormat": "CycloneDX", "specVersion": "1.6", "serialNumber": f"urn:uuid:{str(uuid.uuid4())}", "version": 1, "metadata": { "timestamp": datetime.datetime.utcnow().isoformat() + "Z", "tools": { "components": [{ "bom-ref": "pkg:generic/@cybeats/aetheris-aibom-generator@0.1.0", "type": "application", "name": "aetheris-aibom-generator", "version": "0.1.0", "manufacturer": { "name": "Aetheris AI" } }] }, "component": { "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}@1.0", "type": "application", "name": model_id.split("/")[-1], "description": f"AI model {model_id}", "version": "1.0", "purl": f"pkg:generic/{model_id.replace('/', '%2F')}@1.0", "copyright": "NOASSERTION" } }, "components": [{ "bom-ref": f"pkg:huggingface/{model_id.replace('/', '/')}@1.0", "type": "machine-learning-model", "name": model_id.split("/")[-1], "version": "1.0", "purl": f"pkg:huggingface/{model_id.replace('/', '/')}@1.0" }], "dependencies": [{ "ref": f"pkg:generic/{model_id.replace('/', '%2F')}@1.0", "dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@1.0"] }] } def get_enhancement_report(self): """Return the enhancement report from the last generate_aibom call""" return self.enhancement_report def _fetch_model_info(self, model_id: str) -> Dict[str, Any]: try: return self.hf_api.model_info(model_id) except Exception as e: print(f"Error fetching model info for {model_id}: {e}") return {} # ---- new helper --------------------------------------------------------- @staticmethod def _normalise_model_id(raw_id: str) -> str: """ Accept either 'owner/model' or a full URL like 'https://huggingface.co/owner/model'. Return 'owner/model'. """ if raw_id.startswith(("http://", "https://")): path = urlparse(raw_id).path.lstrip("/") # path can contain extra segments (e.g. /commit/...), keep first two parts = path.split("/") if len(parts) >= 2: return "/".join(parts[:2]) return path return raw_id # ------------------------------------------------------------------------- def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]: try: return ModelCard.load(model_id) except Exception as e: print(f"Error fetching model card for {model_id}: {e}") return None def _create_aibom_structure( self, model_id: str, metadata: Dict[str, Any], ) -> Dict[str, Any]: # Extract owner and model name from model_id parts = model_id.split("/") group = parts[0] if len(parts) > 1 else "" name = parts[1] if len(parts) > 1 else parts[0] # Get version from metadata or use default version = metadata.get("commit", "1.0") aibom = { "bomFormat": "CycloneDX", "specVersion": "1.6", "serialNumber": f"urn:uuid:{str(uuid.uuid4())}", "version": 1, "metadata": self._create_metadata_section(model_id, metadata), "components": [self._create_component_section(model_id, metadata)], "dependencies": [ { "ref": f"pkg:generic/{model_id.replace('/', '%2F')}@{version}", "dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@{version}"] } ] } # Add downloadLocation if available if metadata and "commit_url" in metadata: # Add external reference for downloadLocation if "externalReferences" not in aibom: aibom["externalReferences"] = [] aibom["externalReferences"].append({ "type": "distribution", "url": f"https://huggingface.co/{model_id}" }) return aibom def _extract_structured_metadata( self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard], ) -> Dict[str, Any]: metadata = {} if model_info: try: metadata.update({ "name": model_info.modelId.split("/")[-1] if hasattr(model_info, "modelId") else model_id.split("/")[-1], "author": model_info.author if hasattr(model_info, "author") else None, "tags": model_info.tags if hasattr(model_info, "tags") else [], "pipeline_tag": model_info.pipeline_tag if hasattr(model_info, "pipeline_tag") else None, "downloads": model_info.downloads if hasattr(model_info, "downloads") else 0, "last_modified": model_info.lastModified if hasattr(model_info, "lastModified") else None, "commit": model_info.sha[:7] if hasattr(model_info, "sha") and model_info.sha else None, "commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if hasattr(model_info, "sha") and model_info.sha else None, }) except Exception as e: print(f"Error extracting model info metadata: {e}") if model_card and hasattr(model_card, "data") and model_card.data: try: card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {} metadata.update({ "language": card_data.get("language"), "license": card_data.get("license"), "library_name": card_data.get("library_name"), "base_model": card_data.get("base_model"), "datasets": card_data.get("datasets"), "model_name": card_data.get("model_name"), "tags": card_data.get("tags", metadata.get("tags", [])), "description": card_data.get("model_summary", None) }) if hasattr(model_card.data, "eval_results") and model_card.data.eval_results: metadata["eval_results"] = model_card.data.eval_results except Exception as e: print(f"Error extracting model card metadata: {e}") metadata["ai:type"] = "Transformer" metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation") metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown" # Add fields for industry-neutral scoring (silently aligned with SPDX) metadata["primaryPurpose"] = metadata.get("ai:task", "Text Generation") metadata["suppliedBy"] = metadata.get("author", "Unknown") # Add typeOfModel field metadata["typeOfModel"] = metadata.get("ai:type", "Transformer") return {k: v for k, v in metadata.items() if v is not None} def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]: """ Extract additional metadata from model card using BERT model. This is a placeholder implementation that would be replaced with actual BERT inference. In a real implementation, this would: 1. Extract text from model card 2. Use BERT to identify key information 3. Structure the extracted information For now, we'll simulate this with some basic extraction logic. """ enhanced_metadata = {} # In a real implementation, we would use a BERT model here # Since we can't install the required libraries due to space constraints, # we'll simulate the enhancement with a placeholder implementation if model_card and hasattr(model_card, "text") and model_card.text: try: card_text = model_card.text # Simulate BERT extraction with basic text analysis # In reality, this would be done with NLP models # Extract description if missing if card_text and "description" not in enhanced_metadata: # Take first paragraph that's longer than 20 chars as description paragraphs = [p.strip() for p in card_text.split('\n\n')] for p in paragraphs: if len(p) > 20 and not p.startswith('#'): enhanced_metadata["description"] = p break # Extract limitations if present if "limitations" not in enhanced_metadata: if "## Limitations" in card_text: limitations_section = card_text.split("## Limitations")[1].split("##")[0].strip() if limitations_section: enhanced_metadata["limitations"] = limitations_section # Extract ethical considerations if present if "ethical_considerations" not in enhanced_metadata: for heading in ["## Ethical Considerations", "## Ethics", "## Bias"]: if heading in card_text: section = card_text.split(heading)[1].split("##")[0].strip() if section: enhanced_metadata["ethical_considerations"] = section break # Extract risks if present if "risks" not in enhanced_metadata: if "## Risks" in card_text: risks_section = card_text.split("## Risks")[1].split("##")[0].strip() if risks_section: enhanced_metadata["risks"] = risks_section # Extract datasets if present if "datasets" not in enhanced_metadata: datasets = [] if "## Dataset" in card_text or "## Datasets" in card_text: dataset_section = "" if "## Dataset" in card_text: dataset_section = card_text.split("## Dataset")[1].split("##")[0].strip() elif "## Datasets" in card_text: dataset_section = card_text.split("## Datasets")[1].split("##")[0].strip() if dataset_section: # Simple parsing to extract dataset names lines = dataset_section.split("\n") for line in lines: if line.strip() and not line.startswith("#"): datasets.append({ "type": "dataset", "name": line.strip().split()[0] if line.strip().split() else "Unknown", "description": line.strip() }) if datasets: enhanced_metadata["datasets"] = datasets except Exception as e: print(f"Error extracting unstructured metadata: {e}") return enhanced_metadata def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]: timestamp = datetime.datetime.utcnow().isoformat() + "Z" # Get version from metadata or use default version = metadata.get("commit", "1.0") # Create tools section with components array tools = { "components": [{ "bom-ref": "pkg:generic/@cybeats/aetheris-aibom-generator@0.1.0", "type": "application", "name": "aetheris-aibom-generator", "version": "1.0", "manufacturer": { "name": "Aetheris AI" } }] } # Create authors array authors = [] if "author" in metadata and metadata["author"]: authors.append({ "name": metadata["author"] }) # Create component section for metadata component = { "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}@{version}", "type": "application", "name": metadata.get("name", model_id.split("/")[-1]), "description": metadata.get("description", f"AI model {model_id}"), "version": version, "purl": f"pkg:generic/{model_id.replace('/', '%2F')}@{version}" } # Add authors to component if available if authors: component["authors"] = authors # Add publisher and supplier if author is available if "author" in metadata and metadata["author"]: component["publisher"] = metadata["author"] component["supplier"] = { "name": metadata["author"] } component["manufacturer"] = { "name": metadata["author"] } # Add copyright component["copyright"] = "NOASSERTION" # Create properties array for additional metadata properties = [] for key, value in metadata.items(): if key not in ["name", "author", "license", "description", "commit"] and value is not None: if isinstance(value, (list, dict)): if not isinstance(value, str): value = json.dumps(value) properties.append({"name": key, "value": str(value)}) # Assemble metadata section metadata_section = { "timestamp": timestamp, "tools": tools, "component": component } if properties: metadata_section["properties"] = properties return metadata_section def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]: # Extract owner and model name from model_id parts = model_id.split("/") group = parts[0] if len(parts) > 1 else "" name = parts[1] if len(parts) > 1 else parts[0] # Get version from metadata or use default version = metadata.get("commit", "1.0") # Create PURL with version information if commit is available purl = f"pkg:huggingface/{model_id.replace('/', '/')}" if "commit" in metadata: purl = f"{purl}@{metadata['commit']}" else: purl = f"{purl}@{version}" component = { "bom-ref": f"pkg:huggingface/{model_id.replace('/', '/')}@{version}", "type": "machine-learning-model", "group": group, "name": name, "version": version, "purl": purl } # Add licenses if available if "license" in metadata: component["licenses"] = [{ "license": { "id": metadata["license"], "url": self._get_license_url(metadata["license"]) } }] # Add description if available if "description" in metadata: component["description"] = metadata["description"] # Add external references external_refs = [{ "type": "website", "url": f"https://huggingface.co/{model_id}" }] if "commit_url" in metadata: external_refs.append({ "type": "vcs", "url": metadata["commit_url"] }) component["externalReferences"] = external_refs # Add authors, publisher, supplier, manufacturer if "author" in metadata and metadata["author"]: component["authors"] = [{"name": metadata["author"]}] component["publisher"] = metadata["author"] component["supplier"] = { "name": metadata["author"], "url": [f"https://huggingface.co/{metadata['author']}"] } component["manufacturer"] = { "name": metadata["author"], "url": [f"https://huggingface.co/{metadata['author']}"] } # Add copyright component["copyright"] = "NOASSERTION" # Add model card section component["modelCard"] = self._create_model_card_section(metadata) return component def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]: model_card_section = {} # Add quantitative analysis section if "eval_results" in metadata: model_card_section["quantitativeAnalysis"] = { "performanceMetrics": metadata["eval_results"], "graphics": {} # Empty graphics object as in the example } else: model_card_section["quantitativeAnalysis"] = {"graphics": {}} # Add properties section properties = [] for key, value in metadata.items(): if key in ["author", "library_name", "license", "downloads", "likes", "tags", "created_at", "last_modified"]: properties.append({"name": key, "value": str(value)}) if properties: model_card_section["properties"] = properties # Create model parameters section model_parameters = {} # Add outputs array model_parameters["outputs"] = [{"format": "generated-text"}] # Add task model_parameters["task"] = metadata.get("pipeline_tag", "text-generation") # Add architecture information model_parameters["architectureFamily"] = "llama" if "llama" in metadata.get("name", "").lower() else "transformer" model_parameters["modelArchitecture"] = f"{metadata.get('name', 'Unknown')}ForCausalLM" # Add datasets array with proper structure if "datasets" in metadata: datasets = [] if isinstance(metadata["datasets"], list): for dataset in metadata["datasets"]: if isinstance(dataset, str): datasets.append({ "type": "dataset", "name": dataset, "description": f"Dataset used for training {metadata.get('name', 'the model')}" }) elif isinstance(dataset, dict) and "name" in dataset: # Ensure dataset has the required structure dataset_entry = { "type": dataset.get("type", "dataset"), "name": dataset["name"], "description": dataset.get("description", f"Dataset: {dataset['name']}") } datasets.append(dataset_entry) elif isinstance(metadata["datasets"], str): datasets.append({ "type": "dataset", "name": metadata["datasets"], "description": f"Dataset used for training {metadata.get('name', 'the model')}" }) if datasets: model_parameters["datasets"] = datasets # Add inputs array model_parameters["inputs"] = [{"format": "text"}] # Add model parameters to model card section model_card_section["modelParameters"] = model_parameters # Add considerations section considerations = {} for k in ["limitations", "ethical_considerations", "bias", "risks"]: if k in metadata: considerations[k] = metadata[k] if considerations: model_card_section["considerations"] = considerations return model_card_section def _get_license_url(self, license_id: str) -> str: """Get the URL for a license based on its SPDX ID.""" license_urls = { "Apache-2.0": "https://www.apache.org/licenses/LICENSE-2.0", "MIT": "https://opensource.org/licenses/MIT", "BSD-3-Clause": "https://opensource.org/licenses/BSD-3-Clause", "GPL-3.0": "https://www.gnu.org/licenses/gpl-3.0.en.html", "CC-BY-4.0": "https://creativecommons.org/licenses/by/4.0/", "CC-BY-SA-4.0": "https://creativecommons.org/licenses/by-sa/4.0/", "CC-BY-NC-4.0": "https://creativecommons.org/licenses/by-nc/4.0/", "CC-BY-ND-4.0": "https://creativecommons.org/licenses/by-nd/4.0/", "CC-BY-NC-SA-4.0": "https://creativecommons.org/licenses/by-nc-sa/4.0/", "CC-BY-NC-ND-4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/", "LGPL-3.0": "https://www.gnu.org/licenses/lgpl-3.0.en.html", "MPL-2.0": "https://www.mozilla.org/en-US/MPL/2.0/", } return license_urls.get(license_id, "https://spdx.org/licenses/")