Spaces:
Running
Running
Update src/aibom-generator/generator.py
Browse files- src/aibom-generator/generator.py +115 -145
src/aibom-generator/generator.py
CHANGED
@@ -81,12 +81,6 @@ class AIBOMGenerator:
|
|
81 |
# Calculate final score with industry-neutral approach if enabled
|
82 |
final_score = calculate_completeness_score(aibom, validate=True, use_best_practices=use_best_practices)
|
83 |
|
84 |
-
# Ensure metadata.properties exists
|
85 |
-
if "metadata" in aibom and "properties" not in aibom["metadata"]:
|
86 |
-
aibom["metadata"]["properties"] = []
|
87 |
-
|
88 |
-
# Note: Quality score information is no longer added to the AIBOM metadata
|
89 |
-
# This was removed as requested by the user
|
90 |
|
91 |
if output_file:
|
92 |
with open(output_file, 'w') as f:
|
@@ -214,16 +208,17 @@ class AIBOMGenerator:
|
|
214 |
]
|
215 |
}
|
216 |
|
217 |
-
#
|
|
|
|
|
|
|
|
|
|
|
218 |
if metadata and "commit_url" in metadata:
|
219 |
-
# Add external reference for downloadLocation
|
220 |
-
if "externalReferences" not in aibom:
|
221 |
-
aibom["externalReferences"] = []
|
222 |
-
|
223 |
aibom["externalReferences"].append({
|
224 |
-
"type": "
|
225 |
-
"url":
|
226 |
-
})
|
227 |
|
228 |
return aibom
|
229 |
|
@@ -234,22 +229,30 @@ class AIBOMGenerator:
|
|
234 |
model_card: Optional[ModelCard],
|
235 |
) -> Dict[str, Any]:
|
236 |
metadata = {}
|
237 |
-
|
238 |
if model_info:
|
239 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
metadata.update({
|
241 |
-
"name":
|
242 |
-
"author":
|
243 |
-
"tags":
|
244 |
-
"pipeline_tag":
|
245 |
-
"downloads":
|
246 |
-
"last_modified":
|
247 |
-
"commit": model_info
|
248 |
-
"commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if
|
249 |
})
|
250 |
except Exception as e:
|
251 |
print(f"Error extracting model info metadata: {e}")
|
252 |
-
|
253 |
if model_card and hasattr(model_card, "data") and model_card.data:
|
254 |
try:
|
255 |
card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {}
|
@@ -267,104 +270,35 @@ class AIBOMGenerator:
|
|
267 |
metadata["eval_results"] = model_card.data.eval_results
|
268 |
except Exception as e:
|
269 |
print(f"Error extracting model card metadata: {e}")
|
270 |
-
|
271 |
metadata["ai:type"] = "Transformer"
|
272 |
metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation")
|
273 |
metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown"
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
|
|
|
|
|
|
280 |
metadata["typeOfModel"] = metadata.get("ai:type", "Transformer")
|
281 |
-
|
|
|
|
|
|
|
|
|
282 |
return {k: v for k, v in metadata.items() if v is not None}
|
|
|
283 |
|
284 |
def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
|
285 |
"""
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
In a real implementation, this would:
|
290 |
-
1. Extract text from model card
|
291 |
-
2. Use BERT to identify key information
|
292 |
-
3. Structure the extracted information
|
293 |
-
|
294 |
-
For now, we'll simulate this with some basic extraction logic.
|
295 |
"""
|
296 |
-
|
297 |
|
298 |
-
# In a real implementation, we would use a BERT model here
|
299 |
-
# Since we can't install the required libraries due to space constraints,
|
300 |
-
# we'll simulate the enhancement with a placeholder implementation
|
301 |
-
|
302 |
-
if model_card and hasattr(model_card, "text") and model_card.text:
|
303 |
-
try:
|
304 |
-
card_text = model_card.text
|
305 |
-
|
306 |
-
# Simulate BERT extraction with basic text analysis
|
307 |
-
# In reality, this would be done with NLP models
|
308 |
-
|
309 |
-
# Extract description if missing
|
310 |
-
if card_text and "description" not in enhanced_metadata:
|
311 |
-
# Take first paragraph that's longer than 20 chars as description
|
312 |
-
paragraphs = [p.strip() for p in card_text.split('\n\n')]
|
313 |
-
for p in paragraphs:
|
314 |
-
if len(p) > 20 and not p.startswith('#'):
|
315 |
-
enhanced_metadata["description"] = p
|
316 |
-
break
|
317 |
-
|
318 |
-
# Extract limitations if present
|
319 |
-
if "limitations" not in enhanced_metadata:
|
320 |
-
if "## Limitations" in card_text:
|
321 |
-
limitations_section = card_text.split("## Limitations")[1].split("##")[0].strip()
|
322 |
-
if limitations_section:
|
323 |
-
enhanced_metadata["limitations"] = limitations_section
|
324 |
-
|
325 |
-
# Extract ethical considerations if present
|
326 |
-
if "ethical_considerations" not in enhanced_metadata:
|
327 |
-
for heading in ["## Ethical Considerations", "## Ethics", "## Bias"]:
|
328 |
-
if heading in card_text:
|
329 |
-
section = card_text.split(heading)[1].split("##")[0].strip()
|
330 |
-
if section:
|
331 |
-
enhanced_metadata["ethical_considerations"] = section
|
332 |
-
break
|
333 |
-
|
334 |
-
# Extract risks if present
|
335 |
-
if "risks" not in enhanced_metadata:
|
336 |
-
if "## Risks" in card_text:
|
337 |
-
risks_section = card_text.split("## Risks")[1].split("##")[0].strip()
|
338 |
-
if risks_section:
|
339 |
-
enhanced_metadata["risks"] = risks_section
|
340 |
-
|
341 |
-
# Extract datasets if present
|
342 |
-
if "datasets" not in enhanced_metadata:
|
343 |
-
datasets = []
|
344 |
-
if "## Dataset" in card_text or "## Datasets" in card_text:
|
345 |
-
dataset_section = ""
|
346 |
-
if "## Dataset" in card_text:
|
347 |
-
dataset_section = card_text.split("## Dataset")[1].split("##")[0].strip()
|
348 |
-
elif "## Datasets" in card_text:
|
349 |
-
dataset_section = card_text.split("## Datasets")[1].split("##")[0].strip()
|
350 |
-
|
351 |
-
if dataset_section:
|
352 |
-
# Simple parsing to extract dataset names
|
353 |
-
lines = dataset_section.split("\n")
|
354 |
-
for line in lines:
|
355 |
-
if line.strip() and not line.startswith("#"):
|
356 |
-
datasets.append({
|
357 |
-
"type": "dataset",
|
358 |
-
"name": line.strip().split()[0] if line.strip().split() else "Unknown",
|
359 |
-
"description": line.strip()
|
360 |
-
})
|
361 |
-
|
362 |
-
if datasets:
|
363 |
-
enhanced_metadata["datasets"] = datasets
|
364 |
-
except Exception as e:
|
365 |
-
print(f"Error extracting unstructured metadata: {e}")
|
366 |
-
|
367 |
-
return enhanced_metadata
|
368 |
|
369 |
def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
370 |
timestamp = datetime.datetime.utcnow().isoformat() + "Z"
|
@@ -419,10 +353,25 @@ class AIBOMGenerator:
|
|
419 |
# Add copyright
|
420 |
component["copyright"] = "NOASSERTION"
|
421 |
|
422 |
-
# Create properties array for additional metadata
|
423 |
properties = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
424 |
for key, value in metadata.items():
|
425 |
-
if key not in
|
426 |
if isinstance(value, (list, dict)):
|
427 |
if not isinstance(value, str):
|
428 |
value = json.dumps(value)
|
@@ -432,12 +381,10 @@ class AIBOMGenerator:
|
|
432 |
metadata_section = {
|
433 |
"timestamp": timestamp,
|
434 |
"tools": tools,
|
435 |
-
"component": component
|
|
|
436 |
}
|
437 |
|
438 |
-
if properties:
|
439 |
-
metadata_section["properties"] = properties
|
440 |
-
|
441 |
return metadata_section
|
442 |
|
443 |
def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
@@ -465,18 +412,29 @@ class AIBOMGenerator:
|
|
465 |
"purl": purl
|
466 |
}
|
467 |
|
468 |
-
#
|
469 |
-
if "license" in metadata:
|
470 |
component["licenses"] = [{
|
471 |
"license": {
|
472 |
"id": metadata["license"],
|
473 |
"url": self._get_license_url(metadata["license"])
|
474 |
}
|
475 |
}]
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
|
481 |
# Add external references
|
482 |
external_refs = [{
|
@@ -490,17 +448,18 @@ class AIBOMGenerator:
|
|
490 |
})
|
491 |
component["externalReferences"] = external_refs
|
492 |
|
493 |
-
#
|
494 |
-
|
495 |
-
|
496 |
-
component["
|
|
|
497 |
component["supplier"] = {
|
498 |
-
"name":
|
499 |
-
"url": [f"https://huggingface.co/{
|
500 |
}
|
501 |
component["manufacturer"] = {
|
502 |
-
"name":
|
503 |
-
"url": [f"https://huggingface.co/{
|
504 |
}
|
505 |
|
506 |
# Add copyright
|
@@ -593,19 +552,30 @@ class AIBOMGenerator:
|
|
593 |
def _get_license_url(self, license_id: str) -> str:
|
594 |
"""Get the URL for a license based on its SPDX ID."""
|
595 |
license_urls = {
|
596 |
-
"
|
597 |
-
"
|
598 |
-
"
|
599 |
-
"
|
600 |
-
"
|
601 |
-
"
|
602 |
-
"
|
603 |
-
"
|
604 |
-
"
|
605 |
-
"
|
606 |
-
"
|
607 |
-
"
|
608 |
}
|
609 |
|
610 |
-
return license_urls.get(license_id, "https://spdx.org/licenses/")
|
611 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
# Calculate final score with industry-neutral approach if enabled
|
82 |
final_score = calculate_completeness_score(aibom, validate=True, use_best_practices=use_best_practices)
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
if output_file:
|
86 |
with open(output_file, 'w') as f:
|
|
|
208 |
]
|
209 |
}
|
210 |
|
211 |
+
# ALWAYS add root-level external references
|
212 |
+
aibom["externalReferences"] = [{
|
213 |
+
"type": "distribution",
|
214 |
+
"url": f"https://huggingface.co/{model_id}"
|
215 |
+
}]
|
216 |
+
|
217 |
if metadata and "commit_url" in metadata:
|
|
|
|
|
|
|
|
|
218 |
aibom["externalReferences"].append({
|
219 |
+
"type": "vcs",
|
220 |
+
"url": metadata["commit_url"]
|
221 |
+
} )
|
222 |
|
223 |
return aibom
|
224 |
|
|
|
229 |
model_card: Optional[ModelCard],
|
230 |
) -> Dict[str, Any]:
|
231 |
metadata = {}
|
232 |
+
|
233 |
if model_info:
|
234 |
try:
|
235 |
+
author = getattr(model_info, "author", None)
|
236 |
+
if not author or author.strip() == "":
|
237 |
+
parts = model_id.split("/")
|
238 |
+
author = parts[0] if len(parts) > 1 else "unknown"
|
239 |
+
print(f"DEBUG: Fallback author used: {author}")
|
240 |
+
else:
|
241 |
+
print(f"DEBUG: Author from model_info: {author}")
|
242 |
+
|
243 |
metadata.update({
|
244 |
+
"name": getattr(model_info, "modelId", model_id).split("/")[-1],
|
245 |
+
"author": author,
|
246 |
+
"tags": getattr(model_info, "tags", []),
|
247 |
+
"pipeline_tag": getattr(model_info, "pipeline_tag", None),
|
248 |
+
"downloads": getattr(model_info, "downloads", 0),
|
249 |
+
"last_modified": getattr(model_info, "lastModified", None),
|
250 |
+
"commit": getattr(model_info, "sha", None)[:7] if getattr(model_info, "sha", None) else None,
|
251 |
+
"commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if getattr(model_info, "sha", None) else None,
|
252 |
})
|
253 |
except Exception as e:
|
254 |
print(f"Error extracting model info metadata: {e}")
|
255 |
+
|
256 |
if model_card and hasattr(model_card, "data") and model_card.data:
|
257 |
try:
|
258 |
card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {}
|
|
|
270 |
metadata["eval_results"] = model_card.data.eval_results
|
271 |
except Exception as e:
|
272 |
print(f"Error extracting model card metadata: {e}")
|
273 |
+
|
274 |
metadata["ai:type"] = "Transformer"
|
275 |
metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation")
|
276 |
metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown"
|
277 |
+
|
278 |
+
metadata["primaryPurpose"] = metadata.get("ai:task", "text-generation")
|
279 |
+
|
280 |
+
# Use model owner as fallback for suppliedBy if no author
|
281 |
+
if not metadata.get("author"):
|
282 |
+
parts = model_id.split("/")
|
283 |
+
metadata["author"] = parts[0] if len(parts) > 1 else "unknown"
|
284 |
+
|
285 |
+
metadata["suppliedBy"] = metadata.get("author", "unknown")
|
286 |
metadata["typeOfModel"] = metadata.get("ai:type", "Transformer")
|
287 |
+
|
288 |
+
print(f"DEBUG: Final metadata['author'] = {metadata.get('author')}")
|
289 |
+
print(f"DEBUG: Adding primaryPurpose = {metadata.get('ai:task', 'Text Generation')}")
|
290 |
+
print(f"DEBUG: Adding suppliedBy = {metadata.get('suppliedBy')}")
|
291 |
+
|
292 |
return {k: v for k, v in metadata.items() if v is not None}
|
293 |
+
|
294 |
|
295 |
def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
|
296 |
"""
|
297 |
+
Placeholder for future AI enhancement.
|
298 |
+
Currently returns empty dict since AI enhancement is not implemented.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
"""
|
300 |
+
return {}
|
301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
|
303 |
def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
304 |
timestamp = datetime.datetime.utcnow().isoformat() + "Z"
|
|
|
353 |
# Add copyright
|
354 |
component["copyright"] = "NOASSERTION"
|
355 |
|
356 |
+
# Create properties array for additional metadata (ALWAYS include critical fields)
|
357 |
properties = []
|
358 |
+
|
359 |
+
# ALWAYS add critical fields for scoring
|
360 |
+
critical_fields = {
|
361 |
+
"primaryPurpose": metadata.get("primaryPurpose", metadata.get("ai:task", "text-generation")),
|
362 |
+
"suppliedBy": metadata.get("suppliedBy", metadata.get("author", "unknown")),
|
363 |
+
"typeOfModel": metadata.get("ai:type", "transformer")
|
364 |
+
}
|
365 |
+
|
366 |
+
# Add critical fields first
|
367 |
+
for key, value in critical_fields.items():
|
368 |
+
if value and value != "unknown":
|
369 |
+
properties.append({"name": key, "value": str(value)})
|
370 |
+
|
371 |
+
# Add other metadata fields (excluding basic component fields)
|
372 |
+
excluded_fields = ["name", "author", "license", "description", "commit", "primaryPurpose", "suppliedBy", "typeOfModel"]
|
373 |
for key, value in metadata.items():
|
374 |
+
if key not in excluded_fields and value is not None:
|
375 |
if isinstance(value, (list, dict)):
|
376 |
if not isinstance(value, str):
|
377 |
value = json.dumps(value)
|
|
|
381 |
metadata_section = {
|
382 |
"timestamp": timestamp,
|
383 |
"tools": tools,
|
384 |
+
"component": component,
|
385 |
+
"properties": properties # ALWAYS include properties
|
386 |
}
|
387 |
|
|
|
|
|
|
|
388 |
return metadata_section
|
389 |
|
390 |
def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
412 |
"purl": purl
|
413 |
}
|
414 |
|
415 |
+
# ALWAYS add licenses (use default if not available)
|
416 |
+
if metadata and "license" in metadata and metadata["license"]:
|
417 |
component["licenses"] = [{
|
418 |
"license": {
|
419 |
"id": metadata["license"],
|
420 |
"url": self._get_license_url(metadata["license"])
|
421 |
}
|
422 |
}]
|
423 |
+
else:
|
424 |
+
# Add default license structure for consistency
|
425 |
+
component["licenses"] = [{
|
426 |
+
"license": {
|
427 |
+
"id": "unknown",
|
428 |
+
"url": "https://spdx.org/licenses/"
|
429 |
+
}
|
430 |
+
}]
|
431 |
+
# Debug
|
432 |
+
print(f"DEBUG: License in metadata: {'license' in metadata}" )
|
433 |
+
if "license" in metadata:
|
434 |
+
print(f"DEBUG: Adding licenses = {metadata['license']}")
|
435 |
+
|
436 |
+
# ALWAYS add description
|
437 |
+
component["description"] = metadata.get("description", f"AI model {model_id}")
|
438 |
|
439 |
# Add external references
|
440 |
external_refs = [{
|
|
|
448 |
})
|
449 |
component["externalReferences"] = external_refs
|
450 |
|
451 |
+
# ALWAYS add author information (use model owner if not available )
|
452 |
+
author_name = metadata.get("author", group if group else "unknown")
|
453 |
+
if author_name and author_name != "unknown":
|
454 |
+
component["authors"] = [{"name": author_name}]
|
455 |
+
component["publisher"] = author_name
|
456 |
component["supplier"] = {
|
457 |
+
"name": author_name,
|
458 |
+
"url": [f"https://huggingface.co/{author_name}"]
|
459 |
}
|
460 |
component["manufacturer"] = {
|
461 |
+
"name": author_name,
|
462 |
+
"url": [f"https://huggingface.co/{author_name}"]
|
463 |
}
|
464 |
|
465 |
# Add copyright
|
|
|
552 |
def _get_license_url(self, license_id: str) -> str:
|
553 |
"""Get the URL for a license based on its SPDX ID."""
|
554 |
license_urls = {
|
555 |
+
"apache-2.0": "https://www.apache.org/licenses/LICENSE-2.0",
|
556 |
+
"mit": "https://opensource.org/licenses/MIT",
|
557 |
+
"bsd-3-clause": "https://opensource.org/licenses/BSD-3-Clause",
|
558 |
+
"gpl-3.0": "https://www.gnu.org/licenses/gpl-3.0.en.html",
|
559 |
+
"cc-by-4.0": "https://creativecommons.org/licenses/by/4.0/",
|
560 |
+
"cc-by-sa-4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
|
561 |
+
"cc-by-nc-4.0": "https://creativecommons.org/licenses/by-nc/4.0/",
|
562 |
+
"cc-by-nd-4.0": "https://creativecommons.org/licenses/by-nd/4.0/",
|
563 |
+
"cc-by-nc-sa-4.0": "https://creativecommons.org/licenses/by-nc-sa/4.0/",
|
564 |
+
"cc-by-nc-nd-4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/",
|
565 |
+
"lgpl-3.0": "https://www.gnu.org/licenses/lgpl-3.0.en.html",
|
566 |
+
"mpl-2.0": "https://www.mozilla.org/en-US/MPL/2.0/",
|
567 |
}
|
568 |
|
569 |
+
return license_urls.get(license_id.lower(), "https://spdx.org/licenses/" )
|
570 |
|
571 |
+
def _fetch_with_retry(self, fetch_func, *args, max_retries=3, **kwargs):
|
572 |
+
"""Fetch data with retry logic for network failures."""
|
573 |
+
for attempt in range(max_retries):
|
574 |
+
try:
|
575 |
+
return fetch_func(*args, **kwargs)
|
576 |
+
except Exception as e:
|
577 |
+
if attempt == max_retries - 1:
|
578 |
+
logger.warning(f"Failed to fetch after {max_retries} attempts: {e}")
|
579 |
+
return None
|
580 |
+
time.sleep(1 * (attempt + 1)) # Exponential backoff
|
581 |
+
return None
|