Spaces:
Running
Running
Update src/aibom_generator/generator.py
Browse files- src/aibom_generator/generator.py +181 -72
src/aibom_generator/generator.py
CHANGED
@@ -14,21 +14,27 @@ class AIBOMGenerator:
|
|
14 |
inference_model_url: Optional[str] = None,
|
15 |
use_inference: bool = True,
|
16 |
cache_dir: Optional[str] = None,
|
|
|
17 |
):
|
18 |
self.hf_api = HfApi(token=hf_token)
|
19 |
self.inference_model_url = inference_model_url
|
20 |
self.use_inference = use_inference
|
21 |
self.cache_dir = cache_dir
|
22 |
self.enhancement_report = None # Store enhancement report as instance variable
|
|
|
23 |
|
24 |
def generate_aibom(
|
25 |
self,
|
26 |
model_id: str,
|
27 |
output_file: Optional[str] = None,
|
28 |
include_inference: Optional[bool] = None,
|
|
|
29 |
) -> Dict[str, Any]:
|
30 |
try:
|
31 |
use_inference = include_inference if include_inference is not None else self.use_inference
|
|
|
|
|
|
|
32 |
model_info = self._fetch_model_info(model_id)
|
33 |
model_card = self._fetch_model_card(model_id)
|
34 |
|
@@ -38,8 +44,8 @@ class AIBOMGenerator:
|
|
38 |
# Create initial AIBOM with original metadata
|
39 |
original_aibom = self._create_aibom_structure(model_id, original_metadata)
|
40 |
|
41 |
-
# Calculate initial score
|
42 |
-
original_score = calculate_completeness_score(original_aibom)
|
43 |
|
44 |
# Final metadata starts with original metadata
|
45 |
final_metadata = original_metadata.copy() if original_metadata else {}
|
@@ -69,8 +75,8 @@ class AIBOMGenerator:
|
|
69 |
# Create final AIBOM with potentially enhanced metadata
|
70 |
aibom = self._create_aibom_structure(model_id, final_metadata)
|
71 |
|
72 |
-
# Calculate final score
|
73 |
-
final_score = calculate_completeness_score(aibom)
|
74 |
|
75 |
# Add score and enhancement info to metadata properties
|
76 |
if "metadata" in aibom and "properties" not in aibom["metadata"]:
|
@@ -83,6 +89,17 @@ class AIBOMGenerator:
|
|
83 |
aibom["metadata"]["properties"].append({"name": "aibom:quality-breakdown", "value": json.dumps(final_score["section_scores"])})
|
84 |
aibom["metadata"]["properties"].append({"name": "aibom:max-scores", "value": json.dumps(final_score["max_scores"])})
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
# Add AI enhancement information
|
87 |
if ai_enhanced:
|
88 |
aibom["metadata"]["properties"].append({"name": "aibom:ai-enhanced", "value": "true"})
|
@@ -181,6 +198,23 @@ class AIBOMGenerator:
|
|
181 |
]
|
182 |
}
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
return aibom
|
185 |
|
186 |
def _extract_structured_metadata(
|
@@ -227,6 +261,13 @@ class AIBOMGenerator:
|
|
227 |
metadata["ai:type"] = "Transformer"
|
228 |
metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation")
|
229 |
metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
return {k: v for k, v in metadata.items() if v is not None}
|
232 |
|
@@ -270,6 +311,8 @@ class AIBOMGenerator:
|
|
270 |
limitations_section = card_text.split("## Limitations")[1].split("##")[0].strip()
|
271 |
if limitations_section:
|
272 |
enhanced_metadata["limitations"] = limitations_section
|
|
|
|
|
273 |
|
274 |
# Extract ethical considerations if present
|
275 |
if "ethical_considerations" not in enhanced_metadata:
|
@@ -278,6 +321,8 @@ class AIBOMGenerator:
|
|
278 |
section = card_text.split(heading)[1].split("##")[0].strip()
|
279 |
if section:
|
280 |
enhanced_metadata["ethical_considerations"] = section
|
|
|
|
|
281 |
break
|
282 |
|
283 |
# Extract risks if present
|
@@ -286,6 +331,24 @@ class AIBOMGenerator:
|
|
286 |
risks_section = card_text.split("## Risks")[1].split("##")[0].strip()
|
287 |
if risks_section:
|
288 |
enhanced_metadata["risks"] = risks_section
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
except Exception as e:
|
290 |
print(f"Error extracting unstructured metadata: {e}")
|
291 |
|
@@ -345,81 +408,127 @@ class AIBOMGenerator:
|
|
345 |
|
346 |
component = {
|
347 |
"type": "machine-learning-model",
|
348 |
-
"bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}",
|
349 |
"name": metadata.get("name", model_id.split("/")[-1]) if metadata else model_id.split("/")[-1],
|
|
|
350 |
"purl": purl
|
351 |
}
|
352 |
-
|
353 |
-
if metadata and "description" in metadata:
|
354 |
-
component["description"] = metadata["description"]
|
355 |
-
|
356 |
-
if metadata and "commit" in metadata:
|
357 |
-
component["version"] = metadata["commit"]
|
358 |
-
|
359 |
-
if metadata and "license" in metadata:
|
360 |
-
component["licenses"] = [{"license": {"id": metadata["license"]}}]
|
361 |
-
|
362 |
-
external_refs = [{
|
363 |
-
"type": "website",
|
364 |
-
"url": f"https://huggingface.co/{model_id}"
|
365 |
-
}]
|
366 |
-
if metadata and "commit_url" in metadata:
|
367 |
-
external_refs.append({
|
368 |
-
"type": "vcs",
|
369 |
-
"url": metadata["commit_url"]
|
370 |
-
})
|
371 |
-
component["externalReferences"] = external_refs
|
372 |
-
|
373 |
-
component["modelCard"] = self._create_model_card_section(metadata)
|
374 |
-
|
375 |
-
return component
|
376 |
-
|
377 |
-
def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
378 |
-
model_card_section = {}
|
379 |
|
380 |
-
if
|
381 |
-
|
|
|
382 |
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
"
|
388 |
-
|
|
|
|
|
389 |
|
390 |
-
|
391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
|
|
|
|
|
|
403 |
})
|
404 |
-
|
405 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
|
407 |
-
|
408 |
-
|
409 |
-
model_card_section["modelParameters"] = model_parameters
|
410 |
-
|
411 |
-
# Safely extract evaluation results
|
412 |
-
if "eval_results" in metadata:
|
413 |
-
model_card_section["quantitativeAnalysis"] = {"performanceMetrics": metadata["eval_results"]}
|
414 |
-
|
415 |
-
# Safely extract considerations
|
416 |
-
considerations = {}
|
417 |
-
for k in ["limitations", "ethical_considerations", "bias", "risks"]:
|
418 |
-
if k in metadata:
|
419 |
-
considerations[k] = metadata[k]
|
420 |
-
if considerations:
|
421 |
-
model_card_section["considerations"] = considerations
|
422 |
-
except Exception as e:
|
423 |
-
print(f"Error creating model card section: {e}")
|
424 |
|
425 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
inference_model_url: Optional[str] = None,
|
15 |
use_inference: bool = True,
|
16 |
cache_dir: Optional[str] = None,
|
17 |
+
use_best_practices: bool = True, # Added parameter for industry-neutral scoring
|
18 |
):
|
19 |
self.hf_api = HfApi(token=hf_token)
|
20 |
self.inference_model_url = inference_model_url
|
21 |
self.use_inference = use_inference
|
22 |
self.cache_dir = cache_dir
|
23 |
self.enhancement_report = None # Store enhancement report as instance variable
|
24 |
+
self.use_best_practices = use_best_practices # Store best practices flag
|
25 |
|
26 |
def generate_aibom(
|
27 |
self,
|
28 |
model_id: str,
|
29 |
output_file: Optional[str] = None,
|
30 |
include_inference: Optional[bool] = None,
|
31 |
+
use_best_practices: Optional[bool] = None, # Added parameter for industry-neutral scoring
|
32 |
) -> Dict[str, Any]:
|
33 |
try:
|
34 |
use_inference = include_inference if include_inference is not None else self.use_inference
|
35 |
+
# Use method parameter if provided, otherwise use instance variable
|
36 |
+
use_best_practices = use_best_practices if use_best_practices is not None else self.use_best_practices
|
37 |
+
|
38 |
model_info = self._fetch_model_info(model_id)
|
39 |
model_card = self._fetch_model_card(model_id)
|
40 |
|
|
|
44 |
# Create initial AIBOM with original metadata
|
45 |
original_aibom = self._create_aibom_structure(model_id, original_metadata)
|
46 |
|
47 |
+
# Calculate initial score with industry-neutral approach if enabled
|
48 |
+
original_score = calculate_completeness_score(original_aibom, validate=True, use_best_practices=use_best_practices)
|
49 |
|
50 |
# Final metadata starts with original metadata
|
51 |
final_metadata = original_metadata.copy() if original_metadata else {}
|
|
|
75 |
# Create final AIBOM with potentially enhanced metadata
|
76 |
aibom = self._create_aibom_structure(model_id, final_metadata)
|
77 |
|
78 |
+
# Calculate final score with industry-neutral approach if enabled
|
79 |
+
final_score = calculate_completeness_score(aibom, validate=True, use_best_practices=use_best_practices)
|
80 |
|
81 |
# Add score and enhancement info to metadata properties
|
82 |
if "metadata" in aibom and "properties" not in aibom["metadata"]:
|
|
|
89 |
aibom["metadata"]["properties"].append({"name": "aibom:quality-breakdown", "value": json.dumps(final_score["section_scores"])})
|
90 |
aibom["metadata"]["properties"].append({"name": "aibom:max-scores", "value": json.dumps(final_score["max_scores"])})
|
91 |
|
92 |
+
# Add completeness profile information if available (from industry-neutral approach)
|
93 |
+
if use_best_practices and "completeness_profile" in final_score:
|
94 |
+
aibom["metadata"]["properties"].append({
|
95 |
+
"name": "aibom:completeness-profile",
|
96 |
+
"value": final_score["completeness_profile"]["name"]
|
97 |
+
})
|
98 |
+
aibom["metadata"]["properties"].append({
|
99 |
+
"name": "aibom:completeness-description",
|
100 |
+
"value": final_score["completeness_profile"]["description"]
|
101 |
+
})
|
102 |
+
|
103 |
# Add AI enhancement information
|
104 |
if ai_enhanced:
|
105 |
aibom["metadata"]["properties"].append({"name": "aibom:ai-enhanced", "value": "true"})
|
|
|
198 |
]
|
199 |
}
|
200 |
|
201 |
+
# Add critical fields for industry-neutral scoring (silently aligned with SPDX)
|
202 |
+
if metadata and "last_modified" in metadata:
|
203 |
+
aibom["buildTime"] = metadata["last_modified"]
|
204 |
+
# Add releaseTime as well if available, otherwise use last_modified
|
205 |
+
aibom["releaseTime"] = metadata.get("release_date", metadata["last_modified"])
|
206 |
+
|
207 |
+
# Add downloadLocation if available
|
208 |
+
if metadata and "commit_url" in metadata:
|
209 |
+
# Add external reference for downloadLocation
|
210 |
+
if "externalReferences" not in aibom:
|
211 |
+
aibom["externalReferences"] = []
|
212 |
+
|
213 |
+
aibom["externalReferences"].append({
|
214 |
+
"type": "distribution",
|
215 |
+
"url": f"https://huggingface.co/{model_id}"
|
216 |
+
})
|
217 |
+
|
218 |
return aibom
|
219 |
|
220 |
def _extract_structured_metadata(
|
|
|
261 |
metadata["ai:type"] = "Transformer"
|
262 |
metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation")
|
263 |
metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown"
|
264 |
+
|
265 |
+
# Add fields for industry-neutral scoring (silently aligned with SPDX)
|
266 |
+
metadata["primaryPurpose"] = metadata.get("ai:task", "Text Generation")
|
267 |
+
metadata["suppliedBy"] = metadata.get("author", "Unknown")
|
268 |
+
|
269 |
+
# Add typeOfModel field
|
270 |
+
metadata["typeOfModel"] = metadata.get("ai:type", "Transformer")
|
271 |
|
272 |
return {k: v for k, v in metadata.items() if v is not None}
|
273 |
|
|
|
311 |
limitations_section = card_text.split("## Limitations")[1].split("##")[0].strip()
|
312 |
if limitations_section:
|
313 |
enhanced_metadata["limitations"] = limitations_section
|
314 |
+
# Map to industry-neutral field (silently aligned with SPDX)
|
315 |
+
enhanced_metadata["limitation"] = limitations_section
|
316 |
|
317 |
# Extract ethical considerations if present
|
318 |
if "ethical_considerations" not in enhanced_metadata:
|
|
|
321 |
section = card_text.split(heading)[1].split("##")[0].strip()
|
322 |
if section:
|
323 |
enhanced_metadata["ethical_considerations"] = section
|
324 |
+
# Map to industry-neutral field (silently aligned with SPDX)
|
325 |
+
enhanced_metadata["safetyRiskAssessment"] = section
|
326 |
break
|
327 |
|
328 |
# Extract risks if present
|
|
|
331 |
risks_section = card_text.split("## Risks")[1].split("##")[0].strip()
|
332 |
if risks_section:
|
333 |
enhanced_metadata["risks"] = risks_section
|
334 |
+
|
335 |
+
# Extract energy consumption if present (for industry-neutral scoring)
|
336 |
+
if "energy" not in enhanced_metadata:
|
337 |
+
for heading in ["## Energy", "## Energy Consumption", "## Environmental Impact"]:
|
338 |
+
if heading in card_text:
|
339 |
+
section = card_text.split(heading)[1].split("##")[0].strip()
|
340 |
+
if section:
|
341 |
+
enhanced_metadata["energyConsumption"] = section
|
342 |
+
break
|
343 |
+
|
344 |
+
# Extract hyperparameters if present (for industry-neutral scoring)
|
345 |
+
if "hyperparameters" not in enhanced_metadata:
|
346 |
+
for heading in ["## Hyperparameters", "## Training Hyperparameters", "## Model Hyperparameters"]:
|
347 |
+
if heading in card_text:
|
348 |
+
section = card_text.split(heading)[1].split("##")[0].strip()
|
349 |
+
if section:
|
350 |
+
enhanced_metadata["hyperparameter"] = section
|
351 |
+
break
|
352 |
except Exception as e:
|
353 |
print(f"Error extracting unstructured metadata: {e}")
|
354 |
|
|
|
408 |
|
409 |
component = {
|
410 |
"type": "machine-learning-model",
|
|
|
411 |
"name": metadata.get("name", model_id.split("/")[-1]) if metadata else model_id.split("/")[-1],
|
412 |
+
"bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}",
|
413 |
"purl": purl
|
414 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
415 |
|
416 |
+
# Add description if available
|
417 |
+
if metadata and "description" in metadata and metadata["description"]:
|
418 |
+
component["description"] = metadata["description"]
|
419 |
|
420 |
+
# Add license if available
|
421 |
+
if metadata and "license" in metadata and metadata["license"]:
|
422 |
+
license_id = metadata["license"]
|
423 |
+
component["licenses"] = [{
|
424 |
+
"license": {
|
425 |
+
"id": license_id
|
426 |
+
}
|
427 |
+
}]
|
428 |
|
429 |
+
# Add model card if available
|
430 |
+
model_card = {}
|
431 |
+
|
432 |
+
# Add model parameters
|
433 |
+
model_parameters = {}
|
434 |
+
if metadata:
|
435 |
+
for key in ["ai:type", "ai:task", "ai:framework", "base_model", "library_name"]:
|
436 |
+
if key in metadata and metadata[key]:
|
437 |
+
if "properties" not in model_parameters:
|
438 |
+
model_parameters["properties"] = []
|
439 |
+
model_parameters["properties"].append({
|
440 |
+
"name": key,
|
441 |
+
"value": metadata[key]
|
442 |
+
})
|
443 |
+
|
444 |
+
# Add datasets if available
|
445 |
+
if "datasets" in metadata and metadata["datasets"]:
|
446 |
+
model_parameters["datasets"] = []
|
447 |
+
try:
|
448 |
+
if isinstance(metadata["datasets"], list):
|
449 |
+
for dataset in metadata["datasets"]:
|
450 |
+
model_parameters["datasets"].append({
|
451 |
+
"name": dataset
|
452 |
+
})
|
453 |
+
elif isinstance(metadata["datasets"], str):
|
454 |
+
model_parameters["datasets"].append({
|
455 |
+
"name": metadata["datasets"]
|
456 |
+
})
|
457 |
+
except Exception as e:
|
458 |
+
print(f"Error processing datasets: {e}")
|
459 |
+
|
460 |
+
if model_parameters:
|
461 |
+
model_card["modelParameters"] = model_parameters
|
462 |
|
463 |
+
# Add quantitative analysis if available
|
464 |
+
if metadata and "eval_results" in metadata and metadata["eval_results"]:
|
465 |
+
try:
|
466 |
+
quantitative_analysis = {
|
467 |
+
"performanceMetrics": []
|
468 |
+
}
|
469 |
+
|
470 |
+
eval_results = metadata["eval_results"]
|
471 |
+
if isinstance(eval_results, dict):
|
472 |
+
for metric, value in eval_results.items():
|
473 |
+
quantitative_analysis["performanceMetrics"].append({
|
474 |
+
"type": metric,
|
475 |
+
"value": str(value)
|
476 |
})
|
477 |
+
elif isinstance(eval_results, list):
|
478 |
+
for result in eval_results:
|
479 |
+
if isinstance(result, dict) and "metric" in result and "value" in result:
|
480 |
+
quantitative_analysis["performanceMetrics"].append({
|
481 |
+
"type": result["metric"],
|
482 |
+
"value": str(result["value"])
|
483 |
+
})
|
484 |
+
|
485 |
+
if quantitative_analysis["performanceMetrics"]:
|
486 |
+
model_card["quantitativeAnalysis"] = quantitative_analysis
|
487 |
+
except Exception as e:
|
488 |
+
print(f"Error processing evaluation results: {e}")
|
489 |
+
|
490 |
+
# Add considerations if available
|
491 |
+
considerations = {}
|
492 |
+
if metadata:
|
493 |
+
# Technical limitations
|
494 |
+
if "limitations" in metadata and metadata["limitations"]:
|
495 |
+
considerations["technicalLimitations"] = metadata["limitations"]
|
496 |
+
|
497 |
+
# Ethical considerations
|
498 |
+
if "ethical_considerations" in metadata and metadata["ethical_considerations"]:
|
499 |
+
considerations["ethicalConsiderations"] = metadata["ethical_considerations"]
|
500 |
+
|
501 |
+
# Risks
|
502 |
+
if "risks" in metadata and metadata["risks"]:
|
503 |
+
considerations["risks"] = metadata["risks"]
|
504 |
+
|
505 |
+
# Environmental considerations (for industry-neutral scoring)
|
506 |
+
if "energyConsumption" in metadata and metadata["energyConsumption"]:
|
507 |
+
considerations["environmentalConsiderations"] = metadata["energyConsumption"]
|
508 |
+
|
509 |
+
if considerations:
|
510 |
+
model_card["considerations"] = considerations
|
511 |
|
512 |
+
if model_card:
|
513 |
+
component["modelCard"] = model_card
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
514 |
|
515 |
+
# Add external references if available
|
516 |
+
external_references = []
|
517 |
+
|
518 |
+
# Add model card URL
|
519 |
+
external_references.append({
|
520 |
+
"type": "documentation",
|
521 |
+
"url": f"https://huggingface.co/{model_id}"
|
522 |
+
})
|
523 |
+
|
524 |
+
# Add commit URL if available
|
525 |
+
if metadata and "commit_url" in metadata and metadata["commit_url"]:
|
526 |
+
external_references.append({
|
527 |
+
"type": "vcs",
|
528 |
+
"url": metadata["commit_url"]
|
529 |
+
})
|
530 |
+
|
531 |
+
if external_references:
|
532 |
+
component["externalReferences"] = external_references
|
533 |
+
|
534 |
+
return component
|