a1c00l commited on
Commit
b697afb
·
verified ·
1 Parent(s): 56cac31

Update src/aibom_generator/generator.py

Browse files
Files changed (1) hide show
  1. src/aibom_generator/generator.py +181 -72
src/aibom_generator/generator.py CHANGED
@@ -14,21 +14,27 @@ class AIBOMGenerator:
14
  inference_model_url: Optional[str] = None,
15
  use_inference: bool = True,
16
  cache_dir: Optional[str] = None,
 
17
  ):
18
  self.hf_api = HfApi(token=hf_token)
19
  self.inference_model_url = inference_model_url
20
  self.use_inference = use_inference
21
  self.cache_dir = cache_dir
22
  self.enhancement_report = None # Store enhancement report as instance variable
 
23
 
24
  def generate_aibom(
25
  self,
26
  model_id: str,
27
  output_file: Optional[str] = None,
28
  include_inference: Optional[bool] = None,
 
29
  ) -> Dict[str, Any]:
30
  try:
31
  use_inference = include_inference if include_inference is not None else self.use_inference
 
 
 
32
  model_info = self._fetch_model_info(model_id)
33
  model_card = self._fetch_model_card(model_id)
34
 
@@ -38,8 +44,8 @@ class AIBOMGenerator:
38
  # Create initial AIBOM with original metadata
39
  original_aibom = self._create_aibom_structure(model_id, original_metadata)
40
 
41
- # Calculate initial score
42
- original_score = calculate_completeness_score(original_aibom)
43
 
44
  # Final metadata starts with original metadata
45
  final_metadata = original_metadata.copy() if original_metadata else {}
@@ -69,8 +75,8 @@ class AIBOMGenerator:
69
  # Create final AIBOM with potentially enhanced metadata
70
  aibom = self._create_aibom_structure(model_id, final_metadata)
71
 
72
- # Calculate final score
73
- final_score = calculate_completeness_score(aibom)
74
 
75
  # Add score and enhancement info to metadata properties
76
  if "metadata" in aibom and "properties" not in aibom["metadata"]:
@@ -83,6 +89,17 @@ class AIBOMGenerator:
83
  aibom["metadata"]["properties"].append({"name": "aibom:quality-breakdown", "value": json.dumps(final_score["section_scores"])})
84
  aibom["metadata"]["properties"].append({"name": "aibom:max-scores", "value": json.dumps(final_score["max_scores"])})
85
 
 
 
 
 
 
 
 
 
 
 
 
86
  # Add AI enhancement information
87
  if ai_enhanced:
88
  aibom["metadata"]["properties"].append({"name": "aibom:ai-enhanced", "value": "true"})
@@ -181,6 +198,23 @@ class AIBOMGenerator:
181
  ]
182
  }
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  return aibom
185
 
186
  def _extract_structured_metadata(
@@ -227,6 +261,13 @@ class AIBOMGenerator:
227
  metadata["ai:type"] = "Transformer"
228
  metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation")
229
  metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown"
 
 
 
 
 
 
 
230
 
231
  return {k: v for k, v in metadata.items() if v is not None}
232
 
@@ -270,6 +311,8 @@ class AIBOMGenerator:
270
  limitations_section = card_text.split("## Limitations")[1].split("##")[0].strip()
271
  if limitations_section:
272
  enhanced_metadata["limitations"] = limitations_section
 
 
273
 
274
  # Extract ethical considerations if present
275
  if "ethical_considerations" not in enhanced_metadata:
@@ -278,6 +321,8 @@ class AIBOMGenerator:
278
  section = card_text.split(heading)[1].split("##")[0].strip()
279
  if section:
280
  enhanced_metadata["ethical_considerations"] = section
 
 
281
  break
282
 
283
  # Extract risks if present
@@ -286,6 +331,24 @@ class AIBOMGenerator:
286
  risks_section = card_text.split("## Risks")[1].split("##")[0].strip()
287
  if risks_section:
288
  enhanced_metadata["risks"] = risks_section
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  except Exception as e:
290
  print(f"Error extracting unstructured metadata: {e}")
291
 
@@ -345,81 +408,127 @@ class AIBOMGenerator:
345
 
346
  component = {
347
  "type": "machine-learning-model",
348
- "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}",
349
  "name": metadata.get("name", model_id.split("/")[-1]) if metadata else model_id.split("/")[-1],
 
350
  "purl": purl
351
  }
352
-
353
- if metadata and "description" in metadata:
354
- component["description"] = metadata["description"]
355
-
356
- if metadata and "commit" in metadata:
357
- component["version"] = metadata["commit"]
358
-
359
- if metadata and "license" in metadata:
360
- component["licenses"] = [{"license": {"id": metadata["license"]}}]
361
-
362
- external_refs = [{
363
- "type": "website",
364
- "url": f"https://huggingface.co/{model_id}"
365
- }]
366
- if metadata and "commit_url" in metadata:
367
- external_refs.append({
368
- "type": "vcs",
369
- "url": metadata["commit_url"]
370
- })
371
- component["externalReferences"] = external_refs
372
-
373
- component["modelCard"] = self._create_model_card_section(metadata)
374
-
375
- return component
376
-
377
- def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
378
- model_card_section = {}
379
 
380
- if not metadata:
381
- return model_card_section
 
382
 
383
- try:
384
- # Create a CycloneDX-compliant modelParameters structure
385
- # Instead of adding custom properties directly, use a standardized structure
386
- model_parameters = {
387
- "description": "Model parameters and configuration"
388
- }
 
 
389
 
390
- # Add properties array for custom parameters to ensure schema compliance
391
- parameter_properties = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
- for k in ["base_model", "library_name", "pipeline_tag"]:
394
- if k in metadata and metadata[k] is not None:
395
- try:
396
- if isinstance(metadata[k], (list, dict)):
397
- value = json.dumps(metadata[k])
398
- else:
399
- value = str(metadata[k])
400
- parameter_properties.append({
401
- "name": k,
402
- "value": value
 
 
 
403
  })
404
- except Exception as e:
405
- print(f"Error processing model parameter {k}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
- if parameter_properties:
408
- model_parameters["properties"] = parameter_properties
409
- model_card_section["modelParameters"] = model_parameters
410
-
411
- # Safely extract evaluation results
412
- if "eval_results" in metadata:
413
- model_card_section["quantitativeAnalysis"] = {"performanceMetrics": metadata["eval_results"]}
414
-
415
- # Safely extract considerations
416
- considerations = {}
417
- for k in ["limitations", "ethical_considerations", "bias", "risks"]:
418
- if k in metadata:
419
- considerations[k] = metadata[k]
420
- if considerations:
421
- model_card_section["considerations"] = considerations
422
- except Exception as e:
423
- print(f"Error creating model card section: {e}")
424
 
425
- return model_card_section
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  inference_model_url: Optional[str] = None,
15
  use_inference: bool = True,
16
  cache_dir: Optional[str] = None,
17
+ use_best_practices: bool = True, # Added parameter for industry-neutral scoring
18
  ):
19
  self.hf_api = HfApi(token=hf_token)
20
  self.inference_model_url = inference_model_url
21
  self.use_inference = use_inference
22
  self.cache_dir = cache_dir
23
  self.enhancement_report = None # Store enhancement report as instance variable
24
+ self.use_best_practices = use_best_practices # Store best practices flag
25
 
26
  def generate_aibom(
27
  self,
28
  model_id: str,
29
  output_file: Optional[str] = None,
30
  include_inference: Optional[bool] = None,
31
+ use_best_practices: Optional[bool] = None, # Added parameter for industry-neutral scoring
32
  ) -> Dict[str, Any]:
33
  try:
34
  use_inference = include_inference if include_inference is not None else self.use_inference
35
+ # Use method parameter if provided, otherwise use instance variable
36
+ use_best_practices = use_best_practices if use_best_practices is not None else self.use_best_practices
37
+
38
  model_info = self._fetch_model_info(model_id)
39
  model_card = self._fetch_model_card(model_id)
40
 
 
44
  # Create initial AIBOM with original metadata
45
  original_aibom = self._create_aibom_structure(model_id, original_metadata)
46
 
47
+ # Calculate initial score with industry-neutral approach if enabled
48
+ original_score = calculate_completeness_score(original_aibom, validate=True, use_best_practices=use_best_practices)
49
 
50
  # Final metadata starts with original metadata
51
  final_metadata = original_metadata.copy() if original_metadata else {}
 
75
  # Create final AIBOM with potentially enhanced metadata
76
  aibom = self._create_aibom_structure(model_id, final_metadata)
77
 
78
+ # Calculate final score with industry-neutral approach if enabled
79
+ final_score = calculate_completeness_score(aibom, validate=True, use_best_practices=use_best_practices)
80
 
81
  # Add score and enhancement info to metadata properties
82
  if "metadata" in aibom and "properties" not in aibom["metadata"]:
 
89
  aibom["metadata"]["properties"].append({"name": "aibom:quality-breakdown", "value": json.dumps(final_score["section_scores"])})
90
  aibom["metadata"]["properties"].append({"name": "aibom:max-scores", "value": json.dumps(final_score["max_scores"])})
91
 
92
+ # Add completeness profile information if available (from industry-neutral approach)
93
+ if use_best_practices and "completeness_profile" in final_score:
94
+ aibom["metadata"]["properties"].append({
95
+ "name": "aibom:completeness-profile",
96
+ "value": final_score["completeness_profile"]["name"]
97
+ })
98
+ aibom["metadata"]["properties"].append({
99
+ "name": "aibom:completeness-description",
100
+ "value": final_score["completeness_profile"]["description"]
101
+ })
102
+
103
  # Add AI enhancement information
104
  if ai_enhanced:
105
  aibom["metadata"]["properties"].append({"name": "aibom:ai-enhanced", "value": "true"})
 
198
  ]
199
  }
200
 
201
+ # Add critical fields for industry-neutral scoring (silently aligned with SPDX)
202
+ if metadata and "last_modified" in metadata:
203
+ aibom["buildTime"] = metadata["last_modified"]
204
+ # Add releaseTime as well if available, otherwise use last_modified
205
+ aibom["releaseTime"] = metadata.get("release_date", metadata["last_modified"])
206
+
207
+ # Add downloadLocation if available
208
+ if metadata and "commit_url" in metadata:
209
+ # Add external reference for downloadLocation
210
+ if "externalReferences" not in aibom:
211
+ aibom["externalReferences"] = []
212
+
213
+ aibom["externalReferences"].append({
214
+ "type": "distribution",
215
+ "url": f"https://huggingface.co/{model_id}"
216
+ })
217
+
218
  return aibom
219
 
220
  def _extract_structured_metadata(
 
261
  metadata["ai:type"] = "Transformer"
262
  metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation")
263
  metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown"
264
+
265
+ # Add fields for industry-neutral scoring (silently aligned with SPDX)
266
+ metadata["primaryPurpose"] = metadata.get("ai:task", "Text Generation")
267
+ metadata["suppliedBy"] = metadata.get("author", "Unknown")
268
+
269
+ # Add typeOfModel field
270
+ metadata["typeOfModel"] = metadata.get("ai:type", "Transformer")
271
 
272
  return {k: v for k, v in metadata.items() if v is not None}
273
 
 
311
  limitations_section = card_text.split("## Limitations")[1].split("##")[0].strip()
312
  if limitations_section:
313
  enhanced_metadata["limitations"] = limitations_section
314
+ # Map to industry-neutral field (silently aligned with SPDX)
315
+ enhanced_metadata["limitation"] = limitations_section
316
 
317
  # Extract ethical considerations if present
318
  if "ethical_considerations" not in enhanced_metadata:
 
321
  section = card_text.split(heading)[1].split("##")[0].strip()
322
  if section:
323
  enhanced_metadata["ethical_considerations"] = section
324
+ # Map to industry-neutral field (silently aligned with SPDX)
325
+ enhanced_metadata["safetyRiskAssessment"] = section
326
  break
327
 
328
  # Extract risks if present
 
331
  risks_section = card_text.split("## Risks")[1].split("##")[0].strip()
332
  if risks_section:
333
  enhanced_metadata["risks"] = risks_section
334
+
335
+ # Extract energy consumption if present (for industry-neutral scoring)
336
+ if "energy" not in enhanced_metadata:
337
+ for heading in ["## Energy", "## Energy Consumption", "## Environmental Impact"]:
338
+ if heading in card_text:
339
+ section = card_text.split(heading)[1].split("##")[0].strip()
340
+ if section:
341
+ enhanced_metadata["energyConsumption"] = section
342
+ break
343
+
344
+ # Extract hyperparameters if present (for industry-neutral scoring)
345
+ if "hyperparameters" not in enhanced_metadata:
346
+ for heading in ["## Hyperparameters", "## Training Hyperparameters", "## Model Hyperparameters"]:
347
+ if heading in card_text:
348
+ section = card_text.split(heading)[1].split("##")[0].strip()
349
+ if section:
350
+ enhanced_metadata["hyperparameter"] = section
351
+ break
352
  except Exception as e:
353
  print(f"Error extracting unstructured metadata: {e}")
354
 
 
408
 
409
  component = {
410
  "type": "machine-learning-model",
 
411
  "name": metadata.get("name", model_id.split("/")[-1]) if metadata else model_id.split("/")[-1],
412
+ "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}",
413
  "purl": purl
414
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
416
+ # Add description if available
417
+ if metadata and "description" in metadata and metadata["description"]:
418
+ component["description"] = metadata["description"]
419
 
420
+ # Add license if available
421
+ if metadata and "license" in metadata and metadata["license"]:
422
+ license_id = metadata["license"]
423
+ component["licenses"] = [{
424
+ "license": {
425
+ "id": license_id
426
+ }
427
+ }]
428
 
429
+ # Add model card if available
430
+ model_card = {}
431
+
432
+ # Add model parameters
433
+ model_parameters = {}
434
+ if metadata:
435
+ for key in ["ai:type", "ai:task", "ai:framework", "base_model", "library_name"]:
436
+ if key in metadata and metadata[key]:
437
+ if "properties" not in model_parameters:
438
+ model_parameters["properties"] = []
439
+ model_parameters["properties"].append({
440
+ "name": key,
441
+ "value": metadata[key]
442
+ })
443
+
444
+ # Add datasets if available
445
+ if "datasets" in metadata and metadata["datasets"]:
446
+ model_parameters["datasets"] = []
447
+ try:
448
+ if isinstance(metadata["datasets"], list):
449
+ for dataset in metadata["datasets"]:
450
+ model_parameters["datasets"].append({
451
+ "name": dataset
452
+ })
453
+ elif isinstance(metadata["datasets"], str):
454
+ model_parameters["datasets"].append({
455
+ "name": metadata["datasets"]
456
+ })
457
+ except Exception as e:
458
+ print(f"Error processing datasets: {e}")
459
+
460
+ if model_parameters:
461
+ model_card["modelParameters"] = model_parameters
462
 
463
+ # Add quantitative analysis if available
464
+ if metadata and "eval_results" in metadata and metadata["eval_results"]:
465
+ try:
466
+ quantitative_analysis = {
467
+ "performanceMetrics": []
468
+ }
469
+
470
+ eval_results = metadata["eval_results"]
471
+ if isinstance(eval_results, dict):
472
+ for metric, value in eval_results.items():
473
+ quantitative_analysis["performanceMetrics"].append({
474
+ "type": metric,
475
+ "value": str(value)
476
  })
477
+ elif isinstance(eval_results, list):
478
+ for result in eval_results:
479
+ if isinstance(result, dict) and "metric" in result and "value" in result:
480
+ quantitative_analysis["performanceMetrics"].append({
481
+ "type": result["metric"],
482
+ "value": str(result["value"])
483
+ })
484
+
485
+ if quantitative_analysis["performanceMetrics"]:
486
+ model_card["quantitativeAnalysis"] = quantitative_analysis
487
+ except Exception as e:
488
+ print(f"Error processing evaluation results: {e}")
489
+
490
+ # Add considerations if available
491
+ considerations = {}
492
+ if metadata:
493
+ # Technical limitations
494
+ if "limitations" in metadata and metadata["limitations"]:
495
+ considerations["technicalLimitations"] = metadata["limitations"]
496
+
497
+ # Ethical considerations
498
+ if "ethical_considerations" in metadata and metadata["ethical_considerations"]:
499
+ considerations["ethicalConsiderations"] = metadata["ethical_considerations"]
500
+
501
+ # Risks
502
+ if "risks" in metadata and metadata["risks"]:
503
+ considerations["risks"] = metadata["risks"]
504
+
505
+ # Environmental considerations (for industry-neutral scoring)
506
+ if "energyConsumption" in metadata and metadata["energyConsumption"]:
507
+ considerations["environmentalConsiderations"] = metadata["energyConsumption"]
508
+
509
+ if considerations:
510
+ model_card["considerations"] = considerations
511
 
512
+ if model_card:
513
+ component["modelCard"] = model_card
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
 
515
+ # Add external references if available
516
+ external_references = []
517
+
518
+ # Add model card URL
519
+ external_references.append({
520
+ "type": "documentation",
521
+ "url": f"https://huggingface.co/{model_id}"
522
+ })
523
+
524
+ # Add commit URL if available
525
+ if metadata and "commit_url" in metadata and metadata["commit_url"]:
526
+ external_references.append({
527
+ "type": "vcs",
528
+ "url": metadata["commit_url"]
529
+ })
530
+
531
+ if external_references:
532
+ component["externalReferences"] = external_references
533
+
534
+ return component