a1c00l commited on
Commit
2d83b8c
·
verified ·
1 Parent(s): ea2f717

Update src/aibom_generator/generator.py

Browse files
Files changed (1) hide show
  1. src/aibom_generator/generator.py +77 -238
src/aibom_generator/generator.py CHANGED
@@ -1,23 +1,12 @@
1
- """
2
- Core functionality for generating CycloneDX AIBOMs from Hugging Face models.
3
- """
4
-
5
  import json
6
  import uuid
7
  import datetime
8
- from typing import Dict, List, Optional, Union, Any
9
 
10
- from huggingface_hub import HfApi, ModelCard, ModelCardData
11
 
12
 
13
  class AIBOMGenerator:
14
- """
15
- Generator for AI Bills of Materials (AIBOMs) in CycloneDX format.
16
-
17
- This class provides functionality to generate CycloneDX 1.6 compliant
18
- AIBOMs for machine learning models hosted on the Hugging Face Hub.
19
- """
20
-
21
  def __init__(
22
  self,
23
  hf_token: Optional[str] = None,
@@ -25,93 +14,42 @@ class AIBOMGenerator:
25
  use_inference: bool = True,
26
  cache_dir: Optional[str] = None,
27
  ):
28
- """
29
- Initialize the AIBOM Generator.
30
-
31
- Args:
32
- hf_token: Hugging Face API token for accessing private models
33
- inference_model_url: URL of the inference model service for extracting
34
- metadata from unstructured text
35
- use_inference: Whether to use the inference model for metadata extraction
36
- cache_dir: Directory to cache API responses and model cards
37
- """
38
  self.hf_api = HfApi(token=hf_token)
39
  self.inference_model_url = inference_model_url
40
  self.use_inference = use_inference
41
  self.cache_dir = cache_dir
42
-
43
  def generate_aibom(
44
  self,
45
  model_id: str,
46
  output_file: Optional[str] = None,
47
  include_inference: Optional[bool] = None,
48
  ) -> Dict[str, Any]:
49
- """
50
- Generate a CycloneDX AIBOM for the specified Hugging Face model.
51
-
52
- Args:
53
- model_id: The Hugging Face model ID (e.g., "google/bert-base-uncased")
54
- output_file: Optional path to save the generated AIBOM
55
- include_inference: Override the default inference model usage setting
56
-
57
- Returns:
58
- The generated AIBOM as a dictionary
59
- """
60
- # Determine whether to use inference
61
  use_inference = include_inference if include_inference is not None else self.use_inference
62
-
63
- # Fetch model information
64
  model_info = self._fetch_model_info(model_id)
65
  model_card = self._fetch_model_card(model_id)
66
-
67
- # Generate the AIBOM
68
  aibom = self._create_aibom_structure(model_id, model_info, model_card, use_inference)
69
-
70
- # Save to file if requested
71
  if output_file:
72
  with open(output_file, 'w') as f:
73
  json.dump(aibom, f, indent=2)
74
-
75
  return aibom
76
-
77
  def _fetch_model_info(self, model_id: str) -> Dict[str, Any]:
78
- """
79
- Fetch model information from the Hugging Face API.
80
-
81
- Args:
82
- model_id: The Hugging Face model ID
83
-
84
- Returns:
85
- Model information as a dictionary
86
- """
87
- # TODO: Implement caching
88
  try:
89
- model_info = self.hf_api.model_info(model_id)
90
- return model_info
91
  except Exception as e:
92
- # Log the error and return empty dict
93
  print(f"Error fetching model info for {model_id}: {e}")
94
  return {}
95
-
96
  def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]:
97
- """
98
- Fetch the model card for the specified model.
99
-
100
- Args:
101
- model_id: The Hugging Face model ID
102
-
103
- Returns:
104
- ModelCard object if available, None otherwise
105
- """
106
- # TODO: Implement caching
107
  try:
108
- model_card = ModelCard.load(model_id)
109
- return model_card
110
  except Exception as e:
111
- # Log the error and return None
112
  print(f"Error fetching model card for {model_id}: {e}")
113
  return None
114
-
115
  def _create_aibom_structure(
116
  self,
117
  model_id: str,
@@ -119,28 +57,12 @@ class AIBOMGenerator:
119
  model_card: Optional[ModelCard],
120
  use_inference: bool,
121
  ) -> Dict[str, Any]:
122
- """
123
- Create the CycloneDX AIBOM structure.
124
-
125
- Args:
126
- model_id: The Hugging Face model ID
127
- model_info: Model information from the API
128
- model_card: ModelCard object if available
129
- use_inference: Whether to use inference for metadata extraction
130
-
131
- Returns:
132
- CycloneDX AIBOM as a dictionary
133
- """
134
- # Extract structured metadata
135
  metadata = self._extract_structured_metadata(model_id, model_info, model_card)
136
-
137
- # Extract unstructured metadata if requested and available
138
  if use_inference and model_card and self.inference_model_url:
139
  unstructured_metadata = self._extract_unstructured_metadata(model_card)
140
- # Merge with structured metadata, giving priority to structured
141
  metadata = {**unstructured_metadata, **metadata}
142
-
143
- # Create the AIBOM structure
144
  aibom = {
145
  "bomFormat": "CycloneDX",
146
  "specVersion": "1.6",
@@ -148,34 +70,24 @@ class AIBOMGenerator:
148
  "version": 1,
149
  "metadata": self._create_metadata_section(model_id, metadata),
150
  "components": [self._create_component_section(model_id, metadata)],
 
 
 
 
 
 
151
  }
152
-
153
- # Add external references if available
154
- if "external_references" in metadata:
155
- aibom["externalReferences"] = metadata["external_references"]
156
-
157
  return aibom
158
-
159
  def _extract_structured_metadata(
160
  self,
161
  model_id: str,
162
  model_info: Dict[str, Any],
163
  model_card: Optional[ModelCard],
164
  ) -> Dict[str, Any]:
165
- """
166
- Extract structured metadata from model info and model card.
167
-
168
- Args:
169
- model_id: The Hugging Face model ID
170
- model_info: Model information from the API
171
- model_card: ModelCard object if available
172
-
173
- Returns:
174
- Structured metadata as a dictionary
175
- """
176
  metadata = {}
177
-
178
- # Extract from model_info
179
  if model_info:
180
  metadata.update({
181
  "name": model_info.modelId.split("/")[-1] if hasattr(model_info, "modelId") else model_id.split("/")[-1],
@@ -185,12 +97,9 @@ class AIBOMGenerator:
185
  "downloads": model_info.downloads if hasattr(model_info, "downloads") else 0,
186
  "last_modified": model_info.lastModified if hasattr(model_info, "lastModified") else None,
187
  })
188
-
189
- # Extract from model_card
190
  if model_card and model_card.data:
191
  card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {}
192
-
193
- # Map card data to metadata
194
  metadata.update({
195
  "language": card_data.get("language"),
196
  "license": card_data.get("license"),
@@ -200,189 +109,119 @@ class AIBOMGenerator:
200
  "model_name": card_data.get("model_name"),
201
  "tags": card_data.get("tags", metadata.get("tags", [])),
202
  })
203
-
204
- # Extract evaluation results if available
205
  if hasattr(model_card.data, "eval_results") and model_card.data.eval_results:
206
  metadata["eval_results"] = model_card.data.eval_results
207
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  return {k: v for k, v in metadata.items() if v is not None}
209
-
210
  def _extract_unstructured_metadata(self, model_card: ModelCard) -> Dict[str, Any]:
211
- """
212
- Extract metadata from unstructured text using the inference model.
213
-
214
- Args:
215
- model_card: ModelCard object
216
-
217
- Returns:
218
- Extracted metadata as a dictionary
219
- """
220
- # TODO: Implement inference model integration
221
- # This is a placeholder that will be replaced with actual inference model calls
222
- return {}
223
-
224
  def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
225
- """
226
- Create the metadata section of the CycloneDX AIBOM.
227
-
228
- Args:
229
- model_id: The Hugging Face model ID
230
- metadata: Extracted metadata
231
-
232
- Returns:
233
- Metadata section as a dictionary
234
- """
235
- # Create timestamp
236
  timestamp = datetime.datetime.utcnow().isoformat() + "Z"
237
-
238
- # Create tools section
239
  tools = [{
240
  "vendor": "Aetheris AI",
241
  "name": "aibom-generator",
242
- "version": __import__("aibom_generator").__version__,
243
  }]
244
-
245
- # Create authors section
246
  authors = []
247
  if "author" in metadata and metadata["author"]:
248
  authors.append({
249
  "name": metadata["author"],
250
  "url": f"https://huggingface.co/{metadata['author']}"
251
  })
252
-
253
- # Create component section (reference to the main component)
254
  component = {
255
  "type": "machine-learning-model",
256
  "name": metadata.get("name", model_id.split("/")[-1]),
257
- "bom-ref": f"pkg:huggingface/{model_id}",
258
  }
259
-
260
- # Create properties section
261
  properties = []
262
  for key, value in metadata.items():
263
  if key not in ["name", "author", "license"] and value is not None:
264
  if isinstance(value, (list, dict)):
265
  value = json.dumps(value)
266
- properties.append({
267
- "name": key,
268
- "value": str(value)
269
- })
270
-
271
- # Assemble metadata section
272
  metadata_section = {
273
  "timestamp": timestamp,
274
  "tools": tools,
 
275
  }
276
-
277
  if authors:
278
  metadata_section["authors"] = authors
279
-
280
- if component:
281
- metadata_section["component"] = component
282
-
283
  if properties:
284
  metadata_section["properties"] = properties
285
-
286
  return metadata_section
287
-
288
  def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
289
- """
290
- Create the component section of the CycloneDX AIBOM.
291
-
292
- Args:
293
- model_id: The Hugging Face model ID
294
- metadata: Extracted metadata
295
-
296
- Returns:
297
- Component section as a dictionary
298
- """
299
- # Create basic component information
300
  component = {
301
  "type": "machine-learning-model",
302
- "bom-ref": f"pkg:huggingface/{model_id}",
303
  "name": metadata.get("name", model_id.split("/")[-1]),
304
- "purl": f"pkg:huggingface/{model_id}",
305
  }
306
-
307
- # Add description if available
308
  if "description" in metadata:
309
  component["description"] = metadata["description"]
310
-
311
- # Add version if available
312
  if "version" in metadata:
313
  component["version"] = metadata["version"]
314
-
315
- # Add license if available
316
  if "license" in metadata:
317
- component["licenses"] = [{
318
- "license": {
319
- "id": metadata["license"]
320
- }
321
- }]
322
-
323
- # Add external references
324
- component["externalReferences"] = [
325
- {
326
- "type": "website",
327
- "url": f"https://huggingface.co/{model_id}"
328
- }
329
- ]
330
-
331
- # Add model card section
332
  component["modelCard"] = self._create_model_card_section(metadata)
333
-
334
  return component
335
-
336
  def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
337
- """
338
- Create the modelCard section of the component.
339
-
340
- Args:
341
- metadata: Extracted metadata
342
-
343
- Returns:
344
- ModelCard section as a dictionary
345
- """
346
  model_card_section = {}
347
-
348
- # Add model parameters if available
349
- model_parameters = {}
350
- for param in ["base_model", "library_name", "pipeline_tag"]:
351
- if param in metadata and metadata[param]:
352
- model_parameters[param] = metadata[param]
353
-
354
  if model_parameters:
355
  model_card_section["modelParameters"] = model_parameters
356
-
357
- # Add quantitative analysis if available
358
  if "eval_results" in metadata:
359
- model_card_section["quantitativeAnalysis"] = {
360
- "performanceMetrics": metadata["eval_results"]
361
- }
362
-
363
- # Add considerations if available
364
  considerations = {}
365
- for consideration in ["limitations", "ethical_considerations", "bias", "risks"]:
366
- if consideration in metadata and metadata[consideration]:
367
- considerations[consideration] = metadata[consideration]
368
-
369
  if considerations:
370
  model_card_section["considerations"] = considerations
371
-
372
- # Add properties if available
373
  properties = []
374
  for key, value in metadata.items():
375
- if key not in ["name", "author", "license", "base_model", "library_name",
376
- "pipeline_tag", "eval_results", "limitations",
377
- "ethical_considerations", "bias", "risks"] and value is not None:
378
  if isinstance(value, (list, dict)):
379
  value = json.dumps(value)
380
- properties.append({
381
- "name": key,
382
- "value": str(value)
383
- })
384
-
385
  if properties:
386
  model_card_section["properties"] = properties
387
-
388
  return model_card_section
 
 
 
 
 
1
  import json
2
  import uuid
3
  import datetime
4
+ from typing import Dict, Optional, Any
5
 
6
+ from huggingface_hub import HfApi, ModelCard
7
 
8
 
9
  class AIBOMGenerator:
 
 
 
 
 
 
 
10
  def __init__(
11
  self,
12
  hf_token: Optional[str] = None,
 
14
  use_inference: bool = True,
15
  cache_dir: Optional[str] = None,
16
  ):
 
 
 
 
 
 
 
 
 
 
17
  self.hf_api = HfApi(token=hf_token)
18
  self.inference_model_url = inference_model_url
19
  self.use_inference = use_inference
20
  self.cache_dir = cache_dir
21
+
22
  def generate_aibom(
23
  self,
24
  model_id: str,
25
  output_file: Optional[str] = None,
26
  include_inference: Optional[bool] = None,
27
  ) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
28
  use_inference = include_inference if include_inference is not None else self.use_inference
 
 
29
  model_info = self._fetch_model_info(model_id)
30
  model_card = self._fetch_model_card(model_id)
 
 
31
  aibom = self._create_aibom_structure(model_id, model_info, model_card, use_inference)
32
+
 
33
  if output_file:
34
  with open(output_file, 'w') as f:
35
  json.dump(aibom, f, indent=2)
36
+
37
  return aibom
38
+
39
  def _fetch_model_info(self, model_id: str) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
40
  try:
41
+ return self.hf_api.model_info(model_id)
 
42
  except Exception as e:
 
43
  print(f"Error fetching model info for {model_id}: {e}")
44
  return {}
45
+
46
  def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]:
 
 
 
 
 
 
 
 
 
 
47
  try:
48
+ return ModelCard.load(model_id)
 
49
  except Exception as e:
 
50
  print(f"Error fetching model card for {model_id}: {e}")
51
  return None
52
+
53
  def _create_aibom_structure(
54
  self,
55
  model_id: str,
 
57
  model_card: Optional[ModelCard],
58
  use_inference: bool,
59
  ) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  metadata = self._extract_structured_metadata(model_id, model_info, model_card)
61
+
 
62
  if use_inference and model_card and self.inference_model_url:
63
  unstructured_metadata = self._extract_unstructured_metadata(model_card)
 
64
  metadata = {**unstructured_metadata, **metadata}
65
+
 
66
  aibom = {
67
  "bomFormat": "CycloneDX",
68
  "specVersion": "1.6",
 
70
  "version": 1,
71
  "metadata": self._create_metadata_section(model_id, metadata),
72
  "components": [self._create_component_section(model_id, metadata)],
73
+ "dependencies": [
74
+ {
75
+ "ref": f"pkg:generic/{model_id.replace('/', '%2F')}",
76
+ "dependsOn": ["pkg:pypi/[email protected]"]
77
+ }
78
+ ]
79
  }
80
+
 
 
 
 
81
  return aibom
82
+
83
  def _extract_structured_metadata(
84
  self,
85
  model_id: str,
86
  model_info: Dict[str, Any],
87
  model_card: Optional[ModelCard],
88
  ) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
89
  metadata = {}
90
+
 
91
  if model_info:
92
  metadata.update({
93
  "name": model_info.modelId.split("/")[-1] if hasattr(model_info, "modelId") else model_id.split("/")[-1],
 
97
  "downloads": model_info.downloads if hasattr(model_info, "downloads") else 0,
98
  "last_modified": model_info.lastModified if hasattr(model_info, "lastModified") else None,
99
  })
100
+
 
101
  if model_card and model_card.data:
102
  card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {}
 
 
103
  metadata.update({
104
  "language": card_data.get("language"),
105
  "license": card_data.get("license"),
 
109
  "model_name": card_data.get("model_name"),
110
  "tags": card_data.get("tags", metadata.get("tags", [])),
111
  })
 
 
112
  if hasattr(model_card.data, "eval_results") and model_card.data.eval_results:
113
  metadata["eval_results"] = model_card.data.eval_results
114
+
115
+ # AI-specific fields (manually added or inferred)
116
+ metadata["ai:type"] = "Transformer"
117
+ metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation")
118
+ metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown"
119
+
120
+ if "DeepSeek-R1" in model_id:
121
+ metadata.update({
122
+ "ai:parameters": "672B total, 37B active per token",
123
+ "ai:training-data": "14.8 trillion tokens",
124
+ "ai:training-duration": "55 days",
125
+ "ai:training-cost": "$5.58 million",
126
+ "ai:hardware": "NVIDIA H800 GPUs"
127
+ })
128
+
129
  return {k: v for k, v in metadata.items() if v is not None}
130
+
131
  def _extract_unstructured_metadata(self, model_card: ModelCard) -> Dict[str, Any]:
132
+ return {} # Placeholder for inference model integration
133
+
 
 
 
 
 
 
 
 
 
 
 
134
  def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
135
  timestamp = datetime.datetime.utcnow().isoformat() + "Z"
 
 
136
  tools = [{
137
  "vendor": "Aetheris AI",
138
  "name": "aibom-generator",
139
+ "version": "0.1.0"
140
  }]
141
+
 
142
  authors = []
143
  if "author" in metadata and metadata["author"]:
144
  authors.append({
145
  "name": metadata["author"],
146
  "url": f"https://huggingface.co/{metadata['author']}"
147
  })
148
+
 
149
  component = {
150
  "type": "machine-learning-model",
151
  "name": metadata.get("name", model_id.split("/")[-1]),
152
+ "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}"
153
  }
154
+
 
155
  properties = []
156
  for key, value in metadata.items():
157
  if key not in ["name", "author", "license"] and value is not None:
158
  if isinstance(value, (list, dict)):
159
  value = json.dumps(value)
160
+ properties.append({"name": key, "value": str(value)})
161
+
 
 
 
 
162
  metadata_section = {
163
  "timestamp": timestamp,
164
  "tools": tools,
165
+ "component": component
166
  }
167
+
168
  if authors:
169
  metadata_section["authors"] = authors
 
 
 
 
170
  if properties:
171
  metadata_section["properties"] = properties
172
+
173
  return metadata_section
174
+
175
  def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
176
  component = {
177
  "type": "machine-learning-model",
178
+ "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}",
179
  "name": metadata.get("name", model_id.split("/")[-1]),
180
+ "purl": f"pkg:generic/{model_id.replace('/', '%2F')}"
181
  }
182
+
 
183
  if "description" in metadata:
184
  component["description"] = metadata["description"]
185
+
 
186
  if "version" in metadata:
187
  component["version"] = metadata["version"]
188
+
 
189
  if "license" in metadata:
190
+ component["licenses"] = [{"license": {"id": metadata["license"]}}]
191
+
192
+ component["externalReferences"] = [{
193
+ "type": "website",
194
+ "url": f"https://huggingface.co/{model_id}"
195
+ }]
196
+
 
 
 
 
 
 
 
 
197
  component["modelCard"] = self._create_model_card_section(metadata)
198
+
199
  return component
200
+
201
  def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
202
  model_card_section = {}
203
+ model_parameters = {k: metadata[k] for k in ["base_model", "library_name", "pipeline_tag"] if k in metadata}
 
 
 
 
 
 
204
  if model_parameters:
205
  model_card_section["modelParameters"] = model_parameters
206
+
 
207
  if "eval_results" in metadata:
208
+ model_card_section["quantitativeAnalysis"] = {"performanceMetrics": metadata["eval_results"]}
209
+
 
 
 
210
  considerations = {}
211
+ for k in ["limitations", "ethical_considerations", "bias", "risks"]:
212
+ if k in metadata:
213
+ considerations[k] = metadata[k]
 
214
  if considerations:
215
  model_card_section["considerations"] = considerations
216
+
 
217
  properties = []
218
  for key, value in metadata.items():
219
+ if key not in ["name", "author", "license", "base_model", "library_name", "pipeline_tag", "eval_results", "limitations", "ethical_considerations", "bias", "risks"]:
 
 
220
  if isinstance(value, (list, dict)):
221
  value = json.dumps(value)
222
+ properties.append({"name": key, "value": str(value)})
223
+
 
 
 
224
  if properties:
225
  model_card_section["properties"] = properties
226
+
227
  return model_card_section