a1c00l commited on
Commit
09c3179
Β·
verified Β·
1 Parent(s): 829e58b

Update src/aibom_generator/utils.py

Browse files
Files changed (1) hide show
  1. src/aibom_generator/utils.py +90 -139
src/aibom_generator/utils.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Utility functions for the AIBOM Generator.
3
  """
4
 
5
  import json
@@ -13,7 +13,6 @@ logger = logging.getLogger(__name__)
13
 
14
 
15
  def setup_logging(level=logging.INFO):
16
- """Set up logging configuration."""
17
  logging.basicConfig(
18
  level=level,
19
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -22,28 +21,16 @@ def setup_logging(level=logging.INFO):
22
 
23
 
24
  def ensure_directory(directory_path):
25
- """Ensure that a directory exists, creating it if necessary."""
26
  if not os.path.exists(directory_path):
27
  os.makedirs(directory_path)
28
  return directory_path
29
 
30
 
31
  def generate_uuid():
32
- """Generate a UUID for the AIBOM serialNumber."""
33
  return str(uuid.uuid4())
34
 
35
 
36
  def normalize_license_id(license_text):
37
- """
38
- Normalize a license string to a SPDX license identifier if possible.
39
-
40
- Args:
41
- license_text: The license text to normalize
42
-
43
- Returns:
44
- SPDX license identifier or the original text if no match
45
- """
46
- # Common license mappings
47
  license_mappings = {
48
  "mit": "MIT",
49
  "apache": "Apache-2.0",
@@ -82,169 +69,133 @@ def normalize_license_id(license_text):
82
  "proprietary": "NONE",
83
  "commercial": "NONE",
84
  }
85
-
86
  if not license_text:
87
  return None
88
-
89
- # Normalize to lowercase and remove punctuation
90
  normalized = re.sub(r'[^\w\s-]', '', license_text.lower())
91
-
92
- # Check for direct matches
93
  if normalized in license_mappings:
94
  return license_mappings[normalized]
95
-
96
- # Check for partial matches
97
  for key, value in license_mappings.items():
98
  if key in normalized:
99
  return value
100
-
101
- # Return original if no match
102
  return license_text
103
 
104
 
 
 
 
 
 
 
 
 
 
 
 
105
  def calculate_completeness_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
106
- """
107
- Calculate a completeness score for the AIBOM.
108
-
109
- Args:
110
- aibom: The AIBOM dictionary
111
-
112
- Returns:
113
- Dictionary containing:
114
- - total_score: overall completeness score (0-100)
115
- - section_scores: points earned per section
116
- - field_checklist: dictionary showing presence (βœ”) or absence (✘) of key fields
117
- """
118
- score = 0
119
- max_score = 100
120
- section_scores = {}
121
  field_checklist = {}
122
 
123
- # Define scoring weights for different sections
124
- weights = {
125
- "required_fields": 20,
126
- "metadata": 20,
127
- "component_basic": 20,
128
- "component_model_card": 30,
129
- "external_references": 10,
130
- }
131
-
132
- # Required Fields
133
  required_fields = ["bomFormat", "specVersion", "serialNumber", "version"]
134
- required_present = [field for field in required_fields if field in aibom]
135
- required_score = (len(required_present) / len(required_fields)) * weights["required_fields"]
136
- section_scores["required_fields"] = round(required_score)
137
  for field in required_fields:
138
- field_checklist[field] = "βœ”" if field in required_present else "✘"
139
-
140
- # Metadata Fields
141
- metadata_score = 0
142
- if "metadata" in aibom:
143
- metadata_fields = ["timestamp", "tools", "authors", "component"]
144
- present = [field for field in metadata_fields if field in aibom["metadata"]]
145
- metadata_score = (len(present) / len(metadata_fields)) * weights["metadata"]
146
- for field in metadata_fields:
147
- field_checklist[f"metadata.{field}"] = "βœ”" if field in present else "✘"
148
- section_scores["metadata"] = round(metadata_score)
149
-
150
- # Component Basic Info
151
- component_score = 0
152
- component = aibom.get("components", [{}])[0]
153
- component_fields = ["type", "name", "bom-ref", "purl", "description", "licenses"]
154
- present = [field for field in component_fields if field in component]
155
- component_score = (len(present) / len(component_fields)) * weights["component_basic"]
156
- section_scores["component_basic"] = round(component_score)
157
- for field in component_fields:
158
- field_checklist[f"component.{field}"] = "βœ”" if field in present else "✘"
159
-
160
- # Model Card Section
161
- model_card_score = 0
162
- model_card_fields = ["modelParameters", "quantitativeAnalysis", "considerations"]
163
- if "modelCard" in component:
164
- model_card = component["modelCard"]
165
- present = [field for field in model_card_fields if field in model_card]
166
- model_card_score = (len(present) / len(model_card_fields)) * weights["component_model_card"]
167
- for field in model_card_fields:
168
- field_checklist[f"modelCard.{field}"] = "βœ”" if field in present else "✘"
169
- else:
170
- for field in model_card_fields:
171
- field_checklist[f"modelCard.{field}"] = "✘"
172
- section_scores["component_model_card"] = round(model_card_score)
173
-
174
- # External References
175
- ext_score = weights["external_references"] if aibom.get("externalReferences") else 0
176
- section_scores["external_references"] = round(ext_score)
177
- field_checklist["externalReferences"] = "βœ”" if ext_score else "✘"
178
-
179
- # Final total score
180
- total_score = round(sum(section_scores.values()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  return {
183
- "total_score": total_score,
184
- "section_scores": section_scores,
 
 
 
 
 
 
185
  "field_checklist": field_checklist
186
  }
187
 
 
188
  def merge_metadata(primary: Dict[str, Any], secondary: Dict[str, Any]) -> Dict[str, Any]:
189
- """
190
- Merge two metadata dictionaries, giving priority to the primary dictionary.
191
-
192
- Args:
193
- primary: Primary metadata dictionary
194
- secondary: Secondary metadata dictionary
195
-
196
- Returns:
197
- Merged metadata dictionary
198
- """
199
  result = secondary.copy()
200
-
201
  for key, value in primary.items():
202
  if value is not None:
203
  if key in result and isinstance(value, dict) and isinstance(result[key], dict):
204
  result[key] = merge_metadata(value, result[key])
205
  else:
206
  result[key] = value
207
-
208
  return result
209
 
210
 
211
  def extract_model_id_parts(model_id: str) -> Dict[str, str]:
212
- """
213
- Extract parts from a Hugging Face model ID.
214
-
215
- Args:
216
- model_id: Hugging Face model ID (e.g., "google/bert-base-uncased")
217
-
218
- Returns:
219
- Dictionary with parts (owner, name)
220
- """
221
  parts = model_id.split("/")
222
-
223
  if len(parts) == 1:
224
- return {
225
- "owner": None,
226
- "name": parts[0],
227
- }
228
- else:
229
- return {
230
- "owner": parts[0],
231
- "name": "/".join(parts[1:]),
232
- }
233
 
234
 
235
  def create_purl(model_id: str) -> str:
236
- """
237
- Create a Package URL (purl) for a Hugging Face model.
238
-
239
- Args:
240
- model_id: Hugging Face model ID
241
-
242
- Returns:
243
- Package URL string
244
- """
245
  parts = extract_model_id_parts(model_id)
246
-
247
  if parts["owner"]:
248
  return f"pkg:huggingface/{parts['owner']}/{parts['name']}"
249
- else:
250
- return f"pkg:huggingface/{parts['name']}"
 
1
  """
2
+ Utility functions for the AIBOM Generator with restored field_checklist support.
3
  """
4
 
5
  import json
 
13
 
14
 
15
  def setup_logging(level=logging.INFO):
 
16
  logging.basicConfig(
17
  level=level,
18
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 
21
 
22
 
23
  def ensure_directory(directory_path):
 
24
  if not os.path.exists(directory_path):
25
  os.makedirs(directory_path)
26
  return directory_path
27
 
28
 
29
  def generate_uuid():
 
30
  return str(uuid.uuid4())
31
 
32
 
33
  def normalize_license_id(license_text):
 
 
 
 
 
 
 
 
 
 
34
  license_mappings = {
35
  "mit": "MIT",
36
  "apache": "Apache-2.0",
 
69
  "proprietary": "NONE",
70
  "commercial": "NONE",
71
  }
72
+
73
  if not license_text:
74
  return None
75
+
 
76
  normalized = re.sub(r'[^\w\s-]', '', license_text.lower())
77
+
 
78
  if normalized in license_mappings:
79
  return license_mappings[normalized]
80
+
 
81
  for key, value in license_mappings.items():
82
  if key in normalized:
83
  return value
84
+
 
85
  return license_text
86
 
87
 
88
+ def validate_spdx(license_entry):
89
+ spdx_licenses = [
90
+ "MIT", "Apache-2.0", "GPL-3.0-only", "GPL-2.0-only", "LGPL-3.0-only",
91
+ "BSD-3-Clause", "BSD-2-Clause", "CC-BY-4.0", "CC-BY-SA-4.0", "CC0-1.0",
92
+ "Unlicense", "NONE"
93
+ ]
94
+ if isinstance(license_entry, list):
95
+ return all(lic in spdx_licenses for lic in license_entry)
96
+ return license_entry in spdx_licenses
97
+
98
+
99
  def calculate_completeness_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  field_checklist = {}
101
 
 
 
 
 
 
 
 
 
 
 
102
  required_fields = ["bomFormat", "specVersion", "serialNumber", "version"]
103
+ required_score = sum([5 if aibom.get(field) else 0 for field in required_fields])
 
 
104
  for field in required_fields:
105
+ field_checklist[field] = "βœ”" if aibom.get(field) else "✘"
106
+
107
+ metadata = aibom.get("metadata", {})
108
+ metadata_fields = ["timestamp", "tools", "authors", "component"]
109
+ metadata_score = sum([5 if metadata.get(field) else 0 for field in metadata_fields])
110
+ for field in metadata_fields:
111
+ field_checklist[f"metadata.{field}"] = "βœ”" if metadata.get(field) else "✘"
112
+
113
+ components = aibom.get("components", [])
114
+ component_scores = []
115
+ model_card_scores = []
116
+
117
+ for comp in components:
118
+ comp_fields = ["type", "name", "bom-ref", "purl", "description", "licenses"]
119
+ comp_score = sum([
120
+ 2 if comp.get("type") else 0,
121
+ 4 if comp.get("name") else 0,
122
+ 2 if comp.get("bom-ref") else 0,
123
+ 4 if comp.get("purl") and re.match(r'^pkg:huggingface/.+', comp["purl"]) else 0,
124
+ 4 if comp.get("description") and len(comp["description"]) > 20 else 0,
125
+ 4 if comp.get("licenses") and validate_spdx(comp["licenses"]) else 0
126
+ ])
127
+ component_scores.append(comp_score)
128
+ for field in comp_fields:
129
+ field_checklist[f"component.{field}"] = "βœ”" if comp.get(field) else "✘"
130
+
131
+ card = comp.get("modelCard", {})
132
+ card_fields = ["modelParameters", "quantitativeAnalysis", "considerations"]
133
+ card_score = sum([
134
+ 10 if card.get("modelParameters") else 0,
135
+ 10 if card.get("quantitativeAnalysis") else 0,
136
+ 10 if card.get("considerations") and len(card["considerations"]) > 50 else 0
137
+ ])
138
+ model_card_scores.append(card_score)
139
+ for field in card_fields:
140
+ field_checklist[f"modelCard.{field}"] = "βœ”" if field in card else "✘"
141
+
142
+ avg_comp_score = (sum(component_scores) / len(component_scores)) if component_scores else 0
143
+ avg_model_card_score = (sum(model_card_scores) / len(model_card_scores)) if model_card_scores else 0
144
+
145
+ ext_refs = aibom.get("externalReferences", [])
146
+ ext_score = 0
147
+ for ref in ext_refs:
148
+ url = ref.get("url", "").lower()
149
+ if "modelcard" in url:
150
+ ext_score += 4
151
+ elif "huggingface.co" in url or "github.com" in url:
152
+ ext_score += 3
153
+ elif "dataset" in url:
154
+ ext_score += 3
155
+ ext_score = min(ext_score, 10)
156
+ field_checklist["externalReferences"] = "βœ”" if ext_refs else "✘"
157
+
158
+ total_score = (
159
+ (required_score * 0.20) +
160
+ (metadata_score * 0.20) +
161
+ (avg_comp_score * 0.20) +
162
+ (avg_model_card_score * 0.30) +
163
+ (ext_score * 0.10)
164
+ )
165
 
166
  return {
167
+ "total_score": round(total_score, 2),
168
+ "section_scores": {
169
+ "required_fields": required_score,
170
+ "metadata": metadata_score,
171
+ "component_basic": avg_comp_score,
172
+ "component_model_card": avg_model_card_score,
173
+ "external_references": ext_score
174
+ },
175
  "field_checklist": field_checklist
176
  }
177
 
178
+
179
  def merge_metadata(primary: Dict[str, Any], secondary: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
180
  result = secondary.copy()
 
181
  for key, value in primary.items():
182
  if value is not None:
183
  if key in result and isinstance(value, dict) and isinstance(result[key], dict):
184
  result[key] = merge_metadata(value, result[key])
185
  else:
186
  result[key] = value
 
187
  return result
188
 
189
 
190
  def extract_model_id_parts(model_id: str) -> Dict[str, str]:
 
 
 
 
 
 
 
 
 
191
  parts = model_id.split("/")
 
192
  if len(parts) == 1:
193
+ return {"owner": None, "name": parts[0]}
194
+ return {"owner": parts[0], "name": "/".join(parts[1:])}
 
 
 
 
 
 
 
195
 
196
 
197
  def create_purl(model_id: str) -> str:
 
 
 
 
 
 
 
 
 
198
  parts = extract_model_id_parts(model_id)
 
199
  if parts["owner"]:
200
  return f"pkg:huggingface/{parts['owner']}/{parts['name']}"
201
+ return f"pkg:huggingface/{parts['name']}"