File size: 4,422 Bytes
8819832
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
Integration with the main generator class to use the inference model.
"""

import logging
from typing import Dict, List, Optional, Any

from huggingface_hub import ModelCard

from aibom_generator.inference import MetadataExtractor
from aibom_generator.utils import merge_metadata

logger = logging.getLogger(__name__)


class InferenceModelIntegration:
    """
    Integration with the inference model for metadata extraction.
    """
    
    def __init__(
        self,
        inference_url: Optional[str] = None,
        use_inference: bool = True,
    ):
        """
        Initialize the inference model integration.
        
        Args:
            inference_url: URL of the inference model service
            use_inference: Whether to use the inference model
        """
        self.extractor = MetadataExtractor(inference_url, use_inference)
    
    def extract_metadata_from_model_card(
        self,
        model_card: ModelCard,
        structured_metadata: Optional[Dict[str, Any]] = None,
        fields: Optional[List[str]] = None,
    ) -> Dict[str, Any]:
        """
        Extract metadata from a model card using the inference model.
        
        Args:
            model_card: The ModelCard object
            structured_metadata: Optional structured metadata to provide context
            fields: Optional list of specific fields to extract
            
        Returns:
            Extracted metadata as a dictionary
        """
        if not model_card:
            logger.warning("No model card provided for inference extraction")
            return {}
        
        # Get the model card text content
        model_card_text = model_card.text if hasattr(model_card, "text") else ""
        
        if not model_card_text:
            logger.warning("Model card has no text content for inference extraction")
            return {}
        
        # Extract metadata using the extractor
        extracted_metadata = self.extractor.extract_metadata(
            model_card_text, structured_metadata, fields
        )
        
        return extracted_metadata
    
    def enhance_metadata(
        self,
        structured_metadata: Dict[str, Any],
        model_card: ModelCard,
    ) -> Dict[str, Any]:
        """
        Enhance structured metadata with information extracted from the model card.
        
        Args:
            structured_metadata: Structured metadata from API
            model_card: The ModelCard object
            
        Returns:
            Enhanced metadata as a dictionary
        """
        # Identify missing fields that could be extracted from unstructured text
        missing_fields = self._identify_missing_fields(structured_metadata)
        
        if not missing_fields:
            logger.info("No missing fields to extract from unstructured text")
            return structured_metadata
        
        # Extract missing fields from unstructured text
        extracted_metadata = self.extract_metadata_from_model_card(
            model_card, structured_metadata, missing_fields
        )
        
        # Merge the extracted metadata with the structured metadata
        # Structured metadata takes precedence
        enhanced_metadata = merge_metadata(structured_metadata, extracted_metadata)
        
        return enhanced_metadata
    
    def _identify_missing_fields(self, metadata: Dict[str, Any]) -> List[str]:
        """
        Identify fields that are missing or incomplete in the metadata.
        
        Args:
            metadata: The metadata to check
            
        Returns:
            List of missing field names
        """
        missing_fields = []
        
        # Check for missing or empty fields
        important_fields = [
            "description",
            "license",
            "model_parameters",
            "datasets",
            "evaluation_results",
            "limitations",
            "ethical_considerations",
        ]
        
        for field in important_fields:
            if field not in metadata or not metadata[field]:
                missing_fields.append(field)
            elif isinstance(metadata[field], dict) and not any(metadata[field].values()):
                missing_fields.append(field)
            elif isinstance(metadata[field], list) and not metadata[field]:
                missing_fields.append(field)
        
        return missing_fields