Spaces:

aetheris-ai
/

aibom-generator

Running

File size: 12,587 Bytes
"""
Inference model integration for extracting metadata from unstructured text.
"""

import json
import logging
import re
import requests
from typing import Dict, List, Optional, Any, Union

logger = logging.getLogger(__name__)


class InferenceModelClient:
    """
    Client for interacting with the inference model service to extract
    metadata from unstructured text in model cards.
    """
    
    def __init__(
        self,
        inference_url: str,
        timeout: int = 30,
        max_retries: int = 3,
    ):
        """
        Initialize the inference model client.
        
        Args:
            inference_url: URL of the inference model service
            timeout: Request timeout in seconds
            max_retries: Maximum number of retries for failed requests
        """
        self.inference_url = inference_url
        self.timeout = timeout
        self.max_retries = max_retries
    
    def extract_metadata(
        self,
        model_card_text: str,
        structured_metadata: Optional[Dict[str, Any]] = None,
        fields: Optional[List[str]] = None,
    ) -> Dict[str, Any]:
        """
        Extract metadata from unstructured text using the inference model.
        
        Args:
            model_card_text: The text content of the model card
            structured_metadata: Optional structured metadata to provide context
            fields: Optional list of specific fields to extract
            
        Returns:
            Extracted metadata as a dictionary
        """
        if not self.inference_url:
            logger.warning("No inference model URL provided, skipping extraction")
            return {}
        
        # Prepare the request payload
        payload = {
            "text": model_card_text,
            "structured_metadata": structured_metadata or {},
            "fields": fields or [],
        }
        
        # Make the request to the inference model
        try:
            response = self._make_request(payload)
            return response.get("metadata", {})
        except Exception as e:
            logger.error(f"Error extracting metadata with inference model: {e}")
            return {}
    
    def _make_request(self, payload: Dict[str, Any]) -> Dict[str, Any]:
        """
        Make a request to the inference model service.
        
        Args:
            payload: Request payload
            
        Returns:
            Response from the inference model
            
        Raises:
            Exception: If the request fails after max_retries
        """
        headers = {"Content-Type": "application/json"}
        
        for attempt in range(self.max_retries):
            try:
                response = requests.post(
                    self.inference_url,
                    headers=headers,
                    json=payload,
                    timeout=self.timeout,
                )
                response.raise_for_status()
                return response.json()
            except requests.exceptions.RequestException as e:
                logger.warning(f"Request failed (attempt {attempt+1}/{self.max_retries}): {e}")
                if attempt == self.max_retries - 1:
                    raise
        
        # This should never be reached due to the raise in the loop
        raise Exception("Failed to make request to inference model")


class FallbackExtractor:
    """
    Fallback extractor for extracting metadata using regex and heuristics
    when the inference model is not available or fails.
    """
    
    def extract_metadata(
        self,
        model_card_text: str,
        structured_metadata: Optional[Dict[str, Any]] = None,
        fields: Optional[List[str]] = None,
    ) -> Dict[str, Any]:
        """
        Extract metadata using regex and heuristics.
        
        Args:
            model_card_text: The text content of the model card
            structured_metadata: Optional structured metadata to provide context
            fields: Optional list of specific fields to extract
            
        Returns:
            Extracted metadata as a dictionary
        """
        metadata = {}
        
        # Extract model parameters
        metadata.update(self._extract_model_parameters(model_card_text))
        
        # Extract limitations and ethical considerations
        metadata.update(self._extract_considerations(model_card_text))
        
        # Extract datasets
        metadata.update(self._extract_datasets(model_card_text))
        
        # Extract evaluation results
        metadata.update(self._extract_evaluation_results(model_card_text))
        
        return metadata
    
    def _extract_model_parameters(self, text: str) -> Dict[str, Any]:
        """Extract model parameters from text."""
        params = {}
        
        # Extract model type/architecture
        architecture_patterns = [
            r"(?:model|architecture)(?:\s+type)?(?:\s*:\s*|\s+is\s+)([A-Za-z0-9\-]+)",
            r"based\s+on\s+(?:the\s+)?([A-Za-z0-9\-]+)(?:\s+architecture)?",
        ]
        
        for pattern in architecture_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                params["architecture"] = match.group(1).strip()
                break
        
        # Extract number of parameters
        param_patterns = [
            r"(\d+(?:\.\d+)?)\s*(?:B|M|K)?\s*(?:billion|million|thousand)?\s*parameters",
            r"parameters\s*:\s*(\d+(?:\.\d+)?)\s*(?:B|M|K)?",
        ]
        
        for pattern in param_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                params["parameters"] = match.group(1).strip()
                # TODO: Normalize to a standard unit
                break
        
        return {"model_parameters": params} if params else {}
    
    def _extract_considerations(self, text: str) -> Dict[str, Any]:
        """Extract limitations and ethical considerations from text."""
        considerations = {}
        
        # Extract limitations
        limitations_section = self._extract_section(text, ["limitations", "limits", "shortcomings"])
        if limitations_section:
            considerations["limitations"] = limitations_section
        
        # Extract ethical considerations
        ethics_section = self._extract_section(
            text, ["ethical considerations", "ethics", "bias", "fairness", "risks"]
        )
        if ethics_section:
            considerations["ethical_considerations"] = ethics_section
        
        return {"considerations": considerations} if considerations else {}
    
    def _extract_datasets(self, text: str) -> Dict[str, Any]:
        """Extract dataset information from text."""
        datasets = []
        
        # Extract dataset mentions
        dataset_patterns = [
            r"trained\s+on\s+(?:the\s+)?([A-Za-z0-9\-\s]+)(?:\s+dataset)?",
            r"dataset(?:\s*:\s*|\s+is\s+)([A-Za-z0-9\-\s]+)",
            r"using\s+(?:the\s+)?([A-Za-z0-9\-\s]+)(?:\s+dataset)",
        ]
        
        for pattern in dataset_patterns:
            for match in re.finditer(pattern, text, re.IGNORECASE):
                dataset = match.group(1).strip()
                if dataset and dataset.lower() not in ["this", "these", "those"]:
                    datasets.append(dataset)
        
        return {"datasets": list(set(datasets))} if datasets else {}
    
    def _extract_evaluation_results(self, text: str) -> Dict[str, Any]:
        """Extract evaluation results from text."""
        results = {}
        
        # Extract accuracy
        accuracy_match = re.search(
            r"accuracy(?:\s*:\s*|\s+of\s+|\s+is\s+)(\d+(?:\.\d+)?)\s*%?",
            text,
            re.IGNORECASE,
        )
        if accuracy_match:
            results["accuracy"] = float(accuracy_match.group(1))
        
        # Extract F1 score
        f1_match = re.search(
            r"f1(?:\s*[\-_]?score)?(?:\s*:\s*|\s+of\s+|\s+is\s+)(\d+(?:\.\d+)?)",
            text,
            re.IGNORECASE,
        )
        if f1_match:
            results["f1"] = float(f1_match.group(1))
        
        # Extract precision
        precision_match = re.search(
            r"precision(?:\s*:\s*|\s+of\s+|\s+is\s+)(\d+(?:\.\d+)?)",
            text,
            re.IGNORECASE,
        )
        if precision_match:
            results["precision"] = float(precision_match.group(1))
        
        # Extract recall
        recall_match = re.search(
            r"recall(?:\s*:\s*|\s+of\s+|\s+is\s+)(\d+(?:\.\d+)?)",
            text,
            re.IGNORECASE,
        )
        if recall_match:
            results["recall"] = float(recall_match.group(1))
        
        return {"evaluation_results": results} if results else {}
    
    def _extract_section(self, text: str, section_names: List[str]) -> Optional[str]:
        """
        Extract a section from the text based on section names.
        
        Args:
            text: The text to extract from
            section_names: Possible names for the section
            
        Returns:
            The extracted section text, or None if not found
        """
        # Create pattern to match section headers
        header_pattern = r"(?:^|\n)(?:#+\s*|[0-9]+\.\s*|[A-Z\s]+:\s*)(?:{})(?:\s*:)?(?:\s*\n|\s*$)".format(
            "|".join(section_names)
        )
        
        # Find all section headers
        headers = list(re.finditer(header_pattern, text, re.IGNORECASE))
        
        for i, match in enumerate(headers):
            start = match.end()
            
            # Find the end of the section (next header or end of text)
            if i < len(headers) - 1:
                end = headers[i + 1].start()
            else:
                end = len(text)
            
            # Extract the section content
            section = text[start:end].strip()
            
            if section:
                return section
        
        return None


class MetadataExtractor:
    """
    Metadata extractor that combines inference model and fallback extraction.
    """
    
    def __init__(
        self,
        inference_url: Optional[str] = None,
        use_inference: bool = True,
    ):
        """
        Initialize the metadata extractor.
        
        Args:
            inference_url: URL of the inference model service
            use_inference: Whether to use the inference model
        """
        self.use_inference = use_inference and inference_url is not None
        self.inference_client = InferenceModelClient(inference_url) if self.use_inference else None
        self.fallback_extractor = FallbackExtractor()
    
    def extract_metadata(
        self,
        model_card_text: str,
        structured_metadata: Optional[Dict[str, Any]] = None,
        fields: Optional[List[str]] = None,
    ) -> Dict[str, Any]:
        """
        Extract metadata from model card text.
        
        Args:
            model_card_text: The text content of the model card
            structured_metadata: Optional structured metadata to provide context
            fields: Optional list of specific fields to extract
            
        Returns:
            Extracted metadata as a dictionary
        """
        metadata = {}
        
        # Try inference model first if enabled
        if self.use_inference and self.inference_client:
            try:
                inference_metadata = self.inference_client.extract_metadata(
                    model_card_text, structured_metadata, fields
                )
                metadata.update(inference_metadata)
            except Exception as e:
                logger.error(f"Inference model extraction failed: {e}")
        
        # Use fallback extractor for missing fields or if inference failed
        if not metadata or (fields and not all(field in metadata for field in fields)):
            missing_fields = fields if fields else None
            if fields:
                missing_fields = [field for field in fields if field not in metadata]
            
            fallback_metadata = self.fallback_extractor.extract_metadata(
                model_card_text, structured_metadata, missing_fields
            )
            
            # Only update with fallback data for fields that weren't extracted by inference
            for key, value in fallback_metadata.items():
                if key not in metadata or not metadata[key]:
                    metadata[key] = value
        
        return metadata