import os import base64 import json import io import datetime from PIL import Image import logging from datasets import Dataset, load_dataset, Features, Value, Sequence import copy import numpy as np logger = logging.getLogger(__name__) HF_DATASET_NAME = "aiwithoutborders-xyz/degentic_rd0" # Custom JSON Encoder to handle numpy types (copy from app_mcp.py if it's identical) class NumpyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.float32): return float(obj) return json.JSONEncoder.default(self, obj) def _pil_to_base64(image: Image.Image) -> str: """Converts a PIL Image to a base64 string.""" # Explicitly check if the input is a PIL Image if not isinstance(image, Image.Image): raise TypeError(f"Expected a PIL Image, but received type: {type(image)}") buffered = io.BytesIO() # Ensure image is in RGB mode before saving as JPEG if image.mode != 'RGB': image = image.convert('RGB') image.save(buffered, format="JPEG", quality=85) return base64.b64encode(buffered.getvalue()).decode('utf-8') def initialize_dataset(): """Initializes or loads the Hugging Face dataset.""" try: # Try to load existing dataset dataset = load_dataset(HF_DATASET_NAME, split="train") logger.info(f"Loaded existing Hugging Face dataset: {HF_DATASET_NAME}") except Exception: # If dataset does not exist, create a new one with an empty structure logger.info(f"Creating new Hugging Face dataset: {HF_DATASET_NAME}") # Define the features explicitly features = Features({ "timestamp": Value('string'), "image": Value('string'), # base64 string "inference_request": Value('string'), # JSON string "model_predictions": Value('string'), # JSON string "ensemble_output": Value('string'), # JSON string "forensic_outputs": Sequence(Value('string')), # List of base64 image strings "agent_monitoring_data": Value('string'), # JSON string "human_feedback": Value('string') # JSON string }) dataset = Dataset.from_dict({ "timestamp": [], "image": [], "inference_request": [], "model_predictions": [], "ensemble_output": [], "forensic_outputs": [], "agent_monitoring_data": [], "human_feedback": [] }, features=features) # Pass the features explicitly return dataset def log_inference_data( original_image: Image.Image, inference_params: dict, model_predictions: list[dict], ensemble_output: dict, forensic_images: list[Image.Image], agent_monitoring_data: dict, human_feedback: dict = None ): """Logs a single inference event to the Hugging Face dataset.""" try: dataset = initialize_dataset() # Convert PIL Images to base64 strings for storage original_image_b64 = _pil_to_base64(original_image) forensic_images_b64 = [] for img_item in forensic_images: if img_item is not None: if not isinstance(img_item, Image.Image): try: img_item = Image.fromarray(img_item) except Exception as e: logger.error(f"Error converting forensic image to PIL for base64 encoding: {e}") continue # Skip this image if conversion fails # Now img_item should be a PIL Image, safe to pass to _pil_to_base64 forensic_images_b64.append(_pil_to_base64(img_item)) new_entry = { "timestamp": datetime.datetime.now().isoformat(), "image": original_image_b64, "inference_request": json.dumps(inference_params, cls=NumpyEncoder), "model_predictions": json.dumps(model_predictions, cls=NumpyEncoder), "ensemble_output": json.dumps(ensemble_output, cls=NumpyEncoder), "forensic_outputs": forensic_images_b64, # This is already a list of strings "agent_monitoring_data": json.dumps(agent_monitoring_data, cls=NumpyEncoder), "human_feedback": json.dumps(human_feedback if human_feedback is not None else {}, cls=NumpyEncoder) } # Get current dataset features features = dataset.features # Convert existing dataset to a list of dictionaries dataset_list = dataset.to_list() # Append the new entry to the list dataset_list.append(new_entry) # Create a new dataset from the updated list updated_dataset = Dataset.from_list(dataset_list, features=features) # This will push to the Hugging Face Hub if you are logged in and dataset is configured # Or save locally if not. updated_dataset.save_to_disk("sherloq-forensics/hf_dataset_cache") # Save locally for now logger.info("Inference data logged successfully to local cache.") # To push to hub, uncomment the line below and ensure HF_DATASET_NAME is set correctly and you are logged in updated_dataset.push_to_hub(HF_DATASET_NAME, private=True) logger.info("Inference data pushed to Hugging Face Hub.") except Exception as e: logger.error(f"Failed to log inference data to Hugging Face dataset: {e}")