LPX
fix: enable dataset logging to Hugging Face Hub by uncommenting push_to_hub functionality
87fc2bb
raw
history blame
5.48 kB
import os
import base64
import json
import io
import datetime
from PIL import Image
import logging
from datasets import Dataset, load_dataset, Features, Value, Sequence
import copy
import numpy as np
logger = logging.getLogger(__name__)
HF_DATASET_NAME = "aiwithoutborders-xyz/degentic_rd0"
# Custom JSON Encoder to handle numpy types (copy from app_mcp.py if it's identical)
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.float32):
return float(obj)
return json.JSONEncoder.default(self, obj)
def _pil_to_base64(image: Image.Image) -> str:
"""Converts a PIL Image to a base64 string."""
# Explicitly check if the input is a PIL Image
if not isinstance(image, Image.Image):
raise TypeError(f"Expected a PIL Image, but received type: {type(image)}")
buffered = io.BytesIO()
# Ensure image is in RGB mode before saving as JPEG
if image.mode != 'RGB':
image = image.convert('RGB')
image.save(buffered, format="JPEG", quality=85)
return base64.b64encode(buffered.getvalue()).decode('utf-8')
def initialize_dataset():
"""Initializes or loads the Hugging Face dataset."""
try:
# Try to load existing dataset
dataset = load_dataset(HF_DATASET_NAME, split="train")
logger.info(f"Loaded existing Hugging Face dataset: {HF_DATASET_NAME}")
except Exception:
# If dataset does not exist, create a new one with an empty structure
logger.info(f"Creating new Hugging Face dataset: {HF_DATASET_NAME}")
# Define the features explicitly
features = Features({
"timestamp": Value('string'),
"image": Value('string'), # base64 string
"inference_request": Value('string'), # JSON string
"model_predictions": Value('string'), # JSON string
"ensemble_output": Value('string'), # JSON string
"forensic_outputs": Sequence(Value('string')), # List of base64 image strings
"agent_monitoring_data": Value('string'), # JSON string
"human_feedback": Value('string') # JSON string
})
dataset = Dataset.from_dict({
"timestamp": [],
"image": [],
"inference_request": [],
"model_predictions": [],
"ensemble_output": [],
"forensic_outputs": [],
"agent_monitoring_data": [],
"human_feedback": []
}, features=features) # Pass the features explicitly
return dataset
def log_inference_data(
original_image: Image.Image,
inference_params: dict,
model_predictions: list[dict],
ensemble_output: dict,
forensic_images: list[Image.Image],
agent_monitoring_data: dict,
human_feedback: dict = None
):
"""Logs a single inference event to the Hugging Face dataset."""
try:
dataset = initialize_dataset()
# Convert PIL Images to base64 strings for storage
original_image_b64 = _pil_to_base64(original_image)
forensic_images_b64 = []
for img_item in forensic_images:
if img_item is not None:
if not isinstance(img_item, Image.Image):
try:
img_item = Image.fromarray(img_item)
except Exception as e:
logger.error(f"Error converting forensic image to PIL for base64 encoding: {e}")
continue # Skip this image if conversion fails
# Now img_item should be a PIL Image, safe to pass to _pil_to_base64
forensic_images_b64.append(_pil_to_base64(img_item))
new_entry = {
"timestamp": datetime.datetime.now().isoformat(),
"image": original_image_b64,
"inference_request": json.dumps(inference_params, cls=NumpyEncoder),
"model_predictions": json.dumps(model_predictions, cls=NumpyEncoder),
"ensemble_output": json.dumps(ensemble_output, cls=NumpyEncoder),
"forensic_outputs": forensic_images_b64, # This is already a list of strings
"agent_monitoring_data": json.dumps(agent_monitoring_data, cls=NumpyEncoder),
"human_feedback": json.dumps(human_feedback if human_feedback is not None else {}, cls=NumpyEncoder)
}
# Get current dataset features
features = dataset.features
# Convert existing dataset to a list of dictionaries
dataset_list = dataset.to_list()
# Append the new entry to the list
dataset_list.append(new_entry)
# Create a new dataset from the updated list
updated_dataset = Dataset.from_list(dataset_list, features=features)
# This will push to the Hugging Face Hub if you are logged in and dataset is configured
# Or save locally if not.
updated_dataset.save_to_disk("sherloq-forensics/hf_dataset_cache") # Save locally for now
logger.info("Inference data logged successfully to local cache.")
# To push to hub, uncomment the line below and ensure HF_DATASET_NAME is set correctly and you are logged in
updated_dataset.push_to_hub(HF_DATASET_NAME, private=True)
logger.info("Inference data pushed to Hugging Face Hub.")
except Exception as e:
logger.error(f"Failed to log inference data to Hugging Face dataset: {e}")