File size: 5,544 Bytes
58f23d5 febce11 0f427f3 58f23d5 0f427f3 58f23d5 d20c076 58f23d5 febce11 58f23d5 febce11 58f23d5 febce11 58f23d5 febce11 58f23d5 d20c076 58f23d5 0f427f3 febce11 0f427f3 58f23d5 febce11 679e047 febce11 58f23d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
import base64
import json
import io
import datetime
from PIL import Image
import logging
from datasets import Dataset, load_dataset, Features, Value, Sequence
import copy
import numpy as np
logger = logging.getLogger(__name__)
HF_DATASET_NAME = "aiwithoutborders-xyz/degentic_rd0" # TODO: Replace with your actual HF username and dataset name
# Custom JSON Encoder to handle numpy types (copy from app_mcp.py if it's identical)
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.float32):
return float(obj)
return json.JSONEncoder.default(self, obj)
def _pil_to_base64(image: Image.Image) -> str:
"""Converts a PIL Image to a base64 string."""
# Explicitly check if the input is a PIL Image
if not isinstance(image, Image.Image):
raise TypeError(f"Expected a PIL Image, but received type: {type(image)}")
buffered = io.BytesIO()
# Ensure image is in RGB mode before saving as JPEG
if image.mode != 'RGB':
image = image.convert('RGB')
image.save(buffered, format="JPEG", quality=85)
return base64.b64encode(buffered.getvalue()).decode('utf-8')
def initialize_dataset():
"""Initializes or loads the Hugging Face dataset."""
try:
# Try to load existing dataset
dataset = load_dataset(HF_DATASET_NAME, split="train")
logger.info(f"Loaded existing Hugging Face dataset: {HF_DATASET_NAME}")
except Exception:
# If dataset does not exist, create a new one with an empty structure
logger.info(f"Creating new Hugging Face dataset: {HF_DATASET_NAME}")
# Define the features explicitly
features = Features({
"timestamp": Value('string'),
"image": Value('string'), # base64 string
"inference_request": Value('string'), # JSON string
"model_predictions": Value('string'), # JSON string
"ensemble_output": Value('string'), # JSON string
"forensic_outputs": Sequence(Value('string')), # List of base64 image strings
"agent_monitoring_data": Value('string'), # JSON string
"human_feedback": Value('string') # JSON string
})
dataset = Dataset.from_dict({
"timestamp": [],
"image": [],
"inference_request": [],
"model_predictions": [],
"ensemble_output": [],
"forensic_outputs": [],
"agent_monitoring_data": [],
"human_feedback": []
}, features=features) # Pass the features explicitly
return dataset
def log_inference_data(
original_image: Image.Image,
inference_params: dict,
model_predictions: list[dict],
ensemble_output: dict,
forensic_images: list[Image.Image],
agent_monitoring_data: dict,
human_feedback: dict = None
):
"""Logs a single inference event to the Hugging Face dataset."""
try:
dataset = initialize_dataset()
# Convert PIL Images to base64 strings for storage
original_image_b64 = _pil_to_base64(original_image)
forensic_images_b64 = []
for img_item in forensic_images:
if img_item is not None:
if not isinstance(img_item, Image.Image):
try:
img_item = Image.fromarray(img_item)
except Exception as e:
logger.error(f"Error converting forensic image to PIL for base64 encoding: {e}")
continue # Skip this image if conversion fails
# Now img_item should be a PIL Image, safe to pass to _pil_to_base64
forensic_images_b64.append(_pil_to_base64(img_item))
new_entry = {
"timestamp": datetime.datetime.now().isoformat(),
"image": original_image_b64,
"inference_request": json.dumps(inference_params, cls=NumpyEncoder),
"model_predictions": json.dumps(model_predictions, cls=NumpyEncoder),
"ensemble_output": json.dumps(ensemble_output, cls=NumpyEncoder),
"forensic_outputs": forensic_images_b64, # This is already a list of strings
"agent_monitoring_data": json.dumps(agent_monitoring_data, cls=NumpyEncoder),
"human_feedback": json.dumps(human_feedback if human_feedback is not None else {}, cls=NumpyEncoder)
}
# Get current dataset features
features = dataset.features
# Convert existing dataset to a list of dictionaries
dataset_list = dataset.to_list()
# Append the new entry to the list
dataset_list.append(new_entry)
# Create a new dataset from the updated list
updated_dataset = Dataset.from_list(dataset_list, features=features)
# This will push to the Hugging Face Hub if you are logged in and dataset is configured
# Or save locally if not.
updated_dataset.save_to_disk("sherloq-forensics/hf_dataset_cache") # Save locally for now
logger.info("Inference data logged successfully to local cache.")
# To push to hub, uncomment the line below and ensure HF_DATASET_NAME is set correctly and you are logged in
# updated_dataset.push_to_hub(HF_DATASET_NAME, private=True)
# logger.info("Inference data pushed to Hugging Face Hub.")
except Exception as e:
logger.error(f"Failed to log inference data to Hugging Face dataset: {e}") |