File size: 3,601 Bytes
58f23d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import base64
import json
import io
import datetime
from PIL import Image
import logging
from datasets import Dataset, load_dataset

logger = logging.getLogger(__name__)

HF_DATASET_NAME = "aiwithoutborders-xyz/degentic_rd0" # TODO: Replace with your actual HF username and dataset name

def _pil_to_base64(image: Image.Image) -> str:
    """Converts a PIL Image to a base64 string."""
    buffered = io.BytesIO()
    # Ensure image is in RGB mode before saving as JPEG
    if image.mode != 'RGB':
        image = image.convert('RGB')
    image.save(buffered, format="JPEG", quality=85)
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

def initialize_dataset():
    """Initializes or loads the Hugging Face dataset."""
    try:
        # Try to load existing dataset
        dataset = load_dataset(HF_DATASET_NAME, split="train")
        logger.info(f"Loaded existing Hugging Face dataset: {HF_DATASET_NAME}")
    except Exception:
        # If dataset does not exist, create a new one with an empty structure
        logger.info(f"Creating new Hugging Face dataset: {HF_DATASET_NAME}")
        dataset = Dataset.from_dict({
            "timestamp": [],
            "image": [], # Storing base64 string for simplicity, or path/bytes if preferred
            "inference_request": [],
            "model_predictions": [],
            "ensemble_output": [],
            "forensic_outputs": [], # List of base64 image strings
            "agent_monitoring_data": [],
            "human_feedback": []
        })
    return dataset

def log_inference_data(
    original_image: Image.Image,
    inference_params: dict,
    model_predictions: list[dict],
    ensemble_output: dict,
    forensic_images: list[Image.Image],
    agent_monitoring_data: dict,
    human_feedback: dict = None
):
    """Logs a single inference event to the Hugging Face dataset."""
    try:
        dataset = initialize_dataset()
        
        # Convert PIL Images to base64 strings for storage
        original_image_b64 = _pil_to_base64(original_image)
        forensic_images_b64 = [_pil_to_base64(img) for img in forensic_images if img is not None]

        new_entry = {
            "timestamp": datetime.datetime.now().isoformat(),
            "image": original_image_b64,
            "inference_request": inference_params,
            "model_predictions": model_predictions,
            "ensemble_output": ensemble_output,
            "forensic_outputs": forensic_images_b64,
            "agent_monitoring_data": agent_monitoring_data,
            "human_feedback": human_feedback if human_feedback is not None else {}
        }
        
        # Append the new entry
        # Note: Directly appending might not be efficient for large datasets or frequent logging
        # For a production system, consider batched writes or more robust data pipelines.
        updated_dataset = dataset.add_item(new_entry)
        
        # This will push to the Hugging Face Hub if you are logged in and dataset is configured
        # Or save locally if not.
        updated_dataset.save_to_disk("sherloq-forensics/hf_dataset_cache") # Save locally for now
        logger.info("Inference data logged successfully to local cache.")
        
        # To push to hub, uncomment the line below and ensure HF_DATASET_NAME is set correctly and you are logged in
        # updated_dataset.push_to_hub(HF_DATASET_NAME, private=True)
        # logger.info("Inference data pushed to Hugging Face Hub.")

    except Exception as e:
        logger.error(f"Failed to log inference data to Hugging Face dataset: {e}")