File size: 5,544 Bytes
58f23d5
 
 
 
 
 
 
febce11
 
0f427f3
58f23d5
 
 
 
 
0f427f3
 
 
 
 
 
 
58f23d5
 
d20c076
 
 
 
58f23d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
febce11
 
 
 
 
 
 
 
 
 
 
 
 
58f23d5
 
febce11
58f23d5
 
 
febce11
58f23d5
 
febce11
58f23d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d20c076
 
 
 
 
 
 
 
 
 
 
 
 
58f23d5
 
 
 
0f427f3
 
 
febce11
0f427f3
 
58f23d5
 
febce11
 
 
 
 
 
 
 
679e047
febce11
 
58f23d5
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import base64
import json
import io
import datetime
from PIL import Image
import logging
from datasets import Dataset, load_dataset, Features, Value, Sequence
import copy
import numpy as np

logger = logging.getLogger(__name__)

HF_DATASET_NAME = "aiwithoutborders-xyz/degentic_rd0" # TODO: Replace with your actual HF username and dataset name

# Custom JSON Encoder to handle numpy types (copy from app_mcp.py if it's identical)
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.float32):
            return float(obj)
        return json.JSONEncoder.default(self, obj)

def _pil_to_base64(image: Image.Image) -> str:
    """Converts a PIL Image to a base64 string."""
    # Explicitly check if the input is a PIL Image
    if not isinstance(image, Image.Image):
        raise TypeError(f"Expected a PIL Image, but received type: {type(image)}")

    buffered = io.BytesIO()
    # Ensure image is in RGB mode before saving as JPEG
    if image.mode != 'RGB':
        image = image.convert('RGB')
    image.save(buffered, format="JPEG", quality=85)
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

def initialize_dataset():
    """Initializes or loads the Hugging Face dataset."""
    try:
        # Try to load existing dataset
        dataset = load_dataset(HF_DATASET_NAME, split="train")
        logger.info(f"Loaded existing Hugging Face dataset: {HF_DATASET_NAME}")
    except Exception:
        # If dataset does not exist, create a new one with an empty structure
        logger.info(f"Creating new Hugging Face dataset: {HF_DATASET_NAME}")
        
        # Define the features explicitly
        features = Features({
            "timestamp": Value('string'),
            "image": Value('string'), # base64 string
            "inference_request": Value('string'), # JSON string
            "model_predictions": Value('string'), # JSON string
            "ensemble_output": Value('string'), # JSON string
            "forensic_outputs": Sequence(Value('string')), # List of base64 image strings
            "agent_monitoring_data": Value('string'), # JSON string
            "human_feedback": Value('string') # JSON string
        })
        
        dataset = Dataset.from_dict({
            "timestamp": [],
            "image": [],
            "inference_request": [],
            "model_predictions": [],
            "ensemble_output": [],
            "forensic_outputs": [],
            "agent_monitoring_data": [],
            "human_feedback": []
        }, features=features) # Pass the features explicitly
    return dataset

def log_inference_data(
    original_image: Image.Image,
    inference_params: dict,
    model_predictions: list[dict],
    ensemble_output: dict,
    forensic_images: list[Image.Image],
    agent_monitoring_data: dict,
    human_feedback: dict = None
):
    """Logs a single inference event to the Hugging Face dataset."""
    try:
        dataset = initialize_dataset()
        
        # Convert PIL Images to base64 strings for storage
        original_image_b64 = _pil_to_base64(original_image)
        
        forensic_images_b64 = []
        for img_item in forensic_images:
            if img_item is not None:
                if not isinstance(img_item, Image.Image):
                    try:
                        img_item = Image.fromarray(img_item)
                    except Exception as e:
                        logger.error(f"Error converting forensic image to PIL for base64 encoding: {e}")
                        continue # Skip this image if conversion fails
                
                # Now img_item should be a PIL Image, safe to pass to _pil_to_base64
                forensic_images_b64.append(_pil_to_base64(img_item))

        new_entry = {
            "timestamp": datetime.datetime.now().isoformat(),
            "image": original_image_b64,
            "inference_request": json.dumps(inference_params, cls=NumpyEncoder),
            "model_predictions": json.dumps(model_predictions, cls=NumpyEncoder),
            "ensemble_output": json.dumps(ensemble_output, cls=NumpyEncoder),
            "forensic_outputs": forensic_images_b64, # This is already a list of strings
            "agent_monitoring_data": json.dumps(agent_monitoring_data, cls=NumpyEncoder),
            "human_feedback": json.dumps(human_feedback if human_feedback is not None else {}, cls=NumpyEncoder)
        }
        
        # Get current dataset features
        features = dataset.features

        # Convert existing dataset to a list of dictionaries
        dataset_list = dataset.to_list()
        
        # Append the new entry to the list
        dataset_list.append(new_entry)
        
        # Create a new dataset from the updated list
        updated_dataset = Dataset.from_list(dataset_list, features=features)
        
        # This will push to the Hugging Face Hub if you are logged in and dataset is configured
        # Or save locally if not.
        updated_dataset.save_to_disk("sherloq-forensics/hf_dataset_cache") # Save locally for now
        logger.info("Inference data logged successfully to local cache.")
        
        # To push to hub, uncomment the line below and ensure HF_DATASET_NAME is set correctly and you are logged in
        # updated_dataset.push_to_hub(HF_DATASET_NAME, private=True)
        # logger.info("Inference data pushed to Hugging Face Hub.")

    except Exception as e:
        logger.error(f"Failed to log inference data to Hugging Face dataset: {e}")