LPX commited on
Commit
5967d4d
·
1 Parent(s): 87fc2bb

feat: implement CommitScheduler for automated logging to Hugging Face dataset and refactor dataset initialization process

Browse files
Files changed (2) hide show
  1. app_mcp.py +14 -2
  2. utils/hf_logger.py +34 -68
app_mcp.py CHANGED
@@ -28,13 +28,15 @@ from forensics.registry import register_model, MODEL_REGISTRY, ModelEntry
28
  from agents.weight_management import ModelWeightManager
29
  from dotenv import load_dotenv
30
  import json
 
31
 
32
  # Configure logging
33
  logging.basicConfig(level=logging.INFO)
34
  logger = logging.getLogger(__name__)
35
  os.environ['HF_HUB_CACHE'] = './models'
36
 
37
-
 
38
  load_dotenv()
39
  # print(os.getenv("HF_HUB_CACHE"))
40
 
@@ -586,4 +588,14 @@ with gr.Blocks(css="#post-gallery { overflow: hidden !important;} .grid-wrap{ ov
586
 
587
  # --- MCP-Ready Launch ---
588
  if __name__ == "__main__":
589
- demo.launch(share=True, mcp_server=True)
 
 
 
 
 
 
 
 
 
 
 
28
  from agents.weight_management import ModelWeightManager
29
  from dotenv import load_dotenv
30
  import json
31
+ from huggingface_hub import CommitScheduler
32
 
33
  # Configure logging
34
  logging.basicConfig(level=logging.INFO)
35
  logger = logging.getLogger(__name__)
36
  os.environ['HF_HUB_CACHE'] = './models'
37
 
38
+ LOCAL_LOG_DIR = "./hf_inference_logs"
39
+ HF_DATASET_NAME="aiwithoutborders-xyz/degentic_rd0"
40
  load_dotenv()
41
  # print(os.getenv("HF_HUB_CACHE"))
42
 
 
588
 
589
  # --- MCP-Ready Launch ---
590
  if __name__ == "__main__":
591
+ # Initialize CommitScheduler
592
+ # The scheduler will monitor LOCAL_LOG_DIR and push changes to HF_DATASET_NAME
593
+ with CommitScheduler(
594
+ repo_id=HF_DATASET_NAME, # Your Hugging Face dataset repository ID
595
+ repo_type="dataset",
596
+ folder_path=LOCAL_LOG_DIR,
597
+ every=5, # Commit every 5 minutes
598
+ private=True, # Keep your dataset private
599
+ # token=os.getenv("HF_TOKEN") # Uncomment and set if token is not saved globally
600
+ ) as scheduler:
601
+ demo.launch(share=True, mcp_server=True)
utils/hf_logger.py CHANGED
@@ -5,15 +5,15 @@ import io
5
  import datetime
6
  from PIL import Image
7
  import logging
8
- from datasets import Dataset, load_dataset, Features, Value, Sequence
9
- import copy
10
  import numpy as np
11
 
12
  logger = logging.getLogger(__name__)
13
 
14
- HF_DATASET_NAME = "aiwithoutborders-xyz/degentic_rd0"
 
15
 
16
- # Custom JSON Encoder to handle numpy types (copy from app_mcp.py if it's identical)
17
  class NumpyEncoder(json.JSONEncoder):
18
  def default(self, obj):
19
  if isinstance(obj, np.float32):
@@ -33,39 +33,18 @@ def _pil_to_base64(image: Image.Image) -> str:
33
  image.save(buffered, format="JPEG", quality=85)
34
  return base64.b64encode(buffered.getvalue()).decode('utf-8')
35
 
36
- def initialize_dataset():
37
- """Initializes or loads the Hugging Face dataset."""
 
 
 
38
  try:
39
- # Try to load existing dataset
40
- dataset = load_dataset(HF_DATASET_NAME, split="train")
41
- logger.info(f"Loaded existing Hugging Face dataset: {HF_DATASET_NAME}")
42
  except Exception:
43
- # If dataset does not exist, create a new one with an empty structure
44
- logger.info(f"Creating new Hugging Face dataset: {HF_DATASET_NAME}")
45
-
46
- # Define the features explicitly
47
- features = Features({
48
- "timestamp": Value('string'),
49
- "image": Value('string'), # base64 string
50
- "inference_request": Value('string'), # JSON string
51
- "model_predictions": Value('string'), # JSON string
52
- "ensemble_output": Value('string'), # JSON string
53
- "forensic_outputs": Sequence(Value('string')), # List of base64 image strings
54
- "agent_monitoring_data": Value('string'), # JSON string
55
- "human_feedback": Value('string') # JSON string
56
- })
57
-
58
- dataset = Dataset.from_dict({
59
- "timestamp": [],
60
- "image": [],
61
- "inference_request": [],
62
- "model_predictions": [],
63
- "ensemble_output": [],
64
- "forensic_outputs": [],
65
- "agent_monitoring_data": [],
66
- "human_feedback": []
67
- }, features=features) # Pass the features explicitly
68
- return dataset
69
 
70
  def log_inference_data(
71
  original_image: Image.Image,
@@ -76,13 +55,12 @@ def log_inference_data(
76
  agent_monitoring_data: dict,
77
  human_feedback: dict = None
78
  ):
79
- """Logs a single inference event to the Hugging Face dataset."""
80
  try:
81
- dataset = initialize_dataset()
82
-
83
- # Convert PIL Images to base64 strings for storage
84
  original_image_b64 = _pil_to_base64(original_image)
85
-
86
  forensic_images_b64 = []
87
  for img_item in forensic_images:
88
  if img_item is not None:
@@ -91,42 +69,30 @@ def log_inference_data(
91
  img_item = Image.fromarray(img_item)
92
  except Exception as e:
93
  logger.error(f"Error converting forensic image to PIL for base64 encoding: {e}")
94
- continue # Skip this image if conversion fails
95
-
96
- # Now img_item should be a PIL Image, safe to pass to _pil_to_base64
97
  forensic_images_b64.append(_pil_to_base64(img_item))
98
 
99
  new_entry = {
100
  "timestamp": datetime.datetime.now().isoformat(),
101
  "image": original_image_b64,
102
- "inference_request": json.dumps(inference_params, cls=NumpyEncoder),
103
- "model_predictions": json.dumps(model_predictions, cls=NumpyEncoder),
104
- "ensemble_output": json.dumps(ensemble_output, cls=NumpyEncoder),
105
- "forensic_outputs": forensic_images_b64, # This is already a list of strings
106
- "agent_monitoring_data": json.dumps(agent_monitoring_data, cls=NumpyEncoder),
107
- "human_feedback": json.dumps(human_feedback if human_feedback is not None else {}, cls=NumpyEncoder)
108
  }
109
 
110
- # Get current dataset features
111
- features = dataset.features
112
-
113
- # Convert existing dataset to a list of dictionaries
114
- dataset_list = dataset.to_list()
115
-
116
- # Append the new entry to the list
117
- dataset_list.append(new_entry)
118
-
119
- # Create a new dataset from the updated list
120
- updated_dataset = Dataset.from_list(dataset_list, features=features)
121
 
122
- # This will push to the Hugging Face Hub if you are logged in and dataset is configured
123
- # Or save locally if not.
124
- updated_dataset.save_to_disk("sherloq-forensics/hf_dataset_cache") # Save locally for now
125
- logger.info("Inference data logged successfully to local cache.")
126
 
127
- # To push to hub, uncomment the line below and ensure HF_DATASET_NAME is set correctly and you are logged in
128
- updated_dataset.push_to_hub(HF_DATASET_NAME, private=True)
129
- logger.info("Inference data pushed to Hugging Face Hub.")
130
 
131
  except Exception as e:
132
- logger.error(f"Failed to log inference data to Hugging Face dataset: {e}")
 
5
  import datetime
6
  from PIL import Image
7
  import logging
8
+ from huggingface_hub import HfApi, CommitOperationAdd # Keep HfApi for repo creation, but remove CommitOperationAdd for direct upload
 
9
  import numpy as np
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
+ HF_DATASET_NAME = "aiwithoutborders-xyz/degentic_rd0"
14
+ LOCAL_LOG_DIR = "./hf_inference_logs" # Define a local directory to store logs
15
 
16
+ # Custom JSON Encoder to handle numpy types
17
  class NumpyEncoder(json.JSONEncoder):
18
  def default(self, obj):
19
  if isinstance(obj, np.float32):
 
33
  image.save(buffered, format="JPEG", quality=85)
34
  return base64.b64encode(buffered.getvalue()).decode('utf-8')
35
 
36
+ # The initialize_dataset function will change significantly or be removed/simplified
37
+ # as we are no longer appending to a datasets.Dataset object directly in memory
38
+ def initialize_dataset_repo():
39
+ """Initializes or ensures the Hugging Face dataset repository exists."""
40
+ api = HfApi()
41
  try:
42
+ api.repo_info(repo_id=HF_DATASET_NAME, repo_type="dataset")
43
+ logger.info(f"Hugging Face dataset repository already exists: {HF_DATASET_NAME}")
 
44
  except Exception:
45
+ logger.info(f"Creating new Hugging Face dataset repository: {HF_DATASET_NAME}")
46
+ api.create_repo(repo_id=HF_DATASET_NAME, repo_type="dataset", private=True)
47
+ return api # Return the API object for subsequent operations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  def log_inference_data(
50
  original_image: Image.Image,
 
55
  agent_monitoring_data: dict,
56
  human_feedback: dict = None
57
  ):
58
+ """Logs a single inference event by uploading a JSON file to the Hugging Face dataset repository."""
59
  try:
60
+ api = initialize_dataset_repo() # Get or create the repository
61
+
 
62
  original_image_b64 = _pil_to_base64(original_image)
63
+
64
  forensic_images_b64 = []
65
  for img_item in forensic_images:
66
  if img_item is not None:
 
69
  img_item = Image.fromarray(img_item)
70
  except Exception as e:
71
  logger.error(f"Error converting forensic image to PIL for base64 encoding: {e}")
72
+ continue
 
 
73
  forensic_images_b64.append(_pil_to_base64(img_item))
74
 
75
  new_entry = {
76
  "timestamp": datetime.datetime.now().isoformat(),
77
  "image": original_image_b64,
78
+ "inference_request": inference_params,
79
+ "model_predictions": model_predictions,
80
+ "ensemble_output": ensemble_output,
81
+ "forensic_outputs": forensic_images_b64,
82
+ "agent_monitoring_data": agent_monitoring_data,
83
+ "human_feedback": human_feedback if human_feedback is not None else {}
84
  }
85
 
86
+ # Define a unique path for the new log file within the local directory
87
+ os.makedirs(LOCAL_LOG_DIR, exist_ok=True) # Ensure the local directory exists
88
+ timestamp_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
89
+ log_file_path = os.path.join(LOCAL_LOG_DIR, f"log_{timestamp_str}.json")
 
 
 
 
 
 
 
90
 
91
+ # Serialize the new entry to a JSON file using the custom encoder
92
+ with open(log_file_path, 'w', encoding='utf-8') as f:
93
+ json.dump(new_entry, f, cls=NumpyEncoder, indent=2)
 
94
 
95
+ logger.info(f"Inference data logged successfully to local file: {log_file_path}")
 
 
96
 
97
  except Exception as e:
98
+ logger.error(f"Failed to log inference data to local file: {e}")