OpenSight-Deepfake-Detection-Models-Playground

Running

App Files Files Community

LPX commited on 28 days ago

Commit

5967d4d

1 Parent(s): 87fc2bb

feat: implement CommitScheduler for automated logging to Hugging Face dataset and refactor dataset initialization process

Browse files

Files changed (2) hide show

app_mcp.py +14 -2
utils/hf_logger.py +34 -68

app_mcp.py CHANGED Viewed

@@ -28,13 +28,15 @@ from forensics.registry import register_model, MODEL_REGISTRY, ModelEntry
 from agents.weight_management import ModelWeightManager
 from dotenv import load_dotenv
 import json
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 os.environ['HF_HUB_CACHE'] = './models'
 load_dotenv()
 # print(os.getenv("HF_HUB_CACHE"))
@@ -586,4 +588,14 @@ with gr.Blocks(css="#post-gallery { overflow: hidden !important;} .grid-wrap{ ov
 # --- MCP-Ready Launch ---
 if __name__ == "__main__":
-    demo.launch(share=True, mcp_server=True)

 from agents.weight_management import ModelWeightManager
 from dotenv import load_dotenv
 import json
+from huggingface_hub import CommitScheduler
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 os.environ['HF_HUB_CACHE'] = './models'
+LOCAL_LOG_DIR = "./hf_inference_logs"
+HF_DATASET_NAME="aiwithoutborders-xyz/degentic_rd0"
 load_dotenv()
 # print(os.getenv("HF_HUB_CACHE"))
 # --- MCP-Ready Launch ---
 if __name__ == "__main__":
+    # Initialize CommitScheduler
+    # The scheduler will monitor LOCAL_LOG_DIR and push changes to HF_DATASET_NAME
+    with CommitScheduler(
+        repo_id=HF_DATASET_NAME, # Your Hugging Face dataset repository ID
+        repo_type="dataset",
+        folder_path=LOCAL_LOG_DIR,
+        every=5, # Commit every 5 minutes
+        private=True, # Keep your dataset private
+        # token=os.getenv("HF_TOKEN") # Uncomment and set if token is not saved globally
+    ) as scheduler:
+        demo.launch(share=True, mcp_server=True)

utils/hf_logger.py CHANGED Viewed

@@ -5,15 +5,15 @@ import io
 import datetime
 from PIL import Image
 import logging
-from datasets import Dataset, load_dataset, Features, Value, Sequence
-import copy
 import numpy as np
 logger = logging.getLogger(__name__)
-HF_DATASET_NAME = "aiwithoutborders-xyz/degentic_rd0"
-# Custom JSON Encoder to handle numpy types (copy from app_mcp.py if it's identical)
 class NumpyEncoder(json.JSONEncoder):
     def default(self, obj):
         if isinstance(obj, np.float32):
@@ -33,39 +33,18 @@ def _pil_to_base64(image: Image.Image) -> str:
     image.save(buffered, format="JPEG", quality=85)
     return base64.b64encode(buffered.getvalue()).decode('utf-8')
-def initialize_dataset():
-    """Initializes or loads the Hugging Face dataset."""
     try:
-        # Try to load existing dataset
-        dataset = load_dataset(HF_DATASET_NAME, split="train")
-        logger.info(f"Loaded existing Hugging Face dataset: {HF_DATASET_NAME}")
     except Exception:
-        # If dataset does not exist, create a new one with an empty structure
-        logger.info(f"Creating new Hugging Face dataset: {HF_DATASET_NAME}")
-        # Define the features explicitly
-        features = Features({
-            "timestamp": Value('string'),
-            "image": Value('string'), # base64 string
-            "inference_request": Value('string'), # JSON string
-            "model_predictions": Value('string'), # JSON string
-            "ensemble_output": Value('string'), # JSON string
-            "forensic_outputs": Sequence(Value('string')), # List of base64 image strings
-            "agent_monitoring_data": Value('string'), # JSON string
-            "human_feedback": Value('string') # JSON string
-        })
-        dataset = Dataset.from_dict({
-            "timestamp": [],
-            "image": [],
-            "inference_request": [],
-            "model_predictions": [],
-            "ensemble_output": [],
-            "forensic_outputs": [],
-            "agent_monitoring_data": [],
-            "human_feedback": []
-        }, features=features) # Pass the features explicitly
-    return dataset
 def log_inference_data(
     original_image: Image.Image,
@@ -76,13 +55,12 @@ def log_inference_data(
     agent_monitoring_data: dict,
     human_feedback: dict = None
 ):
-    """Logs a single inference event to the Hugging Face dataset."""
     try:
-        dataset = initialize_dataset()
-        # Convert PIL Images to base64 strings for storage
         original_image_b64 = _pil_to_base64(original_image)
         forensic_images_b64 = []
         for img_item in forensic_images:
             if img_item is not None:
@@ -91,42 +69,30 @@ def log_inference_data(
                         img_item = Image.fromarray(img_item)
                     except Exception as e:
                         logger.error(f"Error converting forensic image to PIL for base64 encoding: {e}")
-                        continue # Skip this image if conversion fails
-                # Now img_item should be a PIL Image, safe to pass to _pil_to_base64
                 forensic_images_b64.append(_pil_to_base64(img_item))
         new_entry = {
             "timestamp": datetime.datetime.now().isoformat(),
             "image": original_image_b64,
-            "inference_request": json.dumps(inference_params, cls=NumpyEncoder),
-            "model_predictions": json.dumps(model_predictions, cls=NumpyEncoder),
-            "ensemble_output": json.dumps(ensemble_output, cls=NumpyEncoder),
-            "forensic_outputs": forensic_images_b64, # This is already a list of strings
-            "agent_monitoring_data": json.dumps(agent_monitoring_data, cls=NumpyEncoder),
-            "human_feedback": json.dumps(human_feedback if human_feedback is not None else {}, cls=NumpyEncoder)
         }
-        # Get current dataset features
-        features = dataset.features
-        # Convert existing dataset to a list of dictionaries
-        dataset_list = dataset.to_list()
-        # Append the new entry to the list
-        dataset_list.append(new_entry)
-        # Create a new dataset from the updated list
-        updated_dataset = Dataset.from_list(dataset_list, features=features)
-        # This will push to the Hugging Face Hub if you are logged in and dataset is configured
-        # Or save locally if not.
-        updated_dataset.save_to_disk("sherloq-forensics/hf_dataset_cache") # Save locally for now
-        logger.info("Inference data logged successfully to local cache.")
-        # To push to hub, uncomment the line below and ensure HF_DATASET_NAME is set correctly and you are logged in
-        updated_dataset.push_to_hub(HF_DATASET_NAME, private=True)
-        logger.info("Inference data pushed to Hugging Face Hub.")
     except Exception as e:
-        logger.error(f"Failed to log inference data to Hugging Face dataset: {e}")

 import datetime
 from PIL import Image
 import logging
+from huggingface_hub import HfApi, CommitOperationAdd # Keep HfApi for repo creation, but remove CommitOperationAdd for direct upload
 import numpy as np
 logger = logging.getLogger(__name__)
+HF_DATASET_NAME = "aiwithoutborders-xyz/degentic_rd0"
+LOCAL_LOG_DIR = "./hf_inference_logs" # Define a local directory to store logs
+# Custom JSON Encoder to handle numpy types
 class NumpyEncoder(json.JSONEncoder):
     def default(self, obj):
         if isinstance(obj, np.float32):
     image.save(buffered, format="JPEG", quality=85)
     return base64.b64encode(buffered.getvalue()).decode('utf-8')
+# The initialize_dataset function will change significantly or be removed/simplified
+# as we are no longer appending to a datasets.Dataset object directly in memory
+def initialize_dataset_repo():
+    """Initializes or ensures the Hugging Face dataset repository exists."""
+    api = HfApi()
     try:
+        api.repo_info(repo_id=HF_DATASET_NAME, repo_type="dataset")
+        logger.info(f"Hugging Face dataset repository already exists: {HF_DATASET_NAME}")
     except Exception:
+        logger.info(f"Creating new Hugging Face dataset repository: {HF_DATASET_NAME}")
+        api.create_repo(repo_id=HF_DATASET_NAME, repo_type="dataset", private=True)
+    return api # Return the API object for subsequent operations
 def log_inference_data(
     original_image: Image.Image,
     agent_monitoring_data: dict,
     human_feedback: dict = None
 ):
+    """Logs a single inference event by uploading a JSON file to the Hugging Face dataset repository."""
     try:
+        api = initialize_dataset_repo() # Get or create the repository
         original_image_b64 = _pil_to_base64(original_image)
         forensic_images_b64 = []
         for img_item in forensic_images:
             if img_item is not None:
                         img_item = Image.fromarray(img_item)
                     except Exception as e:
                         logger.error(f"Error converting forensic image to PIL for base64 encoding: {e}")
+                        continue
                 forensic_images_b64.append(_pil_to_base64(img_item))
         new_entry = {
             "timestamp": datetime.datetime.now().isoformat(),
             "image": original_image_b64,
+            "inference_request": inference_params,
+            "model_predictions": model_predictions,
+            "ensemble_output": ensemble_output,
+            "forensic_outputs": forensic_images_b64,
+            "agent_monitoring_data": agent_monitoring_data,
+            "human_feedback": human_feedback if human_feedback is not None else {}
         }
+        # Define a unique path for the new log file within the local directory
+        os.makedirs(LOCAL_LOG_DIR, exist_ok=True) # Ensure the local directory exists
+        timestamp_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
+        log_file_path = os.path.join(LOCAL_LOG_DIR, f"log_{timestamp_str}.json")
+        # Serialize the new entry to a JSON file using the custom encoder
+        with open(log_file_path, 'w', encoding='utf-8') as f:
+            json.dump(new_entry, f, cls=NumpyEncoder, indent=2)
+        logger.info(f"Inference data logged successfully to local file: {log_file_path}")
     except Exception as e:
+        logger.error(f"Failed to log inference data to local file: {e}")