submission-frugal-ai-challenge

Sleeping

App Files Files Community

soury commited on Jan 28

Commit

8d30d08

verified ·

1 Parent(s): 998e8ac

pr/1 (#1)

Browse files

- setup processing & initial classifier (d1276d6fedbf2879914e20609918aae06c884daf)

Files changed (3) hide show

.gitignore +3 -1
models/audio_classification_baseline.pkl +3 -0
tasks/audio.py +46 -22

.gitignore CHANGED Viewed

@@ -6,7 +6,9 @@ __pycache__/
 .env
 .ipynb_checkpoints
 .vscode/
 eval-queue/
 eval-results/
 eval-queue-bk/

 .env
 .ipynb_checkpoints
 .vscode/
+notebooks
+Pipfile
+Pipfile.lock
 eval-queue/
 eval-results/
 eval-queue-bk/

models/audio_classification_baseline.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a27a9a671a920660995bc08b255e17449427f018402ceec81710a0ae93cb612
+size 36073945

tasks/audio.py CHANGED Viewed

@@ -4,9 +4,12 @@ from datasets import load_dataset
 from sklearn.metrics import accuracy_score
 import random
 import os
-from .utils.evaluation import AudioEvaluationRequest
-from .utils.emissions import tracker, clean_emissions_data, get_space_info
 from dotenv import load_dotenv
 load_dotenv()
@@ -17,13 +20,12 @@ DESCRIPTION = "Random Baseline"
 ROUTE = "/audio"
 @router.post(ROUTE, tags=["Audio Task"],
              description=DESCRIPTION)
 async def evaluate_audio(request: AudioEvaluationRequest):
     """
     Evaluate audio classification for rainforest sound detection.
     Current Model: Random Baseline
     - Makes random predictions from the label space (0-1)
     - Used as a baseline for comparison
@@ -38,35 +40,58 @@ async def evaluate_audio(request: AudioEvaluationRequest):
     }
     # Load and prepare the dataset
     # Because the dataset is gated, we need to use the HF_TOKEN environment variable to authenticate
-    dataset = load_dataset(request.dataset_name,token=os.getenv("HF_TOKEN"))
     # Split dataset
-    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
     test_dataset = train_test["test"]
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
-    #--------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE CODE HERE
     # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
-    #--------------------------------------------------------------------------------------------
-    # Make random predictions (placeholder for actual model inference)
     true_labels = test_dataset["label"]
-    predictions = [random.randint(0, 1) for _ in range(len(true_labels))]
-    #--------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE STOPS HERE
-    #--------------------------------------------------------------------------------------------
     # Stop tracking emissions
     emissions_data = tracker.stop_task()
     # Calculate accuracy
     accuracy = accuracy_score(true_labels, predictions)
     # Prepare results dictionary
     results = {
         "username": username,
@@ -84,5 +109,4 @@ async def evaluate_audio(request: AudioEvaluationRequest):
             "test_seed": request.test_seed
         }
     }
-    return results

 from sklearn.metrics import accuracy_score
 import random
 import os
+import joblib
+import librosa
+import numpy as np
+from utils.evaluation import AudioEvaluationRequest
+from utils.emissions import tracker, clean_emissions_data, get_space_info
 from dotenv import load_dotenv
 load_dotenv()
 ROUTE = "/audio"
 @router.post(ROUTE, tags=["Audio Task"],
              description=DESCRIPTION)
 async def evaluate_audio(request: AudioEvaluationRequest):
     """
     Evaluate audio classification for rainforest sound detection.
     Current Model: Random Baseline
     - Makes random predictions from the label space (0-1)
     - Used as a baseline for comparison
     }
     # Load and prepare the dataset
     # Because the dataset is gated, we need to use the HF_TOKEN environment variable to authenticate
+    dataset = load_dataset(request.dataset_name, token=os.getenv("HF_TOKEN"))
     # Split dataset
+    train_test = dataset["train"].train_test_split(
+        test_size=request.test_size, seed=request.test_seed)
     test_dataset = train_test["test"]
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
+    # --------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE CODE HERE
     # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
+    # --------------------------------------------------------------------------------------------
+    # data formatting
+    def preprocess(dataset):
+        features = []
+        for row in dataset:
+            # Load the audio file and resample it
+            target_sr = 25000
+            audio = row['audio']['array']
+            audio = librosa.resample(audio, orig_sr=12000, target_sr=target_sr)
+            # Extract MFCC features
+            mfccs = librosa.feature.mfcc(y=audio, sr=target_sr, n_mfcc=40)
+            mfccs_scaled = np.mean(mfccs.T, axis=0)
+            # Append features and labels
+            features.append(mfccs_scaled)
+        return np.array(features)
+    X_test = preprocess(test_dataset)
+    classification_model = joblib.load(
+        "../models/audio_classification_baseline.pkl")
+    predictions = classification_model.predict(X_test)
     true_labels = test_dataset["label"]
+    # --------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE STOPS HERE
+    # --------------------------------------------------------------------------------------------
     # Stop tracking emissions
     emissions_data = tracker.stop_task()
     # Calculate accuracy
     accuracy = accuracy_score(true_labels, predictions)
     # Prepare results dictionary
     results = {
         "username": username,
             "test_seed": request.test_seed
         }
     }
+    return results