Spaces:

Pavithiran
/

SAGAN

Sleeping

App Files Files Community

Pavithiran commited on May 20

Commit

e8e15b2

verified ·

1 Parent(s): 36b8fdb

Update sagan_inference.py

Browse files

Files changed (1) hide show

sagan_inference.py +32 -41

sagan_inference.py CHANGED Viewed

@@ -1,21 +1,11 @@
-import torch
-import numpy as np
-import librosa
-from huggingface_hub import hf_hub_download
-from sagan_model import SAGANModel  # your model definition
-### 1) Download & load your SAGAN weights from your HF repo ###
-SAGAN_WEIGHTS_PATH = hf_hub_download(
-    repo_id="Pavithiran/SAGAN",   # ← replace with your HF namespace
-    filename="sagan_weights.pth"
-)
-model = SAGANModel()
-state_dict = torch.load(SAGAN_WEIGHTS_PATH, map_location="cpu")
-model.load_state_dict(state_dict)
-model.eval()
 ### 2) Age-group Z-score stats (proxy values from literature) ###
-import math
 STATS = {
     "kindergarten": {
         "pitch":  {"mu":  30.0, "sigma": 29.0},  # Wise & Sloboda (2008)
@@ -34,32 +24,33 @@ STATS = {
     },
 }
-def sigmoid(z: float) -> float:
-    return 1 / (1 + math.exp(-z))
-def z_score_standardize(raw_metrics: dict, age_group: str) -> dict:
-    if age_group not in STATS:
-        raise ValueError(f"Unknown age_group '{age_group}'")
-    stats = STATS[age_group]
-    out = {}
-    for key, raw in raw_metrics.items():
-        μ, σ = stats[key]["mu"], stats[key]["sigma"]
-        z = (raw - μ) / σ
-        out[key] = round(sigmoid(z), 3)
-    return out
-def run_sagan(wav_path: str) -> dict:
-    """
-    1) Load audio
-    2) Run SAGANModel.evaluate → returns {'pitch_accuracy', 'rhythm_consistency', 'timbre_score'}
-    3) Return raw dict
-    """
-    y, sr = librosa.load(wav_path, sr=16000, mono=True)
     with torch.no_grad():
-        metrics = model.evaluate(y, sr)
-    # Ensure keys:
-    return {
-        "pitch": float(metrics.get("pitch_accuracy", metrics[0])),
-        "rhythm": float(metrics.get("rhythm_consistency", metrics[1])),
-        "timbre": float(metrics.get("timbre_score", metrics[2])),
-    }

+# sagan_inference.py
+import torch
+import torchaudio
+import math
+from sagan_model import SAGANModel
 ### 2) Age-group Z-score stats (proxy values from literature) ###
 STATS = {
     "kindergarten": {
         "pitch":  {"mu":  30.0, "sigma": 29.0},  # Wise & Sloboda (2008)
     },
 }
+def z_score_standardize(waveform: torch.Tensor, age_group: str) -> torch.Tensor:
+    stats = STATS.get(age_group, STATS["adult"])
+    mu, sigma = stats["pitch"]["mu"], stats["pitch"]["sigma"]
+    # example for pitch; repeat for rhythm/timbre as needed
+    return (waveform - mu) / (sigma + 1e-9)
+def run_sagan(audio_path: str, checkpoint_path: str, device='cpu'):
+    # 1) Load audio
+    waveform, sr = torchaudio.load(audio_path)
+    waveform = z_score_standardize(waveform).to(device)
+    # 2) Instantiate model & load weights
+    model = SAGANModel(z_dim=128).to(device)
+    ckpt = torch.load(checkpoint_path, map_location=device)
+    model.load_state_dict(ckpt['model_state_dict'])
+    model.eval()
+    # 3) Prepare latent vector from audio (example: mean-pool + linear proj)
+    #    _Here you’ll replace `encode_to_z` with your custom feature extractor_
+    z = encode_to_z(waveform).unsqueeze(-1).unsqueeze(-1)  # -> (1, 128, 1, 1)
+    # 4) Generate
     with torch.no_grad():
+        fake_img = model(z)  # -> (1, 3, 64, 64) for a 64×64 SAGAN
+    return fake_img
+# Placeholder: your own mapping from waveform → z
+def encode_to_z(wf):
+    # e.g., a small CNN or an MLP extracting 128-d features from audio
+    return wf.mean(dim=-1).mean(dim=-1).unsqueeze(0).repeat(1,128)