Sin2pi
/

asr-model

@@ -56,24 +56,25 @@ f0 = f0.to(device, dtype) # feature extracted during processing
 f0_mean = f0.mean() # mean only used as theta in freqs calculation
 theta = f0_mean + self.theta
 ## This can be just f0_mean or even perhaps f0 (per frame) and probably should for voice audio.
-## In text, theta=10,000 sets the base frequency for positional encoding, ensuring a wide range of periodicities for long sequences.
-## For audio, especially speech, the relevant periodicities are determined by the pitch (f0), so using f0_mean (or even better, the local f0 per frame) might be more meaningful.
 freqs = (theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.dim // 2) / 2595) - 1) / 1000
 ## This seems to give superior results compared to the standard freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)).
-## I thought a mel-scale version might be more perceptually meaningful for audio.. Hovering around 220.0 seems to be a sweet spot but I imagine this depends on dataset specifics
 freqs = t[:, None] * freqs[None, :] # dont repeat or use some other method here
-radius = f0.to(device, dtype) # we want to avoid using the mean of f0 (or any stat or interpolation)
-if radius.shape[0] != x.shape[0]: # encoder outputs will already be the correct length
-    F = radius.shape[0] / x.shape[0]
-    idx = torch.arange(x.shape[0], device=f0.device)
-    idx = (idx * F).long().clamp(0, radius.shape[0] - 1)
-    radius = radius[idx] # it's the best method i know of that retains f0 character
-radius = radius.unsqueeze(-1).expand(-1, freqs.shape[-1])
-radius = torch.sigmoid(radius)
-freqs = torch.polar(radius, freqs)
 ```
@@ -134,7 +135,7 @@ Narrow bands: More focus on nearby positions (good for local patterns)
 #### Diagnostic test run where 1 epoch = 1000 steps = 1000 samples:
-<img width="480" alt="321321" src="https://github.com/user-attachments/assets/c336ac19-1fdb-43e9-9722-6b3ef05efc74" />
 ----

 f0_mean = f0.mean() # mean only used as theta in freqs calculation
 theta = f0_mean + self.theta
 ## This can be just f0_mean or even perhaps f0 (per frame) and probably should for voice audio.
+## In text, theta=10,000 sets the base frequency for positional encoding, ensuring a wide range of periodicities for long sequences. I'm not convinced by that arguement even for text.
+## But.. for audio, especially speech, the relevant periodicities are determined by the pitch (f0), so using f0_mean (or even better, the local f0 per frame) might be more meaningful.
 freqs = (theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.dim // 2) / 2595) - 1) / 1000
 ## This seems to give superior results compared to the standard freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)).
+## I thought a mel-scale version might be more perceptually meaningful for audio.. Hovering around 220.0 seems to be a sweet spot but I imagine this depends on dataset specifics. Whale speech might be different.
 freqs = t[:, None] * freqs[None, :] # dont repeat or use some other method here
+if self.radii and f0 is not None:
+    radius = f0.to(device, dtype) # we want to avoid using the mean of f0 (or any stat or interpolation)
+    if radius.shape[0] != x.shape[0]: # encoder outputs will already be the correct length
+        F = radius.shape[0] / x.shape[0]
+        idx = torch.arange(x.shape[0], device=f0.device)
+        idx = (idx * F).long().clamp(0, radius.shape[0] - 1)
+        radius = radius[idx]
+    freqs = torch.polar(radius.unsqueeze(-1).expand_as(freqs), freqs)
+else:
+    freqs = torch.polar(torch.ones_like(freqs), freqs)
 ```
 #### Diagnostic test run where 1 epoch = 1000 steps = 1000 samples:
+<img width="480" alt="1epoch" src="https://github.com/user-attachments/assets/b46a3118-f8d2-44e5-8b33-c66b843f7e85" />
 ----