Update README.md
Browse files
README.md
CHANGED
@@ -56,24 +56,25 @@ f0 = f0.to(device, dtype) # feature extracted during processing
|
|
56 |
f0_mean = f0.mean() # mean only used as theta in freqs calculation
|
57 |
theta = f0_mean + self.theta
|
58 |
## This can be just f0_mean or even perhaps f0 (per frame) and probably should for voice audio.
|
59 |
-
## In text, theta=10,000 sets the base frequency for positional encoding, ensuring a wide range of periodicities for long sequences.
|
60 |
-
##
|
61 |
|
62 |
freqs = (theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.dim // 2) / 2595) - 1) / 1000
|
63 |
## This seems to give superior results compared to the standard freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)).
|
64 |
-
## I thought a mel-scale version might be more perceptually meaningful for audio.. Hovering around 220.0 seems to be a sweet spot but I imagine this depends on dataset specifics
|
65 |
|
66 |
freqs = t[:, None] * freqs[None, :] # dont repeat or use some other method here
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
radius = radius
|
75 |
-
|
76 |
-
|
|
|
77 |
|
78 |
```
|
79 |
|
@@ -134,7 +135,7 @@ Narrow bands: More focus on nearby positions (good for local patterns)
|
|
134 |
|
135 |
#### Diagnostic test run where 1 epoch = 1000 steps = 1000 samples:
|
136 |
|
137 |
-
<img width="480" alt="
|
138 |
|
139 |
----
|
140 |
|
|
|
56 |
f0_mean = f0.mean() # mean only used as theta in freqs calculation
|
57 |
theta = f0_mean + self.theta
|
58 |
## This can be just f0_mean or even perhaps f0 (per frame) and probably should for voice audio.
|
59 |
+
## In text, theta=10,000 sets the base frequency for positional encoding, ensuring a wide range of periodicities for long sequences. I'm not convinced by that arguement even for text.
|
60 |
+
## But.. for audio, especially speech, the relevant periodicities are determined by the pitch (f0), so using f0_mean (or even better, the local f0 per frame) might be more meaningful.
|
61 |
|
62 |
freqs = (theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.dim // 2) / 2595) - 1) / 1000
|
63 |
## This seems to give superior results compared to the standard freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)).
|
64 |
+
## I thought a mel-scale version might be more perceptually meaningful for audio.. Hovering around 220.0 seems to be a sweet spot but I imagine this depends on dataset specifics. Whale speech might be different.
|
65 |
|
66 |
freqs = t[:, None] * freqs[None, :] # dont repeat or use some other method here
|
67 |
|
68 |
+
if self.radii and f0 is not None:
|
69 |
+
radius = f0.to(device, dtype) # we want to avoid using the mean of f0 (or any stat or interpolation)
|
70 |
+
if radius.shape[0] != x.shape[0]: # encoder outputs will already be the correct length
|
71 |
+
F = radius.shape[0] / x.shape[0]
|
72 |
+
idx = torch.arange(x.shape[0], device=f0.device)
|
73 |
+
idx = (idx * F).long().clamp(0, radius.shape[0] - 1)
|
74 |
+
radius = radius[idx]
|
75 |
+
freqs = torch.polar(radius.unsqueeze(-1).expand_as(freqs), freqs)
|
76 |
+
else:
|
77 |
+
freqs = torch.polar(torch.ones_like(freqs), freqs)
|
78 |
|
79 |
```
|
80 |
|
|
|
135 |
|
136 |
#### Diagnostic test run where 1 epoch = 1000 steps = 1000 samples:
|
137 |
|
138 |
+
<img width="480" alt="1epoch" src="https://github.com/user-attachments/assets/b46a3118-f8d2-44e5-8b33-c66b843f7e85" />
|
139 |
|
140 |
----
|
141 |
|