Sin2pi
/

asr-model

@@ -21,7 +21,7 @@ tags:
 ---
-ASR model
 <img width="1363" height="732" alt="pitch_spectrogram" src="https://github.com/user-attachments/assets/ceb65e94-7df4-41b7-aa3d-c4aa4c6c0717" />
@@ -66,28 +66,25 @@ Reference: [PyTorch Documentation - torch.polar]https:pytorch.orgdocsstablegener
-<img width="349" height="577" alt="standard" src="https://github.com/user-attachments/assets/450f814f-5e9c-4599-8f85-9c5620c42394" />
-<img width="400" height="500" alt="standardl" src="https://github.com/user-attachments/assets/6197a6a4-c778-443c-9a04-62f99d01fdac" />
 ```python
 # Modified freq calculation:
-   pos = torch.arange(ctx, device=device, dtype=dtype)
-   freqs = (self.theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
-   freqs = pos[:, None] * freqs
 # standard
-        # pos = torch.arange(ctx, dtype=torch.float32, device=device).unsqueeze(1)
-        # dim = torch.arange(0, self.head_dim, 2, dtype=torch.float32, device=device)
-        # freqs = pos / (self.theta ** (dim / self.head_dim))
-        # dim = torch.arange(0, self.head_dim, 2, dtype=torch.float32, device=device)
 ```
       # 200Hz - 4000Hz (covers 95% of speech content)
       freqs = (self.theta / 220.0) * 200 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 4000/200)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
@@ -101,6 +98,7 @@ Reference: [PyTorch Documentation - torch.polar]https:pytorch.orgdocsstablegener
       # original
       freqs = (self.theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
 Standard RoPE: 1, 0.1, 0.01, 0.001... (arbitrary geometric)
 This RoPE: 80Hz, 100Hz, 140Hz... (perceptually meaningful)
@@ -252,5 +250,3 @@ The Complex Frequency Result:

 ---
+ASR model + pitch aware relative positional embeddings.
 <img width="1363" height="732" alt="pitch_spectrogram" src="https://github.com/user-attachments/assets/ceb65e94-7df4-41b7-aa3d-c4aa4c6c0717" />
+<img width="1370" height="576" alt="123123" src="https://github.com/user-attachments/assets/17031084-48aa-46db-8b12-c025417f3074" />
 ```python
 # Modified freq calculation:
+      pos = torch.arange(ctx, device=device, dtype=dtype)
+      freqs = (self.theta / 220.0) * 200 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 4000/200)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
+      freqs = pos[:, None] * freqs
 # standard
+     pos = torch.arange(ctx, dtype=torch.float32, device=device)
+     freqs = 1.0 / (self.theta ** (torch.arange(0, self.head_dim, 2, device=device, dtype=dtype) / (self.head_dim // 2)))
+     freqs = pos[:, None] * freqs
 ```
       # 200Hz - 4000Hz (covers 95% of speech content)
       freqs = (self.theta / 220.0) * 200 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 4000/200)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
       # original
       freqs = (self.theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
 Standard RoPE: 1, 0.1, 0.01, 0.001... (arbitrary geometric)
 This RoPE: 80Hz, 100Hz, 140Hz... (perceptually meaningful)