Sin2pi commited on
Commit
11c9723
·
verified ·
1 Parent(s): edcaa5d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +10 -14
README.md CHANGED
@@ -21,7 +21,7 @@ tags:
21
 
22
  ---
23
 
24
- ASR model
25
 
26
  <img width="1363" height="732" alt="pitch_spectrogram" src="https://github.com/user-attachments/assets/ceb65e94-7df4-41b7-aa3d-c4aa4c6c0717" />
27
 
@@ -66,28 +66,25 @@ Reference: [PyTorch Documentation - torch.polar]https:pytorch.orgdocsstablegener
66
 
67
 
68
 
69
- <img width="349" height="577" alt="standard" src="https://github.com/user-attachments/assets/450f814f-5e9c-4599-8f85-9c5620c42394" />
70
 
71
 
72
 
73
- <img width="400" height="500" alt="standardl" src="https://github.com/user-attachments/assets/6197a6a4-c778-443c-9a04-62f99d01fdac" />
74
-
75
 
76
  ```python
77
 
78
 
79
-
80
  # Modified freq calculation:
81
 
82
- pos = torch.arange(ctx, device=device, dtype=dtype)
83
- freqs = (self.theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
84
- freqs = pos[:, None] * freqs
85
 
86
  # standard
87
- # pos = torch.arange(ctx, dtype=torch.float32, device=device).unsqueeze(1)
88
- # dim = torch.arange(0, self.head_dim, 2, dtype=torch.float32, device=device)
89
- # freqs = pos / (self.theta ** (dim / self.head_dim))
90
- # dim = torch.arange(0, self.head_dim, 2, dtype=torch.float32, device=device)
91
  ```
92
  # 200Hz - 4000Hz (covers 95% of speech content)
93
  freqs = (self.theta / 220.0) * 200 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 4000/200)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
@@ -101,6 +98,7 @@ Reference: [PyTorch Documentation - torch.polar]https:pytorch.orgdocsstablegener
101
  # original
102
  freqs = (self.theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
103
 
 
104
  Standard RoPE: 1, 0.1, 0.01, 0.001... (arbitrary geometric)
105
  This RoPE: 80Hz, 100Hz, 140Hz... (perceptually meaningful)
106
 
@@ -252,5 +250,3 @@ The Complex Frequency Result:
252
 
253
 
254
 
255
-
256
-
 
21
 
22
  ---
23
 
24
+ ASR model + pitch aware relative positional embeddings.
25
 
26
  <img width="1363" height="732" alt="pitch_spectrogram" src="https://github.com/user-attachments/assets/ceb65e94-7df4-41b7-aa3d-c4aa4c6c0717" />
27
 
 
66
 
67
 
68
 
69
+ <img width="1370" height="576" alt="123123" src="https://github.com/user-attachments/assets/17031084-48aa-46db-8b12-c025417f3074" />
70
 
71
 
72
 
 
 
73
 
74
  ```python
75
 
76
 
 
77
  # Modified freq calculation:
78
 
79
+ pos = torch.arange(ctx, device=device, dtype=dtype)
80
+ freqs = (self.theta / 220.0) * 200 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 4000/200)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
81
+ freqs = pos[:, None] * freqs
82
 
83
  # standard
84
+ pos = torch.arange(ctx, dtype=torch.float32, device=device)
85
+ freqs = 1.0 / (self.theta ** (torch.arange(0, self.head_dim, 2, device=device, dtype=dtype) / (self.head_dim // 2)))
86
+ freqs = pos[:, None] * freqs
87
+
88
  ```
89
  # 200Hz - 4000Hz (covers 95% of speech content)
90
  freqs = (self.theta / 220.0) * 200 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 4000/200)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
 
98
  # original
99
  freqs = (self.theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
100
 
101
+
102
  Standard RoPE: 1, 0.1, 0.01, 0.001... (arbitrary geometric)
103
  This RoPE: 80Hz, 100Hz, 140Hz... (perceptually meaningful)
104
 
 
250
 
251
 
252