akshansh36 commited on
Commit
2540ac0
1 Parent(s): 9d22957

Upload 7 files

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. AKSHAY KUMAR.wav +3 -0
  3. hubert_base.pt +3 -0
  4. infer.py +190 -0
  5. metadata.json +152 -0
  6. model.index +3 -0
  7. model.pth +3 -0
  8. rmvpe.pt +3 -0
.gitattributes CHANGED
@@ -37,3 +37,4 @@ model.index filter=lfs diff=lfs merge=lfs -text
37
  models/Female.index filter=lfs diff=lfs merge=lfs -text
38
  models/added_IVF714_Flat_nprobe_1_timcook_v2.index filter=lfs diff=lfs merge=lfs -text
39
  models/Male.index filter=lfs diff=lfs merge=lfs -text
 
 
37
  models/Female.index filter=lfs diff=lfs merge=lfs -text
38
  models/added_IVF714_Flat_nprobe_1_timcook_v2.index filter=lfs diff=lfs merge=lfs -text
39
  models/Male.index filter=lfs diff=lfs merge=lfs -text
40
+ AKSHAY[[:space:]]KUMAR.wav filter=lfs diff=lfs merge=lfs -text
AKSHAY KUMAR.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d10c5a01bbaf92e0e42d1a3888c68e510cdd568f60404eadc4f64c67c2297295
3
+ size 4567182
hubert_base.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f54b40fd2802423a5643779c4861af1e9ee9c1564dc9d32f54f20b5ffba7db96
3
+ size 189507909
infer.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import time
4
+ import sounddevice as sd
5
+ import torchaudio
6
+ import json
7
+ from infer_rvc_python import BaseLoader
8
+ import datetime
9
+ import pyaudio
10
+ # Get the current date and time
11
+ now = datetime.datetime.now()
12
+ # Format the date and time as a string
13
+ timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")
14
+
15
+ converter = BaseLoader(only_cpu=True, hubert_path='./hubert_base.pt', rmvpe_path='./rmvpe.pt')
16
+ random_tag = "USER_"+str(timestamp)
17
+ converter.apply_conf(
18
+ tag=random_tag,
19
+ file_model="./model.pth",
20
+ pitch_algo="rmvpe+",
21
+ pitch_lvl=0,
22
+ file_index="./model.index",
23
+ index_influence=0.80,
24
+ respiration_median_filtering=3,
25
+ envelope_ratio=0.25,
26
+ consonant_breath_protection=0.5,
27
+ resample_sr=0,
28
+ )
29
+ time.sleep(0.5)
30
+ chunk_sec = 0.1
31
+ sr = 16000
32
+ chunk_len = int(sr * chunk_sec)
33
+ L = 16
34
+ b, a = converter.generate_from_cache(
35
+ audio_data="./AKSHAY KUMAR.wav",
36
+ tag=random_tag,
37
+ )
38
+ import soundfile as sf
39
+
40
+ sf.write(
41
+ file="output_file.wav",
42
+ samplerate=a,
43
+ data=b
44
+ )
45
+ stop_recording = False
46
+ def infer_stream(sr, max_duration):
47
+ global start_time
48
+ global first_output_latency
49
+ global audio_buffer
50
+
51
+ previous_chunk = torch.zeros(L * 2, dtype=torch.float32)
52
+
53
+ outputs = []
54
+ times = []
55
+ elapsed_time = 0
56
+
57
+ with torch.inference_mode():
58
+ while True:
59
+ if len(audio_buffer) < chunk_len:
60
+ print(f'Buffer too small')
61
+ time.sleep(0.1)
62
+ continue # Wait for enough data
63
+
64
+ # Get the current chunk
65
+ buffer_chunk = audio_buffer[:chunk_len]
66
+ audio_buffer = audio_buffer[chunk_len:]
67
+
68
+ # Add lookahead context
69
+ input_chunk = torch.cat([previous_chunk, buffer_chunk])
70
+ start = time.time()
71
+ # todo:
72
+ data = (input_chunk.numpy().astype(np.int16), sr)
73
+ print(data)
74
+ result_array, sample_rate = converter.generate_from_cache(
75
+ audio_data=data,
76
+ tag=random_tag,
77
+ )
78
+
79
+ if first_output_latency < 1:
80
+ first_output_latency = time.time() - start_time
81
+ print(f'first_output_latency {first_output_latency}')
82
+ # Convert the NumPy array (result_array) to a PyTorch tensor
83
+ output = torch.tensor(result_array, dtype=torch.float32)
84
+ outputs.append(output)
85
+ times.append(time.time() - start)
86
+
87
+ # Update the previous chunk with the last part of the current buffer_chunk
88
+ previous_chunk = buffer_chunk[-L * 2:]
89
+
90
+ # Check if the maximum duration has been reached
91
+ elapsed_time = time.time() - start_time
92
+ if elapsed_time > max_duration/1.2 and len(audio_buffer) < chunk_len:
93
+ break
94
+ else:
95
+ print(f'Audio Buffer At Processing: {len(audio_buffer)} elapsed_time {elapsed_time}/{max_duration}')
96
+
97
+ # Concatenate outputs and calculate metrics
98
+ if outputs:
99
+ outputs = torch.cat(outputs, dim=2)
100
+ avg_time = np.mean(times)
101
+ total_time_processing = np.sum(times)
102
+ rtf = (chunk_len / sr) / avg_time
103
+ e2e_latency = ((2 * L + chunk_len) / sr + avg_time) * 1000
104
+ outputs = outputs.squeeze(0)
105
+ else:
106
+ rtf = e2e_latency = None
107
+
108
+ return outputs, rtf, e2e_latency, total_time_processing
109
+
110
+ def save_audio(audio, audio_path, sample_rate):
111
+ torchaudio.save(audio_path, audio, sample_rate)
112
+ max_duration = 2 # Maximum duration to process in seconds
113
+ silence_threshold = 0.01 # Threshold to detect silence
114
+ max_silence_duration = 0.5 # Maximum duration of silence to keep in seconds
115
+
116
+ # Variable to track accumulated silence duration
117
+ accumulated_silence_duration = 0.0
118
+ # Callback function to process audio from mic
119
+ def callback(indata, frames, time_info, status):
120
+ global audio_buffer, accumulated_silence_duration
121
+ global stop_recording
122
+ global stop_pro
123
+ if stop_recording:
124
+ if stop_pro < 10:
125
+ stop_pro+=1
126
+ print(f'Audio Buffer Stopped Recording: {len(audio_buffer)}')
127
+ return
128
+ audio_data = indata[:, 0] # Use first channel if stereo
129
+ audio_data = torch.tensor(audio_data, dtype=torch.float32)
130
+
131
+ # Convert audio data to numpy for silence detection
132
+ audio_np = audio_data.numpy()
133
+
134
+ # Detect silence (audio below the threshold)
135
+ silence_indices = np.where(np.abs(audio_np) < silence_threshold)[0]
136
+ # Calculate the duration of the current chunk in seconds
137
+ chunk_duration = len(audio_np) / sample_rate
138
+
139
+ if len(silence_indices) == len(audio_np):
140
+ # All data is silent
141
+ accumulated_silence_duration += chunk_duration
142
+
143
+ if accumulated_silence_duration <= max_silence_duration:
144
+ audio_buffer = torch.cat((audio_buffer, audio_data))
145
+ else:
146
+ # Non-silence detected, reset accumulated silence duration
147
+ accumulated_silence_duration = 0.0
148
+ audio_buffer = torch.cat((audio_buffer, audio_data))
149
+
150
+ if time.time()-start_time > max_duration:
151
+ stop_recording = True
152
+ print(f'Audio Buffer At Insert: {len(audio_buffer)}')
153
+ def list_audio_devices():
154
+ audio = pyaudio.PyAudio()
155
+ device_count = audio.get_device_count()
156
+
157
+ print("Available audio devices:")
158
+ for i in range(device_count):
159
+ device_info = audio.get_device_info_by_index(i)
160
+ print(f"Index: {i}, Name: {device_info['name']}, Input Channels: {device_info['maxInputChannels']}, Output Channels: {device_info['maxOutputChannels']}")
161
+ # Main script
162
+ if __name__ == "__main__":
163
+ list_audio_devices()
164
+ chunk_size = 2024 # Size of each audio chunk
165
+ sample_rate = sr # Sample rate from the model config
166
+ stop_pro = 0
167
+
168
+ # Initialize global audio buffer
169
+ audio_buffer = torch.zeros(0, dtype=torch.float32)
170
+
171
+ # Set up the microphone stream
172
+ input_device_index = 2 # Replace with your actual input device index
173
+
174
+ print("Recording...")
175
+ start_time = time.time()
176
+ first_output_latency = 0
177
+ final_output_latency = 0
178
+ try:
179
+ with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback, blocksize=chunk_size, device=input_device_index):
180
+ output_waveform, rtf, e2e_latency, total_processing_time = infer_stream(sr=sample_rate, max_duration=max_duration)
181
+ if output_waveform is not None:
182
+ # Save output to file
183
+ final_output_latency = (time.time() - start_time) - (len(output_waveform[0])/sample_rate)
184
+ save_audio(output_waveform, f'output_audio_stream_buff-{now}.wav', sample_rate)
185
+ print(f"Processed audio saved to output_audio_stream_buff.wav")
186
+ print(f'first_output_latency: {first_output_latency} || final_output_latency {final_output_latency} || total_processing_time {total_processing_time}')
187
+ if rtf is not None and e2e_latency is not None:
188
+ print(f"RTF: {rtf}, E2E Latency: {e2e_latency} ms")
189
+ except KeyboardInterrupt:
190
+ print("Recording stopped.")
metadata.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "US Ascent",
3
+ "author": {
4
+ "name": "mayank dubey",
5
+ "discordUserId": null
6
+ },
7
+ "md5": "b0a77398fc88806fda285f4ecd6a5839",
8
+ "uploadedAt": "2024-07-05T07:38:18.500Z",
9
+ "weightsLink": "https://www.weights.gg/models/cly8dvwn8000cagr4ohpzo4q2",
10
+ "id": "cly8dvwn8000cagr4ohpzo4q2",
11
+ "type": "v2",
12
+ "tags": [],
13
+ "description": "US Ascent",
14
+ "samples": [],
15
+ "files": [
16
+ {
17
+ "name": "model.index",
18
+ "size": 101587779,
19
+ "md5": "61a545d9b5bb380bed408a51708b210e"
20
+ },
21
+ {
22
+ "name": "model.pth",
23
+ "size": 57577722,
24
+ "md5": "b0a77398fc88806fda285f4ecd6a5839"
25
+ }
26
+ ],
27
+ "torchMetadata": {
28
+ "config": {
29
+ "spec_channels": 1025,
30
+ "segment_size": 32,
31
+ "inter_channels": 192,
32
+ "hidden_channels": 192,
33
+ "filter_channels": 768,
34
+ "n_heads": 2,
35
+ "n_layers": 6,
36
+ "kernel_size": 3,
37
+ "p_dropout": 0,
38
+ "resblock": "1",
39
+ "resblock_kernel_sizes": [
40
+ 3,
41
+ 7,
42
+ 11
43
+ ],
44
+ "resblock_dilation_sizes": [
45
+ [
46
+ 1,
47
+ 3,
48
+ 5
49
+ ],
50
+ [
51
+ 1,
52
+ 3,
53
+ 5
54
+ ],
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ]
60
+ ],
61
+ "upsample_rates": [
62
+ 12,
63
+ 10,
64
+ 2,
65
+ 2
66
+ ],
67
+ "upsample_initial_channel": 512,
68
+ "upsample_kernel_sizes": [
69
+ 24,
70
+ 20,
71
+ 4,
72
+ 4
73
+ ],
74
+ "emb_channels": null,
75
+ "spk_embed_dim": 109,
76
+ "gin_channels": 256,
77
+ "sr": 48000
78
+ },
79
+ "f0": 1,
80
+ "version": "v2",
81
+ "extra_info": {
82
+ "config": [
83
+ 1025,
84
+ 32,
85
+ 192,
86
+ 192,
87
+ 768,
88
+ 2,
89
+ 6,
90
+ 3,
91
+ 0,
92
+ "1",
93
+ [
94
+ 3,
95
+ 7,
96
+ 11
97
+ ],
98
+ [
99
+ [
100
+ 1,
101
+ 3,
102
+ 5
103
+ ],
104
+ [
105
+ 1,
106
+ 3,
107
+ 5
108
+ ],
109
+ [
110
+ 1,
111
+ 3,
112
+ 5
113
+ ]
114
+ ],
115
+ [
116
+ 12,
117
+ 10,
118
+ 2,
119
+ 2
120
+ ],
121
+ 512,
122
+ [
123
+ 24,
124
+ 20,
125
+ 4,
126
+ 4
127
+ ],
128
+ 109,
129
+ 256,
130
+ 48000
131
+ ],
132
+ "epoch": 233,
133
+ "step": 6291,
134
+ "sr": 48000,
135
+ "f0": 1,
136
+ "version": "v2",
137
+ "creation_date": "2024-07-05T07:01:09.035229",
138
+ "model_hash": "7c335d1650be63dea6409d741859c137dc6827d9945d6afba08867ab4281e056"
139
+ },
140
+ "epochs": 233,
141
+ "step": 6291,
142
+ "creation_date": "2024-07-05T07:01:09.035229",
143
+ "model_hash": "7c335d1650be63dea6409d741859c137dc6827d9945d6afba08867ab4281e056"
144
+ },
145
+ "url": "https://models.weights.gg/cly79hr6d1211hlpr4obj48ab.zip",
146
+ "urls": [],
147
+ "epochs": 233,
148
+ "originalFileList": [
149
+ "model.index",
150
+ "model.pth"
151
+ ]
152
+ }
model.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9cc11461add817f1964dfac11c37033a20037d28fe2935038d884196f556590
3
+ size 101587779
model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:595287a521b83cbd8cf8372e1a8c3200081e88ce8c0b7866ebc9db7e66be9512
3
+ size 57577722
rmvpe.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5ed4719f59085d1affc5d81354c70828c740584f2d24e782523345a6a278962
3
+ size 181189687