Spaces:
Sleeping
Sleeping
akshansh36
commited on
Commit
•
2540ac0
1
Parent(s):
9d22957
Upload 7 files
Browse files- .gitattributes +1 -0
- AKSHAY KUMAR.wav +3 -0
- hubert_base.pt +3 -0
- infer.py +190 -0
- metadata.json +152 -0
- model.index +3 -0
- model.pth +3 -0
- rmvpe.pt +3 -0
.gitattributes
CHANGED
@@ -37,3 +37,4 @@ model.index filter=lfs diff=lfs merge=lfs -text
|
|
37 |
models/Female.index filter=lfs diff=lfs merge=lfs -text
|
38 |
models/added_IVF714_Flat_nprobe_1_timcook_v2.index filter=lfs diff=lfs merge=lfs -text
|
39 |
models/Male.index filter=lfs diff=lfs merge=lfs -text
|
|
|
|
37 |
models/Female.index filter=lfs diff=lfs merge=lfs -text
|
38 |
models/added_IVF714_Flat_nprobe_1_timcook_v2.index filter=lfs diff=lfs merge=lfs -text
|
39 |
models/Male.index filter=lfs diff=lfs merge=lfs -text
|
40 |
+
AKSHAY[[:space:]]KUMAR.wav filter=lfs diff=lfs merge=lfs -text
|
AKSHAY KUMAR.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d10c5a01bbaf92e0e42d1a3888c68e510cdd568f60404eadc4f64c67c2297295
|
3 |
+
size 4567182
|
hubert_base.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f54b40fd2802423a5643779c4861af1e9ee9c1564dc9d32f54f20b5ffba7db96
|
3 |
+
size 189507909
|
infer.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
import time
|
4 |
+
import sounddevice as sd
|
5 |
+
import torchaudio
|
6 |
+
import json
|
7 |
+
from infer_rvc_python import BaseLoader
|
8 |
+
import datetime
|
9 |
+
import pyaudio
|
10 |
+
# Get the current date and time
|
11 |
+
now = datetime.datetime.now()
|
12 |
+
# Format the date and time as a string
|
13 |
+
timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")
|
14 |
+
|
15 |
+
converter = BaseLoader(only_cpu=True, hubert_path='./hubert_base.pt', rmvpe_path='./rmvpe.pt')
|
16 |
+
random_tag = "USER_"+str(timestamp)
|
17 |
+
converter.apply_conf(
|
18 |
+
tag=random_tag,
|
19 |
+
file_model="./model.pth",
|
20 |
+
pitch_algo="rmvpe+",
|
21 |
+
pitch_lvl=0,
|
22 |
+
file_index="./model.index",
|
23 |
+
index_influence=0.80,
|
24 |
+
respiration_median_filtering=3,
|
25 |
+
envelope_ratio=0.25,
|
26 |
+
consonant_breath_protection=0.5,
|
27 |
+
resample_sr=0,
|
28 |
+
)
|
29 |
+
time.sleep(0.5)
|
30 |
+
chunk_sec = 0.1
|
31 |
+
sr = 16000
|
32 |
+
chunk_len = int(sr * chunk_sec)
|
33 |
+
L = 16
|
34 |
+
b, a = converter.generate_from_cache(
|
35 |
+
audio_data="./AKSHAY KUMAR.wav",
|
36 |
+
tag=random_tag,
|
37 |
+
)
|
38 |
+
import soundfile as sf
|
39 |
+
|
40 |
+
sf.write(
|
41 |
+
file="output_file.wav",
|
42 |
+
samplerate=a,
|
43 |
+
data=b
|
44 |
+
)
|
45 |
+
stop_recording = False
|
46 |
+
def infer_stream(sr, max_duration):
|
47 |
+
global start_time
|
48 |
+
global first_output_latency
|
49 |
+
global audio_buffer
|
50 |
+
|
51 |
+
previous_chunk = torch.zeros(L * 2, dtype=torch.float32)
|
52 |
+
|
53 |
+
outputs = []
|
54 |
+
times = []
|
55 |
+
elapsed_time = 0
|
56 |
+
|
57 |
+
with torch.inference_mode():
|
58 |
+
while True:
|
59 |
+
if len(audio_buffer) < chunk_len:
|
60 |
+
print(f'Buffer too small')
|
61 |
+
time.sleep(0.1)
|
62 |
+
continue # Wait for enough data
|
63 |
+
|
64 |
+
# Get the current chunk
|
65 |
+
buffer_chunk = audio_buffer[:chunk_len]
|
66 |
+
audio_buffer = audio_buffer[chunk_len:]
|
67 |
+
|
68 |
+
# Add lookahead context
|
69 |
+
input_chunk = torch.cat([previous_chunk, buffer_chunk])
|
70 |
+
start = time.time()
|
71 |
+
# todo:
|
72 |
+
data = (input_chunk.numpy().astype(np.int16), sr)
|
73 |
+
print(data)
|
74 |
+
result_array, sample_rate = converter.generate_from_cache(
|
75 |
+
audio_data=data,
|
76 |
+
tag=random_tag,
|
77 |
+
)
|
78 |
+
|
79 |
+
if first_output_latency < 1:
|
80 |
+
first_output_latency = time.time() - start_time
|
81 |
+
print(f'first_output_latency {first_output_latency}')
|
82 |
+
# Convert the NumPy array (result_array) to a PyTorch tensor
|
83 |
+
output = torch.tensor(result_array, dtype=torch.float32)
|
84 |
+
outputs.append(output)
|
85 |
+
times.append(time.time() - start)
|
86 |
+
|
87 |
+
# Update the previous chunk with the last part of the current buffer_chunk
|
88 |
+
previous_chunk = buffer_chunk[-L * 2:]
|
89 |
+
|
90 |
+
# Check if the maximum duration has been reached
|
91 |
+
elapsed_time = time.time() - start_time
|
92 |
+
if elapsed_time > max_duration/1.2 and len(audio_buffer) < chunk_len:
|
93 |
+
break
|
94 |
+
else:
|
95 |
+
print(f'Audio Buffer At Processing: {len(audio_buffer)} elapsed_time {elapsed_time}/{max_duration}')
|
96 |
+
|
97 |
+
# Concatenate outputs and calculate metrics
|
98 |
+
if outputs:
|
99 |
+
outputs = torch.cat(outputs, dim=2)
|
100 |
+
avg_time = np.mean(times)
|
101 |
+
total_time_processing = np.sum(times)
|
102 |
+
rtf = (chunk_len / sr) / avg_time
|
103 |
+
e2e_latency = ((2 * L + chunk_len) / sr + avg_time) * 1000
|
104 |
+
outputs = outputs.squeeze(0)
|
105 |
+
else:
|
106 |
+
rtf = e2e_latency = None
|
107 |
+
|
108 |
+
return outputs, rtf, e2e_latency, total_time_processing
|
109 |
+
|
110 |
+
def save_audio(audio, audio_path, sample_rate):
|
111 |
+
torchaudio.save(audio_path, audio, sample_rate)
|
112 |
+
max_duration = 2 # Maximum duration to process in seconds
|
113 |
+
silence_threshold = 0.01 # Threshold to detect silence
|
114 |
+
max_silence_duration = 0.5 # Maximum duration of silence to keep in seconds
|
115 |
+
|
116 |
+
# Variable to track accumulated silence duration
|
117 |
+
accumulated_silence_duration = 0.0
|
118 |
+
# Callback function to process audio from mic
|
119 |
+
def callback(indata, frames, time_info, status):
|
120 |
+
global audio_buffer, accumulated_silence_duration
|
121 |
+
global stop_recording
|
122 |
+
global stop_pro
|
123 |
+
if stop_recording:
|
124 |
+
if stop_pro < 10:
|
125 |
+
stop_pro+=1
|
126 |
+
print(f'Audio Buffer Stopped Recording: {len(audio_buffer)}')
|
127 |
+
return
|
128 |
+
audio_data = indata[:, 0] # Use first channel if stereo
|
129 |
+
audio_data = torch.tensor(audio_data, dtype=torch.float32)
|
130 |
+
|
131 |
+
# Convert audio data to numpy for silence detection
|
132 |
+
audio_np = audio_data.numpy()
|
133 |
+
|
134 |
+
# Detect silence (audio below the threshold)
|
135 |
+
silence_indices = np.where(np.abs(audio_np) < silence_threshold)[0]
|
136 |
+
# Calculate the duration of the current chunk in seconds
|
137 |
+
chunk_duration = len(audio_np) / sample_rate
|
138 |
+
|
139 |
+
if len(silence_indices) == len(audio_np):
|
140 |
+
# All data is silent
|
141 |
+
accumulated_silence_duration += chunk_duration
|
142 |
+
|
143 |
+
if accumulated_silence_duration <= max_silence_duration:
|
144 |
+
audio_buffer = torch.cat((audio_buffer, audio_data))
|
145 |
+
else:
|
146 |
+
# Non-silence detected, reset accumulated silence duration
|
147 |
+
accumulated_silence_duration = 0.0
|
148 |
+
audio_buffer = torch.cat((audio_buffer, audio_data))
|
149 |
+
|
150 |
+
if time.time()-start_time > max_duration:
|
151 |
+
stop_recording = True
|
152 |
+
print(f'Audio Buffer At Insert: {len(audio_buffer)}')
|
153 |
+
def list_audio_devices():
|
154 |
+
audio = pyaudio.PyAudio()
|
155 |
+
device_count = audio.get_device_count()
|
156 |
+
|
157 |
+
print("Available audio devices:")
|
158 |
+
for i in range(device_count):
|
159 |
+
device_info = audio.get_device_info_by_index(i)
|
160 |
+
print(f"Index: {i}, Name: {device_info['name']}, Input Channels: {device_info['maxInputChannels']}, Output Channels: {device_info['maxOutputChannels']}")
|
161 |
+
# Main script
|
162 |
+
if __name__ == "__main__":
|
163 |
+
list_audio_devices()
|
164 |
+
chunk_size = 2024 # Size of each audio chunk
|
165 |
+
sample_rate = sr # Sample rate from the model config
|
166 |
+
stop_pro = 0
|
167 |
+
|
168 |
+
# Initialize global audio buffer
|
169 |
+
audio_buffer = torch.zeros(0, dtype=torch.float32)
|
170 |
+
|
171 |
+
# Set up the microphone stream
|
172 |
+
input_device_index = 2 # Replace with your actual input device index
|
173 |
+
|
174 |
+
print("Recording...")
|
175 |
+
start_time = time.time()
|
176 |
+
first_output_latency = 0
|
177 |
+
final_output_latency = 0
|
178 |
+
try:
|
179 |
+
with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback, blocksize=chunk_size, device=input_device_index):
|
180 |
+
output_waveform, rtf, e2e_latency, total_processing_time = infer_stream(sr=sample_rate, max_duration=max_duration)
|
181 |
+
if output_waveform is not None:
|
182 |
+
# Save output to file
|
183 |
+
final_output_latency = (time.time() - start_time) - (len(output_waveform[0])/sample_rate)
|
184 |
+
save_audio(output_waveform, f'output_audio_stream_buff-{now}.wav', sample_rate)
|
185 |
+
print(f"Processed audio saved to output_audio_stream_buff.wav")
|
186 |
+
print(f'first_output_latency: {first_output_latency} || final_output_latency {final_output_latency} || total_processing_time {total_processing_time}')
|
187 |
+
if rtf is not None and e2e_latency is not None:
|
188 |
+
print(f"RTF: {rtf}, E2E Latency: {e2e_latency} ms")
|
189 |
+
except KeyboardInterrupt:
|
190 |
+
print("Recording stopped.")
|
metadata.json
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"title": "US Ascent",
|
3 |
+
"author": {
|
4 |
+
"name": "mayank dubey",
|
5 |
+
"discordUserId": null
|
6 |
+
},
|
7 |
+
"md5": "b0a77398fc88806fda285f4ecd6a5839",
|
8 |
+
"uploadedAt": "2024-07-05T07:38:18.500Z",
|
9 |
+
"weightsLink": "https://www.weights.gg/models/cly8dvwn8000cagr4ohpzo4q2",
|
10 |
+
"id": "cly8dvwn8000cagr4ohpzo4q2",
|
11 |
+
"type": "v2",
|
12 |
+
"tags": [],
|
13 |
+
"description": "US Ascent",
|
14 |
+
"samples": [],
|
15 |
+
"files": [
|
16 |
+
{
|
17 |
+
"name": "model.index",
|
18 |
+
"size": 101587779,
|
19 |
+
"md5": "61a545d9b5bb380bed408a51708b210e"
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"name": "model.pth",
|
23 |
+
"size": 57577722,
|
24 |
+
"md5": "b0a77398fc88806fda285f4ecd6a5839"
|
25 |
+
}
|
26 |
+
],
|
27 |
+
"torchMetadata": {
|
28 |
+
"config": {
|
29 |
+
"spec_channels": 1025,
|
30 |
+
"segment_size": 32,
|
31 |
+
"inter_channels": 192,
|
32 |
+
"hidden_channels": 192,
|
33 |
+
"filter_channels": 768,
|
34 |
+
"n_heads": 2,
|
35 |
+
"n_layers": 6,
|
36 |
+
"kernel_size": 3,
|
37 |
+
"p_dropout": 0,
|
38 |
+
"resblock": "1",
|
39 |
+
"resblock_kernel_sizes": [
|
40 |
+
3,
|
41 |
+
7,
|
42 |
+
11
|
43 |
+
],
|
44 |
+
"resblock_dilation_sizes": [
|
45 |
+
[
|
46 |
+
1,
|
47 |
+
3,
|
48 |
+
5
|
49 |
+
],
|
50 |
+
[
|
51 |
+
1,
|
52 |
+
3,
|
53 |
+
5
|
54 |
+
],
|
55 |
+
[
|
56 |
+
1,
|
57 |
+
3,
|
58 |
+
5
|
59 |
+
]
|
60 |
+
],
|
61 |
+
"upsample_rates": [
|
62 |
+
12,
|
63 |
+
10,
|
64 |
+
2,
|
65 |
+
2
|
66 |
+
],
|
67 |
+
"upsample_initial_channel": 512,
|
68 |
+
"upsample_kernel_sizes": [
|
69 |
+
24,
|
70 |
+
20,
|
71 |
+
4,
|
72 |
+
4
|
73 |
+
],
|
74 |
+
"emb_channels": null,
|
75 |
+
"spk_embed_dim": 109,
|
76 |
+
"gin_channels": 256,
|
77 |
+
"sr": 48000
|
78 |
+
},
|
79 |
+
"f0": 1,
|
80 |
+
"version": "v2",
|
81 |
+
"extra_info": {
|
82 |
+
"config": [
|
83 |
+
1025,
|
84 |
+
32,
|
85 |
+
192,
|
86 |
+
192,
|
87 |
+
768,
|
88 |
+
2,
|
89 |
+
6,
|
90 |
+
3,
|
91 |
+
0,
|
92 |
+
"1",
|
93 |
+
[
|
94 |
+
3,
|
95 |
+
7,
|
96 |
+
11
|
97 |
+
],
|
98 |
+
[
|
99 |
+
[
|
100 |
+
1,
|
101 |
+
3,
|
102 |
+
5
|
103 |
+
],
|
104 |
+
[
|
105 |
+
1,
|
106 |
+
3,
|
107 |
+
5
|
108 |
+
],
|
109 |
+
[
|
110 |
+
1,
|
111 |
+
3,
|
112 |
+
5
|
113 |
+
]
|
114 |
+
],
|
115 |
+
[
|
116 |
+
12,
|
117 |
+
10,
|
118 |
+
2,
|
119 |
+
2
|
120 |
+
],
|
121 |
+
512,
|
122 |
+
[
|
123 |
+
24,
|
124 |
+
20,
|
125 |
+
4,
|
126 |
+
4
|
127 |
+
],
|
128 |
+
109,
|
129 |
+
256,
|
130 |
+
48000
|
131 |
+
],
|
132 |
+
"epoch": 233,
|
133 |
+
"step": 6291,
|
134 |
+
"sr": 48000,
|
135 |
+
"f0": 1,
|
136 |
+
"version": "v2",
|
137 |
+
"creation_date": "2024-07-05T07:01:09.035229",
|
138 |
+
"model_hash": "7c335d1650be63dea6409d741859c137dc6827d9945d6afba08867ab4281e056"
|
139 |
+
},
|
140 |
+
"epochs": 233,
|
141 |
+
"step": 6291,
|
142 |
+
"creation_date": "2024-07-05T07:01:09.035229",
|
143 |
+
"model_hash": "7c335d1650be63dea6409d741859c137dc6827d9945d6afba08867ab4281e056"
|
144 |
+
},
|
145 |
+
"url": "https://models.weights.gg/cly79hr6d1211hlpr4obj48ab.zip",
|
146 |
+
"urls": [],
|
147 |
+
"epochs": 233,
|
148 |
+
"originalFileList": [
|
149 |
+
"model.index",
|
150 |
+
"model.pth"
|
151 |
+
]
|
152 |
+
}
|
model.index
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a9cc11461add817f1964dfac11c37033a20037d28fe2935038d884196f556590
|
3 |
+
size 101587779
|
model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:595287a521b83cbd8cf8372e1a8c3200081e88ce8c0b7866ebc9db7e66be9512
|
3 |
+
size 57577722
|
rmvpe.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5ed4719f59085d1affc5d81354c70828c740584f2d24e782523345a6a278962
|
3 |
+
size 181189687
|