Spaces:

youdata-ai
/

Vodex-AI

Sleeping

App Files Files Community

akshansh36 commited on Aug 23

Commit

2540ac0

•

1 Parent(s): 9d22957

Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
AKSHAY KUMAR.wav +3 -0
hubert_base.pt +3 -0
infer.py +190 -0
metadata.json +152 -0
model.index +3 -0
model.pth +3 -0
rmvpe.pt +3 -0

.gitattributes CHANGED Viewed

@@ -37,3 +37,4 @@ model.index filter=lfs diff=lfs merge=lfs -text
 models/Female.index filter=lfs diff=lfs merge=lfs -text
 models/added_IVF714_Flat_nprobe_1_timcook_v2.index filter=lfs diff=lfs merge=lfs -text
 models/Male.index filter=lfs diff=lfs merge=lfs -text

 models/Female.index filter=lfs diff=lfs merge=lfs -text
 models/added_IVF714_Flat_nprobe_1_timcook_v2.index filter=lfs diff=lfs merge=lfs -text
 models/Male.index filter=lfs diff=lfs merge=lfs -text
+AKSHAY[[:space:]]KUMAR.wav filter=lfs diff=lfs merge=lfs -text

AKSHAY KUMAR.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d10c5a01bbaf92e0e42d1a3888c68e510cdd568f60404eadc4f64c67c2297295
+size 4567182

hubert_base.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f54b40fd2802423a5643779c4861af1e9ee9c1564dc9d32f54f20b5ffba7db96
+size 189507909

infer.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import torch
+import numpy as np
+import time
+import sounddevice as sd
+import torchaudio
+import json
+from infer_rvc_python import BaseLoader
+import datetime
+import pyaudio
+# Get the current date and time
+now = datetime.datetime.now()
+# Format the date and time as a string
+timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")
+converter = BaseLoader(only_cpu=True, hubert_path='./hubert_base.pt', rmvpe_path='./rmvpe.pt')
+random_tag = "USER_"+str(timestamp)
+converter.apply_conf(
+                tag=random_tag,
+                file_model="./model.pth",
+                pitch_algo="rmvpe+",
+                pitch_lvl=0,
+                file_index="./model.index",
+                index_influence=0.80,
+                respiration_median_filtering=3,
+                envelope_ratio=0.25,
+                consonant_breath_protection=0.5,
+                resample_sr=0,
+            )
+time.sleep(0.5)
+chunk_sec = 0.1
+sr = 16000
+chunk_len = int(sr * chunk_sec)
+L = 16
+b, a = converter.generate_from_cache(
+    audio_data="./AKSHAY KUMAR.wav",
+    tag=random_tag,
+)
+import soundfile as sf
+sf.write(
+    file="output_file.wav",
+    samplerate=a,
+    data=b
+)
+stop_recording = False
+def infer_stream(sr, max_duration):
+    global start_time
+    global first_output_latency
+    global audio_buffer
+    previous_chunk = torch.zeros(L * 2, dtype=torch.float32)
+    outputs = []
+    times = []
+    elapsed_time = 0
+    with torch.inference_mode():
+        while True:
+            if len(audio_buffer) < chunk_len:
+                print(f'Buffer too small')
+                time.sleep(0.1)
+                continue  # Wait for enough data
+            # Get the current chunk
+            buffer_chunk = audio_buffer[:chunk_len]
+            audio_buffer = audio_buffer[chunk_len:]
+            # Add lookahead context
+            input_chunk = torch.cat([previous_chunk, buffer_chunk])
+            start = time.time()
+            # todo:
+            data = (input_chunk.numpy().astype(np.int16), sr)
+            print(data)
+            result_array, sample_rate = converter.generate_from_cache(
+                audio_data=data,
+                tag=random_tag,
+            )
+            if first_output_latency < 1:
+                first_output_latency = time.time() - start_time
+                print(f'first_output_latency {first_output_latency}')
+            # Convert the NumPy array (result_array) to a PyTorch tensor
+            output = torch.tensor(result_array, dtype=torch.float32)
+            outputs.append(output)
+            times.append(time.time() - start)
+            # Update the previous chunk with the last part of the current buffer_chunk
+            previous_chunk = buffer_chunk[-L * 2:]
+            # Check if the maximum duration has been reached
+            elapsed_time = time.time() - start_time
+            if elapsed_time > max_duration/1.2 and len(audio_buffer) < chunk_len:
+                break
+            else:
+                print(f'Audio Buffer At Processing: {len(audio_buffer)} elapsed_time {elapsed_time}/{max_duration}')
+    # Concatenate outputs and calculate metrics
+    if outputs:
+        outputs = torch.cat(outputs, dim=2)
+        avg_time = np.mean(times)
+        total_time_processing = np.sum(times)
+        rtf = (chunk_len / sr) / avg_time
+        e2e_latency = ((2 * L + chunk_len) / sr + avg_time) * 1000
+        outputs = outputs.squeeze(0)
+    else:
+        rtf = e2e_latency = None
+    return outputs, rtf, e2e_latency, total_time_processing
+def save_audio(audio, audio_path, sample_rate):
+    torchaudio.save(audio_path, audio, sample_rate)
+max_duration = 2  # Maximum duration to process in seconds
+silence_threshold = 0.01  # Threshold to detect silence
+max_silence_duration = 0.5  # Maximum duration of silence to keep in seconds
+# Variable to track accumulated silence duration
+accumulated_silence_duration = 0.0
+# Callback function to process audio from mic
+def callback(indata, frames, time_info, status):
+    global audio_buffer, accumulated_silence_duration
+    global stop_recording
+    global stop_pro
+    if stop_recording:
+        if stop_pro < 10:
+            stop_pro+=1
+            print(f'Audio Buffer Stopped Recording: {len(audio_buffer)}')
+        return
+    audio_data = indata[:, 0]  # Use first channel if stereo
+    audio_data = torch.tensor(audio_data, dtype=torch.float32)
+    # Convert audio data to numpy for silence detection
+    audio_np = audio_data.numpy()
+    # Detect silence (audio below the threshold)
+    silence_indices = np.where(np.abs(audio_np) < silence_threshold)[0]
+    # Calculate the duration of the current chunk in seconds
+    chunk_duration = len(audio_np) / sample_rate
+    if len(silence_indices) == len(audio_np):
+        # All data is silent
+        accumulated_silence_duration += chunk_duration
+        if accumulated_silence_duration <= max_silence_duration:
+            audio_buffer = torch.cat((audio_buffer, audio_data))
+    else:
+        # Non-silence detected, reset accumulated silence duration
+        accumulated_silence_duration = 0.0
+        audio_buffer = torch.cat((audio_buffer, audio_data))
+    if time.time()-start_time > max_duration:
+        stop_recording = True
+    print(f'Audio Buffer At Insert: {len(audio_buffer)}')
+def list_audio_devices():
+    audio = pyaudio.PyAudio()
+    device_count = audio.get_device_count()
+    print("Available audio devices:")
+    for i in range(device_count):
+        device_info = audio.get_device_info_by_index(i)
+        print(f"Index: {i}, Name: {device_info['name']}, Input Channels: {device_info['maxInputChannels']}, Output Channels: {device_info['maxOutputChannels']}")
+# Main script
+if __name__ == "__main__":
+    list_audio_devices()
+    chunk_size = 2024  # Size of each audio chunk
+    sample_rate = sr  # Sample rate from the model config
+    stop_pro = 0
+    # Initialize global audio buffer
+    audio_buffer = torch.zeros(0, dtype=torch.float32)
+    # Set up the microphone stream
+    input_device_index = 2  # Replace with your actual input device index
+    print("Recording...")
+    start_time = time.time()
+    first_output_latency = 0
+    final_output_latency = 0
+    try:
+        with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback, blocksize=chunk_size, device=input_device_index):
+            output_waveform, rtf, e2e_latency, total_processing_time = infer_stream(sr=sample_rate, max_duration=max_duration)
+            if output_waveform is not None:
+                # Save output to file
+                final_output_latency = (time.time() - start_time) - (len(output_waveform[0])/sample_rate)
+                save_audio(output_waveform, f'output_audio_stream_buff-{now}.wav', sample_rate)
+                print(f"Processed audio saved to output_audio_stream_buff.wav")
+                print(f'first_output_latency: {first_output_latency} || final_output_latency {final_output_latency} || total_processing_time {total_processing_time}')
+            if rtf is not None and e2e_latency is not None:
+                print(f"RTF: {rtf}, E2E Latency: {e2e_latency} ms")
+    except KeyboardInterrupt:
+        print("Recording stopped.")

metadata.json ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+  "title": "US Ascent",
+  "author": {
+    "name": "mayank dubey",
+    "discordUserId": null
+  },
+  "md5": "b0a77398fc88806fda285f4ecd6a5839",
+  "uploadedAt": "2024-07-05T07:38:18.500Z",
+  "weightsLink": "https://www.weights.gg/models/cly8dvwn8000cagr4ohpzo4q2",
+  "id": "cly8dvwn8000cagr4ohpzo4q2",
+  "type": "v2",
+  "tags": [],
+  "description": "US Ascent",
+  "samples": [],
+  "files": [
+    {
+      "name": "model.index",
+      "size": 101587779,
+      "md5": "61a545d9b5bb380bed408a51708b210e"
+    },
+    {
+      "name": "model.pth",
+      "size": 57577722,
+      "md5": "b0a77398fc88806fda285f4ecd6a5839"
+    }
+  ],
+  "torchMetadata": {
+    "config": {
+      "spec_channels": 1025,
+      "segment_size": 32,
+      "inter_channels": 192,
+      "hidden_channels": 192,
+      "filter_channels": 768,
+      "n_heads": 2,
+      "n_layers": 6,
+      "kernel_size": 3,
+      "p_dropout": 0,
+      "resblock": "1",
+      "resblock_kernel_sizes": [
+        3,
+        7,
+        11
+      ],
+      "resblock_dilation_sizes": [
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ]
+      ],
+      "upsample_rates": [
+        12,
+        10,
+        2,
+        2
+      ],
+      "upsample_initial_channel": 512,
+      "upsample_kernel_sizes": [
+        24,
+        20,
+        4,
+        4
+      ],
+      "emb_channels": null,
+      "spk_embed_dim": 109,
+      "gin_channels": 256,
+      "sr": 48000
+    },
+    "f0": 1,
+    "version": "v2",
+    "extra_info": {
+      "config": [
+        1025,
+        32,
+        192,
+        192,
+        768,
+        2,
+        6,
+        3,
+        0,
+        "1",
+        [
+          3,
+          7,
+          11
+        ],
+        [
+          [
+            1,
+            3,
+            5
+          ],
+          [
+            1,
+            3,
+            5
+          ],
+          [
+            1,
+            3,
+            5
+          ]
+        ],
+        [
+          12,
+          10,
+          2,
+          2
+        ],
+        512,
+        [
+          24,
+          20,
+          4,
+          4
+        ],
+        109,
+        256,
+        48000
+      ],
+      "epoch": 233,
+      "step": 6291,
+      "sr": 48000,
+      "f0": 1,
+      "version": "v2",
+      "creation_date": "2024-07-05T07:01:09.035229",
+      "model_hash": "7c335d1650be63dea6409d741859c137dc6827d9945d6afba08867ab4281e056"
+    },
+    "epochs": 233,
+    "step": 6291,
+    "creation_date": "2024-07-05T07:01:09.035229",
+    "model_hash": "7c335d1650be63dea6409d741859c137dc6827d9945d6afba08867ab4281e056"
+  },
+  "url": "https://models.weights.gg/cly79hr6d1211hlpr4obj48ab.zip",
+  "urls": [],
+  "epochs": 233,
+  "originalFileList": [
+    "model.index",
+    "model.pth"
+  ]
+}

model.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9cc11461add817f1964dfac11c37033a20037d28fe2935038d884196f556590
+size 101587779

model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:595287a521b83cbd8cf8372e1a8c3200081e88ce8c0b7866ebc9db7e66be9512
+size 57577722

rmvpe.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5ed4719f59085d1affc5d81354c70828c740584f2d24e782523345a6a278962
+size 181189687