Spaces:

youdata-ai
/

Vodex-AI

Sleeping

App Files Files Community

akshansh36 commited on Aug 23, 2024

Commit

03d15d8

verified ·

1 Parent(s): 2540ac0

Update infer.py

Browse files

Files changed (1) hide show

infer.py +77 -160

infer.py CHANGED Viewed

@@ -1,156 +1,78 @@
 import torch
 import numpy as np
 import time
-import sounddevice as sd
-import torchaudio
-import json
-from infer_rvc_python import BaseLoader
 import datetime
-import pyaudio
-# Get the current date and time
 now = datetime.datetime.now()
-# Format the date and time as a string
 timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")
-converter = BaseLoader(only_cpu=True, hubert_path='./hubert_base.pt', rmvpe_path='./rmvpe.pt')
-random_tag = "USER_"+str(timestamp)
 converter.apply_conf(
-                tag=random_tag,
-                file_model="./model.pth",
-                pitch_algo="rmvpe+",
-                pitch_lvl=0,
-                file_index="./model.index",
-                index_influence=0.80,
-                respiration_median_filtering=3,
-                envelope_ratio=0.25,
-                consonant_breath_protection=0.5,
-                resample_sr=0,
-            )
-time.sleep(0.5)
 chunk_sec = 0.1
 sr = 16000
 chunk_len = int(sr * chunk_sec)
 L = 16
-b, a = converter.generate_from_cache(
-    audio_data="./AKSHAY KUMAR.wav",
-    tag=random_tag,
-)
-import soundfile as sf
-sf.write(
-    file="output_file.wav",
-    samplerate=a,
-    data=b
-)
-stop_recording = False
-def infer_stream(sr, max_duration):
-    global start_time
-    global first_output_latency
-    global audio_buffer
-    previous_chunk = torch.zeros(L * 2, dtype=torch.float32)
-    outputs = []
-    times = []
-    elapsed_time = 0
-    with torch.inference_mode():
-        while True:
-            if len(audio_buffer) < chunk_len:
-                print(f'Buffer too small')
-                time.sleep(0.1)
-                continue  # Wait for enough data
-            # Get the current chunk
-            buffer_chunk = audio_buffer[:chunk_len]
-            audio_buffer = audio_buffer[chunk_len:]
-            # Add lookahead context
-            input_chunk = torch.cat([previous_chunk, buffer_chunk])
-            start = time.time()
-            # todo:
-            data = (input_chunk.numpy().astype(np.int16), sr)
-            print(data)
-            result_array, sample_rate = converter.generate_from_cache(
-                audio_data=data,
-                tag=random_tag,
-            )
-            if first_output_latency < 1:
-                first_output_latency = time.time() - start_time
-                print(f'first_output_latency {first_output_latency}')
-            # Convert the NumPy array (result_array) to a PyTorch tensor
-            output = torch.tensor(result_array, dtype=torch.float32)
-            outputs.append(output)
-            times.append(time.time() - start)
-            # Update the previous chunk with the last part of the current buffer_chunk
-            previous_chunk = buffer_chunk[-L * 2:]
-            # Check if the maximum duration has been reached
-            elapsed_time = time.time() - start_time
-            if elapsed_time > max_duration/1.2 and len(audio_buffer) < chunk_len:
-                break
-            else:
-                print(f'Audio Buffer At Processing: {len(audio_buffer)} elapsed_time {elapsed_time}/{max_duration}')
-    # Concatenate outputs and calculate metrics
-    if outputs:
-        outputs = torch.cat(outputs, dim=2)
-        avg_time = np.mean(times)
-        total_time_processing = np.sum(times)
-        rtf = (chunk_len / sr) / avg_time
-        e2e_latency = ((2 * L + chunk_len) / sr + avg_time) * 1000
-        outputs = outputs.squeeze(0)
-    else:
-        rtf = e2e_latency = None
-    return outputs, rtf, e2e_latency, total_time_processing
-def save_audio(audio, audio_path, sample_rate):
-    torchaudio.save(audio_path, audio, sample_rate)
-max_duration = 2  # Maximum duration to process in seconds
-silence_threshold = 0.01  # Threshold to detect silence
-max_silence_duration = 0.5  # Maximum duration of silence to keep in seconds
-# Variable to track accumulated silence duration
-accumulated_silence_duration = 0.0
-# Callback function to process audio from mic
-def callback(indata, frames, time_info, status):
-    global audio_buffer, accumulated_silence_duration
-    global stop_recording
-    global stop_pro
-    if stop_recording:
-        if stop_pro < 10:
-            stop_pro+=1
-            print(f'Audio Buffer Stopped Recording: {len(audio_buffer)}')
-        return
-    audio_data = indata[:, 0]  # Use first channel if stereo
-    audio_data = torch.tensor(audio_data, dtype=torch.float32)
-    # Convert audio data to numpy for silence detection
-    audio_np = audio_data.numpy()
-    # Detect silence (audio below the threshold)
-    silence_indices = np.where(np.abs(audio_np) < silence_threshold)[0]
-    # Calculate the duration of the current chunk in seconds
-    chunk_duration = len(audio_np) / sample_rate
-    if len(silence_indices) == len(audio_np):
-        # All data is silent
-        accumulated_silence_duration += chunk_duration
-        if accumulated_silence_duration <= max_silence_duration:
-            audio_buffer = torch.cat((audio_buffer, audio_data))
     else:
-        # Non-silence detected, reset accumulated silence duration
-        accumulated_silence_duration = 0.0
-        audio_buffer = torch.cat((audio_buffer, audio_data))
-    if time.time()-start_time > max_duration:
-        stop_recording = True
-    print(f'Audio Buffer At Insert: {len(audio_buffer)}')
 def list_audio_devices():
     audio = pyaudio.PyAudio()
     device_count = audio.get_device_count()
@@ -158,33 +80,28 @@ def list_audio_devices():
     for i in range(device_count):
         device_info = audio.get_device_info_by_index(i)
         print(f"Index: {i}, Name: {device_info['name']}, Input Channels: {device_info['maxInputChannels']}, Output Channels: {device_info['maxOutputChannels']}")
-# Main script
-if __name__ == "__main__":
-    list_audio_devices()
-    chunk_size = 2024  # Size of each audio chunk
-    sample_rate = sr  # Sample rate from the model config
-    stop_pro = 0
-    # Initialize global audio buffer
-    audio_buffer = torch.zeros(0, dtype=torch.float32)
-    # Set up the microphone stream
-    input_device_index = 2  # Replace with your actual input device index
-    print("Recording...")
     start_time = time.time()
     first_output_latency = 0
-    final_output_latency = 0
-    try:
-        with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback, blocksize=chunk_size, device=input_device_index):
-            output_waveform, rtf, e2e_latency, total_processing_time = infer_stream(sr=sample_rate, max_duration=max_duration)
-            if output_waveform is not None:
-                # Save output to file
-                final_output_latency = (time.time() - start_time) - (len(output_waveform[0])/sample_rate)
-                save_audio(output_waveform, f'output_audio_stream_buff-{now}.wav', sample_rate)
-                print(f"Processed audio saved to output_audio_stream_buff.wav")
-                print(f'first_output_latency: {first_output_latency} || final_output_latency {final_output_latency} || total_processing_time {total_processing_time}')
-            if rtf is not None and e2e_latency is not None:
-                print(f"RTF: {rtf}, E2E Latency: {e2e_latency} ms")
-    except KeyboardInterrupt:
-        print("Recording stopped.")

+import gradio as gr
 import torch
 import numpy as np
 import time
+import soundfile as sf
 import datetime
+from infer_rvc_python import BaseLoader
+# Initialize converter and other global variables
+converter = BaseLoader(only_cpu=True, hubert_path='./hubert_base.pt', rmvpe_path='./rmvpe.pt')
 now = datetime.datetime.now()
 timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")
+random_tag = "USER_" + str(timestamp)
 converter.apply_conf(
+    tag=random_tag,
+    file_model="./model.pth",
+    pitch_algo="rmvpe+",
+    pitch_lvl=0,
+    file_index="./model.index",
+    index_influence=0.80,
+    respiration_median_filtering=3,
+    envelope_ratio=0.25,
+    consonant_breath_protection=0.5,
+    resample_sr=0,
+)
+# Constants and initializations
 chunk_sec = 0.1
 sr = 16000
 chunk_len = int(sr * chunk_sec)
 L = 16
+# Define the streaming function for Gradio
+def process_audio_stream(audio, instream):
+    global audio_buffer, start_time, first_output_latency, stop_recording
+    if audio is None:
+        return gr.update(), instream
+    if instream is None:
+        instream = torch.zeros(0, dtype=torch.float32)
+    # Assuming 'audio' is received as numpy array, convert to torch tensor
+    audio_data = torch.tensor(audio[1], dtype=torch.float32)
+    # Append new data to audio buffer
+    audio_buffer = torch.cat((audio_buffer, audio_data))
+    if len(audio_buffer) >= chunk_len:
+        # Get the current chunk
+        buffer_chunk = audio_buffer[:chunk_len]
+        audio_buffer = audio_buffer[chunk_len:]
+        # Process the audio data (as per your existing logic)
+        input_chunk = torch.cat([instream[-L*2:], buffer_chunk])
+        data = (input_chunk.numpy().astype(np.int16), sr)
+        result_array, _ = converter.generate_from_cache(audio_data=data, tag=random_tag)
+        output = torch.tensor(result_array, dtype=torch.float32)
+        # Append the processed output to instream for continuous processing
+        instream = torch.cat((instream, output))
+        return instream.numpy(), instream.numpy()
     else:
+        return gr.update(), instream
+# Function to save audio to file
+def save_audio(audio, audio_path, sample_rate):
+    torchaudio.save(audio_path, torch.tensor(audio, dtype=torch.float32), sample_rate)
+# Function to list audio devices (for debugging or selecting specific devices)
 def list_audio_devices():
+    import pyaudio
     audio = pyaudio.PyAudio()
     device_count = audio.get_device_count()
     for i in range(device_count):
         device_info = audio.get_device_info_by_index(i)
         print(f"Index: {i}, Name: {device_info['name']}, Input Channels: {device_info['maxInputChannels']}, Output Channels: {device_info['maxOutputChannels']}")
+# Define Gradio interface
+with gr.Blocks() as demo:
+    inp = gr.Audio(sources="microphone", streaming=True)
+    out = gr.Audio(streaming=True)
+    stream = gr.State()
+    inp.stream(process_audio_stream, [inp, stream], [out, stream])
+    # Button to clear/reset the stream
+    clear = gr.Button("Clear")
+    clear.click(lambda: [None, torch.zeros(0, dtype=torch.float32)], None, [inp, out, stream])
+if __name__ == "__main__":
+    # Initialize global audio buffer
+    audio_buffer = torch.zeros(0, dtype=torch.float32)
     start_time = time.time()
     first_output_latency = 0
+    stop_recording = False
+    # Optionally list audio devices (can be commented out if not needed)
+    # list_audio_devices()
+    # Launch Gradio interface
+    demo.launch()