Vodex-AI / app.py
akshansh36's picture
Update app.py
f4a28e2 verified
raw
history blame
3.48 kB
import gradio as gr
import torch
import numpy as np
import time
import soundfile as sf
import datetime
from infer_rvc_python import BaseLoader
# Initialize converter and other global variables
converter = BaseLoader(only_cpu=False, hubert_path='./hubert_base.pt', rmvpe_path='./rmvpe.pt')
now = datetime.datetime.now()
timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")
random_tag = "USER_" + str(timestamp)
converter.apply_conf(
tag=random_tag,
file_model="./model.pth",
pitch_algo="rmvpe+",
pitch_lvl=0,
file_index="./model.index",
index_influence=0.80,
respiration_median_filtering=3,
envelope_ratio=0.25,
consonant_breath_protection=0.5,
resample_sr=0,
)
# Constants and initializations
chunk_sec = 0.1
sr = 16000
chunk_len = int(sr * chunk_sec)
L = 16
# Define the streaming function for Gradio
def process_audio_stream(audio, instream):
global audio_buffer, start_time, first_output_latency, stop_recording
if audio is None:
return gr.update(), instream
if instream is None:
instream = torch.zeros(0, dtype=torch.float32)
# Assuming 'audio' is received as numpy array, convert to torch tensor
audio_data = torch.tensor(audio[1], dtype=torch.float32)
# Append new data to audio buffer
audio_buffer = torch.cat((audio_buffer, audio_data))
if len(audio_buffer) >= chunk_len:
# Get the current chunk
buffer_chunk = audio_buffer[:chunk_len]
audio_buffer = audio_buffer[chunk_len:]
# Process the audio data (as per your existing logic)
input_chunk = torch.cat([instream[-L*2:], buffer_chunk])
data = (input_chunk.numpy().astype(np.int16), sr)
result_array, _ = converter.generate_from_cache(audio_data=data, tag=random_tag)
output = torch.tensor(result_array, dtype=torch.float32)
# Append the processed output to instream for continuous processing
instream = torch.cat((instream, output))
return instream.numpy(), instream.numpy()
else:
return gr.update(), instream
# Function to save audio to file
def save_audio(audio, audio_path, sample_rate):
torchaudio.save(audio_path, torch.tensor(audio, dtype=torch.float32), sample_rate)
# Function to list audio devices (for debugging or selecting specific devices)
def list_audio_devices():
import pyaudio
audio = pyaudio.PyAudio()
device_count = audio.get_device_count()
print("Available audio devices:")
for i in range(device_count):
device_info = audio.get_device_info_by_index(i)
print(f"Index: {i}, Name: {device_info['name']}, Input Channels: {device_info['maxInputChannels']}, Output Channels: {device_info['maxOutputChannels']}")
# Define Gradio interface
with gr.Blocks() as demo:
inp = gr.Audio(sources="microphone", streaming=True)
out = gr.Audio(streaming=True)
stream = gr.State()
inp.stream(process_audio_stream, [inp, stream], [out, stream])
# Button to clear/reset the stream
clear = gr.Button("Clear")
clear.click(lambda: [None, torch.zeros(0, dtype=torch.float32)], None, [inp, out, stream])
if __name__ == "__main__":
# Initialize global audio buffer
audio_buffer = torch.zeros(0, dtype=torch.float32)
start_time = time.time()
first_output_latency = 0
stop_recording = False
# Optionally list audio devices (can be commented out if not needed)
# list_audio_devices()
# Launch Gradio interface
demo.launch()