Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import numpy as np | |
import time | |
import soundfile as sf | |
import datetime | |
from infer_rvc_python import BaseLoader | |
# Initialize converter and other global variables | |
converter = BaseLoader(only_cpu=False, hubert_path='./hubert_base.pt', rmvpe_path='./rmvpe.pt') | |
now = datetime.datetime.now() | |
timestamp = now.strftime("%Y-%m-%d_%H-%M-%S") | |
random_tag = "USER_" + str(timestamp) | |
converter.apply_conf( | |
tag=random_tag, | |
file_model="./model.pth", | |
pitch_algo="rmvpe+", | |
pitch_lvl=0, | |
file_index="./model.index", | |
index_influence=0.80, | |
respiration_median_filtering=3, | |
envelope_ratio=0.25, | |
consonant_breath_protection=0.5, | |
resample_sr=0, | |
) | |
# Constants and initializations | |
chunk_sec = 0.1 | |
sr = 16000 | |
chunk_len = int(sr * chunk_sec) | |
L = 16 | |
# Define the streaming function for Gradio | |
def process_audio_stream(audio, instream): | |
global audio_buffer, start_time, first_output_latency, stop_recording | |
if audio is None: | |
return gr.update(), instream | |
if instream is None: | |
instream = torch.zeros(0, dtype=torch.float32) | |
# Assuming 'audio' is received as numpy array, convert to torch tensor | |
audio_data = torch.tensor(audio[1], dtype=torch.float32) | |
# Append new data to audio buffer | |
audio_buffer = torch.cat((audio_buffer, audio_data)) | |
if len(audio_buffer) >= chunk_len: | |
# Get the current chunk | |
buffer_chunk = audio_buffer[:chunk_len] | |
audio_buffer = audio_buffer[chunk_len:] | |
# Process the audio data (as per your existing logic) | |
input_chunk = torch.cat([instream[-L*2:], buffer_chunk]) | |
data = (input_chunk.numpy().astype(np.int16), sr) | |
result_array, _ = converter.generate_from_cache(audio_data=data, tag=random_tag) | |
output = torch.tensor(result_array, dtype=torch.float32) | |
# Append the processed output to instream for continuous processing | |
instream = torch.cat((instream, output)) | |
return instream.numpy(), instream.numpy() | |
else: | |
return gr.update(), instream | |
# Function to save audio to file | |
def save_audio(audio, audio_path, sample_rate): | |
torchaudio.save(audio_path, torch.tensor(audio, dtype=torch.float32), sample_rate) | |
# Function to list audio devices (for debugging or selecting specific devices) | |
def list_audio_devices(): | |
import pyaudio | |
audio = pyaudio.PyAudio() | |
device_count = audio.get_device_count() | |
print("Available audio devices:") | |
for i in range(device_count): | |
device_info = audio.get_device_info_by_index(i) | |
print(f"Index: {i}, Name: {device_info['name']}, Input Channels: {device_info['maxInputChannels']}, Output Channels: {device_info['maxOutputChannels']}") | |
# Define Gradio interface | |
with gr.Blocks() as demo: | |
inp = gr.Audio(sources="microphone", streaming=True) | |
out = gr.Audio(streaming=True) | |
stream = gr.State() | |
inp.stream(process_audio_stream, [inp, stream], [out, stream]) | |
# Button to clear/reset the stream | |
clear = gr.Button("Clear") | |
clear.click(lambda: [None, torch.zeros(0, dtype=torch.float32)], None, [inp, out, stream]) | |
if __name__ == "__main__": | |
# Initialize global audio buffer | |
audio_buffer = torch.zeros(0, dtype=torch.float32) | |
start_time = time.time() | |
first_output_latency = 0 | |
stop_recording = False | |
# Optionally list audio devices (can be commented out if not needed) | |
# list_audio_devices() | |
# Launch Gradio interface | |
demo.launch() | |