import gradio as gr import torch import numpy as np import time import soundfile as sf import datetime from infer_rvc_python import BaseLoader # Initialize converter and other global variables converter = BaseLoader(only_cpu=False, hubert_path='./hubert_base.pt', rmvpe_path='./rmvpe.pt') now = datetime.datetime.now() timestamp = now.strftime("%Y-%m-%d_%H-%M-%S") random_tag = "USER_" + str(timestamp) converter.apply_conf( tag=random_tag, file_model="./model.pth", pitch_algo="rmvpe+", pitch_lvl=0, file_index="./model.index", index_influence=0.80, respiration_median_filtering=3, envelope_ratio=0.25, consonant_breath_protection=0.5, resample_sr=0, ) # Constants and initializations chunk_sec = 0.1 sr = 16000 chunk_len = int(sr * chunk_sec) L = 16 # Define the streaming function for Gradio def process_audio_stream(audio, instream): global audio_buffer, start_time, first_output_latency, stop_recording if audio is None: return gr.update(), instream if instream is None: instream = torch.zeros(0, dtype=torch.float32) # Assuming 'audio' is received as numpy array, convert to torch tensor audio_data = torch.tensor(audio[1], dtype=torch.float32) # Append new data to audio buffer audio_buffer = torch.cat((audio_buffer, audio_data)) if len(audio_buffer) >= chunk_len: # Get the current chunk buffer_chunk = audio_buffer[:chunk_len] audio_buffer = audio_buffer[chunk_len:] # Process the audio data (as per your existing logic) input_chunk = torch.cat([instream[-L*2:], buffer_chunk]) data = (input_chunk.numpy().astype(np.int16), sr) result_array, _ = converter.generate_from_cache(audio_data=data, tag=random_tag) output = torch.tensor(result_array, dtype=torch.float32) # Append the processed output to instream for continuous processing instream = torch.cat((instream, output)) return instream.numpy(), instream.numpy() else: return gr.update(), instream # Function to save audio to file def save_audio(audio, audio_path, sample_rate): torchaudio.save(audio_path, torch.tensor(audio, dtype=torch.float32), sample_rate) # Function to list audio devices (for debugging or selecting specific devices) def list_audio_devices(): import pyaudio audio = pyaudio.PyAudio() device_count = audio.get_device_count() print("Available audio devices:") for i in range(device_count): device_info = audio.get_device_info_by_index(i) print(f"Index: {i}, Name: {device_info['name']}, Input Channels: {device_info['maxInputChannels']}, Output Channels: {device_info['maxOutputChannels']}") # Define Gradio interface with gr.Blocks() as demo: inp = gr.Audio(sources="microphone", streaming=True) out = gr.Audio(streaming=True) stream = gr.State() inp.stream(process_audio_stream, [inp, stream], [out, stream]) # Button to clear/reset the stream clear = gr.Button("Clear") clear.click(lambda: [None, torch.zeros(0, dtype=torch.float32)], None, [inp, out, stream]) if __name__ == "__main__": # Initialize global audio buffer audio_buffer = torch.zeros(0, dtype=torch.float32) start_time = time.time() first_output_latency = 0 stop_recording = False # Optionally list audio devices (can be commented out if not needed) # list_audio_devices() # Launch Gradio interface demo.launch()