akshansh36 commited on
Commit
32051f1
·
verified ·
1 Parent(s): beab452

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -55
app.py CHANGED
@@ -1,70 +1,83 @@
1
  import gradio as gr
2
- import os
3
- import numpy as np
4
  import torch
 
 
5
  from infer_rvc_python import BaseLoader
 
6
 
7
- # Initialize the audio transformation model
8
- converter = BaseLoader(only_cpu=False, hubert_path=None, rmvpe_path=None)
 
 
9
 
10
- # Path to the models directory
11
- model_dir = "./models"
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Function to configure the model based on user selection
14
- def configure_model(file_model, file_index):
15
- model_path = os.path.join(model_dir, file_model)
16
- index_path = os.path.join(model_dir, file_index) if file_index else None
17
-
18
- converter.apply_conf(
19
- tag="live_transform",
20
- file_model=model_path,
21
- pitch_algo="rmvpe+",
22
- pitch_lvl=0,
23
- file_index=index_path,
24
- index_influence=0.75,
25
- respiration_median_filtering=3,
26
- envelope_ratio=0.25,
27
- consonant_breath_protection=0.5,
28
- resample_sr=44100
29
- )
30
- return "Model configured successfully."
31
 
32
- # Function to process each audio chunk
33
- def transform_audio_chunk(audio):
 
 
 
 
 
34
  if audio is None:
35
- return None
 
 
 
 
 
 
 
36
 
37
- audio_data = torch.tensor(audio[1], dtype=torch.float32).unsqueeze(0) # Prepare audio for processing
38
- with torch.no_grad():
39
- # Ensure that source_sr and base_sr are scalar values
40
- transformed_audio, _ = converter.generate_from_cache(
41
- audio_data=(audio[0], audio_data.numpy()),
42
- tag="live_transform",
 
 
 
 
 
43
  )
44
-
45
- return audio[0], transformed_audio.squeeze(0).numpy()
 
 
 
 
 
 
46
 
47
  # Gradio interface setup
48
  with gr.Blocks() as demo:
49
- # Get the list of available model and index files
50
- model_files = [f for f in os.listdir(model_dir) if f.endswith(".pth")]
51
- index_files = [f for f in os.listdir(model_dir) if f.endswith(".index")]
52
 
53
- # Dropdowns for model and index file selection
54
- model_file = gr.Dropdown(choices=model_files, label="Select Model File")
55
- index_file = gr.Dropdown(choices=index_files, label="Select Index File")
56
- configure_button = gr.Button("Configure Model")
57
-
58
- # Audio input component with streaming enabled
59
- inp = gr.Audio(sources="microphone", streaming=True, type="numpy")
60
- # Audio output component to play back the transformed audio
61
- out = gr.Audio()
62
-
63
- # Link the input to the processing function and output
64
- inp.stream(transform_audio_chunk, inputs=[inp], outputs=[out])
65
-
66
- # Link the model configuration button to the configure_model function
67
- configure_button.click(configure_model, inputs=[model_file, index_file], outputs=[])
68
 
69
- if __name__ == "__main__":
70
- demo.launch()
 
1
  import gradio as gr
 
 
2
  import torch
3
+ import numpy as np
4
+ import datetime
5
  from infer_rvc_python import BaseLoader
6
+ import torchaudio
7
 
8
+ # Initialize the model
9
+ now = datetime.datetime.now()
10
+ timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")
11
+ random_tag = "USER_" + str(timestamp)
12
 
13
+ converter = BaseLoader(only_cpu=False, hubert_path='./hubert_base.pt', rmvpe_path='./rmvpe.pt')
14
+ converter.apply_conf(
15
+ tag=random_tag,
16
+ file_model="./model.pth",
17
+ pitch_algo="rmvpe+",
18
+ pitch_lvl=0,
19
+ file_index="./model.index",
20
+ index_influence=0.80,
21
+ respiration_median_filtering=3,
22
+ envelope_ratio=0.25,
23
+ consonant_breath_protection=0.5,
24
+ resample_sr=0,
25
+ )
26
 
27
+ chunk_sec = 0.1
28
+ sr = 16000
29
+ chunk_len = int(sr * chunk_sec)
30
+ L = 16
31
+ stop_recording = False
32
+ first_output_latency = 0
33
+
34
+ # Initialize global audio buffer
35
+ audio_buffer = torch.zeros(0, dtype=torch.float32)
 
 
 
 
 
 
 
 
 
36
 
37
+ # Function to process audio chunks
38
+ def process_audio_chunk(audio, buffer_state):
39
+ global first_output_latency
40
+
41
+ if buffer_state is None:
42
+ buffer_state = torch.zeros(0, dtype=torch.float32)
43
+
44
  if audio is None:
45
+ return None, buffer_state
46
+
47
+ # Convert input audio to tensor
48
+ audio_data = torch.tensor(audio[1], dtype=torch.float32)
49
+ buffer_state = torch.cat((buffer_state, audio_data))
50
+
51
+ if len(buffer_state) < chunk_len:
52
+ return None, buffer_state
53
 
54
+ # Process the chunk
55
+ previous_chunk = buffer_state[:chunk_len]
56
+ buffer_state = buffer_state[chunk_len:]
57
+
58
+ input_chunk = torch.cat([torch.zeros(L * 2, dtype=torch.float32), previous_chunk])
59
+
60
+ with torch.inference_mode():
61
+ data = (input_chunk.numpy().astype(np.int16), sr)
62
+ result_array, sample_rate = converter.generate_from_cache(
63
+ audio_data=data,
64
+ tag=random_tag,
65
  )
66
+
67
+ if first_output_latency == 0:
68
+ first_output_latency = time.time()
69
+
70
+ output = torch.tensor(result_array, dtype=torch.float32)
71
+ output = output.squeeze(0).numpy()
72
+
73
+ return (audio[0], output), buffer_state
74
 
75
  # Gradio interface setup
76
  with gr.Blocks() as demo:
77
+ audio_input = gr.Audio(sources="microphone", streaming=True, type="numpy", label="Input Audio")
78
+ audio_output = gr.Audio(label="Output Audio")
79
+ buffer_state = gr.State()
80
 
81
+ audio_input.stream(process_audio_chunk, inputs=[audio_input, buffer_state], outputs=[audio_output, buffer_state])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ demo.launch()