Spaces:
Sleeping
Sleeping
le quy don
commited on
Update ban goc.py
Browse files- ban goc.py +52 -9
ban goc.py
CHANGED
@@ -54,7 +54,7 @@ def reset_model():
|
|
54 |
print(f"Failed to reinitialize model: {e}")
|
55 |
return False
|
56 |
|
57 |
-
def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
|
58 |
if not inp_audio or not inp_text:
|
59 |
gr.Warning("Please provide both reference audio and text to generate.")
|
60 |
return None
|
@@ -82,6 +82,11 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
|
|
82 |
with torch.no_grad(): # Use no_grad for inference
|
83 |
resource_context = infer_pipe.preprocess(file_content)
|
84 |
wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
|
|
|
|
|
|
|
|
|
|
|
85 |
# Clean up memory after successful generation
|
86 |
cleanup_memory()
|
87 |
return wav_bytes
|
@@ -101,6 +106,43 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
|
|
101 |
cleanup_memory()
|
102 |
return None
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
def cleanup_memory():
|
105 |
"""Clean up system memory."""
|
106 |
gc.collect()
|
@@ -157,13 +199,6 @@ def preprocess_audio_robust(audio_path, target_sr=22050, max_duration=30):
|
|
157 |
raise ValueError(f"Failed to process audio: {str(e)}")
|
158 |
|
159 |
with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
|
160 |
-
gr.Markdown("# MegaTTS 3 Voice Cloning")
|
161 |
-
gr.Markdown("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities. The original authors did not release the WavVAE encoder, so voice cloning was not publicly available; however, thanks to [@ACoderPassBy](https://modelscope.cn/models/ACoderPassBy/MegaTTS-SFT)'s WavVAE encoder, we can now clone voices with MegaTTS 3!")
|
162 |
-
gr.Markdown("This is by no means the best voice cloning solution, but it works pretty well for some specific use-cases. Try out multiple and see which one works best for you.")
|
163 |
-
gr.Markdown("**Please use this Space responsibly and do not abuse it!**")
|
164 |
-
gr.Markdown("h/t to MysteryShack on Discord for the info about the unofficial WavVAE encoder!")
|
165 |
-
gr.Markdown("Upload a reference audio clip and enter text to generate speech with the cloned voice.")
|
166 |
-
|
167 |
with gr.Row():
|
168 |
with gr.Column():
|
169 |
reference_audio = gr.Audio(
|
@@ -199,6 +234,14 @@ with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
|
|
199 |
maximum=10.0,
|
200 |
step=0.1
|
201 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
generate_btn = gr.Button("Generate Speech", variant="primary")
|
204 |
|
@@ -207,7 +250,7 @@ with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
|
|
207 |
|
208 |
generate_btn.click(
|
209 |
fn=generate_speech,
|
210 |
-
inputs=[reference_audio, text_input, infer_timestep, p_w, t_w],
|
211 |
outputs=[output_audio]
|
212 |
)
|
213 |
|
|
|
54 |
print(f"Failed to reinitialize model: {e}")
|
55 |
return False
|
56 |
|
57 |
+
def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w, speed_factor):
|
58 |
if not inp_audio or not inp_text:
|
59 |
gr.Warning("Please provide both reference audio and text to generate.")
|
60 |
return None
|
|
|
82 |
with torch.no_grad(): # Use no_grad for inference
|
83 |
resource_context = infer_pipe.preprocess(file_content)
|
84 |
wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
|
85 |
+
|
86 |
+
# Apply speed adjustment if needed
|
87 |
+
if speed_factor != 1.0:
|
88 |
+
wav_bytes = adjust_speed(wav_bytes, speed_factor)
|
89 |
+
|
90 |
# Clean up memory after successful generation
|
91 |
cleanup_memory()
|
92 |
return wav_bytes
|
|
|
106 |
cleanup_memory()
|
107 |
return None
|
108 |
|
109 |
+
def adjust_speed(wav_bytes, speed_factor):
|
110 |
+
"""Adjust the speed of the audio without changing pitch"""
|
111 |
+
try:
|
112 |
+
# Create temp file
|
113 |
+
temp_input = "temp_input.wav"
|
114 |
+
temp_output = "temp_output.wav"
|
115 |
+
|
116 |
+
with open(temp_input, "wb") as f:
|
117 |
+
f.write(wav_bytes)
|
118 |
+
|
119 |
+
# Load audio
|
120 |
+
audio = AudioSegment.from_file(temp_input)
|
121 |
+
|
122 |
+
# Apply speed change
|
123 |
+
if speed_factor != 1.0:
|
124 |
+
# Manually adjust frame rate to change speed without pitch alteration
|
125 |
+
new_frame_rate = int(audio.frame_rate * speed_factor)
|
126 |
+
audio = audio._spawn(audio.raw_data, overrides={
|
127 |
+
"frame_rate": new_frame_rate
|
128 |
+
}).set_frame_rate(audio.frame_rate)
|
129 |
+
|
130 |
+
# Export result
|
131 |
+
audio.export(temp_output, format="wav")
|
132 |
+
|
133 |
+
# Read and return
|
134 |
+
with open(temp_output, "rb") as f:
|
135 |
+
result = f.read()
|
136 |
+
|
137 |
+
# Clean up temp files
|
138 |
+
os.remove(temp_input)
|
139 |
+
os.remove(temp_output)
|
140 |
+
|
141 |
+
return result
|
142 |
+
except Exception as e:
|
143 |
+
print(f"Speed adjustment failed: {e}")
|
144 |
+
return wav_bytes # Return original if adjustment fails
|
145 |
+
|
146 |
def cleanup_memory():
|
147 |
"""Clean up system memory."""
|
148 |
gc.collect()
|
|
|
199 |
raise ValueError(f"Failed to process audio: {str(e)}")
|
200 |
|
201 |
with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
with gr.Row():
|
203 |
with gr.Column():
|
204 |
reference_audio = gr.Audio(
|
|
|
234 |
maximum=10.0,
|
235 |
step=0.1
|
236 |
)
|
237 |
+
speed_factor = gr.Slider(
|
238 |
+
label="Speed Adjustment",
|
239 |
+
value=1.0,
|
240 |
+
minimum=0.5,
|
241 |
+
maximum=2.0,
|
242 |
+
step=0.1,
|
243 |
+
info="1.0 = normal speed, <1.0 = slower, >1.0 = faster"
|
244 |
+
)
|
245 |
|
246 |
generate_btn = gr.Button("Generate Speech", variant="primary")
|
247 |
|
|
|
250 |
|
251 |
generate_btn.click(
|
252 |
fn=generate_speech,
|
253 |
+
inputs=[reference_audio, text_input, infer_timestep, p_w, t_w, speed_factor],
|
254 |
outputs=[output_audio]
|
255 |
)
|
256 |
|