le quy don commited on
Commit
cbe9884
·
verified ·
1 Parent(s): dd1c07c

Update ban goc.py

Browse files
Files changed (1) hide show
  1. ban goc.py +52 -9
ban goc.py CHANGED
@@ -54,7 +54,7 @@ def reset_model():
54
  print(f"Failed to reinitialize model: {e}")
55
  return False
56
 
57
- def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
58
  if not inp_audio or not inp_text:
59
  gr.Warning("Please provide both reference audio and text to generate.")
60
  return None
@@ -82,6 +82,11 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
82
  with torch.no_grad(): # Use no_grad for inference
83
  resource_context = infer_pipe.preprocess(file_content)
84
  wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
 
 
 
 
 
85
  # Clean up memory after successful generation
86
  cleanup_memory()
87
  return wav_bytes
@@ -101,6 +106,43 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
101
  cleanup_memory()
102
  return None
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  def cleanup_memory():
105
  """Clean up system memory."""
106
  gc.collect()
@@ -157,13 +199,6 @@ def preprocess_audio_robust(audio_path, target_sr=22050, max_duration=30):
157
  raise ValueError(f"Failed to process audio: {str(e)}")
158
 
159
  with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
160
- gr.Markdown("# MegaTTS 3 Voice Cloning")
161
- gr.Markdown("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities. The original authors did not release the WavVAE encoder, so voice cloning was not publicly available; however, thanks to [@ACoderPassBy](https://modelscope.cn/models/ACoderPassBy/MegaTTS-SFT)'s WavVAE encoder, we can now clone voices with MegaTTS 3!")
162
- gr.Markdown("This is by no means the best voice cloning solution, but it works pretty well for some specific use-cases. Try out multiple and see which one works best for you.")
163
- gr.Markdown("**Please use this Space responsibly and do not abuse it!**")
164
- gr.Markdown("h/t to MysteryShack on Discord for the info about the unofficial WavVAE encoder!")
165
- gr.Markdown("Upload a reference audio clip and enter text to generate speech with the cloned voice.")
166
-
167
  with gr.Row():
168
  with gr.Column():
169
  reference_audio = gr.Audio(
@@ -199,6 +234,14 @@ with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
199
  maximum=10.0,
200
  step=0.1
201
  )
 
 
 
 
 
 
 
 
202
 
203
  generate_btn = gr.Button("Generate Speech", variant="primary")
204
 
@@ -207,7 +250,7 @@ with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
207
 
208
  generate_btn.click(
209
  fn=generate_speech,
210
- inputs=[reference_audio, text_input, infer_timestep, p_w, t_w],
211
  outputs=[output_audio]
212
  )
213
 
 
54
  print(f"Failed to reinitialize model: {e}")
55
  return False
56
 
57
+ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w, speed_factor):
58
  if not inp_audio or not inp_text:
59
  gr.Warning("Please provide both reference audio and text to generate.")
60
  return None
 
82
  with torch.no_grad(): # Use no_grad for inference
83
  resource_context = infer_pipe.preprocess(file_content)
84
  wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
85
+
86
+ # Apply speed adjustment if needed
87
+ if speed_factor != 1.0:
88
+ wav_bytes = adjust_speed(wav_bytes, speed_factor)
89
+
90
  # Clean up memory after successful generation
91
  cleanup_memory()
92
  return wav_bytes
 
106
  cleanup_memory()
107
  return None
108
 
109
+ def adjust_speed(wav_bytes, speed_factor):
110
+ """Adjust the speed of the audio without changing pitch"""
111
+ try:
112
+ # Create temp file
113
+ temp_input = "temp_input.wav"
114
+ temp_output = "temp_output.wav"
115
+
116
+ with open(temp_input, "wb") as f:
117
+ f.write(wav_bytes)
118
+
119
+ # Load audio
120
+ audio = AudioSegment.from_file(temp_input)
121
+
122
+ # Apply speed change
123
+ if speed_factor != 1.0:
124
+ # Manually adjust frame rate to change speed without pitch alteration
125
+ new_frame_rate = int(audio.frame_rate * speed_factor)
126
+ audio = audio._spawn(audio.raw_data, overrides={
127
+ "frame_rate": new_frame_rate
128
+ }).set_frame_rate(audio.frame_rate)
129
+
130
+ # Export result
131
+ audio.export(temp_output, format="wav")
132
+
133
+ # Read and return
134
+ with open(temp_output, "rb") as f:
135
+ result = f.read()
136
+
137
+ # Clean up temp files
138
+ os.remove(temp_input)
139
+ os.remove(temp_output)
140
+
141
+ return result
142
+ except Exception as e:
143
+ print(f"Speed adjustment failed: {e}")
144
+ return wav_bytes # Return original if adjustment fails
145
+
146
  def cleanup_memory():
147
  """Clean up system memory."""
148
  gc.collect()
 
199
  raise ValueError(f"Failed to process audio: {str(e)}")
200
 
201
  with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
 
 
 
 
 
 
 
202
  with gr.Row():
203
  with gr.Column():
204
  reference_audio = gr.Audio(
 
234
  maximum=10.0,
235
  step=0.1
236
  )
237
+ speed_factor = gr.Slider(
238
+ label="Speed Adjustment",
239
+ value=1.0,
240
+ minimum=0.5,
241
+ maximum=2.0,
242
+ step=0.1,
243
+ info="1.0 = normal speed, <1.0 = slower, >1.0 = faster"
244
+ )
245
 
246
  generate_btn = gr.Button("Generate Speech", variant="primary")
247
 
 
250
 
251
  generate_btn.click(
252
  fn=generate_speech,
253
+ inputs=[reference_audio, text_input, infer_timestep, p_w, t_w, speed_factor],
254
  outputs=[output_audio]
255
  )
256