le quy don commited on
Commit
c609950
·
verified ·
1 Parent(s): f35a58c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -2
app.py CHANGED
@@ -54,7 +54,7 @@ def reset_model():
54
  print(f"Failed to reinitialize model: {e}")
55
  return False
56
 
57
- def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
58
  if not inp_audio or not inp_text:
59
  gr.Warning("Please provide both reference audio and text to generate.")
60
  return None
@@ -82,6 +82,11 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
82
  with torch.no_grad(): # Use no_grad for inference
83
  resource_context = infer_pipe.preprocess(file_content)
84
  wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
 
 
 
 
 
85
  # Clean up memory after successful generation
86
  cleanup_memory()
87
  return wav_bytes
@@ -101,6 +106,43 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
101
  cleanup_memory()
102
  return None
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  def cleanup_memory():
105
  """Clean up system memory."""
106
  gc.collect()
@@ -199,6 +241,14 @@ with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
199
  maximum=10.0,
200
  step=0.1
201
  )
 
 
 
 
 
 
 
 
202
 
203
  generate_btn = gr.Button("Generate Speech", variant="primary")
204
 
@@ -207,7 +257,7 @@ with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
207
 
208
  generate_btn.click(
209
  fn=generate_speech,
210
- inputs=[reference_audio, text_input, infer_timestep, p_w, t_w],
211
  outputs=[output_audio]
212
  )
213
 
 
54
  print(f"Failed to reinitialize model: {e}")
55
  return False
56
 
57
+ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w, speed_factor):
58
  if not inp_audio or not inp_text:
59
  gr.Warning("Please provide both reference audio and text to generate.")
60
  return None
 
82
  with torch.no_grad(): # Use no_grad for inference
83
  resource_context = infer_pipe.preprocess(file_content)
84
  wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
85
+
86
+ # Apply speed adjustment if needed
87
+ if speed_factor != 1.0:
88
+ wav_bytes = adjust_speed(wav_bytes, speed_factor)
89
+
90
  # Clean up memory after successful generation
91
  cleanup_memory()
92
  return wav_bytes
 
106
  cleanup_memory()
107
  return None
108
 
109
+ def adjust_speed(wav_bytes, speed_factor):
110
+ """Adjust the speed of the audio without changing pitch"""
111
+ try:
112
+ # Create temp file
113
+ temp_input = "temp_input.wav"
114
+ temp_output = "temp_output.wav"
115
+
116
+ with open(temp_input, "wb") as f:
117
+ f.write(wav_bytes)
118
+
119
+ # Load audio
120
+ audio = AudioSegment.from_file(temp_input)
121
+
122
+ # Apply speed change
123
+ if speed_factor != 1.0:
124
+ # Manually adjust frame rate to change speed without pitch alteration
125
+ new_frame_rate = int(audio.frame_rate * speed_factor)
126
+ audio = audio._spawn(audio.raw_data, overrides={
127
+ "frame_rate": new_frame_rate
128
+ }).set_frame_rate(audio.frame_rate)
129
+
130
+ # Export result
131
+ audio.export(temp_output, format="wav")
132
+
133
+ # Read and return
134
+ with open(temp_output, "rb") as f:
135
+ result = f.read()
136
+
137
+ # Clean up temp files
138
+ os.remove(temp_input)
139
+ os.remove(temp_output)
140
+
141
+ return result
142
+ except Exception as e:
143
+ print(f"Speed adjustment failed: {e}")
144
+ return wav_bytes # Return original if adjustment fails
145
+
146
  def cleanup_memory():
147
  """Clean up system memory."""
148
  gc.collect()
 
241
  maximum=10.0,
242
  step=0.1
243
  )
244
+ speed_factor = gr.Slider(
245
+ label="Speed Adjustment",
246
+ value=1.0,
247
+ minimum=0.5,
248
+ maximum=2.0,
249
+ step=0.1,
250
+ info="1.0 = normal speed, <1.0 = slower, >1.0 = faster"
251
+ )
252
 
253
  generate_btn = gr.Button("Generate Speech", variant="primary")
254
 
 
257
 
258
  generate_btn.click(
259
  fn=generate_speech,
260
+ inputs=[reference_audio, text_input, infer_timestep, p_w, t_w, speed_factor],
261
  outputs=[output_audio]
262
  )
263