Spaces:

mr-don88
/

voice-clone

Sleeping

App Files Files Community

le quy don commited on 16 days ago

Commit

cbe9884

verified ·

1 Parent(s): dd1c07c

Update ban goc.py

Browse files

Files changed (1) hide show

ban goc.py +52 -9

ban goc.py CHANGED Viewed

@@ -54,7 +54,7 @@ def reset_model():
         print(f"Failed to reinitialize model: {e}")
         return False
-def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
     if not inp_audio or not inp_text:
         gr.Warning("Please provide both reference audio and text to generate.")
         return None
@@ -82,6 +82,11 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
             with torch.no_grad():  # Use no_grad for inference
                 resource_context = infer_pipe.preprocess(file_content)
                 wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
             # Clean up memory after successful generation
             cleanup_memory()
             return wav_bytes
@@ -101,6 +106,43 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
         cleanup_memory()
         return None
 def cleanup_memory():
     """Clean up system memory."""
     gc.collect()
@@ -157,13 +199,6 @@ def preprocess_audio_robust(audio_path, target_sr=22050, max_duration=30):
         raise ValueError(f"Failed to process audio: {str(e)}")
 with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
-    gr.Markdown("# MegaTTS 3 Voice Cloning")
-    gr.Markdown("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities. The original authors did not release the WavVAE encoder, so voice cloning was not publicly available; however, thanks to [@ACoderPassBy](https://modelscope.cn/models/ACoderPassBy/MegaTTS-SFT)'s WavVAE encoder, we can now clone voices with MegaTTS 3!")
-    gr.Markdown("This is by no means the best voice cloning solution, but it works pretty well for some specific use-cases. Try out multiple and see which one works best for you.")
-    gr.Markdown("**Please use this Space responsibly and do not abuse it!**")
-    gr.Markdown("h/t to MysteryShack on Discord for the info about the unofficial WavVAE encoder!")
-    gr.Markdown("Upload a reference audio clip and enter text to generate speech with the cloned voice.")
     with gr.Row():
         with gr.Column():
             reference_audio = gr.Audio(
@@ -199,6 +234,14 @@ with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
                     maximum=10.0,
                     step=0.1
                 )
             generate_btn = gr.Button("Generate Speech", variant="primary")
@@ -207,7 +250,7 @@ with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
     generate_btn.click(
         fn=generate_speech,
-        inputs=[reference_audio, text_input, infer_timestep, p_w, t_w],
         outputs=[output_audio]
     )

         print(f"Failed to reinitialize model: {e}")
         return False
+def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w, speed_factor):
     if not inp_audio or not inp_text:
         gr.Warning("Please provide both reference audio and text to generate.")
         return None
             with torch.no_grad():  # Use no_grad for inference
                 resource_context = infer_pipe.preprocess(file_content)
                 wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
+            # Apply speed adjustment if needed
+            if speed_factor != 1.0:
+                wav_bytes = adjust_speed(wav_bytes, speed_factor)
             # Clean up memory after successful generation
             cleanup_memory()
             return wav_bytes
         cleanup_memory()
         return None
+def adjust_speed(wav_bytes, speed_factor):
+    """Adjust the speed of the audio without changing pitch"""
+    try:
+        # Create temp file
+        temp_input = "temp_input.wav"
+        temp_output = "temp_output.wav"
+        with open(temp_input, "wb") as f:
+            f.write(wav_bytes)
+        # Load audio
+        audio = AudioSegment.from_file(temp_input)
+        # Apply speed change
+        if speed_factor != 1.0:
+            # Manually adjust frame rate to change speed without pitch alteration
+            new_frame_rate = int(audio.frame_rate * speed_factor)
+            audio = audio._spawn(audio.raw_data, overrides={
+                "frame_rate": new_frame_rate
+            }).set_frame_rate(audio.frame_rate)
+        # Export result
+        audio.export(temp_output, format="wav")
+        # Read and return
+        with open(temp_output, "rb") as f:
+            result = f.read()
+        # Clean up temp files
+        os.remove(temp_input)
+        os.remove(temp_output)
+        return result
+    except Exception as e:
+        print(f"Speed adjustment failed: {e}")
+        return wav_bytes  # Return original if adjustment fails
 def cleanup_memory():
     """Clean up system memory."""
     gc.collect()
         raise ValueError(f"Failed to process audio: {str(e)}")
 with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
     with gr.Row():
         with gr.Column():
             reference_audio = gr.Audio(
                     maximum=10.0,
                     step=0.1
                 )
+                speed_factor = gr.Slider(
+                    label="Speed Adjustment",
+                    value=1.0,
+                    minimum=0.5,
+                    maximum=2.0,
+                    step=0.1,
+                    info="1.0 = normal speed, <1.0 = slower, >1.0 = faster"
+                )
             generate_btn = gr.Button("Generate Speech", variant="primary")
     generate_btn.click(
         fn=generate_speech,
+        inputs=[reference_audio, text_input, infer_timestep, p_w, t_w, speed_factor],
         outputs=[output_audio]
     )