MegaTTS3-Voice-Cloning

Running

App Files Files Community

mrfakename commited on 28 days ago

Commit

1ca3adb

1 Parent(s): cb4e009

fix stability

Browse files

Files changed (4) hide show

app.py +113 -8
packages.txt +1 -0
requirements.txt +2 -1
tts/frontend_function.py +20 -1

app.py CHANGED Viewed

@@ -4,6 +4,11 @@ import os
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 import gradio as gr
 import traceback
 from huggingface_hub import snapshot_download
 from tts.infer_cli import MegaTTS3DiTInfer, convert_to_wav, cut_wav
@@ -33,6 +38,21 @@ print("Initializing MegaTTS3 model...")
 infer_pipe = MegaTTS3DiTInfer()
 print("Model loaded successfully!")
 @spaces.GPU
 def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
     if not inp_audio or not inp_text:
@@ -42,25 +62,110 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
     try:
         print(f"Generating speech with: {inp_text}...")
-        # Convert and prepare audio
-        convert_to_wav(inp_audio)
-        wav_path = os.path.splitext(inp_audio)[0] + '.wav'
-        cut_wav(wav_path, max_len=28)
         # Read audio file
         with open(wav_path, 'rb') as file:
             file_content = file.read()
-        # Generate speech
-        resource_context = infer_pipe.preprocess(file_content)
-        wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
-        return wav_bytes
     except Exception as e:
         traceback.print_exc()
         gr.Warning(f"Speech generation failed: {str(e)}")
         return None
 with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
     gr.Markdown("# MegaTTS 3 Voice Cloning")

 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 import gradio as gr
 import traceback
+import gc
+import numpy as np
+import librosa
+from pydub import AudioSegment
+from pydub.effects import normalize
 from huggingface_hub import snapshot_download
 from tts.infer_cli import MegaTTS3DiTInfer, convert_to_wav, cut_wav
 infer_pipe = MegaTTS3DiTInfer()
 print("Model loaded successfully!")
+def reset_model():
+    """Reset the inference pipeline to recover from CUDA errors."""
+    global infer_pipe
+    try:
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        print("Reinitializing MegaTTS3 model...")
+        infer_pipe = MegaTTS3DiTInfer()
+        print("Model reinitialized successfully!")
+        return True
+    except Exception as e:
+        print(f"Failed to reinitialize model: {e}")
+        return False
 @spaces.GPU
 def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
     if not inp_audio or not inp_text:
     try:
         print(f"Generating speech with: {inp_text}...")
+        # Check CUDA availability and clear cache
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            print(f"CUDA device: {torch.cuda.get_device_name()}")
+        else:
+            gr.Warning("CUDA is not available. Please check your GPU setup.")
+            return None
+        # Robustly preprocess audio
+        try:
+            processed_audio_path = preprocess_audio_robust(inp_audio)
+            # Use existing cut_wav for final trimming
+            cut_wav(processed_audio_path, max_len=28)
+            wav_path = processed_audio_path
+        except Exception as audio_error:
+            gr.Warning(f"Audio preprocessing failed: {str(audio_error)}")
+            return None
         # Read audio file
         with open(wav_path, 'rb') as file:
             file_content = file.read()
+        # Generate speech with proper error handling
+        try:
+            resource_context = infer_pipe.preprocess(file_content)
+            wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
+            # Clean up memory after successful generation
+            cleanup_memory()
+            return wav_bytes
+        except RuntimeError as cuda_error:
+            if "CUDA" in str(cuda_error):
+                print(f"CUDA error detected: {cuda_error}")
+                # Try to reset the model to recover from CUDA errors
+                if reset_model():
+                    gr.Warning("CUDA error occurred. Model has been reset. Please try again.")
+                else:
+                    gr.Warning("CUDA error occurred and model reset failed. Please restart the application.")
+                return None
+            else:
+                raise cuda_error
     except Exception as e:
         traceback.print_exc()
         gr.Warning(f"Speech generation failed: {str(e)}")
+        # Clean up CUDA memory on any error
+        cleanup_memory()
         return None
+def cleanup_memory():
+    """Clean up GPU and system memory."""
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+def preprocess_audio_robust(audio_path, target_sr=22050, max_duration=30):
+    """Robustly preprocess audio to prevent CUDA errors."""
+    try:
+        # Load with pydub for robust format handling
+        audio = AudioSegment.from_file(audio_path)
+        # Convert to mono if stereo
+        if audio.channels > 1:
+            audio = audio.set_channels(1)
+        # Limit duration to prevent memory issues
+        if len(audio) > max_duration * 1000:  # pydub uses milliseconds
+            audio = audio[:max_duration * 1000]
+        # Normalize audio to prevent clipping
+        audio = normalize(audio)
+        # Convert to target sample rate
+        audio = audio.set_frame_rate(target_sr)
+        # Export to temporary WAV file with specific parameters
+        temp_path = audio_path.replace(os.path.splitext(audio_path)[1], '_processed.wav')
+        audio.export(
+            temp_path,
+            format="wav",
+            parameters=["-acodec", "pcm_s16le", "-ac", "1", "-ar", str(target_sr)]
+        )
+        # Validate the audio with librosa
+        wav, sr = librosa.load(temp_path, sr=target_sr, mono=True)
+        # Check for invalid values
+        if np.any(np.isnan(wav)) or np.any(np.isinf(wav)):
+            raise ValueError("Audio contains NaN or infinite values")
+        # Ensure reasonable amplitude range
+        if np.max(np.abs(wav)) < 1e-6:
+            raise ValueError("Audio signal is too quiet")
+        # Re-save the validated audio
+        import soundfile as sf
+        sf.write(temp_path, wav, sr)
+        return temp_path
+    except Exception as e:
+        print(f"Audio preprocessing failed: {e}")
+        raise ValueError(f"Failed to process audio: {str(e)}")
 with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
     gr.Markdown("# MegaTTS 3 Voice Cloning")

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt CHANGED Viewed

@@ -16,4 +16,5 @@ torchdiffeq==0.2.5
 openai-whisper==20240930
 httpx==0.28.1
 gradio==5.23.1
-hf-transfer

 openai-whisper==20240930
 httpx==0.28.1
 gradio==5.23.1
+hf-transfer
+soundfile

tts/frontend_function.py CHANGED Viewed

@@ -16,6 +16,7 @@ import torch
 import torch.nn.functional as F
 import whisper
 import librosa
 from copy import deepcopy
 from tts.utils.text_utils.ph_tone_convert import split_ph_timestamp, split_ph
 from tts.utils.audio_utils.align import mel2token_to_dur
@@ -39,8 +40,26 @@ def g2p(self, text_inp):
 ''' Get phoneme2mel align of prompt speech '''
 def align(self, wav):
     with torch.inference_mode():
         whisper_wav = librosa.resample(wav, orig_sr=self.sr, target_sr=16000)
-        mel = torch.FloatTensor(whisper.log_mel_spectrogram(whisper_wav).T).to(self.device)[None].transpose(1,2)
         prompt_max_frame = mel.size(2) // self.fm * self.fm
         mel = mel[:, :, :prompt_max_frame]
         token = torch.LongTensor([[798]]).to(self.device)

 import torch.nn.functional as F
 import whisper
 import librosa
+import numpy as np
 from copy import deepcopy
 from tts.utils.text_utils.ph_tone_convert import split_ph_timestamp, split_ph
 from tts.utils.audio_utils.align import mel2token_to_dur
 ''' Get phoneme2mel align of prompt speech '''
 def align(self, wav):
     with torch.inference_mode():
+        # Validate input audio
+        if np.any(np.isnan(wav)) or np.any(np.isinf(wav)):
+            raise ValueError("Input audio contains NaN or infinite values")
         whisper_wav = librosa.resample(wav, orig_sr=self.sr, target_sr=16000)
+        # Validate resampled audio
+        if np.any(np.isnan(whisper_wav)) or np.any(np.isinf(whisper_wav)):
+            raise ValueError("Resampled audio contains NaN or infinite values")
+        # Get mel spectrogram with validation
+        mel_spec = whisper.log_mel_spectrogram(whisper_wav)
+        if np.any(np.isnan(mel_spec)) or np.any(np.isinf(mel_spec)):
+            raise ValueError("Mel spectrogram contains NaN or infinite values")
+        mel = torch.FloatTensor(mel_spec.T).to(self.device)[None].transpose(1,2)
+        # Validate tensor before further processing
+        if torch.any(torch.isnan(mel)) or torch.any(torch.isinf(mel)):
+            raise ValueError("Mel tensor contains NaN or infinite values")
         prompt_max_frame = mel.size(2) // self.fm * self.fm
         mel = mel[:, :, :prompt_max_frame]
         token = torch.LongTensor([[798]]).to(self.device)