import streamlit as st import moviepy.editor as mp import soundfile as sf from io import BytesIO import subprocess from TTS.api import TTS # Set up the model for text-to-speech (TTS) MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC" # Example TTS model; adjust as needed tts = TTS(model_name=MODEL_NAME, progress_bar=True, gpu=False) # Function to extract audio from MP4 file using ffmpeg if moviepy fails def extract_audio_from_mp4(mp4_file): try: # Attempt to use moviepy to extract audio video = mp.VideoFileClip(mp4_file) audio = video.audio audio_file = "temp_audio.wav" audio.write_audiofile(audio_file) return audio_file except Exception as e: st.error(f"Error extracting audio with moviepy: {e}. Trying ffmpeg extraction...") # Fallback to using ffmpeg for audio extraction if moviepy fails try: mp4_path = "uploaded_video.mp4" with open(mp4_path, "wb") as f: f.write(mp4_file.getbuffer()) audio_path = "temp_audio.wav" subprocess.run(["ffmpeg", "-i", mp4_path, "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2", audio_path]) return audio_path except Exception as ffmpeg_error: st.error(f"Error with ffmpeg extraction: {ffmpeg_error}") return None # Function to load audio file def load_audio(file): try: audio_data, sample_rate = sf.read(file) return audio_data, sample_rate except Exception as e: st.error(f"Error loading audio: {e}") return None, None # Function to save the generated audio to a file def save_audio(output_audio, sample_rate): output_path = "output_cloned_voice.wav" sf.write(output_path, output_audio, sample_rate) return output_path # Streamlit app def main(): st.title("Voice Cloning Tool") st.markdown("Upload an MP4, WAV, or MP3 file, and get the cloned voice output.") # File upload audio_file = st.file_uploader("Upload your audio file", type=["wav", "mp3", "mp4"]) if audio_file is not None: # Handle MP4 file by extracting audio if audio_file.type == "video/mp4": # Save the uploaded file to a temporary location with open("uploaded_video.mp4", "wb") as f: f.write(audio_file.getbuffer()) # Extract audio from MP4 audio_path = extract_audio_from_mp4("uploaded_video.mp4") if audio_path: st.audio(audio_path, format="audio/wav") # Load audio for TTS processing audio_data, sample_rate = load_audio(audio_path) else: # For audio files directly (WAV/MP3) st.audio(audio_file, format=f"audio/{audio_file.type.split('/')[1]}") # Load audio file with open("temp_audio.wav", "wb") as f: f.write(audio_file.getbuffer()) audio_data, sample_rate = load_audio("temp_audio.wav") if audio_data is not None: # Perform voice cloning (This assumes your TTS model supports some form of input) try: st.text("Processing your input...") output_audio = tts.tts(audio_data) # Pass the audio to your TTS model for cloning output_path = save_audio(output_audio, sample_rate) # Provide download link st.audio(output_path, format="audio/wav") st.markdown(f"[Download Cloned Voice](/{output_path})") except Exception as e: st.error(f"Error processing audio: {e}") else: st.error("No audio data to process.") if __name__ == "__main__": main()