Amelia-James's picture
Update app.py
550fb6b verified
import streamlit as st
import moviepy.editor as mp
import soundfile as sf
from io import BytesIO
import subprocess
from TTS.api import TTS
# Set up the model for text-to-speech (TTS)
MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC" # Example TTS model; adjust as needed
tts = TTS(model_name=MODEL_NAME, progress_bar=True, gpu=False)
# Function to extract audio from MP4 file using ffmpeg if moviepy fails
def extract_audio_from_mp4(mp4_file):
try:
# Attempt to use moviepy to extract audio
video = mp.VideoFileClip(mp4_file)
audio = video.audio
audio_file = "temp_audio.wav"
audio.write_audiofile(audio_file)
return audio_file
except Exception as e:
st.error(f"Error extracting audio with moviepy: {e}. Trying ffmpeg extraction...")
# Fallback to using ffmpeg for audio extraction if moviepy fails
try:
mp4_path = "uploaded_video.mp4"
with open(mp4_path, "wb") as f:
f.write(mp4_file.getbuffer())
audio_path = "temp_audio.wav"
subprocess.run(["ffmpeg", "-i", mp4_path, "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2", audio_path])
return audio_path
except Exception as ffmpeg_error:
st.error(f"Error with ffmpeg extraction: {ffmpeg_error}")
return None
# Function to load audio file
def load_audio(file):
try:
audio_data, sample_rate = sf.read(file)
return audio_data, sample_rate
except Exception as e:
st.error(f"Error loading audio: {e}")
return None, None
# Function to save the generated audio to a file
def save_audio(output_audio, sample_rate):
output_path = "output_cloned_voice.wav"
sf.write(output_path, output_audio, sample_rate)
return output_path
# Streamlit app
def main():
st.title("Voice Cloning Tool")
st.markdown("Upload an MP4, WAV, or MP3 file, and get the cloned voice output.")
# File upload
audio_file = st.file_uploader("Upload your audio file", type=["wav", "mp3", "mp4"])
if audio_file is not None:
# Handle MP4 file by extracting audio
if audio_file.type == "video/mp4":
# Save the uploaded file to a temporary location
with open("uploaded_video.mp4", "wb") as f:
f.write(audio_file.getbuffer())
# Extract audio from MP4
audio_path = extract_audio_from_mp4("uploaded_video.mp4")
if audio_path:
st.audio(audio_path, format="audio/wav")
# Load audio for TTS processing
audio_data, sample_rate = load_audio(audio_path)
else:
# For audio files directly (WAV/MP3)
st.audio(audio_file, format=f"audio/{audio_file.type.split('/')[1]}")
# Load audio file
with open("temp_audio.wav", "wb") as f:
f.write(audio_file.getbuffer())
audio_data, sample_rate = load_audio("temp_audio.wav")
if audio_data is not None:
# Perform voice cloning (This assumes your TTS model supports some form of input)
try:
st.text("Processing your input...")
output_audio = tts.tts(audio_data) # Pass the audio to your TTS model for cloning
output_path = save_audio(output_audio, sample_rate)
# Provide download link
st.audio(output_path, format="audio/wav")
st.markdown(f"[Download Cloned Voice](/{output_path})")
except Exception as e:
st.error(f"Error processing audio: {e}")
else:
st.error("No audio data to process.")
if __name__ == "__main__":
main()