import gradio as gr import moviepy.video.io.ImageSequenceClip from PIL import Image from pydub import AudioSegment from moviepy.editor import ImageSequenceClip, VideoFileClip, AudioFileClip import numpy as np import os from mutagen.mp3 import MP3 import cv2 from dotenv import load_dotenv from transformers import pipeline, AutoProcessor, AutoModel import torch import soundfile as sf # Load environment variables load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") def resize(img_list): resize_img_list = [] for item in img_list: im = Image.open(item) imResize = im.resize((256, 256), Image.LANCZOS) resize_img_list.append(np.array(imResize)) return resize_img_list def text2speech(text): # Using Microsoft's SpeechT5 model instead of FastSpeech2 processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts") model = AutoModel.from_pretrained("microsoft/speecht5_tts") # Preprocessing text input inputs = processor(text=text, return_tensors="pt") # Generate speech with default speaker embedding speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size)) speech = model.generate_speech(inputs["input_ids"], speaker_embeddings) # Save as flac file sf.write("speech_output.flac", speech.numpy(), samplerate=16000) return "speech_output.flac" def merge_audio_video(entities_num, resize_img_list, text_input): speech = text2speech(text_input) wav_audio = AudioSegment.from_file(speech, "flac") wav_audio.export("audio.mp3", format="mp3") audio_length = int(MP3("audio.mp3").info.length) fps = entities_num / audio_length fps = float(format(fps, '.5f')) clip = ImageSequenceClip(resize_img_list, fps=fps) clip.write_videofile('my_vid_tmp.mp4') videoclip = VideoFileClip('my_vid_tmp.mp4') audioclip = AudioFileClip('audio.mp3') mergedclip = videoclip.set_audio(audioclip) return mergedclip def engine(text_input): ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large", api_key=HF_TOKEN) entities = ner(text_input) entities = [tupl for tupl in entities if None not in tupl] entities_num = len(entities) img_list = [] latentdiffusion = gr.Interface.load("spaces/multimodalart/latentdiffusion", api_key=HF_TOKEN) for ent in entities: img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0] img_list.append(img) resize_img_list = resize(img_list) mergedclip = merge_audio_video(entities_num, resize_img_list, text_input) mergedclip.write_videofile('mergedvideo.mp4') return 'mergedvideo.mp4' app = gr.Interface( fn=engine, inputs=gr.Textbox(lines=5, label="Input Text"), outputs=gr.Video(label='Final Merged Video'), description="