import gradio as gr import moviepy.video.io.ImageSequenceClip from PIL import Image from pydub import AudioSegment from moviepy.editor import * import numpy as np import mutagen from mutagen.mp3 import MP3 import cv2 HF_TOKEN = os.environ.get("HF_TOKEN") def resize(img_list): resize_img_list = [] for item in img_list: im = Image.open(item) imResize = im.resize((256,256), Image.ANTIALIAS) resize_img_list.append(np.array(imResize)) return resize_img_list def merge_audio_video(entities_num, resize_img_list, text_input): speech = text2speech(text_input) wav_audio = AudioSegment.from_file(speech, "flac") #("/content/gdrive/My Drive/AI/audio1.flac", "flac") wav_audio.export("audio.mp3", format="mp3") #("/content/gdrive/My Drive/AI/audio1.mp3", format="mp3") audio_length = int(MP3("audio.mp3").info.length) fps= entities_num / audio_length #length of audio file fps = float(format(fps, '.5f')) clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps) clip.write_videofile('my_vid_tmp.mp4') videoclip = VideoFileClip('my_vid_tmp.mp4') #("/content/gdrive/My Drive/AI/my_video1.mp4") audioclip = AudioFileClip('audio.mp3') #.subclip(0, 15) mergedclip = videoclip.set_audio(audioclip) duration = mergedclip.duration frame_count = mergedclip.fps return mergedclip fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech", api_key=HF_TOKEN) def text2speech(text): speech = fastspeech(text) return speech def engine(text_input): ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large", api_key=HF_TOKEN) entities = ner(text_input) entities = [tupl for tupl in entities if None not in tupl] entities_num = len(entities) img_list = [] for ent in entities: img = gr.Interface.load("spaces/multimodalart/latentdiffusion, api_key=HF_TOKEN")(ent[0],'50','256','256','1',10)[0] img_list.append(img) resize_img_list = resize(img_list) mergedclip = merge_audio_video(entities_num, resize_img_list, text_input) mergedclip.to_videofile('mergedvideo.mp4') return 'mergedvideo.mp4' app = gr.Interface(engine, gr.inputs.Textbox(lines=5, label="Input Text"), gr.outputs.Video(type=None, label='Final Merged video'), description="