Spaces:
Running
Running
import gradio as gr | |
import moviepy.video.io.ImageSequenceClip | |
from PIL import Image | |
from pydub import AudioSegment | |
from moviepy.editor import ImageSequenceClip, VideoFileClip, AudioFileClip | |
import numpy as np | |
import os | |
from mutagen.mp3 import MP3 | |
import cv2 | |
from dotenv import load_dotenv | |
from transformers import pipeline, AutoProcessor, AutoModel | |
import torch | |
import soundfile as sf | |
# Load environment variables | |
load_dotenv() | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
def resize(img_list): | |
resize_img_list = [] | |
for item in img_list: | |
im = Image.open(item) | |
imResize = im.resize((256, 256), Image.LANCZOS) | |
resize_img_list.append(np.array(imResize)) | |
return resize_img_list | |
def text2speech(text): | |
# Using Microsoft's SpeechT5 model instead of FastSpeech2 | |
processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts") | |
model = AutoModel.from_pretrained("microsoft/speecht5_tts") | |
# Preprocessing text input | |
inputs = processor(text=text, return_tensors="pt") | |
# Generate speech with default speaker embedding | |
speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size)) | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings) | |
# Save as flac file | |
sf.write("speech_output.flac", speech.numpy(), samplerate=16000) | |
return "speech_output.flac" | |
def merge_audio_video(entities_num, resize_img_list, text_input): | |
speech = text2speech(text_input) | |
wav_audio = AudioSegment.from_file(speech, "flac") | |
wav_audio.export("audio.mp3", format="mp3") | |
audio_length = int(MP3("audio.mp3").info.length) | |
fps = entities_num / audio_length | |
fps = float(format(fps, '.5f')) | |
clip = ImageSequenceClip(resize_img_list, fps=fps) | |
clip.write_videofile('my_vid_tmp.mp4') | |
videoclip = VideoFileClip('my_vid_tmp.mp4') | |
audioclip = AudioFileClip('audio.mp3') | |
mergedclip = videoclip.set_audio(audioclip) | |
return mergedclip | |
def engine(text_input): | |
ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large", api_key=HF_TOKEN) | |
entities = ner(text_input) | |
entities = [tupl for tupl in entities if None not in tupl] | |
entities_num = len(entities) | |
img_list = [] | |
latentdiffusion = gr.Interface.load("spaces/multimodalart/latentdiffusion", api_key=HF_TOKEN) | |
for ent in entities: | |
img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0] | |
img_list.append(img) | |
resize_img_list = resize(img_list) | |
mergedclip = merge_audio_video(entities_num, resize_img_list, text_input) | |
mergedclip.write_videofile('mergedvideo.mp4') | |
return 'mergedvideo.mp4' | |
app = gr.Interface( | |
fn=engine, | |
inputs=gr.Textbox(lines=5, label="Input Text"), | |
outputs=gr.Video(label='Final Merged Video'), | |
description="<div>πποΈπΏ AI Movie Maker - Comedy π¬ π§ π¨</div>", | |
examples=[ | |
["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."] | |
], | |
title="AI Pipeline Multi Model πποΈπΏ Movie Maker π¬ π§ π¨", | |
article="<br><div></div>" | |
).launch(debug=True) |