Spaces:
Running
Running
File size: 3,350 Bytes
9e908c5 df17f8f 9e908c5 df17f8f 9e908c5 df17f8f 83acbfc 9e908c5 df17f8f a1c58e6 9e908c5 df17f8f 9e908c5 83acbfc 9e908c5 df17f8f 9e908c5 df17f8f 9e908c5 df17f8f 9e908c5 df17f8f 9e908c5 df17f8f 9e908c5 a1c58e6 9e908c5 df17f8f 9e908c5 df17f8f 9e908c5 df17f8f 9e908c5 83acbfc 9e908c5 df17f8f 9e908c5 df17f8f 83acbfc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import gradio as gr
import moviepy.video.io.ImageSequenceClip
from PIL import Image
from pydub import AudioSegment
from moviepy.editor import ImageSequenceClip, VideoFileClip, AudioFileClip
import numpy as np
import os
from mutagen.mp3 import MP3
import cv2
from dotenv import load_dotenv
from transformers import pipeline, AutoProcessor, AutoModel
import torch
import soundfile as sf
# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
def resize(img_list):
resize_img_list = []
for item in img_list:
im = Image.open(item)
imResize = im.resize((256, 256), Image.LANCZOS)
resize_img_list.append(np.array(imResize))
return resize_img_list
def text2speech(text):
# Using Microsoft's SpeechT5 model instead of FastSpeech2
processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
model = AutoModel.from_pretrained("microsoft/speecht5_tts")
# Preprocessing text input
inputs = processor(text=text, return_tensors="pt")
# Generate speech with default speaker embedding
speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
# Save as flac file
sf.write("speech_output.flac", speech.numpy(), samplerate=16000)
return "speech_output.flac"
def merge_audio_video(entities_num, resize_img_list, text_input):
speech = text2speech(text_input)
wav_audio = AudioSegment.from_file(speech, "flac")
wav_audio.export("audio.mp3", format="mp3")
audio_length = int(MP3("audio.mp3").info.length)
fps = entities_num / audio_length
fps = float(format(fps, '.5f'))
clip = ImageSequenceClip(resize_img_list, fps=fps)
clip.write_videofile('my_vid_tmp.mp4')
videoclip = VideoFileClip('my_vid_tmp.mp4')
audioclip = AudioFileClip('audio.mp3')
mergedclip = videoclip.set_audio(audioclip)
return mergedclip
def engine(text_input):
ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large", api_key=HF_TOKEN)
entities = ner(text_input)
entities = [tupl for tupl in entities if None not in tupl]
entities_num = len(entities)
img_list = []
latentdiffusion = gr.Interface.load("spaces/multimodalart/latentdiffusion", api_key=HF_TOKEN)
for ent in entities:
img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0]
img_list.append(img)
resize_img_list = resize(img_list)
mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
mergedclip.write_videofile('mergedvideo.mp4')
return 'mergedvideo.mp4'
app = gr.Interface(
fn=engine,
inputs=gr.Textbox(lines=5, label="Input Text"),
outputs=gr.Video(label='Final Merged Video'),
description="<div>πποΈπΏ AI Movie Maker - Comedy π¬ π§ π¨</div>",
examples=[
["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."]
],
title="AI Pipeline Multi Model πποΈπΏ Movie Maker π¬ π§ π¨",
article="<br><div></div>"
).launch(debug=True) |