File size: 3,112 Bytes
9e908c5
 
 
 
df17f8f
9e908c5
df17f8f
9e908c5
 
df17f8f
983b97e
 
 
9e908c5
df17f8f
 
 
a1c58e6
9e908c5
 
 
 
df17f8f
9e908c5
 
 
 
 
df17f8f
 
9e908c5
df17f8f
9e908c5
df17f8f
 
9e908c5
df17f8f
 
 
9e908c5
 
df17f8f
 
9e908c5
7332ca5
 
 
 
 
 
 
 
 
 
df17f8f
9e908c5
a1c58e6
9e908c5
 
 
 
df17f8f
 
 
9e908c5
df17f8f
9e908c5
df17f8f
9e908c5
 
 
 
df17f8f
9e908c5
7332ca5
 
 
 
 
df17f8f
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
import moviepy.video.io.ImageSequenceClip
from PIL import Image
from pydub import AudioSegment
from moviepy.editor import ImageSequenceClip, VideoFileClip, AudioFileClip
import numpy as np
import os
from mutagen.mp3 import MP3
import cv2
from dotenv import load_dotenv
from transformers import pipeline



# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

def resize(img_list):
    resize_img_list = []
    for item in img_list:
        im = Image.open(item)
        imResize = im.resize((256, 256), Image.LANCZOS)
        resize_img_list.append(np.array(imResize))
    return resize_img_list

def merge_audio_video(entities_num, resize_img_list, text_input):
    speech = text2speech(text_input)
    wav_audio = AudioSegment.from_file(speech, "flac")
    wav_audio.export("audio.mp3", format="mp3")
    audio_length = int(MP3("audio.mp3").info.length)
    fps = entities_num / audio_length  
    fps = float(format(fps, '.5f'))
    
    clip = ImageSequenceClip(resize_img_list, fps=fps)
    clip.write_videofile('my_vid_tmp.mp4')
    
    videoclip = VideoFileClip('my_vid_tmp.mp4')
    audioclip = AudioFileClip('audio.mp3')
    mergedclip = videoclip.set_audio(audioclip)
    
    return mergedclip

def text2speech(text):
    # Generate speech from text using FastSpeech2
    speech_output = fastspeech(text)
    # Save the output as a .flac file (assuming the output is in numpy format)
    with open("speech_output.flac", "wb") as f:
        f.write(speech_output["audio"])
    return "speech_output.flac"
    
# Load FastSpeech2 model from Hugging Face directly
fastspeech = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech", use_auth_token=HF_TOKEN)


def engine(text_input):
    ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large", api_key=HF_TOKEN)
    entities = ner(text_input)
    entities = [tupl for tupl in entities if None not in tupl]
    entities_num = len(entities)
    img_list = []
    
    latentdiffusion = gr.Interface.load("spaces/multimodalart/latentdiffusion", api_key=HF_TOKEN)
    
    for ent in entities:
        img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0]
        img_list.append(img)
    
    resize_img_list = resize(img_list)
    mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
    mergedclip.to_videofile('mergedvideo.mp4')
    
    return 'mergedvideo.mp4'






app = gr.Interface(
    fn=engine, 
    inputs=gr.Textbox(lines=5, label="Input Text"), 
    outputs=gr.Video(label='Final Merged Video'), 
    description="<div>🎭🎞️🍿 AI Movie Maker - Comedy 🎬 🧠 🎨</div>",
    examples=[
        ["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."]
    ],
    title="AI Pipeline Multi Model 🎭🎞️🍿 Movie Maker 🎬 🧠 🎨",
    article="<br><div></div>"
).launch(debug=True)