Spaces:

awacke1
/

AI-MovieMaker-Comedy

Running

File size: 5,082 Bytes

9e908c5
 
 
 
df17f8f
9e908c5
df17f8f
9e908c5
02cb7fa
df17f8f
02cb7fa
83acbfc
a0010c7
9e908c5
df17f8f
 
dbbec05
a1c58e6
a0010c7
 
 
 
 
 
 
 
 
 
 
 
 
 
9e908c5
 
 
 
df17f8f
9e908c5
 
 
83acbfc
a0010c7
 
 
 
 
 
 
 
 
 
 
 
 
 
83acbfc
9e908c5
a0010c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df17f8f
a0010c7
 
 
9e908c5
a0010c7
 
02cb7fa
 
 
 
a0010c7
 
 
 
02cb7fa
 
 
 
 
a0010c7
02cb7fa
a0010c7
 
 
 
 
 
02cb7fa
a0010c7
 
 
 
 
 
02cb7fa
 
 
a0010c7
02cb7fa
a0010c7

import gradio as gr
import moviepy.video.io.ImageSequenceClip
from PIL import Image
from pydub import AudioSegment
from moviepy.editor import ImageSequenceClip, VideoFileClip, AudioFileClip
import numpy as np
import os
from mutagen.mp3 import MP3
import soundfile as sf
from dotenv import load_dotenv
from transformers import AutoProcessor, AutoModel
import torch
import tempfile

# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("API_KEY")

def cleanup_temp_files():
    temp_files = [
        os.path.join(tempfile.gettempdir(), 'speech_output.flac'),
        os.path.join(tempfile.gettempdir(), 'audio.mp3'),
        os.path.join(tempfile.gettempdir(), 'my_vid_tmp.mp4'),
        os.path.join(tempfile.gettempdir(), 'mergedvideo.mp4')
    ]
    for file in temp_files:
        if os.path.exists(file):
            try:
                os.remove(file)
            except:
                pass

def resize(img_list):
    resize_img_list = []
    for item in img_list:
        im = Image.open(item)
        imResize = im.resize((256, 256), Image.LANCZOS)
        resize_img_list.append(np.array(imResize))
    return resize_img_list

def text2speech(text):
    try:
        processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
        model = AutoModel.from_pretrained("microsoft/speecht5_tts")
        
        inputs = processor(text=text, return_tensors="pt")
        speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
        
        output_path = os.path.join(tempfile.gettempdir(), "speech_output.flac")
        sf.write(output_path, speech.numpy(), samplerate=16000)
        return output_path
    except Exception as e:
        print(f"Error in text2speech: {str(e)}")
        raise

def merge_audio_video(entities_num, resize_img_list, text_input):
    try:
        speech = text2speech(text_input)
        wav_audio = AudioSegment.from_file(speech, "flac")
        
        audio_path = os.path.join(tempfile.gettempdir(), "audio.mp3")
        wav_audio.export(audio_path, format="mp3")
        
        audio_length = int(MP3(audio_path).info.length)
        fps = max(entities_num / audio_length, 1)  # Ensure fps is at least 1
        fps = float(format(fps, '.5f'))
        
        temp_video = os.path.join(tempfile.gettempdir(), "my_vid_tmp.mp4")
        clip = ImageSequenceClip(resize_img_list, fps=fps)
        clip.write_videofile(temp_video, codec='libx264', fps=fps)
        
        videoclip = VideoFileClip(temp_video)
        audioclip = AudioFileClip(audio_path)
        mergedclip = videoclip.set_audio(audioclip)
        
        output_path = os.path.join(tempfile.gettempdir(), "mergedvideo.mp4")
        mergedclip.write_videofile(output_path)
        
        # Clean up clips
        videoclip.close()
        audioclip.close()
        mergedclip.close()
        
        return output_path
    except Exception as e:
        print(f"Error in merge_audio_video: {str(e)}")
        raise
    finally:
        cleanup_temp_files()

# Load models outside the Blocks context
ner = gr.load("huggingface/flair/ner-english-ontonotes-large")
latentdiffusion = gr.load("spaces/multimodalart/latentdiffusion")

def engine(text_input):
    try:
        entities = ner(text_input)
        entities = [tupl for tupl in entities if None not in tupl]
        entities_num = len(entities)
        
        if entities_num == 0:
            raise ValueError("No entities found in the input text")
        
        img_list = []
        for ent in entities:
            img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0]
            img_list.append(img)
        
        resize_img_list = resize(img_list)
        output_path = merge_audio_video(entities_num, resize_img_list, text_input)
        
        return output_path
    except Exception as e:
        print(f"Error in engine: {str(e)}")
        raise gr.Error(f"An error occurred: {str(e)}")
    finally:
        cleanup_temp_files()

with gr.Blocks() as app:
    gr.Markdown("# AI Pipeline Multi Model 🎭🎞️🍿 Movie Maker 🎬 🧠 🎨")
    gr.Markdown("<div>🎭🎞️🍿 AI Movie Maker - Comedy 🎬 🧠 🎨</div>")
    text_input = gr.Textbox(lines=5, label="Input Text")
    output_video = gr.Video(label='Final Merged Video')
    examples = gr.Examples(
        examples=[
            ["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."]
        ],
        inputs=text_input
    )
    submit_button = gr.Button("Generate Video")
    submit_button.click(fn=engine, inputs=text_input, outputs=output_video)

    gr.Markdown("<br><div></div>")

app.launch(
    debug=True,
    share=True,  # Enable sharing
    server_name="0.0.0.0",  # Listen on all interfaces
    server_port=7860  # Specify port
)