Spaces:

awacke1
/

AI-MovieMaker-Comedy

Running

File size: 5,336 Bytes

import gradio as gr
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from diffusers import StableDiffusionPipeline
import torch
from PIL import Image
import numpy as np
import os
import tempfile
import moviepy.editor as mpe
import soundfile as sf
import nltk
from pydub import AudioSegment
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Ensure NLTK data is downloaded
nltk.download('punkt')

# Initialize models
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32

# Story generator
story_generator = pipeline('text-generation', model='gpt2-large', device=0 if device=='cuda' else -1)

# Stable Diffusion model
sd_model_id = "runwayml/stable-diffusion-v1-5"
sd_pipe = StableDiffusionPipeline.from_pretrained(sd_model_id, torch_dtype=torch_dtype)
sd_pipe = sd_pipe.to(device)

# Text-to-Speech model
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts", torch_dtype=torch_dtype)
tts_model = tts_model.to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", torch_dtype=torch_dtype)
vocoder = vocoder.to(device)

def text2speech(text):
    try:
        inputs = tts_processor(text=text, return_tensors="pt").to(device)
        speaker_embeddings = torch.zeros((1, 512), device=device)
        speech = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
        
        output_path = os.path.join(tempfile.gettempdir(), "speech_output.wav")
        sf.write(output_path, speech.cpu().numpy(), samplerate=16000)
        return output_path
    except Exception as e:
        print(f"Error in text2speech: {str(e)}")
        raise

def generate_story(prompt):
    generated = story_generator(prompt, max_length=500, num_return_sequences=1)
    story = generated[0]['generated_text']
    return story

def split_story_into_sentences(story):
    sentences = nltk.sent_tokenize(story)
    return sentences

def generate_images(sentences):
    images = []
    for idx, sentence in enumerate(sentences):
        image = sd_pipe(sentence).images[0]
        # Save image to temporary file
        temp_file = tempfile.NamedTemporaryFile(suffix=f"_{idx}.png", delete=False)
        image.save(temp_file.name)
        images.append(temp_file.name)
    return images

def generate_audio(story_text):
    audio_path = text2speech(story_text)
    audio = AudioSegment.from_file(audio_path)
    total_duration = len(audio) / 1000  # duration in seconds
    return audio_path, total_duration

def compute_sentence_durations(sentences, total_duration):
    total_words = sum(len(sentence.split()) for sentence in sentences)
    sentence_durations = []
    for sentence in sentences:
        num_words = len(sentence.split())
        duration = total_duration * (num_words / total_words)
        sentence_durations.append(duration)
    return sentence_durations

def create_video(images, durations, audio_path):
    clips = []
    for image_path, duration in zip(images, durations):
        clip = mpe.ImageClip(image_path).set_duration(duration)
        clips.append(clip)
    video = mpe.concatenate_videoclips(clips, method='compose')
    audio = mpe.AudioFileClip(audio_path)
    video = video.set_audio(audio)
    # Save video
    output_path = os.path.join(tempfile.gettempdir(), "final_video.mp4")
    video.write_videofile(output_path, fps=1, codec='libx264')
    return output_path

def process_pipeline(prompt, progress=gr.Progress(track_tqdm=True)):
    try:
        with gr.Progress(track_tqdm=True, desc="Generating Story"):
            story = generate_story(prompt)
        with gr.Progress(track_tqdm=True, desc="Splitting Story into Sentences"):
            sentences = split_story_into_sentences(story)
        with gr.Progress(track_tqdm=True, desc="Generating Images for Sentences"):
            images = generate_images(sentences)
        with gr.Progress(track_tqdm=True, desc="Generating Audio"):
            audio_path, total_duration = generate_audio(story)
        with gr.Progress(track_tqdm=True, desc="Computing Durations"):
            durations = compute_sentence_durations(sentences, total_duration)
        with gr.Progress(track_tqdm=True, desc="Creating Video"):
            video_path = create_video(images, durations, audio_path)
        return video_path
    except Exception as e:
        print(f"Error in process_pipeline: {str(e)}")
        raise gr.Error(f"An error occurred: {str(e)}")

title = """<h1 align="center">AI Story Video Generator 🎥</h1>
<p align="center">
Generate a story from a prompt, create images for each sentence, and produce a video with narration!
</p>
"""

with gr.Blocks(css=".container { max-width: 800px; margin: auto; }") as demo:
    gr.HTML(title)
    
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(label="Enter a Prompt", lines=2)
            generate_button = gr.Button("Generate Video")
            progress_bar = gr.Markdown("")
        with gr.Column():
            video_output = gr.Video(label="Generated Video")
    
    generate_button.click(fn=process_pipeline, inputs=prompt_input, outputs=video_output)

demo.launch(debug=True)