File size: 5,021 Bytes
9e908c5
02a76d9
9e1ef69
 
9e908c5
 
df17f8f
a0010c7
9e1ef69
 
 
 
02a76d9
 
9e1ef69
 
 
 
 
 
 
 
 
9e908c5
9e1ef69
02a76d9
 
 
 
 
9e1ef69
 
 
02a76d9
 
 
 
9e1ef69
 
02a76d9
83acbfc
a0010c7
02a76d9
a0010c7
 
 
 
83acbfc
02a76d9
 
 
 
 
 
 
9e1ef69
 
 
 
 
 
 
 
 
 
 
 
 
 
02a76d9
9e1ef69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df17f8f
9e1ef69
 
 
 
 
 
 
 
 
 
 
 
dafecc5
bd79c84
a0010c7
dafecc5
 
bd79c84
dafecc5
 
 
bd79c84
dafecc5
 
 
bd79c84
dafecc5
 
 
bd79c84
dafecc5
 
 
bd79c84
dafecc5
 
 
bd79c84
dafecc5
 
 
bd79c84
dafecc5
bd79c84
9e1ef69
a0010c7
9e1ef69
a0010c7
dafecc5
9e1ef69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import gradio as gr
from transformers import pipeline
from diffusers import StableDiffusionPipeline
import torch
from PIL import Image
import numpy as np
import os
import tempfile
import moviepy.editor as mpe
import nltk
from pydub import AudioSegment
import warnings
import asyncio
import edge_tts

warnings.filterwarnings("ignore", category=UserWarning)

# Ensure NLTK data is downloaded
nltk.download('punkt')

# Initialize models
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32

# Story generator
story_generator = pipeline(
    'text-generation',
    model='gpt2-large',
    device=0 if device == 'cuda' else -1
)

# Stable Diffusion model
sd_model_id = "runwayml/stable-diffusion-v1-5"
sd_pipe = StableDiffusionPipeline.from_pretrained(
    sd_model_id,
    torch_dtype=torch_dtype
)
sd_pipe = sd_pipe.to(device)

# Text-to-Speech function using edge_tts
def text2speech(text):
    try:
        output_path = asyncio.run(_text2speech_async(text))
        return output_path
    except Exception as e:
        print(f"Error in text2speech: {str(e)}")
        raise

async def _text2speech_async(text):
    communicate = edge_tts.Communicate(text, voice="en-US-AriaNeural")
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path

def generate_story(prompt):
    generated = story_generator(prompt, max_length=500, num_return_sequences=1)
    story = generated[0]['generated_text']
    return story

def split_story_into_sentences(story):
    sentences = nltk.sent_tokenize(story)
    return sentences

def generate_images(sentences):
    images = []
    for idx, sentence in enumerate(sentences):
        image = sd_pipe(sentence).images[0]
        # Save image to temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f"_{idx}.png")
        image.save(temp_file.name)
        images.append(temp_file.name)
    return images

def generate_audio(story_text):
    audio_path = text2speech(story_text)
    audio = AudioSegment.from_file(audio_path)
    total_duration = len(audio) / 1000  # duration in seconds
    return audio_path, total_duration

def compute_sentence_durations(sentences, total_duration):
    total_words = sum(len(sentence.split()) for sentence in sentences)
    sentence_durations = []
    for sentence in sentences:
        num_words = len(sentence.split())
        duration = total_duration * (num_words / total_words)
        sentence_durations.append(duration)
    return sentence_durations

def create_video(images, durations, audio_path):
    clips = []
    for image_path, duration in zip(images, durations):
        clip = mpe.ImageClip(image_path).set_duration(duration)
        clips.append(clip)
    video = mpe.concatenate_videoclips(clips, method='compose')
    audio = mpe.AudioFileClip(audio_path)
    video = video.set_audio(audio)
    # Save video
    output_path = os.path.join(tempfile.gettempdir(), "final_video.mp4")
    video.write_videofile(output_path, fps=1, codec='libx264')
    return output_path
    
def process_pipeline(prompt, progress=gr.Progress()):
    try:
        total_steps = 6
        step = 0

        progress(step / total_steps, desc="Generating Story")
        story = generate_story(prompt)
        step += 1

        progress(step / total_steps, desc="Splitting Story into Sentences")
        sentences = split_story_into_sentences(story)
        step += 1

        progress(step / total_steps, desc="Generating Images for Sentences")
        images = generate_images(sentences)
        step += 1

        progress(step / total_steps, desc="Generating Audio")
        audio_path, total_duration = generate_audio(story)
        step += 1

        progress(step / total_steps, desc="Computing Durations")
        durations = compute_sentence_durations(sentences, total_duration)
        step += 1

        progress(step / total_steps, desc="Creating Video")
        video_path = create_video(images, durations, audio_path)
        step += 1

        progress(1.0, desc="Completed")

        return video_path
    except Exception as e:
        print(f"Error in process_pipeline: {str(e)}")
        raise gr.Error(f"An error occurred: {str(e)}")
        
title = """<h1 align="center">AI Story Video Generator ๐ŸŽฅ</h1>
<p align="center">
Generate a story from a prompt, create images for each sentence, and produce a video with narration!
</p>
"""

with gr.Blocks(css=".container { max-width: 800px; margin: auto; }") as demo:
    gr.HTML(title)
    
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(label="Enter a Prompt", lines=2)
            generate_button = gr.Button("Generate Video")
        with gr.Column():
            video_output = gr.Video(label="Generated Video")
    
    generate_button.click(fn=process_pipeline, inputs=prompt_input, outputs=video_output)

demo.launch(debug=True)