Spaces:
Running
Running
File size: 5,021 Bytes
9e908c5 02a76d9 9e1ef69 9e908c5 df17f8f a0010c7 9e1ef69 02a76d9 9e1ef69 9e908c5 9e1ef69 02a76d9 9e1ef69 02a76d9 9e1ef69 02a76d9 83acbfc a0010c7 02a76d9 a0010c7 83acbfc 02a76d9 9e1ef69 02a76d9 9e1ef69 df17f8f 9e1ef69 dafecc5 bd79c84 a0010c7 dafecc5 bd79c84 dafecc5 bd79c84 dafecc5 bd79c84 dafecc5 bd79c84 dafecc5 bd79c84 dafecc5 bd79c84 dafecc5 bd79c84 dafecc5 bd79c84 9e1ef69 a0010c7 9e1ef69 a0010c7 dafecc5 9e1ef69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import gradio as gr
from transformers import pipeline
from diffusers import StableDiffusionPipeline
import torch
from PIL import Image
import numpy as np
import os
import tempfile
import moviepy.editor as mpe
import nltk
from pydub import AudioSegment
import warnings
import asyncio
import edge_tts
warnings.filterwarnings("ignore", category=UserWarning)
# Ensure NLTK data is downloaded
nltk.download('punkt')
# Initialize models
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32
# Story generator
story_generator = pipeline(
'text-generation',
model='gpt2-large',
device=0 if device == 'cuda' else -1
)
# Stable Diffusion model
sd_model_id = "runwayml/stable-diffusion-v1-5"
sd_pipe = StableDiffusionPipeline.from_pretrained(
sd_model_id,
torch_dtype=torch_dtype
)
sd_pipe = sd_pipe.to(device)
# Text-to-Speech function using edge_tts
def text2speech(text):
try:
output_path = asyncio.run(_text2speech_async(text))
return output_path
except Exception as e:
print(f"Error in text2speech: {str(e)}")
raise
async def _text2speech_async(text):
communicate = edge_tts.Communicate(text, voice="en-US-AriaNeural")
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path
def generate_story(prompt):
generated = story_generator(prompt, max_length=500, num_return_sequences=1)
story = generated[0]['generated_text']
return story
def split_story_into_sentences(story):
sentences = nltk.sent_tokenize(story)
return sentences
def generate_images(sentences):
images = []
for idx, sentence in enumerate(sentences):
image = sd_pipe(sentence).images[0]
# Save image to temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f"_{idx}.png")
image.save(temp_file.name)
images.append(temp_file.name)
return images
def generate_audio(story_text):
audio_path = text2speech(story_text)
audio = AudioSegment.from_file(audio_path)
total_duration = len(audio) / 1000 # duration in seconds
return audio_path, total_duration
def compute_sentence_durations(sentences, total_duration):
total_words = sum(len(sentence.split()) for sentence in sentences)
sentence_durations = []
for sentence in sentences:
num_words = len(sentence.split())
duration = total_duration * (num_words / total_words)
sentence_durations.append(duration)
return sentence_durations
def create_video(images, durations, audio_path):
clips = []
for image_path, duration in zip(images, durations):
clip = mpe.ImageClip(image_path).set_duration(duration)
clips.append(clip)
video = mpe.concatenate_videoclips(clips, method='compose')
audio = mpe.AudioFileClip(audio_path)
video = video.set_audio(audio)
# Save video
output_path = os.path.join(tempfile.gettempdir(), "final_video.mp4")
video.write_videofile(output_path, fps=1, codec='libx264')
return output_path
def process_pipeline(prompt, progress=gr.Progress()):
try:
total_steps = 6
step = 0
progress(step / total_steps, desc="Generating Story")
story = generate_story(prompt)
step += 1
progress(step / total_steps, desc="Splitting Story into Sentences")
sentences = split_story_into_sentences(story)
step += 1
progress(step / total_steps, desc="Generating Images for Sentences")
images = generate_images(sentences)
step += 1
progress(step / total_steps, desc="Generating Audio")
audio_path, total_duration = generate_audio(story)
step += 1
progress(step / total_steps, desc="Computing Durations")
durations = compute_sentence_durations(sentences, total_duration)
step += 1
progress(step / total_steps, desc="Creating Video")
video_path = create_video(images, durations, audio_path)
step += 1
progress(1.0, desc="Completed")
return video_path
except Exception as e:
print(f"Error in process_pipeline: {str(e)}")
raise gr.Error(f"An error occurred: {str(e)}")
title = """<h1 align="center">AI Story Video Generator ๐ฅ</h1>
<p align="center">
Generate a story from a prompt, create images for each sentence, and produce a video with narration!
</p>
"""
with gr.Blocks(css=".container { max-width: 800px; margin: auto; }") as demo:
gr.HTML(title)
with gr.Row():
with gr.Column():
prompt_input = gr.Textbox(label="Enter a Prompt", lines=2)
generate_button = gr.Button("Generate Video")
with gr.Column():
video_output = gr.Video(label="Generated Video")
generate_button.click(fn=process_pipeline, inputs=prompt_input, outputs=video_output)
demo.launch(debug=True)
|