Spaces:

awacke1
/

AI-MovieMaker-Comedy

Running

App Files Files Community

AI-MovieMaker-Comedy / app.py

awacke1

Update app.py

9e1ef69 verified 8 months ago

raw

history blame

5.34 kB

	import gradio as gr
	from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from diffusers import StableDiffusionPipeline
	import torch
	from PIL import Image
	import numpy as np
	import os
	import tempfile
	import moviepy.editor as mpe
	import soundfile as sf
	import nltk
	from pydub import AudioSegment
	import warnings

	warnings.filterwarnings("ignore", category=UserWarning)

	# Ensure NLTK data is downloaded
	nltk.download('punkt')

	# Initialize models
	device = "cuda" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if device == "cuda" else torch.float32

	# Story generator
	story_generator = pipeline('text-generation', model='gpt2-large', device=0 if device=='cuda' else -1)

	# Stable Diffusion model
	sd_model_id = "runwayml/stable-diffusion-v1-5"
	sd_pipe = StableDiffusionPipeline.from_pretrained(sd_model_id, torch_dtype=torch_dtype)
	sd_pipe = sd_pipe.to(device)

	# Text-to-Speech model
	tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts", torch_dtype=torch_dtype)
	tts_model = tts_model.to(device)
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", torch_dtype=torch_dtype)
	vocoder = vocoder.to(device)

	def text2speech(text):
	try:
	inputs = tts_processor(text=text, return_tensors="pt").to(device)
	speaker_embeddings = torch.zeros((1, 512), device=device)
	speech = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

	output_path = os.path.join(tempfile.gettempdir(), "speech_output.wav")
	sf.write(output_path, speech.cpu().numpy(), samplerate=16000)
	return output_path
	except Exception as e:
	print(f"Error in text2speech: {str(e)}")
	raise

	def generate_story(prompt):
	generated = story_generator(prompt, max_length=500, num_return_sequences=1)
	story = generated[0]['generated_text']
	return story

	def split_story_into_sentences(story):
	sentences = nltk.sent_tokenize(story)
	return sentences

	def generate_images(sentences):
	images = []
	for idx, sentence in enumerate(sentences):
	image = sd_pipe(sentence).images[0]
	# Save image to temporary file
	temp_file = tempfile.NamedTemporaryFile(suffix=f"_{idx}.png", delete=False)
	image.save(temp_file.name)
	images.append(temp_file.name)
	return images

	def generate_audio(story_text):
	audio_path = text2speech(story_text)
	audio = AudioSegment.from_file(audio_path)
	total_duration = len(audio) / 1000 # duration in seconds
	return audio_path, total_duration

	def compute_sentence_durations(sentences, total_duration):
	total_words = sum(len(sentence.split()) for sentence in sentences)
	sentence_durations = []
	for sentence in sentences:
	num_words = len(sentence.split())
	duration = total_duration * (num_words / total_words)
	sentence_durations.append(duration)
	return sentence_durations

	def create_video(images, durations, audio_path):
	clips = []
	for image_path, duration in zip(images, durations):
	clip = mpe.ImageClip(image_path).set_duration(duration)
	clips.append(clip)
	video = mpe.concatenate_videoclips(clips, method='compose')
	audio = mpe.AudioFileClip(audio_path)
	video = video.set_audio(audio)
	# Save video
	output_path = os.path.join(tempfile.gettempdir(), "final_video.mp4")
	video.write_videofile(output_path, fps=1, codec='libx264')
	return output_path

	def process_pipeline(prompt, progress=gr.Progress(track_tqdm=True)):
	try:
	with gr.Progress(track_tqdm=True, desc="Generating Story"):
	story = generate_story(prompt)
	with gr.Progress(track_tqdm=True, desc="Splitting Story into Sentences"):
	sentences = split_story_into_sentences(story)
	with gr.Progress(track_tqdm=True, desc="Generating Images for Sentences"):
	images = generate_images(sentences)
	with gr.Progress(track_tqdm=True, desc="Generating Audio"):
	audio_path, total_duration = generate_audio(story)
	with gr.Progress(track_tqdm=True, desc="Computing Durations"):
	durations = compute_sentence_durations(sentences, total_duration)
	with gr.Progress(track_tqdm=True, desc="Creating Video"):
	video_path = create_video(images, durations, audio_path)
	return video_path
	except Exception as e:
	print(f"Error in process_pipeline: {str(e)}")
	raise gr.Error(f"An error occurred: {str(e)}")

	title = """<h1 align="center">AI Story Video Generator 🎥</h1>
	<p align="center">
	Generate a story from a prompt, create images for each sentence, and produce a video with narration!
	</p>
	"""

	with gr.Blocks(css=".container { max-width: 800px; margin: auto; }") as demo:
	gr.HTML(title)

	with gr.Row():
	with gr.Column():
	prompt_input = gr.Textbox(label="Enter a Prompt", lines=2)
	generate_button = gr.Button("Generate Video")
	progress_bar = gr.Markdown("")
	with gr.Column():
	video_output = gr.Video(label="Generated Video")

	generate_button.click(fn=process_pipeline, inputs=prompt_input, outputs=video_output)

	demo.launch(debug=True)