Spaces:

awacke1
/

AI-MovieMaker-Comedy

Running

App Files Files Community

awacke1 commited on Oct 30, 2024

Commit

9e1ef69

verified ·

1 Parent(s): a0010c7

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -121

app.py CHANGED Viewed

@@ -1,141 +1,137 @@
 import gradio as gr
-import moviepy.video.io.ImageSequenceClip
 from PIL import Image
-from pydub import AudioSegment
-from moviepy.editor import ImageSequenceClip, VideoFileClip, AudioFileClip
 import numpy as np
 import os
-from mutagen.mp3 import MP3
-import soundfile as sf
-from dotenv import load_dotenv
-from transformers import AutoProcessor, AutoModel
-import torch
 import tempfile
-# Load environment variables
-load_dotenv()
-HF_TOKEN = os.getenv("API_KEY")
-def cleanup_temp_files():
-    temp_files = [
-        os.path.join(tempfile.gettempdir(), 'speech_output.flac'),
-        os.path.join(tempfile.gettempdir(), 'audio.mp3'),
-        os.path.join(tempfile.gettempdir(), 'my_vid_tmp.mp4'),
-        os.path.join(tempfile.gettempdir(), 'mergedvideo.mp4')
-    ]
-    for file in temp_files:
-        if os.path.exists(file):
-            try:
-                os.remove(file)
-            except:
-                pass
-def resize(img_list):
-    resize_img_list = []
-    for item in img_list:
-        im = Image.open(item)
-        imResize = im.resize((256, 256), Image.LANCZOS)
-        resize_img_list.append(np.array(imResize))
-    return resize_img_list
 def text2speech(text):
     try:
-        processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
-        model = AutoModel.from_pretrained("microsoft/speecht5_tts")
-        inputs = processor(text=text, return_tensors="pt")
-        speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
-        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
-        output_path = os.path.join(tempfile.gettempdir(), "speech_output.flac")
-        sf.write(output_path, speech.numpy(), samplerate=16000)
         return output_path
     except Exception as e:
         print(f"Error in text2speech: {str(e)}")
         raise
-def merge_audio_video(entities_num, resize_img_list, text_input):
-    try:
-        speech = text2speech(text_input)
-        wav_audio = AudioSegment.from_file(speech, "flac")
-        audio_path = os.path.join(tempfile.gettempdir(), "audio.mp3")
-        wav_audio.export(audio_path, format="mp3")
-        audio_length = int(MP3(audio_path).info.length)
-        fps = max(entities_num / audio_length, 1)  # Ensure fps is at least 1
-        fps = float(format(fps, '.5f'))
-        temp_video = os.path.join(tempfile.gettempdir(), "my_vid_tmp.mp4")
-        clip = ImageSequenceClip(resize_img_list, fps=fps)
-        clip.write_videofile(temp_video, codec='libx264', fps=fps)
-        videoclip = VideoFileClip(temp_video)
-        audioclip = AudioFileClip(audio_path)
-        mergedclip = videoclip.set_audio(audioclip)
-        output_path = os.path.join(tempfile.gettempdir(), "mergedvideo.mp4")
-        mergedclip.write_videofile(output_path)
-        # Clean up clips
-        videoclip.close()
-        audioclip.close()
-        mergedclip.close()
-        return output_path
-    except Exception as e:
-        print(f"Error in merge_audio_video: {str(e)}")
-        raise
-    finally:
-        cleanup_temp_files()
-# Load models outside the Blocks context
-ner = gr.load("huggingface/flair/ner-english-ontonotes-large")
-latentdiffusion = gr.load("spaces/multimodalart/latentdiffusion")
-def engine(text_input):
     try:
-        entities = ner(text_input)
-        entities = [tupl for tupl in entities if None not in tupl]
-        entities_num = len(entities)
-        if entities_num == 0:
-            raise ValueError("No entities found in the input text")
-        img_list = []
-        for ent in entities:
-            img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0]
-            img_list.append(img)
-        resize_img_list = resize(img_list)
-        output_path = merge_audio_video(entities_num, resize_img_list, text_input)
-        return output_path
     except Exception as e:
-        print(f"Error in engine: {str(e)}")
         raise gr.Error(f"An error occurred: {str(e)}")
-    finally:
-        cleanup_temp_files()
-with gr.Blocks() as app:
-    gr.Markdown("# AI Pipeline Multi Model 🎭🎞️🍿 Movie Maker 🎬 🧠 🎨")
-    gr.Markdown("<div>🎭🎞️🍿 AI Movie Maker - Comedy 🎬 🧠 🎨</div>")
-    text_input = gr.Textbox(lines=5, label="Input Text")
-    output_video = gr.Video(label='Final Merged Video')
-    examples = gr.Examples(
-        examples=[
-            ["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."]
-        ],
-        inputs=text_input
-    )
-    submit_button = gr.Button("Generate Video")
-    submit_button.click(fn=engine, inputs=text_input, outputs=output_video)
-    gr.Markdown("<br><div></div>")
-app.launch(
-    debug=True,
-    share=True,  # Enable sharing
-    server_name="0.0.0.0",  # Listen on all interfaces
-    server_port=7860  # Specify port
-)

 import gradio as gr
+from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from diffusers import StableDiffusionPipeline
+import torch
 from PIL import Image
 import numpy as np
 import os
 import tempfile
+import moviepy.editor as mpe
+import soundfile as sf
+import nltk
+from pydub import AudioSegment
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+# Ensure NLTK data is downloaded
+nltk.download('punkt')
+# Initialize models
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if device == "cuda" else torch.float32
+# Story generator
+story_generator = pipeline('text-generation', model='gpt2-large', device=0 if device=='cuda' else -1)
+# Stable Diffusion model
+sd_model_id = "runwayml/stable-diffusion-v1-5"
+sd_pipe = StableDiffusionPipeline.from_pretrained(sd_model_id, torch_dtype=torch_dtype)
+sd_pipe = sd_pipe.to(device)
+# Text-to-Speech model
+tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts", torch_dtype=torch_dtype)
+tts_model = tts_model.to(device)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", torch_dtype=torch_dtype)
+vocoder = vocoder.to(device)
 def text2speech(text):
     try:
+        inputs = tts_processor(text=text, return_tensors="pt").to(device)
+        speaker_embeddings = torch.zeros((1, 512), device=device)
+        speech = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+        output_path = os.path.join(tempfile.gettempdir(), "speech_output.wav")
+        sf.write(output_path, speech.cpu().numpy(), samplerate=16000)
         return output_path
     except Exception as e:
         print(f"Error in text2speech: {str(e)}")
         raise
+def generate_story(prompt):
+    generated = story_generator(prompt, max_length=500, num_return_sequences=1)
+    story = generated[0]['generated_text']
+    return story
+def split_story_into_sentences(story):
+    sentences = nltk.sent_tokenize(story)
+    return sentences
+def generate_images(sentences):
+    images = []
+    for idx, sentence in enumerate(sentences):
+        image = sd_pipe(sentence).images[0]
+        # Save image to temporary file
+        temp_file = tempfile.NamedTemporaryFile(suffix=f"_{idx}.png", delete=False)
+        image.save(temp_file.name)
+        images.append(temp_file.name)
+    return images
+def generate_audio(story_text):
+    audio_path = text2speech(story_text)
+    audio = AudioSegment.from_file(audio_path)
+    total_duration = len(audio) / 1000  # duration in seconds
+    return audio_path, total_duration
+def compute_sentence_durations(sentences, total_duration):
+    total_words = sum(len(sentence.split()) for sentence in sentences)
+    sentence_durations = []
+    for sentence in sentences:
+        num_words = len(sentence.split())
+        duration = total_duration * (num_words / total_words)
+        sentence_durations.append(duration)
+    return sentence_durations
+def create_video(images, durations, audio_path):
+    clips = []
+    for image_path, duration in zip(images, durations):
+        clip = mpe.ImageClip(image_path).set_duration(duration)
+        clips.append(clip)
+    video = mpe.concatenate_videoclips(clips, method='compose')
+    audio = mpe.AudioFileClip(audio_path)
+    video = video.set_audio(audio)
+    # Save video
+    output_path = os.path.join(tempfile.gettempdir(), "final_video.mp4")
+    video.write_videofile(output_path, fps=1, codec='libx264')
+    return output_path
+def process_pipeline(prompt, progress=gr.Progress(track_tqdm=True)):
     try:
+        with gr.Progress(track_tqdm=True, desc="Generating Story"):
+            story = generate_story(prompt)
+        with gr.Progress(track_tqdm=True, desc="Splitting Story into Sentences"):
+            sentences = split_story_into_sentences(story)
+        with gr.Progress(track_tqdm=True, desc="Generating Images for Sentences"):
+            images = generate_images(sentences)
+        with gr.Progress(track_tqdm=True, desc="Generating Audio"):
+            audio_path, total_duration = generate_audio(story)
+        with gr.Progress(track_tqdm=True, desc="Computing Durations"):
+            durations = compute_sentence_durations(sentences, total_duration)
+        with gr.Progress(track_tqdm=True, desc="Creating Video"):
+            video_path = create_video(images, durations, audio_path)
+        return video_path
     except Exception as e:
+        print(f"Error in process_pipeline: {str(e)}")
         raise gr.Error(f"An error occurred: {str(e)}")
+title = """<h1 align="center">AI Story Video Generator 🎥</h1>
+<p align="center">
+Generate a story from a prompt, create images for each sentence, and produce a video with narration!
+</p>
+"""
+with gr.Blocks(css=".container { max-width: 800px; margin: auto; }") as demo:
+    gr.HTML(title)
+    with gr.Row():
+        with gr.Column():
+            prompt_input = gr.Textbox(label="Enter a Prompt", lines=2)
+            generate_button = gr.Button("Generate Video")
+            progress_bar = gr.Markdown("")
+        with gr.Column():
+            video_output = gr.Video(label="Generated Video")
+    generate_button.click(fn=process_pipeline, inputs=prompt_input, outputs=video_output)
+demo.launch(debug=True)