Spaces:

awacke1
/

AI-MovieMaker-Comedy

Running

App Files Files Community

awacke1 commited on Oct 30, 2024

Commit

02a76d9

verified ·

1 Parent(s): 64aa7f5

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -25

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from diffusers import StableDiffusionPipeline
 import torch
 from PIL import Image
@@ -7,10 +7,11 @@ import numpy as np
 import os
 import tempfile
 import moviepy.editor as mpe
-import soundfile as sf
 import nltk
 from pydub import AudioSegment
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning)
@@ -22,33 +23,36 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if device == "cuda" else torch.float32
 # Story generator
-story_generator = pipeline('text-generation', model='gpt2-large', device=0 if device=='cuda' else -1)
 # Stable Diffusion model
 sd_model_id = "runwayml/stable-diffusion-v1-5"
-sd_pipe = StableDiffusionPipeline.from_pretrained(sd_model_id, torch_dtype=torch_dtype)
 sd_pipe = sd_pipe.to(device)
-# Text-to-Speech model
-tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts", torch_dtype=torch_dtype)
-tts_model = tts_model.to(device)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", torch_dtype=torch_dtype)
-vocoder = vocoder.to(device)
 def text2speech(text):
     try:
-        inputs = tts_processor(text=text, return_tensors="pt").to(device)
-        speaker_embeddings = torch.zeros((1, 512), device=device)
-        speech = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
-        output_path = os.path.join(tempfile.gettempdir(), "speech_output.wav")
-        sf.write(output_path, speech.cpu().numpy(), samplerate=16000)
         return output_path
     except Exception as e:
         print(f"Error in text2speech: {str(e)}")
         raise
 def generate_story(prompt):
     generated = story_generator(prompt, max_length=500, num_return_sequences=1)
     story = generated[0]['generated_text']
@@ -63,7 +67,7 @@ def generate_images(sentences):
     for idx, sentence in enumerate(sentences):
         image = sd_pipe(sentence).images[0]
         # Save image to temporary file
-        temp_file = tempfile.NamedTemporaryFile(suffix=f"_{idx}.png", delete=False)
         image.save(temp_file.name)
         images.append(temp_file.name)
     return images
@@ -98,18 +102,31 @@ def create_video(images, durations, audio_path):
 def process_pipeline(prompt, progress=gr.Progress(track_tqdm=True)):
     try:
-        with gr.Progress(track_tqdm=True, desc="Generating Story"):
             story = generate_story(prompt)
-        with gr.Progress(track_tqdm=True, desc="Splitting Story into Sentences"):
             sentences = split_story_into_sentences(story)
-        with gr.Progress(track_tqdm=True, desc="Generating Images for Sentences"):
             images = generate_images(sentences)
-        with gr.Progress(track_tqdm=True, desc="Generating Audio"):
             audio_path, total_duration = generate_audio(story)
-        with gr.Progress(track_tqdm=True, desc="Computing Durations"):
             durations = compute_sentence_durations(sentences, total_duration)
-        with gr.Progress(track_tqdm=True, desc="Creating Video"):
             video_path = create_video(images, durations, audio_path)
         return video_path
     except Exception as e:
         print(f"Error in process_pipeline: {str(e)}")
@@ -128,7 +145,6 @@ with gr.Blocks(css=".container { max-width: 800px; margin: auto; }") as demo:
         with gr.Column():
             prompt_input = gr.Textbox(label="Enter a Prompt", lines=2)
             generate_button = gr.Button("Generate Video")
-            progress_bar = gr.Markdown("")
         with gr.Column():
             video_output = gr.Video(label="Generated Video")

 import gradio as gr
+from transformers import pipeline
 from diffusers import StableDiffusionPipeline
 import torch
 from PIL import Image
 import os
 import tempfile
 import moviepy.editor as mpe
 import nltk
 from pydub import AudioSegment
 import warnings
+import asyncio
+import edge_tts
 warnings.filterwarnings("ignore", category=UserWarning)
 torch_dtype = torch.float16 if device == "cuda" else torch.float32
 # Story generator
+story_generator = pipeline(
+    'text-generation',
+    model='gpt2-large',
+    device=0 if device == 'cuda' else -1
+)
 # Stable Diffusion model
 sd_model_id = "runwayml/stable-diffusion-v1-5"
+sd_pipe = StableDiffusionPipeline.from_pretrained(
+    sd_model_id,
+    torch_dtype=torch_dtype
+)
 sd_pipe = sd_pipe.to(device)
+# Text-to-Speech function using edge_tts
 def text2speech(text):
     try:
+        output_path = asyncio.run(_text2speech_async(text))
         return output_path
     except Exception as e:
         print(f"Error in text2speech: {str(e)}")
         raise
+async def _text2speech_async(text):
+    communicate = edge_tts.Communicate(text, voice="en-US-AriaNeural")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_path = tmp_file.name
+        await communicate.save(tmp_path)
+    return tmp_path
 def generate_story(prompt):
     generated = story_generator(prompt, max_length=500, num_return_sequences=1)
     story = generated[0]['generated_text']
     for idx, sentence in enumerate(sentences):
         image = sd_pipe(sentence).images[0]
         # Save image to temporary file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f"_{idx}.png")
         image.save(temp_file.name)
         images.append(temp_file.name)
     return images
 def process_pipeline(prompt, progress=gr.Progress(track_tqdm=True)):
     try:
+        with progress.tqdm(total=6) as pbar:
+            pbar.set_description("Generating Story")
             story = generate_story(prompt)
+            pbar.update(1)
+            pbar.set_description("Splitting Story into Sentences")
             sentences = split_story_into_sentences(story)
+            pbar.update(1)
+            pbar.set_description("Generating Images for Sentences")
             images = generate_images(sentences)
+            pbar.update(1)
+            pbar.set_description("Generating Audio")
             audio_path, total_duration = generate_audio(story)
+            pbar.update(1)
+            pbar.set_description("Computing Durations")
             durations = compute_sentence_durations(sentences, total_duration)
+            pbar.update(1)
+            pbar.set_description("Creating Video")
             video_path = create_video(images, durations, audio_path)
+            pbar.update(1)
         return video_path
     except Exception as e:
         print(f"Error in process_pipeline: {str(e)}")
         with gr.Column():
             prompt_input = gr.Textbox(label="Enter a Prompt", lines=2)
             generate_button = gr.Button("Generate Video")
         with gr.Column():
             video_output = gr.Video(label="Generated Video")