Spaces:

awacke1
/

AI-MovieMaker-Comedy

Running

App Files Files Community

awacke1 commited on Oct 30, 2024

Commit

a0010c7

verified ·

1 Parent(s): 02cb7fa

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -46

app.py CHANGED Viewed

@@ -10,11 +10,26 @@ import soundfile as sf
 from dotenv import load_dotenv
 from transformers import AutoProcessor, AutoModel
 import torch
 # Load environment variables
 load_dotenv()
 HF_TOKEN = os.getenv("API_KEY")
 def resize(img_list):
     resize_img_list = []
     for item in img_list:
@@ -24,68 +39,103 @@ def resize(img_list):
     return resize_img_list
 def text2speech(text):
-    # Using Microsoft's SpeechT5 model instead of FastSpeech2
-    processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
-    model = AutoModel.from_pretrained("microsoft/speecht5_tts")
-    # Preprocessing text input
-    inputs = processor(text=text, return_tensors="pt")
-    # Generate speech with default speaker embedding
-    speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
-    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
-    # Save as flac file
-    sf.write("speech_output.flac", speech.numpy(), samplerate=16000)
-    return "speech_output.flac"
 def merge_audio_video(entities_num, resize_img_list, text_input):
-    speech = text2speech(text_input)
-    wav_audio = AudioSegment.from_file(speech, "flac")
-    wav_audio.export("audio.mp3", format="mp3")
-    audio_length = int(MP3("audio.mp3").info.length)
-    fps = entities_num / audio_length
-    fps = float(format(fps, '.5f'))
-    clip = ImageSequenceClip(resize_img_list, fps=fps)
-    clip.write_videofile('my_vid_tmp.mp4')
-    videoclip = VideoFileClip('my_vid_tmp.mp4')
-    audioclip = AudioFileClip('audio.mp3')
-    mergedclip = videoclip.set_audio(audioclip)
-    return mergedclip
-with gr.Blocks() as app:
-    # Load models in Blocks context
-    ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large")
-    latentdiffusion = gr.Interface.load("spaces/multimodalart/latentdiffusion")
-    def engine(text_input):
         entities = ner(text_input)
         entities = [tupl for tupl in entities if None not in tupl]
         entities_num = len(entities)
-        img_list = []
         for ent in entities:
             img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0]
             img_list.append(img)
         resize_img_list = resize(img_list)
-        mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
-        mergedclip.write_videofile('mergedvideo.mp4')
-        return 'mergedvideo.mp4'
-    interface = gr.Interface(
-        fn=engine,
-        inputs=gr.Textbox(lines=5, label="Input Text"),
-        outputs=gr.Video(label='Final Merged Video'),
-        description="<div>🎭🎞️🍿 AI Movie Maker - Comedy 🎬 🧠 🎨</div>",
         examples=[
             ["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."]
         ],
-        title="AI Pipeline Multi Model 🎭🎞️🍿 Movie Maker 🎬 🧠 🎨",
-        article="<br><div></div>"
     )
-    interface.launch(debug=True)

 from dotenv import load_dotenv
 from transformers import AutoProcessor, AutoModel
 import torch
+import tempfile
 # Load environment variables
 load_dotenv()
 HF_TOKEN = os.getenv("API_KEY")
+def cleanup_temp_files():
+    temp_files = [
+        os.path.join(tempfile.gettempdir(), 'speech_output.flac'),
+        os.path.join(tempfile.gettempdir(), 'audio.mp3'),
+        os.path.join(tempfile.gettempdir(), 'my_vid_tmp.mp4'),
+        os.path.join(tempfile.gettempdir(), 'mergedvideo.mp4')
+    ]
+    for file in temp_files:
+        if os.path.exists(file):
+            try:
+                os.remove(file)
+            except:
+                pass
 def resize(img_list):
     resize_img_list = []
     for item in img_list:
     return resize_img_list
 def text2speech(text):
+    try:
+        processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
+        model = AutoModel.from_pretrained("microsoft/speecht5_tts")
+        inputs = processor(text=text, return_tensors="pt")
+        speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
+        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
+        output_path = os.path.join(tempfile.gettempdir(), "speech_output.flac")
+        sf.write(output_path, speech.numpy(), samplerate=16000)
+        return output_path
+    except Exception as e:
+        print(f"Error in text2speech: {str(e)}")
+        raise
 def merge_audio_video(entities_num, resize_img_list, text_input):
+    try:
+        speech = text2speech(text_input)
+        wav_audio = AudioSegment.from_file(speech, "flac")
+        audio_path = os.path.join(tempfile.gettempdir(), "audio.mp3")
+        wav_audio.export(audio_path, format="mp3")
+        audio_length = int(MP3(audio_path).info.length)
+        fps = max(entities_num / audio_length, 1)  # Ensure fps is at least 1
+        fps = float(format(fps, '.5f'))
+        temp_video = os.path.join(tempfile.gettempdir(), "my_vid_tmp.mp4")
+        clip = ImageSequenceClip(resize_img_list, fps=fps)
+        clip.write_videofile(temp_video, codec='libx264', fps=fps)
+        videoclip = VideoFileClip(temp_video)
+        audioclip = AudioFileClip(audio_path)
+        mergedclip = videoclip.set_audio(audioclip)
+        output_path = os.path.join(tempfile.gettempdir(), "mergedvideo.mp4")
+        mergedclip.write_videofile(output_path)
+        # Clean up clips
+        videoclip.close()
+        audioclip.close()
+        mergedclip.close()
+        return output_path
+    except Exception as e:
+        print(f"Error in merge_audio_video: {str(e)}")
+        raise
+    finally:
+        cleanup_temp_files()
+# Load models outside the Blocks context
+ner = gr.load("huggingface/flair/ner-english-ontonotes-large")
+latentdiffusion = gr.load("spaces/multimodalart/latentdiffusion")
+def engine(text_input):
+    try:
         entities = ner(text_input)
         entities = [tupl for tupl in entities if None not in tupl]
         entities_num = len(entities)
+        if entities_num == 0:
+            raise ValueError("No entities found in the input text")
+        img_list = []
         for ent in entities:
             img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0]
             img_list.append(img)
         resize_img_list = resize(img_list)
+        output_path = merge_audio_video(entities_num, resize_img_list, text_input)
+        return output_path
+    except Exception as e:
+        print(f"Error in engine: {str(e)}")
+        raise gr.Error(f"An error occurred: {str(e)}")
+    finally:
+        cleanup_temp_files()
+with gr.Blocks() as app:
+    gr.Markdown("# AI Pipeline Multi Model 🎭🎞️🍿 Movie Maker 🎬 🧠 🎨")
+    gr.Markdown("<div>🎭🎞️🍿 AI Movie Maker - Comedy 🎬 🧠 🎨</div>")
+    text_input = gr.Textbox(lines=5, label="Input Text")
+    output_video = gr.Video(label='Final Merged Video')
+    examples = gr.Examples(
         examples=[
             ["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."]
         ],
+        inputs=text_input
     )
+    submit_button = gr.Button("Generate Video")
+    submit_button.click(fn=engine, inputs=text_input, outputs=output_video)
+    gr.Markdown("<br><div></div>")
+app.launch(
+    debug=True,
+    share=True,  # Enable sharing
+    server_name="0.0.0.0",  # Listen on all interfaces
+    server_port=7860  # Specify port
+)