Spaces:
Running
Running
File size: 5,082 Bytes
9e908c5 df17f8f 9e908c5 df17f8f 9e908c5 02cb7fa df17f8f 02cb7fa 83acbfc a0010c7 9e908c5 df17f8f dbbec05 a1c58e6 a0010c7 9e908c5 df17f8f 9e908c5 83acbfc a0010c7 83acbfc 9e908c5 a0010c7 df17f8f a0010c7 9e908c5 a0010c7 02cb7fa a0010c7 02cb7fa a0010c7 02cb7fa a0010c7 02cb7fa a0010c7 02cb7fa a0010c7 02cb7fa a0010c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import gradio as gr
import moviepy.video.io.ImageSequenceClip
from PIL import Image
from pydub import AudioSegment
from moviepy.editor import ImageSequenceClip, VideoFileClip, AudioFileClip
import numpy as np
import os
from mutagen.mp3 import MP3
import soundfile as sf
from dotenv import load_dotenv
from transformers import AutoProcessor, AutoModel
import torch
import tempfile
# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("API_KEY")
def cleanup_temp_files():
temp_files = [
os.path.join(tempfile.gettempdir(), 'speech_output.flac'),
os.path.join(tempfile.gettempdir(), 'audio.mp3'),
os.path.join(tempfile.gettempdir(), 'my_vid_tmp.mp4'),
os.path.join(tempfile.gettempdir(), 'mergedvideo.mp4')
]
for file in temp_files:
if os.path.exists(file):
try:
os.remove(file)
except:
pass
def resize(img_list):
resize_img_list = []
for item in img_list:
im = Image.open(item)
imResize = im.resize((256, 256), Image.LANCZOS)
resize_img_list.append(np.array(imResize))
return resize_img_list
def text2speech(text):
try:
processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
model = AutoModel.from_pretrained("microsoft/speecht5_tts")
inputs = processor(text=text, return_tensors="pt")
speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
output_path = os.path.join(tempfile.gettempdir(), "speech_output.flac")
sf.write(output_path, speech.numpy(), samplerate=16000)
return output_path
except Exception as e:
print(f"Error in text2speech: {str(e)}")
raise
def merge_audio_video(entities_num, resize_img_list, text_input):
try:
speech = text2speech(text_input)
wav_audio = AudioSegment.from_file(speech, "flac")
audio_path = os.path.join(tempfile.gettempdir(), "audio.mp3")
wav_audio.export(audio_path, format="mp3")
audio_length = int(MP3(audio_path).info.length)
fps = max(entities_num / audio_length, 1) # Ensure fps is at least 1
fps = float(format(fps, '.5f'))
temp_video = os.path.join(tempfile.gettempdir(), "my_vid_tmp.mp4")
clip = ImageSequenceClip(resize_img_list, fps=fps)
clip.write_videofile(temp_video, codec='libx264', fps=fps)
videoclip = VideoFileClip(temp_video)
audioclip = AudioFileClip(audio_path)
mergedclip = videoclip.set_audio(audioclip)
output_path = os.path.join(tempfile.gettempdir(), "mergedvideo.mp4")
mergedclip.write_videofile(output_path)
# Clean up clips
videoclip.close()
audioclip.close()
mergedclip.close()
return output_path
except Exception as e:
print(f"Error in merge_audio_video: {str(e)}")
raise
finally:
cleanup_temp_files()
# Load models outside the Blocks context
ner = gr.load("huggingface/flair/ner-english-ontonotes-large")
latentdiffusion = gr.load("spaces/multimodalart/latentdiffusion")
def engine(text_input):
try:
entities = ner(text_input)
entities = [tupl for tupl in entities if None not in tupl]
entities_num = len(entities)
if entities_num == 0:
raise ValueError("No entities found in the input text")
img_list = []
for ent in entities:
img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0]
img_list.append(img)
resize_img_list = resize(img_list)
output_path = merge_audio_video(entities_num, resize_img_list, text_input)
return output_path
except Exception as e:
print(f"Error in engine: {str(e)}")
raise gr.Error(f"An error occurred: {str(e)}")
finally:
cleanup_temp_files()
with gr.Blocks() as app:
gr.Markdown("# AI Pipeline Multi Model πποΈπΏ Movie Maker π¬ π§ π¨")
gr.Markdown("<div>πποΈπΏ AI Movie Maker - Comedy π¬ π§ π¨</div>")
text_input = gr.Textbox(lines=5, label="Input Text")
output_video = gr.Video(label='Final Merged Video')
examples = gr.Examples(
examples=[
["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."]
],
inputs=text_input
)
submit_button = gr.Button("Generate Video")
submit_button.click(fn=engine, inputs=text_input, outputs=output_video)
gr.Markdown("<br><div></div>")
app.launch(
debug=True,
share=True, # Enable sharing
server_name="0.0.0.0", # Listen on all interfaces
server_port=7860 # Specify port
)
|