Spaces:
Running
Running
import gradio as gr | |
import moviepy.video.io.ImageSequenceClip | |
from PIL import Image | |
from pydub import AudioSegment | |
from moviepy.editor import ImageSequenceClip, VideoFileClip, AudioFileClip | |
import numpy as np | |
import os | |
from mutagen.mp3 import MP3 | |
import soundfile as sf | |
from dotenv import load_dotenv | |
from transformers import AutoProcessor, AutoModel | |
import torch | |
import tempfile | |
# Load environment variables | |
load_dotenv() | |
HF_TOKEN = os.getenv("API_KEY") | |
def cleanup_temp_files(): | |
temp_files = [ | |
os.path.join(tempfile.gettempdir(), 'speech_output.flac'), | |
os.path.join(tempfile.gettempdir(), 'audio.mp3'), | |
os.path.join(tempfile.gettempdir(), 'my_vid_tmp.mp4'), | |
os.path.join(tempfile.gettempdir(), 'mergedvideo.mp4') | |
] | |
for file in temp_files: | |
if os.path.exists(file): | |
try: | |
os.remove(file) | |
except: | |
pass | |
def resize(img_list): | |
resize_img_list = [] | |
for item in img_list: | |
im = Image.open(item) | |
imResize = im.resize((256, 256), Image.LANCZOS) | |
resize_img_list.append(np.array(imResize)) | |
return resize_img_list | |
def text2speech(text): | |
try: | |
processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts") | |
model = AutoModel.from_pretrained("microsoft/speecht5_tts") | |
inputs = processor(text=text, return_tensors="pt") | |
speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size)) | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings) | |
output_path = os.path.join(tempfile.gettempdir(), "speech_output.flac") | |
sf.write(output_path, speech.numpy(), samplerate=16000) | |
return output_path | |
except Exception as e: | |
print(f"Error in text2speech: {str(e)}") | |
raise | |
def merge_audio_video(entities_num, resize_img_list, text_input): | |
try: | |
speech = text2speech(text_input) | |
wav_audio = AudioSegment.from_file(speech, "flac") | |
audio_path = os.path.join(tempfile.gettempdir(), "audio.mp3") | |
wav_audio.export(audio_path, format="mp3") | |
audio_length = int(MP3(audio_path).info.length) | |
fps = max(entities_num / audio_length, 1) # Ensure fps is at least 1 | |
fps = float(format(fps, '.5f')) | |
temp_video = os.path.join(tempfile.gettempdir(), "my_vid_tmp.mp4") | |
clip = ImageSequenceClip(resize_img_list, fps=fps) | |
clip.write_videofile(temp_video, codec='libx264', fps=fps) | |
videoclip = VideoFileClip(temp_video) | |
audioclip = AudioFileClip(audio_path) | |
mergedclip = videoclip.set_audio(audioclip) | |
output_path = os.path.join(tempfile.gettempdir(), "mergedvideo.mp4") | |
mergedclip.write_videofile(output_path) | |
# Clean up clips | |
videoclip.close() | |
audioclip.close() | |
mergedclip.close() | |
return output_path | |
except Exception as e: | |
print(f"Error in merge_audio_video: {str(e)}") | |
raise | |
finally: | |
cleanup_temp_files() | |
# Load models outside the Blocks context | |
ner = gr.load("huggingface/flair/ner-english-ontonotes-large") | |
latentdiffusion = gr.load("spaces/multimodalart/latentdiffusion") | |
def engine(text_input): | |
try: | |
entities = ner(text_input) | |
entities = [tupl for tupl in entities if None not in tupl] | |
entities_num = len(entities) | |
if entities_num == 0: | |
raise ValueError("No entities found in the input text") | |
img_list = [] | |
for ent in entities: | |
img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0] | |
img_list.append(img) | |
resize_img_list = resize(img_list) | |
output_path = merge_audio_video(entities_num, resize_img_list, text_input) | |
return output_path | |
except Exception as e: | |
print(f"Error in engine: {str(e)}") | |
raise gr.Error(f"An error occurred: {str(e)}") | |
finally: | |
cleanup_temp_files() | |
with gr.Blocks() as app: | |
gr.Markdown("# AI Pipeline Multi Model πποΈπΏ Movie Maker π¬ π§ π¨") | |
gr.Markdown("<div>πποΈπΏ AI Movie Maker - Comedy π¬ π§ π¨</div>") | |
text_input = gr.Textbox(lines=5, label="Input Text") | |
output_video = gr.Video(label='Final Merged Video') | |
examples = gr.Examples( | |
examples=[ | |
["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."] | |
], | |
inputs=text_input | |
) | |
submit_button = gr.Button("Generate Video") | |
submit_button.click(fn=engine, inputs=text_input, outputs=output_video) | |
gr.Markdown("<br><div></div>") | |
app.launch( | |
debug=True, | |
share=True, # Enable sharing | |
server_name="0.0.0.0", # Listen on all interfaces | |
server_port=7860 # Specify port | |
) | |