awacke1 commited on
Commit
9e1ef69
·
verified ·
1 Parent(s): a0010c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -121
app.py CHANGED
@@ -1,141 +1,137 @@
1
  import gradio as gr
2
- import moviepy.video.io.ImageSequenceClip
 
 
3
  from PIL import Image
4
- from pydub import AudioSegment
5
- from moviepy.editor import ImageSequenceClip, VideoFileClip, AudioFileClip
6
  import numpy as np
7
  import os
8
- from mutagen.mp3 import MP3
9
- import soundfile as sf
10
- from dotenv import load_dotenv
11
- from transformers import AutoProcessor, AutoModel
12
- import torch
13
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # Load environment variables
16
- load_dotenv()
17
- HF_TOKEN = os.getenv("API_KEY")
18
-
19
- def cleanup_temp_files():
20
- temp_files = [
21
- os.path.join(tempfile.gettempdir(), 'speech_output.flac'),
22
- os.path.join(tempfile.gettempdir(), 'audio.mp3'),
23
- os.path.join(tempfile.gettempdir(), 'my_vid_tmp.mp4'),
24
- os.path.join(tempfile.gettempdir(), 'mergedvideo.mp4')
25
- ]
26
- for file in temp_files:
27
- if os.path.exists(file):
28
- try:
29
- os.remove(file)
30
- except:
31
- pass
32
-
33
- def resize(img_list):
34
- resize_img_list = []
35
- for item in img_list:
36
- im = Image.open(item)
37
- imResize = im.resize((256, 256), Image.LANCZOS)
38
- resize_img_list.append(np.array(imResize))
39
- return resize_img_list
40
 
41
  def text2speech(text):
42
  try:
43
- processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
44
- model = AutoModel.from_pretrained("microsoft/speecht5_tts")
45
-
46
- inputs = processor(text=text, return_tensors="pt")
47
- speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
48
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
49
 
50
- output_path = os.path.join(tempfile.gettempdir(), "speech_output.flac")
51
- sf.write(output_path, speech.numpy(), samplerate=16000)
52
  return output_path
53
  except Exception as e:
54
  print(f"Error in text2speech: {str(e)}")
55
  raise
56
 
57
- def merge_audio_video(entities_num, resize_img_list, text_input):
58
- try:
59
- speech = text2speech(text_input)
60
- wav_audio = AudioSegment.from_file(speech, "flac")
61
-
62
- audio_path = os.path.join(tempfile.gettempdir(), "audio.mp3")
63
- wav_audio.export(audio_path, format="mp3")
64
-
65
- audio_length = int(MP3(audio_path).info.length)
66
- fps = max(entities_num / audio_length, 1) # Ensure fps is at least 1
67
- fps = float(format(fps, '.5f'))
68
-
69
- temp_video = os.path.join(tempfile.gettempdir(), "my_vid_tmp.mp4")
70
- clip = ImageSequenceClip(resize_img_list, fps=fps)
71
- clip.write_videofile(temp_video, codec='libx264', fps=fps)
72
-
73
- videoclip = VideoFileClip(temp_video)
74
- audioclip = AudioFileClip(audio_path)
75
- mergedclip = videoclip.set_audio(audioclip)
76
-
77
- output_path = os.path.join(tempfile.gettempdir(), "mergedvideo.mp4")
78
- mergedclip.write_videofile(output_path)
79
-
80
- # Clean up clips
81
- videoclip.close()
82
- audioclip.close()
83
- mergedclip.close()
84
-
85
- return output_path
86
- except Exception as e:
87
- print(f"Error in merge_audio_video: {str(e)}")
88
- raise
89
- finally:
90
- cleanup_temp_files()
91
 
92
- # Load models outside the Blocks context
93
- ner = gr.load("huggingface/flair/ner-english-ontonotes-large")
94
- latentdiffusion = gr.load("spaces/multimodalart/latentdiffusion")
 
 
 
 
 
 
 
 
 
95
 
96
- def engine(text_input):
97
  try:
98
- entities = ner(text_input)
99
- entities = [tupl for tupl in entities if None not in tupl]
100
- entities_num = len(entities)
101
-
102
- if entities_num == 0:
103
- raise ValueError("No entities found in the input text")
104
-
105
- img_list = []
106
- for ent in entities:
107
- img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0]
108
- img_list.append(img)
109
-
110
- resize_img_list = resize(img_list)
111
- output_path = merge_audio_video(entities_num, resize_img_list, text_input)
112
-
113
- return output_path
114
  except Exception as e:
115
- print(f"Error in engine: {str(e)}")
116
  raise gr.Error(f"An error occurred: {str(e)}")
117
- finally:
118
- cleanup_temp_files()
119
-
120
- with gr.Blocks() as app:
121
- gr.Markdown("# AI Pipeline Multi Model 🎭🎞️🍿 Movie Maker 🎬 🧠 🎨")
122
- gr.Markdown("<div>🎭🎞️🍿 AI Movie Maker - Comedy 🎬 🧠 🎨</div>")
123
- text_input = gr.Textbox(lines=5, label="Input Text")
124
- output_video = gr.Video(label='Final Merged Video')
125
- examples = gr.Examples(
126
- examples=[
127
- ["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."]
128
- ],
129
- inputs=text_input
130
- )
131
- submit_button = gr.Button("Generate Video")
132
- submit_button.click(fn=engine, inputs=text_input, outputs=output_video)
133
-
134
- gr.Markdown("<br><div></div>")
135
-
136
- app.launch(
137
- debug=True,
138
- share=True, # Enable sharing
139
- server_name="0.0.0.0", # Listen on all interfaces
140
- server_port=7860 # Specify port
141
- )
 
1
  import gradio as gr
2
+ from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
+ from diffusers import StableDiffusionPipeline
4
+ import torch
5
  from PIL import Image
 
 
6
  import numpy as np
7
  import os
 
 
 
 
 
8
  import tempfile
9
+ import moviepy.editor as mpe
10
+ import soundfile as sf
11
+ import nltk
12
+ from pydub import AudioSegment
13
+ import warnings
14
+
15
+ warnings.filterwarnings("ignore", category=UserWarning)
16
+
17
+ # Ensure NLTK data is downloaded
18
+ nltk.download('punkt')
19
+
20
+ # Initialize models
21
+ device = "cuda" if torch.cuda.is_available() else "cpu"
22
+ torch_dtype = torch.float16 if device == "cuda" else torch.float32
23
 
24
+ # Story generator
25
+ story_generator = pipeline('text-generation', model='gpt2-large', device=0 if device=='cuda' else -1)
26
+
27
+ # Stable Diffusion model
28
+ sd_model_id = "runwayml/stable-diffusion-v1-5"
29
+ sd_pipe = StableDiffusionPipeline.from_pretrained(sd_model_id, torch_dtype=torch_dtype)
30
+ sd_pipe = sd_pipe.to(device)
31
+
32
+ # Text-to-Speech model
33
+ tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
34
+ tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts", torch_dtype=torch_dtype)
35
+ tts_model = tts_model.to(device)
36
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", torch_dtype=torch_dtype)
37
+ vocoder = vocoder.to(device)
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def text2speech(text):
40
  try:
41
+ inputs = tts_processor(text=text, return_tensors="pt").to(device)
42
+ speaker_embeddings = torch.zeros((1, 512), device=device)
43
+ speech = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
 
 
 
44
 
45
+ output_path = os.path.join(tempfile.gettempdir(), "speech_output.wav")
46
+ sf.write(output_path, speech.cpu().numpy(), samplerate=16000)
47
  return output_path
48
  except Exception as e:
49
  print(f"Error in text2speech: {str(e)}")
50
  raise
51
 
52
+ def generate_story(prompt):
53
+ generated = story_generator(prompt, max_length=500, num_return_sequences=1)
54
+ story = generated[0]['generated_text']
55
+ return story
56
+
57
+ def split_story_into_sentences(story):
58
+ sentences = nltk.sent_tokenize(story)
59
+ return sentences
60
+
61
+ def generate_images(sentences):
62
+ images = []
63
+ for idx, sentence in enumerate(sentences):
64
+ image = sd_pipe(sentence).images[0]
65
+ # Save image to temporary file
66
+ temp_file = tempfile.NamedTemporaryFile(suffix=f"_{idx}.png", delete=False)
67
+ image.save(temp_file.name)
68
+ images.append(temp_file.name)
69
+ return images
70
+
71
+ def generate_audio(story_text):
72
+ audio_path = text2speech(story_text)
73
+ audio = AudioSegment.from_file(audio_path)
74
+ total_duration = len(audio) / 1000 # duration in seconds
75
+ return audio_path, total_duration
76
+
77
+ def compute_sentence_durations(sentences, total_duration):
78
+ total_words = sum(len(sentence.split()) for sentence in sentences)
79
+ sentence_durations = []
80
+ for sentence in sentences:
81
+ num_words = len(sentence.split())
82
+ duration = total_duration * (num_words / total_words)
83
+ sentence_durations.append(duration)
84
+ return sentence_durations
 
85
 
86
+ def create_video(images, durations, audio_path):
87
+ clips = []
88
+ for image_path, duration in zip(images, durations):
89
+ clip = mpe.ImageClip(image_path).set_duration(duration)
90
+ clips.append(clip)
91
+ video = mpe.concatenate_videoclips(clips, method='compose')
92
+ audio = mpe.AudioFileClip(audio_path)
93
+ video = video.set_audio(audio)
94
+ # Save video
95
+ output_path = os.path.join(tempfile.gettempdir(), "final_video.mp4")
96
+ video.write_videofile(output_path, fps=1, codec='libx264')
97
+ return output_path
98
 
99
+ def process_pipeline(prompt, progress=gr.Progress(track_tqdm=True)):
100
  try:
101
+ with gr.Progress(track_tqdm=True, desc="Generating Story"):
102
+ story = generate_story(prompt)
103
+ with gr.Progress(track_tqdm=True, desc="Splitting Story into Sentences"):
104
+ sentences = split_story_into_sentences(story)
105
+ with gr.Progress(track_tqdm=True, desc="Generating Images for Sentences"):
106
+ images = generate_images(sentences)
107
+ with gr.Progress(track_tqdm=True, desc="Generating Audio"):
108
+ audio_path, total_duration = generate_audio(story)
109
+ with gr.Progress(track_tqdm=True, desc="Computing Durations"):
110
+ durations = compute_sentence_durations(sentences, total_duration)
111
+ with gr.Progress(track_tqdm=True, desc="Creating Video"):
112
+ video_path = create_video(images, durations, audio_path)
113
+ return video_path
 
 
 
114
  except Exception as e:
115
+ print(f"Error in process_pipeline: {str(e)}")
116
  raise gr.Error(f"An error occurred: {str(e)}")
117
+
118
+ title = """<h1 align="center">AI Story Video Generator 🎥</h1>
119
+ <p align="center">
120
+ Generate a story from a prompt, create images for each sentence, and produce a video with narration!
121
+ </p>
122
+ """
123
+
124
+ with gr.Blocks(css=".container { max-width: 800px; margin: auto; }") as demo:
125
+ gr.HTML(title)
126
+
127
+ with gr.Row():
128
+ with gr.Column():
129
+ prompt_input = gr.Textbox(label="Enter a Prompt", lines=2)
130
+ generate_button = gr.Button("Generate Video")
131
+ progress_bar = gr.Markdown("")
132
+ with gr.Column():
133
+ video_output = gr.Video(label="Generated Video")
134
+
135
+ generate_button.click(fn=process_pipeline, inputs=prompt_input, outputs=video_output)
136
+
137
+ demo.launch(debug=True)